]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/lib/nvme/nvme_pcie.c
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / spdk / lib / nvme / nvme_pcie.c
1 /*-
2 * BSD LICENSE
3 *
4 * Copyright (c) Intel Corporation.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * NVMe over PCIe transport
36 */
37
38 #include <sys/mman.h>
39 #include <signal.h>
40 #include <sys/syscall.h>
41 #include <sys/types.h>
42
43 #include "nvme_internal.h"
44 #include "nvme_uevent.h"
45
46 #define NVME_ADMIN_ENTRIES (128)
47 #define NVME_ADMIN_TRACKERS (64)
48
49 /*
50 * NVME_IO_ENTRIES defines the size of an I/O qpair's submission and completion
51 * queues, while NVME_IO_TRACKERS defines the maximum number of I/O that we
52 * will allow outstanding on an I/O qpair at any time. The only advantage in
53 * having IO_ENTRIES > IO_TRACKERS is for debugging purposes - when dumping
54 * the contents of the submission and completion queues, it will show a longer
55 * history of data.
56 */
57 #define NVME_IO_ENTRIES (256)
58 #define NVME_IO_TRACKERS (128)
59
60 /*
61 * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL
62 * segment.
63 */
64 #define NVME_MAX_SGL_DESCRIPTORS (253)
65
66 #define NVME_MAX_PRP_LIST_ENTRIES (506)
67
68 /*
69 * For commands requiring more than 2 PRP entries, one PRP will be
70 * embedded in the command (prp1), and the rest of the PRP entries
71 * will be in a list pointed to by the command (prp2). This means
72 * that real max number of PRP entries we support is 506+1, which
73 * results in a max xfer size of 506*PAGE_SIZE.
74 */
75 #define NVME_MAX_XFER_SIZE NVME_MAX_PRP_LIST_ENTRIES * PAGE_SIZE
76
77 struct nvme_pcie_enum_ctx {
78 spdk_nvme_probe_cb probe_cb;
79 void *cb_ctx;
80 struct spdk_pci_addr pci_addr;
81 bool has_pci_addr;
82 };
83
84 /* PCIe transport extensions for spdk_nvme_ctrlr */
85 struct nvme_pcie_ctrlr {
86 struct spdk_nvme_ctrlr ctrlr;
87
88 /** NVMe MMIO register space */
89 volatile struct spdk_nvme_registers *regs;
90
91 /** NVMe MMIO register size */
92 uint64_t regs_size;
93
94 /* BAR mapping address which contains controller memory buffer */
95 void *cmb_bar_virt_addr;
96
97 /* BAR physical address which contains controller memory buffer */
98 uint64_t cmb_bar_phys_addr;
99
100 /* Controller memory buffer size in Bytes */
101 uint64_t cmb_size;
102
103 /* Current offset of controller memory buffer */
104 uint64_t cmb_current_offset;
105
106 /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */
107 uint32_t doorbell_stride_u32;
108
109 /* Opaque handle to associated PCI device. */
110 struct spdk_pci_device *devhandle;
111
112 /* Flag to indicate the MMIO register has been remapped */
113 bool is_remapped;
114 };
115
116 struct nvme_tracker {
117 TAILQ_ENTRY(nvme_tracker) tq_list;
118
119 struct nvme_request *req;
120 uint16_t cid;
121
122 uint16_t rsvd1: 14;
123 uint16_t timed_out: 1;
124 uint16_t active: 1;
125
126 uint32_t rsvd2;
127
128 /* The value of spdk_get_ticks() when the tracker was submitted to the hardware. */
129 uint64_t submit_tick;
130
131 uint64_t prp_sgl_bus_addr;
132
133 union {
134 uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES];
135 struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS];
136 } u;
137 };
138 /*
139 * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary
140 * and so that there is no padding required to meet alignment requirements.
141 */
142 SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K");
143 SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned");
144
145 /* PCIe transport extensions for spdk_nvme_qpair */
146 struct nvme_pcie_qpair {
147 /* Submission queue tail doorbell */
148 volatile uint32_t *sq_tdbl;
149
150 /* Completion queue head doorbell */
151 volatile uint32_t *cq_hdbl;
152
153 /* Submission queue */
154 struct spdk_nvme_cmd *cmd;
155
156 /* Completion queue */
157 struct spdk_nvme_cpl *cpl;
158
159 TAILQ_HEAD(, nvme_tracker) free_tr;
160 TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr;
161
162 /* Array of trackers indexed by command ID. */
163 struct nvme_tracker *tr;
164
165 uint16_t num_entries;
166
167 uint16_t sq_tail;
168 uint16_t cq_head;
169
170 uint8_t phase;
171
172 bool is_enabled;
173
174 /*
175 * Base qpair structure.
176 * This is located after the hot data in this structure so that the important parts of
177 * nvme_pcie_qpair are in the same cache line.
178 */
179 struct spdk_nvme_qpair qpair;
180
181 /*
182 * Fields below this point should not be touched on the normal I/O path.
183 */
184
185 bool sq_in_cmb;
186
187 uint64_t cmd_bus_addr;
188 uint64_t cpl_bus_addr;
189 };
190
191 static int nvme_pcie_ctrlr_attach(spdk_nvme_probe_cb probe_cb, void *cb_ctx,
192 struct spdk_pci_addr *pci_addr);
193 static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair);
194 static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair);
195
196 __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL;
197 static volatile uint16_t g_signal_lock;
198 static bool g_sigset = false;
199 static int hotplug_fd = -1;
200
201 static void
202 nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx)
203 {
204 void *map_address;
205
206 if (!__sync_bool_compare_and_swap(&g_signal_lock, 0, 1)) {
207 return;
208 }
209
210 assert(g_thread_mmio_ctrlr != NULL);
211
212 if (!g_thread_mmio_ctrlr->is_remapped) {
213 map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size,
214 PROT_READ | PROT_WRITE,
215 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
216 if (map_address == MAP_FAILED) {
217 SPDK_ERRLOG("mmap failed\n");
218 g_signal_lock = 0;
219 return;
220 }
221 memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers));
222 g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address;
223 g_thread_mmio_ctrlr->is_remapped = true;
224 }
225 g_signal_lock = 0;
226 return;
227 }
228
229 static void
230 nvme_pcie_ctrlr_setup_signal(void)
231 {
232 struct sigaction sa;
233
234 sa.sa_sigaction = nvme_sigbus_fault_sighandler;
235 sigemptyset(&sa.sa_mask);
236 sa.sa_flags = SA_SIGINFO;
237 sigaction(SIGBUS, &sa, NULL);
238 }
239
240 static int
241 _nvme_pcie_hotplug_monitor(void *cb_ctx, spdk_nvme_probe_cb probe_cb,
242 spdk_nvme_remove_cb remove_cb)
243 {
244 struct spdk_nvme_ctrlr *ctrlr;
245 struct spdk_uevent event;
246 struct spdk_pci_addr pci_addr;
247
248 while (spdk_get_uevent(hotplug_fd, &event) > 0) {
249 if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO) {
250 if (event.action == SPDK_NVME_UEVENT_ADD) {
251 SPDK_TRACELOG(SPDK_TRACE_NVME, "add nvme address: %s\n",
252 event.traddr);
253 if (spdk_process_is_primary()) {
254 if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) {
255 nvme_pcie_ctrlr_attach(probe_cb, cb_ctx, &pci_addr);
256 }
257 }
258 } else if (event.action == SPDK_NVME_UEVENT_REMOVE) {
259 bool in_list = false;
260
261 TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->attached_ctrlrs, tailq) {
262 if (strcmp(event.traddr, ctrlr->trid.traddr) == 0) {
263 in_list = true;
264 break;
265 }
266 }
267 if (in_list == false) {
268 return 0;
269 }
270 SPDK_TRACELOG(SPDK_TRACE_NVME, "remove nvme address: %s\n",
271 event.traddr);
272
273 nvme_ctrlr_fail(ctrlr, true);
274
275 /* get the user app to clean up and stop I/O */
276 if (remove_cb) {
277 nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
278 remove_cb(cb_ctx, ctrlr);
279 nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
280 }
281 }
282 }
283 }
284 return 0;
285 }
286
287 static inline struct nvme_pcie_ctrlr *
288 nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
289 {
290 assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE);
291 return (struct nvme_pcie_ctrlr *)((uintptr_t)ctrlr - offsetof(struct nvme_pcie_ctrlr, ctrlr));
292 }
293
294 static inline struct nvme_pcie_qpair *
295 nvme_pcie_qpair(struct spdk_nvme_qpair *qpair)
296 {
297 assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE);
298 return (struct nvme_pcie_qpair *)((uintptr_t)qpair - offsetof(struct nvme_pcie_qpair, qpair));
299 }
300
301 static volatile void *
302 nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset)
303 {
304 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
305
306 return (volatile void *)((uintptr_t)pctrlr->regs + offset);
307 }
308
309 int
310 nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
311 {
312 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
313
314 assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
315 g_thread_mmio_ctrlr = pctrlr;
316 spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value);
317 g_thread_mmio_ctrlr = NULL;
318 return 0;
319 }
320
321 int
322 nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
323 {
324 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
325
326 assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
327 g_thread_mmio_ctrlr = pctrlr;
328 spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value);
329 g_thread_mmio_ctrlr = NULL;
330 return 0;
331 }
332
333 int
334 nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
335 {
336 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
337
338 assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
339 assert(value != NULL);
340 g_thread_mmio_ctrlr = pctrlr;
341 *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset));
342 g_thread_mmio_ctrlr = NULL;
343 if (~(*value) == 0) {
344 return -1;
345 }
346
347 return 0;
348 }
349
350 int
351 nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
352 {
353 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
354
355 assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
356 assert(value != NULL);
357 g_thread_mmio_ctrlr = pctrlr;
358 *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset));
359 g_thread_mmio_ctrlr = NULL;
360 if (~(*value) == 0) {
361 return -1;
362 }
363
364 return 0;
365 }
366
367 static int
368 nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
369 {
370 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq),
371 value);
372 }
373
374 static int
375 nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
376 {
377 return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq),
378 value);
379 }
380
381 static int
382 nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa)
383 {
384 return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw),
385 aqa->raw);
386 }
387
388 static int
389 nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc)
390 {
391 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw),
392 &cmbloc->raw);
393 }
394
395 static int
396 nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz)
397 {
398 return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw),
399 &cmbsz->raw);
400 }
401
402 uint32_t
403 nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
404 {
405 return NVME_MAX_XFER_SIZE;
406 }
407
408 uint32_t
409 nvme_pcie_ctrlr_get_max_io_queue_size(struct spdk_nvme_ctrlr *ctrlr)
410 {
411 return NVME_IO_ENTRIES;
412 }
413
414 static void
415 nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr)
416 {
417 int rc;
418 void *addr;
419 uint32_t bir;
420 union spdk_nvme_cmbsz_register cmbsz;
421 union spdk_nvme_cmbloc_register cmbloc;
422 uint64_t size, unit_size, offset, bar_size, bar_phys_addr;
423
424 if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) ||
425 nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
426 SPDK_ERRLOG("get registers failed\n");
427 goto exit;
428 }
429
430 if (!cmbsz.bits.sz)
431 goto exit;
432
433 bir = cmbloc.bits.bir;
434 /* Values 0 2 3 4 5 are valid for BAR */
435 if (bir > 5 || bir == 1)
436 goto exit;
437
438 /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */
439 unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu);
440 /* controller memory buffer size in Bytes */
441 size = unit_size * cmbsz.bits.sz;
442 /* controller memory buffer offset from BAR in Bytes */
443 offset = unit_size * cmbloc.bits.ofst;
444
445 rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr,
446 &bar_phys_addr, &bar_size);
447 if ((rc != 0) || addr == NULL) {
448 goto exit;
449 }
450
451 if (offset > bar_size) {
452 goto exit;
453 }
454
455 if (size > bar_size - offset) {
456 goto exit;
457 }
458
459 pctrlr->cmb_bar_virt_addr = addr;
460 pctrlr->cmb_bar_phys_addr = bar_phys_addr;
461 pctrlr->cmb_size = size;
462 pctrlr->cmb_current_offset = offset;
463
464 if (!cmbsz.bits.sqs) {
465 pctrlr->ctrlr.opts.use_cmb_sqs = false;
466 }
467
468 return;
469 exit:
470 pctrlr->cmb_bar_virt_addr = NULL;
471 pctrlr->ctrlr.opts.use_cmb_sqs = false;
472 return;
473 }
474
475 static int
476 nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr)
477 {
478 int rc = 0;
479 union spdk_nvme_cmbloc_register cmbloc;
480 void *addr = pctrlr->cmb_bar_virt_addr;
481
482 if (addr) {
483 if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
484 SPDK_ERRLOG("get_cmbloc() failed\n");
485 return -EIO;
486 }
487 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr);
488 }
489 return rc;
490 }
491
492 static int
493 nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t length, uint64_t aligned,
494 uint64_t *offset)
495 {
496 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
497 uint64_t round_offset;
498
499 round_offset = pctrlr->cmb_current_offset;
500 round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1);
501
502 if (round_offset + length > pctrlr->cmb_size)
503 return -1;
504
505 *offset = round_offset;
506 pctrlr->cmb_current_offset = round_offset + length;
507
508 return 0;
509 }
510
511 static int
512 nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr)
513 {
514 int rc;
515 void *addr;
516 uint64_t phys_addr, size;
517
518 rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr,
519 &phys_addr, &size);
520 pctrlr->regs = (volatile struct spdk_nvme_registers *)addr;
521 if ((pctrlr->regs == NULL) || (rc != 0)) {
522 SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n",
523 rc, pctrlr->regs);
524 return -1;
525 }
526
527 pctrlr->regs_size = size;
528 nvme_pcie_ctrlr_map_cmb(pctrlr);
529
530 return 0;
531 }
532
533 static int
534 nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr)
535 {
536 int rc = 0;
537 void *addr = (void *)pctrlr->regs;
538
539 if (pctrlr->ctrlr.is_removed) {
540 return rc;
541 }
542
543 rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr);
544 if (rc != 0) {
545 SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc);
546 return -1;
547 }
548
549 if (addr) {
550 /* NOTE: addr may have been remapped here. We're relying on DPDK to call
551 * munmap internally.
552 */
553 rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr);
554 }
555 return rc;
556 }
557
558 static int
559 nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr)
560 {
561 struct nvme_pcie_qpair *pqpair;
562 int rc;
563
564 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL);
565 if (pqpair == NULL) {
566 return -ENOMEM;
567 }
568
569 pqpair->num_entries = NVME_ADMIN_ENTRIES;
570
571 ctrlr->adminq = &pqpair->qpair;
572
573 rc = nvme_qpair_init(ctrlr->adminq,
574 0, /* qpair ID */
575 ctrlr,
576 SPDK_NVME_QPRIO_URGENT,
577 NVME_ADMIN_ENTRIES);
578 if (rc != 0) {
579 return rc;
580 }
581
582 return nvme_pcie_qpair_construct(ctrlr->adminq);
583 }
584
585 /* This function must only be called while holding g_spdk_nvme_driver->lock */
586 static int
587 pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
588 {
589 struct spdk_nvme_transport_id trid = {};
590 struct nvme_pcie_enum_ctx *enum_ctx = ctx;
591 struct spdk_nvme_ctrlr *ctrlr;
592 int rc = 0;
593 struct spdk_pci_addr pci_addr;
594
595 pci_addr = spdk_pci_device_get_addr(pci_dev);
596
597 trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
598 spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
599
600 /* Verify that this controller is not already attached */
601 TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->attached_ctrlrs, tailq) {
602 /* NOTE: In the case like multi-process environment where the device handle is
603 * different per each process, we compare by BDF to determine whether it is the
604 * same controller.
605 */
606 if (strcmp(trid.traddr, ctrlr->trid.traddr) == 0) {
607 if (!spdk_process_is_primary()) {
608 rc = nvme_ctrlr_add_process(ctrlr, pci_dev);
609 }
610 return rc;
611 }
612 }
613
614 /* check whether user passes the pci_addr */
615 if (enum_ctx->has_pci_addr &&
616 (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) {
617 return 1;
618 }
619
620 return nvme_ctrlr_probe(&trid, pci_dev,
621 enum_ctx->probe_cb, enum_ctx->cb_ctx);
622 }
623
624 int
625 nvme_pcie_ctrlr_scan(const struct spdk_nvme_transport_id *trid,
626 void *cb_ctx,
627 spdk_nvme_probe_cb probe_cb,
628 spdk_nvme_remove_cb remove_cb)
629 {
630 struct nvme_pcie_enum_ctx enum_ctx = {};
631
632 enum_ctx.probe_cb = probe_cb;
633 enum_ctx.cb_ctx = cb_ctx;
634
635 if (strlen(trid->traddr) != 0) {
636 if (spdk_pci_addr_parse(&enum_ctx.pci_addr, trid->traddr)) {
637 return -1;
638 }
639 enum_ctx.has_pci_addr = true;
640 }
641
642 if (hotplug_fd < 0) {
643 hotplug_fd = spdk_uevent_connect();
644 if (hotplug_fd < 0) {
645 SPDK_TRACELOG(SPDK_TRACE_NVME, "Failed to open uevent netlink socket\n");
646 }
647 } else {
648 _nvme_pcie_hotplug_monitor(cb_ctx, probe_cb, remove_cb);
649 }
650
651 if (enum_ctx.has_pci_addr == false) {
652 return spdk_pci_nvme_enumerate(pcie_nvme_enum_cb, &enum_ctx);
653 } else {
654 return spdk_pci_nvme_device_attach(pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr);
655 }
656 }
657
658 static int
659 nvme_pcie_ctrlr_attach(spdk_nvme_probe_cb probe_cb, void *cb_ctx, struct spdk_pci_addr *pci_addr)
660 {
661 struct nvme_pcie_enum_ctx enum_ctx;
662
663 enum_ctx.probe_cb = probe_cb;
664 enum_ctx.cb_ctx = cb_ctx;
665
666 return spdk_pci_nvme_device_attach(pcie_nvme_enum_cb, &enum_ctx, pci_addr);
667 }
668
669 struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
670 const struct spdk_nvme_ctrlr_opts *opts,
671 void *devhandle)
672 {
673 struct spdk_pci_device *pci_dev = devhandle;
674 struct nvme_pcie_ctrlr *pctrlr;
675 union spdk_nvme_cap_register cap;
676 uint32_t cmd_reg;
677 int rc;
678 struct spdk_pci_id pci_id;
679
680 pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL);
681 if (pctrlr == NULL) {
682 SPDK_ERRLOG("could not allocate ctrlr\n");
683 return NULL;
684 }
685
686 pctrlr->is_remapped = false;
687 pctrlr->ctrlr.is_removed = false;
688 pctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
689 pctrlr->devhandle = devhandle;
690 pctrlr->ctrlr.opts = *opts;
691 memcpy(&pctrlr->ctrlr.trid, trid, sizeof(pctrlr->ctrlr.trid));
692
693 rc = nvme_pcie_ctrlr_allocate_bars(pctrlr);
694 if (rc != 0) {
695 spdk_free(pctrlr);
696 return NULL;
697 }
698
699 /* Enable PCI busmaster and disable INTx */
700 spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4);
701 cmd_reg |= 0x404;
702 spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4);
703
704 if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) {
705 SPDK_ERRLOG("get_cap() failed\n");
706 spdk_free(pctrlr);
707 return NULL;
708 }
709
710 nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap);
711
712 /* Doorbell stride is 2 ^ (dstrd + 2),
713 * but we want multiples of 4, so drop the + 2 */
714 pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd;
715
716 rc = nvme_ctrlr_construct(&pctrlr->ctrlr);
717 if (rc != 0) {
718 nvme_ctrlr_destruct(&pctrlr->ctrlr);
719 return NULL;
720 }
721
722 pci_id = spdk_pci_device_get_id(pci_dev);
723 pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id);
724
725 rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr);
726 if (rc != 0) {
727 nvme_ctrlr_destruct(&pctrlr->ctrlr);
728 return NULL;
729 }
730
731 /* Construct the primary process properties */
732 rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev);
733 if (rc != 0) {
734 nvme_ctrlr_destruct(&pctrlr->ctrlr);
735 return NULL;
736 }
737
738 if (g_sigset != true) {
739 nvme_pcie_ctrlr_setup_signal();
740 g_sigset = true;
741 }
742
743 return &pctrlr->ctrlr;
744 }
745
746 int
747 nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
748 {
749 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
750 struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq);
751 union spdk_nvme_aqa_register aqa;
752
753 if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) {
754 SPDK_ERRLOG("set_asq() failed\n");
755 return -EIO;
756 }
757
758 if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) {
759 SPDK_ERRLOG("set_acq() failed\n");
760 return -EIO;
761 }
762
763 aqa.raw = 0;
764 /* acqs and asqs are 0-based. */
765 aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
766 aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
767
768 if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) {
769 SPDK_ERRLOG("set_aqa() failed\n");
770 return -EIO;
771 }
772
773 return 0;
774 }
775
776 int
777 nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
778 {
779 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
780
781 if (ctrlr->adminq) {
782 nvme_pcie_qpair_destroy(ctrlr->adminq);
783 }
784
785 nvme_ctrlr_free_processes(ctrlr);
786
787 nvme_pcie_ctrlr_free_bars(pctrlr);
788 spdk_pci_device_detach(pctrlr->devhandle);
789 spdk_free(pctrlr);
790
791 return 0;
792 }
793
794 static void
795 nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr)
796 {
797 tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
798 tr->cid = cid;
799 tr->active = false;
800 }
801
802 int
803 nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair)
804 {
805 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
806
807 pqpair->sq_tail = pqpair->cq_head = 0;
808
809 /*
810 * First time through the completion queue, HW will set phase
811 * bit on completions to 1. So set this to 1 here, indicating
812 * we're looking for a 1 to know which entries have completed.
813 * we'll toggle the bit each time when the completion queue
814 * rolls over.
815 */
816 pqpair->phase = 1;
817
818 memset(pqpair->cmd, 0,
819 pqpair->num_entries * sizeof(struct spdk_nvme_cmd));
820 memset(pqpair->cpl, 0,
821 pqpair->num_entries * sizeof(struct spdk_nvme_cpl));
822
823 return 0;
824 }
825
826 static int
827 nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair)
828 {
829 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
830 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
831 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
832 struct nvme_tracker *tr;
833 uint16_t i;
834 volatile uint32_t *doorbell_base;
835 uint64_t phys_addr = 0;
836 uint64_t offset;
837 uint16_t num_trackers;
838
839 if (qpair->id == 0) {
840 num_trackers = NVME_ADMIN_TRACKERS;
841 } else {
842 /*
843 * No need to have more trackers than entries in the submit queue.
844 * Note also that for a queue size of N, we can only have (N-1)
845 * commands outstanding, hence the "-1" here.
846 */
847 num_trackers = spdk_min(NVME_IO_TRACKERS, pqpair->num_entries - 1);
848 }
849
850 assert(num_trackers != 0);
851
852 pqpair->sq_in_cmb = false;
853
854 /* cmd and cpl rings must be aligned on 4KB boundaries. */
855 if (ctrlr->opts.use_cmb_sqs) {
856 if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
857 0x1000, &offset) == 0) {
858 pqpair->cmd = pctrlr->cmb_bar_virt_addr + offset;
859 pqpair->cmd_bus_addr = pctrlr->cmb_bar_phys_addr + offset;
860 pqpair->sq_in_cmb = true;
861 }
862 }
863 if (pqpair->sq_in_cmb == false) {
864 pqpair->cmd = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
865 0x1000,
866 &pqpair->cmd_bus_addr);
867 if (pqpair->cmd == NULL) {
868 SPDK_ERRLOG("alloc qpair_cmd failed\n");
869 return -ENOMEM;
870 }
871 }
872
873 pqpair->cpl = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cpl),
874 0x1000,
875 &pqpair->cpl_bus_addr);
876 if (pqpair->cpl == NULL) {
877 SPDK_ERRLOG("alloc qpair_cpl failed\n");
878 return -ENOMEM;
879 }
880
881 doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl;
882 pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
883 pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
884
885 /*
886 * Reserve space for all of the trackers in a single allocation.
887 * struct nvme_tracker must be padded so that its size is already a power of 2.
888 * This ensures the PRP list embedded in the nvme_tracker object will not span a
889 * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing.
890 */
891 pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), &phys_addr);
892 if (pqpair->tr == NULL) {
893 SPDK_ERRLOG("nvme_tr failed\n");
894 return -ENOMEM;
895 }
896
897 TAILQ_INIT(&pqpair->free_tr);
898 TAILQ_INIT(&pqpair->outstanding_tr);
899
900 for (i = 0; i < num_trackers; i++) {
901 tr = &pqpair->tr[i];
902 nvme_qpair_construct_tracker(tr, i, phys_addr);
903 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
904 phys_addr += sizeof(struct nvme_tracker);
905 }
906
907 nvme_pcie_qpair_reset(qpair);
908
909 return 0;
910 }
911
912 static inline void
913 nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
914 {
915 /* dst and src are known to be non-overlapping and 64-byte aligned. */
916 #if defined(__AVX__)
917 __m256i *d256 = (__m256i *)dst;
918 const __m256i *s256 = (const __m256i *)src;
919
920 _mm256_store_si256(&d256[0], _mm256_load_si256(&s256[0]));
921 _mm256_store_si256(&d256[1], _mm256_load_si256(&s256[1]));
922 #elif defined(__SSE2__)
923 __m128i *d128 = (__m128i *)dst;
924 const __m128i *s128 = (const __m128i *)src;
925
926 _mm_store_si128(&d128[0], _mm_load_si128(&s128[0]));
927 _mm_store_si128(&d128[1], _mm_load_si128(&s128[1]));
928 _mm_store_si128(&d128[2], _mm_load_si128(&s128[2]));
929 _mm_store_si128(&d128[3], _mm_load_si128(&s128[3]));
930 #else
931 *dst = *src;
932 #endif
933 }
934
935 /**
936 * Note: the ctrlr_lock must be held when calling this function.
937 */
938 static void
939 nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair,
940 struct nvme_request *req, struct spdk_nvme_cpl *cpl)
941 {
942 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
943 struct nvme_request *active_req = req;
944 struct spdk_nvme_ctrlr_process *active_proc;
945 bool pending_on_proc = false;
946
947 /*
948 * The admin request is from another process. Move to the per
949 * process list for that process to handle it later.
950 */
951 assert(nvme_qpair_is_admin_queue(qpair));
952 assert(active_req->pid != getpid());
953
954 TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) {
955 if (active_proc->pid == active_req->pid) {
956 /* Saved the original completion information */
957 memcpy(&active_req->cpl, cpl, sizeof(*cpl));
958 STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq);
959 pending_on_proc = true;
960
961 break;
962 }
963 }
964
965 if (pending_on_proc == false) {
966 SPDK_ERRLOG("The owning process (pid %d) is not found. Drop the request.\n",
967 active_req->pid);
968
969 nvme_free_request(active_req);
970 }
971 }
972
973 /**
974 * Note: the ctrlr_lock must be held when calling this function.
975 */
976 static void
977 nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair)
978 {
979 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
980 struct nvme_request *req, *tmp_req;
981 bool proc_found = false;
982 pid_t pid = getpid();
983 struct spdk_nvme_ctrlr_process *proc;
984
985 /*
986 * Check whether there is any pending admin request from
987 * other active processes.
988 */
989 assert(nvme_qpair_is_admin_queue(qpair));
990
991 TAILQ_FOREACH(proc, &ctrlr->active_procs, tailq) {
992 if (proc->pid == pid) {
993 proc_found = true;
994
995 break;
996 }
997 }
998
999 if (proc_found == false) {
1000 SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid);
1001 assert(proc_found);
1002 }
1003
1004 STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
1005 STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
1006
1007 assert(req->pid == pid);
1008
1009 if (req->cb_fn) {
1010 req->cb_fn(req->cb_arg, &req->cpl);
1011 }
1012
1013 nvme_free_request(req);
1014 }
1015 }
1016
1017 static void
1018 nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1019 {
1020 struct nvme_request *req;
1021 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1022 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
1023
1024 tr->submit_tick = spdk_get_ticks();
1025 tr->timed_out = 0;
1026
1027 req = tr->req;
1028 pqpair->tr[tr->cid].active = true;
1029
1030 /* Copy the command from the tracker to the submission queue. */
1031 nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
1032
1033 if (++pqpair->sq_tail == pqpair->num_entries) {
1034 pqpair->sq_tail = 0;
1035 }
1036
1037 spdk_wmb();
1038 g_thread_mmio_ctrlr = pctrlr;
1039 spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail);
1040 g_thread_mmio_ctrlr = NULL;
1041 }
1042
1043 static void
1044 nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
1045 struct spdk_nvme_cpl *cpl, bool print_on_error)
1046 {
1047 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1048 struct nvme_request *req;
1049 bool retry, error, was_active;
1050 bool req_from_current_proc = true;
1051
1052 req = tr->req;
1053
1054 assert(req != NULL);
1055
1056 error = spdk_nvme_cpl_is_error(cpl);
1057 retry = error && nvme_completion_is_retry(cpl) &&
1058 req->retries < spdk_nvme_retry_count;
1059
1060 if (error && print_on_error) {
1061 nvme_qpair_print_command(qpair, &req->cmd);
1062 nvme_qpair_print_completion(qpair, cpl);
1063 }
1064
1065 was_active = pqpair->tr[cpl->cid].active;
1066 pqpair->tr[cpl->cid].active = false;
1067
1068 assert(cpl->cid == req->cmd.cid);
1069
1070 if (retry) {
1071 req->retries++;
1072 nvme_pcie_qpair_submit_tracker(qpair, tr);
1073 } else {
1074 if (was_active) {
1075 /* Only check admin requests from different processes. */
1076 if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) {
1077 req_from_current_proc = false;
1078 nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl);
1079 } else {
1080 if (req->cb_fn) {
1081 req->cb_fn(req->cb_arg, cpl);
1082 }
1083 }
1084 }
1085
1086 if (req_from_current_proc == true) {
1087 nvme_free_request(req);
1088 }
1089
1090 tr->req = NULL;
1091
1092 TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list);
1093 TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
1094
1095 /*
1096 * If the controller is in the middle of resetting, don't
1097 * try to submit queued requests here - let the reset logic
1098 * handle that instead.
1099 */
1100 if (!STAILQ_EMPTY(&qpair->queued_req) &&
1101 !qpair->ctrlr->is_resetting) {
1102 req = STAILQ_FIRST(&qpair->queued_req);
1103 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1104 nvme_qpair_submit_request(qpair, req);
1105 }
1106 }
1107 }
1108
1109 static void
1110 nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair,
1111 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
1112 bool print_on_error)
1113 {
1114 struct spdk_nvme_cpl cpl;
1115
1116 memset(&cpl, 0, sizeof(cpl));
1117 cpl.sqid = qpair->id;
1118 cpl.cid = tr->cid;
1119 cpl.status.sct = sct;
1120 cpl.status.sc = sc;
1121 cpl.status.dnr = dnr;
1122 nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
1123 }
1124
1125 static void
1126 nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr)
1127 {
1128 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1129 struct nvme_tracker *tr, *temp;
1130
1131 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) {
1132 SPDK_ERRLOG("aborting outstanding command\n");
1133 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1134 SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
1135 }
1136 }
1137
1138 static void
1139 nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
1140 {
1141 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1142 struct nvme_tracker *tr;
1143
1144 tr = TAILQ_FIRST(&pqpair->outstanding_tr);
1145 while (tr != NULL) {
1146 assert(tr->req != NULL);
1147 if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
1148 nvme_pcie_qpair_manual_complete_tracker(qpair, tr,
1149 SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0,
1150 false);
1151 tr = TAILQ_FIRST(&pqpair->outstanding_tr);
1152 } else {
1153 tr = TAILQ_NEXT(tr, tq_list);
1154 }
1155 }
1156 }
1157
1158 static void
1159 nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair)
1160 {
1161 nvme_pcie_admin_qpair_abort_aers(qpair);
1162 }
1163
1164 static int
1165 nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair)
1166 {
1167 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1168
1169 if (nvme_qpair_is_admin_queue(qpair)) {
1170 nvme_pcie_admin_qpair_destroy(qpair);
1171 }
1172 if (pqpair->cmd && !pqpair->sq_in_cmb) {
1173 spdk_free(pqpair->cmd);
1174 }
1175 if (pqpair->cpl) {
1176 spdk_free(pqpair->cpl);
1177 }
1178 if (pqpair->tr) {
1179 spdk_free(pqpair->tr);
1180 }
1181
1182 spdk_free(pqpair);
1183
1184 return 0;
1185 }
1186
1187 static void
1188 nvme_pcie_admin_qpair_enable(struct spdk_nvme_qpair *qpair)
1189 {
1190 /*
1191 * Manually abort each outstanding admin command. Do not retry
1192 * admin commands found here, since they will be left over from
1193 * a controller reset and its likely the context in which the
1194 * command was issued no longer applies.
1195 */
1196 nvme_pcie_qpair_abort_trackers(qpair, 1 /* do not retry */);
1197 }
1198
1199 static void
1200 nvme_pcie_io_qpair_enable(struct spdk_nvme_qpair *qpair)
1201 {
1202 /* Manually abort each outstanding I/O. */
1203 nvme_pcie_qpair_abort_trackers(qpair, 0);
1204 }
1205
1206 int
1207 nvme_pcie_qpair_enable(struct spdk_nvme_qpair *qpair)
1208 {
1209 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1210
1211 pqpair->is_enabled = true;
1212 if (nvme_qpair_is_io_queue(qpair)) {
1213 nvme_pcie_io_qpair_enable(qpair);
1214 } else {
1215 nvme_pcie_admin_qpair_enable(qpair);
1216 }
1217
1218 return 0;
1219 }
1220
1221 static void
1222 nvme_pcie_admin_qpair_disable(struct spdk_nvme_qpair *qpair)
1223 {
1224 nvme_pcie_admin_qpair_abort_aers(qpair);
1225 }
1226
1227 static void
1228 nvme_pcie_io_qpair_disable(struct spdk_nvme_qpair *qpair)
1229 {
1230 }
1231
1232 int
1233 nvme_pcie_qpair_disable(struct spdk_nvme_qpair *qpair)
1234 {
1235 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1236
1237 pqpair->is_enabled = false;
1238 if (nvme_qpair_is_io_queue(qpair)) {
1239 nvme_pcie_io_qpair_disable(qpair);
1240 } else {
1241 nvme_pcie_admin_qpair_disable(qpair);
1242 }
1243
1244 return 0;
1245 }
1246
1247
1248 int
1249 nvme_pcie_qpair_fail(struct spdk_nvme_qpair *qpair)
1250 {
1251 nvme_pcie_qpair_abort_trackers(qpair, 1 /* do not retry */);
1252
1253 return 0;
1254 }
1255
1256 static int
1257 nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr,
1258 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn,
1259 void *cb_arg)
1260 {
1261 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
1262 struct nvme_request *req;
1263 struct spdk_nvme_cmd *cmd;
1264
1265 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1266 if (req == NULL) {
1267 return -ENOMEM;
1268 }
1269
1270 cmd = &req->cmd;
1271 cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ;
1272
1273 /*
1274 * TODO: create a create io completion queue command data
1275 * structure.
1276 */
1277 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id;
1278 /*
1279 * 0x2 = interrupts enabled
1280 * 0x1 = physically contiguous
1281 */
1282 cmd->cdw11 = 0x1;
1283 cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr;
1284
1285 return nvme_ctrlr_submit_admin_request(ctrlr, req);
1286 }
1287
1288 static int
1289 nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr,
1290 struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1291 {
1292 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
1293 struct nvme_request *req;
1294 struct spdk_nvme_cmd *cmd;
1295
1296 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1297 if (req == NULL) {
1298 return -ENOMEM;
1299 }
1300
1301 cmd = &req->cmd;
1302 cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ;
1303
1304 /*
1305 * TODO: create a create io submission queue command data
1306 * structure.
1307 */
1308 cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id;
1309 /* 0x1 = physically contiguous */
1310 cmd->cdw11 = (io_que->id << 16) | (io_que->qprio << 1) | 0x1;
1311 cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr;
1312
1313 return nvme_ctrlr_submit_admin_request(ctrlr, req);
1314 }
1315
1316 static int
1317 nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1318 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1319 {
1320 struct nvme_request *req;
1321 struct spdk_nvme_cmd *cmd;
1322
1323 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1324 if (req == NULL) {
1325 return -ENOMEM;
1326 }
1327
1328 cmd = &req->cmd;
1329 cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ;
1330 cmd->cdw10 = qpair->id;
1331
1332 return nvme_ctrlr_submit_admin_request(ctrlr, req);
1333 }
1334
1335 static int
1336 nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1337 spdk_nvme_cmd_cb cb_fn, void *cb_arg)
1338 {
1339 struct nvme_request *req;
1340 struct spdk_nvme_cmd *cmd;
1341
1342 req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
1343 if (req == NULL) {
1344 return -ENOMEM;
1345 }
1346
1347 cmd = &req->cmd;
1348 cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ;
1349 cmd->cdw10 = qpair->id;
1350
1351 return nvme_ctrlr_submit_admin_request(ctrlr, req);
1352 }
1353
1354 static int
1355 _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
1356 uint16_t qid)
1357 {
1358 struct nvme_completion_poll_status status;
1359 int rc;
1360
1361 status.done = false;
1362 rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status);
1363 if (rc != 0) {
1364 return rc;
1365 }
1366
1367 while (status.done == false) {
1368 spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
1369 }
1370 if (spdk_nvme_cpl_is_error(&status.cpl)) {
1371 SPDK_ERRLOG("nvme_create_io_cq failed!\n");
1372 return -1;
1373 }
1374
1375 status.done = false;
1376 rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status);
1377 if (rc != 0) {
1378 return rc;
1379 }
1380
1381 while (status.done == false) {
1382 spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
1383 }
1384 if (spdk_nvme_cpl_is_error(&status.cpl)) {
1385 SPDK_ERRLOG("nvme_create_io_sq failed!\n");
1386 /* Attempt to delete the completion queue */
1387 status.done = false;
1388 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status);
1389 if (rc != 0) {
1390 return -1;
1391 }
1392 while (status.done == false) {
1393 spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
1394 }
1395 return -1;
1396 }
1397
1398 nvme_pcie_qpair_reset(qpair);
1399
1400 return 0;
1401 }
1402
1403 struct spdk_nvme_qpair *
1404 nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
1405 enum spdk_nvme_qprio qprio)
1406 {
1407 struct nvme_pcie_qpair *pqpair;
1408 struct spdk_nvme_qpair *qpair;
1409 int rc;
1410
1411 assert(ctrlr != NULL);
1412
1413 pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL);
1414 if (pqpair == NULL) {
1415 return NULL;
1416 }
1417
1418 pqpair->num_entries = ctrlr->opts.io_queue_size;
1419
1420 qpair = &pqpair->qpair;
1421
1422 rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, ctrlr->opts.io_queue_requests);
1423 if (rc != 0) {
1424 nvme_pcie_qpair_destroy(qpair);
1425 return NULL;
1426 }
1427
1428 rc = nvme_pcie_qpair_construct(qpair);
1429 if (rc != 0) {
1430 nvme_pcie_qpair_destroy(qpair);
1431 return NULL;
1432 }
1433
1434 rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qid);
1435
1436 if (rc != 0) {
1437 SPDK_ERRLOG("I/O queue creation failed\n");
1438 nvme_pcie_qpair_destroy(qpair);
1439 return NULL;
1440 }
1441
1442 return qpair;
1443 }
1444
1445 int
1446 nvme_pcie_ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1447 {
1448 return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id);
1449 }
1450
1451 int
1452 nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
1453 {
1454 struct nvme_completion_poll_status status;
1455 int rc;
1456
1457 assert(ctrlr != NULL);
1458
1459 if (ctrlr->is_removed) {
1460 goto free;
1461 }
1462
1463 /* Delete the I/O submission queue and then the completion queue */
1464
1465 status.done = false;
1466 rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status);
1467 if (rc != 0) {
1468 return rc;
1469 }
1470 while (status.done == false) {
1471 spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
1472 }
1473 if (spdk_nvme_cpl_is_error(&status.cpl)) {
1474 return -1;
1475 }
1476
1477 status.done = false;
1478 rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status);
1479 if (rc != 0) {
1480 return rc;
1481 }
1482 while (status.done == false) {
1483 spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
1484 }
1485 if (spdk_nvme_cpl_is_error(&status.cpl)) {
1486 return -1;
1487 }
1488
1489 free:
1490 nvme_pcie_qpair_destroy(qpair);
1491 return 0;
1492 }
1493
1494 static void
1495 nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
1496 {
1497 /*
1498 * Bad vtophys translation, so abort this request and return
1499 * immediately.
1500 */
1501 nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
1502 SPDK_NVME_SC_INVALID_FIELD,
1503 1 /* do not retry */, true);
1504 }
1505
1506 /**
1507 * Build PRP list describing physically contiguous payload buffer.
1508 */
1509 static int
1510 nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1511 struct nvme_tracker *tr)
1512 {
1513 uint64_t phys_addr;
1514 void *seg_addr;
1515 uint32_t nseg, cur_nseg, modulo, unaligned;
1516 void *md_payload;
1517 void *payload = req->payload.u.contig + req->payload_offset;
1518
1519 phys_addr = spdk_vtophys(payload);
1520 if (phys_addr == SPDK_VTOPHYS_ERROR) {
1521 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1522 return -1;
1523 }
1524 nseg = req->payload_size >> spdk_u32log2(PAGE_SIZE);
1525 modulo = req->payload_size & (PAGE_SIZE - 1);
1526 unaligned = phys_addr & (PAGE_SIZE - 1);
1527 if (modulo || unaligned) {
1528 nseg += 1 + ((modulo + unaligned - 1) >> spdk_u32log2(PAGE_SIZE));
1529 }
1530
1531 if (req->payload.md) {
1532 md_payload = req->payload.md + req->md_offset;
1533 tr->req->cmd.mptr = spdk_vtophys(md_payload);
1534 if (tr->req->cmd.mptr == SPDK_VTOPHYS_ERROR) {
1535 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1536 return -1;
1537 }
1538 }
1539
1540 tr->req->cmd.psdt = SPDK_NVME_PSDT_PRP;
1541 tr->req->cmd.dptr.prp.prp1 = phys_addr;
1542 if (nseg == 2) {
1543 seg_addr = payload + PAGE_SIZE - unaligned;
1544 tr->req->cmd.dptr.prp.prp2 = spdk_vtophys(seg_addr);
1545 } else if (nseg > 2) {
1546 cur_nseg = 1;
1547 tr->req->cmd.dptr.prp.prp2 = (uint64_t)tr->prp_sgl_bus_addr;
1548 while (cur_nseg < nseg) {
1549 seg_addr = payload + cur_nseg * PAGE_SIZE - unaligned;
1550 phys_addr = spdk_vtophys(seg_addr);
1551 if (phys_addr == SPDK_VTOPHYS_ERROR) {
1552 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1553 return -1;
1554 }
1555 tr->u.prp[cur_nseg - 1] = phys_addr;
1556 cur_nseg++;
1557 }
1558 }
1559
1560 return 0;
1561 }
1562
1563 /**
1564 * Build SGL list describing scattered payload buffer.
1565 */
1566 static int
1567 nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1568 struct nvme_tracker *tr)
1569 {
1570 int rc;
1571 void *virt_addr;
1572 uint64_t phys_addr;
1573 uint32_t remaining_transfer_len, length;
1574 struct spdk_nvme_sgl_descriptor *sgl;
1575 uint32_t nseg = 0;
1576
1577 /*
1578 * Build scattered payloads.
1579 */
1580 assert(req->payload_size != 0);
1581 assert(req->payload.type == NVME_PAYLOAD_TYPE_SGL);
1582 assert(req->payload.u.sgl.reset_sgl_fn != NULL);
1583 assert(req->payload.u.sgl.next_sge_fn != NULL);
1584 req->payload.u.sgl.reset_sgl_fn(req->payload.u.sgl.cb_arg, req->payload_offset);
1585
1586 sgl = tr->u.sgl;
1587 req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_SGL;
1588 req->cmd.dptr.sgl1.unkeyed.subtype = 0;
1589
1590 remaining_transfer_len = req->payload_size;
1591
1592 while (remaining_transfer_len > 0) {
1593 if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
1594 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1595 return -1;
1596 }
1597
1598 rc = req->payload.u.sgl.next_sge_fn(req->payload.u.sgl.cb_arg, &virt_addr, &length);
1599 if (rc) {
1600 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1601 return -1;
1602 }
1603
1604 phys_addr = spdk_vtophys(virt_addr);
1605 if (phys_addr == SPDK_VTOPHYS_ERROR) {
1606 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1607 return -1;
1608 }
1609
1610 length = spdk_min(remaining_transfer_len, length);
1611 remaining_transfer_len -= length;
1612
1613 sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1614 sgl->unkeyed.length = length;
1615 sgl->address = phys_addr;
1616 sgl->unkeyed.subtype = 0;
1617
1618 sgl++;
1619 nseg++;
1620 }
1621
1622 if (nseg == 1) {
1623 /*
1624 * The whole transfer can be described by a single SGL descriptor.
1625 * Use the special case described by the spec where SGL1's type is Data Block.
1626 * This means the SGL in the tracker is not used at all, so copy the first (and only)
1627 * SGL element into SGL1.
1628 */
1629 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
1630 req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
1631 req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
1632 } else {
1633 /* For now we can only support 1 SGL segment in NVMe controller */
1634 req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
1635 req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
1636 req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
1637 }
1638
1639 return 0;
1640 }
1641
1642 /**
1643 * Build PRP list describing scattered payload buffer.
1644 */
1645 static int
1646 nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
1647 struct nvme_tracker *tr)
1648 {
1649 int rc;
1650 void *virt_addr;
1651 uint64_t phys_addr;
1652 uint32_t data_transferred, remaining_transfer_len, length;
1653 uint32_t nseg, cur_nseg, total_nseg, last_nseg, modulo, unaligned;
1654 uint32_t sge_count = 0;
1655 uint64_t prp2 = 0;
1656
1657 /*
1658 * Build scattered payloads.
1659 */
1660 assert(req->payload.type == NVME_PAYLOAD_TYPE_SGL);
1661 assert(req->payload.u.sgl.reset_sgl_fn != NULL);
1662 req->payload.u.sgl.reset_sgl_fn(req->payload.u.sgl.cb_arg, req->payload_offset);
1663
1664 remaining_transfer_len = req->payload_size;
1665 total_nseg = 0;
1666 last_nseg = 0;
1667
1668 while (remaining_transfer_len > 0) {
1669 assert(req->payload.u.sgl.next_sge_fn != NULL);
1670 rc = req->payload.u.sgl.next_sge_fn(req->payload.u.sgl.cb_arg, &virt_addr, &length);
1671 if (rc) {
1672 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1673 return -1;
1674 }
1675
1676 phys_addr = spdk_vtophys(virt_addr);
1677 if (phys_addr == SPDK_VTOPHYS_ERROR) {
1678 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1679 return -1;
1680 }
1681
1682 /*
1683 * Any incompatible sges should have been handled up in the splitting routine,
1684 * but assert here as an additional check.
1685 */
1686 assert((phys_addr & 0x3) == 0); /* Address must be dword aligned. */
1687 /* All SGEs except last must end on a page boundary. */
1688 assert((length >= remaining_transfer_len) || _is_page_aligned(phys_addr + length));
1689 /* All SGe except first must start on a page boundary. */
1690 assert((sge_count == 0) || _is_page_aligned(phys_addr));
1691
1692 data_transferred = spdk_min(remaining_transfer_len, length);
1693
1694 nseg = data_transferred >> spdk_u32log2(PAGE_SIZE);
1695 modulo = data_transferred & (PAGE_SIZE - 1);
1696 unaligned = phys_addr & (PAGE_SIZE - 1);
1697 if (modulo || unaligned) {
1698 nseg += 1 + ((modulo + unaligned - 1) >> spdk_u32log2(PAGE_SIZE));
1699 }
1700
1701 if (total_nseg == 0) {
1702 req->cmd.psdt = SPDK_NVME_PSDT_PRP;
1703 req->cmd.dptr.prp.prp1 = phys_addr;
1704 phys_addr -= unaligned;
1705 }
1706
1707 total_nseg += nseg;
1708 sge_count++;
1709 remaining_transfer_len -= data_transferred;
1710
1711 if (total_nseg == 2) {
1712 if (sge_count == 1)
1713 tr->req->cmd.dptr.prp.prp2 = phys_addr + PAGE_SIZE;
1714 else if (sge_count == 2)
1715 tr->req->cmd.dptr.prp.prp2 = phys_addr;
1716 /* save prp2 value */
1717 prp2 = tr->req->cmd.dptr.prp.prp2;
1718 } else if (total_nseg > 2) {
1719 if (sge_count == 1)
1720 cur_nseg = 1;
1721 else
1722 cur_nseg = 0;
1723
1724 tr->req->cmd.dptr.prp.prp2 = (uint64_t)tr->prp_sgl_bus_addr;
1725 while (cur_nseg < nseg) {
1726 if (prp2) {
1727 tr->u.prp[0] = prp2;
1728 tr->u.prp[last_nseg + 1] = phys_addr + cur_nseg * PAGE_SIZE;
1729 } else
1730 tr->u.prp[last_nseg] = phys_addr + cur_nseg * PAGE_SIZE;
1731
1732 last_nseg++;
1733 cur_nseg++;
1734 }
1735 }
1736 }
1737
1738 return 0;
1739 }
1740
1741 static inline bool
1742 nvme_pcie_qpair_check_enabled(struct spdk_nvme_qpair *qpair)
1743 {
1744 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1745
1746 if (!pqpair->is_enabled &&
1747 !qpair->ctrlr->is_resetting) {
1748 nvme_qpair_enable(qpair);
1749 }
1750 return pqpair->is_enabled;
1751 }
1752
1753 int
1754 nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
1755 {
1756 struct nvme_tracker *tr;
1757 int rc = 0;
1758 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
1759 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1760
1761 nvme_pcie_qpair_check_enabled(qpair);
1762
1763 if (nvme_qpair_is_admin_queue(qpair)) {
1764 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
1765 }
1766
1767 tr = TAILQ_FIRST(&pqpair->free_tr);
1768
1769 if (tr == NULL || !pqpair->is_enabled) {
1770 /*
1771 * No tracker is available, or the qpair is disabled due to
1772 * an in-progress controller-level reset.
1773 *
1774 * Put the request on the qpair's request queue to be
1775 * processed when a tracker frees up via a command
1776 * completion or when the controller reset is
1777 * completed.
1778 */
1779 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1780 goto exit;
1781 }
1782
1783 TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */
1784 TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list);
1785 tr->req = req;
1786 req->cmd.cid = tr->cid;
1787
1788 if (req->payload_size == 0) {
1789 /* Null payload - leave PRP fields zeroed */
1790 rc = 0;
1791 } else if (req->payload.type == NVME_PAYLOAD_TYPE_CONTIG) {
1792 rc = nvme_pcie_qpair_build_contig_request(qpair, req, tr);
1793 } else if (req->payload.type == NVME_PAYLOAD_TYPE_SGL) {
1794 if (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
1795 rc = nvme_pcie_qpair_build_hw_sgl_request(qpair, req, tr);
1796 } else {
1797 rc = nvme_pcie_qpair_build_prps_sgl_request(qpair, req, tr);
1798 }
1799 } else {
1800 assert(0);
1801 nvme_pcie_fail_request_bad_vtophys(qpair, tr);
1802 rc = -EINVAL;
1803 }
1804
1805 if (rc < 0) {
1806 goto exit;
1807 }
1808
1809 nvme_pcie_qpair_submit_tracker(qpair, tr);
1810
1811 exit:
1812 if (nvme_qpair_is_admin_queue(qpair)) {
1813 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
1814 }
1815
1816 return rc;
1817 }
1818
1819 static void
1820 nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
1821 {
1822 uint64_t t02;
1823 struct nvme_tracker *tr, *tmp;
1824 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1825 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
1826
1827 /* We don't want to expose the admin queue to the user,
1828 * so when we're timing out admin commands set the
1829 * qpair to NULL.
1830 */
1831 if (qpair == ctrlr->adminq) {
1832 qpair = NULL;
1833 }
1834
1835 t02 = spdk_get_ticks();
1836 TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
1837 if (tr->timed_out) {
1838 continue;
1839 }
1840
1841 if (qpair == NULL &&
1842 tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
1843 continue;
1844 }
1845
1846 if (tr->submit_tick + ctrlr->timeout_ticks > t02) {
1847 /* The trackers are in order, so as soon as one has not timed out,
1848 * stop iterating.
1849 */
1850 break;
1851 }
1852
1853 tr->timed_out = 1;
1854 ctrlr->timeout_cb_fn(ctrlr->timeout_cb_arg, ctrlr, qpair, tr->cid);
1855 }
1856 }
1857
1858 int32_t
1859 nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
1860 {
1861 struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
1862 struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
1863 struct nvme_tracker *tr;
1864 struct spdk_nvme_cpl *cpl;
1865 uint32_t num_completions = 0;
1866 struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
1867
1868 if (!nvme_pcie_qpair_check_enabled(qpair)) {
1869 /*
1870 * qpair is not enabled, likely because a controller reset is
1871 * is in progress. Ignore the interrupt - any I/O that was
1872 * associated with this interrupt will get retried when the
1873 * reset is complete.
1874 */
1875 return 0;
1876 }
1877
1878 if (nvme_qpair_is_admin_queue(qpair)) {
1879 nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
1880 }
1881
1882 if (max_completions == 0 || (max_completions > (pqpair->num_entries - 1U))) {
1883
1884 /*
1885 * max_completions == 0 means unlimited, but complete at most one
1886 * queue depth batch of I/O at a time so that the completion
1887 * queue doorbells don't wrap around.
1888 */
1889 max_completions = pqpair->num_entries - 1;
1890 }
1891
1892 while (1) {
1893 cpl = &pqpair->cpl[pqpair->cq_head];
1894
1895 if (cpl->status.p != pqpair->phase)
1896 break;
1897
1898 tr = &pqpair->tr[cpl->cid];
1899
1900 if (tr->active) {
1901 nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true);
1902 } else {
1903 SPDK_ERRLOG("cpl does not map to outstanding cmd\n");
1904 nvme_qpair_print_completion(qpair, cpl);
1905 assert(0);
1906 }
1907
1908 if (++pqpair->cq_head == pqpair->num_entries) {
1909 pqpair->cq_head = 0;
1910 pqpair->phase = !pqpair->phase;
1911 }
1912
1913 if (++num_completions == max_completions) {
1914 break;
1915 }
1916 }
1917
1918 if (num_completions > 0) {
1919 g_thread_mmio_ctrlr = pctrlr;
1920 spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head);
1921 g_thread_mmio_ctrlr = NULL;
1922 }
1923
1924 if (qpair->ctrlr->state == NVME_CTRLR_STATE_READY) {
1925 if (qpair->ctrlr->timeout_cb_fn) {
1926 /*
1927 * User registered for timeout callback
1928 */
1929 nvme_pcie_qpair_check_timeout(qpair);
1930 }
1931 }
1932
1933 /* Before returning, complete any pending admin request. */
1934 if (nvme_qpair_is_admin_queue(qpair)) {
1935 nvme_pcie_qpair_complete_pending_admin_request(qpair);
1936
1937 nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
1938 }
1939
1940 return num_completions;
1941 }