]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/examples/nvme/perf/perf.c
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / spdk / examples / nvme / perf / perf.c
1 /*-
2 * BSD LICENSE
3 *
4 * Copyright (c) Intel Corporation.
5 * All rights reserved.
6 *
7 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * * Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * * Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 * * Neither the name of Intel Corporation nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */
35
36 #include "spdk/stdinc.h"
37
38 #include "spdk/env.h"
39 #include "spdk/fd.h"
40 #include "spdk/nvme.h"
41 #include "spdk/vmd.h"
42 #include "spdk/queue.h"
43 #include "spdk/string.h"
44 #include "spdk/nvme_intel.h"
45 #include "spdk/histogram_data.h"
46 #include "spdk/endian.h"
47 #include "spdk/dif.h"
48 #include "spdk/util.h"
49 #include "spdk/log.h"
50 #include "spdk/likely.h"
51
52 #ifdef SPDK_CONFIG_URING
53 #include <liburing.h>
54 #endif
55
56 #if HAVE_LIBAIO
57 #include <libaio.h>
58 #endif
59
60 struct ctrlr_entry {
61 struct spdk_nvme_ctrlr *ctrlr;
62 enum spdk_nvme_transport_type trtype;
63 struct spdk_nvme_intel_rw_latency_page *latency_page;
64
65 struct spdk_nvme_qpair **unused_qpairs;
66
67 struct ctrlr_entry *next;
68 char name[1024];
69 };
70
71 enum entry_type {
72 ENTRY_TYPE_NVME_NS,
73 ENTRY_TYPE_AIO_FILE,
74 ENTRY_TYPE_URING_FILE,
75 };
76
77 struct ns_fn_table;
78
79 struct ns_entry {
80 enum entry_type type;
81 const struct ns_fn_table *fn_table;
82
83 union {
84 struct {
85 struct spdk_nvme_ctrlr *ctrlr;
86 struct spdk_nvme_ns *ns;
87 } nvme;
88 #ifdef SPDK_CONFIG_URING
89 struct {
90 int fd;
91 } uring;
92 #endif
93 #if HAVE_LIBAIO
94 struct {
95 int fd;
96 } aio;
97 #endif
98 } u;
99
100 struct ns_entry *next;
101 uint32_t io_size_blocks;
102 uint32_t num_io_requests;
103 uint64_t size_in_ios;
104 uint32_t block_size;
105 uint32_t md_size;
106 bool md_interleave;
107 bool pi_loc;
108 enum spdk_nvme_pi_type pi_type;
109 uint32_t io_flags;
110 char name[1024];
111 };
112
113 static const double g_latency_cutoffs[] = {
114 0.01,
115 0.10,
116 0.25,
117 0.50,
118 0.75,
119 0.90,
120 0.95,
121 0.98,
122 0.99,
123 0.995,
124 0.999,
125 0.9999,
126 0.99999,
127 0.999999,
128 0.9999999,
129 -1,
130 };
131
132 struct ns_worker_ctx {
133 struct ns_entry *entry;
134 uint64_t io_completed;
135 uint64_t last_io_completed;
136 uint64_t total_tsc;
137 uint64_t min_tsc;
138 uint64_t max_tsc;
139 uint64_t current_queue_depth;
140 uint64_t offset_in_ios;
141 bool is_draining;
142
143 union {
144 struct {
145 int num_active_qpairs;
146 int num_all_qpairs;
147 struct spdk_nvme_qpair **qpair;
148 struct spdk_nvme_poll_group *group;
149 int last_qpair;
150 } nvme;
151
152 #ifdef SPDK_CONFIG_URING
153 struct {
154 struct io_uring ring;
155 uint64_t io_inflight;
156 uint64_t io_pending;
157 struct io_uring_cqe **cqes;
158
159 } uring;
160 #endif
161 #if HAVE_LIBAIO
162 struct {
163 struct io_event *events;
164 io_context_t ctx;
165 } aio;
166 #endif
167 } u;
168
169 struct ns_worker_ctx *next;
170
171 struct spdk_histogram_data *histogram;
172 };
173
174 struct perf_task {
175 struct ns_worker_ctx *ns_ctx;
176 struct iovec iov;
177 struct iovec md_iov;
178 uint64_t submit_tsc;
179 bool is_read;
180 struct spdk_dif_ctx dif_ctx;
181 #if HAVE_LIBAIO
182 struct iocb iocb;
183 #endif
184 };
185
186 struct worker_thread {
187 struct ns_worker_ctx *ns_ctx;
188 struct worker_thread *next;
189 unsigned lcore;
190 };
191
192 struct ns_fn_table {
193 void (*setup_payload)(struct perf_task *task, uint8_t pattern);
194
195 int (*submit_io)(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
196 struct ns_entry *entry, uint64_t offset_in_ios);
197
198 void (*check_io)(struct ns_worker_ctx *ns_ctx);
199
200 void (*verify_io)(struct perf_task *task, struct ns_entry *entry);
201
202 int (*init_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx);
203
204 void (*cleanup_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx);
205 };
206
207 static int g_outstanding_commands;
208
209 static bool g_latency_ssd_tracking_enable;
210 static int g_latency_sw_tracking_level;
211
212 static bool g_vmd;
213 static const char *g_workload_type;
214 static struct ctrlr_entry *g_controllers;
215 static struct ns_entry *g_namespaces;
216 static int g_num_namespaces;
217 static struct worker_thread *g_workers;
218 static int g_num_workers;
219 static uint32_t g_master_core;
220
221 static uint64_t g_tsc_rate;
222
223 static uint32_t g_io_align = 0x200;
224 static uint32_t g_io_size_bytes;
225 static uint32_t g_max_io_md_size;
226 static uint32_t g_max_io_size_blocks;
227 static uint32_t g_metacfg_pract_flag;
228 static uint32_t g_metacfg_prchk_flags;
229 static int g_rw_percentage = -1;
230 static int g_is_random;
231 static int g_queue_depth;
232 static int g_nr_io_queues_per_ns = 1;
233 static int g_nr_unused_io_queues;
234 static int g_time_in_sec;
235 static uint32_t g_max_completions;
236 static int g_dpdk_mem;
237 static int g_shm_id = -1;
238 static uint32_t g_disable_sq_cmb;
239 static bool g_use_uring;
240 static bool g_no_pci;
241 static bool g_warn;
242 static bool g_header_digest;
243 static bool g_data_digest;
244 static bool g_no_shn_notification;
245 static bool g_mix_specified;
246 /* Default to 10 seconds for the keep alive value. This value is arbitrary. */
247 static uint32_t g_keep_alive_timeout_in_ms = 10000;
248
249 static const char *g_core_mask;
250
251 struct trid_entry {
252 struct spdk_nvme_transport_id trid;
253 uint16_t nsid;
254 TAILQ_ENTRY(trid_entry) tailq;
255 };
256
257 static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list);
258
259 static int g_file_optind; /* Index of first filename in argv */
260
261 static inline void
262 task_complete(struct perf_task *task);
263
264 #ifdef SPDK_CONFIG_URING
265
266 static void
267 uring_setup_payload(struct perf_task *task, uint8_t pattern)
268 {
269 task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL);
270 task->iov.iov_len = g_io_size_bytes;
271 if (task->iov.iov_base == NULL) {
272 fprintf(stderr, "spdk_dma_zmalloc() for task->iov.iov_base failed\n");
273 exit(1);
274 }
275 memset(task->iov.iov_base, pattern, task->iov.iov_len);
276 }
277
278 static int
279 uring_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
280 struct ns_entry *entry, uint64_t offset_in_ios)
281 {
282 struct io_uring_sqe *sqe;
283
284 sqe = io_uring_get_sqe(&ns_ctx->u.uring.ring);
285 if (!sqe) {
286 fprintf(stderr, "Cannot get sqe\n");
287 return -1;
288 }
289
290 if (task->is_read) {
291 io_uring_prep_readv(sqe, entry->u.uring.fd, &task->iov, 1, offset_in_ios * task->iov.iov_len);
292 } else {
293 io_uring_prep_writev(sqe, entry->u.uring.fd, &task->iov, 1, offset_in_ios * task->iov.iov_len);
294 }
295
296 io_uring_sqe_set_data(sqe, task);
297 ns_ctx->u.uring.io_pending++;
298
299 return 0;
300 }
301
302 static void
303 uring_check_io(struct ns_worker_ctx *ns_ctx)
304 {
305 int i, count, to_complete, to_submit, ret = 0;
306 struct perf_task *task;
307
308 to_submit = ns_ctx->u.uring.io_pending;
309
310 if (to_submit > 0) {
311 /* If there are I/O to submit, use io_uring_submit here.
312 * It will automatically call spdk_io_uring_enter appropriately. */
313 ret = io_uring_submit(&ns_ctx->u.uring.ring);
314 if (ret < 0) {
315 return;
316 }
317 ns_ctx->u.uring.io_pending = 0;
318 ns_ctx->u.uring.io_inflight += to_submit;
319 }
320
321 to_complete = ns_ctx->u.uring.io_inflight;
322 if (to_complete > 0) {
323 count = io_uring_peek_batch_cqe(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes, to_complete);
324 ns_ctx->u.uring.io_inflight -= count;
325 for (i = 0; i < count; i++) {
326 assert(ns_ctx->u.uring.cqes[i] != NULL);
327 task = (struct perf_task *)ns_ctx->u.uring.cqes[i]->user_data;
328 if (ns_ctx->u.uring.cqes[i]->res != (int)task->iov.iov_len) {
329 fprintf(stderr, "cqe[i]->status=%d\n", ns_ctx->u.uring.cqes[i]->res);
330 exit(0);
331 }
332 io_uring_cqe_seen(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes[i]);
333 task_complete(task);
334 }
335 }
336 }
337
338 static void
339 uring_verify_io(struct perf_task *task, struct ns_entry *entry)
340 {
341 }
342
343 static int
344 uring_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
345 {
346 if (io_uring_queue_init(g_queue_depth, &ns_ctx->u.uring.ring, 0) < 0) {
347 SPDK_ERRLOG("uring I/O context setup failure\n");
348 return -1;
349 }
350
351 ns_ctx->u.uring.cqes = calloc(g_queue_depth, sizeof(struct io_uring_cqe *));
352 if (!ns_ctx->u.uring.cqes) {
353 io_uring_queue_exit(&ns_ctx->u.uring.ring);
354 return -1;
355 }
356
357 return 0;
358 }
359
360 static void
361 uring_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
362 {
363 io_uring_queue_exit(&ns_ctx->u.uring.ring);
364 free(ns_ctx->u.uring.cqes);
365 }
366
367 static const struct ns_fn_table uring_fn_table = {
368 .setup_payload = uring_setup_payload,
369 .submit_io = uring_submit_io,
370 .check_io = uring_check_io,
371 .verify_io = uring_verify_io,
372 .init_ns_worker_ctx = uring_init_ns_worker_ctx,
373 .cleanup_ns_worker_ctx = uring_cleanup_ns_worker_ctx,
374 };
375
376 #endif
377
378 #ifdef HAVE_LIBAIO
379 static void
380 aio_setup_payload(struct perf_task *task, uint8_t pattern)
381 {
382 task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL);
383 task->iov.iov_len = g_io_size_bytes;
384 if (task->iov.iov_base == NULL) {
385 fprintf(stderr, "spdk_dma_zmalloc() for task->buf failed\n");
386 exit(1);
387 }
388 memset(task->iov.iov_base, pattern, task->iov.iov_len);
389 }
390
391 static int
392 aio_submit(io_context_t aio_ctx, struct iocb *iocb, int fd, enum io_iocb_cmd cmd,
393 struct iovec *iov, uint64_t offset, void *cb_ctx)
394 {
395 iocb->aio_fildes = fd;
396 iocb->aio_reqprio = 0;
397 iocb->aio_lio_opcode = cmd;
398 iocb->u.c.buf = iov->iov_base;
399 iocb->u.c.nbytes = iov->iov_len;
400 iocb->u.c.offset = offset * iov->iov_len;
401 iocb->data = cb_ctx;
402
403 if (io_submit(aio_ctx, 1, &iocb) < 0) {
404 printf("io_submit");
405 return -1;
406 }
407
408 return 0;
409 }
410
411 static int
412 aio_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
413 struct ns_entry *entry, uint64_t offset_in_ios)
414 {
415 if (task->is_read) {
416 return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PREAD,
417 &task->iov, offset_in_ios, task);
418 } else {
419 return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PWRITE,
420 &task->iov, offset_in_ios, task);
421 }
422 }
423
424 static void
425 aio_check_io(struct ns_worker_ctx *ns_ctx)
426 {
427 int count, i;
428 struct timespec timeout;
429
430 timeout.tv_sec = 0;
431 timeout.tv_nsec = 0;
432
433 count = io_getevents(ns_ctx->u.aio.ctx, 1, g_queue_depth, ns_ctx->u.aio.events, &timeout);
434 if (count < 0) {
435 fprintf(stderr, "io_getevents error\n");
436 exit(1);
437 }
438
439 for (i = 0; i < count; i++) {
440 task_complete(ns_ctx->u.aio.events[i].data);
441 }
442 }
443
444 static void
445 aio_verify_io(struct perf_task *task, struct ns_entry *entry)
446 {
447 }
448
449 static int
450 aio_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
451 {
452 ns_ctx->u.aio.events = calloc(g_queue_depth, sizeof(struct io_event));
453 if (!ns_ctx->u.aio.events) {
454 return -1;
455 }
456 ns_ctx->u.aio.ctx = 0;
457 if (io_setup(g_queue_depth, &ns_ctx->u.aio.ctx) < 0) {
458 free(ns_ctx->u.aio.events);
459 perror("io_setup");
460 return -1;
461 }
462 return 0;
463 }
464
465 static void
466 aio_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
467 {
468 io_destroy(ns_ctx->u.aio.ctx);
469 free(ns_ctx->u.aio.events);
470 }
471
472 static const struct ns_fn_table aio_fn_table = {
473 .setup_payload = aio_setup_payload,
474 .submit_io = aio_submit_io,
475 .check_io = aio_check_io,
476 .verify_io = aio_verify_io,
477 .init_ns_worker_ctx = aio_init_ns_worker_ctx,
478 .cleanup_ns_worker_ctx = aio_cleanup_ns_worker_ctx,
479 };
480
481 #endif /* HAVE_LIBAIO */
482
483 #if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING)
484
485 static int
486 register_file(const char *path)
487 {
488 struct ns_entry *entry;
489
490 int flags, fd;
491 uint64_t size;
492 uint32_t blklen;
493
494 if (g_rw_percentage == 100) {
495 flags = O_RDONLY;
496 } else if (g_rw_percentage == 0) {
497 flags = O_WRONLY;
498 } else {
499 flags = O_RDWR;
500 }
501
502 flags |= O_DIRECT;
503
504 fd = open(path, flags);
505 if (fd < 0) {
506 fprintf(stderr, "Could not open device %s: %s\n", path, strerror(errno));
507 return -1;
508 }
509
510 size = spdk_fd_get_size(fd);
511 if (size == 0) {
512 fprintf(stderr, "Could not determine size of device %s\n", path);
513 close(fd);
514 return -1;
515 }
516
517 blklen = spdk_fd_get_blocklen(fd);
518 if (blklen == 0) {
519 fprintf(stderr, "Could not determine block size of device %s\n", path);
520 close(fd);
521 return -1;
522 }
523
524 /*
525 * TODO: This should really calculate the LCM of the current g_io_align and blklen.
526 * For now, it's fairly safe to just assume all block sizes are powers of 2.
527 */
528 if (g_io_align < blklen) {
529 g_io_align = blklen;
530 }
531
532 entry = malloc(sizeof(struct ns_entry));
533 if (entry == NULL) {
534 close(fd);
535 perror("ns_entry malloc");
536 return -1;
537 }
538
539 if (g_use_uring) {
540 #ifdef SPDK_CONFIG_URING
541 entry->type = ENTRY_TYPE_URING_FILE;
542 entry->fn_table = &uring_fn_table;
543 entry->u.uring.fd = fd;
544 #endif
545 } else {
546 #if HAVE_LIBAIO
547 entry->type = ENTRY_TYPE_AIO_FILE;
548 entry->fn_table = &aio_fn_table;
549 entry->u.aio.fd = fd;
550 #endif
551 }
552 entry->size_in_ios = size / g_io_size_bytes;
553 entry->io_size_blocks = g_io_size_bytes / blklen;
554
555 snprintf(entry->name, sizeof(entry->name), "%s", path);
556
557 g_num_namespaces++;
558 entry->next = g_namespaces;
559 g_namespaces = entry;
560
561 return 0;
562 }
563
564 static int
565 register_files(int argc, char **argv)
566 {
567 int i;
568
569 /* Treat everything after the options as files for AIO/URING */
570 for (i = g_file_optind; i < argc; i++) {
571 if (register_file(argv[i]) != 0) {
572 return 1;
573 }
574 }
575
576 return 0;
577 }
578 #endif
579
580 static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl);
581
582 static void
583 nvme_setup_payload(struct perf_task *task, uint8_t pattern)
584 {
585 uint32_t max_io_size_bytes, max_io_md_size;
586
587 /* maximum extended lba format size from all active namespace,
588 * it's same with g_io_size_bytes for namespace without metadata.
589 */
590 max_io_size_bytes = g_io_size_bytes + g_max_io_md_size * g_max_io_size_blocks;
591 task->iov.iov_base = spdk_dma_zmalloc(max_io_size_bytes, g_io_align, NULL);
592 task->iov.iov_len = max_io_size_bytes;
593 if (task->iov.iov_base == NULL) {
594 fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n");
595 exit(1);
596 }
597 memset(task->iov.iov_base, pattern, task->iov.iov_len);
598
599 max_io_md_size = g_max_io_md_size * g_max_io_size_blocks;
600 if (max_io_md_size != 0) {
601 task->md_iov.iov_base = spdk_dma_zmalloc(max_io_md_size, g_io_align, NULL);
602 task->md_iov.iov_len = max_io_md_size;
603 if (task->md_iov.iov_base == NULL) {
604 fprintf(stderr, "task->md_buf spdk_dma_zmalloc failed\n");
605 spdk_dma_free(task->iov.iov_base);
606 exit(1);
607 }
608 }
609 }
610
611 static int
612 nvme_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
613 struct ns_entry *entry, uint64_t offset_in_ios)
614 {
615 uint64_t lba;
616 int rc;
617 int qp_num;
618
619 enum dif_mode {
620 DIF_MODE_NONE = 0,
621 DIF_MODE_DIF = 1,
622 DIF_MODE_DIX = 2,
623 } mode = DIF_MODE_NONE;
624
625 lba = offset_in_ios * entry->io_size_blocks;
626
627 if (entry->md_size != 0 && !(entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) {
628 if (entry->md_interleave) {
629 mode = DIF_MODE_DIF;
630 } else {
631 mode = DIF_MODE_DIX;
632 }
633 }
634
635 qp_num = ns_ctx->u.nvme.last_qpair;
636 ns_ctx->u.nvme.last_qpair++;
637 if (ns_ctx->u.nvme.last_qpair == ns_ctx->u.nvme.num_active_qpairs) {
638 ns_ctx->u.nvme.last_qpair = 0;
639 }
640
641 if (mode != DIF_MODE_NONE) {
642 rc = spdk_dif_ctx_init(&task->dif_ctx, entry->block_size, entry->md_size,
643 entry->md_interleave, entry->pi_loc,
644 (enum spdk_dif_type)entry->pi_type, entry->io_flags,
645 lba, 0xFFFF, (uint16_t)entry->io_size_blocks, 0, 0);
646 if (rc != 0) {
647 fprintf(stderr, "Initialization of DIF context failed\n");
648 exit(1);
649 }
650 }
651
652 if (task->is_read) {
653 return spdk_nvme_ns_cmd_read_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num],
654 task->iov.iov_base, task->md_iov.iov_base,
655 lba,
656 entry->io_size_blocks, io_complete,
657 task, entry->io_flags,
658 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag);
659 } else {
660 switch (mode) {
661 case DIF_MODE_DIF:
662 rc = spdk_dif_generate(&task->iov, 1, entry->io_size_blocks, &task->dif_ctx);
663 if (rc != 0) {
664 fprintf(stderr, "Generation of DIF failed\n");
665 return rc;
666 }
667 break;
668 case DIF_MODE_DIX:
669 rc = spdk_dix_generate(&task->iov, 1, &task->md_iov, entry->io_size_blocks,
670 &task->dif_ctx);
671 if (rc != 0) {
672 fprintf(stderr, "Generation of DIX failed\n");
673 return rc;
674 }
675 break;
676 default:
677 break;
678 }
679
680 return spdk_nvme_ns_cmd_write_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num],
681 task->iov.iov_base, task->md_iov.iov_base,
682 lba,
683 entry->io_size_blocks, io_complete,
684 task, entry->io_flags,
685 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag);
686 }
687 }
688
689 static void
690 perf_disconnect_cb(struct spdk_nvme_qpair *qpair, void *ctx)
691 {
692
693 }
694
695 static void
696 nvme_check_io(struct ns_worker_ctx *ns_ctx)
697 {
698 int64_t rc;
699
700 rc = spdk_nvme_poll_group_process_completions(ns_ctx->u.nvme.group, 0, perf_disconnect_cb);
701 if (rc < 0) {
702 fprintf(stderr, "NVMe io qpair process completion error\n");
703 exit(1);
704 }
705 }
706
707 static void
708 nvme_verify_io(struct perf_task *task, struct ns_entry *entry)
709 {
710 struct spdk_dif_error err_blk = {};
711 int rc;
712
713 if (!task->is_read || (entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) {
714 return;
715 }
716
717 if (entry->md_interleave) {
718 rc = spdk_dif_verify(&task->iov, 1, entry->io_size_blocks, &task->dif_ctx,
719 &err_blk);
720 if (rc != 0) {
721 fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n",
722 err_blk.err_type, err_blk.err_offset);
723 }
724 } else {
725 rc = spdk_dix_verify(&task->iov, 1, &task->md_iov, entry->io_size_blocks,
726 &task->dif_ctx, &err_blk);
727 if (rc != 0) {
728 fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n",
729 err_blk.err_type, err_blk.err_offset);
730 }
731 }
732 }
733
734 /*
735 * TODO: If a controller has multiple namespaces, they could all use the same queue.
736 * For now, give each namespace/thread combination its own queue.
737 */
738 static int
739 nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
740 {
741 struct spdk_nvme_io_qpair_opts opts;
742 struct ns_entry *entry = ns_ctx->entry;
743 struct spdk_nvme_poll_group *group;
744 struct spdk_nvme_qpair *qpair;
745 int i;
746
747 ns_ctx->u.nvme.num_active_qpairs = g_nr_io_queues_per_ns;
748 ns_ctx->u.nvme.num_all_qpairs = g_nr_io_queues_per_ns + g_nr_unused_io_queues;
749 ns_ctx->u.nvme.qpair = calloc(ns_ctx->u.nvme.num_all_qpairs, sizeof(struct spdk_nvme_qpair *));
750 if (!ns_ctx->u.nvme.qpair) {
751 return -1;
752 }
753
754 spdk_nvme_ctrlr_get_default_io_qpair_opts(entry->u.nvme.ctrlr, &opts, sizeof(opts));
755 if (opts.io_queue_requests < entry->num_io_requests) {
756 opts.io_queue_requests = entry->num_io_requests;
757 }
758 opts.delay_cmd_submit = true;
759 opts.create_only = true;
760
761 ns_ctx->u.nvme.group = spdk_nvme_poll_group_create(NULL);
762 if (ns_ctx->u.nvme.group == NULL) {
763 goto poll_group_failed;
764 }
765
766 group = ns_ctx->u.nvme.group;
767 for (i = 0; i < ns_ctx->u.nvme.num_all_qpairs; i++) {
768 ns_ctx->u.nvme.qpair[i] = spdk_nvme_ctrlr_alloc_io_qpair(entry->u.nvme.ctrlr, &opts,
769 sizeof(opts));
770 qpair = ns_ctx->u.nvme.qpair[i];
771 if (!qpair) {
772 printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n");
773 goto qpair_failed;
774 }
775
776 if (spdk_nvme_poll_group_add(group, qpair)) {
777 printf("ERROR: unable to add I/O qpair to poll group.\n");
778 spdk_nvme_ctrlr_free_io_qpair(qpair);
779 goto qpair_failed;
780 }
781
782 if (spdk_nvme_ctrlr_connect_io_qpair(entry->u.nvme.ctrlr, qpair)) {
783 printf("ERROR: unable to connect I/O qpair.\n");
784 spdk_nvme_poll_group_remove(group, qpair);
785 spdk_nvme_ctrlr_free_io_qpair(qpair);
786 goto qpair_failed;
787 }
788 }
789
790 return 0;
791
792 qpair_failed:
793 for (; i > 0; --i) {
794 spdk_nvme_poll_group_remove(ns_ctx->u.nvme.group, ns_ctx->u.nvme.qpair[i - 1]);
795 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i - 1]);
796 }
797
798 spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group);
799 poll_group_failed:
800 free(ns_ctx->u.nvme.qpair);
801 return -1;
802 }
803
804 static void
805 nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
806 {
807 int i;
808
809 for (i = 0; i < ns_ctx->u.nvme.num_all_qpairs; i++) {
810 spdk_nvme_poll_group_remove(ns_ctx->u.nvme.group, ns_ctx->u.nvme.qpair[i]);
811 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i]);
812 }
813
814 spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group);
815 free(ns_ctx->u.nvme.qpair);
816 }
817
818 static const struct ns_fn_table nvme_fn_table = {
819 .setup_payload = nvme_setup_payload,
820 .submit_io = nvme_submit_io,
821 .check_io = nvme_check_io,
822 .verify_io = nvme_verify_io,
823 .init_ns_worker_ctx = nvme_init_ns_worker_ctx,
824 .cleanup_ns_worker_ctx = nvme_cleanup_ns_worker_ctx,
825 };
826
827 static int
828 build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr)
829 {
830 const struct spdk_nvme_transport_id *trid;
831 int res = 0;
832
833 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
834
835 switch (trid->trtype) {
836 case SPDK_NVME_TRANSPORT_PCIE:
837 res = snprintf(name, length, "PCIE (%s)", trid->traddr);
838 break;
839 case SPDK_NVME_TRANSPORT_RDMA:
840 res = snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn);
841 break;
842 case SPDK_NVME_TRANSPORT_TCP:
843 res = snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn);
844 break;
845
846 default:
847 fprintf(stderr, "Unknown transport type %d\n", trid->trtype);
848 break;
849 }
850 return res;
851 }
852
853 static void
854 build_nvme_ns_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
855 {
856 int res = 0;
857
858 res = build_nvme_name(name, length, ctrlr);
859 if (res > 0) {
860 snprintf(name + res, length - res, " NSID %u", nsid);
861 }
862
863 }
864
865 static void
866 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
867 {
868 struct ns_entry *entry;
869 const struct spdk_nvme_ctrlr_data *cdata;
870 uint32_t max_xfer_size, entries, sector_size;
871 uint64_t ns_size;
872 struct spdk_nvme_io_qpair_opts opts;
873
874 cdata = spdk_nvme_ctrlr_get_data(ctrlr);
875
876 if (!spdk_nvme_ns_is_active(ns)) {
877 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n",
878 cdata->mn, cdata->sn,
879 spdk_nvme_ns_get_id(ns));
880 g_warn = true;
881 return;
882 }
883
884 ns_size = spdk_nvme_ns_get_size(ns);
885 sector_size = spdk_nvme_ns_get_sector_size(ns);
886
887 if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) {
888 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid "
889 "ns size %" PRIu64 " / block size %u for I/O size %u\n",
890 cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns),
891 ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes);
892 g_warn = true;
893 return;
894 }
895
896 max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
897 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
898 /* NVMe driver may add additional entries based on
899 * stripe size and maximum transfer size, we assume
900 * 1 more entry be used for stripe.
901 */
902 entries = (g_io_size_bytes - 1) / max_xfer_size + 2;
903 if ((g_queue_depth * entries) > opts.io_queue_size) {
904 printf("controller IO queue size %u less than required\n",
905 opts.io_queue_size);
906 printf("Consider using lower queue depth or small IO size because "
907 "IO requests may be queued at the NVMe driver.\n");
908 }
909 /* For requests which have children requests, parent request itself
910 * will also occupy 1 entry.
911 */
912 entries += 1;
913
914 entry = calloc(1, sizeof(struct ns_entry));
915 if (entry == NULL) {
916 perror("ns_entry malloc");
917 exit(1);
918 }
919
920 entry->type = ENTRY_TYPE_NVME_NS;
921 entry->fn_table = &nvme_fn_table;
922 entry->u.nvme.ctrlr = ctrlr;
923 entry->u.nvme.ns = ns;
924 entry->num_io_requests = g_queue_depth * entries;
925
926 entry->size_in_ios = ns_size / g_io_size_bytes;
927 entry->io_size_blocks = g_io_size_bytes / sector_size;
928
929 entry->block_size = spdk_nvme_ns_get_extended_sector_size(ns);
930 entry->md_size = spdk_nvme_ns_get_md_size(ns);
931 entry->md_interleave = spdk_nvme_ns_supports_extended_lba(ns);
932 entry->pi_loc = spdk_nvme_ns_get_data(ns)->dps.md_start;
933 entry->pi_type = spdk_nvme_ns_get_pi_type(ns);
934
935 if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
936 entry->io_flags = g_metacfg_pract_flag | g_metacfg_prchk_flags;
937 }
938
939 /* If metadata size = 8 bytes, PI is stripped (read) or inserted (write),
940 * and so reduce metadata size from block size. (If metadata size > 8 bytes,
941 * PI is passed (read) or replaced (write). So block size is not necessary
942 * to change.)
943 */
944 if ((entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT) && (entry->md_size == 8)) {
945 entry->block_size = spdk_nvme_ns_get_sector_size(ns);
946 }
947
948 if (g_max_io_md_size < entry->md_size) {
949 g_max_io_md_size = entry->md_size;
950 }
951
952 if (g_max_io_size_blocks < entry->io_size_blocks) {
953 g_max_io_size_blocks = entry->io_size_blocks;
954 }
955
956 build_nvme_ns_name(entry->name, sizeof(entry->name), ctrlr, spdk_nvme_ns_get_id(ns));
957
958 g_num_namespaces++;
959 entry->next = g_namespaces;
960 g_namespaces = entry;
961 }
962
963 static void
964 unregister_namespaces(void)
965 {
966 struct ns_entry *entry = g_namespaces;
967
968 while (entry) {
969 struct ns_entry *next = entry->next;
970 free(entry);
971 entry = next;
972 }
973 }
974
975 static void
976 enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl)
977 {
978 if (spdk_nvme_cpl_is_error(cpl)) {
979 printf("enable_latency_tracking_complete failed\n");
980 }
981 g_outstanding_commands--;
982 }
983
984 static void
985 set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable)
986 {
987 int res;
988 union spdk_nvme_intel_feat_latency_tracking latency_tracking;
989
990 if (enable) {
991 latency_tracking.bits.enable = 0x01;
992 } else {
993 latency_tracking.bits.enable = 0x00;
994 }
995
996 res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING,
997 latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL);
998 if (res) {
999 printf("fail to allocate nvme request.\n");
1000 return;
1001 }
1002 g_outstanding_commands++;
1003
1004 while (g_outstanding_commands) {
1005 spdk_nvme_ctrlr_process_admin_completions(ctrlr);
1006 }
1007 }
1008
1009 static void
1010 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry)
1011 {
1012 struct spdk_nvme_ns *ns;
1013 struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry));
1014 uint32_t nsid;
1015
1016 if (entry == NULL) {
1017 perror("ctrlr_entry malloc");
1018 exit(1);
1019 }
1020
1021 entry->latency_page = spdk_dma_zmalloc(sizeof(struct spdk_nvme_intel_rw_latency_page),
1022 4096, NULL);
1023 if (entry->latency_page == NULL) {
1024 printf("Allocation error (latency page)\n");
1025 exit(1);
1026 }
1027
1028 build_nvme_name(entry->name, sizeof(entry->name), ctrlr);
1029
1030 entry->ctrlr = ctrlr;
1031 entry->trtype = trid_entry->trid.trtype;
1032 entry->next = g_controllers;
1033 g_controllers = entry;
1034
1035 if (g_latency_ssd_tracking_enable &&
1036 spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) {
1037 set_latency_tracking_feature(ctrlr, true);
1038 }
1039
1040 if (trid_entry->nsid == 0) {
1041 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
1042 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
1043 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1044 if (ns == NULL) {
1045 continue;
1046 }
1047 register_ns(ctrlr, ns);
1048 }
1049 } else {
1050 ns = spdk_nvme_ctrlr_get_ns(ctrlr, trid_entry->nsid);
1051 if (!ns) {
1052 perror("Namespace does not exist.");
1053 exit(1);
1054 }
1055
1056 register_ns(ctrlr, ns);
1057 }
1058 }
1059
1060 static __thread unsigned int seed = 0;
1061
1062 static inline void
1063 submit_single_io(struct perf_task *task)
1064 {
1065 uint64_t offset_in_ios;
1066 int rc;
1067 struct ns_worker_ctx *ns_ctx = task->ns_ctx;
1068 struct ns_entry *entry = ns_ctx->entry;
1069
1070 if (g_is_random) {
1071 offset_in_ios = rand_r(&seed) % entry->size_in_ios;
1072 } else {
1073 offset_in_ios = ns_ctx->offset_in_ios++;
1074 if (ns_ctx->offset_in_ios == entry->size_in_ios) {
1075 ns_ctx->offset_in_ios = 0;
1076 }
1077 }
1078
1079 task->submit_tsc = spdk_get_ticks();
1080
1081 if ((g_rw_percentage == 100) ||
1082 (g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) {
1083 task->is_read = true;
1084 } else {
1085 task->is_read = false;
1086 }
1087
1088 rc = entry->fn_table->submit_io(task, ns_ctx, entry, offset_in_ios);
1089
1090 if (spdk_unlikely(rc != 0)) {
1091 fprintf(stderr, "starting I/O failed\n");
1092 } else {
1093 ns_ctx->current_queue_depth++;
1094 }
1095 }
1096
1097 static inline void
1098 task_complete(struct perf_task *task)
1099 {
1100 struct ns_worker_ctx *ns_ctx;
1101 uint64_t tsc_diff;
1102 struct ns_entry *entry;
1103
1104 ns_ctx = task->ns_ctx;
1105 entry = ns_ctx->entry;
1106 ns_ctx->current_queue_depth--;
1107 ns_ctx->io_completed++;
1108 tsc_diff = spdk_get_ticks() - task->submit_tsc;
1109 ns_ctx->total_tsc += tsc_diff;
1110 if (spdk_unlikely(ns_ctx->min_tsc > tsc_diff)) {
1111 ns_ctx->min_tsc = tsc_diff;
1112 }
1113 if (spdk_unlikely(ns_ctx->max_tsc < tsc_diff)) {
1114 ns_ctx->max_tsc = tsc_diff;
1115 }
1116 if (spdk_unlikely(g_latency_sw_tracking_level > 0)) {
1117 spdk_histogram_data_tally(ns_ctx->histogram, tsc_diff);
1118 }
1119
1120 if (spdk_unlikely(entry->md_size > 0)) {
1121 /* add application level verification for end-to-end data protection */
1122 entry->fn_table->verify_io(task, entry);
1123 }
1124
1125 /*
1126 * is_draining indicates when time has expired for the test run
1127 * and we are just waiting for the previously submitted I/O
1128 * to complete. In this case, do not submit a new I/O to replace
1129 * the one just completed.
1130 */
1131 if (spdk_unlikely(ns_ctx->is_draining)) {
1132 spdk_dma_free(task->iov.iov_base);
1133 spdk_dma_free(task->md_iov.iov_base);
1134 free(task);
1135 } else {
1136 submit_single_io(task);
1137 }
1138 }
1139
1140 static void
1141 io_complete(void *ctx, const struct spdk_nvme_cpl *cpl)
1142 {
1143 struct perf_task *task = ctx;
1144
1145 if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) {
1146 fprintf(stderr, "%s completed with error (sct=%d, sc=%d)\n",
1147 task->is_read ? "Read" : "Write",
1148 cpl->status.sct, cpl->status.sc);
1149 }
1150
1151 task_complete(task);
1152 }
1153
1154 static struct perf_task *
1155 allocate_task(struct ns_worker_ctx *ns_ctx, int queue_depth)
1156 {
1157 struct perf_task *task;
1158
1159 task = calloc(1, sizeof(*task));
1160 if (task == NULL) {
1161 fprintf(stderr, "Out of memory allocating tasks\n");
1162 exit(1);
1163 }
1164
1165 ns_ctx->entry->fn_table->setup_payload(task, queue_depth % 8 + 1);
1166
1167 task->ns_ctx = ns_ctx;
1168
1169 return task;
1170 }
1171
1172 static void
1173 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth)
1174 {
1175 struct perf_task *task;
1176
1177 while (queue_depth-- > 0) {
1178 task = allocate_task(ns_ctx, queue_depth);
1179 submit_single_io(task);
1180 }
1181 }
1182
1183 static int
1184 init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
1185 {
1186 return ns_ctx->entry->fn_table->init_ns_worker_ctx(ns_ctx);
1187 }
1188
1189 static void
1190 cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
1191 {
1192 ns_ctx->entry->fn_table->cleanup_ns_worker_ctx(ns_ctx);
1193 }
1194
1195 static void
1196 print_periodic_performance(void)
1197 {
1198 uint64_t io_this_second;
1199 double mb_this_second;
1200 struct worker_thread *worker;
1201 struct ns_worker_ctx *ns_ctx;
1202
1203 if (!isatty(STDOUT_FILENO)) {
1204 /* Don't print periodic stats if output is not going
1205 * to a terminal.
1206 */
1207 return;
1208 }
1209
1210 io_this_second = 0;
1211 worker = g_workers;
1212 while (worker) {
1213 ns_ctx = worker->ns_ctx;
1214 while (ns_ctx) {
1215 io_this_second += ns_ctx->io_completed - ns_ctx->last_io_completed;
1216 ns_ctx->last_io_completed = ns_ctx->io_completed;
1217 ns_ctx = ns_ctx->next;
1218 }
1219 worker = worker->next;
1220 }
1221
1222 mb_this_second = (double)io_this_second * g_io_size_bytes / (1024 * 1024);
1223 printf("%9ju IOPS, %8.2f MiB/s\r", io_this_second, mb_this_second);
1224 fflush(stdout);
1225 }
1226
1227 static int
1228 work_fn(void *arg)
1229 {
1230 uint64_t tsc_end, tsc_current, tsc_next_print;
1231 struct worker_thread *worker = (struct worker_thread *)arg;
1232 struct ns_worker_ctx *ns_ctx = NULL;
1233 uint32_t unfinished_ns_ctx;
1234
1235 /* Allocate queue pairs for each namespace. */
1236 ns_ctx = worker->ns_ctx;
1237 while (ns_ctx != NULL) {
1238 if (init_ns_worker_ctx(ns_ctx) != 0) {
1239 printf("ERROR: init_ns_worker_ctx() failed\n");
1240 return 1;
1241 }
1242 ns_ctx = ns_ctx->next;
1243 }
1244
1245 tsc_current = spdk_get_ticks();
1246 tsc_end = tsc_current + g_time_in_sec * g_tsc_rate;
1247 tsc_next_print = tsc_current + g_tsc_rate;
1248
1249 /* Submit initial I/O for each namespace. */
1250 ns_ctx = worker->ns_ctx;
1251 while (ns_ctx != NULL) {
1252 submit_io(ns_ctx, g_queue_depth);
1253 ns_ctx = ns_ctx->next;
1254 }
1255
1256 while (1) {
1257 /*
1258 * Check for completed I/O for each controller. A new
1259 * I/O will be submitted in the io_complete callback
1260 * to replace each I/O that is completed.
1261 */
1262 ns_ctx = worker->ns_ctx;
1263 while (ns_ctx != NULL) {
1264 ns_ctx->entry->fn_table->check_io(ns_ctx);
1265 ns_ctx = ns_ctx->next;
1266 }
1267
1268 tsc_current = spdk_get_ticks();
1269
1270 if (worker->lcore == g_master_core && tsc_current > tsc_next_print) {
1271 tsc_next_print += g_tsc_rate;
1272 print_periodic_performance();
1273 }
1274
1275 if (tsc_current > tsc_end) {
1276 break;
1277 }
1278 }
1279
1280 /* drain the io of each ns_ctx in round robin to make the fairness */
1281 do {
1282 unfinished_ns_ctx = 0;
1283 ns_ctx = worker->ns_ctx;
1284 while (ns_ctx != NULL) {
1285 /* first time will enter into this if case */
1286 if (!ns_ctx->is_draining) {
1287 ns_ctx->is_draining = true;
1288 }
1289
1290 if (ns_ctx->current_queue_depth > 0) {
1291 ns_ctx->entry->fn_table->check_io(ns_ctx);
1292 if (ns_ctx->current_queue_depth == 0) {
1293 cleanup_ns_worker_ctx(ns_ctx);
1294 } else {
1295 unfinished_ns_ctx++;
1296 }
1297 }
1298 ns_ctx = ns_ctx->next;
1299 }
1300 } while (unfinished_ns_ctx > 0);
1301
1302 return 0;
1303 }
1304
1305 static void usage(char *program_name)
1306 {
1307 printf("%s options", program_name);
1308 #if defined(SPDK_CONFIG_URING) || defined(HAVE_LIBAIO)
1309 printf(" [Kernel device(s)]...");
1310 #endif
1311 printf("\n");
1312 printf("\t[-q io depth]\n");
1313 printf("\t[-o io size in bytes]\n");
1314 printf("\t[-P number of io queues per namespace. default: 1]\n");
1315 printf("\t[-U number of unused io queues per controller. default: 0]\n");
1316 printf("\t[-w io pattern type, must be one of\n");
1317 printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
1318 printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
1319 printf("\t[-L enable latency tracking via sw, default: disabled]\n");
1320 printf("\t\t-L for latency summary, -LL for detailed histogram\n");
1321 printf("\t[-l enable latency tracking via ssd (if supported), default: disabled]\n");
1322 printf("\t[-t time in seconds]\n");
1323 printf("\t[-c core mask for I/O submission/completion.]\n");
1324 printf("\t\t(default: 1)\n");
1325 printf("\t[-D disable submission queue in controller memory buffer, default: enabled]\n");
1326 printf("\t[-H enable header digest for TCP transport, default: disabled]\n");
1327 printf("\t[-I enable data digest for TCP transport, default: disabled]\n");
1328 printf("\t[-N no shutdown notification process for controllers, default: disabled]\n");
1329 printf("\t[-r Transport ID for local PCIe NVMe or NVMeoF]\n");
1330 printf("\t Format: 'key:value [key:value] ...'\n");
1331 printf("\t Keys:\n");
1332 printf("\t trtype Transport type (e.g. PCIe, RDMA)\n");
1333 printf("\t adrfam Address family (e.g. IPv4, IPv6)\n");
1334 printf("\t traddr Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n");
1335 printf("\t trsvcid Transport service identifier (e.g. 4420)\n");
1336 printf("\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN);
1337 printf("\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n");
1338 printf("\t -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n");
1339 printf("\t[-e metadata configuration]\n");
1340 printf("\t Keys:\n");
1341 printf("\t PRACT Protection Information Action bit (PRACT=1 or PRACT=0)\n");
1342 printf("\t PRCHK Control of Protection Information Checking (PRCHK=GUARD|REFTAG|APPTAG)\n");
1343 printf("\t Example: -e 'PRACT=0,PRCHK=GUARD|REFTAG|APPTAG'\n");
1344 printf("\t -e 'PRACT=1,PRCHK=GUARD'\n");
1345 printf("\t[-k keep alive timeout period in millisecond]\n");
1346 printf("\t[-s DPDK huge memory size in MB.]\n");
1347 printf("\t[-C max completions per poll]\n");
1348 printf("\t\t(default: 0 - unlimited)\n");
1349 printf("\t[-i shared memory group ID]\n");
1350 printf("\t");
1351 spdk_log_usage(stdout, "-T");
1352 #ifdef SPDK_CONFIG_URING
1353 printf("\t[-R enable using liburing to drive kernel devices (Default: libaio)]\n");
1354 #endif
1355 #ifdef DEBUG
1356 printf("\t[-G enable debug logging]\n");
1357 #else
1358 printf("\t[-G enable debug logging (flag disabled, must reconfigure with --enable-debug)\n");
1359 #endif
1360 }
1361
1362 static void
1363 check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count,
1364 uint64_t total, uint64_t so_far)
1365 {
1366 double so_far_pct;
1367 double **cutoff = ctx;
1368
1369 if (count == 0) {
1370 return;
1371 }
1372
1373 so_far_pct = (double)so_far / total;
1374 while (so_far_pct >= **cutoff && **cutoff > 0) {
1375 printf("%9.5f%% : %9.3fus\n", **cutoff * 100, (double)end * 1000 * 1000 / g_tsc_rate);
1376 (*cutoff)++;
1377 }
1378 }
1379
1380 static void
1381 print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count,
1382 uint64_t total, uint64_t so_far)
1383 {
1384 double so_far_pct;
1385
1386 if (count == 0) {
1387 return;
1388 }
1389
1390 so_far_pct = (double)so_far * 100 / total;
1391 printf("%9.3f - %9.3f: %9.4f%% (%9ju)\n",
1392 (double)start * 1000 * 1000 / g_tsc_rate,
1393 (double)end * 1000 * 1000 / g_tsc_rate,
1394 so_far_pct, count);
1395 }
1396
1397 static void
1398 print_performance(void)
1399 {
1400 uint64_t total_io_completed, total_io_tsc;
1401 double io_per_second, mb_per_second, average_latency, min_latency, max_latency;
1402 double sum_ave_latency, min_latency_so_far, max_latency_so_far;
1403 double total_io_per_second, total_mb_per_second;
1404 int ns_count;
1405 struct worker_thread *worker;
1406 struct ns_worker_ctx *ns_ctx;
1407 uint32_t max_strlen;
1408
1409 total_io_per_second = 0;
1410 total_mb_per_second = 0;
1411 total_io_completed = 0;
1412 total_io_tsc = 0;
1413 min_latency_so_far = (double)UINT64_MAX;
1414 max_latency_so_far = 0;
1415 ns_count = 0;
1416
1417 max_strlen = 0;
1418 worker = g_workers;
1419 while (worker) {
1420 ns_ctx = worker->ns_ctx;
1421 while (ns_ctx) {
1422 max_strlen = spdk_max(strlen(ns_ctx->entry->name), max_strlen);
1423 ns_ctx = ns_ctx->next;
1424 }
1425 worker = worker->next;
1426 }
1427
1428 printf("========================================================\n");
1429 printf("%*s\n", max_strlen + 60, "Latency(us)");
1430 printf("%-*s: %10s %10s %10s %10s %10s\n",
1431 max_strlen + 13, "Device Information", "IOPS", "MiB/s", "Average", "min", "max");
1432
1433 worker = g_workers;
1434 while (worker) {
1435 ns_ctx = worker->ns_ctx;
1436 while (ns_ctx) {
1437 if (ns_ctx->io_completed != 0) {
1438 io_per_second = (double)ns_ctx->io_completed / g_time_in_sec;
1439 mb_per_second = io_per_second * g_io_size_bytes / (1024 * 1024);
1440 average_latency = ((double)ns_ctx->total_tsc / ns_ctx->io_completed) * 1000 * 1000 / g_tsc_rate;
1441 min_latency = (double)ns_ctx->min_tsc * 1000 * 1000 / g_tsc_rate;
1442 if (min_latency < min_latency_so_far) {
1443 min_latency_so_far = min_latency;
1444 }
1445
1446 max_latency = (double)ns_ctx->max_tsc * 1000 * 1000 / g_tsc_rate;
1447 if (max_latency > max_latency_so_far) {
1448 max_latency_so_far = max_latency;
1449 }
1450
1451 printf("%-*.*s from core %2u: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
1452 max_strlen, max_strlen, ns_ctx->entry->name, worker->lcore,
1453 io_per_second, mb_per_second,
1454 average_latency, min_latency, max_latency);
1455 total_io_per_second += io_per_second;
1456 total_mb_per_second += mb_per_second;
1457 total_io_completed += ns_ctx->io_completed;
1458 total_io_tsc += ns_ctx->total_tsc;
1459 ns_count++;
1460 }
1461 ns_ctx = ns_ctx->next;
1462 }
1463 worker = worker->next;
1464 }
1465
1466 if (ns_count != 0 && total_io_completed) {
1467 sum_ave_latency = ((double)total_io_tsc / total_io_completed) * 1000 * 1000 / g_tsc_rate;
1468 printf("========================================================\n");
1469 printf("%-*s: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
1470 max_strlen + 13, "Total", total_io_per_second, total_mb_per_second,
1471 sum_ave_latency, min_latency_so_far, max_latency_so_far);
1472 printf("\n");
1473 }
1474
1475 if (g_latency_sw_tracking_level == 0 || total_io_completed == 0) {
1476 return;
1477 }
1478
1479 worker = g_workers;
1480 while (worker) {
1481 ns_ctx = worker->ns_ctx;
1482 while (ns_ctx) {
1483 const double *cutoff = g_latency_cutoffs;
1484
1485 printf("Summary latency data for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore);
1486 printf("=================================================================================\n");
1487
1488 spdk_histogram_data_iterate(ns_ctx->histogram, check_cutoff, &cutoff);
1489
1490 printf("\n");
1491 ns_ctx = ns_ctx->next;
1492 }
1493 worker = worker->next;
1494 }
1495
1496 if (g_latency_sw_tracking_level == 1) {
1497 return;
1498 }
1499
1500 worker = g_workers;
1501 while (worker) {
1502 ns_ctx = worker->ns_ctx;
1503 while (ns_ctx) {
1504 printf("Latency histogram for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore);
1505 printf("==============================================================================\n");
1506 printf(" Range in us Cumulative IO count\n");
1507
1508 spdk_histogram_data_iterate(ns_ctx->histogram, print_bucket, NULL);
1509 printf("\n");
1510 ns_ctx = ns_ctx->next;
1511 }
1512 worker = worker->next;
1513 }
1514
1515 }
1516
1517 static void
1518 print_latency_page(struct ctrlr_entry *entry)
1519 {
1520 int i;
1521
1522 printf("\n");
1523 printf("%s\n", entry->name);
1524 printf("--------------------------------------------------------\n");
1525
1526 for (i = 0; i < 32; i++) {
1527 if (entry->latency_page->buckets_32us[i]) {
1528 printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32, entry->latency_page->buckets_32us[i]);
1529 }
1530 }
1531 for (i = 0; i < 31; i++) {
1532 if (entry->latency_page->buckets_1ms[i]) {
1533 printf("Bucket %dms - %dms: %d\n", i + 1, i + 2, entry->latency_page->buckets_1ms[i]);
1534 }
1535 }
1536 for (i = 0; i < 31; i++) {
1537 if (entry->latency_page->buckets_32ms[i])
1538 printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32,
1539 entry->latency_page->buckets_32ms[i]);
1540 }
1541 }
1542
1543 static void
1544 print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page)
1545 {
1546 struct ctrlr_entry *ctrlr;
1547
1548 printf("%s Latency Statistics:\n", op_name);
1549 printf("========================================================\n");
1550 ctrlr = g_controllers;
1551 while (ctrlr) {
1552 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
1553 if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr->ctrlr, log_page, SPDK_NVME_GLOBAL_NS_TAG,
1554 ctrlr->latency_page, sizeof(struct spdk_nvme_intel_rw_latency_page), 0,
1555 enable_latency_tracking_complete,
1556 NULL)) {
1557 printf("nvme_ctrlr_cmd_get_log_page() failed\n");
1558 exit(1);
1559 }
1560
1561 g_outstanding_commands++;
1562 } else {
1563 printf("Controller %s: %s latency statistics not supported\n", ctrlr->name, op_name);
1564 }
1565 ctrlr = ctrlr->next;
1566 }
1567
1568 while (g_outstanding_commands) {
1569 ctrlr = g_controllers;
1570 while (ctrlr) {
1571 spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr);
1572 ctrlr = ctrlr->next;
1573 }
1574 }
1575
1576 ctrlr = g_controllers;
1577 while (ctrlr) {
1578 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
1579 print_latency_page(ctrlr);
1580 }
1581 ctrlr = ctrlr->next;
1582 }
1583 printf("\n");
1584 }
1585
1586 static void
1587 print_stats(void)
1588 {
1589 print_performance();
1590 if (g_latency_ssd_tracking_enable) {
1591 if (g_rw_percentage != 0) {
1592 print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY);
1593 }
1594 if (g_rw_percentage != 100) {
1595 print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY);
1596 }
1597 }
1598 }
1599
1600 static void
1601 unregister_trids(void)
1602 {
1603 struct trid_entry *trid_entry, *tmp;
1604
1605 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) {
1606 TAILQ_REMOVE(&g_trid_list, trid_entry, tailq);
1607 free(trid_entry);
1608 }
1609 }
1610
1611 static int
1612 add_trid(const char *trid_str)
1613 {
1614 struct trid_entry *trid_entry;
1615 struct spdk_nvme_transport_id *trid;
1616 char *ns;
1617
1618 trid_entry = calloc(1, sizeof(*trid_entry));
1619 if (trid_entry == NULL) {
1620 return -1;
1621 }
1622
1623 trid = &trid_entry->trid;
1624 trid->trtype = SPDK_NVME_TRANSPORT_PCIE;
1625 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
1626
1627 if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) {
1628 fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str);
1629 free(trid_entry);
1630 return 1;
1631 }
1632
1633 spdk_nvme_transport_id_populate_trstring(trid,
1634 spdk_nvme_transport_id_trtype_str(trid->trtype));
1635
1636 ns = strcasestr(trid_str, "ns:");
1637 if (ns) {
1638 char nsid_str[6]; /* 5 digits maximum in an nsid */
1639 int len;
1640 int nsid;
1641
1642 ns += 3;
1643
1644 len = strcspn(ns, " \t\n");
1645 if (len > 5) {
1646 fprintf(stderr, "NVMe namespace IDs must be 5 digits or less\n");
1647 free(trid_entry);
1648 return 1;
1649 }
1650
1651 memcpy(nsid_str, ns, len);
1652 nsid_str[len] = '\0';
1653
1654 nsid = spdk_strtol(nsid_str, 10);
1655 if (nsid <= 0 || nsid > 65535) {
1656 fprintf(stderr, "NVMe namespace IDs must be less than 65536 and greater than 0\n");
1657 free(trid_entry);
1658 return 1;
1659 }
1660
1661 trid_entry->nsid = (uint16_t)nsid;
1662 }
1663
1664 TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq);
1665 return 0;
1666 }
1667
1668 static size_t
1669 parse_next_key(const char **str, char *key, char *val, size_t key_buf_size,
1670 size_t val_buf_size)
1671 {
1672 const char *sep;
1673 const char *separator = ", \t\n";
1674 size_t key_len, val_len;
1675
1676 *str += strspn(*str, separator);
1677
1678 sep = strchr(*str, '=');
1679 if (!sep) {
1680 fprintf(stderr, "Key without '=' separator\n");
1681 return 0;
1682 }
1683
1684 key_len = sep - *str;
1685 if (key_len >= key_buf_size) {
1686 fprintf(stderr, "Key length %zu is greater than maximum allowed %zu\n",
1687 key_len, key_buf_size - 1);
1688 return 0;
1689 }
1690
1691 memcpy(key, *str, key_len);
1692 key[key_len] = '\0';
1693
1694 *str += key_len + 1; /* Skip key */
1695 val_len = strcspn(*str, separator);
1696 if (val_len == 0) {
1697 fprintf(stderr, "Key without value\n");
1698 return 0;
1699 }
1700
1701 if (val_len >= val_buf_size) {
1702 fprintf(stderr, "Value length %zu is greater than maximum allowed %zu\n",
1703 val_len, val_buf_size - 1);
1704 return 0;
1705 }
1706
1707 memcpy(val, *str, val_len);
1708 val[val_len] = '\0';
1709
1710 *str += val_len;
1711
1712 return val_len;
1713 }
1714
1715 static int
1716 parse_metadata(const char *metacfg_str)
1717 {
1718 const char *str;
1719 size_t val_len;
1720 char key[32];
1721 char val[1024];
1722
1723 if (metacfg_str == NULL) {
1724 return -EINVAL;
1725 }
1726
1727 str = metacfg_str;
1728
1729 while (*str != '\0') {
1730 val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val));
1731 if (val_len == 0) {
1732 fprintf(stderr, "Failed to parse metadata\n");
1733 return -EINVAL;
1734 }
1735
1736 if (strcmp(key, "PRACT") == 0) {
1737 if (*val == '1') {
1738 g_metacfg_pract_flag = SPDK_NVME_IO_FLAGS_PRACT;
1739 }
1740 } else if (strcmp(key, "PRCHK") == 0) {
1741 if (strstr(val, "GUARD") != NULL) {
1742 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
1743 }
1744 if (strstr(val, "REFTAG") != NULL) {
1745 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
1746 }
1747 if (strstr(val, "APPTAG") != NULL) {
1748 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG;
1749 }
1750 } else {
1751 fprintf(stderr, "Unknown key '%s'\n", key);
1752 }
1753 }
1754
1755 return 0;
1756 }
1757
1758 static int
1759 parse_args(int argc, char **argv)
1760 {
1761 int op;
1762 long int val;
1763 int rc;
1764
1765 while ((op = getopt(argc, argv, "c:e:i:lo:q:r:k:s:t:w:C:DGHILM:NP:RT:U:V")) != -1) {
1766 switch (op) {
1767 case 'i':
1768 case 'C':
1769 case 'P':
1770 case 'o':
1771 case 'q':
1772 case 'k':
1773 case 's':
1774 case 't':
1775 case 'M':
1776 case 'U':
1777 val = spdk_strtol(optarg, 10);
1778 if (val < 0) {
1779 fprintf(stderr, "Converting a string to integer failed\n");
1780 return val;
1781 }
1782 switch (op) {
1783 case 'i':
1784 g_shm_id = val;
1785 break;
1786 case 'C':
1787 g_max_completions = val;
1788 break;
1789 case 'P':
1790 g_nr_io_queues_per_ns = val;
1791 break;
1792 case 'o':
1793 g_io_size_bytes = val;
1794 break;
1795 case 'q':
1796 g_queue_depth = val;
1797 break;
1798 case 'k':
1799 g_keep_alive_timeout_in_ms = val;
1800 break;
1801 case 's':
1802 g_dpdk_mem = val;
1803 break;
1804 case 't':
1805 g_time_in_sec = val;
1806 break;
1807 case 'M':
1808 g_rw_percentage = val;
1809 g_mix_specified = true;
1810 break;
1811 case 'U':
1812 g_nr_unused_io_queues = val;
1813 break;
1814 }
1815 break;
1816 case 'c':
1817 g_core_mask = optarg;
1818 break;
1819 case 'e':
1820 if (parse_metadata(optarg)) {
1821 usage(argv[0]);
1822 return 1;
1823 }
1824 break;
1825 case 'l':
1826 g_latency_ssd_tracking_enable = true;
1827 break;
1828 case 'r':
1829 if (add_trid(optarg)) {
1830 usage(argv[0]);
1831 return 1;
1832 }
1833 break;
1834 case 'w':
1835 g_workload_type = optarg;
1836 break;
1837 case 'D':
1838 g_disable_sq_cmb = 1;
1839 break;
1840 case 'G':
1841 #ifndef DEBUG
1842 fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n",
1843 argv[0]);
1844 usage(argv[0]);
1845 return 1;
1846 #else
1847 spdk_log_set_flag("nvme");
1848 spdk_log_set_print_level(SPDK_LOG_DEBUG);
1849 break;
1850 #endif
1851 case 'H':
1852 g_header_digest = 1;
1853 break;
1854 case 'I':
1855 g_data_digest = 1;
1856 break;
1857 case 'L':
1858 g_latency_sw_tracking_level++;
1859 break;
1860 case 'N':
1861 g_no_shn_notification = true;
1862 break;
1863 case 'R':
1864 #ifndef SPDK_CONFIG_URING
1865 fprintf(stderr, "%s must be rebuilt with CONFIG_URING=y for -R flag.\n",
1866 argv[0]);
1867 usage(argv[0]);
1868 return 0;
1869 #endif
1870 g_use_uring = true;
1871 break;
1872 case 'T':
1873 rc = spdk_log_set_flag(optarg);
1874 if (rc < 0) {
1875 fprintf(stderr, "unknown flag\n");
1876 usage(argv[0]);
1877 exit(EXIT_FAILURE);
1878 }
1879 spdk_log_set_print_level(SPDK_LOG_DEBUG);
1880 #ifndef DEBUG
1881 fprintf(stderr, "%s must be rebuilt with CONFIG_DEBUG=y for -T flag.\n",
1882 argv[0]);
1883 usage(argv[0]);
1884 return 0;
1885 #endif
1886 break;
1887 case 'V':
1888 g_vmd = true;
1889 break;
1890 default:
1891 usage(argv[0]);
1892 return 1;
1893 }
1894 }
1895
1896 if (!g_nr_io_queues_per_ns) {
1897 usage(argv[0]);
1898 return 1;
1899 }
1900
1901 if (!g_queue_depth) {
1902 fprintf(stderr, "missing -q (queue size) operand\n");
1903 usage(argv[0]);
1904 return 1;
1905 }
1906 if (!g_io_size_bytes) {
1907 fprintf(stderr, "missing -o (block size) operand\n");
1908 usage(argv[0]);
1909 return 1;
1910 }
1911 if (!g_workload_type) {
1912 fprintf(stderr, "missing -w (io pattern type) operand\n");
1913 usage(argv[0]);
1914 return 1;
1915 }
1916 if (!g_time_in_sec) {
1917 fprintf(stderr, "missing -t (test time in seconds) operand\n");
1918 usage(argv[0]);
1919 return 1;
1920 }
1921
1922 if (strncmp(g_workload_type, "rand", 4) == 0) {
1923 g_is_random = 1;
1924 g_workload_type = &g_workload_type[4];
1925 }
1926
1927 if (strcmp(g_workload_type, "read") == 0 || strcmp(g_workload_type, "write") == 0) {
1928 g_rw_percentage = strcmp(g_workload_type, "read") == 0 ? 100 : 0;
1929 if (g_mix_specified) {
1930 fprintf(stderr, "Ignoring -M option... Please use -M option"
1931 " only when using rw or randrw.\n");
1932 }
1933 } else if (strcmp(g_workload_type, "rw") == 0) {
1934 if (g_rw_percentage < 0 || g_rw_percentage > 100) {
1935 fprintf(stderr,
1936 "-M must be specified to value from 0 to 100 "
1937 "for rw or randrw.\n");
1938 return 1;
1939 }
1940 } else {
1941 fprintf(stderr,
1942 "io pattern type must be one of\n"
1943 "(read, write, randread, randwrite, rw, randrw)\n");
1944 return 1;
1945 }
1946
1947 if (TAILQ_EMPTY(&g_trid_list)) {
1948 /* If no transport IDs specified, default to enumerating all local PCIe devices */
1949 add_trid("trtype:PCIe");
1950 } else {
1951 struct trid_entry *trid_entry, *trid_entry_tmp;
1952
1953 g_no_pci = true;
1954 /* check whether there is local PCIe type */
1955 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) {
1956 if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
1957 g_no_pci = false;
1958 break;
1959 }
1960 }
1961 }
1962
1963 g_file_optind = optind;
1964
1965 return 0;
1966 }
1967
1968 static int
1969 register_workers(void)
1970 {
1971 uint32_t i;
1972 struct worker_thread *worker;
1973
1974 g_workers = NULL;
1975 g_num_workers = 0;
1976
1977 SPDK_ENV_FOREACH_CORE(i) {
1978 worker = calloc(1, sizeof(*worker));
1979 if (worker == NULL) {
1980 fprintf(stderr, "Unable to allocate worker\n");
1981 return -1;
1982 }
1983
1984 worker->lcore = i;
1985 worker->next = g_workers;
1986 g_workers = worker;
1987 g_num_workers++;
1988 }
1989
1990 return 0;
1991 }
1992
1993 static void
1994 unregister_workers(void)
1995 {
1996 struct worker_thread *worker = g_workers;
1997
1998 /* Free namespace context and worker thread */
1999 while (worker) {
2000 struct worker_thread *next_worker = worker->next;
2001 struct ns_worker_ctx *ns_ctx = worker->ns_ctx;
2002
2003 while (ns_ctx) {
2004 struct ns_worker_ctx *next_ns_ctx = ns_ctx->next;
2005 spdk_histogram_data_free(ns_ctx->histogram);
2006 free(ns_ctx);
2007 ns_ctx = next_ns_ctx;
2008 }
2009
2010 free(worker);
2011 worker = next_worker;
2012 }
2013 }
2014
2015 static bool
2016 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2017 struct spdk_nvme_ctrlr_opts *opts)
2018 {
2019 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2020 if (g_disable_sq_cmb) {
2021 opts->use_cmb_sqs = false;
2022 }
2023 if (g_no_shn_notification) {
2024 opts->no_shn_notification = true;
2025 }
2026 }
2027
2028 /* Set io_queue_size to UINT16_MAX, NVMe driver
2029 * will then reduce this to MQES to maximize
2030 * the io_queue_size as much as possible.
2031 */
2032 opts->io_queue_size = UINT16_MAX;
2033
2034 /* Set the header and data_digest */
2035 opts->header_digest = g_header_digest;
2036 opts->data_digest = g_data_digest;
2037 opts->keep_alive_timeout_ms = g_keep_alive_timeout_in_ms;
2038
2039 return true;
2040 }
2041
2042 static void
2043 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2044 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2045 {
2046 struct trid_entry *trid_entry = cb_ctx;
2047 struct spdk_pci_addr pci_addr;
2048 struct spdk_pci_device *pci_dev;
2049 struct spdk_pci_id pci_id;
2050
2051 if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) {
2052 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n",
2053 trid->traddr, trid->trsvcid,
2054 trid->subnqn);
2055 } else {
2056 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) {
2057 return;
2058 }
2059
2060 pci_dev = spdk_nvme_ctrlr_get_pci_device(ctrlr);
2061 if (!pci_dev) {
2062 return;
2063 }
2064
2065 pci_id = spdk_pci_device_get_id(pci_dev);
2066
2067 printf("Attached to NVMe Controller at %s [%04x:%04x]\n",
2068 trid->traddr,
2069 pci_id.vendor_id, pci_id.device_id);
2070 }
2071
2072 register_ctrlr(ctrlr, trid_entry);
2073 }
2074
2075 static int
2076 register_controllers(void)
2077 {
2078 struct trid_entry *trid_entry;
2079
2080 printf("Initializing NVMe Controllers\n");
2081
2082 if (g_vmd && spdk_vmd_init()) {
2083 fprintf(stderr, "Failed to initialize VMD."
2084 " Some NVMe devices can be unavailable.\n");
2085 }
2086
2087 TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) {
2088 if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) {
2089 fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n",
2090 trid_entry->trid.traddr);
2091 return -1;
2092 }
2093 }
2094
2095 return 0;
2096 }
2097
2098 static void
2099 unregister_controllers(void)
2100 {
2101 struct ctrlr_entry *entry = g_controllers;
2102
2103 while (entry) {
2104 struct ctrlr_entry *next = entry->next;
2105 spdk_dma_free(entry->latency_page);
2106 if (g_latency_ssd_tracking_enable &&
2107 spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) {
2108 set_latency_tracking_feature(entry->ctrlr, false);
2109 }
2110
2111 if (g_nr_unused_io_queues) {
2112 int i;
2113
2114 for (i = 0; i < g_nr_unused_io_queues; i++) {
2115 spdk_nvme_ctrlr_free_io_qpair(entry->unused_qpairs[i]);
2116 }
2117
2118 free(entry->unused_qpairs);
2119 }
2120
2121 spdk_nvme_detach(entry->ctrlr);
2122 free(entry);
2123 entry = next;
2124 }
2125
2126 if (g_vmd) {
2127 spdk_vmd_fini();
2128 }
2129 }
2130
2131 static int
2132 associate_workers_with_ns(void)
2133 {
2134 struct ns_entry *entry = g_namespaces;
2135 struct worker_thread *worker = g_workers;
2136 struct ns_worker_ctx *ns_ctx;
2137 int i, count;
2138
2139 count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers;
2140
2141 for (i = 0; i < count; i++) {
2142 if (entry == NULL) {
2143 break;
2144 }
2145
2146 ns_ctx = calloc(1, sizeof(struct ns_worker_ctx));
2147 if (!ns_ctx) {
2148 return -1;
2149 }
2150
2151 printf("Associating %s with lcore %d\n", entry->name, worker->lcore);
2152 ns_ctx->min_tsc = UINT64_MAX;
2153 ns_ctx->entry = entry;
2154 ns_ctx->next = worker->ns_ctx;
2155 ns_ctx->histogram = spdk_histogram_data_alloc();
2156 worker->ns_ctx = ns_ctx;
2157
2158 worker = worker->next;
2159 if (worker == NULL) {
2160 worker = g_workers;
2161 }
2162
2163 entry = entry->next;
2164 if (entry == NULL) {
2165 entry = g_namespaces;
2166 }
2167
2168 }
2169
2170 return 0;
2171 }
2172
2173 static void *
2174 nvme_poll_ctrlrs(void *arg)
2175 {
2176 struct ctrlr_entry *entry;
2177 int oldstate;
2178
2179 spdk_unaffinitize_thread();
2180
2181 while (true) {
2182 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate);
2183
2184 entry = g_controllers;
2185 while (entry) {
2186 if (entry->trtype != SPDK_NVME_TRANSPORT_PCIE) {
2187 spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr);
2188 }
2189 entry = entry->next;
2190 }
2191
2192 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);
2193
2194 /* This is a pthread cancellation point and cannot be removed. */
2195 sleep(1);
2196 }
2197
2198 return NULL;
2199 }
2200
2201 int main(int argc, char **argv)
2202 {
2203 int rc;
2204 struct worker_thread *worker, *master_worker;
2205 struct spdk_env_opts opts;
2206 pthread_t thread_id = 0;
2207
2208 rc = parse_args(argc, argv);
2209 if (rc != 0) {
2210 return rc;
2211 }
2212
2213 spdk_env_opts_init(&opts);
2214 opts.name = "perf";
2215 opts.shm_id = g_shm_id;
2216 if (g_core_mask) {
2217 opts.core_mask = g_core_mask;
2218 }
2219
2220 if (g_dpdk_mem) {
2221 opts.mem_size = g_dpdk_mem;
2222 }
2223 if (g_no_pci) {
2224 opts.no_pci = g_no_pci;
2225 }
2226 if (spdk_env_init(&opts) < 0) {
2227 fprintf(stderr, "Unable to initialize SPDK env\n");
2228 rc = -1;
2229 goto cleanup;
2230 }
2231
2232 g_tsc_rate = spdk_get_ticks_hz();
2233
2234 if (register_workers() != 0) {
2235 rc = -1;
2236 goto cleanup;
2237 }
2238
2239 #if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING)
2240 if (register_files(argc, argv) != 0) {
2241 rc = -1;
2242 goto cleanup;
2243 }
2244 #endif
2245
2246 if (register_controllers() != 0) {
2247 rc = -1;
2248 goto cleanup;
2249 }
2250
2251 if (g_warn) {
2252 printf("WARNING: Some requested NVMe devices were skipped\n");
2253 }
2254
2255 if (g_num_namespaces == 0) {
2256 fprintf(stderr, "No valid NVMe controllers or AIO or URING devices found\n");
2257 goto cleanup;
2258 }
2259
2260 rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL);
2261 if (rc != 0) {
2262 fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n");
2263 goto cleanup;
2264 }
2265
2266 if (associate_workers_with_ns() != 0) {
2267 rc = -1;
2268 goto cleanup;
2269 }
2270
2271 printf("Initialization complete. Launching workers.\n");
2272
2273 /* Launch all of the slave workers */
2274 g_master_core = spdk_env_get_current_core();
2275 master_worker = NULL;
2276 worker = g_workers;
2277 while (worker != NULL) {
2278 if (worker->lcore != g_master_core) {
2279 spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker);
2280 } else {
2281 assert(master_worker == NULL);
2282 master_worker = worker;
2283 }
2284 worker = worker->next;
2285 }
2286
2287 assert(master_worker != NULL);
2288 rc = work_fn(master_worker);
2289
2290 spdk_env_thread_wait_all();
2291
2292 print_stats();
2293
2294 cleanup:
2295 if (thread_id && pthread_cancel(thread_id) == 0) {
2296 pthread_join(thread_id, NULL);
2297 }
2298 unregister_trids();
2299 unregister_namespaces();
2300 unregister_controllers();
2301 unregister_workers();
2302
2303 if (rc != 0) {
2304 fprintf(stderr, "%s: errors occured\n", argv[0]);
2305 }
2306
2307 return rc;
2308 }