]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/examples/nvme/perf/perf.c
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / examples / nvme / perf / perf.c
CommitLineData
7c673cae
FG
1/*-
2 * BSD LICENSE
3 *
4 * Copyright (c) Intel Corporation.
5 * All rights reserved.
6 *
9f95a23c
TL
7 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
8 *
7c673cae
FG
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * * Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * * Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 * * Neither the name of Intel Corporation nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */
35
11fdf7f2 36#include "spdk/stdinc.h"
7c673cae 37
11fdf7f2 38#include "spdk/env.h"
7c673cae
FG
39#include "spdk/fd.h"
40#include "spdk/nvme.h"
7c673cae
FG
41#include "spdk/queue.h"
42#include "spdk/string.h"
43#include "spdk/nvme_intel.h"
11fdf7f2
TL
44#include "spdk/histogram_data.h"
45#include "spdk/endian.h"
9f95a23c
TL
46#include "spdk/dif.h"
47#include "spdk/util.h"
48#include "spdk/log.h"
49#include "spdk/likely.h"
7c673cae
FG
50
51#if HAVE_LIBAIO
52#include <libaio.h>
7c673cae
FG
53#endif
54
55struct ctrlr_entry {
56 struct spdk_nvme_ctrlr *ctrlr;
9f95a23c 57 enum spdk_nvme_transport_type trtype;
7c673cae 58 struct spdk_nvme_intel_rw_latency_page *latency_page;
9f95a23c
TL
59
60 struct spdk_nvme_qpair **unused_qpairs;
61
7c673cae
FG
62 struct ctrlr_entry *next;
63 char name[1024];
64};
65
66enum entry_type {
67 ENTRY_TYPE_NVME_NS,
68 ENTRY_TYPE_AIO_FILE,
69};
70
9f95a23c
TL
71struct ns_fn_table;
72
7c673cae
FG
73struct ns_entry {
74 enum entry_type type;
9f95a23c 75 const struct ns_fn_table *fn_table;
7c673cae
FG
76
77 union {
78 struct {
79 struct spdk_nvme_ctrlr *ctrlr;
80 struct spdk_nvme_ns *ns;
81 } nvme;
82#if HAVE_LIBAIO
83 struct {
84 int fd;
85 } aio;
86#endif
87 } u;
88
89 struct ns_entry *next;
90 uint32_t io_size_blocks;
11fdf7f2 91 uint32_t num_io_requests;
7c673cae 92 uint64_t size_in_ios;
9f95a23c
TL
93 uint32_t block_size;
94 uint32_t md_size;
95 bool md_interleave;
96 bool pi_loc;
97 enum spdk_nvme_pi_type pi_type;
11fdf7f2 98 uint32_t io_flags;
7c673cae 99 char name[1024];
11fdf7f2
TL
100};
101
102static const double g_latency_cutoffs[] = {
103 0.01,
104 0.10,
105 0.25,
106 0.50,
107 0.75,
108 0.90,
109 0.95,
110 0.98,
111 0.99,
112 0.995,
113 0.999,
114 0.9999,
115 0.99999,
116 0.999999,
117 0.9999999,
118 -1,
7c673cae
FG
119};
120
121struct ns_worker_ctx {
122 struct ns_entry *entry;
123 uint64_t io_completed;
124 uint64_t total_tsc;
125 uint64_t min_tsc;
126 uint64_t max_tsc;
127 uint64_t current_queue_depth;
128 uint64_t offset_in_ios;
129 bool is_draining;
130
131 union {
132 struct {
9f95a23c
TL
133 int num_qpairs;
134 struct spdk_nvme_qpair **qpair;
135 int last_qpair;
7c673cae
FG
136 } nvme;
137
138#if HAVE_LIBAIO
139 struct {
140 struct io_event *events;
141 io_context_t ctx;
142 } aio;
143#endif
144 } u;
145
146 struct ns_worker_ctx *next;
11fdf7f2
TL
147
148 struct spdk_histogram_data *histogram;
7c673cae
FG
149};
150
151struct perf_task {
152 struct ns_worker_ctx *ns_ctx;
9f95a23c
TL
153 struct iovec iov;
154 struct iovec md_iov;
7c673cae 155 uint64_t submit_tsc;
11fdf7f2 156 bool is_read;
9f95a23c 157 struct spdk_dif_ctx dif_ctx;
7c673cae
FG
158#if HAVE_LIBAIO
159 struct iocb iocb;
160#endif
161};
162
163struct worker_thread {
11fdf7f2 164 struct ns_worker_ctx *ns_ctx;
7c673cae
FG
165 struct worker_thread *next;
166 unsigned lcore;
167};
168
9f95a23c
TL
169struct ns_fn_table {
170 void (*setup_payload)(struct perf_task *task, uint8_t pattern);
171
172 int (*submit_io)(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
173 struct ns_entry *entry, uint64_t offset_in_ios);
174
175 void (*check_io)(struct ns_worker_ctx *ns_ctx);
176
177 void (*verify_io)(struct perf_task *task, struct ns_entry *entry);
178
179 int (*init_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx);
180
181 void (*cleanup_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx);
182};
183
7c673cae
FG
184static int g_outstanding_commands;
185
11fdf7f2
TL
186static bool g_latency_ssd_tracking_enable = false;
187static int g_latency_sw_tracking_level = 0;
7c673cae
FG
188
189static struct ctrlr_entry *g_controllers = NULL;
11fdf7f2 190static int g_controllers_found = 0;
7c673cae
FG
191static struct ns_entry *g_namespaces = NULL;
192static int g_num_namespaces = 0;
193static struct worker_thread *g_workers = NULL;
194static int g_num_workers = 0;
195
196static uint64_t g_tsc_rate;
197
198static uint32_t g_io_align = 0x200;
199static uint32_t g_io_size_bytes;
11fdf7f2
TL
200static uint32_t g_max_io_md_size;
201static uint32_t g_max_io_size_blocks;
202static uint32_t g_metacfg_pract_flag;
203static uint32_t g_metacfg_prchk_flags;
7c673cae
FG
204static int g_rw_percentage;
205static int g_is_random;
206static int g_queue_depth;
9f95a23c
TL
207static int g_nr_io_queues_per_ns = 1;
208static int g_nr_unused_io_queues = 0;
7c673cae
FG
209static int g_time_in_sec;
210static uint32_t g_max_completions;
211static int g_dpdk_mem;
212static int g_shm_id = -1;
11fdf7f2
TL
213static uint32_t g_disable_sq_cmb;
214static bool g_no_pci;
215static bool g_warn;
9f95a23c
TL
216static bool g_header_digest;
217static bool g_data_digest;
218static uint32_t g_keep_alive_timeout_in_ms = 0;
7c673cae
FG
219
220static const char *g_core_mask;
221
222struct trid_entry {
223 struct spdk_nvme_transport_id trid;
11fdf7f2 224 uint16_t nsid;
7c673cae
FG
225 TAILQ_ENTRY(trid_entry) tailq;
226};
227
228static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list);
229
230static int g_aio_optind; /* Index of first AIO filename in argv */
231
9f95a23c 232static inline void
7c673cae
FG
233task_complete(struct perf_task *task);
234
9f95a23c 235#if HAVE_LIBAIO
7c673cae 236static void
9f95a23c 237aio_setup_payload(struct perf_task *task, uint8_t pattern)
7c673cae 238{
9f95a23c
TL
239 task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL);
240 task->iov.iov_len = g_io_size_bytes;
241 if (task->iov.iov_base == NULL) {
242 fprintf(stderr, "spdk_dma_zmalloc() for task->buf failed\n");
7c673cae
FG
243 exit(1);
244 }
9f95a23c 245 memset(task->iov.iov_base, pattern, task->iov.iov_len);
7c673cae
FG
246}
247
9f95a23c
TL
248static int
249aio_submit(io_context_t aio_ctx, struct iocb *iocb, int fd, enum io_iocb_cmd cmd,
250 struct iovec *iov, uint64_t offset, void *cb_ctx)
7c673cae 251{
9f95a23c
TL
252 iocb->aio_fildes = fd;
253 iocb->aio_reqprio = 0;
254 iocb->aio_lio_opcode = cmd;
255 iocb->u.c.buf = iov->iov_base;
256 iocb->u.c.nbytes = iov->iov_len;
257 iocb->u.c.offset = offset * iov->iov_len;
258 iocb->data = cb_ctx;
7c673cae 259
9f95a23c
TL
260 if (io_submit(aio_ctx, 1, &iocb) < 0) {
261 printf("io_submit");
262 return -1;
7c673cae 263 }
9f95a23c
TL
264
265 return 0;
7c673cae
FG
266}
267
9f95a23c
TL
268static int
269aio_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
270 struct ns_entry *entry, uint64_t offset_in_ios)
7c673cae 271{
9f95a23c
TL
272 if (task->is_read) {
273 return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PREAD,
274 &task->iov, offset_in_ios, task);
275 } else {
276 return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PWRITE,
277 &task->iov, offset_in_ios, task);
7c673cae 278 }
7c673cae
FG
279}
280
281static void
9f95a23c 282aio_check_io(struct ns_worker_ctx *ns_ctx)
7c673cae 283{
9f95a23c
TL
284 int count, i;
285 struct timespec timeout;
7c673cae 286
9f95a23c
TL
287 timeout.tv_sec = 0;
288 timeout.tv_nsec = 0;
7c673cae 289
9f95a23c
TL
290 count = io_getevents(ns_ctx->u.aio.ctx, 1, g_queue_depth, ns_ctx->u.aio.events, &timeout);
291 if (count < 0) {
292 fprintf(stderr, "io_getevents error\n");
293 exit(1);
7c673cae 294 }
7c673cae 295
9f95a23c
TL
296 for (i = 0; i < count; i++) {
297 task_complete(ns_ctx->u.aio.events[i].data);
7c673cae
FG
298 }
299}
300
301static void
9f95a23c 302aio_verify_io(struct perf_task *task, struct ns_entry *entry)
7c673cae 303{
9f95a23c 304}
7c673cae 305
9f95a23c
TL
306static int
307aio_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
308{
309 ns_ctx->u.aio.events = calloc(g_queue_depth, sizeof(struct io_event));
310 if (!ns_ctx->u.aio.events) {
311 return -1;
11fdf7f2 312 }
9f95a23c
TL
313 ns_ctx->u.aio.ctx = 0;
314 if (io_setup(g_queue_depth, &ns_ctx->u.aio.ctx) < 0) {
315 free(ns_ctx->u.aio.events);
316 perror("io_setup");
317 return -1;
7c673cae 318 }
9f95a23c
TL
319 return 0;
320}
7c673cae 321
9f95a23c
TL
322static void
323aio_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
324{
325 io_destroy(ns_ctx->u.aio.ctx);
326 free(ns_ctx->u.aio.events);
7c673cae
FG
327}
328
9f95a23c
TL
329static const struct ns_fn_table aio_fn_table = {
330 .setup_payload = aio_setup_payload,
331 .submit_io = aio_submit_io,
332 .check_io = aio_check_io,
333 .verify_io = aio_verify_io,
334 .init_ns_worker_ctx = aio_init_ns_worker_ctx,
335 .cleanup_ns_worker_ctx = aio_cleanup_ns_worker_ctx,
336};
337
7c673cae
FG
338static int
339register_aio_file(const char *path)
340{
341 struct ns_entry *entry;
342
343 int flags, fd;
344 uint64_t size;
345 uint32_t blklen;
346
347 if (g_rw_percentage == 100) {
348 flags = O_RDONLY;
349 } else if (g_rw_percentage == 0) {
350 flags = O_WRONLY;
351 } else {
352 flags = O_RDWR;
353 }
354
355 flags |= O_DIRECT;
356
357 fd = open(path, flags);
358 if (fd < 0) {
359 fprintf(stderr, "Could not open AIO device %s: %s\n", path, strerror(errno));
360 return -1;
361 }
362
363 size = spdk_fd_get_size(fd);
364 if (size == 0) {
365 fprintf(stderr, "Could not determine size of AIO device %s\n", path);
366 close(fd);
367 return -1;
368 }
369
370 blklen = spdk_fd_get_blocklen(fd);
371 if (blklen == 0) {
372 fprintf(stderr, "Could not determine block size of AIO device %s\n", path);
373 close(fd);
374 return -1;
375 }
376
377 /*
378 * TODO: This should really calculate the LCM of the current g_io_align and blklen.
379 * For now, it's fairly safe to just assume all block sizes are powers of 2.
380 */
381 if (g_io_align < blklen) {
382 g_io_align = blklen;
383 }
384
385 entry = malloc(sizeof(struct ns_entry));
386 if (entry == NULL) {
387 close(fd);
388 perror("aio ns_entry malloc");
389 return -1;
390 }
391
392 entry->type = ENTRY_TYPE_AIO_FILE;
9f95a23c 393 entry->fn_table = &aio_fn_table;
7c673cae
FG
394 entry->u.aio.fd = fd;
395 entry->size_in_ios = size / g_io_size_bytes;
396 entry->io_size_blocks = g_io_size_bytes / blklen;
397
398 snprintf(entry->name, sizeof(entry->name), "%s", path);
399
400 g_num_namespaces++;
401 entry->next = g_namespaces;
402 g_namespaces = entry;
403
404 return 0;
405}
406
407static int
9f95a23c 408register_aio_files(int argc, char **argv)
7c673cae 409{
9f95a23c 410 int i;
7c673cae 411
9f95a23c
TL
412 /* Treat everything after the options as files for AIO */
413 for (i = g_aio_optind; i < argc; i++) {
414 if (register_aio_file(argv[i]) != 0) {
415 return 1;
416 }
417 }
418
419 return 0;
420}
421#endif /* HAVE_LIBAIO */
422
423static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl);
424
425static void
426nvme_setup_payload(struct perf_task *task, uint8_t pattern)
427{
428 uint32_t max_io_size_bytes, max_io_md_size;
429
430 /* maximum extended lba format size from all active namespace,
431 * it's same with g_io_size_bytes for namespace without metadata.
432 */
433 max_io_size_bytes = g_io_size_bytes + g_max_io_md_size * g_max_io_size_blocks;
434 task->iov.iov_base = spdk_dma_zmalloc(max_io_size_bytes, g_io_align, NULL);
435 task->iov.iov_len = max_io_size_bytes;
436 if (task->iov.iov_base == NULL) {
437 fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n");
438 exit(1);
439 }
440 memset(task->iov.iov_base, pattern, task->iov.iov_len);
441
442 max_io_md_size = g_max_io_md_size * g_max_io_size_blocks;
443 if (max_io_md_size != 0) {
444 task->md_iov.iov_base = spdk_dma_zmalloc(max_io_md_size, g_io_align, NULL);
445 task->md_iov.iov_len = max_io_md_size;
446 if (task->md_iov.iov_base == NULL) {
447 fprintf(stderr, "task->md_buf spdk_dma_zmalloc failed\n");
448 spdk_dma_free(task->iov.iov_base);
449 exit(1);
450 }
451 }
452}
453
454static int
455nvme_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
456 struct ns_entry *entry, uint64_t offset_in_ios)
457{
458 uint64_t lba;
459 int rc;
460 int qp_num;
461
462 enum dif_mode {
463 DIF_MODE_NONE = 0,
464 DIF_MODE_DIF = 1,
465 DIF_MODE_DIX = 2,
466 } mode = DIF_MODE_NONE;
467
468 lba = offset_in_ios * entry->io_size_blocks;
469
470 if (entry->md_size != 0 && !(entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) {
471 if (entry->md_interleave) {
472 mode = DIF_MODE_DIF;
473 } else {
474 mode = DIF_MODE_DIX;
475 }
476 }
477
478 qp_num = ns_ctx->u.nvme.last_qpair;
479 ns_ctx->u.nvme.last_qpair++;
480 if (ns_ctx->u.nvme.last_qpair == ns_ctx->u.nvme.num_qpairs) {
481 ns_ctx->u.nvme.last_qpair = 0;
482 }
483
484 if (mode != DIF_MODE_NONE) {
485 rc = spdk_dif_ctx_init(&task->dif_ctx, entry->block_size, entry->md_size,
486 entry->md_interleave, entry->pi_loc,
487 (enum spdk_dif_type)entry->pi_type, entry->io_flags,
488 lba, 0xFFFF, (uint16_t)entry->io_size_blocks, 0, 0);
489 if (rc != 0) {
490 fprintf(stderr, "Initialization of DIF context failed\n");
491 exit(1);
492 }
493 }
494
495 if (task->is_read) {
496 return spdk_nvme_ns_cmd_read_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num],
497 task->iov.iov_base, task->md_iov.iov_base,
498 lba,
499 entry->io_size_blocks, io_complete,
500 task, entry->io_flags,
501 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag);
502 } else {
503 switch (mode) {
504 case DIF_MODE_DIF:
505 rc = spdk_dif_generate(&task->iov, 1, entry->io_size_blocks, &task->dif_ctx);
506 if (rc != 0) {
507 fprintf(stderr, "Generation of DIF failed\n");
508 return rc;
509 }
510 break;
511 case DIF_MODE_DIX:
512 rc = spdk_dix_generate(&task->iov, 1, &task->md_iov, entry->io_size_blocks,
513 &task->dif_ctx);
514 if (rc != 0) {
515 fprintf(stderr, "Generation of DIX failed\n");
516 return rc;
517 }
518 break;
519 default:
520 break;
521 }
522
523 return spdk_nvme_ns_cmd_write_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num],
524 task->iov.iov_base, task->md_iov.iov_base,
525 lba,
526 entry->io_size_blocks, io_complete,
527 task, entry->io_flags,
528 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag);
529 }
530}
531
532static void
533nvme_check_io(struct ns_worker_ctx *ns_ctx)
534{
535 int i, rc;
536
537 for (i = 0; i < ns_ctx->u.nvme.num_qpairs; i++) {
538 rc = spdk_nvme_qpair_process_completions(ns_ctx->u.nvme.qpair[i], g_max_completions);
539 if (rc < 0) {
540 fprintf(stderr, "NVMe io qpair process completion error\n");
541 exit(1);
542 }
543 }
544}
545
546static void
547nvme_verify_io(struct perf_task *task, struct ns_entry *entry)
548{
549 struct spdk_dif_error err_blk = {};
550 int rc;
551
552 if (!task->is_read || (entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) {
553 return;
554 }
555
556 if (entry->md_interleave) {
557 rc = spdk_dif_verify(&task->iov, 1, entry->io_size_blocks, &task->dif_ctx,
558 &err_blk);
559 if (rc != 0) {
560 fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n",
561 err_blk.err_type, err_blk.err_offset);
562 }
563 } else {
564 rc = spdk_dix_verify(&task->iov, 1, &task->md_iov, entry->io_size_blocks,
565 &task->dif_ctx, &err_blk);
566 if (rc != 0) {
567 fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n",
568 err_blk.err_type, err_blk.err_offset);
569 }
570 }
571}
572
573/*
574 * TODO: If a controller has multiple namespaces, they could all use the same queue.
575 * For now, give each namespace/thread combination its own queue.
576 */
577static int
578nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
579{
580 struct spdk_nvme_io_qpair_opts opts;
581 struct ns_entry *entry = ns_ctx->entry;
582 int i;
583
584 ns_ctx->u.nvme.num_qpairs = g_nr_io_queues_per_ns;
585 ns_ctx->u.nvme.qpair = calloc(ns_ctx->u.nvme.num_qpairs, sizeof(struct spdk_nvme_qpair *));
586 if (!ns_ctx->u.nvme.qpair) {
7c673cae
FG
587 return -1;
588 }
589
9f95a23c
TL
590 spdk_nvme_ctrlr_get_default_io_qpair_opts(entry->u.nvme.ctrlr, &opts, sizeof(opts));
591 if (opts.io_queue_requests < entry->num_io_requests) {
592 opts.io_queue_requests = entry->num_io_requests;
593 }
594 opts.delay_pcie_doorbell = true;
595
596 for (i = 0; i < ns_ctx->u.nvme.num_qpairs; i++) {
597 ns_ctx->u.nvme.qpair[i] = spdk_nvme_ctrlr_alloc_io_qpair(entry->u.nvme.ctrlr, &opts,
598 sizeof(opts));
599 if (!ns_ctx->u.nvme.qpair[i]) {
600 printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n");
601 return -1;
602 }
603 }
604
7c673cae
FG
605 return 0;
606}
607
608static void
9f95a23c 609nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
7c673cae 610{
9f95a23c 611 int i;
7c673cae 612
9f95a23c
TL
613 for (i = 0; i < ns_ctx->u.nvme.num_qpairs; i++) {
614 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i]);
615 }
7c673cae 616
9f95a23c
TL
617 free(ns_ctx->u.nvme.qpair);
618}
619
620static const struct ns_fn_table nvme_fn_table = {
621 .setup_payload = nvme_setup_payload,
622 .submit_io = nvme_submit_io,
623 .check_io = nvme_check_io,
624 .verify_io = nvme_verify_io,
625 .init_ns_worker_ctx = nvme_init_ns_worker_ctx,
626 .cleanup_ns_worker_ctx = nvme_cleanup_ns_worker_ctx,
627};
628
629static void
630build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr)
631{
632 const struct spdk_nvme_transport_id *trid;
633
634 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
635
636 switch (trid->trtype) {
637 case SPDK_NVME_TRANSPORT_PCIE:
638 snprintf(name, length, "PCIE (%s)", trid->traddr);
639 break;
640 case SPDK_NVME_TRANSPORT_RDMA:
641 snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn);
642 break;
643 case SPDK_NVME_TRANSPORT_TCP:
644 snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn);
645 break;
646 default:
647 fprintf(stderr, "Unknown transport type %d\n", trid->trtype);
648 break;
649 }
650}
651
652static void
653register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
654{
655 struct ns_entry *entry;
656 const struct spdk_nvme_ctrlr_data *cdata;
657 uint32_t max_xfer_size, entries, sector_size;
658 uint64_t ns_size;
659 struct spdk_nvme_io_qpair_opts opts;
660
661 cdata = spdk_nvme_ctrlr_get_data(ctrlr);
662
663 if (!spdk_nvme_ns_is_active(ns)) {
664 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n",
665 cdata->mn, cdata->sn,
666 spdk_nvme_ns_get_id(ns));
667 g_warn = true;
668 return;
669 }
670
671 ns_size = spdk_nvme_ns_get_size(ns);
672 sector_size = spdk_nvme_ns_get_sector_size(ns);
673
674 if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) {
675 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid "
676 "ns size %" PRIu64 " / block size %u for I/O size %u\n",
677 cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns),
678 ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes);
679 g_warn = true;
680 return;
681 }
682
683 max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
684 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
685 /* NVMe driver may add additional entries based on
686 * stripe size and maximum transfer size, we assume
687 * 1 more entry be used for stripe.
688 */
689 entries = (g_io_size_bytes - 1) / max_xfer_size + 2;
690 if ((g_queue_depth * entries) > opts.io_queue_size) {
691 printf("controller IO queue size %u less than required\n",
692 opts.io_queue_size);
693 printf("Consider using lower queue depth or small IO size because "
694 "IO requests may be queued at the NVMe driver.\n");
695 g_warn = true;
696 }
697 /* For requests which have children requests, parent request itself
698 * will also occupy 1 entry.
699 */
700 entries += 1;
701
702 entry = calloc(1, sizeof(struct ns_entry));
703 if (entry == NULL) {
704 perror("ns_entry malloc");
7c673cae
FG
705 exit(1);
706 }
707
9f95a23c
TL
708 entry->type = ENTRY_TYPE_NVME_NS;
709 entry->fn_table = &nvme_fn_table;
710 entry->u.nvme.ctrlr = ctrlr;
711 entry->u.nvme.ns = ns;
712 entry->num_io_requests = g_queue_depth * entries;
713
714 entry->size_in_ios = ns_size / g_io_size_bytes;
715 entry->io_size_blocks = g_io_size_bytes / sector_size;
716
717 entry->block_size = spdk_nvme_ns_get_extended_sector_size(ns);
718 entry->md_size = spdk_nvme_ns_get_md_size(ns);
719 entry->md_interleave = spdk_nvme_ns_supports_extended_lba(ns);
720 entry->pi_loc = spdk_nvme_ns_get_data(ns)->dps.md_start;
721 entry->pi_type = spdk_nvme_ns_get_pi_type(ns);
722
723 if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
724 entry->io_flags = g_metacfg_pract_flag | g_metacfg_prchk_flags;
725 }
726
727 if (g_max_io_md_size < entry->md_size) {
728 g_max_io_md_size = entry->md_size;
7c673cae 729 }
9f95a23c
TL
730
731 if (g_max_io_size_blocks < entry->io_size_blocks) {
732 g_max_io_size_blocks = entry->io_size_blocks;
733 }
734
735 build_nvme_name(entry->name, sizeof(entry->name), ctrlr);
736
737 g_num_namespaces++;
738 entry->next = g_namespaces;
739 g_namespaces = entry;
7c673cae 740}
7c673cae 741
11fdf7f2 742static void
9f95a23c 743unregister_namespaces(void)
7c673cae 744{
9f95a23c 745 struct ns_entry *entry = g_namespaces;
11fdf7f2 746
9f95a23c
TL
747 while (entry) {
748 struct ns_entry *next = entry->next;
749 free(entry);
750 entry = next;
751 }
752}
11fdf7f2 753
9f95a23c
TL
754static void
755enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl)
756{
757 if (spdk_nvme_cpl_is_error(cpl)) {
758 printf("enable_latency_tracking_complete failed\n");
759 }
760 g_outstanding_commands--;
761}
762
763static void
764set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable)
765{
766 int res;
767 union spdk_nvme_intel_feat_latency_tracking latency_tracking;
768
769 if (enable) {
770 latency_tracking.bits.enable = 0x01;
771 } else {
772 latency_tracking.bits.enable = 0x00;
11fdf7f2
TL
773 }
774
9f95a23c
TL
775 res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING,
776 latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL);
777 if (res) {
778 printf("fail to allocate nvme request.\n");
11fdf7f2
TL
779 return;
780 }
9f95a23c
TL
781 g_outstanding_commands++;
782
783 while (g_outstanding_commands) {
784 spdk_nvme_ctrlr_process_admin_completions(ctrlr);
785 }
786}
787
788static void
789register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry)
790{
791 struct spdk_nvme_ns *ns;
792 struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry));
793 uint32_t nsid;
11fdf7f2 794
9f95a23c
TL
795 if (entry == NULL) {
796 perror("ctrlr_entry malloc");
797 exit(1);
11fdf7f2
TL
798 }
799
9f95a23c
TL
800 entry->latency_page = spdk_dma_zmalloc(sizeof(struct spdk_nvme_intel_rw_latency_page),
801 4096, NULL);
802 if (entry->latency_page == NULL) {
803 printf("Allocation error (latency page)\n");
804 exit(1);
11fdf7f2
TL
805 }
806
9f95a23c 807 build_nvme_name(entry->name, sizeof(entry->name), ctrlr);
11fdf7f2 808
9f95a23c
TL
809 entry->ctrlr = ctrlr;
810 entry->trtype = trid_entry->trid.trtype;
811 entry->next = g_controllers;
812 g_controllers = entry;
11fdf7f2 813
9f95a23c
TL
814 if (g_latency_ssd_tracking_enable &&
815 spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) {
816 set_latency_tracking_feature(ctrlr, true);
11fdf7f2
TL
817 }
818
9f95a23c
TL
819 if (trid_entry->nsid == 0) {
820 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
821 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
822 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
823 if (ns == NULL) {
824 continue;
11fdf7f2 825 }
9f95a23c
TL
826 register_ns(ctrlr, ns);
827 }
828 } else {
829 ns = spdk_nvme_ctrlr_get_ns(ctrlr, trid_entry->nsid);
830 if (!ns) {
831 perror("Namespace does not exist.");
832 exit(1);
11fdf7f2 833 }
11fdf7f2 834
9f95a23c 835 register_ns(ctrlr, ns);
11fdf7f2
TL
836 }
837
9f95a23c
TL
838 if (g_nr_unused_io_queues) {
839 int i;
11fdf7f2 840
9f95a23c 841 printf("Creating %u unused qpairs for controller %s\n", g_nr_unused_io_queues, entry->name);
11fdf7f2 842
9f95a23c
TL
843 entry->unused_qpairs = calloc(g_nr_unused_io_queues, sizeof(struct spdk_nvme_qpair *));
844 if (!entry->unused_qpairs) {
845 fprintf(stderr, "Unable to allocate memory for qpair array\n");
846 exit(1);
11fdf7f2 847 }
9f95a23c
TL
848
849 for (i = 0; i < g_nr_unused_io_queues; i++) {
850 entry->unused_qpairs[i] = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0);
851 if (!entry->unused_qpairs[i]) {
852 fprintf(stderr, "Unable to allocate unused qpair. Did you request too many?\n");
853 exit(1);
11fdf7f2
TL
854 }
855 }
7c673cae 856 }
7c673cae 857
9f95a23c 858}
7c673cae
FG
859
860static __thread unsigned int seed = 0;
861
9f95a23c 862static inline void
11fdf7f2 863submit_single_io(struct perf_task *task)
7c673cae 864{
7c673cae
FG
865 uint64_t offset_in_ios;
866 int rc;
11fdf7f2 867 struct ns_worker_ctx *ns_ctx = task->ns_ctx;
7c673cae
FG
868 struct ns_entry *entry = ns_ctx->entry;
869
7c673cae
FG
870 if (g_is_random) {
871 offset_in_ios = rand_r(&seed) % entry->size_in_ios;
872 } else {
873 offset_in_ios = ns_ctx->offset_in_ios++;
874 if (ns_ctx->offset_in_ios == entry->size_in_ios) {
875 ns_ctx->offset_in_ios = 0;
876 }
877 }
878
879 task->submit_tsc = spdk_get_ticks();
880
881 if ((g_rw_percentage == 100) ||
882 (g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) {
9f95a23c 883 task->is_read = true;
7c673cae 884 } else {
9f95a23c 885 task->is_read = false;
7c673cae
FG
886 }
887
9f95a23c
TL
888 rc = entry->fn_table->submit_io(task, ns_ctx, entry, offset_in_ios);
889
890 if (spdk_unlikely(rc != 0)) {
7c673cae 891 fprintf(stderr, "starting I/O failed\n");
11fdf7f2
TL
892 } else {
893 ns_ctx->current_queue_depth++;
7c673cae 894 }
7c673cae
FG
895}
896
9f95a23c 897static inline void
7c673cae
FG
898task_complete(struct perf_task *task)
899{
900 struct ns_worker_ctx *ns_ctx;
901 uint64_t tsc_diff;
11fdf7f2 902 struct ns_entry *entry;
7c673cae
FG
903
904 ns_ctx = task->ns_ctx;
11fdf7f2 905 entry = ns_ctx->entry;
7c673cae
FG
906 ns_ctx->current_queue_depth--;
907 ns_ctx->io_completed++;
908 tsc_diff = spdk_get_ticks() - task->submit_tsc;
909 ns_ctx->total_tsc += tsc_diff;
9f95a23c 910 if (spdk_unlikely(ns_ctx->min_tsc > tsc_diff)) {
7c673cae
FG
911 ns_ctx->min_tsc = tsc_diff;
912 }
9f95a23c 913 if (spdk_unlikely(ns_ctx->max_tsc < tsc_diff)) {
7c673cae
FG
914 ns_ctx->max_tsc = tsc_diff;
915 }
9f95a23c 916 if (spdk_unlikely(g_latency_sw_tracking_level > 0)) {
11fdf7f2
TL
917 spdk_histogram_data_tally(ns_ctx->histogram, tsc_diff);
918 }
7c673cae 919
9f95a23c
TL
920 if (spdk_unlikely(entry->md_size > 0)) {
921 /* add application level verification for end-to-end data protection */
922 entry->fn_table->verify_io(task, entry);
11fdf7f2 923 }
7c673cae
FG
924
925 /*
926 * is_draining indicates when time has expired for the test run
927 * and we are just waiting for the previously submitted I/O
928 * to complete. In this case, do not submit a new I/O to replace
929 * the one just completed.
930 */
9f95a23c
TL
931 if (spdk_unlikely(ns_ctx->is_draining)) {
932 spdk_dma_free(task->iov.iov_base);
933 spdk_dma_free(task->md_iov.iov_base);
11fdf7f2
TL
934 free(task);
935 } else {
936 submit_single_io(task);
7c673cae
FG
937 }
938}
939
940static void
9f95a23c 941io_complete(void *ctx, const struct spdk_nvme_cpl *cpl)
7c673cae 942{
9f95a23c
TL
943 struct perf_task *task = ctx;
944
945 if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) {
946 fprintf(stderr, "%s completed with error (sct=%d, sc=%d)\n",
947 task->is_read ? "Read" : "Write",
948 cpl->status.sct, cpl->status.sc);
949 }
950
951 task_complete(task);
7c673cae
FG
952}
953
954static void
955check_io(struct ns_worker_ctx *ns_ctx)
956{
9f95a23c 957 ns_ctx->entry->fn_table->check_io(ns_ctx);
7c673cae
FG
958}
959
9f95a23c
TL
960static struct perf_task *
961allocate_task(struct ns_worker_ctx *ns_ctx, int queue_depth)
7c673cae 962{
11fdf7f2 963 struct perf_task *task;
11fdf7f2 964
9f95a23c
TL
965 task = calloc(1, sizeof(*task));
966 if (task == NULL) {
967 fprintf(stderr, "Out of memory allocating tasks\n");
968 exit(1);
969 }
11fdf7f2 970
9f95a23c 971 ns_ctx->entry->fn_table->setup_payload(task, queue_depth % 8 + 1);
11fdf7f2 972
9f95a23c 973 task->ns_ctx = ns_ctx;
11fdf7f2 974
9f95a23c 975 return task;
7c673cae
FG
976}
977
978static void
9f95a23c 979submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth)
7c673cae 980{
9f95a23c
TL
981 struct perf_task *task;
982
983 while (queue_depth-- > 0) {
984 task = allocate_task(ns_ctx, queue_depth);
985 submit_single_io(task);
7c673cae
FG
986 }
987}
988
989static int
990init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
991{
9f95a23c 992 return ns_ctx->entry->fn_table->init_ns_worker_ctx(ns_ctx);
7c673cae
FG
993}
994
995static void
996cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
997{
9f95a23c 998 ns_ctx->entry->fn_table->cleanup_ns_worker_ctx(ns_ctx);
7c673cae
FG
999}
1000
1001static int
1002work_fn(void *arg)
1003{
1004 uint64_t tsc_end;
1005 struct worker_thread *worker = (struct worker_thread *)arg;
1006 struct ns_worker_ctx *ns_ctx = NULL;
9f95a23c 1007 uint32_t unfinished_ns_ctx;
7c673cae
FG
1008
1009 printf("Starting thread on core %u\n", worker->lcore);
1010
9f95a23c 1011 /* Allocate queue pairs for each namespace. */
7c673cae
FG
1012 ns_ctx = worker->ns_ctx;
1013 while (ns_ctx != NULL) {
1014 if (init_ns_worker_ctx(ns_ctx) != 0) {
1015 printf("ERROR: init_ns_worker_ctx() failed\n");
1016 return 1;
1017 }
1018 ns_ctx = ns_ctx->next;
1019 }
1020
1021 tsc_end = spdk_get_ticks() + g_time_in_sec * g_tsc_rate;
1022
1023 /* Submit initial I/O for each namespace. */
1024 ns_ctx = worker->ns_ctx;
1025 while (ns_ctx != NULL) {
1026 submit_io(ns_ctx, g_queue_depth);
1027 ns_ctx = ns_ctx->next;
1028 }
1029
1030 while (1) {
1031 /*
1032 * Check for completed I/O for each controller. A new
1033 * I/O will be submitted in the io_complete callback
1034 * to replace each I/O that is completed.
1035 */
1036 ns_ctx = worker->ns_ctx;
1037 while (ns_ctx != NULL) {
1038 check_io(ns_ctx);
1039 ns_ctx = ns_ctx->next;
1040 }
1041
1042 if (spdk_get_ticks() > tsc_end) {
1043 break;
1044 }
1045 }
1046
9f95a23c
TL
1047 /* drain the io of each ns_ctx in round robin to make the fairness */
1048 do {
1049 unfinished_ns_ctx = 0;
1050 ns_ctx = worker->ns_ctx;
1051 while (ns_ctx != NULL) {
1052 /* first time will enter into this if case */
1053 if (!ns_ctx->is_draining) {
1054 ns_ctx->is_draining = true;
1055 }
1056
1057 if (ns_ctx->current_queue_depth > 0) {
1058 check_io(ns_ctx);
1059 if (ns_ctx->current_queue_depth == 0) {
1060 cleanup_ns_worker_ctx(ns_ctx);
1061 } else {
1062 unfinished_ns_ctx++;
1063 }
1064 }
1065 ns_ctx = ns_ctx->next;
1066 }
1067 } while (unfinished_ns_ctx > 0);
7c673cae
FG
1068
1069 return 0;
1070}
1071
1072static void usage(char *program_name)
1073{
1074 printf("%s options", program_name);
1075#if HAVE_LIBAIO
1076 printf(" [AIO device(s)]...");
1077#endif
1078 printf("\n");
1079 printf("\t[-q io depth]\n");
11fdf7f2 1080 printf("\t[-o io size in bytes]\n");
9f95a23c
TL
1081 printf("\t[-n number of io queues per namespace. default: 1]\n");
1082 printf("\t[-U number of unused io queues per controller. default: 0]\n");
7c673cae
FG
1083 printf("\t[-w io pattern type, must be one of\n");
1084 printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
1085 printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
11fdf7f2
TL
1086 printf("\t[-L enable latency tracking via sw, default: disabled]\n");
1087 printf("\t\t-L for latency summary, -LL for detailed histogram\n");
1088 printf("\t[-l enable latency tracking via ssd (if supported), default: disabled]\n");
7c673cae
FG
1089 printf("\t[-t time in seconds]\n");
1090 printf("\t[-c core mask for I/O submission/completion.]\n");
9f95a23c 1091 printf("\t\t(default: 1)\n");
11fdf7f2 1092 printf("\t[-D disable submission queue in controller memory buffer, default: enabled]\n");
9f95a23c
TL
1093 printf("\t[-H enable header digest for TCP transport, default: disabled]\n");
1094 printf("\t[-I enable data digest for TCP transport, default: disabled]\n");
7c673cae
FG
1095 printf("\t[-r Transport ID for local PCIe NVMe or NVMeoF]\n");
1096 printf("\t Format: 'key:value [key:value] ...'\n");
1097 printf("\t Keys:\n");
1098 printf("\t trtype Transport type (e.g. PCIe, RDMA)\n");
1099 printf("\t adrfam Address family (e.g. IPv4, IPv6)\n");
1100 printf("\t traddr Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n");
1101 printf("\t trsvcid Transport service identifier (e.g. 4420)\n");
1102 printf("\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN);
1103 printf("\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n");
1104 printf("\t -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n");
11fdf7f2
TL
1105 printf("\t[-e metadata configuration]\n");
1106 printf("\t Keys:\n");
1107 printf("\t PRACT Protection Information Action bit (PRACT=1 or PRACT=0)\n");
1108 printf("\t PRCHK Control of Protection Information Checking (PRCHK=GUARD|REFTAG|APPTAG)\n");
1109 printf("\t Example: -e 'PRACT=0,PRCHK=GUARD|REFTAG|APPTAG'\n");
1110 printf("\t -e 'PRACT=1,PRCHK=GUARD'\n");
9f95a23c 1111 printf("\t[-k keep alive timeout period in millisecond]\n");
11fdf7f2 1112 printf("\t[-s DPDK huge memory size in MB.]\n");
7c673cae
FG
1113 printf("\t[-m max completions per poll]\n");
1114 printf("\t\t(default: 0 - unlimited)\n");
1115 printf("\t[-i shared memory group ID]\n");
9f95a23c
TL
1116#ifdef DEBUG
1117 printf("\t[-G enable debug logging]\n");
1118#else
1119 printf("\t[-G enable debug logging (flag disabled, must reconfigure with --enable-debug)\n");
1120#endif
7c673cae
FG
1121}
1122
11fdf7f2
TL
1123static void
1124check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count,
1125 uint64_t total, uint64_t so_far)
1126{
1127 double so_far_pct;
1128 double **cutoff = ctx;
1129
1130 if (count == 0) {
1131 return;
1132 }
1133
1134 so_far_pct = (double)so_far / total;
1135 while (so_far_pct >= **cutoff && **cutoff > 0) {
1136 printf("%9.5f%% : %9.3fus\n", **cutoff * 100, (double)end * 1000 * 1000 / g_tsc_rate);
1137 (*cutoff)++;
1138 }
1139}
1140
1141static void
1142print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count,
1143 uint64_t total, uint64_t so_far)
1144{
1145 double so_far_pct;
1146
1147 if (count == 0) {
1148 return;
1149 }
1150
1151 so_far_pct = (double)so_far * 100 / total;
1152 printf("%9.3f - %9.3f: %9.4f%% (%9ju)\n",
1153 (double)start * 1000 * 1000 / g_tsc_rate,
1154 (double)end * 1000 * 1000 / g_tsc_rate,
1155 so_far_pct, count);
1156}
1157
7c673cae
FG
1158static void
1159print_performance(void)
1160{
11fdf7f2
TL
1161 uint64_t total_io_completed, total_io_tsc;
1162 double io_per_second, mb_per_second, average_latency, min_latency, max_latency;
1163 double sum_ave_latency, min_latency_so_far, max_latency_so_far;
1164 double total_io_per_second, total_mb_per_second;
7c673cae
FG
1165 int ns_count;
1166 struct worker_thread *worker;
1167 struct ns_worker_ctx *ns_ctx;
9f95a23c 1168 uint32_t max_strlen;
7c673cae
FG
1169
1170 total_io_per_second = 0;
1171 total_mb_per_second = 0;
1172 total_io_completed = 0;
11fdf7f2
TL
1173 total_io_tsc = 0;
1174 min_latency_so_far = (double)UINT64_MAX;
1175 max_latency_so_far = 0;
7c673cae
FG
1176 ns_count = 0;
1177
9f95a23c
TL
1178 max_strlen = 0;
1179 worker = g_workers;
1180 while (worker) {
1181 ns_ctx = worker->ns_ctx;
1182 while (ns_ctx) {
1183 max_strlen = spdk_max(strlen(ns_ctx->entry->name), max_strlen);
1184 ns_ctx = ns_ctx->next;
1185 }
1186 worker = worker->next;
1187 }
1188
7c673cae 1189 printf("========================================================\n");
9f95a23c
TL
1190 printf("%*s\n", max_strlen + 60, "Latency(us)");
1191 printf("%-*s: %10s %10s %10s %10s %10s\n",
1192 max_strlen + 12, "Device Information", "IOPS", "MiB/s", "Average", "min", "max");
7c673cae
FG
1193
1194 worker = g_workers;
1195 while (worker) {
1196 ns_ctx = worker->ns_ctx;
1197 while (ns_ctx) {
11fdf7f2
TL
1198 if (ns_ctx->io_completed != 0) {
1199 io_per_second = (double)ns_ctx->io_completed / g_time_in_sec;
1200 mb_per_second = io_per_second * g_io_size_bytes / (1024 * 1024);
1201 average_latency = ((double)ns_ctx->total_tsc / ns_ctx->io_completed) * 1000 * 1000 / g_tsc_rate;
1202 min_latency = (double)ns_ctx->min_tsc * 1000 * 1000 / g_tsc_rate;
1203 if (min_latency < min_latency_so_far) {
1204 min_latency_so_far = min_latency;
1205 }
1206
1207 max_latency = (double)ns_ctx->max_tsc * 1000 * 1000 / g_tsc_rate;
1208 if (max_latency > max_latency_so_far) {
1209 max_latency_so_far = max_latency;
1210 }
1211
9f95a23c
TL
1212 printf("%-*.*s from core %u: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
1213 max_strlen, max_strlen, ns_ctx->entry->name, worker->lcore,
11fdf7f2
TL
1214 io_per_second, mb_per_second,
1215 average_latency, min_latency, max_latency);
1216 total_io_per_second += io_per_second;
1217 total_mb_per_second += mb_per_second;
1218 total_io_completed += ns_ctx->io_completed;
1219 total_io_tsc += ns_ctx->total_tsc;
1220 ns_count++;
1221 }
1222 ns_ctx = ns_ctx->next;
1223 }
1224 worker = worker->next;
1225 }
1226
1227 if (ns_count != 0 && total_io_completed) {
1228 sum_ave_latency = ((double)total_io_tsc / total_io_completed) * 1000 * 1000 / g_tsc_rate;
1229 printf("========================================================\n");
9f95a23c
TL
1230 printf("%-*s: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
1231 max_strlen + 12, "Total", total_io_per_second, total_mb_per_second,
11fdf7f2
TL
1232 sum_ave_latency, min_latency_so_far, max_latency_so_far);
1233 printf("\n");
1234 }
1235
1236 if (g_latency_sw_tracking_level == 0 || total_io_completed == 0) {
1237 return;
1238 }
1239
1240 worker = g_workers;
1241 while (worker) {
1242 ns_ctx = worker->ns_ctx;
1243 while (ns_ctx) {
1244 const double *cutoff = g_latency_cutoffs;
1245
1246 printf("Summary latency data for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore);
1247 printf("=================================================================================\n");
1248
1249 spdk_histogram_data_iterate(ns_ctx->histogram, check_cutoff, &cutoff);
1250
1251 printf("\n");
1252 ns_ctx = ns_ctx->next;
1253 }
1254 worker = worker->next;
1255 }
1256
1257 if (g_latency_sw_tracking_level == 1) {
1258 return;
1259 }
1260
1261 worker = g_workers;
1262 while (worker) {
1263 ns_ctx = worker->ns_ctx;
1264 while (ns_ctx) {
1265 printf("Latency histogram for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore);
1266 printf("==============================================================================\n");
1267 printf(" Range in us Cumulative IO count\n");
1268
1269 spdk_histogram_data_iterate(ns_ctx->histogram, print_bucket, NULL);
1270 printf("\n");
7c673cae
FG
1271 ns_ctx = ns_ctx->next;
1272 }
1273 worker = worker->next;
1274 }
1275
7c673cae
FG
1276}
1277
1278static void
1279print_latency_page(struct ctrlr_entry *entry)
1280{
1281 int i;
1282
1283 printf("\n");
1284 printf("%s\n", entry->name);
1285 printf("--------------------------------------------------------\n");
1286
1287 for (i = 0; i < 32; i++) {
11fdf7f2 1288 if (entry->latency_page->buckets_32us[i]) {
7c673cae 1289 printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32, entry->latency_page->buckets_32us[i]);
11fdf7f2 1290 }
7c673cae
FG
1291 }
1292 for (i = 0; i < 31; i++) {
11fdf7f2 1293 if (entry->latency_page->buckets_1ms[i]) {
7c673cae 1294 printf("Bucket %dms - %dms: %d\n", i + 1, i + 2, entry->latency_page->buckets_1ms[i]);
11fdf7f2 1295 }
7c673cae
FG
1296 }
1297 for (i = 0; i < 31; i++) {
1298 if (entry->latency_page->buckets_32ms[i])
1299 printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32,
1300 entry->latency_page->buckets_32ms[i]);
1301 }
1302}
1303
1304static void
1305print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page)
1306{
1307 struct ctrlr_entry *ctrlr;
1308
1309 printf("%s Latency Statistics:\n", op_name);
1310 printf("========================================================\n");
1311 ctrlr = g_controllers;
1312 while (ctrlr) {
1313 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
1314 if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr->ctrlr, log_page, SPDK_NVME_GLOBAL_NS_TAG,
1315 ctrlr->latency_page, sizeof(struct spdk_nvme_intel_rw_latency_page), 0,
1316 enable_latency_tracking_complete,
1317 NULL)) {
1318 printf("nvme_ctrlr_cmd_get_log_page() failed\n");
1319 exit(1);
1320 }
1321
1322 g_outstanding_commands++;
1323 } else {
1324 printf("Controller %s: %s latency statistics not supported\n", ctrlr->name, op_name);
1325 }
1326 ctrlr = ctrlr->next;
1327 }
1328
1329 while (g_outstanding_commands) {
1330 ctrlr = g_controllers;
1331 while (ctrlr) {
1332 spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr);
1333 ctrlr = ctrlr->next;
1334 }
1335 }
1336
1337 ctrlr = g_controllers;
1338 while (ctrlr) {
1339 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
1340 print_latency_page(ctrlr);
1341 }
1342 ctrlr = ctrlr->next;
1343 }
1344 printf("\n");
1345}
1346
1347static void
1348print_stats(void)
1349{
1350 print_performance();
11fdf7f2 1351 if (g_latency_ssd_tracking_enable) {
7c673cae
FG
1352 if (g_rw_percentage != 0) {
1353 print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY);
1354 }
1355 if (g_rw_percentage != 100) {
1356 print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY);
1357 }
1358 }
1359}
1360
1361static void
1362unregister_trids(void)
1363{
1364 struct trid_entry *trid_entry, *tmp;
1365
1366 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) {
9f95a23c 1367 TAILQ_REMOVE(&g_trid_list, trid_entry, tailq);
7c673cae
FG
1368 free(trid_entry);
1369 }
1370}
1371
1372static int
1373add_trid(const char *trid_str)
1374{
1375 struct trid_entry *trid_entry;
1376 struct spdk_nvme_transport_id *trid;
11fdf7f2 1377 char *ns;
7c673cae
FG
1378
1379 trid_entry = calloc(1, sizeof(*trid_entry));
1380 if (trid_entry == NULL) {
1381 return -1;
1382 }
1383
1384 trid = &trid_entry->trid;
7c673cae
FG
1385 trid->trtype = SPDK_NVME_TRANSPORT_PCIE;
1386 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
1387
1388 if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) {
1389 fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str);
1390 free(trid_entry);
1391 return 1;
1392 }
1393
11fdf7f2
TL
1394 ns = strcasestr(trid_str, "ns:");
1395 if (ns) {
1396 char nsid_str[6]; /* 5 digits maximum in an nsid */
1397 int len;
1398 int nsid;
1399
1400 ns += 3;
1401
1402 len = strcspn(ns, " \t\n");
1403 if (len > 5) {
1404 fprintf(stderr, "NVMe namespace IDs must be 5 digits or less\n");
1405 free(trid_entry);
1406 return 1;
1407 }
1408
1409 memcpy(nsid_str, ns, len);
1410 nsid_str[len] = '\0';
1411
9f95a23c 1412 nsid = spdk_strtol(nsid_str, 10);
11fdf7f2
TL
1413 if (nsid <= 0 || nsid > 65535) {
1414 fprintf(stderr, "NVMe namespace IDs must be less than 65536 and greater than 0\n");
1415 free(trid_entry);
1416 return 1;
1417 }
1418
1419 trid_entry->nsid = (uint16_t)nsid;
1420 }
1421
7c673cae
FG
1422 TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq);
1423 return 0;
1424}
1425
9f95a23c
TL
1426static size_t
1427parse_next_key(const char **str, char *key, char *val, size_t key_buf_size,
1428 size_t val_buf_size)
11fdf7f2
TL
1429{
1430 const char *sep;
9f95a23c
TL
1431 const char *separator = ", \t\n";
1432 size_t key_len, val_len;
11fdf7f2 1433
9f95a23c 1434 *str += strspn(*str, separator);
11fdf7f2 1435
9f95a23c 1436 sep = strchr(*str, '=');
11fdf7f2 1437 if (!sep) {
9f95a23c 1438 fprintf(stderr, "Key without '=' separator\n");
11fdf7f2
TL
1439 return 0;
1440 }
1441
9f95a23c
TL
1442 key_len = sep - *str;
1443 if (key_len >= key_buf_size) {
1444 fprintf(stderr, "Key length %zu is greater than maximum allowed %zu\n",
1445 key_len, key_buf_size - 1);
1446 return 0;
1447 }
1448
1449 memcpy(key, *str, key_len);
1450 key[key_len] = '\0';
1451
1452 *str += key_len + 1; /* Skip key */
1453 val_len = strcspn(*str, separator);
1454 if (val_len == 0) {
1455 fprintf(stderr, "Key without value\n");
1456 return 0;
1457 }
1458
1459 if (val_len >= val_buf_size) {
1460 fprintf(stderr, "Value length %zu is greater than maximum allowed %zu\n",
1461 val_len, val_buf_size - 1);
1462 return 0;
1463 }
1464
1465 memcpy(val, *str, val_len);
1466 val[val_len] = '\0';
1467
1468 *str += val_len;
1469
1470 return val_len;
1471}
1472
1473static int
1474parse_metadata(const char *metacfg_str)
1475{
1476 const char *str;
1477 size_t val_len;
1478 char key[32];
1479 char val[1024];
1480
1481 if (metacfg_str == NULL) {
1482 return -EINVAL;
1483 }
1484
1485 str = metacfg_str;
1486
1487 while (*str != '\0') {
1488 val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val));
1489 if (val_len == 0) {
1490 fprintf(stderr, "Failed to parse metadata\n");
1491 return -EINVAL;
11fdf7f2 1492 }
9f95a23c
TL
1493
1494 if (strcmp(key, "PRACT") == 0) {
1495 if (*val == '1') {
1496 g_metacfg_prchk_flags = SPDK_NVME_IO_FLAGS_PRACT;
1497 }
1498 } else if (strcmp(key, "PRCHK") == 0) {
1499 if (strstr(val, "GUARD") != NULL) {
1500 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
1501 }
1502 if (strstr(val, "REFTAG") != NULL) {
1503 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
1504 }
1505 if (strstr(val, "APPTAG") != NULL) {
1506 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG;
1507 }
1508 } else {
1509 fprintf(stderr, "Unknown key '%s'\n", key);
11fdf7f2
TL
1510 }
1511 }
1512
1513 return 0;
1514}
1515
7c673cae
FG
1516static int
1517parse_args(int argc, char **argv)
1518{
1519 const char *workload_type;
1520 int op;
1521 bool mix_specified = false;
9f95a23c 1522 long int val;
7c673cae 1523
11fdf7f2 1524 /* default value */
7c673cae
FG
1525 g_queue_depth = 0;
1526 g_io_size_bytes = 0;
1527 workload_type = NULL;
1528 g_time_in_sec = 0;
1529 g_rw_percentage = -1;
1530 g_core_mask = NULL;
1531 g_max_completions = 0;
1532
9f95a23c 1533 while ((op = getopt(argc, argv, "c:e:i:lm:n:o:q:r:k:s:t:w:DGHILM:U:")) != -1) {
7c673cae 1534 switch (op) {
9f95a23c
TL
1535 case 'i':
1536 case 'm':
1537 case 'n':
1538 case 'o':
1539 case 'q':
1540 case 'k':
1541 case 's':
1542 case 't':
1543 case 'M':
1544 case 'U':
1545 val = spdk_strtol(optarg, 10);
1546 if (val < 0) {
1547 fprintf(stderr, "Converting a string to integer failed\n");
1548 return val;
1549 }
1550 switch (op) {
1551 case 'i':
1552 g_shm_id = val;
1553 break;
1554 case 'm':
1555 g_max_completions = val;
1556 break;
1557 case 'n':
1558 g_nr_io_queues_per_ns = val;
1559 break;
1560 case 'o':
1561 g_io_size_bytes = val;
1562 break;
1563 case 'q':
1564 g_queue_depth = val;
1565 break;
1566 case 'k':
1567 g_keep_alive_timeout_in_ms = val;
1568 break;
1569 case 's':
1570 g_dpdk_mem = val;
1571 break;
1572 case 't':
1573 g_time_in_sec = val;
1574 break;
1575 case 'M':
1576 g_rw_percentage = val;
1577 mix_specified = true;
1578 break;
1579 case 'U':
1580 g_nr_unused_io_queues = val;
1581 break;
1582 }
1583 break;
7c673cae
FG
1584 case 'c':
1585 g_core_mask = optarg;
1586 break;
11fdf7f2
TL
1587 case 'e':
1588 if (parse_metadata(optarg)) {
1589 usage(argv[0]);
1590 return 1;
1591 }
7c673cae 1592 break;
7c673cae 1593 case 'l':
11fdf7f2 1594 g_latency_ssd_tracking_enable = true;
7c673cae 1595 break;
7c673cae
FG
1596 case 'r':
1597 if (add_trid(optarg)) {
1598 usage(argv[0]);
1599 return 1;
1600 }
1601 break;
7c673cae
FG
1602 case 'w':
1603 workload_type = optarg;
1604 break;
11fdf7f2
TL
1605 case 'D':
1606 g_disable_sq_cmb = 1;
1607 break;
9f95a23c
TL
1608 case 'G':
1609#ifndef DEBUG
1610 fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n",
1611 argv[0]);
1612 usage(argv[0]);
1613 return 1;
1614#else
1615 spdk_log_set_flag("nvme");
1616 spdk_log_set_print_level(SPDK_LOG_DEBUG);
1617 break;
1618#endif
1619 case 'H':
1620 g_header_digest = 1;
1621 break;
1622 case 'I':
1623 g_data_digest = 1;
1624 break;
11fdf7f2
TL
1625 case 'L':
1626 g_latency_sw_tracking_level++;
1627 break;
7c673cae
FG
1628 default:
1629 usage(argv[0]);
1630 return 1;
1631 }
1632 }
1633
9f95a23c
TL
1634 if (!g_nr_io_queues_per_ns) {
1635 usage(argv[0]);
1636 return 1;
1637 }
1638
7c673cae
FG
1639 if (!g_queue_depth) {
1640 usage(argv[0]);
1641 return 1;
1642 }
1643 if (!g_io_size_bytes) {
1644 usage(argv[0]);
1645 return 1;
1646 }
1647 if (!workload_type) {
1648 usage(argv[0]);
1649 return 1;
1650 }
1651 if (!g_time_in_sec) {
1652 usage(argv[0]);
1653 return 1;
1654 }
1655
1656 if (strcmp(workload_type, "read") &&
1657 strcmp(workload_type, "write") &&
1658 strcmp(workload_type, "randread") &&
1659 strcmp(workload_type, "randwrite") &&
1660 strcmp(workload_type, "rw") &&
1661 strcmp(workload_type, "randrw")) {
1662 fprintf(stderr,
1663 "io pattern type must be one of\n"
1664 "(read, write, randread, randwrite, rw, randrw)\n");
1665 return 1;
1666 }
1667
1668 if (!strcmp(workload_type, "read") ||
1669 !strcmp(workload_type, "randread")) {
1670 g_rw_percentage = 100;
1671 }
1672
1673 if (!strcmp(workload_type, "write") ||
1674 !strcmp(workload_type, "randwrite")) {
1675 g_rw_percentage = 0;
1676 }
1677
1678 if (!strcmp(workload_type, "read") ||
1679 !strcmp(workload_type, "randread") ||
1680 !strcmp(workload_type, "write") ||
1681 !strcmp(workload_type, "randwrite")) {
1682 if (mix_specified) {
1683 fprintf(stderr, "Ignoring -M option... Please use -M option"
1684 " only when using rw or randrw.\n");
1685 }
1686 }
1687
1688 if (!strcmp(workload_type, "rw") ||
1689 !strcmp(workload_type, "randrw")) {
1690 if (g_rw_percentage < 0 || g_rw_percentage > 100) {
1691 fprintf(stderr,
1692 "-M must be specified to value from 0 to 100 "
1693 "for rw or randrw.\n");
1694 return 1;
1695 }
1696 }
1697
1698 if (!strcmp(workload_type, "read") ||
1699 !strcmp(workload_type, "write") ||
1700 !strcmp(workload_type, "rw")) {
1701 g_is_random = 0;
1702 } else {
1703 g_is_random = 1;
1704 }
1705
1706 if (TAILQ_EMPTY(&g_trid_list)) {
1707 /* If no transport IDs specified, default to enumerating all local PCIe devices */
1708 add_trid("trtype:PCIe");
11fdf7f2
TL
1709 } else {
1710 struct trid_entry *trid_entry, *trid_entry_tmp;
1711
1712 g_no_pci = true;
1713 /* check whether there is local PCIe type */
1714 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) {
1715 if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
1716 g_no_pci = false;
1717 break;
1718 }
1719 }
7c673cae
FG
1720 }
1721
1722 g_aio_optind = optind;
11fdf7f2 1723
7c673cae
FG
1724 return 0;
1725}
1726
1727static int
1728register_workers(void)
1729{
1730 uint32_t i;
1731 struct worker_thread *worker;
1732
1733 g_workers = NULL;
1734 g_num_workers = 0;
1735
1736 SPDK_ENV_FOREACH_CORE(i) {
1737 worker = calloc(1, sizeof(*worker));
1738 if (worker == NULL) {
1739 fprintf(stderr, "Unable to allocate worker\n");
1740 return -1;
1741 }
1742
1743 worker->lcore = i;
1744 worker->next = g_workers;
1745 g_workers = worker;
1746 g_num_workers++;
1747 }
1748
1749 return 0;
1750}
1751
1752static void
1753unregister_workers(void)
1754{
1755 struct worker_thread *worker = g_workers;
1756
1757 /* Free namespace context and worker thread */
1758 while (worker) {
1759 struct worker_thread *next_worker = worker->next;
1760 struct ns_worker_ctx *ns_ctx = worker->ns_ctx;
1761
1762 while (ns_ctx) {
1763 struct ns_worker_ctx *next_ns_ctx = ns_ctx->next;
11fdf7f2 1764 spdk_histogram_data_free(ns_ctx->histogram);
7c673cae
FG
1765 free(ns_ctx);
1766 ns_ctx = next_ns_ctx;
1767 }
1768
1769 free(worker);
1770 worker = next_worker;
1771 }
1772}
1773
1774static bool
1775probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1776 struct spdk_nvme_ctrlr_opts *opts)
1777{
7c673cae
FG
1778 if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) {
1779 printf("Attaching to NVMe over Fabrics controller at %s:%s: %s\n",
1780 trid->traddr, trid->trsvcid,
1781 trid->subnqn);
1782 } else {
11fdf7f2
TL
1783 if (g_disable_sq_cmb) {
1784 opts->use_cmb_sqs = false;
7c673cae
FG
1785 }
1786
11fdf7f2
TL
1787 printf("Attaching to NVMe Controller at %s\n",
1788 trid->traddr);
7c673cae
FG
1789 }
1790
11fdf7f2
TL
1791 /* Set io_queue_size to UINT16_MAX, NVMe driver
1792 * will then reduce this to MQES to maximize
1793 * the io_queue_size as much as possible.
1794 */
1795 opts->io_queue_size = UINT16_MAX;
7c673cae 1796
9f95a23c
TL
1797 /* Set the header and data_digest */
1798 opts->header_digest = g_header_digest;
1799 opts->data_digest = g_data_digest;
1800 opts->keep_alive_timeout_ms = spdk_max(opts->keep_alive_timeout_ms,
1801 g_keep_alive_timeout_in_ms);
1802
7c673cae
FG
1803 return true;
1804}
1805
1806static void
1807attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1808 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1809{
11fdf7f2 1810 struct trid_entry *trid_entry = cb_ctx;
7c673cae
FG
1811 struct spdk_pci_addr pci_addr;
1812 struct spdk_pci_device *pci_dev;
1813 struct spdk_pci_id pci_id;
1814
11fdf7f2 1815 g_controllers_found++;
7c673cae
FG
1816 if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) {
1817 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n",
1818 trid->traddr, trid->trsvcid,
1819 trid->subnqn);
1820 } else {
1821 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) {
1822 return;
1823 }
1824
11fdf7f2 1825 pci_dev = spdk_nvme_ctrlr_get_pci_device(ctrlr);
7c673cae
FG
1826 if (!pci_dev) {
1827 return;
1828 }
1829
1830 pci_id = spdk_pci_device_get_id(pci_dev);
1831
1832 printf("Attached to NVMe Controller at %s [%04x:%04x]\n",
1833 trid->traddr,
1834 pci_id.vendor_id, pci_id.device_id);
1835 }
1836
11fdf7f2 1837 register_ctrlr(ctrlr, trid_entry);
7c673cae
FG
1838}
1839
1840static int
1841register_controllers(void)
1842{
1843 struct trid_entry *trid_entry;
1844
1845 printf("Initializing NVMe Controllers\n");
1846
1847 TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) {
11fdf7f2 1848 if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) {
7c673cae
FG
1849 fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n",
1850 trid_entry->trid.traddr);
1851 return -1;
1852 }
1853 }
1854
1855 return 0;
1856}
1857
1858static void
1859unregister_controllers(void)
1860{
1861 struct ctrlr_entry *entry = g_controllers;
1862
1863 while (entry) {
1864 struct ctrlr_entry *next = entry->next;
11fdf7f2
TL
1865 spdk_dma_free(entry->latency_page);
1866 if (g_latency_ssd_tracking_enable &&
1867 spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) {
7c673cae 1868 set_latency_tracking_feature(entry->ctrlr, false);
11fdf7f2 1869 }
7c673cae 1870
9f95a23c
TL
1871 if (g_nr_unused_io_queues) {
1872 int i;
7c673cae 1873
9f95a23c
TL
1874 for (i = 0; i < g_nr_unused_io_queues; i++) {
1875 spdk_nvme_ctrlr_free_io_qpair(entry->unused_qpairs[i]);
1876 }
1877
1878 free(entry->unused_qpairs);
7c673cae 1879 }
7c673cae 1880
9f95a23c
TL
1881 spdk_nvme_detach(entry->ctrlr);
1882 free(entry);
1883 entry = next;
1884 }
7c673cae
FG
1885}
1886
1887static int
1888associate_workers_with_ns(void)
1889{
1890 struct ns_entry *entry = g_namespaces;
1891 struct worker_thread *worker = g_workers;
1892 struct ns_worker_ctx *ns_ctx;
1893 int i, count;
1894
1895 count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers;
1896
1897 for (i = 0; i < count; i++) {
1898 if (entry == NULL) {
1899 break;
1900 }
1901
9f95a23c 1902 ns_ctx = calloc(1, sizeof(struct ns_worker_ctx));
7c673cae
FG
1903 if (!ns_ctx) {
1904 return -1;
1905 }
7c673cae
FG
1906
1907 printf("Associating %s with lcore %d\n", entry->name, worker->lcore);
1908 ns_ctx->min_tsc = UINT64_MAX;
1909 ns_ctx->entry = entry;
1910 ns_ctx->next = worker->ns_ctx;
11fdf7f2 1911 ns_ctx->histogram = spdk_histogram_data_alloc();
7c673cae
FG
1912 worker->ns_ctx = ns_ctx;
1913
1914 worker = worker->next;
1915 if (worker == NULL) {
1916 worker = g_workers;
1917 }
1918
1919 entry = entry->next;
1920 if (entry == NULL) {
1921 entry = g_namespaces;
1922 }
1923
1924 }
1925
1926 return 0;
1927}
1928
9f95a23c
TL
1929static void *
1930nvme_poll_ctrlrs(void *arg)
1931{
1932 struct ctrlr_entry *entry;
1933 int oldstate;
1934
1935 spdk_unaffinitize_thread();
1936
1937 while (true) {
1938 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate);
1939
1940 entry = g_controllers;
1941 while (entry) {
1942 if (entry->trtype != SPDK_NVME_TRANSPORT_PCIE) {
1943 spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr);
1944 }
1945 entry = entry->next;
1946 }
1947
1948 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);
1949
1950 /* This is a pthread cancellation point and cannot be removed. */
1951 sleep(1);
1952 }
1953
1954 return NULL;
1955}
1956
7c673cae
FG
1957int main(int argc, char **argv)
1958{
1959 int rc;
1960 struct worker_thread *worker, *master_worker;
1961 unsigned master_core;
7c673cae 1962 struct spdk_env_opts opts;
9f95a23c 1963 pthread_t thread_id = 0;
7c673cae
FG
1964
1965 rc = parse_args(argc, argv);
1966 if (rc != 0) {
1967 return rc;
1968 }
1969
1970 spdk_env_opts_init(&opts);
1971 opts.name = "perf";
1972 opts.shm_id = g_shm_id;
1973 if (g_core_mask) {
1974 opts.core_mask = g_core_mask;
1975 }
1976
1977 if (g_dpdk_mem) {
11fdf7f2
TL
1978 opts.mem_size = g_dpdk_mem;
1979 }
1980 if (g_no_pci) {
1981 opts.no_pci = g_no_pci;
1982 }
1983 if (spdk_env_init(&opts) < 0) {
1984 fprintf(stderr, "Unable to initialize SPDK env\n");
1985 rc = -1;
1986 goto cleanup;
7c673cae 1987 }
7c673cae
FG
1988
1989 g_tsc_rate = spdk_get_ticks_hz();
1990
1991 if (register_workers() != 0) {
1992 rc = -1;
1993 goto cleanup;
1994 }
1995
9f95a23c 1996#if HAVE_LIBAIO
7c673cae
FG
1997 if (register_aio_files(argc, argv) != 0) {
1998 rc = -1;
1999 goto cleanup;
2000 }
9f95a23c 2001#endif
7c673cae
FG
2002
2003 if (register_controllers() != 0) {
2004 rc = -1;
2005 goto cleanup;
2006 }
2007
11fdf7f2
TL
2008 if (g_warn) {
2009 printf("WARNING: Some requested NVMe devices were skipped\n");
7c673cae
FG
2010 }
2011
11fdf7f2
TL
2012 if (g_num_namespaces == 0) {
2013 fprintf(stderr, "No valid NVMe controllers or AIO devices found\n");
9f95a23c
TL
2014 goto cleanup;
2015 }
2016
2017 rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL);
2018 if (rc != 0) {
2019 fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n");
2020 goto cleanup;
11fdf7f2 2021 }
7c673cae 2022
11fdf7f2 2023 if (associate_workers_with_ns() != 0) {
7c673cae
FG
2024 rc = -1;
2025 goto cleanup;
2026 }
2027
2028 printf("Initialization complete. Launching workers.\n");
2029
2030 /* Launch all of the slave workers */
11fdf7f2 2031 master_core = spdk_env_get_current_core();
7c673cae
FG
2032 master_worker = NULL;
2033 worker = g_workers;
2034 while (worker != NULL) {
2035 if (worker->lcore != master_core) {
11fdf7f2 2036 spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker);
7c673cae
FG
2037 } else {
2038 assert(master_worker == NULL);
2039 master_worker = worker;
2040 }
2041 worker = worker->next;
2042 }
2043
2044 assert(master_worker != NULL);
2045 rc = work_fn(master_worker);
2046
11fdf7f2 2047 spdk_env_thread_wait_all();
7c673cae
FG
2048
2049 print_stats();
2050
2051cleanup:
9f95a23c
TL
2052 if (thread_id && pthread_cancel(thread_id) == 0) {
2053 pthread_join(thread_id, NULL);
2054 }
7c673cae
FG
2055 unregister_trids();
2056 unregister_namespaces();
2057 unregister_controllers();
2058 unregister_workers();
2059
2060 if (rc != 0) {
2061 fprintf(stderr, "%s: errors occured\n", argv[0]);
2062 }
2063
2064 return rc;
2065}