ceph/src/spdk/examples/nvme/perf/perf.c

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright (c) Intel Corporation.
   5  *   All rights reserved.
   6  *
   7  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
   8  *
   9  *   Redistribution and use in source and binary forms, with or without
  10  *   modification, are permitted provided that the following conditions
  11  *   are met:
  12  *
  13  *     * Redistributions of source code must retain the above copyright
  14  *       notice, this list of conditions and the following disclaimer.
  15  *     * Redistributions in binary form must reproduce the above copyright
  16  *       notice, this list of conditions and the following disclaimer in
  17  *       the documentation and/or other materials provided with the
  18  *       distribution.
  19  *     * Neither the name of Intel Corporation nor the names of its
  20  *       contributors may be used to endorse or promote products derived
  21  *       from this software without specific prior written permission.
  22  *
  23  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  24  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  25  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  26  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  27  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  28  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  29  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  30  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  31  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  32  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  33  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  34  */
  35
  36 #include "spdk/stdinc.h"
  37
  38 #include "spdk/env.h"
  39 #include "spdk/fd.h"
  40 #include "spdk/nvme.h"
  41 #include "spdk/vmd.h"
  42 #include "spdk/queue.h"
  43 #include "spdk/string.h"
  44 #include "spdk/nvme_intel.h"
  45 #include "spdk/histogram_data.h"
  46 #include "spdk/endian.h"
  47 #include "spdk/dif.h"
  48 #include "spdk/util.h"
  49 #include "spdk/log.h"
  50 #include "spdk/likely.h"
  51
  52 #ifdef SPDK_CONFIG_URING
  53 #include <liburing.h>
  54 #endif
  55
  56 #if HAVE_LIBAIO
  57 #include <libaio.h>
  58 #endif
  59
  60 struct ctrlr_entry {
  61         struct spdk_nvme_ctrlr                  *ctrlr;
  62         enum spdk_nvme_transport_type           trtype;
  63         struct spdk_nvme_intel_rw_latency_page  *latency_page;
  64
  65         struct spdk_nvme_qpair                  **unused_qpairs;
  66
  67         struct ctrlr_entry                      *next;
  68         char                                    name[1024];
  69 };
  70
  71 enum entry_type {
  72         ENTRY_TYPE_NVME_NS,
  73         ENTRY_TYPE_AIO_FILE,
  74         ENTRY_TYPE_URING_FILE,
  75 };
  76
  77 struct ns_fn_table;
  78
  79 struct ns_entry {
  80         enum entry_type         type;
  81         const struct ns_fn_table        *fn_table;
  82
  83         union {
  84                 struct {
  85                         struct spdk_nvme_ctrlr  *ctrlr;
  86                         struct spdk_nvme_ns     *ns;
  87                 } nvme;
  88 #ifdef SPDK_CONFIG_URING
  89                 struct {
  90                         int                     fd;
  91                 } uring;
  92 #endif
  93 #if HAVE_LIBAIO
  94                 struct {
  95                         int                     fd;
  96                 } aio;
  97 #endif
  98         } u;
  99
 100         struct ns_entry         *next;
 101         uint32_t                io_size_blocks;
 102         uint32_t                num_io_requests;
 103         uint64_t                size_in_ios;
 104         uint32_t                block_size;
 105         uint32_t                md_size;
 106         bool                    md_interleave;
 107         bool                    pi_loc;
 108         enum spdk_nvme_pi_type  pi_type;
 109         uint32_t                io_flags;
 110         char                    name[1024];
 111 };
 112
 113 static const double g_latency_cutoffs[] = {
 114         0.01,
 115         0.10,
 116         0.25,
 117         0.50,
 118         0.75,
 119         0.90,
 120         0.95,
 121         0.98,
 122         0.99,
 123         0.995,
 124         0.999,
 125         0.9999,
 126         0.99999,
 127         0.999999,
 128         0.9999999,
 129         -1,
 130 };
 131
 132 struct ns_worker_ctx {
 133         struct ns_entry         *entry;
 134         uint64_t                io_completed;
 135         uint64_t                last_io_completed;
 136         uint64_t                total_tsc;
 137         uint64_t                min_tsc;
 138         uint64_t                max_tsc;
 139         uint64_t                current_queue_depth;
 140         uint64_t                offset_in_ios;
 141         bool                    is_draining;
 142
 143         union {
 144                 struct {
 145                         int                             num_active_qpairs;
 146                         int                             num_all_qpairs;
 147                         struct spdk_nvme_qpair          **qpair;
 148                         struct spdk_nvme_poll_group     *group;
 149                         int                             last_qpair;
 150                 } nvme;
 151
 152 #ifdef SPDK_CONFIG_URING
 153                 struct {
 154                         struct io_uring         ring;
 155                         uint64_t                io_inflight;
 156                         uint64_t                io_pending;
 157                         struct io_uring_cqe     **cqes;
 158
 159                 } uring;
 160 #endif
 161 #if HAVE_LIBAIO
 162                 struct {
 163                         struct io_event         *events;
 164                         io_context_t            ctx;
 165                 } aio;
 166 #endif
 167         } u;
 168
 169         struct ns_worker_ctx    *next;
 170
 171         struct spdk_histogram_data      *histogram;
 172 };
 173
 174 struct perf_task {
 175         struct ns_worker_ctx    *ns_ctx;
 176         struct iovec            iov;
 177         struct iovec            md_iov;
 178         uint64_t                submit_tsc;
 179         bool                    is_read;
 180         struct spdk_dif_ctx     dif_ctx;
 181 #if HAVE_LIBAIO
 182         struct iocb             iocb;
 183 #endif
 184 };
 185
 186 struct worker_thread {
 187         struct ns_worker_ctx    *ns_ctx;
 188         struct worker_thread    *next;
 189         unsigned                lcore;
 190 };
 191
 192 struct ns_fn_table {
 193         void    (*setup_payload)(struct perf_task *task, uint8_t pattern);
 194
 195         int     (*submit_io)(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
 196                              struct ns_entry *entry, uint64_t offset_in_ios);
 197
 198         void    (*check_io)(struct ns_worker_ctx *ns_ctx);
 199
 200         void    (*verify_io)(struct perf_task *task, struct ns_entry *entry);
 201
 202         int     (*init_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx);
 203
 204         void    (*cleanup_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx);
 205 };
 206
 207 static int g_outstanding_commands;
 208
 209 static bool g_latency_ssd_tracking_enable;
 210 static int g_latency_sw_tracking_level;
 211
 212 static bool g_vmd;
 213 static const char *g_workload_type;
 214 static struct ctrlr_entry *g_controllers;
 215 static struct ns_entry *g_namespaces;
 216 static int g_num_namespaces;
 217 static struct worker_thread *g_workers;
 218 static int g_num_workers;
 219 static uint32_t g_master_core;
 220
 221 static uint64_t g_tsc_rate;
 222
 223 static uint32_t g_io_align = 0x200;
 224 static uint32_t g_io_size_bytes;
 225 static uint32_t g_max_io_md_size;
 226 static uint32_t g_max_io_size_blocks;
 227 static uint32_t g_metacfg_pract_flag;
 228 static uint32_t g_metacfg_prchk_flags;
 229 static int g_rw_percentage = -1;
 230 static int g_is_random;
 231 static int g_queue_depth;
 232 static int g_nr_io_queues_per_ns = 1;
 233 static int g_nr_unused_io_queues;
 234 static int g_time_in_sec;
 235 static uint32_t g_max_completions;
 236 static int g_dpdk_mem;
 237 static int g_shm_id = -1;
 238 static uint32_t g_disable_sq_cmb;
 239 static bool g_use_uring;
 240 static bool g_no_pci;
 241 static bool g_warn;
 242 static bool g_header_digest;
 243 static bool g_data_digest;
 244 static bool g_no_shn_notification;
 245 static bool g_mix_specified;
 246 /* Default to 10 seconds for the keep alive value. This value is arbitrary. */
 247 static uint32_t g_keep_alive_timeout_in_ms = 10000;
 248
 249 static const char *g_core_mask;
 250
 251 struct trid_entry {
 252         struct spdk_nvme_transport_id   trid;
 253         uint16_t                        nsid;
 254         TAILQ_ENTRY(trid_entry)         tailq;
 255 };
 256
 257 static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list);
 258
 259 static int g_file_optind; /* Index of first filename in argv */
 260
 261 static inline void
 262 task_complete(struct perf_task *task);
 263
 264 #ifdef SPDK_CONFIG_URING
 265
 266 static void
 267 uring_setup_payload(struct perf_task *task, uint8_t pattern)
 268 {
 269         task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL);
 270         task->iov.iov_len = g_io_size_bytes;
 271         if (task->iov.iov_base == NULL) {
 272                 fprintf(stderr, "spdk_dma_zmalloc() for task->iov.iov_base failed\n");
 273                 exit(1);
 274         }
 275         memset(task->iov.iov_base, pattern, task->iov.iov_len);
 276 }
 277
 278 static int
 279 uring_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
 280                 struct ns_entry *entry, uint64_t offset_in_ios)
 281 {
 282         struct io_uring_sqe *sqe;
 283
 284         sqe = io_uring_get_sqe(&ns_ctx->u.uring.ring);
 285         if (!sqe) {
 286                 fprintf(stderr, "Cannot get sqe\n");
 287                 return -1;
 288         }
 289
 290         if (task->is_read) {
 291                 io_uring_prep_readv(sqe, entry->u.uring.fd, &task->iov, 1, offset_in_ios * task->iov.iov_len);
 292         } else {
 293                 io_uring_prep_writev(sqe, entry->u.uring.fd, &task->iov, 1, offset_in_ios * task->iov.iov_len);
 294         }
 295
 296         io_uring_sqe_set_data(sqe, task);
 297         ns_ctx->u.uring.io_pending++;
 298
 299         return 0;
 300 }
 301
 302 static void
 303 uring_check_io(struct ns_worker_ctx *ns_ctx)
 304 {
 305         int i, count, to_complete, to_submit, ret = 0;
 306         struct perf_task *task;
 307
 308         to_submit = ns_ctx->u.uring.io_pending;
 309
 310         if (to_submit > 0) {
 311                 /* If there are I/O to submit, use io_uring_submit here.
 312                  * It will automatically call spdk_io_uring_enter appropriately. */
 313                 ret = io_uring_submit(&ns_ctx->u.uring.ring);
 314                 if (ret < 0) {
 315                         return;
 316                 }
 317                 ns_ctx->u.uring.io_pending = 0;
 318                 ns_ctx->u.uring.io_inflight += to_submit;
 319         }
 320
 321         to_complete = ns_ctx->u.uring.io_inflight;
 322         if (to_complete > 0) {
 323                 count = io_uring_peek_batch_cqe(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes, to_complete);
 324                 ns_ctx->u.uring.io_inflight -= count;
 325                 for (i = 0; i < count; i++) {
 326                         assert(ns_ctx->u.uring.cqes[i] != NULL);
 327                         task = (struct perf_task *)ns_ctx->u.uring.cqes[i]->user_data;
 328                         if (ns_ctx->u.uring.cqes[i]->res != (int)task->iov.iov_len) {
 329                                 fprintf(stderr, "cqe[i]->status=%d\n", ns_ctx->u.uring.cqes[i]->res);
 330                                 exit(0);
 331                         }
 332                         io_uring_cqe_seen(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes[i]);
 333                         task_complete(task);
 334                 }
 335         }
 336 }
 337
 338 static void
 339 uring_verify_io(struct perf_task *task, struct ns_entry *entry)
 340 {
 341 }
 342
 343 static int
 344 uring_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 345 {
 346         if (io_uring_queue_init(g_queue_depth, &ns_ctx->u.uring.ring, 0) < 0) {
 347                 SPDK_ERRLOG("uring I/O context setup failure\n");
 348                 return -1;
 349         }
 350
 351         ns_ctx->u.uring.cqes = calloc(g_queue_depth, sizeof(struct io_uring_cqe *));
 352         if (!ns_ctx->u.uring.cqes) {
 353                 io_uring_queue_exit(&ns_ctx->u.uring.ring);
 354                 return -1;
 355         }
 356
 357         return 0;
 358 }
 359
 360 static void
 361 uring_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 362 {
 363         io_uring_queue_exit(&ns_ctx->u.uring.ring);
 364         free(ns_ctx->u.uring.cqes);
 365 }
 366
 367 static const struct ns_fn_table uring_fn_table = {
 368         .setup_payload          = uring_setup_payload,
 369         .submit_io              = uring_submit_io,
 370         .check_io               = uring_check_io,
 371         .verify_io              = uring_verify_io,
 372         .init_ns_worker_ctx     = uring_init_ns_worker_ctx,
 373         .cleanup_ns_worker_ctx  = uring_cleanup_ns_worker_ctx,
 374 };
 375
 376 #endif
 377
 378 #ifdef HAVE_LIBAIO
 379 static void
 380 aio_setup_payload(struct perf_task *task, uint8_t pattern)
 381 {
 382         task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL);
 383         task->iov.iov_len = g_io_size_bytes;
 384         if (task->iov.iov_base == NULL) {
 385                 fprintf(stderr, "spdk_dma_zmalloc() for task->buf failed\n");
 386                 exit(1);
 387         }
 388         memset(task->iov.iov_base, pattern, task->iov.iov_len);
 389 }
 390
 391 static int
 392 aio_submit(io_context_t aio_ctx, struct iocb *iocb, int fd, enum io_iocb_cmd cmd,
 393            struct iovec *iov, uint64_t offset, void *cb_ctx)
 394 {
 395         iocb->aio_fildes = fd;
 396         iocb->aio_reqprio = 0;
 397         iocb->aio_lio_opcode = cmd;
 398         iocb->u.c.buf = iov->iov_base;
 399         iocb->u.c.nbytes = iov->iov_len;
 400         iocb->u.c.offset = offset * iov->iov_len;
 401         iocb->data = cb_ctx;
 402
 403         if (io_submit(aio_ctx, 1, &iocb) < 0) {
 404                 printf("io_submit");
 405                 return -1;
 406         }
 407
 408         return 0;
 409 }
 410
 411 static int
 412 aio_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
 413               struct ns_entry *entry, uint64_t offset_in_ios)
 414 {
 415         if (task->is_read) {
 416                 return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PREAD,
 417                                   &task->iov, offset_in_ios, task);
 418         } else {
 419                 return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PWRITE,
 420                                   &task->iov, offset_in_ios, task);
 421         }
 422 }
 423
 424 static void
 425 aio_check_io(struct ns_worker_ctx *ns_ctx)
 426 {
 427         int count, i;
 428         struct timespec timeout;
 429
 430         timeout.tv_sec = 0;
 431         timeout.tv_nsec = 0;
 432
 433         count = io_getevents(ns_ctx->u.aio.ctx, 1, g_queue_depth, ns_ctx->u.aio.events, &timeout);
 434         if (count < 0) {
 435                 fprintf(stderr, "io_getevents error\n");
 436                 exit(1);
 437         }
 438
 439         for (i = 0; i < count; i++) {
 440                 task_complete(ns_ctx->u.aio.events[i].data);
 441         }
 442 }
 443
 444 static void
 445 aio_verify_io(struct perf_task *task, struct ns_entry *entry)
 446 {
 447 }
 448
 449 static int
 450 aio_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 451 {
 452         ns_ctx->u.aio.events = calloc(g_queue_depth, sizeof(struct io_event));
 453         if (!ns_ctx->u.aio.events) {
 454                 return -1;
 455         }
 456         ns_ctx->u.aio.ctx = 0;
 457         if (io_setup(g_queue_depth, &ns_ctx->u.aio.ctx) < 0) {
 458                 free(ns_ctx->u.aio.events);
 459                 perror("io_setup");
 460                 return -1;
 461         }
 462         return 0;
 463 }
 464
 465 static void
 466 aio_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 467 {
 468         io_destroy(ns_ctx->u.aio.ctx);
 469         free(ns_ctx->u.aio.events);
 470 }
 471
 472 static const struct ns_fn_table aio_fn_table = {
 473         .setup_payload          = aio_setup_payload,
 474         .submit_io              = aio_submit_io,
 475         .check_io               = aio_check_io,
 476         .verify_io              = aio_verify_io,
 477         .init_ns_worker_ctx     = aio_init_ns_worker_ctx,
 478         .cleanup_ns_worker_ctx  = aio_cleanup_ns_worker_ctx,
 479 };
 480
 481 #endif /* HAVE_LIBAIO */
 482
 483 #if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING)
 484
 485 static int
 486 register_file(const char *path)
 487 {
 488         struct ns_entry *entry;
 489
 490         int flags, fd;
 491         uint64_t size;
 492         uint32_t blklen;
 493
 494         if (g_rw_percentage == 100) {
 495                 flags = O_RDONLY;
 496         } else if (g_rw_percentage == 0) {
 497                 flags = O_WRONLY;
 498         } else {
 499                 flags = O_RDWR;
 500         }
 501
 502         flags |= O_DIRECT;
 503
 504         fd = open(path, flags);
 505         if (fd < 0) {
 506                 fprintf(stderr, "Could not open device %s: %s\n", path, strerror(errno));
 507                 return -1;
 508         }
 509
 510         size = spdk_fd_get_size(fd);
 511         if (size == 0) {
 512                 fprintf(stderr, "Could not determine size of device %s\n", path);
 513                 close(fd);
 514                 return -1;
 515         }
 516
 517         blklen = spdk_fd_get_blocklen(fd);
 518         if (blklen == 0) {
 519                 fprintf(stderr, "Could not determine block size of device %s\n", path);
 520                 close(fd);
 521                 return -1;
 522         }
 523
 524         /*
 525          * TODO: This should really calculate the LCM of the current g_io_align and blklen.
 526          * For now, it's fairly safe to just assume all block sizes are powers of 2.
 527          */
 528         if (g_io_align < blklen) {
 529                 g_io_align = blklen;
 530         }
 531
 532         entry = malloc(sizeof(struct ns_entry));
 533         if (entry == NULL) {
 534                 close(fd);
 535                 perror("ns_entry malloc");
 536                 return -1;
 537         }
 538
 539         if (g_use_uring) {
 540 #ifdef SPDK_CONFIG_URING
 541                 entry->type = ENTRY_TYPE_URING_FILE;
 542                 entry->fn_table = &uring_fn_table;
 543                 entry->u.uring.fd = fd;
 544 #endif
 545         } else {
 546 #if HAVE_LIBAIO
 547                 entry->type = ENTRY_TYPE_AIO_FILE;
 548                 entry->fn_table = &aio_fn_table;
 549                 entry->u.aio.fd = fd;
 550 #endif
 551         }
 552         entry->size_in_ios = size / g_io_size_bytes;
 553         entry->io_size_blocks = g_io_size_bytes / blklen;
 554
 555         snprintf(entry->name, sizeof(entry->name), "%s", path);
 556
 557         g_num_namespaces++;
 558         entry->next = g_namespaces;
 559         g_namespaces = entry;
 560
 561         return 0;
 562 }
 563
 564 static int
 565 register_files(int argc, char **argv)
 566 {
 567         int i;
 568
 569         /* Treat everything after the options as files for AIO/URING */
 570         for (i = g_file_optind; i < argc; i++) {
 571                 if (register_file(argv[i]) != 0) {
 572                         return 1;
 573                 }
 574         }
 575
 576         return 0;
 577 }
 578 #endif
 579
 580 static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl);
 581
 582 static void
 583 nvme_setup_payload(struct perf_task *task, uint8_t pattern)
 584 {
 585         uint32_t max_io_size_bytes, max_io_md_size;
 586
 587         /* maximum extended lba format size from all active namespace,
 588          * it's same with g_io_size_bytes for namespace without metadata.
 589          */
 590         max_io_size_bytes = g_io_size_bytes + g_max_io_md_size * g_max_io_size_blocks;
 591         task->iov.iov_base = spdk_dma_zmalloc(max_io_size_bytes, g_io_align, NULL);
 592         task->iov.iov_len = max_io_size_bytes;
 593         if (task->iov.iov_base == NULL) {
 594                 fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n");
 595                 exit(1);
 596         }
 597         memset(task->iov.iov_base, pattern, task->iov.iov_len);
 598
 599         max_io_md_size = g_max_io_md_size * g_max_io_size_blocks;
 600         if (max_io_md_size != 0) {
 601                 task->md_iov.iov_base = spdk_dma_zmalloc(max_io_md_size, g_io_align, NULL);
 602                 task->md_iov.iov_len = max_io_md_size;
 603                 if (task->md_iov.iov_base == NULL) {
 604                         fprintf(stderr, "task->md_buf spdk_dma_zmalloc failed\n");
 605                         spdk_dma_free(task->iov.iov_base);
 606                         exit(1);
 607                 }
 608         }
 609 }
 610
 611 static int
 612 nvme_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx,
 613                struct ns_entry *entry, uint64_t offset_in_ios)
 614 {
 615         uint64_t lba;
 616         int rc;
 617         int qp_num;
 618
 619         enum dif_mode {
 620                 DIF_MODE_NONE = 0,
 621                 DIF_MODE_DIF = 1,
 622                 DIF_MODE_DIX = 2,
 623         }  mode = DIF_MODE_NONE;
 624
 625         lba = offset_in_ios * entry->io_size_blocks;
 626
 627         if (entry->md_size != 0 && !(entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) {
 628                 if (entry->md_interleave) {
 629                         mode = DIF_MODE_DIF;
 630                 } else {
 631                         mode = DIF_MODE_DIX;
 632                 }
 633         }
 634
 635         qp_num = ns_ctx->u.nvme.last_qpair;
 636         ns_ctx->u.nvme.last_qpair++;
 637         if (ns_ctx->u.nvme.last_qpair == ns_ctx->u.nvme.num_active_qpairs) {
 638                 ns_ctx->u.nvme.last_qpair = 0;
 639         }
 640
 641         if (mode != DIF_MODE_NONE) {
 642                 rc = spdk_dif_ctx_init(&task->dif_ctx, entry->block_size, entry->md_size,
 643                                        entry->md_interleave, entry->pi_loc,
 644                                        (enum spdk_dif_type)entry->pi_type, entry->io_flags,
 645                                        lba, 0xFFFF, (uint16_t)entry->io_size_blocks, 0, 0);
 646                 if (rc != 0) {
 647                         fprintf(stderr, "Initialization of DIF context failed\n");
 648                         exit(1);
 649                 }
 650         }
 651
 652         if (task->is_read) {
 653                 return spdk_nvme_ns_cmd_read_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num],
 654                                                      task->iov.iov_base, task->md_iov.iov_base,
 655                                                      lba,
 656                                                      entry->io_size_blocks, io_complete,
 657                                                      task, entry->io_flags,
 658                                                      task->dif_ctx.apptag_mask, task->dif_ctx.app_tag);
 659         } else {
 660                 switch (mode) {
 661                 case DIF_MODE_DIF:
 662                         rc = spdk_dif_generate(&task->iov, 1, entry->io_size_blocks, &task->dif_ctx);
 663                         if (rc != 0) {
 664                                 fprintf(stderr, "Generation of DIF failed\n");
 665                                 return rc;
 666                         }
 667                         break;
 668                 case DIF_MODE_DIX:
 669                         rc = spdk_dix_generate(&task->iov, 1, &task->md_iov, entry->io_size_blocks,
 670                                                &task->dif_ctx);
 671                         if (rc != 0) {
 672                                 fprintf(stderr, "Generation of DIX failed\n");
 673                                 return rc;
 674                         }
 675                         break;
 676                 default:
 677                         break;
 678                 }
 679
 680                 return spdk_nvme_ns_cmd_write_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num],
 681                                                       task->iov.iov_base, task->md_iov.iov_base,
 682                                                       lba,
 683                                                       entry->io_size_blocks, io_complete,
 684                                                       task, entry->io_flags,
 685                                                       task->dif_ctx.apptag_mask, task->dif_ctx.app_tag);
 686         }
 687 }
 688
 689 static void
 690 perf_disconnect_cb(struct spdk_nvme_qpair *qpair, void *ctx)
 691 {
 692
 693 }
 694
 695 static void
 696 nvme_check_io(struct ns_worker_ctx *ns_ctx)
 697 {
 698         int64_t rc;
 699
 700         rc = spdk_nvme_poll_group_process_completions(ns_ctx->u.nvme.group, 0, perf_disconnect_cb);
 701         if (rc < 0) {
 702                 fprintf(stderr, "NVMe io qpair process completion error\n");
 703                 exit(1);
 704         }
 705 }
 706
 707 static void
 708 nvme_verify_io(struct perf_task *task, struct ns_entry *entry)
 709 {
 710         struct spdk_dif_error err_blk = {};
 711         int rc;
 712
 713         if (!task->is_read || (entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) {
 714                 return;
 715         }
 716
 717         if (entry->md_interleave) {
 718                 rc = spdk_dif_verify(&task->iov, 1, entry->io_size_blocks, &task->dif_ctx,
 719                                      &err_blk);
 720                 if (rc != 0) {
 721                         fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n",
 722                                 err_blk.err_type, err_blk.err_offset);
 723                 }
 724         } else {
 725                 rc = spdk_dix_verify(&task->iov, 1, &task->md_iov, entry->io_size_blocks,
 726                                      &task->dif_ctx, &err_blk);
 727                 if (rc != 0) {
 728                         fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n",
 729                                 err_blk.err_type, err_blk.err_offset);
 730                 }
 731         }
 732 }
 733
 734 /*
 735  * TODO: If a controller has multiple namespaces, they could all use the same queue.
 736  *  For now, give each namespace/thread combination its own queue.
 737  */
 738 static int
 739 nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 740 {
 741         struct spdk_nvme_io_qpair_opts opts;
 742         struct ns_entry *entry = ns_ctx->entry;
 743         struct spdk_nvme_poll_group *group;
 744         struct spdk_nvme_qpair *qpair;
 745         int i;
 746
 747         ns_ctx->u.nvme.num_active_qpairs = g_nr_io_queues_per_ns;
 748         ns_ctx->u.nvme.num_all_qpairs = g_nr_io_queues_per_ns + g_nr_unused_io_queues;
 749         ns_ctx->u.nvme.qpair = calloc(ns_ctx->u.nvme.num_all_qpairs, sizeof(struct spdk_nvme_qpair *));
 750         if (!ns_ctx->u.nvme.qpair) {
 751                 return -1;
 752         }
 753
 754         spdk_nvme_ctrlr_get_default_io_qpair_opts(entry->u.nvme.ctrlr, &opts, sizeof(opts));
 755         if (opts.io_queue_requests < entry->num_io_requests) {
 756                 opts.io_queue_requests = entry->num_io_requests;
 757         }
 758         opts.delay_cmd_submit = true;
 759         opts.create_only = true;
 760
 761         ns_ctx->u.nvme.group = spdk_nvme_poll_group_create(NULL);
 762         if (ns_ctx->u.nvme.group == NULL) {
 763                 goto poll_group_failed;
 764         }
 765
 766         group = ns_ctx->u.nvme.group;
 767         for (i = 0; i < ns_ctx->u.nvme.num_all_qpairs; i++) {
 768                 ns_ctx->u.nvme.qpair[i] = spdk_nvme_ctrlr_alloc_io_qpair(entry->u.nvme.ctrlr, &opts,
 769                                           sizeof(opts));
 770                 qpair = ns_ctx->u.nvme.qpair[i];
 771                 if (!qpair) {
 772                         printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n");
 773                         goto qpair_failed;
 774                 }
 775
 776                 if (spdk_nvme_poll_group_add(group, qpair)) {
 777                         printf("ERROR: unable to add I/O qpair to poll group.\n");
 778                         spdk_nvme_ctrlr_free_io_qpair(qpair);
 779                         goto qpair_failed;
 780                 }
 781
 782                 if (spdk_nvme_ctrlr_connect_io_qpair(entry->u.nvme.ctrlr, qpair)) {
 783                         printf("ERROR: unable to connect I/O qpair.\n");
 784                         spdk_nvme_poll_group_remove(group, qpair);
 785                         spdk_nvme_ctrlr_free_io_qpair(qpair);
 786                         goto qpair_failed;
 787                 }
 788         }
 789
 790         return 0;
 791
 792 qpair_failed:
 793         for (; i > 0; --i) {
 794                 spdk_nvme_poll_group_remove(ns_ctx->u.nvme.group, ns_ctx->u.nvme.qpair[i - 1]);
 795                 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i - 1]);
 796         }
 797
 798         spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group);
 799 poll_group_failed:
 800         free(ns_ctx->u.nvme.qpair);
 801         return -1;
 802 }
 803
 804 static void
 805 nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 806 {
 807         int i;
 808
 809         for (i = 0; i < ns_ctx->u.nvme.num_all_qpairs; i++) {
 810                 spdk_nvme_poll_group_remove(ns_ctx->u.nvme.group, ns_ctx->u.nvme.qpair[i]);
 811                 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i]);
 812         }
 813
 814         spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group);
 815         free(ns_ctx->u.nvme.qpair);
 816 }
 817
 818 static const struct ns_fn_table nvme_fn_table = {
 819         .setup_payload          = nvme_setup_payload,
 820         .submit_io              = nvme_submit_io,
 821         .check_io               = nvme_check_io,
 822         .verify_io              = nvme_verify_io,
 823         .init_ns_worker_ctx     = nvme_init_ns_worker_ctx,
 824         .cleanup_ns_worker_ctx  = nvme_cleanup_ns_worker_ctx,
 825 };
 826
 827 static int
 828 build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr)
 829 {
 830         const struct spdk_nvme_transport_id *trid;
 831         int res = 0;
 832
 833         trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
 834
 835         switch (trid->trtype) {
 836         case SPDK_NVME_TRANSPORT_PCIE:
 837                 res = snprintf(name, length, "PCIE (%s)", trid->traddr);
 838                 break;
 839         case SPDK_NVME_TRANSPORT_RDMA:
 840                 res = snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn);
 841                 break;
 842         case SPDK_NVME_TRANSPORT_TCP:
 843                 res = snprintf(name, length, "TCP  (addr:%s subnqn:%s)", trid->traddr, trid->subnqn);
 844                 break;
 845
 846         default:
 847                 fprintf(stderr, "Unknown transport type %d\n", trid->trtype);
 848                 break;
 849         }
 850         return res;
 851 }
 852
 853 static void
 854 build_nvme_ns_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
 855 {
 856         int res = 0;
 857
 858         res = build_nvme_name(name, length, ctrlr);
 859         if (res > 0) {
 860                 snprintf(name + res, length - res, " NSID %u", nsid);
 861         }
 862
 863 }
 864
 865 static void
 866 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
 867 {
 868         struct ns_entry *entry;
 869         const struct spdk_nvme_ctrlr_data *cdata;
 870         uint32_t max_xfer_size, entries, sector_size;
 871         uint64_t ns_size;
 872         struct spdk_nvme_io_qpair_opts opts;
 873
 874         cdata = spdk_nvme_ctrlr_get_data(ctrlr);
 875
 876         if (!spdk_nvme_ns_is_active(ns)) {
 877                 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n",
 878                        cdata->mn, cdata->sn,
 879                        spdk_nvme_ns_get_id(ns));
 880                 g_warn = true;
 881                 return;
 882         }
 883
 884         ns_size = spdk_nvme_ns_get_size(ns);
 885         sector_size = spdk_nvme_ns_get_sector_size(ns);
 886
 887         if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) {
 888                 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid "
 889                        "ns size %" PRIu64 " / block size %u for I/O size %u\n",
 890                        cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns),
 891                        ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes);
 892                 g_warn = true;
 893                 return;
 894         }
 895
 896         max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
 897         spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
 898         /* NVMe driver may add additional entries based on
 899          * stripe size and maximum transfer size, we assume
 900          * 1 more entry be used for stripe.
 901          */
 902         entries = (g_io_size_bytes - 1) / max_xfer_size + 2;
 903         if ((g_queue_depth * entries) > opts.io_queue_size) {
 904                 printf("controller IO queue size %u less than required\n",
 905                        opts.io_queue_size);
 906                 printf("Consider using lower queue depth or small IO size because "
 907                        "IO requests may be queued at the NVMe driver.\n");
 908         }
 909         /* For requests which have children requests, parent request itself
 910          * will also occupy 1 entry.
 911          */
 912         entries += 1;
 913
 914         entry = calloc(1, sizeof(struct ns_entry));
 915         if (entry == NULL) {
 916                 perror("ns_entry malloc");
 917                 exit(1);
 918         }
 919
 920         entry->type = ENTRY_TYPE_NVME_NS;
 921         entry->fn_table = &nvme_fn_table;
 922         entry->u.nvme.ctrlr = ctrlr;
 923         entry->u.nvme.ns = ns;
 924         entry->num_io_requests = g_queue_depth * entries;
 925
 926         entry->size_in_ios = ns_size / g_io_size_bytes;
 927         entry->io_size_blocks = g_io_size_bytes / sector_size;
 928
 929         entry->block_size = spdk_nvme_ns_get_extended_sector_size(ns);
 930         entry->md_size = spdk_nvme_ns_get_md_size(ns);
 931         entry->md_interleave = spdk_nvme_ns_supports_extended_lba(ns);
 932         entry->pi_loc = spdk_nvme_ns_get_data(ns)->dps.md_start;
 933         entry->pi_type = spdk_nvme_ns_get_pi_type(ns);
 934
 935         if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
 936                 entry->io_flags = g_metacfg_pract_flag | g_metacfg_prchk_flags;
 937         }
 938
 939         /* If metadata size = 8 bytes, PI is stripped (read) or inserted (write),
 940          *  and so reduce metadata size from block size.  (If metadata size > 8 bytes,
 941          *  PI is passed (read) or replaced (write).  So block size is not necessary
 942          *  to change.)
 943          */
 944         if ((entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT) && (entry->md_size == 8)) {
 945                 entry->block_size = spdk_nvme_ns_get_sector_size(ns);
 946         }
 947
 948         if (g_max_io_md_size < entry->md_size) {
 949                 g_max_io_md_size = entry->md_size;
 950         }
 951
 952         if (g_max_io_size_blocks < entry->io_size_blocks) {
 953                 g_max_io_size_blocks = entry->io_size_blocks;
 954         }
 955
 956         build_nvme_ns_name(entry->name, sizeof(entry->name), ctrlr, spdk_nvme_ns_get_id(ns));
 957
 958         g_num_namespaces++;
 959         entry->next = g_namespaces;
 960         g_namespaces = entry;
 961 }
 962
 963 static void
 964 unregister_namespaces(void)
 965 {
 966         struct ns_entry *entry = g_namespaces;
 967
 968         while (entry) {
 969                 struct ns_entry *next = entry->next;
 970                 free(entry);
 971                 entry = next;
 972         }
 973 }
 974
 975 static void
 976 enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl)
 977 {
 978         if (spdk_nvme_cpl_is_error(cpl)) {
 979                 printf("enable_latency_tracking_complete failed\n");
 980         }
 981         g_outstanding_commands--;
 982 }
 983
 984 static void
 985 set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable)
 986 {
 987         int res;
 988         union spdk_nvme_intel_feat_latency_tracking latency_tracking;
 989
 990         if (enable) {
 991                 latency_tracking.bits.enable = 0x01;
 992         } else {
 993                 latency_tracking.bits.enable = 0x00;
 994         }
 995
 996         res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING,
 997                                               latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL);
 998         if (res) {
 999                 printf("fail to allocate nvme request.\n");
1000                 return;
1001         }
1002         g_outstanding_commands++;
1003
1004         while (g_outstanding_commands) {
1005                 spdk_nvme_ctrlr_process_admin_completions(ctrlr);
1006         }
1007 }
1008
1009 static void
1010 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry)
1011 {
1012         struct spdk_nvme_ns *ns;
1013         struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry));
1014         uint32_t nsid;
1015
1016         if (entry == NULL) {
1017                 perror("ctrlr_entry malloc");
1018                 exit(1);
1019         }
1020
1021         entry->latency_page = spdk_dma_zmalloc(sizeof(struct spdk_nvme_intel_rw_latency_page),
1022                                                4096, NULL);
1023         if (entry->latency_page == NULL) {
1024                 printf("Allocation error (latency page)\n");
1025                 exit(1);
1026         }
1027
1028         build_nvme_name(entry->name, sizeof(entry->name), ctrlr);
1029
1030         entry->ctrlr = ctrlr;
1031         entry->trtype = trid_entry->trid.trtype;
1032         entry->next = g_controllers;
1033         g_controllers = entry;
1034
1035         if (g_latency_ssd_tracking_enable &&
1036             spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) {
1037                 set_latency_tracking_feature(ctrlr, true);
1038         }
1039
1040         if (trid_entry->nsid == 0) {
1041                 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
1042                      nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
1043                         ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
1044                         if (ns == NULL) {
1045                                 continue;
1046                         }
1047                         register_ns(ctrlr, ns);
1048                 }
1049         } else {
1050                 ns = spdk_nvme_ctrlr_get_ns(ctrlr, trid_entry->nsid);
1051                 if (!ns) {
1052                         perror("Namespace does not exist.");
1053                         exit(1);
1054                 }
1055
1056                 register_ns(ctrlr, ns);
1057         }
1058 }
1059
1060 static __thread unsigned int seed = 0;
1061
1062 static inline void
1063 submit_single_io(struct perf_task *task)
1064 {
1065         uint64_t                offset_in_ios;
1066         int                     rc;
1067         struct ns_worker_ctx    *ns_ctx = task->ns_ctx;
1068         struct ns_entry         *entry = ns_ctx->entry;
1069
1070         if (g_is_random) {
1071                 offset_in_ios = rand_r(&seed) % entry->size_in_ios;
1072         } else {
1073                 offset_in_ios = ns_ctx->offset_in_ios++;
1074                 if (ns_ctx->offset_in_ios == entry->size_in_ios) {
1075                         ns_ctx->offset_in_ios = 0;
1076                 }
1077         }
1078
1079         task->submit_tsc = spdk_get_ticks();
1080
1081         if ((g_rw_percentage == 100) ||
1082             (g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) {
1083                 task->is_read = true;
1084         } else {
1085                 task->is_read = false;
1086         }
1087
1088         rc = entry->fn_table->submit_io(task, ns_ctx, entry, offset_in_ios);
1089
1090         if (spdk_unlikely(rc != 0)) {
1091                 fprintf(stderr, "starting I/O failed\n");
1092         } else {
1093                 ns_ctx->current_queue_depth++;
1094         }
1095 }
1096
1097 static inline void
1098 task_complete(struct perf_task *task)
1099 {
1100         struct ns_worker_ctx    *ns_ctx;
1101         uint64_t                tsc_diff;
1102         struct ns_entry         *entry;
1103
1104         ns_ctx = task->ns_ctx;
1105         entry = ns_ctx->entry;
1106         ns_ctx->current_queue_depth--;
1107         ns_ctx->io_completed++;
1108         tsc_diff = spdk_get_ticks() - task->submit_tsc;
1109         ns_ctx->total_tsc += tsc_diff;
1110         if (spdk_unlikely(ns_ctx->min_tsc > tsc_diff)) {
1111                 ns_ctx->min_tsc = tsc_diff;
1112         }
1113         if (spdk_unlikely(ns_ctx->max_tsc < tsc_diff)) {
1114                 ns_ctx->max_tsc = tsc_diff;
1115         }
1116         if (spdk_unlikely(g_latency_sw_tracking_level > 0)) {
1117                 spdk_histogram_data_tally(ns_ctx->histogram, tsc_diff);
1118         }
1119
1120         if (spdk_unlikely(entry->md_size > 0)) {
1121                 /* add application level verification for end-to-end data protection */
1122                 entry->fn_table->verify_io(task, entry);
1123         }
1124
1125         /*
1126          * is_draining indicates when time has expired for the test run
1127          * and we are just waiting for the previously submitted I/O
1128          * to complete.  In this case, do not submit a new I/O to replace
1129          * the one just completed.
1130          */
1131         if (spdk_unlikely(ns_ctx->is_draining)) {
1132                 spdk_dma_free(task->iov.iov_base);
1133                 spdk_dma_free(task->md_iov.iov_base);
1134                 free(task);
1135         } else {
1136                 submit_single_io(task);
1137         }
1138 }
1139
1140 static void
1141 io_complete(void *ctx, const struct spdk_nvme_cpl *cpl)
1142 {
1143         struct perf_task *task = ctx;
1144
1145         if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) {
1146                 fprintf(stderr, "%s completed with error (sct=%d, sc=%d)\n",
1147                         task->is_read ? "Read" : "Write",
1148                         cpl->status.sct, cpl->status.sc);
1149         }
1150
1151         task_complete(task);
1152 }
1153
1154 static struct perf_task *
1155 allocate_task(struct ns_worker_ctx *ns_ctx, int queue_depth)
1156 {
1157         struct perf_task *task;
1158
1159         task = calloc(1, sizeof(*task));
1160         if (task == NULL) {
1161                 fprintf(stderr, "Out of memory allocating tasks\n");
1162                 exit(1);
1163         }
1164
1165         ns_ctx->entry->fn_table->setup_payload(task, queue_depth % 8 + 1);
1166
1167         task->ns_ctx = ns_ctx;
1168
1169         return task;
1170 }
1171
1172 static void
1173 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth)
1174 {
1175         struct perf_task *task;
1176
1177         while (queue_depth-- > 0) {
1178                 task = allocate_task(ns_ctx, queue_depth);
1179                 submit_single_io(task);
1180         }
1181 }
1182
1183 static int
1184 init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
1185 {
1186         return ns_ctx->entry->fn_table->init_ns_worker_ctx(ns_ctx);
1187 }
1188
1189 static void
1190 cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
1191 {
1192         ns_ctx->entry->fn_table->cleanup_ns_worker_ctx(ns_ctx);
1193 }
1194
1195 static void
1196 print_periodic_performance(void)
1197 {
1198         uint64_t io_this_second;
1199         double mb_this_second;
1200         struct worker_thread *worker;
1201         struct ns_worker_ctx *ns_ctx;
1202
1203         if (!isatty(STDOUT_FILENO)) {
1204                 /* Don't print periodic stats if output is not going
1205                  * to a terminal.
1206                  */
1207                 return;
1208         }
1209
1210         io_this_second = 0;
1211         worker = g_workers;
1212         while (worker) {
1213                 ns_ctx = worker->ns_ctx;
1214                 while (ns_ctx) {
1215                         io_this_second += ns_ctx->io_completed - ns_ctx->last_io_completed;
1216                         ns_ctx->last_io_completed = ns_ctx->io_completed;
1217                         ns_ctx = ns_ctx->next;
1218                 }
1219                 worker = worker->next;
1220         }
1221
1222         mb_this_second = (double)io_this_second * g_io_size_bytes / (1024 * 1024);
1223         printf("%9ju IOPS, %8.2f MiB/s\r", io_this_second, mb_this_second);
1224         fflush(stdout);
1225 }
1226
1227 static int
1228 work_fn(void *arg)
1229 {
1230         uint64_t tsc_end, tsc_current, tsc_next_print;
1231         struct worker_thread *worker = (struct worker_thread *)arg;
1232         struct ns_worker_ctx *ns_ctx = NULL;
1233         uint32_t unfinished_ns_ctx;
1234
1235         /* Allocate queue pairs for each namespace. */
1236         ns_ctx = worker->ns_ctx;
1237         while (ns_ctx != NULL) {
1238                 if (init_ns_worker_ctx(ns_ctx) != 0) {
1239                         printf("ERROR: init_ns_worker_ctx() failed\n");
1240                         return 1;
1241                 }
1242                 ns_ctx = ns_ctx->next;
1243         }
1244
1245         tsc_current = spdk_get_ticks();
1246         tsc_end = tsc_current + g_time_in_sec * g_tsc_rate;
1247         tsc_next_print = tsc_current + g_tsc_rate;
1248
1249         /* Submit initial I/O for each namespace. */
1250         ns_ctx = worker->ns_ctx;
1251         while (ns_ctx != NULL) {
1252                 submit_io(ns_ctx, g_queue_depth);
1253                 ns_ctx = ns_ctx->next;
1254         }
1255
1256         while (1) {
1257                 /*
1258                  * Check for completed I/O for each controller. A new
1259                  * I/O will be submitted in the io_complete callback
1260                  * to replace each I/O that is completed.
1261                  */
1262                 ns_ctx = worker->ns_ctx;
1263                 while (ns_ctx != NULL) {
1264                         ns_ctx->entry->fn_table->check_io(ns_ctx);
1265                         ns_ctx = ns_ctx->next;
1266                 }
1267
1268                 tsc_current = spdk_get_ticks();
1269
1270                 if (worker->lcore == g_master_core && tsc_current > tsc_next_print) {
1271                         tsc_next_print += g_tsc_rate;
1272                         print_periodic_performance();
1273                 }
1274
1275                 if (tsc_current > tsc_end) {
1276                         break;
1277                 }
1278         }
1279
1280         /* drain the io of each ns_ctx in round robin to make the fairness */
1281         do {
1282                 unfinished_ns_ctx = 0;
1283                 ns_ctx = worker->ns_ctx;
1284                 while (ns_ctx != NULL) {
1285                         /* first time will enter into this if case */
1286                         if (!ns_ctx->is_draining) {
1287                                 ns_ctx->is_draining = true;
1288                         }
1289
1290                         if (ns_ctx->current_queue_depth > 0) {
1291                                 ns_ctx->entry->fn_table->check_io(ns_ctx);
1292                                 if (ns_ctx->current_queue_depth == 0) {
1293                                         cleanup_ns_worker_ctx(ns_ctx);
1294                                 } else {
1295                                         unfinished_ns_ctx++;
1296                                 }
1297                         }
1298                         ns_ctx = ns_ctx->next;
1299                 }
1300         } while (unfinished_ns_ctx > 0);
1301
1302         return 0;
1303 }
1304
1305 static void usage(char *program_name)
1306 {
1307         printf("%s options", program_name);
1308 #if defined(SPDK_CONFIG_URING) || defined(HAVE_LIBAIO)
1309         printf(" [Kernel device(s)]...");
1310 #endif
1311         printf("\n");
1312         printf("\t[-q io depth]\n");
1313         printf("\t[-o io size in bytes]\n");
1314         printf("\t[-P number of io queues per namespace. default: 1]\n");
1315         printf("\t[-U number of unused io queues per controller. default: 0]\n");
1316         printf("\t[-w io pattern type, must be one of\n");
1317         printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
1318         printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
1319         printf("\t[-L enable latency tracking via sw, default: disabled]\n");
1320         printf("\t\t-L for latency summary, -LL for detailed histogram\n");
1321         printf("\t[-l enable latency tracking via ssd (if supported), default: disabled]\n");
1322         printf("\t[-t time in seconds]\n");
1323         printf("\t[-c core mask for I/O submission/completion.]\n");
1324         printf("\t\t(default: 1)\n");
1325         printf("\t[-D disable submission queue in controller memory buffer, default: enabled]\n");
1326         printf("\t[-H enable header digest for TCP transport, default: disabled]\n");
1327         printf("\t[-I enable data digest for TCP transport, default: disabled]\n");
1328         printf("\t[-N no shutdown notification process for controllers, default: disabled]\n");
1329         printf("\t[-r Transport ID for local PCIe NVMe or NVMeoF]\n");
1330         printf("\t Format: 'key:value [key:value] ...'\n");
1331         printf("\t Keys:\n");
1332         printf("\t  trtype      Transport type (e.g. PCIe, RDMA)\n");
1333         printf("\t  adrfam      Address family (e.g. IPv4, IPv6)\n");
1334         printf("\t  traddr      Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n");
1335         printf("\t  trsvcid     Transport service identifier (e.g. 4420)\n");
1336         printf("\t  subnqn      Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN);
1337         printf("\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n");
1338         printf("\t          -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n");
1339         printf("\t[-e metadata configuration]\n");
1340         printf("\t Keys:\n");
1341         printf("\t  PRACT      Protection Information Action bit (PRACT=1 or PRACT=0)\n");
1342         printf("\t  PRCHK      Control of Protection Information Checking (PRCHK=GUARD|REFTAG|APPTAG)\n");
1343         printf("\t Example: -e 'PRACT=0,PRCHK=GUARD|REFTAG|APPTAG'\n");
1344         printf("\t          -e 'PRACT=1,PRCHK=GUARD'\n");
1345         printf("\t[-k keep alive timeout period in millisecond]\n");
1346         printf("\t[-s DPDK huge memory size in MB.]\n");
1347         printf("\t[-C max completions per poll]\n");
1348         printf("\t\t(default: 0 - unlimited)\n");
1349         printf("\t[-i shared memory group ID]\n");
1350         printf("\t");
1351         spdk_log_usage(stdout, "-T");
1352 #ifdef SPDK_CONFIG_URING
1353         printf("\t[-R enable using liburing to drive kernel devices (Default: libaio)]\n");
1354 #endif
1355 #ifdef DEBUG
1356         printf("\t[-G enable debug logging]\n");
1357 #else
1358         printf("\t[-G enable debug logging (flag disabled, must reconfigure with --enable-debug)\n");
1359 #endif
1360 }
1361
1362 static void
1363 check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count,
1364              uint64_t total, uint64_t so_far)
1365 {
1366         double so_far_pct;
1367         double **cutoff = ctx;
1368
1369         if (count == 0) {
1370                 return;
1371         }
1372
1373         so_far_pct = (double)so_far / total;
1374         while (so_far_pct >= **cutoff && **cutoff > 0) {
1375                 printf("%9.5f%% : %9.3fus\n", **cutoff * 100, (double)end * 1000 * 1000 / g_tsc_rate);
1376                 (*cutoff)++;
1377         }
1378 }
1379
1380 static void
1381 print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count,
1382              uint64_t total, uint64_t so_far)
1383 {
1384         double so_far_pct;
1385
1386         if (count == 0) {
1387                 return;
1388         }
1389
1390         so_far_pct = (double)so_far * 100 / total;
1391         printf("%9.3f - %9.3f: %9.4f%%  (%9ju)\n",
1392                (double)start * 1000 * 1000 / g_tsc_rate,
1393                (double)end * 1000 * 1000 / g_tsc_rate,
1394                so_far_pct, count);
1395 }
1396
1397 static void
1398 print_performance(void)
1399 {
1400         uint64_t total_io_completed, total_io_tsc;
1401         double io_per_second, mb_per_second, average_latency, min_latency, max_latency;
1402         double sum_ave_latency, min_latency_so_far, max_latency_so_far;
1403         double total_io_per_second, total_mb_per_second;
1404         int ns_count;
1405         struct worker_thread    *worker;
1406         struct ns_worker_ctx    *ns_ctx;
1407         uint32_t max_strlen;
1408
1409         total_io_per_second = 0;
1410         total_mb_per_second = 0;
1411         total_io_completed = 0;
1412         total_io_tsc = 0;
1413         min_latency_so_far = (double)UINT64_MAX;
1414         max_latency_so_far = 0;
1415         ns_count = 0;
1416
1417         max_strlen = 0;
1418         worker = g_workers;
1419         while (worker) {
1420                 ns_ctx = worker->ns_ctx;
1421                 while (ns_ctx) {
1422                         max_strlen = spdk_max(strlen(ns_ctx->entry->name), max_strlen);
1423                         ns_ctx = ns_ctx->next;
1424                 }
1425                 worker = worker->next;
1426         }
1427
1428         printf("========================================================\n");
1429         printf("%*s\n", max_strlen + 60, "Latency(us)");
1430         printf("%-*s: %10s %10s %10s %10s %10s\n",
1431                max_strlen + 13, "Device Information", "IOPS", "MiB/s", "Average", "min", "max");
1432
1433         worker = g_workers;
1434         while (worker) {
1435                 ns_ctx = worker->ns_ctx;
1436                 while (ns_ctx) {
1437                         if (ns_ctx->io_completed != 0) {
1438                                 io_per_second = (double)ns_ctx->io_completed / g_time_in_sec;
1439                                 mb_per_second = io_per_second * g_io_size_bytes / (1024 * 1024);
1440                                 average_latency = ((double)ns_ctx->total_tsc / ns_ctx->io_completed) * 1000 * 1000 / g_tsc_rate;
1441                                 min_latency = (double)ns_ctx->min_tsc * 1000 * 1000 / g_tsc_rate;
1442                                 if (min_latency < min_latency_so_far) {
1443                                         min_latency_so_far = min_latency;
1444                                 }
1445
1446                                 max_latency = (double)ns_ctx->max_tsc * 1000 * 1000 / g_tsc_rate;
1447                                 if (max_latency > max_latency_so_far) {
1448                                         max_latency_so_far = max_latency;
1449                                 }
1450
1451                                 printf("%-*.*s from core %2u: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
1452                                        max_strlen, max_strlen, ns_ctx->entry->name, worker->lcore,
1453                                        io_per_second, mb_per_second,
1454                                        average_latency, min_latency, max_latency);
1455                                 total_io_per_second += io_per_second;
1456                                 total_mb_per_second += mb_per_second;
1457                                 total_io_completed += ns_ctx->io_completed;
1458                                 total_io_tsc += ns_ctx->total_tsc;
1459                                 ns_count++;
1460                         }
1461                         ns_ctx = ns_ctx->next;
1462                 }
1463                 worker = worker->next;
1464         }
1465
1466         if (ns_count != 0 && total_io_completed) {
1467                 sum_ave_latency = ((double)total_io_tsc / total_io_completed) * 1000 * 1000 / g_tsc_rate;
1468                 printf("========================================================\n");
1469                 printf("%-*s: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
1470                        max_strlen + 13, "Total", total_io_per_second, total_mb_per_second,
1471                        sum_ave_latency, min_latency_so_far, max_latency_so_far);
1472                 printf("\n");
1473         }
1474
1475         if (g_latency_sw_tracking_level == 0 || total_io_completed == 0) {
1476                 return;
1477         }
1478
1479         worker = g_workers;
1480         while (worker) {
1481                 ns_ctx = worker->ns_ctx;
1482                 while (ns_ctx) {
1483                         const double *cutoff = g_latency_cutoffs;
1484
1485                         printf("Summary latency data for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore);
1486                         printf("=================================================================================\n");
1487
1488                         spdk_histogram_data_iterate(ns_ctx->histogram, check_cutoff, &cutoff);
1489
1490                         printf("\n");
1491                         ns_ctx = ns_ctx->next;
1492                 }
1493                 worker = worker->next;
1494         }
1495
1496         if (g_latency_sw_tracking_level == 1) {
1497                 return;
1498         }
1499
1500         worker = g_workers;
1501         while (worker) {
1502                 ns_ctx = worker->ns_ctx;
1503                 while (ns_ctx) {
1504                         printf("Latency histogram for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore);
1505                         printf("==============================================================================\n");
1506                         printf("       Range in us     Cumulative    IO count\n");
1507
1508                         spdk_histogram_data_iterate(ns_ctx->histogram, print_bucket, NULL);
1509                         printf("\n");
1510                         ns_ctx = ns_ctx->next;
1511                 }
1512                 worker = worker->next;
1513         }
1514
1515 }
1516
1517 static void
1518 print_latency_page(struct ctrlr_entry *entry)
1519 {
1520         int i;
1521
1522         printf("\n");
1523         printf("%s\n", entry->name);
1524         printf("--------------------------------------------------------\n");
1525
1526         for (i = 0; i < 32; i++) {
1527                 if (entry->latency_page->buckets_32us[i]) {
1528                         printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32, entry->latency_page->buckets_32us[i]);
1529                 }
1530         }
1531         for (i = 0; i < 31; i++) {
1532                 if (entry->latency_page->buckets_1ms[i]) {
1533                         printf("Bucket %dms - %dms: %d\n", i + 1, i + 2, entry->latency_page->buckets_1ms[i]);
1534                 }
1535         }
1536         for (i = 0; i < 31; i++) {
1537                 if (entry->latency_page->buckets_32ms[i])
1538                         printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32,
1539                                entry->latency_page->buckets_32ms[i]);
1540         }
1541 }
1542
1543 static void
1544 print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page)
1545 {
1546         struct ctrlr_entry      *ctrlr;
1547
1548         printf("%s Latency Statistics:\n", op_name);
1549         printf("========================================================\n");
1550         ctrlr = g_controllers;
1551         while (ctrlr) {
1552                 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
1553                         if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr->ctrlr, log_page, SPDK_NVME_GLOBAL_NS_TAG,
1554                                                              ctrlr->latency_page, sizeof(struct spdk_nvme_intel_rw_latency_page), 0,
1555                                                              enable_latency_tracking_complete,
1556                                                              NULL)) {
1557                                 printf("nvme_ctrlr_cmd_get_log_page() failed\n");
1558                                 exit(1);
1559                         }
1560
1561                         g_outstanding_commands++;
1562                 } else {
1563                         printf("Controller %s: %s latency statistics not supported\n", ctrlr->name, op_name);
1564                 }
1565                 ctrlr = ctrlr->next;
1566         }
1567
1568         while (g_outstanding_commands) {
1569                 ctrlr = g_controllers;
1570                 while (ctrlr) {
1571                         spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr);
1572                         ctrlr = ctrlr->next;
1573                 }
1574         }
1575
1576         ctrlr = g_controllers;
1577         while (ctrlr) {
1578                 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
1579                         print_latency_page(ctrlr);
1580                 }
1581                 ctrlr = ctrlr->next;
1582         }
1583         printf("\n");
1584 }
1585
1586 static void
1587 print_stats(void)
1588 {
1589         print_performance();
1590         if (g_latency_ssd_tracking_enable) {
1591                 if (g_rw_percentage != 0) {
1592                         print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY);
1593                 }
1594                 if (g_rw_percentage != 100) {
1595                         print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY);
1596                 }
1597         }
1598 }
1599
1600 static void
1601 unregister_trids(void)
1602 {
1603         struct trid_entry *trid_entry, *tmp;
1604
1605         TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) {
1606                 TAILQ_REMOVE(&g_trid_list, trid_entry, tailq);
1607                 free(trid_entry);
1608         }
1609 }
1610
1611 static int
1612 add_trid(const char *trid_str)
1613 {
1614         struct trid_entry *trid_entry;
1615         struct spdk_nvme_transport_id *trid;
1616         char *ns;
1617
1618         trid_entry = calloc(1, sizeof(*trid_entry));
1619         if (trid_entry == NULL) {
1620                 return -1;
1621         }
1622
1623         trid = &trid_entry->trid;
1624         trid->trtype = SPDK_NVME_TRANSPORT_PCIE;
1625         snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
1626
1627         if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) {
1628                 fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str);
1629                 free(trid_entry);
1630                 return 1;
1631         }
1632
1633         spdk_nvme_transport_id_populate_trstring(trid,
1634                         spdk_nvme_transport_id_trtype_str(trid->trtype));
1635
1636         ns = strcasestr(trid_str, "ns:");
1637         if (ns) {
1638                 char nsid_str[6]; /* 5 digits maximum in an nsid */
1639                 int len;
1640                 int nsid;
1641
1642                 ns += 3;
1643
1644                 len = strcspn(ns, " \t\n");
1645                 if (len > 5) {
1646                         fprintf(stderr, "NVMe namespace IDs must be 5 digits or less\n");
1647                         free(trid_entry);
1648                         return 1;
1649                 }
1650
1651                 memcpy(nsid_str, ns, len);
1652                 nsid_str[len] = '\0';
1653
1654                 nsid = spdk_strtol(nsid_str, 10);
1655                 if (nsid <= 0 || nsid > 65535) {
1656                         fprintf(stderr, "NVMe namespace IDs must be less than 65536 and greater than 0\n");
1657                         free(trid_entry);
1658                         return 1;
1659                 }
1660
1661                 trid_entry->nsid = (uint16_t)nsid;
1662         }
1663
1664         TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq);
1665         return 0;
1666 }
1667
1668 static size_t
1669 parse_next_key(const char **str, char *key, char *val, size_t key_buf_size,
1670                size_t val_buf_size)
1671 {
1672         const char *sep;
1673         const char *separator = ", \t\n";
1674         size_t key_len, val_len;
1675
1676         *str += strspn(*str, separator);
1677
1678         sep = strchr(*str, '=');
1679         if (!sep) {
1680                 fprintf(stderr, "Key without '=' separator\n");
1681                 return 0;
1682         }
1683
1684         key_len = sep - *str;
1685         if (key_len >= key_buf_size) {
1686                 fprintf(stderr, "Key length %zu is greater than maximum allowed %zu\n",
1687                         key_len, key_buf_size - 1);
1688                 return 0;
1689         }
1690
1691         memcpy(key, *str, key_len);
1692         key[key_len] = '\0';
1693
1694         *str += key_len + 1;    /* Skip key */
1695         val_len = strcspn(*str, separator);
1696         if (val_len == 0) {
1697                 fprintf(stderr, "Key without value\n");
1698                 return 0;
1699         }
1700
1701         if (val_len >= val_buf_size) {
1702                 fprintf(stderr, "Value length %zu is greater than maximum allowed %zu\n",
1703                         val_len, val_buf_size - 1);
1704                 return 0;
1705         }
1706
1707         memcpy(val, *str, val_len);
1708         val[val_len] = '\0';
1709
1710         *str += val_len;
1711
1712         return val_len;
1713 }
1714
1715 static int
1716 parse_metadata(const char *metacfg_str)
1717 {
1718         const char *str;
1719         size_t val_len;
1720         char key[32];
1721         char val[1024];
1722
1723         if (metacfg_str == NULL) {
1724                 return -EINVAL;
1725         }
1726
1727         str = metacfg_str;
1728
1729         while (*str != '\0') {
1730                 val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val));
1731                 if (val_len == 0) {
1732                         fprintf(stderr, "Failed to parse metadata\n");
1733                         return -EINVAL;
1734                 }
1735
1736                 if (strcmp(key, "PRACT") == 0) {
1737                         if (*val == '1') {
1738                                 g_metacfg_pract_flag = SPDK_NVME_IO_FLAGS_PRACT;
1739                         }
1740                 } else if (strcmp(key, "PRCHK") == 0) {
1741                         if (strstr(val, "GUARD") != NULL) {
1742                                 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
1743                         }
1744                         if (strstr(val, "REFTAG") != NULL) {
1745                                 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
1746                         }
1747                         if (strstr(val, "APPTAG") != NULL) {
1748                                 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG;
1749                         }
1750                 } else {
1751                         fprintf(stderr, "Unknown key '%s'\n", key);
1752                 }
1753         }
1754
1755         return 0;
1756 }
1757
1758 static int
1759 parse_args(int argc, char **argv)
1760 {
1761         int op;
1762         long int val;
1763         int rc;
1764
1765         while ((op = getopt(argc, argv, "c:e:i:lo:q:r:k:s:t:w:C:DGHILM:NP:RT:U:V")) != -1) {
1766                 switch (op) {
1767                 case 'i':
1768                 case 'C':
1769                 case 'P':
1770                 case 'o':
1771                 case 'q':
1772                 case 'k':
1773                 case 's':
1774                 case 't':
1775                 case 'M':
1776                 case 'U':
1777                         val = spdk_strtol(optarg, 10);
1778                         if (val < 0) {
1779                                 fprintf(stderr, "Converting a string to integer failed\n");
1780                                 return val;
1781                         }
1782                         switch (op) {
1783                         case 'i':
1784                                 g_shm_id = val;
1785                                 break;
1786                         case 'C':
1787                                 g_max_completions = val;
1788                                 break;
1789                         case 'P':
1790                                 g_nr_io_queues_per_ns = val;
1791                                 break;
1792                         case 'o':
1793                                 g_io_size_bytes = val;
1794                                 break;
1795                         case 'q':
1796                                 g_queue_depth = val;
1797                                 break;
1798                         case 'k':
1799                                 g_keep_alive_timeout_in_ms = val;
1800                                 break;
1801                         case 's':
1802                                 g_dpdk_mem = val;
1803                                 break;
1804                         case 't':
1805                                 g_time_in_sec = val;
1806                                 break;
1807                         case 'M':
1808                                 g_rw_percentage = val;
1809                                 g_mix_specified = true;
1810                                 break;
1811                         case 'U':
1812                                 g_nr_unused_io_queues = val;
1813                                 break;
1814                         }
1815                         break;
1816                 case 'c':
1817                         g_core_mask = optarg;
1818                         break;
1819                 case 'e':
1820                         if (parse_metadata(optarg)) {
1821                                 usage(argv[0]);
1822                                 return 1;
1823                         }
1824                         break;
1825                 case 'l':
1826                         g_latency_ssd_tracking_enable = true;
1827                         break;
1828                 case 'r':
1829                         if (add_trid(optarg)) {
1830                                 usage(argv[0]);
1831                                 return 1;
1832                         }
1833                         break;
1834                 case 'w':
1835                         g_workload_type = optarg;
1836                         break;
1837                 case 'D':
1838                         g_disable_sq_cmb = 1;
1839                         break;
1840                 case 'G':
1841 #ifndef DEBUG
1842                         fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n",
1843                                 argv[0]);
1844                         usage(argv[0]);
1845                         return 1;
1846 #else
1847                         spdk_log_set_flag("nvme");
1848                         spdk_log_set_print_level(SPDK_LOG_DEBUG);
1849                         break;
1850 #endif
1851                 case 'H':
1852                         g_header_digest = 1;
1853                         break;
1854                 case 'I':
1855                         g_data_digest = 1;
1856                         break;
1857                 case 'L':
1858                         g_latency_sw_tracking_level++;
1859                         break;
1860                 case 'N':
1861                         g_no_shn_notification = true;
1862                         break;
1863                 case 'R':
1864 #ifndef SPDK_CONFIG_URING
1865                         fprintf(stderr, "%s must be rebuilt with CONFIG_URING=y for -R flag.\n",
1866                                 argv[0]);
1867                         usage(argv[0]);
1868                         return 0;
1869 #endif
1870                         g_use_uring = true;
1871                         break;
1872                 case 'T':
1873                         rc = spdk_log_set_flag(optarg);
1874                         if (rc < 0) {
1875                                 fprintf(stderr, "unknown flag\n");
1876                                 usage(argv[0]);
1877                                 exit(EXIT_FAILURE);
1878                         }
1879                         spdk_log_set_print_level(SPDK_LOG_DEBUG);
1880 #ifndef DEBUG
1881                         fprintf(stderr, "%s must be rebuilt with CONFIG_DEBUG=y for -T flag.\n",
1882                                 argv[0]);
1883                         usage(argv[0]);
1884                         return 0;
1885 #endif
1886                         break;
1887                 case 'V':
1888                         g_vmd = true;
1889                         break;
1890                 default:
1891                         usage(argv[0]);
1892                         return 1;
1893                 }
1894         }
1895
1896         if (!g_nr_io_queues_per_ns) {
1897                 usage(argv[0]);
1898                 return 1;
1899         }
1900
1901         if (!g_queue_depth) {
1902                 fprintf(stderr, "missing -q (queue size) operand\n");
1903                 usage(argv[0]);
1904                 return 1;
1905         }
1906         if (!g_io_size_bytes) {
1907                 fprintf(stderr, "missing -o (block size) operand\n");
1908                 usage(argv[0]);
1909                 return 1;
1910         }
1911         if (!g_workload_type) {
1912                 fprintf(stderr, "missing -w (io pattern type) operand\n");
1913                 usage(argv[0]);
1914                 return 1;
1915         }
1916         if (!g_time_in_sec) {
1917                 fprintf(stderr, "missing -t (test time in seconds) operand\n");
1918                 usage(argv[0]);
1919                 return 1;
1920         }
1921
1922         if (strncmp(g_workload_type, "rand", 4) == 0) {
1923                 g_is_random = 1;
1924                 g_workload_type = &g_workload_type[4];
1925         }
1926
1927         if (strcmp(g_workload_type, "read") == 0 || strcmp(g_workload_type, "write") == 0) {
1928                 g_rw_percentage = strcmp(g_workload_type, "read") == 0 ? 100 : 0;
1929                 if (g_mix_specified) {
1930                         fprintf(stderr, "Ignoring -M option... Please use -M option"
1931                                 " only when using rw or randrw.\n");
1932                 }
1933         } else if (strcmp(g_workload_type, "rw") == 0) {
1934                 if (g_rw_percentage < 0 || g_rw_percentage > 100) {
1935                         fprintf(stderr,
1936                                 "-M must be specified to value from 0 to 100 "
1937                                 "for rw or randrw.\n");
1938                         return 1;
1939                 }
1940         } else {
1941                 fprintf(stderr,
1942                         "io pattern type must be one of\n"
1943                         "(read, write, randread, randwrite, rw, randrw)\n");
1944                 return 1;
1945         }
1946
1947         if (TAILQ_EMPTY(&g_trid_list)) {
1948                 /* If no transport IDs specified, default to enumerating all local PCIe devices */
1949                 add_trid("trtype:PCIe");
1950         } else {
1951                 struct trid_entry *trid_entry, *trid_entry_tmp;
1952
1953                 g_no_pci = true;
1954                 /* check whether there is local PCIe type */
1955                 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) {
1956                         if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
1957                                 g_no_pci = false;
1958                                 break;
1959                         }
1960                 }
1961         }
1962
1963         g_file_optind = optind;
1964
1965         return 0;
1966 }
1967
1968 static int
1969 register_workers(void)
1970 {
1971         uint32_t i;
1972         struct worker_thread *worker;
1973
1974         g_workers = NULL;
1975         g_num_workers = 0;
1976
1977         SPDK_ENV_FOREACH_CORE(i) {
1978                 worker = calloc(1, sizeof(*worker));
1979                 if (worker == NULL) {
1980                         fprintf(stderr, "Unable to allocate worker\n");
1981                         return -1;
1982                 }
1983
1984                 worker->lcore = i;
1985                 worker->next = g_workers;
1986                 g_workers = worker;
1987                 g_num_workers++;
1988         }
1989
1990         return 0;
1991 }
1992
1993 static void
1994 unregister_workers(void)
1995 {
1996         struct worker_thread *worker = g_workers;
1997
1998         /* Free namespace context and worker thread */
1999         while (worker) {
2000                 struct worker_thread *next_worker = worker->next;
2001                 struct ns_worker_ctx *ns_ctx = worker->ns_ctx;
2002
2003                 while (ns_ctx) {
2004                         struct ns_worker_ctx *next_ns_ctx = ns_ctx->next;
2005                         spdk_histogram_data_free(ns_ctx->histogram);
2006                         free(ns_ctx);
2007                         ns_ctx = next_ns_ctx;
2008                 }
2009
2010                 free(worker);
2011                 worker = next_worker;
2012         }
2013 }
2014
2015 static bool
2016 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2017          struct spdk_nvme_ctrlr_opts *opts)
2018 {
2019         if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
2020                 if (g_disable_sq_cmb) {
2021                         opts->use_cmb_sqs = false;
2022                 }
2023                 if (g_no_shn_notification) {
2024                         opts->no_shn_notification = true;
2025                 }
2026         }
2027
2028         /* Set io_queue_size to UINT16_MAX, NVMe driver
2029          * will then reduce this to MQES to maximize
2030          * the io_queue_size as much as possible.
2031          */
2032         opts->io_queue_size = UINT16_MAX;
2033
2034         /* Set the header and data_digest */
2035         opts->header_digest = g_header_digest;
2036         opts->data_digest = g_data_digest;
2037         opts->keep_alive_timeout_ms = g_keep_alive_timeout_in_ms;
2038
2039         return true;
2040 }
2041
2042 static void
2043 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
2044           struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
2045 {
2046         struct trid_entry       *trid_entry = cb_ctx;
2047         struct spdk_pci_addr    pci_addr;
2048         struct spdk_pci_device  *pci_dev;
2049         struct spdk_pci_id      pci_id;
2050
2051         if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) {
2052                 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n",
2053                        trid->traddr, trid->trsvcid,
2054                        trid->subnqn);
2055         } else {
2056                 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) {
2057                         return;
2058                 }
2059
2060                 pci_dev = spdk_nvme_ctrlr_get_pci_device(ctrlr);
2061                 if (!pci_dev) {
2062                         return;
2063                 }
2064
2065                 pci_id = spdk_pci_device_get_id(pci_dev);
2066
2067                 printf("Attached to NVMe Controller at %s [%04x:%04x]\n",
2068                        trid->traddr,
2069                        pci_id.vendor_id, pci_id.device_id);
2070         }
2071
2072         register_ctrlr(ctrlr, trid_entry);
2073 }
2074
2075 static int
2076 register_controllers(void)
2077 {
2078         struct trid_entry *trid_entry;
2079
2080         printf("Initializing NVMe Controllers\n");
2081
2082         if (g_vmd && spdk_vmd_init()) {
2083                 fprintf(stderr, "Failed to initialize VMD."
2084                         " Some NVMe devices can be unavailable.\n");
2085         }
2086
2087         TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) {
2088                 if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) {
2089                         fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n",
2090                                 trid_entry->trid.traddr);
2091                         return -1;
2092                 }
2093         }
2094
2095         return 0;
2096 }
2097
2098 static void
2099 unregister_controllers(void)
2100 {
2101         struct ctrlr_entry *entry = g_controllers;
2102
2103         while (entry) {
2104                 struct ctrlr_entry *next = entry->next;
2105                 spdk_dma_free(entry->latency_page);
2106                 if (g_latency_ssd_tracking_enable &&
2107                     spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) {
2108                         set_latency_tracking_feature(entry->ctrlr, false);
2109                 }
2110
2111                 if (g_nr_unused_io_queues) {
2112                         int i;
2113
2114                         for (i = 0; i < g_nr_unused_io_queues; i++) {
2115                                 spdk_nvme_ctrlr_free_io_qpair(entry->unused_qpairs[i]);
2116                         }
2117
2118                         free(entry->unused_qpairs);
2119                 }
2120
2121                 spdk_nvme_detach(entry->ctrlr);
2122                 free(entry);
2123                 entry = next;
2124         }
2125
2126         if (g_vmd) {
2127                 spdk_vmd_fini();
2128         }
2129 }
2130
2131 static int
2132 associate_workers_with_ns(void)
2133 {
2134         struct ns_entry         *entry = g_namespaces;
2135         struct worker_thread    *worker = g_workers;
2136         struct ns_worker_ctx    *ns_ctx;
2137         int                     i, count;
2138
2139         count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers;
2140
2141         for (i = 0; i < count; i++) {
2142                 if (entry == NULL) {
2143                         break;
2144                 }
2145
2146                 ns_ctx = calloc(1, sizeof(struct ns_worker_ctx));
2147                 if (!ns_ctx) {
2148                         return -1;
2149                 }
2150
2151                 printf("Associating %s with lcore %d\n", entry->name, worker->lcore);
2152                 ns_ctx->min_tsc = UINT64_MAX;
2153                 ns_ctx->entry = entry;
2154                 ns_ctx->next = worker->ns_ctx;
2155                 ns_ctx->histogram = spdk_histogram_data_alloc();
2156                 worker->ns_ctx = ns_ctx;
2157
2158                 worker = worker->next;
2159                 if (worker == NULL) {
2160                         worker = g_workers;
2161                 }
2162
2163                 entry = entry->next;
2164                 if (entry == NULL) {
2165                         entry = g_namespaces;
2166                 }
2167
2168         }
2169
2170         return 0;
2171 }
2172
2173 static void *
2174 nvme_poll_ctrlrs(void *arg)
2175 {
2176         struct ctrlr_entry *entry;
2177         int oldstate;
2178
2179         spdk_unaffinitize_thread();
2180
2181         while (true) {
2182                 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate);
2183
2184                 entry = g_controllers;
2185                 while (entry) {
2186                         if (entry->trtype != SPDK_NVME_TRANSPORT_PCIE) {
2187                                 spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr);
2188                         }
2189                         entry = entry->next;
2190                 }
2191
2192                 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);
2193
2194                 /* This is a pthread cancellation point and cannot be removed. */
2195                 sleep(1);
2196         }
2197
2198         return NULL;
2199 }
2200
2201 int main(int argc, char **argv)
2202 {
2203         int rc;
2204         struct worker_thread *worker, *master_worker;
2205         struct spdk_env_opts opts;
2206         pthread_t thread_id = 0;
2207
2208         rc = parse_args(argc, argv);
2209         if (rc != 0) {
2210                 return rc;
2211         }
2212
2213         spdk_env_opts_init(&opts);
2214         opts.name = "perf";
2215         opts.shm_id = g_shm_id;
2216         if (g_core_mask) {
2217                 opts.core_mask = g_core_mask;
2218         }
2219
2220         if (g_dpdk_mem) {
2221                 opts.mem_size = g_dpdk_mem;
2222         }
2223         if (g_no_pci) {
2224                 opts.no_pci = g_no_pci;
2225         }
2226         if (spdk_env_init(&opts) < 0) {
2227                 fprintf(stderr, "Unable to initialize SPDK env\n");
2228                 rc = -1;
2229                 goto cleanup;
2230         }
2231
2232         g_tsc_rate = spdk_get_ticks_hz();
2233
2234         if (register_workers() != 0) {
2235                 rc = -1;
2236                 goto cleanup;
2237         }
2238
2239 #if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING)
2240         if (register_files(argc, argv) != 0) {
2241                 rc = -1;
2242                 goto cleanup;
2243         }
2244 #endif
2245
2246         if (register_controllers() != 0) {
2247                 rc = -1;
2248                 goto cleanup;
2249         }
2250
2251         if (g_warn) {
2252                 printf("WARNING: Some requested NVMe devices were skipped\n");
2253         }
2254
2255         if (g_num_namespaces == 0) {
2256                 fprintf(stderr, "No valid NVMe controllers or AIO or URING devices found\n");
2257                 goto cleanup;
2258         }
2259
2260         rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL);
2261         if (rc != 0) {
2262                 fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n");
2263                 goto cleanup;
2264         }
2265
2266         if (associate_workers_with_ns() != 0) {
2267                 rc = -1;
2268                 goto cleanup;
2269         }
2270
2271         printf("Initialization complete. Launching workers.\n");
2272
2273         /* Launch all of the slave workers */
2274         g_master_core = spdk_env_get_current_core();
2275         master_worker = NULL;
2276         worker = g_workers;
2277         while (worker != NULL) {
2278                 if (worker->lcore != g_master_core) {
2279                         spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker);
2280                 } else {
2281                         assert(master_worker == NULL);
2282                         master_worker = worker;
2283                 }
2284                 worker = worker->next;
2285         }
2286
2287         assert(master_worker != NULL);
2288         rc = work_fn(master_worker);
2289
2290         spdk_env_thread_wait_all();
2291
2292         print_stats();
2293
2294 cleanup:
2295         if (thread_id && pthread_cancel(thread_id) == 0) {
2296                 pthread_join(thread_id, NULL);
2297         }
2298         unregister_trids();
2299         unregister_namespaces();
2300         unregister_controllers();
2301         unregister_workers();
2302
2303         if (rc != 0) {
2304                 fprintf(stderr, "%s: errors occured\n", argv[0]);
2305         }
2306
2307         return rc;
2308 }