ceph/src/spdk/examples/nvme/perf/perf.c

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright (c) Intel Corporation.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "spdk/stdinc.h"
  35
  36 #include "spdk/env.h"
  37 #include "spdk/fd.h"
  38 #include "spdk/nvme.h"
  39 #include "spdk/env.h"
  40 #include "spdk/queue.h"
  41 #include "spdk/string.h"
  42 #include "spdk/nvme_intel.h"
  43 #include "spdk/histogram_data.h"
  44 #include "spdk/endian.h"
  45 #include "spdk/crc16.h"
  46
  47 #if HAVE_LIBAIO
  48 #include <libaio.h>
  49 #endif
  50
  51 struct ctrlr_entry {
  52         struct spdk_nvme_ctrlr                  *ctrlr;
  53         struct spdk_nvme_intel_rw_latency_page  *latency_page;
  54         struct ctrlr_entry                      *next;
  55         char                                    name[1024];
  56 };
  57
  58 enum entry_type {
  59         ENTRY_TYPE_NVME_NS,
  60         ENTRY_TYPE_AIO_FILE,
  61 };
  62
  63 struct ns_entry {
  64         enum entry_type         type;
  65
  66         union {
  67                 struct {
  68                         struct spdk_nvme_ctrlr  *ctrlr;
  69                         struct spdk_nvme_ns     *ns;
  70                 } nvme;
  71 #if HAVE_LIBAIO
  72                 struct {
  73                         int                     fd;
  74                 } aio;
  75 #endif
  76         } u;
  77
  78         struct ns_entry         *next;
  79         uint32_t                io_size_blocks;
  80         uint32_t                num_io_requests;
  81         uint64_t                size_in_ios;
  82         uint32_t                io_flags;
  83         uint16_t                apptag_mask;
  84         uint16_t                apptag;
  85         char                    name[1024];
  86         const struct spdk_nvme_ns_data  *nsdata;
  87 };
  88
  89 static const double g_latency_cutoffs[] = {
  90         0.01,
  91         0.10,
  92         0.25,
  93         0.50,
  94         0.75,
  95         0.90,
  96         0.95,
  97         0.98,
  98         0.99,
  99         0.995,
 100         0.999,
 101         0.9999,
 102         0.99999,
 103         0.999999,
 104         0.9999999,
 105         -1,
 106 };
 107
 108 struct ns_worker_ctx {
 109         struct ns_entry         *entry;
 110         uint64_t                io_completed;
 111         uint64_t                total_tsc;
 112         uint64_t                min_tsc;
 113         uint64_t                max_tsc;
 114         uint64_t                current_queue_depth;
 115         uint64_t                offset_in_ios;
 116         bool                    is_draining;
 117
 118         union {
 119                 struct {
 120                         struct spdk_nvme_qpair  *qpair;
 121                 } nvme;
 122
 123 #if HAVE_LIBAIO
 124                 struct {
 125                         struct io_event         *events;
 126                         io_context_t            ctx;
 127                 } aio;
 128 #endif
 129         } u;
 130
 131         struct ns_worker_ctx    *next;
 132
 133         struct spdk_histogram_data      *histogram;
 134 };
 135
 136 struct perf_task {
 137         struct ns_worker_ctx    *ns_ctx;
 138         void                    *buf;
 139         uint64_t                submit_tsc;
 140         uint16_t                appmask;
 141         uint16_t                apptag;
 142         uint64_t                lba;
 143         bool                    is_read;
 144 #if HAVE_LIBAIO
 145         struct iocb             iocb;
 146 #endif
 147 };
 148
 149 struct worker_thread {
 150         struct ns_worker_ctx    *ns_ctx;
 151         struct worker_thread    *next;
 152         unsigned                lcore;
 153 };
 154
 155 static int g_outstanding_commands;
 156
 157 static bool g_latency_ssd_tracking_enable = false;
 158 static int g_latency_sw_tracking_level = 0;
 159
 160 static struct ctrlr_entry *g_controllers = NULL;
 161 static int g_controllers_found = 0;
 162 static struct ns_entry *g_namespaces = NULL;
 163 static int g_num_namespaces = 0;
 164 static struct worker_thread *g_workers = NULL;
 165 static int g_num_workers = 0;
 166
 167 static uint64_t g_tsc_rate;
 168
 169 static uint32_t g_io_align = 0x200;
 170 static uint32_t g_io_size_bytes;
 171 static uint32_t g_max_io_md_size;
 172 static uint32_t g_max_io_size_blocks;
 173 static uint32_t g_metacfg_pract_flag;
 174 static uint32_t g_metacfg_prchk_flags;
 175 static int g_rw_percentage;
 176 static int g_is_random;
 177 static int g_queue_depth;
 178 static int g_time_in_sec;
 179 static uint32_t g_max_completions;
 180 static int g_dpdk_mem;
 181 static int g_shm_id = -1;
 182 static uint32_t g_disable_sq_cmb;
 183 static bool g_no_pci;
 184 static bool g_warn;
 185
 186 static const char *g_core_mask;
 187
 188 struct trid_entry {
 189         struct spdk_nvme_transport_id   trid;
 190         uint16_t                        nsid;
 191         TAILQ_ENTRY(trid_entry)         tailq;
 192 };
 193
 194 static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list);
 195
 196 static int g_aio_optind; /* Index of first AIO filename in argv */
 197
 198 static void
 199 task_complete(struct perf_task *task);
 200
 201 static void
 202 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
 203 {
 204         struct ns_entry *entry;
 205         const struct spdk_nvme_ctrlr_data *cdata;
 206         uint32_t max_xfer_size, entries;
 207         struct spdk_nvme_io_qpair_opts opts;
 208
 209         cdata = spdk_nvme_ctrlr_get_data(ctrlr);
 210
 211         if (!spdk_nvme_ns_is_active(ns)) {
 212                 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n",
 213                        cdata->mn, cdata->sn,
 214                        spdk_nvme_ns_get_id(ns));
 215                 g_warn = true;
 216                 return;
 217         }
 218
 219         if (spdk_nvme_ns_get_size(ns) < g_io_size_bytes ||
 220             spdk_nvme_ns_get_sector_size(ns) > g_io_size_bytes) {
 221                 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid "
 222                        "ns size %" PRIu64 " / block size %u for I/O size %u\n",
 223                        cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns),
 224                        spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes);
 225                 g_warn = true;
 226                 return;
 227         }
 228
 229         max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
 230         spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
 231         /* NVMe driver may add additional entries based on
 232          * stripe size and maximum transfer size, we assume
 233          * 1 more entry be used for stripe.
 234          */
 235         entries = (g_io_size_bytes - 1) / max_xfer_size + 2;
 236         if ((g_queue_depth * entries) > opts.io_queue_size) {
 237                 printf("controller IO queue size %u less than required\n",
 238                        opts.io_queue_size);
 239                 printf("Consider using lower queue depth or small IO size because "
 240                        "IO requests may be queued at the NVMe driver.\n");
 241                 g_warn = true;
 242         }
 243
 244         entry = calloc(1, sizeof(struct ns_entry));
 245         if (entry == NULL) {
 246                 perror("ns_entry malloc");
 247                 exit(1);
 248         }
 249
 250         entry->type = ENTRY_TYPE_NVME_NS;
 251         entry->u.nvme.ctrlr = ctrlr;
 252         entry->u.nvme.ns = ns;
 253         entry->num_io_requests = entries;
 254
 255         entry->size_in_ios = spdk_nvme_ns_get_size(ns) /
 256                              g_io_size_bytes;
 257         entry->io_size_blocks = g_io_size_bytes / spdk_nvme_ns_get_sector_size(ns);
 258
 259         if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
 260                 entry->io_flags = g_metacfg_pract_flag | g_metacfg_prchk_flags;
 261         }
 262
 263         if (g_max_io_md_size < spdk_nvme_ns_get_md_size(ns)) {
 264                 g_max_io_md_size = spdk_nvme_ns_get_md_size(ns);
 265         }
 266
 267         if (g_max_io_size_blocks < entry->io_size_blocks) {
 268                 g_max_io_size_blocks = entry->io_size_blocks;
 269         }
 270
 271         entry->nsdata = spdk_nvme_ns_get_data(ns);
 272
 273         snprintf(entry->name, 44, "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
 274
 275         g_num_namespaces++;
 276         entry->next = g_namespaces;
 277         g_namespaces = entry;
 278 }
 279
 280 static void
 281 unregister_namespaces(void)
 282 {
 283         struct ns_entry *entry = g_namespaces;
 284
 285         while (entry) {
 286                 struct ns_entry *next = entry->next;
 287                 free(entry);
 288                 entry = next;
 289         }
 290 }
 291
 292 static void
 293 enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl)
 294 {
 295         if (spdk_nvme_cpl_is_error(cpl)) {
 296                 printf("enable_latency_tracking_complete failed\n");
 297         }
 298         g_outstanding_commands--;
 299 }
 300
 301 static void
 302 set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable)
 303 {
 304         int res;
 305         union spdk_nvme_intel_feat_latency_tracking latency_tracking;
 306
 307         if (enable) {
 308                 latency_tracking.bits.enable = 0x01;
 309         } else {
 310                 latency_tracking.bits.enable = 0x00;
 311         }
 312
 313         res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING,
 314                                               latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL);
 315         if (res) {
 316                 printf("fail to allocate nvme request.\n");
 317                 return;
 318         }
 319         g_outstanding_commands++;
 320
 321         while (g_outstanding_commands) {
 322                 spdk_nvme_ctrlr_process_admin_completions(ctrlr);
 323         }
 324 }
 325
 326 static void
 327 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry)
 328 {
 329         struct spdk_nvme_ns *ns;
 330         struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry));
 331         const struct spdk_nvme_ctrlr_data *cdata = spdk_nvme_ctrlr_get_data(ctrlr);
 332         uint32_t nsid;
 333
 334         if (entry == NULL) {
 335                 perror("ctrlr_entry malloc");
 336                 exit(1);
 337         }
 338
 339         entry->latency_page = spdk_dma_zmalloc(sizeof(struct spdk_nvme_intel_rw_latency_page),
 340                                                4096, NULL);
 341         if (entry->latency_page == NULL) {
 342                 printf("Allocation error (latency page)\n");
 343                 exit(1);
 344         }
 345
 346         snprintf(entry->name, sizeof(entry->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
 347
 348         entry->ctrlr = ctrlr;
 349         entry->next = g_controllers;
 350         g_controllers = entry;
 351
 352         if (g_latency_ssd_tracking_enable &&
 353             spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) {
 354                 set_latency_tracking_feature(ctrlr, true);
 355         }
 356
 357         if (trid_entry->nsid == 0) {
 358                 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
 359                      nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
 360                         ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
 361                         if (ns == NULL) {
 362                                 continue;
 363                         }
 364                         register_ns(ctrlr, ns);
 365                 }
 366         } else {
 367                 ns = spdk_nvme_ctrlr_get_ns(ctrlr, trid_entry->nsid);
 368                 if (!ns) {
 369                         perror("Namespace does not exist.");
 370                         exit(1);
 371                 }
 372
 373                 register_ns(ctrlr, ns);
 374         }
 375
 376 }
 377
 378 #if HAVE_LIBAIO
 379 static int
 380 register_aio_file(const char *path)
 381 {
 382         struct ns_entry *entry;
 383
 384         int flags, fd;
 385         uint64_t size;
 386         uint32_t blklen;
 387
 388         if (g_rw_percentage == 100) {
 389                 flags = O_RDONLY;
 390         } else if (g_rw_percentage == 0) {
 391                 flags = O_WRONLY;
 392         } else {
 393                 flags = O_RDWR;
 394         }
 395
 396         flags |= O_DIRECT;
 397
 398         fd = open(path, flags);
 399         if (fd < 0) {
 400                 fprintf(stderr, "Could not open AIO device %s: %s\n", path, strerror(errno));
 401                 return -1;
 402         }
 403
 404         size = spdk_fd_get_size(fd);
 405         if (size == 0) {
 406                 fprintf(stderr, "Could not determine size of AIO device %s\n", path);
 407                 close(fd);
 408                 return -1;
 409         }
 410
 411         blklen = spdk_fd_get_blocklen(fd);
 412         if (blklen == 0) {
 413                 fprintf(stderr, "Could not determine block size of AIO device %s\n", path);
 414                 close(fd);
 415                 return -1;
 416         }
 417
 418         /*
 419          * TODO: This should really calculate the LCM of the current g_io_align and blklen.
 420          * For now, it's fairly safe to just assume all block sizes are powers of 2.
 421          */
 422         if (g_io_align < blklen) {
 423                 g_io_align = blklen;
 424         }
 425
 426         entry = malloc(sizeof(struct ns_entry));
 427         if (entry == NULL) {
 428                 close(fd);
 429                 perror("aio ns_entry malloc");
 430                 return -1;
 431         }
 432
 433         entry->type = ENTRY_TYPE_AIO_FILE;
 434         entry->u.aio.fd = fd;
 435         entry->size_in_ios = size / g_io_size_bytes;
 436         entry->io_size_blocks = g_io_size_bytes / blklen;
 437
 438         snprintf(entry->name, sizeof(entry->name), "%s", path);
 439
 440         g_num_namespaces++;
 441         entry->next = g_namespaces;
 442         g_namespaces = entry;
 443
 444         return 0;
 445 }
 446
 447 static int
 448 aio_submit(io_context_t aio_ctx, struct iocb *iocb, int fd, enum io_iocb_cmd cmd, void *buf,
 449            unsigned long nbytes, uint64_t offset, void *cb_ctx)
 450 {
 451         iocb->aio_fildes = fd;
 452         iocb->aio_reqprio = 0;
 453         iocb->aio_lio_opcode = cmd;
 454         iocb->u.c.buf = buf;
 455         iocb->u.c.nbytes = nbytes;
 456         iocb->u.c.offset = offset;
 457         iocb->data = cb_ctx;
 458
 459         if (io_submit(aio_ctx, 1, &iocb) < 0) {
 460                 printf("io_submit");
 461                 return -1;
 462         }
 463
 464         return 0;
 465 }
 466
 467 static void
 468 aio_check_io(struct ns_worker_ctx *ns_ctx)
 469 {
 470         int count, i;
 471         struct timespec timeout;
 472
 473         timeout.tv_sec = 0;
 474         timeout.tv_nsec = 0;
 475
 476         count = io_getevents(ns_ctx->u.aio.ctx, 1, g_queue_depth, ns_ctx->u.aio.events, &timeout);
 477         if (count < 0) {
 478                 fprintf(stderr, "io_getevents error\n");
 479                 exit(1);
 480         }
 481
 482         for (i = 0; i < count; i++) {
 483                 task_complete(ns_ctx->u.aio.events[i].data);
 484         }
 485 }
 486 #endif /* HAVE_LIBAIO */
 487
 488 static void
 489 task_extended_lba_setup_pi(struct ns_entry *entry, struct perf_task *task, uint64_t lba,
 490                            uint32_t lba_count, bool is_write)
 491 {
 492         struct spdk_nvme_protection_info *pi;
 493         uint32_t i, md_size, sector_size, pi_offset;
 494         uint16_t crc16;
 495
 496         task->appmask = 0;
 497         task->apptag = 0;
 498
 499         if (!spdk_nvme_ns_supports_extended_lba(entry->u.nvme.ns)) {
 500                 return;
 501         }
 502
 503         if (spdk_nvme_ns_get_pi_type(entry->u.nvme.ns) ==
 504             SPDK_NVME_FMT_NVM_PROTECTION_DISABLE) {
 505                 return;
 506         }
 507
 508         if (entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
 509                 return;
 510         }
 511
 512         /* Type3 don't support REFTAG */
 513         if (spdk_nvme_ns_get_pi_type(entry->u.nvme.ns) ==
 514             SPDK_NVME_FMT_NVM_PROTECTION_TYPE3) {
 515                 return;
 516         }
 517
 518         sector_size = spdk_nvme_ns_get_sector_size(entry->u.nvme.ns);
 519         md_size = spdk_nvme_ns_get_md_size(entry->u.nvme.ns);
 520
 521         /* PI locates at the first 8 bytes of metadata,
 522          * doesn't support now
 523          */
 524         if (entry->nsdata->dps.md_start) {
 525                 return;
 526         }
 527
 528         if (entry->io_flags & SPDK_NVME_IO_FLAGS_PRCHK_APPTAG) {
 529                 /* Let's use number of lbas for application tag */
 530                 task->appmask = 0xffff;
 531                 task->apptag = lba_count;
 532         }
 533
 534         for (i = 0; i < lba_count; i++) {
 535                 pi_offset = ((sector_size + md_size) * (i + 1)) - 8;
 536                 pi = (struct spdk_nvme_protection_info *)(task->buf + pi_offset);
 537                 memset(pi, 0, sizeof(*pi));
 538
 539                 if (is_write) {
 540                         if (entry->io_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) {
 541                                 /* CRC buffer should not include PI */
 542                                 crc16 = spdk_crc16_t10dif(task->buf + (sector_size + md_size) * i,
 543                                                           sector_size + md_size - 8);
 544                                 to_be16(&pi->guard, crc16);
 545                         }
 546                         if (entry->io_flags & SPDK_NVME_IO_FLAGS_PRCHK_APPTAG) {
 547                                 /* Let's use number of lbas for application tag */
 548                                 to_be16(&pi->app_tag, lba_count);
 549                         }
 550                         if (entry->io_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) {
 551                                 to_be32(&pi->ref_tag, (uint32_t)lba + i);
 552                         }
 553                 }
 554         }
 555 }
 556
 557 static void
 558 task_extended_lba_pi_verify(struct ns_entry *entry, struct perf_task *task,
 559                             uint64_t lba, uint32_t lba_count)
 560 {
 561         struct spdk_nvme_protection_info *pi;
 562         uint32_t i, md_size, sector_size, pi_offset, ref_tag;
 563         uint16_t crc16, guard, app_tag;
 564
 565         if (spdk_nvme_ns_get_pi_type(entry->u.nvme.ns) ==
 566             SPDK_NVME_FMT_NVM_PROTECTION_DISABLE) {
 567                 return;
 568         }
 569
 570         sector_size = spdk_nvme_ns_get_sector_size(entry->u.nvme.ns);
 571         md_size = spdk_nvme_ns_get_md_size(entry->u.nvme.ns);
 572
 573         /* PI locates at the first 8 bytes of metadata,
 574          * doesn't support now
 575          */
 576         if (entry->nsdata->dps.md_start) {
 577                 return;
 578         }
 579
 580         for (i = 0; i < lba_count; i++) {
 581                 pi_offset = ((sector_size + md_size) * (i + 1)) - 8;
 582                 pi = (struct spdk_nvme_protection_info *)(task->buf + pi_offset);
 583
 584                 if (entry->io_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) {
 585                         /* CRC buffer should not include last 8 bytes of PI */
 586                         crc16 = spdk_crc16_t10dif(task->buf + (sector_size + md_size) * i,
 587                                                   sector_size + md_size - 8);
 588                         to_be16(&guard, crc16);
 589                         if (pi->guard != guard) {
 590                                 fprintf(stdout, "Get Guard Error LBA 0x%16.16"PRIx64","
 591                                         " Preferred 0x%04x but returned with 0x%04x,"
 592                                         " may read the LBA without write it first\n",
 593                                         lba + i, guard, pi->guard);
 594                         }
 595
 596                 }
 597                 if (entry->io_flags & SPDK_NVME_IO_FLAGS_PRCHK_APPTAG) {
 598                         /* Previously we used the number of lbas as
 599                          * application tag for writes
 600                          */
 601                         to_be16(&app_tag, lba_count);
 602                         if (pi->app_tag != app_tag) {
 603                                 fprintf(stdout, "Get Application Tag Error LBA 0x%16.16"PRIx64","
 604                                         " Preferred 0x%04x but returned with 0x%04x,"
 605                                         " may read the LBA without write it first\n",
 606                                         lba + i, app_tag, pi->app_tag);
 607                         }
 608                 }
 609                 if (entry->io_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) {
 610                         to_be32(&ref_tag, (uint32_t)lba + i);
 611                         if (pi->ref_tag != ref_tag) {
 612                                 fprintf(stdout, "Get Reference Tag Error LBA 0x%16.16"PRIx64","
 613                                         " Preferred 0x%08x but returned with 0x%08x,"
 614                                         " may read the LBA without write it first\n",
 615                                         lba + i, ref_tag, pi->ref_tag);
 616                         }
 617                 }
 618         }
 619 }
 620
 621 static void io_complete(void *ctx, const struct spdk_nvme_cpl *completion);
 622
 623 static __thread unsigned int seed = 0;
 624
 625 static void
 626 submit_single_io(struct perf_task *task)
 627 {
 628         uint64_t                offset_in_ios;
 629         int                     rc;
 630         struct ns_worker_ctx    *ns_ctx = task->ns_ctx;
 631         struct ns_entry         *entry = ns_ctx->entry;
 632
 633         if (g_is_random) {
 634                 offset_in_ios = rand_r(&seed) % entry->size_in_ios;
 635         } else {
 636                 offset_in_ios = ns_ctx->offset_in_ios++;
 637                 if (ns_ctx->offset_in_ios == entry->size_in_ios) {
 638                         ns_ctx->offset_in_ios = 0;
 639                 }
 640         }
 641
 642         task->is_read = false;
 643         task->submit_tsc = spdk_get_ticks();
 644         task->lba = offset_in_ios * entry->io_size_blocks;
 645
 646         if ((g_rw_percentage == 100) ||
 647             (g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) {
 648 #if HAVE_LIBAIO
 649                 if (entry->type == ENTRY_TYPE_AIO_FILE) {
 650                         rc = aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PREAD, task->buf,
 651                                         g_io_size_bytes, offset_in_ios * g_io_size_bytes, task);
 652                 } else
 653 #endif
 654                 {
 655                         task_extended_lba_setup_pi(entry, task, task->lba,
 656                                                    entry->io_size_blocks, false);
 657                         task->is_read = true;
 658
 659                         rc = spdk_nvme_ns_cmd_read_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair,
 660                                                            task->buf, NULL,
 661                                                            task->lba,
 662                                                            entry->io_size_blocks, io_complete,
 663                                                            task, entry->io_flags,
 664                                                            task->appmask, task->apptag);
 665                 }
 666         } else {
 667 #if HAVE_LIBAIO
 668                 if (entry->type == ENTRY_TYPE_AIO_FILE) {
 669                         rc = aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PWRITE, task->buf,
 670                                         g_io_size_bytes, offset_in_ios * g_io_size_bytes, task);
 671                 } else
 672 #endif
 673                 {
 674                         task_extended_lba_setup_pi(entry, task, task->lba,
 675                                                    entry->io_size_blocks, true);
 676
 677                         rc = spdk_nvme_ns_cmd_write_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair,
 678                                                             task->buf, NULL,
 679                                                             task->lba,
 680                                                             entry->io_size_blocks, io_complete,
 681                                                             task, entry->io_flags,
 682                                                             task->appmask, task->apptag);
 683                 }
 684         }
 685
 686         if (rc != 0) {
 687                 fprintf(stderr, "starting I/O failed\n");
 688         } else {
 689                 ns_ctx->current_queue_depth++;
 690         }
 691 }
 692
 693 static void
 694 task_complete(struct perf_task *task)
 695 {
 696         struct ns_worker_ctx    *ns_ctx;
 697         uint64_t                tsc_diff;
 698         struct ns_entry         *entry;
 699
 700         ns_ctx = task->ns_ctx;
 701         entry = ns_ctx->entry;
 702         ns_ctx->current_queue_depth--;
 703         ns_ctx->io_completed++;
 704         tsc_diff = spdk_get_ticks() - task->submit_tsc;
 705         ns_ctx->total_tsc += tsc_diff;
 706         if (ns_ctx->min_tsc > tsc_diff) {
 707                 ns_ctx->min_tsc = tsc_diff;
 708         }
 709         if (ns_ctx->max_tsc < tsc_diff) {
 710                 ns_ctx->max_tsc = tsc_diff;
 711         }
 712         if (g_latency_sw_tracking_level > 0) {
 713                 spdk_histogram_data_tally(ns_ctx->histogram, tsc_diff);
 714         }
 715
 716         /* add application level verification for end-to-end data protection */
 717         if (entry->type == ENTRY_TYPE_NVME_NS) {
 718                 if (spdk_nvme_ns_supports_extended_lba(entry->u.nvme.ns) &&
 719                     task->is_read && !g_metacfg_pract_flag) {
 720                         task_extended_lba_pi_verify(entry, task, task->lba,
 721                                                     entry->io_size_blocks);
 722                 }
 723         }
 724
 725         /*
 726          * is_draining indicates when time has expired for the test run
 727          * and we are just waiting for the previously submitted I/O
 728          * to complete.  In this case, do not submit a new I/O to replace
 729          * the one just completed.
 730          */
 731         if (ns_ctx->is_draining) {
 732                 spdk_dma_free(task->buf);
 733                 free(task);
 734         } else {
 735                 submit_single_io(task);
 736         }
 737 }
 738
 739 static void
 740 io_complete(void *ctx, const struct spdk_nvme_cpl *completion)
 741 {
 742         task_complete((struct perf_task *)ctx);
 743 }
 744
 745 static void
 746 check_io(struct ns_worker_ctx *ns_ctx)
 747 {
 748 #if HAVE_LIBAIO
 749         if (ns_ctx->entry->type == ENTRY_TYPE_AIO_FILE) {
 750                 aio_check_io(ns_ctx);
 751         } else
 752 #endif
 753         {
 754                 spdk_nvme_qpair_process_completions(ns_ctx->u.nvme.qpair, g_max_completions);
 755         }
 756 }
 757
 758 static void
 759 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth)
 760 {
 761         struct perf_task *task;
 762         uint32_t max_io_size_bytes;
 763
 764         while (queue_depth-- > 0) {
 765                 task = calloc(1, sizeof(*task));
 766                 if (task == NULL) {
 767                         fprintf(stderr, "Out of memory allocating tasks\n");
 768                         exit(1);
 769                 }
 770
 771                 /* maximum extended lba format size from all active
 772                  * namespace, it's same with g_io_size_bytes for
 773                  * namespace without metadata
 774                  */
 775                 max_io_size_bytes = g_io_size_bytes + g_max_io_md_size * g_max_io_size_blocks;
 776                 task->buf = spdk_dma_zmalloc(max_io_size_bytes, g_io_align, NULL);
 777                 if (task->buf == NULL) {
 778                         fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n");
 779                         exit(1);
 780                 }
 781                 memset(task->buf, queue_depth % 8 + 1, max_io_size_bytes);
 782
 783                 task->ns_ctx = ns_ctx;
 784
 785                 submit_single_io(task);
 786         }
 787 }
 788
 789 static void
 790 drain_io(struct ns_worker_ctx *ns_ctx)
 791 {
 792         ns_ctx->is_draining = true;
 793         while (ns_ctx->current_queue_depth > 0) {
 794                 check_io(ns_ctx);
 795         }
 796 }
 797
 798 static int
 799 init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 800 {
 801         if (ns_ctx->entry->type == ENTRY_TYPE_AIO_FILE) {
 802 #ifdef HAVE_LIBAIO
 803                 ns_ctx->u.aio.events = calloc(g_queue_depth, sizeof(struct io_event));
 804                 if (!ns_ctx->u.aio.events) {
 805                         return -1;
 806                 }
 807                 ns_ctx->u.aio.ctx = 0;
 808                 if (io_setup(g_queue_depth, &ns_ctx->u.aio.ctx) < 0) {
 809                         free(ns_ctx->u.aio.events);
 810                         perror("io_setup");
 811                         return -1;
 812                 }
 813 #endif
 814         } else {
 815                 /*
 816                  * TODO: If a controller has multiple namespaces, they could all use the same queue.
 817                  *  For now, give each namespace/thread combination its own queue.
 818                  */
 819                 struct spdk_nvme_io_qpair_opts opts;
 820
 821                 spdk_nvme_ctrlr_get_default_io_qpair_opts(ns_ctx->entry->u.nvme.ctrlr, &opts, sizeof(opts));
 822                 if (opts.io_queue_requests < ns_ctx->entry->num_io_requests) {
 823                         opts.io_queue_requests = ns_ctx->entry->num_io_requests;
 824                 }
 825
 826                 ns_ctx->u.nvme.qpair = spdk_nvme_ctrlr_alloc_io_qpair(ns_ctx->entry->u.nvme.ctrlr, &opts,
 827                                        sizeof(opts));
 828                 if (!ns_ctx->u.nvme.qpair) {
 829                         printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n");
 830                         return -1;
 831                 }
 832         }
 833
 834         return 0;
 835 }
 836
 837 static void
 838 cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 839 {
 840         if (ns_ctx->entry->type == ENTRY_TYPE_AIO_FILE) {
 841 #ifdef HAVE_LIBAIO
 842                 io_destroy(ns_ctx->u.aio.ctx);
 843                 free(ns_ctx->u.aio.events);
 844 #endif
 845         } else {
 846                 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair);
 847         }
 848 }
 849
 850 static int
 851 work_fn(void *arg)
 852 {
 853         uint64_t tsc_end;
 854         struct worker_thread *worker = (struct worker_thread *)arg;
 855         struct ns_worker_ctx *ns_ctx = NULL;
 856
 857         printf("Starting thread on core %u\n", worker->lcore);
 858
 859         /* Allocate a queue pair for each namespace. */
 860         ns_ctx = worker->ns_ctx;
 861         while (ns_ctx != NULL) {
 862                 if (init_ns_worker_ctx(ns_ctx) != 0) {
 863                         printf("ERROR: init_ns_worker_ctx() failed\n");
 864                         return 1;
 865                 }
 866                 ns_ctx = ns_ctx->next;
 867         }
 868
 869         tsc_end = spdk_get_ticks() + g_time_in_sec * g_tsc_rate;
 870
 871         /* Submit initial I/O for each namespace. */
 872         ns_ctx = worker->ns_ctx;
 873         while (ns_ctx != NULL) {
 874                 submit_io(ns_ctx, g_queue_depth);
 875                 ns_ctx = ns_ctx->next;
 876         }
 877
 878         while (1) {
 879                 /*
 880                  * Check for completed I/O for each controller. A new
 881                  * I/O will be submitted in the io_complete callback
 882                  * to replace each I/O that is completed.
 883                  */
 884                 ns_ctx = worker->ns_ctx;
 885                 while (ns_ctx != NULL) {
 886                         check_io(ns_ctx);
 887                         ns_ctx = ns_ctx->next;
 888                 }
 889
 890                 if (spdk_get_ticks() > tsc_end) {
 891                         break;
 892                 }
 893         }
 894
 895         ns_ctx = worker->ns_ctx;
 896         while (ns_ctx != NULL) {
 897                 drain_io(ns_ctx);
 898                 cleanup_ns_worker_ctx(ns_ctx);
 899                 ns_ctx = ns_ctx->next;
 900         }
 901
 902         return 0;
 903 }
 904
 905 static void usage(char *program_name)
 906 {
 907         printf("%s options", program_name);
 908 #if HAVE_LIBAIO
 909         printf(" [AIO device(s)]...");
 910 #endif
 911         printf("\n");
 912         printf("\t[-q io depth]\n");
 913         printf("\t[-o io size in bytes]\n");
 914         printf("\t[-w io pattern type, must be one of\n");
 915         printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
 916         printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
 917         printf("\t[-L enable latency tracking via sw, default: disabled]\n");
 918         printf("\t\t-L for latency summary, -LL for detailed histogram\n");
 919         printf("\t[-l enable latency tracking via ssd (if supported), default: disabled]\n");
 920         printf("\t[-t time in seconds]\n");
 921         printf("\t[-c core mask for I/O submission/completion.]\n");
 922         printf("\t\t(default: 1)]\n");
 923         printf("\t[-D disable submission queue in controller memory buffer, default: enabled]\n");
 924         printf("\t[-r Transport ID for local PCIe NVMe or NVMeoF]\n");
 925         printf("\t Format: 'key:value [key:value] ...'\n");
 926         printf("\t Keys:\n");
 927         printf("\t  trtype      Transport type (e.g. PCIe, RDMA)\n");
 928         printf("\t  adrfam      Address family (e.g. IPv4, IPv6)\n");
 929         printf("\t  traddr      Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n");
 930         printf("\t  trsvcid     Transport service identifier (e.g. 4420)\n");
 931         printf("\t  subnqn      Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN);
 932         printf("\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n");
 933         printf("\t          -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n");
 934         printf("\t[-e metadata configuration]\n");
 935         printf("\t Keys:\n");
 936         printf("\t  PRACT      Protection Information Action bit (PRACT=1 or PRACT=0)\n");
 937         printf("\t  PRCHK      Control of Protection Information Checking (PRCHK=GUARD|REFTAG|APPTAG)\n");
 938         printf("\t Example: -e 'PRACT=0,PRCHK=GUARD|REFTAG|APPTAG'\n");
 939         printf("\t          -e 'PRACT=1,PRCHK=GUARD'\n");
 940         printf("\t[-s DPDK huge memory size in MB.]\n");
 941         printf("\t[-m max completions per poll]\n");
 942         printf("\t\t(default: 0 - unlimited)\n");
 943         printf("\t[-i shared memory group ID]\n");
 944 }
 945
 946 static void
 947 check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count,
 948              uint64_t total, uint64_t so_far)
 949 {
 950         double so_far_pct;
 951         double **cutoff = ctx;
 952
 953         if (count == 0) {
 954                 return;
 955         }
 956
 957         so_far_pct = (double)so_far / total;
 958         while (so_far_pct >= **cutoff && **cutoff > 0) {
 959                 printf("%9.5f%% : %9.3fus\n", **cutoff * 100, (double)end * 1000 * 1000 / g_tsc_rate);
 960                 (*cutoff)++;
 961         }
 962 }
 963
 964 static void
 965 print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count,
 966              uint64_t total, uint64_t so_far)
 967 {
 968         double so_far_pct;
 969
 970         if (count == 0) {
 971                 return;
 972         }
 973
 974         so_far_pct = (double)so_far * 100 / total;
 975         printf("%9.3f - %9.3f: %9.4f%%  (%9ju)\n",
 976                (double)start * 1000 * 1000 / g_tsc_rate,
 977                (double)end * 1000 * 1000 / g_tsc_rate,
 978                so_far_pct, count);
 979 }
 980
 981 static void
 982 print_performance(void)
 983 {
 984         uint64_t total_io_completed, total_io_tsc;
 985         double io_per_second, mb_per_second, average_latency, min_latency, max_latency;
 986         double sum_ave_latency, min_latency_so_far, max_latency_so_far;
 987         double total_io_per_second, total_mb_per_second;
 988         int ns_count;
 989         struct worker_thread    *worker;
 990         struct ns_worker_ctx    *ns_ctx;
 991
 992         total_io_per_second = 0;
 993         total_mb_per_second = 0;
 994         total_io_completed = 0;
 995         total_io_tsc = 0;
 996         min_latency_so_far = (double)UINT64_MAX;
 997         max_latency_so_far = 0;
 998         ns_count = 0;
 999
1000         printf("========================================================\n");
1001         printf("%103s\n", "Latency(us)");
1002         printf("%-55s: %10s %10s %10s %10s %10s\n",
1003                "Device Information", "IOPS", "MB/s", "Average", "min", "max");
1004
1005         worker = g_workers;
1006         while (worker) {
1007                 ns_ctx = worker->ns_ctx;
1008                 while (ns_ctx) {
1009                         if (ns_ctx->io_completed != 0) {
1010                                 io_per_second = (double)ns_ctx->io_completed / g_time_in_sec;
1011                                 mb_per_second = io_per_second * g_io_size_bytes / (1024 * 1024);
1012                                 average_latency = ((double)ns_ctx->total_tsc / ns_ctx->io_completed) * 1000 * 1000 / g_tsc_rate;
1013                                 min_latency = (double)ns_ctx->min_tsc * 1000 * 1000 / g_tsc_rate;
1014                                 if (min_latency < min_latency_so_far) {
1015                                         min_latency_so_far = min_latency;
1016                                 }
1017
1018                                 max_latency = (double)ns_ctx->max_tsc * 1000 * 1000 / g_tsc_rate;
1019                                 if (max_latency > max_latency_so_far) {
1020                                         max_latency_so_far = max_latency;
1021                                 }
1022
1023                                 printf("%-43.43s from core %u: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
1024                                        ns_ctx->entry->name, worker->lcore,
1025                                        io_per_second, mb_per_second,
1026                                        average_latency, min_latency, max_latency);
1027                                 total_io_per_second += io_per_second;
1028                                 total_mb_per_second += mb_per_second;
1029                                 total_io_completed += ns_ctx->io_completed;
1030                                 total_io_tsc += ns_ctx->total_tsc;
1031                                 ns_count++;
1032                         }
1033                         ns_ctx = ns_ctx->next;
1034                 }
1035                 worker = worker->next;
1036         }
1037
1038         if (ns_count != 0 && total_io_completed) {
1039                 sum_ave_latency = ((double)total_io_tsc / total_io_completed) * 1000 * 1000 / g_tsc_rate;
1040                 printf("========================================================\n");
1041                 printf("%-55s: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
1042                        "Total", total_io_per_second, total_mb_per_second,
1043                        sum_ave_latency, min_latency_so_far, max_latency_so_far);
1044                 printf("\n");
1045         }
1046
1047         if (g_latency_sw_tracking_level == 0 || total_io_completed == 0) {
1048                 return;
1049         }
1050
1051         worker = g_workers;
1052         while (worker) {
1053                 ns_ctx = worker->ns_ctx;
1054                 while (ns_ctx) {
1055                         const double *cutoff = g_latency_cutoffs;
1056
1057                         printf("Summary latency data for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore);
1058                         printf("=================================================================================\n");
1059
1060                         spdk_histogram_data_iterate(ns_ctx->histogram, check_cutoff, &cutoff);
1061
1062                         printf("\n");
1063                         ns_ctx = ns_ctx->next;
1064                 }
1065                 worker = worker->next;
1066         }
1067
1068         if (g_latency_sw_tracking_level == 1) {
1069                 return;
1070         }
1071
1072         worker = g_workers;
1073         while (worker) {
1074                 ns_ctx = worker->ns_ctx;
1075                 while (ns_ctx) {
1076                         printf("Latency histogram for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore);
1077                         printf("==============================================================================\n");
1078                         printf("       Range in us     Cumulative    IO count\n");
1079
1080                         spdk_histogram_data_iterate(ns_ctx->histogram, print_bucket, NULL);
1081                         printf("\n");
1082                         ns_ctx = ns_ctx->next;
1083                 }
1084                 worker = worker->next;
1085         }
1086
1087 }
1088
1089 static void
1090 print_latency_page(struct ctrlr_entry *entry)
1091 {
1092         int i;
1093
1094         printf("\n");
1095         printf("%s\n", entry->name);
1096         printf("--------------------------------------------------------\n");
1097
1098         for (i = 0; i < 32; i++) {
1099                 if (entry->latency_page->buckets_32us[i]) {
1100                         printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32, entry->latency_page->buckets_32us[i]);
1101                 }
1102         }
1103         for (i = 0; i < 31; i++) {
1104                 if (entry->latency_page->buckets_1ms[i]) {
1105                         printf("Bucket %dms - %dms: %d\n", i + 1, i + 2, entry->latency_page->buckets_1ms[i]);
1106                 }
1107         }
1108         for (i = 0; i < 31; i++) {
1109                 if (entry->latency_page->buckets_32ms[i])
1110                         printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32,
1111                                entry->latency_page->buckets_32ms[i]);
1112         }
1113 }
1114
1115 static void
1116 print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page)
1117 {
1118         struct ctrlr_entry      *ctrlr;
1119
1120         printf("%s Latency Statistics:\n", op_name);
1121         printf("========================================================\n");
1122         ctrlr = g_controllers;
1123         while (ctrlr) {
1124                 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
1125                         if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr->ctrlr, log_page, SPDK_NVME_GLOBAL_NS_TAG,
1126                                                              ctrlr->latency_page, sizeof(struct spdk_nvme_intel_rw_latency_page), 0,
1127                                                              enable_latency_tracking_complete,
1128                                                              NULL)) {
1129                                 printf("nvme_ctrlr_cmd_get_log_page() failed\n");
1130                                 exit(1);
1131                         }
1132
1133                         g_outstanding_commands++;
1134                 } else {
1135                         printf("Controller %s: %s latency statistics not supported\n", ctrlr->name, op_name);
1136                 }
1137                 ctrlr = ctrlr->next;
1138         }
1139
1140         while (g_outstanding_commands) {
1141                 ctrlr = g_controllers;
1142                 while (ctrlr) {
1143                         spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr);
1144                         ctrlr = ctrlr->next;
1145                 }
1146         }
1147
1148         ctrlr = g_controllers;
1149         while (ctrlr) {
1150                 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
1151                         print_latency_page(ctrlr);
1152                 }
1153                 ctrlr = ctrlr->next;
1154         }
1155         printf("\n");
1156 }
1157
1158 static void
1159 print_stats(void)
1160 {
1161         print_performance();
1162         if (g_latency_ssd_tracking_enable) {
1163                 if (g_rw_percentage != 0) {
1164                         print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY);
1165                 }
1166                 if (g_rw_percentage != 100) {
1167                         print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY);
1168                 }
1169         }
1170 }
1171
1172 static void
1173 unregister_trids(void)
1174 {
1175         struct trid_entry *trid_entry, *tmp;
1176
1177         TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) {
1178                 free(trid_entry);
1179         }
1180 }
1181
1182 static int
1183 add_trid(const char *trid_str)
1184 {
1185         struct trid_entry *trid_entry;
1186         struct spdk_nvme_transport_id *trid;
1187         char *ns;
1188
1189         trid_entry = calloc(1, sizeof(*trid_entry));
1190         if (trid_entry == NULL) {
1191                 return -1;
1192         }
1193
1194         trid = &trid_entry->trid;
1195         memset(trid, 0, sizeof(*trid));
1196         trid->trtype = SPDK_NVME_TRANSPORT_PCIE;
1197         snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
1198
1199         if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) {
1200                 fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str);
1201                 free(trid_entry);
1202                 return 1;
1203         }
1204
1205         ns = strcasestr(trid_str, "ns:");
1206         if (ns) {
1207                 char nsid_str[6]; /* 5 digits maximum in an nsid */
1208                 int len;
1209                 int nsid;
1210
1211                 ns += 3;
1212
1213                 len = strcspn(ns, " \t\n");
1214                 if (len > 5) {
1215                         fprintf(stderr, "NVMe namespace IDs must be 5 digits or less\n");
1216                         free(trid_entry);
1217                         return 1;
1218                 }
1219
1220                 memcpy(nsid_str, ns, len);
1221                 nsid_str[len] = '\0';
1222
1223                 nsid = atoi(nsid_str);
1224                 if (nsid <= 0 || nsid > 65535) {
1225                         fprintf(stderr, "NVMe namespace IDs must be less than 65536 and greater than 0\n");
1226                         free(trid_entry);
1227                         return 1;
1228                 }
1229
1230                 trid_entry->nsid = (uint16_t)nsid;
1231         }
1232
1233         TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq);
1234         return 0;
1235 }
1236
1237 static int
1238 parse_metadata(const char *metacfg_str)
1239 {
1240         const char *sep;
1241
1242         if (strstr(metacfg_str, "PRACT=1") != NULL) {
1243                 g_metacfg_pract_flag = SPDK_NVME_IO_FLAGS_PRACT;
1244         }
1245
1246         sep = strchr(metacfg_str, ',');
1247         if (!sep) {
1248                 return 0;
1249         }
1250
1251         if (strstr(sep, "PRCHK=") != NULL) {
1252                 if (strstr(sep, "GUARD") != NULL) {
1253                         g_metacfg_prchk_flags = SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
1254                 }
1255                 if (strstr(sep, "REFTAG") != NULL) {
1256                         g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
1257                 }
1258                 if (strstr(sep, "APPTAG") != NULL) {
1259                         g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG;
1260                 }
1261         }
1262
1263         return 0;
1264 }
1265
1266 static int
1267 parse_args(int argc, char **argv)
1268 {
1269         const char *workload_type;
1270         int op;
1271         bool mix_specified = false;
1272
1273         /* default value */
1274         g_queue_depth = 0;
1275         g_io_size_bytes = 0;
1276         workload_type = NULL;
1277         g_time_in_sec = 0;
1278         g_rw_percentage = -1;
1279         g_core_mask = NULL;
1280         g_max_completions = 0;
1281
1282         while ((op = getopt(argc, argv, "c:e:i:lm:o:q:r:s:t:w:DLM:")) != -1) {
1283                 switch (op) {
1284                 case 'c':
1285                         g_core_mask = optarg;
1286                         break;
1287                 case 'e':
1288                         if (parse_metadata(optarg)) {
1289                                 usage(argv[0]);
1290                                 return 1;
1291                         }
1292                         break;
1293                 case 'i':
1294                         g_shm_id = atoi(optarg);
1295                         break;
1296                 case 'l':
1297                         g_latency_ssd_tracking_enable = true;
1298                         break;
1299                 case 'm':
1300                         g_max_completions = atoi(optarg);
1301                         break;
1302                 case 'o':
1303                         g_io_size_bytes = atoi(optarg);
1304                         break;
1305                 case 'q':
1306                         g_queue_depth = atoi(optarg);
1307                         break;
1308                 case 'r':
1309                         if (add_trid(optarg)) {
1310                                 usage(argv[0]);
1311                                 return 1;
1312                         }
1313                         break;
1314                 case 's':
1315                         g_dpdk_mem = atoi(optarg);
1316                         break;
1317                 case 't':
1318                         g_time_in_sec = atoi(optarg);
1319                         break;
1320                 case 'w':
1321                         workload_type = optarg;
1322                         break;
1323                 case 'D':
1324                         g_disable_sq_cmb = 1;
1325                         break;
1326                 case 'L':
1327                         g_latency_sw_tracking_level++;
1328                         break;
1329                 case 'M':
1330                         g_rw_percentage = atoi(optarg);
1331                         mix_specified = true;
1332                         break;
1333                 default:
1334                         usage(argv[0]);
1335                         return 1;
1336                 }
1337         }
1338
1339         if (!g_queue_depth) {
1340                 usage(argv[0]);
1341                 return 1;
1342         }
1343         if (!g_io_size_bytes) {
1344                 usage(argv[0]);
1345                 return 1;
1346         }
1347         if (!workload_type) {
1348                 usage(argv[0]);
1349                 return 1;
1350         }
1351         if (!g_time_in_sec) {
1352                 usage(argv[0]);
1353                 return 1;
1354         }
1355
1356         if (strcmp(workload_type, "read") &&
1357             strcmp(workload_type, "write") &&
1358             strcmp(workload_type, "randread") &&
1359             strcmp(workload_type, "randwrite") &&
1360             strcmp(workload_type, "rw") &&
1361             strcmp(workload_type, "randrw")) {
1362                 fprintf(stderr,
1363                         "io pattern type must be one of\n"
1364                         "(read, write, randread, randwrite, rw, randrw)\n");
1365                 return 1;
1366         }
1367
1368         if (!strcmp(workload_type, "read") ||
1369             !strcmp(workload_type, "randread")) {
1370                 g_rw_percentage = 100;
1371         }
1372
1373         if (!strcmp(workload_type, "write") ||
1374             !strcmp(workload_type, "randwrite")) {
1375                 g_rw_percentage = 0;
1376         }
1377
1378         if (!strcmp(workload_type, "read") ||
1379             !strcmp(workload_type, "randread") ||
1380             !strcmp(workload_type, "write") ||
1381             !strcmp(workload_type, "randwrite")) {
1382                 if (mix_specified) {
1383                         fprintf(stderr, "Ignoring -M option... Please use -M option"
1384                                 " only when using rw or randrw.\n");
1385                 }
1386         }
1387
1388         if (!strcmp(workload_type, "rw") ||
1389             !strcmp(workload_type, "randrw")) {
1390                 if (g_rw_percentage < 0 || g_rw_percentage > 100) {
1391                         fprintf(stderr,
1392                                 "-M must be specified to value from 0 to 100 "
1393                                 "for rw or randrw.\n");
1394                         return 1;
1395                 }
1396         }
1397
1398         if (!strcmp(workload_type, "read") ||
1399             !strcmp(workload_type, "write") ||
1400             !strcmp(workload_type, "rw")) {
1401                 g_is_random = 0;
1402         } else {
1403                 g_is_random = 1;
1404         }
1405
1406         if (TAILQ_EMPTY(&g_trid_list)) {
1407                 /* If no transport IDs specified, default to enumerating all local PCIe devices */
1408                 add_trid("trtype:PCIe");
1409         } else {
1410                 struct trid_entry *trid_entry, *trid_entry_tmp;
1411
1412                 g_no_pci = true;
1413                 /* check whether there is local PCIe type */
1414                 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) {
1415                         if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
1416                                 g_no_pci = false;
1417                                 break;
1418                         }
1419                 }
1420         }
1421
1422         g_aio_optind = optind;
1423
1424         return 0;
1425 }
1426
1427 static int
1428 register_workers(void)
1429 {
1430         uint32_t i;
1431         struct worker_thread *worker;
1432
1433         g_workers = NULL;
1434         g_num_workers = 0;
1435
1436         SPDK_ENV_FOREACH_CORE(i) {
1437                 worker = calloc(1, sizeof(*worker));
1438                 if (worker == NULL) {
1439                         fprintf(stderr, "Unable to allocate worker\n");
1440                         return -1;
1441                 }
1442
1443                 worker->lcore = i;
1444                 worker->next = g_workers;
1445                 g_workers = worker;
1446                 g_num_workers++;
1447         }
1448
1449         return 0;
1450 }
1451
1452 static void
1453 unregister_workers(void)
1454 {
1455         struct worker_thread *worker = g_workers;
1456
1457         /* Free namespace context and worker thread */
1458         while (worker) {
1459                 struct worker_thread *next_worker = worker->next;
1460                 struct ns_worker_ctx *ns_ctx = worker->ns_ctx;
1461
1462                 while (ns_ctx) {
1463                         struct ns_worker_ctx *next_ns_ctx = ns_ctx->next;
1464                         spdk_histogram_data_free(ns_ctx->histogram);
1465                         free(ns_ctx);
1466                         ns_ctx = next_ns_ctx;
1467                 }
1468
1469                 free(worker);
1470                 worker = next_worker;
1471         }
1472 }
1473
1474 static bool
1475 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1476          struct spdk_nvme_ctrlr_opts *opts)
1477 {
1478         if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) {
1479                 printf("Attaching to NVMe over Fabrics controller at %s:%s: %s\n",
1480                        trid->traddr, trid->trsvcid,
1481                        trid->subnqn);
1482         } else {
1483                 if (g_disable_sq_cmb) {
1484                         opts->use_cmb_sqs = false;
1485                 }
1486
1487                 printf("Attaching to NVMe Controller at %s\n",
1488                        trid->traddr);
1489         }
1490
1491         /* Set io_queue_size to UINT16_MAX, NVMe driver
1492          * will then reduce this to MQES to maximize
1493          * the io_queue_size as much as possible.
1494          */
1495         opts->io_queue_size = UINT16_MAX;
1496
1497         return true;
1498 }
1499
1500 static void
1501 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1502           struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1503 {
1504         struct trid_entry       *trid_entry = cb_ctx;
1505         struct spdk_pci_addr    pci_addr;
1506         struct spdk_pci_device  *pci_dev;
1507         struct spdk_pci_id      pci_id;
1508
1509         g_controllers_found++;
1510         if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) {
1511                 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n",
1512                        trid->traddr, trid->trsvcid,
1513                        trid->subnqn);
1514         } else {
1515                 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) {
1516                         return;
1517                 }
1518
1519                 pci_dev = spdk_nvme_ctrlr_get_pci_device(ctrlr);
1520                 if (!pci_dev) {
1521                         return;
1522                 }
1523
1524                 pci_id = spdk_pci_device_get_id(pci_dev);
1525
1526                 printf("Attached to NVMe Controller at %s [%04x:%04x]\n",
1527                        trid->traddr,
1528                        pci_id.vendor_id, pci_id.device_id);
1529         }
1530
1531         register_ctrlr(ctrlr, trid_entry);
1532 }
1533
1534 static int
1535 register_controllers(void)
1536 {
1537         struct trid_entry *trid_entry;
1538
1539         printf("Initializing NVMe Controllers\n");
1540
1541         TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) {
1542                 if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) {
1543                         fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n",
1544                                 trid_entry->trid.traddr);
1545                         return -1;
1546                 }
1547         }
1548
1549         return 0;
1550 }
1551
1552 static void
1553 unregister_controllers(void)
1554 {
1555         struct ctrlr_entry *entry = g_controllers;
1556
1557         while (entry) {
1558                 struct ctrlr_entry *next = entry->next;
1559                 spdk_dma_free(entry->latency_page);
1560                 if (g_latency_ssd_tracking_enable &&
1561                     spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) {
1562                         set_latency_tracking_feature(entry->ctrlr, false);
1563                 }
1564                 spdk_nvme_detach(entry->ctrlr);
1565                 free(entry);
1566                 entry = next;
1567         }
1568 }
1569
1570 static int
1571 register_aio_files(int argc, char **argv)
1572 {
1573 #if HAVE_LIBAIO
1574         int i;
1575
1576         /* Treat everything after the options as files for AIO */
1577         for (i = g_aio_optind; i < argc; i++) {
1578                 if (register_aio_file(argv[i]) != 0) {
1579                         return 1;
1580                 }
1581         }
1582 #endif /* HAVE_LIBAIO */
1583
1584         return 0;
1585 }
1586
1587 static int
1588 associate_workers_with_ns(void)
1589 {
1590         struct ns_entry         *entry = g_namespaces;
1591         struct worker_thread    *worker = g_workers;
1592         struct ns_worker_ctx    *ns_ctx;
1593         int                     i, count;
1594
1595         count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers;
1596
1597         for (i = 0; i < count; i++) {
1598                 if (entry == NULL) {
1599                         break;
1600                 }
1601
1602                 ns_ctx = malloc(sizeof(struct ns_worker_ctx));
1603                 if (!ns_ctx) {
1604                         return -1;
1605                 }
1606                 memset(ns_ctx, 0, sizeof(*ns_ctx));
1607
1608                 printf("Associating %s with lcore %d\n", entry->name, worker->lcore);
1609                 ns_ctx->min_tsc = UINT64_MAX;
1610                 ns_ctx->entry = entry;
1611                 ns_ctx->next = worker->ns_ctx;
1612                 ns_ctx->histogram = spdk_histogram_data_alloc();
1613                 worker->ns_ctx = ns_ctx;
1614
1615                 worker = worker->next;
1616                 if (worker == NULL) {
1617                         worker = g_workers;
1618                 }
1619
1620                 entry = entry->next;
1621                 if (entry == NULL) {
1622                         entry = g_namespaces;
1623                 }
1624
1625         }
1626
1627         return 0;
1628 }
1629
1630 int main(int argc, char **argv)
1631 {
1632         int rc;
1633         struct worker_thread *worker, *master_worker;
1634         unsigned master_core;
1635         struct spdk_env_opts opts;
1636
1637         rc = parse_args(argc, argv);
1638         if (rc != 0) {
1639                 return rc;
1640         }
1641
1642         spdk_env_opts_init(&opts);
1643         opts.name = "perf";
1644         opts.shm_id = g_shm_id;
1645         if (g_core_mask) {
1646                 opts.core_mask = g_core_mask;
1647         }
1648
1649         if (g_dpdk_mem) {
1650                 opts.mem_size = g_dpdk_mem;
1651         }
1652         if (g_no_pci) {
1653                 opts.no_pci = g_no_pci;
1654         }
1655         if (spdk_env_init(&opts) < 0) {
1656                 fprintf(stderr, "Unable to initialize SPDK env\n");
1657                 rc = -1;
1658                 goto cleanup;
1659         }
1660
1661         g_tsc_rate = spdk_get_ticks_hz();
1662
1663         if (register_workers() != 0) {
1664                 rc = -1;
1665                 goto cleanup;
1666         }
1667
1668         if (register_aio_files(argc, argv) != 0) {
1669                 rc = -1;
1670                 goto cleanup;
1671         }
1672
1673         if (register_controllers() != 0) {
1674                 rc = -1;
1675                 goto cleanup;
1676         }
1677
1678         if (g_warn) {
1679                 printf("WARNING: Some requested NVMe devices were skipped\n");
1680         }
1681
1682         if (g_num_namespaces == 0) {
1683                 fprintf(stderr, "No valid NVMe controllers or AIO devices found\n");
1684                 return 0;
1685         }
1686
1687         if (associate_workers_with_ns() != 0) {
1688                 rc = -1;
1689                 goto cleanup;
1690         }
1691
1692         printf("Initialization complete. Launching workers.\n");
1693
1694         /* Launch all of the slave workers */
1695         master_core = spdk_env_get_current_core();
1696         master_worker = NULL;
1697         worker = g_workers;
1698         while (worker != NULL) {
1699                 if (worker->lcore != master_core) {
1700                         spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker);
1701                 } else {
1702                         assert(master_worker == NULL);
1703                         master_worker = worker;
1704                 }
1705                 worker = worker->next;
1706         }
1707
1708         assert(master_worker != NULL);
1709         rc = work_fn(master_worker);
1710
1711         spdk_env_thread_wait_all();
1712
1713         print_stats();
1714
1715 cleanup:
1716         unregister_trids();
1717         unregister_namespaces();
1718         unregister_controllers();
1719         unregister_workers();
1720
1721         if (rc != 0) {
1722                 fprintf(stderr, "%s: errors occured\n", argv[0]);
1723         }
1724
1725         return rc;
1726 }