ceph/src/spdk/examples/nvme/perf/perf.c

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright (c) Intel Corporation.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include <stdio.h>
  35 #include <stdbool.h>
  36 #include <string.h>
  37 #include <unistd.h>
  38
  39 #include <rte_config.h>
  40 #include <rte_mempool.h>
  41 #include <rte_lcore.h>
  42
  43 #include "spdk/fd.h"
  44 #include "spdk/nvme.h"
  45 #include "spdk/env.h"
  46 #include "spdk/queue.h"
  47 #include "spdk/string.h"
  48 #include "spdk/nvme_intel.h"
  49
  50 #if HAVE_LIBAIO
  51 #include <libaio.h>
  52 #include <sys/stat.h>
  53 #include <fcntl.h>
  54 #endif
  55
  56 struct ctrlr_entry {
  57         struct spdk_nvme_ctrlr                  *ctrlr;
  58         struct spdk_nvme_intel_rw_latency_page  *latency_page;
  59         struct ctrlr_entry                      *next;
  60         char                                    name[1024];
  61 };
  62
  63 enum entry_type {
  64         ENTRY_TYPE_NVME_NS,
  65         ENTRY_TYPE_AIO_FILE,
  66 };
  67
  68 struct ns_entry {
  69         enum entry_type         type;
  70
  71         union {
  72                 struct {
  73                         struct spdk_nvme_ctrlr  *ctrlr;
  74                         struct spdk_nvme_ns     *ns;
  75                 } nvme;
  76 #if HAVE_LIBAIO
  77                 struct {
  78                         int                     fd;
  79                 } aio;
  80 #endif
  81         } u;
  82
  83         struct ns_entry         *next;
  84         uint32_t                io_size_blocks;
  85         uint64_t                size_in_ios;
  86         char                    name[1024];
  87 };
  88
  89 struct ns_worker_ctx {
  90         struct ns_entry         *entry;
  91         uint64_t                io_completed;
  92         uint64_t                total_tsc;
  93         uint64_t                min_tsc;
  94         uint64_t                max_tsc;
  95         uint64_t                current_queue_depth;
  96         uint64_t                offset_in_ios;
  97         bool                    is_draining;
  98
  99         union {
 100                 struct {
 101                         struct spdk_nvme_qpair  *qpair;
 102                 } nvme;
 103
 104 #if HAVE_LIBAIO
 105                 struct {
 106                         struct io_event         *events;
 107                         io_context_t            ctx;
 108                 } aio;
 109 #endif
 110         } u;
 111
 112         struct ns_worker_ctx    *next;
 113 };
 114
 115 struct perf_task {
 116         struct ns_worker_ctx    *ns_ctx;
 117         void                    *buf;
 118         uint64_t                submit_tsc;
 119 #if HAVE_LIBAIO
 120         struct iocb             iocb;
 121 #endif
 122 };
 123
 124 struct worker_thread {
 125         struct ns_worker_ctx    *ns_ctx;
 126         struct worker_thread    *next;
 127         unsigned                lcore;
 128 };
 129
 130 static int g_outstanding_commands;
 131
 132 static bool g_latency_tracking_enable = false;
 133
 134 static struct rte_mempool *task_pool;
 135
 136 static struct ctrlr_entry *g_controllers = NULL;
 137 static struct ns_entry *g_namespaces = NULL;
 138 static int g_num_namespaces = 0;
 139 static struct worker_thread *g_workers = NULL;
 140 static int g_num_workers = 0;
 141
 142 static uint64_t g_tsc_rate;
 143
 144 static uint32_t g_io_align = 0x200;
 145 static uint32_t g_io_size_bytes;
 146 static int g_rw_percentage;
 147 static int g_is_random;
 148 static int g_queue_depth;
 149 static int g_time_in_sec;
 150 static uint32_t g_max_completions;
 151 static int g_dpdk_mem;
 152 static int g_shm_id = -1;
 153
 154 static const char *g_core_mask;
 155
 156 struct trid_entry {
 157         struct spdk_nvme_transport_id   trid;
 158         TAILQ_ENTRY(trid_entry)         tailq;
 159 };
 160
 161 static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list);
 162
 163 static int g_aio_optind; /* Index of first AIO filename in argv */
 164
 165 static void
 166 task_complete(struct perf_task *task);
 167
 168 static void
 169 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
 170 {
 171         struct ns_entry *entry;
 172         const struct spdk_nvme_ctrlr_data *cdata;
 173
 174         cdata = spdk_nvme_ctrlr_get_data(ctrlr);
 175
 176         if (!spdk_nvme_ns_is_active(ns)) {
 177                 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n",
 178                        cdata->mn, cdata->sn,
 179                        spdk_nvme_ns_get_id(ns));
 180                 return;
 181         }
 182
 183         if (spdk_nvme_ns_get_size(ns) < g_io_size_bytes ||
 184             spdk_nvme_ns_get_sector_size(ns) > g_io_size_bytes) {
 185                 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid "
 186                        "ns size %" PRIu64 " / block size %u for I/O size %u\n",
 187                        cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns),
 188                        spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes);
 189                 return;
 190         }
 191
 192         entry = malloc(sizeof(struct ns_entry));
 193         if (entry == NULL) {
 194                 perror("ns_entry malloc");
 195                 exit(1);
 196         }
 197
 198         entry->type = ENTRY_TYPE_NVME_NS;
 199         entry->u.nvme.ctrlr = ctrlr;
 200         entry->u.nvme.ns = ns;
 201
 202         entry->size_in_ios = spdk_nvme_ns_get_size(ns) /
 203                              g_io_size_bytes;
 204         entry->io_size_blocks = g_io_size_bytes / spdk_nvme_ns_get_sector_size(ns);
 205
 206         snprintf(entry->name, 44, "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
 207
 208         g_num_namespaces++;
 209         entry->next = g_namespaces;
 210         g_namespaces = entry;
 211 }
 212
 213 static void
 214 unregister_namespaces(void)
 215 {
 216         struct ns_entry *entry = g_namespaces;
 217
 218         while (entry) {
 219                 struct ns_entry *next = entry->next;
 220                 free(entry);
 221                 entry = next;
 222         }
 223 }
 224
 225 static void
 226 enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl)
 227 {
 228         if (spdk_nvme_cpl_is_error(cpl)) {
 229                 printf("enable_latency_tracking_complete failed\n");
 230         }
 231         g_outstanding_commands--;
 232 }
 233
 234 static void
 235 set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable)
 236 {
 237         int res;
 238         union spdk_nvme_intel_feat_latency_tracking latency_tracking;
 239
 240         if (enable) {
 241                 latency_tracking.bits.enable = 0x01;
 242         } else {
 243                 latency_tracking.bits.enable = 0x00;
 244         }
 245
 246         res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING,
 247                                               latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL);
 248         if (res) {
 249                 printf("fail to allocate nvme request.\n");
 250                 return;
 251         }
 252         g_outstanding_commands++;
 253
 254         while (g_outstanding_commands) {
 255                 spdk_nvme_ctrlr_process_admin_completions(ctrlr);
 256         }
 257 }
 258
 259 static void
 260 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
 261 {
 262         int nsid, num_ns;
 263         struct spdk_nvme_ns *ns;
 264         struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry));
 265         const struct spdk_nvme_ctrlr_data *cdata = spdk_nvme_ctrlr_get_data(ctrlr);
 266
 267         if (entry == NULL) {
 268                 perror("ctrlr_entry malloc");
 269                 exit(1);
 270         }
 271
 272         entry->latency_page = spdk_zmalloc(sizeof(struct spdk_nvme_intel_rw_latency_page),
 273                                            4096, NULL);
 274         if (entry->latency_page == NULL) {
 275                 printf("Allocation error (latency page)\n");
 276                 exit(1);
 277         }
 278
 279         snprintf(entry->name, sizeof(entry->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
 280
 281         entry->ctrlr = ctrlr;
 282         entry->next = g_controllers;
 283         g_controllers = entry;
 284
 285         if (g_latency_tracking_enable &&
 286             spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING))
 287                 set_latency_tracking_feature(ctrlr, true);
 288
 289         num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
 290         for (nsid = 1; nsid <= num_ns; nsid++) {
 291                 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
 292                 if (ns == NULL) {
 293                         continue;
 294                 }
 295                 register_ns(ctrlr, ns);
 296         }
 297
 298 }
 299
 300 #if HAVE_LIBAIO
 301 static int
 302 register_aio_file(const char *path)
 303 {
 304         struct ns_entry *entry;
 305
 306         int flags, fd;
 307         uint64_t size;
 308         uint32_t blklen;
 309
 310         if (g_rw_percentage == 100) {
 311                 flags = O_RDONLY;
 312         } else if (g_rw_percentage == 0) {
 313                 flags = O_WRONLY;
 314         } else {
 315                 flags = O_RDWR;
 316         }
 317
 318         flags |= O_DIRECT;
 319
 320         fd = open(path, flags);
 321         if (fd < 0) {
 322                 fprintf(stderr, "Could not open AIO device %s: %s\n", path, strerror(errno));
 323                 return -1;
 324         }
 325
 326         size = spdk_fd_get_size(fd);
 327         if (size == 0) {
 328                 fprintf(stderr, "Could not determine size of AIO device %s\n", path);
 329                 close(fd);
 330                 return -1;
 331         }
 332
 333         blklen = spdk_fd_get_blocklen(fd);
 334         if (blklen == 0) {
 335                 fprintf(stderr, "Could not determine block size of AIO device %s\n", path);
 336                 close(fd);
 337                 return -1;
 338         }
 339
 340         /*
 341          * TODO: This should really calculate the LCM of the current g_io_align and blklen.
 342          * For now, it's fairly safe to just assume all block sizes are powers of 2.
 343          */
 344         if (g_io_align < blklen) {
 345                 g_io_align = blklen;
 346         }
 347
 348         entry = malloc(sizeof(struct ns_entry));
 349         if (entry == NULL) {
 350                 close(fd);
 351                 perror("aio ns_entry malloc");
 352                 return -1;
 353         }
 354
 355         entry->type = ENTRY_TYPE_AIO_FILE;
 356         entry->u.aio.fd = fd;
 357         entry->size_in_ios = size / g_io_size_bytes;
 358         entry->io_size_blocks = g_io_size_bytes / blklen;
 359
 360         snprintf(entry->name, sizeof(entry->name), "%s", path);
 361
 362         g_num_namespaces++;
 363         entry->next = g_namespaces;
 364         g_namespaces = entry;
 365
 366         return 0;
 367 }
 368
 369 static int
 370 aio_submit(io_context_t aio_ctx, struct iocb *iocb, int fd, enum io_iocb_cmd cmd, void *buf,
 371            unsigned long nbytes, uint64_t offset, void *cb_ctx)
 372 {
 373         iocb->aio_fildes = fd;
 374         iocb->aio_reqprio = 0;
 375         iocb->aio_lio_opcode = cmd;
 376         iocb->u.c.buf = buf;
 377         iocb->u.c.nbytes = nbytes;
 378         iocb->u.c.offset = offset;
 379         iocb->data = cb_ctx;
 380
 381         if (io_submit(aio_ctx, 1, &iocb) < 0) {
 382                 printf("io_submit");
 383                 return -1;
 384         }
 385
 386         return 0;
 387 }
 388
 389 static void
 390 aio_check_io(struct ns_worker_ctx *ns_ctx)
 391 {
 392         int count, i;
 393         struct timespec timeout;
 394
 395         timeout.tv_sec = 0;
 396         timeout.tv_nsec = 0;
 397
 398         count = io_getevents(ns_ctx->u.aio.ctx, 1, g_queue_depth, ns_ctx->u.aio.events, &timeout);
 399         if (count < 0) {
 400                 fprintf(stderr, "io_getevents error\n");
 401                 exit(1);
 402         }
 403
 404         for (i = 0; i < count; i++) {
 405                 task_complete(ns_ctx->u.aio.events[i].data);
 406         }
 407 }
 408 #endif /* HAVE_LIBAIO */
 409
 410 static void task_ctor(struct rte_mempool *mp, void *arg, void *__task, unsigned id)
 411 {
 412         struct perf_task *task = __task;
 413         task->buf = spdk_zmalloc(g_io_size_bytes, g_io_align, NULL);
 414         if (task->buf == NULL) {
 415                 fprintf(stderr, "task->buf spdk_zmalloc failed\n");
 416                 exit(1);
 417         }
 418         memset(task->buf, id % 8, g_io_size_bytes);
 419 }
 420
 421 static void io_complete(void *ctx, const struct spdk_nvme_cpl *completion);
 422
 423 static __thread unsigned int seed = 0;
 424
 425 static void
 426 submit_single_io(struct ns_worker_ctx *ns_ctx)
 427 {
 428         struct perf_task        *task = NULL;
 429         uint64_t                offset_in_ios;
 430         int                     rc;
 431         struct ns_entry         *entry = ns_ctx->entry;
 432
 433         if (rte_mempool_get(task_pool, (void **)&task) != 0) {
 434                 fprintf(stderr, "task_pool rte_mempool_get failed\n");
 435                 exit(1);
 436         }
 437
 438         task->ns_ctx = ns_ctx;
 439
 440         if (g_is_random) {
 441                 offset_in_ios = rand_r(&seed) % entry->size_in_ios;
 442         } else {
 443                 offset_in_ios = ns_ctx->offset_in_ios++;
 444                 if (ns_ctx->offset_in_ios == entry->size_in_ios) {
 445                         ns_ctx->offset_in_ios = 0;
 446                 }
 447         }
 448
 449         task->submit_tsc = spdk_get_ticks();
 450
 451         if ((g_rw_percentage == 100) ||
 452             (g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) {
 453 #if HAVE_LIBAIO
 454                 if (entry->type == ENTRY_TYPE_AIO_FILE) {
 455                         rc = aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PREAD, task->buf,
 456                                         g_io_size_bytes, offset_in_ios * g_io_size_bytes, task);
 457                 } else
 458 #endif
 459                 {
 460                         rc = spdk_nvme_ns_cmd_read(entry->u.nvme.ns, ns_ctx->u.nvme.qpair, task->buf,
 461                                                    offset_in_ios * entry->io_size_blocks,
 462                                                    entry->io_size_blocks, io_complete, task, 0);
 463                 }
 464         } else {
 465 #if HAVE_LIBAIO
 466                 if (entry->type == ENTRY_TYPE_AIO_FILE) {
 467                         rc = aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PWRITE, task->buf,
 468                                         g_io_size_bytes, offset_in_ios * g_io_size_bytes, task);
 469                 } else
 470 #endif
 471                 {
 472                         rc = spdk_nvme_ns_cmd_write(entry->u.nvme.ns, ns_ctx->u.nvme.qpair, task->buf,
 473                                                     offset_in_ios * entry->io_size_blocks,
 474                                                     entry->io_size_blocks, io_complete, task, 0);
 475                 }
 476         }
 477
 478         if (rc != 0) {
 479                 fprintf(stderr, "starting I/O failed\n");
 480         }
 481
 482         ns_ctx->current_queue_depth++;
 483 }
 484
 485 static void
 486 task_complete(struct perf_task *task)
 487 {
 488         struct ns_worker_ctx    *ns_ctx;
 489         uint64_t                tsc_diff;
 490
 491         ns_ctx = task->ns_ctx;
 492         ns_ctx->current_queue_depth--;
 493         ns_ctx->io_completed++;
 494         tsc_diff = spdk_get_ticks() - task->submit_tsc;
 495         ns_ctx->total_tsc += tsc_diff;
 496         if (ns_ctx->min_tsc > tsc_diff) {
 497                 ns_ctx->min_tsc = tsc_diff;
 498         }
 499         if (ns_ctx->max_tsc < tsc_diff) {
 500                 ns_ctx->max_tsc = tsc_diff;
 501         }
 502
 503         rte_mempool_put(task_pool, task);
 504
 505         /*
 506          * is_draining indicates when time has expired for the test run
 507          * and we are just waiting for the previously submitted I/O
 508          * to complete.  In this case, do not submit a new I/O to replace
 509          * the one just completed.
 510          */
 511         if (!ns_ctx->is_draining) {
 512                 submit_single_io(ns_ctx);
 513         }
 514 }
 515
 516 static void
 517 io_complete(void *ctx, const struct spdk_nvme_cpl *completion)
 518 {
 519         task_complete((struct perf_task *)ctx);
 520 }
 521
 522 static void
 523 check_io(struct ns_worker_ctx *ns_ctx)
 524 {
 525 #if HAVE_LIBAIO
 526         if (ns_ctx->entry->type == ENTRY_TYPE_AIO_FILE) {
 527                 aio_check_io(ns_ctx);
 528         } else
 529 #endif
 530         {
 531                 spdk_nvme_qpair_process_completions(ns_ctx->u.nvme.qpair, g_max_completions);
 532         }
 533 }
 534
 535 static void
 536 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth)
 537 {
 538         while (queue_depth-- > 0) {
 539                 submit_single_io(ns_ctx);
 540         }
 541 }
 542
 543 static void
 544 drain_io(struct ns_worker_ctx *ns_ctx)
 545 {
 546         ns_ctx->is_draining = true;
 547         while (ns_ctx->current_queue_depth > 0) {
 548                 check_io(ns_ctx);
 549         }
 550 }
 551
 552 static int
 553 init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 554 {
 555         if (ns_ctx->entry->type == ENTRY_TYPE_AIO_FILE) {
 556 #ifdef HAVE_LIBAIO
 557                 ns_ctx->u.aio.events = calloc(g_queue_depth, sizeof(struct io_event));
 558                 if (!ns_ctx->u.aio.events) {
 559                         return -1;
 560                 }
 561                 ns_ctx->u.aio.ctx = 0;
 562                 if (io_setup(g_queue_depth, &ns_ctx->u.aio.ctx) < 0) {
 563                         free(ns_ctx->u.aio.events);
 564                         perror("io_setup");
 565                         return -1;
 566                 }
 567 #endif
 568         } else {
 569                 /*
 570                  * TODO: If a controller has multiple namespaces, they could all use the same queue.
 571                  *  For now, give each namespace/thread combination its own queue.
 572                  */
 573                 ns_ctx->u.nvme.qpair = spdk_nvme_ctrlr_alloc_io_qpair(ns_ctx->entry->u.nvme.ctrlr, 0);
 574                 if (!ns_ctx->u.nvme.qpair) {
 575                         printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n");
 576                         return -1;
 577                 }
 578         }
 579
 580         return 0;
 581 }
 582
 583 static void
 584 cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 585 {
 586         if (ns_ctx->entry->type == ENTRY_TYPE_AIO_FILE) {
 587 #ifdef HAVE_LIBAIO
 588                 io_destroy(ns_ctx->u.aio.ctx);
 589                 free(ns_ctx->u.aio.events);
 590 #endif
 591         } else {
 592                 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair);
 593         }
 594 }
 595
 596 static int
 597 work_fn(void *arg)
 598 {
 599         uint64_t tsc_end;
 600         struct worker_thread *worker = (struct worker_thread *)arg;
 601         struct ns_worker_ctx *ns_ctx = NULL;
 602
 603         printf("Starting thread on core %u\n", worker->lcore);
 604
 605         /* Allocate a queue pair for each namespace. */
 606         ns_ctx = worker->ns_ctx;
 607         while (ns_ctx != NULL) {
 608                 if (init_ns_worker_ctx(ns_ctx) != 0) {
 609                         printf("ERROR: init_ns_worker_ctx() failed\n");
 610                         return 1;
 611                 }
 612                 ns_ctx = ns_ctx->next;
 613         }
 614
 615         tsc_end = spdk_get_ticks() + g_time_in_sec * g_tsc_rate;
 616
 617         /* Submit initial I/O for each namespace. */
 618         ns_ctx = worker->ns_ctx;
 619         while (ns_ctx != NULL) {
 620                 submit_io(ns_ctx, g_queue_depth);
 621                 ns_ctx = ns_ctx->next;
 622         }
 623
 624         while (1) {
 625                 /*
 626                  * Check for completed I/O for each controller. A new
 627                  * I/O will be submitted in the io_complete callback
 628                  * to replace each I/O that is completed.
 629                  */
 630                 ns_ctx = worker->ns_ctx;
 631                 while (ns_ctx != NULL) {
 632                         check_io(ns_ctx);
 633                         ns_ctx = ns_ctx->next;
 634                 }
 635
 636                 if (spdk_get_ticks() > tsc_end) {
 637                         break;
 638                 }
 639         }
 640
 641         ns_ctx = worker->ns_ctx;
 642         while (ns_ctx != NULL) {
 643                 drain_io(ns_ctx);
 644                 cleanup_ns_worker_ctx(ns_ctx);
 645                 ns_ctx = ns_ctx->next;
 646         }
 647
 648         return 0;
 649 }
 650
 651 static void usage(char *program_name)
 652 {
 653         printf("%s options", program_name);
 654 #if HAVE_LIBAIO
 655         printf(" [AIO device(s)]...");
 656 #endif
 657         printf("\n");
 658         printf("\t[-q io depth]\n");
 659         printf("\t[-s io size in bytes]\n");
 660         printf("\t[-w io pattern type, must be one of\n");
 661         printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
 662         printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
 663         printf("\t[-l enable latency tracking, default: disabled]\n");
 664         printf("\t[-t time in seconds]\n");
 665         printf("\t[-c core mask for I/O submission/completion.]\n");
 666         printf("\t\t(default: 1)]\n");
 667         printf("\t[-r Transport ID for local PCIe NVMe or NVMeoF]\n");
 668         printf("\t Format: 'key:value [key:value] ...'\n");
 669         printf("\t Keys:\n");
 670         printf("\t  trtype      Transport type (e.g. PCIe, RDMA)\n");
 671         printf("\t  adrfam      Address family (e.g. IPv4, IPv6)\n");
 672         printf("\t  traddr      Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n");
 673         printf("\t  trsvcid     Transport service identifier (e.g. 4420)\n");
 674         printf("\t  subnqn      Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN);
 675         printf("\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n");
 676         printf("\t          -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n");
 677         printf("\t[-d DPDK huge memory size in MB.]\n");
 678         printf("\t[-m max completions per poll]\n");
 679         printf("\t\t(default: 0 - unlimited)\n");
 680         printf("\t[-i shared memory group ID]\n");
 681 }
 682
 683 static void
 684 print_performance(void)
 685 {
 686         uint64_t total_io_completed;
 687         float io_per_second, mb_per_second, average_latency, min_latency, max_latency;
 688         float total_io_per_second, total_mb_per_second;
 689         float sum_ave_latency, sum_min_latency, sum_max_latency;
 690         int ns_count;
 691         struct worker_thread    *worker;
 692         struct ns_worker_ctx    *ns_ctx;
 693
 694         total_io_per_second = 0;
 695         total_mb_per_second = 0;
 696         total_io_completed = 0;
 697         sum_ave_latency = 0;
 698         sum_min_latency = 0;
 699         sum_max_latency = 0;
 700         ns_count = 0;
 701
 702         printf("========================================================\n");
 703         printf("%103s\n", "Latency(us)");
 704         printf("%-55s: %10s %10s %10s %10s %10s\n",
 705                "Device Information", "IOPS", "MB/s", "Average", "min", "max");
 706
 707         worker = g_workers;
 708         while (worker) {
 709                 ns_ctx = worker->ns_ctx;
 710                 while (ns_ctx) {
 711                         io_per_second = (float)ns_ctx->io_completed / g_time_in_sec;
 712                         mb_per_second = io_per_second * g_io_size_bytes / (1024 * 1024);
 713                         average_latency = (float)(ns_ctx->total_tsc / ns_ctx->io_completed) * 1000 * 1000 / g_tsc_rate;
 714                         min_latency = (float)ns_ctx->min_tsc * 1000 * 1000 / g_tsc_rate;
 715                         max_latency = (float)ns_ctx->max_tsc * 1000 * 1000 / g_tsc_rate;
 716                         printf("%-43.43s from core %u: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
 717                                ns_ctx->entry->name, worker->lcore,
 718                                io_per_second, mb_per_second,
 719                                average_latency, min_latency, max_latency);
 720                         total_io_per_second += io_per_second;
 721                         total_mb_per_second += mb_per_second;
 722                         total_io_completed += ns_ctx->io_completed;
 723                         sum_ave_latency += average_latency;
 724                         sum_min_latency += min_latency;
 725                         sum_max_latency += max_latency;
 726                         ns_count++;
 727                         ns_ctx = ns_ctx->next;
 728                 }
 729                 worker = worker->next;
 730         }
 731
 732         assert(ns_count != 0);
 733         printf("========================================================\n");
 734         printf("%-55s: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
 735                "Total", total_io_per_second, total_mb_per_second,
 736                sum_ave_latency / ns_count, sum_min_latency / ns_count,
 737                sum_max_latency / ns_count);
 738         printf("\n");
 739 }
 740
 741 static void
 742 print_latency_page(struct ctrlr_entry *entry)
 743 {
 744         int i;
 745
 746         printf("\n");
 747         printf("%s\n", entry->name);
 748         printf("--------------------------------------------------------\n");
 749
 750         for (i = 0; i < 32; i++) {
 751                 if (entry->latency_page->buckets_32us[i])
 752                         printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32, entry->latency_page->buckets_32us[i]);
 753         }
 754         for (i = 0; i < 31; i++) {
 755                 if (entry->latency_page->buckets_1ms[i])
 756                         printf("Bucket %dms - %dms: %d\n", i + 1, i + 2, entry->latency_page->buckets_1ms[i]);
 757         }
 758         for (i = 0; i < 31; i++) {
 759                 if (entry->latency_page->buckets_32ms[i])
 760                         printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32,
 761                                entry->latency_page->buckets_32ms[i]);
 762         }
 763 }
 764
 765 static void
 766 print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page)
 767 {
 768         struct ctrlr_entry      *ctrlr;
 769
 770         printf("%s Latency Statistics:\n", op_name);
 771         printf("========================================================\n");
 772         ctrlr = g_controllers;
 773         while (ctrlr) {
 774                 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
 775                         if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr->ctrlr, log_page, SPDK_NVME_GLOBAL_NS_TAG,
 776                                                              ctrlr->latency_page, sizeof(struct spdk_nvme_intel_rw_latency_page), 0,
 777                                                              enable_latency_tracking_complete,
 778                                                              NULL)) {
 779                                 printf("nvme_ctrlr_cmd_get_log_page() failed\n");
 780                                 exit(1);
 781                         }
 782
 783                         g_outstanding_commands++;
 784                 } else {
 785                         printf("Controller %s: %s latency statistics not supported\n", ctrlr->name, op_name);
 786                 }
 787                 ctrlr = ctrlr->next;
 788         }
 789
 790         while (g_outstanding_commands) {
 791                 ctrlr = g_controllers;
 792                 while (ctrlr) {
 793                         spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr);
 794                         ctrlr = ctrlr->next;
 795                 }
 796         }
 797
 798         ctrlr = g_controllers;
 799         while (ctrlr) {
 800                 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
 801                         print_latency_page(ctrlr);
 802                 }
 803                 ctrlr = ctrlr->next;
 804         }
 805         printf("\n");
 806 }
 807
 808 static void
 809 print_stats(void)
 810 {
 811         print_performance();
 812         if (g_latency_tracking_enable) {
 813                 if (g_rw_percentage != 0) {
 814                         print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY);
 815                 }
 816                 if (g_rw_percentage != 100) {
 817                         print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY);
 818                 }
 819         }
 820 }
 821
 822 static void
 823 unregister_trids(void)
 824 {
 825         struct trid_entry *trid_entry, *tmp;
 826
 827         TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) {
 828                 free(trid_entry);
 829         }
 830 }
 831
 832 static int
 833 add_trid(const char *trid_str)
 834 {
 835         struct trid_entry *trid_entry;
 836         struct spdk_nvme_transport_id *trid;
 837
 838         trid_entry = calloc(1, sizeof(*trid_entry));
 839         if (trid_entry == NULL) {
 840                 return -1;
 841         }
 842
 843         trid = &trid_entry->trid;
 844         memset(trid, 0, sizeof(*trid));
 845         trid->trtype = SPDK_NVME_TRANSPORT_PCIE;
 846         snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
 847
 848         if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) {
 849                 fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str);
 850                 free(trid_entry);
 851                 return 1;
 852         }
 853
 854         TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq);
 855         return 0;
 856 }
 857
 858 static int
 859 parse_args(int argc, char **argv)
 860 {
 861         const char *workload_type;
 862         int op;
 863         bool mix_specified = false;
 864
 865         /* default value*/
 866         g_queue_depth = 0;
 867         g_io_size_bytes = 0;
 868         workload_type = NULL;
 869         g_time_in_sec = 0;
 870         g_rw_percentage = -1;
 871         g_core_mask = NULL;
 872         g_max_completions = 0;
 873
 874         while ((op = getopt(argc, argv, "c:d:i:lm:q:r:s:t:w:M:")) != -1) {
 875                 switch (op) {
 876                 case 'c':
 877                         g_core_mask = optarg;
 878                         break;
 879                 case 'd':
 880                         g_dpdk_mem = atoi(optarg);
 881                         break;
 882                 case 'i':
 883                         g_shm_id = atoi(optarg);
 884                         break;
 885                 case 'l':
 886                         g_latency_tracking_enable = true;
 887                         break;
 888                 case 'm':
 889                         g_max_completions = atoi(optarg);
 890                         break;
 891                 case 'q':
 892                         g_queue_depth = atoi(optarg);
 893                         break;
 894                 case 'r':
 895                         if (add_trid(optarg)) {
 896                                 usage(argv[0]);
 897                                 return 1;
 898                         }
 899                         break;
 900                 case 's':
 901                         g_io_size_bytes = atoi(optarg);
 902                         break;
 903                 case 't':
 904                         g_time_in_sec = atoi(optarg);
 905                         break;
 906                 case 'w':
 907                         workload_type = optarg;
 908                         break;
 909                 case 'M':
 910                         g_rw_percentage = atoi(optarg);
 911                         mix_specified = true;
 912                         break;
 913                 default:
 914                         usage(argv[0]);
 915                         return 1;
 916                 }
 917         }
 918
 919         if (!g_queue_depth) {
 920                 usage(argv[0]);
 921                 return 1;
 922         }
 923         if (!g_io_size_bytes) {
 924                 usage(argv[0]);
 925                 return 1;
 926         }
 927         if (!workload_type) {
 928                 usage(argv[0]);
 929                 return 1;
 930         }
 931         if (!g_time_in_sec) {
 932                 usage(argv[0]);
 933                 return 1;
 934         }
 935
 936         if (strcmp(workload_type, "read") &&
 937             strcmp(workload_type, "write") &&
 938             strcmp(workload_type, "randread") &&
 939             strcmp(workload_type, "randwrite") &&
 940             strcmp(workload_type, "rw") &&
 941             strcmp(workload_type, "randrw")) {
 942                 fprintf(stderr,
 943                         "io pattern type must be one of\n"
 944                         "(read, write, randread, randwrite, rw, randrw)\n");
 945                 return 1;
 946         }
 947
 948         if (!strcmp(workload_type, "read") ||
 949             !strcmp(workload_type, "randread")) {
 950                 g_rw_percentage = 100;
 951         }
 952
 953         if (!strcmp(workload_type, "write") ||
 954             !strcmp(workload_type, "randwrite")) {
 955                 g_rw_percentage = 0;
 956         }
 957
 958         if (!strcmp(workload_type, "read") ||
 959             !strcmp(workload_type, "randread") ||
 960             !strcmp(workload_type, "write") ||
 961             !strcmp(workload_type, "randwrite")) {
 962                 if (mix_specified) {
 963                         fprintf(stderr, "Ignoring -M option... Please use -M option"
 964                                 " only when using rw or randrw.\n");
 965                 }
 966         }
 967
 968         if (!strcmp(workload_type, "rw") ||
 969             !strcmp(workload_type, "randrw")) {
 970                 if (g_rw_percentage < 0 || g_rw_percentage > 100) {
 971                         fprintf(stderr,
 972                                 "-M must be specified to value from 0 to 100 "
 973                                 "for rw or randrw.\n");
 974                         return 1;
 975                 }
 976         }
 977
 978         if (!strcmp(workload_type, "read") ||
 979             !strcmp(workload_type, "write") ||
 980             !strcmp(workload_type, "rw")) {
 981                 g_is_random = 0;
 982         } else {
 983                 g_is_random = 1;
 984         }
 985
 986         if (TAILQ_EMPTY(&g_trid_list)) {
 987                 /* If no transport IDs specified, default to enumerating all local PCIe devices */
 988                 add_trid("trtype:PCIe");
 989         }
 990
 991         g_aio_optind = optind;
 992         optind = 1;
 993         return 0;
 994 }
 995
 996 static int
 997 register_workers(void)
 998 {
 999         uint32_t i;
1000         struct worker_thread *worker;
1001
1002         g_workers = NULL;
1003         g_num_workers = 0;
1004
1005         SPDK_ENV_FOREACH_CORE(i) {
1006                 worker = calloc(1, sizeof(*worker));
1007                 if (worker == NULL) {
1008                         fprintf(stderr, "Unable to allocate worker\n");
1009                         return -1;
1010                 }
1011
1012                 worker->lcore = i;
1013                 worker->next = g_workers;
1014                 g_workers = worker;
1015                 g_num_workers++;
1016         }
1017
1018         return 0;
1019 }
1020
1021 static void
1022 unregister_workers(void)
1023 {
1024         struct worker_thread *worker = g_workers;
1025
1026         /* Free namespace context and worker thread */
1027         while (worker) {
1028                 struct worker_thread *next_worker = worker->next;
1029                 struct ns_worker_ctx *ns_ctx = worker->ns_ctx;
1030
1031                 while (ns_ctx) {
1032                         struct ns_worker_ctx *next_ns_ctx = ns_ctx->next;
1033                         free(ns_ctx);
1034                         ns_ctx = next_ns_ctx;
1035                 }
1036
1037                 free(worker);
1038                 worker = next_worker;
1039         }
1040 }
1041
1042 static bool
1043 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1044          struct spdk_nvme_ctrlr_opts *opts)
1045 {
1046         struct spdk_pci_addr    pci_addr;
1047         struct spdk_pci_device  *pci_dev;
1048         struct spdk_pci_id      pci_id;
1049
1050         if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) {
1051                 printf("Attaching to NVMe over Fabrics controller at %s:%s: %s\n",
1052                        trid->traddr, trid->trsvcid,
1053                        trid->subnqn);
1054         } else {
1055                 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) {
1056                         return false;
1057                 }
1058
1059                 pci_dev = spdk_pci_get_device(&pci_addr);
1060                 if (!pci_dev) {
1061                         return false;
1062                 }
1063
1064                 pci_id = spdk_pci_device_get_id(pci_dev);
1065
1066                 printf("Attaching to NVMe Controller at %s [%04x:%04x]\n",
1067                        trid->traddr,
1068                        pci_id.vendor_id, pci_id.device_id);
1069         }
1070
1071         opts->io_queue_size = g_queue_depth + 1;
1072
1073         return true;
1074 }
1075
1076 static void
1077 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1078           struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1079 {
1080         struct spdk_pci_addr    pci_addr;
1081         struct spdk_pci_device  *pci_dev;
1082         struct spdk_pci_id      pci_id;
1083
1084         if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) {
1085                 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n",
1086                        trid->traddr, trid->trsvcid,
1087                        trid->subnqn);
1088         } else {
1089                 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) {
1090                         return;
1091                 }
1092
1093                 pci_dev = spdk_pci_get_device(&pci_addr);
1094                 if (!pci_dev) {
1095                         return;
1096                 }
1097
1098                 pci_id = spdk_pci_device_get_id(pci_dev);
1099
1100                 printf("Attached to NVMe Controller at %s [%04x:%04x]\n",
1101                        trid->traddr,
1102                        pci_id.vendor_id, pci_id.device_id);
1103         }
1104
1105         register_ctrlr(ctrlr);
1106 }
1107
1108 static int
1109 register_controllers(void)
1110 {
1111         struct trid_entry *trid_entry;
1112
1113         printf("Initializing NVMe Controllers\n");
1114
1115         TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) {
1116                 if (spdk_nvme_probe(&trid_entry->trid, NULL, probe_cb, attach_cb, NULL) != 0) {
1117                         fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n",
1118                                 trid_entry->trid.traddr);
1119                         return -1;
1120                 }
1121         }
1122
1123         return 0;
1124 }
1125
1126 static void
1127 unregister_controllers(void)
1128 {
1129         struct ctrlr_entry *entry = g_controllers;
1130
1131         while (entry) {
1132                 struct ctrlr_entry *next = entry->next;
1133                 spdk_free(entry->latency_page);
1134                 if (g_latency_tracking_enable &&
1135                     spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING))
1136                         set_latency_tracking_feature(entry->ctrlr, false);
1137                 spdk_nvme_detach(entry->ctrlr);
1138                 free(entry);
1139                 entry = next;
1140         }
1141 }
1142
1143 static int
1144 register_aio_files(int argc, char **argv)
1145 {
1146 #if HAVE_LIBAIO
1147         int i;
1148
1149         /* Treat everything after the options as files for AIO */
1150         for (i = g_aio_optind; i < argc; i++) {
1151                 if (register_aio_file(argv[i]) != 0) {
1152                         return 1;
1153                 }
1154         }
1155 #endif /* HAVE_LIBAIO */
1156
1157         return 0;
1158 }
1159
1160 static int
1161 associate_workers_with_ns(void)
1162 {
1163         struct ns_entry         *entry = g_namespaces;
1164         struct worker_thread    *worker = g_workers;
1165         struct ns_worker_ctx    *ns_ctx;
1166         int                     i, count;
1167
1168         count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers;
1169
1170         for (i = 0; i < count; i++) {
1171                 if (entry == NULL) {
1172                         break;
1173                 }
1174
1175                 ns_ctx = malloc(sizeof(struct ns_worker_ctx));
1176                 if (!ns_ctx) {
1177                         return -1;
1178                 }
1179                 memset(ns_ctx, 0, sizeof(*ns_ctx));
1180
1181                 printf("Associating %s with lcore %d\n", entry->name, worker->lcore);
1182                 ns_ctx->min_tsc = UINT64_MAX;
1183                 ns_ctx->entry = entry;
1184                 ns_ctx->next = worker->ns_ctx;
1185                 worker->ns_ctx = ns_ctx;
1186
1187                 worker = worker->next;
1188                 if (worker == NULL) {
1189                         worker = g_workers;
1190                 }
1191
1192                 entry = entry->next;
1193                 if (entry == NULL) {
1194                         entry = g_namespaces;
1195                 }
1196
1197         }
1198
1199         return 0;
1200 }
1201
1202 int main(int argc, char **argv)
1203 {
1204         int rc;
1205         struct worker_thread *worker, *master_worker;
1206         unsigned master_core;
1207         char task_pool_name[30];
1208         uint32_t task_count;
1209         struct spdk_env_opts opts;
1210
1211         rc = parse_args(argc, argv);
1212         if (rc != 0) {
1213                 return rc;
1214         }
1215
1216         spdk_env_opts_init(&opts);
1217         opts.name = "perf";
1218         opts.shm_id = g_shm_id;
1219         if (g_core_mask) {
1220                 opts.core_mask = g_core_mask;
1221         }
1222
1223         if (g_dpdk_mem) {
1224                 opts.dpdk_mem_size = g_dpdk_mem;
1225         }
1226         spdk_env_init(&opts);
1227
1228         g_tsc_rate = spdk_get_ticks_hz();
1229
1230         if (register_workers() != 0) {
1231                 rc = -1;
1232                 goto cleanup;
1233         }
1234
1235         if (register_aio_files(argc, argv) != 0) {
1236                 rc = -1;
1237                 goto cleanup;
1238         }
1239
1240         if (register_controllers() != 0) {
1241                 rc = -1;
1242                 goto cleanup;
1243         }
1244
1245         if (associate_workers_with_ns() != 0) {
1246                 rc = -1;
1247                 goto cleanup;
1248         }
1249
1250         snprintf(task_pool_name, sizeof(task_pool_name), "task_pool_%d", getpid());
1251
1252         /*
1253          * The task_count will be dynamically calculated based on the
1254          * number of attached active namespaces(aio files), queue depth
1255          * and number of cores (workers) involved in the IO operations.
1256          */
1257         task_count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers;
1258         task_count *= g_queue_depth;
1259
1260         task_pool = rte_mempool_create(task_pool_name, task_count,
1261                                        sizeof(struct perf_task),
1262                                        0, 0, NULL, NULL, task_ctor, NULL,
1263                                        SOCKET_ID_ANY, 0);
1264         if (task_pool == NULL) {
1265                 fprintf(stderr, "could not initialize task pool\n");
1266                 rc = -1;
1267                 goto cleanup;
1268         }
1269
1270         printf("Initialization complete. Launching workers.\n");
1271
1272         /* Launch all of the slave workers */
1273         master_core = rte_get_master_lcore();
1274         master_worker = NULL;
1275         worker = g_workers;
1276         while (worker != NULL) {
1277                 if (worker->lcore != master_core) {
1278                         rte_eal_remote_launch(work_fn, worker, worker->lcore);
1279                 } else {
1280                         assert(master_worker == NULL);
1281                         master_worker = worker;
1282                 }
1283                 worker = worker->next;
1284         }
1285
1286         assert(master_worker != NULL);
1287         rc = work_fn(master_worker);
1288
1289         rte_eal_mp_wait_lcore();
1290
1291         print_stats();
1292
1293 cleanup:
1294         unregister_trids();
1295         unregister_namespaces();
1296         unregister_controllers();
1297         unregister_workers();
1298
1299         if (rc != 0) {
1300                 fprintf(stderr, "%s: errors occured\n", argv[0]);
1301         }
1302
1303         return rc;
1304 }