ceph/src/spdk/examples/nvme/arbitration/arbitration.c

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright (c) Intel Corporation.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include <stdio.h>
  35 #include <stdbool.h>
  36 #include <string.h>
  37 #include <unistd.h>
  38
  39 #include <rte_config.h>
  40 #include <rte_mempool.h>
  41 #include <rte_lcore.h>
  42
  43 #include "spdk/nvme.h"
  44 #include "spdk/env.h"
  45 #include "spdk/string.h"
  46 #include "spdk/nvme_intel.h"
  47
  48 struct ctrlr_entry {
  49         struct spdk_nvme_ctrlr                  *ctrlr;
  50         struct spdk_nvme_intel_rw_latency_page  latency_page;
  51         struct ctrlr_entry                      *next;
  52         char                                    name[1024];
  53 };
  54
  55 struct ns_entry {
  56         struct {
  57                 struct spdk_nvme_ctrlr          *ctrlr;
  58                 struct spdk_nvme_ns             *ns;
  59         } nvme;
  60
  61         struct ns_entry                         *next;
  62         uint32_t                                io_size_blocks;
  63         uint64_t                                size_in_ios;
  64         char                                    name[1024];
  65 };
  66
  67 struct ns_worker_ctx {
  68         struct ns_entry                         *entry;
  69         uint64_t                                io_completed;
  70         uint64_t                                current_queue_depth;
  71         uint64_t                                offset_in_ios;
  72         bool                                    is_draining;
  73         struct spdk_nvme_qpair                  *qpair;
  74         struct ns_worker_ctx                    *next;
  75 };
  76
  77 struct arb_task {
  78         struct ns_worker_ctx                    *ns_ctx;
  79         void                                    *buf;
  80 };
  81
  82 struct worker_thread {
  83         struct ns_worker_ctx                    *ns_ctx;
  84         struct worker_thread                    *next;
  85         unsigned                                lcore;
  86         enum spdk_nvme_qprio                    qprio;
  87 };
  88
  89 struct arb_context {
  90         int                                     shm_id;
  91         int                                     outstanding_commands;
  92         int                                     num_namespaces;
  93         int                                     num_workers;
  94         int                                     rw_percentage;
  95         int                                     is_random;
  96         int                                     queue_depth;
  97         int                                     time_in_sec;
  98         int                                     io_count;
  99         uint8_t                                 latency_tracking_enable;
 100         uint8_t                                 arbitration_mechanism;
 101         uint8_t                                 arbitration_config;
 102         uint32_t                                io_size_bytes;
 103         uint32_t                                max_completions;
 104         uint64_t                                tsc_rate;
 105         const char                              *core_mask;
 106         const char                              *workload_type;
 107 };
 108
 109 struct feature {
 110         uint32_t                                result;
 111         bool                                    valid;
 112 };
 113
 114 static struct rte_mempool *task_pool            = NULL;
 115
 116 static struct ctrlr_entry *g_controllers        = NULL;
 117 static struct ns_entry *g_namespaces            = NULL;
 118 static struct worker_thread *g_workers          = NULL;
 119
 120 static struct feature features[256];
 121
 122 static struct arb_context g_arbitration = {
 123         .shm_id                                 = -1,
 124         .outstanding_commands                   = 0,
 125         .num_workers                            = 0,
 126         .num_namespaces                         = 0,
 127         .rw_percentage                          = 50,
 128         .queue_depth                            = 64,
 129         .time_in_sec                            = 60,
 130         .io_count                               = 100000,
 131         .latency_tracking_enable                = 0,
 132         .arbitration_mechanism                  = SPDK_NVME_CC_AMS_RR,
 133         .arbitration_config                     = 0,
 134         .io_size_bytes                          = 131072,
 135         .max_completions                        = 0,
 136         /* Default 4 cores for urgent/high/medium/low */
 137         .core_mask                              = "0xf",
 138         .workload_type                          = "randrw",
 139 };
 140
 141 /*
 142  * For weighted round robin arbitration mechanism, the smaller value between
 143  * weight and burst will be picked to execute the commands in one queue.
 144  */
 145 #define USER_SPECIFIED_HIGH_PRIORITY_WEIGHT     32
 146 #define USER_SPECIFIED_MEDIUM_PRIORITY_WEIGHT   16
 147 #define USER_SPECIFIED_LOW_PRIORITY_WEIGHT      8
 148 #define USER_SPECIFIED_ARBITRATION_BURST        7       /* No limit */
 149
 150 /*
 151  * Description of dword for priority weight and arbitration burst
 152  * ------------------------------------------------------------------------------
 153  *     31 : 24      |       23 : 16      |    15 : 08      | 07 : 03  | 02 : 00
 154  * ------------------------------------------------------------------------------
 155  * High Prio Weight | Medium Prio Weight | Low Prio Weight | Reserved | Arb Burst
 156  * ------------------------------------------------------------------------------
 157  *
 158  * The priority weights are zero based value.
 159  */
 160 #define SPDK_NVME_HIGH_PRIO_WEIGHT_SHIFT        24
 161 #define SPDK_NVME_MED_PRIO_WEIGHT_SHIFT         16
 162 #define SPDK_NVME_LOW_PRIO_WEIGHT_SHIFT         8
 163 #define SPDK_NVME_PRIO_WEIGHT_MASK              0xFF
 164 #define SPDK_NVME_ARB_BURST_MASK                0x7
 165
 166 #define SPDK_NVME_QPRIO_MAX                     (SPDK_NVME_QPRIO_LOW + 1)
 167
 168 static void task_complete(struct arb_task *task);
 169
 170 static void io_complete(void *ctx, const struct spdk_nvme_cpl *completion);
 171
 172 static void get_arb_feature(struct spdk_nvme_ctrlr *ctrlr);
 173
 174 static int set_arb_feature(struct spdk_nvme_ctrlr *ctrlr);
 175
 176 static const char *print_qprio(enum spdk_nvme_qprio);
 177
 178
 179 static void
 180 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
 181 {
 182         struct ns_entry *entry;
 183         const struct spdk_nvme_ctrlr_data *cdata;
 184
 185         cdata = spdk_nvme_ctrlr_get_data(ctrlr);
 186
 187         if (!spdk_nvme_ns_is_active(ns)) {
 188                 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n",
 189                        cdata->mn, cdata->sn,
 190                        spdk_nvme_ns_get_id(ns));
 191                 return;
 192         }
 193
 194         if (spdk_nvme_ns_get_size(ns) < g_arbitration.io_size_bytes ||
 195             spdk_nvme_ns_get_sector_size(ns) > g_arbitration.io_size_bytes) {
 196                 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid "
 197                        "ns size %" PRIu64 " / block size %u for I/O size %u\n",
 198                        cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns),
 199                        spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_sector_size(ns),
 200                        g_arbitration.io_size_bytes);
 201                 return;
 202         }
 203
 204         entry = malloc(sizeof(struct ns_entry));
 205         if (entry == NULL) {
 206                 perror("ns_entry malloc");
 207                 exit(1);
 208         }
 209
 210         entry->nvme.ctrlr = ctrlr;
 211         entry->nvme.ns = ns;
 212
 213         entry->size_in_ios = spdk_nvme_ns_get_size(ns) / g_arbitration.io_size_bytes;
 214         entry->io_size_blocks = g_arbitration.io_size_bytes / spdk_nvme_ns_get_sector_size(ns);
 215
 216         snprintf(entry->name, 44, "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
 217
 218         g_arbitration.num_namespaces++;
 219         entry->next = g_namespaces;
 220         g_namespaces = entry;
 221 }
 222
 223 static void
 224 enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl)
 225 {
 226         if (spdk_nvme_cpl_is_error(cpl)) {
 227                 printf("enable_latency_tracking_complete failed\n");
 228         }
 229         g_arbitration.outstanding_commands--;
 230 }
 231
 232 static void
 233 set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable)
 234 {
 235         int res;
 236         union spdk_nvme_intel_feat_latency_tracking latency_tracking;
 237
 238         if (enable) {
 239                 latency_tracking.bits.enable = 0x01;
 240         } else {
 241                 latency_tracking.bits.enable = 0x00;
 242         }
 243
 244         res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING,
 245                                               latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL);
 246         if (res) {
 247                 printf("fail to allocate nvme request.\n");
 248                 return;
 249         }
 250         g_arbitration.outstanding_commands++;
 251
 252         while (g_arbitration.outstanding_commands) {
 253                 spdk_nvme_ctrlr_process_admin_completions(ctrlr);
 254         }
 255 }
 256
 257 static void
 258 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
 259 {
 260         int nsid, num_ns;
 261         struct spdk_nvme_ns *ns;
 262         struct ctrlr_entry *entry = calloc(1, sizeof(struct ctrlr_entry));
 263         const struct spdk_nvme_ctrlr_data *cdata = spdk_nvme_ctrlr_get_data(ctrlr);
 264
 265         if (entry == NULL) {
 266                 perror("ctrlr_entry malloc");
 267                 exit(1);
 268         }
 269
 270         snprintf(entry->name, sizeof(entry->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
 271
 272         entry->ctrlr = ctrlr;
 273         entry->next = g_controllers;
 274         g_controllers = entry;
 275
 276         if ((g_arbitration.latency_tracking_enable != 0) &&
 277             spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING))
 278                 set_latency_tracking_feature(ctrlr, true);
 279
 280         num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
 281         for (nsid = 1; nsid <= num_ns; nsid++) {
 282                 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
 283                 if (ns == NULL) {
 284                         continue;
 285                 }
 286                 register_ns(ctrlr, ns);
 287         }
 288
 289         if (g_arbitration.arbitration_mechanism == SPDK_NVME_CAP_AMS_WRR) {
 290                 get_arb_feature(ctrlr);
 291
 292                 if (g_arbitration.arbitration_config != 0) {
 293                         set_arb_feature(ctrlr);
 294                         get_arb_feature(ctrlr);
 295                 }
 296         }
 297 }
 298
 299 static void
 300 task_ctor(struct rte_mempool *mp, void *arg, void *__task, unsigned id)
 301 {
 302         struct arb_task *task = __task;
 303         task->buf = spdk_zmalloc(g_arbitration.io_size_bytes, 0x200, NULL);
 304         if (task->buf == NULL) {
 305                 fprintf(stderr, "task->buf spdk_zmalloc failed\n");
 306                 exit(1);
 307         }
 308 }
 309
 310 static __thread unsigned int seed = 0;
 311
 312 static void
 313 submit_single_io(struct ns_worker_ctx *ns_ctx)
 314 {
 315         struct arb_task         *task = NULL;
 316         uint64_t                offset_in_ios;
 317         int                     rc;
 318         struct ns_entry         *entry = ns_ctx->entry;
 319
 320         if (rte_mempool_get(task_pool, (void **)&task) != 0) {
 321                 fprintf(stderr, "task_pool rte_mempool_get failed\n");
 322                 exit(1);
 323         }
 324
 325         task->ns_ctx = ns_ctx;
 326
 327         if (g_arbitration.is_random) {
 328                 offset_in_ios = rand_r(&seed) % entry->size_in_ios;
 329         } else {
 330                 offset_in_ios = ns_ctx->offset_in_ios++;
 331                 if (ns_ctx->offset_in_ios == entry->size_in_ios) {
 332                         ns_ctx->offset_in_ios = 0;
 333                 }
 334         }
 335
 336         if ((g_arbitration.rw_percentage == 100) ||
 337             (g_arbitration.rw_percentage != 0 &&
 338              ((rand_r(&seed) % 100) < g_arbitration.rw_percentage))) {
 339                 rc = spdk_nvme_ns_cmd_read(entry->nvme.ns, ns_ctx->qpair, task->buf,
 340                                            offset_in_ios * entry->io_size_blocks,
 341                                            entry->io_size_blocks, io_complete, task, 0);
 342         } else {
 343                 rc = spdk_nvme_ns_cmd_write(entry->nvme.ns, ns_ctx->qpair, task->buf,
 344                                             offset_in_ios * entry->io_size_blocks,
 345                                             entry->io_size_blocks, io_complete, task, 0);
 346         }
 347
 348         if (rc != 0) {
 349                 fprintf(stderr, "starting I/O failed\n");
 350         }
 351
 352         ns_ctx->current_queue_depth++;
 353 }
 354
 355 static void
 356 task_complete(struct arb_task *task)
 357 {
 358         struct ns_worker_ctx    *ns_ctx;
 359
 360         ns_ctx = task->ns_ctx;
 361         ns_ctx->current_queue_depth--;
 362         ns_ctx->io_completed++;
 363
 364         rte_mempool_put(task_pool, task);
 365
 366         /*
 367          * is_draining indicates when time has expired for the test run
 368          * and we are just waiting for the previously submitted I/O
 369          * to complete.  In this case, do not submit a new I/O to replace
 370          * the one just completed.
 371          */
 372         if (!ns_ctx->is_draining) {
 373                 submit_single_io(ns_ctx);
 374         }
 375 }
 376
 377 static void
 378 io_complete(void *ctx, const struct spdk_nvme_cpl *completion)
 379 {
 380         task_complete((struct arb_task *)ctx);
 381 }
 382
 383 static void
 384 check_io(struct ns_worker_ctx *ns_ctx)
 385 {
 386         spdk_nvme_qpair_process_completions(ns_ctx->qpair, g_arbitration.max_completions);
 387 }
 388
 389 static void
 390 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth)
 391 {
 392         while (queue_depth-- > 0) {
 393                 submit_single_io(ns_ctx);
 394         }
 395 }
 396
 397 static void
 398 drain_io(struct ns_worker_ctx *ns_ctx)
 399 {
 400         ns_ctx->is_draining = true;
 401         while (ns_ctx->current_queue_depth > 0) {
 402                 check_io(ns_ctx);
 403         }
 404 }
 405
 406 static int
 407 init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx, enum spdk_nvme_qprio qprio)
 408 {
 409         ns_ctx->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ns_ctx->entry->nvme.ctrlr, qprio);
 410         if (!ns_ctx->qpair) {
 411                 printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n");
 412                 return 1;
 413         }
 414
 415         return 0;
 416 }
 417
 418 static void
 419 cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 420 {
 421         spdk_nvme_ctrlr_free_io_qpair(ns_ctx->qpair);
 422 }
 423
 424 static void
 425 cleanup(void)
 426 {
 427         struct ns_entry *entry                  = g_namespaces;
 428         struct ns_entry *next_entry             = NULL;
 429         struct worker_thread *worker            = g_workers;
 430         struct worker_thread *next_worker       = NULL;
 431         struct arb_task *task                   = NULL;
 432
 433         while (entry) {
 434                 next_entry = entry->next;
 435                 free(entry);
 436                 entry = next_entry;
 437         };
 438
 439         while (worker) {
 440                 next_worker = worker->next;
 441                 free(worker->ns_ctx);
 442                 free(worker);
 443                 worker = next_worker;
 444         };
 445
 446         if (rte_mempool_get(task_pool, (void **)&task) == 0) {
 447                 spdk_free(task->buf);
 448         }
 449
 450 }
 451
 452 static int
 453 work_fn(void *arg)
 454 {
 455         uint64_t tsc_end;
 456         struct worker_thread *worker = (struct worker_thread *)arg;
 457         struct ns_worker_ctx *ns_ctx = NULL;
 458
 459         printf("Starting thread on core %u with %s\n", worker->lcore, print_qprio(worker->qprio));
 460
 461         /* Allocate a queue pair for each namespace. */
 462         ns_ctx = worker->ns_ctx;
 463         while (ns_ctx != NULL) {
 464                 if (init_ns_worker_ctx(ns_ctx, worker->qprio) != 0) {
 465                         printf("ERROR: init_ns_worker_ctx() failed\n");
 466                         return 1;
 467                 }
 468                 ns_ctx = ns_ctx->next;
 469         }
 470
 471         tsc_end = spdk_get_ticks() + g_arbitration.time_in_sec * g_arbitration.tsc_rate;
 472
 473         /* Submit initial I/O for each namespace. */
 474         ns_ctx = worker->ns_ctx;
 475
 476         while (ns_ctx != NULL) {
 477                 submit_io(ns_ctx, g_arbitration.queue_depth);
 478                 ns_ctx = ns_ctx->next;
 479         }
 480
 481         while (1) {
 482                 /*
 483                  * Check for completed I/O for each controller. A new
 484                  * I/O will be submitted in the io_complete callback
 485                  * to replace each I/O that is completed.
 486                  */
 487                 ns_ctx = worker->ns_ctx;
 488                 while (ns_ctx != NULL) {
 489                         check_io(ns_ctx);
 490                         ns_ctx = ns_ctx->next;
 491                 }
 492
 493                 if (spdk_get_ticks() > tsc_end) {
 494                         break;
 495                 }
 496         }
 497
 498         ns_ctx = worker->ns_ctx;
 499         while (ns_ctx != NULL) {
 500                 drain_io(ns_ctx);
 501                 cleanup_ns_worker_ctx(ns_ctx);
 502                 ns_ctx = ns_ctx->next;
 503         }
 504
 505         return 0;
 506 }
 507
 508 static void
 509 usage(char *program_name)
 510 {
 511         printf("%s options", program_name);
 512         printf("\n");
 513         printf("\t[-q io depth]\n");
 514         printf("\t[-s io size in bytes]\n");
 515         printf("\t[-w io pattern type, must be one of\n");
 516         printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
 517         printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
 518         printf("\t[-l enable latency tracking, default: disabled]\n");
 519         printf("\t\t(0 - disabled; 1 - enabled)\n");
 520         printf("\t[-t time in seconds]\n");
 521         printf("\t[-c core mask for I/O submission/completion.]\n");
 522         printf("\t\t(default: 0xf - 4 cores)]\n");
 523         printf("\t[-m max completions per poll]\n");
 524         printf("\t\t(default: 0 - unlimited)\n");
 525         printf("\t[-a arbitration mechanism, must be one of below]\n");
 526         printf("\t\t(0, 1, 2)]\n");
 527         printf("\t\t(0: default round robin mechanism)]\n");
 528         printf("\t\t(1: weighted round robin mechanism)]\n");
 529         printf("\t\t(2: vendor specific mechanism)]\n");
 530         printf("\t[-b enable arbitration user configuration, default: disabled]\n");
 531         printf("\t\t(0 - disabled; 1 - enabled)\n");
 532         printf("\t[-n subjected IOs for performance comparison]\n");
 533         printf("\t[-i shared memory group ID]\n");
 534 }
 535
 536 static const char *
 537 print_qprio(enum spdk_nvme_qprio qprio)
 538 {
 539         switch (qprio) {
 540         case SPDK_NVME_QPRIO_URGENT:
 541                 return "urgent priority queue";
 542         case SPDK_NVME_QPRIO_HIGH:
 543                 return "high priority queue";
 544         case SPDK_NVME_QPRIO_MEDIUM:
 545                 return "medium priority queue";
 546         case SPDK_NVME_QPRIO_LOW:
 547                 return "low priority queue";
 548         default:
 549                 return "invalid priority queue";
 550         }
 551 }
 552
 553
 554 static void
 555 print_configuration(char *program_name)
 556 {
 557         printf("%s run with configuration:\n", program_name);
 558         printf("%s -q %d -s %d -w %s -M %d -l %d -t %d -c %s -m %d -a %d -b %d -i %d\n",
 559                program_name,
 560                g_arbitration.queue_depth,
 561                g_arbitration.io_size_bytes,
 562                g_arbitration.workload_type,
 563                g_arbitration.rw_percentage,
 564                g_arbitration.latency_tracking_enable,
 565                g_arbitration.time_in_sec,
 566                g_arbitration.core_mask,
 567                g_arbitration.max_completions,
 568                g_arbitration.arbitration_mechanism,
 569                g_arbitration.arbitration_config ,
 570                g_arbitration.io_count);
 571 }
 572
 573
 574 static void
 575 print_performance(void)
 576 {
 577         float io_per_second, sent_all_io_in_secs;
 578         struct worker_thread    *worker;
 579         struct ns_worker_ctx    *ns_ctx;
 580
 581         worker = g_workers;
 582         while (worker) {
 583                 ns_ctx = worker->ns_ctx;
 584                 while (ns_ctx) {
 585                         io_per_second = (float)ns_ctx->io_completed / g_arbitration.time_in_sec;
 586                         sent_all_io_in_secs = g_arbitration.io_count / io_per_second;
 587                         printf("%-43.43s core %u: %8.2f IO/s %8.2f secs/%d ios\n",
 588                                ns_ctx->entry->name, worker->lcore,
 589                                io_per_second, sent_all_io_in_secs, g_arbitration.io_count);
 590                         ns_ctx = ns_ctx->next;
 591                 }
 592                 worker = worker->next;
 593         }
 594         printf("========================================================\n");
 595
 596         printf("\n");
 597 }
 598
 599 static void
 600 print_latency_page(struct ctrlr_entry *entry)
 601 {
 602         int i;
 603
 604         printf("\n");
 605         printf("%s\n", entry->name);
 606         printf("--------------------------------------------------------\n");
 607
 608         for (i = 0; i < 32; i++) {
 609                 if (entry->latency_page.buckets_32us[i])
 610                         printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32,
 611                                entry->latency_page.buckets_32us[i]);
 612         }
 613         for (i = 0; i < 31; i++) {
 614                 if (entry->latency_page.buckets_1ms[i])
 615                         printf("Bucket %dms - %dms: %d\n", i + 1, i + 2,
 616                                entry->latency_page.buckets_1ms[i]);
 617         }
 618         for (i = 0; i < 31; i++) {
 619                 if (entry->latency_page.buckets_32ms[i])
 620                         printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32,
 621                                entry->latency_page.buckets_32ms[i]);
 622         }
 623 }
 624
 625 static void
 626 print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page)
 627 {
 628         struct ctrlr_entry      *ctrlr;
 629
 630         printf("%s Latency Statistics:\n", op_name);
 631         printf("========================================================\n");
 632         ctrlr = g_controllers;
 633         while (ctrlr) {
 634                 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
 635                         if (spdk_nvme_ctrlr_cmd_get_log_page(
 636                                     ctrlr->ctrlr, log_page,
 637                                     SPDK_NVME_GLOBAL_NS_TAG,
 638                                     &ctrlr->latency_page,
 639                                     sizeof(struct spdk_nvme_intel_rw_latency_page),
 640                                     0,
 641                                     enable_latency_tracking_complete,
 642                                     NULL)) {
 643                                 printf("nvme_ctrlr_cmd_get_log_page() failed\n");
 644                                 exit(1);
 645                         }
 646
 647                         g_arbitration.outstanding_commands++;
 648                 } else {
 649                         printf("Controller %s: %s latency statistics not supported\n",
 650                                ctrlr->name, op_name);
 651                 }
 652                 ctrlr = ctrlr->next;
 653         }
 654
 655         while (g_arbitration.outstanding_commands) {
 656                 ctrlr = g_controllers;
 657                 while (ctrlr) {
 658                         spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr);
 659                         ctrlr = ctrlr->next;
 660                 }
 661         }
 662
 663         ctrlr = g_controllers;
 664         while (ctrlr) {
 665                 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
 666                         print_latency_page(ctrlr);
 667                 }
 668                 ctrlr = ctrlr->next;
 669         }
 670         printf("\n");
 671 }
 672
 673 static void
 674 print_stats(void)
 675 {
 676         print_performance();
 677         if (g_arbitration.latency_tracking_enable) {
 678                 if (g_arbitration.rw_percentage != 0) {
 679                         print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY);
 680                 }
 681                 if (g_arbitration.rw_percentage != 100) {
 682                         print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY);
 683                 }
 684         }
 685 }
 686
 687 static int
 688 parse_args(int argc, char **argv)
 689 {
 690         const char *workload_type       = NULL;
 691         int op                          = 0;
 692         bool mix_specified              = false;
 693
 694         while ((op = getopt(argc, argv, "c:l:i:m:q:s:t:w:M:a:b:n:h")) != -1) {
 695                 switch (op) {
 696                 case 'c':
 697                         g_arbitration.core_mask = optarg;
 698                         break;
 699                 case 'i':
 700                         g_arbitration.shm_id = atoi(optarg);
 701                         break;
 702                 case 'l':
 703                         g_arbitration.latency_tracking_enable = atoi(optarg);
 704                         break;
 705                 case 'm':
 706                         g_arbitration.max_completions = atoi(optarg);
 707                         break;
 708                 case 'q':
 709                         g_arbitration.queue_depth = atoi(optarg);
 710                         break;
 711                 case 's':
 712                         g_arbitration.io_size_bytes = atoi(optarg);
 713                         break;
 714                 case 't':
 715                         g_arbitration.time_in_sec = atoi(optarg);
 716                         break;
 717                 case 'w':
 718                         g_arbitration.workload_type = optarg;
 719                         break;
 720                 case 'M':
 721                         g_arbitration.rw_percentage = atoi(optarg);
 722                         mix_specified = true;
 723                         break;
 724                 case 'a':
 725                         g_arbitration.arbitration_mechanism = atoi(optarg);
 726                         break;
 727                 case 'b':
 728                         g_arbitration.arbitration_config = atoi(optarg);
 729                         break;
 730                 case 'n':
 731                         g_arbitration.io_count = atoi(optarg);
 732                         break;
 733                 case 'h':
 734                 default:
 735                         usage(argv[0]);
 736                         return 1;
 737                 }
 738         }
 739
 740         workload_type = g_arbitration.workload_type;
 741
 742         if (strcmp(workload_type, "read") &&
 743             strcmp(workload_type, "write") &&
 744             strcmp(workload_type, "randread") &&
 745             strcmp(workload_type, "randwrite") &&
 746             strcmp(workload_type, "rw") &&
 747             strcmp(workload_type, "randrw")) {
 748                 fprintf(stderr,
 749                         "io pattern type must be one of\n"
 750                         "(read, write, randread, randwrite, rw, randrw)\n");
 751                 return 1;
 752         }
 753
 754         if (!strcmp(workload_type, "read") ||
 755             !strcmp(workload_type, "randread")) {
 756                 g_arbitration.rw_percentage = 100;
 757         }
 758
 759         if (!strcmp(workload_type, "write") ||
 760             !strcmp(workload_type, "randwrite")) {
 761                 g_arbitration.rw_percentage = 0;
 762         }
 763
 764         if (!strcmp(workload_type, "read") ||
 765             !strcmp(workload_type, "randread") ||
 766             !strcmp(workload_type, "write") ||
 767             !strcmp(workload_type, "randwrite")) {
 768                 if (mix_specified) {
 769                         fprintf(stderr, "Ignoring -M option... Please use -M option"
 770                                 " only when using rw or randrw.\n");
 771                 }
 772         }
 773
 774         if (!strcmp(workload_type, "rw") ||
 775             !strcmp(workload_type, "randrw")) {
 776                 if (g_arbitration.rw_percentage < 0 || g_arbitration.rw_percentage > 100) {
 777                         fprintf(stderr,
 778                                 "-M must be specified to value from 0 to 100 "
 779                                 "for rw or randrw.\n");
 780                         return 1;
 781                 }
 782         }
 783
 784         if (!strcmp(workload_type, "read") ||
 785             !strcmp(workload_type, "write") ||
 786             !strcmp(workload_type, "rw")) {
 787                 g_arbitration.is_random = 0;
 788         } else {
 789                 g_arbitration.is_random = 1;
 790         }
 791
 792         if (g_arbitration.latency_tracking_enable != 0 &&
 793             g_arbitration.latency_tracking_enable != 1) {
 794                 fprintf(stderr,
 795                         "-l must be specified to value 0 or 1.\n");
 796                 return 1;
 797         }
 798
 799         switch (g_arbitration.arbitration_mechanism) {
 800         case SPDK_NVME_CC_AMS_RR:
 801         case SPDK_NVME_CC_AMS_WRR:
 802         case SPDK_NVME_CC_AMS_VS:
 803                 break;
 804         default:
 805                 fprintf(stderr,
 806                         "-a must be specified to value 0, 1, or 7.\n");
 807                 return 1;
 808         }
 809
 810         if (g_arbitration.arbitration_config != 0 &&
 811             g_arbitration.arbitration_config != 1) {
 812                 fprintf(stderr,
 813                         "-b must be specified to value 0 or 1.\n");
 814                 return 1;
 815         } else if (g_arbitration.arbitration_config == 1 &&
 816                    g_arbitration.arbitration_mechanism != SPDK_NVME_CC_AMS_WRR) {
 817                 fprintf(stderr,
 818                         "-a must be specified to 1 (WRR) together.\n");
 819                 return 1;
 820         }
 821
 822         return 0;
 823 }
 824
 825 static int
 826 register_workers(void)
 827 {
 828         uint32_t i;
 829         struct worker_thread *worker;
 830         enum spdk_nvme_qprio qprio = SPDK_NVME_QPRIO_URGENT;
 831
 832         g_workers = NULL;
 833         g_arbitration.num_workers = 0;
 834
 835         SPDK_ENV_FOREACH_CORE(i) {
 836                 worker = calloc(1, sizeof(*worker));
 837                 if (worker == NULL) {
 838                         fprintf(stderr, "Unable to allocate worker\n");
 839                         return -1;
 840                 }
 841
 842                 worker->lcore = i;
 843                 worker->next = g_workers;
 844                 g_workers = worker;
 845                 g_arbitration.num_workers++;
 846
 847                 if (g_arbitration.arbitration_mechanism == SPDK_NVME_CAP_AMS_WRR) {
 848                         qprio++;
 849                 }
 850
 851                 worker->qprio = qprio % SPDK_NVME_QPRIO_MAX;
 852         }
 853
 854         return 0;
 855 }
 856
 857 static bool
 858 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
 859          struct spdk_nvme_ctrlr_opts *opts)
 860 {
 861         /* Update with user specified arbitration configuration */
 862         opts->arb_mechanism = g_arbitration.arbitration_mechanism;
 863
 864         printf("Attaching to %s\n", trid->traddr);
 865
 866         return true;
 867 }
 868
 869 static void
 870 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
 871           struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
 872 {
 873         printf("Attached to %s\n", trid->traddr);
 874
 875         /* Update with actual arbitration configuration in use */
 876         g_arbitration.arbitration_mechanism = opts->arb_mechanism;
 877
 878         register_ctrlr(ctrlr);
 879 }
 880
 881 static int
 882 register_controllers(void)
 883 {
 884         printf("Initializing NVMe Controllers\n");
 885
 886         if (spdk_nvme_probe(NULL, NULL, probe_cb, attach_cb, NULL) != 0) {
 887                 fprintf(stderr, "spdk_nvme_probe() failed\n");
 888                 return 1;
 889         }
 890
 891         if (g_arbitration.num_namespaces == 0) {
 892                 fprintf(stderr, "No valid namespaces to continue IO testing\n");
 893                 return 1;
 894         }
 895
 896         return 0;
 897 }
 898
 899 static void
 900 unregister_controllers(void)
 901 {
 902         struct ctrlr_entry *entry = g_controllers;
 903
 904         while (entry) {
 905                 struct ctrlr_entry *next = entry->next;
 906                 if (g_arbitration.latency_tracking_enable &&
 907                     spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING))
 908                         set_latency_tracking_feature(entry->ctrlr, false);
 909                 spdk_nvme_detach(entry->ctrlr);
 910                 free(entry);
 911                 entry = next;
 912         }
 913 }
 914
 915 static int
 916 associate_workers_with_ns(void)
 917 {
 918         struct ns_entry         *entry = g_namespaces;
 919         struct worker_thread    *worker = g_workers;
 920         struct ns_worker_ctx    *ns_ctx;
 921         int                     i, count;
 922
 923         count = g_arbitration.num_namespaces > g_arbitration.num_workers ?
 924                 g_arbitration.num_namespaces : g_arbitration.num_workers;
 925
 926         for (i = 0; i < count; i++) {
 927                 if (entry == NULL) {
 928                         break;
 929                 }
 930
 931                 ns_ctx = malloc(sizeof(struct ns_worker_ctx));
 932                 if (!ns_ctx) {
 933                         return 1;
 934                 }
 935                 memset(ns_ctx, 0, sizeof(*ns_ctx));
 936
 937                 printf("Associating %s with lcore %d\n", entry->name, worker->lcore);
 938                 ns_ctx->entry = entry;
 939                 ns_ctx->next = worker->ns_ctx;
 940                 worker->ns_ctx = ns_ctx;
 941
 942                 worker = worker->next;
 943                 if (worker == NULL) {
 944                         worker = g_workers;
 945                 }
 946
 947                 entry = entry->next;
 948                 if (entry == NULL) {
 949                         entry = g_namespaces;
 950                 }
 951
 952         }
 953
 954         return 0;
 955 }
 956
 957 static void
 958 get_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl)
 959 {
 960         struct feature *feature = cb_arg;
 961         int fid = feature - features;
 962
 963         if (spdk_nvme_cpl_is_error(cpl)) {
 964                 printf("get_feature(0x%02X) failed\n", fid);
 965         } else {
 966                 feature->result = cpl->cdw0;
 967                 feature->valid = true;
 968         }
 969
 970         g_arbitration.outstanding_commands--;
 971 }
 972
 973 static int
 974 get_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t fid)
 975 {
 976         struct spdk_nvme_cmd cmd = {};
 977
 978         cmd.opc = SPDK_NVME_OPC_GET_FEATURES;
 979         cmd.cdw10 = fid;
 980
 981         return spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &cmd, NULL, 0, get_feature_completion, &features[fid]);
 982 }
 983
 984 static void
 985 get_arb_feature(struct spdk_nvme_ctrlr *ctrlr)
 986 {
 987         get_feature(ctrlr, SPDK_NVME_FEAT_ARBITRATION);
 988
 989         g_arbitration.outstanding_commands++;
 990
 991         while (g_arbitration.outstanding_commands) {
 992                 spdk_nvme_ctrlr_process_admin_completions(ctrlr);
 993         }
 994
 995         if (features[SPDK_NVME_FEAT_ARBITRATION].valid) {
 996                 uint32_t arb = features[SPDK_NVME_FEAT_ARBITRATION].result;
 997                 unsigned ab, lpw, mpw, hpw;
 998
 999                 ab = arb & SPDK_NVME_ARB_BURST_MASK;
1000                 lpw = ((arb >> SPDK_NVME_LOW_PRIO_WEIGHT_SHIFT) & SPDK_NVME_PRIO_WEIGHT_MASK) + 1;
1001                 mpw = ((arb >> SPDK_NVME_MED_PRIO_WEIGHT_SHIFT) & SPDK_NVME_PRIO_WEIGHT_MASK) + 1;
1002                 hpw = ((arb >> SPDK_NVME_HIGH_PRIO_WEIGHT_SHIFT) & SPDK_NVME_PRIO_WEIGHT_MASK) + 1;
1003
1004                 printf("Current Arbitration Configuration\n");
1005                 printf("===========\n");
1006                 printf("Arbitration Burst:           ");
1007                 if (ab == SPDK_NVME_ARB_BURST_MASK) {
1008                         printf("no limit\n");
1009                 } else {
1010                         printf("%u\n", 1u << ab);
1011                 }
1012
1013                 printf("Low Priority Weight:         %u\n", lpw);
1014                 printf("Medium Priority Weight:      %u\n", mpw);
1015                 printf("High Priority Weight:        %u\n", hpw);
1016                 printf("\n");
1017         }
1018 }
1019
1020 static void
1021 set_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl)
1022 {
1023         struct feature *feature = cb_arg;
1024         int fid = feature - features;
1025
1026         if (spdk_nvme_cpl_is_error(cpl)) {
1027                 printf("set_feature(0x%02X) failed\n", fid);
1028                 feature->valid = false;
1029         } else {
1030                 printf("Set Arbitration Feature Successfully\n");
1031         }
1032
1033         g_arbitration.outstanding_commands--;
1034 }
1035
1036 static int
1037 set_arb_feature(struct spdk_nvme_ctrlr *ctrlr)
1038 {
1039         int ret;
1040         struct spdk_nvme_cmd cmd = {};
1041         uint32_t arb = 0;
1042         unsigned ab, lpw, mpw, hpw;
1043
1044         cmd.opc = SPDK_NVME_OPC_SET_FEATURES;
1045         cmd.cdw10 = SPDK_NVME_FEAT_ARBITRATION;
1046
1047         g_arbitration.outstanding_commands = 0;
1048
1049         if (features[SPDK_NVME_FEAT_ARBITRATION].valid) {
1050                 ab = USER_SPECIFIED_ARBITRATION_BURST & SPDK_NVME_ARB_BURST_MASK;
1051                 hpw = USER_SPECIFIED_HIGH_PRIORITY_WEIGHT << SPDK_NVME_HIGH_PRIO_WEIGHT_SHIFT;
1052                 mpw = USER_SPECIFIED_MEDIUM_PRIORITY_WEIGHT << SPDK_NVME_MED_PRIO_WEIGHT_SHIFT;
1053                 lpw = USER_SPECIFIED_LOW_PRIORITY_WEIGHT << SPDK_NVME_LOW_PRIO_WEIGHT_SHIFT;
1054                 arb = hpw | mpw | lpw | ab;
1055                 cmd.cdw11 = arb;
1056         }
1057
1058         ret = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &cmd, NULL, 0,
1059                                             set_feature_completion, &features[SPDK_NVME_FEAT_ARBITRATION]);
1060         if (ret) {
1061                 printf("Set Arbitration Feature: Failed 0x%x\n", ret);
1062                 return 1;
1063         }
1064
1065         g_arbitration.outstanding_commands++;
1066
1067         while (g_arbitration.outstanding_commands) {
1068                 spdk_nvme_ctrlr_process_admin_completions(ctrlr);
1069         }
1070
1071         if (!features[SPDK_NVME_FEAT_ARBITRATION].valid) {
1072                 printf("Set Arbitration Feature failed and use default configuration\n");
1073         }
1074
1075         return 0;
1076 }
1077
1078 int
1079 main(int argc, char **argv)
1080 {
1081         int rc;
1082         struct worker_thread *worker, *master_worker;
1083         unsigned master_core;
1084         char task_pool_name[30];
1085         uint32_t task_count;
1086         struct spdk_env_opts opts;
1087
1088         rc = parse_args(argc, argv);
1089         if (rc != 0) {
1090                 return rc;
1091         }
1092
1093         spdk_env_opts_init(&opts);
1094         opts.name = "arb";
1095         opts.core_mask = g_arbitration.core_mask;
1096         opts.shm_id = g_arbitration.shm_id;
1097         spdk_env_init(&opts);
1098
1099         g_arbitration.tsc_rate = spdk_get_ticks_hz();
1100
1101         if (register_workers() != 0) {
1102                 return 1;
1103         }
1104
1105         if (register_controllers() != 0) {
1106                 return 1;
1107         }
1108
1109         if (associate_workers_with_ns() != 0) {
1110                 return 1;
1111         }
1112
1113         snprintf(task_pool_name, sizeof(task_pool_name), "task_pool_%d", getpid());
1114
1115         /*
1116          * The task_count will be dynamically calculated based on the
1117          * number of attached active namespaces, queue depth and number
1118          * of cores (workers) involved in the IO perations.
1119          */
1120         task_count = g_arbitration.num_namespaces > g_arbitration.num_workers ?
1121                      g_arbitration.num_namespaces : g_arbitration.num_workers;
1122         task_count *= g_arbitration.queue_depth;
1123
1124         task_pool = rte_mempool_create(task_pool_name, task_count,
1125                                        sizeof(struct arb_task),
1126                                        0, 0, NULL, NULL, task_ctor, NULL,
1127                                        SOCKET_ID_ANY, 0);
1128         if (task_pool == NULL) {
1129                 fprintf(stderr, "could not initialize task pool\n");
1130                 return 1;
1131         }
1132
1133         print_configuration(argv[0]);
1134
1135         printf("Initialization complete. Launching workers.\n");
1136
1137         /* Launch all of the slave workers */
1138         master_core = rte_get_master_lcore();
1139         master_worker = NULL;
1140         worker = g_workers;
1141         while (worker != NULL) {
1142                 if (worker->lcore != master_core) {
1143                         rte_eal_remote_launch(work_fn, worker, worker->lcore);
1144                 } else {
1145                         assert(master_worker == NULL);
1146                         master_worker = worker;
1147                 }
1148                 worker = worker->next;
1149         }
1150
1151         assert(master_worker != NULL);
1152         rc = work_fn(master_worker);
1153
1154         rte_eal_mp_wait_lcore();
1155
1156         print_stats();
1157
1158         unregister_controllers();
1159
1160         cleanup();
1161
1162         if (rc != 0) {
1163                 fprintf(stderr, "%s: errors occured\n", argv[0]);
1164         }
1165
1166         return rc;
1167 }