]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/examples/nvme/perf/perf.c
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / spdk / examples / nvme / perf / perf.c
1 /*-
2 * BSD LICENSE
3 *
4 * Copyright (c) Intel Corporation.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include <stdio.h>
35 #include <stdbool.h>
36 #include <string.h>
37 #include <unistd.h>
38
39 #include <rte_config.h>
40 #include <rte_mempool.h>
41 #include <rte_lcore.h>
42
43 #include "spdk/fd.h"
44 #include "spdk/nvme.h"
45 #include "spdk/env.h"
46 #include "spdk/queue.h"
47 #include "spdk/string.h"
48 #include "spdk/nvme_intel.h"
49
50 #if HAVE_LIBAIO
51 #include <libaio.h>
52 #include <sys/stat.h>
53 #include <fcntl.h>
54 #endif
55
56 struct ctrlr_entry {
57 struct spdk_nvme_ctrlr *ctrlr;
58 struct spdk_nvme_intel_rw_latency_page *latency_page;
59 struct ctrlr_entry *next;
60 char name[1024];
61 };
62
63 enum entry_type {
64 ENTRY_TYPE_NVME_NS,
65 ENTRY_TYPE_AIO_FILE,
66 };
67
68 struct ns_entry {
69 enum entry_type type;
70
71 union {
72 struct {
73 struct spdk_nvme_ctrlr *ctrlr;
74 struct spdk_nvme_ns *ns;
75 } nvme;
76 #if HAVE_LIBAIO
77 struct {
78 int fd;
79 } aio;
80 #endif
81 } u;
82
83 struct ns_entry *next;
84 uint32_t io_size_blocks;
85 uint64_t size_in_ios;
86 char name[1024];
87 };
88
89 struct ns_worker_ctx {
90 struct ns_entry *entry;
91 uint64_t io_completed;
92 uint64_t total_tsc;
93 uint64_t min_tsc;
94 uint64_t max_tsc;
95 uint64_t current_queue_depth;
96 uint64_t offset_in_ios;
97 bool is_draining;
98
99 union {
100 struct {
101 struct spdk_nvme_qpair *qpair;
102 } nvme;
103
104 #if HAVE_LIBAIO
105 struct {
106 struct io_event *events;
107 io_context_t ctx;
108 } aio;
109 #endif
110 } u;
111
112 struct ns_worker_ctx *next;
113 };
114
115 struct perf_task {
116 struct ns_worker_ctx *ns_ctx;
117 void *buf;
118 uint64_t submit_tsc;
119 #if HAVE_LIBAIO
120 struct iocb iocb;
121 #endif
122 };
123
124 struct worker_thread {
125 struct ns_worker_ctx *ns_ctx;
126 struct worker_thread *next;
127 unsigned lcore;
128 };
129
130 static int g_outstanding_commands;
131
132 static bool g_latency_tracking_enable = false;
133
134 static struct rte_mempool *task_pool;
135
136 static struct ctrlr_entry *g_controllers = NULL;
137 static struct ns_entry *g_namespaces = NULL;
138 static int g_num_namespaces = 0;
139 static struct worker_thread *g_workers = NULL;
140 static int g_num_workers = 0;
141
142 static uint64_t g_tsc_rate;
143
144 static uint32_t g_io_align = 0x200;
145 static uint32_t g_io_size_bytes;
146 static int g_rw_percentage;
147 static int g_is_random;
148 static int g_queue_depth;
149 static int g_time_in_sec;
150 static uint32_t g_max_completions;
151 static int g_dpdk_mem;
152 static int g_shm_id = -1;
153
154 static const char *g_core_mask;
155
156 struct trid_entry {
157 struct spdk_nvme_transport_id trid;
158 TAILQ_ENTRY(trid_entry) tailq;
159 };
160
161 static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list);
162
163 static int g_aio_optind; /* Index of first AIO filename in argv */
164
165 static void
166 task_complete(struct perf_task *task);
167
168 static void
169 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
170 {
171 struct ns_entry *entry;
172 const struct spdk_nvme_ctrlr_data *cdata;
173
174 cdata = spdk_nvme_ctrlr_get_data(ctrlr);
175
176 if (!spdk_nvme_ns_is_active(ns)) {
177 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n",
178 cdata->mn, cdata->sn,
179 spdk_nvme_ns_get_id(ns));
180 return;
181 }
182
183 if (spdk_nvme_ns_get_size(ns) < g_io_size_bytes ||
184 spdk_nvme_ns_get_sector_size(ns) > g_io_size_bytes) {
185 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid "
186 "ns size %" PRIu64 " / block size %u for I/O size %u\n",
187 cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns),
188 spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes);
189 return;
190 }
191
192 entry = malloc(sizeof(struct ns_entry));
193 if (entry == NULL) {
194 perror("ns_entry malloc");
195 exit(1);
196 }
197
198 entry->type = ENTRY_TYPE_NVME_NS;
199 entry->u.nvme.ctrlr = ctrlr;
200 entry->u.nvme.ns = ns;
201
202 entry->size_in_ios = spdk_nvme_ns_get_size(ns) /
203 g_io_size_bytes;
204 entry->io_size_blocks = g_io_size_bytes / spdk_nvme_ns_get_sector_size(ns);
205
206 snprintf(entry->name, 44, "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
207
208 g_num_namespaces++;
209 entry->next = g_namespaces;
210 g_namespaces = entry;
211 }
212
213 static void
214 unregister_namespaces(void)
215 {
216 struct ns_entry *entry = g_namespaces;
217
218 while (entry) {
219 struct ns_entry *next = entry->next;
220 free(entry);
221 entry = next;
222 }
223 }
224
225 static void
226 enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl)
227 {
228 if (spdk_nvme_cpl_is_error(cpl)) {
229 printf("enable_latency_tracking_complete failed\n");
230 }
231 g_outstanding_commands--;
232 }
233
234 static void
235 set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable)
236 {
237 int res;
238 union spdk_nvme_intel_feat_latency_tracking latency_tracking;
239
240 if (enable) {
241 latency_tracking.bits.enable = 0x01;
242 } else {
243 latency_tracking.bits.enable = 0x00;
244 }
245
246 res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING,
247 latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL);
248 if (res) {
249 printf("fail to allocate nvme request.\n");
250 return;
251 }
252 g_outstanding_commands++;
253
254 while (g_outstanding_commands) {
255 spdk_nvme_ctrlr_process_admin_completions(ctrlr);
256 }
257 }
258
259 static void
260 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
261 {
262 int nsid, num_ns;
263 struct spdk_nvme_ns *ns;
264 struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry));
265 const struct spdk_nvme_ctrlr_data *cdata = spdk_nvme_ctrlr_get_data(ctrlr);
266
267 if (entry == NULL) {
268 perror("ctrlr_entry malloc");
269 exit(1);
270 }
271
272 entry->latency_page = spdk_zmalloc(sizeof(struct spdk_nvme_intel_rw_latency_page),
273 4096, NULL);
274 if (entry->latency_page == NULL) {
275 printf("Allocation error (latency page)\n");
276 exit(1);
277 }
278
279 snprintf(entry->name, sizeof(entry->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
280
281 entry->ctrlr = ctrlr;
282 entry->next = g_controllers;
283 g_controllers = entry;
284
285 if (g_latency_tracking_enable &&
286 spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING))
287 set_latency_tracking_feature(ctrlr, true);
288
289 num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
290 for (nsid = 1; nsid <= num_ns; nsid++) {
291 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
292 if (ns == NULL) {
293 continue;
294 }
295 register_ns(ctrlr, ns);
296 }
297
298 }
299
300 #if HAVE_LIBAIO
301 static int
302 register_aio_file(const char *path)
303 {
304 struct ns_entry *entry;
305
306 int flags, fd;
307 uint64_t size;
308 uint32_t blklen;
309
310 if (g_rw_percentage == 100) {
311 flags = O_RDONLY;
312 } else if (g_rw_percentage == 0) {
313 flags = O_WRONLY;
314 } else {
315 flags = O_RDWR;
316 }
317
318 flags |= O_DIRECT;
319
320 fd = open(path, flags);
321 if (fd < 0) {
322 fprintf(stderr, "Could not open AIO device %s: %s\n", path, strerror(errno));
323 return -1;
324 }
325
326 size = spdk_fd_get_size(fd);
327 if (size == 0) {
328 fprintf(stderr, "Could not determine size of AIO device %s\n", path);
329 close(fd);
330 return -1;
331 }
332
333 blklen = spdk_fd_get_blocklen(fd);
334 if (blklen == 0) {
335 fprintf(stderr, "Could not determine block size of AIO device %s\n", path);
336 close(fd);
337 return -1;
338 }
339
340 /*
341 * TODO: This should really calculate the LCM of the current g_io_align and blklen.
342 * For now, it's fairly safe to just assume all block sizes are powers of 2.
343 */
344 if (g_io_align < blklen) {
345 g_io_align = blklen;
346 }
347
348 entry = malloc(sizeof(struct ns_entry));
349 if (entry == NULL) {
350 close(fd);
351 perror("aio ns_entry malloc");
352 return -1;
353 }
354
355 entry->type = ENTRY_TYPE_AIO_FILE;
356 entry->u.aio.fd = fd;
357 entry->size_in_ios = size / g_io_size_bytes;
358 entry->io_size_blocks = g_io_size_bytes / blklen;
359
360 snprintf(entry->name, sizeof(entry->name), "%s", path);
361
362 g_num_namespaces++;
363 entry->next = g_namespaces;
364 g_namespaces = entry;
365
366 return 0;
367 }
368
369 static int
370 aio_submit(io_context_t aio_ctx, struct iocb *iocb, int fd, enum io_iocb_cmd cmd, void *buf,
371 unsigned long nbytes, uint64_t offset, void *cb_ctx)
372 {
373 iocb->aio_fildes = fd;
374 iocb->aio_reqprio = 0;
375 iocb->aio_lio_opcode = cmd;
376 iocb->u.c.buf = buf;
377 iocb->u.c.nbytes = nbytes;
378 iocb->u.c.offset = offset;
379 iocb->data = cb_ctx;
380
381 if (io_submit(aio_ctx, 1, &iocb) < 0) {
382 printf("io_submit");
383 return -1;
384 }
385
386 return 0;
387 }
388
389 static void
390 aio_check_io(struct ns_worker_ctx *ns_ctx)
391 {
392 int count, i;
393 struct timespec timeout;
394
395 timeout.tv_sec = 0;
396 timeout.tv_nsec = 0;
397
398 count = io_getevents(ns_ctx->u.aio.ctx, 1, g_queue_depth, ns_ctx->u.aio.events, &timeout);
399 if (count < 0) {
400 fprintf(stderr, "io_getevents error\n");
401 exit(1);
402 }
403
404 for (i = 0; i < count; i++) {
405 task_complete(ns_ctx->u.aio.events[i].data);
406 }
407 }
408 #endif /* HAVE_LIBAIO */
409
410 static void task_ctor(struct rte_mempool *mp, void *arg, void *__task, unsigned id)
411 {
412 struct perf_task *task = __task;
413 task->buf = spdk_zmalloc(g_io_size_bytes, g_io_align, NULL);
414 if (task->buf == NULL) {
415 fprintf(stderr, "task->buf spdk_zmalloc failed\n");
416 exit(1);
417 }
418 memset(task->buf, id % 8, g_io_size_bytes);
419 }
420
421 static void io_complete(void *ctx, const struct spdk_nvme_cpl *completion);
422
423 static __thread unsigned int seed = 0;
424
425 static void
426 submit_single_io(struct ns_worker_ctx *ns_ctx)
427 {
428 struct perf_task *task = NULL;
429 uint64_t offset_in_ios;
430 int rc;
431 struct ns_entry *entry = ns_ctx->entry;
432
433 if (rte_mempool_get(task_pool, (void **)&task) != 0) {
434 fprintf(stderr, "task_pool rte_mempool_get failed\n");
435 exit(1);
436 }
437
438 task->ns_ctx = ns_ctx;
439
440 if (g_is_random) {
441 offset_in_ios = rand_r(&seed) % entry->size_in_ios;
442 } else {
443 offset_in_ios = ns_ctx->offset_in_ios++;
444 if (ns_ctx->offset_in_ios == entry->size_in_ios) {
445 ns_ctx->offset_in_ios = 0;
446 }
447 }
448
449 task->submit_tsc = spdk_get_ticks();
450
451 if ((g_rw_percentage == 100) ||
452 (g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) {
453 #if HAVE_LIBAIO
454 if (entry->type == ENTRY_TYPE_AIO_FILE) {
455 rc = aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PREAD, task->buf,
456 g_io_size_bytes, offset_in_ios * g_io_size_bytes, task);
457 } else
458 #endif
459 {
460 rc = spdk_nvme_ns_cmd_read(entry->u.nvme.ns, ns_ctx->u.nvme.qpair, task->buf,
461 offset_in_ios * entry->io_size_blocks,
462 entry->io_size_blocks, io_complete, task, 0);
463 }
464 } else {
465 #if HAVE_LIBAIO
466 if (entry->type == ENTRY_TYPE_AIO_FILE) {
467 rc = aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PWRITE, task->buf,
468 g_io_size_bytes, offset_in_ios * g_io_size_bytes, task);
469 } else
470 #endif
471 {
472 rc = spdk_nvme_ns_cmd_write(entry->u.nvme.ns, ns_ctx->u.nvme.qpair, task->buf,
473 offset_in_ios * entry->io_size_blocks,
474 entry->io_size_blocks, io_complete, task, 0);
475 }
476 }
477
478 if (rc != 0) {
479 fprintf(stderr, "starting I/O failed\n");
480 }
481
482 ns_ctx->current_queue_depth++;
483 }
484
485 static void
486 task_complete(struct perf_task *task)
487 {
488 struct ns_worker_ctx *ns_ctx;
489 uint64_t tsc_diff;
490
491 ns_ctx = task->ns_ctx;
492 ns_ctx->current_queue_depth--;
493 ns_ctx->io_completed++;
494 tsc_diff = spdk_get_ticks() - task->submit_tsc;
495 ns_ctx->total_tsc += tsc_diff;
496 if (ns_ctx->min_tsc > tsc_diff) {
497 ns_ctx->min_tsc = tsc_diff;
498 }
499 if (ns_ctx->max_tsc < tsc_diff) {
500 ns_ctx->max_tsc = tsc_diff;
501 }
502
503 rte_mempool_put(task_pool, task);
504
505 /*
506 * is_draining indicates when time has expired for the test run
507 * and we are just waiting for the previously submitted I/O
508 * to complete. In this case, do not submit a new I/O to replace
509 * the one just completed.
510 */
511 if (!ns_ctx->is_draining) {
512 submit_single_io(ns_ctx);
513 }
514 }
515
516 static void
517 io_complete(void *ctx, const struct spdk_nvme_cpl *completion)
518 {
519 task_complete((struct perf_task *)ctx);
520 }
521
522 static void
523 check_io(struct ns_worker_ctx *ns_ctx)
524 {
525 #if HAVE_LIBAIO
526 if (ns_ctx->entry->type == ENTRY_TYPE_AIO_FILE) {
527 aio_check_io(ns_ctx);
528 } else
529 #endif
530 {
531 spdk_nvme_qpair_process_completions(ns_ctx->u.nvme.qpair, g_max_completions);
532 }
533 }
534
535 static void
536 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth)
537 {
538 while (queue_depth-- > 0) {
539 submit_single_io(ns_ctx);
540 }
541 }
542
543 static void
544 drain_io(struct ns_worker_ctx *ns_ctx)
545 {
546 ns_ctx->is_draining = true;
547 while (ns_ctx->current_queue_depth > 0) {
548 check_io(ns_ctx);
549 }
550 }
551
552 static int
553 init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
554 {
555 if (ns_ctx->entry->type == ENTRY_TYPE_AIO_FILE) {
556 #ifdef HAVE_LIBAIO
557 ns_ctx->u.aio.events = calloc(g_queue_depth, sizeof(struct io_event));
558 if (!ns_ctx->u.aio.events) {
559 return -1;
560 }
561 ns_ctx->u.aio.ctx = 0;
562 if (io_setup(g_queue_depth, &ns_ctx->u.aio.ctx) < 0) {
563 free(ns_ctx->u.aio.events);
564 perror("io_setup");
565 return -1;
566 }
567 #endif
568 } else {
569 /*
570 * TODO: If a controller has multiple namespaces, they could all use the same queue.
571 * For now, give each namespace/thread combination its own queue.
572 */
573 ns_ctx->u.nvme.qpair = spdk_nvme_ctrlr_alloc_io_qpair(ns_ctx->entry->u.nvme.ctrlr, 0);
574 if (!ns_ctx->u.nvme.qpair) {
575 printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n");
576 return -1;
577 }
578 }
579
580 return 0;
581 }
582
583 static void
584 cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
585 {
586 if (ns_ctx->entry->type == ENTRY_TYPE_AIO_FILE) {
587 #ifdef HAVE_LIBAIO
588 io_destroy(ns_ctx->u.aio.ctx);
589 free(ns_ctx->u.aio.events);
590 #endif
591 } else {
592 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair);
593 }
594 }
595
596 static int
597 work_fn(void *arg)
598 {
599 uint64_t tsc_end;
600 struct worker_thread *worker = (struct worker_thread *)arg;
601 struct ns_worker_ctx *ns_ctx = NULL;
602
603 printf("Starting thread on core %u\n", worker->lcore);
604
605 /* Allocate a queue pair for each namespace. */
606 ns_ctx = worker->ns_ctx;
607 while (ns_ctx != NULL) {
608 if (init_ns_worker_ctx(ns_ctx) != 0) {
609 printf("ERROR: init_ns_worker_ctx() failed\n");
610 return 1;
611 }
612 ns_ctx = ns_ctx->next;
613 }
614
615 tsc_end = spdk_get_ticks() + g_time_in_sec * g_tsc_rate;
616
617 /* Submit initial I/O for each namespace. */
618 ns_ctx = worker->ns_ctx;
619 while (ns_ctx != NULL) {
620 submit_io(ns_ctx, g_queue_depth);
621 ns_ctx = ns_ctx->next;
622 }
623
624 while (1) {
625 /*
626 * Check for completed I/O for each controller. A new
627 * I/O will be submitted in the io_complete callback
628 * to replace each I/O that is completed.
629 */
630 ns_ctx = worker->ns_ctx;
631 while (ns_ctx != NULL) {
632 check_io(ns_ctx);
633 ns_ctx = ns_ctx->next;
634 }
635
636 if (spdk_get_ticks() > tsc_end) {
637 break;
638 }
639 }
640
641 ns_ctx = worker->ns_ctx;
642 while (ns_ctx != NULL) {
643 drain_io(ns_ctx);
644 cleanup_ns_worker_ctx(ns_ctx);
645 ns_ctx = ns_ctx->next;
646 }
647
648 return 0;
649 }
650
651 static void usage(char *program_name)
652 {
653 printf("%s options", program_name);
654 #if HAVE_LIBAIO
655 printf(" [AIO device(s)]...");
656 #endif
657 printf("\n");
658 printf("\t[-q io depth]\n");
659 printf("\t[-s io size in bytes]\n");
660 printf("\t[-w io pattern type, must be one of\n");
661 printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
662 printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
663 printf("\t[-l enable latency tracking, default: disabled]\n");
664 printf("\t[-t time in seconds]\n");
665 printf("\t[-c core mask for I/O submission/completion.]\n");
666 printf("\t\t(default: 1)]\n");
667 printf("\t[-r Transport ID for local PCIe NVMe or NVMeoF]\n");
668 printf("\t Format: 'key:value [key:value] ...'\n");
669 printf("\t Keys:\n");
670 printf("\t trtype Transport type (e.g. PCIe, RDMA)\n");
671 printf("\t adrfam Address family (e.g. IPv4, IPv6)\n");
672 printf("\t traddr Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n");
673 printf("\t trsvcid Transport service identifier (e.g. 4420)\n");
674 printf("\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN);
675 printf("\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n");
676 printf("\t -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n");
677 printf("\t[-d DPDK huge memory size in MB.]\n");
678 printf("\t[-m max completions per poll]\n");
679 printf("\t\t(default: 0 - unlimited)\n");
680 printf("\t[-i shared memory group ID]\n");
681 }
682
683 static void
684 print_performance(void)
685 {
686 uint64_t total_io_completed;
687 float io_per_second, mb_per_second, average_latency, min_latency, max_latency;
688 float total_io_per_second, total_mb_per_second;
689 float sum_ave_latency, sum_min_latency, sum_max_latency;
690 int ns_count;
691 struct worker_thread *worker;
692 struct ns_worker_ctx *ns_ctx;
693
694 total_io_per_second = 0;
695 total_mb_per_second = 0;
696 total_io_completed = 0;
697 sum_ave_latency = 0;
698 sum_min_latency = 0;
699 sum_max_latency = 0;
700 ns_count = 0;
701
702 printf("========================================================\n");
703 printf("%103s\n", "Latency(us)");
704 printf("%-55s: %10s %10s %10s %10s %10s\n",
705 "Device Information", "IOPS", "MB/s", "Average", "min", "max");
706
707 worker = g_workers;
708 while (worker) {
709 ns_ctx = worker->ns_ctx;
710 while (ns_ctx) {
711 io_per_second = (float)ns_ctx->io_completed / g_time_in_sec;
712 mb_per_second = io_per_second * g_io_size_bytes / (1024 * 1024);
713 average_latency = (float)(ns_ctx->total_tsc / ns_ctx->io_completed) * 1000 * 1000 / g_tsc_rate;
714 min_latency = (float)ns_ctx->min_tsc * 1000 * 1000 / g_tsc_rate;
715 max_latency = (float)ns_ctx->max_tsc * 1000 * 1000 / g_tsc_rate;
716 printf("%-43.43s from core %u: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
717 ns_ctx->entry->name, worker->lcore,
718 io_per_second, mb_per_second,
719 average_latency, min_latency, max_latency);
720 total_io_per_second += io_per_second;
721 total_mb_per_second += mb_per_second;
722 total_io_completed += ns_ctx->io_completed;
723 sum_ave_latency += average_latency;
724 sum_min_latency += min_latency;
725 sum_max_latency += max_latency;
726 ns_count++;
727 ns_ctx = ns_ctx->next;
728 }
729 worker = worker->next;
730 }
731
732 assert(ns_count != 0);
733 printf("========================================================\n");
734 printf("%-55s: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
735 "Total", total_io_per_second, total_mb_per_second,
736 sum_ave_latency / ns_count, sum_min_latency / ns_count,
737 sum_max_latency / ns_count);
738 printf("\n");
739 }
740
741 static void
742 print_latency_page(struct ctrlr_entry *entry)
743 {
744 int i;
745
746 printf("\n");
747 printf("%s\n", entry->name);
748 printf("--------------------------------------------------------\n");
749
750 for (i = 0; i < 32; i++) {
751 if (entry->latency_page->buckets_32us[i])
752 printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32, entry->latency_page->buckets_32us[i]);
753 }
754 for (i = 0; i < 31; i++) {
755 if (entry->latency_page->buckets_1ms[i])
756 printf("Bucket %dms - %dms: %d\n", i + 1, i + 2, entry->latency_page->buckets_1ms[i]);
757 }
758 for (i = 0; i < 31; i++) {
759 if (entry->latency_page->buckets_32ms[i])
760 printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32,
761 entry->latency_page->buckets_32ms[i]);
762 }
763 }
764
765 static void
766 print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page)
767 {
768 struct ctrlr_entry *ctrlr;
769
770 printf("%s Latency Statistics:\n", op_name);
771 printf("========================================================\n");
772 ctrlr = g_controllers;
773 while (ctrlr) {
774 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
775 if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr->ctrlr, log_page, SPDK_NVME_GLOBAL_NS_TAG,
776 ctrlr->latency_page, sizeof(struct spdk_nvme_intel_rw_latency_page), 0,
777 enable_latency_tracking_complete,
778 NULL)) {
779 printf("nvme_ctrlr_cmd_get_log_page() failed\n");
780 exit(1);
781 }
782
783 g_outstanding_commands++;
784 } else {
785 printf("Controller %s: %s latency statistics not supported\n", ctrlr->name, op_name);
786 }
787 ctrlr = ctrlr->next;
788 }
789
790 while (g_outstanding_commands) {
791 ctrlr = g_controllers;
792 while (ctrlr) {
793 spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr);
794 ctrlr = ctrlr->next;
795 }
796 }
797
798 ctrlr = g_controllers;
799 while (ctrlr) {
800 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
801 print_latency_page(ctrlr);
802 }
803 ctrlr = ctrlr->next;
804 }
805 printf("\n");
806 }
807
808 static void
809 print_stats(void)
810 {
811 print_performance();
812 if (g_latency_tracking_enable) {
813 if (g_rw_percentage != 0) {
814 print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY);
815 }
816 if (g_rw_percentage != 100) {
817 print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY);
818 }
819 }
820 }
821
822 static void
823 unregister_trids(void)
824 {
825 struct trid_entry *trid_entry, *tmp;
826
827 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) {
828 free(trid_entry);
829 }
830 }
831
832 static int
833 add_trid(const char *trid_str)
834 {
835 struct trid_entry *trid_entry;
836 struct spdk_nvme_transport_id *trid;
837
838 trid_entry = calloc(1, sizeof(*trid_entry));
839 if (trid_entry == NULL) {
840 return -1;
841 }
842
843 trid = &trid_entry->trid;
844 memset(trid, 0, sizeof(*trid));
845 trid->trtype = SPDK_NVME_TRANSPORT_PCIE;
846 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
847
848 if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) {
849 fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str);
850 free(trid_entry);
851 return 1;
852 }
853
854 TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq);
855 return 0;
856 }
857
858 static int
859 parse_args(int argc, char **argv)
860 {
861 const char *workload_type;
862 int op;
863 bool mix_specified = false;
864
865 /* default value*/
866 g_queue_depth = 0;
867 g_io_size_bytes = 0;
868 workload_type = NULL;
869 g_time_in_sec = 0;
870 g_rw_percentage = -1;
871 g_core_mask = NULL;
872 g_max_completions = 0;
873
874 while ((op = getopt(argc, argv, "c:d:i:lm:q:r:s:t:w:M:")) != -1) {
875 switch (op) {
876 case 'c':
877 g_core_mask = optarg;
878 break;
879 case 'd':
880 g_dpdk_mem = atoi(optarg);
881 break;
882 case 'i':
883 g_shm_id = atoi(optarg);
884 break;
885 case 'l':
886 g_latency_tracking_enable = true;
887 break;
888 case 'm':
889 g_max_completions = atoi(optarg);
890 break;
891 case 'q':
892 g_queue_depth = atoi(optarg);
893 break;
894 case 'r':
895 if (add_trid(optarg)) {
896 usage(argv[0]);
897 return 1;
898 }
899 break;
900 case 's':
901 g_io_size_bytes = atoi(optarg);
902 break;
903 case 't':
904 g_time_in_sec = atoi(optarg);
905 break;
906 case 'w':
907 workload_type = optarg;
908 break;
909 case 'M':
910 g_rw_percentage = atoi(optarg);
911 mix_specified = true;
912 break;
913 default:
914 usage(argv[0]);
915 return 1;
916 }
917 }
918
919 if (!g_queue_depth) {
920 usage(argv[0]);
921 return 1;
922 }
923 if (!g_io_size_bytes) {
924 usage(argv[0]);
925 return 1;
926 }
927 if (!workload_type) {
928 usage(argv[0]);
929 return 1;
930 }
931 if (!g_time_in_sec) {
932 usage(argv[0]);
933 return 1;
934 }
935
936 if (strcmp(workload_type, "read") &&
937 strcmp(workload_type, "write") &&
938 strcmp(workload_type, "randread") &&
939 strcmp(workload_type, "randwrite") &&
940 strcmp(workload_type, "rw") &&
941 strcmp(workload_type, "randrw")) {
942 fprintf(stderr,
943 "io pattern type must be one of\n"
944 "(read, write, randread, randwrite, rw, randrw)\n");
945 return 1;
946 }
947
948 if (!strcmp(workload_type, "read") ||
949 !strcmp(workload_type, "randread")) {
950 g_rw_percentage = 100;
951 }
952
953 if (!strcmp(workload_type, "write") ||
954 !strcmp(workload_type, "randwrite")) {
955 g_rw_percentage = 0;
956 }
957
958 if (!strcmp(workload_type, "read") ||
959 !strcmp(workload_type, "randread") ||
960 !strcmp(workload_type, "write") ||
961 !strcmp(workload_type, "randwrite")) {
962 if (mix_specified) {
963 fprintf(stderr, "Ignoring -M option... Please use -M option"
964 " only when using rw or randrw.\n");
965 }
966 }
967
968 if (!strcmp(workload_type, "rw") ||
969 !strcmp(workload_type, "randrw")) {
970 if (g_rw_percentage < 0 || g_rw_percentage > 100) {
971 fprintf(stderr,
972 "-M must be specified to value from 0 to 100 "
973 "for rw or randrw.\n");
974 return 1;
975 }
976 }
977
978 if (!strcmp(workload_type, "read") ||
979 !strcmp(workload_type, "write") ||
980 !strcmp(workload_type, "rw")) {
981 g_is_random = 0;
982 } else {
983 g_is_random = 1;
984 }
985
986 if (TAILQ_EMPTY(&g_trid_list)) {
987 /* If no transport IDs specified, default to enumerating all local PCIe devices */
988 add_trid("trtype:PCIe");
989 }
990
991 g_aio_optind = optind;
992 optind = 1;
993 return 0;
994 }
995
996 static int
997 register_workers(void)
998 {
999 uint32_t i;
1000 struct worker_thread *worker;
1001
1002 g_workers = NULL;
1003 g_num_workers = 0;
1004
1005 SPDK_ENV_FOREACH_CORE(i) {
1006 worker = calloc(1, sizeof(*worker));
1007 if (worker == NULL) {
1008 fprintf(stderr, "Unable to allocate worker\n");
1009 return -1;
1010 }
1011
1012 worker->lcore = i;
1013 worker->next = g_workers;
1014 g_workers = worker;
1015 g_num_workers++;
1016 }
1017
1018 return 0;
1019 }
1020
1021 static void
1022 unregister_workers(void)
1023 {
1024 struct worker_thread *worker = g_workers;
1025
1026 /* Free namespace context and worker thread */
1027 while (worker) {
1028 struct worker_thread *next_worker = worker->next;
1029 struct ns_worker_ctx *ns_ctx = worker->ns_ctx;
1030
1031 while (ns_ctx) {
1032 struct ns_worker_ctx *next_ns_ctx = ns_ctx->next;
1033 free(ns_ctx);
1034 ns_ctx = next_ns_ctx;
1035 }
1036
1037 free(worker);
1038 worker = next_worker;
1039 }
1040 }
1041
1042 static bool
1043 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1044 struct spdk_nvme_ctrlr_opts *opts)
1045 {
1046 struct spdk_pci_addr pci_addr;
1047 struct spdk_pci_device *pci_dev;
1048 struct spdk_pci_id pci_id;
1049
1050 if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) {
1051 printf("Attaching to NVMe over Fabrics controller at %s:%s: %s\n",
1052 trid->traddr, trid->trsvcid,
1053 trid->subnqn);
1054 } else {
1055 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) {
1056 return false;
1057 }
1058
1059 pci_dev = spdk_pci_get_device(&pci_addr);
1060 if (!pci_dev) {
1061 return false;
1062 }
1063
1064 pci_id = spdk_pci_device_get_id(pci_dev);
1065
1066 printf("Attaching to NVMe Controller at %s [%04x:%04x]\n",
1067 trid->traddr,
1068 pci_id.vendor_id, pci_id.device_id);
1069 }
1070
1071 opts->io_queue_size = g_queue_depth + 1;
1072
1073 return true;
1074 }
1075
1076 static void
1077 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
1078 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
1079 {
1080 struct spdk_pci_addr pci_addr;
1081 struct spdk_pci_device *pci_dev;
1082 struct spdk_pci_id pci_id;
1083
1084 if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) {
1085 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n",
1086 trid->traddr, trid->trsvcid,
1087 trid->subnqn);
1088 } else {
1089 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) {
1090 return;
1091 }
1092
1093 pci_dev = spdk_pci_get_device(&pci_addr);
1094 if (!pci_dev) {
1095 return;
1096 }
1097
1098 pci_id = spdk_pci_device_get_id(pci_dev);
1099
1100 printf("Attached to NVMe Controller at %s [%04x:%04x]\n",
1101 trid->traddr,
1102 pci_id.vendor_id, pci_id.device_id);
1103 }
1104
1105 register_ctrlr(ctrlr);
1106 }
1107
1108 static int
1109 register_controllers(void)
1110 {
1111 struct trid_entry *trid_entry;
1112
1113 printf("Initializing NVMe Controllers\n");
1114
1115 TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) {
1116 if (spdk_nvme_probe(&trid_entry->trid, NULL, probe_cb, attach_cb, NULL) != 0) {
1117 fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n",
1118 trid_entry->trid.traddr);
1119 return -1;
1120 }
1121 }
1122
1123 return 0;
1124 }
1125
1126 static void
1127 unregister_controllers(void)
1128 {
1129 struct ctrlr_entry *entry = g_controllers;
1130
1131 while (entry) {
1132 struct ctrlr_entry *next = entry->next;
1133 spdk_free(entry->latency_page);
1134 if (g_latency_tracking_enable &&
1135 spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING))
1136 set_latency_tracking_feature(entry->ctrlr, false);
1137 spdk_nvme_detach(entry->ctrlr);
1138 free(entry);
1139 entry = next;
1140 }
1141 }
1142
1143 static int
1144 register_aio_files(int argc, char **argv)
1145 {
1146 #if HAVE_LIBAIO
1147 int i;
1148
1149 /* Treat everything after the options as files for AIO */
1150 for (i = g_aio_optind; i < argc; i++) {
1151 if (register_aio_file(argv[i]) != 0) {
1152 return 1;
1153 }
1154 }
1155 #endif /* HAVE_LIBAIO */
1156
1157 return 0;
1158 }
1159
1160 static int
1161 associate_workers_with_ns(void)
1162 {
1163 struct ns_entry *entry = g_namespaces;
1164 struct worker_thread *worker = g_workers;
1165 struct ns_worker_ctx *ns_ctx;
1166 int i, count;
1167
1168 count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers;
1169
1170 for (i = 0; i < count; i++) {
1171 if (entry == NULL) {
1172 break;
1173 }
1174
1175 ns_ctx = malloc(sizeof(struct ns_worker_ctx));
1176 if (!ns_ctx) {
1177 return -1;
1178 }
1179 memset(ns_ctx, 0, sizeof(*ns_ctx));
1180
1181 printf("Associating %s with lcore %d\n", entry->name, worker->lcore);
1182 ns_ctx->min_tsc = UINT64_MAX;
1183 ns_ctx->entry = entry;
1184 ns_ctx->next = worker->ns_ctx;
1185 worker->ns_ctx = ns_ctx;
1186
1187 worker = worker->next;
1188 if (worker == NULL) {
1189 worker = g_workers;
1190 }
1191
1192 entry = entry->next;
1193 if (entry == NULL) {
1194 entry = g_namespaces;
1195 }
1196
1197 }
1198
1199 return 0;
1200 }
1201
1202 int main(int argc, char **argv)
1203 {
1204 int rc;
1205 struct worker_thread *worker, *master_worker;
1206 unsigned master_core;
1207 char task_pool_name[30];
1208 uint32_t task_count;
1209 struct spdk_env_opts opts;
1210
1211 rc = parse_args(argc, argv);
1212 if (rc != 0) {
1213 return rc;
1214 }
1215
1216 spdk_env_opts_init(&opts);
1217 opts.name = "perf";
1218 opts.shm_id = g_shm_id;
1219 if (g_core_mask) {
1220 opts.core_mask = g_core_mask;
1221 }
1222
1223 if (g_dpdk_mem) {
1224 opts.dpdk_mem_size = g_dpdk_mem;
1225 }
1226 spdk_env_init(&opts);
1227
1228 g_tsc_rate = spdk_get_ticks_hz();
1229
1230 if (register_workers() != 0) {
1231 rc = -1;
1232 goto cleanup;
1233 }
1234
1235 if (register_aio_files(argc, argv) != 0) {
1236 rc = -1;
1237 goto cleanup;
1238 }
1239
1240 if (register_controllers() != 0) {
1241 rc = -1;
1242 goto cleanup;
1243 }
1244
1245 if (associate_workers_with_ns() != 0) {
1246 rc = -1;
1247 goto cleanup;
1248 }
1249
1250 snprintf(task_pool_name, sizeof(task_pool_name), "task_pool_%d", getpid());
1251
1252 /*
1253 * The task_count will be dynamically calculated based on the
1254 * number of attached active namespaces(aio files), queue depth
1255 * and number of cores (workers) involved in the IO operations.
1256 */
1257 task_count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers;
1258 task_count *= g_queue_depth;
1259
1260 task_pool = rte_mempool_create(task_pool_name, task_count,
1261 sizeof(struct perf_task),
1262 0, 0, NULL, NULL, task_ctor, NULL,
1263 SOCKET_ID_ANY, 0);
1264 if (task_pool == NULL) {
1265 fprintf(stderr, "could not initialize task pool\n");
1266 rc = -1;
1267 goto cleanup;
1268 }
1269
1270 printf("Initialization complete. Launching workers.\n");
1271
1272 /* Launch all of the slave workers */
1273 master_core = rte_get_master_lcore();
1274 master_worker = NULL;
1275 worker = g_workers;
1276 while (worker != NULL) {
1277 if (worker->lcore != master_core) {
1278 rte_eal_remote_launch(work_fn, worker, worker->lcore);
1279 } else {
1280 assert(master_worker == NULL);
1281 master_worker = worker;
1282 }
1283 worker = worker->next;
1284 }
1285
1286 assert(master_worker != NULL);
1287 rc = work_fn(master_worker);
1288
1289 rte_eal_mp_wait_lcore();
1290
1291 print_stats();
1292
1293 cleanup:
1294 unregister_trids();
1295 unregister_namespaces();
1296 unregister_controllers();
1297 unregister_workers();
1298
1299 if (rc != 0) {
1300 fprintf(stderr, "%s: errors occured\n", argv[0]);
1301 }
1302
1303 return rc;
1304 }