4 * Copyright (c) Intel Corporation.
7 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
13 * * Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * * Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
19 * * Neither the name of Intel Corporation nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 #include "spdk/stdinc.h"
40 #include "spdk/nvme.h"
42 #include "spdk/queue.h"
43 #include "spdk/string.h"
44 #include "spdk/nvme_intel.h"
45 #include "spdk/histogram_data.h"
46 #include "spdk/endian.h"
48 #include "spdk/util.h"
50 #include "spdk/likely.h"
52 #ifdef SPDK_CONFIG_URING
61 struct spdk_nvme_ctrlr
*ctrlr
;
62 enum spdk_nvme_transport_type trtype
;
63 struct spdk_nvme_intel_rw_latency_page
*latency_page
;
65 struct spdk_nvme_qpair
**unused_qpairs
;
67 struct ctrlr_entry
*next
;
74 ENTRY_TYPE_URING_FILE
,
81 const struct ns_fn_table
*fn_table
;
85 struct spdk_nvme_ctrlr
*ctrlr
;
86 struct spdk_nvme_ns
*ns
;
88 #ifdef SPDK_CONFIG_URING
100 struct ns_entry
*next
;
101 uint32_t io_size_blocks
;
102 uint32_t num_io_requests
;
103 uint64_t size_in_ios
;
108 enum spdk_nvme_pi_type pi_type
;
113 static const double g_latency_cutoffs
[] = {
132 struct ns_worker_ctx
{
133 struct ns_entry
*entry
;
134 uint64_t io_completed
;
135 uint64_t last_io_completed
;
139 uint64_t current_queue_depth
;
140 uint64_t offset_in_ios
;
145 int num_active_qpairs
;
147 struct spdk_nvme_qpair
**qpair
;
148 struct spdk_nvme_poll_group
*group
;
152 #ifdef SPDK_CONFIG_URING
154 struct io_uring ring
;
155 uint64_t io_inflight
;
157 struct io_uring_cqe
**cqes
;
163 struct io_event
*events
;
169 struct ns_worker_ctx
*next
;
171 struct spdk_histogram_data
*histogram
;
175 struct ns_worker_ctx
*ns_ctx
;
180 struct spdk_dif_ctx dif_ctx
;
186 struct worker_thread
{
187 struct ns_worker_ctx
*ns_ctx
;
188 struct worker_thread
*next
;
193 void (*setup_payload
)(struct perf_task
*task
, uint8_t pattern
);
195 int (*submit_io
)(struct perf_task
*task
, struct ns_worker_ctx
*ns_ctx
,
196 struct ns_entry
*entry
, uint64_t offset_in_ios
);
198 void (*check_io
)(struct ns_worker_ctx
*ns_ctx
);
200 void (*verify_io
)(struct perf_task
*task
, struct ns_entry
*entry
);
202 int (*init_ns_worker_ctx
)(struct ns_worker_ctx
*ns_ctx
);
204 void (*cleanup_ns_worker_ctx
)(struct ns_worker_ctx
*ns_ctx
);
207 static int g_outstanding_commands
;
209 static bool g_latency_ssd_tracking_enable
;
210 static int g_latency_sw_tracking_level
;
213 static const char *g_workload_type
;
214 static struct ctrlr_entry
*g_controllers
;
215 static struct ns_entry
*g_namespaces
;
216 static int g_num_namespaces
;
217 static struct worker_thread
*g_workers
;
218 static int g_num_workers
;
219 static uint32_t g_master_core
;
221 static uint64_t g_tsc_rate
;
223 static uint32_t g_io_align
= 0x200;
224 static uint32_t g_io_size_bytes
;
225 static uint32_t g_max_io_md_size
;
226 static uint32_t g_max_io_size_blocks
;
227 static uint32_t g_metacfg_pract_flag
;
228 static uint32_t g_metacfg_prchk_flags
;
229 static int g_rw_percentage
= -1;
230 static int g_is_random
;
231 static int g_queue_depth
;
232 static int g_nr_io_queues_per_ns
= 1;
233 static int g_nr_unused_io_queues
;
234 static int g_time_in_sec
;
235 static uint32_t g_max_completions
;
236 static int g_dpdk_mem
;
237 static int g_shm_id
= -1;
238 static uint32_t g_disable_sq_cmb
;
239 static bool g_use_uring
;
240 static bool g_no_pci
;
242 static bool g_header_digest
;
243 static bool g_data_digest
;
244 static bool g_no_shn_notification
;
245 static bool g_mix_specified
;
246 /* Default to 10 seconds for the keep alive value. This value is arbitrary. */
247 static uint32_t g_keep_alive_timeout_in_ms
= 10000;
249 static const char *g_core_mask
;
252 struct spdk_nvme_transport_id trid
;
254 TAILQ_ENTRY(trid_entry
) tailq
;
257 static TAILQ_HEAD(, trid_entry
) g_trid_list
= TAILQ_HEAD_INITIALIZER(g_trid_list
);
259 static int g_file_optind
; /* Index of first filename in argv */
262 task_complete(struct perf_task
*task
);
264 #ifdef SPDK_CONFIG_URING
267 uring_setup_payload(struct perf_task
*task
, uint8_t pattern
)
269 task
->iov
.iov_base
= spdk_dma_zmalloc(g_io_size_bytes
, g_io_align
, NULL
);
270 task
->iov
.iov_len
= g_io_size_bytes
;
271 if (task
->iov
.iov_base
== NULL
) {
272 fprintf(stderr
, "spdk_dma_zmalloc() for task->iov.iov_base failed\n");
275 memset(task
->iov
.iov_base
, pattern
, task
->iov
.iov_len
);
279 uring_submit_io(struct perf_task
*task
, struct ns_worker_ctx
*ns_ctx
,
280 struct ns_entry
*entry
, uint64_t offset_in_ios
)
282 struct io_uring_sqe
*sqe
;
284 sqe
= io_uring_get_sqe(&ns_ctx
->u
.uring
.ring
);
286 fprintf(stderr
, "Cannot get sqe\n");
291 io_uring_prep_readv(sqe
, entry
->u
.uring
.fd
, &task
->iov
, 1, offset_in_ios
* task
->iov
.iov_len
);
293 io_uring_prep_writev(sqe
, entry
->u
.uring
.fd
, &task
->iov
, 1, offset_in_ios
* task
->iov
.iov_len
);
296 io_uring_sqe_set_data(sqe
, task
);
297 ns_ctx
->u
.uring
.io_pending
++;
303 uring_check_io(struct ns_worker_ctx
*ns_ctx
)
305 int i
, count
, to_complete
, to_submit
, ret
= 0;
306 struct perf_task
*task
;
308 to_submit
= ns_ctx
->u
.uring
.io_pending
;
311 /* If there are I/O to submit, use io_uring_submit here.
312 * It will automatically call spdk_io_uring_enter appropriately. */
313 ret
= io_uring_submit(&ns_ctx
->u
.uring
.ring
);
317 ns_ctx
->u
.uring
.io_pending
= 0;
318 ns_ctx
->u
.uring
.io_inflight
+= to_submit
;
321 to_complete
= ns_ctx
->u
.uring
.io_inflight
;
322 if (to_complete
> 0) {
323 count
= io_uring_peek_batch_cqe(&ns_ctx
->u
.uring
.ring
, ns_ctx
->u
.uring
.cqes
, to_complete
);
324 ns_ctx
->u
.uring
.io_inflight
-= count
;
325 for (i
= 0; i
< count
; i
++) {
326 assert(ns_ctx
->u
.uring
.cqes
[i
] != NULL
);
327 task
= (struct perf_task
*)ns_ctx
->u
.uring
.cqes
[i
]->user_data
;
328 if (ns_ctx
->u
.uring
.cqes
[i
]->res
!= (int)task
->iov
.iov_len
) {
329 fprintf(stderr
, "cqe[i]->status=%d\n", ns_ctx
->u
.uring
.cqes
[i
]->res
);
332 io_uring_cqe_seen(&ns_ctx
->u
.uring
.ring
, ns_ctx
->u
.uring
.cqes
[i
]);
339 uring_verify_io(struct perf_task
*task
, struct ns_entry
*entry
)
344 uring_init_ns_worker_ctx(struct ns_worker_ctx
*ns_ctx
)
346 if (io_uring_queue_init(g_queue_depth
, &ns_ctx
->u
.uring
.ring
, 0) < 0) {
347 SPDK_ERRLOG("uring I/O context setup failure\n");
351 ns_ctx
->u
.uring
.cqes
= calloc(g_queue_depth
, sizeof(struct io_uring_cqe
*));
352 if (!ns_ctx
->u
.uring
.cqes
) {
353 io_uring_queue_exit(&ns_ctx
->u
.uring
.ring
);
361 uring_cleanup_ns_worker_ctx(struct ns_worker_ctx
*ns_ctx
)
363 io_uring_queue_exit(&ns_ctx
->u
.uring
.ring
);
364 free(ns_ctx
->u
.uring
.cqes
);
367 static const struct ns_fn_table uring_fn_table
= {
368 .setup_payload
= uring_setup_payload
,
369 .submit_io
= uring_submit_io
,
370 .check_io
= uring_check_io
,
371 .verify_io
= uring_verify_io
,
372 .init_ns_worker_ctx
= uring_init_ns_worker_ctx
,
373 .cleanup_ns_worker_ctx
= uring_cleanup_ns_worker_ctx
,
380 aio_setup_payload(struct perf_task
*task
, uint8_t pattern
)
382 task
->iov
.iov_base
= spdk_dma_zmalloc(g_io_size_bytes
, g_io_align
, NULL
);
383 task
->iov
.iov_len
= g_io_size_bytes
;
384 if (task
->iov
.iov_base
== NULL
) {
385 fprintf(stderr
, "spdk_dma_zmalloc() for task->buf failed\n");
388 memset(task
->iov
.iov_base
, pattern
, task
->iov
.iov_len
);
392 aio_submit(io_context_t aio_ctx
, struct iocb
*iocb
, int fd
, enum io_iocb_cmd cmd
,
393 struct iovec
*iov
, uint64_t offset
, void *cb_ctx
)
395 iocb
->aio_fildes
= fd
;
396 iocb
->aio_reqprio
= 0;
397 iocb
->aio_lio_opcode
= cmd
;
398 iocb
->u
.c
.buf
= iov
->iov_base
;
399 iocb
->u
.c
.nbytes
= iov
->iov_len
;
400 iocb
->u
.c
.offset
= offset
* iov
->iov_len
;
403 if (io_submit(aio_ctx
, 1, &iocb
) < 0) {
412 aio_submit_io(struct perf_task
*task
, struct ns_worker_ctx
*ns_ctx
,
413 struct ns_entry
*entry
, uint64_t offset_in_ios
)
416 return aio_submit(ns_ctx
->u
.aio
.ctx
, &task
->iocb
, entry
->u
.aio
.fd
, IO_CMD_PREAD
,
417 &task
->iov
, offset_in_ios
, task
);
419 return aio_submit(ns_ctx
->u
.aio
.ctx
, &task
->iocb
, entry
->u
.aio
.fd
, IO_CMD_PWRITE
,
420 &task
->iov
, offset_in_ios
, task
);
425 aio_check_io(struct ns_worker_ctx
*ns_ctx
)
428 struct timespec timeout
;
433 count
= io_getevents(ns_ctx
->u
.aio
.ctx
, 1, g_queue_depth
, ns_ctx
->u
.aio
.events
, &timeout
);
435 fprintf(stderr
, "io_getevents error\n");
439 for (i
= 0; i
< count
; i
++) {
440 task_complete(ns_ctx
->u
.aio
.events
[i
].data
);
445 aio_verify_io(struct perf_task
*task
, struct ns_entry
*entry
)
450 aio_init_ns_worker_ctx(struct ns_worker_ctx
*ns_ctx
)
452 ns_ctx
->u
.aio
.events
= calloc(g_queue_depth
, sizeof(struct io_event
));
453 if (!ns_ctx
->u
.aio
.events
) {
456 ns_ctx
->u
.aio
.ctx
= 0;
457 if (io_setup(g_queue_depth
, &ns_ctx
->u
.aio
.ctx
) < 0) {
458 free(ns_ctx
->u
.aio
.events
);
466 aio_cleanup_ns_worker_ctx(struct ns_worker_ctx
*ns_ctx
)
468 io_destroy(ns_ctx
->u
.aio
.ctx
);
469 free(ns_ctx
->u
.aio
.events
);
472 static const struct ns_fn_table aio_fn_table
= {
473 .setup_payload
= aio_setup_payload
,
474 .submit_io
= aio_submit_io
,
475 .check_io
= aio_check_io
,
476 .verify_io
= aio_verify_io
,
477 .init_ns_worker_ctx
= aio_init_ns_worker_ctx
,
478 .cleanup_ns_worker_ctx
= aio_cleanup_ns_worker_ctx
,
481 #endif /* HAVE_LIBAIO */
483 #if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING)
486 register_file(const char *path
)
488 struct ns_entry
*entry
;
494 if (g_rw_percentage
== 100) {
496 } else if (g_rw_percentage
== 0) {
504 fd
= open(path
, flags
);
506 fprintf(stderr
, "Could not open device %s: %s\n", path
, strerror(errno
));
510 size
= spdk_fd_get_size(fd
);
512 fprintf(stderr
, "Could not determine size of device %s\n", path
);
517 blklen
= spdk_fd_get_blocklen(fd
);
519 fprintf(stderr
, "Could not determine block size of device %s\n", path
);
525 * TODO: This should really calculate the LCM of the current g_io_align and blklen.
526 * For now, it's fairly safe to just assume all block sizes are powers of 2.
528 if (g_io_align
< blklen
) {
532 entry
= malloc(sizeof(struct ns_entry
));
535 perror("ns_entry malloc");
540 #ifdef SPDK_CONFIG_URING
541 entry
->type
= ENTRY_TYPE_URING_FILE
;
542 entry
->fn_table
= &uring_fn_table
;
543 entry
->u
.uring
.fd
= fd
;
547 entry
->type
= ENTRY_TYPE_AIO_FILE
;
548 entry
->fn_table
= &aio_fn_table
;
549 entry
->u
.aio
.fd
= fd
;
552 entry
->size_in_ios
= size
/ g_io_size_bytes
;
553 entry
->io_size_blocks
= g_io_size_bytes
/ blklen
;
555 snprintf(entry
->name
, sizeof(entry
->name
), "%s", path
);
558 entry
->next
= g_namespaces
;
559 g_namespaces
= entry
;
565 register_files(int argc
, char **argv
)
569 /* Treat everything after the options as files for AIO/URING */
570 for (i
= g_file_optind
; i
< argc
; i
++) {
571 if (register_file(argv
[i
]) != 0) {
580 static void io_complete(void *ctx
, const struct spdk_nvme_cpl
*cpl
);
583 nvme_setup_payload(struct perf_task
*task
, uint8_t pattern
)
585 uint32_t max_io_size_bytes
, max_io_md_size
;
587 /* maximum extended lba format size from all active namespace,
588 * it's same with g_io_size_bytes for namespace without metadata.
590 max_io_size_bytes
= g_io_size_bytes
+ g_max_io_md_size
* g_max_io_size_blocks
;
591 task
->iov
.iov_base
= spdk_dma_zmalloc(max_io_size_bytes
, g_io_align
, NULL
);
592 task
->iov
.iov_len
= max_io_size_bytes
;
593 if (task
->iov
.iov_base
== NULL
) {
594 fprintf(stderr
, "task->buf spdk_dma_zmalloc failed\n");
597 memset(task
->iov
.iov_base
, pattern
, task
->iov
.iov_len
);
599 max_io_md_size
= g_max_io_md_size
* g_max_io_size_blocks
;
600 if (max_io_md_size
!= 0) {
601 task
->md_iov
.iov_base
= spdk_dma_zmalloc(max_io_md_size
, g_io_align
, NULL
);
602 task
->md_iov
.iov_len
= max_io_md_size
;
603 if (task
->md_iov
.iov_base
== NULL
) {
604 fprintf(stderr
, "task->md_buf spdk_dma_zmalloc failed\n");
605 spdk_dma_free(task
->iov
.iov_base
);
612 nvme_submit_io(struct perf_task
*task
, struct ns_worker_ctx
*ns_ctx
,
613 struct ns_entry
*entry
, uint64_t offset_in_ios
)
623 } mode
= DIF_MODE_NONE
;
625 lba
= offset_in_ios
* entry
->io_size_blocks
;
627 if (entry
->md_size
!= 0 && !(entry
->io_flags
& SPDK_NVME_IO_FLAGS_PRACT
)) {
628 if (entry
->md_interleave
) {
635 qp_num
= ns_ctx
->u
.nvme
.last_qpair
;
636 ns_ctx
->u
.nvme
.last_qpair
++;
637 if (ns_ctx
->u
.nvme
.last_qpair
== ns_ctx
->u
.nvme
.num_active_qpairs
) {
638 ns_ctx
->u
.nvme
.last_qpair
= 0;
641 if (mode
!= DIF_MODE_NONE
) {
642 rc
= spdk_dif_ctx_init(&task
->dif_ctx
, entry
->block_size
, entry
->md_size
,
643 entry
->md_interleave
, entry
->pi_loc
,
644 (enum spdk_dif_type
)entry
->pi_type
, entry
->io_flags
,
645 lba
, 0xFFFF, (uint16_t)entry
->io_size_blocks
, 0, 0);
647 fprintf(stderr
, "Initialization of DIF context failed\n");
653 return spdk_nvme_ns_cmd_read_with_md(entry
->u
.nvme
.ns
, ns_ctx
->u
.nvme
.qpair
[qp_num
],
654 task
->iov
.iov_base
, task
->md_iov
.iov_base
,
656 entry
->io_size_blocks
, io_complete
,
657 task
, entry
->io_flags
,
658 task
->dif_ctx
.apptag_mask
, task
->dif_ctx
.app_tag
);
662 rc
= spdk_dif_generate(&task
->iov
, 1, entry
->io_size_blocks
, &task
->dif_ctx
);
664 fprintf(stderr
, "Generation of DIF failed\n");
669 rc
= spdk_dix_generate(&task
->iov
, 1, &task
->md_iov
, entry
->io_size_blocks
,
672 fprintf(stderr
, "Generation of DIX failed\n");
680 return spdk_nvme_ns_cmd_write_with_md(entry
->u
.nvme
.ns
, ns_ctx
->u
.nvme
.qpair
[qp_num
],
681 task
->iov
.iov_base
, task
->md_iov
.iov_base
,
683 entry
->io_size_blocks
, io_complete
,
684 task
, entry
->io_flags
,
685 task
->dif_ctx
.apptag_mask
, task
->dif_ctx
.app_tag
);
690 perf_disconnect_cb(struct spdk_nvme_qpair
*qpair
, void *ctx
)
696 nvme_check_io(struct ns_worker_ctx
*ns_ctx
)
700 rc
= spdk_nvme_poll_group_process_completions(ns_ctx
->u
.nvme
.group
, 0, perf_disconnect_cb
);
702 fprintf(stderr
, "NVMe io qpair process completion error\n");
708 nvme_verify_io(struct perf_task
*task
, struct ns_entry
*entry
)
710 struct spdk_dif_error err_blk
= {};
713 if (!task
->is_read
|| (entry
->io_flags
& SPDK_NVME_IO_FLAGS_PRACT
)) {
717 if (entry
->md_interleave
) {
718 rc
= spdk_dif_verify(&task
->iov
, 1, entry
->io_size_blocks
, &task
->dif_ctx
,
721 fprintf(stderr
, "DIF error detected. type=%d, offset=%" PRIu32
"\n",
722 err_blk
.err_type
, err_blk
.err_offset
);
725 rc
= spdk_dix_verify(&task
->iov
, 1, &task
->md_iov
, entry
->io_size_blocks
,
726 &task
->dif_ctx
, &err_blk
);
728 fprintf(stderr
, "DIX error detected. type=%d, offset=%" PRIu32
"\n",
729 err_blk
.err_type
, err_blk
.err_offset
);
735 * TODO: If a controller has multiple namespaces, they could all use the same queue.
736 * For now, give each namespace/thread combination its own queue.
739 nvme_init_ns_worker_ctx(struct ns_worker_ctx
*ns_ctx
)
741 struct spdk_nvme_io_qpair_opts opts
;
742 struct ns_entry
*entry
= ns_ctx
->entry
;
743 struct spdk_nvme_poll_group
*group
;
744 struct spdk_nvme_qpair
*qpair
;
747 ns_ctx
->u
.nvme
.num_active_qpairs
= g_nr_io_queues_per_ns
;
748 ns_ctx
->u
.nvme
.num_all_qpairs
= g_nr_io_queues_per_ns
+ g_nr_unused_io_queues
;
749 ns_ctx
->u
.nvme
.qpair
= calloc(ns_ctx
->u
.nvme
.num_all_qpairs
, sizeof(struct spdk_nvme_qpair
*));
750 if (!ns_ctx
->u
.nvme
.qpair
) {
754 spdk_nvme_ctrlr_get_default_io_qpair_opts(entry
->u
.nvme
.ctrlr
, &opts
, sizeof(opts
));
755 if (opts
.io_queue_requests
< entry
->num_io_requests
) {
756 opts
.io_queue_requests
= entry
->num_io_requests
;
758 opts
.delay_cmd_submit
= true;
759 opts
.create_only
= true;
761 ns_ctx
->u
.nvme
.group
= spdk_nvme_poll_group_create(NULL
);
762 if (ns_ctx
->u
.nvme
.group
== NULL
) {
763 goto poll_group_failed
;
766 group
= ns_ctx
->u
.nvme
.group
;
767 for (i
= 0; i
< ns_ctx
->u
.nvme
.num_all_qpairs
; i
++) {
768 ns_ctx
->u
.nvme
.qpair
[i
] = spdk_nvme_ctrlr_alloc_io_qpair(entry
->u
.nvme
.ctrlr
, &opts
,
770 qpair
= ns_ctx
->u
.nvme
.qpair
[i
];
772 printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n");
776 if (spdk_nvme_poll_group_add(group
, qpair
)) {
777 printf("ERROR: unable to add I/O qpair to poll group.\n");
778 spdk_nvme_ctrlr_free_io_qpair(qpair
);
782 if (spdk_nvme_ctrlr_connect_io_qpair(entry
->u
.nvme
.ctrlr
, qpair
)) {
783 printf("ERROR: unable to connect I/O qpair.\n");
784 spdk_nvme_poll_group_remove(group
, qpair
);
785 spdk_nvme_ctrlr_free_io_qpair(qpair
);
794 spdk_nvme_poll_group_remove(ns_ctx
->u
.nvme
.group
, ns_ctx
->u
.nvme
.qpair
[i
- 1]);
795 spdk_nvme_ctrlr_free_io_qpair(ns_ctx
->u
.nvme
.qpair
[i
- 1]);
798 spdk_nvme_poll_group_destroy(ns_ctx
->u
.nvme
.group
);
800 free(ns_ctx
->u
.nvme
.qpair
);
805 nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx
*ns_ctx
)
809 for (i
= 0; i
< ns_ctx
->u
.nvme
.num_all_qpairs
; i
++) {
810 spdk_nvme_poll_group_remove(ns_ctx
->u
.nvme
.group
, ns_ctx
->u
.nvme
.qpair
[i
]);
811 spdk_nvme_ctrlr_free_io_qpair(ns_ctx
->u
.nvme
.qpair
[i
]);
814 spdk_nvme_poll_group_destroy(ns_ctx
->u
.nvme
.group
);
815 free(ns_ctx
->u
.nvme
.qpair
);
818 static const struct ns_fn_table nvme_fn_table
= {
819 .setup_payload
= nvme_setup_payload
,
820 .submit_io
= nvme_submit_io
,
821 .check_io
= nvme_check_io
,
822 .verify_io
= nvme_verify_io
,
823 .init_ns_worker_ctx
= nvme_init_ns_worker_ctx
,
824 .cleanup_ns_worker_ctx
= nvme_cleanup_ns_worker_ctx
,
828 build_nvme_name(char *name
, size_t length
, struct spdk_nvme_ctrlr
*ctrlr
)
830 const struct spdk_nvme_transport_id
*trid
;
833 trid
= spdk_nvme_ctrlr_get_transport_id(ctrlr
);
835 switch (trid
->trtype
) {
836 case SPDK_NVME_TRANSPORT_PCIE
:
837 res
= snprintf(name
, length
, "PCIE (%s)", trid
->traddr
);
839 case SPDK_NVME_TRANSPORT_RDMA
:
840 res
= snprintf(name
, length
, "RDMA (addr:%s subnqn:%s)", trid
->traddr
, trid
->subnqn
);
842 case SPDK_NVME_TRANSPORT_TCP
:
843 res
= snprintf(name
, length
, "TCP (addr:%s subnqn:%s)", trid
->traddr
, trid
->subnqn
);
847 fprintf(stderr
, "Unknown transport type %d\n", trid
->trtype
);
854 build_nvme_ns_name(char *name
, size_t length
, struct spdk_nvme_ctrlr
*ctrlr
, uint32_t nsid
)
858 res
= build_nvme_name(name
, length
, ctrlr
);
860 snprintf(name
+ res
, length
- res
, " NSID %u", nsid
);
866 register_ns(struct spdk_nvme_ctrlr
*ctrlr
, struct spdk_nvme_ns
*ns
)
868 struct ns_entry
*entry
;
869 const struct spdk_nvme_ctrlr_data
*cdata
;
870 uint32_t max_xfer_size
, entries
, sector_size
;
872 struct spdk_nvme_io_qpair_opts opts
;
874 cdata
= spdk_nvme_ctrlr_get_data(ctrlr
);
876 if (!spdk_nvme_ns_is_active(ns
)) {
877 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n",
878 cdata
->mn
, cdata
->sn
,
879 spdk_nvme_ns_get_id(ns
));
884 ns_size
= spdk_nvme_ns_get_size(ns
);
885 sector_size
= spdk_nvme_ns_get_sector_size(ns
);
887 if (ns_size
< g_io_size_bytes
|| sector_size
> g_io_size_bytes
) {
888 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid "
889 "ns size %" PRIu64
" / block size %u for I/O size %u\n",
890 cdata
->mn
, cdata
->sn
, spdk_nvme_ns_get_id(ns
),
891 ns_size
, spdk_nvme_ns_get_sector_size(ns
), g_io_size_bytes
);
896 max_xfer_size
= spdk_nvme_ns_get_max_io_xfer_size(ns
);
897 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr
, &opts
, sizeof(opts
));
898 /* NVMe driver may add additional entries based on
899 * stripe size and maximum transfer size, we assume
900 * 1 more entry be used for stripe.
902 entries
= (g_io_size_bytes
- 1) / max_xfer_size
+ 2;
903 if ((g_queue_depth
* entries
) > opts
.io_queue_size
) {
904 printf("controller IO queue size %u less than required\n",
906 printf("Consider using lower queue depth or small IO size because "
907 "IO requests may be queued at the NVMe driver.\n");
909 /* For requests which have children requests, parent request itself
910 * will also occupy 1 entry.
914 entry
= calloc(1, sizeof(struct ns_entry
));
916 perror("ns_entry malloc");
920 entry
->type
= ENTRY_TYPE_NVME_NS
;
921 entry
->fn_table
= &nvme_fn_table
;
922 entry
->u
.nvme
.ctrlr
= ctrlr
;
923 entry
->u
.nvme
.ns
= ns
;
924 entry
->num_io_requests
= g_queue_depth
* entries
;
926 entry
->size_in_ios
= ns_size
/ g_io_size_bytes
;
927 entry
->io_size_blocks
= g_io_size_bytes
/ sector_size
;
929 entry
->block_size
= spdk_nvme_ns_get_extended_sector_size(ns
);
930 entry
->md_size
= spdk_nvme_ns_get_md_size(ns
);
931 entry
->md_interleave
= spdk_nvme_ns_supports_extended_lba(ns
);
932 entry
->pi_loc
= spdk_nvme_ns_get_data(ns
)->dps
.md_start
;
933 entry
->pi_type
= spdk_nvme_ns_get_pi_type(ns
);
935 if (spdk_nvme_ns_get_flags(ns
) & SPDK_NVME_NS_DPS_PI_SUPPORTED
) {
936 entry
->io_flags
= g_metacfg_pract_flag
| g_metacfg_prchk_flags
;
939 /* If metadata size = 8 bytes, PI is stripped (read) or inserted (write),
940 * and so reduce metadata size from block size. (If metadata size > 8 bytes,
941 * PI is passed (read) or replaced (write). So block size is not necessary
944 if ((entry
->io_flags
& SPDK_NVME_IO_FLAGS_PRACT
) && (entry
->md_size
== 8)) {
945 entry
->block_size
= spdk_nvme_ns_get_sector_size(ns
);
948 if (g_max_io_md_size
< entry
->md_size
) {
949 g_max_io_md_size
= entry
->md_size
;
952 if (g_max_io_size_blocks
< entry
->io_size_blocks
) {
953 g_max_io_size_blocks
= entry
->io_size_blocks
;
956 build_nvme_ns_name(entry
->name
, sizeof(entry
->name
), ctrlr
, spdk_nvme_ns_get_id(ns
));
959 entry
->next
= g_namespaces
;
960 g_namespaces
= entry
;
964 unregister_namespaces(void)
966 struct ns_entry
*entry
= g_namespaces
;
969 struct ns_entry
*next
= entry
->next
;
976 enable_latency_tracking_complete(void *cb_arg
, const struct spdk_nvme_cpl
*cpl
)
978 if (spdk_nvme_cpl_is_error(cpl
)) {
979 printf("enable_latency_tracking_complete failed\n");
981 g_outstanding_commands
--;
985 set_latency_tracking_feature(struct spdk_nvme_ctrlr
*ctrlr
, bool enable
)
988 union spdk_nvme_intel_feat_latency_tracking latency_tracking
;
991 latency_tracking
.bits
.enable
= 0x01;
993 latency_tracking
.bits
.enable
= 0x00;
996 res
= spdk_nvme_ctrlr_cmd_set_feature(ctrlr
, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING
,
997 latency_tracking
.raw
, 0, NULL
, 0, enable_latency_tracking_complete
, NULL
);
999 printf("fail to allocate nvme request.\n");
1002 g_outstanding_commands
++;
1004 while (g_outstanding_commands
) {
1005 spdk_nvme_ctrlr_process_admin_completions(ctrlr
);
1010 register_ctrlr(struct spdk_nvme_ctrlr
*ctrlr
, struct trid_entry
*trid_entry
)
1012 struct spdk_nvme_ns
*ns
;
1013 struct ctrlr_entry
*entry
= malloc(sizeof(struct ctrlr_entry
));
1016 if (entry
== NULL
) {
1017 perror("ctrlr_entry malloc");
1021 entry
->latency_page
= spdk_dma_zmalloc(sizeof(struct spdk_nvme_intel_rw_latency_page
),
1023 if (entry
->latency_page
== NULL
) {
1024 printf("Allocation error (latency page)\n");
1028 build_nvme_name(entry
->name
, sizeof(entry
->name
), ctrlr
);
1030 entry
->ctrlr
= ctrlr
;
1031 entry
->trtype
= trid_entry
->trid
.trtype
;
1032 entry
->next
= g_controllers
;
1033 g_controllers
= entry
;
1035 if (g_latency_ssd_tracking_enable
&&
1036 spdk_nvme_ctrlr_is_feature_supported(ctrlr
, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING
)) {
1037 set_latency_tracking_feature(ctrlr
, true);
1040 if (trid_entry
->nsid
== 0) {
1041 for (nsid
= spdk_nvme_ctrlr_get_first_active_ns(ctrlr
);
1042 nsid
!= 0; nsid
= spdk_nvme_ctrlr_get_next_active_ns(ctrlr
, nsid
)) {
1043 ns
= spdk_nvme_ctrlr_get_ns(ctrlr
, nsid
);
1047 register_ns(ctrlr
, ns
);
1050 ns
= spdk_nvme_ctrlr_get_ns(ctrlr
, trid_entry
->nsid
);
1052 perror("Namespace does not exist.");
1056 register_ns(ctrlr
, ns
);
1060 static __thread
unsigned int seed
= 0;
1063 submit_single_io(struct perf_task
*task
)
1065 uint64_t offset_in_ios
;
1067 struct ns_worker_ctx
*ns_ctx
= task
->ns_ctx
;
1068 struct ns_entry
*entry
= ns_ctx
->entry
;
1071 offset_in_ios
= rand_r(&seed
) % entry
->size_in_ios
;
1073 offset_in_ios
= ns_ctx
->offset_in_ios
++;
1074 if (ns_ctx
->offset_in_ios
== entry
->size_in_ios
) {
1075 ns_ctx
->offset_in_ios
= 0;
1079 task
->submit_tsc
= spdk_get_ticks();
1081 if ((g_rw_percentage
== 100) ||
1082 (g_rw_percentage
!= 0 && ((rand_r(&seed
) % 100) < g_rw_percentage
))) {
1083 task
->is_read
= true;
1085 task
->is_read
= false;
1088 rc
= entry
->fn_table
->submit_io(task
, ns_ctx
, entry
, offset_in_ios
);
1090 if (spdk_unlikely(rc
!= 0)) {
1091 fprintf(stderr
, "starting I/O failed\n");
1093 ns_ctx
->current_queue_depth
++;
1098 task_complete(struct perf_task
*task
)
1100 struct ns_worker_ctx
*ns_ctx
;
1102 struct ns_entry
*entry
;
1104 ns_ctx
= task
->ns_ctx
;
1105 entry
= ns_ctx
->entry
;
1106 ns_ctx
->current_queue_depth
--;
1107 ns_ctx
->io_completed
++;
1108 tsc_diff
= spdk_get_ticks() - task
->submit_tsc
;
1109 ns_ctx
->total_tsc
+= tsc_diff
;
1110 if (spdk_unlikely(ns_ctx
->min_tsc
> tsc_diff
)) {
1111 ns_ctx
->min_tsc
= tsc_diff
;
1113 if (spdk_unlikely(ns_ctx
->max_tsc
< tsc_diff
)) {
1114 ns_ctx
->max_tsc
= tsc_diff
;
1116 if (spdk_unlikely(g_latency_sw_tracking_level
> 0)) {
1117 spdk_histogram_data_tally(ns_ctx
->histogram
, tsc_diff
);
1120 if (spdk_unlikely(entry
->md_size
> 0)) {
1121 /* add application level verification for end-to-end data protection */
1122 entry
->fn_table
->verify_io(task
, entry
);
1126 * is_draining indicates when time has expired for the test run
1127 * and we are just waiting for the previously submitted I/O
1128 * to complete. In this case, do not submit a new I/O to replace
1129 * the one just completed.
1131 if (spdk_unlikely(ns_ctx
->is_draining
)) {
1132 spdk_dma_free(task
->iov
.iov_base
);
1133 spdk_dma_free(task
->md_iov
.iov_base
);
1136 submit_single_io(task
);
1141 io_complete(void *ctx
, const struct spdk_nvme_cpl
*cpl
)
1143 struct perf_task
*task
= ctx
;
1145 if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl
))) {
1146 fprintf(stderr
, "%s completed with error (sct=%d, sc=%d)\n",
1147 task
->is_read
? "Read" : "Write",
1148 cpl
->status
.sct
, cpl
->status
.sc
);
1151 task_complete(task
);
1154 static struct perf_task
*
1155 allocate_task(struct ns_worker_ctx
*ns_ctx
, int queue_depth
)
1157 struct perf_task
*task
;
1159 task
= calloc(1, sizeof(*task
));
1161 fprintf(stderr
, "Out of memory allocating tasks\n");
1165 ns_ctx
->entry
->fn_table
->setup_payload(task
, queue_depth
% 8 + 1);
1167 task
->ns_ctx
= ns_ctx
;
1173 submit_io(struct ns_worker_ctx
*ns_ctx
, int queue_depth
)
1175 struct perf_task
*task
;
1177 while (queue_depth
-- > 0) {
1178 task
= allocate_task(ns_ctx
, queue_depth
);
1179 submit_single_io(task
);
1184 init_ns_worker_ctx(struct ns_worker_ctx
*ns_ctx
)
1186 return ns_ctx
->entry
->fn_table
->init_ns_worker_ctx(ns_ctx
);
1190 cleanup_ns_worker_ctx(struct ns_worker_ctx
*ns_ctx
)
1192 ns_ctx
->entry
->fn_table
->cleanup_ns_worker_ctx(ns_ctx
);
1196 print_periodic_performance(void)
1198 uint64_t io_this_second
;
1199 double mb_this_second
;
1200 struct worker_thread
*worker
;
1201 struct ns_worker_ctx
*ns_ctx
;
1203 if (!isatty(STDOUT_FILENO
)) {
1204 /* Don't print periodic stats if output is not going
1213 ns_ctx
= worker
->ns_ctx
;
1215 io_this_second
+= ns_ctx
->io_completed
- ns_ctx
->last_io_completed
;
1216 ns_ctx
->last_io_completed
= ns_ctx
->io_completed
;
1217 ns_ctx
= ns_ctx
->next
;
1219 worker
= worker
->next
;
1222 mb_this_second
= (double)io_this_second
* g_io_size_bytes
/ (1024 * 1024);
1223 printf("%9ju IOPS, %8.2f MiB/s\r", io_this_second
, mb_this_second
);
1230 uint64_t tsc_end
, tsc_current
, tsc_next_print
;
1231 struct worker_thread
*worker
= (struct worker_thread
*)arg
;
1232 struct ns_worker_ctx
*ns_ctx
= NULL
;
1233 uint32_t unfinished_ns_ctx
;
1235 /* Allocate queue pairs for each namespace. */
1236 ns_ctx
= worker
->ns_ctx
;
1237 while (ns_ctx
!= NULL
) {
1238 if (init_ns_worker_ctx(ns_ctx
) != 0) {
1239 printf("ERROR: init_ns_worker_ctx() failed\n");
1242 ns_ctx
= ns_ctx
->next
;
1245 tsc_current
= spdk_get_ticks();
1246 tsc_end
= tsc_current
+ g_time_in_sec
* g_tsc_rate
;
1247 tsc_next_print
= tsc_current
+ g_tsc_rate
;
1249 /* Submit initial I/O for each namespace. */
1250 ns_ctx
= worker
->ns_ctx
;
1251 while (ns_ctx
!= NULL
) {
1252 submit_io(ns_ctx
, g_queue_depth
);
1253 ns_ctx
= ns_ctx
->next
;
1258 * Check for completed I/O for each controller. A new
1259 * I/O will be submitted in the io_complete callback
1260 * to replace each I/O that is completed.
1262 ns_ctx
= worker
->ns_ctx
;
1263 while (ns_ctx
!= NULL
) {
1264 ns_ctx
->entry
->fn_table
->check_io(ns_ctx
);
1265 ns_ctx
= ns_ctx
->next
;
1268 tsc_current
= spdk_get_ticks();
1270 if (worker
->lcore
== g_master_core
&& tsc_current
> tsc_next_print
) {
1271 tsc_next_print
+= g_tsc_rate
;
1272 print_periodic_performance();
1275 if (tsc_current
> tsc_end
) {
1280 /* drain the io of each ns_ctx in round robin to make the fairness */
1282 unfinished_ns_ctx
= 0;
1283 ns_ctx
= worker
->ns_ctx
;
1284 while (ns_ctx
!= NULL
) {
1285 /* first time will enter into this if case */
1286 if (!ns_ctx
->is_draining
) {
1287 ns_ctx
->is_draining
= true;
1290 if (ns_ctx
->current_queue_depth
> 0) {
1291 ns_ctx
->entry
->fn_table
->check_io(ns_ctx
);
1292 if (ns_ctx
->current_queue_depth
== 0) {
1293 cleanup_ns_worker_ctx(ns_ctx
);
1295 unfinished_ns_ctx
++;
1298 ns_ctx
= ns_ctx
->next
;
1300 } while (unfinished_ns_ctx
> 0);
1305 static void usage(char *program_name
)
1307 printf("%s options", program_name
);
1308 #if defined(SPDK_CONFIG_URING) || defined(HAVE_LIBAIO)
1309 printf(" [Kernel device(s)]...");
1312 printf("\t[-q io depth]\n");
1313 printf("\t[-o io size in bytes]\n");
1314 printf("\t[-P number of io queues per namespace. default: 1]\n");
1315 printf("\t[-U number of unused io queues per controller. default: 0]\n");
1316 printf("\t[-w io pattern type, must be one of\n");
1317 printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
1318 printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
1319 printf("\t[-L enable latency tracking via sw, default: disabled]\n");
1320 printf("\t\t-L for latency summary, -LL for detailed histogram\n");
1321 printf("\t[-l enable latency tracking via ssd (if supported), default: disabled]\n");
1322 printf("\t[-t time in seconds]\n");
1323 printf("\t[-c core mask for I/O submission/completion.]\n");
1324 printf("\t\t(default: 1)\n");
1325 printf("\t[-D disable submission queue in controller memory buffer, default: enabled]\n");
1326 printf("\t[-H enable header digest for TCP transport, default: disabled]\n");
1327 printf("\t[-I enable data digest for TCP transport, default: disabled]\n");
1328 printf("\t[-N no shutdown notification process for controllers, default: disabled]\n");
1329 printf("\t[-r Transport ID for local PCIe NVMe or NVMeoF]\n");
1330 printf("\t Format: 'key:value [key:value] ...'\n");
1331 printf("\t Keys:\n");
1332 printf("\t trtype Transport type (e.g. PCIe, RDMA)\n");
1333 printf("\t adrfam Address family (e.g. IPv4, IPv6)\n");
1334 printf("\t traddr Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n");
1335 printf("\t trsvcid Transport service identifier (e.g. 4420)\n");
1336 printf("\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN
);
1337 printf("\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n");
1338 printf("\t -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n");
1339 printf("\t[-e metadata configuration]\n");
1340 printf("\t Keys:\n");
1341 printf("\t PRACT Protection Information Action bit (PRACT=1 or PRACT=0)\n");
1342 printf("\t PRCHK Control of Protection Information Checking (PRCHK=GUARD|REFTAG|APPTAG)\n");
1343 printf("\t Example: -e 'PRACT=0,PRCHK=GUARD|REFTAG|APPTAG'\n");
1344 printf("\t -e 'PRACT=1,PRCHK=GUARD'\n");
1345 printf("\t[-k keep alive timeout period in millisecond]\n");
1346 printf("\t[-s DPDK huge memory size in MB.]\n");
1347 printf("\t[-C max completions per poll]\n");
1348 printf("\t\t(default: 0 - unlimited)\n");
1349 printf("\t[-i shared memory group ID]\n");
1351 spdk_log_usage(stdout
, "-T");
1352 #ifdef SPDK_CONFIG_URING
1353 printf("\t[-R enable using liburing to drive kernel devices (Default: libaio)]\n");
1356 printf("\t[-G enable debug logging]\n");
1358 printf("\t[-G enable debug logging (flag disabled, must reconfigure with --enable-debug)\n");
1363 check_cutoff(void *ctx
, uint64_t start
, uint64_t end
, uint64_t count
,
1364 uint64_t total
, uint64_t so_far
)
1367 double **cutoff
= ctx
;
1373 so_far_pct
= (double)so_far
/ total
;
1374 while (so_far_pct
>= **cutoff
&& **cutoff
> 0) {
1375 printf("%9.5f%% : %9.3fus\n", **cutoff
* 100, (double)end
* 1000 * 1000 / g_tsc_rate
);
1381 print_bucket(void *ctx
, uint64_t start
, uint64_t end
, uint64_t count
,
1382 uint64_t total
, uint64_t so_far
)
1390 so_far_pct
= (double)so_far
* 100 / total
;
1391 printf("%9.3f - %9.3f: %9.4f%% (%9ju)\n",
1392 (double)start
* 1000 * 1000 / g_tsc_rate
,
1393 (double)end
* 1000 * 1000 / g_tsc_rate
,
1398 print_performance(void)
1400 uint64_t total_io_completed
, total_io_tsc
;
1401 double io_per_second
, mb_per_second
, average_latency
, min_latency
, max_latency
;
1402 double sum_ave_latency
, min_latency_so_far
, max_latency_so_far
;
1403 double total_io_per_second
, total_mb_per_second
;
1405 struct worker_thread
*worker
;
1406 struct ns_worker_ctx
*ns_ctx
;
1407 uint32_t max_strlen
;
1409 total_io_per_second
= 0;
1410 total_mb_per_second
= 0;
1411 total_io_completed
= 0;
1413 min_latency_so_far
= (double)UINT64_MAX
;
1414 max_latency_so_far
= 0;
1420 ns_ctx
= worker
->ns_ctx
;
1422 max_strlen
= spdk_max(strlen(ns_ctx
->entry
->name
), max_strlen
);
1423 ns_ctx
= ns_ctx
->next
;
1425 worker
= worker
->next
;
1428 printf("========================================================\n");
1429 printf("%*s\n", max_strlen
+ 60, "Latency(us)");
1430 printf("%-*s: %10s %10s %10s %10s %10s\n",
1431 max_strlen
+ 13, "Device Information", "IOPS", "MiB/s", "Average", "min", "max");
1435 ns_ctx
= worker
->ns_ctx
;
1437 if (ns_ctx
->io_completed
!= 0) {
1438 io_per_second
= (double)ns_ctx
->io_completed
/ g_time_in_sec
;
1439 mb_per_second
= io_per_second
* g_io_size_bytes
/ (1024 * 1024);
1440 average_latency
= ((double)ns_ctx
->total_tsc
/ ns_ctx
->io_completed
) * 1000 * 1000 / g_tsc_rate
;
1441 min_latency
= (double)ns_ctx
->min_tsc
* 1000 * 1000 / g_tsc_rate
;
1442 if (min_latency
< min_latency_so_far
) {
1443 min_latency_so_far
= min_latency
;
1446 max_latency
= (double)ns_ctx
->max_tsc
* 1000 * 1000 / g_tsc_rate
;
1447 if (max_latency
> max_latency_so_far
) {
1448 max_latency_so_far
= max_latency
;
1451 printf("%-*.*s from core %2u: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
1452 max_strlen
, max_strlen
, ns_ctx
->entry
->name
, worker
->lcore
,
1453 io_per_second
, mb_per_second
,
1454 average_latency
, min_latency
, max_latency
);
1455 total_io_per_second
+= io_per_second
;
1456 total_mb_per_second
+= mb_per_second
;
1457 total_io_completed
+= ns_ctx
->io_completed
;
1458 total_io_tsc
+= ns_ctx
->total_tsc
;
1461 ns_ctx
= ns_ctx
->next
;
1463 worker
= worker
->next
;
1466 if (ns_count
!= 0 && total_io_completed
) {
1467 sum_ave_latency
= ((double)total_io_tsc
/ total_io_completed
) * 1000 * 1000 / g_tsc_rate
;
1468 printf("========================================================\n");
1469 printf("%-*s: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
1470 max_strlen
+ 13, "Total", total_io_per_second
, total_mb_per_second
,
1471 sum_ave_latency
, min_latency_so_far
, max_latency_so_far
);
1475 if (g_latency_sw_tracking_level
== 0 || total_io_completed
== 0) {
1481 ns_ctx
= worker
->ns_ctx
;
1483 const double *cutoff
= g_latency_cutoffs
;
1485 printf("Summary latency data for %-43.43s from core %u:\n", ns_ctx
->entry
->name
, worker
->lcore
);
1486 printf("=================================================================================\n");
1488 spdk_histogram_data_iterate(ns_ctx
->histogram
, check_cutoff
, &cutoff
);
1491 ns_ctx
= ns_ctx
->next
;
1493 worker
= worker
->next
;
1496 if (g_latency_sw_tracking_level
== 1) {
1502 ns_ctx
= worker
->ns_ctx
;
1504 printf("Latency histogram for %-43.43s from core %u:\n", ns_ctx
->entry
->name
, worker
->lcore
);
1505 printf("==============================================================================\n");
1506 printf(" Range in us Cumulative IO count\n");
1508 spdk_histogram_data_iterate(ns_ctx
->histogram
, print_bucket
, NULL
);
1510 ns_ctx
= ns_ctx
->next
;
1512 worker
= worker
->next
;
1518 print_latency_page(struct ctrlr_entry
*entry
)
1523 printf("%s\n", entry
->name
);
1524 printf("--------------------------------------------------------\n");
1526 for (i
= 0; i
< 32; i
++) {
1527 if (entry
->latency_page
->buckets_32us
[i
]) {
1528 printf("Bucket %dus - %dus: %d\n", i
* 32, (i
+ 1) * 32, entry
->latency_page
->buckets_32us
[i
]);
1531 for (i
= 0; i
< 31; i
++) {
1532 if (entry
->latency_page
->buckets_1ms
[i
]) {
1533 printf("Bucket %dms - %dms: %d\n", i
+ 1, i
+ 2, entry
->latency_page
->buckets_1ms
[i
]);
1536 for (i
= 0; i
< 31; i
++) {
1537 if (entry
->latency_page
->buckets_32ms
[i
])
1538 printf("Bucket %dms - %dms: %d\n", (i
+ 1) * 32, (i
+ 2) * 32,
1539 entry
->latency_page
->buckets_32ms
[i
]);
1544 print_latency_statistics(const char *op_name
, enum spdk_nvme_intel_log_page log_page
)
1546 struct ctrlr_entry
*ctrlr
;
1548 printf("%s Latency Statistics:\n", op_name
);
1549 printf("========================================================\n");
1550 ctrlr
= g_controllers
;
1552 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr
->ctrlr
, log_page
)) {
1553 if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr
->ctrlr
, log_page
, SPDK_NVME_GLOBAL_NS_TAG
,
1554 ctrlr
->latency_page
, sizeof(struct spdk_nvme_intel_rw_latency_page
), 0,
1555 enable_latency_tracking_complete
,
1557 printf("nvme_ctrlr_cmd_get_log_page() failed\n");
1561 g_outstanding_commands
++;
1563 printf("Controller %s: %s latency statistics not supported\n", ctrlr
->name
, op_name
);
1565 ctrlr
= ctrlr
->next
;
1568 while (g_outstanding_commands
) {
1569 ctrlr
= g_controllers
;
1571 spdk_nvme_ctrlr_process_admin_completions(ctrlr
->ctrlr
);
1572 ctrlr
= ctrlr
->next
;
1576 ctrlr
= g_controllers
;
1578 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr
->ctrlr
, log_page
)) {
1579 print_latency_page(ctrlr
);
1581 ctrlr
= ctrlr
->next
;
1589 print_performance();
1590 if (g_latency_ssd_tracking_enable
) {
1591 if (g_rw_percentage
!= 0) {
1592 print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY
);
1594 if (g_rw_percentage
!= 100) {
1595 print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY
);
1601 unregister_trids(void)
1603 struct trid_entry
*trid_entry
, *tmp
;
1605 TAILQ_FOREACH_SAFE(trid_entry
, &g_trid_list
, tailq
, tmp
) {
1606 TAILQ_REMOVE(&g_trid_list
, trid_entry
, tailq
);
1612 add_trid(const char *trid_str
)
1614 struct trid_entry
*trid_entry
;
1615 struct spdk_nvme_transport_id
*trid
;
1618 trid_entry
= calloc(1, sizeof(*trid_entry
));
1619 if (trid_entry
== NULL
) {
1623 trid
= &trid_entry
->trid
;
1624 trid
->trtype
= SPDK_NVME_TRANSPORT_PCIE
;
1625 snprintf(trid
->subnqn
, sizeof(trid
->subnqn
), "%s", SPDK_NVMF_DISCOVERY_NQN
);
1627 if (spdk_nvme_transport_id_parse(trid
, trid_str
) != 0) {
1628 fprintf(stderr
, "Invalid transport ID format '%s'\n", trid_str
);
1633 spdk_nvme_transport_id_populate_trstring(trid
,
1634 spdk_nvme_transport_id_trtype_str(trid
->trtype
));
1636 ns
= strcasestr(trid_str
, "ns:");
1638 char nsid_str
[6]; /* 5 digits maximum in an nsid */
1644 len
= strcspn(ns
, " \t\n");
1646 fprintf(stderr
, "NVMe namespace IDs must be 5 digits or less\n");
1651 memcpy(nsid_str
, ns
, len
);
1652 nsid_str
[len
] = '\0';
1654 nsid
= spdk_strtol(nsid_str
, 10);
1655 if (nsid
<= 0 || nsid
> 65535) {
1656 fprintf(stderr
, "NVMe namespace IDs must be less than 65536 and greater than 0\n");
1661 trid_entry
->nsid
= (uint16_t)nsid
;
1664 TAILQ_INSERT_TAIL(&g_trid_list
, trid_entry
, tailq
);
1669 parse_next_key(const char **str
, char *key
, char *val
, size_t key_buf_size
,
1670 size_t val_buf_size
)
1673 const char *separator
= ", \t\n";
1674 size_t key_len
, val_len
;
1676 *str
+= strspn(*str
, separator
);
1678 sep
= strchr(*str
, '=');
1680 fprintf(stderr
, "Key without '=' separator\n");
1684 key_len
= sep
- *str
;
1685 if (key_len
>= key_buf_size
) {
1686 fprintf(stderr
, "Key length %zu is greater than maximum allowed %zu\n",
1687 key_len
, key_buf_size
- 1);
1691 memcpy(key
, *str
, key_len
);
1692 key
[key_len
] = '\0';
1694 *str
+= key_len
+ 1; /* Skip key */
1695 val_len
= strcspn(*str
, separator
);
1697 fprintf(stderr
, "Key without value\n");
1701 if (val_len
>= val_buf_size
) {
1702 fprintf(stderr
, "Value length %zu is greater than maximum allowed %zu\n",
1703 val_len
, val_buf_size
- 1);
1707 memcpy(val
, *str
, val_len
);
1708 val
[val_len
] = '\0';
1716 parse_metadata(const char *metacfg_str
)
1723 if (metacfg_str
== NULL
) {
1729 while (*str
!= '\0') {
1730 val_len
= parse_next_key(&str
, key
, val
, sizeof(key
), sizeof(val
));
1732 fprintf(stderr
, "Failed to parse metadata\n");
1736 if (strcmp(key
, "PRACT") == 0) {
1738 g_metacfg_pract_flag
= SPDK_NVME_IO_FLAGS_PRACT
;
1740 } else if (strcmp(key
, "PRCHK") == 0) {
1741 if (strstr(val
, "GUARD") != NULL
) {
1742 g_metacfg_prchk_flags
|= SPDK_NVME_IO_FLAGS_PRCHK_GUARD
;
1744 if (strstr(val
, "REFTAG") != NULL
) {
1745 g_metacfg_prchk_flags
|= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG
;
1747 if (strstr(val
, "APPTAG") != NULL
) {
1748 g_metacfg_prchk_flags
|= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG
;
1751 fprintf(stderr
, "Unknown key '%s'\n", key
);
1759 parse_args(int argc
, char **argv
)
1765 while ((op
= getopt(argc
, argv
, "c:e:i:lo:q:r:k:s:t:w:C:DGHILM:NP:RT:U:V")) != -1) {
1777 val
= spdk_strtol(optarg
, 10);
1779 fprintf(stderr
, "Converting a string to integer failed\n");
1787 g_max_completions
= val
;
1790 g_nr_io_queues_per_ns
= val
;
1793 g_io_size_bytes
= val
;
1796 g_queue_depth
= val
;
1799 g_keep_alive_timeout_in_ms
= val
;
1805 g_time_in_sec
= val
;
1808 g_rw_percentage
= val
;
1809 g_mix_specified
= true;
1812 g_nr_unused_io_queues
= val
;
1817 g_core_mask
= optarg
;
1820 if (parse_metadata(optarg
)) {
1826 g_latency_ssd_tracking_enable
= true;
1829 if (add_trid(optarg
)) {
1835 g_workload_type
= optarg
;
1838 g_disable_sq_cmb
= 1;
1842 fprintf(stderr
, "%s must be configured with --enable-debug for -G flag\n",
1847 spdk_log_set_flag("nvme");
1848 spdk_log_set_print_level(SPDK_LOG_DEBUG
);
1852 g_header_digest
= 1;
1858 g_latency_sw_tracking_level
++;
1861 g_no_shn_notification
= true;
1864 #ifndef SPDK_CONFIG_URING
1865 fprintf(stderr
, "%s must be rebuilt with CONFIG_URING=y for -R flag.\n",
1873 rc
= spdk_log_set_flag(optarg
);
1875 fprintf(stderr
, "unknown flag\n");
1879 spdk_log_set_print_level(SPDK_LOG_DEBUG
);
1881 fprintf(stderr
, "%s must be rebuilt with CONFIG_DEBUG=y for -T flag.\n",
1896 if (!g_nr_io_queues_per_ns
) {
1901 if (!g_queue_depth
) {
1902 fprintf(stderr
, "missing -q (queue size) operand\n");
1906 if (!g_io_size_bytes
) {
1907 fprintf(stderr
, "missing -o (block size) operand\n");
1911 if (!g_workload_type
) {
1912 fprintf(stderr
, "missing -w (io pattern type) operand\n");
1916 if (!g_time_in_sec
) {
1917 fprintf(stderr
, "missing -t (test time in seconds) operand\n");
1922 if (strncmp(g_workload_type
, "rand", 4) == 0) {
1924 g_workload_type
= &g_workload_type
[4];
1927 if (strcmp(g_workload_type
, "read") == 0 || strcmp(g_workload_type
, "write") == 0) {
1928 g_rw_percentage
= strcmp(g_workload_type
, "read") == 0 ? 100 : 0;
1929 if (g_mix_specified
) {
1930 fprintf(stderr
, "Ignoring -M option... Please use -M option"
1931 " only when using rw or randrw.\n");
1933 } else if (strcmp(g_workload_type
, "rw") == 0) {
1934 if (g_rw_percentage
< 0 || g_rw_percentage
> 100) {
1936 "-M must be specified to value from 0 to 100 "
1937 "for rw or randrw.\n");
1942 "io pattern type must be one of\n"
1943 "(read, write, randread, randwrite, rw, randrw)\n");
1947 if (TAILQ_EMPTY(&g_trid_list
)) {
1948 /* If no transport IDs specified, default to enumerating all local PCIe devices */
1949 add_trid("trtype:PCIe");
1951 struct trid_entry
*trid_entry
, *trid_entry_tmp
;
1954 /* check whether there is local PCIe type */
1955 TAILQ_FOREACH_SAFE(trid_entry
, &g_trid_list
, tailq
, trid_entry_tmp
) {
1956 if (trid_entry
->trid
.trtype
== SPDK_NVME_TRANSPORT_PCIE
) {
1963 g_file_optind
= optind
;
1969 register_workers(void)
1972 struct worker_thread
*worker
;
1977 SPDK_ENV_FOREACH_CORE(i
) {
1978 worker
= calloc(1, sizeof(*worker
));
1979 if (worker
== NULL
) {
1980 fprintf(stderr
, "Unable to allocate worker\n");
1985 worker
->next
= g_workers
;
1994 unregister_workers(void)
1996 struct worker_thread
*worker
= g_workers
;
1998 /* Free namespace context and worker thread */
2000 struct worker_thread
*next_worker
= worker
->next
;
2001 struct ns_worker_ctx
*ns_ctx
= worker
->ns_ctx
;
2004 struct ns_worker_ctx
*next_ns_ctx
= ns_ctx
->next
;
2005 spdk_histogram_data_free(ns_ctx
->histogram
);
2007 ns_ctx
= next_ns_ctx
;
2011 worker
= next_worker
;
2016 probe_cb(void *cb_ctx
, const struct spdk_nvme_transport_id
*trid
,
2017 struct spdk_nvme_ctrlr_opts
*opts
)
2019 if (trid
->trtype
== SPDK_NVME_TRANSPORT_PCIE
) {
2020 if (g_disable_sq_cmb
) {
2021 opts
->use_cmb_sqs
= false;
2023 if (g_no_shn_notification
) {
2024 opts
->no_shn_notification
= true;
2028 /* Set io_queue_size to UINT16_MAX, NVMe driver
2029 * will then reduce this to MQES to maximize
2030 * the io_queue_size as much as possible.
2032 opts
->io_queue_size
= UINT16_MAX
;
2034 /* Set the header and data_digest */
2035 opts
->header_digest
= g_header_digest
;
2036 opts
->data_digest
= g_data_digest
;
2037 opts
->keep_alive_timeout_ms
= g_keep_alive_timeout_in_ms
;
2043 attach_cb(void *cb_ctx
, const struct spdk_nvme_transport_id
*trid
,
2044 struct spdk_nvme_ctrlr
*ctrlr
, const struct spdk_nvme_ctrlr_opts
*opts
)
2046 struct trid_entry
*trid_entry
= cb_ctx
;
2047 struct spdk_pci_addr pci_addr
;
2048 struct spdk_pci_device
*pci_dev
;
2049 struct spdk_pci_id pci_id
;
2051 if (trid
->trtype
!= SPDK_NVME_TRANSPORT_PCIE
) {
2052 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n",
2053 trid
->traddr
, trid
->trsvcid
,
2056 if (spdk_pci_addr_parse(&pci_addr
, trid
->traddr
)) {
2060 pci_dev
= spdk_nvme_ctrlr_get_pci_device(ctrlr
);
2065 pci_id
= spdk_pci_device_get_id(pci_dev
);
2067 printf("Attached to NVMe Controller at %s [%04x:%04x]\n",
2069 pci_id
.vendor_id
, pci_id
.device_id
);
2072 register_ctrlr(ctrlr
, trid_entry
);
2076 register_controllers(void)
2078 struct trid_entry
*trid_entry
;
2080 printf("Initializing NVMe Controllers\n");
2082 if (g_vmd
&& spdk_vmd_init()) {
2083 fprintf(stderr
, "Failed to initialize VMD."
2084 " Some NVMe devices can be unavailable.\n");
2087 TAILQ_FOREACH(trid_entry
, &g_trid_list
, tailq
) {
2088 if (spdk_nvme_probe(&trid_entry
->trid
, trid_entry
, probe_cb
, attach_cb
, NULL
) != 0) {
2089 fprintf(stderr
, "spdk_nvme_probe() failed for transport address '%s'\n",
2090 trid_entry
->trid
.traddr
);
2099 unregister_controllers(void)
2101 struct ctrlr_entry
*entry
= g_controllers
;
2104 struct ctrlr_entry
*next
= entry
->next
;
2105 spdk_dma_free(entry
->latency_page
);
2106 if (g_latency_ssd_tracking_enable
&&
2107 spdk_nvme_ctrlr_is_feature_supported(entry
->ctrlr
, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING
)) {
2108 set_latency_tracking_feature(entry
->ctrlr
, false);
2111 if (g_nr_unused_io_queues
) {
2114 for (i
= 0; i
< g_nr_unused_io_queues
; i
++) {
2115 spdk_nvme_ctrlr_free_io_qpair(entry
->unused_qpairs
[i
]);
2118 free(entry
->unused_qpairs
);
2121 spdk_nvme_detach(entry
->ctrlr
);
2132 associate_workers_with_ns(void)
2134 struct ns_entry
*entry
= g_namespaces
;
2135 struct worker_thread
*worker
= g_workers
;
2136 struct ns_worker_ctx
*ns_ctx
;
2139 count
= g_num_namespaces
> g_num_workers
? g_num_namespaces
: g_num_workers
;
2141 for (i
= 0; i
< count
; i
++) {
2142 if (entry
== NULL
) {
2146 ns_ctx
= calloc(1, sizeof(struct ns_worker_ctx
));
2151 printf("Associating %s with lcore %d\n", entry
->name
, worker
->lcore
);
2152 ns_ctx
->min_tsc
= UINT64_MAX
;
2153 ns_ctx
->entry
= entry
;
2154 ns_ctx
->next
= worker
->ns_ctx
;
2155 ns_ctx
->histogram
= spdk_histogram_data_alloc();
2156 worker
->ns_ctx
= ns_ctx
;
2158 worker
= worker
->next
;
2159 if (worker
== NULL
) {
2163 entry
= entry
->next
;
2164 if (entry
== NULL
) {
2165 entry
= g_namespaces
;
2174 nvme_poll_ctrlrs(void *arg
)
2176 struct ctrlr_entry
*entry
;
2179 spdk_unaffinitize_thread();
2182 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE
, &oldstate
);
2184 entry
= g_controllers
;
2186 if (entry
->trtype
!= SPDK_NVME_TRANSPORT_PCIE
) {
2187 spdk_nvme_ctrlr_process_admin_completions(entry
->ctrlr
);
2189 entry
= entry
->next
;
2192 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE
, &oldstate
);
2194 /* This is a pthread cancellation point and cannot be removed. */
2201 int main(int argc
, char **argv
)
2204 struct worker_thread
*worker
, *master_worker
;
2205 struct spdk_env_opts opts
;
2206 pthread_t thread_id
= 0;
2208 rc
= parse_args(argc
, argv
);
2213 spdk_env_opts_init(&opts
);
2215 opts
.shm_id
= g_shm_id
;
2217 opts
.core_mask
= g_core_mask
;
2221 opts
.mem_size
= g_dpdk_mem
;
2224 opts
.no_pci
= g_no_pci
;
2226 if (spdk_env_init(&opts
) < 0) {
2227 fprintf(stderr
, "Unable to initialize SPDK env\n");
2232 g_tsc_rate
= spdk_get_ticks_hz();
2234 if (register_workers() != 0) {
2239 #if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING)
2240 if (register_files(argc
, argv
) != 0) {
2246 if (register_controllers() != 0) {
2252 printf("WARNING: Some requested NVMe devices were skipped\n");
2255 if (g_num_namespaces
== 0) {
2256 fprintf(stderr
, "No valid NVMe controllers or AIO or URING devices found\n");
2260 rc
= pthread_create(&thread_id
, NULL
, &nvme_poll_ctrlrs
, NULL
);
2262 fprintf(stderr
, "Unable to spawn a thread to poll admin queues.\n");
2266 if (associate_workers_with_ns() != 0) {
2271 printf("Initialization complete. Launching workers.\n");
2273 /* Launch all of the slave workers */
2274 g_master_core
= spdk_env_get_current_core();
2275 master_worker
= NULL
;
2277 while (worker
!= NULL
) {
2278 if (worker
->lcore
!= g_master_core
) {
2279 spdk_env_thread_launch_pinned(worker
->lcore
, work_fn
, worker
);
2281 assert(master_worker
== NULL
);
2282 master_worker
= worker
;
2284 worker
= worker
->next
;
2287 assert(master_worker
!= NULL
);
2288 rc
= work_fn(master_worker
);
2290 spdk_env_thread_wait_all();
2295 if (thread_id
&& pthread_cancel(thread_id
) == 0) {
2296 pthread_join(thread_id
, NULL
);
2299 unregister_namespaces();
2300 unregister_controllers();
2301 unregister_workers();
2304 fprintf(stderr
, "%s: errors occured\n", argv
[0]);