4 * Copyright (c) Intel Corporation.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include "bdev_uring.h"
36 #include "spdk/stdinc.h"
38 #include "spdk/barrier.h"
39 #include "spdk/bdev.h"
40 #include "spdk/conf.h"
43 #include "spdk/likely.h"
44 #include "spdk/thread.h"
45 #include "spdk/json.h"
46 #include "spdk/util.h"
47 #include "spdk/string.h"
49 #include "spdk_internal/log.h"
53 struct bdev_uring_io_channel
{
54 struct bdev_uring_group_channel
*group_ch
;
57 struct bdev_uring_group_channel
{
60 struct spdk_poller
*poller
;
61 struct io_uring uring
;
64 struct bdev_uring_task
{
66 struct bdev_uring_io_channel
*ch
;
67 TAILQ_ENTRY(bdev_uring_task
) link
;
71 struct spdk_bdev bdev
;
74 TAILQ_ENTRY(bdev_uring
) link
;
77 static int bdev_uring_init(void);
78 static void bdev_uring_fini(void);
79 static void uring_free_bdev(struct bdev_uring
*uring
);
80 static TAILQ_HEAD(, bdev_uring
) g_uring_bdev_head
;
82 #define SPDK_URING_QUEUE_DEPTH 512
83 #define MAX_EVENTS_PER_POLL 32
86 bdev_uring_get_ctx_size(void)
88 return sizeof(struct bdev_uring_task
);
91 static struct spdk_bdev_module uring_if
= {
93 .module_init
= bdev_uring_init
,
94 .module_fini
= bdev_uring_fini
,
96 .get_ctx_size
= bdev_uring_get_ctx_size
,
99 SPDK_BDEV_MODULE_REGISTER(uring
, &uring_if
)
102 bdev_uring_open(struct bdev_uring
*bdev
)
106 fd
= open(bdev
->filename
, O_NOATIME
| O_DIRECT
);
108 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
109 bdev
->filename
, errno
, spdk_strerror(errno
));
120 bdev_uring_close(struct bdev_uring
*bdev
)
124 if (bdev
->fd
== -1) {
128 rc
= close(bdev
->fd
);
130 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
131 bdev
->fd
, errno
, spdk_strerror(errno
));
141 bdev_uring_readv(struct bdev_uring
*uring
, struct spdk_io_channel
*ch
,
142 struct bdev_uring_task
*uring_task
,
143 struct iovec
*iov
, int iovcnt
, uint64_t nbytes
, uint64_t offset
)
145 struct bdev_uring_io_channel
*uring_ch
= spdk_io_channel_get_ctx(ch
);
146 struct bdev_uring_group_channel
*group_ch
= uring_ch
->group_ch
;
147 struct io_uring_sqe
*sqe
;
149 sqe
= io_uring_get_sqe(&group_ch
->uring
);
150 io_uring_prep_readv(sqe
, uring
->fd
, iov
, iovcnt
, offset
);
151 io_uring_sqe_set_data(sqe
, uring_task
);
152 uring_task
->len
= nbytes
;
153 uring_task
->ch
= uring_ch
;
155 SPDK_DEBUGLOG(SPDK_LOG_URING
, "read %d iovs size %lu to off: %#lx\n",
156 iovcnt
, nbytes
, offset
);
158 group_ch
->io_pending
++;
163 bdev_uring_writev(struct bdev_uring
*uring
, struct spdk_io_channel
*ch
,
164 struct bdev_uring_task
*uring_task
,
165 struct iovec
*iov
, int iovcnt
, size_t nbytes
, uint64_t offset
)
167 struct bdev_uring_io_channel
*uring_ch
= spdk_io_channel_get_ctx(ch
);
168 struct bdev_uring_group_channel
*group_ch
= uring_ch
->group_ch
;
169 struct io_uring_sqe
*sqe
;
171 sqe
= io_uring_get_sqe(&group_ch
->uring
);
172 io_uring_prep_writev(sqe
, uring
->fd
, iov
, iovcnt
, offset
);
173 io_uring_sqe_set_data(sqe
, uring_task
);
174 uring_task
->ch
= uring_ch
;
176 SPDK_DEBUGLOG(SPDK_LOG_URING
, "write %d iovs size %lu from off: %#lx\n",
177 iovcnt
, nbytes
, offset
);
179 group_ch
->io_pending
++;
184 bdev_uring_destruct(void *ctx
)
186 struct bdev_uring
*uring
= ctx
;
189 TAILQ_REMOVE(&g_uring_bdev_head
, uring
, link
);
190 rc
= bdev_uring_close(uring
);
192 SPDK_ERRLOG("bdev_uring_close() failed\n");
194 spdk_io_device_unregister(uring
, NULL
);
195 uring_free_bdev(uring
);
200 bdev_uring_reap(struct io_uring
*ring
, int max
)
203 struct io_uring_cqe
*cqe
;
204 struct bdev_uring_task
*uring_task
;
205 enum spdk_bdev_io_status status
;
208 for (i
= 0; i
< max
; i
++) {
209 ret
= io_uring_peek_cqe(ring
, &cqe
);
218 uring_task
= (struct bdev_uring_task
*)cqe
->user_data
;
219 if (cqe
->res
!= (signed)uring_task
->len
) {
220 status
= SPDK_BDEV_IO_STATUS_FAILED
;
222 status
= SPDK_BDEV_IO_STATUS_SUCCESS
;
225 uring_task
->ch
->group_ch
->io_inflight
--;
226 io_uring_cqe_seen(ring
, cqe
);
227 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task
), status
);
235 bdev_uring_group_poll(void *arg
)
237 struct bdev_uring_group_channel
*group_ch
= arg
;
238 int to_complete
, to_submit
;
241 to_submit
= group_ch
->io_pending
;
242 to_complete
= group_ch
->io_inflight
;
246 /* If there are I/O to submit, use io_uring_submit here.
247 * It will automatically call io_uring_enter appropriately. */
248 ret
= io_uring_submit(&group_ch
->uring
);
249 group_ch
->io_pending
= 0;
250 group_ch
->io_inflight
+= to_submit
;
251 } else if (to_complete
> 0) {
252 /* If there are I/O in flight but none to submit, we need to
253 * call io_uring_enter ourselves. */
254 ret
= io_uring_enter(group_ch
->uring
.ring_fd
, 0, 0,
255 IORING_ENTER_GETEVENTS
, NULL
);
263 if (to_complete
> 0) {
264 count
= bdev_uring_reap(&group_ch
->uring
, to_complete
);
267 return (count
+ to_submit
);
270 static void bdev_uring_get_buf_cb(struct spdk_io_channel
*ch
, struct spdk_bdev_io
*bdev_io
,
274 spdk_bdev_io_complete(bdev_io
, SPDK_BDEV_IO_STATUS_FAILED
);
278 switch (bdev_io
->type
) {
279 case SPDK_BDEV_IO_TYPE_READ
:
280 bdev_uring_readv((struct bdev_uring
*)bdev_io
->bdev
->ctxt
,
282 (struct bdev_uring_task
*)bdev_io
->driver_ctx
,
283 bdev_io
->u
.bdev
.iovs
,
284 bdev_io
->u
.bdev
.iovcnt
,
285 bdev_io
->u
.bdev
.num_blocks
* bdev_io
->bdev
->blocklen
,
286 bdev_io
->u
.bdev
.offset_blocks
* bdev_io
->bdev
->blocklen
);
288 case SPDK_BDEV_IO_TYPE_WRITE
:
289 bdev_uring_writev((struct bdev_uring
*)bdev_io
->bdev
->ctxt
,
291 (struct bdev_uring_task
*)bdev_io
->driver_ctx
,
292 bdev_io
->u
.bdev
.iovs
,
293 bdev_io
->u
.bdev
.iovcnt
,
294 bdev_io
->u
.bdev
.num_blocks
* bdev_io
->bdev
->blocklen
,
295 bdev_io
->u
.bdev
.offset_blocks
* bdev_io
->bdev
->blocklen
);
298 SPDK_ERRLOG("Wrong io type\n");
303 static int _bdev_uring_submit_request(struct spdk_io_channel
*ch
, struct spdk_bdev_io
*bdev_io
)
305 switch (bdev_io
->type
) {
306 /* Read and write operations must be performed on buffers aligned to
307 * bdev->required_alignment. If user specified unaligned buffers,
308 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
309 case SPDK_BDEV_IO_TYPE_READ
:
310 case SPDK_BDEV_IO_TYPE_WRITE
:
311 spdk_bdev_io_get_buf(bdev_io
, bdev_uring_get_buf_cb
,
312 bdev_io
->u
.bdev
.num_blocks
* bdev_io
->bdev
->blocklen
);
319 static void bdev_uring_submit_request(struct spdk_io_channel
*ch
, struct spdk_bdev_io
*bdev_io
)
321 if (_bdev_uring_submit_request(ch
, bdev_io
) < 0) {
322 spdk_bdev_io_complete(bdev_io
, SPDK_BDEV_IO_STATUS_FAILED
);
327 bdev_uring_io_type_supported(void *ctx
, enum spdk_bdev_io_type io_type
)
330 case SPDK_BDEV_IO_TYPE_READ
:
331 case SPDK_BDEV_IO_TYPE_WRITE
:
339 bdev_uring_create_cb(void *io_device
, void *ctx_buf
)
341 struct bdev_uring_io_channel
*ch
= ctx_buf
;
343 ch
->group_ch
= spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if
));
349 bdev_uring_destroy_cb(void *io_device
, void *ctx_buf
)
351 struct bdev_uring_io_channel
*ch
= ctx_buf
;
353 spdk_put_io_channel(spdk_io_channel_from_ctx(ch
->group_ch
));
356 static struct spdk_io_channel
*
357 bdev_uring_get_io_channel(void *ctx
)
359 struct bdev_uring
*uring
= ctx
;
361 return spdk_get_io_channel(uring
);
365 static const struct spdk_bdev_fn_table uring_fn_table
= {
366 .destruct
= bdev_uring_destruct
,
367 .submit_request
= bdev_uring_submit_request
,
368 .io_type_supported
= bdev_uring_io_type_supported
,
369 .get_io_channel
= bdev_uring_get_io_channel
,
372 static void uring_free_bdev(struct bdev_uring
*uring
)
377 free(uring
->filename
);
378 free(uring
->bdev
.name
);
383 bdev_uring_group_create_cb(void *io_device
, void *ctx_buf
)
385 struct bdev_uring_group_channel
*ch
= ctx_buf
;
387 if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH
, &ch
->uring
, IORING_SETUP_IOPOLL
) < 0) {
388 SPDK_ERRLOG("uring I/O context setup failure\n");
392 ch
->poller
= spdk_poller_register(bdev_uring_group_poll
, ch
, 0);
397 bdev_uring_group_destroy_cb(void *io_device
, void *ctx_buf
)
399 struct bdev_uring_group_channel
*ch
= ctx_buf
;
401 close(ch
->uring
.ring_fd
);
402 io_uring_queue_exit(&ch
->uring
);
404 spdk_poller_unregister(&ch
->poller
);
408 create_uring_bdev(const char *name
, const char *filename
)
410 struct bdev_uring
*uring
;
415 uring
= calloc(1, sizeof(*uring
));
417 SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n");
421 uring
->filename
= strdup(filename
);
422 if (!uring
->filename
) {
426 if (bdev_uring_open(uring
)) {
427 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename
, uring
->fd
, errno
);
431 bdev_size
= spdk_fd_get_size(uring
->fd
);
433 uring
->bdev
.name
= strdup(name
);
434 if (!uring
->bdev
.name
) {
437 uring
->bdev
.product_name
= "URING bdev";
438 uring
->bdev
.module
= &uring_if
;
440 uring
->bdev
.write_cache
= 1;
442 block_size
= spdk_fd_get_blocklen(uring
->fd
);
443 if (block_size
== 0) {
444 SPDK_ERRLOG("Block size could not be auto-detected\n");
448 if (block_size
< 512) {
449 SPDK_ERRLOG("Invalid block size %" PRIu32
" (must be at least 512).\n", block_size
);
453 if (!spdk_u32_is_pow2(block_size
)) {
454 SPDK_ERRLOG("Invalid block size %" PRIu32
" (must be a power of 2.)\n", block_size
);
458 uring
->bdev
.blocklen
= block_size
;
459 uring
->bdev
.required_alignment
= spdk_u32log2(block_size
);
461 if (bdev_size
% uring
->bdev
.blocklen
!= 0) {
462 SPDK_ERRLOG("Disk size %" PRIu64
" is not a multiple of block size %" PRIu32
"\n",
463 bdev_size
, uring
->bdev
.blocklen
);
467 uring
->bdev
.blockcnt
= bdev_size
/ uring
->bdev
.blocklen
;
468 uring
->bdev
.ctxt
= uring
;
470 uring
->bdev
.fn_table
= &uring_fn_table
;
472 spdk_io_device_register(uring
, bdev_uring_create_cb
, bdev_uring_destroy_cb
,
473 sizeof(struct bdev_uring_io_channel
),
475 rc
= spdk_bdev_register(&uring
->bdev
);
477 spdk_io_device_unregister(uring
, NULL
);
481 TAILQ_INSERT_TAIL(&g_uring_bdev_head
, uring
, link
);
485 bdev_uring_close(uring
);
486 uring_free_bdev(uring
);
490 struct delete_uring_bdev_ctx
{
491 spdk_delete_uring_complete cb_fn
;
496 uring_bdev_unregister_cb(void *arg
, int bdeverrno
)
498 struct delete_uring_bdev_ctx
*ctx
= arg
;
500 ctx
->cb_fn(ctx
->cb_arg
, bdeverrno
);
505 delete_uring_bdev(struct spdk_bdev
*bdev
, spdk_delete_uring_complete cb_fn
, void *cb_arg
)
507 struct delete_uring_bdev_ctx
*ctx
;
509 if (!bdev
|| bdev
->module
!= &uring_if
) {
510 cb_fn(cb_arg
, -ENODEV
);
514 ctx
= calloc(1, sizeof(*ctx
));
516 cb_fn(cb_arg
, -ENOMEM
);
521 ctx
->cb_arg
= cb_arg
;
522 spdk_bdev_unregister(bdev
, uring_bdev_unregister_cb
, ctx
);
526 bdev_uring_init(void)
529 struct spdk_conf_section
*sp
;
530 struct spdk_bdev
*bdev
;
532 TAILQ_INIT(&g_uring_bdev_head
);
533 spdk_io_device_register(&uring_if
, bdev_uring_group_create_cb
, bdev_uring_group_destroy_cb
,
534 sizeof(struct bdev_uring_group_channel
),
537 sp
= spdk_conf_find_section(NULL
, "URING");
547 file
= spdk_conf_section_get_nmval(sp
, "URING", i
, 0);
552 name
= spdk_conf_section_get_nmval(sp
, "URING", i
, 1);
554 SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file
);
559 bdev
= create_uring_bdev(name
, file
);
561 SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file
);
573 bdev_uring_fini(void)
575 spdk_io_device_unregister(&uring_if
, NULL
);
578 SPDK_LOG_REGISTER_COMPONENT("uring", SPDK_LOG_URING
)