]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/lib/bdev/uring/bdev_uring.c
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / spdk / lib / bdev / uring / bdev_uring.c
CommitLineData
9f95a23c
TL
1/*-
2 * BSD LICENSE
3 *
4 * Copyright (c) Intel Corporation.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include "bdev_uring.h"
35
36#include "spdk/stdinc.h"
37
38#include "spdk/barrier.h"
39#include "spdk/bdev.h"
40#include "spdk/conf.h"
41#include "spdk/env.h"
42#include "spdk/fd.h"
43#include "spdk/likely.h"
44#include "spdk/thread.h"
45#include "spdk/json.h"
46#include "spdk/util.h"
47#include "spdk/string.h"
48
49#include "spdk_internal/log.h"
50
51#include <liburing.h>
52
53struct bdev_uring_io_channel {
54 struct bdev_uring_group_channel *group_ch;
55};
56
57struct bdev_uring_group_channel {
58 uint64_t io_inflight;
59 uint64_t io_pending;
60 struct spdk_poller *poller;
61 struct io_uring uring;
62};
63
64struct bdev_uring_task {
65 uint64_t len;
66 struct bdev_uring_io_channel *ch;
67 TAILQ_ENTRY(bdev_uring_task) link;
68};
69
70struct bdev_uring {
71 struct spdk_bdev bdev;
72 char *filename;
73 int fd;
74 TAILQ_ENTRY(bdev_uring) link;
75};
76
77static int bdev_uring_init(void);
78static void bdev_uring_fini(void);
79static void uring_free_bdev(struct bdev_uring *uring);
80static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head;
81
82#define SPDK_URING_QUEUE_DEPTH 512
83#define MAX_EVENTS_PER_POLL 32
84
85static int
86bdev_uring_get_ctx_size(void)
87{
88 return sizeof(struct bdev_uring_task);
89}
90
91static struct spdk_bdev_module uring_if = {
92 .name = "uring",
93 .module_init = bdev_uring_init,
94 .module_fini = bdev_uring_fini,
95 .config_text = NULL,
96 .get_ctx_size = bdev_uring_get_ctx_size,
97};
98
99SPDK_BDEV_MODULE_REGISTER(uring, &uring_if)
100
101static int
102bdev_uring_open(struct bdev_uring *bdev)
103{
104 int fd;
105
106 fd = open(bdev->filename, O_NOATIME | O_DIRECT);
107 if (fd < 0) {
108 SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
109 bdev->filename, errno, spdk_strerror(errno));
110 bdev->fd = -1;
111 return -1;
112 }
113
114 bdev->fd = fd;
115
116 return 0;
117}
118
119static int
120bdev_uring_close(struct bdev_uring *bdev)
121{
122 int rc;
123
124 if (bdev->fd == -1) {
125 return 0;
126 }
127
128 rc = close(bdev->fd);
129 if (rc < 0) {
130 SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
131 bdev->fd, errno, spdk_strerror(errno));
132 return -1;
133 }
134
135 bdev->fd = -1;
136
137 return 0;
138}
139
140static int64_t
141bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch,
142 struct bdev_uring_task *uring_task,
143 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
144{
145 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
146 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
147 struct io_uring_sqe *sqe;
148
149 sqe = io_uring_get_sqe(&group_ch->uring);
150 io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset);
151 io_uring_sqe_set_data(sqe, uring_task);
152 uring_task->len = nbytes;
153 uring_task->ch = uring_ch;
154
155 SPDK_DEBUGLOG(SPDK_LOG_URING, "read %d iovs size %lu to off: %#lx\n",
156 iovcnt, nbytes, offset);
157
158 group_ch->io_pending++;
159 return nbytes;
160}
161
162static int64_t
163bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch,
164 struct bdev_uring_task *uring_task,
165 struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset)
166{
167 struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
168 struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
169 struct io_uring_sqe *sqe;
170
171 sqe = io_uring_get_sqe(&group_ch->uring);
172 io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset);
173 io_uring_sqe_set_data(sqe, uring_task);
174 uring_task->ch = uring_ch;
175
176 SPDK_DEBUGLOG(SPDK_LOG_URING, "write %d iovs size %lu from off: %#lx\n",
177 iovcnt, nbytes, offset);
178
179 group_ch->io_pending++;
180 return nbytes;
181}
182
183static int
184bdev_uring_destruct(void *ctx)
185{
186 struct bdev_uring *uring = ctx;
187 int rc = 0;
188
189 TAILQ_REMOVE(&g_uring_bdev_head, uring, link);
190 rc = bdev_uring_close(uring);
191 if (rc < 0) {
192 SPDK_ERRLOG("bdev_uring_close() failed\n");
193 }
194 spdk_io_device_unregister(uring, NULL);
195 uring_free_bdev(uring);
196 return rc;
197}
198
199static int
200bdev_uring_reap(struct io_uring *ring, int max)
201{
202 int i, count, ret;
203 struct io_uring_cqe *cqe;
204 struct bdev_uring_task *uring_task;
205 enum spdk_bdev_io_status status;
206
207 count = 0;
208 for (i = 0; i < max; i++) {
209 ret = io_uring_peek_cqe(ring, &cqe);
210 if (ret != 0) {
211 return ret;
212 }
213
214 if (cqe == NULL) {
215 return count;
216 }
217
218 uring_task = (struct bdev_uring_task *)cqe->user_data;
219 if (cqe->res != (signed)uring_task->len) {
220 status = SPDK_BDEV_IO_STATUS_FAILED;
221 } else {
222 status = SPDK_BDEV_IO_STATUS_SUCCESS;
223 }
224
225 uring_task->ch->group_ch->io_inflight--;
226 io_uring_cqe_seen(ring, cqe);
227 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status);
228 count++;
229 }
230
231 return count;
232}
233
234static int
235bdev_uring_group_poll(void *arg)
236{
237 struct bdev_uring_group_channel *group_ch = arg;
238 int to_complete, to_submit;
239 int count, ret;
240
241 to_submit = group_ch->io_pending;
242 to_complete = group_ch->io_inflight;
243
244 ret = 0;
245 if (to_submit > 0) {
246 /* If there are I/O to submit, use io_uring_submit here.
247 * It will automatically call io_uring_enter appropriately. */
248 ret = io_uring_submit(&group_ch->uring);
249 group_ch->io_pending = 0;
250 group_ch->io_inflight += to_submit;
251 } else if (to_complete > 0) {
252 /* If there are I/O in flight but none to submit, we need to
253 * call io_uring_enter ourselves. */
254 ret = io_uring_enter(group_ch->uring.ring_fd, 0, 0,
255 IORING_ENTER_GETEVENTS, NULL);
256 }
257
258 if (ret < 0) {
259 return 1;
260 }
261
262 count = 0;
263 if (to_complete > 0) {
264 count = bdev_uring_reap(&group_ch->uring, to_complete);
265 }
266
267 return (count + to_submit);
268}
269
270static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
271 bool success)
272{
273 if (!success) {
274 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
275 return;
276 }
277
278 switch (bdev_io->type) {
279 case SPDK_BDEV_IO_TYPE_READ:
280 bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt,
281 ch,
282 (struct bdev_uring_task *)bdev_io->driver_ctx,
283 bdev_io->u.bdev.iovs,
284 bdev_io->u.bdev.iovcnt,
285 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
286 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
287 break;
288 case SPDK_BDEV_IO_TYPE_WRITE:
289 bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt,
290 ch,
291 (struct bdev_uring_task *)bdev_io->driver_ctx,
292 bdev_io->u.bdev.iovs,
293 bdev_io->u.bdev.iovcnt,
294 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
295 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
296 break;
297 default:
298 SPDK_ERRLOG("Wrong io type\n");
299 break;
300 }
301}
302
303static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
304{
305 switch (bdev_io->type) {
306 /* Read and write operations must be performed on buffers aligned to
307 * bdev->required_alignment. If user specified unaligned buffers,
308 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
309 case SPDK_BDEV_IO_TYPE_READ:
310 case SPDK_BDEV_IO_TYPE_WRITE:
311 spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb,
312 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
313 return 0;
314 default:
315 return -1;
316 }
317}
318
319static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
320{
321 if (_bdev_uring_submit_request(ch, bdev_io) < 0) {
322 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
323 }
324}
325
326static bool
327bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
328{
329 switch (io_type) {
330 case SPDK_BDEV_IO_TYPE_READ:
331 case SPDK_BDEV_IO_TYPE_WRITE:
332 return true;
333 default:
334 return false;
335 }
336}
337
338static int
339bdev_uring_create_cb(void *io_device, void *ctx_buf)
340{
341 struct bdev_uring_io_channel *ch = ctx_buf;
342
343 ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if));
344
345 return 0;
346}
347
348static void
349bdev_uring_destroy_cb(void *io_device, void *ctx_buf)
350{
351 struct bdev_uring_io_channel *ch = ctx_buf;
352
353 spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
354}
355
356static struct spdk_io_channel *
357bdev_uring_get_io_channel(void *ctx)
358{
359 struct bdev_uring *uring = ctx;
360
361 return spdk_get_io_channel(uring);
362}
363
364
365static const struct spdk_bdev_fn_table uring_fn_table = {
366 .destruct = bdev_uring_destruct,
367 .submit_request = bdev_uring_submit_request,
368 .io_type_supported = bdev_uring_io_type_supported,
369 .get_io_channel = bdev_uring_get_io_channel,
370};
371
372static void uring_free_bdev(struct bdev_uring *uring)
373{
374 if (uring == NULL) {
375 return;
376 }
377 free(uring->filename);
378 free(uring->bdev.name);
379 free(uring);
380}
381
382static int
383bdev_uring_group_create_cb(void *io_device, void *ctx_buf)
384{
385 struct bdev_uring_group_channel *ch = ctx_buf;
386
387 if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, IORING_SETUP_IOPOLL) < 0) {
388 SPDK_ERRLOG("uring I/O context setup failure\n");
389 return -1;
390 }
391
392 ch->poller = spdk_poller_register(bdev_uring_group_poll, ch, 0);
393 return 0;
394}
395
396static void
397bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf)
398{
399 struct bdev_uring_group_channel *ch = ctx_buf;
400
401 close(ch->uring.ring_fd);
402 io_uring_queue_exit(&ch->uring);
403
404 spdk_poller_unregister(&ch->poller);
405}
406
407struct spdk_bdev *
408create_uring_bdev(const char *name, const char *filename)
409{
410 struct bdev_uring *uring;
411 uint32_t block_size;
412 uint64_t bdev_size;
413 int rc;
414
415 uring = calloc(1, sizeof(*uring));
416 if (!uring) {
417 SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n");
418 return NULL;
419 }
420
421 uring->filename = strdup(filename);
422 if (!uring->filename) {
423 goto error_return;
424 }
425
426 if (bdev_uring_open(uring)) {
427 SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno);
428 goto error_return;
429 }
430
431 bdev_size = spdk_fd_get_size(uring->fd);
432
433 uring->bdev.name = strdup(name);
434 if (!uring->bdev.name) {
435 goto error_return;
436 }
437 uring->bdev.product_name = "URING bdev";
438 uring->bdev.module = &uring_if;
439
440 uring->bdev.write_cache = 1;
441
442 block_size = spdk_fd_get_blocklen(uring->fd);
443 if (block_size == 0) {
444 SPDK_ERRLOG("Block size could not be auto-detected\n");
445 goto error_return;
446 }
447
448 if (block_size < 512) {
449 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
450 goto error_return;
451 }
452
453 if (!spdk_u32_is_pow2(block_size)) {
454 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
455 goto error_return;
456 }
457
458 uring->bdev.blocklen = block_size;
459 uring->bdev.required_alignment = spdk_u32log2(block_size);
460
461 if (bdev_size % uring->bdev.blocklen != 0) {
462 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
463 bdev_size, uring->bdev.blocklen);
464 goto error_return;
465 }
466
467 uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen;
468 uring->bdev.ctxt = uring;
469
470 uring->bdev.fn_table = &uring_fn_table;
471
472 spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb,
473 sizeof(struct bdev_uring_io_channel),
474 uring->bdev.name);
475 rc = spdk_bdev_register(&uring->bdev);
476 if (rc) {
477 spdk_io_device_unregister(uring, NULL);
478 goto error_return;
479 }
480
481 TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link);
482 return &uring->bdev;
483
484error_return:
485 bdev_uring_close(uring);
486 uring_free_bdev(uring);
487 return NULL;
488}
489
490struct delete_uring_bdev_ctx {
491 spdk_delete_uring_complete cb_fn;
492 void *cb_arg;
493};
494
495static void
496uring_bdev_unregister_cb(void *arg, int bdeverrno)
497{
498 struct delete_uring_bdev_ctx *ctx = arg;
499
500 ctx->cb_fn(ctx->cb_arg, bdeverrno);
501 free(ctx);
502}
503
504void
505delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg)
506{
507 struct delete_uring_bdev_ctx *ctx;
508
509 if (!bdev || bdev->module != &uring_if) {
510 cb_fn(cb_arg, -ENODEV);
511 return;
512 }
513
514 ctx = calloc(1, sizeof(*ctx));
515 if (ctx == NULL) {
516 cb_fn(cb_arg, -ENOMEM);
517 return;
518 }
519
520 ctx->cb_fn = cb_fn;
521 ctx->cb_arg = cb_arg;
522 spdk_bdev_unregister(bdev, uring_bdev_unregister_cb, ctx);
523}
524
525static int
526bdev_uring_init(void)
527{
528 size_t i;
529 struct spdk_conf_section *sp;
530 struct spdk_bdev *bdev;
531
532 TAILQ_INIT(&g_uring_bdev_head);
533 spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb,
534 sizeof(struct bdev_uring_group_channel),
535 "uring_module");
536
537 sp = spdk_conf_find_section(NULL, "URING");
538 if (!sp) {
539 return 0;
540 }
541
542 i = 0;
543 while (true) {
544 const char *file;
545 const char *name;
546
547 file = spdk_conf_section_get_nmval(sp, "URING", i, 0);
548 if (!file) {
549 break;
550 }
551
552 name = spdk_conf_section_get_nmval(sp, "URING", i, 1);
553 if (!name) {
554 SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file);
555 i++;
556 continue;
557 }
558
559 bdev = create_uring_bdev(name, file);
560 if (!bdev) {
561 SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file);
562 i++;
563 continue;
564 }
565
566 i++;
567 }
568
569 return 0;
570}
571
572static void
573bdev_uring_fini(void)
574{
575 spdk_io_device_unregister(&uring_if, NULL);
576}
577
578SPDK_LOG_REGISTER_COMPONENT("uring", SPDK_LOG_URING)