]> git.proxmox.com Git - mirror_qemu.git/blame - block/blkio.c
block/nvme: convert to blk_io_plug_call() API
[mirror_qemu.git] / block / blkio.c
CommitLineData
fd66dbd4
SH
1/* SPDX-License-Identifier: LGPL-2.1-or-later */
2/*
3 * libblkio BlockDriver
4 *
5 * Copyright Red Hat, Inc.
6 *
7 * Author:
8 * Stefan Hajnoczi <stefanha@redhat.com>
9 */
10
11#include "qemu/osdep.h"
12#include <blkio.h>
13#include "block/block_int.h"
c5640b3e
SH
14#include "exec/memory.h"
15#include "exec/cpu-common.h" /* for qemu_ram_get_fd() */
fd66dbd4 16#include "qapi/error.h"
c5640b3e 17#include "qemu/error-report.h"
fd66dbd4
SH
18#include "qapi/qmp/qdict.h"
19#include "qemu/module.h"
c5640b3e 20#include "exec/memory.h" /* for ram_block_discard_disable() */
fd66dbd4 21
4f01a9bb
PK
22#include "block/block-io.h"
23
fd66dbd4
SH
24/*
25 * Keep the QEMU BlockDriver names identical to the libblkio driver names.
26 * Using macros instead of typing out the string literals avoids typos.
27 */
28#define DRIVER_IO_URING "io_uring"
29#define DRIVER_NVME_IO_URING "nvme-io_uring"
03d9e4c0 30#define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci"
fd66dbd4
SH
31#define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user"
32#define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa"
33
34/*
35 * Allocated bounce buffers are kept in a list sorted by buffer address.
36 */
37typedef struct BlkioBounceBuf {
38 QLIST_ENTRY(BlkioBounceBuf) next;
39
40 /* The bounce buffer */
41 struct iovec buf;
42} BlkioBounceBuf;
43
44typedef struct {
45 /*
46 * libblkio is not thread-safe so this lock protects ->blkio and
47 * ->blkioq.
48 */
49 QemuMutex blkio_lock;
50 struct blkio *blkio;
51 struct blkioq *blkioq; /* make this multi-queue in the future... */
52 int completion_fd;
53
54 /*
55 * Polling fetches the next completion into this field.
56 *
57 * No lock is necessary since only one thread calls aio_poll() and invokes
58 * fd and poll handlers.
59 */
60 struct blkio_completion poll_completion;
61
62 /*
63 * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available.
64 *
65 * Lock ordering: ->bounce_lock before ->blkio_lock.
66 */
67 CoMutex bounce_lock;
68
69 /* Bounce buffer pool */
70 struct blkio_mem_region bounce_pool;
71
72 /* Sorted list of allocated bounce buffers */
73 QLIST_HEAD(, BlkioBounceBuf) bounce_bufs;
74
75 /* Queue for coroutines waiting for bounce buffer space */
76 CoQueue bounce_available;
77
78 /* The value of the "mem-region-alignment" property */
79 size_t mem_region_alignment;
80
81 /* Can we skip adding/deleting blkio_mem_regions? */
82 bool needs_mem_regions;
c5640b3e
SH
83
84 /* Are file descriptors necessary for blkio_mem_regions? */
85 bool needs_mem_region_fd;
86
87 /* Are madvise(MADV_DONTNEED)-style operations unavailable? */
88 bool may_pin_mem_regions;
fd66dbd4
SH
89} BDRVBlkioState;
90
91/* Called with s->bounce_lock held */
92static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes)
93{
94 /* There can be no allocated bounce buffers during resize */
95 assert(QLIST_EMPTY(&s->bounce_bufs));
96
97 /* Pad size to reduce frequency of resize calls */
98 bytes += 128 * 1024;
99
100 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
101 int ret;
102
103 if (s->bounce_pool.addr) {
104 blkio_unmap_mem_region(s->blkio, &s->bounce_pool);
105 blkio_free_mem_region(s->blkio, &s->bounce_pool);
106 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
107 }
108
109 /* Automatically freed when s->blkio is destroyed */
110 ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes);
111 if (ret < 0) {
112 return ret;
113 }
114
115 ret = blkio_map_mem_region(s->blkio, &s->bounce_pool);
116 if (ret < 0) {
117 blkio_free_mem_region(s->blkio, &s->bounce_pool);
118 memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
119 return ret;
120 }
121 }
122
123 return 0;
124}
125
126/* Called with s->bounce_lock held */
127static bool
128blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
129 int64_t bytes)
130{
131 void *addr = s->bounce_pool.addr;
132 BlkioBounceBuf *cur = NULL;
133 BlkioBounceBuf *prev = NULL;
134 ptrdiff_t space;
135
136 /*
137 * This is just a linear search over the holes between requests. An
138 * efficient allocator would be nice.
139 */
140 QLIST_FOREACH(cur, &s->bounce_bufs, next) {
141 space = cur->buf.iov_base - addr;
142 if (bytes <= space) {
143 QLIST_INSERT_BEFORE(cur, bounce, next);
144 bounce->buf.iov_base = addr;
145 bounce->buf.iov_len = bytes;
146 return true;
147 }
148
149 addr = cur->buf.iov_base + cur->buf.iov_len;
150 prev = cur;
151 }
152
153 /* Is there space after the last request? */
154 space = s->bounce_pool.addr + s->bounce_pool.len - addr;
155 if (bytes > space) {
156 return false;
157 }
158 if (prev) {
159 QLIST_INSERT_AFTER(prev, bounce, next);
160 } else {
161 QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next);
162 }
163 bounce->buf.iov_base = addr;
164 bounce->buf.iov_len = bytes;
165 return true;
166}
167
168static int coroutine_fn
169blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
170 int64_t bytes)
171{
172 /*
173 * Ensure fairness: first time around we join the back of the queue,
174 * subsequently we join the front so we don't lose our place.
175 */
176 CoQueueWaitFlags wait_flags = 0;
177
178 QEMU_LOCK_GUARD(&s->bounce_lock);
179
180 /* Ensure fairness: don't even try if other requests are already waiting */
181 if (!qemu_co_queue_empty(&s->bounce_available)) {
182 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
183 wait_flags);
184 wait_flags = CO_QUEUE_WAIT_FRONT;
185 }
186
187 while (true) {
188 if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) {
189 /* Kick the next queued request since there may be space */
190 qemu_co_queue_next(&s->bounce_available);
191 return 0;
192 }
193
194 /*
195 * If there are no in-flight requests then the pool was simply too
196 * small.
197 */
198 if (QLIST_EMPTY(&s->bounce_bufs)) {
199 bool ok;
200 int ret;
201
202 ret = blkio_resize_bounce_pool(s, bytes);
203 if (ret < 0) {
204 /* Kick the next queued request since that may fail too */
205 qemu_co_queue_next(&s->bounce_available);
206 return ret;
207 }
208
209 ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes);
210 assert(ok); /* must have space this time */
211 return 0;
212 }
213
214 qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
215 wait_flags);
216 wait_flags = CO_QUEUE_WAIT_FRONT;
217 }
218}
219
220static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s,
221 BlkioBounceBuf *bounce)
222{
223 QEMU_LOCK_GUARD(&s->bounce_lock);
224
225 QLIST_REMOVE(bounce, next);
226
227 /* Wake up waiting coroutines since space may now be available */
228 qemu_co_queue_next(&s->bounce_available);
229}
230
231/* For async to .bdrv_co_*() conversion */
232typedef struct {
233 Coroutine *coroutine;
234 int ret;
235} BlkioCoData;
236
237static void blkio_completion_fd_read(void *opaque)
238{
239 BlockDriverState *bs = opaque;
240 BDRVBlkioState *s = bs->opaque;
241 uint64_t val;
242 int ret;
243
244 /* Polling may have already fetched a completion */
245 if (s->poll_completion.user_data != NULL) {
246 BlkioCoData *cod = s->poll_completion.user_data;
247 cod->ret = s->poll_completion.ret;
248
249 /* Clear it in case aio_co_wake() enters a nested event loop */
250 s->poll_completion.user_data = NULL;
251
252 aio_co_wake(cod->coroutine);
253 }
254
255 /* Reset completion fd status */
256 ret = read(s->completion_fd, &val, sizeof(val));
257
258 /* Ignore errors, there's nothing we can do */
259 (void)ret;
260
261 /*
262 * Reading one completion at a time makes nested event loop re-entrancy
263 * simple. Change this loop to get multiple completions in one go if it
264 * becomes a performance bottleneck.
265 */
266 while (true) {
267 struct blkio_completion completion;
268
269 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
270 ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL);
271 }
272 if (ret != 1) {
273 break;
274 }
275
276 BlkioCoData *cod = completion.user_data;
277 cod->ret = completion.ret;
278 aio_co_wake(cod->coroutine);
279 }
280}
281
282static bool blkio_completion_fd_poll(void *opaque)
283{
284 BlockDriverState *bs = opaque;
285 BDRVBlkioState *s = bs->opaque;
286 int ret;
287
288 /* Just in case we already fetched a completion */
289 if (s->poll_completion.user_data != NULL) {
290 return true;
291 }
292
293 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
294 ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL);
295 }
296 return ret == 1;
297}
298
299static void blkio_completion_fd_poll_ready(void *opaque)
300{
301 blkio_completion_fd_read(opaque);
302}
303
304static void blkio_attach_aio_context(BlockDriverState *bs,
305 AioContext *new_context)
306{
307 BDRVBlkioState *s = bs->opaque;
308
60f782b6
SH
309 aio_set_fd_handler(new_context, s->completion_fd,
310 blkio_completion_fd_read, NULL,
fd66dbd4 311 blkio_completion_fd_poll,
60f782b6 312 blkio_completion_fd_poll_ready, bs);
fd66dbd4
SH
313}
314
315static void blkio_detach_aio_context(BlockDriverState *bs)
316{
317 BDRVBlkioState *s = bs->opaque;
318
60f782b6
SH
319 aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL,
320 NULL, NULL, NULL);
fd66dbd4
SH
321}
322
323/* Call with s->blkio_lock held to submit I/O after enqueuing a new request */
324static void blkio_submit_io(BlockDriverState *bs)
325{
326 if (qatomic_read(&bs->io_plugged) == 0) {
327 BDRVBlkioState *s = bs->opaque;
328
329 blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
330 }
331}
332
333static int coroutine_fn
334blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
335{
336 BDRVBlkioState *s = bs->opaque;
337 BlkioCoData cod = {
338 .coroutine = qemu_coroutine_self(),
339 };
340
341 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
342 blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
343 blkio_submit_io(bs);
344 }
345
346 qemu_coroutine_yield();
347 return cod.ret;
348}
349
350static int coroutine_fn
351blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
352 QEMUIOVector *qiov, BdrvRequestFlags flags)
353{
354 BlkioCoData cod = {
355 .coroutine = qemu_coroutine_self(),
356 };
357 BDRVBlkioState *s = bs->opaque;
c5640b3e
SH
358 bool use_bounce_buffer =
359 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
fd66dbd4
SH
360 BlkioBounceBuf bounce;
361 struct iovec *iov = qiov->iov;
362 int iovcnt = qiov->niov;
363
364 if (use_bounce_buffer) {
365 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
366 if (ret < 0) {
367 return ret;
368 }
369
370 iov = &bounce.buf;
371 iovcnt = 1;
372 }
373
374 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
375 blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
376 blkio_submit_io(bs);
377 }
378
379 qemu_coroutine_yield();
380
381 if (use_bounce_buffer) {
382 if (cod.ret == 0) {
383 qemu_iovec_from_buf(qiov, 0,
384 bounce.buf.iov_base,
385 bounce.buf.iov_len);
386 }
387
388 blkio_free_bounce_buffer(s, &bounce);
389 }
390
391 return cod.ret;
392}
393
394static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
395 int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
396{
397 uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
398 BlkioCoData cod = {
399 .coroutine = qemu_coroutine_self(),
400 };
401 BDRVBlkioState *s = bs->opaque;
c5640b3e
SH
402 bool use_bounce_buffer =
403 s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
fd66dbd4
SH
404 BlkioBounceBuf bounce;
405 struct iovec *iov = qiov->iov;
406 int iovcnt = qiov->niov;
407
408 if (use_bounce_buffer) {
409 int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
410 if (ret < 0) {
411 return ret;
412 }
413
414 qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes);
415 iov = &bounce.buf;
416 iovcnt = 1;
417 }
418
419 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
420 blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
421 blkio_submit_io(bs);
422 }
423
424 qemu_coroutine_yield();
425
426 if (use_bounce_buffer) {
427 blkio_free_bounce_buffer(s, &bounce);
428 }
429
430 return cod.ret;
431}
432
433static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
434{
435 BDRVBlkioState *s = bs->opaque;
436 BlkioCoData cod = {
437 .coroutine = qemu_coroutine_self(),
438 };
439
440 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
441 blkioq_flush(s->blkioq, &cod, 0);
442 blkio_submit_io(bs);
443 }
444
445 qemu_coroutine_yield();
446 return cod.ret;
447}
448
449static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
450 int64_t offset, int64_t bytes, BdrvRequestFlags flags)
451{
452 BDRVBlkioState *s = bs->opaque;
453 BlkioCoData cod = {
454 .coroutine = qemu_coroutine_self(),
455 };
456 uint32_t blkio_flags = 0;
457
458 if (flags & BDRV_REQ_FUA) {
459 blkio_flags |= BLKIO_REQ_FUA;
460 }
461 if (!(flags & BDRV_REQ_MAY_UNMAP)) {
462 blkio_flags |= BLKIO_REQ_NO_UNMAP;
463 }
464 if (flags & BDRV_REQ_NO_FALLBACK) {
465 blkio_flags |= BLKIO_REQ_NO_FALLBACK;
466 }
467
468 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
469 blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
470 blkio_submit_io(bs);
471 }
472
473 qemu_coroutine_yield();
474 return cod.ret;
475}
476
09d9fc97 477static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs)
fd66dbd4
SH
478{
479 BDRVBlkioState *s = bs->opaque;
480
481 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
482 blkio_submit_io(bs);
483 }
484}
485
c5640b3e
SH
486typedef enum {
487 BMRR_OK,
488 BMRR_SKIP,
489 BMRR_FAIL,
490} BlkioMemRegionResult;
491
492/*
493 * Produce a struct blkio_mem_region for a given address and size.
494 *
495 * This function produces identical results when called multiple times with the
496 * same arguments. This property is necessary because blkio_unmap_mem_region()
497 * must receive the same struct blkio_mem_region field values that were passed
498 * to blkio_map_mem_region().
499 */
500static BlkioMemRegionResult
501blkio_mem_region_from_host(BlockDriverState *bs,
502 void *host, size_t size,
503 struct blkio_mem_region *region,
504 Error **errp)
505{
506 BDRVBlkioState *s = bs->opaque;
507 int fd = -1;
508 ram_addr_t fd_offset = 0;
509
510 if (((uintptr_t)host | size) % s->mem_region_alignment) {
511 error_setg(errp, "unaligned buf %p with size %zu", host, size);
512 return BMRR_FAIL;
513 }
514
515 /* Attempt to find the fd for the underlying memory */
516 if (s->needs_mem_region_fd) {
517 RAMBlock *ram_block;
518 RAMBlock *end_block;
519 ram_addr_t offset;
520
521 /*
522 * bdrv_register_buf() is called with the BQL held so mr lives at least
523 * until this function returns.
524 */
525 ram_block = qemu_ram_block_from_host(host, false, &fd_offset);
526 if (ram_block) {
527 fd = qemu_ram_get_fd(ram_block);
528 }
529 if (fd == -1) {
530 /*
531 * Ideally every RAMBlock would have an fd. pc-bios and other
532 * things don't. Luckily they are usually not I/O buffers and we
533 * can just ignore them.
534 */
535 return BMRR_SKIP;
536 }
537
538 /* Make sure the fd covers the entire range */
539 end_block = qemu_ram_block_from_host(host + size - 1, false, &offset);
540 if (ram_block != end_block) {
541 error_setg(errp, "registered buffer at %p with size %zu extends "
542 "beyond RAMBlock", host, size);
543 return BMRR_FAIL;
544 }
545 }
546
547 *region = (struct blkio_mem_region){
548 .addr = host,
549 .len = size,
550 .fd = fd,
551 .fd_offset = fd_offset,
552 };
553 return BMRR_OK;
554}
555
556static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size,
557 Error **errp)
558{
559 BDRVBlkioState *s = bs->opaque;
560 struct blkio_mem_region region;
561 BlkioMemRegionResult region_result;
562 int ret;
563
564 /*
565 * Mapping memory regions conflicts with RAM discard (virtio-mem) when
566 * there is pinning, so only do it when necessary.
567 */
568 if (!s->needs_mem_regions && s->may_pin_mem_regions) {
569 return true;
570 }
571
572 region_result = blkio_mem_region_from_host(bs, host, size, &region, errp);
573 if (region_result == BMRR_SKIP) {
574 return true;
575 } else if (region_result != BMRR_OK) {
576 return false;
577 }
578
579 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
580 ret = blkio_map_mem_region(s->blkio, &region);
581 }
582
583 if (ret < 0) {
584 error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s",
585 host, size, blkio_get_error_msg());
586 return false;
587 }
588 return true;
589}
590
591static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
592{
593 BDRVBlkioState *s = bs->opaque;
594 struct blkio_mem_region region;
595
596 /* See blkio_register_buf() */
597 if (!s->needs_mem_regions && s->may_pin_mem_regions) {
598 return;
599 }
600
601 if (blkio_mem_region_from_host(bs, host, size, &region, NULL) != BMRR_OK) {
602 return;
603 }
604
605 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
606 blkio_unmap_mem_region(s->blkio, &region);
607 }
608}
609
fd66dbd4
SH
610static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags,
611 Error **errp)
612{
613 const char *filename = qdict_get_str(options, "filename");
614 BDRVBlkioState *s = bs->opaque;
615 int ret;
616
617 ret = blkio_set_str(s->blkio, "path", filename);
618 qdict_del(options, "filename");
619 if (ret < 0) {
620 error_setg_errno(errp, -ret, "failed to set path: %s",
621 blkio_get_error_msg());
622 return ret;
623 }
624
625 if (flags & BDRV_O_NOCACHE) {
626 ret = blkio_set_bool(s->blkio, "direct", true);
627 if (ret < 0) {
628 error_setg_errno(errp, -ret, "failed to set direct: %s",
629 blkio_get_error_msg());
630 return ret;
631 }
632 }
633
634 return 0;
635}
636
637static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags,
638 Error **errp)
639{
6c32fc0d 640 const char *path = qdict_get_try_str(options, "path");
fd66dbd4
SH
641 BDRVBlkioState *s = bs->opaque;
642 int ret;
643
6c32fc0d
AF
644 if (!path) {
645 error_setg(errp, "missing 'path' option");
646 return -EINVAL;
647 }
648
649 ret = blkio_set_str(s->blkio, "path", path);
650 qdict_del(options, "path");
fd66dbd4
SH
651 if (ret < 0) {
652 error_setg_errno(errp, -ret, "failed to set path: %s",
653 blkio_get_error_msg());
654 return ret;
655 }
656
657 if (!(flags & BDRV_O_NOCACHE)) {
658 error_setg(errp, "cache.direct=off is not supported");
659 return -EINVAL;
660 }
661
662 return 0;
663}
664
665static int blkio_virtio_blk_common_open(BlockDriverState *bs,
666 QDict *options, int flags, Error **errp)
667{
668 const char *path = qdict_get_try_str(options, "path");
669 BDRVBlkioState *s = bs->opaque;
670 int ret;
671
672 if (!path) {
673 error_setg(errp, "missing 'path' option");
674 return -EINVAL;
675 }
676
677 ret = blkio_set_str(s->blkio, "path", path);
678 qdict_del(options, "path");
679 if (ret < 0) {
680 error_setg_errno(errp, -ret, "failed to set path: %s",
681 blkio_get_error_msg());
682 return ret;
683 }
684
685 if (!(flags & BDRV_O_NOCACHE)) {
686 error_setg(errp, "cache.direct=off is not supported");
687 return -EINVAL;
688 }
689 return 0;
690}
691
692static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
693 Error **errp)
694{
695 const char *blkio_driver = bs->drv->protocol_name;
696 BDRVBlkioState *s = bs->opaque;
697 int ret;
698
699 ret = blkio_create(blkio_driver, &s->blkio);
700 if (ret < 0) {
701 error_setg_errno(errp, -ret, "blkio_create failed: %s",
702 blkio_get_error_msg());
703 return ret;
704 }
705
706 if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) {
707 ret = blkio_io_uring_open(bs, options, flags, errp);
708 } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) {
709 ret = blkio_nvme_io_uring(bs, options, flags, errp);
03d9e4c0
AF
710 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) {
711 ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
fd66dbd4
SH
712 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) {
713 ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
714 } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) {
715 ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
716 } else {
717 g_assert_not_reached();
718 }
719 if (ret < 0) {
720 blkio_destroy(&s->blkio);
721 return ret;
722 }
723
724 if (!(flags & BDRV_O_RDWR)) {
725 ret = blkio_set_bool(s->blkio, "read-only", true);
726 if (ret < 0) {
727 error_setg_errno(errp, -ret, "failed to set read-only: %s",
728 blkio_get_error_msg());
729 blkio_destroy(&s->blkio);
730 return ret;
731 }
732 }
733
734 ret = blkio_connect(s->blkio);
735 if (ret < 0) {
736 error_setg_errno(errp, -ret, "blkio_connect failed: %s",
737 blkio_get_error_msg());
738 blkio_destroy(&s->blkio);
739 return ret;
740 }
741
742 ret = blkio_get_bool(s->blkio,
743 "needs-mem-regions",
744 &s->needs_mem_regions);
745 if (ret < 0) {
746 error_setg_errno(errp, -ret,
747 "failed to get needs-mem-regions: %s",
748 blkio_get_error_msg());
749 blkio_destroy(&s->blkio);
750 return ret;
751 }
752
c5640b3e
SH
753 ret = blkio_get_bool(s->blkio,
754 "needs-mem-region-fd",
755 &s->needs_mem_region_fd);
756 if (ret < 0) {
757 error_setg_errno(errp, -ret,
758 "failed to get needs-mem-region-fd: %s",
759 blkio_get_error_msg());
760 blkio_destroy(&s->blkio);
761 return ret;
762 }
763
fd66dbd4
SH
764 ret = blkio_get_uint64(s->blkio,
765 "mem-region-alignment",
766 &s->mem_region_alignment);
767 if (ret < 0) {
768 error_setg_errno(errp, -ret,
769 "failed to get mem-region-alignment: %s",
770 blkio_get_error_msg());
771 blkio_destroy(&s->blkio);
772 return ret;
773 }
774
c5640b3e
SH
775 ret = blkio_get_bool(s->blkio,
776 "may-pin-mem-regions",
777 &s->may_pin_mem_regions);
778 if (ret < 0) {
779 /* Be conservative (assume pinning) if the property is not supported */
780 s->may_pin_mem_regions = s->needs_mem_regions;
781 }
782
783 /*
784 * Notify if libblkio drivers pin memory and prevent features like
785 * virtio-mem from working.
786 */
787 if (s->may_pin_mem_regions) {
788 ret = ram_block_discard_disable(true);
789 if (ret < 0) {
790 error_setg_errno(errp, -ret, "ram_block_discard_disable() failed");
791 blkio_destroy(&s->blkio);
792 return ret;
793 }
794 }
795
fd66dbd4
SH
796 ret = blkio_start(s->blkio);
797 if (ret < 0) {
798 error_setg_errno(errp, -ret, "blkio_start failed: %s",
799 blkio_get_error_msg());
800 blkio_destroy(&s->blkio);
c5640b3e
SH
801 if (s->may_pin_mem_regions) {
802 ram_block_discard_disable(false);
803 }
fd66dbd4
SH
804 return ret;
805 }
806
c5640b3e 807 bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
fd66dbd4
SH
808 bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP |
809 BDRV_REQ_NO_FALLBACK;
810
811 qemu_mutex_init(&s->blkio_lock);
812 qemu_co_mutex_init(&s->bounce_lock);
813 qemu_co_queue_init(&s->bounce_available);
814 QLIST_INIT(&s->bounce_bufs);
815 s->blkioq = blkio_get_queue(s->blkio, 0);
816 s->completion_fd = blkioq_get_completion_fd(s->blkioq);
817
818 blkio_attach_aio_context(bs, bdrv_get_aio_context(bs));
819 return 0;
820}
821
822static void blkio_close(BlockDriverState *bs)
823{
824 BDRVBlkioState *s = bs->opaque;
825
826 /* There is no destroy() API for s->bounce_lock */
827
828 qemu_mutex_destroy(&s->blkio_lock);
829 blkio_detach_aio_context(bs);
830 blkio_destroy(&s->blkio);
c5640b3e
SH
831
832 if (s->may_pin_mem_regions) {
833 ram_block_discard_disable(false);
834 }
fd66dbd4
SH
835}
836
c86422c5 837static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs)
fd66dbd4
SH
838{
839 BDRVBlkioState *s = bs->opaque;
840 uint64_t capacity;
841 int ret;
842
843 WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
844 ret = blkio_get_uint64(s->blkio, "capacity", &capacity);
845 }
846 if (ret < 0) {
847 return -ret;
848 }
849
850 return capacity;
851}
852
4c8f4fda
AF
853static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset,
854 bool exact, PreallocMode prealloc,
855 BdrvRequestFlags flags, Error **errp)
856{
857 int64_t current_length;
858
859 if (prealloc != PREALLOC_MODE_OFF) {
860 error_setg(errp, "Unsupported preallocation mode '%s'",
861 PreallocMode_str(prealloc));
862 return -ENOTSUP;
863 }
864
c86422c5 865 current_length = blkio_co_getlength(bs);
4c8f4fda
AF
866
867 if (offset > current_length) {
868 error_setg(errp, "Cannot grow device");
869 return -EINVAL;
870 } else if (exact && offset != current_length) {
871 error_setg(errp, "Cannot resize device");
872 return -ENOTSUP;
873 }
874
875 return 0;
876}
877
3d47eb0a
EGE
878static int coroutine_fn
879blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
fd66dbd4
SH
880{
881 return 0;
882}
883
884static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
885{
886 BDRVBlkioState *s = bs->opaque;
887 QEMU_LOCK_GUARD(&s->blkio_lock);
888 int value;
889 int ret;
890
891 ret = blkio_get_int(s->blkio, "request-alignment", &value);
892 if (ret < 0) {
893 error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s",
894 blkio_get_error_msg());
895 return;
896 }
897 bs->bl.request_alignment = value;
898 if (bs->bl.request_alignment < 1 ||
899 bs->bl.request_alignment >= INT_MAX ||
900 !is_power_of_2(bs->bl.request_alignment)) {
901 error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", "
902 "must be a power of 2 less than INT_MAX",
903 bs->bl.request_alignment);
904 return;
905 }
906
907 ret = blkio_get_int(s->blkio, "optimal-io-size", &value);
908 if (ret < 0) {
909 error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s",
910 blkio_get_error_msg());
911 return;
912 }
913 bs->bl.opt_transfer = value;
914 if (bs->bl.opt_transfer > INT_MAX ||
915 (bs->bl.opt_transfer % bs->bl.request_alignment)) {
916 error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must "
917 "be a multiple of %" PRIu32, bs->bl.opt_transfer,
918 bs->bl.request_alignment);
919 return;
920 }
921
922 ret = blkio_get_int(s->blkio, "max-transfer", &value);
923 if (ret < 0) {
924 error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s",
925 blkio_get_error_msg());
926 return;
927 }
928 bs->bl.max_transfer = value;
929 if ((bs->bl.max_transfer % bs->bl.request_alignment) ||
930 (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) {
931 error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be "
932 "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)",
933 bs->bl.max_transfer, bs->bl.request_alignment,
934 bs->bl.opt_transfer);
935 return;
936 }
937
938 ret = blkio_get_int(s->blkio, "buf-alignment", &value);
939 if (ret < 0) {
940 error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s",
941 blkio_get_error_msg());
942 return;
943 }
944 if (value < 1) {
945 error_setg(errp, "invalid \"buf-alignment\" value %d, must be "
946 "positive", value);
947 return;
948 }
949 bs->bl.min_mem_alignment = value;
950
951 ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value);
952 if (ret < 0) {
953 error_setg_errno(errp, -ret,
954 "failed to get \"optimal-buf-alignment\": %s",
955 blkio_get_error_msg());
956 return;
957 }
958 if (value < 1) {
959 error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, "
960 "must be positive", value);
961 return;
962 }
963 bs->bl.opt_mem_alignment = value;
964
965 ret = blkio_get_int(s->blkio, "max-segments", &value);
966 if (ret < 0) {
967 error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s",
968 blkio_get_error_msg());
969 return;
970 }
971 if (value < 1) {
972 error_setg(errp, "invalid \"max-segments\" value %d, must be positive",
973 value);
974 return;
975 }
976 bs->bl.max_iov = value;
977}
978
979/*
980 * TODO
981 * Missing libblkio APIs:
982 * - block_status
983 * - co_invalidate_cache
984 *
985 * Out of scope?
986 * - create
987 * - truncate
988 */
989
990#define BLKIO_DRIVER(name, ...) \
991 { \
992 .format_name = name, \
993 .protocol_name = name, \
994 .instance_size = sizeof(BDRVBlkioState), \
995 .bdrv_file_open = blkio_file_open, \
996 .bdrv_close = blkio_close, \
c86422c5 997 .bdrv_co_getlength = blkio_co_getlength, \
4c8f4fda 998 .bdrv_co_truncate = blkio_truncate, \
3d47eb0a 999 .bdrv_co_get_info = blkio_co_get_info, \
fd66dbd4
SH
1000 .bdrv_attach_aio_context = blkio_attach_aio_context, \
1001 .bdrv_detach_aio_context = blkio_detach_aio_context, \
1002 .bdrv_co_pdiscard = blkio_co_pdiscard, \
1003 .bdrv_co_preadv = blkio_co_preadv, \
1004 .bdrv_co_pwritev = blkio_co_pwritev, \
1005 .bdrv_co_flush_to_disk = blkio_co_flush, \
1006 .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \
09d9fc97 1007 .bdrv_co_io_unplug = blkio_co_io_unplug, \
fd66dbd4 1008 .bdrv_refresh_limits = blkio_refresh_limits, \
c5640b3e
SH
1009 .bdrv_register_buf = blkio_register_buf, \
1010 .bdrv_unregister_buf = blkio_unregister_buf, \
fd66dbd4
SH
1011 __VA_ARGS__ \
1012 }
1013
1014static BlockDriver bdrv_io_uring = BLKIO_DRIVER(
1015 DRIVER_IO_URING,
1016 .bdrv_needs_filename = true,
1017);
1018
1019static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER(
1020 DRIVER_NVME_IO_URING,
fd66dbd4
SH
1021);
1022
03d9e4c0
AF
1023static BlockDriver bdrv_virtio_blk_vfio_pci = BLKIO_DRIVER(
1024 DRIVER_VIRTIO_BLK_VFIO_PCI
1025);
1026
fd66dbd4
SH
1027static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER(
1028 DRIVER_VIRTIO_BLK_VHOST_USER
1029);
1030
1031static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER(
1032 DRIVER_VIRTIO_BLK_VHOST_VDPA
1033);
1034
1035static void bdrv_blkio_init(void)
1036{
1037 bdrv_register(&bdrv_io_uring);
1038 bdrv_register(&bdrv_nvme_io_uring);
03d9e4c0 1039 bdrv_register(&bdrv_virtio_blk_vfio_pci);
fd66dbd4
SH
1040 bdrv_register(&bdrv_virtio_blk_vhost_user);
1041 bdrv_register(&bdrv_virtio_blk_vhost_vdpa);
1042}
1043
1044block_init(bdrv_blkio_init);