* See the COPYING file in the top-level directory.
*/
#include "qemu/osdep.h"
-#include "qemu-common.h"
#include "block/aio.h"
#include "qemu/queue.h"
#include "block/block.h"
#include "block/raw-aio.h"
#include "qemu/event_notifier.h"
#include "qemu/coroutine.h"
+#include "qemu/defer-call.h"
#include "qapi/error.h"
+#include "sysemu/block-backend.h"
+
+/* Only used for assertions. */
+#include "qemu/coroutine_int.h"
#include <libaio.h>
* than this we will get EAGAIN from io_submit which is communicated to
* the guest as an I/O error.
*/
-#define MAX_EVENTS 128
+#define MAX_EVENTS 1024
+
+/* Maximum number of requests in a batch. (default value) */
+#define DEFAULT_MAX_BATCH 32
struct qemu_laiocb {
- BlockAIOCB common;
Coroutine *co;
LinuxAioState *ctx;
struct iocb iocb;
};
typedef struct {
- int plugged;
unsigned int in_queue;
unsigned int in_flight;
bool blocked;
io_context_t ctx;
EventNotifier e;
- /* io queue for submit at batch. Protected by AioContext lock. */
+ /* No locking required, only accessed from AioContext home thread */
LaioQueue io_q;
-
- /* I/O completion processing. Only runs in I/O thread. */
QEMUBH *completion_bh;
int event_idx;
int event_max;
}
/*
- * Completes an AIO request (calls the callback and frees the ACB).
+ * Completes an AIO request.
*/
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
{
}
laiocb->ret = ret;
- if (laiocb->co) {
- /* If the coroutine is already entered it must be in ioq_submit() and
- * will notice laio->ret has been filled in when it eventually runs
- * later. Coroutines cannot be entered recursively so avoid doing
- * that!
- */
- if (!qemu_coroutine_entered(laiocb->co)) {
- aio_co_wake(laiocb->co);
- }
- } else {
- laiocb->common.cb(laiocb->common.opaque, ret);
- qemu_aio_unref(laiocb);
+
+ /*
+ * If the coroutine is already entered it must be in ioq_submit() and
+ * will notice laio->ret has been filled in when it eventually runs
+ * later. Coroutines cannot be entered recursively so avoid doing
+ * that!
+ */
+ assert(laiocb->co->ctx == laiocb->ctx->aio_context);
+ if (!qemu_coroutine_entered(laiocb->co)) {
+ aio_co_wake(laiocb->co);
}
}
unsigned incompat_features;
unsigned header_length; /* size of aio_ring */
- struct io_event io_events[0];
+ struct io_event io_events[];
};
/**
{
struct io_event *events;
+ defer_call_begin();
+
/* Reschedule so nested event loops see currently pending completions */
qemu_bh_schedule(s->completion_bh);
/* If we are nested we have to notify the level above that we are done
* by setting event_max to zero, upper level will then jump out of it's
- * own `for` loop. If we are the last all counters droped to zero. */
+ * own `for` loop. If we are the last all counters dropped to zero. */
s->event_max = 0;
s->event_idx = 0;
+
+ defer_call_end();
}
static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
{
qemu_laio_process_completions(s);
- aio_context_acquire(s->aio_context);
- if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+ if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
ioq_submit(s);
}
- aio_context_release(s->aio_context);
}
static void qemu_laio_completion_bh(void *opaque)
LinuxAioState *s = container_of(e, LinuxAioState, e);
struct io_event *events;
- if (!io_getevents_peek(s->ctx, &events)) {
- return false;
- }
-
- qemu_laio_process_completions_and_submit(s);
- return true;
+ return io_getevents_peek(s->ctx, &events);
}
-static void laio_cancel(BlockAIOCB *blockacb)
+static void qemu_laio_poll_ready(EventNotifier *opaque)
{
- struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
- struct io_event event;
- int ret;
-
- if (laiocb->ret != -EINPROGRESS) {
- return;
- }
- ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
- laiocb->ret = -ECANCELED;
- if (ret != 0) {
- /* iocb is not cancelled, cb will be called by the event loop later */
- return;
- }
+ EventNotifier *e = opaque;
+ LinuxAioState *s = container_of(e, LinuxAioState, e);
- laiocb->common.cb(laiocb->common.opaque, laiocb->ret);
+ qemu_laio_process_completions_and_submit(s);
}
-static const AIOCBInfo laio_aiocb_info = {
- .aiocb_size = sizeof(struct qemu_laiocb),
- .cancel_async = laio_cancel,
-};
-
static void ioq_init(LaioQueue *io_q)
{
QSIMPLEQ_INIT(&io_q->pending);
- io_q->plugged = 0;
io_q->in_queue = 0;
io_q->in_flight = 0;
io_q->blocked = false;
}
}
-void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
+static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
{
- s->io_q.plugged++;
+ uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
+
+ /*
+ * AIO context can be shared between multiple block devices, so
+ * `dev_max_batch` allows reducing the batch size for latency-sensitive
+ * devices.
+ */
+ max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
+
+ /* limit the batch with the number of available events */
+ max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
+
+ return max_batch;
}
-void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
+static void laio_deferred_fn(void *opaque)
{
- assert(s->io_q.plugged);
- if (--s->io_q.plugged == 0 &&
- !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+ LinuxAioState *s = opaque;
+
+ if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
ioq_submit(s);
}
}
static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
- int type)
+ int type, uint64_t dev_max_batch)
{
LinuxAioState *s = laiocb->ctx;
struct iocb *iocbs = &laiocb->iocb;
switch (type) {
case QEMU_AIO_WRITE:
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
- break;
+ break;
+ case QEMU_AIO_ZONE_APPEND:
+ io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+ break;
case QEMU_AIO_READ:
io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
- break;
+ break;
/* Currently Linux kernel does not support other operations */
default:
fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
s->io_q.in_queue++;
- if (!s->io_q.blocked &&
- (!s->io_q.plugged ||
- s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
- ioq_submit(s);
+ if (!s->io_q.blocked) {
+ if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
+ ioq_submit(s);
+ } else {
+ defer_call(laio_deferred_fn, s);
+ }
}
return 0;
}
-int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
- uint64_t offset, QEMUIOVector *qiov, int type)
+int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
+ int type, uint64_t dev_max_batch)
{
int ret;
+ AioContext *ctx = qemu_get_current_aio_context();
struct qemu_laiocb laiocb = {
.co = qemu_coroutine_self(),
.nbytes = qiov->size,
- .ctx = s,
+ .ctx = aio_get_linux_aio(ctx),
.ret = -EINPROGRESS,
.is_read = (type == QEMU_AIO_READ),
.qiov = qiov,
};
- ret = laio_do_submit(fd, &laiocb, offset, type);
+ ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
if (ret < 0) {
return ret;
}
return laiocb.ret;
}
-BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
- int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
- BlockCompletionFunc *cb, void *opaque, int type)
-{
- struct qemu_laiocb *laiocb;
- off_t offset = sector_num * BDRV_SECTOR_SIZE;
- int ret;
-
- laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
- laiocb->nbytes = nb_sectors * BDRV_SECTOR_SIZE;
- laiocb->ctx = s;
- laiocb->ret = -EINPROGRESS;
- laiocb->is_read = (type == QEMU_AIO_READ);
- laiocb->qiov = qiov;
-
- ret = laio_do_submit(fd, laiocb, offset, type);
- if (ret < 0) {
- qemu_aio_unref(laiocb);
- return NULL;
- }
-
- return &laiocb->common;
-}
-
void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
{
- aio_set_event_notifier(old_context, &s->e, false, NULL, NULL);
+ aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL);
qemu_bh_delete(s->completion_bh);
s->aio_context = NULL;
}
{
s->aio_context = new_context;
s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
- aio_set_event_notifier(new_context, &s->e, false,
+ aio_set_event_notifier(new_context, &s->e,
qemu_laio_completion_cb,
- qemu_laio_poll_cb);
+ qemu_laio_poll_cb,
+ qemu_laio_poll_ready);
}
LinuxAioState *laio_init(Error **errp)
s = g_malloc0(sizeof(*s));
rc = event_notifier_init(&s->e, false);
if (rc < 0) {
- error_setg_errno(errp, -rc, "failed to to initialize event notifier");
+ error_setg_errno(errp, -rc, "failed to initialize event notifier");
goto out_free_state;
}