[mirror_qemu.git] / block / linux-aio.c

/*
 * Linux native AIO support.
 *
 * Copyright (C) 2009 IBM, Corp.
 * Copyright (C) 2009 Red Hat, Inc.
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 */
#include "qemu/osdep.h"
#include "block/aio.h"
#include "qemu/queue.h"
#include "block/block.h"
#include "block/raw-aio.h"
#include "qemu/event_notifier.h"
#include "qemu/coroutine.h"
#include "qemu/defer-call.h"
#include "qapi/error.h"
#include "sysemu/block-backend.h"

/* Only used for assertions.  */
#include "qemu/coroutine_int.h"

#include <libaio.h>

/*
 * Queue size (per-device).
 *
 * XXX: eventually we need to communicate this to the guest and/or make it
 *      tunable by the guest.  If we get more outstanding requests at a time
 *      than this we will get EAGAIN from io_submit which is communicated to
 *      the guest as an I/O error.
 */
#define MAX_EVENTS 1024

/* Maximum number of requests in a batch. (default value) */
#define DEFAULT_MAX_BATCH 32

struct qemu_laiocb {
    Coroutine *co;
    LinuxAioState *ctx;
    struct iocb iocb;
    ssize_t ret;
    size_t nbytes;
    QEMUIOVector *qiov;
    bool is_read;
    QSIMPLEQ_ENTRY(qemu_laiocb) next;
};

typedef struct {
    unsigned int in_queue;
    unsigned int in_flight;
    bool blocked;
    QSIMPLEQ_HEAD(, qemu_laiocb) pending;
} LaioQueue;

struct LinuxAioState {
    AioContext *aio_context;

    io_context_t ctx;
    EventNotifier e;

    /* No locking required, only accessed from AioContext home thread */
    LaioQueue io_q;
    QEMUBH *completion_bh;
    int event_idx;
    int event_max;
};

static void ioq_submit(LinuxAioState *s);

static inline ssize_t io_event_ret(struct io_event *ev)
{
    return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
}

/*
 * Completes an AIO request.
 */
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
{
    int ret;

    ret = laiocb->ret;
    if (ret != -ECANCELED) {
        if (ret == laiocb->nbytes) {
            ret = 0;
        } else if (ret >= 0) {
            /* Short reads mean EOF, pad with zeros. */
            if (laiocb->is_read) {
                qemu_iovec_memset(laiocb->qiov, ret, 0,
                    laiocb->qiov->size - ret);
            } else {
                ret = -ENOSPC;
            }
        }
    }

    laiocb->ret = ret;

    /*
     * If the coroutine is already entered it must be in ioq_submit() and
     * will notice laio->ret has been filled in when it eventually runs
     * later.  Coroutines cannot be entered recursively so avoid doing
     * that!
     */
    assert(laiocb->co->ctx == laiocb->ctx->aio_context);
    if (!qemu_coroutine_entered(laiocb->co)) {
        aio_co_wake(laiocb->co);
    }
}

/**
 * aio_ring buffer which is shared between userspace and kernel.
 *
 * This copied from linux/fs/aio.c, common header does not exist
 * but AIO exists for ages so we assume ABI is stable.
 */
struct aio_ring {
    unsigned    id;    /* kernel internal index number */
    unsigned    nr;    /* number of io_events */
    unsigned    head;  /* Written to by userland or by kernel. */
    unsigned    tail;

    unsigned    magic;
    unsigned    compat_features;
    unsigned    incompat_features;
    unsigned    header_length;  /* size of aio_ring */

    struct io_event io_events[];
};

/**
 * io_getevents_peek:
 * @ctx: AIO context
 * @events: pointer on events array, output value

 * Returns the number of completed events and sets a pointer
 * on events array.  This function does not update the internal
 * ring buffer, only reads head and tail.  When @events has been
 * processed io_getevents_commit() must be called.
 */
static inline unsigned int io_getevents_peek(io_context_t ctx,
                                             struct io_event **events)
{
    struct aio_ring *ring = (struct aio_ring *)ctx;
    unsigned int head = ring->head, tail = ring->tail;
    unsigned int nr;

    nr = tail >= head ? tail - head : ring->nr - head;
    *events = ring->io_events + head;
    /* To avoid speculative loads of s->events[i] before observing tail.
       Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
    smp_rmb();

    return nr;
}

/**
 * io_getevents_commit:
 * @ctx: AIO context
 * @nr: the number of events on which head should be advanced
 *
 * Advances head of a ring buffer.
 */
static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
{
    struct aio_ring *ring = (struct aio_ring *)ctx;

    if (nr) {
        ring->head = (ring->head + nr) % ring->nr;
    }
}

/**
 * io_getevents_advance_and_peek:
 * @ctx: AIO context
 * @events: pointer on events array, output value
 * @nr: the number of events on which head should be advanced
 *
 * Advances head of a ring buffer and returns number of elements left.
 */
static inline unsigned int
io_getevents_advance_and_peek(io_context_t ctx,
                              struct io_event **events,
                              unsigned int nr)
{
    io_getevents_commit(ctx, nr);
    return io_getevents_peek(ctx, events);
}

/**
 * qemu_laio_process_completions:
 * @s: AIO state
 *
 * Fetches completed I/O requests and invokes their callbacks.
 *
 * The function is somewhat tricky because it supports nested event loops, for
 * example when a request callback invokes aio_poll().  In order to do this,
 * indices are kept in LinuxAioState.  Function schedules BH completion so it
 * can be called again in a nested event loop.  When there are no events left
 * to complete the BH is being canceled.
 */
static void qemu_laio_process_completions(LinuxAioState *s)
{
    struct io_event *events;

    defer_call_begin();

    /* Reschedule so nested event loops see currently pending completions */
    qemu_bh_schedule(s->completion_bh);

    while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
                                                         s->event_idx))) {
        for (s->event_idx = 0; s->event_idx < s->event_max; ) {
            struct iocb *iocb = events[s->event_idx].obj;
            struct qemu_laiocb *laiocb =
                container_of(iocb, struct qemu_laiocb, iocb);

            laiocb->ret = io_event_ret(&events[s->event_idx]);

            /* Change counters one-by-one because we can be nested. */
            s->io_q.in_flight--;
            s->event_idx++;
            qemu_laio_process_completion(laiocb);
        }
    }

    qemu_bh_cancel(s->completion_bh);

    /* If we are nested we have to notify the level above that we are done
     * by setting event_max to zero, upper level will then jump out of it's
     * own `for` loop.  If we are the last all counters dropped to zero. */
    s->event_max = 0;
    s->event_idx = 0;

    defer_call_end();
}

static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
{
    qemu_laio_process_completions(s);

    if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
}

static void qemu_laio_completion_bh(void *opaque)
{
    LinuxAioState *s = opaque;

    qemu_laio_process_completions_and_submit(s);
}

static void qemu_laio_completion_cb(EventNotifier *e)
{
    LinuxAioState *s = container_of(e, LinuxAioState, e);

    if (event_notifier_test_and_clear(&s->e)) {
        qemu_laio_process_completions_and_submit(s);
    }
}

static bool qemu_laio_poll_cb(void *opaque)
{
    EventNotifier *e = opaque;
    LinuxAioState *s = container_of(e, LinuxAioState, e);
    struct io_event *events;

    return io_getevents_peek(s->ctx, &events);
}

static void qemu_laio_poll_ready(EventNotifier *opaque)
{
    EventNotifier *e = opaque;
    LinuxAioState *s = container_of(e, LinuxAioState, e);

    qemu_laio_process_completions_and_submit(s);
}

static void ioq_init(LaioQueue *io_q)
{
    QSIMPLEQ_INIT(&io_q->pending);
    io_q->in_queue = 0;
    io_q->in_flight = 0;
    io_q->blocked = false;
}

static void ioq_submit(LinuxAioState *s)
{
    int ret, len;
    struct qemu_laiocb *aiocb;
    struct iocb *iocbs[MAX_EVENTS];
    QSIMPLEQ_HEAD(, qemu_laiocb) completed;

    do {
        if (s->io_q.in_flight >= MAX_EVENTS) {
            break;
        }
        len = 0;
        QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
            iocbs[len++] = &aiocb->iocb;
            if (s->io_q.in_flight + len >= MAX_EVENTS) {
                break;
            }
        }

        ret = io_submit(s->ctx, len, iocbs);
        if (ret == -EAGAIN) {
            break;
        }
        if (ret < 0) {
            /* Fail the first request, retry the rest */
            aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
            QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
            s->io_q.in_queue--;
            aiocb->ret = ret;
            qemu_laio_process_completion(aiocb);
            continue;
        }

        s->io_q.in_flight += ret;
        s->io_q.in_queue  -= ret;
        aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
        QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
    } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
    s->io_q.blocked = (s->io_q.in_queue > 0);

    if (s->io_q.in_flight) {
        /* We can try to complete something just right away if there are
         * still requests in-flight. */
        qemu_laio_process_completions(s);
        /*
         * Even we have completed everything (in_flight == 0), the queue can
         * have still pended requests (in_queue > 0).  We do not attempt to
         * repeat submission to avoid IO hang.  The reason is simple: s->e is
         * still set and completion callback will be called shortly and all
         * pended requests will be submitted from there.
         */
    }
}

static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
{
    uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;

    /*
     * AIO context can be shared between multiple block devices, so
     * `dev_max_batch` allows reducing the batch size for latency-sensitive
     * devices.
     */
    max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);

    /* limit the batch with the number of available events */
    max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);

    return max_batch;
}

static void laio_deferred_fn(void *opaque)
{
    LinuxAioState *s = opaque;

    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
}

static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
                          int type, uint64_t dev_max_batch)
{
    LinuxAioState *s = laiocb->ctx;
    struct iocb *iocbs = &laiocb->iocb;
    QEMUIOVector *qiov = laiocb->qiov;

    switch (type) {
    case QEMU_AIO_WRITE:
        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
    case QEMU_AIO_ZONE_APPEND:
        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
    case QEMU_AIO_READ:
        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
    /* Currently Linux kernel does not support other operations */
    default:
        fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
                        __func__, type);
        return -EIO;
    }
    io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));

    QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
    s->io_q.in_queue++;
    if (!s->io_q.blocked) {
        if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
            ioq_submit(s);
        } else {
            defer_call(laio_deferred_fn, s);
        }
    }

    return 0;
}

int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
                                int type, uint64_t dev_max_batch)
{
    int ret;
    AioContext *ctx = qemu_get_current_aio_context();
    struct qemu_laiocb laiocb = {
        .co         = qemu_coroutine_self(),
        .nbytes     = qiov->size,
        .ctx        = aio_get_linux_aio(ctx),
        .ret        = -EINPROGRESS,
        .is_read    = (type == QEMU_AIO_READ),
        .qiov       = qiov,
    };

    ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
    if (ret < 0) {
        return ret;
    }

    if (laiocb.ret == -EINPROGRESS) {
        qemu_coroutine_yield();
    }
    return laiocb.ret;
}

void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
{
    aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL);
    qemu_bh_delete(s->completion_bh);
    s->aio_context = NULL;
}

void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
{
    s->aio_context = new_context;
    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
    aio_set_event_notifier(new_context, &s->e,
                           qemu_laio_completion_cb,
                           qemu_laio_poll_cb,
                           qemu_laio_poll_ready);
}

LinuxAioState *laio_init(Error **errp)
{
    int rc;
    LinuxAioState *s;

    s = g_malloc0(sizeof(*s));
    rc = event_notifier_init(&s->e, false);
    if (rc < 0) {
        error_setg_errno(errp, -rc, "failed to initialize event notifier");
        goto out_free_state;
    }

    rc = io_setup(MAX_EVENTS, &s->ctx);
    if (rc < 0) {
        error_setg_errno(errp, -rc, "failed to create linux AIO context");
        goto out_close_efd;
    }

    ioq_init(&s->io_q);

    return s;

out_close_efd:
    event_notifier_cleanup(&s->e);
out_free_state:
    g_free(s);
    return NULL;
}

void laio_cleanup(LinuxAioState *s)
{
    event_notifier_cleanup(&s->e);

    if (io_destroy(s->ctx) != 0) {
        fprintf(stderr, "%s: destroy AIO context %p failed\n",
                        __func__, &s->ctx);
    }
    g_free(s);
}
Commit	Line	Data
5c6c3a6c CH	1	/*
	2	* Linux native AIO support.
	3	*
	4	* Copyright (C) 2009 IBM, Corp.
	5	* Copyright (C) 2009 Red Hat, Inc.
	6	*
	7	* This work is licensed under the terms of the GNU GPL, version 2 or later.
	8	* See the COPYING file in the top-level directory.
	9	*/
80c71a24	10	#include "qemu/osdep.h"
737e150e	11	#include "block/aio.h"
1de7afc9	12	#include "qemu/queue.h"
2174f12b	13	#include "block/block.h"
9f8540ec	14	#include "block/raw-aio.h"
1de7afc9	15	#include "qemu/event_notifier.h"
2174f12b	16	#include "qemu/coroutine.h"
433fcea4	17	#include "qemu/defer-call.h"
ed6e2161	18	#include "qapi/error.h"
07668288	19	#include "sysemu/block-backend.h"
5c6c3a6c	20
ab50533b EGE	21	/* Only used for assertions. */
	22	#include "qemu/coroutine_int.h"
	23
5c6c3a6c CH	24	#include <libaio.h>
	25
	26	/*
	27	* Queue size (per-device).
	28	*
	29	* XXX: eventually we need to communicate this to the guest and/or make it
	30	* tunable by the guest. If we get more outstanding requests at a time
	31	* than this we will get EAGAIN from io_submit which is communicated to
	32	* the guest as an I/O error.
	33	*/
2558cb8d	34	#define MAX_EVENTS 1024
5c6c3a6c	35
d7ddd0a1 SG	36	/* Maximum number of requests in a batch. (default value) */
	37	#define DEFAULT_MAX_BATCH 32
	38
5c6c3a6c	39	struct qemu_laiocb {
2174f12b	40	Coroutine *co;
dd7f7ed1	41	LinuxAioState *ctx;
5c6c3a6c CH	42	struct iocb iocb;
	43	ssize_t ret;
	44	size_t nbytes;
b161e2e4 KW	45	QEMUIOVector *qiov;
b161e2e4 KW	46	bool is_read;
28b24087	47	QSIMPLEQ_ENTRY(qemu_laiocb) next;
5c6c3a6c CH	48	};
5c6c3a6c CH	49
1b3abdcc	50	typedef struct {
5e1b34a3 RP	51	unsigned int in_queue;
5e1b34a3 RP	52	unsigned int in_flight;
43f2376e	53	bool blocked;
28b24087	54	QSIMPLEQ_HEAD(, qemu_laiocb) pending;
1b3abdcc ML	55	} LaioQueue;
1b3abdcc ML	56
dd7f7ed1	57	struct LinuxAioState {
0187f5c9 PB	58	AioContext *aio_context;
0187f5c9 PB	59
5c6c3a6c	60	io_context_t ctx;
c90caf25	61	EventNotifier e;
1b3abdcc	62
ab50533b	63	/* No locking required, only accessed from AioContext home thread */
1b3abdcc	64	LaioQueue io_q;
2cdff7f6	65	QEMUBH *completion_bh;
2cdff7f6 SH	66	int event_idx;
2cdff7f6 SH	67	int event_max;
5c6c3a6c CH	68	};
5c6c3a6c CH	69
dd7f7ed1	70	static void ioq_submit(LinuxAioState *s);
28b24087	71
5c6c3a6c CH	72	static inline ssize_t io_event_ret(struct io_event *ev)
	73	{
	74	return (ssize_t)(((uint64_t)ev->res2 << 32) \| ev->res);
	75	}
	76
db0ffc24	77	/*
2b02fd81	78	* Completes an AIO request.
db0ffc24	79	*/
dd7f7ed1	80	static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
db0ffc24 KW	81	{
	82	int ret;
	83
db0ffc24 KW	84	ret = laiocb->ret;
db0ffc24 KW	85	if (ret != -ECANCELED) {
b161e2e4	86	if (ret == laiocb->nbytes) {
db0ffc24	87	ret = 0;
b161e2e4 KW	88	} else if (ret >= 0) {
	89	/* Short reads mean EOF, pad with zeros. */
	90	if (laiocb->is_read) {
3d9b4925 MT	91	qemu_iovec_memset(laiocb->qiov, ret, 0,
3d9b4925 MT	92	laiocb->qiov->size - ret);
b161e2e4	93	} else {
1c42f149	94	ret = -ENOSPC;
b161e2e4 KW	95	}
b161e2e4 KW	96	}
db0ffc24 KW	97	}
db0ffc24 KW	98
2174f12b	99	laiocb->ret = ret;
2b02fd81 JS	100
	101	/*
	102	* If the coroutine is already entered it must be in ioq_submit() and
	103	* will notice laio->ret has been filled in when it eventually runs
	104	* later. Coroutines cannot be entered recursively so avoid doing
	105	* that!
	106	*/
ab50533b	107	assert(laiocb->co->ctx == laiocb->ctx->aio_context);
2b02fd81 JS	108	if (!qemu_coroutine_entered(laiocb->co)) {
2b02fd81 JS	109	aio_co_wake(laiocb->co);
2174f12b	110	}
db0ffc24 KW	111	}
db0ffc24 KW	112
9e909a58 RP	113	/**
	114	* aio_ring buffer which is shared between userspace and kernel.
	115	*
	116	* This copied from linux/fs/aio.c, common header does not exist
	117	* but AIO exists for ages so we assume ABI is stable.
	118	*/
	119	struct aio_ring {
	120	unsigned id; /* kernel internal index number */
	121	unsigned nr; /* number of io_events */
	122	unsigned head; /* Written to by userland or by kernel. */
	123	unsigned tail;
	124
	125	unsigned magic;
	126	unsigned compat_features;
	127	unsigned incompat_features;
	128	unsigned header_length; /* size of aio_ring */
	129
f7795e40	130	struct io_event io_events[];
9e909a58 RP	131	};
	132
	133	/**
	134	* io_getevents_peek:
	135	* @ctx: AIO context
	136	* @events: pointer on events array, output value
	137
	138	* Returns the number of completed events and sets a pointer
	139	* on events array. This function does not update the internal
	140	* ring buffer, only reads head and tail. When @events has been
	141	* processed io_getevents_commit() must be called.
	142	*/
	143	static inline unsigned int io_getevents_peek(io_context_t ctx,
	144	struct io_event **events)
	145	{
	146	struct aio_ring ring = (struct aio_ring )ctx;
	147	unsigned int head = ring->head, tail = ring->tail;
	148	unsigned int nr;
	149
	150	nr = tail >= head ? tail - head : ring->nr - head;
	151	*events = ring->io_events + head;
	152	/* To avoid speculative loads of s->events[i] before observing tail.
	153	Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
	154	smp_rmb();
	155
	156	return nr;
	157	}
	158
	159	/**
	160	* io_getevents_commit:
	161	* @ctx: AIO context
	162	* @nr: the number of events on which head should be advanced
	163	*
	164	* Advances head of a ring buffer.
	165	*/
	166	static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
	167	{
	168	struct aio_ring ring = (struct aio_ring )ctx;
	169
	170	if (nr) {
	171	ring->head = (ring->head + nr) % ring->nr;
	172	}
	173	}
	174
	175	/**
	176	* io_getevents_advance_and_peek:
	177	* @ctx: AIO context
	178	* @events: pointer on events array, output value
	179	* @nr: the number of events on which head should be advanced
	180	*
	181	* Advances head of a ring buffer and returns number of elements left.
	182	*/
	183	static inline unsigned int
	184	io_getevents_advance_and_peek(io_context_t ctx,
	185	struct io_event **events,
	186	unsigned int nr)
	187	{
	188	io_getevents_commit(ctx, nr);
	189	return io_getevents_peek(ctx, events);
	190	}
	191
3407de57 RP	192	/**
	193	* qemu_laio_process_completions:
	194	* @s: AIO state
	195	*
	196	* Fetches completed I/O requests and invokes their callbacks.
2cdff7f6 SH	197	*
	198	* The function is somewhat tricky because it supports nested event loops, for
	199	* example when a request callback invokes aio_poll(). In order to do this,
3407de57 RP	200	* indices are kept in LinuxAioState. Function schedules BH completion so it
	201	* can be called again in a nested event loop. When there are no events left
	202	* to complete the BH is being canceled.
2cdff7f6	203	*/
3407de57	204	static void qemu_laio_process_completions(LinuxAioState *s)
5c6c3a6c	205	{
9e909a58	206	struct io_event *events;
5c6c3a6c	207
84d61e5f SH	208	defer_call_begin();
84d61e5f SH	209
2cdff7f6 SH	210	/* Reschedule so nested event loops see currently pending completions */
2cdff7f6 SH	211	qemu_bh_schedule(s->completion_bh);
5c6c3a6c	212
9e909a58 RP	213	while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
	214	s->event_idx))) {
	215	for (s->event_idx = 0; s->event_idx < s->event_max; ) {
	216	struct iocb *iocb = events[s->event_idx].obj;
	217	struct qemu_laiocb *laiocb =
2cdff7f6 SH	218	container_of(iocb, struct qemu_laiocb, iocb);
2cdff7f6 SH	219
9e909a58	220	laiocb->ret = io_event_ret(&events[s->event_idx]);
2cdff7f6	221
9e909a58 RP	222	/* Change counters one-by-one because we can be nested. */
	223	s->io_q.in_flight--;
	224	s->event_idx++;
	225	qemu_laio_process_completion(laiocb);
	226	}
2cdff7f6	227	}
28b24087	228
9e909a58 RP	229	qemu_bh_cancel(s->completion_bh);
	230
	231	/* If we are nested we have to notify the level above that we are done
	232	* by setting event_max to zero, upper level will then jump out of it's
3202d8e4	233	* own `for` loop. If we are the last all counters dropped to zero. */
9e909a58 RP	234	s->event_max = 0;
9e909a58 RP	235	s->event_idx = 0;
84d61e5f SH	236
84d61e5f SH	237	defer_call_end();
3407de57	238	}
9e909a58	239
3407de57 RP	240	static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
	241	{
	242	qemu_laio_process_completions(s);
1919631e	243
07668288	244	if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
28b24087 PB	245	ioq_submit(s);
28b24087 PB	246	}
2cdff7f6 SH	247	}
2cdff7f6 SH	248
3407de57 RP	249	static void qemu_laio_completion_bh(void *opaque)
	250	{
	251	LinuxAioState *s = opaque;
	252
	253	qemu_laio_process_completions_and_submit(s);
	254	}
	255
2cdff7f6 SH	256	static void qemu_laio_completion_cb(EventNotifier *e)
2cdff7f6 SH	257	{
dd7f7ed1	258	LinuxAioState *s = container_of(e, LinuxAioState, e);
2cdff7f6 SH	259
2cdff7f6 SH	260	if (event_notifier_test_and_clear(&s->e)) {
3407de57	261	qemu_laio_process_completions_and_submit(s);
5c6c3a6c CH	262	}
	263	}
	264
ee686975 SH	265	static bool qemu_laio_poll_cb(void *opaque)
	266	{
	267	EventNotifier *e = opaque;
	268	LinuxAioState *s = container_of(e, LinuxAioState, e);
	269	struct io_event *events;
	270
826cc324 SH	271	return io_getevents_peek(s->ctx, &events);
	272	}
	273
	274	static void qemu_laio_poll_ready(EventNotifier *opaque)
	275	{
	276	EventNotifier *e = opaque;
	277	LinuxAioState *s = container_of(e, LinuxAioState, e);
ee686975 SH	278
ee686975 SH	279	qemu_laio_process_completions_and_submit(s);
ee686975 SH	280	}
ee686975 SH	281
1b3abdcc ML	282	static void ioq_init(LaioQueue *io_q)
1b3abdcc ML	283	{
28b24087	284	QSIMPLEQ_INIT(&io_q->pending);
5e1b34a3 RP	285	io_q->in_queue = 0;
5e1b34a3 RP	286	io_q->in_flight = 0;
43f2376e	287	io_q->blocked = false;
1b3abdcc ML	288	}
1b3abdcc ML	289
dd7f7ed1	290	static void ioq_submit(LinuxAioState *s)
1b3abdcc	291	{
82595da8	292	int ret, len;
28b24087	293	struct qemu_laiocb *aiocb;
5e1b34a3	294	struct iocb *iocbs[MAX_EVENTS];
82595da8	295	QSIMPLEQ_HEAD(, qemu_laiocb) completed;
1b3abdcc	296
43f2376e	297	do {
5e1b34a3 RP	298	if (s->io_q.in_flight >= MAX_EVENTS) {
	299	break;
	300	}
43f2376e PB	301	len = 0;
	302	QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
	303	iocbs[len++] = &aiocb->iocb;
5e1b34a3	304	if (s->io_q.in_flight + len >= MAX_EVENTS) {
43f2376e PB	305	break;
43f2376e PB	306	}
28b24087	307	}
1b3abdcc	308
43f2376e PB	309	ret = io_submit(s->ctx, len, iocbs);
43f2376e PB	310	if (ret == -EAGAIN) {
82595da8	311	break;
43f2376e PB	312	}
43f2376e PB	313	if (ret < 0) {
44713c9e KW	314	/* Fail the first request, retry the rest */
	315	aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
	316	QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
	317	s->io_q.in_queue--;
	318	aiocb->ret = ret;
	319	qemu_laio_process_completion(aiocb);
	320	continue;
43f2376e PB	321	}
43f2376e PB	322
5e1b34a3 RP	323	s->io_q.in_flight += ret;
5e1b34a3 RP	324	s->io_q.in_queue -= ret;
82595da8 PB	325	aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
82595da8 PB	326	QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
43f2376e	327	} while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
5e1b34a3	328	s->io_q.blocked = (s->io_q.in_queue > 0);
0ed93d84 RP	329
	330	if (s->io_q.in_flight) {
	331	/* We can try to complete something just right away if there are
	332	* still requests in-flight. */
	333	qemu_laio_process_completions(s);
	334	/*
	335	* Even we have completed everything (in_flight == 0), the queue can
	336	* have still pended requests (in_queue > 0). We do not attempt to
	337	* repeat submission to avoid IO hang. The reason is simple: s->e is
	338	* still set and completion callback will be called shortly and all
	339	* pended requests will be submitted from there.
	340	*/
	341	}
1b3abdcc ML	342	}
1b3abdcc ML	343
512da211 SG	344	static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
	345	{
	346	uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
	347
	348	/*
	349	* AIO context can be shared between multiple block devices, so
	350	* `dev_max_batch` allows reducing the batch size for latency-sensitive
	351	* devices.
	352	*/
	353	max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
	354
	355	/* limit the batch with the number of available events */
	356	max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
	357
	358	return max_batch;
	359	}
	360
ccee48aa	361	static void laio_deferred_fn(void *opaque)
1b3abdcc	362	{
07668288	363	LinuxAioState *s = opaque;
f387cac5	364
07668288	365	if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
de354644	366	ioq_submit(s);
1b3abdcc	367	}
1b3abdcc ML	368	}
1b3abdcc ML	369
2174f12b	370	static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
512da211	371	int type, uint64_t dev_max_batch)
5c6c3a6c	372	{
2174f12b KW	373	LinuxAioState *s = laiocb->ctx;
	374	struct iocb *iocbs = &laiocb->iocb;
	375	QEMUIOVector *qiov = laiocb->qiov;
5c6c3a6c CH	376
	377	switch (type) {
	378	case QEMU_AIO_WRITE:
	379	io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
7d37435b	380	break;
4751d09a SL	381	case QEMU_AIO_ZONE_APPEND:
	382	io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
	383	break;
5c6c3a6c CH	384	case QEMU_AIO_READ:
5c6c3a6c CH	385	io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
7d37435b	386	break;
c30e624d	387	/* Currently Linux kernel does not support other operations */
5c6c3a6c CH	388	default:
	389	fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
	390	__func__, type);
2174f12b	391	return -EIO;
5c6c3a6c	392	}
c90caf25	393	io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
5c6c3a6c	394
28b24087	395	QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
5e1b34a3	396	s->io_q.in_queue++;
07668288 SH	397	if (!s->io_q.blocked) {
	398	if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
	399	ioq_submit(s);
	400	} else {
ccee48aa	401	defer_call(laio_deferred_fn, s);
07668288	402	}
1b3abdcc	403	}
5c6c3a6c	404
2174f12b KW	405	return 0;
	406	}
	407
ab50533b EGE	408	int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
ab50533b EGE	409	int type, uint64_t dev_max_batch)
2174f12b	410	{
2174f12b	411	int ret;
ab50533b	412	AioContext *ctx = qemu_get_current_aio_context();
2174f12b KW	413	struct qemu_laiocb laiocb = {
2174f12b KW	414	.co = qemu_coroutine_self(),
9d52aa3c	415	.nbytes = qiov->size,
ab50533b	416	.ctx = aio_get_linux_aio(ctx),
0ed93d84	417	.ret = -EINPROGRESS,
2174f12b KW	418	.is_read = (type == QEMU_AIO_READ),
	419	.qiov = qiov,
	420	};
	421
512da211	422	ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
2174f12b KW	423	if (ret < 0) {
	424	return ret;
	425	}
	426
0ed93d84 RP	427	if (laiocb.ret == -EINPROGRESS) {
	428	qemu_coroutine_yield();
	429	}
2174f12b KW	430	return laiocb.ret;
	431	}
	432
dd7f7ed1	433	void laio_detach_aio_context(LinuxAioState s, AioContext old_context)
c2f3426c	434	{
60f782b6	435	aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL);
2cdff7f6	436	qemu_bh_delete(s->completion_bh);
1919631e	437	s->aio_context = NULL;
c2f3426c SH	438	}
c2f3426c SH	439
dd7f7ed1	440	void laio_attach_aio_context(LinuxAioState s, AioContext new_context)
c2f3426c	441	{
0187f5c9	442	s->aio_context = new_context;
2cdff7f6	443	s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
60f782b6	444	aio_set_event_notifier(new_context, &s->e,
ee686975	445	qemu_laio_completion_cb,
826cc324 SH	446	qemu_laio_poll_cb,
826cc324 SH	447	qemu_laio_poll_ready);
c2f3426c SH	448	}
c2f3426c SH	449
ed6e2161	450	LinuxAioState laio_init(Error *errp)
5c6c3a6c	451	{
ed6e2161	452	int rc;
dd7f7ed1	453	LinuxAioState *s;
5c6c3a6c	454
7267c094	455	s = g_malloc0(sizeof(*s));
ed6e2161 NA	456	rc = event_notifier_init(&s->e, false);
ed6e2161 NA	457	if (rc < 0) {
7a21bee2	458	error_setg_errno(errp, -rc, "failed to initialize event notifier");
5c6c3a6c	459	goto out_free_state;
c90caf25	460	}
5c6c3a6c	461
ed6e2161 NA	462	rc = io_setup(MAX_EVENTS, &s->ctx);
	463	if (rc < 0) {
	464	error_setg_errno(errp, -rc, "failed to create linux AIO context");
5c6c3a6c	465	goto out_close_efd;
c90caf25	466	}
5c6c3a6c	467
1b3abdcc ML	468	ioq_init(&s->io_q);
1b3abdcc ML	469
5c6c3a6c CH	470	return s;
	471
	472	out_close_efd:
c90caf25	473	event_notifier_cleanup(&s->e);
5c6c3a6c	474	out_free_state:
7267c094	475	g_free(s);
5c6c3a6c CH	476	return NULL;
5c6c3a6c CH	477	}
abd269b7	478
dd7f7ed1	479	void laio_cleanup(LinuxAioState *s)
abd269b7	480	{
abd269b7	481	event_notifier_cleanup(&s->e);
a1abf40d GA	482
	483	if (io_destroy(s->ctx) != 0) {
	484	fprintf(stderr, "%s: destroy AIO context %p failed\n",
	485	__func__, &s->ctx);
	486	}
abd269b7 SH	487	g_free(s);
abd269b7 SH	488	}