[mirror_qemu.git] / block / linux-aio.c

/*
 * Linux native AIO support.
 *
 * Copyright (C) 2009 IBM, Corp.
 * Copyright (C) 2009 Red Hat, Inc.
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 */
#include "qemu/osdep.h"
#include "block/aio.h"
#include "qemu/queue.h"
#include "block/block.h"
#include "block/raw-aio.h"
#include "qemu/event_notifier.h"
#include "qemu/coroutine.h"
#include "qapi/error.h"
#include "sysemu/block-backend.h"

/* Only used for assertions.  */
#include "qemu/coroutine_int.h"

#include <libaio.h>

/*
 * Queue size (per-device).
 *
 * XXX: eventually we need to communicate this to the guest and/or make it
 *      tunable by the guest.  If we get more outstanding requests at a time
 *      than this we will get EAGAIN from io_submit which is communicated to
 *      the guest as an I/O error.
 */
#define MAX_EVENTS 1024

/* Maximum number of requests in a batch. (default value) */
#define DEFAULT_MAX_BATCH 32

struct qemu_laiocb {
    Coroutine *co;
    LinuxAioState *ctx;
    struct iocb iocb;
    ssize_t ret;
    size_t nbytes;
    QEMUIOVector *qiov;
    bool is_read;
    QSIMPLEQ_ENTRY(qemu_laiocb) next;
};

typedef struct {
    unsigned int in_queue;
    unsigned int in_flight;
    bool blocked;
    QSIMPLEQ_HEAD(, qemu_laiocb) pending;
} LaioQueue;

struct LinuxAioState {
    AioContext *aio_context;

    io_context_t ctx;
    EventNotifier e;

    /* No locking required, only accessed from AioContext home thread */
    LaioQueue io_q;
    QEMUBH *completion_bh;
    int event_idx;
    int event_max;
};

static void ioq_submit(LinuxAioState *s);

static inline ssize_t io_event_ret(struct io_event *ev)
{
    return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
}

/*
 * Completes an AIO request.
 */
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
{
    int ret;

    ret = laiocb->ret;
    if (ret != -ECANCELED) {
        if (ret == laiocb->nbytes) {
            ret = 0;
        } else if (ret >= 0) {
            /* Short reads mean EOF, pad with zeros. */
            if (laiocb->is_read) {
                qemu_iovec_memset(laiocb->qiov, ret, 0,
                    laiocb->qiov->size - ret);
            } else {
                ret = -ENOSPC;
            }
        }
    }

    laiocb->ret = ret;

    /*
     * If the coroutine is already entered it must be in ioq_submit() and
     * will notice laio->ret has been filled in when it eventually runs
     * later.  Coroutines cannot be entered recursively so avoid doing
     * that!
     */
    assert(laiocb->co->ctx == laiocb->ctx->aio_context);
    if (!qemu_coroutine_entered(laiocb->co)) {
        aio_co_wake(laiocb->co);
    }
}

/**
 * aio_ring buffer which is shared between userspace and kernel.
 *
 * This copied from linux/fs/aio.c, common header does not exist
 * but AIO exists for ages so we assume ABI is stable.
 */
struct aio_ring {
    unsigned    id;    /* kernel internal index number */
    unsigned    nr;    /* number of io_events */
    unsigned    head;  /* Written to by userland or by kernel. */
    unsigned    tail;

    unsigned    magic;
    unsigned    compat_features;
    unsigned    incompat_features;
    unsigned    header_length;  /* size of aio_ring */

    struct io_event io_events[];
};

/**
 * io_getevents_peek:
 * @ctx: AIO context
 * @events: pointer on events array, output value

 * Returns the number of completed events and sets a pointer
 * on events array.  This function does not update the internal
 * ring buffer, only reads head and tail.  When @events has been
 * processed io_getevents_commit() must be called.
 */
static inline unsigned int io_getevents_peek(io_context_t ctx,
                                             struct io_event **events)
{
    struct aio_ring *ring = (struct aio_ring *)ctx;
    unsigned int head = ring->head, tail = ring->tail;
    unsigned int nr;

    nr = tail >= head ? tail - head : ring->nr - head;
    *events = ring->io_events + head;
    /* To avoid speculative loads of s->events[i] before observing tail.
       Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
    smp_rmb();

    return nr;
}

/**
 * io_getevents_commit:
 * @ctx: AIO context
 * @nr: the number of events on which head should be advanced
 *
 * Advances head of a ring buffer.
 */
static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
{
    struct aio_ring *ring = (struct aio_ring *)ctx;

    if (nr) {
        ring->head = (ring->head + nr) % ring->nr;
    }
}

/**
 * io_getevents_advance_and_peek:
 * @ctx: AIO context
 * @events: pointer on events array, output value
 * @nr: the number of events on which head should be advanced
 *
 * Advances head of a ring buffer and returns number of elements left.
 */
static inline unsigned int
io_getevents_advance_and_peek(io_context_t ctx,
                              struct io_event **events,
                              unsigned int nr)
{
    io_getevents_commit(ctx, nr);
    return io_getevents_peek(ctx, events);
}

/**
 * qemu_laio_process_completions:
 * @s: AIO state
 *
 * Fetches completed I/O requests and invokes their callbacks.
 *
 * The function is somewhat tricky because it supports nested event loops, for
 * example when a request callback invokes aio_poll().  In order to do this,
 * indices are kept in LinuxAioState.  Function schedules BH completion so it
 * can be called again in a nested event loop.  When there are no events left
 * to complete the BH is being canceled.
 */
static void qemu_laio_process_completions(LinuxAioState *s)
{
    struct io_event *events;

    /* Reschedule so nested event loops see currently pending completions */
    qemu_bh_schedule(s->completion_bh);

    while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
                                                         s->event_idx))) {
        for (s->event_idx = 0; s->event_idx < s->event_max; ) {
            struct iocb *iocb = events[s->event_idx].obj;
            struct qemu_laiocb *laiocb =
                container_of(iocb, struct qemu_laiocb, iocb);

            laiocb->ret = io_event_ret(&events[s->event_idx]);

            /* Change counters one-by-one because we can be nested. */
            s->io_q.in_flight--;
            s->event_idx++;
            qemu_laio_process_completion(laiocb);
        }
    }

    qemu_bh_cancel(s->completion_bh);

    /* If we are nested we have to notify the level above that we are done
     * by setting event_max to zero, upper level will then jump out of it's
     * own `for` loop.  If we are the last all counters dropped to zero. */
    s->event_max = 0;
    s->event_idx = 0;
}

static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
{
    qemu_laio_process_completions(s);

    if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
}

static void qemu_laio_completion_bh(void *opaque)
{
    LinuxAioState *s = opaque;

    qemu_laio_process_completions_and_submit(s);
}

static void qemu_laio_completion_cb(EventNotifier *e)
{
    LinuxAioState *s = container_of(e, LinuxAioState, e);

    if (event_notifier_test_and_clear(&s->e)) {
        qemu_laio_process_completions_and_submit(s);
    }
}

static bool qemu_laio_poll_cb(void *opaque)
{
    EventNotifier *e = opaque;
    LinuxAioState *s = container_of(e, LinuxAioState, e);
    struct io_event *events;

    return io_getevents_peek(s->ctx, &events);
}

static void qemu_laio_poll_ready(EventNotifier *opaque)
{
    EventNotifier *e = opaque;
    LinuxAioState *s = container_of(e, LinuxAioState, e);

    qemu_laio_process_completions_and_submit(s);
}

static void ioq_init(LaioQueue *io_q)
{
    QSIMPLEQ_INIT(&io_q->pending);
    io_q->in_queue = 0;
    io_q->in_flight = 0;
    io_q->blocked = false;
}

static void ioq_submit(LinuxAioState *s)
{
    int ret, len;
    struct qemu_laiocb *aiocb;
    struct iocb *iocbs[MAX_EVENTS];
    QSIMPLEQ_HEAD(, qemu_laiocb) completed;

    do {
        if (s->io_q.in_flight >= MAX_EVENTS) {
            break;
        }
        len = 0;
        QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
            iocbs[len++] = &aiocb->iocb;
            if (s->io_q.in_flight + len >= MAX_EVENTS) {
                break;
            }
        }

        ret = io_submit(s->ctx, len, iocbs);
        if (ret == -EAGAIN) {
            break;
        }
        if (ret < 0) {
            /* Fail the first request, retry the rest */
            aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
            QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
            s->io_q.in_queue--;
            aiocb->ret = ret;
            qemu_laio_process_completion(aiocb);
            continue;
        }

        s->io_q.in_flight += ret;
        s->io_q.in_queue  -= ret;
        aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
        QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
    } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
    s->io_q.blocked = (s->io_q.in_queue > 0);

    if (s->io_q.in_flight) {
        /* We can try to complete something just right away if there are
         * still requests in-flight. */
        qemu_laio_process_completions(s);
        /*
         * Even we have completed everything (in_flight == 0), the queue can
         * have still pended requests (in_queue > 0).  We do not attempt to
         * repeat submission to avoid IO hang.  The reason is simple: s->e is
         * still set and completion callback will be called shortly and all
         * pended requests will be submitted from there.
         */
    }
}

static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
{
    uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;

    /*
     * AIO context can be shared between multiple block devices, so
     * `dev_max_batch` allows reducing the batch size for latency-sensitive
     * devices.
     */
    max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);

    /* limit the batch with the number of available events */
    max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);

    return max_batch;
}

static void laio_unplug_fn(void *opaque)
{
    LinuxAioState *s = opaque;

    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
}

static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
                          int type, uint64_t dev_max_batch)
{
    LinuxAioState *s = laiocb->ctx;
    struct iocb *iocbs = &laiocb->iocb;
    QEMUIOVector *qiov = laiocb->qiov;

    switch (type) {
    case QEMU_AIO_WRITE:
        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
    case QEMU_AIO_ZONE_APPEND:
        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
    case QEMU_AIO_READ:
        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
    /* Currently Linux kernel does not support other operations */
    default:
        fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
                        __func__, type);
        return -EIO;
    }
    io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));

    QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
    s->io_q.in_queue++;
    if (!s->io_q.blocked) {
        if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
            ioq_submit(s);
        } else {
            blk_io_plug_call(laio_unplug_fn, s);
        }
    }

    return 0;
}

int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
                                int type, uint64_t dev_max_batch)
{
    int ret;
    AioContext *ctx = qemu_get_current_aio_context();
    struct qemu_laiocb laiocb = {
        .co         = qemu_coroutine_self(),
        .nbytes     = qiov->size,
        .ctx        = aio_get_linux_aio(ctx),
        .ret        = -EINPROGRESS,
        .is_read    = (type == QEMU_AIO_READ),
        .qiov       = qiov,
    };

    ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
    if (ret < 0) {
        return ret;
    }

    if (laiocb.ret == -EINPROGRESS) {
        qemu_coroutine_yield();
    }
    return laiocb.ret;
}

void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
{
    aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL);
    qemu_bh_delete(s->completion_bh);
    s->aio_context = NULL;
}

void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
{
    s->aio_context = new_context;
    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
    aio_set_event_notifier(new_context, &s->e,
                           qemu_laio_completion_cb,
                           qemu_laio_poll_cb,
                           qemu_laio_poll_ready);
}

LinuxAioState *laio_init(Error **errp)
{
    int rc;
    LinuxAioState *s;

    s = g_malloc0(sizeof(*s));
    rc = event_notifier_init(&s->e, false);
    if (rc < 0) {
        error_setg_errno(errp, -rc, "failed to initialize event notifier");
        goto out_free_state;
    }

    rc = io_setup(MAX_EVENTS, &s->ctx);
    if (rc < 0) {
        error_setg_errno(errp, -rc, "failed to create linux AIO context");
        goto out_close_efd;
    }

    ioq_init(&s->io_q);

    return s;

out_close_efd:
    event_notifier_cleanup(&s->e);
out_free_state:
    g_free(s);
    return NULL;
}

void laio_cleanup(LinuxAioState *s)
{
    event_notifier_cleanup(&s->e);

    if (io_destroy(s->ctx) != 0) {
        fprintf(stderr, "%s: destroy AIO context %p failed\n",
                        __func__, &s->ctx);
    }
    g_free(s);
}
Commit	Line	Data
5c6c3a6c CH	1	/*
	2	* Linux native AIO support.
	3	*
	4	* Copyright (C) 2009 IBM, Corp.
	5	* Copyright (C) 2009 Red Hat, Inc.
	6	*
	7	* This work is licensed under the terms of the GNU GPL, version 2 or later.
	8	* See the COPYING file in the top-level directory.
	9	*/
80c71a24	10	#include "qemu/osdep.h"
737e150e	11	#include "block/aio.h"
1de7afc9	12	#include "qemu/queue.h"
2174f12b	13	#include "block/block.h"
9f8540ec	14	#include "block/raw-aio.h"
1de7afc9	15	#include "qemu/event_notifier.h"
2174f12b	16	#include "qemu/coroutine.h"
ed6e2161	17	#include "qapi/error.h"
07668288	18	#include "sysemu/block-backend.h"
5c6c3a6c	19
ab50533b EGE	20	/* Only used for assertions. */
	21	#include "qemu/coroutine_int.h"
	22
5c6c3a6c CH	23	#include <libaio.h>
	24
	25	/*
	26	* Queue size (per-device).
	27	*
	28	* XXX: eventually we need to communicate this to the guest and/or make it
	29	* tunable by the guest. If we get more outstanding requests at a time
	30	* than this we will get EAGAIN from io_submit which is communicated to
	31	* the guest as an I/O error.
	32	*/
2558cb8d	33	#define MAX_EVENTS 1024
5c6c3a6c	34
d7ddd0a1 SG	35	/* Maximum number of requests in a batch. (default value) */
	36	#define DEFAULT_MAX_BATCH 32
	37
5c6c3a6c	38	struct qemu_laiocb {
2174f12b	39	Coroutine *co;
dd7f7ed1	40	LinuxAioState *ctx;
5c6c3a6c CH	41	struct iocb iocb;
	42	ssize_t ret;
	43	size_t nbytes;
b161e2e4 KW	44	QEMUIOVector *qiov;
b161e2e4 KW	45	bool is_read;
28b24087	46	QSIMPLEQ_ENTRY(qemu_laiocb) next;
5c6c3a6c CH	47	};
5c6c3a6c CH	48
1b3abdcc	49	typedef struct {
5e1b34a3 RP	50	unsigned int in_queue;
5e1b34a3 RP	51	unsigned int in_flight;
43f2376e	52	bool blocked;
28b24087	53	QSIMPLEQ_HEAD(, qemu_laiocb) pending;
1b3abdcc ML	54	} LaioQueue;
1b3abdcc ML	55
dd7f7ed1	56	struct LinuxAioState {
0187f5c9 PB	57	AioContext *aio_context;
0187f5c9 PB	58
5c6c3a6c	59	io_context_t ctx;
c90caf25	60	EventNotifier e;
1b3abdcc	61
ab50533b	62	/* No locking required, only accessed from AioContext home thread */
1b3abdcc	63	LaioQueue io_q;
2cdff7f6	64	QEMUBH *completion_bh;
2cdff7f6 SH	65	int event_idx;
2cdff7f6 SH	66	int event_max;
5c6c3a6c CH	67	};
5c6c3a6c CH	68
dd7f7ed1	69	static void ioq_submit(LinuxAioState *s);
28b24087	70
5c6c3a6c CH	71	static inline ssize_t io_event_ret(struct io_event *ev)
	72	{
	73	return (ssize_t)(((uint64_t)ev->res2 << 32) \| ev->res);
	74	}
	75
db0ffc24	76	/*
2b02fd81	77	* Completes an AIO request.
db0ffc24	78	*/
dd7f7ed1	79	static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
db0ffc24 KW	80	{
	81	int ret;
	82
db0ffc24 KW	83	ret = laiocb->ret;
db0ffc24 KW	84	if (ret != -ECANCELED) {
b161e2e4	85	if (ret == laiocb->nbytes) {
db0ffc24	86	ret = 0;
b161e2e4 KW	87	} else if (ret >= 0) {
	88	/* Short reads mean EOF, pad with zeros. */
	89	if (laiocb->is_read) {
3d9b4925 MT	90	qemu_iovec_memset(laiocb->qiov, ret, 0,
3d9b4925 MT	91	laiocb->qiov->size - ret);
b161e2e4	92	} else {
1c42f149	93	ret = -ENOSPC;
b161e2e4 KW	94	}
b161e2e4 KW	95	}
db0ffc24 KW	96	}
db0ffc24 KW	97
2174f12b	98	laiocb->ret = ret;
2b02fd81 JS	99
	100	/*
	101	* If the coroutine is already entered it must be in ioq_submit() and
	102	* will notice laio->ret has been filled in when it eventually runs
	103	* later. Coroutines cannot be entered recursively so avoid doing
	104	* that!
	105	*/
ab50533b	106	assert(laiocb->co->ctx == laiocb->ctx->aio_context);
2b02fd81 JS	107	if (!qemu_coroutine_entered(laiocb->co)) {
2b02fd81 JS	108	aio_co_wake(laiocb->co);
2174f12b	109	}
db0ffc24 KW	110	}
db0ffc24 KW	111
9e909a58 RP	112	/**
	113	* aio_ring buffer which is shared between userspace and kernel.
	114	*
	115	* This copied from linux/fs/aio.c, common header does not exist
	116	* but AIO exists for ages so we assume ABI is stable.
	117	*/
	118	struct aio_ring {
	119	unsigned id; /* kernel internal index number */
	120	unsigned nr; /* number of io_events */
	121	unsigned head; /* Written to by userland or by kernel. */
	122	unsigned tail;
	123
	124	unsigned magic;
	125	unsigned compat_features;
	126	unsigned incompat_features;
	127	unsigned header_length; /* size of aio_ring */
	128
f7795e40	129	struct io_event io_events[];
9e909a58 RP	130	};
	131
	132	/**
	133	* io_getevents_peek:
	134	* @ctx: AIO context
	135	* @events: pointer on events array, output value
	136
	137	* Returns the number of completed events and sets a pointer
	138	* on events array. This function does not update the internal
	139	* ring buffer, only reads head and tail. When @events has been
	140	* processed io_getevents_commit() must be called.
	141	*/
	142	static inline unsigned int io_getevents_peek(io_context_t ctx,
	143	struct io_event **events)
	144	{
	145	struct aio_ring ring = (struct aio_ring )ctx;
	146	unsigned int head = ring->head, tail = ring->tail;
	147	unsigned int nr;
	148
	149	nr = tail >= head ? tail - head : ring->nr - head;
	150	*events = ring->io_events + head;
	151	/* To avoid speculative loads of s->events[i] before observing tail.
	152	Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
	153	smp_rmb();
	154
	155	return nr;
	156	}
	157
	158	/**
	159	* io_getevents_commit:
	160	* @ctx: AIO context
	161	* @nr: the number of events on which head should be advanced
	162	*
	163	* Advances head of a ring buffer.
	164	*/
	165	static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
	166	{
	167	struct aio_ring ring = (struct aio_ring )ctx;
	168
	169	if (nr) {
	170	ring->head = (ring->head + nr) % ring->nr;
	171	}
	172	}
	173
	174	/**
	175	* io_getevents_advance_and_peek:
	176	* @ctx: AIO context
	177	* @events: pointer on events array, output value
	178	* @nr: the number of events on which head should be advanced
	179	*
	180	* Advances head of a ring buffer and returns number of elements left.
	181	*/
	182	static inline unsigned int
	183	io_getevents_advance_and_peek(io_context_t ctx,
	184	struct io_event **events,
	185	unsigned int nr)
	186	{
	187	io_getevents_commit(ctx, nr);
	188	return io_getevents_peek(ctx, events);
	189	}
	190
3407de57 RP	191	/**
	192	* qemu_laio_process_completions:
	193	* @s: AIO state
	194	*
	195	* Fetches completed I/O requests and invokes their callbacks.
2cdff7f6 SH	196	*
	197	* The function is somewhat tricky because it supports nested event loops, for
	198	* example when a request callback invokes aio_poll(). In order to do this,
3407de57 RP	199	* indices are kept in LinuxAioState. Function schedules BH completion so it
	200	* can be called again in a nested event loop. When there are no events left
	201	* to complete the BH is being canceled.
2cdff7f6	202	*/
3407de57	203	static void qemu_laio_process_completions(LinuxAioState *s)
5c6c3a6c	204	{
9e909a58	205	struct io_event *events;
5c6c3a6c	206
2cdff7f6 SH	207	/* Reschedule so nested event loops see currently pending completions */
2cdff7f6 SH	208	qemu_bh_schedule(s->completion_bh);
5c6c3a6c	209
9e909a58 RP	210	while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
	211	s->event_idx))) {
	212	for (s->event_idx = 0; s->event_idx < s->event_max; ) {
	213	struct iocb *iocb = events[s->event_idx].obj;
	214	struct qemu_laiocb *laiocb =
2cdff7f6 SH	215	container_of(iocb, struct qemu_laiocb, iocb);
2cdff7f6 SH	216
9e909a58	217	laiocb->ret = io_event_ret(&events[s->event_idx]);
2cdff7f6	218
9e909a58 RP	219	/* Change counters one-by-one because we can be nested. */
	220	s->io_q.in_flight--;
	221	s->event_idx++;
	222	qemu_laio_process_completion(laiocb);
	223	}
2cdff7f6	224	}
28b24087	225
9e909a58 RP	226	qemu_bh_cancel(s->completion_bh);
	227
	228	/* If we are nested we have to notify the level above that we are done
	229	* by setting event_max to zero, upper level will then jump out of it's
3202d8e4	230	* own `for` loop. If we are the last all counters dropped to zero. */
9e909a58 RP	231	s->event_max = 0;
9e909a58 RP	232	s->event_idx = 0;
3407de57	233	}
9e909a58	234
3407de57 RP	235	static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
	236	{
	237	qemu_laio_process_completions(s);
1919631e	238
07668288	239	if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
28b24087 PB	240	ioq_submit(s);
28b24087 PB	241	}
2cdff7f6 SH	242	}
2cdff7f6 SH	243
3407de57 RP	244	static void qemu_laio_completion_bh(void *opaque)
	245	{
	246	LinuxAioState *s = opaque;
	247
	248	qemu_laio_process_completions_and_submit(s);
	249	}
	250
2cdff7f6 SH	251	static void qemu_laio_completion_cb(EventNotifier *e)
2cdff7f6 SH	252	{
dd7f7ed1	253	LinuxAioState *s = container_of(e, LinuxAioState, e);
2cdff7f6 SH	254
2cdff7f6 SH	255	if (event_notifier_test_and_clear(&s->e)) {
3407de57	256	qemu_laio_process_completions_and_submit(s);
5c6c3a6c CH	257	}
	258	}
	259
ee686975 SH	260	static bool qemu_laio_poll_cb(void *opaque)
	261	{
	262	EventNotifier *e = opaque;
	263	LinuxAioState *s = container_of(e, LinuxAioState, e);
	264	struct io_event *events;
	265
826cc324 SH	266	return io_getevents_peek(s->ctx, &events);
	267	}
	268
	269	static void qemu_laio_poll_ready(EventNotifier *opaque)
	270	{
	271	EventNotifier *e = opaque;
	272	LinuxAioState *s = container_of(e, LinuxAioState, e);
ee686975 SH	273
ee686975 SH	274	qemu_laio_process_completions_and_submit(s);
ee686975 SH	275	}
ee686975 SH	276
1b3abdcc ML	277	static void ioq_init(LaioQueue *io_q)
1b3abdcc ML	278	{
28b24087	279	QSIMPLEQ_INIT(&io_q->pending);
5e1b34a3 RP	280	io_q->in_queue = 0;
5e1b34a3 RP	281	io_q->in_flight = 0;
43f2376e	282	io_q->blocked = false;
1b3abdcc ML	283	}
1b3abdcc ML	284
dd7f7ed1	285	static void ioq_submit(LinuxAioState *s)
1b3abdcc	286	{
82595da8	287	int ret, len;
28b24087	288	struct qemu_laiocb *aiocb;
5e1b34a3	289	struct iocb *iocbs[MAX_EVENTS];
82595da8	290	QSIMPLEQ_HEAD(, qemu_laiocb) completed;
1b3abdcc	291
43f2376e	292	do {
5e1b34a3 RP	293	if (s->io_q.in_flight >= MAX_EVENTS) {
	294	break;
	295	}
43f2376e PB	296	len = 0;
	297	QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
	298	iocbs[len++] = &aiocb->iocb;
5e1b34a3	299	if (s->io_q.in_flight + len >= MAX_EVENTS) {
43f2376e PB	300	break;
43f2376e PB	301	}
28b24087	302	}
1b3abdcc	303
43f2376e PB	304	ret = io_submit(s->ctx, len, iocbs);
43f2376e PB	305	if (ret == -EAGAIN) {
82595da8	306	break;
43f2376e PB	307	}
43f2376e PB	308	if (ret < 0) {
44713c9e KW	309	/* Fail the first request, retry the rest */
	310	aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
	311	QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
	312	s->io_q.in_queue--;
	313	aiocb->ret = ret;
	314	qemu_laio_process_completion(aiocb);
	315	continue;
43f2376e PB	316	}
43f2376e PB	317
5e1b34a3 RP	318	s->io_q.in_flight += ret;
5e1b34a3 RP	319	s->io_q.in_queue -= ret;
82595da8 PB	320	aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
82595da8 PB	321	QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
43f2376e	322	} while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
5e1b34a3	323	s->io_q.blocked = (s->io_q.in_queue > 0);
0ed93d84 RP	324
	325	if (s->io_q.in_flight) {
	326	/* We can try to complete something just right away if there are
	327	* still requests in-flight. */
	328	qemu_laio_process_completions(s);
	329	/*
	330	* Even we have completed everything (in_flight == 0), the queue can
	331	* have still pended requests (in_queue > 0). We do not attempt to
	332	* repeat submission to avoid IO hang. The reason is simple: s->e is
	333	* still set and completion callback will be called shortly and all
	334	* pended requests will be submitted from there.
	335	*/
	336	}
1b3abdcc ML	337	}
1b3abdcc ML	338
512da211 SG	339	static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
	340	{
	341	uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
	342
	343	/*
	344	* AIO context can be shared between multiple block devices, so
	345	* `dev_max_batch` allows reducing the batch size for latency-sensitive
	346	* devices.
	347	*/
	348	max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
	349
	350	/* limit the batch with the number of available events */
	351	max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
	352
	353	return max_batch;
	354	}
	355
07668288	356	static void laio_unplug_fn(void *opaque)
1b3abdcc	357	{
07668288	358	LinuxAioState *s = opaque;
f387cac5	359
07668288	360	if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
de354644	361	ioq_submit(s);
1b3abdcc	362	}
1b3abdcc ML	363	}
1b3abdcc ML	364
2174f12b	365	static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
512da211	366	int type, uint64_t dev_max_batch)
5c6c3a6c	367	{
2174f12b KW	368	LinuxAioState *s = laiocb->ctx;
	369	struct iocb *iocbs = &laiocb->iocb;
	370	QEMUIOVector *qiov = laiocb->qiov;
5c6c3a6c CH	371
	372	switch (type) {
	373	case QEMU_AIO_WRITE:
	374	io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
7d37435b	375	break;
4751d09a SL	376	case QEMU_AIO_ZONE_APPEND:
	377	io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
	378	break;
5c6c3a6c CH	379	case QEMU_AIO_READ:
5c6c3a6c CH	380	io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
7d37435b	381	break;
c30e624d	382	/* Currently Linux kernel does not support other operations */
5c6c3a6c CH	383	default:
	384	fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
	385	__func__, type);
2174f12b	386	return -EIO;
5c6c3a6c	387	}
c90caf25	388	io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
5c6c3a6c	389
28b24087	390	QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
5e1b34a3	391	s->io_q.in_queue++;
07668288 SH	392	if (!s->io_q.blocked) {
	393	if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
	394	ioq_submit(s);
	395	} else {
	396	blk_io_plug_call(laio_unplug_fn, s);
	397	}
1b3abdcc	398	}
5c6c3a6c	399
2174f12b KW	400	return 0;
	401	}
	402
ab50533b EGE	403	int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
ab50533b EGE	404	int type, uint64_t dev_max_batch)
2174f12b	405	{
2174f12b	406	int ret;
ab50533b	407	AioContext *ctx = qemu_get_current_aio_context();
2174f12b KW	408	struct qemu_laiocb laiocb = {
2174f12b KW	409	.co = qemu_coroutine_self(),
9d52aa3c	410	.nbytes = qiov->size,
ab50533b	411	.ctx = aio_get_linux_aio(ctx),
0ed93d84	412	.ret = -EINPROGRESS,
2174f12b KW	413	.is_read = (type == QEMU_AIO_READ),
	414	.qiov = qiov,
	415	};
	416
512da211	417	ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
2174f12b KW	418	if (ret < 0) {
	419	return ret;
	420	}
	421
0ed93d84 RP	422	if (laiocb.ret == -EINPROGRESS) {
	423	qemu_coroutine_yield();
	424	}
2174f12b KW	425	return laiocb.ret;
	426	}
	427
dd7f7ed1	428	void laio_detach_aio_context(LinuxAioState s, AioContext old_context)
c2f3426c	429	{
60f782b6	430	aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL);
2cdff7f6	431	qemu_bh_delete(s->completion_bh);
1919631e	432	s->aio_context = NULL;
c2f3426c SH	433	}
c2f3426c SH	434
dd7f7ed1	435	void laio_attach_aio_context(LinuxAioState s, AioContext new_context)
c2f3426c	436	{
0187f5c9	437	s->aio_context = new_context;
2cdff7f6	438	s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
60f782b6	439	aio_set_event_notifier(new_context, &s->e,
ee686975	440	qemu_laio_completion_cb,
826cc324 SH	441	qemu_laio_poll_cb,
826cc324 SH	442	qemu_laio_poll_ready);
c2f3426c SH	443	}
c2f3426c SH	444
ed6e2161	445	LinuxAioState laio_init(Error *errp)
5c6c3a6c	446	{
ed6e2161	447	int rc;
dd7f7ed1	448	LinuxAioState *s;
5c6c3a6c	449
7267c094	450	s = g_malloc0(sizeof(*s));
ed6e2161 NA	451	rc = event_notifier_init(&s->e, false);
ed6e2161 NA	452	if (rc < 0) {
7a21bee2	453	error_setg_errno(errp, -rc, "failed to initialize event notifier");
5c6c3a6c	454	goto out_free_state;
c90caf25	455	}
5c6c3a6c	456
ed6e2161 NA	457	rc = io_setup(MAX_EVENTS, &s->ctx);
	458	if (rc < 0) {
	459	error_setg_errno(errp, -rc, "failed to create linux AIO context");
5c6c3a6c	460	goto out_close_efd;
c90caf25	461	}
5c6c3a6c	462
1b3abdcc ML	463	ioq_init(&s->io_q);
1b3abdcc ML	464
5c6c3a6c CH	465	return s;
	466
	467	out_close_efd:
c90caf25	468	event_notifier_cleanup(&s->e);
5c6c3a6c	469	out_free_state:
7267c094	470	g_free(s);
5c6c3a6c CH	471	return NULL;
5c6c3a6c CH	472	}
abd269b7	473
dd7f7ed1	474	void laio_cleanup(LinuxAioState *s)
abd269b7	475	{
abd269b7	476	event_notifier_cleanup(&s->e);
a1abf40d GA	477
	478	if (io_destroy(s->ctx) != 0) {
	479	fprintf(stderr, "%s: destroy AIO context %p failed\n",
	480	__func__, &s->ctx);
	481	}
abd269b7 SH	482	g_free(s);
abd269b7 SH	483	}