[mirror_qemu.git] / block / linux-aio.c

/*
 * Linux native AIO support.
 *
 * Copyright (C) 2009 IBM, Corp.
 * Copyright (C) 2009 Red Hat, Inc.
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 */
#include "qemu/osdep.h"
#include "block/aio.h"
#include "qemu/queue.h"
#include "block/block.h"
#include "block/raw-aio.h"
#include "qemu/event_notifier.h"
#include "qemu/coroutine.h"
#include "qapi/error.h"

#include <libaio.h>

/*
 * Queue size (per-device).
 *
 * XXX: eventually we need to communicate this to the guest and/or make it
 *      tunable by the guest.  If we get more outstanding requests at a time
 *      than this we will get EAGAIN from io_submit which is communicated to
 *      the guest as an I/O error.
 */
#define MAX_EVENTS 1024

/* Maximum number of requests in a batch. (default value) */
#define DEFAULT_MAX_BATCH 32

struct qemu_laiocb {
    Coroutine *co;
    LinuxAioState *ctx;
    struct iocb iocb;
    ssize_t ret;
    size_t nbytes;
    QEMUIOVector *qiov;
    bool is_read;
    QSIMPLEQ_ENTRY(qemu_laiocb) next;
};

typedef struct {
    int plugged;
    unsigned int in_queue;
    unsigned int in_flight;
    bool blocked;
    QSIMPLEQ_HEAD(, qemu_laiocb) pending;
} LaioQueue;

struct LinuxAioState {
    AioContext *aio_context;

    io_context_t ctx;
    EventNotifier e;

    /* io queue for submit at batch.  Protected by AioContext lock. */
    LaioQueue io_q;

    /* I/O completion processing.  Only runs in I/O thread.  */
    QEMUBH *completion_bh;
    int event_idx;
    int event_max;
};

static void ioq_submit(LinuxAioState *s);

static inline ssize_t io_event_ret(struct io_event *ev)
{
    return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
}

/*
 * Completes an AIO request.
 */
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
{
    int ret;

    ret = laiocb->ret;
    if (ret != -ECANCELED) {
        if (ret == laiocb->nbytes) {
            ret = 0;
        } else if (ret >= 0) {
            /* Short reads mean EOF, pad with zeros. */
            if (laiocb->is_read) {
                qemu_iovec_memset(laiocb->qiov, ret, 0,
                    laiocb->qiov->size - ret);
            } else {
                ret = -ENOSPC;
            }
        }
    }

    laiocb->ret = ret;

    /*
     * If the coroutine is already entered it must be in ioq_submit() and
     * will notice laio->ret has been filled in when it eventually runs
     * later.  Coroutines cannot be entered recursively so avoid doing
     * that!
     */
    if (!qemu_coroutine_entered(laiocb->co)) {
        aio_co_wake(laiocb->co);
    }
}

/**
 * aio_ring buffer which is shared between userspace and kernel.
 *
 * This copied from linux/fs/aio.c, common header does not exist
 * but AIO exists for ages so we assume ABI is stable.
 */
struct aio_ring {
    unsigned    id;    /* kernel internal index number */
    unsigned    nr;    /* number of io_events */
    unsigned    head;  /* Written to by userland or by kernel. */
    unsigned    tail;

    unsigned    magic;
    unsigned    compat_features;
    unsigned    incompat_features;
    unsigned    header_length;  /* size of aio_ring */

    struct io_event io_events[];
};

/**
 * io_getevents_peek:
 * @ctx: AIO context
 * @events: pointer on events array, output value

 * Returns the number of completed events and sets a pointer
 * on events array.  This function does not update the internal
 * ring buffer, only reads head and tail.  When @events has been
 * processed io_getevents_commit() must be called.
 */
static inline unsigned int io_getevents_peek(io_context_t ctx,
                                             struct io_event **events)
{
    struct aio_ring *ring = (struct aio_ring *)ctx;
    unsigned int head = ring->head, tail = ring->tail;
    unsigned int nr;

    nr = tail >= head ? tail - head : ring->nr - head;
    *events = ring->io_events + head;
    /* To avoid speculative loads of s->events[i] before observing tail.
       Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
    smp_rmb();

    return nr;
}

/**
 * io_getevents_commit:
 * @ctx: AIO context
 * @nr: the number of events on which head should be advanced
 *
 * Advances head of a ring buffer.
 */
static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
{
    struct aio_ring *ring = (struct aio_ring *)ctx;

    if (nr) {
        ring->head = (ring->head + nr) % ring->nr;
    }
}

/**
 * io_getevents_advance_and_peek:
 * @ctx: AIO context
 * @events: pointer on events array, output value
 * @nr: the number of events on which head should be advanced
 *
 * Advances head of a ring buffer and returns number of elements left.
 */
static inline unsigned int
io_getevents_advance_and_peek(io_context_t ctx,
                              struct io_event **events,
                              unsigned int nr)
{
    io_getevents_commit(ctx, nr);
    return io_getevents_peek(ctx, events);
}

/**
 * qemu_laio_process_completions:
 * @s: AIO state
 *
 * Fetches completed I/O requests and invokes their callbacks.
 *
 * The function is somewhat tricky because it supports nested event loops, for
 * example when a request callback invokes aio_poll().  In order to do this,
 * indices are kept in LinuxAioState.  Function schedules BH completion so it
 * can be called again in a nested event loop.  When there are no events left
 * to complete the BH is being canceled.
 */
static void qemu_laio_process_completions(LinuxAioState *s)
{
    struct io_event *events;

    /* Reschedule so nested event loops see currently pending completions */
    qemu_bh_schedule(s->completion_bh);

    while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
                                                         s->event_idx))) {
        for (s->event_idx = 0; s->event_idx < s->event_max; ) {
            struct iocb *iocb = events[s->event_idx].obj;
            struct qemu_laiocb *laiocb =
                container_of(iocb, struct qemu_laiocb, iocb);

            laiocb->ret = io_event_ret(&events[s->event_idx]);

            /* Change counters one-by-one because we can be nested. */
            s->io_q.in_flight--;
            s->event_idx++;
            qemu_laio_process_completion(laiocb);
        }
    }

    qemu_bh_cancel(s->completion_bh);

    /* If we are nested we have to notify the level above that we are done
     * by setting event_max to zero, upper level will then jump out of it's
     * own `for` loop.  If we are the last all counters droped to zero. */
    s->event_max = 0;
    s->event_idx = 0;
}

static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
{
    aio_context_acquire(s->aio_context);
    qemu_laio_process_completions(s);

    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
    aio_context_release(s->aio_context);
}

static void qemu_laio_completion_bh(void *opaque)
{
    LinuxAioState *s = opaque;

    qemu_laio_process_completions_and_submit(s);
}

static void qemu_laio_completion_cb(EventNotifier *e)
{
    LinuxAioState *s = container_of(e, LinuxAioState, e);

    if (event_notifier_test_and_clear(&s->e)) {
        qemu_laio_process_completions_and_submit(s);
    }
}

static bool qemu_laio_poll_cb(void *opaque)
{
    EventNotifier *e = opaque;
    LinuxAioState *s = container_of(e, LinuxAioState, e);
    struct io_event *events;

    return io_getevents_peek(s->ctx, &events);
}

static void qemu_laio_poll_ready(EventNotifier *opaque)
{
    EventNotifier *e = opaque;
    LinuxAioState *s = container_of(e, LinuxAioState, e);

    qemu_laio_process_completions_and_submit(s);
}

static void ioq_init(LaioQueue *io_q)
{
    QSIMPLEQ_INIT(&io_q->pending);
    io_q->plugged = 0;
    io_q->in_queue = 0;
    io_q->in_flight = 0;
    io_q->blocked = false;
}

static void ioq_submit(LinuxAioState *s)
{
    int ret, len;
    struct qemu_laiocb *aiocb;
    struct iocb *iocbs[MAX_EVENTS];
    QSIMPLEQ_HEAD(, qemu_laiocb) completed;

    do {
        if (s->io_q.in_flight >= MAX_EVENTS) {
            break;
        }
        len = 0;
        QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
            iocbs[len++] = &aiocb->iocb;
            if (s->io_q.in_flight + len >= MAX_EVENTS) {
                break;
            }
        }

        ret = io_submit(s->ctx, len, iocbs);
        if (ret == -EAGAIN) {
            break;
        }
        if (ret < 0) {
            /* Fail the first request, retry the rest */
            aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
            QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
            s->io_q.in_queue--;
            aiocb->ret = ret;
            qemu_laio_process_completion(aiocb);
            continue;
        }

        s->io_q.in_flight += ret;
        s->io_q.in_queue  -= ret;
        aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
        QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
    } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
    s->io_q.blocked = (s->io_q.in_queue > 0);

    if (s->io_q.in_flight) {
        /* We can try to complete something just right away if there are
         * still requests in-flight. */
        qemu_laio_process_completions(s);
        /*
         * Even we have completed everything (in_flight == 0), the queue can
         * have still pended requests (in_queue > 0).  We do not attempt to
         * repeat submission to avoid IO hang.  The reason is simple: s->e is
         * still set and completion callback will be called shortly and all
         * pended requests will be submitted from there.
         */
    }
}

static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
{
    uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;

    /*
     * AIO context can be shared between multiple block devices, so
     * `dev_max_batch` allows reducing the batch size for latency-sensitive
     * devices.
     */
    max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);

    /* limit the batch with the number of available events */
    max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);

    return max_batch;
}

void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
{
    s->io_q.plugged++;
}

void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s,
                    uint64_t dev_max_batch)
{
    assert(s->io_q.plugged);
    s->io_q.plugged--;

    /*
     * Why max batch checking is performed here:
     * Another BDS may have queued requests with a higher dev_max_batch and
     * therefore in_queue could now exceed our dev_max_batch. Re-check the max
     * batch so we can honor our device's dev_max_batch.
     */
    if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
        (!s->io_q.plugged &&
         !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
        ioq_submit(s);
    }
}

static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
                          int type, uint64_t dev_max_batch)
{
    LinuxAioState *s = laiocb->ctx;
    struct iocb *iocbs = &laiocb->iocb;
    QEMUIOVector *qiov = laiocb->qiov;

    switch (type) {
    case QEMU_AIO_WRITE:
        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
    case QEMU_AIO_READ:
        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
        break;
    /* Currently Linux kernel does not support other operations */
    default:
        fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
                        __func__, type);
        return -EIO;
    }
    io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));

    QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
    s->io_q.in_queue++;
    if (!s->io_q.blocked &&
        (!s->io_q.plugged ||
         s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
        ioq_submit(s);
    }

    return 0;
}

int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
                                uint64_t offset, QEMUIOVector *qiov, int type,
                                uint64_t dev_max_batch)
{
    int ret;
    struct qemu_laiocb laiocb = {
        .co         = qemu_coroutine_self(),
        .nbytes     = qiov->size,
        .ctx        = s,
        .ret        = -EINPROGRESS,
        .is_read    = (type == QEMU_AIO_READ),
        .qiov       = qiov,
    };

    ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
    if (ret < 0) {
        return ret;
    }

    if (laiocb.ret == -EINPROGRESS) {
        qemu_coroutine_yield();
    }
    return laiocb.ret;
}

void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
{
    aio_set_event_notifier(old_context, &s->e, false, NULL, NULL, NULL);
    qemu_bh_delete(s->completion_bh);
    s->aio_context = NULL;
}

void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
{
    s->aio_context = new_context;
    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
    aio_set_event_notifier(new_context, &s->e, false,
                           qemu_laio_completion_cb,
                           qemu_laio_poll_cb,
                           qemu_laio_poll_ready);
}

LinuxAioState *laio_init(Error **errp)
{
    int rc;
    LinuxAioState *s;

    s = g_malloc0(sizeof(*s));
    rc = event_notifier_init(&s->e, false);
    if (rc < 0) {
        error_setg_errno(errp, -rc, "failed to to initialize event notifier");
        goto out_free_state;
    }

    rc = io_setup(MAX_EVENTS, &s->ctx);
    if (rc < 0) {
        error_setg_errno(errp, -rc, "failed to create linux AIO context");
        goto out_close_efd;
    }

    ioq_init(&s->io_q);

    return s;

out_close_efd:
    event_notifier_cleanup(&s->e);
out_free_state:
    g_free(s);
    return NULL;
}

void laio_cleanup(LinuxAioState *s)
{
    event_notifier_cleanup(&s->e);

    if (io_destroy(s->ctx) != 0) {
        fprintf(stderr, "%s: destroy AIO context %p failed\n",
                        __func__, &s->ctx);
    }
    g_free(s);
}
Commit	Line	Data
5c6c3a6c CH	1	/*
	2	* Linux native AIO support.
	3	*
	4	* Copyright (C) 2009 IBM, Corp.
	5	* Copyright (C) 2009 Red Hat, Inc.
	6	*
	7	* This work is licensed under the terms of the GNU GPL, version 2 or later.
	8	* See the COPYING file in the top-level directory.
	9	*/
80c71a24	10	#include "qemu/osdep.h"
737e150e	11	#include "block/aio.h"
1de7afc9	12	#include "qemu/queue.h"
2174f12b	13	#include "block/block.h"
9f8540ec	14	#include "block/raw-aio.h"
1de7afc9	15	#include "qemu/event_notifier.h"
2174f12b	16	#include "qemu/coroutine.h"
ed6e2161	17	#include "qapi/error.h"
5c6c3a6c	18
5c6c3a6c CH	19	#include <libaio.h>
	20
	21	/*
	22	* Queue size (per-device).
	23	*
	24	* XXX: eventually we need to communicate this to the guest and/or make it
	25	* tunable by the guest. If we get more outstanding requests at a time
	26	* than this we will get EAGAIN from io_submit which is communicated to
	27	* the guest as an I/O error.
	28	*/
2558cb8d	29	#define MAX_EVENTS 1024
5c6c3a6c	30
d7ddd0a1 SG	31	/* Maximum number of requests in a batch. (default value) */
	32	#define DEFAULT_MAX_BATCH 32
	33
5c6c3a6c	34	struct qemu_laiocb {
2174f12b	35	Coroutine *co;
dd7f7ed1	36	LinuxAioState *ctx;
5c6c3a6c CH	37	struct iocb iocb;
	38	ssize_t ret;
	39	size_t nbytes;
b161e2e4 KW	40	QEMUIOVector *qiov;
b161e2e4 KW	41	bool is_read;
28b24087	42	QSIMPLEQ_ENTRY(qemu_laiocb) next;
5c6c3a6c CH	43	};
5c6c3a6c CH	44
1b3abdcc	45	typedef struct {
1b3abdcc	46	int plugged;
5e1b34a3 RP	47	unsigned int in_queue;
5e1b34a3 RP	48	unsigned int in_flight;
43f2376e	49	bool blocked;
28b24087	50	QSIMPLEQ_HEAD(, qemu_laiocb) pending;
1b3abdcc ML	51	} LaioQueue;
1b3abdcc ML	52
dd7f7ed1	53	struct LinuxAioState {
0187f5c9 PB	54	AioContext *aio_context;
0187f5c9 PB	55
5c6c3a6c	56	io_context_t ctx;
c90caf25	57	EventNotifier e;
1b3abdcc	58
1919631e	59	/* io queue for submit at batch. Protected by AioContext lock. */
1b3abdcc	60	LaioQueue io_q;
2cdff7f6	61
1919631e	62	/* I/O completion processing. Only runs in I/O thread. */
2cdff7f6	63	QEMUBH *completion_bh;
2cdff7f6 SH	64	int event_idx;
2cdff7f6 SH	65	int event_max;
5c6c3a6c CH	66	};
5c6c3a6c CH	67
dd7f7ed1	68	static void ioq_submit(LinuxAioState *s);
28b24087	69
5c6c3a6c CH	70	static inline ssize_t io_event_ret(struct io_event *ev)
	71	{
	72	return (ssize_t)(((uint64_t)ev->res2 << 32) \| ev->res);
	73	}
	74
db0ffc24	75	/*
2b02fd81	76	* Completes an AIO request.
db0ffc24	77	*/
dd7f7ed1	78	static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
db0ffc24 KW	79	{
	80	int ret;
	81
db0ffc24 KW	82	ret = laiocb->ret;
db0ffc24 KW	83	if (ret != -ECANCELED) {
b161e2e4	84	if (ret == laiocb->nbytes) {
db0ffc24	85	ret = 0;
b161e2e4 KW	86	} else if (ret >= 0) {
	87	/* Short reads mean EOF, pad with zeros. */
	88	if (laiocb->is_read) {
3d9b4925 MT	89	qemu_iovec_memset(laiocb->qiov, ret, 0,
3d9b4925 MT	90	laiocb->qiov->size - ret);
b161e2e4	91	} else {
1c42f149	92	ret = -ENOSPC;
b161e2e4 KW	93	}
b161e2e4 KW	94	}
db0ffc24 KW	95	}
db0ffc24 KW	96
2174f12b	97	laiocb->ret = ret;
2b02fd81 JS	98
	99	/*
	100	* If the coroutine is already entered it must be in ioq_submit() and
	101	* will notice laio->ret has been filled in when it eventually runs
	102	* later. Coroutines cannot be entered recursively so avoid doing
	103	* that!
	104	*/
	105	if (!qemu_coroutine_entered(laiocb->co)) {
	106	aio_co_wake(laiocb->co);
2174f12b	107	}
db0ffc24 KW	108	}
db0ffc24 KW	109
9e909a58 RP	110	/**
	111	* aio_ring buffer which is shared between userspace and kernel.
	112	*
	113	* This copied from linux/fs/aio.c, common header does not exist
	114	* but AIO exists for ages so we assume ABI is stable.
	115	*/
	116	struct aio_ring {
	117	unsigned id; /* kernel internal index number */
	118	unsigned nr; /* number of io_events */
	119	unsigned head; /* Written to by userland or by kernel. */
	120	unsigned tail;
	121
	122	unsigned magic;
	123	unsigned compat_features;
	124	unsigned incompat_features;
	125	unsigned header_length; /* size of aio_ring */
	126
f7795e40	127	struct io_event io_events[];
9e909a58 RP	128	};
	129
	130	/**
	131	* io_getevents_peek:
	132	* @ctx: AIO context
	133	* @events: pointer on events array, output value
	134
	135	* Returns the number of completed events and sets a pointer
	136	* on events array. This function does not update the internal
	137	* ring buffer, only reads head and tail. When @events has been
	138	* processed io_getevents_commit() must be called.
	139	*/
	140	static inline unsigned int io_getevents_peek(io_context_t ctx,
	141	struct io_event **events)
	142	{
	143	struct aio_ring ring = (struct aio_ring )ctx;
	144	unsigned int head = ring->head, tail = ring->tail;
	145	unsigned int nr;
	146
	147	nr = tail >= head ? tail - head : ring->nr - head;
	148	*events = ring->io_events + head;
	149	/* To avoid speculative loads of s->events[i] before observing tail.
	150	Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
	151	smp_rmb();
	152
	153	return nr;
	154	}
	155
	156	/**
	157	* io_getevents_commit:
	158	* @ctx: AIO context
	159	* @nr: the number of events on which head should be advanced
	160	*
	161	* Advances head of a ring buffer.
	162	*/
	163	static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
	164	{
	165	struct aio_ring ring = (struct aio_ring )ctx;
	166
	167	if (nr) {
	168	ring->head = (ring->head + nr) % ring->nr;
	169	}
	170	}
	171
	172	/**
	173	* io_getevents_advance_and_peek:
	174	* @ctx: AIO context
	175	* @events: pointer on events array, output value
	176	* @nr: the number of events on which head should be advanced
	177	*
	178	* Advances head of a ring buffer and returns number of elements left.
	179	*/
	180	static inline unsigned int
	181	io_getevents_advance_and_peek(io_context_t ctx,
	182	struct io_event **events,
	183	unsigned int nr)
	184	{
	185	io_getevents_commit(ctx, nr);
	186	return io_getevents_peek(ctx, events);
	187	}
	188
3407de57 RP	189	/**
	190	* qemu_laio_process_completions:
	191	* @s: AIO state
	192	*
	193	* Fetches completed I/O requests and invokes their callbacks.
2cdff7f6 SH	194	*
	195	* The function is somewhat tricky because it supports nested event loops, for
	196	* example when a request callback invokes aio_poll(). In order to do this,
3407de57 RP	197	* indices are kept in LinuxAioState. Function schedules BH completion so it
	198	* can be called again in a nested event loop. When there are no events left
	199	* to complete the BH is being canceled.
2cdff7f6	200	*/
3407de57	201	static void qemu_laio_process_completions(LinuxAioState *s)
5c6c3a6c	202	{
9e909a58	203	struct io_event *events;
5c6c3a6c	204
2cdff7f6 SH	205	/* Reschedule so nested event loops see currently pending completions */
2cdff7f6 SH	206	qemu_bh_schedule(s->completion_bh);
5c6c3a6c	207
9e909a58 RP	208	while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
	209	s->event_idx))) {
	210	for (s->event_idx = 0; s->event_idx < s->event_max; ) {
	211	struct iocb *iocb = events[s->event_idx].obj;
	212	struct qemu_laiocb *laiocb =
2cdff7f6 SH	213	container_of(iocb, struct qemu_laiocb, iocb);
2cdff7f6 SH	214
9e909a58	215	laiocb->ret = io_event_ret(&events[s->event_idx]);
2cdff7f6	216
9e909a58 RP	217	/* Change counters one-by-one because we can be nested. */
	218	s->io_q.in_flight--;
	219	s->event_idx++;
	220	qemu_laio_process_completion(laiocb);
	221	}
2cdff7f6	222	}
28b24087	223
9e909a58 RP	224	qemu_bh_cancel(s->completion_bh);
	225
	226	/* If we are nested we have to notify the level above that we are done
	227	* by setting event_max to zero, upper level will then jump out of it's
	228	* own `for` loop. If we are the last all counters droped to zero. */
	229	s->event_max = 0;
	230	s->event_idx = 0;
3407de57	231	}
9e909a58	232
3407de57 RP	233	static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
3407de57 RP	234	{
e091f0e9	235	aio_context_acquire(s->aio_context);
3407de57	236	qemu_laio_process_completions(s);
1919631e	237
28b24087 PB	238	if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
	239	ioq_submit(s);
	240	}
1919631e	241	aio_context_release(s->aio_context);
2cdff7f6 SH	242	}
2cdff7f6 SH	243
3407de57 RP	244	static void qemu_laio_completion_bh(void *opaque)
	245	{
	246	LinuxAioState *s = opaque;
	247
	248	qemu_laio_process_completions_and_submit(s);
	249	}
	250
2cdff7f6 SH	251	static void qemu_laio_completion_cb(EventNotifier *e)
2cdff7f6 SH	252	{
dd7f7ed1	253	LinuxAioState *s = container_of(e, LinuxAioState, e);
2cdff7f6 SH	254
2cdff7f6 SH	255	if (event_notifier_test_and_clear(&s->e)) {
3407de57	256	qemu_laio_process_completions_and_submit(s);
5c6c3a6c CH	257	}
	258	}
	259
ee686975 SH	260	static bool qemu_laio_poll_cb(void *opaque)
	261	{
	262	EventNotifier *e = opaque;
	263	LinuxAioState *s = container_of(e, LinuxAioState, e);
	264	struct io_event *events;
	265
826cc324 SH	266	return io_getevents_peek(s->ctx, &events);
	267	}
	268
	269	static void qemu_laio_poll_ready(EventNotifier *opaque)
	270	{
	271	EventNotifier *e = opaque;
	272	LinuxAioState *s = container_of(e, LinuxAioState, e);
ee686975 SH	273
ee686975 SH	274	qemu_laio_process_completions_and_submit(s);
ee686975 SH	275	}
ee686975 SH	276
1b3abdcc ML	277	static void ioq_init(LaioQueue *io_q)
1b3abdcc ML	278	{
28b24087	279	QSIMPLEQ_INIT(&io_q->pending);
1b3abdcc	280	io_q->plugged = 0;
5e1b34a3 RP	281	io_q->in_queue = 0;
5e1b34a3 RP	282	io_q->in_flight = 0;
43f2376e	283	io_q->blocked = false;
1b3abdcc ML	284	}
1b3abdcc ML	285
dd7f7ed1	286	static void ioq_submit(LinuxAioState *s)
1b3abdcc	287	{
82595da8	288	int ret, len;
28b24087	289	struct qemu_laiocb *aiocb;
5e1b34a3	290	struct iocb *iocbs[MAX_EVENTS];
82595da8	291	QSIMPLEQ_HEAD(, qemu_laiocb) completed;
1b3abdcc	292
43f2376e	293	do {
5e1b34a3 RP	294	if (s->io_q.in_flight >= MAX_EVENTS) {
	295	break;
	296	}
43f2376e PB	297	len = 0;
	298	QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
	299	iocbs[len++] = &aiocb->iocb;
5e1b34a3	300	if (s->io_q.in_flight + len >= MAX_EVENTS) {
43f2376e PB	301	break;
43f2376e PB	302	}
28b24087	303	}
1b3abdcc	304
43f2376e PB	305	ret = io_submit(s->ctx, len, iocbs);
43f2376e PB	306	if (ret == -EAGAIN) {
82595da8	307	break;
43f2376e PB	308	}
43f2376e PB	309	if (ret < 0) {
44713c9e KW	310	/* Fail the first request, retry the rest */
	311	aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
	312	QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
	313	s->io_q.in_queue--;
	314	aiocb->ret = ret;
	315	qemu_laio_process_completion(aiocb);
	316	continue;
43f2376e PB	317	}
43f2376e PB	318
5e1b34a3 RP	319	s->io_q.in_flight += ret;
5e1b34a3 RP	320	s->io_q.in_queue -= ret;
82595da8 PB	321	aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
82595da8 PB	322	QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
43f2376e	323	} while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
5e1b34a3	324	s->io_q.blocked = (s->io_q.in_queue > 0);
0ed93d84 RP	325
	326	if (s->io_q.in_flight) {
	327	/* We can try to complete something just right away if there are
	328	* still requests in-flight. */
	329	qemu_laio_process_completions(s);
	330	/*
	331	* Even we have completed everything (in_flight == 0), the queue can
	332	* have still pended requests (in_queue > 0). We do not attempt to
	333	* repeat submission to avoid IO hang. The reason is simple: s->e is
	334	* still set and completion callback will be called shortly and all
	335	* pended requests will be submitted from there.
	336	*/
	337	}
1b3abdcc ML	338	}
1b3abdcc ML	339
512da211 SG	340	static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
	341	{
	342	uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
	343
	344	/*
	345	* AIO context can be shared between multiple block devices, so
	346	* `dev_max_batch` allows reducing the batch size for latency-sensitive
	347	* devices.
	348	*/
	349	max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
	350
	351	/* limit the batch with the number of available events */
	352	max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
	353
	354	return max_batch;
	355	}
	356
dd7f7ed1	357	void laio_io_plug(BlockDriverState bs, LinuxAioState s)
1b3abdcc	358	{
0187f5c9	359	s->io_q.plugged++;
1b3abdcc ML	360	}
1b3abdcc ML	361
68d79466 SG	362	void laio_io_unplug(BlockDriverState bs, LinuxAioState s,
68d79466 SG	363	uint64_t dev_max_batch)
1b3abdcc	364	{
6b98bd64	365	assert(s->io_q.plugged);
f387cac5 SH	366	s->io_q.plugged--;
f387cac5 SH	367
99b969fb SH	368	/*
	369	* Why max batch checking is performed here:
	370	* Another BDS may have queued requests with a higher dev_max_batch and
	371	* therefore in_queue could now exceed our dev_max_batch. Re-check the max
	372	* batch so we can honor our device's dev_max_batch.
	373	*/
68d79466	374	if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) \|\|
f387cac5	375	(!s->io_q.plugged &&
68d79466	376	!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
de354644	377	ioq_submit(s);
1b3abdcc	378	}
1b3abdcc ML	379	}
1b3abdcc ML	380
2174f12b	381	static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
512da211	382	int type, uint64_t dev_max_batch)
5c6c3a6c	383	{
2174f12b KW	384	LinuxAioState *s = laiocb->ctx;
	385	struct iocb *iocbs = &laiocb->iocb;
	386	QEMUIOVector *qiov = laiocb->qiov;
5c6c3a6c CH	387
	388	switch (type) {
	389	case QEMU_AIO_WRITE:
	390	io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
7d37435b	391	break;
5c6c3a6c CH	392	case QEMU_AIO_READ:
5c6c3a6c CH	393	io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
7d37435b	394	break;
c30e624d	395	/* Currently Linux kernel does not support other operations */
5c6c3a6c CH	396	default:
	397	fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
	398	__func__, type);
2174f12b	399	return -EIO;
5c6c3a6c	400	}
c90caf25	401	io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
5c6c3a6c	402
28b24087	403	QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
5e1b34a3	404	s->io_q.in_queue++;
43f2376e	405	if (!s->io_q.blocked &&
5e1b34a3	406	(!s->io_q.plugged \|\|
512da211	407	s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
28b24087	408	ioq_submit(s);
1b3abdcc	409	}
5c6c3a6c	410
2174f12b KW	411	return 0;
	412	}
	413
	414	int coroutine_fn laio_co_submit(BlockDriverState bs, LinuxAioState s, int fd,
512da211 SG	415	uint64_t offset, QEMUIOVector *qiov, int type,
512da211 SG	416	uint64_t dev_max_batch)
2174f12b	417	{
2174f12b	418	int ret;
2174f12b KW	419	struct qemu_laiocb laiocb = {
2174f12b KW	420	.co = qemu_coroutine_self(),
9d52aa3c	421	.nbytes = qiov->size,
2174f12b	422	.ctx = s,
0ed93d84	423	.ret = -EINPROGRESS,
2174f12b KW	424	.is_read = (type == QEMU_AIO_READ),
	425	.qiov = qiov,
	426	};
	427
512da211	428	ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
2174f12b KW	429	if (ret < 0) {
	430	return ret;
	431	}
	432
0ed93d84 RP	433	if (laiocb.ret == -EINPROGRESS) {
	434	qemu_coroutine_yield();
	435	}
2174f12b KW	436	return laiocb.ret;
	437	}
	438
dd7f7ed1	439	void laio_detach_aio_context(LinuxAioState s, AioContext old_context)
c2f3426c	440	{
826cc324	441	aio_set_event_notifier(old_context, &s->e, false, NULL, NULL, NULL);
2cdff7f6	442	qemu_bh_delete(s->completion_bh);
1919631e	443	s->aio_context = NULL;
c2f3426c SH	444	}
c2f3426c SH	445
dd7f7ed1	446	void laio_attach_aio_context(LinuxAioState s, AioContext new_context)
c2f3426c	447	{
0187f5c9	448	s->aio_context = new_context;
2cdff7f6	449	s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
dca21ef2	450	aio_set_event_notifier(new_context, &s->e, false,
ee686975	451	qemu_laio_completion_cb,
826cc324 SH	452	qemu_laio_poll_cb,
826cc324 SH	453	qemu_laio_poll_ready);
c2f3426c SH	454	}
c2f3426c SH	455
ed6e2161	456	LinuxAioState laio_init(Error *errp)
5c6c3a6c	457	{
ed6e2161	458	int rc;
dd7f7ed1	459	LinuxAioState *s;
5c6c3a6c	460
7267c094	461	s = g_malloc0(sizeof(*s));
ed6e2161 NA	462	rc = event_notifier_init(&s->e, false);
	463	if (rc < 0) {
	464	error_setg_errno(errp, -rc, "failed to to initialize event notifier");
5c6c3a6c	465	goto out_free_state;
c90caf25	466	}
5c6c3a6c	467
ed6e2161 NA	468	rc = io_setup(MAX_EVENTS, &s->ctx);
	469	if (rc < 0) {
	470	error_setg_errno(errp, -rc, "failed to create linux AIO context");
5c6c3a6c	471	goto out_close_efd;
c90caf25	472	}
5c6c3a6c	473
1b3abdcc ML	474	ioq_init(&s->io_q);
1b3abdcc ML	475
5c6c3a6c CH	476	return s;
	477
	478	out_close_efd:
c90caf25	479	event_notifier_cleanup(&s->e);
5c6c3a6c	480	out_free_state:
7267c094	481	g_free(s);
5c6c3a6c CH	482	return NULL;
5c6c3a6c CH	483	}
abd269b7	484
dd7f7ed1	485	void laio_cleanup(LinuxAioState *s)
abd269b7	486	{
abd269b7	487	event_notifier_cleanup(&s->e);
a1abf40d GA	488
	489	if (io_destroy(s->ctx) != 0) {
	490	fprintf(stderr, "%s: destroy AIO context %p failed\n",
	491	__func__, &s->ctx);
	492	}
abd269b7 SH	493	g_free(s);
abd269b7 SH	494	}