[mirror_qemu.git] / block / linux-aio.c

/*
 * Linux native AIO support.
 *
 * Copyright (C) 2009 IBM, Corp.
 * Copyright (C) 2009 Red Hat, Inc.
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 */
#include "qemu/osdep.h"
#include "qemu-common.h"
#include "block/aio.h"
#include "qemu/queue.h"
#include "block/block.h"
#include "block/raw-aio.h"
#include "qemu/event_notifier.h"
#include "qemu/coroutine.h"

#include <libaio.h>

/*
 * Queue size (per-device).
 *
 * XXX: eventually we need to communicate this to the guest and/or make it
 *      tunable by the guest.  If we get more outstanding requests at a time
 *      than this we will get EAGAIN from io_submit which is communicated to
 *      the guest as an I/O error.
 */
#define MAX_EVENTS 128

struct qemu_laiocb {
    BlockAIOCB common;
    Coroutine *co;
    LinuxAioState *ctx;
    struct iocb iocb;
    ssize_t ret;
    size_t nbytes;
    QEMUIOVector *qiov;
    bool is_read;
    QSIMPLEQ_ENTRY(qemu_laiocb) next;
};

typedef struct {
    int plugged;
    unsigned int in_queue;
    unsigned int in_flight;
    bool blocked;
    QSIMPLEQ_HEAD(, qemu_laiocb) pending;
} LaioQueue;

struct LinuxAioState {
    AioContext *aio_context;

    io_context_t ctx;
    EventNotifier e;

    /* io queue for submit at batch */
    LaioQueue io_q;

    /* I/O completion processing */
    QEMUBH *completion_bh;
    int event_idx;
    int event_max;
};

static void ioq_submit(LinuxAioState *s);

static inline ssize_t io_event_ret(struct io_event *ev)
{
    return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
}

/*
 * Completes an AIO request (calls the callback and frees the ACB).
 */
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
{
    int ret;

    ret = laiocb->ret;
    if (ret != -ECANCELED) {
        if (ret == laiocb->nbytes) {
            ret = 0;
        } else if (ret >= 0) {
            /* Short reads mean EOF, pad with zeros. */
            if (laiocb->is_read) {
                qemu_iovec_memset(laiocb->qiov, ret, 0,
                    laiocb->qiov->size - ret);
            } else {
                ret = -ENOSPC;
            }
        }
    }

    laiocb->ret = ret;
    if (laiocb->co) {
        /* If the coroutine is already entered it must be in ioq_submit() and
         * will notice laio->ret has been filled in when it eventually runs
         * later.  Coroutines cannot be entered recursively so avoid doing
         * that!
         */
        if (!qemu_coroutine_entered(laiocb->co)) {
            qemu_coroutine_enter(laiocb->co);
        }
    } else {
        laiocb->common.cb(laiocb->common.opaque, ret);
        qemu_aio_unref(laiocb);
    }
}

/**
 * aio_ring buffer which is shared between userspace and kernel.
 *
 * This copied from linux/fs/aio.c, common header does not exist
 * but AIO exists for ages so we assume ABI is stable.
 */
struct aio_ring {
    unsigned    id;    /* kernel internal index number */
    unsigned    nr;    /* number of io_events */
    unsigned    head;  /* Written to by userland or by kernel. */
    unsigned    tail;

    unsigned    magic;
    unsigned    compat_features;
    unsigned    incompat_features;
    unsigned    header_length;  /* size of aio_ring */

    struct io_event io_events[0];
};

/**
 * io_getevents_peek:
 * @ctx: AIO context
 * @events: pointer on events array, output value

 * Returns the number of completed events and sets a pointer
 * on events array.  This function does not update the internal
 * ring buffer, only reads head and tail.  When @events has been
 * processed io_getevents_commit() must be called.
 */
static inline unsigned int io_getevents_peek(io_context_t ctx,
                                             struct io_event **events)
{
    struct aio_ring *ring = (struct aio_ring *)ctx;
    unsigned int head = ring->head, tail = ring->tail;
    unsigned int nr;

    nr = tail >= head ? tail - head : ring->nr - head;
    *events = ring->io_events + head;
    /* To avoid speculative loads of s->events[i] before observing tail.
       Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
    smp_rmb();

    return nr;
}

/**
 * io_getevents_commit:
 * @ctx: AIO context
 * @nr: the number of events on which head should be advanced
 *
 * Advances head of a ring buffer.
 */
static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
{
    struct aio_ring *ring = (struct aio_ring *)ctx;

    if (nr) {
        ring->head = (ring->head + nr) % ring->nr;
    }
}

/**
 * io_getevents_advance_and_peek:
 * @ctx: AIO context
 * @events: pointer on events array, output value
 * @nr: the number of events on which head should be advanced
 *
 * Advances head of a ring buffer and returns number of elements left.
 */
static inline unsigned int
io_getevents_advance_and_peek(io_context_t ctx,
                              struct io_event **events,
                              unsigned int nr)
{
    io_getevents_commit(ctx, nr);
    return io_getevents_peek(ctx, events);
}

/**
 * qemu_laio_process_completions:
 * @s: AIO state
 *
 * Fetches completed I/O requests and invokes their callbacks.
 *
 * The function is somewhat tricky because it supports nested event loops, for
 * example when a request callback invokes aio_poll().  In order to do this,
 * indices are kept in LinuxAioState.  Function schedules BH completion so it
 * can be called again in a nested event loop.  When there are no events left
 * to complete the BH is being canceled.
 */
static void qemu_laio_process_completions(LinuxAioState *s)
{
    struct io_event *events;

    /* Reschedule so nested event loops see currently pending completions */
    qemu_bh_schedule(s->completion_bh);

    while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
                                                         s->event_idx))) {
        for (s->event_idx = 0; s->event_idx < s->event_max; ) {
            struct iocb *iocb = events[s->event_idx].obj;
            struct qemu_laiocb *laiocb =
                container_of(iocb, struct qemu_laiocb, iocb);

            laiocb->ret = io_event_ret(&events[s->event_idx]);

            /* Change counters one-by-one because we can be nested. */
            s->io_q.in_flight--;
            s->event_idx++;
            qemu_laio_process_completion(laiocb);
        }
    }

    qemu_bh_cancel(s->completion_bh);

    /* If we are nested we have to notify the level above that we are done
     * by setting event_max to zero, upper level will then jump out of it's
     * own `for` loop.  If we are the last all counters droped to zero. */
    s->event_max = 0;
    s->event_idx = 0;
}

static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
{
    qemu_laio_process_completions(s);
    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
}

static void qemu_laio_completion_bh(void *opaque)
{
    LinuxAioState *s = opaque;

    qemu_laio_process_completions_and_submit(s);
}

static void qemu_laio_completion_cb(EventNotifier *e)
{
    LinuxAioState *s = container_of(e, LinuxAioState, e);

    if (event_notifier_test_and_clear(&s->e)) {
        qemu_laio_process_completions_and_submit(s);
    }
}

static void laio_cancel(BlockAIOCB *blockacb)
{
    struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
    struct io_event event;
    int ret;

    if (laiocb->ret != -EINPROGRESS) {
        return;
    }
    ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
    laiocb->ret = -ECANCELED;
    if (ret != 0) {
        /* iocb is not cancelled, cb will be called by the event loop later */
        return;
    }

    laiocb->common.cb(laiocb->common.opaque, laiocb->ret);
}

static const AIOCBInfo laio_aiocb_info = {
    .aiocb_size         = sizeof(struct qemu_laiocb),
    .cancel_async       = laio_cancel,
};

static void ioq_init(LaioQueue *io_q)
{
    QSIMPLEQ_INIT(&io_q->pending);
    io_q->plugged = 0;
    io_q->in_queue = 0;
    io_q->in_flight = 0;
    io_q->blocked = false;
}

static void ioq_submit(LinuxAioState *s)
{
    int ret, len;
    struct qemu_laiocb *aiocb;
    struct iocb *iocbs[MAX_EVENTS];
    QSIMPLEQ_HEAD(, qemu_laiocb) completed;

    do {
        if (s->io_q.in_flight >= MAX_EVENTS) {
            break;
        }
        len = 0;
        QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
            iocbs[len++] = &aiocb->iocb;
            if (s->io_q.in_flight + len >= MAX_EVENTS) {
                break;
            }
        }

        ret = io_submit(s->ctx, len, iocbs);
        if (ret == -EAGAIN) {
            break;
        }
        if (ret < 0) {
            /* Fail the first request, retry the rest */
            aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
            QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
            s->io_q.in_queue--;
            aiocb->ret = ret;
            qemu_laio_process_completion(aiocb);
            continue;
        }

        s->io_q.in_flight += ret;
        s->io_q.in_queue  -= ret;
        aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
        QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
    } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
    s->io_q.blocked = (s->io_q.in_queue > 0);

    if (s->io_q.in_flight) {
        /* We can try to complete something just right away if there are
         * still requests in-flight. */
        qemu_laio_process_completions(s);
        /*
         * Even we have completed everything (in_flight == 0), the queue can
         * have still pended requests (in_queue > 0).  We do not attempt to
         * repeat submission to avoid IO hang.  The reason is simple: s->e is
         * still set and completion callback will be called shortly and all
         * pended requests will be submitted from there.
         */
    }
}

void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
{
    s->io_q.plugged++;
}

void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
{
    assert(s->io_q.plugged);
    if (--s->io_q.plugged == 0 &&
        !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
        ioq_submit(s);
    }
}

static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
                          int type)
{
    LinuxAioState *s = laiocb->ctx;
    struct iocb *iocbs = &laiocb->iocb;
    QEMUIOVector *qiov = laiocb->qiov;

    switch (type) {
    case QEMU_AIO_WRITE:
        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
	break;
    case QEMU_AIO_READ:
        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
	break;
    /* Currently Linux kernel does not support other operations */
    default:
        fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
                        __func__, type);
        return -EIO;
    }
    io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));

    QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
    s->io_q.in_queue++;
    if (!s->io_q.blocked &&
        (!s->io_q.plugged ||
         s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
        ioq_submit(s);
    }

    return 0;
}

int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
                                uint64_t offset, QEMUIOVector *qiov, int type)
{
    int ret;
    struct qemu_laiocb laiocb = {
        .co         = qemu_coroutine_self(),
        .nbytes     = qiov->size,
        .ctx        = s,
        .ret        = -EINPROGRESS,
        .is_read    = (type == QEMU_AIO_READ),
        .qiov       = qiov,
    };

    ret = laio_do_submit(fd, &laiocb, offset, type);
    if (ret < 0) {
        return ret;
    }

    if (laiocb.ret == -EINPROGRESS) {
        qemu_coroutine_yield();
    }
    return laiocb.ret;
}

BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockCompletionFunc *cb, void *opaque, int type)
{
    struct qemu_laiocb *laiocb;
    off_t offset = sector_num * BDRV_SECTOR_SIZE;
    int ret;

    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
    laiocb->nbytes = nb_sectors * BDRV_SECTOR_SIZE;
    laiocb->ctx = s;
    laiocb->ret = -EINPROGRESS;
    laiocb->is_read = (type == QEMU_AIO_READ);
    laiocb->qiov = qiov;

    ret = laio_do_submit(fd, laiocb, offset, type);
    if (ret < 0) {
        qemu_aio_unref(laiocb);
        return NULL;
    }

    return &laiocb->common;
}

void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
{
    aio_set_event_notifier(old_context, &s->e, false, NULL);
    qemu_bh_delete(s->completion_bh);
}

void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
{
    s->aio_context = new_context;
    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
    aio_set_event_notifier(new_context, &s->e, false,
                           qemu_laio_completion_cb);
}

LinuxAioState *laio_init(void)
{
    LinuxAioState *s;

    s = g_malloc0(sizeof(*s));
    if (event_notifier_init(&s->e, false) < 0) {
        goto out_free_state;
    }

    if (io_setup(MAX_EVENTS, &s->ctx) != 0) {
        goto out_close_efd;
    }

    ioq_init(&s->io_q);

    return s;

out_close_efd:
    event_notifier_cleanup(&s->e);
out_free_state:
    g_free(s);
    return NULL;
}

void laio_cleanup(LinuxAioState *s)
{
    event_notifier_cleanup(&s->e);

    if (io_destroy(s->ctx) != 0) {
        fprintf(stderr, "%s: destroy AIO context %p failed\n",
                        __func__, &s->ctx);
    }
    g_free(s);
}
Commit	Line	Data
5c6c3a6c CH	1	/*
	2	* Linux native AIO support.
	3	*
	4	* Copyright (C) 2009 IBM, Corp.
	5	* Copyright (C) 2009 Red Hat, Inc.
	6	*
	7	* This work is licensed under the terms of the GNU GPL, version 2 or later.
	8	* See the COPYING file in the top-level directory.
	9	*/
80c71a24	10	#include "qemu/osdep.h"
5c6c3a6c	11	#include "qemu-common.h"
737e150e	12	#include "block/aio.h"
1de7afc9	13	#include "qemu/queue.h"
2174f12b	14	#include "block/block.h"
9f8540ec	15	#include "block/raw-aio.h"
1de7afc9	16	#include "qemu/event_notifier.h"
2174f12b	17	#include "qemu/coroutine.h"
5c6c3a6c	18
5c6c3a6c CH	19	#include <libaio.h>
	20
	21	/*
	22	* Queue size (per-device).
	23	*
	24	* XXX: eventually we need to communicate this to the guest and/or make it
	25	* tunable by the guest. If we get more outstanding requests at a time
	26	* than this we will get EAGAIN from io_submit which is communicated to
	27	* the guest as an I/O error.
	28	*/
	29	#define MAX_EVENTS 128
	30
	31	struct qemu_laiocb {
7c84b1b8	32	BlockAIOCB common;
2174f12b	33	Coroutine *co;
dd7f7ed1	34	LinuxAioState *ctx;
5c6c3a6c CH	35	struct iocb iocb;
	36	ssize_t ret;
	37	size_t nbytes;
b161e2e4 KW	38	QEMUIOVector *qiov;
b161e2e4 KW	39	bool is_read;
28b24087	40	QSIMPLEQ_ENTRY(qemu_laiocb) next;
5c6c3a6c CH	41	};
5c6c3a6c CH	42
1b3abdcc	43	typedef struct {
1b3abdcc	44	int plugged;
5e1b34a3 RP	45	unsigned int in_queue;
5e1b34a3 RP	46	unsigned int in_flight;
43f2376e	47	bool blocked;
28b24087	48	QSIMPLEQ_HEAD(, qemu_laiocb) pending;
1b3abdcc ML	49	} LaioQueue;
1b3abdcc ML	50
dd7f7ed1	51	struct LinuxAioState {
0187f5c9 PB	52	AioContext *aio_context;
0187f5c9 PB	53
5c6c3a6c	54	io_context_t ctx;
c90caf25	55	EventNotifier e;
1b3abdcc ML	56
	57	/* io queue for submit at batch */
	58	LaioQueue io_q;
2cdff7f6 SH	59
	60	/* I/O completion processing */
	61	QEMUBH *completion_bh;
2cdff7f6 SH	62	int event_idx;
2cdff7f6 SH	63	int event_max;
5c6c3a6c CH	64	};
5c6c3a6c CH	65
dd7f7ed1	66	static void ioq_submit(LinuxAioState *s);
28b24087	67
5c6c3a6c CH	68	static inline ssize_t io_event_ret(struct io_event *ev)
	69	{
	70	return (ssize_t)(((uint64_t)ev->res2 << 32) \| ev->res);
	71	}
	72
db0ffc24 KW	73	/*
db0ffc24 KW	74	* Completes an AIO request (calls the callback and frees the ACB).
db0ffc24	75	*/
dd7f7ed1	76	static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
db0ffc24 KW	77	{
	78	int ret;
	79
db0ffc24 KW	80	ret = laiocb->ret;
db0ffc24 KW	81	if (ret != -ECANCELED) {
b161e2e4	82	if (ret == laiocb->nbytes) {
db0ffc24	83	ret = 0;
b161e2e4 KW	84	} else if (ret >= 0) {
	85	/* Short reads mean EOF, pad with zeros. */
	86	if (laiocb->is_read) {
3d9b4925 MT	87	qemu_iovec_memset(laiocb->qiov, ret, 0,
3d9b4925 MT	88	laiocb->qiov->size - ret);
b161e2e4	89	} else {
1c42f149	90	ret = -ENOSPC;
b161e2e4 KW	91	}
b161e2e4 KW	92	}
db0ffc24 KW	93	}
db0ffc24 KW	94
2174f12b KW	95	laiocb->ret = ret;
2174f12b KW	96	if (laiocb->co) {
fe121b9d SH	97	/* If the coroutine is already entered it must be in ioq_submit() and
	98	* will notice laio->ret has been filled in when it eventually runs
	99	* later. Coroutines cannot be entered recursively so avoid doing
	100	* that!
	101	*/
	102	if (!qemu_coroutine_entered(laiocb->co)) {
0ed93d84 RP	103	qemu_coroutine_enter(laiocb->co);
0ed93d84 RP	104	}
2174f12b KW	105	} else {
	106	laiocb->common.cb(laiocb->common.opaque, ret);
	107	qemu_aio_unref(laiocb);
	108	}
db0ffc24 KW	109	}
db0ffc24 KW	110
9e909a58 RP	111	/**
	112	* aio_ring buffer which is shared between userspace and kernel.
	113	*
	114	* This copied from linux/fs/aio.c, common header does not exist
	115	* but AIO exists for ages so we assume ABI is stable.
	116	*/
	117	struct aio_ring {
	118	unsigned id; /* kernel internal index number */
	119	unsigned nr; /* number of io_events */
	120	unsigned head; /* Written to by userland or by kernel. */
	121	unsigned tail;
	122
	123	unsigned magic;
	124	unsigned compat_features;
	125	unsigned incompat_features;
	126	unsigned header_length; /* size of aio_ring */
	127
	128	struct io_event io_events[0];
	129	};
	130
	131	/**
	132	* io_getevents_peek:
	133	* @ctx: AIO context
	134	* @events: pointer on events array, output value
	135
	136	* Returns the number of completed events and sets a pointer
	137	* on events array. This function does not update the internal
	138	* ring buffer, only reads head and tail. When @events has been
	139	* processed io_getevents_commit() must be called.
	140	*/
	141	static inline unsigned int io_getevents_peek(io_context_t ctx,
	142	struct io_event **events)
	143	{
	144	struct aio_ring ring = (struct aio_ring )ctx;
	145	unsigned int head = ring->head, tail = ring->tail;
	146	unsigned int nr;
	147
	148	nr = tail >= head ? tail - head : ring->nr - head;
	149	*events = ring->io_events + head;
	150	/* To avoid speculative loads of s->events[i] before observing tail.
	151	Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
	152	smp_rmb();
	153
	154	return nr;
	155	}
	156
	157	/**
	158	* io_getevents_commit:
	159	* @ctx: AIO context
	160	* @nr: the number of events on which head should be advanced
	161	*
	162	* Advances head of a ring buffer.
	163	*/
	164	static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
	165	{
	166	struct aio_ring ring = (struct aio_ring )ctx;
	167
	168	if (nr) {
	169	ring->head = (ring->head + nr) % ring->nr;
	170	}
	171	}
	172
	173	/**
	174	* io_getevents_advance_and_peek:
175	* @ctx: AIO context
176	* @events: pointer on events array, output value
177	* @nr: the number of events on which head should be advanced
178	*
179	* Advances head of a ring buffer and returns number of elements left.
180	*/
181	static inline unsigned int
182	io_getevents_advance_and_peek(io_context_t ctx,
183	struct io_event **events,
184	unsigned int nr)
185	{
186	io_getevents_commit(ctx, nr);
187	return io_getevents_peek(ctx, events);
188	}
189
3407de57 RP	190	/**
	191	* qemu_laio_process_completions:
	192	* @s: AIO state
	193	*
	194	* Fetches completed I/O requests and invokes their callbacks.
2cdff7f6 SH	195	*
	196	* The function is somewhat tricky because it supports nested event loops, for
	197	* example when a request callback invokes aio_poll(). In order to do this,
3407de57 RP	198	* indices are kept in LinuxAioState. Function schedules BH completion so it
	199	* can be called again in a nested event loop. When there are no events left
	200	* to complete the BH is being canceled.
2cdff7f6	201	*/
3407de57	202	static void qemu_laio_process_completions(LinuxAioState *s)
5c6c3a6c	203	{
9e909a58	204	struct io_event *events;
5c6c3a6c	205
2cdff7f6 SH	206	/* Reschedule so nested event loops see currently pending completions */
2cdff7f6 SH	207	qemu_bh_schedule(s->completion_bh);
5c6c3a6c	208
9e909a58 RP	209	while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
	210	s->event_idx))) {
	211	for (s->event_idx = 0; s->event_idx < s->event_max; ) {
	212	struct iocb *iocb = events[s->event_idx].obj;
	213	struct qemu_laiocb *laiocb =
2cdff7f6 SH	214	container_of(iocb, struct qemu_laiocb, iocb);
2cdff7f6 SH	215
9e909a58	216	laiocb->ret = io_event_ret(&events[s->event_idx]);
2cdff7f6	217
9e909a58 RP	218	/* Change counters one-by-one because we can be nested. */
	219	s->io_q.in_flight--;
	220	s->event_idx++;
	221	qemu_laio_process_completion(laiocb);
	222	}
2cdff7f6	223	}
28b24087	224
9e909a58 RP	225	qemu_bh_cancel(s->completion_bh);
	226
	227	/* If we are nested we have to notify the level above that we are done
	228	* by setting event_max to zero, upper level will then jump out of it's
	229	* own `for` loop. If we are the last all counters droped to zero. */
	230	s->event_max = 0;
	231	s->event_idx = 0;
3407de57	232	}
9e909a58	233
3407de57 RP	234	static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
	235	{
	236	qemu_laio_process_completions(s);
28b24087 PB	237	if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
	238	ioq_submit(s);
	239	}
2cdff7f6 SH	240	}
2cdff7f6 SH	241
3407de57 RP	242	static void qemu_laio_completion_bh(void *opaque)
	243	{
	244	LinuxAioState *s = opaque;
	245
	246	qemu_laio_process_completions_and_submit(s);
	247	}
	248
2cdff7f6 SH	249	static void qemu_laio_completion_cb(EventNotifier *e)
2cdff7f6 SH	250	{
dd7f7ed1	251	LinuxAioState *s = container_of(e, LinuxAioState, e);
2cdff7f6 SH	252
2cdff7f6 SH	253	if (event_notifier_test_and_clear(&s->e)) {
3407de57	254	qemu_laio_process_completions_and_submit(s);
5c6c3a6c CH	255	}
	256	}
	257
7c84b1b8	258	static void laio_cancel(BlockAIOCB *blockacb)
5c6c3a6c CH	259	{
	260	struct qemu_laiocb laiocb = (struct qemu_laiocb )blockacb;
	261	struct io_event event;
	262	int ret;
	263
771b64da	264	if (laiocb->ret != -EINPROGRESS) {
5c6c3a6c	265	return;
771b64da	266	}
5c6c3a6c	267	ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
771b64da FZ	268	laiocb->ret = -ECANCELED;
	269	if (ret != 0) {
	270	/* iocb is not cancelled, cb will be called by the event loop later */
5c6c3a6c CH	271	return;
	272	}
	273
771b64da	274	laiocb->common.cb(laiocb->common.opaque, laiocb->ret);
5c6c3a6c CH	275	}
5c6c3a6c CH	276
d7331bed	277	static const AIOCBInfo laio_aiocb_info = {
5c6c3a6c	278	.aiocb_size = sizeof(struct qemu_laiocb),
771b64da	279	.cancel_async = laio_cancel,
5c6c3a6c CH	280	};
5c6c3a6c CH	281
1b3abdcc ML	282	static void ioq_init(LaioQueue *io_q)
1b3abdcc ML	283	{
28b24087	284	QSIMPLEQ_INIT(&io_q->pending);
1b3abdcc	285	io_q->plugged = 0;
5e1b34a3 RP	286	io_q->in_queue = 0;
5e1b34a3 RP	287	io_q->in_flight = 0;
43f2376e	288	io_q->blocked = false;
1b3abdcc ML	289	}
1b3abdcc ML	290
dd7f7ed1	291	static void ioq_submit(LinuxAioState *s)
1b3abdcc	292	{
82595da8	293	int ret, len;
28b24087	294	struct qemu_laiocb *aiocb;
5e1b34a3	295	struct iocb *iocbs[MAX_EVENTS];
82595da8	296	QSIMPLEQ_HEAD(, qemu_laiocb) completed;
1b3abdcc	297
43f2376e	298	do {
5e1b34a3 RP	299	if (s->io_q.in_flight >= MAX_EVENTS) {
	300	break;
	301	}
43f2376e PB	302	len = 0;
	303	QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
	304	iocbs[len++] = &aiocb->iocb;
5e1b34a3	305	if (s->io_q.in_flight + len >= MAX_EVENTS) {
43f2376e PB	306	break;
43f2376e PB	307	}
28b24087	308	}
1b3abdcc	309
43f2376e PB	310	ret = io_submit(s->ctx, len, iocbs);
43f2376e PB	311	if (ret == -EAGAIN) {
82595da8	312	break;
43f2376e PB	313	}
43f2376e PB	314	if (ret < 0) {
44713c9e KW	315	/* Fail the first request, retry the rest */
	316	aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
	317	QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
	318	s->io_q.in_queue--;
	319	aiocb->ret = ret;
	320	qemu_laio_process_completion(aiocb);
	321	continue;
43f2376e PB	322	}
43f2376e PB	323
5e1b34a3 RP	324	s->io_q.in_flight += ret;
5e1b34a3 RP	325	s->io_q.in_queue -= ret;
82595da8 PB	326	aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
82595da8 PB	327	QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
43f2376e	328	} while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
5e1b34a3	329	s->io_q.blocked = (s->io_q.in_queue > 0);
0ed93d84 RP	330
	331	if (s->io_q.in_flight) {
	332	/* We can try to complete something just right away if there are
	333	* still requests in-flight. */
	334	qemu_laio_process_completions(s);
	335	/*
	336	* Even we have completed everything (in_flight == 0), the queue can
	337	* have still pended requests (in_queue > 0). We do not attempt to
	338	* repeat submission to avoid IO hang. The reason is simple: s->e is
	339	* still set and completion callback will be called shortly and all
	340	* pended requests will be submitted from there.
	341	*/
	342	}
1b3abdcc ML	343	}
1b3abdcc ML	344
dd7f7ed1	345	void laio_io_plug(BlockDriverState bs, LinuxAioState s)
1b3abdcc	346	{
0187f5c9	347	s->io_q.plugged++;
1b3abdcc ML	348	}
1b3abdcc ML	349
dd7f7ed1	350	void laio_io_unplug(BlockDriverState bs, LinuxAioState s)
1b3abdcc	351	{
6b98bd64	352	assert(s->io_q.plugged);
0187f5c9 PB	353	if (--s->io_q.plugged == 0 &&
0187f5c9 PB	354	!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
de354644	355	ioq_submit(s);
1b3abdcc	356	}
1b3abdcc ML	357	}
1b3abdcc ML	358
2174f12b KW	359	static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
2174f12b KW	360	int type)
5c6c3a6c	361	{
2174f12b KW	362	LinuxAioState *s = laiocb->ctx;
	363	struct iocb *iocbs = &laiocb->iocb;
	364	QEMUIOVector *qiov = laiocb->qiov;
5c6c3a6c CH	365
	366	switch (type) {
	367	case QEMU_AIO_WRITE:
	368	io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
	369	break;
	370	case QEMU_AIO_READ:
	371	io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
	372	break;
c30e624d	373	/* Currently Linux kernel does not support other operations */
5c6c3a6c CH	374	default:
	375	fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
	376	__func__, type);
2174f12b	377	return -EIO;
5c6c3a6c	378	}
c90caf25	379	io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
5c6c3a6c	380
28b24087	381	QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
5e1b34a3	382	s->io_q.in_queue++;
43f2376e	383	if (!s->io_q.blocked &&
5e1b34a3 RP	384	(!s->io_q.plugged \|\|
5e1b34a3 RP	385	s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
28b24087	386	ioq_submit(s);
1b3abdcc	387	}
5c6c3a6c	388
2174f12b KW	389	return 0;
	390	}
	391
	392	int coroutine_fn laio_co_submit(BlockDriverState bs, LinuxAioState s, int fd,
9d52aa3c	393	uint64_t offset, QEMUIOVector *qiov, int type)
2174f12b	394	{
2174f12b	395	int ret;
2174f12b KW	396	struct qemu_laiocb laiocb = {
2174f12b KW	397	.co = qemu_coroutine_self(),
9d52aa3c	398	.nbytes = qiov->size,
2174f12b	399	.ctx = s,
0ed93d84	400	.ret = -EINPROGRESS,
2174f12b KW	401	.is_read = (type == QEMU_AIO_READ),
	402	.qiov = qiov,
	403	};
	404
	405	ret = laio_do_submit(fd, &laiocb, offset, type);
	406	if (ret < 0) {
	407	return ret;
	408	}
	409
0ed93d84 RP	410	if (laiocb.ret == -EINPROGRESS) {
	411	qemu_coroutine_yield();
	412	}
2174f12b KW	413	return laiocb.ret;
	414	}
	415
	416	BlockAIOCB laio_submit(BlockDriverState bs, LinuxAioState *s, int fd,
	417	int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
	418	BlockCompletionFunc cb, void opaque, int type)
	419	{
	420	struct qemu_laiocb *laiocb;
	421	off_t offset = sector_num * BDRV_SECTOR_SIZE;
	422	int ret;
	423
	424	laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque);
	425	laiocb->nbytes = nb_sectors * BDRV_SECTOR_SIZE;
	426	laiocb->ctx = s;
	427	laiocb->ret = -EINPROGRESS;
	428	laiocb->is_read = (type == QEMU_AIO_READ);
	429	laiocb->qiov = qiov;
	430
	431	ret = laio_do_submit(fd, laiocb, offset, type);
	432	if (ret < 0) {
	433	qemu_aio_unref(laiocb);
	434	return NULL;
	435	}
	436
	437	return &laiocb->common;
5c6c3a6c CH	438	}
5c6c3a6c CH	439
dd7f7ed1	440	void laio_detach_aio_context(LinuxAioState s, AioContext old_context)
c2f3426c	441	{
dca21ef2	442	aio_set_event_notifier(old_context, &s->e, false, NULL);
2cdff7f6	443	qemu_bh_delete(s->completion_bh);
c2f3426c SH	444	}
c2f3426c SH	445
dd7f7ed1	446	void laio_attach_aio_context(LinuxAioState s, AioContext new_context)
c2f3426c	447	{
0187f5c9	448	s->aio_context = new_context;
2cdff7f6	449	s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
dca21ef2 FZ	450	aio_set_event_notifier(new_context, &s->e, false,
dca21ef2 FZ	451	qemu_laio_completion_cb);
c2f3426c SH	452	}
c2f3426c SH	453
dd7f7ed1	454	LinuxAioState *laio_init(void)
5c6c3a6c	455	{
dd7f7ed1	456	LinuxAioState *s;
5c6c3a6c	457
7267c094	458	s = g_malloc0(sizeof(*s));
c90caf25	459	if (event_notifier_init(&s->e, false) < 0) {
5c6c3a6c	460	goto out_free_state;
c90caf25	461	}
5c6c3a6c	462
c90caf25	463	if (io_setup(MAX_EVENTS, &s->ctx) != 0) {
5c6c3a6c	464	goto out_close_efd;
c90caf25	465	}
5c6c3a6c	466
1b3abdcc ML	467	ioq_init(&s->io_q);
1b3abdcc ML	468
5c6c3a6c CH	469	return s;
	470
	471	out_close_efd:
c90caf25	472	event_notifier_cleanup(&s->e);
5c6c3a6c	473	out_free_state:
7267c094	474	g_free(s);
5c6c3a6c CH	475	return NULL;
5c6c3a6c CH	476	}
abd269b7	477
dd7f7ed1	478	void laio_cleanup(LinuxAioState *s)
abd269b7	479	{
abd269b7	480	event_notifier_cleanup(&s->e);
a1abf40d GA	481
	482	if (io_destroy(s->ctx) != 0) {
	483	fprintf(stderr, "%s: destroy AIO context %p failed\n",
	484	__func__, &s->ctx);
	485	}
abd269b7 SH	486	g_free(s);
abd269b7 SH	487	}