[mirror_qemu.git] / aio-posix.c

/*
 * QEMU aio implementation
 *
 * Copyright IBM, Corp. 2008
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
 */

#include "qemu/osdep.h"
#include "qemu-common.h"
#include "block/block.h"
#include "qemu/queue.h"
#include "qemu/sockets.h"
#ifdef CONFIG_EPOLL_CREATE1
#include <sys/epoll.h>
#endif

struct AioHandler
{
    GPollFD pfd;
    IOHandler *io_read;
    IOHandler *io_write;
    int deleted;
    void *opaque;
    bool is_external;
    QLIST_ENTRY(AioHandler) node;
};

#ifdef CONFIG_EPOLL_CREATE1

/* The fd number threashold to switch to epoll */
#define EPOLL_ENABLE_THRESHOLD 64

static void aio_epoll_disable(AioContext *ctx)
{
    ctx->epoll_available = false;
    if (!ctx->epoll_enabled) {
        return;
    }
    ctx->epoll_enabled = false;
    close(ctx->epollfd);
}

static inline int epoll_events_from_pfd(int pfd_events)
{
    return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
           (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
           (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
           (pfd_events & G_IO_ERR ? EPOLLERR : 0);
}

static bool aio_epoll_try_enable(AioContext *ctx)
{
    AioHandler *node;
    struct epoll_event event;

    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
        int r;
        if (node->deleted || !node->pfd.events) {
            continue;
        }
        event.events = epoll_events_from_pfd(node->pfd.events);
        event.data.ptr = node;
        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
        if (r) {
            return false;
        }
    }
    ctx->epoll_enabled = true;
    return true;
}

static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
{
    struct epoll_event event;
    int r;

    if (!ctx->epoll_enabled) {
        return;
    }
    if (!node->pfd.events) {
        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event);
        if (r) {
            aio_epoll_disable(ctx);
        }
    } else {
        event.data.ptr = node;
        event.events = epoll_events_from_pfd(node->pfd.events);
        if (is_new) {
            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
            if (r) {
                aio_epoll_disable(ctx);
            }
        } else {
            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event);
            if (r) {
                aio_epoll_disable(ctx);
            }
        }
    }
}

static int aio_epoll(AioContext *ctx, GPollFD *pfds,
                     unsigned npfd, int64_t timeout)
{
    AioHandler *node;
    int i, ret = 0;
    struct epoll_event events[128];

    assert(npfd == 1);
    assert(pfds[0].fd == ctx->epollfd);
    if (timeout > 0) {
        ret = qemu_poll_ns(pfds, npfd, timeout);
    }
    if (timeout <= 0 || ret > 0) {
        ret = epoll_wait(ctx->epollfd, events,
                         sizeof(events) / sizeof(events[0]),
                         timeout);
        if (ret <= 0) {
            goto out;
        }
        for (i = 0; i < ret; i++) {
            int ev = events[i].events;
            node = events[i].data.ptr;
            node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
                (ev & EPOLLOUT ? G_IO_OUT : 0) |
                (ev & EPOLLHUP ? G_IO_HUP : 0) |
                (ev & EPOLLERR ? G_IO_ERR : 0);
        }
    }
out:
    return ret;
}

static bool aio_epoll_enabled(AioContext *ctx)
{
    /* Fall back to ppoll when external clients are disabled. */
    return !aio_external_disabled(ctx) && ctx->epoll_enabled;
}

static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
                                 unsigned npfd, int64_t timeout)
{
    if (!ctx->epoll_available) {
        return false;
    }
    if (aio_epoll_enabled(ctx)) {
        return true;
    }
    if (npfd >= EPOLL_ENABLE_THRESHOLD) {
        if (aio_epoll_try_enable(ctx)) {
            return true;
        } else {
            aio_epoll_disable(ctx);
        }
    }
    return false;
}

#else

static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
{
}

static int aio_epoll(AioContext *ctx, GPollFD *pfds,
                     unsigned npfd, int64_t timeout)
{
    assert(false);
}

static bool aio_epoll_enabled(AioContext *ctx)
{
    return false;
}

static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
                          unsigned npfd, int64_t timeout)
{
    return false;
}

#endif

static AioHandler *find_aio_handler(AioContext *ctx, int fd)
{
    AioHandler *node;

    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
        if (node->pfd.fd == fd)
            if (!node->deleted)
                return node;
    }

    return NULL;
}

void aio_set_fd_handler(AioContext *ctx,
                        int fd,
                        bool is_external,
                        IOHandler *io_read,
                        IOHandler *io_write,
                        void *opaque)
{
    AioHandler *node;
    bool is_new = false;
    bool deleted = false;

    node = find_aio_handler(ctx, fd);

    /* Are we deleting the fd handler? */
    if (!io_read && !io_write) {
        if (node) {
            g_source_remove_poll(&ctx->source, &node->pfd);

            /* If the lock is held, just mark the node as deleted */
            if (ctx->walking_handlers) {
                node->deleted = 1;
                node->pfd.revents = 0;
            } else {
                /* Otherwise, delete it for real.  We can't just mark it as
                 * deleted because deleted nodes are only cleaned up after
                 * releasing the walking_handlers lock.
                 */
                QLIST_REMOVE(node, node);
                deleted = true;
            }
        }
    } else {
        if (node == NULL) {
            /* Alloc and insert if it's not already there */
            node = g_new0(AioHandler, 1);
            node->pfd.fd = fd;
            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);

            g_source_add_poll(&ctx->source, &node->pfd);
            is_new = true;
        }
        /* Update handler with latest information */
        node->io_read = io_read;
        node->io_write = io_write;
        node->opaque = opaque;
        node->is_external = is_external;

        node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
        node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
    }

    aio_epoll_update(ctx, node, is_new);
    aio_notify(ctx);
    if (deleted) {
        g_free(node);
    }
}

void aio_set_event_notifier(AioContext *ctx,
                            EventNotifier *notifier,
                            bool is_external,
                            EventNotifierHandler *io_read)
{
    aio_set_fd_handler(ctx, event_notifier_get_fd(notifier),
                       is_external, (IOHandler *)io_read, NULL, notifier);
}

bool aio_prepare(AioContext *ctx)
{
    return false;
}

bool aio_pending(AioContext *ctx)
{
    AioHandler *node;

    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
        int revents;

        revents = node->pfd.revents & node->pfd.events;
        if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
            return true;
        }
        if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
            return true;
        }
    }

    return false;
}

bool aio_dispatch(AioContext *ctx)
{
    AioHandler *node;
    bool progress = false;

    /*
     * If there are callbacks left that have been queued, we need to call them.
     * Do not call select in this case, because it is possible that the caller
     * does not need a complete flush (as is the case for aio_poll loops).
     */
    if (aio_bh_poll(ctx)) {
        progress = true;
    }

    /*
     * We have to walk very carefully in case aio_set_fd_handler is
     * called while we're walking.
     */
    node = QLIST_FIRST(&ctx->aio_handlers);
    while (node) {
        AioHandler *tmp;
        int revents;

        ctx->walking_handlers++;

        revents = node->pfd.revents & node->pfd.events;
        node->pfd.revents = 0;

        if (!node->deleted &&
            (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
            node->io_read) {
            node->io_read(node->opaque);

            /* aio_notify() does not count as progress */
            if (node->opaque != &ctx->notifier) {
                progress = true;
            }
        }
        if (!node->deleted &&
            (revents & (G_IO_OUT | G_IO_ERR)) &&
            node->io_write) {
            node->io_write(node->opaque);
            progress = true;
        }

        tmp = node;
        node = QLIST_NEXT(node, node);

        ctx->walking_handlers--;

        if (!ctx->walking_handlers && tmp->deleted) {
            QLIST_REMOVE(tmp, node);
            g_free(tmp);
        }
    }

    /* Run our timers */
    progress |= timerlistgroup_run_timers(&ctx->tlg);

    return progress;
}

/* These thread-local variables are used only in a small part of aio_poll
 * around the call to the poll() system call.  In particular they are not
 * used while aio_poll is performing callbacks, which makes it much easier
 * to think about reentrancy!
 *
 * Stack-allocated arrays would be perfect but they have size limitations;
 * heap allocation is expensive enough that we want to reuse arrays across
 * calls to aio_poll().  And because poll() has to be called without holding
 * any lock, the arrays cannot be stored in AioContext.  Thread-local data
 * has none of the disadvantages of these three options.
 */
static __thread GPollFD *pollfds;
static __thread AioHandler **nodes;
static __thread unsigned npfd, nalloc;
static __thread Notifier pollfds_cleanup_notifier;

static void pollfds_cleanup(Notifier *n, void *unused)
{
    g_assert(npfd == 0);
    g_free(pollfds);
    g_free(nodes);
    nalloc = 0;
}

static void add_pollfd(AioHandler *node)
{
    if (npfd == nalloc) {
        if (nalloc == 0) {
            pollfds_cleanup_notifier.notify = pollfds_cleanup;
            qemu_thread_atexit_add(&pollfds_cleanup_notifier);
            nalloc = 8;
        } else {
            g_assert(nalloc <= INT_MAX);
            nalloc *= 2;
        }
        pollfds = g_renew(GPollFD, pollfds, nalloc);
        nodes = g_renew(AioHandler *, nodes, nalloc);
    }
    nodes[npfd] = node;
    pollfds[npfd] = (GPollFD) {
        .fd = node->pfd.fd,
        .events = node->pfd.events,
    };
    npfd++;
}

bool aio_poll(AioContext *ctx, bool blocking)
{
    AioHandler *node;
    int i, ret;
    bool progress;
    int64_t timeout;

    aio_context_acquire(ctx);
    progress = false;

    /* aio_notify can avoid the expensive event_notifier_set if
     * everything (file descriptors, bottom halves, timers) will
     * be re-evaluated before the next blocking poll().  This is
     * already true when aio_poll is called with blocking == false;
     * if blocking == true, it is only true after poll() returns,
     * so disable the optimization now.
     */
    if (blocking) {
        atomic_add(&ctx->notify_me, 2);
    }

    ctx->walking_handlers++;

    assert(npfd == 0);

    /* fill pollfds */
    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
        if (!node->deleted && node->pfd.events
            && !aio_epoll_enabled(ctx)
            && aio_node_check(ctx, node->is_external)) {
            add_pollfd(node);
        }
    }

    timeout = blocking ? aio_compute_timeout(ctx) : 0;

    /* wait until next event */
    if (timeout) {
        aio_context_release(ctx);
    }
    if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
        AioHandler epoll_handler;

        epoll_handler.pfd.fd = ctx->epollfd;
        epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR;
        npfd = 0;
        add_pollfd(&epoll_handler);
        ret = aio_epoll(ctx, pollfds, npfd, timeout);
    } else  {
        ret = qemu_poll_ns(pollfds, npfd, timeout);
    }
    if (blocking) {
        atomic_sub(&ctx->notify_me, 2);
    }
    if (timeout) {
        aio_context_acquire(ctx);
    }

    aio_notify_accept(ctx);

    /* if we have any readable fds, dispatch event */
    if (ret > 0) {
        for (i = 0; i < npfd; i++) {
            nodes[i]->pfd.revents = pollfds[i].revents;
        }
    }

    npfd = 0;
    ctx->walking_handlers--;

    /* Run dispatch even if there were no readable fds to run timers */
    if (aio_dispatch(ctx)) {
        progress = true;
    }

    aio_context_release(ctx);

    return progress;
}

void aio_context_setup(AioContext *ctx, Error **errp)
{
#ifdef CONFIG_EPOLL_CREATE1
    assert(!ctx->epollfd);
    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
    if (ctx->epollfd == -1) {
        ctx->epoll_available = false;
    } else {
        ctx->epoll_available = true;
    }
#endif
}
Commit	Line	Data
a76bab49 AL	1	/*
	2	* QEMU aio implementation
	3	*
	4	* Copyright IBM, Corp. 2008
	5	*
	6	* Authors:
	7	* Anthony Liguori <aliguori@us.ibm.com>
	8	*
	9	* This work is licensed under the terms of the GNU GPL, version 2. See
	10	* the COPYING file in the top-level directory.
	11	*
6b620ca3 PB	12	* Contributions after 2012-01-13 are licensed under the terms of the
6b620ca3 PB	13	* GNU GPL, version 2 or (at your option) any later version.
a76bab49 AL	14	*/
a76bab49 AL	15
d38ea87a	16	#include "qemu/osdep.h"
a76bab49	17	#include "qemu-common.h"
737e150e	18	#include "block/block.h"
1de7afc9 PB	19	#include "qemu/queue.h"
1de7afc9 PB	20	#include "qemu/sockets.h"
147dfab7	21	#ifdef CONFIG_EPOLL_CREATE1
fbe3fc5c FZ	22	#include <sys/epoll.h>
fbe3fc5c FZ	23	#endif
a76bab49	24
a76bab49 AL	25	struct AioHandler
a76bab49 AL	26	{
cd9ba1eb	27	GPollFD pfd;
a76bab49 AL	28	IOHandler *io_read;
a76bab49 AL	29	IOHandler *io_write;
a76bab49 AL	30	int deleted;
a76bab49 AL	31	void *opaque;
dca21ef2	32	bool is_external;
72cf2d4f	33	QLIST_ENTRY(AioHandler) node;
a76bab49 AL	34	};
a76bab49 AL	35
147dfab7	36	#ifdef CONFIG_EPOLL_CREATE1
fbe3fc5c FZ	37
	38	/* The fd number threashold to switch to epoll */
	39	#define EPOLL_ENABLE_THRESHOLD 64
	40
	41	static void aio_epoll_disable(AioContext *ctx)
	42	{
	43	ctx->epoll_available = false;
	44	if (!ctx->epoll_enabled) {
	45	return;
	46	}
	47	ctx->epoll_enabled = false;
	48	close(ctx->epollfd);
	49	}
	50
	51	static inline int epoll_events_from_pfd(int pfd_events)
	52	{
	53	return (pfd_events & G_IO_IN ? EPOLLIN : 0) \|
	54	(pfd_events & G_IO_OUT ? EPOLLOUT : 0) \|
	55	(pfd_events & G_IO_HUP ? EPOLLHUP : 0) \|
	56	(pfd_events & G_IO_ERR ? EPOLLERR : 0);
	57	}
	58
	59	static bool aio_epoll_try_enable(AioContext *ctx)
	60	{
	61	AioHandler *node;
	62	struct epoll_event event;
	63
	64	QLIST_FOREACH(node, &ctx->aio_handlers, node) {
	65	int r;
	66	if (node->deleted \|\| !node->pfd.events) {
	67	continue;
	68	}
	69	event.events = epoll_events_from_pfd(node->pfd.events);
	70	event.data.ptr = node;
	71	r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
	72	if (r) {
	73	return false;
	74	}
	75	}
	76	ctx->epoll_enabled = true;
	77	return true;
	78	}
	79
	80	static void aio_epoll_update(AioContext ctx, AioHandler node, bool is_new)
	81	{
	82	struct epoll_event event;
	83	int r;
	84
	85	if (!ctx->epoll_enabled) {
	86	return;
	87	}
	88	if (!node->pfd.events) {
	89	r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event);
	90	if (r) {
	91	aio_epoll_disable(ctx);
	92	}
	93	} else {
	94	event.data.ptr = node;
	95	event.events = epoll_events_from_pfd(node->pfd.events);
	96	if (is_new) {
	97	r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
	98	if (r) {
	99	aio_epoll_disable(ctx);
	100	}
101	} else {
102	r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event);
103	if (r) {
104	aio_epoll_disable(ctx);
105	}
106	}
107	}
108	}
109
110	static int aio_epoll(AioContext ctx, GPollFD pfds,
111	unsigned npfd, int64_t timeout)
112	{
113	AioHandler *node;
114	int i, ret = 0;
115	struct epoll_event events[128];
116
117	assert(npfd == 1);
118	assert(pfds[0].fd == ctx->epollfd);
119	if (timeout > 0) {
120	ret = qemu_poll_ns(pfds, npfd, timeout);
121	}
122	if (timeout <= 0 \|\| ret > 0) {
123	ret = epoll_wait(ctx->epollfd, events,
124	sizeof(events) / sizeof(events[0]),
125	timeout);
126	if (ret <= 0) {
127	goto out;
128	}
129	for (i = 0; i < ret; i++) {
130	int ev = events[i].events;
131	node = events[i].data.ptr;
132	node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) \|
133	(ev & EPOLLOUT ? G_IO_OUT : 0) \|
134	(ev & EPOLLHUP ? G_IO_HUP : 0) \|
135	(ev & EPOLLERR ? G_IO_ERR : 0);
136	}
137	}
138	out:
139	return ret;
140	}
141
142	static bool aio_epoll_enabled(AioContext *ctx)
143	{
144	/* Fall back to ppoll when external clients are disabled. */
145	return !aio_external_disabled(ctx) && ctx->epoll_enabled;
146	}
147
148	static bool aio_epoll_check_poll(AioContext ctx, GPollFD pfds,
149	unsigned npfd, int64_t timeout)
150	{
151	if (!ctx->epoll_available) {
152	return false;
153	}
154	if (aio_epoll_enabled(ctx)) {
155	return true;
156	}
157	if (npfd >= EPOLL_ENABLE_THRESHOLD) {
158	if (aio_epoll_try_enable(ctx)) {
159	return true;
160	} else {
161	aio_epoll_disable(ctx);
162	}
163	}
164	return false;
165	}
166
167	#else
168
169	static void aio_epoll_update(AioContext ctx, AioHandler node, bool is_new)
170	{
171	}
172
173	static int aio_epoll(AioContext ctx, GPollFD pfds,
174	unsigned npfd, int64_t timeout)
175	{
176	assert(false);
177	}
178
179	static bool aio_epoll_enabled(AioContext *ctx)
180	{
181	return false;
182	}
183
184	static bool aio_epoll_check_poll(AioContext ctx, GPollFD pfds,
185	unsigned npfd, int64_t timeout)
186	{
187	return false;
188	}
189
190	#endif
191
a915f4bc	192	static AioHandler find_aio_handler(AioContext ctx, int fd)
a76bab49 AL	193	{
	194	AioHandler *node;
	195
a915f4bc	196	QLIST_FOREACH(node, &ctx->aio_handlers, node) {
cd9ba1eb	197	if (node->pfd.fd == fd)
79d5ca56 AG	198	if (!node->deleted)
79d5ca56 AG	199	return node;
a76bab49 AL	200	}
	201
	202	return NULL;
	203	}
	204
a915f4bc PB	205	void aio_set_fd_handler(AioContext *ctx,
a915f4bc PB	206	int fd,
dca21ef2	207	bool is_external,
a915f4bc PB	208	IOHandler *io_read,
a915f4bc PB	209	IOHandler *io_write,
a915f4bc	210	void *opaque)
a76bab49 AL	211	{
a76bab49 AL	212	AioHandler *node;
fbe3fc5c	213	bool is_new = false;
0ed39f3d	214	bool deleted = false;
a76bab49	215
a915f4bc	216	node = find_aio_handler(ctx, fd);
a76bab49 AL	217
	218	/* Are we deleting the fd handler? */
	219	if (!io_read && !io_write) {
	220	if (node) {
e3713e00 PB	221	g_source_remove_poll(&ctx->source, &node->pfd);
e3713e00 PB	222
a76bab49	223	/* If the lock is held, just mark the node as deleted */
cd9ba1eb	224	if (ctx->walking_handlers) {
a76bab49	225	node->deleted = 1;
cd9ba1eb PB	226	node->pfd.revents = 0;
cd9ba1eb PB	227	} else {
a76bab49 AL	228	/* Otherwise, delete it for real. We can't just mark it as
	229	* deleted because deleted nodes are only cleaned up after
	230	* releasing the walking_handlers lock.
	231	*/
72cf2d4f	232	QLIST_REMOVE(node, node);
0ed39f3d	233	deleted = true;
a76bab49 AL	234	}
	235	}
	236	} else {
	237	if (node == NULL) {
	238	/* Alloc and insert if it's not already there */
3ba235a0	239	node = g_new0(AioHandler, 1);
cd9ba1eb	240	node->pfd.fd = fd;
a915f4bc	241	QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
e3713e00 PB	242
e3713e00 PB	243	g_source_add_poll(&ctx->source, &node->pfd);
fbe3fc5c	244	is_new = true;
a76bab49 AL	245	}
	246	/* Update handler with latest information */
	247	node->io_read = io_read;
	248	node->io_write = io_write;
a76bab49	249	node->opaque = opaque;
dca21ef2	250	node->is_external = is_external;
cd9ba1eb	251
b5a01a70 SH	252	node->pfd.events = (io_read ? G_IO_IN \| G_IO_HUP \| G_IO_ERR : 0);
b5a01a70 SH	253	node->pfd.events \|= (io_write ? G_IO_OUT \| G_IO_ERR : 0);
a76bab49	254	}
7ed2b24c	255
fbe3fc5c	256	aio_epoll_update(ctx, node, is_new);
7ed2b24c	257	aio_notify(ctx);
0ed39f3d FZ	258	if (deleted) {
	259	g_free(node);
	260	}
9958c351 PB	261	}
9958c351 PB	262
a915f4bc PB	263	void aio_set_event_notifier(AioContext *ctx,
a915f4bc PB	264	EventNotifier *notifier,
dca21ef2	265	bool is_external,
f2e5dca4	266	EventNotifierHandler *io_read)
a76bab49	267	{
a915f4bc	268	aio_set_fd_handler(ctx, event_notifier_get_fd(notifier),
dca21ef2	269	is_external, (IOHandler *)io_read, NULL, notifier);
a76bab49 AL	270	}
a76bab49 AL	271
a3462c65 PB	272	bool aio_prepare(AioContext *ctx)
	273	{
	274	return false;
	275	}
	276
cd9ba1eb PB	277	bool aio_pending(AioContext *ctx)
	278	{
	279	AioHandler *node;
	280
	281	QLIST_FOREACH(node, &ctx->aio_handlers, node) {
	282	int revents;
	283
cd9ba1eb PB	284	revents = node->pfd.revents & node->pfd.events;
	285	if (revents & (G_IO_IN \| G_IO_HUP \| G_IO_ERR) && node->io_read) {
	286	return true;
	287	}
	288	if (revents & (G_IO_OUT \| G_IO_ERR) && node->io_write) {
	289	return true;
	290	}
	291	}
	292
	293	return false;
	294	}
	295
e4c7e2d1	296	bool aio_dispatch(AioContext *ctx)
a76bab49	297	{
9eb0bfca	298	AioHandler *node;
d0c8d2c0	299	bool progress = false;
7c0628b2	300
e4c7e2d1 PB	301	/*
	302	* If there are callbacks left that have been queued, we need to call them.
	303	* Do not call select in this case, because it is possible that the caller
	304	* does not need a complete flush (as is the case for aio_poll loops).
	305	*/
	306	if (aio_bh_poll(ctx)) {
	307	progress = true;
	308	}
	309
cd9ba1eb	310	/*
87f68d31	311	* We have to walk very carefully in case aio_set_fd_handler is
cd9ba1eb PB	312	* called while we're walking.
	313	*/
	314	node = QLIST_FIRST(&ctx->aio_handlers);
	315	while (node) {
	316	AioHandler *tmp;
	317	int revents;
	318
	319	ctx->walking_handlers++;
	320
	321	revents = node->pfd.revents & node->pfd.events;
	322	node->pfd.revents = 0;
	323
d0c8d2c0 SH	324	if (!node->deleted &&
	325	(revents & (G_IO_IN \| G_IO_HUP \| G_IO_ERR)) &&
	326	node->io_read) {
cd9ba1eb	327	node->io_read(node->opaque);
164a101f SH	328
	329	/* aio_notify() does not count as progress */
	330	if (node->opaque != &ctx->notifier) {
	331	progress = true;
	332	}
cd9ba1eb	333	}
d0c8d2c0 SH	334	if (!node->deleted &&
	335	(revents & (G_IO_OUT \| G_IO_ERR)) &&
	336	node->io_write) {
cd9ba1eb PB	337	node->io_write(node->opaque);
	338	progress = true;
	339	}
	340
	341	tmp = node;
	342	node = QLIST_NEXT(node, node);
	343
	344	ctx->walking_handlers--;
	345
	346	if (!ctx->walking_handlers && tmp->deleted) {
	347	QLIST_REMOVE(tmp, node);
	348	g_free(tmp);
	349	}
	350	}
438e1f47 AB	351
	352	/* Run our timers */
	353	progress \|= timerlistgroup_run_timers(&ctx->tlg);
	354
d0c8d2c0 SH	355	return progress;
	356	}
	357
e98ab097 PB	358	/* These thread-local variables are used only in a small part of aio_poll
	359	* around the call to the poll() system call. In particular they are not
	360	* used while aio_poll is performing callbacks, which makes it much easier
	361	* to think about reentrancy!
	362	*
	363	* Stack-allocated arrays would be perfect but they have size limitations;
	364	* heap allocation is expensive enough that we want to reuse arrays across
	365	* calls to aio_poll(). And because poll() has to be called without holding
	366	* any lock, the arrays cannot be stored in AioContext. Thread-local data
	367	* has none of the disadvantages of these three options.
	368	*/
	369	static __thread GPollFD *pollfds;
	370	static __thread AioHandler **nodes;
	371	static __thread unsigned npfd, nalloc;
	372	static __thread Notifier pollfds_cleanup_notifier;
	373
	374	static void pollfds_cleanup(Notifier n, void unused)
	375	{
	376	g_assert(npfd == 0);
	377	g_free(pollfds);
	378	g_free(nodes);
	379	nalloc = 0;
	380	}
	381
	382	static void add_pollfd(AioHandler *node)
	383	{
	384	if (npfd == nalloc) {
	385	if (nalloc == 0) {
	386	pollfds_cleanup_notifier.notify = pollfds_cleanup;
	387	qemu_thread_atexit_add(&pollfds_cleanup_notifier);
	388	nalloc = 8;
	389	} else {
	390	g_assert(nalloc <= INT_MAX);
	391	nalloc *= 2;
	392	}
	393	pollfds = g_renew(GPollFD, pollfds, nalloc);
	394	nodes = g_renew(AioHandler *, nodes, nalloc);
	395	}
	396	nodes[npfd] = node;
	397	pollfds[npfd] = (GPollFD) {
	398	.fd = node->pfd.fd,
	399	.events = node->pfd.events,
	400	};
	401	npfd++;
	402	}
	403
d0c8d2c0 SH	404	bool aio_poll(AioContext *ctx, bool blocking)
d0c8d2c0 SH	405	{
d0c8d2c0	406	AioHandler *node;
e98ab097	407	int i, ret;
164a101f	408	bool progress;
e98ab097	409	int64_t timeout;
d0c8d2c0	410
49110174	411	aio_context_acquire(ctx);
d0c8d2c0 SH	412	progress = false;
d0c8d2c0 SH	413
0ceb849b PB	414	/* aio_notify can avoid the expensive event_notifier_set if
0ceb849b PB	415	* everything (file descriptors, bottom halves, timers) will
e4c7e2d1 PB	416	* be re-evaluated before the next blocking poll(). This is
e4c7e2d1 PB	417	* already true when aio_poll is called with blocking == false;
eabc9779 PB	418	* if blocking == true, it is only true after poll() returns,
eabc9779 PB	419	* so disable the optimization now.
0ceb849b	420	*/
eabc9779 PB	421	if (blocking) {
	422	atomic_add(&ctx->notify_me, 2);
	423	}
0ceb849b	424
a915f4bc	425	ctx->walking_handlers++;
a76bab49	426
e98ab097	427	assert(npfd == 0);
a76bab49	428
6b5f8762	429	/* fill pollfds */
a915f4bc	430	QLIST_FOREACH(node, &ctx->aio_handlers, node) {
c1e1e5fa	431	if (!node->deleted && node->pfd.events
fbe3fc5c	432	&& !aio_epoll_enabled(ctx)
c1e1e5fa	433	&& aio_node_check(ctx, node->is_external)) {
e98ab097	434	add_pollfd(node);
9eb0bfca PB	435	}
9eb0bfca PB	436	}
a76bab49	437
e98ab097	438	timeout = blocking ? aio_compute_timeout(ctx) : 0;
a76bab49	439
9eb0bfca	440	/* wait until next event */
49110174 PB	441	if (timeout) {
	442	aio_context_release(ctx);
	443	}
fbe3fc5c FZ	444	if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
	445	AioHandler epoll_handler;
	446
	447	epoll_handler.pfd.fd = ctx->epollfd;
	448	epoll_handler.pfd.events = G_IO_IN \| G_IO_OUT \| G_IO_HUP \| G_IO_ERR;
	449	npfd = 0;
	450	add_pollfd(&epoll_handler);
	451	ret = aio_epoll(ctx, pollfds, npfd, timeout);
	452	} else {
	453	ret = qemu_poll_ns(pollfds, npfd, timeout);
	454	}
eabc9779 PB	455	if (blocking) {
	456	atomic_sub(&ctx->notify_me, 2);
	457	}
49110174 PB	458	if (timeout) {
	459	aio_context_acquire(ctx);
	460	}
9eb0bfca	461
05e514b1	462	aio_notify_accept(ctx);
21a03d17	463
9eb0bfca PB	464	/* if we have any readable fds, dispatch event */
9eb0bfca PB	465	if (ret > 0) {
e98ab097 PB	466	for (i = 0; i < npfd; i++) {
e98ab097 PB	467	nodes[i]->pfd.revents = pollfds[i].revents;
a76bab49	468	}
438e1f47 AB	469	}
438e1f47 AB	470
e98ab097 PB	471	npfd = 0;
	472	ctx->walking_handlers--;
	473
438e1f47 AB	474	/* Run dispatch even if there were no readable fds to run timers */
	475	if (aio_dispatch(ctx)) {
	476	progress = true;
9eb0bfca	477	}
bcdc1857	478
49110174 PB	479	aio_context_release(ctx);
49110174 PB	480
164a101f	481	return progress;
a76bab49	482	}
37fcee5d FZ	483
	484	void aio_context_setup(AioContext ctx, Error *errp)
	485	{
147dfab7	486	#ifdef CONFIG_EPOLL_CREATE1
fbe3fc5c FZ	487	assert(!ctx->epollfd);
	488	ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
	489	if (ctx->epollfd == -1) {
	490	ctx->epoll_available = false;
	491	} else {
	492	ctx->epoll_available = true;
	493	}
	494	#endif
37fcee5d	495	}