[mirror_qemu.git] / aio-posix.c

/*
 * QEMU aio implementation
 *
 * Copyright IBM, Corp. 2008
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
 */

#include "qemu-common.h"
#include "block/block.h"
#include "qemu/queue.h"
#include "qemu/sockets.h"
#ifdef CONFIG_EPOLL
#include <sys/epoll.h>
#endif

struct AioHandler
{
    GPollFD pfd;
    IOHandler *io_read;
    IOHandler *io_write;
    int deleted;
    void *opaque;
    bool is_external;
    QLIST_ENTRY(AioHandler) node;
};

#ifdef CONFIG_EPOLL

/* The fd number threashold to switch to epoll */
#define EPOLL_ENABLE_THRESHOLD 64

static void aio_epoll_disable(AioContext *ctx)
{
    ctx->epoll_available = false;
    if (!ctx->epoll_enabled) {
        return;
    }
    ctx->epoll_enabled = false;
    close(ctx->epollfd);
}

static inline int epoll_events_from_pfd(int pfd_events)
{
    return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
           (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
           (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
           (pfd_events & G_IO_ERR ? EPOLLERR : 0);
}

static bool aio_epoll_try_enable(AioContext *ctx)
{
    AioHandler *node;
    struct epoll_event event;

    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
        int r;
        if (node->deleted || !node->pfd.events) {
            continue;
        }
        event.events = epoll_events_from_pfd(node->pfd.events);
        event.data.ptr = node;
        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
        if (r) {
            return false;
        }
    }
    ctx->epoll_enabled = true;
    return true;
}

static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
{
    struct epoll_event event;
    int r;

    if (!ctx->epoll_enabled) {
        return;
    }
    if (!node->pfd.events) {
        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event);
        if (r) {
            aio_epoll_disable(ctx);
        }
    } else {
        event.data.ptr = node;
        event.events = epoll_events_from_pfd(node->pfd.events);
        if (is_new) {
            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
            if (r) {
                aio_epoll_disable(ctx);
            }
        } else {
            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event);
            if (r) {
                aio_epoll_disable(ctx);
            }
        }
    }
}

static int aio_epoll(AioContext *ctx, GPollFD *pfds,
                     unsigned npfd, int64_t timeout)
{
    AioHandler *node;
    int i, ret = 0;
    struct epoll_event events[128];

    assert(npfd == 1);
    assert(pfds[0].fd == ctx->epollfd);
    if (timeout > 0) {
        ret = qemu_poll_ns(pfds, npfd, timeout);
    }
    if (timeout <= 0 || ret > 0) {
        ret = epoll_wait(ctx->epollfd, events,
                         sizeof(events) / sizeof(events[0]),
                         timeout);
        if (ret <= 0) {
            goto out;
        }
        for (i = 0; i < ret; i++) {
            int ev = events[i].events;
            node = events[i].data.ptr;
            node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
                (ev & EPOLLOUT ? G_IO_OUT : 0) |
                (ev & EPOLLHUP ? G_IO_HUP : 0) |
                (ev & EPOLLERR ? G_IO_ERR : 0);
        }
    }
out:
    return ret;
}

static bool aio_epoll_enabled(AioContext *ctx)
{
    /* Fall back to ppoll when external clients are disabled. */
    return !aio_external_disabled(ctx) && ctx->epoll_enabled;
}

static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
                                 unsigned npfd, int64_t timeout)
{
    if (!ctx->epoll_available) {
        return false;
    }
    if (aio_epoll_enabled(ctx)) {
        return true;
    }
    if (npfd >= EPOLL_ENABLE_THRESHOLD) {
        if (aio_epoll_try_enable(ctx)) {
            return true;
        } else {
            aio_epoll_disable(ctx);
        }
    }
    return false;
}

#else

static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
{
}

static int aio_epoll(AioContext *ctx, GPollFD *pfds,
                     unsigned npfd, int64_t timeout)
{
    assert(false);
}

static bool aio_epoll_enabled(AioContext *ctx)
{
    return false;
}

static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
                          unsigned npfd, int64_t timeout)
{
    return false;
}

#endif

static AioHandler *find_aio_handler(AioContext *ctx, int fd)
{
    AioHandler *node;

    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
        if (node->pfd.fd == fd)
            if (!node->deleted)
                return node;
    }

    return NULL;
}

void aio_set_fd_handler(AioContext *ctx,
                        int fd,
                        bool is_external,
                        IOHandler *io_read,
                        IOHandler *io_write,
                        void *opaque)
{
    AioHandler *node;
    bool is_new = false;

    node = find_aio_handler(ctx, fd);

    /* Are we deleting the fd handler? */
    if (!io_read && !io_write) {
        if (node) {
            g_source_remove_poll(&ctx->source, &node->pfd);

            /* If the lock is held, just mark the node as deleted */
            if (ctx->walking_handlers) {
                node->deleted = 1;
                node->pfd.revents = 0;
            } else {
                /* Otherwise, delete it for real.  We can't just mark it as
                 * deleted because deleted nodes are only cleaned up after
                 * releasing the walking_handlers lock.
                 */
                QLIST_REMOVE(node, node);
                g_free(node);
            }
        }
    } else {
        if (node == NULL) {
            /* Alloc and insert if it's not already there */
            node = g_new0(AioHandler, 1);
            node->pfd.fd = fd;
            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);

            g_source_add_poll(&ctx->source, &node->pfd);
            is_new = true;
        }
        /* Update handler with latest information */
        node->io_read = io_read;
        node->io_write = io_write;
        node->opaque = opaque;
        node->is_external = is_external;

        node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
        node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
    }

    aio_epoll_update(ctx, node, is_new);
    aio_notify(ctx);
}

void aio_set_event_notifier(AioContext *ctx,
                            EventNotifier *notifier,
                            bool is_external,
                            EventNotifierHandler *io_read)
{
    aio_set_fd_handler(ctx, event_notifier_get_fd(notifier),
                       is_external, (IOHandler *)io_read, NULL, notifier);
}

bool aio_prepare(AioContext *ctx)
{
    return false;
}

bool aio_pending(AioContext *ctx)
{
    AioHandler *node;

    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
        int revents;

        revents = node->pfd.revents & node->pfd.events;
        if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
            return true;
        }
        if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
            return true;
        }
    }

    return false;
}

bool aio_dispatch(AioContext *ctx)
{
    AioHandler *node;
    bool progress = false;

    /*
     * If there are callbacks left that have been queued, we need to call them.
     * Do not call select in this case, because it is possible that the caller
     * does not need a complete flush (as is the case for aio_poll loops).
     */
    if (aio_bh_poll(ctx)) {
        progress = true;
    }

    /*
     * We have to walk very carefully in case aio_set_fd_handler is
     * called while we're walking.
     */
    node = QLIST_FIRST(&ctx->aio_handlers);
    while (node) {
        AioHandler *tmp;
        int revents;

        ctx->walking_handlers++;

        revents = node->pfd.revents & node->pfd.events;
        node->pfd.revents = 0;

        if (!node->deleted &&
            (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
            node->io_read) {
            node->io_read(node->opaque);

            /* aio_notify() does not count as progress */
            if (node->opaque != &ctx->notifier) {
                progress = true;
            }
        }
        if (!node->deleted &&
            (revents & (G_IO_OUT | G_IO_ERR)) &&
            node->io_write) {
            node->io_write(node->opaque);
            progress = true;
        }

        tmp = node;
        node = QLIST_NEXT(node, node);

        ctx->walking_handlers--;

        if (!ctx->walking_handlers && tmp->deleted) {
            QLIST_REMOVE(tmp, node);
            g_free(tmp);
        }
    }

    /* Run our timers */
    progress |= timerlistgroup_run_timers(&ctx->tlg);

    return progress;
}

/* These thread-local variables are used only in a small part of aio_poll
 * around the call to the poll() system call.  In particular they are not
 * used while aio_poll is performing callbacks, which makes it much easier
 * to think about reentrancy!
 *
 * Stack-allocated arrays would be perfect but they have size limitations;
 * heap allocation is expensive enough that we want to reuse arrays across
 * calls to aio_poll().  And because poll() has to be called without holding
 * any lock, the arrays cannot be stored in AioContext.  Thread-local data
 * has none of the disadvantages of these three options.
 */
static __thread GPollFD *pollfds;
static __thread AioHandler **nodes;
static __thread unsigned npfd, nalloc;
static __thread Notifier pollfds_cleanup_notifier;

static void pollfds_cleanup(Notifier *n, void *unused)
{
    g_assert(npfd == 0);
    g_free(pollfds);
    g_free(nodes);
    nalloc = 0;
}

static void add_pollfd(AioHandler *node)
{
    if (npfd == nalloc) {
        if (nalloc == 0) {
            pollfds_cleanup_notifier.notify = pollfds_cleanup;
            qemu_thread_atexit_add(&pollfds_cleanup_notifier);
            nalloc = 8;
        } else {
            g_assert(nalloc <= INT_MAX);
            nalloc *= 2;
        }
        pollfds = g_renew(GPollFD, pollfds, nalloc);
        nodes = g_renew(AioHandler *, nodes, nalloc);
    }
    nodes[npfd] = node;
    pollfds[npfd] = (GPollFD) {
        .fd = node->pfd.fd,
        .events = node->pfd.events,
    };
    npfd++;
}

bool aio_poll(AioContext *ctx, bool blocking)
{
    AioHandler *node;
    int i, ret;
    bool progress;
    int64_t timeout;

    aio_context_acquire(ctx);
    progress = false;

    /* aio_notify can avoid the expensive event_notifier_set if
     * everything (file descriptors, bottom halves, timers) will
     * be re-evaluated before the next blocking poll().  This is
     * already true when aio_poll is called with blocking == false;
     * if blocking == true, it is only true after poll() returns,
     * so disable the optimization now.
     */
    if (blocking) {
        atomic_add(&ctx->notify_me, 2);
    }

    ctx->walking_handlers++;

    assert(npfd == 0);

    /* fill pollfds */
    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
        if (!node->deleted && node->pfd.events
            && !aio_epoll_enabled(ctx)
            && aio_node_check(ctx, node->is_external)) {
            add_pollfd(node);
        }
    }

    timeout = blocking ? aio_compute_timeout(ctx) : 0;

    /* wait until next event */
    if (timeout) {
        aio_context_release(ctx);
    }
    if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
        AioHandler epoll_handler;

        epoll_handler.pfd.fd = ctx->epollfd;
        epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR;
        npfd = 0;
        add_pollfd(&epoll_handler);
        ret = aio_epoll(ctx, pollfds, npfd, timeout);
    } else  {
        ret = qemu_poll_ns(pollfds, npfd, timeout);
    }
    if (blocking) {
        atomic_sub(&ctx->notify_me, 2);
    }
    if (timeout) {
        aio_context_acquire(ctx);
    }

    aio_notify_accept(ctx);

    /* if we have any readable fds, dispatch event */
    if (ret > 0) {
        for (i = 0; i < npfd; i++) {
            nodes[i]->pfd.revents = pollfds[i].revents;
        }
    }

    npfd = 0;
    ctx->walking_handlers--;

    /* Run dispatch even if there were no readable fds to run timers */
    if (aio_dispatch(ctx)) {
        progress = true;
    }

    aio_context_release(ctx);

    return progress;
}

void aio_context_setup(AioContext *ctx, Error **errp)
{
#ifdef CONFIG_EPOLL
    assert(!ctx->epollfd);
    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
    if (ctx->epollfd == -1) {
        ctx->epoll_available = false;
    } else {
        ctx->epoll_available = true;
    }
#endif
}
Commit	Line	Data
a76bab49 AL	1	/*
	2	* QEMU aio implementation
	3	*
	4	* Copyright IBM, Corp. 2008
	5	*
	6	* Authors:
	7	* Anthony Liguori <aliguori@us.ibm.com>
	8	*
	9	* This work is licensed under the terms of the GNU GPL, version 2. See
	10	* the COPYING file in the top-level directory.
	11	*
6b620ca3 PB	12	* Contributions after 2012-01-13 are licensed under the terms of the
6b620ca3 PB	13	* GNU GPL, version 2 or (at your option) any later version.
a76bab49 AL	14	*/
	15
	16	#include "qemu-common.h"
737e150e	17	#include "block/block.h"
1de7afc9 PB	18	#include "qemu/queue.h"
1de7afc9 PB	19	#include "qemu/sockets.h"
fbe3fc5c FZ	20	#ifdef CONFIG_EPOLL
	21	#include <sys/epoll.h>
	22	#endif
a76bab49	23
a76bab49 AL	24	struct AioHandler
a76bab49 AL	25	{
cd9ba1eb	26	GPollFD pfd;
a76bab49 AL	27	IOHandler *io_read;
a76bab49 AL	28	IOHandler *io_write;
a76bab49 AL	29	int deleted;
a76bab49 AL	30	void *opaque;
dca21ef2	31	bool is_external;
72cf2d4f	32	QLIST_ENTRY(AioHandler) node;
a76bab49 AL	33	};
a76bab49 AL	34
fbe3fc5c FZ	35	#ifdef CONFIG_EPOLL
	36
	37	/* The fd number threashold to switch to epoll */
	38	#define EPOLL_ENABLE_THRESHOLD 64
	39
	40	static void aio_epoll_disable(AioContext *ctx)
	41	{
	42	ctx->epoll_available = false;
	43	if (!ctx->epoll_enabled) {
	44	return;
	45	}
	46	ctx->epoll_enabled = false;
	47	close(ctx->epollfd);
	48	}
	49
	50	static inline int epoll_events_from_pfd(int pfd_events)
	51	{
	52	return (pfd_events & G_IO_IN ? EPOLLIN : 0) \|
	53	(pfd_events & G_IO_OUT ? EPOLLOUT : 0) \|
	54	(pfd_events & G_IO_HUP ? EPOLLHUP : 0) \|
	55	(pfd_events & G_IO_ERR ? EPOLLERR : 0);
	56	}
	57
	58	static bool aio_epoll_try_enable(AioContext *ctx)
	59	{
	60	AioHandler *node;
	61	struct epoll_event event;
	62
	63	QLIST_FOREACH(node, &ctx->aio_handlers, node) {
	64	int r;
	65	if (node->deleted \|\| !node->pfd.events) {
	66	continue;
	67	}
	68	event.events = epoll_events_from_pfd(node->pfd.events);
	69	event.data.ptr = node;
	70	r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
	71	if (r) {
	72	return false;
	73	}
	74	}
	75	ctx->epoll_enabled = true;
	76	return true;
	77	}
	78
	79	static void aio_epoll_update(AioContext ctx, AioHandler node, bool is_new)
	80	{
	81	struct epoll_event event;
	82	int r;
	83
	84	if (!ctx->epoll_enabled) {
	85	return;
	86	}
	87	if (!node->pfd.events) {
	88	r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event);
	89	if (r) {
	90	aio_epoll_disable(ctx);
	91	}
	92	} else {
	93	event.data.ptr = node;
	94	event.events = epoll_events_from_pfd(node->pfd.events);
	95	if (is_new) {
	96	r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
	97	if (r) {
	98	aio_epoll_disable(ctx);
99	}
100	} else {
101	r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event);
102	if (r) {
103	aio_epoll_disable(ctx);
104	}
105	}
106	}
107	}
108
109	static int aio_epoll(AioContext ctx, GPollFD pfds,
110	unsigned npfd, int64_t timeout)
111	{
112	AioHandler *node;
113	int i, ret = 0;
114	struct epoll_event events[128];
115
116	assert(npfd == 1);
117	assert(pfds[0].fd == ctx->epollfd);
118	if (timeout > 0) {
119	ret = qemu_poll_ns(pfds, npfd, timeout);
120	}
121	if (timeout <= 0 \|\| ret > 0) {
122	ret = epoll_wait(ctx->epollfd, events,
123	sizeof(events) / sizeof(events[0]),
124	timeout);
125	if (ret <= 0) {
126	goto out;
127	}
128	for (i = 0; i < ret; i++) {
129	int ev = events[i].events;
130	node = events[i].data.ptr;
131	node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) \|
132	(ev & EPOLLOUT ? G_IO_OUT : 0) \|
133	(ev & EPOLLHUP ? G_IO_HUP : 0) \|
134	(ev & EPOLLERR ? G_IO_ERR : 0);
135	}
136	}
137	out:
138	return ret;
139	}
140
141	static bool aio_epoll_enabled(AioContext *ctx)
142	{
143	/* Fall back to ppoll when external clients are disabled. */
144	return !aio_external_disabled(ctx) && ctx->epoll_enabled;
145	}
146
147	static bool aio_epoll_check_poll(AioContext ctx, GPollFD pfds,
148	unsigned npfd, int64_t timeout)
149	{
150	if (!ctx->epoll_available) {
151	return false;
152	}
153	if (aio_epoll_enabled(ctx)) {
154	return true;
155	}
156	if (npfd >= EPOLL_ENABLE_THRESHOLD) {
157	if (aio_epoll_try_enable(ctx)) {
158	return true;
159	} else {
160	aio_epoll_disable(ctx);
161	}
162	}
163	return false;
164	}
165
166	#else
167
168	static void aio_epoll_update(AioContext ctx, AioHandler node, bool is_new)
169	{
170	}
171
172	static int aio_epoll(AioContext ctx, GPollFD pfds,
173	unsigned npfd, int64_t timeout)
174	{
175	assert(false);
176	}
177
178	static bool aio_epoll_enabled(AioContext *ctx)
179	{
180	return false;
181	}
182
183	static bool aio_epoll_check_poll(AioContext ctx, GPollFD pfds,
184	unsigned npfd, int64_t timeout)
185	{
186	return false;
187	}
188
189	#endif
190
a915f4bc	191	static AioHandler find_aio_handler(AioContext ctx, int fd)
a76bab49 AL	192	{
	193	AioHandler *node;
	194
a915f4bc	195	QLIST_FOREACH(node, &ctx->aio_handlers, node) {
cd9ba1eb	196	if (node->pfd.fd == fd)
79d5ca56 AG	197	if (!node->deleted)
79d5ca56 AG	198	return node;
a76bab49 AL	199	}
	200
	201	return NULL;
	202	}
	203
a915f4bc PB	204	void aio_set_fd_handler(AioContext *ctx,
a915f4bc PB	205	int fd,
dca21ef2	206	bool is_external,
a915f4bc PB	207	IOHandler *io_read,
a915f4bc PB	208	IOHandler *io_write,
a915f4bc	209	void *opaque)
a76bab49 AL	210	{
a76bab49 AL	211	AioHandler *node;
fbe3fc5c	212	bool is_new = false;
a76bab49	213
a915f4bc	214	node = find_aio_handler(ctx, fd);
a76bab49 AL	215
	216	/* Are we deleting the fd handler? */
	217	if (!io_read && !io_write) {
	218	if (node) {
e3713e00 PB	219	g_source_remove_poll(&ctx->source, &node->pfd);
e3713e00 PB	220
a76bab49	221	/* If the lock is held, just mark the node as deleted */
cd9ba1eb	222	if (ctx->walking_handlers) {
a76bab49	223	node->deleted = 1;
cd9ba1eb PB	224	node->pfd.revents = 0;
cd9ba1eb PB	225	} else {
a76bab49 AL	226	/* Otherwise, delete it for real. We can't just mark it as
	227	* deleted because deleted nodes are only cleaned up after
	228	* releasing the walking_handlers lock.
	229	*/
72cf2d4f	230	QLIST_REMOVE(node, node);
7267c094	231	g_free(node);
a76bab49 AL	232	}
	233	}
	234	} else {
	235	if (node == NULL) {
	236	/* Alloc and insert if it's not already there */
3ba235a0	237	node = g_new0(AioHandler, 1);
cd9ba1eb	238	node->pfd.fd = fd;
a915f4bc	239	QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
e3713e00 PB	240
e3713e00 PB	241	g_source_add_poll(&ctx->source, &node->pfd);
fbe3fc5c	242	is_new = true;
a76bab49 AL	243	}
	244	/* Update handler with latest information */
	245	node->io_read = io_read;
	246	node->io_write = io_write;
a76bab49	247	node->opaque = opaque;
dca21ef2	248	node->is_external = is_external;
cd9ba1eb	249
b5a01a70 SH	250	node->pfd.events = (io_read ? G_IO_IN \| G_IO_HUP \| G_IO_ERR : 0);
b5a01a70 SH	251	node->pfd.events \|= (io_write ? G_IO_OUT \| G_IO_ERR : 0);
a76bab49	252	}
7ed2b24c	253
fbe3fc5c	254	aio_epoll_update(ctx, node, is_new);
7ed2b24c	255	aio_notify(ctx);
9958c351 PB	256	}
9958c351 PB	257
a915f4bc PB	258	void aio_set_event_notifier(AioContext *ctx,
a915f4bc PB	259	EventNotifier *notifier,
dca21ef2	260	bool is_external,
f2e5dca4	261	EventNotifierHandler *io_read)
a76bab49	262	{
a915f4bc	263	aio_set_fd_handler(ctx, event_notifier_get_fd(notifier),
dca21ef2	264	is_external, (IOHandler *)io_read, NULL, notifier);
a76bab49 AL	265	}
a76bab49 AL	266
a3462c65 PB	267	bool aio_prepare(AioContext *ctx)
	268	{
	269	return false;
	270	}
	271
cd9ba1eb PB	272	bool aio_pending(AioContext *ctx)
	273	{
	274	AioHandler *node;
	275
	276	QLIST_FOREACH(node, &ctx->aio_handlers, node) {
	277	int revents;
	278
cd9ba1eb PB	279	revents = node->pfd.revents & node->pfd.events;
	280	if (revents & (G_IO_IN \| G_IO_HUP \| G_IO_ERR) && node->io_read) {
	281	return true;
	282	}
	283	if (revents & (G_IO_OUT \| G_IO_ERR) && node->io_write) {
	284	return true;
	285	}
	286	}
	287
	288	return false;
	289	}
	290
e4c7e2d1	291	bool aio_dispatch(AioContext *ctx)
a76bab49	292	{
9eb0bfca	293	AioHandler *node;
d0c8d2c0	294	bool progress = false;
7c0628b2	295
e4c7e2d1 PB	296	/*
	297	* If there are callbacks left that have been queued, we need to call them.
	298	* Do not call select in this case, because it is possible that the caller
	299	* does not need a complete flush (as is the case for aio_poll loops).
	300	*/
	301	if (aio_bh_poll(ctx)) {
	302	progress = true;
	303	}
	304
cd9ba1eb	305	/*
87f68d31	306	* We have to walk very carefully in case aio_set_fd_handler is
cd9ba1eb PB	307	* called while we're walking.
	308	*/
	309	node = QLIST_FIRST(&ctx->aio_handlers);
	310	while (node) {
	311	AioHandler *tmp;
	312	int revents;
	313
	314	ctx->walking_handlers++;
	315
	316	revents = node->pfd.revents & node->pfd.events;
	317	node->pfd.revents = 0;
	318
d0c8d2c0 SH	319	if (!node->deleted &&
	320	(revents & (G_IO_IN \| G_IO_HUP \| G_IO_ERR)) &&
	321	node->io_read) {
cd9ba1eb	322	node->io_read(node->opaque);
164a101f SH	323
	324	/* aio_notify() does not count as progress */
	325	if (node->opaque != &ctx->notifier) {
	326	progress = true;
	327	}
cd9ba1eb	328	}
d0c8d2c0 SH	329	if (!node->deleted &&
	330	(revents & (G_IO_OUT \| G_IO_ERR)) &&
	331	node->io_write) {
cd9ba1eb PB	332	node->io_write(node->opaque);
	333	progress = true;
	334	}
	335
	336	tmp = node;
	337	node = QLIST_NEXT(node, node);
	338
	339	ctx->walking_handlers--;
	340
	341	if (!ctx->walking_handlers && tmp->deleted) {
	342	QLIST_REMOVE(tmp, node);
	343	g_free(tmp);
	344	}
	345	}
438e1f47 AB	346
	347	/* Run our timers */
	348	progress \|= timerlistgroup_run_timers(&ctx->tlg);
	349
d0c8d2c0 SH	350	return progress;
	351	}
	352
e98ab097 PB	353	/* These thread-local variables are used only in a small part of aio_poll
	354	* around the call to the poll() system call. In particular they are not
	355	* used while aio_poll is performing callbacks, which makes it much easier
	356	* to think about reentrancy!
	357	*
	358	* Stack-allocated arrays would be perfect but they have size limitations;
	359	* heap allocation is expensive enough that we want to reuse arrays across
	360	* calls to aio_poll(). And because poll() has to be called without holding
	361	* any lock, the arrays cannot be stored in AioContext. Thread-local data
	362	* has none of the disadvantages of these three options.
	363	*/
	364	static __thread GPollFD *pollfds;
	365	static __thread AioHandler **nodes;
	366	static __thread unsigned npfd, nalloc;
	367	static __thread Notifier pollfds_cleanup_notifier;
	368
	369	static void pollfds_cleanup(Notifier n, void unused)
	370	{
	371	g_assert(npfd == 0);
	372	g_free(pollfds);
	373	g_free(nodes);
	374	nalloc = 0;
	375	}
	376
	377	static void add_pollfd(AioHandler *node)
	378	{
	379	if (npfd == nalloc) {
	380	if (nalloc == 0) {
	381	pollfds_cleanup_notifier.notify = pollfds_cleanup;
	382	qemu_thread_atexit_add(&pollfds_cleanup_notifier);
	383	nalloc = 8;
	384	} else {
	385	g_assert(nalloc <= INT_MAX);
	386	nalloc *= 2;
	387	}
	388	pollfds = g_renew(GPollFD, pollfds, nalloc);
	389	nodes = g_renew(AioHandler *, nodes, nalloc);
	390	}
	391	nodes[npfd] = node;
	392	pollfds[npfd] = (GPollFD) {
	393	.fd = node->pfd.fd,
	394	.events = node->pfd.events,
	395	};
	396	npfd++;
	397	}
	398
d0c8d2c0 SH	399	bool aio_poll(AioContext *ctx, bool blocking)
d0c8d2c0 SH	400	{
d0c8d2c0	401	AioHandler *node;
e98ab097	402	int i, ret;
164a101f	403	bool progress;
e98ab097	404	int64_t timeout;
d0c8d2c0	405
49110174	406	aio_context_acquire(ctx);
d0c8d2c0 SH	407	progress = false;
d0c8d2c0 SH	408
0ceb849b PB	409	/* aio_notify can avoid the expensive event_notifier_set if
0ceb849b PB	410	* everything (file descriptors, bottom halves, timers) will
e4c7e2d1 PB	411	* be re-evaluated before the next blocking poll(). This is
e4c7e2d1 PB	412	* already true when aio_poll is called with blocking == false;
eabc9779 PB	413	* if blocking == true, it is only true after poll() returns,
eabc9779 PB	414	* so disable the optimization now.
0ceb849b	415	*/
eabc9779 PB	416	if (blocking) {
	417	atomic_add(&ctx->notify_me, 2);
	418	}
0ceb849b	419
a915f4bc	420	ctx->walking_handlers++;
a76bab49	421
e98ab097	422	assert(npfd == 0);
a76bab49	423
6b5f8762	424	/* fill pollfds */
a915f4bc	425	QLIST_FOREACH(node, &ctx->aio_handlers, node) {
c1e1e5fa	426	if (!node->deleted && node->pfd.events
fbe3fc5c	427	&& !aio_epoll_enabled(ctx)
c1e1e5fa	428	&& aio_node_check(ctx, node->is_external)) {
e98ab097	429	add_pollfd(node);
9eb0bfca PB	430	}
9eb0bfca PB	431	}
a76bab49	432
e98ab097	433	timeout = blocking ? aio_compute_timeout(ctx) : 0;
a76bab49	434
9eb0bfca	435	/* wait until next event */
49110174 PB	436	if (timeout) {
	437	aio_context_release(ctx);
	438	}
fbe3fc5c FZ	439	if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
	440	AioHandler epoll_handler;
	441
	442	epoll_handler.pfd.fd = ctx->epollfd;
	443	epoll_handler.pfd.events = G_IO_IN \| G_IO_OUT \| G_IO_HUP \| G_IO_ERR;
	444	npfd = 0;
	445	add_pollfd(&epoll_handler);
	446	ret = aio_epoll(ctx, pollfds, npfd, timeout);
	447	} else {
	448	ret = qemu_poll_ns(pollfds, npfd, timeout);
	449	}
eabc9779 PB	450	if (blocking) {
	451	atomic_sub(&ctx->notify_me, 2);
	452	}
49110174 PB	453	if (timeout) {
	454	aio_context_acquire(ctx);
	455	}
9eb0bfca	456
05e514b1	457	aio_notify_accept(ctx);
21a03d17	458
9eb0bfca PB	459	/* if we have any readable fds, dispatch event */
9eb0bfca PB	460	if (ret > 0) {
e98ab097 PB	461	for (i = 0; i < npfd; i++) {
e98ab097 PB	462	nodes[i]->pfd.revents = pollfds[i].revents;
a76bab49	463	}
438e1f47 AB	464	}
438e1f47 AB	465
e98ab097 PB	466	npfd = 0;
	467	ctx->walking_handlers--;
	468
438e1f47 AB	469	/* Run dispatch even if there were no readable fds to run timers */
	470	if (aio_dispatch(ctx)) {
	471	progress = true;
9eb0bfca	472	}
bcdc1857	473
49110174 PB	474	aio_context_release(ctx);
49110174 PB	475
164a101f	476	return progress;
a76bab49	477	}
37fcee5d FZ	478
	479	void aio_context_setup(AioContext ctx, Error *errp)
	480	{
fbe3fc5c FZ	481	#ifdef CONFIG_EPOLL
	482	assert(!ctx->epollfd);
	483	ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
	484	if (ctx->epollfd == -1) {
	485	ctx->epoll_available = false;
	486	} else {
	487	ctx->epoll_available = true;
	488	}
	489	#endif
37fcee5d	490	}