[mirror_qemu.git] / aio-posix.c

/*
 * QEMU aio implementation
 *
 * Copyright IBM, Corp. 2008
 *
 * Authors:
 *  Anthony Liguori   <aliguori@us.ibm.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 * Contributions after 2012-01-13 are licensed under the terms of the
 * GNU GPL, version 2 or (at your option) any later version.
 */

#include "qemu-common.h"
#include "block/block.h"
#include "qemu/queue.h"
#include "qemu/sockets.h"
#ifdef CONFIG_EPOLL
#include <sys/epoll.h>
#endif

struct AioHandler
{
    GPollFD pfd;
    IOHandler *io_read;
    IOHandler *io_write;
    int deleted;
    void *opaque;
    bool is_external;
    QLIST_ENTRY(AioHandler) node;
};

#ifdef CONFIG_EPOLL

/* The fd number threashold to switch to epoll */
#define EPOLL_ENABLE_THRESHOLD 64

static void aio_epoll_disable(AioContext *ctx)
{
    ctx->epoll_available = false;
    if (!ctx->epoll_enabled) {
        return;
    }
    ctx->epoll_enabled = false;
    close(ctx->epollfd);
}

static inline int epoll_events_from_pfd(int pfd_events)
{
    return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
           (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
           (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
           (pfd_events & G_IO_ERR ? EPOLLERR : 0);
}

static bool aio_epoll_try_enable(AioContext *ctx)
{
    AioHandler *node;
    struct epoll_event event;

    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
        int r;
        if (node->deleted || !node->pfd.events) {
            continue;
        }
        event.events = epoll_events_from_pfd(node->pfd.events);
        event.data.ptr = node;
        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
        if (r) {
            return false;
        }
    }
    ctx->epoll_enabled = true;
    return true;
}

static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
{
    struct epoll_event event;
    int r;

    if (!ctx->epoll_enabled) {
        return;
    }
    if (!node->pfd.events) {
        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event);
        if (r) {
            aio_epoll_disable(ctx);
        }
    } else {
        event.data.ptr = node;
        event.events = epoll_events_from_pfd(node->pfd.events);
        if (is_new) {
            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
            if (r) {
                aio_epoll_disable(ctx);
            }
        } else {
            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event);
            if (r) {
                aio_epoll_disable(ctx);
            }
        }
    }
}

static int aio_epoll(AioContext *ctx, GPollFD *pfds,
                     unsigned npfd, int64_t timeout)
{
    AioHandler *node;
    int i, ret = 0;
    struct epoll_event events[128];

    assert(npfd == 1);
    assert(pfds[0].fd == ctx->epollfd);
    if (timeout > 0) {
        ret = qemu_poll_ns(pfds, npfd, timeout);
    }
    if (timeout <= 0 || ret > 0) {
        ret = epoll_wait(ctx->epollfd, events,
                         sizeof(events) / sizeof(events[0]),
                         timeout);
        if (ret <= 0) {
            goto out;
        }
        for (i = 0; i < ret; i++) {
            int ev = events[i].events;
            node = events[i].data.ptr;
            node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
                (ev & EPOLLOUT ? G_IO_OUT : 0) |
                (ev & EPOLLHUP ? G_IO_HUP : 0) |
                (ev & EPOLLERR ? G_IO_ERR : 0);
        }
    }
out:
    return ret;
}

static bool aio_epoll_enabled(AioContext *ctx)
{
    /* Fall back to ppoll when external clients are disabled. */
    return !aio_external_disabled(ctx) && ctx->epoll_enabled;
}

static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
                                 unsigned npfd, int64_t timeout)
{
    if (!ctx->epoll_available) {
        return false;
    }
    if (aio_epoll_enabled(ctx)) {
        return true;
    }
    if (npfd >= EPOLL_ENABLE_THRESHOLD) {
        if (aio_epoll_try_enable(ctx)) {
            return true;
        } else {
            aio_epoll_disable(ctx);
        }
    }
    return false;
}

#else

static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
{
}

static int aio_epoll(AioContext *ctx, GPollFD *pfds,
                     unsigned npfd, int64_t timeout)
{
    assert(false);
}

static bool aio_epoll_enabled(AioContext *ctx)
{
    return false;
}

static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
                          unsigned npfd, int64_t timeout)
{
    return false;
}

#endif

static AioHandler *find_aio_handler(AioContext *ctx, int fd)
{
    AioHandler *node;

    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
        if (node->pfd.fd == fd)
            if (!node->deleted)
                return node;
    }

    return NULL;
}

void aio_set_fd_handler(AioContext *ctx,
                        int fd,
                        bool is_external,
                        IOHandler *io_read,
                        IOHandler *io_write,
                        void *opaque)
{
    AioHandler *node;
    bool is_new = false;
    bool deleted = false;

    node = find_aio_handler(ctx, fd);

    /* Are we deleting the fd handler? */
    if (!io_read && !io_write) {
        if (node) {
            g_source_remove_poll(&ctx->source, &node->pfd);

            /* If the lock is held, just mark the node as deleted */
            if (ctx->walking_handlers) {
                node->deleted = 1;
                node->pfd.revents = 0;
            } else {
                /* Otherwise, delete it for real.  We can't just mark it as
                 * deleted because deleted nodes are only cleaned up after
                 * releasing the walking_handlers lock.
                 */
                QLIST_REMOVE(node, node);
                deleted = true;
            }
        }
    } else {
        if (node == NULL) {
            /* Alloc and insert if it's not already there */
            node = g_new0(AioHandler, 1);
            node->pfd.fd = fd;
            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);

            g_source_add_poll(&ctx->source, &node->pfd);
            is_new = true;
        }
        /* Update handler with latest information */
        node->io_read = io_read;
        node->io_write = io_write;
        node->opaque = opaque;
        node->is_external = is_external;

        node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
        node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
    }

    aio_epoll_update(ctx, node, is_new);
    aio_notify(ctx);
    if (deleted) {
        g_free(node);
    }
}

void aio_set_event_notifier(AioContext *ctx,
                            EventNotifier *notifier,
                            bool is_external,
                            EventNotifierHandler *io_read)
{
    aio_set_fd_handler(ctx, event_notifier_get_fd(notifier),
                       is_external, (IOHandler *)io_read, NULL, notifier);
}

bool aio_prepare(AioContext *ctx)
{
    return false;
}

bool aio_pending(AioContext *ctx)
{
    AioHandler *node;

    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
        int revents;

        revents = node->pfd.revents & node->pfd.events;
        if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
            return true;
        }
        if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
            return true;
        }
    }

    return false;
}

bool aio_dispatch(AioContext *ctx)
{
    AioHandler *node;
    bool progress = false;

    /*
     * If there are callbacks left that have been queued, we need to call them.
     * Do not call select in this case, because it is possible that the caller
     * does not need a complete flush (as is the case for aio_poll loops).
     */
    if (aio_bh_poll(ctx)) {
        progress = true;
    }

    /*
     * We have to walk very carefully in case aio_set_fd_handler is
     * called while we're walking.
     */
    node = QLIST_FIRST(&ctx->aio_handlers);
    while (node) {
        AioHandler *tmp;
        int revents;

        ctx->walking_handlers++;

        revents = node->pfd.revents & node->pfd.events;
        node->pfd.revents = 0;

        if (!node->deleted &&
            (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
            node->io_read) {
            node->io_read(node->opaque);

            /* aio_notify() does not count as progress */
            if (node->opaque != &ctx->notifier) {
                progress = true;
            }
        }
        if (!node->deleted &&
            (revents & (G_IO_OUT | G_IO_ERR)) &&
            node->io_write) {
            node->io_write(node->opaque);
            progress = true;
        }

        tmp = node;
        node = QLIST_NEXT(node, node);

        ctx->walking_handlers--;

        if (!ctx->walking_handlers && tmp->deleted) {
            QLIST_REMOVE(tmp, node);
            g_free(tmp);
        }
    }

    /* Run our timers */
    progress |= timerlistgroup_run_timers(&ctx->tlg);

    return progress;
}

/* These thread-local variables are used only in a small part of aio_poll
 * around the call to the poll() system call.  In particular they are not
 * used while aio_poll is performing callbacks, which makes it much easier
 * to think about reentrancy!
 *
 * Stack-allocated arrays would be perfect but they have size limitations;
 * heap allocation is expensive enough that we want to reuse arrays across
 * calls to aio_poll().  And because poll() has to be called without holding
 * any lock, the arrays cannot be stored in AioContext.  Thread-local data
 * has none of the disadvantages of these three options.
 */
static __thread GPollFD *pollfds;
static __thread AioHandler **nodes;
static __thread unsigned npfd, nalloc;
static __thread Notifier pollfds_cleanup_notifier;

static void pollfds_cleanup(Notifier *n, void *unused)
{
    g_assert(npfd == 0);
    g_free(pollfds);
    g_free(nodes);
    nalloc = 0;
}

static void add_pollfd(AioHandler *node)
{
    if (npfd == nalloc) {
        if (nalloc == 0) {
            pollfds_cleanup_notifier.notify = pollfds_cleanup;
            qemu_thread_atexit_add(&pollfds_cleanup_notifier);
            nalloc = 8;
        } else {
            g_assert(nalloc <= INT_MAX);
            nalloc *= 2;
        }
        pollfds = g_renew(GPollFD, pollfds, nalloc);
        nodes = g_renew(AioHandler *, nodes, nalloc);
    }
    nodes[npfd] = node;
    pollfds[npfd] = (GPollFD) {
        .fd = node->pfd.fd,
        .events = node->pfd.events,
    };
    npfd++;
}

bool aio_poll(AioContext *ctx, bool blocking)
{
    AioHandler *node;
    int i, ret;
    bool progress;
    int64_t timeout;

    aio_context_acquire(ctx);
    progress = false;

    /* aio_notify can avoid the expensive event_notifier_set if
     * everything (file descriptors, bottom halves, timers) will
     * be re-evaluated before the next blocking poll().  This is
     * already true when aio_poll is called with blocking == false;
     * if blocking == true, it is only true after poll() returns,
     * so disable the optimization now.
     */
    if (blocking) {
        atomic_add(&ctx->notify_me, 2);
    }

    ctx->walking_handlers++;

    assert(npfd == 0);

    /* fill pollfds */
    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
        if (!node->deleted && node->pfd.events
            && !aio_epoll_enabled(ctx)
            && aio_node_check(ctx, node->is_external)) {
            add_pollfd(node);
        }
    }

    timeout = blocking ? aio_compute_timeout(ctx) : 0;

    /* wait until next event */
    if (timeout) {
        aio_context_release(ctx);
    }
    if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
        AioHandler epoll_handler;

        epoll_handler.pfd.fd = ctx->epollfd;
        epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR;
        npfd = 0;
        add_pollfd(&epoll_handler);
        ret = aio_epoll(ctx, pollfds, npfd, timeout);
    } else  {
        ret = qemu_poll_ns(pollfds, npfd, timeout);
    }
    if (blocking) {
        atomic_sub(&ctx->notify_me, 2);
    }
    if (timeout) {
        aio_context_acquire(ctx);
    }

    aio_notify_accept(ctx);

    /* if we have any readable fds, dispatch event */
    if (ret > 0) {
        for (i = 0; i < npfd; i++) {
            nodes[i]->pfd.revents = pollfds[i].revents;
        }
    }

    npfd = 0;
    ctx->walking_handlers--;

    /* Run dispatch even if there were no readable fds to run timers */
    if (aio_dispatch(ctx)) {
        progress = true;
    }

    aio_context_release(ctx);

    return progress;
}

void aio_context_setup(AioContext *ctx, Error **errp)
{
#ifdef CONFIG_EPOLL
    assert(!ctx->epollfd);
    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
    if (ctx->epollfd == -1) {
        ctx->epoll_available = false;
    } else {
        ctx->epoll_available = true;
    }
#endif
}
Commit	Line	Data
a76bab49 AL	1	/*
	2	* QEMU aio implementation
	3	*
	4	* Copyright IBM, Corp. 2008
	5	*
	6	* Authors:
	7	* Anthony Liguori <aliguori@us.ibm.com>
	8	*
	9	* This work is licensed under the terms of the GNU GPL, version 2. See
	10	* the COPYING file in the top-level directory.
	11	*
6b620ca3 PB	12	* Contributions after 2012-01-13 are licensed under the terms of the
6b620ca3 PB	13	* GNU GPL, version 2 or (at your option) any later version.
a76bab49 AL	14	*/
	15
	16	#include "qemu-common.h"
737e150e	17	#include "block/block.h"
1de7afc9 PB	18	#include "qemu/queue.h"
1de7afc9 PB	19	#include "qemu/sockets.h"
fbe3fc5c FZ	20	#ifdef CONFIG_EPOLL
	21	#include <sys/epoll.h>
	22	#endif
a76bab49	23
a76bab49 AL	24	struct AioHandler
a76bab49 AL	25	{
cd9ba1eb	26	GPollFD pfd;
a76bab49 AL	27	IOHandler *io_read;
a76bab49 AL	28	IOHandler *io_write;
a76bab49 AL	29	int deleted;
a76bab49 AL	30	void *opaque;
dca21ef2	31	bool is_external;
72cf2d4f	32	QLIST_ENTRY(AioHandler) node;
a76bab49 AL	33	};
a76bab49 AL	34
fbe3fc5c FZ	35	#ifdef CONFIG_EPOLL
	36
	37	/* The fd number threashold to switch to epoll */
	38	#define EPOLL_ENABLE_THRESHOLD 64
	39
	40	static void aio_epoll_disable(AioContext *ctx)
	41	{
	42	ctx->epoll_available = false;
	43	if (!ctx->epoll_enabled) {
	44	return;
	45	}
	46	ctx->epoll_enabled = false;
	47	close(ctx->epollfd);
	48	}
	49
	50	static inline int epoll_events_from_pfd(int pfd_events)
	51	{
	52	return (pfd_events & G_IO_IN ? EPOLLIN : 0) \|
	53	(pfd_events & G_IO_OUT ? EPOLLOUT : 0) \|
	54	(pfd_events & G_IO_HUP ? EPOLLHUP : 0) \|
	55	(pfd_events & G_IO_ERR ? EPOLLERR : 0);
	56	}
	57
	58	static bool aio_epoll_try_enable(AioContext *ctx)
	59	{
	60	AioHandler *node;
	61	struct epoll_event event;
	62
	63	QLIST_FOREACH(node, &ctx->aio_handlers, node) {
	64	int r;
	65	if (node->deleted \|\| !node->pfd.events) {
	66	continue;
	67	}
	68	event.events = epoll_events_from_pfd(node->pfd.events);
	69	event.data.ptr = node;
	70	r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
	71	if (r) {
	72	return false;
	73	}
	74	}
	75	ctx->epoll_enabled = true;
	76	return true;
	77	}
	78
	79	static void aio_epoll_update(AioContext ctx, AioHandler node, bool is_new)
	80	{
	81	struct epoll_event event;
	82	int r;
	83
	84	if (!ctx->epoll_enabled) {
	85	return;
	86	}
	87	if (!node->pfd.events) {
	88	r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event);
	89	if (r) {
	90	aio_epoll_disable(ctx);
	91	}
	92	} else {
	93	event.data.ptr = node;
	94	event.events = epoll_events_from_pfd(node->pfd.events);
	95	if (is_new) {
	96	r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
	97	if (r) {
	98	aio_epoll_disable(ctx);
99	}
100	} else {
101	r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event);
102	if (r) {
103	aio_epoll_disable(ctx);
104	}
105	}
106	}
107	}
108
109	static int aio_epoll(AioContext ctx, GPollFD pfds,
110	unsigned npfd, int64_t timeout)
111	{
112	AioHandler *node;
113	int i, ret = 0;
114	struct epoll_event events[128];
115
116	assert(npfd == 1);
117	assert(pfds[0].fd == ctx->epollfd);
118	if (timeout > 0) {
119	ret = qemu_poll_ns(pfds, npfd, timeout);
120	}
121	if (timeout <= 0 \|\| ret > 0) {
122	ret = epoll_wait(ctx->epollfd, events,
123	sizeof(events) / sizeof(events[0]),
124	timeout);
125	if (ret <= 0) {
126	goto out;
127	}
128	for (i = 0; i < ret; i++) {
129	int ev = events[i].events;
130	node = events[i].data.ptr;
131	node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) \|
132	(ev & EPOLLOUT ? G_IO_OUT : 0) \|
133	(ev & EPOLLHUP ? G_IO_HUP : 0) \|
134	(ev & EPOLLERR ? G_IO_ERR : 0);
135	}
136	}
137	out:
138	return ret;
139	}
140
141	static bool aio_epoll_enabled(AioContext *ctx)
142	{
143	/* Fall back to ppoll when external clients are disabled. */
144	return !aio_external_disabled(ctx) && ctx->epoll_enabled;
145	}
146
147	static bool aio_epoll_check_poll(AioContext ctx, GPollFD pfds,
148	unsigned npfd, int64_t timeout)
149	{
150	if (!ctx->epoll_available) {
151	return false;
152	}
153	if (aio_epoll_enabled(ctx)) {
154	return true;
155	}
156	if (npfd >= EPOLL_ENABLE_THRESHOLD) {
157	if (aio_epoll_try_enable(ctx)) {
158	return true;
159	} else {
160	aio_epoll_disable(ctx);
161	}
162	}
163	return false;
164	}
165
166	#else
167
168	static void aio_epoll_update(AioContext ctx, AioHandler node, bool is_new)
169	{
170	}
171
172	static int aio_epoll(AioContext ctx, GPollFD pfds,
173	unsigned npfd, int64_t timeout)
174	{
175	assert(false);
176	}
177
178	static bool aio_epoll_enabled(AioContext *ctx)
179	{
180	return false;
181	}
182
183	static bool aio_epoll_check_poll(AioContext ctx, GPollFD pfds,
184	unsigned npfd, int64_t timeout)
185	{
186	return false;
187	}
188
189	#endif
190
a915f4bc	191	static AioHandler find_aio_handler(AioContext ctx, int fd)
a76bab49 AL	192	{
	193	AioHandler *node;
	194
a915f4bc	195	QLIST_FOREACH(node, &ctx->aio_handlers, node) {
cd9ba1eb	196	if (node->pfd.fd == fd)
79d5ca56 AG	197	if (!node->deleted)
79d5ca56 AG	198	return node;
a76bab49 AL	199	}
	200
	201	return NULL;
	202	}
	203
a915f4bc PB	204	void aio_set_fd_handler(AioContext *ctx,
a915f4bc PB	205	int fd,
dca21ef2	206	bool is_external,
a915f4bc PB	207	IOHandler *io_read,
a915f4bc PB	208	IOHandler *io_write,
a915f4bc	209	void *opaque)
a76bab49 AL	210	{
a76bab49 AL	211	AioHandler *node;
fbe3fc5c	212	bool is_new = false;
0ed39f3d	213	bool deleted = false;
a76bab49	214
a915f4bc	215	node = find_aio_handler(ctx, fd);
a76bab49 AL	216
	217	/* Are we deleting the fd handler? */
	218	if (!io_read && !io_write) {
	219	if (node) {
e3713e00 PB	220	g_source_remove_poll(&ctx->source, &node->pfd);
e3713e00 PB	221
a76bab49	222	/* If the lock is held, just mark the node as deleted */
cd9ba1eb	223	if (ctx->walking_handlers) {
a76bab49	224	node->deleted = 1;
cd9ba1eb PB	225	node->pfd.revents = 0;
cd9ba1eb PB	226	} else {
a76bab49 AL	227	/* Otherwise, delete it for real. We can't just mark it as
	228	* deleted because deleted nodes are only cleaned up after
	229	* releasing the walking_handlers lock.
	230	*/
72cf2d4f	231	QLIST_REMOVE(node, node);
0ed39f3d	232	deleted = true;
a76bab49 AL	233	}
	234	}
	235	} else {
	236	if (node == NULL) {
	237	/* Alloc and insert if it's not already there */
3ba235a0	238	node = g_new0(AioHandler, 1);
cd9ba1eb	239	node->pfd.fd = fd;
a915f4bc	240	QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
e3713e00 PB	241
e3713e00 PB	242	g_source_add_poll(&ctx->source, &node->pfd);
fbe3fc5c	243	is_new = true;
a76bab49 AL	244	}
	245	/* Update handler with latest information */
	246	node->io_read = io_read;
	247	node->io_write = io_write;
a76bab49	248	node->opaque = opaque;
dca21ef2	249	node->is_external = is_external;
cd9ba1eb	250
b5a01a70 SH	251	node->pfd.events = (io_read ? G_IO_IN \| G_IO_HUP \| G_IO_ERR : 0);
b5a01a70 SH	252	node->pfd.events \|= (io_write ? G_IO_OUT \| G_IO_ERR : 0);
a76bab49	253	}
7ed2b24c	254
fbe3fc5c	255	aio_epoll_update(ctx, node, is_new);
7ed2b24c	256	aio_notify(ctx);
0ed39f3d FZ	257	if (deleted) {
	258	g_free(node);
	259	}
9958c351 PB	260	}
9958c351 PB	261
a915f4bc PB	262	void aio_set_event_notifier(AioContext *ctx,
a915f4bc PB	263	EventNotifier *notifier,
dca21ef2	264	bool is_external,
f2e5dca4	265	EventNotifierHandler *io_read)
a76bab49	266	{
a915f4bc	267	aio_set_fd_handler(ctx, event_notifier_get_fd(notifier),
dca21ef2	268	is_external, (IOHandler *)io_read, NULL, notifier);
a76bab49 AL	269	}
a76bab49 AL	270
a3462c65 PB	271	bool aio_prepare(AioContext *ctx)
	272	{
	273	return false;
	274	}
	275
cd9ba1eb PB	276	bool aio_pending(AioContext *ctx)
	277	{
	278	AioHandler *node;
	279
	280	QLIST_FOREACH(node, &ctx->aio_handlers, node) {
	281	int revents;
	282
cd9ba1eb PB	283	revents = node->pfd.revents & node->pfd.events;
	284	if (revents & (G_IO_IN \| G_IO_HUP \| G_IO_ERR) && node->io_read) {
	285	return true;
	286	}
	287	if (revents & (G_IO_OUT \| G_IO_ERR) && node->io_write) {
	288	return true;
	289	}
	290	}
	291
	292	return false;
	293	}
	294
e4c7e2d1	295	bool aio_dispatch(AioContext *ctx)
a76bab49	296	{
9eb0bfca	297	AioHandler *node;
d0c8d2c0	298	bool progress = false;
7c0628b2	299
e4c7e2d1 PB	300	/*
	301	* If there are callbacks left that have been queued, we need to call them.
	302	* Do not call select in this case, because it is possible that the caller
	303	* does not need a complete flush (as is the case for aio_poll loops).
	304	*/
	305	if (aio_bh_poll(ctx)) {
	306	progress = true;
	307	}
	308
cd9ba1eb	309	/*
87f68d31	310	* We have to walk very carefully in case aio_set_fd_handler is
cd9ba1eb PB	311	* called while we're walking.
	312	*/
	313	node = QLIST_FIRST(&ctx->aio_handlers);
	314	while (node) {
	315	AioHandler *tmp;
	316	int revents;
	317
	318	ctx->walking_handlers++;
	319
	320	revents = node->pfd.revents & node->pfd.events;
	321	node->pfd.revents = 0;
	322
d0c8d2c0 SH	323	if (!node->deleted &&
	324	(revents & (G_IO_IN \| G_IO_HUP \| G_IO_ERR)) &&
	325	node->io_read) {
cd9ba1eb	326	node->io_read(node->opaque);
164a101f SH	327
	328	/* aio_notify() does not count as progress */
	329	if (node->opaque != &ctx->notifier) {
	330	progress = true;
	331	}
cd9ba1eb	332	}
d0c8d2c0 SH	333	if (!node->deleted &&
	334	(revents & (G_IO_OUT \| G_IO_ERR)) &&
	335	node->io_write) {
cd9ba1eb PB	336	node->io_write(node->opaque);
	337	progress = true;
	338	}
	339
	340	tmp = node;
	341	node = QLIST_NEXT(node, node);
	342
	343	ctx->walking_handlers--;
	344
	345	if (!ctx->walking_handlers && tmp->deleted) {
	346	QLIST_REMOVE(tmp, node);
	347	g_free(tmp);
	348	}
	349	}
438e1f47 AB	350
	351	/* Run our timers */
	352	progress \|= timerlistgroup_run_timers(&ctx->tlg);
	353
d0c8d2c0 SH	354	return progress;
	355	}
	356
e98ab097 PB	357	/* These thread-local variables are used only in a small part of aio_poll
	358	* around the call to the poll() system call. In particular they are not
	359	* used while aio_poll is performing callbacks, which makes it much easier
	360	* to think about reentrancy!
	361	*
	362	* Stack-allocated arrays would be perfect but they have size limitations;
	363	* heap allocation is expensive enough that we want to reuse arrays across
	364	* calls to aio_poll(). And because poll() has to be called without holding
	365	* any lock, the arrays cannot be stored in AioContext. Thread-local data
	366	* has none of the disadvantages of these three options.
	367	*/
	368	static __thread GPollFD *pollfds;
	369	static __thread AioHandler **nodes;
	370	static __thread unsigned npfd, nalloc;
	371	static __thread Notifier pollfds_cleanup_notifier;
	372
	373	static void pollfds_cleanup(Notifier n, void unused)
	374	{
	375	g_assert(npfd == 0);
	376	g_free(pollfds);
	377	g_free(nodes);
	378	nalloc = 0;
	379	}
	380
	381	static void add_pollfd(AioHandler *node)
	382	{
	383	if (npfd == nalloc) {
	384	if (nalloc == 0) {
	385	pollfds_cleanup_notifier.notify = pollfds_cleanup;
	386	qemu_thread_atexit_add(&pollfds_cleanup_notifier);
	387	nalloc = 8;
	388	} else {
	389	g_assert(nalloc <= INT_MAX);
	390	nalloc *= 2;
	391	}
	392	pollfds = g_renew(GPollFD, pollfds, nalloc);
	393	nodes = g_renew(AioHandler *, nodes, nalloc);
	394	}
	395	nodes[npfd] = node;
	396	pollfds[npfd] = (GPollFD) {
	397	.fd = node->pfd.fd,
	398	.events = node->pfd.events,
	399	};
	400	npfd++;
	401	}
	402
d0c8d2c0 SH	403	bool aio_poll(AioContext *ctx, bool blocking)
d0c8d2c0 SH	404	{
d0c8d2c0	405	AioHandler *node;
e98ab097	406	int i, ret;
164a101f	407	bool progress;
e98ab097	408	int64_t timeout;
d0c8d2c0	409
49110174	410	aio_context_acquire(ctx);
d0c8d2c0 SH	411	progress = false;
d0c8d2c0 SH	412
0ceb849b PB	413	/* aio_notify can avoid the expensive event_notifier_set if
0ceb849b PB	414	* everything (file descriptors, bottom halves, timers) will
e4c7e2d1 PB	415	* be re-evaluated before the next blocking poll(). This is
e4c7e2d1 PB	416	* already true when aio_poll is called with blocking == false;
eabc9779 PB	417	* if blocking == true, it is only true after poll() returns,
eabc9779 PB	418	* so disable the optimization now.
0ceb849b	419	*/
eabc9779 PB	420	if (blocking) {
	421	atomic_add(&ctx->notify_me, 2);
	422	}
0ceb849b	423
a915f4bc	424	ctx->walking_handlers++;
a76bab49	425
e98ab097	426	assert(npfd == 0);
a76bab49	427
6b5f8762	428	/* fill pollfds */
a915f4bc	429	QLIST_FOREACH(node, &ctx->aio_handlers, node) {
c1e1e5fa	430	if (!node->deleted && node->pfd.events
fbe3fc5c	431	&& !aio_epoll_enabled(ctx)
c1e1e5fa	432	&& aio_node_check(ctx, node->is_external)) {
e98ab097	433	add_pollfd(node);
9eb0bfca PB	434	}
9eb0bfca PB	435	}
a76bab49	436
e98ab097	437	timeout = blocking ? aio_compute_timeout(ctx) : 0;
a76bab49	438
9eb0bfca	439	/* wait until next event */
49110174 PB	440	if (timeout) {
	441	aio_context_release(ctx);
	442	}
fbe3fc5c FZ	443	if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
	444	AioHandler epoll_handler;
	445
	446	epoll_handler.pfd.fd = ctx->epollfd;
	447	epoll_handler.pfd.events = G_IO_IN \| G_IO_OUT \| G_IO_HUP \| G_IO_ERR;
	448	npfd = 0;
	449	add_pollfd(&epoll_handler);
	450	ret = aio_epoll(ctx, pollfds, npfd, timeout);
	451	} else {
	452	ret = qemu_poll_ns(pollfds, npfd, timeout);
	453	}
eabc9779 PB	454	if (blocking) {
	455	atomic_sub(&ctx->notify_me, 2);
	456	}
49110174 PB	457	if (timeout) {
	458	aio_context_acquire(ctx);
	459	}
9eb0bfca	460
05e514b1	461	aio_notify_accept(ctx);
21a03d17	462
9eb0bfca PB	463	/* if we have any readable fds, dispatch event */
9eb0bfca PB	464	if (ret > 0) {
e98ab097 PB	465	for (i = 0; i < npfd; i++) {
e98ab097 PB	466	nodes[i]->pfd.revents = pollfds[i].revents;
a76bab49	467	}
438e1f47 AB	468	}
438e1f47 AB	469
e98ab097 PB	470	npfd = 0;
	471	ctx->walking_handlers--;
	472
438e1f47 AB	473	/* Run dispatch even if there were no readable fds to run timers */
	474	if (aio_dispatch(ctx)) {
	475	progress = true;
9eb0bfca	476	}
bcdc1857	477
49110174 PB	478	aio_context_release(ctx);
49110174 PB	479
164a101f	480	return progress;
a76bab49	481	}
37fcee5d FZ	482
	483	void aio_context_setup(AioContext ctx, Error *errp)
	484	{
fbe3fc5c FZ	485	#ifdef CONFIG_EPOLL
	486	assert(!ctx->epollfd);
	487	ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
	488	if (ctx->epollfd == -1) {
	489	ctx->epoll_available = false;
	490	} else {
	491	ctx->epoll_available = true;
	492	}
	493	#endif
37fcee5d	494	}