]> git.proxmox.com Git - mirror_qemu.git/blame - util/aio-posix.c
aio-posix: make AioHandler deletion O(1)
[mirror_qemu.git] / util / aio-posix.c
CommitLineData
a76bab49
AL
1/*
2 * QEMU aio implementation
3 *
4 * Copyright IBM, Corp. 2008
5 *
6 * Authors:
7 * Anthony Liguori <aliguori@us.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
6b620ca3
PB
12 * Contributions after 2012-01-13 are licensed under the terms of the
13 * GNU GPL, version 2 or (at your option) any later version.
a76bab49
AL
14 */
15
d38ea87a 16#include "qemu/osdep.h"
737e150e 17#include "block/block.h"
f25c0b54 18#include "qemu/rcu.h"
2bbf11d7 19#include "qemu/rcu_queue.h"
1de7afc9 20#include "qemu/sockets.h"
4a1cba38 21#include "qemu/cutils.h"
c2b38b27 22#include "trace.h"
147dfab7 23#ifdef CONFIG_EPOLL_CREATE1
fbe3fc5c
FZ
24#include <sys/epoll.h>
25#endif
a76bab49 26
a76bab49
AL
27struct AioHandler
28{
cd9ba1eb 29 GPollFD pfd;
a76bab49
AL
30 IOHandler *io_read;
31 IOHandler *io_write;
4a1cba38 32 AioPollFn *io_poll;
684e508c
SH
33 IOHandler *io_poll_begin;
34 IOHandler *io_poll_end;
a76bab49 35 void *opaque;
dca21ef2 36 bool is_external;
72cf2d4f 37 QLIST_ENTRY(AioHandler) node;
4749079c 38 QLIST_ENTRY(AioHandler) node_deleted;
a76bab49
AL
39};
40
147dfab7 41#ifdef CONFIG_EPOLL_CREATE1
fbe3fc5c 42
82dfee5a 43/* The fd number threshold to switch to epoll */
fbe3fc5c
FZ
44#define EPOLL_ENABLE_THRESHOLD 64
45
46static void aio_epoll_disable(AioContext *ctx)
47{
cd0a6d2b
JW
48 ctx->epoll_enabled = false;
49 if (!ctx->epoll_available) {
fbe3fc5c
FZ
50 return;
51 }
cd0a6d2b 52 ctx->epoll_available = false;
fbe3fc5c
FZ
53 close(ctx->epollfd);
54}
55
56static inline int epoll_events_from_pfd(int pfd_events)
57{
58 return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
59 (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
60 (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
61 (pfd_events & G_IO_ERR ? EPOLLERR : 0);
62}
63
64static bool aio_epoll_try_enable(AioContext *ctx)
65{
66 AioHandler *node;
67 struct epoll_event event;
68
2bbf11d7 69 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
fbe3fc5c 70 int r;
4749079c 71 if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
fbe3fc5c
FZ
72 continue;
73 }
74 event.events = epoll_events_from_pfd(node->pfd.events);
75 event.data.ptr = node;
76 r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
77 if (r) {
78 return false;
79 }
80 }
81 ctx->epoll_enabled = true;
82 return true;
83}
84
85static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
86{
87 struct epoll_event event;
88 int r;
35dd66e2 89 int ctl;
fbe3fc5c
FZ
90
91 if (!ctx->epoll_enabled) {
92 return;
93 }
94 if (!node->pfd.events) {
35dd66e2 95 ctl = EPOLL_CTL_DEL;
fbe3fc5c
FZ
96 } else {
97 event.data.ptr = node;
98 event.events = epoll_events_from_pfd(node->pfd.events);
35dd66e2
PB
99 ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
100 }
101
102 r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
103 if (r) {
104 aio_epoll_disable(ctx);
fbe3fc5c
FZ
105 }
106}
107
ff29ed3a 108static int aio_epoll(AioContext *ctx, int64_t timeout)
fbe3fc5c 109{
ff29ed3a
SH
110 GPollFD pfd = {
111 .fd = ctx->epollfd,
112 .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
113 };
fbe3fc5c
FZ
114 AioHandler *node;
115 int i, ret = 0;
116 struct epoll_event events[128];
117
fbe3fc5c 118 if (timeout > 0) {
ff29ed3a 119 ret = qemu_poll_ns(&pfd, 1, timeout);
ca8c6b22
SH
120 if (ret > 0) {
121 timeout = 0;
122 }
fbe3fc5c
FZ
123 }
124 if (timeout <= 0 || ret > 0) {
125 ret = epoll_wait(ctx->epollfd, events,
8f801baf 126 ARRAY_SIZE(events),
fbe3fc5c
FZ
127 timeout);
128 if (ret <= 0) {
129 goto out;
130 }
131 for (i = 0; i < ret; i++) {
132 int ev = events[i].events;
133 node = events[i].data.ptr;
134 node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
135 (ev & EPOLLOUT ? G_IO_OUT : 0) |
136 (ev & EPOLLHUP ? G_IO_HUP : 0) |
137 (ev & EPOLLERR ? G_IO_ERR : 0);
138 }
139 }
140out:
141 return ret;
142}
143
144static bool aio_epoll_enabled(AioContext *ctx)
145{
146 /* Fall back to ppoll when external clients are disabled. */
147 return !aio_external_disabled(ctx) && ctx->epoll_enabled;
148}
149
150static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
151 unsigned npfd, int64_t timeout)
152{
153 if (!ctx->epoll_available) {
154 return false;
155 }
156 if (aio_epoll_enabled(ctx)) {
157 return true;
158 }
159 if (npfd >= EPOLL_ENABLE_THRESHOLD) {
160 if (aio_epoll_try_enable(ctx)) {
161 return true;
162 } else {
163 aio_epoll_disable(ctx);
164 }
165 }
166 return false;
167}
168
169#else
170
171static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
172{
173}
174
175static int aio_epoll(AioContext *ctx, GPollFD *pfds,
176 unsigned npfd, int64_t timeout)
177{
178 assert(false);
179}
180
181static bool aio_epoll_enabled(AioContext *ctx)
182{
183 return false;
184}
185
186static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
187 unsigned npfd, int64_t timeout)
188{
189 return false;
190}
191
192#endif
193
a915f4bc 194static AioHandler *find_aio_handler(AioContext *ctx, int fd)
a76bab49
AL
195{
196 AioHandler *node;
197
a915f4bc 198 QLIST_FOREACH(node, &ctx->aio_handlers, node) {
4749079c
SH
199 if (node->pfd.fd == fd) {
200 if (!QLIST_IS_INSERTED(node, node_deleted)) {
79d5ca56 201 return node;
4749079c
SH
202 }
203 }
a76bab49
AL
204 }
205
206 return NULL;
207}
208
fef16601
RN
209static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
210{
211 /* If the GSource is in the process of being destroyed then
212 * g_source_remove_poll() causes an assertion failure. Skip
213 * removal in that case, because glib cleans up its state during
214 * destruction anyway.
215 */
216 if (!g_source_is_destroyed(&ctx->source)) {
217 g_source_remove_poll(&ctx->source, &node->pfd);
218 }
219
220 /* If a read is in progress, just mark the node as deleted */
221 if (qemu_lockcnt_count(&ctx->list_lock)) {
4749079c 222 QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
fef16601
RN
223 node->pfd.revents = 0;
224 return false;
225 }
226 /* Otherwise, delete it for real. We can't just mark it as
227 * deleted because deleted nodes are only cleaned up while
228 * no one is walking the handlers list.
229 */
230 QLIST_REMOVE(node, node);
231 return true;
232}
233
a915f4bc
PB
234void aio_set_fd_handler(AioContext *ctx,
235 int fd,
dca21ef2 236 bool is_external,
a915f4bc
PB
237 IOHandler *io_read,
238 IOHandler *io_write,
f6a51c84 239 AioPollFn *io_poll,
a915f4bc 240 void *opaque)
a76bab49
AL
241{
242 AioHandler *node;
fef16601 243 AioHandler *new_node = NULL;
fbe3fc5c 244 bool is_new = false;
0ed39f3d 245 bool deleted = false;
d7be5dd1 246 int poll_disable_change;
a76bab49 247
2bbf11d7
PB
248 qemu_lockcnt_lock(&ctx->list_lock);
249
a915f4bc 250 node = find_aio_handler(ctx, fd);
a76bab49
AL
251
252 /* Are we deleting the fd handler? */
4a1cba38 253 if (!io_read && !io_write && !io_poll) {
36173ec5 254 if (node == NULL) {
2bbf11d7 255 qemu_lockcnt_unlock(&ctx->list_lock);
36173ec5
PB
256 return;
257 }
8821b34a
RN
258 /* Clean events in order to unregister fd from the ctx epoll. */
259 node->pfd.events = 0;
260
d7be5dd1 261 poll_disable_change = -!node->io_poll;
a76bab49 262 } else {
d7be5dd1 263 poll_disable_change = !io_poll - (node && !node->io_poll);
a76bab49 264 if (node == NULL) {
fbe3fc5c 265 is_new = true;
a76bab49 266 }
fef16601
RN
267 /* Alloc and insert if it's not already there */
268 new_node = g_new0(AioHandler, 1);
4a1cba38 269
a76bab49 270 /* Update handler with latest information */
fef16601
RN
271 new_node->io_read = io_read;
272 new_node->io_write = io_write;
273 new_node->io_poll = io_poll;
274 new_node->opaque = opaque;
275 new_node->is_external = is_external;
276
277 if (is_new) {
278 new_node->pfd.fd = fd;
279 } else {
280 new_node->pfd = node->pfd;
281 }
282 g_source_add_poll(&ctx->source, &new_node->pfd);
283
284 new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
285 new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
286
287 QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
288 }
289 if (node) {
290 deleted = aio_remove_fd_handler(ctx, node);
a76bab49 291 }
7ed2b24c 292
d7be5dd1
PB
293 /* No need to order poll_disable_cnt writes against other updates;
294 * the counter is only used to avoid wasting time and latency on
295 * iterated polling when the system call will be ultimately necessary.
296 * Changing handlers is a rare event, and a little wasted polling until
297 * the aio_notify below is not an issue.
298 */
299 atomic_set(&ctx->poll_disable_cnt,
300 atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
301
fef16601
RN
302 if (new_node) {
303 aio_epoll_update(ctx, new_node, is_new);
304 } else if (node) {
305 /* Unregister deleted fd_handler */
306 aio_epoll_update(ctx, node, false);
307 }
2bbf11d7 308 qemu_lockcnt_unlock(&ctx->list_lock);
7ed2b24c 309 aio_notify(ctx);
4a1cba38 310
0ed39f3d
FZ
311 if (deleted) {
312 g_free(node);
313 }
9958c351
PB
314}
315
684e508c
SH
316void aio_set_fd_poll(AioContext *ctx, int fd,
317 IOHandler *io_poll_begin,
318 IOHandler *io_poll_end)
319{
320 AioHandler *node = find_aio_handler(ctx, fd);
321
322 if (!node) {
323 return;
324 }
325
326 node->io_poll_begin = io_poll_begin;
327 node->io_poll_end = io_poll_end;
328}
329
a915f4bc
PB
330void aio_set_event_notifier(AioContext *ctx,
331 EventNotifier *notifier,
dca21ef2 332 bool is_external,
f6a51c84
SH
333 EventNotifierHandler *io_read,
334 AioPollFn *io_poll)
a76bab49 335{
f6a51c84
SH
336 aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), is_external,
337 (IOHandler *)io_read, NULL, io_poll, notifier);
a76bab49
AL
338}
339
684e508c
SH
340void aio_set_event_notifier_poll(AioContext *ctx,
341 EventNotifier *notifier,
342 EventNotifierHandler *io_poll_begin,
343 EventNotifierHandler *io_poll_end)
344{
345 aio_set_fd_poll(ctx, event_notifier_get_fd(notifier),
346 (IOHandler *)io_poll_begin,
347 (IOHandler *)io_poll_end);
348}
349
350static void poll_set_started(AioContext *ctx, bool started)
351{
352 AioHandler *node;
353
354 if (started == ctx->poll_started) {
355 return;
356 }
357
358 ctx->poll_started = started;
359
2bbf11d7
PB
360 qemu_lockcnt_inc(&ctx->list_lock);
361 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
684e508c
SH
362 IOHandler *fn;
363
4749079c 364 if (QLIST_IS_INSERTED(node, node_deleted)) {
684e508c
SH
365 continue;
366 }
367
368 if (started) {
369 fn = node->io_poll_begin;
370 } else {
371 fn = node->io_poll_end;
372 }
373
374 if (fn) {
375 fn(node->opaque);
376 }
377 }
2bbf11d7 378 qemu_lockcnt_dec(&ctx->list_lock);
684e508c
SH
379}
380
381
a3462c65
PB
382bool aio_prepare(AioContext *ctx)
383{
684e508c
SH
384 /* Poll mode cannot be used with glib's event loop, disable it. */
385 poll_set_started(ctx, false);
386
a3462c65
PB
387 return false;
388}
389
cd9ba1eb
PB
390bool aio_pending(AioContext *ctx)
391{
392 AioHandler *node;
2bbf11d7 393 bool result = false;
cd9ba1eb 394
2bbf11d7
PB
395 /*
396 * We have to walk very carefully in case aio_set_fd_handler is
397 * called while we're walking.
398 */
399 qemu_lockcnt_inc(&ctx->list_lock);
400
401 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
cd9ba1eb
PB
402 int revents;
403
cd9ba1eb 404 revents = node->pfd.revents & node->pfd.events;
37989ced
FZ
405 if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read &&
406 aio_node_check(ctx, node->is_external)) {
2bbf11d7
PB
407 result = true;
408 break;
cd9ba1eb 409 }
37989ced
FZ
410 if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write &&
411 aio_node_check(ctx, node->is_external)) {
2bbf11d7
PB
412 result = true;
413 break;
cd9ba1eb
PB
414 }
415 }
2bbf11d7 416 qemu_lockcnt_dec(&ctx->list_lock);
cd9ba1eb 417
2bbf11d7 418 return result;
cd9ba1eb
PB
419}
420
4749079c
SH
421static void aio_free_deleted_handlers(AioContext *ctx)
422{
423 AioHandler *node;
424
425 if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) {
426 return;
427 }
428 if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
429 return; /* we are nested, let the parent do the freeing */
430 }
431
432 while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
433 QLIST_REMOVE(node, node);
434 QLIST_REMOVE(node, node_deleted);
435 g_free(node);
436 }
437
438 qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
439}
440
56d2c3c6 441static bool aio_dispatch_handlers(AioContext *ctx)
a76bab49 442{
abf90d39 443 AioHandler *node, *tmp;
d0c8d2c0 444 bool progress = false;
7c0628b2 445
2bbf11d7 446 QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
abf90d39 447 int revents;
cd9ba1eb
PB
448
449 revents = node->pfd.revents & node->pfd.events;
450 node->pfd.revents = 0;
451
4749079c 452 if (!QLIST_IS_INSERTED(node, node_deleted) &&
d0c8d2c0 453 (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
37989ced 454 aio_node_check(ctx, node->is_external) &&
d0c8d2c0 455 node->io_read) {
cd9ba1eb 456 node->io_read(node->opaque);
164a101f
SH
457
458 /* aio_notify() does not count as progress */
459 if (node->opaque != &ctx->notifier) {
460 progress = true;
461 }
cd9ba1eb 462 }
4749079c 463 if (!QLIST_IS_INSERTED(node, node_deleted) &&
d0c8d2c0 464 (revents & (G_IO_OUT | G_IO_ERR)) &&
37989ced 465 aio_node_check(ctx, node->is_external) &&
d0c8d2c0 466 node->io_write) {
cd9ba1eb
PB
467 node->io_write(node->opaque);
468 progress = true;
469 }
cd9ba1eb 470 }
438e1f47 471
56d2c3c6
PB
472 return progress;
473}
474
a153bf52 475void aio_dispatch(AioContext *ctx)
56d2c3c6 476{
a153bf52 477 qemu_lockcnt_inc(&ctx->list_lock);
bd451435 478 aio_bh_poll(ctx);
a153bf52 479 aio_dispatch_handlers(ctx);
4749079c 480 aio_free_deleted_handlers(ctx);
a153bf52 481 qemu_lockcnt_dec(&ctx->list_lock);
438e1f47 482
a153bf52 483 timerlistgroup_run_timers(&ctx->tlg);
d0c8d2c0
SH
484}
485
e98ab097
PB
486/* These thread-local variables are used only in a small part of aio_poll
487 * around the call to the poll() system call. In particular they are not
488 * used while aio_poll is performing callbacks, which makes it much easier
489 * to think about reentrancy!
490 *
491 * Stack-allocated arrays would be perfect but they have size limitations;
492 * heap allocation is expensive enough that we want to reuse arrays across
493 * calls to aio_poll(). And because poll() has to be called without holding
494 * any lock, the arrays cannot be stored in AioContext. Thread-local data
495 * has none of the disadvantages of these three options.
496 */
497static __thread GPollFD *pollfds;
498static __thread AioHandler **nodes;
499static __thread unsigned npfd, nalloc;
500static __thread Notifier pollfds_cleanup_notifier;
501
502static void pollfds_cleanup(Notifier *n, void *unused)
503{
504 g_assert(npfd == 0);
505 g_free(pollfds);
506 g_free(nodes);
507 nalloc = 0;
508}
509
510static void add_pollfd(AioHandler *node)
511{
512 if (npfd == nalloc) {
513 if (nalloc == 0) {
514 pollfds_cleanup_notifier.notify = pollfds_cleanup;
515 qemu_thread_atexit_add(&pollfds_cleanup_notifier);
516 nalloc = 8;
517 } else {
518 g_assert(nalloc <= INT_MAX);
519 nalloc *= 2;
520 }
521 pollfds = g_renew(GPollFD, pollfds, nalloc);
522 nodes = g_renew(AioHandler *, nodes, nalloc);
523 }
524 nodes[npfd] = node;
525 pollfds[npfd] = (GPollFD) {
526 .fd = node->pfd.fd,
527 .events = node->pfd.events,
528 };
529 npfd++;
530}
531
e30cffa0 532static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
684e508c
SH
533{
534 bool progress = false;
535 AioHandler *node;
536
f25c0b54
SH
537 /*
538 * Optimization: ->io_poll() handlers often contain RCU read critical
539 * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
540 * -> rcu_read_lock() -> ... sequences with expensive memory
541 * synchronization primitives. Make the entire polling loop an RCU
542 * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
543 * are cheap.
544 */
545 RCU_READ_LOCK_GUARD();
546
2bbf11d7 547 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
4749079c 548 if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
59c9f437 549 aio_node_check(ctx, node->is_external) &&
cfeb35d6 550 node->io_poll(node->opaque)) {
993ed89f
PB
551 /*
552 * Polling was successful, exit try_poll_mode immediately
553 * to adjust the next polling time.
554 */
e30cffa0 555 *timeout = 0;
cfeb35d6
PB
556 if (node->opaque != &ctx->notifier) {
557 progress = true;
558 }
684e508c
SH
559 }
560
561 /* Caller handles freeing deleted nodes. Don't do it here. */
562 }
563
564 return progress;
565}
566
4a1cba38
SH
567/* run_poll_handlers:
568 * @ctx: the AioContext
569 * @max_ns: maximum time to poll for, in nanoseconds
570 *
571 * Polls for a given time.
572 *
573 * Note that ctx->notify_me must be non-zero so this function can detect
574 * aio_notify().
575 *
2bbf11d7 576 * Note that the caller must have incremented ctx->list_lock.
4a1cba38
SH
577 *
578 * Returns: true if progress was made, false otherwise
579 */
e30cffa0 580static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
4a1cba38 581{
684e508c 582 bool progress;
e30cffa0 583 int64_t start_time, elapsed_time;
4a1cba38
SH
584
585 assert(ctx->notify_me);
2bbf11d7 586 assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
4a1cba38 587
e30cffa0 588 trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
4a1cba38 589
e30cffa0 590 start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
4a1cba38 591 do {
e30cffa0
PB
592 progress = run_poll_handlers_once(ctx, timeout);
593 elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
993ed89f
PB
594 max_ns = qemu_soonest_timeout(*timeout, max_ns);
595 assert(!(max_ns && progress));
596 } while (elapsed_time < max_ns && !atomic_read(&ctx->poll_disable_cnt));
4a1cba38 597
e30cffa0
PB
598 /* If time has passed with no successful polling, adjust *timeout to
599 * keep the same ending time.
600 */
601 if (*timeout != -1) {
602 *timeout -= MIN(*timeout, elapsed_time);
603 }
4a1cba38 604
e30cffa0 605 trace_run_poll_handlers_end(ctx, progress, *timeout);
4a1cba38
SH
606 return progress;
607}
608
609/* try_poll_mode:
610 * @ctx: the AioContext
e30cffa0
PB
611 * @timeout: timeout for blocking wait, computed by the caller and updated if
612 * polling succeeds.
4a1cba38 613 *
684e508c 614 * ctx->notify_me must be non-zero so this function can detect aio_notify().
4a1cba38 615 *
2bbf11d7 616 * Note that the caller must have incremented ctx->list_lock.
4a1cba38
SH
617 *
618 * Returns: true if progress was made, false otherwise
619 */
e30cffa0 620static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
4a1cba38 621{
993ed89f 622 int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
4a1cba38 623
e30cffa0
PB
624 if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) {
625 poll_set_started(ctx, true);
684e508c 626
e30cffa0
PB
627 if (run_poll_handlers(ctx, max_ns, timeout)) {
628 return true;
4a1cba38
SH
629 }
630 }
631
684e508c
SH
632 poll_set_started(ctx, false);
633
634 /* Even if we don't run busy polling, try polling once in case it can make
635 * progress and the caller will be able to avoid ppoll(2)/epoll_wait(2).
636 */
e30cffa0 637 return run_poll_handlers_once(ctx, timeout);
4a1cba38
SH
638}
639
d0c8d2c0
SH
640bool aio_poll(AioContext *ctx, bool blocking)
641{
d0c8d2c0 642 AioHandler *node;
4a1cba38
SH
643 int i;
644 int ret = 0;
164a101f 645 bool progress;
e98ab097 646 int64_t timeout;
82a41186 647 int64_t start = 0;
d0c8d2c0 648
0dc165c1
KW
649 assert(in_aio_context_home_thread(ctx));
650
0ceb849b
PB
651 /* aio_notify can avoid the expensive event_notifier_set if
652 * everything (file descriptors, bottom halves, timers) will
e4c7e2d1
PB
653 * be re-evaluated before the next blocking poll(). This is
654 * already true when aio_poll is called with blocking == false;
eabc9779
PB
655 * if blocking == true, it is only true after poll() returns,
656 * so disable the optimization now.
0ceb849b 657 */
eabc9779
PB
658 if (blocking) {
659 atomic_add(&ctx->notify_me, 2);
660 }
0ceb849b 661
2bbf11d7 662 qemu_lockcnt_inc(&ctx->list_lock);
a76bab49 663
82a41186
SH
664 if (ctx->poll_max_ns) {
665 start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
666 }
667
e30cffa0
PB
668 timeout = blocking ? aio_compute_timeout(ctx) : 0;
669 progress = try_poll_mode(ctx, &timeout);
670 assert(!(timeout && progress));
671
672 /* If polling is allowed, non-blocking aio_poll does not need the
673 * system call---a single round of run_poll_handlers_once suffices.
674 */
675 if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
4a1cba38 676 assert(npfd == 0);
a76bab49 677
4a1cba38 678 /* fill pollfds */
6b942468 679
4a1cba38 680 if (!aio_epoll_enabled(ctx)) {
2bbf11d7 681 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
4749079c 682 if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
4a1cba38
SH
683 && aio_node_check(ctx, node->is_external)) {
684 add_pollfd(node);
685 }
6b942468 686 }
9eb0bfca 687 }
a76bab49 688
4a1cba38 689 /* wait until next event */
4a1cba38 690 if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
ff29ed3a
SH
691 npfd = 0; /* pollfds[] is not being used */
692 ret = aio_epoll(ctx, timeout);
4a1cba38
SH
693 } else {
694 ret = qemu_poll_ns(pollfds, npfd, timeout);
695 }
fbe3fc5c 696 }
4a1cba38 697
eabc9779
PB
698 if (blocking) {
699 atomic_sub(&ctx->notify_me, 2);
b37548fc 700 aio_notify_accept(ctx);
eabc9779 701 }
9eb0bfca 702
82a41186
SH
703 /* Adjust polling time */
704 if (ctx->poll_max_ns) {
705 int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
706
707 if (block_ns <= ctx->poll_ns) {
708 /* This is the sweet spot, no adjustment needed */
709 } else if (block_ns > ctx->poll_max_ns) {
710 /* We'd have to poll for too long, poll less */
711 int64_t old = ctx->poll_ns;
712
713 if (ctx->poll_shrink) {
714 ctx->poll_ns /= ctx->poll_shrink;
715 } else {
716 ctx->poll_ns = 0;
717 }
718
719 trace_poll_shrink(ctx, old, ctx->poll_ns);
720 } else if (ctx->poll_ns < ctx->poll_max_ns &&
721 block_ns < ctx->poll_max_ns) {
722 /* There is room to grow, poll longer */
723 int64_t old = ctx->poll_ns;
724 int64_t grow = ctx->poll_grow;
725
726 if (grow == 0) {
727 grow = 2;
728 }
729
730 if (ctx->poll_ns) {
731 ctx->poll_ns *= grow;
732 } else {
733 ctx->poll_ns = 4000; /* start polling at 4 microseconds */
734 }
735
736 if (ctx->poll_ns > ctx->poll_max_ns) {
737 ctx->poll_ns = ctx->poll_max_ns;
738 }
739
740 trace_poll_grow(ctx, old, ctx->poll_ns);
741 }
742 }
743
9eb0bfca
PB
744 /* if we have any readable fds, dispatch event */
745 if (ret > 0) {
e98ab097
PB
746 for (i = 0; i < npfd; i++) {
747 nodes[i]->pfd.revents = pollfds[i].revents;
a76bab49 748 }
438e1f47
AB
749 }
750
e98ab097 751 npfd = 0;
e98ab097 752
a153bf52
PB
753 progress |= aio_bh_poll(ctx);
754
755 if (ret > 0) {
a153bf52 756 progress |= aio_dispatch_handlers(ctx);
9eb0bfca 757 }
bcdc1857 758
4749079c
SH
759 aio_free_deleted_handlers(ctx);
760
bd451435
PB
761 qemu_lockcnt_dec(&ctx->list_lock);
762
a153bf52
PB
763 progress |= timerlistgroup_run_timers(&ctx->tlg);
764
164a101f 765 return progress;
a76bab49 766}
37fcee5d 767
7e003465 768void aio_context_setup(AioContext *ctx)
37fcee5d 769{
147dfab7 770#ifdef CONFIG_EPOLL_CREATE1
fbe3fc5c
FZ
771 assert(!ctx->epollfd);
772 ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
773 if (ctx->epollfd == -1) {
7e003465 774 fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
fbe3fc5c
FZ
775 ctx->epoll_available = false;
776 } else {
777 ctx->epoll_available = true;
778 }
779#endif
37fcee5d 780}
4a1cba38 781
cd0a6d2b
JW
782void aio_context_destroy(AioContext *ctx)
783{
784#ifdef CONFIG_EPOLL_CREATE1
785 aio_epoll_disable(ctx);
786#endif
787}
788
82a41186
SH
789void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
790 int64_t grow, int64_t shrink, Error **errp)
4a1cba38 791{
82a41186
SH
792 /* No thread synchronization here, it doesn't matter if an incorrect value
793 * is used once.
4a1cba38
SH
794 */
795 ctx->poll_max_ns = max_ns;
82a41186
SH
796 ctx->poll_ns = 0;
797 ctx->poll_grow = grow;
798 ctx->poll_shrink = shrink;
4a1cba38
SH
799
800 aio_notify(ctx);
801}