util/aio-posix.c

   1 /*
   2  * QEMU aio implementation
   3  *
   4  * Copyright IBM, Corp. 2008
   5  *
   6  * Authors:
   7  *  Anthony Liguori   <aliguori@us.ibm.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Contributions after 2012-01-13 are licensed under the terms of the
  13  * GNU GPL, version 2 or (at your option) any later version.
  14  */
  15
  16 #include "qemu/osdep.h"
  17 #include "block/block.h"
  18 #include "qemu/rcu.h"
  19 #include "qemu/rcu_queue.h"
  20 #include "qemu/sockets.h"
  21 #include "qemu/cutils.h"
  22 #include "trace.h"
  23 #ifdef CONFIG_EPOLL_CREATE1
  24 #include <sys/epoll.h>
  25 #endif
  26
  27 struct AioHandler
  28 {
  29     GPollFD pfd;
  30     IOHandler *io_read;
  31     IOHandler *io_write;
  32     AioPollFn *io_poll;
  33     IOHandler *io_poll_begin;
  34     IOHandler *io_poll_end;
  35     void *opaque;
  36     bool is_external;
  37     QLIST_ENTRY(AioHandler) node;
  38     QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
  39     QLIST_ENTRY(AioHandler) node_deleted;
  40 };
  41
  42 /* Add a handler to a ready list */
  43 static void add_ready_handler(AioHandlerList *ready_list,
  44                               AioHandler *node,
  45                               int revents)
  46 {
  47     QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
  48     node->pfd.revents = revents;
  49     QLIST_INSERT_HEAD(ready_list, node, node_ready);
  50 }
  51
  52 #ifdef CONFIG_EPOLL_CREATE1
  53
  54 /* The fd number threshold to switch to epoll */
  55 #define EPOLL_ENABLE_THRESHOLD 64
  56
  57 static void aio_epoll_disable(AioContext *ctx)
  58 {
  59     ctx->epoll_enabled = false;
  60     if (!ctx->epoll_available) {
  61         return;
  62     }
  63     ctx->epoll_available = false;
  64     close(ctx->epollfd);
  65 }
  66
  67 static inline int epoll_events_from_pfd(int pfd_events)
  68 {
  69     return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
  70            (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
  71            (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
  72            (pfd_events & G_IO_ERR ? EPOLLERR : 0);
  73 }
  74
  75 static bool aio_epoll_try_enable(AioContext *ctx)
  76 {
  77     AioHandler *node;
  78     struct epoll_event event;
  79
  80     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
  81         int r;
  82         if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
  83             continue;
  84         }
  85         event.events = epoll_events_from_pfd(node->pfd.events);
  86         event.data.ptr = node;
  87         r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
  88         if (r) {
  89             return false;
  90         }
  91     }
  92     ctx->epoll_enabled = true;
  93     return true;
  94 }
  95
  96 static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
  97 {
  98     struct epoll_event event;
  99     int r;
 100     int ctl;
 101
 102     if (!ctx->epoll_enabled) {
 103         return;
 104     }
 105     if (!node->pfd.events) {
 106         ctl = EPOLL_CTL_DEL;
 107     } else {
 108         event.data.ptr = node;
 109         event.events = epoll_events_from_pfd(node->pfd.events);
 110         ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
 111     }
 112
 113     r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
 114     if (r) {
 115         aio_epoll_disable(ctx);
 116     }
 117 }
 118
 119 static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
 120                      int64_t timeout)
 121 {
 122     GPollFD pfd = {
 123         .fd = ctx->epollfd,
 124         .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
 125     };
 126     AioHandler *node;
 127     int i, ret = 0;
 128     struct epoll_event events[128];
 129
 130     if (timeout > 0) {
 131         ret = qemu_poll_ns(&pfd, 1, timeout);
 132         if (ret > 0) {
 133             timeout = 0;
 134         }
 135     }
 136     if (timeout <= 0 || ret > 0) {
 137         ret = epoll_wait(ctx->epollfd, events,
 138                          ARRAY_SIZE(events),
 139                          timeout);
 140         if (ret <= 0) {
 141             goto out;
 142         }
 143         for (i = 0; i < ret; i++) {
 144             int ev = events[i].events;
 145             int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
 146                           (ev & EPOLLOUT ? G_IO_OUT : 0) |
 147                           (ev & EPOLLHUP ? G_IO_HUP : 0) |
 148                           (ev & EPOLLERR ? G_IO_ERR : 0);
 149
 150             node = events[i].data.ptr;
 151             add_ready_handler(ready_list, node, revents);
 152         }
 153     }
 154 out:
 155     return ret;
 156 }
 157
 158 static bool aio_epoll_enabled(AioContext *ctx)
 159 {
 160     /* Fall back to ppoll when external clients are disabled. */
 161     return !aio_external_disabled(ctx) && ctx->epoll_enabled;
 162 }
 163
 164 static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
 165                                  unsigned npfd, int64_t timeout)
 166 {
 167     if (!ctx->epoll_available) {
 168         return false;
 169     }
 170     if (aio_epoll_enabled(ctx)) {
 171         return true;
 172     }
 173     if (npfd >= EPOLL_ENABLE_THRESHOLD) {
 174         if (aio_epoll_try_enable(ctx)) {
 175             return true;
 176         } else {
 177             aio_epoll_disable(ctx);
 178         }
 179     }
 180     return false;
 181 }
 182
 183 #else
 184
 185 static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
 186 {
 187 }
 188
 189 static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
 190                      int64_t timeout)
 191 {
 192     assert(false);
 193 }
 194
 195 static bool aio_epoll_enabled(AioContext *ctx)
 196 {
 197     return false;
 198 }
 199
 200 static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
 201                           unsigned npfd, int64_t timeout)
 202 {
 203     return false;
 204 }
 205
 206 #endif
 207
 208 static AioHandler *find_aio_handler(AioContext *ctx, int fd)
 209 {
 210     AioHandler *node;
 211
 212     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
 213         if (node->pfd.fd == fd) {
 214             if (!QLIST_IS_INSERTED(node, node_deleted)) {
 215                 return node;
 216             }
 217         }
 218     }
 219
 220     return NULL;
 221 }
 222
 223 static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
 224 {
 225     /* If the GSource is in the process of being destroyed then
 226      * g_source_remove_poll() causes an assertion failure.  Skip
 227      * removal in that case, because glib cleans up its state during
 228      * destruction anyway.
 229      */
 230     if (!g_source_is_destroyed(&ctx->source)) {
 231         g_source_remove_poll(&ctx->source, &node->pfd);
 232     }
 233
 234     /* If a read is in progress, just mark the node as deleted */
 235     if (qemu_lockcnt_count(&ctx->list_lock)) {
 236         QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
 237         node->pfd.revents = 0;
 238         return false;
 239     }
 240     /* Otherwise, delete it for real.  We can't just mark it as
 241      * deleted because deleted nodes are only cleaned up while
 242      * no one is walking the handlers list.
 243      */
 244     QLIST_REMOVE(node, node);
 245     return true;
 246 }
 247
 248 void aio_set_fd_handler(AioContext *ctx,
 249                         int fd,
 250                         bool is_external,
 251                         IOHandler *io_read,
 252                         IOHandler *io_write,
 253                         AioPollFn *io_poll,
 254                         void *opaque)
 255 {
 256     AioHandler *node;
 257     AioHandler *new_node = NULL;
 258     bool is_new = false;
 259     bool deleted = false;
 260     int poll_disable_change;
 261
 262     qemu_lockcnt_lock(&ctx->list_lock);
 263
 264     node = find_aio_handler(ctx, fd);
 265
 266     /* Are we deleting the fd handler? */
 267     if (!io_read && !io_write && !io_poll) {
 268         if (node == NULL) {
 269             qemu_lockcnt_unlock(&ctx->list_lock);
 270             return;
 271         }
 272         /* Clean events in order to unregister fd from the ctx epoll. */
 273         node->pfd.events = 0;
 274
 275         poll_disable_change = -!node->io_poll;
 276     } else {
 277         poll_disable_change = !io_poll - (node && !node->io_poll);
 278         if (node == NULL) {
 279             is_new = true;
 280         }
 281         /* Alloc and insert if it's not already there */
 282         new_node = g_new0(AioHandler, 1);
 283
 284         /* Update handler with latest information */
 285         new_node->io_read = io_read;
 286         new_node->io_write = io_write;
 287         new_node->io_poll = io_poll;
 288         new_node->opaque = opaque;
 289         new_node->is_external = is_external;
 290
 291         if (is_new) {
 292             new_node->pfd.fd = fd;
 293         } else {
 294             new_node->pfd = node->pfd;
 295         }
 296         g_source_add_poll(&ctx->source, &new_node->pfd);
 297
 298         new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
 299         new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
 300
 301         QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
 302     }
 303     if (node) {
 304         deleted = aio_remove_fd_handler(ctx, node);
 305     }
 306
 307     /* No need to order poll_disable_cnt writes against other updates;
 308      * the counter is only used to avoid wasting time and latency on
 309      * iterated polling when the system call will be ultimately necessary.
 310      * Changing handlers is a rare event, and a little wasted polling until
 311      * the aio_notify below is not an issue.
 312      */
 313     atomic_set(&ctx->poll_disable_cnt,
 314                atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
 315
 316     if (new_node) {
 317         aio_epoll_update(ctx, new_node, is_new);
 318     } else if (node) {
 319         /* Unregister deleted fd_handler */
 320         aio_epoll_update(ctx, node, false);
 321     }
 322     qemu_lockcnt_unlock(&ctx->list_lock);
 323     aio_notify(ctx);
 324
 325     if (deleted) {
 326         g_free(node);
 327     }
 328 }
 329
 330 void aio_set_fd_poll(AioContext *ctx, int fd,
 331                      IOHandler *io_poll_begin,
 332                      IOHandler *io_poll_end)
 333 {
 334     AioHandler *node = find_aio_handler(ctx, fd);
 335
 336     if (!node) {
 337         return;
 338     }
 339
 340     node->io_poll_begin = io_poll_begin;
 341     node->io_poll_end = io_poll_end;
 342 }
 343
 344 void aio_set_event_notifier(AioContext *ctx,
 345                             EventNotifier *notifier,
 346                             bool is_external,
 347                             EventNotifierHandler *io_read,
 348                             AioPollFn *io_poll)
 349 {
 350     aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), is_external,
 351                        (IOHandler *)io_read, NULL, io_poll, notifier);
 352 }
 353
 354 void aio_set_event_notifier_poll(AioContext *ctx,
 355                                  EventNotifier *notifier,
 356                                  EventNotifierHandler *io_poll_begin,
 357                                  EventNotifierHandler *io_poll_end)
 358 {
 359     aio_set_fd_poll(ctx, event_notifier_get_fd(notifier),
 360                     (IOHandler *)io_poll_begin,
 361                     (IOHandler *)io_poll_end);
 362 }
 363
 364 static bool poll_set_started(AioContext *ctx, bool started)
 365 {
 366     AioHandler *node;
 367     bool progress = false;
 368
 369     if (started == ctx->poll_started) {
 370         return false;
 371     }
 372
 373     ctx->poll_started = started;
 374
 375     qemu_lockcnt_inc(&ctx->list_lock);
 376     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 377         IOHandler *fn;
 378
 379         if (QLIST_IS_INSERTED(node, node_deleted)) {
 380             continue;
 381         }
 382
 383         if (started) {
 384             fn = node->io_poll_begin;
 385         } else {
 386             fn = node->io_poll_end;
 387         }
 388
 389         if (fn) {
 390             fn(node->opaque);
 391         }
 392
 393         /* Poll one last time in case ->io_poll_end() raced with the event */
 394         if (!started) {
 395             progress = node->io_poll(node->opaque) || progress;
 396         }
 397     }
 398     qemu_lockcnt_dec(&ctx->list_lock);
 399
 400     return progress;
 401 }
 402
 403
 404 bool aio_prepare(AioContext *ctx)
 405 {
 406     /* Poll mode cannot be used with glib's event loop, disable it. */
 407     poll_set_started(ctx, false);
 408
 409     return false;
 410 }
 411
 412 bool aio_pending(AioContext *ctx)
 413 {
 414     AioHandler *node;
 415     bool result = false;
 416
 417     /*
 418      * We have to walk very carefully in case aio_set_fd_handler is
 419      * called while we're walking.
 420      */
 421     qemu_lockcnt_inc(&ctx->list_lock);
 422
 423     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 424         int revents;
 425
 426         revents = node->pfd.revents & node->pfd.events;
 427         if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read &&
 428             aio_node_check(ctx, node->is_external)) {
 429             result = true;
 430             break;
 431         }
 432         if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write &&
 433             aio_node_check(ctx, node->is_external)) {
 434             result = true;
 435             break;
 436         }
 437     }
 438     qemu_lockcnt_dec(&ctx->list_lock);
 439
 440     return result;
 441 }
 442
 443 static void aio_free_deleted_handlers(AioContext *ctx)
 444 {
 445     AioHandler *node;
 446
 447     if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) {
 448         return;
 449     }
 450     if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
 451         return; /* we are nested, let the parent do the freeing */
 452     }
 453
 454     while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
 455         QLIST_REMOVE(node, node);
 456         QLIST_REMOVE(node, node_deleted);
 457         g_free(node);
 458     }
 459
 460     qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
 461 }
 462
 463 static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
 464 {
 465     bool progress = false;
 466     int revents;
 467
 468     revents = node->pfd.revents & node->pfd.events;
 469     node->pfd.revents = 0;
 470
 471     if (!QLIST_IS_INSERTED(node, node_deleted) &&
 472         (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
 473         aio_node_check(ctx, node->is_external) &&
 474         node->io_read) {
 475         node->io_read(node->opaque);
 476
 477         /* aio_notify() does not count as progress */
 478         if (node->opaque != &ctx->notifier) {
 479             progress = true;
 480         }
 481     }
 482     if (!QLIST_IS_INSERTED(node, node_deleted) &&
 483         (revents & (G_IO_OUT | G_IO_ERR)) &&
 484         aio_node_check(ctx, node->is_external) &&
 485         node->io_write) {
 486         node->io_write(node->opaque);
 487         progress = true;
 488     }
 489
 490     return progress;
 491 }
 492
 493 /*
 494  * If we have a list of ready handlers then this is more efficient than
 495  * scanning all handlers with aio_dispatch_handlers().
 496  */
 497 static bool aio_dispatch_ready_handlers(AioContext *ctx,
 498                                         AioHandlerList *ready_list)
 499 {
 500     bool progress = false;
 501     AioHandler *node;
 502
 503     while ((node = QLIST_FIRST(ready_list))) {
 504         QLIST_REMOVE(node, node_ready);
 505         progress = aio_dispatch_handler(ctx, node) || progress;
 506     }
 507
 508     return progress;
 509 }
 510
 511 /* Slower than aio_dispatch_ready_handlers() but only used via glib */
 512 static bool aio_dispatch_handlers(AioContext *ctx)
 513 {
 514     AioHandler *node, *tmp;
 515     bool progress = false;
 516
 517     QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
 518         progress = aio_dispatch_handler(ctx, node) || progress;
 519     }
 520
 521     return progress;
 522 }
 523
 524 void aio_dispatch(AioContext *ctx)
 525 {
 526     qemu_lockcnt_inc(&ctx->list_lock);
 527     aio_bh_poll(ctx);
 528     aio_dispatch_handlers(ctx);
 529     aio_free_deleted_handlers(ctx);
 530     qemu_lockcnt_dec(&ctx->list_lock);
 531
 532     timerlistgroup_run_timers(&ctx->tlg);
 533 }
 534
 535 /* These thread-local variables are used only in a small part of aio_poll
 536  * around the call to the poll() system call.  In particular they are not
 537  * used while aio_poll is performing callbacks, which makes it much easier
 538  * to think about reentrancy!
 539  *
 540  * Stack-allocated arrays would be perfect but they have size limitations;
 541  * heap allocation is expensive enough that we want to reuse arrays across
 542  * calls to aio_poll().  And because poll() has to be called without holding
 543  * any lock, the arrays cannot be stored in AioContext.  Thread-local data
 544  * has none of the disadvantages of these three options.
 545  */
 546 static __thread GPollFD *pollfds;
 547 static __thread AioHandler **nodes;
 548 static __thread unsigned npfd, nalloc;
 549 static __thread Notifier pollfds_cleanup_notifier;
 550
 551 static void pollfds_cleanup(Notifier *n, void *unused)
 552 {
 553     g_assert(npfd == 0);
 554     g_free(pollfds);
 555     g_free(nodes);
 556     nalloc = 0;
 557 }
 558
 559 static void add_pollfd(AioHandler *node)
 560 {
 561     if (npfd == nalloc) {
 562         if (nalloc == 0) {
 563             pollfds_cleanup_notifier.notify = pollfds_cleanup;
 564             qemu_thread_atexit_add(&pollfds_cleanup_notifier);
 565             nalloc = 8;
 566         } else {
 567             g_assert(nalloc <= INT_MAX);
 568             nalloc *= 2;
 569         }
 570         pollfds = g_renew(GPollFD, pollfds, nalloc);
 571         nodes = g_renew(AioHandler *, nodes, nalloc);
 572     }
 573     nodes[npfd] = node;
 574     pollfds[npfd] = (GPollFD) {
 575         .fd = node->pfd.fd,
 576         .events = node->pfd.events,
 577     };
 578     npfd++;
 579 }
 580
 581 static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
 582 {
 583     bool progress = false;
 584     AioHandler *node;
 585
 586     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 587         if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
 588             aio_node_check(ctx, node->is_external) &&
 589             node->io_poll(node->opaque)) {
 590             /*
 591              * Polling was successful, exit try_poll_mode immediately
 592              * to adjust the next polling time.
 593              */
 594             *timeout = 0;
 595             if (node->opaque != &ctx->notifier) {
 596                 progress = true;
 597             }
 598         }
 599
 600         /* Caller handles freeing deleted nodes.  Don't do it here. */
 601     }
 602
 603     return progress;
 604 }
 605
 606 /* run_poll_handlers:
 607  * @ctx: the AioContext
 608  * @max_ns: maximum time to poll for, in nanoseconds
 609  *
 610  * Polls for a given time.
 611  *
 612  * Note that ctx->notify_me must be non-zero so this function can detect
 613  * aio_notify().
 614  *
 615  * Note that the caller must have incremented ctx->list_lock.
 616  *
 617  * Returns: true if progress was made, false otherwise
 618  */
 619 static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
 620 {
 621     bool progress;
 622     int64_t start_time, elapsed_time;
 623
 624     assert(ctx->notify_me);
 625     assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
 626
 627     trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
 628
 629     /*
 630      * Optimization: ->io_poll() handlers often contain RCU read critical
 631      * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
 632      * -> rcu_read_lock() -> ... sequences with expensive memory
 633      * synchronization primitives.  Make the entire polling loop an RCU
 634      * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
 635      * are cheap.
 636      */
 637     RCU_READ_LOCK_GUARD();
 638
 639     start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 640     do {
 641         progress = run_poll_handlers_once(ctx, timeout);
 642         elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
 643         max_ns = qemu_soonest_timeout(*timeout, max_ns);
 644         assert(!(max_ns && progress));
 645     } while (elapsed_time < max_ns && !atomic_read(&ctx->poll_disable_cnt));
 646
 647     /* If time has passed with no successful polling, adjust *timeout to
 648      * keep the same ending time.
 649      */
 650     if (*timeout != -1) {
 651         *timeout -= MIN(*timeout, elapsed_time);
 652     }
 653
 654     trace_run_poll_handlers_end(ctx, progress, *timeout);
 655     return progress;
 656 }
 657
 658 /* try_poll_mode:
 659  * @ctx: the AioContext
 660  * @timeout: timeout for blocking wait, computed by the caller and updated if
 661  *    polling succeeds.
 662  *
 663  * ctx->notify_me must be non-zero so this function can detect aio_notify().
 664  *
 665  * Note that the caller must have incremented ctx->list_lock.
 666  *
 667  * Returns: true if progress was made, false otherwise
 668  */
 669 static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
 670 {
 671     int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
 672
 673     if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) {
 674         poll_set_started(ctx, true);
 675
 676         if (run_poll_handlers(ctx, max_ns, timeout)) {
 677             return true;
 678         }
 679     }
 680
 681     if (poll_set_started(ctx, false)) {
 682         *timeout = 0;
 683         return true;
 684     }
 685
 686     return false;
 687 }
 688
 689 bool aio_poll(AioContext *ctx, bool blocking)
 690 {
 691     AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
 692     AioHandler *node;
 693     int i;
 694     int ret = 0;
 695     bool progress;
 696     int64_t timeout;
 697     int64_t start = 0;
 698
 699     assert(in_aio_context_home_thread(ctx));
 700
 701     /* aio_notify can avoid the expensive event_notifier_set if
 702      * everything (file descriptors, bottom halves, timers) will
 703      * be re-evaluated before the next blocking poll().  This is
 704      * already true when aio_poll is called with blocking == false;
 705      * if blocking == true, it is only true after poll() returns,
 706      * so disable the optimization now.
 707      */
 708     if (blocking) {
 709         atomic_add(&ctx->notify_me, 2);
 710     }
 711
 712     qemu_lockcnt_inc(&ctx->list_lock);
 713
 714     if (ctx->poll_max_ns) {
 715         start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 716     }
 717
 718     timeout = blocking ? aio_compute_timeout(ctx) : 0;
 719     progress = try_poll_mode(ctx, &timeout);
 720     assert(!(timeout && progress));
 721
 722     /* If polling is allowed, non-blocking aio_poll does not need the
 723      * system call---a single round of run_poll_handlers_once suffices.
 724      */
 725     if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
 726         assert(npfd == 0);
 727
 728         /* fill pollfds */
 729
 730         if (!aio_epoll_enabled(ctx)) {
 731             QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 732                 if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
 733                     && aio_node_check(ctx, node->is_external)) {
 734                     add_pollfd(node);
 735                 }
 736             }
 737         }
 738
 739         /* wait until next event */
 740         if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
 741             npfd = 0; /* pollfds[] is not being used */
 742             ret = aio_epoll(ctx, &ready_list, timeout);
 743         } else  {
 744             ret = qemu_poll_ns(pollfds, npfd, timeout);
 745         }
 746     }
 747
 748     if (blocking) {
 749         atomic_sub(&ctx->notify_me, 2);
 750         aio_notify_accept(ctx);
 751     }
 752
 753     /* Adjust polling time */
 754     if (ctx->poll_max_ns) {
 755         int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
 756
 757         if (block_ns <= ctx->poll_ns) {
 758             /* This is the sweet spot, no adjustment needed */
 759         } else if (block_ns > ctx->poll_max_ns) {
 760             /* We'd have to poll for too long, poll less */
 761             int64_t old = ctx->poll_ns;
 762
 763             if (ctx->poll_shrink) {
 764                 ctx->poll_ns /= ctx->poll_shrink;
 765             } else {
 766                 ctx->poll_ns = 0;
 767             }
 768
 769             trace_poll_shrink(ctx, old, ctx->poll_ns);
 770         } else if (ctx->poll_ns < ctx->poll_max_ns &&
 771                    block_ns < ctx->poll_max_ns) {
 772             /* There is room to grow, poll longer */
 773             int64_t old = ctx->poll_ns;
 774             int64_t grow = ctx->poll_grow;
 775
 776             if (grow == 0) {
 777                 grow = 2;
 778             }
 779
 780             if (ctx->poll_ns) {
 781                 ctx->poll_ns *= grow;
 782             } else {
 783                 ctx->poll_ns = 4000; /* start polling at 4 microseconds */
 784             }
 785
 786             if (ctx->poll_ns > ctx->poll_max_ns) {
 787                 ctx->poll_ns = ctx->poll_max_ns;
 788             }
 789
 790             trace_poll_grow(ctx, old, ctx->poll_ns);
 791         }
 792     }
 793
 794     /* if we have any readable fds, dispatch event */
 795     if (ret > 0) {
 796         for (i = 0; i < npfd; i++) {
 797             int revents = pollfds[i].revents;
 798
 799             if (revents) {
 800                 add_ready_handler(&ready_list, nodes[i], revents);
 801             }
 802         }
 803     }
 804
 805     npfd = 0;
 806
 807     progress |= aio_bh_poll(ctx);
 808
 809     if (ret > 0) {
 810         progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
 811     }
 812
 813     aio_free_deleted_handlers(ctx);
 814
 815     qemu_lockcnt_dec(&ctx->list_lock);
 816
 817     progress |= timerlistgroup_run_timers(&ctx->tlg);
 818
 819     return progress;
 820 }
 821
 822 void aio_context_setup(AioContext *ctx)
 823 {
 824 #ifdef CONFIG_EPOLL_CREATE1
 825     assert(!ctx->epollfd);
 826     ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
 827     if (ctx->epollfd == -1) {
 828         fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
 829         ctx->epoll_available = false;
 830     } else {
 831         ctx->epoll_available = true;
 832     }
 833 #endif
 834 }
 835
 836 void aio_context_destroy(AioContext *ctx)
 837 {
 838 #ifdef CONFIG_EPOLL_CREATE1
 839     aio_epoll_disable(ctx);
 840 #endif
 841 }
 842
 843 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
 844                                  int64_t grow, int64_t shrink, Error **errp)
 845 {
 846     /* No thread synchronization here, it doesn't matter if an incorrect value
 847      * is used once.
 848      */
 849     ctx->poll_max_ns = max_ns;
 850     ctx->poll_ns = 0;
 851     ctx->poll_grow = grow;
 852     ctx->poll_shrink = shrink;
 853
 854     aio_notify(ctx);
 855 }