util/aio-posix.c

   1 /*
   2  * QEMU aio implementation
   3  *
   4  * Copyright IBM, Corp. 2008
   5  *
   6  * Authors:
   7  *  Anthony Liguori   <aliguori@us.ibm.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Contributions after 2012-01-13 are licensed under the terms of the
  13  * GNU GPL, version 2 or (at your option) any later version.
  14  */
  15
  16 #include "qemu/osdep.h"
  17 #include "block/block.h"
  18 #include "qemu/rcu.h"
  19 #include "qemu/rcu_queue.h"
  20 #include "qemu/sockets.h"
  21 #include "qemu/cutils.h"
  22 #include "trace.h"
  23 #ifdef CONFIG_EPOLL_CREATE1
  24 #include <sys/epoll.h>
  25 #endif
  26
  27 struct AioHandler
  28 {
  29     GPollFD pfd;
  30     IOHandler *io_read;
  31     IOHandler *io_write;
  32     AioPollFn *io_poll;
  33     IOHandler *io_poll_begin;
  34     IOHandler *io_poll_end;
  35     void *opaque;
  36     bool is_external;
  37     QLIST_ENTRY(AioHandler) node;
  38     QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
  39     QLIST_ENTRY(AioHandler) node_deleted;
  40 };
  41
  42 /* Add a handler to a ready list */
  43 static void add_ready_handler(AioHandlerList *ready_list,
  44                               AioHandler *node,
  45                               int revents)
  46 {
  47     QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
  48     node->pfd.revents = revents;
  49     QLIST_INSERT_HEAD(ready_list, node, node_ready);
  50 }
  51
  52 #ifdef CONFIG_EPOLL_CREATE1
  53
  54 /* The fd number threshold to switch to epoll */
  55 #define EPOLL_ENABLE_THRESHOLD 64
  56
  57 static void aio_epoll_disable(AioContext *ctx)
  58 {
  59     ctx->epoll_enabled = false;
  60     if (!ctx->epoll_available) {
  61         return;
  62     }
  63     ctx->epoll_available = false;
  64     close(ctx->epollfd);
  65 }
  66
  67 static inline int epoll_events_from_pfd(int pfd_events)
  68 {
  69     return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
  70            (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
  71            (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
  72            (pfd_events & G_IO_ERR ? EPOLLERR : 0);
  73 }
  74
  75 static bool aio_epoll_try_enable(AioContext *ctx)
  76 {
  77     AioHandler *node;
  78     struct epoll_event event;
  79
  80     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
  81         int r;
  82         if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
  83             continue;
  84         }
  85         event.events = epoll_events_from_pfd(node->pfd.events);
  86         event.data.ptr = node;
  87         r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
  88         if (r) {
  89             return false;
  90         }
  91     }
  92     ctx->epoll_enabled = true;
  93     return true;
  94 }
  95
  96 static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
  97 {
  98     struct epoll_event event;
  99     int r;
 100     int ctl;
 101
 102     if (!ctx->epoll_enabled) {
 103         return;
 104     }
 105     if (!node->pfd.events) {
 106         ctl = EPOLL_CTL_DEL;
 107     } else {
 108         event.data.ptr = node;
 109         event.events = epoll_events_from_pfd(node->pfd.events);
 110         ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
 111     }
 112
 113     r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
 114     if (r) {
 115         aio_epoll_disable(ctx);
 116     }
 117 }
 118
 119 static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
 120                      int64_t timeout)
 121 {
 122     GPollFD pfd = {
 123         .fd = ctx->epollfd,
 124         .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
 125     };
 126     AioHandler *node;
 127     int i, ret = 0;
 128     struct epoll_event events[128];
 129
 130     if (timeout > 0) {
 131         ret = qemu_poll_ns(&pfd, 1, timeout);
 132         if (ret > 0) {
 133             timeout = 0;
 134         }
 135     }
 136     if (timeout <= 0 || ret > 0) {
 137         ret = epoll_wait(ctx->epollfd, events,
 138                          ARRAY_SIZE(events),
 139                          timeout);
 140         if (ret <= 0) {
 141             goto out;
 142         }
 143         for (i = 0; i < ret; i++) {
 144             int ev = events[i].events;
 145             int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
 146                           (ev & EPOLLOUT ? G_IO_OUT : 0) |
 147                           (ev & EPOLLHUP ? G_IO_HUP : 0) |
 148                           (ev & EPOLLERR ? G_IO_ERR : 0);
 149
 150             node = events[i].data.ptr;
 151             add_ready_handler(ready_list, node, revents);
 152         }
 153     }
 154 out:
 155     return ret;
 156 }
 157
 158 static bool aio_epoll_enabled(AioContext *ctx)
 159 {
 160     /* Fall back to ppoll when external clients are disabled. */
 161     return !aio_external_disabled(ctx) && ctx->epoll_enabled;
 162 }
 163
 164 static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
 165                                  unsigned npfd, int64_t timeout)
 166 {
 167     if (!ctx->epoll_available) {
 168         return false;
 169     }
 170     if (aio_epoll_enabled(ctx)) {
 171         return true;
 172     }
 173     if (npfd >= EPOLL_ENABLE_THRESHOLD) {
 174         if (aio_epoll_try_enable(ctx)) {
 175             return true;
 176         } else {
 177             aio_epoll_disable(ctx);
 178         }
 179     }
 180     return false;
 181 }
 182
 183 #else
 184
 185 static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
 186 {
 187 }
 188
 189 static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
 190                      int64_t timeout)
 191 {
 192     assert(false);
 193 }
 194
 195 static bool aio_epoll_enabled(AioContext *ctx)
 196 {
 197     return false;
 198 }
 199
 200 static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
 201                           unsigned npfd, int64_t timeout)
 202 {
 203     return false;
 204 }
 205
 206 #endif
 207
 208 static AioHandler *find_aio_handler(AioContext *ctx, int fd)
 209 {
 210     AioHandler *node;
 211
 212     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
 213         if (node->pfd.fd == fd) {
 214             if (!QLIST_IS_INSERTED(node, node_deleted)) {
 215                 return node;
 216             }
 217         }
 218     }
 219
 220     return NULL;
 221 }
 222
 223 static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
 224 {
 225     /* If the GSource is in the process of being destroyed then
 226      * g_source_remove_poll() causes an assertion failure.  Skip
 227      * removal in that case, because glib cleans up its state during
 228      * destruction anyway.
 229      */
 230     if (!g_source_is_destroyed(&ctx->source)) {
 231         g_source_remove_poll(&ctx->source, &node->pfd);
 232     }
 233
 234     /* If a read is in progress, just mark the node as deleted */
 235     if (qemu_lockcnt_count(&ctx->list_lock)) {
 236         QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
 237         node->pfd.revents = 0;
 238         return false;
 239     }
 240     /* Otherwise, delete it for real.  We can't just mark it as
 241      * deleted because deleted nodes are only cleaned up while
 242      * no one is walking the handlers list.
 243      */
 244     QLIST_REMOVE(node, node);
 245     return true;
 246 }
 247
 248 void aio_set_fd_handler(AioContext *ctx,
 249                         int fd,
 250                         bool is_external,
 251                         IOHandler *io_read,
 252                         IOHandler *io_write,
 253                         AioPollFn *io_poll,
 254                         void *opaque)
 255 {
 256     AioHandler *node;
 257     AioHandler *new_node = NULL;
 258     bool is_new = false;
 259     bool deleted = false;
 260     int poll_disable_change;
 261
 262     qemu_lockcnt_lock(&ctx->list_lock);
 263
 264     node = find_aio_handler(ctx, fd);
 265
 266     /* Are we deleting the fd handler? */
 267     if (!io_read && !io_write && !io_poll) {
 268         if (node == NULL) {
 269             qemu_lockcnt_unlock(&ctx->list_lock);
 270             return;
 271         }
 272         /* Clean events in order to unregister fd from the ctx epoll. */
 273         node->pfd.events = 0;
 274
 275         poll_disable_change = -!node->io_poll;
 276     } else {
 277         poll_disable_change = !io_poll - (node && !node->io_poll);
 278         if (node == NULL) {
 279             is_new = true;
 280         }
 281         /* Alloc and insert if it's not already there */
 282         new_node = g_new0(AioHandler, 1);
 283
 284         /* Update handler with latest information */
 285         new_node->io_read = io_read;
 286         new_node->io_write = io_write;
 287         new_node->io_poll = io_poll;
 288         new_node->opaque = opaque;
 289         new_node->is_external = is_external;
 290
 291         if (is_new) {
 292             new_node->pfd.fd = fd;
 293         } else {
 294             new_node->pfd = node->pfd;
 295         }
 296         g_source_add_poll(&ctx->source, &new_node->pfd);
 297
 298         new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
 299         new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
 300
 301         QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
 302     }
 303     if (node) {
 304         deleted = aio_remove_fd_handler(ctx, node);
 305     }
 306
 307     /* No need to order poll_disable_cnt writes against other updates;
 308      * the counter is only used to avoid wasting time and latency on
 309      * iterated polling when the system call will be ultimately necessary.
 310      * Changing handlers is a rare event, and a little wasted polling until
 311      * the aio_notify below is not an issue.
 312      */
 313     atomic_set(&ctx->poll_disable_cnt,
 314                atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
 315
 316     if (new_node) {
 317         aio_epoll_update(ctx, new_node, is_new);
 318     } else if (node) {
 319         /* Unregister deleted fd_handler */
 320         aio_epoll_update(ctx, node, false);
 321     }
 322     qemu_lockcnt_unlock(&ctx->list_lock);
 323     aio_notify(ctx);
 324
 325     if (deleted) {
 326         g_free(node);
 327     }
 328 }
 329
 330 void aio_set_fd_poll(AioContext *ctx, int fd,
 331                      IOHandler *io_poll_begin,
 332                      IOHandler *io_poll_end)
 333 {
 334     AioHandler *node = find_aio_handler(ctx, fd);
 335
 336     if (!node) {
 337         return;
 338     }
 339
 340     node->io_poll_begin = io_poll_begin;
 341     node->io_poll_end = io_poll_end;
 342 }
 343
 344 void aio_set_event_notifier(AioContext *ctx,
 345                             EventNotifier *notifier,
 346                             bool is_external,
 347                             EventNotifierHandler *io_read,
 348                             AioPollFn *io_poll)
 349 {
 350     aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), is_external,
 351                        (IOHandler *)io_read, NULL, io_poll, notifier);
 352 }
 353
 354 void aio_set_event_notifier_poll(AioContext *ctx,
 355                                  EventNotifier *notifier,
 356                                  EventNotifierHandler *io_poll_begin,
 357                                  EventNotifierHandler *io_poll_end)
 358 {
 359     aio_set_fd_poll(ctx, event_notifier_get_fd(notifier),
 360                     (IOHandler *)io_poll_begin,
 361                     (IOHandler *)io_poll_end);
 362 }
 363
 364 static void poll_set_started(AioContext *ctx, bool started)
 365 {
 366     AioHandler *node;
 367
 368     if (started == ctx->poll_started) {
 369         return;
 370     }
 371
 372     ctx->poll_started = started;
 373
 374     qemu_lockcnt_inc(&ctx->list_lock);
 375     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 376         IOHandler *fn;
 377
 378         if (QLIST_IS_INSERTED(node, node_deleted)) {
 379             continue;
 380         }
 381
 382         if (started) {
 383             fn = node->io_poll_begin;
 384         } else {
 385             fn = node->io_poll_end;
 386         }
 387
 388         if (fn) {
 389             fn(node->opaque);
 390         }
 391     }
 392     qemu_lockcnt_dec(&ctx->list_lock);
 393 }
 394
 395
 396 bool aio_prepare(AioContext *ctx)
 397 {
 398     /* Poll mode cannot be used with glib's event loop, disable it. */
 399     poll_set_started(ctx, false);
 400
 401     return false;
 402 }
 403
 404 bool aio_pending(AioContext *ctx)
 405 {
 406     AioHandler *node;
 407     bool result = false;
 408
 409     /*
 410      * We have to walk very carefully in case aio_set_fd_handler is
 411      * called while we're walking.
 412      */
 413     qemu_lockcnt_inc(&ctx->list_lock);
 414
 415     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 416         int revents;
 417
 418         revents = node->pfd.revents & node->pfd.events;
 419         if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read &&
 420             aio_node_check(ctx, node->is_external)) {
 421             result = true;
 422             break;
 423         }
 424         if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write &&
 425             aio_node_check(ctx, node->is_external)) {
 426             result = true;
 427             break;
 428         }
 429     }
 430     qemu_lockcnt_dec(&ctx->list_lock);
 431
 432     return result;
 433 }
 434
 435 static void aio_free_deleted_handlers(AioContext *ctx)
 436 {
 437     AioHandler *node;
 438
 439     if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) {
 440         return;
 441     }
 442     if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
 443         return; /* we are nested, let the parent do the freeing */
 444     }
 445
 446     while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
 447         QLIST_REMOVE(node, node);
 448         QLIST_REMOVE(node, node_deleted);
 449         g_free(node);
 450     }
 451
 452     qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
 453 }
 454
 455 static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
 456 {
 457     bool progress = false;
 458     int revents;
 459
 460     revents = node->pfd.revents & node->pfd.events;
 461     node->pfd.revents = 0;
 462
 463     if (!QLIST_IS_INSERTED(node, node_deleted) &&
 464         (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
 465         aio_node_check(ctx, node->is_external) &&
 466         node->io_read) {
 467         node->io_read(node->opaque);
 468
 469         /* aio_notify() does not count as progress */
 470         if (node->opaque != &ctx->notifier) {
 471             progress = true;
 472         }
 473     }
 474     if (!QLIST_IS_INSERTED(node, node_deleted) &&
 475         (revents & (G_IO_OUT | G_IO_ERR)) &&
 476         aio_node_check(ctx, node->is_external) &&
 477         node->io_write) {
 478         node->io_write(node->opaque);
 479         progress = true;
 480     }
 481
 482     return progress;
 483 }
 484
 485 /*
 486  * If we have a list of ready handlers then this is more efficient than
 487  * scanning all handlers with aio_dispatch_handlers().
 488  */
 489 static bool aio_dispatch_ready_handlers(AioContext *ctx,
 490                                         AioHandlerList *ready_list)
 491 {
 492     bool progress = false;
 493     AioHandler *node;
 494
 495     while ((node = QLIST_FIRST(ready_list))) {
 496         QLIST_REMOVE(node, node_ready);
 497         progress = aio_dispatch_handler(ctx, node) || progress;
 498     }
 499
 500     return progress;
 501 }
 502
 503 /* Slower than aio_dispatch_ready_handlers() but only used via glib */
 504 static bool aio_dispatch_handlers(AioContext *ctx)
 505 {
 506     AioHandler *node, *tmp;
 507     bool progress = false;
 508
 509     QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
 510         progress = aio_dispatch_handler(ctx, node) || progress;
 511     }
 512
 513     return progress;
 514 }
 515
 516 void aio_dispatch(AioContext *ctx)
 517 {
 518     qemu_lockcnt_inc(&ctx->list_lock);
 519     aio_bh_poll(ctx);
 520     aio_dispatch_handlers(ctx);
 521     aio_free_deleted_handlers(ctx);
 522     qemu_lockcnt_dec(&ctx->list_lock);
 523
 524     timerlistgroup_run_timers(&ctx->tlg);
 525 }
 526
 527 /* These thread-local variables are used only in a small part of aio_poll
 528  * around the call to the poll() system call.  In particular they are not
 529  * used while aio_poll is performing callbacks, which makes it much easier
 530  * to think about reentrancy!
 531  *
 532  * Stack-allocated arrays would be perfect but they have size limitations;
 533  * heap allocation is expensive enough that we want to reuse arrays across
 534  * calls to aio_poll().  And because poll() has to be called without holding
 535  * any lock, the arrays cannot be stored in AioContext.  Thread-local data
 536  * has none of the disadvantages of these three options.
 537  */
 538 static __thread GPollFD *pollfds;
 539 static __thread AioHandler **nodes;
 540 static __thread unsigned npfd, nalloc;
 541 static __thread Notifier pollfds_cleanup_notifier;
 542
 543 static void pollfds_cleanup(Notifier *n, void *unused)
 544 {
 545     g_assert(npfd == 0);
 546     g_free(pollfds);
 547     g_free(nodes);
 548     nalloc = 0;
 549 }
 550
 551 static void add_pollfd(AioHandler *node)
 552 {
 553     if (npfd == nalloc) {
 554         if (nalloc == 0) {
 555             pollfds_cleanup_notifier.notify = pollfds_cleanup;
 556             qemu_thread_atexit_add(&pollfds_cleanup_notifier);
 557             nalloc = 8;
 558         } else {
 559             g_assert(nalloc <= INT_MAX);
 560             nalloc *= 2;
 561         }
 562         pollfds = g_renew(GPollFD, pollfds, nalloc);
 563         nodes = g_renew(AioHandler *, nodes, nalloc);
 564     }
 565     nodes[npfd] = node;
 566     pollfds[npfd] = (GPollFD) {
 567         .fd = node->pfd.fd,
 568         .events = node->pfd.events,
 569     };
 570     npfd++;
 571 }
 572
 573 static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
 574 {
 575     bool progress = false;
 576     AioHandler *node;
 577
 578     /*
 579      * Optimization: ->io_poll() handlers often contain RCU read critical
 580      * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
 581      * -> rcu_read_lock() -> ... sequences with expensive memory
 582      * synchronization primitives.  Make the entire polling loop an RCU
 583      * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
 584      * are cheap.
 585      */
 586     RCU_READ_LOCK_GUARD();
 587
 588     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 589         if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
 590             aio_node_check(ctx, node->is_external) &&
 591             node->io_poll(node->opaque)) {
 592             /*
 593              * Polling was successful, exit try_poll_mode immediately
 594              * to adjust the next polling time.
 595              */
 596             *timeout = 0;
 597             if (node->opaque != &ctx->notifier) {
 598                 progress = true;
 599             }
 600         }
 601
 602         /* Caller handles freeing deleted nodes.  Don't do it here. */
 603     }
 604
 605     return progress;
 606 }
 607
 608 /* run_poll_handlers:
 609  * @ctx: the AioContext
 610  * @max_ns: maximum time to poll for, in nanoseconds
 611  *
 612  * Polls for a given time.
 613  *
 614  * Note that ctx->notify_me must be non-zero so this function can detect
 615  * aio_notify().
 616  *
 617  * Note that the caller must have incremented ctx->list_lock.
 618  *
 619  * Returns: true if progress was made, false otherwise
 620  */
 621 static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
 622 {
 623     bool progress;
 624     int64_t start_time, elapsed_time;
 625
 626     assert(ctx->notify_me);
 627     assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
 628
 629     trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
 630
 631     start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 632     do {
 633         progress = run_poll_handlers_once(ctx, timeout);
 634         elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
 635         max_ns = qemu_soonest_timeout(*timeout, max_ns);
 636         assert(!(max_ns && progress));
 637     } while (elapsed_time < max_ns && !atomic_read(&ctx->poll_disable_cnt));
 638
 639     /* If time has passed with no successful polling, adjust *timeout to
 640      * keep the same ending time.
 641      */
 642     if (*timeout != -1) {
 643         *timeout -= MIN(*timeout, elapsed_time);
 644     }
 645
 646     trace_run_poll_handlers_end(ctx, progress, *timeout);
 647     return progress;
 648 }
 649
 650 /* try_poll_mode:
 651  * @ctx: the AioContext
 652  * @timeout: timeout for blocking wait, computed by the caller and updated if
 653  *    polling succeeds.
 654  *
 655  * ctx->notify_me must be non-zero so this function can detect aio_notify().
 656  *
 657  * Note that the caller must have incremented ctx->list_lock.
 658  *
 659  * Returns: true if progress was made, false otherwise
 660  */
 661 static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
 662 {
 663     int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
 664
 665     if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) {
 666         poll_set_started(ctx, true);
 667
 668         if (run_poll_handlers(ctx, max_ns, timeout)) {
 669             return true;
 670         }
 671     }
 672
 673     poll_set_started(ctx, false);
 674
 675     /* Even if we don't run busy polling, try polling once in case it can make
 676      * progress and the caller will be able to avoid ppoll(2)/epoll_wait(2).
 677      */
 678     return run_poll_handlers_once(ctx, timeout);
 679 }
 680
 681 bool aio_poll(AioContext *ctx, bool blocking)
 682 {
 683     AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
 684     AioHandler *node;
 685     int i;
 686     int ret = 0;
 687     bool progress;
 688     int64_t timeout;
 689     int64_t start = 0;
 690
 691     assert(in_aio_context_home_thread(ctx));
 692
 693     /* aio_notify can avoid the expensive event_notifier_set if
 694      * everything (file descriptors, bottom halves, timers) will
 695      * be re-evaluated before the next blocking poll().  This is
 696      * already true when aio_poll is called with blocking == false;
 697      * if blocking == true, it is only true after poll() returns,
 698      * so disable the optimization now.
 699      */
 700     if (blocking) {
 701         atomic_add(&ctx->notify_me, 2);
 702     }
 703
 704     qemu_lockcnt_inc(&ctx->list_lock);
 705
 706     if (ctx->poll_max_ns) {
 707         start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 708     }
 709
 710     timeout = blocking ? aio_compute_timeout(ctx) : 0;
 711     progress = try_poll_mode(ctx, &timeout);
 712     assert(!(timeout && progress));
 713
 714     /* If polling is allowed, non-blocking aio_poll does not need the
 715      * system call---a single round of run_poll_handlers_once suffices.
 716      */
 717     if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
 718         assert(npfd == 0);
 719
 720         /* fill pollfds */
 721
 722         if (!aio_epoll_enabled(ctx)) {
 723             QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
 724                 if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
 725                     && aio_node_check(ctx, node->is_external)) {
 726                     add_pollfd(node);
 727                 }
 728             }
 729         }
 730
 731         /* wait until next event */
 732         if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
 733             npfd = 0; /* pollfds[] is not being used */
 734             ret = aio_epoll(ctx, &ready_list, timeout);
 735         } else  {
 736             ret = qemu_poll_ns(pollfds, npfd, timeout);
 737         }
 738     }
 739
 740     if (blocking) {
 741         atomic_sub(&ctx->notify_me, 2);
 742         aio_notify_accept(ctx);
 743     }
 744
 745     /* Adjust polling time */
 746     if (ctx->poll_max_ns) {
 747         int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
 748
 749         if (block_ns <= ctx->poll_ns) {
 750             /* This is the sweet spot, no adjustment needed */
 751         } else if (block_ns > ctx->poll_max_ns) {
 752             /* We'd have to poll for too long, poll less */
 753             int64_t old = ctx->poll_ns;
 754
 755             if (ctx->poll_shrink) {
 756                 ctx->poll_ns /= ctx->poll_shrink;
 757             } else {
 758                 ctx->poll_ns = 0;
 759             }
 760
 761             trace_poll_shrink(ctx, old, ctx->poll_ns);
 762         } else if (ctx->poll_ns < ctx->poll_max_ns &&
 763                    block_ns < ctx->poll_max_ns) {
 764             /* There is room to grow, poll longer */
 765             int64_t old = ctx->poll_ns;
 766             int64_t grow = ctx->poll_grow;
 767
 768             if (grow == 0) {
 769                 grow = 2;
 770             }
 771
 772             if (ctx->poll_ns) {
 773                 ctx->poll_ns *= grow;
 774             } else {
 775                 ctx->poll_ns = 4000; /* start polling at 4 microseconds */
 776             }
 777
 778             if (ctx->poll_ns > ctx->poll_max_ns) {
 779                 ctx->poll_ns = ctx->poll_max_ns;
 780             }
 781
 782             trace_poll_grow(ctx, old, ctx->poll_ns);
 783         }
 784     }
 785
 786     /* if we have any readable fds, dispatch event */
 787     if (ret > 0) {
 788         for (i = 0; i < npfd; i++) {
 789             int revents = pollfds[i].revents;
 790
 791             if (revents) {
 792                 add_ready_handler(&ready_list, nodes[i], revents);
 793             }
 794         }
 795     }
 796
 797     npfd = 0;
 798
 799     progress |= aio_bh_poll(ctx);
 800
 801     if (ret > 0) {
 802         progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
 803     }
 804
 805     aio_free_deleted_handlers(ctx);
 806
 807     qemu_lockcnt_dec(&ctx->list_lock);
 808
 809     progress |= timerlistgroup_run_timers(&ctx->tlg);
 810
 811     return progress;
 812 }
 813
 814 void aio_context_setup(AioContext *ctx)
 815 {
 816 #ifdef CONFIG_EPOLL_CREATE1
 817     assert(!ctx->epollfd);
 818     ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
 819     if (ctx->epollfd == -1) {
 820         fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
 821         ctx->epoll_available = false;
 822     } else {
 823         ctx->epoll_available = true;
 824     }
 825 #endif
 826 }
 827
 828 void aio_context_destroy(AioContext *ctx)
 829 {
 830 #ifdef CONFIG_EPOLL_CREATE1
 831     aio_epoll_disable(ctx);
 832 #endif
 833 }
 834
 835 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
 836                                  int64_t grow, int64_t shrink, Error **errp)
 837 {
 838     /* No thread synchronization here, it doesn't matter if an incorrect value
 839      * is used once.
 840      */
 841     ctx->poll_max_ns = max_ns;
 842     ctx->poll_ns = 0;
 843     ctx->poll_grow = grow;
 844     ctx->poll_shrink = shrink;
 845
 846     aio_notify(ctx);
 847 }