aio-posix: remove idle poll handlers to improve scalability

author Stefan Hajnoczi <stefanha@redhat.com>

Thu, 5 Mar 2020 17:08:06 +0000 (17:08 +0000)

committer Stefan Hajnoczi <stefanha@redhat.com>

Mon, 9 Mar 2020 16:45:16 +0000 (16:45 +0000)
author Stefan Hajnoczi <stefanha@redhat.com>
Thu, 5 Mar 2020 17:08:06 +0000 (17:08 +0000)
committer Stefan Hajnoczi <stefanha@redhat.com>
Mon, 9 Mar 2020 16:45:16 +0000 (16:45 +0000)
diff --git a/include/block/aio.h b/include/block/aio.h

index f07ebb76b8182797008582405330a0055438a512..cb1989105a8da860c48c2519828d21f0dff1396d 100644 (file)
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -227,6 +227,14 @@ struct AioContext {
      int64_t poll_grow;      /* polling time growth factor */
      int64_t poll_shrink;    /* polling time shrink factor */
  
+    /*
+     * List of handlers participating in userspace polling.  Protected by
+     * ctx->list_lock.  Iterated and modified mostly by the event loop thread
+     * from aio_poll() with ctx->list_lock incremented.  aio_set_fd_handler()
+     * only touches the list to delete nodes if ctx->list_lock's count is zero.
+     */
+    AioHandlerList poll_aio_handlers;
+
      /* Are we in polling mode or monitoring file descriptors? */
      bool poll_started;
  
diff --git a/util/aio-posix.c b/util/aio-posix.c

index 759989b45bd037feb2f942ff9c52baffceb92a00..cd6cf0a4a97d97e3e8242f9ab59769034444803f 100644 (file)
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -22,6 +22,9 @@
  #include "trace.h"
  #include "aio-posix.h"
  
+/* Stop userspace polling on a handler if it isn't active for some time */
+#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
+
  bool aio_poll_disabled(AioContext *ctx)
  {
      return atomic_read(&ctx->poll_disable_cnt);
@@ -78,6 +81,7 @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
       * deleted because deleted nodes are only cleaned up while
       * no one is walking the handlers list.
       */
+    QLIST_SAFE_REMOVE(node, node_poll);
      QLIST_REMOVE(node, node);
      return true;
  }
@@ -205,7 +209,7 @@ static bool poll_set_started(AioContext *ctx, bool started)
      ctx->poll_started = started;
  
      qemu_lockcnt_inc(&ctx->list_lock);
-    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
          IOHandler *fn;
  
          if (QLIST_IS_INSERTED(node, node_deleted)) {
@@ -286,6 +290,7 @@ static void aio_free_deleted_handlers(AioContext *ctx)
      while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
          QLIST_REMOVE(node, node);
          QLIST_REMOVE(node, node_deleted);
+        QLIST_SAFE_REMOVE(node, node_poll);
          g_free(node);
      }
  
@@ -300,6 +305,22 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
      revents = node->pfd.revents & node->pfd.events;
      node->pfd.revents = 0;
  
+    /*
+     * Start polling AioHandlers when they become ready because activity is
+     * likely to continue.  Note that starvation is theoretically possible when
+     * fdmon_supports_polling(), but only until the fd fires for the first
+     * time.
+     */
+    if (!QLIST_IS_INSERTED(node, node_deleted) &&
+        !QLIST_IS_INSERTED(node, node_poll) &&
+        node->io_poll) {
+        trace_poll_add(ctx, node, node->pfd.fd, revents);
+        if (ctx->poll_started && node->io_poll_begin) {
+            node->io_poll_begin(node->opaque);
+        }
+        QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
+    }
+
      if (!QLIST_IS_INSERTED(node, node_deleted) &&
          (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
          aio_node_check(ctx, node->is_external) &&
@@ -364,15 +385,19 @@ void aio_dispatch(AioContext *ctx)
      timerlistgroup_run_timers(&ctx->tlg);
  }
  
-static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
+static bool run_poll_handlers_once(AioContext *ctx,
+                                   int64_t now,
+                                   int64_t *timeout)
  {
      bool progress = false;
      AioHandler *node;
+    AioHandler *tmp;
  
-    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
-        if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
-            aio_node_check(ctx, node->is_external) &&
+    QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+        if (aio_node_check(ctx, node->is_external) &&
              node->io_poll(node->opaque)) {
+            node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+
              /*
               * Polling was successful, exit try_poll_mode immediately
               * to adjust the next polling time.
@@ -389,6 +414,50 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
      return progress;
  }
  
+static bool fdmon_supports_polling(AioContext *ctx)
+{
+    return ctx->fdmon_ops->need_wait != aio_poll_disabled;
+}
+
+static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now)
+{
+    AioHandler *node;
+    AioHandler *tmp;
+    bool progress = false;
+
+    /*
+     * File descriptor monitoring implementations without userspace polling
+     * support suffer from starvation when a subset of handlers is polled
+     * because fds will not be processed in a timely fashion.  Don't remove
+     * idle poll handlers.
+     */
+    if (!fdmon_supports_polling(ctx)) {
+        return false;
+    }
+
+    QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+        if (node->poll_idle_timeout == 0LL) {
+            node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+        } else if (now >= node->poll_idle_timeout) {
+            trace_poll_remove(ctx, node, node->pfd.fd);
+            node->poll_idle_timeout = 0LL;
+            QLIST_SAFE_REMOVE(node, node_poll);
+            if (ctx->poll_started && node->io_poll_end) {
+                node->io_poll_end(node->opaque);
+
+                /*
+                 * Final poll in case ->io_poll_end() races with an event.
+                 * Nevermind about re-adding the handler in the rare case where
+                 * this causes progress.
+                 */
+                progress = node->io_poll(node->opaque) || progress;
+            }
+        }
+    }
+
+    return progress;
+}
+
  /* run_poll_handlers:
   * @ctx: the AioContext
   * @max_ns: maximum time to poll for, in nanoseconds
@@ -424,12 +493,17 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
  
      start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
      do {
-        progress = run_poll_handlers_once(ctx, timeout);
+        progress = run_poll_handlers_once(ctx, start_time, timeout);
          elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
          max_ns = qemu_soonest_timeout(*timeout, max_ns);
          assert(!(max_ns && progress));
      } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
  
+    if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) {
+        *timeout = 0;
+        progress = true;
+    }
+
      /* If time has passed with no successful polling, adjust *timeout to
       * keep the same ending time.
       */
@@ -454,8 +528,13 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
   */
  static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
  {
-    int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
+    int64_t max_ns;
+
+    if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
+        return false;
+    }
  
+    max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
      if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
          poll_set_started(ctx, true);
  
diff --git a/util/aio-posix.h b/util/aio-posix.h

index 55fc771327afb114aad721604c0be77c8aae67a6..c80c04506a8590486de9646b839d2389db9bbe55 100644 (file)
--- a/util/aio-posix.h
+++ b/util/aio-posix.h
@@ -30,10 +30,12 @@ struct AioHandler {
      QLIST_ENTRY(AioHandler) node;
      QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
      QLIST_ENTRY(AioHandler) node_deleted;
+    QLIST_ENTRY(AioHandler) node_poll;
  #ifdef CONFIG_LINUX_IO_URING
      QSLIST_ENTRY(AioHandler) node_submitted;
      unsigned flags; /* see fdmon-io_uring.c */
  #endif
+    int64_t poll_idle_timeout; /* when to stop userspace polling */
      bool is_external;
  };
  
diff --git a/util/trace-events b/util/trace-events

index 83b6639018a26288d6400a0920802d102ac22987..0ce42822eba9f160302cca3b3bf15a1e6c09e80b 100644 (file)
--- a/util/trace-events
+++ b/util/trace-events
@@ -5,6 +5,8 @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_
  run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64
  poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
  poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x"
+poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d"
  
  # async.c
  aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
author	Stefan Hajnoczi <stefanha@redhat.com>
	Thu, 5 Mar 2020 17:08:06 +0000 (17:08 +0000)
committer	Stefan Hajnoczi <stefanha@redhat.com>
	Mon, 9 Mar 2020 16:45:16 +0000 (16:45 +0000)
include/block/aio.h		patch \| blob \| blame \| history
util/aio-posix.c		patch \| blob \| blame \| history
util/aio-posix.h		patch \| blob \| blame \| history
util/trace-events		patch \| blob \| blame \| history