Merge tag 'pull-maintainer-may24-160524-2' of https://gitlab.com/stsquad/qemu into...

[mirror_qemu.git] / block / linux-aio.c
diff --git a/block/linux-aio.c b/block/linux-aio.c

index 91204a25a2b42a9aa0f2e0e116ae144ce2d66a9b..ec05d946f312bb90f13c4fb6b7b8a8159a34a9e3 100644 (file)
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -14,7 +14,12 @@
  #include "block/raw-aio.h"
  #include "qemu/event_notifier.h"
  #include "qemu/coroutine.h"
+#include "qemu/defer-call.h"
  #include "qapi/error.h"
+#include "sysemu/block-backend.h"
+
+/* Only used for assertions.  */
+#include "qemu/coroutine_int.h"
  
  #include <libaio.h>
  
@@ -28,6 +33,9 @@
   */
  #define MAX_EVENTS 1024
  
+/* Maximum number of requests in a batch. (default value) */
+#define DEFAULT_MAX_BATCH 32
+
  struct qemu_laiocb {
      Coroutine *co;
      LinuxAioState *ctx;
@@ -40,7 +48,6 @@ struct qemu_laiocb {
  };
  
  typedef struct {
-    int plugged;
      unsigned int in_queue;
      unsigned int in_flight;
      bool blocked;
@@ -53,10 +60,8 @@ struct LinuxAioState {
      io_context_t ctx;
      EventNotifier e;
  
-    /* io queue for submit at batch.  Protected by AioContext lock. */
+    /* No locking required, only accessed from AioContext home thread */
      LaioQueue io_q;
-
-    /* I/O completion processing.  Only runs in I/O thread.  */
      QEMUBH *completion_bh;
      int event_idx;
      int event_max;
@@ -99,6 +104,7 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
       * later.  Coroutines cannot be entered recursively so avoid doing
       * that!
       */
+    assert(laiocb->co->ctx == laiocb->ctx->aio_context);
      if (!qemu_coroutine_entered(laiocb->co)) {
          aio_co_wake(laiocb->co);
      }
@@ -121,7 +127,7 @@ struct aio_ring {
      unsigned    incompat_features;
      unsigned    header_length;  /* size of aio_ring */
  
-    struct io_event io_events[0];
+    struct io_event io_events[];
  };
  
  /**
@@ -199,6 +205,8 @@ static void qemu_laio_process_completions(LinuxAioState *s)
  {
      struct io_event *events;
  
+    defer_call_begin();
+
      /* Reschedule so nested event loops see currently pending completions */
      qemu_bh_schedule(s->completion_bh);
  
@@ -222,20 +230,20 @@ static void qemu_laio_process_completions(LinuxAioState *s)
  
      /* If we are nested we have to notify the level above that we are done
       * by setting event_max to zero, upper level will then jump out of it's
-     * own `for` loop.  If we are the last all counters droped to zero. */
+     * own `for` loop.  If we are the last all counters dropped to zero. */
      s->event_max = 0;
      s->event_idx = 0;
+
+    defer_call_end();
  }
  
  static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
  {
-    aio_context_acquire(s->aio_context);
      qemu_laio_process_completions(s);
  
-    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+    if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
          ioq_submit(s);
      }
-    aio_context_release(s->aio_context);
  }
  
  static void qemu_laio_completion_bh(void *opaque)
@@ -260,18 +268,20 @@ static bool qemu_laio_poll_cb(void *opaque)
      LinuxAioState *s = container_of(e, LinuxAioState, e);
      struct io_event *events;
  
-    if (!io_getevents_peek(s->ctx, &events)) {
-        return false;
-    }
+    return io_getevents_peek(s->ctx, &events);
+}
+
+static void qemu_laio_poll_ready(EventNotifier *opaque)
+{
+    EventNotifier *e = opaque;
+    LinuxAioState *s = container_of(e, LinuxAioState, e);
  
      qemu_laio_process_completions_and_submit(s);
-    return true;
  }
  
  static void ioq_init(LaioQueue *io_q)
  {
      QSIMPLEQ_INIT(&io_q->pending);
-    io_q->plugged = 0;
      io_q->in_queue = 0;
      io_q->in_flight = 0;
      io_q->blocked = false;
@@ -331,22 +341,34 @@ static void ioq_submit(LinuxAioState *s)
      }
  }
  
-void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
+static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
  {
-    s->io_q.plugged++;
+    uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
+
+    /*
+     * AIO context can be shared between multiple block devices, so
+     * `dev_max_batch` allows reducing the batch size for latency-sensitive
+     * devices.
+     */
+    max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
+
+    /* limit the batch with the number of available events */
+    max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
+
+    return max_batch;
  }
  
-void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
+static void laio_deferred_fn(void *opaque)
  {
-    assert(s->io_q.plugged);
-    if (--s->io_q.plugged == 0 &&
-        !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+    LinuxAioState *s = opaque;
+
+    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
          ioq_submit(s);
      }
  }
  
  static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
-                          int type)
+                          int type, uint64_t dev_max_batch)
  {
      LinuxAioState *s = laiocb->ctx;
      struct iocb *iocbs = &laiocb->iocb;
@@ -356,6 +378,9 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
      case QEMU_AIO_WRITE:
          io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
          break;
+    case QEMU_AIO_ZONE_APPEND:
+        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+        break;
      case QEMU_AIO_READ:
          io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
          break;
@@ -369,29 +394,32 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
  
      QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
      s->io_q.in_queue++;
-    if (!s->io_q.blocked &&
-        (!s->io_q.plugged ||
-         s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
-        ioq_submit(s);
+    if (!s->io_q.blocked) {
+        if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
+            ioq_submit(s);
+        } else {
+            defer_call(laio_deferred_fn, s);
+        }
      }
  
      return 0;
  }
  
-int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
-                                uint64_t offset, QEMUIOVector *qiov, int type)
+int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
+                                int type, uint64_t dev_max_batch)
  {
      int ret;
+    AioContext *ctx = qemu_get_current_aio_context();
      struct qemu_laiocb laiocb = {
          .co         = qemu_coroutine_self(),
          .nbytes     = qiov->size,
-        .ctx        = s,
+        .ctx        = aio_get_linux_aio(ctx),
          .ret        = -EINPROGRESS,
          .is_read    = (type == QEMU_AIO_READ),
          .qiov       = qiov,
      };
  
-    ret = laio_do_submit(fd, &laiocb, offset, type);
+    ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
      if (ret < 0) {
          return ret;
      }
@@ -404,7 +432,7 @@ int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
  
  void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
  {
-    aio_set_event_notifier(old_context, &s->e, false, NULL, NULL);
+    aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL);
      qemu_bh_delete(s->completion_bh);
      s->aio_context = NULL;
  }
@@ -413,9 +441,10 @@ void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
  {
      s->aio_context = new_context;
      s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
-    aio_set_event_notifier(new_context, &s->e, false,
+    aio_set_event_notifier(new_context, &s->e,
                             qemu_laio_completion_cb,
-                           qemu_laio_poll_cb);
+                           qemu_laio_poll_cb,
+                           qemu_laio_poll_ready);
  }
  
  LinuxAioState *laio_init(Error **errp)
@@ -426,7 +455,7 @@ LinuxAioState *laio_init(Error **errp)
      s = g_malloc0(sizeof(*s));
      rc = event_notifier_init(&s->e, false);
      if (rc < 0) {
-        error_setg_errno(errp, -rc, "failed to to initialize event notifier");
+        error_setg_errno(errp, -rc, "failed to initialize event notifier");
          goto out_free_state;
      }