]> git.proxmox.com Git - pve-qemu.git/commitdiff
fix vmstate-snapshots with iothread=1
authorThomas Lamprecht <t.lamprecht@proxmox.com>
Wed, 27 May 2020 12:40:46 +0000 (14:40 +0200)
committerThomas Lamprecht <t.lamprecht@proxmox.com>
Wed, 27 May 2020 16:54:06 +0000 (18:54 +0200)
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
debian/patches/pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch [new file with mode: 0644]
debian/patches/pve/0046-util-async-Add-aio_co_reschedule_self.patch [new file with mode: 0644]
debian/patches/pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch [new file with mode: 0644]
debian/patches/pve/0048-savevm-async-add-debug-timing-prints.patch [new file with mode: 0644]
debian/patches/series

diff --git a/debian/patches/pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch b/debian/patches/pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch
new file mode 100644 (file)
index 0000000..1880717
--- /dev/null
@@ -0,0 +1,188 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Reiter <s.reiter@proxmox.com>
+Date: Wed, 27 May 2020 11:33:19 +0200
+Subject: [PATCH] savevm-async: move more code to cleanup and rename to
+ finalize
+
+process_savevm_cleanup is renamed to process_savevm_finalize to
+accomodate more code that is not all cleanup related.
+
+The benefit of this is that it allows us to call functions which need to
+run in the main AIOContext directly. It doesn't majorly affect snapshot
+performance, since the first instruction that is moved stops the VM,
+so the downtime stays about the same.
+
+The target bdrv is additionally moved to the IOHandler context before
+process_savevm_co to make sure the coroutine can call functions that
+require it to own the bdrv's context. process_savevm_finalize then moves
+it back to the main context to run its part.
+
+Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
+---
+ savevm-async.c | 87 +++++++++++++++++++++++++++++---------------------
+ 1 file changed, 51 insertions(+), 36 deletions(-)
+
+diff --git a/savevm-async.c b/savevm-async.c
+index c3fe741c38..2894c94233 100644
+--- a/savevm-async.c
++++ b/savevm-async.c
+@@ -50,7 +50,7 @@ static struct SnapshotState {
+     int saved_vm_running;
+     QEMUFile *file;
+     int64_t total_time;
+-    QEMUBH *cleanup_bh;
++    QEMUBH *finalize_bh;
+     Coroutine *co;
+ } snap_state;
+@@ -196,12 +196,42 @@ static const QEMUFileOps block_file_ops = {
+     .close =          block_state_close,
+ };
+-static void process_savevm_cleanup(void *opaque)
++static void process_savevm_finalize(void *opaque)
+ {
+     int ret;
+-    qemu_bh_delete(snap_state.cleanup_bh);
+-    snap_state.cleanup_bh = NULL;
++    AioContext *iohandler_ctx = iohandler_get_aio_context();
++    MigrationState *ms = migrate_get_current();
++
++    qemu_bh_delete(snap_state.finalize_bh);
++    snap_state.finalize_bh = NULL;
+     snap_state.co = NULL;
++
++    /* We need to own the target bdrv's context for the following functions,
++     * so move it back. It can stay in the main context and live out its live
++     * there, since we're done with it after this method ends anyway.
++     */
++    aio_context_acquire(iohandler_ctx);
++    blk_set_aio_context(snap_state.target, qemu_get_aio_context(), NULL);
++    aio_context_release(iohandler_ctx);
++
++    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
++    if (ret < 0) {
++        save_snapshot_error("vm_stop_force_state error %d", ret);
++    }
++
++    (void)qemu_savevm_state_complete_precopy(snap_state.file, false, false);
++    ret = qemu_file_get_error(snap_state.file);
++    if (ret < 0) {
++            save_snapshot_error("qemu_savevm_state_iterate error %d", ret);
++    }
++
++    DPRINTF("state saving complete\n");
++
++    /* clear migration state */
++    migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP,
++                      ret ? MIGRATION_STATUS_FAILED : MIGRATION_STATUS_COMPLETED);
++    ms->to_dst_file = NULL;
++
+     qemu_savevm_state_cleanup();
+     ret = save_snapshot_cleanup();
+@@ -219,16 +249,15 @@ static void process_savevm_cleanup(void *opaque)
+     }
+ }
+-static void process_savevm_coro(void *opaque)
++static void coroutine_fn process_savevm_co(void *opaque)
+ {
+     int ret;
+     int64_t maxlen;
+-    MigrationState *ms = migrate_get_current();
+     ret = qemu_file_get_error(snap_state.file);
+     if (ret < 0) {
+         save_snapshot_error("qemu_savevm_state_setup failed");
+-        goto out;
++        return;
+     }
+     while (snap_state.state == SAVE_STATE_ACTIVE) {
+@@ -245,7 +274,7 @@ static void process_savevm_coro(void *opaque)
+                 save_snapshot_error("qemu_savevm_state_iterate error %d", ret);
+                 break;
+             }
+-            DPRINTF("savevm inerate pending size %lu ret %d\n", pending_size, ret);
++            DPRINTF("savevm iterate pending size %lu ret %d\n", pending_size, ret);
+         } else {
+             qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
+             ret = global_state_store();
+@@ -253,40 +282,20 @@ static void process_savevm_coro(void *opaque)
+                 save_snapshot_error("global_state_store error %d", ret);
+                 break;
+             }
+-            ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+-            if (ret < 0) {
+-                save_snapshot_error("vm_stop_force_state error %d", ret);
+-                break;
+-            }
+-            DPRINTF("savevm inerate finished\n");
+-            /* upstream made the return value here inconsistent
+-             * (-1 instead of 'ret' in one case and 0 after flush which can
+-             * still set a file error...)
+-             */
+-            (void)qemu_savevm_state_complete_precopy(snap_state.file, false, false);
+-            ret = qemu_file_get_error(snap_state.file);
+-            if (ret < 0) {
+-                    save_snapshot_error("qemu_savevm_state_iterate error %d", ret);
+-                    break;
+-            }
+-            DPRINTF("save complete\n");
++
++            DPRINTF("savevm iterate complete\n");
+             break;
+         }
+     }
+-    qemu_bh_schedule(snap_state.cleanup_bh);
+-
+-out:
+-    /* set migration state accordingly and clear soon-to-be stale file */
+-    migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP,
+-                      ret ? MIGRATION_STATUS_FAILED : MIGRATION_STATUS_COMPLETED);
+-    ms->to_dst_file = NULL;
++    qemu_bh_schedule(snap_state.finalize_bh);
+ }
+ void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp)
+ {
+     Error *local_err = NULL;
+     MigrationState *ms = migrate_get_current();
++    AioContext *iohandler_ctx = iohandler_get_aio_context();
+     int bdrv_oflags = BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH;
+@@ -347,7 +356,7 @@ void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp)
+     /*
+      * qemu_savevm_* paths use migration code and expect a migration state.
+-     * State is cleared in process_savevm_thread, but has to be initialized
++     * State is cleared in process_savevm_co, but has to be initialized
+      * here (blocking main thread, from QMP) to avoid race conditions.
+      */
+     migrate_init(ms);
+@@ -358,13 +367,19 @@ void qmp_savevm_start(bool has_statefile, const char *statefile, Error **errp)
+     blk_op_block_all(snap_state.target, snap_state.blocker);
+     snap_state.state = SAVE_STATE_ACTIVE;
+-    snap_state.cleanup_bh = qemu_bh_new(process_savevm_cleanup, &snap_state);
+-    snap_state.co = qemu_coroutine_create(&process_savevm_coro, NULL);
++    snap_state.finalize_bh = qemu_bh_new(process_savevm_finalize, &snap_state);
++    snap_state.co = qemu_coroutine_create(&process_savevm_co, NULL);
+     qemu_mutex_unlock_iothread();
+     qemu_savevm_state_header(snap_state.file);
+     qemu_savevm_state_setup(snap_state.file);
+     qemu_mutex_lock_iothread();
+-    aio_co_schedule(iohandler_get_aio_context(), snap_state.co);
++
++    /* Async processing from here on out happens in iohandler context, so let
++     * the target bdrv have its home there.
++     */
++    blk_set_aio_context(snap_state.target, iohandler_ctx, &local_err);
++
++    aio_co_schedule(iohandler_ctx, snap_state.co);
+     return;
diff --git a/debian/patches/pve/0046-util-async-Add-aio_co_reschedule_self.patch b/debian/patches/pve/0046-util-async-Add-aio_co_reschedule_self.patch
new file mode 100644 (file)
index 0000000..6791261
--- /dev/null
@@ -0,0 +1,86 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Kevin Wolf <kwolf@redhat.com>
+Date: Wed, 27 May 2020 11:33:20 +0200
+Subject: [PATCH] util/async: Add aio_co_reschedule_self()
+
+From: Kevin Wolf <kwolf@redhat.com>
+
+Add a function that can be used to move the currently running coroutine
+to a different AioContext (and therefore potentially a different
+thread).
+
+Signed-off-by: Kevin Wolf <kwolf@redhat.com>
+---
+ include/block/aio.h | 10 ++++++++++
+ util/async.c        | 30 ++++++++++++++++++++++++++++++
+ 2 files changed, 40 insertions(+)
+
+diff --git a/include/block/aio.h b/include/block/aio.h
+index 62ed954344..d5399c67d6 100644
+--- a/include/block/aio.h
++++ b/include/block/aio.h
+@@ -17,6 +17,7 @@
+ #ifdef CONFIG_LINUX_IO_URING
+ #include <liburing.h>
+ #endif
++#include "qemu/coroutine.h"
+ #include "qemu/queue.h"
+ #include "qemu/event_notifier.h"
+ #include "qemu/thread.h"
+@@ -654,6 +655,15 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
+  */
+ void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
++/**
++ * aio_co_reschedule_self:
++ * @new_ctx: the new context
++ *
++ * Move the currently running coroutine to new_ctx. If the coroutine is already
++ * running in new_ctx, do nothing.
++ */
++void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
++
+ /**
+  * aio_co_wake:
+  * @co: the coroutine
+diff --git a/util/async.c b/util/async.c
+index 3165a28f2f..4eba1e6f1b 100644
+--- a/util/async.c
++++ b/util/async.c
+@@ -558,6 +558,36 @@ void aio_co_schedule(AioContext *ctx, Coroutine *co)
+     aio_context_unref(ctx);
+ }
++typedef struct AioCoRescheduleSelf {
++    Coroutine *co;
++    AioContext *new_ctx;
++} AioCoRescheduleSelf;
++
++static void aio_co_reschedule_self_bh(void *opaque)
++{
++    AioCoRescheduleSelf *data = opaque;
++    aio_co_schedule(data->new_ctx, data->co);
++}
++
++void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx)
++{
++    AioContext *old_ctx = qemu_get_current_aio_context();
++
++    if (old_ctx != new_ctx) {
++        AioCoRescheduleSelf data = {
++            .co = qemu_coroutine_self(),
++            .new_ctx = new_ctx,
++        };
++        /*
++         * We can't directly schedule the coroutine in the target context
++         * because this would be racy: The other thread could try to enter the
++         * coroutine before it has yielded in this one.
++         */
++        aio_bh_schedule_oneshot(old_ctx, aio_co_reschedule_self_bh, &data);
++        qemu_coroutine_yield();
++    }
++}
++
+ void aio_co_wake(struct Coroutine *co)
+ {
+     AioContext *ctx;
diff --git a/debian/patches/pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch b/debian/patches/pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch
new file mode 100644 (file)
index 0000000..b68ec1f
--- /dev/null
@@ -0,0 +1,58 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Reiter <s.reiter@proxmox.com>
+Date: Wed, 27 May 2020 11:33:21 +0200
+Subject: [PATCH] savevm-async: flush IOThread-drives async before entering
+ blocking part
+
+By flushing all drives where its possible to so before entering the
+blocking part (where the VM is stopped), we can reduce the time spent in
+said part for every disk that has an IOThread (other drives cannot be
+flushed async anyway).
+
+Suggested-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
+---
+ savevm-async.c | 23 +++++++++++++++++++++++
+ 1 file changed, 23 insertions(+)
+
+diff --git a/savevm-async.c b/savevm-async.c
+index 2894c94233..4ce83a0691 100644
+--- a/savevm-async.c
++++ b/savevm-async.c
+@@ -253,6 +253,8 @@ static void coroutine_fn process_savevm_co(void *opaque)
+ {
+     int ret;
+     int64_t maxlen;
++    BdrvNextIterator it;
++    BlockDriverState *bs = NULL;
+     ret = qemu_file_get_error(snap_state.file);
+     if (ret < 0) {
+@@ -288,6 +290,27 @@ static void coroutine_fn process_savevm_co(void *opaque)
+         }
+     }
++    /* If a drive runs in an IOThread we can flush it async, and only
++     * need to sync-flush whatever IO happens between now and
++     * vm_stop_force_state. bdrv_next can only be called from main AioContext,
++     * so move there now and after every flush.
++     */
++    aio_co_reschedule_self(qemu_get_aio_context());
++    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
++        /* target has BDRV_O_NO_FLUSH, no sense calling bdrv_flush on it */
++        if (bs == blk_bs(snap_state.target)) {
++            continue;
++        }
++
++        AioContext *bs_ctx = bdrv_get_aio_context(bs);
++        if (bs_ctx != qemu_get_aio_context()) {
++            DPRINTF("savevm: async flushing drive %s\n", bs->filename);
++            aio_co_reschedule_self(bs_ctx);
++            bdrv_flush(bs);
++            aio_co_reschedule_self(qemu_get_aio_context());
++        }
++    }
++
+     qemu_bh_schedule(snap_state.finalize_bh);
+ }
diff --git a/debian/patches/pve/0048-savevm-async-add-debug-timing-prints.patch b/debian/patches/pve/0048-savevm-async-add-debug-timing-prints.patch
new file mode 100644 (file)
index 0000000..68b7635
--- /dev/null
@@ -0,0 +1,80 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Reiter <s.reiter@proxmox.com>
+Date: Wed, 27 May 2020 11:33:22 +0200
+Subject: [PATCH] savevm-async: add debug timing prints
+
+Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
+[ Thomas: guard variable declaration by DEBUG #ifdef ]
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ savevm-async.c | 22 ++++++++++++++++++++++
+ 1 file changed, 22 insertions(+)
+
+diff --git a/savevm-async.c b/savevm-async.c
+index 4ce83a0691..0388cebbe9 100644
+--- a/savevm-async.c
++++ b/savevm-async.c
+@@ -202,6 +202,10 @@ static void process_savevm_finalize(void *opaque)
+     AioContext *iohandler_ctx = iohandler_get_aio_context();
+     MigrationState *ms = migrate_get_current();
++#ifdef DEBUG_SAVEVM_STATE
++    int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
++#endif
++
+     qemu_bh_delete(snap_state.finalize_bh);
+     snap_state.finalize_bh = NULL;
+     snap_state.co = NULL;
+@@ -226,6 +230,8 @@ static void process_savevm_finalize(void *opaque)
+     }
+     DPRINTF("state saving complete\n");
++    DPRINTF("timing: process_savevm_finalize (state saving) took %ld ms\n",
++        qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time);
+     /* clear migration state */
+     migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP,
+@@ -247,6 +253,9 @@ static void process_savevm_finalize(void *opaque)
+         vm_start();
+         snap_state.saved_vm_running = false;
+     }
++
++    DPRINTF("timing: process_savevm_finalize (full) took %ld ms\n",
++        qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time);
+ }
+ static void coroutine_fn process_savevm_co(void *opaque)
+@@ -256,6 +265,10 @@ static void coroutine_fn process_savevm_co(void *opaque)
+     BdrvNextIterator it;
+     BlockDriverState *bs = NULL;
++#ifdef DEBUG_SAVEVM_STATE
++    int64_t start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
++#endif
++
+     ret = qemu_file_get_error(snap_state.file);
+     if (ret < 0) {
+         save_snapshot_error("qemu_savevm_state_setup failed");
+@@ -290,6 +303,12 @@ static void coroutine_fn process_savevm_co(void *opaque)
+         }
+     }
++    DPRINTF("timing: process_savevm_co took %ld ms\n",
++        qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time);
++
++#ifdef DEBUG_SAVEVM_STATE
++    int64_t start_time_flush = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
++#endif
+     /* If a drive runs in an IOThread we can flush it async, and only
+      * need to sync-flush whatever IO happens between now and
+      * vm_stop_force_state. bdrv_next can only be called from main AioContext,
+@@ -311,6 +330,9 @@ static void coroutine_fn process_savevm_co(void *opaque)
+         }
+     }
++    DPRINTF("timing: async flushing took %ld ms\n",
++        qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - start_time_flush);
++
+     qemu_bh_schedule(snap_state.finalize_bh);
+ }
index 670b744be11bc907616913d9de5c3aea5fe3a967..e7345cebfcd117dd9c62caecc82d2a27d4923762 100644 (file)
@@ -42,3 +42,7 @@ pve/0041-PVE-Backup-avoid-use-QemuRecMutex-inside-coroutines.patch
 pve/0042-PVE-Backup-use-QemuMutex-instead-of-QemuRecMutex.patch
 pve/0043-move-savevm-async-back-into-a-coroutine.patch
 pve/0044-add-optional-buffer-size-to-QEMUFile.patch
+pve/0045-savevm-async-move-more-code-to-cleanup-and-rename-to.patch
+pve/0046-util-async-Add-aio_co_reschedule_self.patch
+pve/0047-savevm-async-flush-IOThread-drives-async-before-ente.patch
+pve/0048-savevm-async-add-debug-timing-prints.patch