--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Reiter <s.reiter@proxmox.com>
+Date: Mon, 28 Sep 2020 13:40:51 +0200
+Subject: [PATCH] PVE-Backup: Don't block on finishing and cleanup
+ create_backup_jobs
+
+proxmox_backup_co_finish is already async, but previously we would wait
+for the coroutine using block_on_coroutine_fn(). Avoid this by
+scheduling pvebackup_co_complete_stream (and thus pvebackup_co_cleanup)
+as a real coroutine when calling from pvebackup_complete_cb. This is ok,
+since complete_stream uses the backup_mutex internally to synchronize,
+and other streams can happily continue writing in the meantime anyway.
+
+To accomodate, backup_mutex is converted to a CoMutex. This means
+converting every user to a coroutine. This is not just useful here, but
+will come in handy once this series[0] is merged, and QMP calls can be
+yield-able coroutines too. Then we can also finally get rid of
+block_on_coroutine_fn.
+
+Cases of aio_context_acquire/release from within what is now a coroutine
+are changed to aio_co_reschedule_self, which works since a running
+coroutine always holds the aio lock for the context it is running in.
+
+job_cancel_sync is called from a BH since it can't be run from a
+coroutine (uses AIO_WAIT_WHILE internally).
+
+Same thing for create_backup_jobs, which is converted to a BH too.
+
+To communicate the finishing state, a new property is introduced to
+query-backup: 'finishing'. A new state is explicitly not used, since
+that would break compatibility with older qemu-server versions.
+
+Also fix create_backup_jobs:
+
+No more weird bool returns, just the standard "errp" format used
+everywhere else too. With this, if backup_job_create fails, the error
+message is actually returned over QMP and can be shown to the user.
+
+To facilitate correct cleanup on such an error, we call
+create_backup_jobs as a bottom half directly from pvebackup_co_prepare.
+This additionally allows us to actually hold the backup_mutex during
+operation.
+
+Also add a job_cancel_sync before job_unref, since a job must be in
+STATUS_NULL to be deleted by unref, which could trigger an assert
+before.
+
+[0] https://lists.gnu.org/archive/html/qemu-devel/2020-09/msg03515.html
+
+Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+[add new force parameter to job_cancel_sync calls]
+Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
+---
+ pve-backup.c | 217 ++++++++++++++++++++++++++++---------------
+ qapi/block-core.json | 5 +-
+ 2 files changed, 144 insertions(+), 78 deletions(-)
+
+diff --git a/pve-backup.c b/pve-backup.c
+index 63c686463f..6f05796fad 100644
+--- a/pve-backup.c
++++ b/pve-backup.c
+@@ -33,7 +33,9 @@ const char *PBS_BITMAP_NAME = "pbs-incremental-dirty-bitmap";
+
+ static struct PVEBackupState {
+ struct {
+- // Everithing accessed from qmp_backup_query command is protected using lock
++ // Everything accessed from qmp_backup_query command is protected using
++ // this lock. Do NOT hold this lock for long times, as it is sometimes
++ // acquired from coroutines, and thus any wait time may block the guest.
+ QemuMutex lock;
+ Error *error;
+ time_t start_time;
+@@ -47,20 +49,22 @@ static struct PVEBackupState {
+ size_t reused;
+ size_t zero_bytes;
+ GList *bitmap_list;
++ bool finishing;
++ bool starting;
+ } stat;
+ int64_t speed;
+ VmaWriter *vmaw;
+ ProxmoxBackupHandle *pbs;
+ GList *di_list;
+ JobTxn *txn;
+- QemuMutex backup_mutex;
++ CoMutex backup_mutex;
+ CoMutex dump_callback_mutex;
+ } backup_state;
+
+ static void pvebackup_init(void)
+ {
+ qemu_mutex_init(&backup_state.stat.lock);
+- qemu_mutex_init(&backup_state.backup_mutex);
++ qemu_co_mutex_init(&backup_state.backup_mutex);
+ qemu_co_mutex_init(&backup_state.dump_callback_mutex);
+ }
+
+@@ -72,6 +76,7 @@ typedef struct PVEBackupDevInfo {
+ size_t size;
+ uint64_t block_size;
+ uint8_t dev_id;
++ int completed_ret; // INT_MAX if not completed
+ char targetfile[PATH_MAX];
+ BdrvDirtyBitmap *bitmap;
+ BlockDriverState *target;
+@@ -227,12 +232,12 @@ pvebackup_co_dump_vma_cb(
+ }
+
+ // assumes the caller holds backup_mutex
+-static void coroutine_fn pvebackup_co_cleanup(void *unused)
++static void coroutine_fn pvebackup_co_cleanup(void)
+ {
+ assert(qemu_in_coroutine());
+
+ qemu_mutex_lock(&backup_state.stat.lock);
+- backup_state.stat.end_time = time(NULL);
++ backup_state.stat.finishing = true;
+ qemu_mutex_unlock(&backup_state.stat.lock);
+
+ if (backup_state.vmaw) {
+@@ -261,35 +266,29 @@ static void coroutine_fn pvebackup_co_cleanup(void *unused)
+
+ g_list_free(backup_state.di_list);
+ backup_state.di_list = NULL;
++
++ qemu_mutex_lock(&backup_state.stat.lock);
++ backup_state.stat.end_time = time(NULL);
++ backup_state.stat.finishing = false;
++ qemu_mutex_unlock(&backup_state.stat.lock);
+ }
+
+-// assumes the caller holds backup_mutex
+-static void coroutine_fn pvebackup_complete_stream(void *opaque)
++static void coroutine_fn pvebackup_co_complete_stream(void *opaque)
+ {
+ PVEBackupDevInfo *di = opaque;
++ int ret = di->completed_ret;
+
+- bool error_or_canceled = pvebackup_error_or_canceled();
+-
+- if (backup_state.vmaw) {
+- vma_writer_close_stream(backup_state.vmaw, di->dev_id);
++ qemu_mutex_lock(&backup_state.stat.lock);
++ bool starting = backup_state.stat.starting;
++ qemu_mutex_unlock(&backup_state.stat.lock);
++ if (starting) {
++ /* in 'starting' state, no tasks have been run yet, meaning we can (and
++ * must) skip all cleanup, as we don't know what has and hasn't been
++ * initialized yet. */
++ return;
+ }
+
+- if (backup_state.pbs && !error_or_canceled) {
+- Error *local_err = NULL;
+- proxmox_backup_co_close_image(backup_state.pbs, di->dev_id, &local_err);
+- if (local_err != NULL) {
+- pvebackup_propagate_error(local_err);
+- }
+- }
+-}
+-
+-static void pvebackup_complete_cb(void *opaque, int ret)
+-{
+- assert(!qemu_in_coroutine());
+-
+- PVEBackupDevInfo *di = opaque;
+-
+- qemu_mutex_lock(&backup_state.backup_mutex);
++ qemu_co_mutex_lock(&backup_state.backup_mutex);
+
+ if (ret < 0) {
+ Error *local_err = NULL;
+@@ -301,7 +300,19 @@ static void pvebackup_complete_cb(void *opaque, int ret)
+
+ assert(di->target == NULL);
+
+- block_on_coroutine_fn(pvebackup_complete_stream, di);
++ bool error_or_canceled = pvebackup_error_or_canceled();
++
++ if (backup_state.vmaw) {
++ vma_writer_close_stream(backup_state.vmaw, di->dev_id);
++ }
++
++ if (backup_state.pbs && !error_or_canceled) {
++ Error *local_err = NULL;
++ proxmox_backup_co_close_image(backup_state.pbs, di->dev_id, &local_err);
++ if (local_err != NULL) {
++ pvebackup_propagate_error(local_err);
++ }
++ }
+
+ // remove self from job list
+ backup_state.di_list = g_list_remove(backup_state.di_list, di);
+@@ -310,21 +321,49 @@ static void pvebackup_complete_cb(void *opaque, int ret)
+
+ /* call cleanup if we're the last job */
+ if (!g_list_first(backup_state.di_list)) {
+- block_on_coroutine_fn(pvebackup_co_cleanup, NULL);
++ pvebackup_co_cleanup();
+ }
+
+- qemu_mutex_unlock(&backup_state.backup_mutex);
++ qemu_co_mutex_unlock(&backup_state.backup_mutex);
+ }
+
+-static void pvebackup_cancel(void)
++static void pvebackup_complete_cb(void *opaque, int ret)
+ {
+- assert(!qemu_in_coroutine());
++ PVEBackupDevInfo *di = opaque;
++ di->completed_ret = ret;
++
++ /*
++ * Schedule stream cleanup in async coroutine. close_image and finish might
++ * take a while, so we can't block on them here. This way it also doesn't
++ * matter if we're already running in a coroutine or not.
++ * Note: di is a pointer to an entry in the global backup_state struct, so
++ * it stays valid.
++ */
++ Coroutine *co = qemu_coroutine_create(pvebackup_co_complete_stream, di);
++ aio_co_enter(qemu_get_aio_context(), co);
++}
++
++/*
++ * job_cancel(_sync) does not like to be called from coroutines, so defer to
++ * main loop processing via a bottom half.
++ */
++static void job_cancel_bh(void *opaque) {
++ CoCtxData *data = (CoCtxData*)opaque;
++ Job *job = (Job*)data->data;
++ AioContext *job_ctx = job->aio_context;
++ aio_context_acquire(job_ctx);
++ job_cancel_sync(job, true);
++ aio_context_release(job_ctx);
++ aio_co_enter(data->ctx, data->co);
++}
+
++static void coroutine_fn pvebackup_co_cancel(void *opaque)
++{
+ Error *cancel_err = NULL;
+ error_setg(&cancel_err, "backup canceled");
+ pvebackup_propagate_error(cancel_err);
+
+- qemu_mutex_lock(&backup_state.backup_mutex);
++ qemu_co_mutex_lock(&backup_state.backup_mutex);
+
+ if (backup_state.vmaw) {
+ /* make sure vma writer does not block anymore */
+@@ -342,27 +381,22 @@ static void pvebackup_cancel(void)
+ ((PVEBackupDevInfo *)bdi->data)->job :
+ NULL;
+
+- /* ref the job before releasing the mutex, just to be safe */
+ if (cancel_job) {
+- job_ref(&cancel_job->job);
++ CoCtxData data = {
++ .ctx = qemu_get_current_aio_context(),
++ .co = qemu_coroutine_self(),
++ .data = &cancel_job->job,
++ };
++ aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data);
++ qemu_coroutine_yield();
+ }
+
+- /* job_cancel_sync may enter the job, so we need to release the
+- * backup_mutex to avoid deadlock */
+- qemu_mutex_unlock(&backup_state.backup_mutex);
+-
+- if (cancel_job) {
+- AioContext *aio_context = cancel_job->job.aio_context;
+- aio_context_acquire(aio_context);
+- job_cancel_sync(&cancel_job->job, true);
+- job_unref(&cancel_job->job);
+- aio_context_release(aio_context);
+- }
++ qemu_co_mutex_unlock(&backup_state.backup_mutex);
+ }
+
+ void qmp_backup_cancel(Error **errp)
+ {
+- pvebackup_cancel();
++ block_on_coroutine_fn(pvebackup_co_cancel, NULL);
+ }
+
+ // assumes the caller holds backup_mutex
+@@ -415,10 +449,18 @@ static int coroutine_fn pvebackup_co_add_config(
+ goto out;
+ }
+
+-static bool create_backup_jobs(void) {
++/*
++ * backup_job_create can *not* be run from a coroutine (and requires an
++ * acquired AioContext), so this can't either.
++ * The caller is responsible that backup_mutex is held nonetheless.
++ */
++static void create_backup_jobs_bh(void *opaque) {
+
+ assert(!qemu_in_coroutine());
+
++ CoCtxData *data = (CoCtxData*)opaque;
++ Error **errp = (Error**)data->data;
++
+ Error *local_err = NULL;
+
+ /* create job transaction to synchronize bitmap commit and cancel all
+@@ -454,24 +496,19 @@ static bool create_backup_jobs(void) {
+
+ aio_context_release(aio_context);
+
+- if (!job || local_err != NULL) {
+- Error *create_job_err = NULL;
+- error_setg(&create_job_err, "backup_job_create failed: %s",
+- local_err ? error_get_pretty(local_err) : "null");
++ di->job = job;
+
+- pvebackup_propagate_error(create_job_err);
++ if (!job || local_err) {
++ error_setg(errp, "backup_job_create failed: %s",
++ local_err ? error_get_pretty(local_err) : "null");
+ break;
+ }
+
+- di->job = job;
+-
+ bdrv_unref(di->target);
+ di->target = NULL;
+ }
+
+- bool errors = pvebackup_error_or_canceled();
+-
+- if (errors) {
++ if (*errp) {
+ l = backup_state.di_list;
+ while (l) {
+ PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
+@@ -483,12 +520,17 @@ static bool create_backup_jobs(void) {
+ }
+
+ if (di->job) {
++ AioContext *ctx = di->job->job.aio_context;
++ aio_context_acquire(ctx);
++ job_cancel_sync(&di->job->job, true);
+ job_unref(&di->job->job);
++ aio_context_release(ctx);
+ }
+ }
+ }
+
+- return errors;
++ /* return */
++ aio_co_enter(data->ctx, data->co);
+ }
+
+ typedef struct QmpBackupTask {
+@@ -525,11 +567,12 @@ typedef struct QmpBackupTask {
+ UuidInfo *result;
+ } QmpBackupTask;
+
+-// assumes the caller holds backup_mutex
+ static void coroutine_fn pvebackup_co_prepare(void *opaque)
+ {
+ assert(qemu_in_coroutine());
+
++ qemu_co_mutex_lock(&backup_state.backup_mutex);
++
+ QmpBackupTask *task = opaque;
+
+ task->result = NULL; // just to be sure
+@@ -550,8 +593,9 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
+ const char *firewall_name = "qemu-server.fw";
+
+ if (backup_state.di_list) {
+- error_set(task->errp, ERROR_CLASS_GENERIC_ERROR,
++ error_set(task->errp, ERROR_CLASS_GENERIC_ERROR,
+ "previous backup not finished");
++ qemu_co_mutex_unlock(&backup_state.backup_mutex);
+ return;
+ }
+
+@@ -618,6 +662,8 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
+ }
+ di->size = size;
+ total += size;
++
++ di->completed_ret = INT_MAX;
+ }
+
+ uuid_generate(uuid);
+@@ -849,6 +895,8 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
+ backup_state.stat.dirty = total - backup_state.stat.reused;
+ backup_state.stat.transferred = 0;
+ backup_state.stat.zero_bytes = 0;
++ backup_state.stat.finishing = false;
++ backup_state.stat.starting = true;
+
+ qemu_mutex_unlock(&backup_state.stat.lock);
+
+@@ -863,6 +911,33 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
+ uuid_info->UUID = uuid_str;
+
+ task->result = uuid_info;
++
++ /* Run create_backup_jobs_bh outside of coroutine (in BH) but keep
++ * backup_mutex locked. This is fine, a CoMutex can be held across yield
++ * points, and we'll release it as soon as the BH reschedules us.
++ */
++ CoCtxData waker = {
++ .co = qemu_coroutine_self(),
++ .ctx = qemu_get_current_aio_context(),
++ .data = &local_err,
++ };
++ aio_bh_schedule_oneshot(waker.ctx, create_backup_jobs_bh, &waker);
++ qemu_coroutine_yield();
++
++ if (local_err) {
++ error_propagate(task->errp, local_err);
++ goto err;
++ }
++
++ qemu_co_mutex_unlock(&backup_state.backup_mutex);
++
++ qemu_mutex_lock(&backup_state.stat.lock);
++ backup_state.stat.starting = false;
++ qemu_mutex_unlock(&backup_state.stat.lock);
++
++ /* start the first job in the transaction */
++ job_txn_start_seq(backup_state.txn);
++
+ return;
+
+ err_mutex:
+@@ -885,6 +960,7 @@ err:
+ g_free(di);
+ }
+ g_list_free(di_list);
++ backup_state.di_list = NULL;
+
+ if (devs) {
+ g_strfreev(devs);
+@@ -905,6 +981,8 @@ err:
+ }
+
+ task->result = NULL;
++
++ qemu_co_mutex_unlock(&backup_state.backup_mutex);
+ return;
+ }
+
+@@ -958,24 +1036,8 @@ UuidInfo *qmp_backup(
+ .errp = errp,
+ };
+
+- qemu_mutex_lock(&backup_state.backup_mutex);
+-
+ block_on_coroutine_fn(pvebackup_co_prepare, &task);
+
+- if (*errp == NULL) {
+- bool errors = create_backup_jobs();
+- qemu_mutex_unlock(&backup_state.backup_mutex);
+-
+- if (!errors) {
+- /* start the first job in the transaction
+- * note: this might directly enter the job, so we need to do this
+- * after unlocking the backup_mutex */
+- job_txn_start_seq(backup_state.txn);
+- }
+- } else {
+- qemu_mutex_unlock(&backup_state.backup_mutex);
+- }
+-
+ return task.result;
+ }
+
+@@ -1027,6 +1089,7 @@ BackupStatus *qmp_query_backup(Error **errp)
+ info->transferred = backup_state.stat.transferred;
+ info->has_reused = true;
+ info->reused = backup_state.stat.reused;
++ info->finishing = backup_state.stat.finishing;
+
+ qemu_mutex_unlock(&backup_state.stat.lock);
+
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index 16e184dd28..cb17d00fe0 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -770,12 +770,15 @@
+ #
+ # @uuid: uuid for this backup job
+ #
++# @finishing: if status='active' and finishing=true, then the backup process is
++# waiting for the target to finish.
++#
+ ##
+ { 'struct': 'BackupStatus',
+ 'data': {'*status': 'str', '*errmsg': 'str', '*total': 'int', '*dirty': 'int',
+ '*transferred': 'int', '*zero-bytes': 'int', '*reused': 'int',
+ '*start-time': 'int', '*end-time': 'int',
+- '*backup-file': 'str', '*uuid': 'str' } }
++ '*backup-file': 'str', '*uuid': 'str', 'finishing': 'bool' } }
+
+ ##
+ # @BackupFormat: