]> git.proxmox.com Git - pve-qemu.git/commitdiff
improve qemu backup by reducing lock contention
authorDietmar Maurer <dietmar@proxmox.com>
Tue, 18 Feb 2020 09:47:21 +0000 (10:47 +0100)
committerDietmar Maurer <dietmar@proxmox.com>
Tue, 18 Feb 2020 09:47:21 +0000 (10:47 +0100)
- reducing lock contention by using CoRwLock
- correctly call aio_wait_kick()

debian/patches/pve/0048-PVE-backup-use-separate-CoRwlock-for-data-accessed-b.patch [new file with mode: 0644]
debian/patches/pve/0049-PVE-backup-block_on_coroutine_wrapper-call-aio_wait.patch [new file with mode: 0644]
debian/patches/pve/0050-PVE-backup-move-backup_state.cancel-to-backup_state.patch [new file with mode: 0644]
debian/patches/series

diff --git a/debian/patches/pve/0048-PVE-backup-use-separate-CoRwlock-for-data-accessed-b.patch b/debian/patches/pve/0048-PVE-backup-use-separate-CoRwlock-for-data-accessed-b.patch
new file mode 100644 (file)
index 0000000..f8b368e
--- /dev/null
@@ -0,0 +1,365 @@
+From 4c2e4e498716b8a556fccec20fb1348555c2b189 Mon Sep 17 00:00:00 2001
+From: Dietmar Maurer <dietmar@proxmox.com>
+Date: Fri, 14 Feb 2020 10:21:37 +0100
+Subject: [PATCH qemu 1/7] PVE backup: use separate CoRwlock for data accessed
+ by qmp query_backup
+
+This should minimize contention between query_backup and running backup jobs.
+---
+ blockdev.c | 178 +++++++++++++++++++++++++++++++++--------------------
+ 1 file changed, 112 insertions(+), 66 deletions(-)
+
+diff --git a/blockdev.c b/blockdev.c
+index 46e8a2780a..4367eaf2a7 100644
+--- a/blockdev.c
++++ b/blockdev.c
+@@ -3203,22 +3203,26 @@ static void block_on_coroutine_fn(CoroutineEntry *entry, void *entry_arg)
+     AIO_WAIT_WHILE(ctx, !wrapper.finished);
+ }
++
+ static struct PVEBackupState {
+-    Error *error;
++    struct {
++        CoRwlock rwlock;
++        Error *error;
++        time_t start_time;
++        time_t end_time;
++        char *backup_file;
++        uuid_t uuid;
++        char uuid_str[37];
++        size_t total;
++        size_t transferred;
++        size_t zero_bytes;
++    } stat;
+     bool cancel;
+-    uuid_t uuid;
+-    char uuid_str[37];
+     int64_t speed;
+-    time_t start_time;
+-    time_t end_time;
+-    char *backup_file;
+     VmaWriter *vmaw;
+     GList *di_list;
+-    size_t total;
+-    size_t transferred;
+-    size_t zero_bytes;
+     CoMutex backup_mutex;
+-    bool      backup_mutex_initialized;
++    bool mutex_initialized;
+ } backup_state;
+ typedef struct PVEBackupDevInfo {
+@@ -3251,11 +3255,14 @@ static int coroutine_fn pvebackup_co_dump_cb(void *opaque, BlockBackend *target,
+     uint64_t cluster_num = start / VMA_CLUSTER_SIZE;
+     if ((cluster_num * VMA_CLUSTER_SIZE) != start) {
+-        if (!backup_state.error) {
+-            error_setg(&backup_state.error,
++        qemu_co_rwlock_rdlock(&backup_state.stat.rwlock);
++        if (!backup_state.stat.error) {
++            qemu_co_rwlock_upgrade(&backup_state.stat.rwlock);
++            error_setg(&backup_state.stat.error,
+                        "got unaligned write inside backup dump "
+                        "callback (sector %ld)", start);
+         }
++        qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
+         qemu_co_mutex_unlock(&backup_state.backup_mutex);
+         return -1; // not aligned to cluster size
+     }
+@@ -3273,27 +3280,35 @@ static int coroutine_fn pvebackup_co_dump_cb(void *opaque, BlockBackend *target,
+                 buf += VMA_CLUSTER_SIZE;
+             }
+             if (ret < 0) {
+-                if (!backup_state.error) {
+-                    vma_writer_error_propagate(backup_state.vmaw, &backup_state.error);
++                qemu_co_rwlock_rdlock(&backup_state.stat.rwlock);
++                if (!backup_state.stat.error) {
++                    qemu_co_rwlock_upgrade(&backup_state.stat.rwlock);
++                    vma_writer_error_propagate(backup_state.vmaw, &backup_state.stat.error);
+                 }
++                qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
++
+                 qemu_co_mutex_unlock(&backup_state.backup_mutex);
+                 return ret;
+             } else {
+-                backup_state.zero_bytes += zero_bytes;
++                qemu_co_rwlock_wrlock(&backup_state.stat.rwlock);
++                backup_state.stat.zero_bytes += zero_bytes;
+                 if (remaining >= VMA_CLUSTER_SIZE) {
+-                    backup_state.transferred += VMA_CLUSTER_SIZE;
++                    backup_state.stat.transferred += VMA_CLUSTER_SIZE;
+                     remaining -= VMA_CLUSTER_SIZE;
+                 } else {
+-                    backup_state.transferred += remaining;
++                    backup_state.stat.transferred += remaining;
+                     remaining = 0;
+                 }
++                qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
+             }
+         }
+     } else {
++        qemu_co_rwlock_wrlock(&backup_state.stat.rwlock);
+         if (!buf) {
+-            backup_state.zero_bytes += size;
++            backup_state.stat.zero_bytes += size;
+         }
+-        backup_state.transferred += size;
++        backup_state.stat.transferred += size;
++        qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
+     }
+     qemu_co_mutex_unlock(&backup_state.backup_mutex);
+@@ -3313,12 +3328,20 @@ static void coroutine_fn pvebackup_co_cleanup(void)
+         return;
+     }
+-    backup_state.end_time = time(NULL);
++    qemu_co_rwlock_wrlock(&backup_state.stat.rwlock);
++    backup_state.stat.end_time = time(NULL);
++    qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
+     if (backup_state.vmaw) {
+         Error *local_err = NULL;
+         vma_writer_close(backup_state.vmaw, &local_err);
+-        error_propagate(&backup_state.error, local_err);
++
++        if (local_err != NULL) {
++            qemu_co_rwlock_wrlock(&backup_state.stat.rwlock);
++            error_propagate(&backup_state.stat.error, local_err);
++            qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
++        }
++
+         backup_state.vmaw = NULL;
+     }
+@@ -3345,10 +3368,14 @@ static void coroutine_fn pvebackup_co_complete_cb(void *opaque)
+     di->completed = true;
+-    if (ret < 0 && !backup_state.error) {
+-        error_setg(&backup_state.error, "job failed with err %d - %s",
++    qemu_co_rwlock_rdlock(&backup_state.stat.rwlock);
++
++    if (ret < 0 && !backup_state.stat.error) {
++        qemu_co_rwlock_upgrade(&backup_state.stat.rwlock);
++        error_setg(&backup_state.stat.error, "job failed with err %d - %s",
+                    ret, strerror(-ret));
+     }
++    qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
+     di->bs = NULL;
+     di->target = NULL;
+@@ -3401,9 +3428,12 @@ static void coroutine_fn pvebackup_co_cancel(void *opaque)
+         return;
+     }
+-    if (!backup_state.error) {
+-        error_setg(&backup_state.error, "backup cancelled");
++    qemu_co_rwlock_rdlock(&backup_state.stat.rwlock);
++    if (!backup_state.stat.error) {
++        qemu_co_rwlock_upgrade(&backup_state.stat.rwlock);
++        error_setg(&backup_state.stat.error, "backup cancelled");
+     }
++    qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
+     if (backup_state.vmaw) {
+         /* make sure vma writer does not block anymore */
+@@ -3438,7 +3468,7 @@ static void coroutine_fn pvebackup_co_cancel(void *opaque)
+ void qmp_backup_cancel(Error **errp)
+ {
+-    if (!backup_state.backup_mutex_initialized)
++    if (!backup_state.mutex_initialized)
+         return;
+     block_on_coroutine_fn(pvebackup_co_cancel, NULL);
+@@ -3508,9 +3538,13 @@ static void coroutine_fn pvebackup_co_run_next_job(void)
+                     qemu_co_mutex_unlock(&backup_state.backup_mutex);
+                     aio_context_acquire(aio_context);
+-                    
++
+                     if (job_should_pause(&job->job)) {
+-                        if (backup_state.error || backup_state.cancel) {
++                        qemu_co_rwlock_rdlock(&backup_state.stat.rwlock);
++                        bool error_or_canceled = backup_state.stat.error || backup_state.cancel;
++                        qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
++
++                        if (error_or_canceled) {
+                             job_cancel(&job->job, false);
+                         } else {
+                             job_resume(&job->job);
+@@ -3565,9 +3599,10 @@ static void coroutine_fn pvebackup_co_start(void *opaque)
+     const char *config_name = "qemu-server.conf";
+     const char *firewall_name = "qemu-server.fw";
+-    if (!backup_state.backup_mutex_initialized) {
++    if (!backup_state.mutex_initialized) {
++        qemu_co_rwlock_init(&backup_state.stat.rwlock);
+         qemu_co_mutex_init(&backup_state.backup_mutex);
+-        backup_state.backup_mutex_initialized = true;
++        backup_state.mutex_initialized = true;
+     }
+     qemu_co_mutex_lock(&backup_state.backup_mutex);
+@@ -3727,31 +3762,36 @@ static void coroutine_fn pvebackup_co_start(void *opaque)
+     backup_state.cancel = false;
+-    if (backup_state.error) {
+-        error_free(backup_state.error);
+-        backup_state.error = NULL;
+-    }
++    qemu_co_rwlock_wrlock(&backup_state.stat.rwlock);
+-    backup_state.speed = (task->has_speed && task->speed > 0) ? task->speed : 0;
++    if (backup_state.stat.error) {
++        error_free(backup_state.stat.error);
++        backup_state.stat.error = NULL;
++    }
+-    backup_state.start_time = time(NULL);
+-    backup_state.end_time = 0;
++    backup_state.stat.start_time = time(NULL);
++    backup_state.stat.end_time = 0;
+-    if (backup_state.backup_file) {
+-        g_free(backup_state.backup_file);
++    if (backup_state.stat.backup_file) {
++        g_free(backup_state.stat.backup_file);
+     }
+-    backup_state.backup_file = g_strdup(task->backup_file);
++    backup_state.stat.backup_file = g_strdup(task->backup_file);
+-    backup_state.vmaw = vmaw;
++    uuid_copy(backup_state.stat.uuid, uuid);
++    uuid_unparse_lower(uuid, backup_state.stat.uuid_str);
++    char *uuid_str = g_strdup(backup_state.stat.uuid_str);
+-    uuid_copy(backup_state.uuid, uuid);
+-    uuid_unparse_lower(uuid, backup_state.uuid_str);
++    backup_state.stat.total = total;
++    backup_state.stat.transferred = 0;
++    backup_state.stat.zero_bytes = 0;
+-    backup_state.di_list = di_list;
++    qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
++
++    backup_state.speed = (task->has_speed && task->speed > 0) ? task->speed : 0;
++
++    backup_state.vmaw = vmaw;
+-    backup_state.total = total;
+-    backup_state.transferred = 0;
+-    backup_state.zero_bytes = 0;
++    backup_state.di_list = di_list;
+     /* start all jobs (paused state) */
+     l = di_list;
+@@ -3763,7 +3803,9 @@ static void coroutine_fn pvebackup_co_start(void *opaque)
+                                 JOB_DEFAULT, pvebackup_co_dump_cb, dump_cb_block_size,
+                                 pvebackup_complete_cb, di, 1, NULL, &local_err);
+         if (!job || local_err != NULL) {
+-            error_setg(&backup_state.error, "backup_job_create failed");
++            qemu_co_rwlock_wrlock(&backup_state.stat.rwlock);
++            error_setg(&backup_state.stat.error, "backup_job_create failed");
++            qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
+             break;
+         }
+         job_start(&job->job);
+@@ -3775,14 +3817,18 @@ static void coroutine_fn pvebackup_co_start(void *opaque)
+     qemu_co_mutex_unlock(&backup_state.backup_mutex);
+-    if (!backup_state.error) {
++    qemu_co_rwlock_rdlock(&backup_state.stat.rwlock);
++    bool no_errors = !backup_state.stat.error;
++    qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
++
++    if (no_errors) {
+         pvebackup_co_run_next_job(); // run one job
+     } else {
+         pvebackup_co_cancel(NULL);
+     }
+     uuid_info = g_malloc0(sizeof(*uuid_info));
+-    uuid_info->UUID = g_strdup(backup_state.uuid_str);
++    uuid_info->UUID = uuid_str;
+     task->result = uuid_info;
+     return;
+@@ -3866,54 +3912,54 @@ static void coroutine_fn pvebackup_co_query(void *opaque)
+     BackupStatus *info = g_malloc0(sizeof(*info));
+-    if (!backup_state.backup_mutex_initialized)
++    if (!backup_state.mutex_initialized)
+         return;
+-    qemu_co_mutex_lock(&backup_state.backup_mutex);
++    qemu_co_rwlock_rdlock(&backup_state.stat.rwlock);
+-    if (!backup_state.start_time) {
++    if (!backup_state.stat.start_time) {
+         /* not started, return {} */
+         task->result = info;
+-        qemu_co_mutex_unlock(&backup_state.backup_mutex);
++        qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
+         return;
+     }
+     info->has_status = true;
+     info->has_start_time = true;
+-    info->start_time = backup_state.start_time;
++    info->start_time = backup_state.stat.start_time;
+-    if (backup_state.backup_file) {
++    if (backup_state.stat.backup_file) {
+         info->has_backup_file = true;
+-        info->backup_file = g_strdup(backup_state.backup_file);
++        info->backup_file = g_strdup(backup_state.stat.backup_file);
+     }
+     info->has_uuid = true;
+-    info->uuid = g_strdup(backup_state.uuid_str);
++    info->uuid = g_strdup(backup_state.stat.uuid_str);
+-    if (backup_state.end_time) {
+-        if (backup_state.error) {
++    if (backup_state.stat.end_time) {
++        if (backup_state.stat.error) {
+             info->status = g_strdup("error");
+             info->has_errmsg = true;
+-            info->errmsg = g_strdup(error_get_pretty(backup_state.error));
++            info->errmsg = g_strdup(error_get_pretty(backup_state.stat.error));
+         } else {
+             info->status = g_strdup("done");
+         }
+         info->has_end_time = true;
+-        info->end_time = backup_state.end_time;
++        info->end_time = backup_state.stat.end_time;
+     } else {
+         info->status = g_strdup("active");
+     }
+     info->has_total = true;
+-    info->total = backup_state.total;
++    info->total = backup_state.stat.total;
+     info->has_zero_bytes = true;
+-    info->zero_bytes = backup_state.zero_bytes;
++    info->zero_bytes = backup_state.stat.zero_bytes;
+     info->has_transferred = true;
+-    info->transferred = backup_state.transferred;
++    info->transferred = backup_state.stat.transferred;
+     task->result = info;
+-    qemu_co_mutex_unlock(&backup_state.backup_mutex);
++    qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
+ }
+ BackupStatus *qmp_query_backup(Error **errp)
+-- 
+2.20.1
+
diff --git a/debian/patches/pve/0049-PVE-backup-block_on_coroutine_wrapper-call-aio_wait.patch b/debian/patches/pve/0049-PVE-backup-block_on_coroutine_wrapper-call-aio_wait.patch
new file mode 100644 (file)
index 0000000..c1c7b24
--- /dev/null
@@ -0,0 +1,25 @@
+From ca384caae9a0dbbbbab2174ec09935702ac7a067 Mon Sep 17 00:00:00 2001
+From: Dietmar Maurer <dietmar@proxmox.com>
+Date: Fri, 14 Feb 2020 11:51:02 +0100
+Subject: [PATCH qemu 2/7] PVE backup - block_on_coroutine_wrapper: call
+ aio_wait_kick() as described in block/aio-wait.h
+
+---
+ blockdev.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/blockdev.c b/blockdev.c
+index 4367eaf2a7..2f84f8832d 100644
+--- a/blockdev.c
++++ b/blockdev.c
+@@ -3185,6 +3185,7 @@ static void coroutine_fn block_on_coroutine_wrapper(void *opaque)
+     BlockOnCoroutineWrapper *wrapper = opaque;
+     wrapper->entry(wrapper->entry_arg);
+     wrapper->finished = true;
++    aio_wait_kick();
+ }
+ static void block_on_coroutine_fn(CoroutineEntry *entry, void *entry_arg)
+-- 
+2.20.1
+
diff --git a/debian/patches/pve/0050-PVE-backup-move-backup_state.cancel-to-backup_state.patch b/debian/patches/pve/0050-PVE-backup-move-backup_state.cancel-to-backup_state.patch
new file mode 100644 (file)
index 0000000..7841169
--- /dev/null
@@ -0,0 +1,95 @@
+From c8856d5cbcf3d427cd02c9556f845486ab5362bc Mon Sep 17 00:00:00 2001
+From: Dietmar Maurer <dietmar@proxmox.com>
+Date: Fri, 14 Feb 2020 12:37:27 +0100
+Subject: [PATCH qemu 3/7] PVE backup: move backup_state.cancel to
+ backup_state.stat.cancel
+
+In order to avoid lock contention with qmp_backup_cancel.
+---
+ blockdev.c | 25 +++++++++++++++----------
+ 1 file changed, 15 insertions(+), 10 deletions(-)
+
+diff --git a/blockdev.c b/blockdev.c
+index 2f84f8832d..a4b5b98224 100644
+--- a/blockdev.c
++++ b/blockdev.c
+@@ -3204,9 +3204,9 @@ static void block_on_coroutine_fn(CoroutineEntry *entry, void *entry_arg)
+     AIO_WAIT_WHILE(ctx, !wrapper.finished);
+ }
+-
+ static struct PVEBackupState {
+     struct {
++        // Everithing accessed from qmp command, protected using rwlock
+         CoRwlock rwlock;
+         Error *error;
+         time_t start_time;
+@@ -3217,8 +3217,8 @@ static struct PVEBackupState {
+         size_t total;
+         size_t transferred;
+         size_t zero_bytes;
++        bool cancel;
+     } stat;
+-    bool cancel;
+     int64_t speed;
+     VmaWriter *vmaw;
+     GList *di_list;
+@@ -3247,13 +3247,16 @@ static int coroutine_fn pvebackup_co_dump_cb(void *opaque, BlockBackend *target,
+     const unsigned char *buf = pbuf;
+     PVEBackupDevInfo *di = opaque;
+-    qemu_co_mutex_lock(&backup_state.backup_mutex);
++    qemu_co_rwlock_rdlock(&backup_state.stat.rwlock);
++    bool cancel = backup_state.stat.cancel;
++    qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
+-    if (backup_state.cancel) {
+-        qemu_co_mutex_unlock(&backup_state.backup_mutex);
++    if (cancel) {
+         return size; // return success
+     }
++    qemu_co_mutex_lock(&backup_state.backup_mutex);
++
+     uint64_t cluster_num = start / VMA_CLUSTER_SIZE;
+     if ((cluster_num * VMA_CLUSTER_SIZE) != start) {
+         qemu_co_rwlock_rdlock(&backup_state.stat.rwlock);
+@@ -3419,9 +3422,11 @@ static void coroutine_fn pvebackup_co_cancel(void *opaque)
+ {
+     assert(qemu_in_coroutine());
+-    qemu_co_mutex_lock(&backup_state.backup_mutex);
++    qemu_co_rwlock_wrlock(&backup_state.stat.rwlock);
++    backup_state.stat.cancel = true;
++    qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
+-    backup_state.cancel = true;
++    qemu_co_mutex_lock(&backup_state.backup_mutex);
+     // Avoid race between block jobs and backup-cancel command:
+     if (!backup_state.vmaw) {
+@@ -3542,7 +3547,7 @@ static void coroutine_fn pvebackup_co_run_next_job(void)
+                     if (job_should_pause(&job->job)) {
+                         qemu_co_rwlock_rdlock(&backup_state.stat.rwlock);
+-                        bool error_or_canceled = backup_state.stat.error || backup_state.cancel;
++                        bool error_or_canceled = backup_state.stat.error || backup_state.stat.cancel;
+                         qemu_co_rwlock_unlock(&backup_state.stat.rwlock);
+                         if (error_or_canceled) {
+@@ -3761,10 +3766,10 @@ static void coroutine_fn pvebackup_co_start(void *opaque)
+     }
+     /* initialize global backup_state now */
+-    backup_state.cancel = false;
+-
+     qemu_co_rwlock_wrlock(&backup_state.stat.rwlock);
++    backup_state.stat.cancel = false;
++
+     if (backup_state.stat.error) {
+         error_free(backup_state.stat.error);
+         backup_state.stat.error = NULL;
+-- 
+2.20.1
+
index c37c4e12a4e4f1565c6cc0dbba620b01ac548d70..a736b06fd5284f1369f90d48f71b8df018ce9ceb 100644 (file)
@@ -47,3 +47,6 @@ pve/0044-Acquire-aio_context-before-calling-block_job_add_bdr.patch
 pve/0045-PVE-Compat-4.0-used-balloon-qemu-4-0-config-size-fal.patch
 pve/0046-PVE-Allow-version-code-in-machine-type.patch
 pve/0047-PVE-fix-hmp-info-backup-cmd-for-not-initialized-back.patch
+pve/0048-PVE-backup-use-separate-CoRwlock-for-data-accessed-b.patch
+pve/0049-PVE-backup-block_on_coroutine_wrapper-call-aio_wait.patch
+pve/0050-PVE-backup-move-backup_state.cancel-to-backup_state.patch