5 files changed, 142 insertions(+), 28 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
-index abbddb39e4..ed14c8b498 100644
+index 1bdce3b657..0c5c72df2e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -51,7 +51,7 @@ typedef struct MirrorBlockJob {
BdrvDirtyBitmap *dirty_bitmap;
BdrvDirtyBitmapIter *dbi;
uint8_t *buf;
-@@ -724,7 +726,8 @@ static int mirror_exit_common(Job *job)
+@@ -722,7 +724,8 @@ static int mirror_exit_common(Job *job)
&error_abort);
if (!abort && s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
BlockDriverState *unfiltered_target = bdrv_skip_filters(target_bs);
if (bdrv_cow_bs(unfiltered_target) != backing) {
-@@ -831,6 +834,16 @@ static void mirror_abort(Job *job)
+@@ -819,6 +822,16 @@ static void mirror_abort(Job *job)
assert(ret == 0);
}
static void coroutine_fn mirror_throttle(MirrorBlockJob *s)
{
int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-@@ -1027,7 +1040,8 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
+@@ -1015,7 +1028,8 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
mirror_free_init(s);
s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
ret = mirror_dirty_init(s);
if (ret < 0 || job_is_cancelled(&s->common.job)) {
goto immediate_exit;
-@@ -1323,6 +1337,7 @@ static const BlockJobDriver mirror_job_driver = {
+@@ -1304,6 +1318,7 @@ static const BlockJobDriver mirror_job_driver = {
.run = mirror_run,
.prepare = mirror_prepare,
.abort = mirror_abort,
.pause = mirror_pause,
.complete = mirror_complete,
.cancel = mirror_cancel,
-@@ -1341,6 +1356,7 @@ static const BlockJobDriver commit_active_job_driver = {
+@@ -1322,6 +1337,7 @@ static const BlockJobDriver commit_active_job_driver = {
.run = mirror_run,
.prepare = mirror_prepare,
.abort = mirror_abort,
.pause = mirror_pause,
.complete = mirror_complete,
.cancel = commit_active_cancel,
-@@ -1733,7 +1749,10 @@ static BlockJob *mirror_start_job(
+@@ -1714,7 +1730,10 @@ static BlockJob *mirror_start_job(
BlockCompletionFunc *cb,
void *opaque,
const BlockJobDriver *driver,
bool auto_complete, const char *filter_node_name,
bool is_mirror, MirrorCopyMode copy_mode,
Error **errp)
-@@ -1747,10 +1766,39 @@ static BlockJob *mirror_start_job(
+@@ -1728,10 +1747,39 @@ static BlockJob *mirror_start_job(
GLOBAL_STATE_CODE();
assert(is_power_of_2(granularity));
if (buf_size < 0) {
-@@ -1890,7 +1938,9 @@ static BlockJob *mirror_start_job(
+@@ -1871,7 +1919,9 @@ static BlockJob *mirror_start_job(
s->replaces = g_strdup(replaces);
s->on_source_error = on_source_error;
s->on_target_error = on_target_error;
s->backing_mode = backing_mode;
s->zero_target = zero_target;
qatomic_set(&s->copy_mode, copy_mode);
-@@ -1916,6 +1966,18 @@ static BlockJob *mirror_start_job(
+@@ -1897,6 +1947,18 @@ static BlockJob *mirror_start_job(
*/
bdrv_disable_dirty_bitmap(s->dirty_bitmap);
+ }
+ }
+
- bdrv_graph_wrlock(bs);
+ bdrv_graph_wrlock();
ret = block_job_add_bdrv(&s->common, "source", bs, 0,
BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE |
-@@ -1998,6 +2060,9 @@ fail:
+@@ -1979,6 +2041,9 @@ fail:
if (s->dirty_bitmap) {
bdrv_release_dirty_bitmap(s->dirty_bitmap);
}
job_early_fail(&s->common.job);
}
-@@ -2020,35 +2085,28 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
+@@ -2001,35 +2066,28 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
BlockDriverState *target, const char *replaces,
int creation_flags, int64_t speed,
uint32_t granularity, int64_t buf_size,
}
BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
-@@ -2075,7 +2133,8 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
+@@ -2056,7 +2114,8 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
job_id, bs, creation_flags, base, NULL, speed, 0, 0,
MIRROR_LEAVE_BACKING_CHAIN, false,
on_error, on_error, true, cb, opaque,
errp);
if (!job) {
diff --git a/blockdev.c b/blockdev.c
-index c91f49e7b6..8c8e8b604a 100644
+index 057601dcf0..8682814a7a 100644
--- a/blockdev.c
+++ b/blockdev.c
-@@ -2903,6 +2903,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2776,6 +2776,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
BlockDriverState *target,
const char *replaces,
enum MirrorSyncMode sync,
BlockMirrorBackingMode backing_mode,
bool zero_target,
bool has_speed, int64_t speed,
-@@ -2921,6 +2924,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2794,6 +2797,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
{
BlockDriverState *unfiltered_bs;
int job_flags = JOB_DEFAULT;
GLOBAL_STATE_CODE();
GRAPH_RDLOCK_GUARD_MAINLOOP();
-@@ -2975,6 +2979,29 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2848,6 +2852,29 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
sync = MIRROR_SYNC_MODE_FULL;
}
if (!replaces) {
/* We want to mirror from @bs, but keep implicit filters on top */
unfiltered_bs = bdrv_skip_implicit_filters(bs);
-@@ -3030,8 +3057,8 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2889,8 +2916,8 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
* and will allow to check whether the node still exist at mirror completion
*/
mirror_start(job_id, bs, target,
on_source_error, on_target_error, unmap, filter_node_name,
copy_mode, errp);
}
-@@ -3189,6 +3216,8 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
+@@ -3034,6 +3061,8 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
blockdev_mirror_common(arg->job_id, bs, target_bs,
arg->replaces, arg->sync,
backing_mode, zero_target,
arg->has_speed, arg->speed,
arg->has_granularity, arg->granularity,
-@@ -3210,6 +3239,8 @@ void qmp_blockdev_mirror(const char *job_id,
+@@ -3053,6 +3082,8 @@ void qmp_blockdev_mirror(const char *job_id,
const char *device, const char *target,
const char *replaces,
MirrorSyncMode sync,
bool has_speed, int64_t speed,
bool has_granularity, uint32_t granularity,
bool has_buf_size, int64_t buf_size,
-@@ -3258,7 +3289,8 @@ void qmp_blockdev_mirror(const char *job_id,
+@@ -3093,7 +3124,8 @@ void qmp_blockdev_mirror(const char *job_id,
}
blockdev_mirror_common(job_id, bs, target_bs,
has_granularity, granularity,
has_buf_size, buf_size,
diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
-index ef31c58bb3..57265a617a 100644
+index d2201e27f4..cc1387ae02 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
-@@ -152,7 +152,9 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
+@@ -158,7 +158,9 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
BlockDriverState *target, const char *replaces,
int creation_flags, int64_t speed,
uint32_t granularity, int64_t buf_size,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
diff --git a/qapi/block-core.json b/qapi/block-core.json
-index ca390c5700..8db0986e9e 100644
+index 746d1694c2..45ab548dfe 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
-@@ -2163,6 +2163,15 @@
+@@ -2174,6 +2174,15 @@
# destination (all the disk, only the sectors allocated in the
# topmost image, or only new I/O).
#
# @granularity: granularity of the dirty bitmap, default is 64K if the
# image format doesn't have clusters, 4K if the clusters are
# smaller than that, else the cluster size. Must be a power of 2
-@@ -2205,7 +2214,9 @@
+@@ -2216,7 +2225,9 @@
{ 'struct': 'DriveMirror',
'data': { '*job-id': 'str', 'device': 'str', 'target': 'str',
'*format': 'str', '*node-name': 'str', '*replaces': 'str',
'*speed': 'int', '*granularity': 'uint32',
'*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
'*on-target-error': 'BlockdevOnError',
-@@ -2489,6 +2500,15 @@
+@@ -2496,6 +2507,15 @@
# destination (all the disk, only the sectors allocated in the
# topmost image, or only new I/O).
#
# @granularity: granularity of the dirty bitmap, default is 64K if the
# image format doesn't have clusters, 4K if the clusters are
# smaller than that, else the cluster size. Must be a power of 2
-@@ -2539,7 +2559,8 @@
+@@ -2544,7 +2564,8 @@
{ 'command': 'blockdev-mirror',
'data': { '*job-id': 'str', 'device': 'str', 'target': 'str',
'*replaces': 'str',
'*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
'*on-target-error': 'BlockdevOnError',
diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
-index 9b15d2768c..54acd47188 100644
+index 3766d5de6b..afa44cbd34 100644
--- a/tests/unit/test-block-iothread.c
+++ b/tests/unit/test-block-iothread.c
-@@ -766,8 +766,8 @@ static void test_propagate_mirror(void)
+@@ -755,8 +755,8 @@ static void test_propagate_mirror(void)
+
/* Start a mirror job */
- aio_context_acquire(main_ctx);
mirror_start("job0", src, target, NULL, JOB_DEFAULT, 0, 0, 0,
- MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false,
- BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
+ false, BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
false, "filter_node", MIRROR_COPY_MODE_BACKGROUND,
&error_abort);
- aio_context_release(main_ctx);
+
1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
-index ed14c8b498..beb98f2e07 100644
+index 0c5c72df2e..37fee3fa25 100644
--- a/block/mirror.c
+++ b/block/mirror.c
-@@ -695,8 +695,6 @@ static int mirror_exit_common(Job *job)
+@@ -693,8 +693,6 @@ static int mirror_exit_common(Job *job)
bdrv_unfreeze_backing_chain(mirror_top_bs, target_bs);
}
/* Make sure that the source BDS doesn't go away during bdrv_replace_node,
* before we can call bdrv_drained_end */
bdrv_ref(src);
-@@ -810,6 +808,18 @@ static int mirror_exit_common(Job *job)
+@@ -800,6 +798,18 @@ static int mirror_exit_common(Job *job)
bdrv_drained_end(target_bs);
bdrv_unref(target_bs);
bs_opaque->job = NULL;
bdrv_drained_end(src);
-@@ -1776,10 +1786,6 @@ static BlockJob *mirror_start_job(
+@@ -1757,10 +1767,6 @@ static BlockJob *mirror_start_job(
" sync mode",
MirrorSyncMode_str(sync_mode));
return NULL;
}
} else if (bitmap) {
error_setg(errp,
-@@ -1796,6 +1802,12 @@ static BlockJob *mirror_start_job(
+@@ -1777,6 +1783,12 @@ static BlockJob *mirror_start_job(
return NULL;
}
granularity = bdrv_dirty_bitmap_granularity(bitmap);
1 file changed, 3 insertions(+)
diff --git a/blockdev.c b/blockdev.c
-index 8c8e8b604a..cdfe7b33b1 100644
+index 8682814a7a..5b75a085ee 100644
--- a/blockdev.c
+++ b/blockdev.c
-@@ -3000,6 +3000,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2873,6 +2873,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_ALLOW_RO, errp)) {
return;
}
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
-index beb98f2e07..1aa75e7384 100644
+index 37fee3fa25..6b3cce1007 100644
--- a/block/mirror.c
+++ b/block/mirror.c
-@@ -814,8 +814,8 @@ static int mirror_exit_common(Job *job)
+@@ -804,8 +804,8 @@ static int mirror_exit_common(Job *job)
job->ret == 0 && ret == 0)) {
/* Success; synchronize copy back to sync. */
bdrv_clear_dirty_bitmap(s->sync_bitmap, NULL);
}
}
bdrv_release_dirty_bitmap(s->dirty_bitmap);
-@@ -1983,11 +1983,8 @@ static BlockJob *mirror_start_job(
+@@ -1964,11 +1964,8 @@ static BlockJob *mirror_start_job(
}
if (s->sync_mode == MIRROR_SYNC_MODE_BITMAP) {
+ NULL, true);
}
- bdrv_graph_wrlock(bs);
+ bdrv_graph_wrlock();
3 files changed, 70 insertions(+), 59 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
-index 1aa75e7384..a02efbfbbd 100644
+index 6b3cce1007..2f1223852b 100644
--- a/block/mirror.c
+++ b/block/mirror.c
-@@ -1776,31 +1776,13 @@ static BlockJob *mirror_start_job(
+@@ -1757,31 +1757,13 @@ static BlockJob *mirror_start_job(
GLOBAL_STATE_CODE();
if (bitmap_mode != BITMAP_SYNC_MODE_NEVER) {
diff --git a/blockdev.c b/blockdev.c
-index cdfe7b33b1..38a40e3e32 100644
+index 5b75a085ee..d27d8c38ec 100644
--- a/blockdev.c
+++ b/blockdev.c
-@@ -2979,7 +2979,36 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2852,7 +2852,36 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
sync = MIRROR_SYNC_MODE_FULL;
}
monitor_qmp_caps_reset(mon);
data = qmp_greeting(mon);
diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
-index 176b549473..790bb7d1da 100644
+index f3488afeef..2624eb3470 100644
--- a/qapi/qmp-dispatch.c
+++ b/qapi/qmp-dispatch.c
@@ -117,16 +117,28 @@ typedef struct QmpDispatchBH {
aio_co_wake(data->co);
}
-@@ -253,6 +265,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ
+@@ -250,6 +262,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ
.ret = &ret,
.errp = &err,
.co = qemu_coroutine_self(),
1 file changed, 2 insertions(+), 12 deletions(-)
diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
-index 32c70c9e99..984b6a3145 100644
+index 2d0c607177..97e51733af 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -1781,7 +1781,7 @@ static int megasas_handle_io(MegasasState *s, MegasasCmd *cmd, int frame_cmd)
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/hw/ide/core.c b/hw/ide/core.c
-index 8a0579bff4..254255f8dc 100644
+index e8cb2dac92..3b21acf651 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -456,7 +456,7 @@ static void ide_trim_bh_cb(void *opaque)
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Fri, 17 Nov 2023 11:18:06 +0100
+Subject: [PATCH] Revert "x86: acpi: workaround Windows not handling name
+ references in Package properly"
+
+This reverts commit 44d975ef340e2f21f236f9520c53e1b30d2213a4.
+
+As reported in the community forum [0] and reproduced locally this
+breaks VirtIO network adapters in (at least) the German ISO of Windows
+Server 2022. The fix itself was for
+
+> Issue is not fatal but as result acpi-index/"PCI Label ID" property
+> is either not shown in device details page or shows incorrect value.
+
+so revert and tolerate that as a stop-gap, rather than have the
+devices not working at all.
+
+[0]: https://forum.proxmox.com/threads/92094/post-605684
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ hw/i386/acpi-build.c | 8 ++------
+ 1 file changed, 2 insertions(+), 6 deletions(-)
+
+diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
+index 53f804ac16..9b1b9f0412 100644
+--- a/hw/i386/acpi-build.c
++++ b/hw/i386/acpi-build.c
+@@ -347,13 +347,9 @@ Aml *aml_pci_device_dsm(void)
+ {
+ Aml *params = aml_local(0);
+ Aml *pkg = aml_package(2);
+- aml_append(pkg, aml_int(0));
+- aml_append(pkg, aml_int(0));
++ aml_append(pkg, aml_name("BSEL"));
++ aml_append(pkg, aml_name("ASUN"));
+ aml_append(method, aml_store(pkg, params));
+- aml_append(method,
+- aml_store(aml_name("BSEL"), aml_index(params, aml_int(0))));
+- aml_append(method,
+- aml_store(aml_name("ASUN"), aml_index(params, aml_int(1))));
+ aml_append(method,
+ aml_return(aml_call5("PDSM", aml_arg(0), aml_arg(1),
+ aml_arg(2), aml_arg(3), params))
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Fri, 28 Jul 2023 10:47:48 +0200
-Subject: [PATCH] migration/block-dirty-bitmap: fix loading bitmap when there
- is an iothread
-
-The bdrv_create_dirty_bitmap() function (which is also called by
-bdrv_dirty_bitmap_create_successor()) uses bdrv_getlength(bs). This is
-a wrapper around a coroutine, and thus uses bdrv_poll_co(). Polling
-tries to release the AioContext which will trigger an assert() if it
-hasn't been acquired before.
-
-The issue does not happen for migration, because there we are in a
-coroutine already, so the wrapper will just call bdrv_co_getlength()
-directly without polling.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- migration/block-dirty-bitmap.c | 6 ++++++
- 1 file changed, 6 insertions(+)
-
-diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
-index 24347ab0f7..0070b13b6f 100644
---- a/migration/block-dirty-bitmap.c
-+++ b/migration/block-dirty-bitmap.c
-@@ -809,8 +809,11 @@ static int dirty_bitmap_load_start(QEMUFile *f, DBMLoadState *s)
- "destination", bdrv_dirty_bitmap_name(s->bitmap));
- return -EINVAL;
- } else {
-+ AioContext *ctx = bdrv_get_aio_context(s->bs);
-+ aio_context_acquire(ctx);
- s->bitmap = bdrv_create_dirty_bitmap(s->bs, granularity,
- s->bitmap_name, &local_err);
-+ aio_context_release(ctx);
- if (!s->bitmap) {
- error_report_err(local_err);
- return -EINVAL;
-@@ -837,7 +840,10 @@ static int dirty_bitmap_load_start(QEMUFile *f, DBMLoadState *s)
-
- bdrv_disable_dirty_bitmap(s->bitmap);
- if (flags & DIRTY_BITMAP_MIG_START_FLAG_ENABLED) {
-+ AioContext *ctx = bdrv_get_aio_context(s->bs);
-+ aio_context_acquire(ctx);
- bdrv_dirty_bitmap_create_successor(s->bitmap, &local_err);
-+ aio_context_release(ctx);
- if (local_err) {
- error_report_err(local_err);
- return -EINVAL;
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Fri, 17 Nov 2023 11:18:06 +0100
-Subject: [PATCH] Revert "x86: acpi: workaround Windows not handling name
- references in Package properly"
-
-This reverts commit 44d975ef340e2f21f236f9520c53e1b30d2213a4.
-
-As reported in the community forum [0] and reproduced locally this
-breaks VirtIO network adapters in (at least) the German ISO of Windows
-Server 2022. The fix itself was for
-
-> Issue is not fatal but as result acpi-index/"PCI Label ID" property
-> is either not shown in device details page or shows incorrect value.
-
-so revert and tolerate that as a stop-gap, rather than have the
-devices not working at all.
-
-[0]: https://forum.proxmox.com/threads/92094/post-605684
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- hw/i386/acpi-build.c | 8 ++------
- 1 file changed, 2 insertions(+), 6 deletions(-)
-
-diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
-index 1e178341de..d8694b338e 100644
---- a/hw/i386/acpi-build.c
-+++ b/hw/i386/acpi-build.c
-@@ -357,13 +357,9 @@ Aml *aml_pci_device_dsm(void)
- {
- Aml *params = aml_local(0);
- Aml *pkg = aml_package(2);
-- aml_append(pkg, aml_int(0));
-- aml_append(pkg, aml_int(0));
-+ aml_append(pkg, aml_name("BSEL"));
-+ aml_append(pkg, aml_name("ASUN"));
- aml_append(method, aml_store(pkg, params));
-- aml_append(method,
-- aml_store(aml_name("BSEL"), aml_index(params, aml_int(0))));
-- aml_append(method,
-- aml_store(aml_name("ASUN"), aml_index(params, aml_int(1))));
- aml_append(method,
- aml_return(aml_call5("PDSM", aml_arg(0), aml_arg(1),
- aml_arg(2), aml_arg(3), params))
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Mon, 18 Dec 2023 11:13:40 +0100
-Subject: [PATCH] qemu_init: increase NOFILE soft limit on POSIX
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-In many configurations, e.g. multiple vNICs with multiple queues or
-with many Ceph OSDs, the default soft limit of 1024 is not enough.
-QEMU is supposed to work fine with file descriptors >= 1024 and does
-not use select() on POSIX. Bump the soft limit to the allowed hard
-limit to avoid issues with the aforementioned configurations.
-
-Of course the limit could be raised from the outside, but the man page
-of systemd.exec states about 'LimitNOFILE=':
-
-> Don't use.
-> [...]
-> Typically applications should increase their soft limit to the hard
-> limit on their own, if they are OK with working with file
-> descriptors above 1023,
-
-If the soft limit is already the same as the hard limit, avoid the
-superfluous setrlimit call. This can avoid a warning with a strict
-seccomp filter blocking setrlimit if NOFILE was already raised before
-executing QEMU.
-
-Buglink: https://bugzilla.proxmox.com/show_bug.cgi?id=4507
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
----
- include/sysemu/os-posix.h | 1 +
- include/sysemu/os-win32.h | 5 +++++
- os-posix.c | 22 ++++++++++++++++++++++
- system/vl.c | 2 ++
- 4 files changed, 30 insertions(+)
-
-diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h
-index dff32ae185..b881ac6c6f 100644
---- a/include/sysemu/os-posix.h
-+++ b/include/sysemu/os-posix.h
-@@ -51,6 +51,7 @@ bool is_daemonized(void);
- void os_daemonize(void);
- bool os_set_runas(const char *user_id);
- void os_set_chroot(const char *path);
-+void os_setup_limits(void);
- void os_setup_post(void);
- int os_mlock(void);
-
-diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h
-index 1047d260cb..106f155037 100644
---- a/include/sysemu/os-win32.h
-+++ b/include/sysemu/os-win32.h
-@@ -128,6 +128,11 @@ static inline int os_mlock(void)
- return -ENOSYS;
- }
-
-+void os_setup_limits(void)
-+{
-+ return;
-+}
-+
- #define fsync _commit
-
- #if !defined(lseek)
-diff --git a/os-posix.c b/os-posix.c
-index 52ef6990ff..a4284e2c07 100644
---- a/os-posix.c
-+++ b/os-posix.c
-@@ -24,6 +24,7 @@
- */
-
- #include "qemu/osdep.h"
-+#include <sys/resource.h>
- #include <sys/wait.h>
- #include <pwd.h>
- #include <grp.h>
-@@ -256,6 +257,27 @@ void os_daemonize(void)
- }
- }
-
-+void os_setup_limits(void)
-+{
-+ struct rlimit nofile;
-+
-+ if (getrlimit(RLIMIT_NOFILE, &nofile) < 0) {
-+ warn_report("unable to query NOFILE limit: %s", strerror(errno));
-+ return;
-+ }
-+
-+ if (nofile.rlim_cur == nofile.rlim_max) {
-+ return;
-+ }
-+
-+ nofile.rlim_cur = nofile.rlim_max;
-+
-+ if (setrlimit(RLIMIT_NOFILE, &nofile) < 0) {
-+ warn_report("unable to set NOFILE limit: %s", strerror(errno));
-+ return;
-+ }
-+}
-+
- void os_setup_post(void)
- {
- int fd = 0;
-diff --git a/system/vl.c b/system/vl.c
-index e18fa3ce46..d2a3b3f457 100644
---- a/system/vl.c
-+++ b/system/vl.c
-@@ -2782,6 +2782,8 @@ void qemu_init(int argc, char **argv)
- error_init(argv[0]);
- qemu_init_exec_dir(argv[0]);
-
-+ os_setup_limits();
-+
- qemu_init_arch_modules();
-
- qemu_init_subsystems();
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Kevin Wolf <kwolf@redhat.com>
-Date: Wed, 13 Mar 2024 16:30:00 +0100
-Subject: [PATCH] mirror: Don't call job_pause_point() under graph lock
-
-Calling job_pause_point() while holding the graph reader lock
-potentially results in a deadlock: bdrv_graph_wrlock() first drains
-everything, including the mirror job, which pauses it. The job is only
-unpaused at the end of the drain section, which is when the graph writer
-lock has been successfully taken. However, if the job happens to be
-paused at a pause point where it still holds the reader lock, the writer
-lock can't be taken as long as the job is still paused.
-
-Mark job_pause_point() as GRAPH_UNLOCKED and fix mirror accordingly.
-
-Cc: qemu-stable@nongnu.org
-Buglink: https://issues.redhat.com/browse/RHEL-28125
-Fixes: 004915a96a7a ("block: Protect bs->backing with graph_lock")
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-Message-ID: <20240313153000.33121-1-kwolf@redhat.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-(cherry picked from commit ae5a40e8581185654a667fbbf7e4adbc2a2a3e45)
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- block/mirror.c | 10 ++++++----
- include/qemu/job.h | 2 +-
- 2 files changed, 7 insertions(+), 5 deletions(-)
-
-diff --git a/block/mirror.c b/block/mirror.c
-index cd9d3ad4a8..abbddb39e4 100644
---- a/block/mirror.c
-+++ b/block/mirror.c
-@@ -479,9 +479,9 @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
- return bytes_handled;
- }
-
--static void coroutine_fn GRAPH_RDLOCK mirror_iteration(MirrorBlockJob *s)
-+static void coroutine_fn GRAPH_UNLOCKED mirror_iteration(MirrorBlockJob *s)
- {
-- BlockDriverState *source = s->mirror_top_bs->backing->bs;
-+ BlockDriverState *source;
- MirrorOp *pseudo_op;
- int64_t offset;
- /* At least the first dirty chunk is mirrored in one iteration. */
-@@ -489,6 +489,10 @@ static void coroutine_fn GRAPH_RDLOCK mirror_iteration(MirrorBlockJob *s)
- bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
- int max_io_bytes = MAX(s->buf_size / MAX_IN_FLIGHT, MAX_IO_BYTES);
-
-+ bdrv_graph_co_rdlock();
-+ source = s->mirror_top_bs->backing->bs;
-+ bdrv_graph_co_rdunlock();
-+
- bdrv_dirty_bitmap_lock(s->dirty_bitmap);
- offset = bdrv_dirty_iter_next(s->dbi);
- if (offset < 0) {
-@@ -1078,9 +1082,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
- mirror_wait_for_free_in_flight_slot(s);
- continue;
- } else if (cnt != 0) {
-- bdrv_graph_co_rdlock();
- mirror_iteration(s);
-- bdrv_graph_co_rdunlock();
- }
- }
-
-diff --git a/include/qemu/job.h b/include/qemu/job.h
-index e502787dd8..b4bc2e174b 100644
---- a/include/qemu/job.h
-+++ b/include/qemu/job.h
-@@ -503,7 +503,7 @@ void job_enter(Job *job);
- *
- * Called with job_mutex *not* held.
- */
--void coroutine_fn job_pause_point(Job *job);
-+void coroutine_fn GRAPH_UNLOCKED job_pause_point(Job *job);
-
- /**
- * @job: The job that calls the function.
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/block/file-posix.c b/block/file-posix.c
-index b862406c71..bc09aefe3b 100644
+index 35684f7e21..43bc0bd520 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -563,7 +563,7 @@ static QemuOptsList raw_runtime_opts = {
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/net/net.h b/include/net/net.h
-index ffbd2c8d56..e857c75f4c 100644
+index b1f9b35fcc..096c0d52e4 100644
--- a/include/net/net.h
+++ b/include/net/net.h
-@@ -263,8 +263,8 @@ void netdev_add(QemuOpts *opts, Error **errp);
+@@ -317,8 +317,8 @@ void netdev_add(QemuOpts *opts, Error **errp);
int net_hub_id_for_client(NetClientState *nc, int *id);
NetClientState *net_hub_port_find(int hub_id);
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
-index 705d925e6c..ba269e5f1e 100644
+index 6b05738079..d82869900a 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
-@@ -2281,9 +2281,9 @@ uint64_t cpu_get_tsc(CPUX86State *env);
+@@ -2291,9 +2291,9 @@ uint64_t cpu_get_tsc(CPUX86State *env);
#define CPU_RESOLVING_TYPE TYPE_X86_CPU
#ifdef TARGET_X86_64
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/ui/spice-core.c b/ui/spice-core.c
-index db21db2c94..0eb138cf43 100644
+index 15be640286..ea20e6153c 100644
--- a/ui/spice-core.c
+++ b/ui/spice-core.c
-@@ -691,32 +691,35 @@ static void qemu_spice_init(void)
+@@ -690,32 +690,35 @@ static void qemu_spice_init(void)
if (tls_port) {
x509_dir = qemu_opt_get(opts, "x509-dir");
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/qemu-img.c b/qemu-img.c
-index 5a77f67719..de51233825 100644
+index 7668f86769..2575e97b43 100644
--- a/qemu-img.c
+++ b/qemu-img.c
-@@ -3079,7 +3079,8 @@ static int img_info(int argc, char **argv)
+@@ -3075,7 +3075,8 @@ static int img_info(int argc, char **argv)
list = collect_image_info_list(image_opts, filename, fmt, chain,
force_share);
if (!list) {
2 files changed, 133 insertions(+), 73 deletions(-)
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
-index 068692d13e..73e0bb1d2c 100644
+index c9dd70a892..048788b23d 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
-@@ -58,9 +58,9 @@ SRST
+@@ -60,9 +60,9 @@ SRST
ERST
DEF("dd", img_dd,
DEF("info", img_info,
diff --git a/qemu-img.c b/qemu-img.c
-index de51233825..ad770f6570 100644
+index 2575e97b43..8ec68b346f 100644
--- a/qemu-img.c
+++ b/qemu-img.c
-@@ -4997,10 +4997,12 @@ static int img_bitmap(int argc, char **argv)
+@@ -4993,10 +4993,12 @@ static int img_bitmap(int argc, char **argv)
#define C_IF 04
#define C_OF 010
#define C_SKIP 020
};
struct DdIo {
-@@ -5076,6 +5078,19 @@ static int img_dd_skip(const char *arg,
+@@ -5072,6 +5074,19 @@ static int img_dd_skip(const char *arg,
return 0;
}
static int img_dd(int argc, char **argv)
{
int ret = 0;
-@@ -5116,6 +5131,7 @@ static int img_dd(int argc, char **argv)
+@@ -5112,6 +5127,7 @@ static int img_dd(int argc, char **argv)
{ "if", img_dd_if, C_IF },
{ "of", img_dd_of, C_OF },
{ "skip", img_dd_skip, C_SKIP },
{ NULL, NULL, 0 }
};
const struct option long_options[] = {
-@@ -5191,91 +5207,112 @@ static int img_dd(int argc, char **argv)
+@@ -5187,91 +5203,112 @@ static int img_dd(int argc, char **argv)
arg = NULL;
}
}
if (dd.flags & C_SKIP && (in.offset > INT64_MAX / in.bsz ||
-@@ -5292,20 +5329,43 @@ static int img_dd(int argc, char **argv)
+@@ -5288,20 +5325,43 @@ static int img_dd(int argc, char **argv)
in.buf = g_new(uint8_t, in.bsz);
for (out_pos = 0; in_pos < size; ) {
1 file changed, 25 insertions(+), 3 deletions(-)
diff --git a/qemu-img.c b/qemu-img.c
-index ad770f6570..b0839a5d18 100644
+index 8ec68b346f..b98184bba1 100644
--- a/qemu-img.c
+++ b/qemu-img.c
-@@ -4998,11 +4998,13 @@ static int img_bitmap(int argc, char **argv)
+@@ -4994,11 +4994,13 @@ static int img_bitmap(int argc, char **argv)
#define C_OF 010
#define C_SKIP 020
#define C_OSIZE 040
};
struct DdIo {
-@@ -5091,6 +5093,19 @@ static int img_dd_osize(const char *arg,
+@@ -5087,6 +5089,19 @@ static int img_dd_osize(const char *arg,
return 0;
}
static int img_dd(int argc, char **argv)
{
int ret = 0;
-@@ -5105,12 +5120,14 @@ static int img_dd(int argc, char **argv)
+@@ -5101,12 +5116,14 @@ static int img_dd(int argc, char **argv)
int c, i;
const char *out_fmt = "raw";
const char *fmt = NULL;
};
struct DdIo in = {
.bsz = 512, /* Block size is by default 512 bytes */
-@@ -5132,6 +5149,7 @@ static int img_dd(int argc, char **argv)
+@@ -5128,6 +5145,7 @@ static int img_dd(int argc, char **argv)
{ "of", img_dd_of, C_OF },
{ "skip", img_dd_skip, C_SKIP },
{ "osize", img_dd_osize, C_OSIZE },
{ NULL, NULL, 0 }
};
const struct option long_options[] = {
-@@ -5328,9 +5346,10 @@ static int img_dd(int argc, char **argv)
+@@ -5324,9 +5342,10 @@ static int img_dd(int argc, char **argv)
in.buf = g_new(uint8_t, in.bsz);
if (blk1) {
in_ret = blk_pread(blk1, in_pos, bytes, in.buf, 0);
if (in_ret == 0) {
-@@ -5339,6 +5358,9 @@ static int img_dd(int argc, char **argv)
+@@ -5335,6 +5354,9 @@ static int img_dd(int argc, char **argv)
} else {
in_ret = read(STDIN_FILENO, in.buf, bytes);
if (in_ret == 0) {
3 files changed, 26 insertions(+), 12 deletions(-)
diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
-index 4459c065f1..c6bc902182 100644
+index 3653adb963..d83e8fb3c0 100644
--- a/docs/tools/qemu-img.rst
+++ b/docs/tools/qemu-img.rst
@@ -212,6 +212,10 @@ Parameters to convert subcommand:
Give information about the disk image *FILENAME*. Use it in
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
-index 73e0bb1d2c..32749b416e 100644
+index 048788b23d..0b29a67a06 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
-@@ -58,9 +58,9 @@ SRST
+@@ -60,9 +60,9 @@ SRST
ERST
DEF("dd", img_dd,
DEF("info", img_info,
diff --git a/qemu-img.c b/qemu-img.c
-index b0839a5d18..4acdf75879 100644
+index b98184bba1..6fc8384f64 100644
--- a/qemu-img.c
+++ b/qemu-img.c
-@@ -5122,7 +5122,7 @@ static int img_dd(int argc, char **argv)
+@@ -5118,7 +5118,7 @@ static int img_dd(int argc, char **argv)
const char *fmt = NULL;
int64_t size = 0, readsize = 0;
int64_t out_pos, in_pos;
struct DdInfo dd = {
.flags = 0,
.count = 0,
-@@ -5160,7 +5160,7 @@ static int img_dd(int argc, char **argv)
+@@ -5156,7 +5156,7 @@ static int img_dd(int argc, char **argv)
{ 0, 0, 0, 0 }
};
if (c == EOF) {
break;
}
-@@ -5180,6 +5180,9 @@ static int img_dd(int argc, char **argv)
+@@ -5176,6 +5176,9 @@ static int img_dd(int argc, char **argv)
case 'h':
help();
break;
case 'U':
force_share = true;
break;
-@@ -5310,13 +5313,15 @@ static int img_dd(int argc, char **argv)
+@@ -5306,13 +5309,15 @@ static int img_dd(int argc, char **argv)
size - in.bsz * in.offset, &error_abort);
}
3 files changed, 36 insertions(+), 7 deletions(-)
diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
-index c6bc902182..fe2241856e 100644
+index d83e8fb3c0..61c6b21859 100644
--- a/docs/tools/qemu-img.rst
+++ b/docs/tools/qemu-img.rst
@@ -496,10 +496,10 @@ Command description:
The data is by default read and written using blocks of 512 bytes but can be
modified by specifying *BLOCK_SIZE*. If count=\ *BLOCKS* is specified
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
-index 32749b416e..40ee9c0ed8 100644
+index 0b29a67a06..758f397232 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
-@@ -58,9 +58,9 @@ SRST
+@@ -60,9 +60,9 @@ SRST
ERST
DEF("dd", img_dd,
DEF("info", img_info,
diff --git a/qemu-img.c b/qemu-img.c
-index 4acdf75879..a9632b5951 100644
+index 6fc8384f64..a6c88e0860 100644
--- a/qemu-img.c
+++ b/qemu-img.c
-@@ -5114,6 +5114,7 @@ static int img_dd(int argc, char **argv)
+@@ -5110,6 +5110,7 @@ static int img_dd(int argc, char **argv)
BlockDriver *drv = NULL, *proto_drv = NULL;
BlockBackend *blk1 = NULL, *blk2 = NULL;
QemuOpts *opts = NULL;
QemuOptsList *create_opts = NULL;
Error *local_err = NULL;
bool image_opts = false;
-@@ -5123,6 +5124,7 @@ static int img_dd(int argc, char **argv)
+@@ -5119,6 +5120,7 @@ static int img_dd(int argc, char **argv)
int64_t size = 0, readsize = 0;
int64_t out_pos, in_pos;
bool force_share = false, skip_create = false;
struct DdInfo dd = {
.flags = 0,
.count = 0,
-@@ -5160,7 +5162,7 @@ static int img_dd(int argc, char **argv)
+@@ -5156,7 +5158,7 @@ static int img_dd(int argc, char **argv)
{ 0, 0, 0, 0 }
};
if (c == EOF) {
break;
}
-@@ -5183,6 +5185,19 @@ static int img_dd(int argc, char **argv)
+@@ -5179,6 +5181,19 @@ static int img_dd(int argc, char **argv)
case 'n':
skip_create = true;
break;
case 'U':
force_share = true;
break;
-@@ -5242,11 +5257,24 @@ static int img_dd(int argc, char **argv)
+@@ -5238,11 +5253,24 @@ static int img_dd(int argc, char **argv)
if (dd.flags & C_IF) {
blk1 = img_open(image_opts, in.filename, fmt, 0, false, false,
force_share);
}
if (dd.flags & C_OSIZE) {
-@@ -5401,6 +5429,7 @@ static int img_dd(int argc, char **argv)
+@@ -5397,6 +5425,7 @@ static int img_dd(int argc, char **argv)
out:
g_free(arg);
qemu_opts_del(opts);
qapi_free_BalloonInfo(info);
}
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
-index d004cf29d2..2660ed520b 100644
+index 609e39a821..8cb6dfcac3 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
-@@ -782,8 +782,37 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
+@@ -781,8 +781,37 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
static void virtio_balloon_stat(void *opaque, BalloonInfo *info)
{
VirtIOBalloon *dev = opaque;
static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
diff --git a/qapi/machine.json b/qapi/machine.json
-index b6d634b30d..03a72efc11 100644
+index e8b60641f2..2054cdc70d 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
-@@ -1087,9 +1087,29 @@
+@@ -1079,9 +1079,29 @@
# @actual: the logical size of the VM in bytes Formula used:
# logical_vm_size = vm_ram_size - balloon_size
#
##
# @query-balloon:
diff --git a/qapi/pragma.json b/qapi/pragma.json
-index 0aa4eeddd3..eae9f54700 100644
+index 59fbe74b8c..be8fa304c5 100644
--- a/qapi/pragma.json
+++ b/qapi/pragma.json
-@@ -35,6 +35,7 @@
+@@ -90,6 +90,7 @@
'member-name-exceptions': [ # visible in:
'ACPISlotType', # query-acpi-ospm-status
'AcpiTableOptions', # -acpitable
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c
-index 3860a50c3b..40821e2317 100644
+index 4b72009cd3..314351cdff 100644
--- a/hw/core/machine-qmp-cmds.c
+++ b/hw/core/machine-qmp-cmds.c
-@@ -91,6 +91,12 @@ MachineInfoList *qmp_query_machines(Error **errp)
+@@ -90,6 +90,12 @@ MachineInfoList *qmp_query_machines(Error **errp)
info->numa_mem_supported = mc->numa_mem_supported;
info->deprecated = !!mc->deprecation_reason;
info->acpi = !!object_class_property_find(OBJECT_CLASS(mc), "acpi");
info->default_cpu_type = g_strdup(mc->default_cpu_type);
}
diff --git a/qapi/machine.json b/qapi/machine.json
-index 03a72efc11..297ad0e0e5 100644
+index 2054cdc70d..a024d5b05d 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -146,6 +146,8 @@
2 files changed, 7 insertions(+)
diff --git a/qapi/ui.json b/qapi/ui.json
-index a0158baf23..20d8994f49 100644
+index f610bce118..6ea26a9acb 100644
--- a/qapi/ui.json
+++ b/qapi/ui.json
-@@ -318,11 +318,14 @@
+@@ -314,11 +314,14 @@
#
# @channels: a list of @SpiceChannel for each active spice channel
#
'if': 'CONFIG_SPICE' }
diff --git a/ui/spice-core.c b/ui/spice-core.c
-index 0eb138cf43..56d677efe1 100644
+index ea20e6153c..55a15fba8b 100644
--- a/ui/spice-core.c
+++ b/ui/spice-core.c
@@ -548,6 +548,10 @@ static SpiceInfo *qmp_query_spice_real(Error **errp)
+
+#endif /* QIO_CHANNEL_SAVEVM_ASYNC_H */
diff --git a/migration/meson.build b/migration/meson.build
-index 92b1cc4297..0e689eac09 100644
+index 1eeb915ff6..95d1cf2250 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -13,6 +13,7 @@ system_ss.add(files(
include/migration/snapshot.h | 2 +
include/monitor/hmp.h | 3 +
migration/meson.build | 1 +
- migration/savevm-async.c | 534 +++++++++++++++++++++++++++++++++++
+ migration/savevm-async.c | 531 +++++++++++++++++++++++++++++++++++
monitor/hmp-cmds.c | 38 +++
qapi/migration.json | 34 +++
- qapi/misc.json | 16 ++
+ qapi/misc.json | 18 ++
qemu-options.hx | 12 +
system/vl.c | 10 +
- 11 files changed, 680 insertions(+)
+ 11 files changed, 679 insertions(+)
create mode 100644 migration/savevm-async.c
diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
-index f5b37eb74a..10fdd822e0 100644
+index ad1b1306e3..d5ab880492 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -525,6 +525,19 @@ SRST
.name = "balloon",
.args_type = "",
diff --git a/hmp-commands.hx b/hmp-commands.hx
-index 765349ed14..893c3bd240 100644
+index 2e2a3bcf98..7506de251c 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
-@@ -1875,3 +1875,20 @@ SRST
+@@ -1862,3 +1862,20 @@ SRST
List event channels in the guest
ERST
#endif
+ .coroutine = true,
+ },
diff --git a/include/migration/snapshot.h b/include/migration/snapshot.h
-index e72083b117..c846d37806 100644
+index 9e4dcaaa75..2581730d74 100644
--- a/include/migration/snapshot.h
+++ b/include/migration/snapshot.h
-@@ -61,4 +61,6 @@ bool delete_snapshot(const char *name,
- bool has_devices, strList *devices,
- Error **errp);
+@@ -68,4 +68,6 @@ bool delete_snapshot(const char *name,
+ */
+ void load_snapshot_resume(RunState state);
+int load_snapshot_from_blockdev(const char *filename, Error **errp);
+
void coroutine_fn hmp_screendump(Monitor *mon, const QDict *qdict);
void hmp_chardev_add(Monitor *mon, const QDict *qdict);
diff --git a/migration/meson.build b/migration/meson.build
-index 0e689eac09..8f9d122187 100644
+index 95d1cf2250..800f12a60d 100644
--- a/migration/meson.build
+++ b/migration/meson.build
-@@ -27,6 +27,7 @@ system_ss.add(files(
+@@ -28,6 +28,7 @@ system_ss.add(files(
'options.c',
'postcopy-ram.c',
'savevm.c',
'threadinfo.c',
diff --git a/migration/savevm-async.c b/migration/savevm-async.c
new file mode 100644
-index 0000000000..8f63c4c637
+index 0000000000..779e4e2a78
--- /dev/null
+++ b/migration/savevm-async.c
-@@ -0,0 +1,534 @@
+@@ -0,0 +1,531 @@
+#include "qemu/osdep.h"
+#include "migration/channel-savevm-async.h"
+#include "migration/migration.h"
+static void process_savevm_finalize(void *opaque)
+{
+ int ret;
-+ AioContext *iohandler_ctx = iohandler_get_aio_context();
+ MigrationState *ms = migrate_get_current();
+
+ bool aborted = savevm_aborted();
+ * so move it back. It can stay in the main context and live out its live
+ * there, since we're done with it after this method ends anyway.
+ */
-+ aio_context_acquire(iohandler_ctx);
+ blk_set_aio_context(snap_state.target, qemu_get_aio_context(), NULL);
-+ aio_context_release(iohandler_ctx);
+
+ ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+ if (ret < 0) {
+ * lock. Similar to what is done in migration.c, call the exact variant
+ * only once pend_precopy in the estimate is below the threshold.
+ */
-+ qemu_mutex_unlock_iothread();
++ bql_unlock();
+ qemu_savevm_state_pending_estimate(&pend_precopy, &pend_postcopy);
+ if (pend_precopy <= threshold) {
+ qemu_savevm_state_pending_exact(&pend_precopy, &pend_postcopy);
+ }
-+ qemu_mutex_lock_iothread();
++ bql_lock();
+ pending_size = pend_precopy + pend_postcopy;
+
+ /*
+ return;
+ }
+
-+ if (migration_is_running(ms->state)) {
++ if (migration_is_running()) {
+ error_set(errp, ERROR_CLASS_GENERIC_ERROR, QERR_MIGRATION_ACTIVE);
+ return;
+ }
+ }
+}
diff --git a/qapi/migration.json b/qapi/migration.json
-index 197d3faa43..b41465fbe9 100644
+index 8c65b90328..ed20d066cd 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
-@@ -298,6 +298,40 @@
+@@ -297,6 +297,40 @@
'*dirty-limit-throttle-time-per-round': 'uint64',
'*dirty-limit-ring-full-time': 'uint64'} }
# @query-migrate:
#
diff --git a/qapi/misc.json b/qapi/misc.json
-index cda2effa81..94a58bb0bf 100644
+index ec30e5c570..7147199a12 100644
--- a/qapi/misc.json
+++ b/qapi/misc.json
-@@ -456,6 +456,22 @@
+@@ -454,6 +454,24 @@
##
{ 'command': 'query-fdsets', 'returns': ['FdsetInfo'] }
+#
+# Prepare for snapshot and halt VM. Save VM state to statefile.
+#
++# @statefile: target file that state should be written to.
++#
+##
+{ 'command': 'savevm-start', 'data': { '*statefile': 'str' } }
+
# @CommandLineParameterType:
#
diff --git a/qemu-options.hx b/qemu-options.hx
-index b6b4ad9e67..881b0b3c43 100644
+index 8ce85d4559..511ab9415e 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
-@@ -4590,6 +4590,18 @@ SRST
+@@ -4610,6 +4610,18 @@ SRST
Start right away with a saved state (``loadvm`` in monitor)
ERST
DEF("daemonize", 0, QEMU_OPTION_daemonize, \
"-daemonize daemonize QEMU after initializing\n", QEMU_ARCH_ALL)
diff --git a/system/vl.c b/system/vl.c
-index d2a3b3f457..57f7ba0525 100644
+index c644222982..2738ab7c91 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -163,6 +163,7 @@ static const char *accelerators;
static QTAILQ_HEAD(, ObjectOption) object_opts = QTAILQ_HEAD_INITIALIZER(object_opts);
static QTAILQ_HEAD(, DeviceOption) device_opts = QTAILQ_HEAD_INITIALIZER(device_opts);
static int display_remote;
-@@ -2715,6 +2716,12 @@ void qmp_x_exit_preconfig(Error **errp)
-
- if (loadvm) {
+@@ -2712,6 +2713,12 @@ void qmp_x_exit_preconfig(Error **errp)
+ RunState state = autostart ? RUN_STATE_RUNNING : runstate_get();
load_snapshot(loadvm, NULL, false, NULL, &error_fatal);
+ load_snapshot_resume(state);
+ } else if (loadstate) {
+ Error *local_err = NULL;
+ if (load_snapshot_from_blockdev(loadstate, &local_err) < 0) {
}
if (replay_mode != REPLAY_MODE_NONE) {
replay_vmstate_init();
-@@ -3265,6 +3272,9 @@ void qemu_init(int argc, char **argv)
+@@ -3259,6 +3266,9 @@ void qemu_init(int argc, char **argv)
case QEMU_OPTION_loadvm:
loadvm = optarg;
break;
3 files changed, 39 insertions(+), 18 deletions(-)
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
-index 94231ff295..afda98292f 100644
+index a10882d47f..19c1de0472 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
-@@ -34,8 +34,8 @@
- #include "qapi/error.h"
+@@ -35,8 +35,8 @@
#include "rdma.h"
+ #include "io/channel-file.h"
-#define IO_BUF_SIZE 32768
-#define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 64)
struct QEMUFile {
QIOChannel *ioc;
-@@ -43,7 +43,8 @@ struct QEMUFile {
+@@ -44,7 +44,8 @@ struct QEMUFile {
int buf_index;
int buf_size; /* 0 when writing */
DECLARE_BITMAP(may_free, MAX_IOV_SIZE);
struct iovec iov[MAX_IOV_SIZE];
-@@ -97,7 +98,9 @@ int qemu_file_shutdown(QEMUFile *f)
+@@ -101,7 +102,9 @@ int qemu_file_shutdown(QEMUFile *f)
return 0;
}
{
QEMUFile *f;
-@@ -106,6 +109,8 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
+@@ -110,6 +113,8 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
object_ref(ioc);
f->ioc = ioc;
f->is_writable = is_writable;
return f;
}
-@@ -116,17 +121,27 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
+@@ -120,17 +125,27 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
*/
QEMUFile *qemu_file_get_return_path(QEMUFile *f)
{
}
/*
-@@ -320,7 +335,7 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
+@@ -328,7 +343,7 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
do {
len = qio_channel_read(f->ioc,
(char *)f->buf + pending,
&local_error);
if (len == QIO_CHANNEL_ERR_BLOCK) {
if (qemu_in_coroutine()) {
-@@ -360,6 +375,9 @@ int qemu_fclose(QEMUFile *f)
+@@ -368,6 +383,9 @@ int qemu_fclose(QEMUFile *f)
ret = ret2;
}
g_clear_pointer(&f->ioc, object_unref);
error_free(f->last_error_obj);
g_free(f);
trace_qemu_file_fclose();
-@@ -408,7 +426,7 @@ static void add_buf_to_iovec(QEMUFile *f, size_t len)
+@@ -416,7 +434,7 @@ static void add_buf_to_iovec(QEMUFile *f, size_t len)
{
if (!add_to_iovec(f, f->buf + f->buf_index, len, false)) {
f->buf_index += len;
qemu_fflush(f);
}
}
-@@ -433,7 +451,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
+@@ -441,7 +459,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
}
while (size > 0) {
if (l > size) {
l = size;
}
-@@ -478,8 +496,8 @@ size_t coroutine_mixed_fn qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t si
+@@ -587,8 +605,8 @@ size_t coroutine_mixed_fn qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t si
size_t index;
assert(!qemu_file_is_writable(f));
/* The 1st byte to read from */
index = f->buf_index + offset;
-@@ -529,7 +547,7 @@ size_t coroutine_mixed_fn qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size
+@@ -638,7 +656,7 @@ size_t coroutine_mixed_fn qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size
size_t res;
uint8_t *src;
if (res == 0) {
return done;
}
-@@ -563,7 +581,7 @@ size_t coroutine_mixed_fn qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size
+@@ -672,7 +690,7 @@ size_t coroutine_mixed_fn qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size
*/
size_t coroutine_mixed_fn qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
{
size_t res;
uint8_t *src = NULL;
-@@ -588,7 +606,7 @@ int coroutine_mixed_fn qemu_peek_byte(QEMUFile *f, int offset)
+@@ -697,7 +715,7 @@ int coroutine_mixed_fn qemu_peek_byte(QEMUFile *f, int offset)
int index = f->buf_index + offset;
assert(!qemu_file_is_writable(f));
if (index >= f->buf_size) {
qemu_fill_buffer(f);
-@@ -702,7 +720,7 @@ static int qemu_compress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
+@@ -811,7 +829,7 @@ static int qemu_compress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream,
const uint8_t *p, size_t size)
{
if (blen < compressBound(size)) {
return -1;
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
-index 8aec9fabf7..5d0b18c51c 100644
+index 32fd4a34fd..36a0cd8cc8 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -30,7 +30,9 @@
/*
diff --git a/migration/savevm-async.c b/migration/savevm-async.c
-index 8f63c4c637..f8d1c2f2b1 100644
+index 779e4e2a78..bf36fc06d2 100644
--- a/migration/savevm-async.c
+++ b/migration/savevm-async.c
-@@ -382,7 +382,7 @@ void qmp_savevm_start(const char *statefile, Error **errp)
+@@ -379,7 +379,7 @@ void qmp_savevm_start(const char *statefile, Error **errp)
QIOChannel *ioc = QIO_CHANNEL(qio_channel_savevm_async_new(snap_state.target,
&snap_state.bs_pos));
if (!snap_state.file) {
error_set(errp, ERROR_CLASS_GENERIC_ERROR, "failed to open '%s'", statefile);
-@@ -499,7 +499,8 @@ int load_snapshot_from_blockdev(const char *filename, Error **errp)
+@@ -496,7 +496,8 @@ int load_snapshot_from_blockdev(const char *filename, Error **errp)
blk_op_block_all(be, blocker);
/* restore the VM state */
create mode 100644 block/zeroinit.c
diff --git a/block/meson.build b/block/meson.build
-index 59ff6d380c..8ded0dc18b 100644
+index e1f03fd773..b530e117b5 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -39,6 +39,7 @@ block_ss.add(files(
system_ss.add(when: 'CONFIG_TCG', if_true: files('blkreplay.c'))
diff --git a/block/zeroinit.c b/block/zeroinit.c
new file mode 100644
-index 0000000000..1f2032bf99
+index 0000000000..696558d8d6
--- /dev/null
+++ b/block/zeroinit.c
@@ -0,0 +1,214 @@
+ &child_of_bds,
+ BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, false,
+ &local_err);
-+ bdrv_graph_wrlock(bs);
++ bdrv_graph_wrlock();
+ bs->file = file;
-+ bdrv_graph_wrunlock(bs);
++ bdrv_graph_wrunlock();
+ if (local_err) {
+ ret = -EINVAL;
+ error_propagate(errp, local_err);
+ ret = 0;
+fail:
+ if (ret < 0) {
-+ bdrv_graph_wrlock(bs);
++ bdrv_graph_wrlock();
+ bdrv_unref_child(bs, bs->file);
-+ bdrv_graph_wrunlock(bs);
++ bdrv_graph_wrunlock();
+ }
+ qemu_opts_del(opts);
+ return ret;
2 files changed, 11 insertions(+)
diff --git a/qemu-options.hx b/qemu-options.hx
-index 881b0b3c43..c17374916c 100644
+index 511ab9415e..92e301d545 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
-@@ -1218,6 +1218,9 @@ legacy PC, they are not recommended for modern configurations.
+@@ -1237,6 +1237,9 @@ legacy PC, they are not recommended for modern configurations.
ERST
"-fda/-fdb file use 'file' as floppy disk 0/1 image\n", QEMU_ARCH_ALL)
DEF("fdb", HAS_ARG, QEMU_OPTION_fdb, "", QEMU_ARCH_ALL)
diff --git a/system/vl.c b/system/vl.c
-index 57f7ba0525..56d715c818 100644
+index 2738ab7c91..20ebf2c920 100644
--- a/system/vl.c
+++ b/system/vl.c
-@@ -2751,6 +2751,7 @@ void qemu_init(int argc, char **argv)
+@@ -2748,6 +2748,7 @@ void qemu_init(int argc, char **argv)
MachineClass *machine_class;
bool userconfig = true;
FILE *vmstate_dump_file = NULL;
qemu_add_opts(&qemu_drive_opts);
qemu_add_drive_opts(&qemu_legacy_drive_opts);
-@@ -3377,6 +3378,13 @@ void qemu_init(int argc, char **argv)
+@@ -3371,6 +3372,13 @@ void qemu_init(int argc, char **argv)
machine_parse_property_opt(qemu_find_opts("smp-opts"),
"smp", optarg);
break;
1 file changed, 9 insertions(+)
diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c
-index bccb4241c2..bd0db7567a 100644
+index d8fc1e2815..789694b8b3 100644
--- a/hw/intc/apic_common.c
+++ b/hw/intc/apic_common.c
-@@ -251,6 +251,15 @@ static void apic_reset_common(DeviceState *dev)
+@@ -263,6 +263,15 @@ static void apic_reset_common(DeviceState *dev)
info->vapic_base_update(s);
apic_init_reset(dev);
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
block/file-posix.c | 59 ++++++++++++++++++++++++++++++--------------
- qapi/block-core.json | 3 ++-
- 2 files changed, 42 insertions(+), 20 deletions(-)
+ qapi/block-core.json | 7 +++++-
+ 2 files changed, 46 insertions(+), 20 deletions(-)
diff --git a/block/file-posix.c b/block/file-posix.c
-index bc09aefe3b..36a53d8682 100644
+index 43bc0bd520..60e98c87f1 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
-@@ -2873,6 +2873,7 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
+@@ -2876,6 +2876,7 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
int fd;
uint64_t perm, shared;
int result = 0;
/* Validate options and set default values */
assert(options->driver == BLOCKDEV_DRIVER_FILE);
-@@ -2913,19 +2914,22 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
+@@ -2916,19 +2917,22 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
}
/* Clear the file by truncating it to 0 */
-@@ -2979,13 +2983,15 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
+@@ -2982,13 +2986,15 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
}
out_unlock:
}
out_close:
-@@ -3009,6 +3015,7 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
+@@ -3012,6 +3018,7 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
PreallocMode prealloc;
char *buf = NULL;
Error *local_err = NULL;
/* Skip file: protocol prefix */
strstart(filename, "file:", &filename);
-@@ -3031,6 +3038,18 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
+@@ -3034,6 +3041,18 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
return -EINVAL;
}
options = (BlockdevCreateOptions) {
.driver = BLOCKDEV_DRIVER_FILE,
.u.file = {
-@@ -3042,6 +3061,8 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
+@@ -3045,6 +3064,8 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
.nocow = nocow,
.has_extent_size_hint = has_extent_size_hint,
.extent_size_hint = extent_size_hint,
};
return raw_co_create(&options, errp);
diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 8db0986e9e..299e3fc350 100644
+index 45ab548dfe..f7c2b63c5d 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
-@@ -4935,7 +4935,8 @@
+@@ -4956,6 +4956,10 @@
+ # @extent-size-hint: Extent size hint to add to the image file; 0 for
+ # not adding an extent size hint (default: 1 MB, since 5.1)
+ #
++# @locking: whether to enable file locking. If set to 'auto', only
++# enable when Open File Descriptor (OFD) locking API is available
++# (default: auto).
++#
+ # Since: 2.12
+ ##
+ { 'struct': 'BlockdevCreateOptionsFile',
+@@ -4963,7 +4967,8 @@
'size': 'size',
'*preallocation': 'PreallocMode',
'*nocow': 'bool',
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/hw/core/machine.c b/hw/core/machine.c
-index 0c17398141..36621d58a7 100644
+index 37ede0e7d4..513e49bab1 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
-@@ -152,7 +152,8 @@ GlobalProperty hw_compat_4_0[] = {
+@@ -161,7 +161,8 @@ GlobalProperty hw_compat_4_0[] = {
{ "virtio-vga", "edid", "false" },
{ "virtio-gpu-device", "edid", "false" },
{ "virtio-device", "use-started", "false" },
4 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c
-index 40821e2317..ee93ddd69a 100644
+index 314351cdff..628a3537c5 100644
--- a/hw/core/machine-qmp-cmds.c
+++ b/hw/core/machine-qmp-cmds.c
-@@ -95,6 +95,11 @@ MachineInfoList *qmp_query_machines(Error **errp)
+@@ -94,6 +94,11 @@ MachineInfoList *qmp_query_machines(Error **errp)
if (strcmp(mc->name, MACHINE_GET_CLASS(current_machine)->name) == 0) {
info->has_is_current = true;
info->is_current = true;
if (mc->default_cpu_type) {
diff --git a/include/hw/boards.h b/include/hw/boards.h
-index da85f86efb..1aa0987020 100644
+index 8b8f6d5c00..dd6d0a1447 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
-@@ -240,6 +240,8 @@ struct MachineClass {
+@@ -246,6 +246,8 @@ struct MachineClass {
const char *desc;
const char *deprecation_reason;
void (*reset)(MachineState *state, ShutdownCause reason);
void (*wakeup)(MachineState *state);
diff --git a/qapi/machine.json b/qapi/machine.json
-index 297ad0e0e5..a9fd40d844 100644
+index a024d5b05d..1d69bffaa0 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -168,6 +168,8 @@
##
# @query-machines:
diff --git a/system/vl.c b/system/vl.c
-index 56d715c818..87f03e61a1 100644
+index 20ebf2c920..4d39e32097 100644
--- a/system/vl.c
+++ b/system/vl.c
-@@ -1660,6 +1660,7 @@ static const QEMUOption *lookup_opt(int argc, char **argv,
+@@ -1659,6 +1659,7 @@ static const QEMUOption *lookup_opt(int argc, char **argv,
static MachineClass *select_machine(QDict *qdict, Error **errp)
{
const char *machine_type = qdict_get_try_str(qdict, "type");
GSList *machines = object_class_get_list(TYPE_MACHINE, false);
MachineClass *machine_class;
Error *local_err = NULL;
-@@ -1677,6 +1678,11 @@ static MachineClass *select_machine(QDict *qdict, Error **errp)
+@@ -1676,6 +1677,11 @@ static MachineClass *select_machine(QDict *qdict, Error **errp)
}
}
g_slist_free(machines);
if (local_err) {
error_append_hint(&local_err, "Use -machine help to list supported machines\n");
-@@ -3319,12 +3325,31 @@ void qemu_init(int argc, char **argv)
+@@ -3313,12 +3319,31 @@ void qemu_init(int argc, char **argv)
case QEMU_OPTION_machine:
{
bool help;
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/block/backup.c b/block/backup.c
-index 8aae5836d7..2516eac5a7 100644
+index ec29d6b810..270957c0cd 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -237,8 +237,8 @@ static void backup_init_bcs_bitmap(BackupBlockJob *job)
int64_t count;
@@ -501,6 +499,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
&error_abort);
- bdrv_graph_wrunlock(target);
+ bdrv_graph_wrunlock();
+ backup_init_bcs_bitmap(job);
+
create mode 100644 vma.h
diff --git a/block/meson.build b/block/meson.build
-index 8ded0dc18b..e709b67d37 100644
+index b530e117b5..b245daa98e 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -42,6 +42,8 @@ block_ss.add(files(
system_ss.add(files('block-ram-registrar.c'))
diff --git a/meson.build b/meson.build
-index 6c77d9687d..8cb1ccd5e1 100644
+index 91a0aa64c6..620cc594b2 100644
--- a/meson.build
+++ b/meson.build
-@@ -1802,6 +1802,8 @@ endif
+@@ -1922,6 +1922,8 @@ endif
has_gettid = cc.has_function('gettid')
# libselinux
selinux = dependency('libselinux',
required: get_option('selinux'),
-@@ -3975,6 +3977,9 @@ if have_tools
+@@ -4023,6 +4025,9 @@ if have_tools
dependencies: [blockdev, qemuutil, gnutls, selinux],
install: true)
+ dependencies: [authz, block, crypto, io, qom], install: true)
+
subdir('storage-daemon')
- subdir('contrib/rdmacm-mux')
- subdir('contrib/elf2dmp')
+
+ foreach exe: [ 'qemu-img', 'qemu-io', 'qemu-nbd', 'qemu-storage-daemon']
diff --git a/vma-reader.c b/vma-reader.c
new file mode 100644
index 0000000000..d0b6721812
+ return bs;
+}
diff --git a/block/backup.c b/block/backup.c
-index 2516eac5a7..aec140e0c8 100644
+index 270957c0cd..16d611c4ca 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -29,28 +29,6 @@
if (perf->max_chunk && perf->max_chunk < cluster_size) {
error_setg(errp, "Required max-chunk (%" PRIi64 ") is less than backup "
diff --git a/block/meson.build b/block/meson.build
-index e709b67d37..f7d1b7ac42 100644
+index b245daa98e..e99914eaa4 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -4,6 +4,7 @@ block_ss.add(files(
'blklogwrites.c',
'blkverify.c',
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
-index 4e31d161c5..2fc6c37bc9 100644
+index 761276127e..b3e6697613 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -26,6 +26,7 @@
BDRV_TRACKED_READ,
BDRV_TRACKED_WRITE,
diff --git a/job.c b/job.c
-index 99a2e54b54..d7a18b8e6a 100644
+index 660ce22c56..baf54c8d60 100644
--- a/job.c
+++ b/job.c
@@ -331,7 +331,8 @@ static bool job_started_locked(Job *job)
2 files changed, 46 insertions(+)
diff --git a/include/qemu/job.h b/include/qemu/job.h
-index b4bc2e174b..4586dc2d3c 100644
+index 2b873f2576..528cd6acb9 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
-@@ -381,6 +381,18 @@ void job_unlock(void);
+@@ -362,6 +362,18 @@ void job_unlock(void);
*/
JobTxn *job_txn_new(void);
* Release a reference that was previously acquired with job_txn_add_job or
* job_txn_new. If it's the last reference to the object, it will be freed.
diff --git a/job.c b/job.c
-index d7a18b8e6a..868df1b4ce 100644
+index baf54c8d60..3ac5e5cde2 100644
--- a/job.c
+++ b/job.c
@@ -94,6 +94,8 @@ struct JobTxn {
/* Called with job_mutex held. */
static void job_txn_ref_locked(JobTxn *txn)
{
-@@ -1058,6 +1079,12 @@ static void job_completed_txn_success_locked(Job *job)
+@@ -1042,6 +1063,12 @@ static void job_completed_txn_success_locked(Job *job)
*/
QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
if (!job_is_completed_locked(other_job)) {
return;
}
assert(other_job->ret == 0);
-@@ -1269,6 +1296,13 @@ int job_finish_sync_locked(Job *job,
+@@ -1253,6 +1280,13 @@ int job_finish_sync_locked(Job *job,
return -EBUSY;
}
monitor/hmp-cmds.c | 72 +++
proxmox-backup-client.c | 146 +++++
proxmox-backup-client.h | 60 ++
- pve-backup.c | 1103 ++++++++++++++++++++++++++++++++
- qapi/block-core.json | 229 +++++++
+ pve-backup.c | 1098 ++++++++++++++++++++++++++++++++
+ qapi/block-core.json | 233 +++++++
qapi/common.json | 14 +
qapi/machine.json | 16 +-
- 14 files changed, 1718 insertions(+), 14 deletions(-)
+ 14 files changed, 1717 insertions(+), 14 deletions(-)
create mode 100644 proxmox-backup-client.c
create mode 100644 proxmox-backup-client.h
create mode 100644 pve-backup.c
diff --git a/block/meson.build b/block/meson.build
-index f7d1b7ac42..9df99aceb5 100644
+index e99914eaa4..6bba803f94 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -44,6 +44,11 @@ block_ss.add(files(
system_ss.add(when: 'CONFIG_TCG', if_true: files('blkreplay.c'))
system_ss.add(files('block-ram-registrar.c'))
diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
-index c729cbf1eb..1656859e03 100644
+index d954bec6f1..5000c084c5 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
-@@ -1037,3 +1037,42 @@ void hmp_change_medium(Monitor *mon, const char *device, const char *target,
+@@ -1008,3 +1008,42 @@ void hmp_change_medium(Monitor *mon, const char *device, const char *target,
qmp_blockdev_change_medium(device, NULL, target, arg, true, force,
!!read_only, read_only_mode, errp);
}
+ hmp_handle_error(mon, error);
+}
diff --git a/blockdev.c b/blockdev.c
-index 38a40e3e32..3049811be8 100644
+index d27d8c38ec..5e5dbc1da9 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -37,6 +37,7 @@
#include "monitor/monitor.h"
#include "qemu/error-report.h"
diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
-index 10fdd822e0..15937793c1 100644
+index d5ab880492..6c97248d1b 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -471,6 +471,20 @@ SRST
{
.name = "usernet",
diff --git a/hmp-commands.hx b/hmp-commands.hx
-index 893c3bd240..5c1ffbc602 100644
+index 7506de251c..d5f9c28194 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -101,6 +101,35 @@ ERST
void hmp_device_add(Monitor *mon, const QDict *qdict);
void hmp_device_del(Monitor *mon, const QDict *qdict);
diff --git a/meson.build b/meson.build
-index 8cb1ccd5e1..955f579308 100644
+index 620cc594b2..d16b97cf3c 100644
--- a/meson.build
+++ b/meson.build
-@@ -1803,6 +1803,7 @@ endif
+@@ -1923,6 +1923,7 @@ endif
has_gettid = cc.has_function('gettid')
libuuid = cc.find_library('uuid', required: true)
+#endif /* PROXMOX_BACKUP_CLIENT_H */
diff --git a/pve-backup.c b/pve-backup.c
new file mode 100644
-index 0000000000..903afcd7e9
+index 0000000000..9c13a92623
--- /dev/null
+++ b/pve-backup.c
-@@ -0,0 +1,1103 @@
+@@ -0,0 +1,1098 @@
+#include "proxmox-backup-client.h"
+#include "vma.h"
+
+ sync_mode = MIRROR_SYNC_MODE_BITMAP;
+ bitmap_mode = BITMAP_SYNC_MODE_ON_SUCCESS;
+ }
-+ AioContext *aio_context = bdrv_get_aio_context(di->bs);
-+ aio_context_acquire(aio_context);
-+
+ bdrv_drained_begin(di->bs);
+
+ BlockJob *job = backup_job_create(
+
+ bdrv_drained_end(di->bs);
+
-+ aio_context_release(aio_context);
-+
+ di->job = job;
+ if (job) {
+ WITH_JOB_LOCK_GUARD() {
+ return ret;
+}
diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 299e3fc350..c155d74230 100644
+index f7c2b63c5d..e49c7b5bc9 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
-@@ -841,6 +841,235 @@
+@@ -851,6 +851,239 @@
{ 'command': 'query-block', 'returns': ['BlockInfo'],
'allow-preconfig': true }
+# @config-file: a configuration file to include into
+# the backup archive.
+#
++# @firewall-file: a firewall configuration file to include into the backup
++# archive.
++#
+# @speed: the maximum speed, in bytes per second
+#
+# @devlist: list of block device names (separated by ',', ';'
+#
+# Cancel the current executing backup process.
+#
-+# Returns: nothing on success
-+#
+# Notes: This command succeeds even if there is no backup process running.
+#
+##
+#
+# @pbs-library-version: Running version of libproxmox-backup-qemu0 library.
+#
++# @backup-max-workers: Whether the 'max-workers' @BackupPerf setting is
++# supported or not.
++#
+##
+{ 'struct': 'ProxmoxSupportStatus',
+ 'data': { 'pbs-dirty-bitmap': 'bool',
# @BlockDeviceTimedStats:
#
diff --git a/qapi/common.json b/qapi/common.json
-index 6fed9cde1a..630a2a8f9a 100644
+index 7558ce5430..6e3d800373 100644
--- a/qapi/common.json
+++ b/qapi/common.json
-@@ -207,3 +207,17 @@
+@@ -200,3 +200,17 @@
##
{ 'struct': 'HumanReadableText',
'data': { 'human-readable-text': 'str' } }
+##
+{ 'struct': 'UuidInfo', 'data': {'UUID': 'str'} }
diff --git a/qapi/machine.json b/qapi/machine.json
-index a9fd40d844..d97f024173 100644
+index 1d69bffaa0..731d8d2f60 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -4,6 +4,8 @@
create mode 100644 pbs-restore.c
diff --git a/meson.build b/meson.build
-index 955f579308..c70f2a4937 100644
+index d16b97cf3c..6de51c34cb 100644
--- a/meson.build
+++ b/meson.build
-@@ -3981,6 +3981,10 @@ if have_tools
+@@ -4029,6 +4029,10 @@ if have_tools
vma = executable('vma', files('vma.c', 'vma-reader.c') + genh,
dependencies: [authz, block, crypto, io, qom], install: true)
+ libproxmox_backup_qemu], install: true)
+
subdir('storage-daemon')
- subdir('contrib/rdmacm-mux')
- subdir('contrib/elf2dmp')
+
+ foreach exe: [ 'qemu-img', 'qemu-io', 'qemu-nbd', 'qemu-storage-daemon']
diff --git a/pbs-restore.c b/pbs-restore.c
new file mode 100644
index 0000000000..f03d9bab8d
block/meson.build | 2 +
block/pbs.c | 307 +++++++++++++++++++++++++++++++++++++++++++
meson.build | 2 +-
- qapi/block-core.json | 13 ++
+ qapi/block-core.json | 29 ++++
qapi/pragma.json | 1 +
- 5 files changed, 324 insertions(+), 1 deletion(-)
+ 5 files changed, 340 insertions(+), 1 deletion(-)
create mode 100644 block/pbs.c
diff --git a/block/meson.build b/block/meson.build
-index 9df99aceb5..549c0c7103 100644
+index 6bba803f94..1945e04eeb 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -49,6 +49,8 @@ block_ss.add(files(
+
+block_init(bdrv_pbs_init);
diff --git a/meson.build b/meson.build
-index c70f2a4937..18c3a312eb 100644
+index 6de51c34cb..3bc039f60f 100644
--- a/meson.build
+++ b/meson.build
-@@ -4390,7 +4390,7 @@ summary_info += {'bzip2 support': libbzip2}
+@@ -4477,7 +4477,7 @@ summary_info += {'bzip2 support': libbzip2}
summary_info += {'lzfse support': liblzfse}
summary_info += {'zstd support': zstd}
summary_info += {'NUMA host support': numa}
summary_info += {'libdaxctl support': libdaxctl}
summary_info += {'libudev': libudev}
diff --git a/qapi/block-core.json b/qapi/block-core.json
-index c155d74230..a4050268ca 100644
+index e49c7b5bc9..fc32ff9957 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
-@@ -3451,6 +3451,7 @@
+@@ -3457,6 +3457,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
'ssh', 'throttle', 'vdi', 'vhdx',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
-@@ -3537,6 +3538,17 @@
+@@ -3543,6 +3544,33 @@
{ 'struct': 'BlockdevOptionsNull',
'data': { '*size': 'int', '*latency-ns': 'uint64', '*read-zeroes': 'bool' } }
+#
+# Driver specific block device options for the PBS backend.
+#
++# @repository: Proxmox Backup Server repository.
++#
++# @snapshot: backup snapshots ID.
++#
++# @archive: archive name.
++#
++# @keyfile: keyfile to use for encryption.
++#
++# @password: password to use for connection.
++#
++# @fingerprint: backup server fingerprint.
++#
++# @key_password: password to unlock key.
++#
++# @namespace: namespace where backup snapshot lives.
++#
+##
+{ 'struct': 'BlockdevOptionsPbs',
+ 'data': { 'repository': 'str', 'snapshot': 'str', 'archive': 'str',
##
# @BlockdevOptionsNVMe:
#
-@@ -4945,6 +4957,7 @@
+@@ -4977,6 +5005,7 @@
'nfs': 'BlockdevOptionsNfs',
'null-aio': 'BlockdevOptionsNull',
'null-co': 'BlockdevOptionsNull',
'nvme-io_uring': { 'type': 'BlockdevOptionsNvmeIoUring',
'if': 'CONFIG_BLKIO' },
diff --git a/qapi/pragma.json b/qapi/pragma.json
-index eae9f54700..03467983b3 100644
+index be8fa304c5..7ff46bd128 100644
--- a/qapi/pragma.json
+++ b/qapi/pragma.json
-@@ -45,6 +45,7 @@
+@@ -100,6 +100,7 @@
'BlockInfo', # query-block
'BlockdevAioOptions', # blockdev-add, -blockdev
'BlockdevDriver', # blockdev-add, query-blockstats, ...
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
- meson.build | 2 ++
+ meson.build | 3 ++-
os-posix.c | 7 +++++--
- 2 files changed, 7 insertions(+), 2 deletions(-)
+ 2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/meson.build b/meson.build
-index 18c3a312eb..b598a0450b 100644
+index 3bc039f60f..067e8956a7 100644
--- a/meson.build
+++ b/meson.build
-@@ -1803,6 +1803,7 @@ endif
+@@ -1923,6 +1923,7 @@ endif
has_gettid = cc.has_function('gettid')
libuuid = cc.find_library('uuid', required: true)
libproxmox_backup_qemu = cc.find_library('proxmox_backup_qemu', required: true)
# libselinux
-@@ -3468,6 +3469,7 @@ if have_block
- # os-posix.c contains POSIX-specific functions used by qemu-storage-daemon,
- # os-win32.c does not
- blockdev_ss.add(when: 'CONFIG_POSIX', if_true: files('os-posix.c'))
-+ blockdev_ss.add(when: 'CONFIG_POSIX', if_true: libsystemd)
- system_ss.add(when: 'CONFIG_WIN32', if_true: [files('os-win32.c')])
+@@ -3530,7 +3531,7 @@ if have_block
+ if host_os == 'windows'
+ system_ss.add(files('os-win32.c'))
+ else
+- blockdev_ss.add(files('os-posix.c'))
++ blockdev_ss.add(files('os-posix.c'), libsystemd)
+ endif
endif
diff --git a/os-posix.c b/os-posix.c
create mode 100644 migration/pbs-state.c
diff --git a/include/migration/misc.h b/include/migration/misc.h
-index 1bc8902e6d..43eb0e46d6 100644
+index c9e200f4eb..12c99ebc69 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
-@@ -80,4 +80,7 @@ bool migration_in_bg_snapshot(void);
+@@ -117,4 +117,7 @@ bool migration_in_bg_snapshot(void);
/* migration/block-dirty-bitmap.c */
void dirty_bitmap_mig_init(void);
+
#endif
diff --git a/migration/meson.build b/migration/meson.build
-index 8f9d122187..2b9d2cc881 100644
+index 800f12a60d..35a4306183 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -7,7 +7,9 @@ migration_files = files(
system_ss.add(files(
'block-dirty-bitmap.c',
diff --git a/migration/migration.c b/migration/migration.c
-index 982ab85f04..b84924442d 100644
+index 86bf76e925..b8d7e471a4 100644
--- a/migration/migration.c
+++ b/migration/migration.c
-@@ -202,6 +202,7 @@ void migration_object_init(void)
+@@ -239,6 +239,7 @@ void migration_object_init(void)
blk_mig_init();
ram_mig_init();
dirty_bitmap_mig_init();
+ pbs_state_mig_init();
}
- void migration_cancel(const Error *error)
+ typedef struct {
diff --git a/migration/pbs-state.c b/migration/pbs-state.c
new file mode 100644
index 0000000000..887e998b9e
+ NULL);
+}
diff --git a/pve-backup.c b/pve-backup.c
-index 903afcd7e9..777db7938e 100644
+index 9c13a92623..9d480a8eec 100644
--- a/pve-backup.c
+++ b/pve-backup.c
-@@ -1096,6 +1096,7 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
+@@ -1091,6 +1091,7 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
ret->pbs_library_version = g_strdup(proxmox_backup_qemu_version());
ret->pbs_dirty_bitmap = true;
ret->pbs_dirty_bitmap_savevm = true;
ret->pbs_masterkey = true;
ret->backup_max_workers = true;
diff --git a/qapi/block-core.json b/qapi/block-core.json
-index a4050268ca..7b977459fa 100644
+index fc32ff9957..f516d8e95a 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
-@@ -993,6 +993,11 @@
+@@ -1004,6 +1004,11 @@
# @pbs-dirty-bitmap-savevm: True if 'dirty-bitmaps' migration capability can
# safely be set for savevm-async.
#
# @pbs-masterkey: True if the QMP backup call supports the 'master_keyfile'
# parameter.
#
-@@ -1003,6 +1008,7 @@
+@@ -1017,6 +1022,7 @@
'data': { 'pbs-dirty-bitmap': 'bool',
'query-bitmap-info': 'bool',
'pbs-dirty-bitmap-savevm': 'bool',
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
-index 0070b13b6f..954be621ba 100644
+index 2708abf3d7..fb17c01308 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -540,7 +540,7 @@ static int add_bitmaps_to_list(DBMSaveState *s, BlockDriverState *bs,
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/stream.c b/block/stream.c
-index 01fe7c0f16..87a97cb8a2 100644
+index 7031eef12b..d2da83ae7c 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -27,7 +27,7 @@ enum {
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Reiter <s.reiter@proxmox.com>
+Date: Mon, 7 Dec 2020 15:21:03 +0100
+Subject: [PATCH] block: add alloc-track driver
+
+Add a new filter node 'alloc-track', which seperates reads and writes to
+different children, thus allowing to put a backing image behind any
+blockdev (regardless of driver support). Since we can't detect any
+pre-allocated blocks, we can only track new writes, hence the write
+target ('file') for this node must always be empty.
+
+Intended use case is for live restoring, i.e. add a backup image as a
+block device into a VM, then put an alloc-track on the restore target
+and set the backup as backing. With this, one can use a regular
+'block-stream' to restore the image, while the VM can already run in the
+background. Copy-on-read will help make progress as the VM reads as
+well.
+
+This only worked if the target supports backing images, so up until now
+only for qcow2, with alloc-track any driver for the target can be used.
+
+Replacing the node cannot be done in the
+track_co_change_backing_file() callback, because replacing a node
+cannot happen in a coroutine and requires the block graph lock
+exclusively. Could either become a special option for the stream job,
+or maybe the upcoming blockdev-replace QMP command can be used in the
+future.
+
+Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+[FE: adapt to changed function signatures
+ make error return value consistent with QEMU
+ avoid premature break during read
+ adhere to block graph lock requirements]
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ block/alloc-track.c | 366 ++++++++++++++++++++++++++++++++++++++++++++
+ block/meson.build | 1 +
+ block/stream.c | 34 ++++
+ 3 files changed, 401 insertions(+)
+ create mode 100644 block/alloc-track.c
+
+diff --git a/block/alloc-track.c b/block/alloc-track.c
+new file mode 100644
+index 0000000000..b9f8ea9137
+--- /dev/null
++++ b/block/alloc-track.c
+@@ -0,0 +1,366 @@
++/*
++ * Node to allow backing images to be applied to any node. Assumes a blank
++ * image to begin with, only new writes are tracked as allocated, thus this
++ * must never be put on a node that already contains data.
++ *
++ * Copyright (c) 2020 Proxmox Server Solutions GmbH
++ * Copyright (c) 2020 Stefan Reiter <s.reiter@proxmox.com>
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * See the COPYING file in the top-level directory.
++ */
++
++#include "qemu/osdep.h"
++#include "qapi/error.h"
++#include "block/block_int.h"
++#include "block/dirty-bitmap.h"
++#include "block/graph-lock.h"
++#include "qapi/qmp/qdict.h"
++#include "qapi/qmp/qstring.h"
++#include "qemu/cutils.h"
++#include "qemu/error-report.h"
++#include "qemu/option.h"
++#include "qemu/module.h"
++#include "sysemu/block-backend.h"
++
++#define TRACK_OPT_AUTO_REMOVE "auto-remove"
++
++typedef enum DropState {
++ DropNone,
++ DropInProgress,
++} DropState;
++
++typedef struct {
++ BdrvDirtyBitmap *bitmap;
++ uint64_t granularity;
++ DropState drop_state;
++ bool auto_remove;
++} BDRVAllocTrackState;
++
++static QemuOptsList runtime_opts = {
++ .name = "alloc-track",
++ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
++ .desc = {
++ {
++ .name = TRACK_OPT_AUTO_REMOVE,
++ .type = QEMU_OPT_BOOL,
++ .help = "automatically replace this node with 'file' when 'backing'"
++ "is detached",
++ },
++ { /* end of list */ }
++ },
++};
++
++static void GRAPH_RDLOCK
++track_refresh_limits(BlockDriverState *bs, Error **errp)
++{
++ BDRVAllocTrackState *s = bs->opaque;
++
++ if (!bs->file) {
++ return;
++ }
++
++ /*
++ * Always use alignment from underlying write device so RMW cycle for
++ * bdrv_pwritev reads data from our backing via track_co_preadv. Also use at
++ * least the bitmap granularity.
++ */
++ bs->bl.request_alignment = MAX(bs->file->bs->bl.request_alignment,
++ s->granularity);
++}
++
++static int track_open(BlockDriverState *bs, QDict *options, int flags,
++ Error **errp)
++{
++ BDRVAllocTrackState *s = bs->opaque;
++ BdrvChild *file = NULL;
++ QemuOpts *opts;
++ Error *local_err = NULL;
++ int ret = 0;
++
++ opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
++ qemu_opts_absorb_qdict(opts, options, &local_err);
++ if (local_err) {
++ error_propagate(errp, local_err);
++ ret = -EINVAL;
++ goto fail;
++ }
++
++ s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
++
++ /* open the target (write) node, backing will be attached by block layer */
++ file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
++ BDRV_CHILD_DATA | BDRV_CHILD_METADATA, false,
++ &local_err);
++ bdrv_graph_wrlock();
++ bs->file = file;
++ bdrv_graph_wrunlock();
++ if (local_err) {
++ ret = -EINVAL;
++ error_propagate(errp, local_err);
++ goto fail;
++ }
++
++ bdrv_graph_rdlock_main_loop();
++ BlockDriverInfo bdi = {0};
++ ret = bdrv_get_info(bs->file->bs, &bdi);
++ if (ret < 0) {
++ /*
++ * Not a hard failure. Worst that can happen is partial cluster
++ * allocation in the write target. However, the driver here returns its
++ * allocation status based on the dirty bitmap, so any other data that
++ * maps to such a cluster will still be copied later by a stream job (or
++ * during writes to that cluster).
++ */
++ warn_report("alloc-track: unable to query cluster size for write target: %s",
++ strerror(ret));
++ }
++ ret = 0;
++ /*
++ * Always consider alignment from underlying write device so RMW cycle for
++ * bdrv_pwritev reads data from our backing via track_co_preadv. Also try to
++ * avoid partial cluster allocation in the write target by considering the
++ * cluster size.
++ */
++ s->granularity = MAX(bs->file->bs->bl.request_alignment,
++ MAX(bdi.cluster_size, BDRV_SECTOR_SIZE));
++ track_refresh_limits(bs, errp);
++ s->bitmap = bdrv_create_dirty_bitmap(bs->file->bs, s->granularity, NULL,
++ &local_err);
++ bdrv_graph_rdunlock_main_loop();
++ if (local_err) {
++ ret = -EIO;
++ error_propagate(errp, local_err);
++ goto fail;
++ }
++
++ s->drop_state = DropNone;
++
++fail:
++ if (ret < 0) {
++ bdrv_graph_wrlock();
++ bdrv_unref_child(bs, bs->file);
++ bdrv_graph_wrunlock();
++ if (s->bitmap) {
++ bdrv_release_dirty_bitmap(s->bitmap);
++ }
++ }
++ qemu_opts_del(opts);
++ return ret;
++}
++
++static void track_close(BlockDriverState *bs)
++{
++ BDRVAllocTrackState *s = bs->opaque;
++ if (s->bitmap) {
++ bdrv_release_dirty_bitmap(s->bitmap);
++ }
++}
++
++static coroutine_fn int64_t GRAPH_RDLOCK
++track_co_getlength(BlockDriverState *bs)
++{
++ return bdrv_co_getlength(bs->file->bs);
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
++ QEMUIOVector *qiov, BdrvRequestFlags flags)
++{
++ BDRVAllocTrackState *s = bs->opaque;
++ QEMUIOVector local_qiov;
++ int ret;
++
++ /* 'cur_offset' is relative to 'offset', 'local_offset' to image start */
++ uint64_t cur_offset, local_offset;
++ int64_t local_bytes;
++ bool alloc;
++
++ if (offset < 0 || bytes < 0) {
++ fprintf(stderr, "unexpected negative 'offset' or 'bytes' value!\n");
++ return -EIO;
++ }
++
++ /* a read request can span multiple granularity-sized chunks, and can thus
++ * contain blocks with different allocation status - we could just iterate
++ * granularity-wise, but for better performance use bdrv_dirty_bitmap_next_X
++ * to find the next flip and consider everything up to that in one go */
++ for (cur_offset = 0; cur_offset < bytes; cur_offset += local_bytes) {
++ local_offset = offset + cur_offset;
++ alloc = bdrv_dirty_bitmap_get(s->bitmap, local_offset);
++ if (alloc) {
++ local_bytes = bdrv_dirty_bitmap_next_zero(s->bitmap, local_offset,
++ bytes - cur_offset);
++ } else {
++ local_bytes = bdrv_dirty_bitmap_next_dirty(s->bitmap, local_offset,
++ bytes - cur_offset);
++ }
++
++ /* _bitmap_next_X return is -1 if no end found within limit, otherwise
++ * offset of next flip (to start of image) */
++ local_bytes = local_bytes < 0 ?
++ bytes - cur_offset :
++ local_bytes - local_offset;
++
++ qemu_iovec_init_slice(&local_qiov, qiov, cur_offset, local_bytes);
++
++ if (alloc) {
++ ret = bdrv_co_preadv(bs->file, local_offset, local_bytes,
++ &local_qiov, flags);
++ } else if (bs->backing) {
++ ret = bdrv_co_preadv(bs->backing, local_offset, local_bytes,
++ &local_qiov, flags);
++ } else {
++ qemu_iovec_memset(&local_qiov, cur_offset, 0, local_bytes);
++ ret = 0;
++ }
++
++ if (ret != 0) {
++ break;
++ }
++ }
++
++ return ret;
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
++ QEMUIOVector *qiov, BdrvRequestFlags flags)
++{
++ return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
++ BdrvRequestFlags flags)
++{
++ return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
++{
++ return bdrv_co_pdiscard(bs->file, offset, bytes);
++}
++
++static coroutine_fn int GRAPH_RDLOCK
++track_co_flush(BlockDriverState *bs)
++{
++ return bdrv_co_flush(bs->file->bs);
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_block_status(BlockDriverState *bs, bool want_zero,
++ int64_t offset,
++ int64_t bytes,
++ int64_t *pnum,
++ int64_t *map,
++ BlockDriverState **file)
++{
++ BDRVAllocTrackState *s = bs->opaque;
++
++ bool alloc = bdrv_dirty_bitmap_get(s->bitmap, offset);
++ int64_t next_flipped;
++ if (alloc) {
++ next_flipped = bdrv_dirty_bitmap_next_zero(s->bitmap, offset, bytes);
++ } else {
++ next_flipped = bdrv_dirty_bitmap_next_dirty(s->bitmap, offset, bytes);
++ }
++
++ /* in case not the entire region has the same state, we need to set pnum to
++ * indicate for how many bytes our result is valid */
++ *pnum = next_flipped == -1 ? bytes : next_flipped - offset;
++ *map = offset;
++
++ if (alloc) {
++ *file = bs->file->bs;
++ return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
++ } else if (bs->backing) {
++ *file = bs->backing->bs;
++ }
++ return 0;
++}
++
++static void GRAPH_RDLOCK
++track_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
++ BlockReopenQueue *reopen_queue, uint64_t perm, uint64_t shared,
++ uint64_t *nperm, uint64_t *nshared)
++{
++ BDRVAllocTrackState *s = bs->opaque;
++
++ *nshared = BLK_PERM_ALL;
++
++ /* in case we're currently dropping ourselves, claim to not use any
++ * permissions at all - which is fine, since from this point on we will
++ * never issue a read or write anymore */
++ if (s->drop_state == DropInProgress) {
++ *nperm = 0;
++ return;
++ }
++
++ if (role & BDRV_CHILD_DATA) {
++ *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
++ } else {
++ /* 'backing' is also a child of our BDS, but we don't expect it to be
++ * writeable, so we only forward 'consistent read' */
++ *nperm = perm & BLK_PERM_CONSISTENT_READ;
++ }
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
++ const char *backing_fmt)
++{
++ /*
++ * Note that the actual backing file graph change is already done in the
++ * stream job itself with bdrv_set_backing_hd_drained(), so no need to
++ * actually do anything here. But still needs to be implemented, to make
++ * our caller (i.e. bdrv_co_change_backing_file() do the right thing).
++ *
++ * FIXME
++ * We'd like to auto-remove ourselves from the block graph, but it cannot
++ * be done from a coroutine. Currently done in the stream job, where it
++ * kinda fits better, but in the long-term, a special parameter would be
++ * nice (or done via qemu-server via upcoming blockdev-replace QMP command).
++ */
++ if (backing_file == NULL) {
++ BDRVAllocTrackState *s = bs->opaque;
++ bdrv_drained_begin(bs);
++ s->drop_state = DropInProgress;
++ bdrv_child_refresh_perms(bs, bs->file, &error_abort);
++ bdrv_drained_end(bs);
++ }
++
++ return 0;
++}
++
++static BlockDriver bdrv_alloc_track = {
++ .format_name = "alloc-track",
++ .instance_size = sizeof(BDRVAllocTrackState),
++
++ .bdrv_file_open = track_open,
++ .bdrv_close = track_close,
++ .bdrv_co_getlength = track_co_getlength,
++ .bdrv_child_perm = track_child_perm,
++ .bdrv_refresh_limits = track_refresh_limits,
++
++ .bdrv_co_pwrite_zeroes = track_co_pwrite_zeroes,
++ .bdrv_co_pwritev = track_co_pwritev,
++ .bdrv_co_preadv = track_co_preadv,
++ .bdrv_co_pdiscard = track_co_pdiscard,
++
++ .bdrv_co_flush = track_co_flush,
++ .bdrv_co_flush_to_disk = track_co_flush,
++
++ .supports_backing = true,
++
++ .bdrv_co_block_status = track_co_block_status,
++ .bdrv_co_change_backing_file = track_co_change_backing_file,
++};
++
++static void bdrv_alloc_track_init(void)
++{
++ bdrv_register(&bdrv_alloc_track);
++}
++
++block_init(bdrv_alloc_track_init);
+diff --git a/block/meson.build b/block/meson.build
+index 1945e04eeb..2873f3a25a 100644
+--- a/block/meson.build
++++ b/block/meson.build
+@@ -2,6 +2,7 @@ block_ss.add(genh)
+ block_ss.add(files(
+ 'accounting.c',
+ 'aio_task.c',
++ 'alloc-track.c',
+ 'amend.c',
+ 'backup.c',
+ 'backup-dump.c',
+diff --git a/block/stream.c b/block/stream.c
+index d2da83ae7c..f941cba14e 100644
+--- a/block/stream.c
++++ b/block/stream.c
+@@ -120,6 +120,40 @@ static int stream_prepare(Job *job)
+ ret = -EPERM;
+ goto out;
+ }
++
++ /*
++ * This cannot be done in the co_change_backing_file callback, because
++ * bdrv_replace_node() cannot be done in a coroutine. The latter also
++ * requires the graph lock exclusively. Only required for the
++ * alloc-track driver.
++ *
++ * The long-term plan is to either have an explicit parameter for the
++ * stream job or use the upcoming blockdev-replace QMP command.
++ */
++ if (base_id == NULL && strcmp(unfiltered_bs->drv->format_name, "alloc-track") == 0) {
++ BlockDriverState *file_bs;
++
++ bdrv_graph_rdlock_main_loop();
++ file_bs = unfiltered_bs->file->bs;
++ bdrv_graph_rdunlock_main_loop();
++
++ bdrv_ref(unfiltered_bs); // unrefed by bdrv_replace_node()
++ bdrv_drained_begin(file_bs);
++ bdrv_graph_wrlock();
++
++ bdrv_replace_node(unfiltered_bs, file_bs, &local_err);
++
++ bdrv_graph_wrunlock();
++ bdrv_drained_end(file_bs);
++ bdrv_unref(unfiltered_bs);
++
++ if (local_err) {
++ error_prepend(&local_err, "failed to replace alloc-track node: ");
++ error_report_err(local_err);
++ ret = -EPERM;
++ goto out;
++ }
++ }
+ }
+
+ out:
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Stefan Reiter <s.reiter@proxmox.com>
-Date: Tue, 2 Mar 2021 16:11:54 +0100
-Subject: [PATCH] block/io: accept NULL qiov in bdrv_pad_request
-
-Some operations, e.g. block-stream, perform reads while discarding the
-results (only copy-on-read matters). In this case they will pass NULL as
-the target QEMUIOVector, which will however trip bdrv_pad_request, since
-it wants to extend its passed vector.
-
-If there is no qiov, no operation can be done with it, but the bytes
-and offset still need to be updated, so the subsequent aligned read
-will actually be aligned and not run into an assertion failure.
-
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
-[FE: do update bytes and offset in any case]
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- block/io.c | 29 ++++++++++++++++-------------
- 1 file changed, 16 insertions(+), 13 deletions(-)
-
-diff --git a/block/io.c b/block/io.c
-index d202987770..d42c811bd7 100644
---- a/block/io.c
-+++ b/block/io.c
-@@ -1756,22 +1756,25 @@ static int bdrv_pad_request(BlockDriverState *bs,
- return 0;
- }
-
-- sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
-- &sliced_head, &sliced_tail,
-- &sliced_niov);
--
-- /* Guaranteed by bdrv_check_request32() */
-- assert(*bytes <= SIZE_MAX);
-- ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
-- sliced_head, *bytes);
-- if (ret < 0) {
-- bdrv_padding_finalize(pad);
-- return ret;
-+ if (qiov && *qiov) {
-+ sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
-+ &sliced_head, &sliced_tail,
-+ &sliced_niov);
-+
-+ /* Guaranteed by bdrv_check_request32() */
-+ assert(*bytes <= SIZE_MAX);
-+ ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
-+ sliced_head, *bytes);
-+ if (ret < 0) {
-+ bdrv_padding_finalize(pad);
-+ return ret;
-+ }
-+ *qiov = &pad->local_qiov;
-+ *qiov_offset = 0;
- }
-+
- *bytes += pad->head + pad->tail;
- *offset -= pad->head;
-- *qiov = &pad->local_qiov;
-- *qiov_offset = 0;
- if (padded) {
- *padded = true;
- }
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fabian Ebner <f.ebner@proxmox.com>
+Date: Thu, 23 Jun 2022 14:00:05 +0200
+Subject: [PATCH] Revert "block/rbd: workaround for ceph issue #53784"
+
+This reverts commit fc176116cdea816ceb8dd969080b2b95f58edbc0 in
+preparation to revert 0347a8fd4c3faaedf119be04c197804be40a384b.
+
+Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/rbd.c | 42 ++----------------------------------------
+ 1 file changed, 2 insertions(+), 40 deletions(-)
+
+diff --git a/block/rbd.c b/block/rbd.c
+index 63f60d41be..367db42dce 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -1515,7 +1515,6 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+ int status, r;
+ RBDDiffIterateReq req = { .offs = offset };
+ uint64_t features, flags;
+- uint64_t head = 0;
+
+ assert(offset + bytes <= s->image_size);
+
+@@ -1543,43 +1542,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+ return status;
+ }
+
+-#if LIBRBD_VERSION_CODE < LIBRBD_VERSION(1, 17, 0)
+- /*
+- * librbd had a bug until early 2022 that affected all versions of ceph that
+- * supported fast-diff. This bug results in reporting of incorrect offsets
+- * if the offset parameter to rbd_diff_iterate2 is not object aligned.
+- * Work around this bug by rounding down the offset to object boundaries.
+- * This is OK because we call rbd_diff_iterate2 with whole_object = true.
+- * However, this workaround only works for non cloned images with default
+- * striping.
+- *
+- * See: https://tracker.ceph.com/issues/53784
+- */
+-
+- /* check if RBD image has non-default striping enabled */
+- if (features & RBD_FEATURE_STRIPINGV2) {
+- return status;
+- }
+-
+-#pragma GCC diagnostic push
+-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+- /*
+- * check if RBD image is a clone (= has a parent).
+- *
+- * rbd_get_parent_info is deprecated from Nautilus onwards, but the
+- * replacement rbd_get_parent is not present in Luminous and Mimic.
+- */
+- if (rbd_get_parent_info(s->image, NULL, 0, NULL, 0, NULL, 0) != -ENOENT) {
+- return status;
+- }
+-#pragma GCC diagnostic pop
+-
+- head = req.offs & (s->object_size - 1);
+- req.offs -= head;
+- bytes += head;
+-#endif
+-
+- r = rbd_diff_iterate2(s->image, NULL, req.offs, bytes, true, true,
++ r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
+ qemu_rbd_diff_iterate_cb, &req);
+ if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
+ return status;
+@@ -1598,8 +1561,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+ status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
+ }
+
+- assert(req.bytes > head);
+- *pnum = req.bytes - head;
++ *pnum = req.bytes;
+ return status;
+ }
+
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Stefan Reiter <s.reiter@proxmox.com>
-Date: Mon, 7 Dec 2020 15:21:03 +0100
-Subject: [PATCH] block: add alloc-track driver
-
-Add a new filter node 'alloc-track', which seperates reads and writes to
-different children, thus allowing to put a backing image behind any
-blockdev (regardless of driver support). Since we can't detect any
-pre-allocated blocks, we can only track new writes, hence the write
-target ('file') for this node must always be empty.
-
-Intended use case is for live restoring, i.e. add a backup image as a
-block device into a VM, then put an alloc-track on the restore target
-and set the backup as backing. With this, one can use a regular
-'block-stream' to restore the image, while the VM can already run in the
-background. Copy-on-read will help make progress as the VM reads as
-well.
-
-This only worked if the target supports backing images, so up until now
-only for qcow2, with alloc-track any driver for the target can be used.
-
-Replacing the node cannot be done in the
-track_co_change_backing_file() callback, because replacing a node
-cannot happen in a coroutine and requires the block graph lock
-exclusively. Could either become a special option for the stream job,
-or maybe the upcoming blockdev-replace QMP command can be used in the
-future.
-
-Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
-[FE: adapt to changed function signatures
- make error return value consistent with QEMU
- avoid premature break during read
- adhere to block graph lock requirements]
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- block/alloc-track.c | 366 ++++++++++++++++++++++++++++++++++++++++++++
- block/meson.build | 1 +
- block/stream.c | 34 ++++
- 3 files changed, 401 insertions(+)
- create mode 100644 block/alloc-track.c
-
-diff --git a/block/alloc-track.c b/block/alloc-track.c
-new file mode 100644
-index 0000000000..14698c362e
---- /dev/null
-+++ b/block/alloc-track.c
-@@ -0,0 +1,366 @@
-+/*
-+ * Node to allow backing images to be applied to any node. Assumes a blank
-+ * image to begin with, only new writes are tracked as allocated, thus this
-+ * must never be put on a node that already contains data.
-+ *
-+ * Copyright (c) 2020 Proxmox Server Solutions GmbH
-+ * Copyright (c) 2020 Stefan Reiter <s.reiter@proxmox.com>
-+ *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-+ */
-+
-+#include "qemu/osdep.h"
-+#include "qapi/error.h"
-+#include "block/block_int.h"
-+#include "block/dirty-bitmap.h"
-+#include "block/graph-lock.h"
-+#include "qapi/qmp/qdict.h"
-+#include "qapi/qmp/qstring.h"
-+#include "qemu/cutils.h"
-+#include "qemu/error-report.h"
-+#include "qemu/option.h"
-+#include "qemu/module.h"
-+#include "sysemu/block-backend.h"
-+
-+#define TRACK_OPT_AUTO_REMOVE "auto-remove"
-+
-+typedef enum DropState {
-+ DropNone,
-+ DropInProgress,
-+} DropState;
-+
-+typedef struct {
-+ BdrvDirtyBitmap *bitmap;
-+ uint64_t granularity;
-+ DropState drop_state;
-+ bool auto_remove;
-+} BDRVAllocTrackState;
-+
-+static QemuOptsList runtime_opts = {
-+ .name = "alloc-track",
-+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
-+ .desc = {
-+ {
-+ .name = TRACK_OPT_AUTO_REMOVE,
-+ .type = QEMU_OPT_BOOL,
-+ .help = "automatically replace this node with 'file' when 'backing'"
-+ "is detached",
-+ },
-+ { /* end of list */ }
-+ },
-+};
-+
-+static void GRAPH_RDLOCK
-+track_refresh_limits(BlockDriverState *bs, Error **errp)
-+{
-+ BDRVAllocTrackState *s = bs->opaque;
-+
-+ if (!bs->file) {
-+ return;
-+ }
-+
-+ /*
-+ * Always use alignment from underlying write device so RMW cycle for
-+ * bdrv_pwritev reads data from our backing via track_co_preadv. Also use at
-+ * least the bitmap granularity.
-+ */
-+ bs->bl.request_alignment = MAX(bs->file->bs->bl.request_alignment,
-+ s->granularity);
-+}
-+
-+static int track_open(BlockDriverState *bs, QDict *options, int flags,
-+ Error **errp)
-+{
-+ BDRVAllocTrackState *s = bs->opaque;
-+ BdrvChild *file = NULL;
-+ QemuOpts *opts;
-+ Error *local_err = NULL;
-+ int ret = 0;
-+
-+ opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
-+ qemu_opts_absorb_qdict(opts, options, &local_err);
-+ if (local_err) {
-+ error_propagate(errp, local_err);
-+ ret = -EINVAL;
-+ goto fail;
-+ }
-+
-+ s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
-+
-+ /* open the target (write) node, backing will be attached by block layer */
-+ file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
-+ BDRV_CHILD_DATA | BDRV_CHILD_METADATA, false,
-+ &local_err);
-+ bdrv_graph_wrlock(bs);
-+ bs->file = file;
-+ bdrv_graph_wrunlock(bs);
-+ if (local_err) {
-+ ret = -EINVAL;
-+ error_propagate(errp, local_err);
-+ goto fail;
-+ }
-+
-+ bdrv_graph_rdlock_main_loop();
-+ BlockDriverInfo bdi = {0};
-+ ret = bdrv_get_info(bs->file->bs, &bdi);
-+ if (ret < 0) {
-+ /*
-+ * Not a hard failure. Worst that can happen is partial cluster
-+ * allocation in the write target. However, the driver here returns its
-+ * allocation status based on the dirty bitmap, so any other data that
-+ * maps to such a cluster will still be copied later by a stream job (or
-+ * during writes to that cluster).
-+ */
-+ warn_report("alloc-track: unable to query cluster size for write target: %s",
-+ strerror(ret));
-+ }
-+ ret = 0;
-+ /*
-+ * Always consider alignment from underlying write device so RMW cycle for
-+ * bdrv_pwritev reads data from our backing via track_co_preadv. Also try to
-+ * avoid partial cluster allocation in the write target by considering the
-+ * cluster size.
-+ */
-+ s->granularity = MAX(bs->file->bs->bl.request_alignment,
-+ MAX(bdi.cluster_size, BDRV_SECTOR_SIZE));
-+ track_refresh_limits(bs, errp);
-+ s->bitmap = bdrv_create_dirty_bitmap(bs->file->bs, s->granularity, NULL,
-+ &local_err);
-+ bdrv_graph_rdunlock_main_loop();
-+ if (local_err) {
-+ ret = -EIO;
-+ error_propagate(errp, local_err);
-+ goto fail;
-+ }
-+
-+ s->drop_state = DropNone;
-+
-+fail:
-+ if (ret < 0) {
-+ bdrv_graph_wrlock(bs);
-+ bdrv_unref_child(bs, bs->file);
-+ bdrv_graph_wrunlock(bs);
-+ if (s->bitmap) {
-+ bdrv_release_dirty_bitmap(s->bitmap);
-+ }
-+ }
-+ qemu_opts_del(opts);
-+ return ret;
-+}
-+
-+static void track_close(BlockDriverState *bs)
-+{
-+ BDRVAllocTrackState *s = bs->opaque;
-+ if (s->bitmap) {
-+ bdrv_release_dirty_bitmap(s->bitmap);
-+ }
-+}
-+
-+static coroutine_fn int64_t GRAPH_RDLOCK
-+track_co_getlength(BlockDriverState *bs)
-+{
-+ return bdrv_co_getlength(bs->file->bs);
-+}
-+
-+static int coroutine_fn GRAPH_RDLOCK
-+track_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
-+ QEMUIOVector *qiov, BdrvRequestFlags flags)
-+{
-+ BDRVAllocTrackState *s = bs->opaque;
-+ QEMUIOVector local_qiov;
-+ int ret;
-+
-+ /* 'cur_offset' is relative to 'offset', 'local_offset' to image start */
-+ uint64_t cur_offset, local_offset;
-+ int64_t local_bytes;
-+ bool alloc;
-+
-+ if (offset < 0 || bytes < 0) {
-+ fprintf(stderr, "unexpected negative 'offset' or 'bytes' value!\n");
-+ return -EIO;
-+ }
-+
-+ /* a read request can span multiple granularity-sized chunks, and can thus
-+ * contain blocks with different allocation status - we could just iterate
-+ * granularity-wise, but for better performance use bdrv_dirty_bitmap_next_X
-+ * to find the next flip and consider everything up to that in one go */
-+ for (cur_offset = 0; cur_offset < bytes; cur_offset += local_bytes) {
-+ local_offset = offset + cur_offset;
-+ alloc = bdrv_dirty_bitmap_get(s->bitmap, local_offset);
-+ if (alloc) {
-+ local_bytes = bdrv_dirty_bitmap_next_zero(s->bitmap, local_offset,
-+ bytes - cur_offset);
-+ } else {
-+ local_bytes = bdrv_dirty_bitmap_next_dirty(s->bitmap, local_offset,
-+ bytes - cur_offset);
-+ }
-+
-+ /* _bitmap_next_X return is -1 if no end found within limit, otherwise
-+ * offset of next flip (to start of image) */
-+ local_bytes = local_bytes < 0 ?
-+ bytes - cur_offset :
-+ local_bytes - local_offset;
-+
-+ qemu_iovec_init_slice(&local_qiov, qiov, cur_offset, local_bytes);
-+
-+ if (alloc) {
-+ ret = bdrv_co_preadv(bs->file, local_offset, local_bytes,
-+ &local_qiov, flags);
-+ } else if (bs->backing) {
-+ ret = bdrv_co_preadv(bs->backing, local_offset, local_bytes,
-+ &local_qiov, flags);
-+ } else {
-+ qemu_iovec_memset(&local_qiov, cur_offset, 0, local_bytes);
-+ ret = 0;
-+ }
-+
-+ if (ret != 0) {
-+ break;
-+ }
-+ }
-+
-+ return ret;
-+}
-+
-+static int coroutine_fn GRAPH_RDLOCK
-+track_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
-+ QEMUIOVector *qiov, BdrvRequestFlags flags)
-+{
-+ return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
-+}
-+
-+static int coroutine_fn GRAPH_RDLOCK
-+track_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
-+ BdrvRequestFlags flags)
-+{
-+ return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
-+}
-+
-+static int coroutine_fn GRAPH_RDLOCK
-+track_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
-+{
-+ return bdrv_co_pdiscard(bs->file, offset, bytes);
-+}
-+
-+static coroutine_fn int GRAPH_RDLOCK
-+track_co_flush(BlockDriverState *bs)
-+{
-+ return bdrv_co_flush(bs->file->bs);
-+}
-+
-+static int coroutine_fn GRAPH_RDLOCK
-+track_co_block_status(BlockDriverState *bs, bool want_zero,
-+ int64_t offset,
-+ int64_t bytes,
-+ int64_t *pnum,
-+ int64_t *map,
-+ BlockDriverState **file)
-+{
-+ BDRVAllocTrackState *s = bs->opaque;
-+
-+ bool alloc = bdrv_dirty_bitmap_get(s->bitmap, offset);
-+ int64_t next_flipped;
-+ if (alloc) {
-+ next_flipped = bdrv_dirty_bitmap_next_zero(s->bitmap, offset, bytes);
-+ } else {
-+ next_flipped = bdrv_dirty_bitmap_next_dirty(s->bitmap, offset, bytes);
-+ }
-+
-+ /* in case not the entire region has the same state, we need to set pnum to
-+ * indicate for how many bytes our result is valid */
-+ *pnum = next_flipped == -1 ? bytes : next_flipped - offset;
-+ *map = offset;
-+
-+ if (alloc) {
-+ *file = bs->file->bs;
-+ return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
-+ } else if (bs->backing) {
-+ *file = bs->backing->bs;
-+ }
-+ return 0;
-+}
-+
-+static void GRAPH_RDLOCK
-+track_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
-+ BlockReopenQueue *reopen_queue, uint64_t perm, uint64_t shared,
-+ uint64_t *nperm, uint64_t *nshared)
-+{
-+ BDRVAllocTrackState *s = bs->opaque;
-+
-+ *nshared = BLK_PERM_ALL;
-+
-+ /* in case we're currently dropping ourselves, claim to not use any
-+ * permissions at all - which is fine, since from this point on we will
-+ * never issue a read or write anymore */
-+ if (s->drop_state == DropInProgress) {
-+ *nperm = 0;
-+ return;
-+ }
-+
-+ if (role & BDRV_CHILD_DATA) {
-+ *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
-+ } else {
-+ /* 'backing' is also a child of our BDS, but we don't expect it to be
-+ * writeable, so we only forward 'consistent read' */
-+ *nperm = perm & BLK_PERM_CONSISTENT_READ;
-+ }
-+}
-+
-+static int coroutine_fn GRAPH_RDLOCK
-+track_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
-+ const char *backing_fmt)
-+{
-+ /*
-+ * Note that the actual backing file graph change is already done in the
-+ * stream job itself with bdrv_set_backing_hd_drained(), so no need to
-+ * actually do anything here. But still needs to be implemented, to make
-+ * our caller (i.e. bdrv_co_change_backing_file() do the right thing).
-+ *
-+ * FIXME
-+ * We'd like to auto-remove ourselves from the block graph, but it cannot
-+ * be done from a coroutine. Currently done in the stream job, where it
-+ * kinda fits better, but in the long-term, a special parameter would be
-+ * nice (or done via qemu-server via upcoming blockdev-replace QMP command).
-+ */
-+ if (backing_file == NULL) {
-+ BDRVAllocTrackState *s = bs->opaque;
-+ bdrv_drained_begin(bs);
-+ s->drop_state = DropInProgress;
-+ bdrv_child_refresh_perms(bs, bs->file, &error_abort);
-+ bdrv_drained_end(bs);
-+ }
-+
-+ return 0;
-+}
-+
-+static BlockDriver bdrv_alloc_track = {
-+ .format_name = "alloc-track",
-+ .instance_size = sizeof(BDRVAllocTrackState),
-+
-+ .bdrv_file_open = track_open,
-+ .bdrv_close = track_close,
-+ .bdrv_co_getlength = track_co_getlength,
-+ .bdrv_child_perm = track_child_perm,
-+ .bdrv_refresh_limits = track_refresh_limits,
-+
-+ .bdrv_co_pwrite_zeroes = track_co_pwrite_zeroes,
-+ .bdrv_co_pwritev = track_co_pwritev,
-+ .bdrv_co_preadv = track_co_preadv,
-+ .bdrv_co_pdiscard = track_co_pdiscard,
-+
-+ .bdrv_co_flush = track_co_flush,
-+ .bdrv_co_flush_to_disk = track_co_flush,
-+
-+ .supports_backing = true,
-+
-+ .bdrv_co_block_status = track_co_block_status,
-+ .bdrv_co_change_backing_file = track_co_change_backing_file,
-+};
-+
-+static void bdrv_alloc_track_init(void)
-+{
-+ bdrv_register(&bdrv_alloc_track);
-+}
-+
-+block_init(bdrv_alloc_track_init);
-diff --git a/block/meson.build b/block/meson.build
-index 549c0c7103..73777a1620 100644
---- a/block/meson.build
-+++ b/block/meson.build
-@@ -2,6 +2,7 @@ block_ss.add(genh)
- block_ss.add(files(
- 'accounting.c',
- 'aio_task.c',
-+ 'alloc-track.c',
- 'amend.c',
- 'backup.c',
- 'backup-dump.c',
-diff --git a/block/stream.c b/block/stream.c
-index 87a97cb8a2..3a04e95ee2 100644
---- a/block/stream.c
-+++ b/block/stream.c
-@@ -114,6 +114,40 @@ static int stream_prepare(Job *job)
- ret = -EPERM;
- goto out;
- }
-+
-+ /*
-+ * This cannot be done in the co_change_backing_file callback, because
-+ * bdrv_replace_node() cannot be done in a coroutine. The latter also
-+ * requires the graph lock exclusively. Only required for the
-+ * alloc-track driver.
-+ *
-+ * The long-term plan is to either have an explicit parameter for the
-+ * stream job or use the upcoming blockdev-replace QMP command.
-+ */
-+ if (base_id == NULL && strcmp(unfiltered_bs->drv->format_name, "alloc-track") == 0) {
-+ BlockDriverState *file_bs;
-+
-+ bdrv_graph_rdlock_main_loop();
-+ file_bs = unfiltered_bs->file->bs;
-+ bdrv_graph_rdunlock_main_loop();
-+
-+ bdrv_ref(unfiltered_bs); // unrefed by bdrv_replace_node()
-+ bdrv_drained_begin(file_bs);
-+ bdrv_graph_wrlock(s->target_bs);
-+
-+ bdrv_replace_node(unfiltered_bs, file_bs, &local_err);
-+
-+ bdrv_graph_wrunlock(s->target_bs);
-+ bdrv_drained_end(file_bs);
-+ bdrv_unref(unfiltered_bs);
-+
-+ if (local_err) {
-+ error_prepend(&local_err, "failed to replace alloc-track node: ");
-+ error_report_err(local_err);
-+ ret = -EPERM;
-+ goto out;
-+ }
-+ }
- }
-
- out:
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fabian Ebner <f.ebner@proxmox.com>
+Date: Thu, 23 Jun 2022 14:00:07 +0200
+Subject: [PATCH] Revert "block/rbd: fix handling of holes in
+ .bdrv_co_block_status"
+
+This reverts commit 9e302f64bb407a9bb097b626da97228c2654cfee in
+preparation to revert 0347a8fd4c3faaedf119be04c197804be40a384b.
+
+Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/rbd.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/block/rbd.c b/block/rbd.c
+index 367db42dce..347b121626 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -1474,11 +1474,11 @@ static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
+ RBDDiffIterateReq *req = opaque;
+
+ assert(req->offs + req->bytes <= offs);
+-
+- /* treat a hole like an unallocated area and bail out */
+- if (!exists) {
+- return 0;
+- }
++ /*
++ * we do not diff against a snapshot so we should never receive a callback
++ * for a hole.
++ */
++ assert(exists);
+
+ if (!req->exists && offs > req->offs) {
+ /*
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fabian Ebner <f.ebner@proxmox.com>
-Date: Thu, 23 Jun 2022 14:00:05 +0200
-Subject: [PATCH] Revert "block/rbd: workaround for ceph issue #53784"
-
-This reverts commit fc176116cdea816ceb8dd969080b2b95f58edbc0 in
-preparation to revert 0347a8fd4c3faaedf119be04c197804be40a384b.
-
-Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/rbd.c | 42 ++----------------------------------------
- 1 file changed, 2 insertions(+), 40 deletions(-)
-
-diff --git a/block/rbd.c b/block/rbd.c
-index 63f60d41be..367db42dce 100644
---- a/block/rbd.c
-+++ b/block/rbd.c
-@@ -1515,7 +1515,6 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
- int status, r;
- RBDDiffIterateReq req = { .offs = offset };
- uint64_t features, flags;
-- uint64_t head = 0;
-
- assert(offset + bytes <= s->image_size);
-
-@@ -1543,43 +1542,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
- return status;
- }
-
--#if LIBRBD_VERSION_CODE < LIBRBD_VERSION(1, 17, 0)
-- /*
-- * librbd had a bug until early 2022 that affected all versions of ceph that
-- * supported fast-diff. This bug results in reporting of incorrect offsets
-- * if the offset parameter to rbd_diff_iterate2 is not object aligned.
-- * Work around this bug by rounding down the offset to object boundaries.
-- * This is OK because we call rbd_diff_iterate2 with whole_object = true.
-- * However, this workaround only works for non cloned images with default
-- * striping.
-- *
-- * See: https://tracker.ceph.com/issues/53784
-- */
--
-- /* check if RBD image has non-default striping enabled */
-- if (features & RBD_FEATURE_STRIPINGV2) {
-- return status;
-- }
--
--#pragma GCC diagnostic push
--#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-- /*
-- * check if RBD image is a clone (= has a parent).
-- *
-- * rbd_get_parent_info is deprecated from Nautilus onwards, but the
-- * replacement rbd_get_parent is not present in Luminous and Mimic.
-- */
-- if (rbd_get_parent_info(s->image, NULL, 0, NULL, 0, NULL, 0) != -ENOENT) {
-- return status;
-- }
--#pragma GCC diagnostic pop
--
-- head = req.offs & (s->object_size - 1);
-- req.offs -= head;
-- bytes += head;
--#endif
--
-- r = rbd_diff_iterate2(s->image, NULL, req.offs, bytes, true, true,
-+ r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
- qemu_rbd_diff_iterate_cb, &req);
- if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
- return status;
-@@ -1598,8 +1561,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
- status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
- }
-
-- assert(req.bytes > head);
-- *pnum = req.bytes - head;
-+ *pnum = req.bytes;
- return status;
- }
-
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fabian Ebner <f.ebner@proxmox.com>
-Date: Thu, 23 Jun 2022 14:00:07 +0200
-Subject: [PATCH] Revert "block/rbd: fix handling of holes in
- .bdrv_co_block_status"
-
-This reverts commit 9e302f64bb407a9bb097b626da97228c2654cfee in
-preparation to revert 0347a8fd4c3faaedf119be04c197804be40a384b.
-
-Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/rbd.c | 10 +++++-----
- 1 file changed, 5 insertions(+), 5 deletions(-)
-
-diff --git a/block/rbd.c b/block/rbd.c
-index 367db42dce..347b121626 100644
---- a/block/rbd.c
-+++ b/block/rbd.c
-@@ -1474,11 +1474,11 @@ static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
- RBDDiffIterateReq *req = opaque;
-
- assert(req->offs + req->bytes <= offs);
--
-- /* treat a hole like an unallocated area and bail out */
-- if (!exists) {
-- return 0;
-- }
-+ /*
-+ * we do not diff against a snapshot so we should never receive a callback
-+ * for a hole.
-+ */
-+ assert(exists);
-
- if (!req->exists && offs > req->offs) {
- /*
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fabian Ebner <f.ebner@proxmox.com>
+Date: Tue, 17 May 2022 09:46:02 +0200
+Subject: [PATCH] Revert "block/rbd: implement bdrv_co_block_status"
+
+During backup, bdrv_co_block_status is called for each block copy
+chunk. When RBD is used, the current implementation with
+rbd_diff_iterate2() using whole_object=true takes about linearly more
+time, depending on the image size. Since there are linearly more
+chunks, the slowdown is quadratic, becoming unacceptable for large
+images (starting somewhere between 500-1000 GiB in my testing).
+
+This reverts commit 0347a8fd4c3faaedf119be04c197804be40a384b as a
+stop-gap measure, until it's clear how to make the implemenation
+more efficient.
+
+Upstream bug report:
+https://gitlab.com/qemu-project/qemu/-/issues/1026
+
+Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/rbd.c | 112 ----------------------------------------------------
+ 1 file changed, 112 deletions(-)
+
+diff --git a/block/rbd.c b/block/rbd.c
+index 347b121626..e61b359b97 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -108,12 +108,6 @@ typedef struct RBDTask {
+ int64_t ret;
+ } RBDTask;
+
+-typedef struct RBDDiffIterateReq {
+- uint64_t offs;
+- uint64_t bytes;
+- bool exists;
+-} RBDDiffIterateReq;
+-
+ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
+ BlockdevOptionsRbd *opts, bool cache,
+ const char *keypairs, const char *secretid,
+@@ -1460,111 +1454,6 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
+ return spec_info;
+ }
+
+-/*
+- * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
+- * value in the callback routine. Choose a value that does not conflict with
+- * an existing exitcode and return it if we want to prematurely stop the
+- * execution because we detected a change in the allocation status.
+- */
+-#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
+-
+-static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
+- int exists, void *opaque)
+-{
+- RBDDiffIterateReq *req = opaque;
+-
+- assert(req->offs + req->bytes <= offs);
+- /*
+- * we do not diff against a snapshot so we should never receive a callback
+- * for a hole.
+- */
+- assert(exists);
+-
+- if (!req->exists && offs > req->offs) {
+- /*
+- * we started in an unallocated area and hit the first allocated
+- * block. req->bytes must be set to the length of the unallocated area
+- * before the allocated area. stop further processing.
+- */
+- req->bytes = offs - req->offs;
+- return QEMU_RBD_EXIT_DIFF_ITERATE2;
+- }
+-
+- if (req->exists && offs > req->offs + req->bytes) {
+- /*
+- * we started in an allocated area and jumped over an unallocated area,
+- * req->bytes contains the length of the allocated area before the
+- * unallocated area. stop further processing.
+- */
+- return QEMU_RBD_EXIT_DIFF_ITERATE2;
+- }
+-
+- req->bytes += len;
+- req->exists = true;
+-
+- return 0;
+-}
+-
+-static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+- bool want_zero, int64_t offset,
+- int64_t bytes, int64_t *pnum,
+- int64_t *map,
+- BlockDriverState **file)
+-{
+- BDRVRBDState *s = bs->opaque;
+- int status, r;
+- RBDDiffIterateReq req = { .offs = offset };
+- uint64_t features, flags;
+-
+- assert(offset + bytes <= s->image_size);
+-
+- /* default to all sectors allocated */
+- status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+- *map = offset;
+- *file = bs;
+- *pnum = bytes;
+-
+- /* check if RBD image supports fast-diff */
+- r = rbd_get_features(s->image, &features);
+- if (r < 0) {
+- return status;
+- }
+- if (!(features & RBD_FEATURE_FAST_DIFF)) {
+- return status;
+- }
+-
+- /* check if RBD fast-diff result is valid */
+- r = rbd_get_flags(s->image, &flags);
+- if (r < 0) {
+- return status;
+- }
+- if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
+- return status;
+- }
+-
+- r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
+- qemu_rbd_diff_iterate_cb, &req);
+- if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
+- return status;
+- }
+- assert(req.bytes <= bytes);
+- if (!req.exists) {
+- if (r == 0) {
+- /*
+- * rbd_diff_iterate2 does not invoke callbacks for unallocated
+- * areas. This here catches the case where no callback was
+- * invoked at all (req.bytes == 0).
+- */
+- assert(req.bytes == 0);
+- req.bytes = bytes;
+- }
+- status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
+- }
+-
+- *pnum = req.bytes;
+- return status;
+-}
+-
+ static int64_t coroutine_fn qemu_rbd_co_getlength(BlockDriverState *bs)
+ {
+ BDRVRBDState *s = bs->opaque;
+@@ -1800,7 +1689,6 @@ static BlockDriver bdrv_rbd = {
+ #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+ .bdrv_co_pwrite_zeroes = qemu_rbd_co_pwrite_zeroes,
+ #endif
+- .bdrv_co_block_status = qemu_rbd_co_block_status,
+
+ .bdrv_snapshot_create = qemu_rbd_snap_create,
+ .bdrv_snapshot_delete = qemu_rbd_snap_remove,
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fabian Ebner <f.ebner@proxmox.com>
-Date: Tue, 17 May 2022 09:46:02 +0200
-Subject: [PATCH] Revert "block/rbd: implement bdrv_co_block_status"
-
-During backup, bdrv_co_block_status is called for each block copy
-chunk. When RBD is used, the current implementation with
-rbd_diff_iterate2() using whole_object=true takes about linearly more
-time, depending on the image size. Since there are linearly more
-chunks, the slowdown is quadratic, becoming unacceptable for large
-images (starting somewhere between 500-1000 GiB in my testing).
-
-This reverts commit 0347a8fd4c3faaedf119be04c197804be40a384b as a
-stop-gap measure, until it's clear how to make the implemenation
-more efficient.
-
-Upstream bug report:
-https://gitlab.com/qemu-project/qemu/-/issues/1026
-
-Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/rbd.c | 112 ----------------------------------------------------
- 1 file changed, 112 deletions(-)
-
-diff --git a/block/rbd.c b/block/rbd.c
-index 347b121626..e61b359b97 100644
---- a/block/rbd.c
-+++ b/block/rbd.c
-@@ -108,12 +108,6 @@ typedef struct RBDTask {
- int64_t ret;
- } RBDTask;
-
--typedef struct RBDDiffIterateReq {
-- uint64_t offs;
-- uint64_t bytes;
-- bool exists;
--} RBDDiffIterateReq;
--
- static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
- BlockdevOptionsRbd *opts, bool cache,
- const char *keypairs, const char *secretid,
-@@ -1460,111 +1454,6 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
- return spec_info;
- }
-
--/*
-- * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
-- * value in the callback routine. Choose a value that does not conflict with
-- * an existing exitcode and return it if we want to prematurely stop the
-- * execution because we detected a change in the allocation status.
-- */
--#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
--
--static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
-- int exists, void *opaque)
--{
-- RBDDiffIterateReq *req = opaque;
--
-- assert(req->offs + req->bytes <= offs);
-- /*
-- * we do not diff against a snapshot so we should never receive a callback
-- * for a hole.
-- */
-- assert(exists);
--
-- if (!req->exists && offs > req->offs) {
-- /*
-- * we started in an unallocated area and hit the first allocated
-- * block. req->bytes must be set to the length of the unallocated area
-- * before the allocated area. stop further processing.
-- */
-- req->bytes = offs - req->offs;
-- return QEMU_RBD_EXIT_DIFF_ITERATE2;
-- }
--
-- if (req->exists && offs > req->offs + req->bytes) {
-- /*
-- * we started in an allocated area and jumped over an unallocated area,
-- * req->bytes contains the length of the allocated area before the
-- * unallocated area. stop further processing.
-- */
-- return QEMU_RBD_EXIT_DIFF_ITERATE2;
-- }
--
-- req->bytes += len;
-- req->exists = true;
--
-- return 0;
--}
--
--static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
-- bool want_zero, int64_t offset,
-- int64_t bytes, int64_t *pnum,
-- int64_t *map,
-- BlockDriverState **file)
--{
-- BDRVRBDState *s = bs->opaque;
-- int status, r;
-- RBDDiffIterateReq req = { .offs = offset };
-- uint64_t features, flags;
--
-- assert(offset + bytes <= s->image_size);
--
-- /* default to all sectors allocated */
-- status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
-- *map = offset;
-- *file = bs;
-- *pnum = bytes;
--
-- /* check if RBD image supports fast-diff */
-- r = rbd_get_features(s->image, &features);
-- if (r < 0) {
-- return status;
-- }
-- if (!(features & RBD_FEATURE_FAST_DIFF)) {
-- return status;
-- }
--
-- /* check if RBD fast-diff result is valid */
-- r = rbd_get_flags(s->image, &flags);
-- if (r < 0) {
-- return status;
-- }
-- if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
-- return status;
-- }
--
-- r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
-- qemu_rbd_diff_iterate_cb, &req);
-- if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
-- return status;
-- }
-- assert(req.bytes <= bytes);
-- if (!req.exists) {
-- if (r == 0) {
-- /*
-- * rbd_diff_iterate2 does not invoke callbacks for unallocated
-- * areas. This here catches the case where no callback was
-- * invoked at all (req.bytes == 0).
-- */
-- assert(req.bytes == 0);
-- req.bytes = bytes;
-- }
-- status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
-- }
--
-- *pnum = req.bytes;
-- return status;
--}
--
- static int64_t coroutine_fn qemu_rbd_co_getlength(BlockDriverState *bs)
- {
- BDRVRBDState *s = bs->opaque;
-@@ -1800,7 +1689,6 @@ static BlockDriver bdrv_rbd = {
- #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
- .bdrv_co_pwrite_zeroes = qemu_rbd_co_pwrite_zeroes,
- #endif
-- .bdrv_co_block_status = qemu_rbd_co_block_status,
-
- .bdrv_snapshot_create = qemu_rbd_snap_create,
- .bdrv_snapshot_delete = qemu_rbd_snap_remove,
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Tue, 26 Mar 2024 14:57:51 +0100
+Subject: [PATCH] alloc-track: error out when auto-remove is not set
+
+Since replacing the node now happens in the stream job, where the
+option cannot be read from (it's internal to the driver), it will
+always be treated as on.
+
+qemu-server will always set it, make sure to have other users notice
+the change (should they even exist). The option can be fully dropped
+in the future while adding a version guard in qemu-server.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ block/alloc-track.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/block/alloc-track.c b/block/alloc-track.c
+index b9f8ea9137..f3ed2935c4 100644
+--- a/block/alloc-track.c
++++ b/block/alloc-track.c
+@@ -34,7 +34,6 @@ typedef struct {
+ BdrvDirtyBitmap *bitmap;
+ uint64_t granularity;
+ DropState drop_state;
+- bool auto_remove;
+ } BDRVAllocTrackState;
+
+ static QemuOptsList runtime_opts = {
+@@ -86,7 +85,11 @@ static int track_open(BlockDriverState *bs, QDict *options, int flags,
+ goto fail;
+ }
+
+- s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
++ if (!qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false)) {
++ error_setg(errp, "alloc-track: requires auto-remove option to be set to on");
++ ret = -EINVAL;
++ goto fail;
++ }
+
+ /* open the target (write) node, backing will be attached by block layer */
+ file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Wed, 27 Mar 2024 11:15:39 +0100
+Subject: [PATCH] alloc-track: avoid seemingly superfluous child permission
+ update
+
+Doesn't seem necessary nowadays (maybe after commit "alloc-track: fix
+deadlock during drop" where the dropping is not rescheduled and delayed
+anymore or some upstream change). Should there really be some issue,
+instead of having a drop state, this could also be just based off the
+fact whether there is still a backing child.
+
+Dumping the cumulative (shared) permissions for the BDS with a debug
+print yields the same values after this patch and with QEMU 8.1,
+namely 3 and 5.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ block/alloc-track.c | 26 --------------------------
+ 1 file changed, 26 deletions(-)
+
+diff --git a/block/alloc-track.c b/block/alloc-track.c
+index f3ed2935c4..29138dcc49 100644
+--- a/block/alloc-track.c
++++ b/block/alloc-track.c
+@@ -25,15 +25,9 @@
+
+ #define TRACK_OPT_AUTO_REMOVE "auto-remove"
+
+-typedef enum DropState {
+- DropNone,
+- DropInProgress,
+-} DropState;
+-
+ typedef struct {
+ BdrvDirtyBitmap *bitmap;
+ uint64_t granularity;
+- DropState drop_state;
+ } BDRVAllocTrackState;
+
+ static QemuOptsList runtime_opts = {
+@@ -137,8 +131,6 @@ static int track_open(BlockDriverState *bs, QDict *options, int flags,
+ goto fail;
+ }
+
+- s->drop_state = DropNone;
+-
+ fail:
+ if (ret < 0) {
+ bdrv_graph_wrlock();
+@@ -289,18 +281,8 @@ track_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+ BlockReopenQueue *reopen_queue, uint64_t perm, uint64_t shared,
+ uint64_t *nperm, uint64_t *nshared)
+ {
+- BDRVAllocTrackState *s = bs->opaque;
+-
+ *nshared = BLK_PERM_ALL;
+
+- /* in case we're currently dropping ourselves, claim to not use any
+- * permissions at all - which is fine, since from this point on we will
+- * never issue a read or write anymore */
+- if (s->drop_state == DropInProgress) {
+- *nperm = 0;
+- return;
+- }
+-
+ if (role & BDRV_CHILD_DATA) {
+ *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
+ } else {
+@@ -326,14 +308,6 @@ track_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
+ * kinda fits better, but in the long-term, a special parameter would be
+ * nice (or done via qemu-server via upcoming blockdev-replace QMP command).
+ */
+- if (backing_file == NULL) {
+- BDRVAllocTrackState *s = bs->opaque;
+- bdrv_drained_begin(bs);
+- s->drop_state = DropInProgress;
+- bdrv_child_refresh_perms(bs, bs->file, &error_abort);
+- bdrv_drained_end(bs);
+- }
+-
+ return 0;
+ }
+
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Tue, 26 Mar 2024 14:57:51 +0100
-Subject: [PATCH] alloc-track: error out when auto-remove is not set
-
-Since replacing the node now happens in the stream job, where the
-option cannot be read from (it's internal to the driver), it will
-always be treated as on.
-
-qemu-server will always set it, make sure to have other users notice
-the change (should they even exist). The option can be fully dropped
-in the future while adding a version guard in qemu-server.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- block/alloc-track.c | 7 +++++--
- 1 file changed, 5 insertions(+), 2 deletions(-)
-
-diff --git a/block/alloc-track.c b/block/alloc-track.c
-index 14698c362e..dad8fe6375 100644
---- a/block/alloc-track.c
-+++ b/block/alloc-track.c
-@@ -34,7 +34,6 @@ typedef struct {
- BdrvDirtyBitmap *bitmap;
- uint64_t granularity;
- DropState drop_state;
-- bool auto_remove;
- } BDRVAllocTrackState;
-
- static QemuOptsList runtime_opts = {
-@@ -86,7 +85,11 @@ static int track_open(BlockDriverState *bs, QDict *options, int flags,
- goto fail;
- }
-
-- s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
-+ if (!qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false)) {
-+ error_setg(errp, "alloc-track: requires auto-remove option to be set to on");
-+ ret = -EINVAL;
-+ goto fail;
-+ }
-
- /* open the target (write) node, backing will be attached by block layer */
- file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Wed, 27 Mar 2024 11:15:39 +0100
-Subject: [PATCH] alloc-track: avoid seemingly superfluous child permission
- update
-
-Doesn't seem necessary nowadays (maybe after commit "alloc-track: fix
-deadlock during drop" where the dropping is not rescheduled and delayed
-anymore or some upstream change). Should there really be some issue,
-instead of having a drop state, this could also be just based off the
-fact whether there is still a backing child.
-
-Dumping the cumulative (shared) permissions for the BDS with a debug
-print yields the same values after this patch and with QEMU 8.1,
-namely 3 and 5.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- block/alloc-track.c | 26 --------------------------
- 1 file changed, 26 deletions(-)
-
-diff --git a/block/alloc-track.c b/block/alloc-track.c
-index dad8fe6375..7aff9763ad 100644
---- a/block/alloc-track.c
-+++ b/block/alloc-track.c
-@@ -25,15 +25,9 @@
-
- #define TRACK_OPT_AUTO_REMOVE "auto-remove"
-
--typedef enum DropState {
-- DropNone,
-- DropInProgress,
--} DropState;
--
- typedef struct {
- BdrvDirtyBitmap *bitmap;
- uint64_t granularity;
-- DropState drop_state;
- } BDRVAllocTrackState;
-
- static QemuOptsList runtime_opts = {
-@@ -137,8 +131,6 @@ static int track_open(BlockDriverState *bs, QDict *options, int flags,
- goto fail;
- }
-
-- s->drop_state = DropNone;
--
- fail:
- if (ret < 0) {
- bdrv_graph_wrlock(bs);
-@@ -289,18 +281,8 @@ track_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
- BlockReopenQueue *reopen_queue, uint64_t perm, uint64_t shared,
- uint64_t *nperm, uint64_t *nshared)
- {
-- BDRVAllocTrackState *s = bs->opaque;
--
- *nshared = BLK_PERM_ALL;
-
-- /* in case we're currently dropping ourselves, claim to not use any
-- * permissions at all - which is fine, since from this point on we will
-- * never issue a read or write anymore */
-- if (s->drop_state == DropInProgress) {
-- *nperm = 0;
-- return;
-- }
--
- if (role & BDRV_CHILD_DATA) {
- *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
- } else {
-@@ -326,14 +308,6 @@ track_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
- * kinda fits better, but in the long-term, a special parameter would be
- * nice (or done via qemu-server via upcoming blockdev-replace QMP command).
- */
-- if (backing_file == NULL) {
-- BDRVAllocTrackState *s = bs->opaque;
-- bdrv_drained_begin(bs);
-- s->drop_state = DropInProgress;
-- bdrv_child_refresh_perms(bs, bs->file, &error_abort);
-- bdrv_drained_end(bs);
-- }
--
- return 0;
- }
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Date: Thu, 11 Apr 2024 11:29:22 +0200
+Subject: [PATCH] block/copy-before-write: fix permission
+
+In case when source node does not have any parents, the condition still
+works as required: backup job do create the parent by
+
+ block_job_create -> block_job_add_bdrv -> bdrv_root_attach_child
+
+Still, in this case checking @perm variable doesn't work, as backup job
+creates the root blk with empty permissions (as it rely on CBW filter
+to require correct permissions and don't want to create extra
+conflicts).
+
+So, we should not check @perm.
+
+The hack may be dropped entirely when transactional insertion of
+filter (when we don't try to recalculate permissions in intermediate
+state, when filter does conflict with original parent of the source
+node) merged (old big series
+"[PATCH v5 00/45] Transactional block-graph modifying API"[1] and it's
+current in-flight part is "[PATCH v8 0/7] blockdev-replace"[2])
+
+[1] https://patchew.org/QEMU/20220330212902.590099-1-vsementsov@openvz.org/
+[2] https://patchew.org/QEMU/20231017184444.932733-1-vsementsov@yandex-team.ru/
+
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/copy-before-write.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 8aba27a71d..3e3af30c08 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -364,9 +364,13 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+ perm, shared, nperm, nshared);
+
+ if (!QLIST_EMPTY(&bs->parents)) {
+- if (perm & BLK_PERM_WRITE) {
+- *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
+- }
++ /*
++ * Note, that source child may be shared with backup job. Backup job
++ * does create own blk parent on copy-before-write node, so this
++ * works even if source node does not have any parents before backup
++ * start
++ */
++ *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
+ *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
+ }
+ }
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Date: Thu, 11 Apr 2024 11:29:22 +0200
-Subject: [PATCH] block/copy-before-write: fix permission
-
-In case when source node does not have any parents, the condition still
-works as required: backup job do create the parent by
-
- block_job_create -> block_job_add_bdrv -> bdrv_root_attach_child
-
-Still, in this case checking @perm variable doesn't work, as backup job
-creates the root blk with empty permissions (as it rely on CBW filter
-to require correct permissions and don't want to create extra
-conflicts).
-
-So, we should not check @perm.
-
-The hack may be dropped entirely when transactional insertion of
-filter (when we don't try to recalculate permissions in intermediate
-state, when filter does conflict with original parent of the source
-node) merged (old big series
-"[PATCH v5 00/45] Transactional block-graph modifying API"[1] and it's
-current in-flight part is "[PATCH v8 0/7] blockdev-replace"[2])
-
-[1] https://patchew.org/QEMU/20220330212902.590099-1-vsementsov@openvz.org/
-[2] https://patchew.org/QEMU/20231017184444.932733-1-vsementsov@yandex-team.ru/
-
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/copy-before-write.c | 10 +++++++---
- 1 file changed, 7 insertions(+), 3 deletions(-)
-
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index 13972879b1..dbdbbca44e 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -364,9 +364,13 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
- perm, shared, nperm, nshared);
-
- if (!QLIST_EMPTY(&bs->parents)) {
-- if (perm & BLK_PERM_WRITE) {
-- *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
-- }
-+ /*
-+ * Note, that source child may be shared with backup job. Backup job
-+ * does create own blk parent on copy-before-write node, so this
-+ * works even if source node does not have any parents before backup
-+ * start
-+ */
-+ *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
- *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
- }
- }
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Date: Thu, 11 Apr 2024 11:29:23 +0200
+Subject: [PATCH] block/copy-before-write: support unligned snapshot-discard
+
+First thing that crashes on unligned access here is
+bdrv_reset_dirty_bitmap(). Correct way is to align-down the
+snapshot-discard request.
+
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/copy-before-write.c | 16 +++++++++++++---
+ 1 file changed, 13 insertions(+), 3 deletions(-)
+
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 3e3af30c08..6d89af0b29 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -325,14 +325,24 @@ static int coroutine_fn GRAPH_RDLOCK
+ cbw_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
+ {
+ BDRVCopyBeforeWriteState *s = bs->opaque;
++ uint32_t cluster_size = block_copy_cluster_size(s->bcs);
++ int64_t aligned_offset = QEMU_ALIGN_UP(offset, cluster_size);
++ int64_t aligned_end = QEMU_ALIGN_DOWN(offset + bytes, cluster_size);
++ int64_t aligned_bytes;
++
++ if (aligned_end <= aligned_offset) {
++ return 0;
++ }
++ aligned_bytes = aligned_end - aligned_offset;
+
+ WITH_QEMU_LOCK_GUARD(&s->lock) {
+- bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes);
++ bdrv_reset_dirty_bitmap(s->access_bitmap, aligned_offset,
++ aligned_bytes);
+ }
+
+- block_copy_reset(s->bcs, offset, bytes);
++ block_copy_reset(s->bcs, aligned_offset, aligned_bytes);
+
+- return bdrv_co_pdiscard(s->target, offset, bytes);
++ return bdrv_co_pdiscard(s->target, aligned_offset, aligned_bytes);
+ }
+
+ static void GRAPH_RDLOCK cbw_refresh_filename(BlockDriverState *bs)
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Date: Thu, 11 Apr 2024 11:29:24 +0200
+Subject: [PATCH] block/copy-before-write: create block_copy bitmap in filter
+ node
+
+Currently block_copy creates copy_bitmap in source node. But that is in
+bad relation with .independent_close=true of copy-before-write filter:
+source node may be detached and removed before .bdrv_close() handler
+called, which should call block_copy_state_free(), which in turn should
+remove copy_bitmap.
+
+That's all not ideal: it would be better if internal bitmap of
+block-copy object is not attached to any node. But that is not possible
+now.
+
+The simplest solution is just create copy_bitmap in filter node, where
+anyway two other bitmaps are created.
+
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/block-copy.c | 3 +-
+ block/copy-before-write.c | 2 +-
+ include/block/block-copy.h | 1 +
+ tests/qemu-iotests/257.out | 112 ++++++++++++++++++-------------------
+ 4 files changed, 60 insertions(+), 58 deletions(-)
+
+diff --git a/block/block-copy.c b/block/block-copy.c
+index 9ee3dd7ef5..8fca2c3698 100644
+--- a/block/block-copy.c
++++ b/block/block-copy.c
+@@ -351,6 +351,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
+ }
+
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
++ BlockDriverState *copy_bitmap_bs,
+ const BdrvDirtyBitmap *bitmap,
+ Error **errp)
+ {
+@@ -367,7 +368,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ return NULL;
+ }
+
+- copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
++ copy_bitmap = bdrv_create_dirty_bitmap(copy_bitmap_bs, cluster_size, NULL,
+ errp);
+ if (!copy_bitmap) {
+ return NULL;
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 6d89af0b29..ed2c228da7 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -468,7 +468,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
+ ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
+ bs->file->bs->supported_zero_flags);
+
+- s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
++ s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp);
+ if (!s->bcs) {
+ error_prepend(errp, "Cannot create block-copy-state: ");
+ return -EINVAL;
+diff --git a/include/block/block-copy.h b/include/block/block-copy.h
+index 0700953ab8..8b41643bfa 100644
+--- a/include/block/block-copy.h
++++ b/include/block/block-copy.h
+@@ -25,6 +25,7 @@ typedef struct BlockCopyState BlockCopyState;
+ typedef struct BlockCopyCallState BlockCopyCallState;
+
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
++ BlockDriverState *copy_bitmap_bs,
+ const BdrvDirtyBitmap *bitmap,
+ Error **errp);
+
+diff --git a/tests/qemu-iotests/257.out b/tests/qemu-iotests/257.out
+index aa76131ca9..c33dd7f3a9 100644
+--- a/tests/qemu-iotests/257.out
++++ b/tests/qemu-iotests/257.out
+@@ -120,16 +120,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -596,16 +596,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -865,16 +865,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -1341,16 +1341,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -1610,16 +1610,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -2086,16 +2086,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -2355,16 +2355,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -2831,16 +2831,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -3100,16 +3100,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -3576,16 +3576,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -3845,16 +3845,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -4321,16 +4321,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -4590,16 +4590,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -5066,16 +5066,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Date: Thu, 11 Apr 2024 11:29:23 +0200
-Subject: [PATCH] block/copy-before-write: support unligned snapshot-discard
-
-First thing that crashes on unligned access here is
-bdrv_reset_dirty_bitmap(). Correct way is to align-down the
-snapshot-discard request.
-
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/copy-before-write.c | 16 +++++++++++++---
- 1 file changed, 13 insertions(+), 3 deletions(-)
-
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index dbdbbca44e..2cbf6f9346 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -325,14 +325,24 @@ static int coroutine_fn GRAPH_RDLOCK
- cbw_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
- {
- BDRVCopyBeforeWriteState *s = bs->opaque;
-+ uint32_t cluster_size = block_copy_cluster_size(s->bcs);
-+ int64_t aligned_offset = QEMU_ALIGN_UP(offset, cluster_size);
-+ int64_t aligned_end = QEMU_ALIGN_DOWN(offset + bytes, cluster_size);
-+ int64_t aligned_bytes;
-+
-+ if (aligned_end <= aligned_offset) {
-+ return 0;
-+ }
-+ aligned_bytes = aligned_end - aligned_offset;
-
- WITH_QEMU_LOCK_GUARD(&s->lock) {
-- bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes);
-+ bdrv_reset_dirty_bitmap(s->access_bitmap, aligned_offset,
-+ aligned_bytes);
- }
-
-- block_copy_reset(s->bcs, offset, bytes);
-+ block_copy_reset(s->bcs, aligned_offset, aligned_bytes);
-
-- return bdrv_co_pdiscard(s->target, offset, bytes);
-+ return bdrv_co_pdiscard(s->target, aligned_offset, aligned_bytes);
- }
-
- static void GRAPH_RDLOCK cbw_refresh_filename(BlockDriverState *bs)
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Date: Thu, 11 Apr 2024 11:29:24 +0200
-Subject: [PATCH] block/copy-before-write: create block_copy bitmap in filter
- node
-
-Currently block_copy creates copy_bitmap in source node. But that is in
-bad relation with .independent_close=true of copy-before-write filter:
-source node may be detached and removed before .bdrv_close() handler
-called, which should call block_copy_state_free(), which in turn should
-remove copy_bitmap.
-
-That's all not ideal: it would be better if internal bitmap of
-block-copy object is not attached to any node. But that is not possible
-now.
-
-The simplest solution is just create copy_bitmap in filter node, where
-anyway two other bitmaps are created.
-
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/block-copy.c | 3 +-
- block/copy-before-write.c | 2 +-
- include/block/block-copy.h | 1 +
- tests/qemu-iotests/257.out | 112 ++++++++++++++++++-------------------
- 4 files changed, 60 insertions(+), 58 deletions(-)
-
-diff --git a/block/block-copy.c b/block/block-copy.c
-index 9ee3dd7ef5..8fca2c3698 100644
---- a/block/block-copy.c
-+++ b/block/block-copy.c
-@@ -351,6 +351,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
- }
-
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-+ BlockDriverState *copy_bitmap_bs,
- const BdrvDirtyBitmap *bitmap,
- Error **errp)
- {
-@@ -367,7 +368,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- return NULL;
- }
-
-- copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
-+ copy_bitmap = bdrv_create_dirty_bitmap(copy_bitmap_bs, cluster_size, NULL,
- errp);
- if (!copy_bitmap) {
- return NULL;
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index 2cbf6f9346..afa5f473d2 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -472,7 +472,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
- ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
- bs->file->bs->supported_zero_flags);
-
-- s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
-+ s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp);
- if (!s->bcs) {
- error_prepend(errp, "Cannot create block-copy-state: ");
- ret = -EINVAL;
-diff --git a/include/block/block-copy.h b/include/block/block-copy.h
-index 0700953ab8..8b41643bfa 100644
---- a/include/block/block-copy.h
-+++ b/include/block/block-copy.h
-@@ -25,6 +25,7 @@ typedef struct BlockCopyState BlockCopyState;
- typedef struct BlockCopyCallState BlockCopyCallState;
-
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-+ BlockDriverState *copy_bitmap_bs,
- const BdrvDirtyBitmap *bitmap,
- Error **errp);
-
-diff --git a/tests/qemu-iotests/257.out b/tests/qemu-iotests/257.out
-index aa76131ca9..c33dd7f3a9 100644
---- a/tests/qemu-iotests/257.out
-+++ b/tests/qemu-iotests/257.out
-@@ -120,16 +120,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -596,16 +596,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -865,16 +865,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -1341,16 +1341,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -1610,16 +1610,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -2086,16 +2086,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -2355,16 +2355,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -2831,16 +2831,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -3100,16 +3100,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -3576,16 +3576,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -3845,16 +3845,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -4321,16 +4321,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -4590,16 +4590,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -5066,16 +5066,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Date: Thu, 11 Apr 2024 11:29:25 +0200
+Subject: [PATCH] qapi: blockdev-backup: add discard-source parameter
+
+Add a parameter that enables discard-after-copy. That is mostly useful
+in "push backup with fleecing" scheme, when source is snapshot-access
+format driver node, based on copy-before-write filter snapshot-access
+API:
+
+[guest] [snapshot-access] ~~ blockdev-backup ~~> [backup target]
+ | |
+ | root | file
+ v v
+[copy-before-write]
+ | |
+ | file | target
+ v v
+[active disk] [temp.img]
+
+In this case discard-after-copy does two things:
+
+ - discard data in temp.img to save disk space
+ - avoid further copy-before-write operation in discarded area
+
+Note that we have to declare WRITE permission on source in
+copy-before-write filter, for discard to work. Still we can't take it
+unconditionally, as it will break normal backup from RO source. So, we
+have to add a parameter and pass it thorough bdrv_open flags.
+
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/backup.c | 5 +++--
+ block/block-copy.c | 9 +++++++++
+ block/copy-before-write.c | 15 +++++++++++++--
+ block/copy-before-write.h | 1 +
+ block/replication.c | 4 ++--
+ blockdev.c | 2 +-
+ include/block/block-common.h | 2 ++
+ include/block/block-copy.h | 1 +
+ include/block/block_int-global-state.h | 2 +-
+ qapi/block-core.json | 4 ++++
+ 10 files changed, 37 insertions(+), 8 deletions(-)
+
+diff --git a/block/backup.c b/block/backup.c
+index 16d611c4ca..1963e47ab9 100644
+--- a/block/backup.c
++++ b/block/backup.c
+@@ -332,7 +332,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+ BlockDriverState *target, int64_t speed,
+ MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
+ BitmapSyncMode bitmap_mode,
+- bool compress,
++ bool compress, bool discard_source,
+ const char *filter_node_name,
+ BackupPerf *perf,
+ BlockdevOnError on_source_error,
+@@ -433,7 +433,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+ goto error;
+ }
+
+- cbw = bdrv_cbw_append(bs, target, filter_node_name, &bcs, errp);
++ cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
++ &bcs, errp);
+ if (!cbw) {
+ goto error;
+ }
+diff --git a/block/block-copy.c b/block/block-copy.c
+index 8fca2c3698..7e3b378528 100644
+--- a/block/block-copy.c
++++ b/block/block-copy.c
+@@ -137,6 +137,7 @@ typedef struct BlockCopyState {
+ CoMutex lock;
+ int64_t in_flight_bytes;
+ BlockCopyMethod method;
++ bool discard_source;
+ BlockReqList reqs;
+ QLIST_HEAD(, BlockCopyCallState) calls;
+ /*
+@@ -353,6 +354,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ BlockDriverState *copy_bitmap_bs,
+ const BdrvDirtyBitmap *bitmap,
++ bool discard_source,
+ Error **errp)
+ {
+ ERRP_GUARD();
+@@ -418,6 +420,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ cluster_size),
+ };
+
++ s->discard_source = discard_source;
+ block_copy_set_copy_opts(s, false, false);
+
+ ratelimit_init(&s->rate_limit);
+@@ -589,6 +592,12 @@ static coroutine_fn int block_copy_task_entry(AioTask *task)
+ co_put_to_shres(s->mem, t->req.bytes);
+ block_copy_task_end(t, ret);
+
++ if (s->discard_source && ret == 0) {
++ int64_t nbytes =
++ MIN(t->req.offset + t->req.bytes, s->len) - t->req.offset;
++ bdrv_co_pdiscard(s->source, t->req.offset, nbytes);
++ }
++
+ return ret;
+ }
+
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index ed2c228da7..cd65524e26 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -44,6 +44,7 @@ typedef struct BDRVCopyBeforeWriteState {
+ BdrvChild *target;
+ OnCbwError on_cbw_error;
+ uint32_t cbw_timeout_ns;
++ bool discard_source;
+
+ /*
+ * @lock: protects access to @access_bitmap, @done_bitmap and
+@@ -357,6 +358,8 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+ uint64_t perm, uint64_t shared,
+ uint64_t *nperm, uint64_t *nshared)
+ {
++ BDRVCopyBeforeWriteState *s = bs->opaque;
++
+ if (!(role & BDRV_CHILD_FILTERED)) {
+ /*
+ * Target child
+@@ -381,6 +384,10 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+ * start
+ */
+ *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
++ if (s->discard_source) {
++ *nperm = *nperm | BLK_PERM_WRITE;
++ }
++
+ *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
+ }
+ }
+@@ -468,7 +475,9 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
+ ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
+ bs->file->bs->supported_zero_flags);
+
+- s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp);
++ s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
++ s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
++ flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
+ if (!s->bcs) {
+ error_prepend(errp, "Cannot create block-copy-state: ");
+ return -EINVAL;
+@@ -535,12 +544,14 @@ static BlockDriver bdrv_cbw_filter = {
+ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ BlockDriverState *target,
+ const char *filter_node_name,
++ bool discard_source,
+ BlockCopyState **bcs,
+ Error **errp)
+ {
+ BDRVCopyBeforeWriteState *state;
+ BlockDriverState *top;
+ QDict *opts;
++ int flags = BDRV_O_RDWR | (discard_source ? BDRV_O_CBW_DISCARD_SOURCE : 0);
+
+ assert(source->total_sectors == target->total_sectors);
+ GLOBAL_STATE_CODE();
+@@ -553,7 +564,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ qdict_put_str(opts, "file", bdrv_get_node_name(source));
+ qdict_put_str(opts, "target", bdrv_get_node_name(target));
+
+- top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp);
++ top = bdrv_insert_node(source, opts, flags, errp);
+ if (!top) {
+ return NULL;
+ }
+diff --git a/block/copy-before-write.h b/block/copy-before-write.h
+index 6e72bb25e9..01af0cd3c4 100644
+--- a/block/copy-before-write.h
++++ b/block/copy-before-write.h
+@@ -39,6 +39,7 @@
+ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ BlockDriverState *target,
+ const char *filter_node_name,
++ bool discard_source,
+ BlockCopyState **bcs,
+ Error **errp);
+ void bdrv_cbw_drop(BlockDriverState *bs);
+diff --git a/block/replication.c b/block/replication.c
+index ca6bd0a720..0415a5e8b7 100644
+--- a/block/replication.c
++++ b/block/replication.c
+@@ -582,8 +582,8 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
+
+ s->backup_job = backup_job_create(
+ NULL, s->secondary_disk->bs, s->hidden_disk->bs,
+- 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, NULL,
+- &perf,
++ 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, false,
++ NULL, &perf,
+ BLOCKDEV_ON_ERROR_REPORT,
+ BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
+ backup_job_completed, bs, NULL, &local_err);
+diff --git a/blockdev.c b/blockdev.c
+index 5e5dbc1da9..1054a69279 100644
+--- a/blockdev.c
++++ b/blockdev.c
+@@ -2727,7 +2727,7 @@ static BlockJob *do_backup_common(BackupCommon *backup,
+
+ job = backup_job_create(backup->job_id, bs, target_bs, backup->speed,
+ backup->sync, bmap, backup->bitmap_mode,
+- backup->compress,
++ backup->compress, backup->discard_source,
+ backup->filter_node_name,
+ &perf,
+ backup->on_source_error,
+diff --git a/include/block/block-common.h b/include/block/block-common.h
+index a846023a09..338fe5ff7a 100644
+--- a/include/block/block-common.h
++++ b/include/block/block-common.h
+@@ -243,6 +243,8 @@ typedef enum {
+ read-write fails */
+ #define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */
+
++#define BDRV_O_CBW_DISCARD_SOURCE 0x80000 /* for copy-before-write filter */
++
+ #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
+
+
+diff --git a/include/block/block-copy.h b/include/block/block-copy.h
+index 8b41643bfa..bdc703bacd 100644
+--- a/include/block/block-copy.h
++++ b/include/block/block-copy.h
+@@ -27,6 +27,7 @@ typedef struct BlockCopyCallState BlockCopyCallState;
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ BlockDriverState *copy_bitmap_bs,
+ const BdrvDirtyBitmap *bitmap,
++ bool discard_source,
+ Error **errp);
+
+ /* Function should be called prior any actual copy request */
+diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
+index cc1387ae02..f0c642b194 100644
+--- a/include/block/block_int-global-state.h
++++ b/include/block/block_int-global-state.h
+@@ -195,7 +195,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+ MirrorSyncMode sync_mode,
+ BdrvDirtyBitmap *sync_bitmap,
+ BitmapSyncMode bitmap_mode,
+- bool compress,
++ bool compress, bool discard_source,
+ const char *filter_node_name,
+ BackupPerf *perf,
+ BlockdevOnError on_source_error,
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index f516d8e95a..d796d49abb 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -1849,6 +1849,9 @@
+ # node specified by @drive. If this option is not given, a node
+ # name is autogenerated. (Since: 4.2)
+ #
++# @discard-source: Discard blocks on source which are already copied
++# to the target. (Since 9.0)
++#
+ # @x-perf: Performance options. (Since 6.0)
+ #
+ # Features:
+@@ -1870,6 +1873,7 @@
+ '*on-target-error': 'BlockdevOnError',
+ '*auto-finalize': 'bool', '*auto-dismiss': 'bool',
+ '*filter-node-name': 'str',
++ '*discard-source': 'bool',
+ '*x-perf': { 'type': 'BackupPerf',
+ 'features': [ 'unstable' ] } } }
+
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Thu, 11 Apr 2024 11:29:26 +0200
+Subject: [PATCH] copy-before-write: allow specifying minimum cluster size
+
+Useful to make discard-source work in the context of backup fleecing
+when the fleecing image has a larger granularity than the backup
+target.
+
+Copy-before-write operations will use at least this granularity and in
+particular, discard requests to the source node will too. If the
+granularity is too small, they will just be aligned down in
+cbw_co_pdiscard_snapshot() and thus effectively ignored.
+
+The QAPI uses uint32 so the value will be non-negative, but still fit
+into a uint64_t.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/block-copy.c | 17 +++++++++++++----
+ block/copy-before-write.c | 3 ++-
+ include/block/block-copy.h | 1 +
+ qapi/block-core.json | 8 +++++++-
+ 4 files changed, 23 insertions(+), 6 deletions(-)
+
+diff --git a/block/block-copy.c b/block/block-copy.c
+index 7e3b378528..adb1cbb440 100644
+--- a/block/block-copy.c
++++ b/block/block-copy.c
+@@ -310,6 +310,7 @@ void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range,
+ }
+
+ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
++ int64_t min_cluster_size,
+ Error **errp)
+ {
+ int ret;
+@@ -335,7 +336,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
+ "used. If the actual block size of the target exceeds "
+ "this default, the backup may be unusable",
+ BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
+- return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
++ return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
+ } else if (ret < 0 && !target_does_cow) {
+ error_setg_errno(errp, -ret,
+ "Couldn't determine the cluster size of the target image, "
+@@ -345,16 +346,18 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
+ return ret;
+ } else if (ret < 0 && target_does_cow) {
+ /* Not fatal; just trudge on ahead. */
+- return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
++ return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
+ }
+
+- return MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
++ return MAX(min_cluster_size,
++ MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size));
+ }
+
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ BlockDriverState *copy_bitmap_bs,
+ const BdrvDirtyBitmap *bitmap,
+ bool discard_source,
++ int64_t min_cluster_size,
+ Error **errp)
+ {
+ ERRP_GUARD();
+@@ -365,7 +368,13 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+
+ GLOBAL_STATE_CODE();
+
+- cluster_size = block_copy_calculate_cluster_size(target->bs, errp);
++ if (min_cluster_size && !is_power_of_2(min_cluster_size)) {
++ error_setg(errp, "min-cluster-size needs to be a power of 2");
++ return NULL;
++ }
++
++ cluster_size = block_copy_calculate_cluster_size(target->bs,
++ min_cluster_size, errp);
+ if (cluster_size < 0) {
+ return NULL;
+ }
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index cd65524e26..ac05a4993f 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -477,7 +477,8 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
+
+ s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
+ s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
+- flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
++ flags & BDRV_O_CBW_DISCARD_SOURCE,
++ opts->min_cluster_size, errp);
+ if (!s->bcs) {
+ error_prepend(errp, "Cannot create block-copy-state: ");
+ return -EINVAL;
+diff --git a/include/block/block-copy.h b/include/block/block-copy.h
+index bdc703bacd..77857c6c68 100644
+--- a/include/block/block-copy.h
++++ b/include/block/block-copy.h
+@@ -28,6 +28,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ BlockDriverState *copy_bitmap_bs,
+ const BdrvDirtyBitmap *bitmap,
+ bool discard_source,
++ int64_t min_cluster_size,
+ Error **errp);
+
+ /* Function should be called prior any actual copy request */
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index d796d49abb..edbf6e78b9 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -4930,12 +4930,18 @@
+ # @on-cbw-error parameter will decide how this failure is handled.
+ # Default 0. (Since 7.1)
+ #
++# @min-cluster-size: Minimum size of blocks used by copy-before-write
++# operations. Has to be a power of 2. No effect if smaller than
++# the maximum of the target's cluster size and 64 KiB. Default 0.
++# (Since 8.1)
++#
+ # Since: 6.2
+ ##
+ { 'struct': 'BlockdevOptionsCbw',
+ 'base': 'BlockdevOptionsGenericFormat',
+ 'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap',
+- '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32' } }
++ '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32',
++ '*min-cluster-size': 'uint32' } }
+
+ ##
+ # @BlockdevOptions:
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Date: Thu, 11 Apr 2024 11:29:25 +0200
-Subject: [PATCH] qapi: blockdev-backup: add discard-source parameter
-
-Add a parameter that enables discard-after-copy. That is mostly useful
-in "push backup with fleecing" scheme, when source is snapshot-access
-format driver node, based on copy-before-write filter snapshot-access
-API:
-
-[guest] [snapshot-access] ~~ blockdev-backup ~~> [backup target]
- | |
- | root | file
- v v
-[copy-before-write]
- | |
- | file | target
- v v
-[active disk] [temp.img]
-
-In this case discard-after-copy does two things:
-
- - discard data in temp.img to save disk space
- - avoid further copy-before-write operation in discarded area
-
-Note that we have to declare WRITE permission on source in
-copy-before-write filter, for discard to work. Still we can't take it
-unconditionally, as it will break normal backup from RO source. So, we
-have to add a parameter and pass it thorough bdrv_open flags.
-
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/backup.c | 5 +++--
- block/block-copy.c | 9 +++++++++
- block/copy-before-write.c | 15 +++++++++++++--
- block/copy-before-write.h | 1 +
- block/replication.c | 4 ++--
- blockdev.c | 2 +-
- include/block/block-common.h | 2 ++
- include/block/block-copy.h | 1 +
- include/block/block_int-global-state.h | 2 +-
- qapi/block-core.json | 4 ++++
- 10 files changed, 37 insertions(+), 8 deletions(-)
-
-diff --git a/block/backup.c b/block/backup.c
-index aec140e0c8..f19b751fe6 100644
---- a/block/backup.c
-+++ b/block/backup.c
-@@ -332,7 +332,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
- BlockDriverState *target, int64_t speed,
- MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
- BitmapSyncMode bitmap_mode,
-- bool compress,
-+ bool compress, bool discard_source,
- const char *filter_node_name,
- BackupPerf *perf,
- BlockdevOnError on_source_error,
-@@ -433,7 +433,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
- goto error;
- }
-
-- cbw = bdrv_cbw_append(bs, target, filter_node_name, &bcs, errp);
-+ cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
-+ &bcs, errp);
- if (!cbw) {
- goto error;
- }
-diff --git a/block/block-copy.c b/block/block-copy.c
-index 8fca2c3698..7e3b378528 100644
---- a/block/block-copy.c
-+++ b/block/block-copy.c
-@@ -137,6 +137,7 @@ typedef struct BlockCopyState {
- CoMutex lock;
- int64_t in_flight_bytes;
- BlockCopyMethod method;
-+ bool discard_source;
- BlockReqList reqs;
- QLIST_HEAD(, BlockCopyCallState) calls;
- /*
-@@ -353,6 +354,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- BlockDriverState *copy_bitmap_bs,
- const BdrvDirtyBitmap *bitmap,
-+ bool discard_source,
- Error **errp)
- {
- ERRP_GUARD();
-@@ -418,6 +420,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- cluster_size),
- };
-
-+ s->discard_source = discard_source;
- block_copy_set_copy_opts(s, false, false);
-
- ratelimit_init(&s->rate_limit);
-@@ -589,6 +592,12 @@ static coroutine_fn int block_copy_task_entry(AioTask *task)
- co_put_to_shres(s->mem, t->req.bytes);
- block_copy_task_end(t, ret);
-
-+ if (s->discard_source && ret == 0) {
-+ int64_t nbytes =
-+ MIN(t->req.offset + t->req.bytes, s->len) - t->req.offset;
-+ bdrv_co_pdiscard(s->source, t->req.offset, nbytes);
-+ }
-+
- return ret;
- }
-
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index afa5f473d2..5506f66857 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -44,6 +44,7 @@ typedef struct BDRVCopyBeforeWriteState {
- BdrvChild *target;
- OnCbwError on_cbw_error;
- uint32_t cbw_timeout_ns;
-+ bool discard_source;
-
- /*
- * @lock: protects access to @access_bitmap, @done_bitmap and
-@@ -357,6 +358,8 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
- uint64_t perm, uint64_t shared,
- uint64_t *nperm, uint64_t *nshared)
- {
-+ BDRVCopyBeforeWriteState *s = bs->opaque;
-+
- if (!(role & BDRV_CHILD_FILTERED)) {
- /*
- * Target child
-@@ -381,6 +384,10 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
- * start
- */
- *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
-+ if (s->discard_source) {
-+ *nperm = *nperm | BLK_PERM_WRITE;
-+ }
-+
- *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
- }
- }
-@@ -472,7 +479,9 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
- ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
- bs->file->bs->supported_zero_flags);
-
-- s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp);
-+ s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
-+ s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
-+ flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
- if (!s->bcs) {
- error_prepend(errp, "Cannot create block-copy-state: ");
- ret = -EINVAL;
-@@ -546,12 +555,14 @@ static BlockDriver bdrv_cbw_filter = {
- BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
- BlockDriverState *target,
- const char *filter_node_name,
-+ bool discard_source,
- BlockCopyState **bcs,
- Error **errp)
- {
- BDRVCopyBeforeWriteState *state;
- BlockDriverState *top;
- QDict *opts;
-+ int flags = BDRV_O_RDWR | (discard_source ? BDRV_O_CBW_DISCARD_SOURCE : 0);
-
- assert(source->total_sectors == target->total_sectors);
- GLOBAL_STATE_CODE();
-@@ -564,7 +575,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
- qdict_put_str(opts, "file", bdrv_get_node_name(source));
- qdict_put_str(opts, "target", bdrv_get_node_name(target));
-
-- top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp);
-+ top = bdrv_insert_node(source, opts, flags, errp);
- if (!top) {
- return NULL;
- }
-diff --git a/block/copy-before-write.h b/block/copy-before-write.h
-index 6e72bb25e9..01af0cd3c4 100644
---- a/block/copy-before-write.h
-+++ b/block/copy-before-write.h
-@@ -39,6 +39,7 @@
- BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
- BlockDriverState *target,
- const char *filter_node_name,
-+ bool discard_source,
- BlockCopyState **bcs,
- Error **errp);
- void bdrv_cbw_drop(BlockDriverState *bs);
-diff --git a/block/replication.c b/block/replication.c
-index 5ded5f1ca9..bd75a6aee3 100644
---- a/block/replication.c
-+++ b/block/replication.c
-@@ -604,8 +604,8 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
-
- s->backup_job = backup_job_create(
- NULL, s->secondary_disk->bs, s->hidden_disk->bs,
-- 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, NULL,
-- &perf,
-+ 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, false,
-+ NULL, &perf,
- BLOCKDEV_ON_ERROR_REPORT,
- BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
- backup_job_completed, bs, NULL, &local_err);
-diff --git a/blockdev.c b/blockdev.c
-index 3049811be8..e167ad1e54 100644
---- a/blockdev.c
-+++ b/blockdev.c
-@@ -2854,7 +2854,7 @@ static BlockJob *do_backup_common(BackupCommon *backup,
-
- job = backup_job_create(backup->job_id, bs, target_bs, backup->speed,
- backup->sync, bmap, backup->bitmap_mode,
-- backup->compress,
-+ backup->compress, backup->discard_source,
- backup->filter_node_name,
- &perf,
- backup->on_source_error,
-diff --git a/include/block/block-common.h b/include/block/block-common.h
-index d7599564db..7f56364e73 100644
---- a/include/block/block-common.h
-+++ b/include/block/block-common.h
-@@ -246,6 +246,8 @@ typedef enum {
- read-write fails */
- #define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */
-
-+#define BDRV_O_CBW_DISCARD_SOURCE 0x80000 /* for copy-before-write filter */
-+
- #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
-
-
-diff --git a/include/block/block-copy.h b/include/block/block-copy.h
-index 8b41643bfa..bdc703bacd 100644
---- a/include/block/block-copy.h
-+++ b/include/block/block-copy.h
-@@ -27,6 +27,7 @@ typedef struct BlockCopyCallState BlockCopyCallState;
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- BlockDriverState *copy_bitmap_bs,
- const BdrvDirtyBitmap *bitmap,
-+ bool discard_source,
- Error **errp);
-
- /* Function should be called prior any actual copy request */
-diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
-index 57265a617a..df731688b4 100644
---- a/include/block/block_int-global-state.h
-+++ b/include/block/block_int-global-state.h
-@@ -189,7 +189,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
- MirrorSyncMode sync_mode,
- BdrvDirtyBitmap *sync_bitmap,
- BitmapSyncMode bitmap_mode,
-- bool compress,
-+ bool compress, bool discard_source,
- const char *filter_node_name,
- BackupPerf *perf,
- BlockdevOnError on_source_error,
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 7b977459fa..82960797dc 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -1834,6 +1834,9 @@
- # node specified by @drive. If this option is not given, a node
- # name is autogenerated. (Since: 4.2)
- #
-+# @discard-source: Discard blocks on source which are already copied
-+# to the target. (Since 9.0)
-+#
- # @x-perf: Performance options. (Since 6.0)
- #
- # Features:
-@@ -1855,6 +1858,7 @@
- '*on-target-error': 'BlockdevOnError',
- '*auto-finalize': 'bool', '*auto-dismiss': 'bool',
- '*filter-node-name': 'str',
-+ '*discard-source': 'bool',
- '*x-perf': { 'type': 'BackupPerf',
- 'features': [ 'unstable' ] } } }
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Thu, 11 Apr 2024 11:29:27 +0200
+Subject: [PATCH] backup: add minimum cluster size to performance options
+
+Useful to make discard-source work in the context of backup fleecing
+when the fleecing image has a larger granularity than the backup
+target.
+
+Backup/block-copy will use at least this granularity for copy operations
+and in particular, discard requests to the backup source will too. If
+the granularity is too small, they will just be aligned down in
+cbw_co_pdiscard_snapshot() and thus effectively ignored.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/backup.c | 2 +-
+ block/copy-before-write.c | 2 ++
+ block/copy-before-write.h | 1 +
+ blockdev.c | 3 +++
+ qapi/block-core.json | 9 +++++++--
+ 5 files changed, 14 insertions(+), 3 deletions(-)
+
+diff --git a/block/backup.c b/block/backup.c
+index 1963e47ab9..fe69723ada 100644
+--- a/block/backup.c
++++ b/block/backup.c
+@@ -434,7 +434,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+ }
+
+ cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
+- &bcs, errp);
++ perf->min_cluster_size, &bcs, errp);
+ if (!cbw) {
+ goto error;
+ }
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index ac05a4993f..d1e87f8cf4 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -546,6 +546,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ BlockDriverState *target,
+ const char *filter_node_name,
+ bool discard_source,
++ int64_t min_cluster_size,
+ BlockCopyState **bcs,
+ Error **errp)
+ {
+@@ -564,6 +565,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ }
+ qdict_put_str(opts, "file", bdrv_get_node_name(source));
+ qdict_put_str(opts, "target", bdrv_get_node_name(target));
++ qdict_put_int(opts, "min-cluster-size", min_cluster_size);
+
+ top = bdrv_insert_node(source, opts, flags, errp);
+ if (!top) {
+diff --git a/block/copy-before-write.h b/block/copy-before-write.h
+index 01af0cd3c4..dc6cafe7fa 100644
+--- a/block/copy-before-write.h
++++ b/block/copy-before-write.h
+@@ -40,6 +40,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ BlockDriverState *target,
+ const char *filter_node_name,
+ bool discard_source,
++ int64_t min_cluster_size,
+ BlockCopyState **bcs,
+ Error **errp);
+ void bdrv_cbw_drop(BlockDriverState *bs);
+diff --git a/blockdev.c b/blockdev.c
+index 1054a69279..cbe224387b 100644
+--- a/blockdev.c
++++ b/blockdev.c
+@@ -2654,6 +2654,9 @@ static BlockJob *do_backup_common(BackupCommon *backup,
+ if (backup->x_perf->has_max_chunk) {
+ perf.max_chunk = backup->x_perf->max_chunk;
+ }
++ if (backup->x_perf->has_min_cluster_size) {
++ perf.min_cluster_size = backup->x_perf->min_cluster_size;
++ }
+ }
+
+ if ((backup->sync == MIRROR_SYNC_MODE_BITMAP) ||
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index edbf6e78b9..6e7ee87633 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -1790,11 +1790,16 @@
+ # it should not be less than job cluster size which is calculated
+ # as maximum of target image cluster size and 64k. Default 0.
+ #
++# @min-cluster-size: Minimum size of blocks used by copy-before-write
++# and background copy operations. Has to be a power of 2. No
++# effect if smaller than the maximum of the target's cluster size
++# and 64 KiB. Default 0. (Since 8.1)
++#
+ # Since: 6.0
+ ##
+ { 'struct': 'BackupPerf',
+- 'data': { '*use-copy-range': 'bool',
+- '*max-workers': 'int', '*max-chunk': 'int64' } }
++ 'data': { '*use-copy-range': 'bool', '*max-workers': 'int',
++ '*max-chunk': 'int64', '*min-cluster-size': 'uint32' } }
+
+ ##
+ # @BackupCommon:
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 11 Apr 2024 11:29:26 +0200
-Subject: [PATCH] copy-before-write: allow specifying minimum cluster size
-
-Useful to make discard-source work in the context of backup fleecing
-when the fleecing image has a larger granularity than the backup
-target.
-
-Copy-before-write operations will use at least this granularity and in
-particular, discard requests to the source node will too. If the
-granularity is too small, they will just be aligned down in
-cbw_co_pdiscard_snapshot() and thus effectively ignored.
-
-The QAPI uses uint32 so the value will be non-negative, but still fit
-into a uint64_t.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/block-copy.c | 17 +++++++++++++----
- block/copy-before-write.c | 3 ++-
- include/block/block-copy.h | 1 +
- qapi/block-core.json | 8 +++++++-
- 4 files changed, 23 insertions(+), 6 deletions(-)
-
-diff --git a/block/block-copy.c b/block/block-copy.c
-index 7e3b378528..adb1cbb440 100644
---- a/block/block-copy.c
-+++ b/block/block-copy.c
-@@ -310,6 +310,7 @@ void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range,
- }
-
- static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
-+ int64_t min_cluster_size,
- Error **errp)
- {
- int ret;
-@@ -335,7 +336,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
- "used. If the actual block size of the target exceeds "
- "this default, the backup may be unusable",
- BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
-- return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
-+ return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
- } else if (ret < 0 && !target_does_cow) {
- error_setg_errno(errp, -ret,
- "Couldn't determine the cluster size of the target image, "
-@@ -345,16 +346,18 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
- return ret;
- } else if (ret < 0 && target_does_cow) {
- /* Not fatal; just trudge on ahead. */
-- return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
-+ return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
- }
-
-- return MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
-+ return MAX(min_cluster_size,
-+ MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size));
- }
-
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- BlockDriverState *copy_bitmap_bs,
- const BdrvDirtyBitmap *bitmap,
- bool discard_source,
-+ int64_t min_cluster_size,
- Error **errp)
- {
- ERRP_GUARD();
-@@ -365,7 +368,13 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-
- GLOBAL_STATE_CODE();
-
-- cluster_size = block_copy_calculate_cluster_size(target->bs, errp);
-+ if (min_cluster_size && !is_power_of_2(min_cluster_size)) {
-+ error_setg(errp, "min-cluster-size needs to be a power of 2");
-+ return NULL;
-+ }
-+
-+ cluster_size = block_copy_calculate_cluster_size(target->bs,
-+ min_cluster_size, errp);
- if (cluster_size < 0) {
- return NULL;
- }
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index 5506f66857..fbc5aeb9eb 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -481,7 +481,8 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
-
- s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
- s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
-- flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
-+ flags & BDRV_O_CBW_DISCARD_SOURCE,
-+ opts->min_cluster_size, errp);
- if (!s->bcs) {
- error_prepend(errp, "Cannot create block-copy-state: ");
- ret = -EINVAL;
-diff --git a/include/block/block-copy.h b/include/block/block-copy.h
-index bdc703bacd..77857c6c68 100644
---- a/include/block/block-copy.h
-+++ b/include/block/block-copy.h
-@@ -28,6 +28,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- BlockDriverState *copy_bitmap_bs,
- const BdrvDirtyBitmap *bitmap,
- bool discard_source,
-+ int64_t min_cluster_size,
- Error **errp);
-
- /* Function should be called prior any actual copy request */
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 82960797dc..f2fec625cc 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -4880,12 +4880,18 @@
- # @on-cbw-error parameter will decide how this failure is handled.
- # Default 0. (Since 7.1)
- #
-+# @min-cluster-size: Minimum size of blocks used by copy-before-write
-+# operations. Has to be a power of 2. No effect if smaller than
-+# the maximum of the target's cluster size and 64 KiB. Default 0.
-+# (Since 8.1)
-+#
- # Since: 6.2
- ##
- { 'struct': 'BlockdevOptionsCbw',
- 'base': 'BlockdevOptionsGenericFormat',
- 'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap',
-- '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32' } }
-+ '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32',
-+ '*min-cluster-size': 'uint32' } }
-
- ##
- # @BlockdevOptions:
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Thu, 11 Apr 2024 11:29:28 +0200
+Subject: [PATCH] PVE backup: add fleecing option
+
+When a fleecing option is given, it is expected that each device has
+a corresponding "-fleecing" block device already attached, except for
+EFI disk and TPM state, where fleecing is never used.
+
+The following graph was adapted from [0] which also contains more
+details about fleecing.
+
+[guest]
+ |
+ | root
+ v file
+[copy-before-write]<------[snapshot-access]
+ | |
+ | file | target
+ v v
+[source] [fleecing]
+
+For fleecing, a copy-before-write filter is inserted on top of the
+source node, as well as a snapshot-access node pointing to the filter
+node which allows to read the consistent state of the image at the
+time it was inserted. New guest writes are passed through the
+copy-before-write filter which will first copy over old data to the
+fleecing image in case that old data is still needed by the
+snapshot-access node.
+
+The backup process will sequentially read from the snapshot access,
+which has a bitmap and knows whether to read from the original image
+or the fleecing image to get the "snapshot" state, i.e. data from the
+source image at the time when the copy-before-write filter was
+inserted. After reading, the copied sections are discarded from the
+fleecing image to reduce space usage.
+
+All of this can be restricted by an initial dirty bitmap to parts of
+the source image that are required for an incremental backup.
+
+For discard to work, it is necessary that the fleecing image does not
+have a larger cluster size than the backup job granularity. Since
+querying that size does not always work, e.g. for RBD with krbd, the
+cluster size will not be reported, a minimum of 4 MiB is used. A job
+with PBS target already has at least this granularity, so it's just
+relevant for other targets. I.e. edge cases where this minimum is not
+enough should be very rare in practice. If ever necessary in the
+future, can still add a passed-in value for the backup QMP command to
+override.
+
+Additionally, the cbw-timeout and on-cbw-error=break-snapshot options
+are set when installing the copy-before-write filter and
+snapshot-access. When an error or timeout occurs, the problematic (and
+each further) snapshot operation will fail and thus cancel the backup
+instead of breaking the guest write.
+
+Note that job_id cannot be inferred from the snapshot-access bs because
+it has no parent, so just pass the one from the original bs.
+
+[0]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg876056.html
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/monitor/block-hmp-cmds.c | 1 +
+ pve-backup.c | 143 ++++++++++++++++++++++++++++++++-
+ qapi/block-core.json | 10 ++-
+ 3 files changed, 150 insertions(+), 4 deletions(-)
+
+diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
+index 5000c084c5..70b3de4c7e 100644
+--- a/block/monitor/block-hmp-cmds.c
++++ b/block/monitor/block-hmp-cmds.c
+@@ -1043,6 +1043,7 @@ void coroutine_fn hmp_backup(Monitor *mon, const QDict *qdict)
+ NULL, NULL,
+ devlist, qdict_haskey(qdict, "speed"), speed,
+ false, 0, // BackupPerf max-workers
++ false, false, // fleecing
+ &error);
+
+ hmp_handle_error(mon, error);
+diff --git a/pve-backup.c b/pve-backup.c
+index 9d480a8eec..7cc1dd3724 100644
+--- a/pve-backup.c
++++ b/pve-backup.c
+@@ -7,9 +7,11 @@
+ #include "sysemu/blockdev.h"
+ #include "block/block_int-global-state.h"
+ #include "block/blockjob.h"
++#include "block/copy-before-write.h"
+ #include "block/dirty-bitmap.h"
+ #include "block/graph-lock.h"
+ #include "qapi/qapi-commands-block.h"
++#include "qapi/qmp/qdict.h"
+ #include "qapi/qmp/qerror.h"
+ #include "qemu/cutils.h"
+
+@@ -81,8 +83,15 @@ static void pvebackup_init(void)
+ // initialize PVEBackupState at startup
+ opts_init(pvebackup_init);
+
++typedef struct PVEBackupFleecingInfo {
++ BlockDriverState *bs;
++ BlockDriverState *cbw;
++ BlockDriverState *snapshot_access;
++} PVEBackupFleecingInfo;
++
+ typedef struct PVEBackupDevInfo {
+ BlockDriverState *bs;
++ PVEBackupFleecingInfo fleecing;
+ size_t size;
+ uint64_t block_size;
+ uint8_t dev_id;
+@@ -355,6 +364,25 @@ static void pvebackup_complete_cb(void *opaque, int ret)
+ PVEBackupDevInfo *di = opaque;
+ di->completed_ret = ret;
+
++ /*
++ * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work
++ * won't be done as a coroutine anyways:
++ * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would
++ * just spawn a BH calling bdrv_unref().
++ * - For cbw, draining would need to spawn a BH.
++ *
++ * Note that the AioContext lock is already acquired by our caller, i.e.
++ * job_finalize_single_locked()
++ */
++ if (di->fleecing.snapshot_access) {
++ bdrv_unref(di->fleecing.snapshot_access);
++ di->fleecing.snapshot_access = NULL;
++ }
++ if (di->fleecing.cbw) {
++ bdrv_cbw_drop(di->fleecing.cbw);
++ di->fleecing.cbw = NULL;
++ }
++
+ /*
+ * Needs to happen outside of coroutine, because it takes the graph write lock.
+ */
+@@ -522,9 +550,82 @@ static void create_backup_jobs_bh(void *opaque) {
+ }
+ bdrv_drained_begin(di->bs);
+
++ BackupPerf perf = (BackupPerf){ .max_workers = backup_state.perf.max_workers };
++
++ BlockDriverState *source_bs = di->bs;
++ bool discard_source = false;
++ bdrv_graph_co_rdlock();
++ const char *job_id = bdrv_get_device_name(di->bs);
++ bdrv_graph_co_rdunlock();
++ if (di->fleecing.bs) {
++ QDict *cbw_opts = qdict_new();
++ qdict_put_str(cbw_opts, "driver", "copy-before-write");
++ qdict_put_str(cbw_opts, "file", bdrv_get_node_name(di->bs));
++ qdict_put_str(cbw_opts, "target", bdrv_get_node_name(di->fleecing.bs));
++
++ if (di->bitmap) {
++ /*
++ * Only guest writes to parts relevant for the backup need to be intercepted with
++ * old data being copied to the fleecing image.
++ */
++ qdict_put_str(cbw_opts, "bitmap.node", bdrv_get_node_name(di->bs));
++ qdict_put_str(cbw_opts, "bitmap.name", bdrv_dirty_bitmap_name(di->bitmap));
++ }
++ /*
++ * Fleecing storage is supposed to be fast and it's better to break backup than guest
++ * writes. Certain guest drivers like VirtIO-win have 60 seconds timeout by default, so
++ * abort a bit before that.
++ */
++ qdict_put_str(cbw_opts, "on-cbw-error", "break-snapshot");
++ qdict_put_int(cbw_opts, "cbw-timeout", 45);
++
++ di->fleecing.cbw = bdrv_insert_node(di->bs, cbw_opts, BDRV_O_RDWR, &local_err);
++
++ if (!di->fleecing.cbw) {
++ error_setg(errp, "appending cbw node for fleecing failed: %s",
++ local_err ? error_get_pretty(local_err) : "unknown error");
++ break;
++ }
++
++ QDict *snapshot_access_opts = qdict_new();
++ qdict_put_str(snapshot_access_opts, "driver", "snapshot-access");
++ qdict_put_str(snapshot_access_opts, "file", bdrv_get_node_name(di->fleecing.cbw));
++
++ /*
++ * Holding the AioContext lock here would cause a deadlock, because bdrv_open_driver()
++ * will aquire it a second time. But it's allowed to be held exactly once when polling
++ * and that happens when the bdrv_refresh_total_sectors() call is made there.
++ */
++ di->fleecing.snapshot_access =
++ bdrv_open(NULL, NULL, snapshot_access_opts, BDRV_O_RDWR | BDRV_O_UNMAP, &local_err);
++ if (!di->fleecing.snapshot_access) {
++ error_setg(errp, "setting up snapshot access for fleecing failed: %s",
++ local_err ? error_get_pretty(local_err) : "unknown error");
++ break;
++ }
++ source_bs = di->fleecing.snapshot_access;
++ discard_source = true;
++
++ /*
++ * bdrv_get_info() just retuns 0 (= doesn't matter) for RBD when using krbd. But discard
++ * on the fleecing image won't work if the backup job's granularity is less than the RBD
++ * object size (default 4 MiB), so it does matter. Always use at least 4 MiB. With a PBS
++ * target, the backup job granularity would already be at least this much.
++ */
++ perf.min_cluster_size = 4 * 1024 * 1024;
++ /*
++ * For discard to work, cluster size for the backup job must be at least the same as for
++ * the fleecing image.
++ */
++ BlockDriverInfo bdi;
++ if (bdrv_get_info(di->fleecing.bs, &bdi) >= 0) {
++ perf.min_cluster_size = MAX(perf.min_cluster_size, bdi.cluster_size);
++ }
++ }
++
+ BlockJob *job = backup_job_create(
+- NULL, di->bs, di->target, backup_state.speed, sync_mode, di->bitmap,
+- bitmap_mode, false, NULL, &backup_state.perf, BLOCKDEV_ON_ERROR_REPORT,
++ job_id, source_bs, di->target, backup_state.speed, sync_mode, di->bitmap,
++ bitmap_mode, false, discard_source, NULL, &perf, BLOCKDEV_ON_ERROR_REPORT,
+ BLOCKDEV_ON_ERROR_REPORT, JOB_DEFAULT, pvebackup_complete_cb, di, backup_state.txn,
+ &local_err);
+
+@@ -580,6 +681,14 @@ static void create_backup_jobs_bh(void *opaque) {
+ aio_co_enter(data->ctx, data->co);
+ }
+
++/*
++ * EFI disk and TPM state are small and it's just not worth setting up fleecing for them.
++ */
++static bool device_uses_fleecing(const char *device_id)
++{
++ return strncmp(device_id, "drive-efidisk", 13) && strncmp(device_id, "drive-tpmstate", 14);
++}
++
+ /*
+ * Returns a list of device infos, which needs to be freed by the caller. In
+ * case of an error, errp will be set, but the returned value might still be a
+@@ -587,6 +696,7 @@ static void create_backup_jobs_bh(void *opaque) {
+ */
+ static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
+ const char *devlist,
++ bool fleecing,
+ Error **errp)
+ {
+ gchar **devs = NULL;
+@@ -610,6 +720,31 @@ static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
+ }
+ PVEBackupDevInfo *di = g_new0(PVEBackupDevInfo, 1);
+ di->bs = bs;
++
++ if (fleecing && device_uses_fleecing(*d)) {
++ g_autofree gchar *fleecing_devid = g_strconcat(*d, "-fleecing", NULL);
++ BlockBackend *fleecing_blk = blk_by_name(fleecing_devid);
++ if (!fleecing_blk) {
++ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
++ "Device '%s' not found", fleecing_devid);
++ goto err;
++ }
++ BlockDriverState *fleecing_bs = blk_bs(fleecing_blk);
++ if (!bdrv_co_is_inserted(fleecing_bs)) {
++ error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, fleecing_devid);
++ goto err;
++ }
++ /*
++ * Fleecing image needs to be the same size to act as a cbw target.
++ */
++ if (bs->total_sectors != fleecing_bs->total_sectors) {
++ error_setg(errp, "Size mismatch for '%s' - sector count %ld != %ld",
++ fleecing_devid, fleecing_bs->total_sectors, bs->total_sectors);
++ goto err;
++ }
++ di->fleecing.bs = fleecing_bs;
++ }
++
+ di_list = g_list_append(di_list, di);
+ d++;
+ }
+@@ -659,6 +794,7 @@ UuidInfo coroutine_fn *qmp_backup(
+ const char *devlist,
+ bool has_speed, int64_t speed,
+ bool has_max_workers, int64_t max_workers,
++ bool has_fleecing, bool fleecing,
+ Error **errp)
+ {
+ assert(qemu_in_coroutine());
+@@ -687,7 +823,7 @@ UuidInfo coroutine_fn *qmp_backup(
+ format = has_format ? format : BACKUP_FORMAT_VMA;
+
+ bdrv_graph_co_rdlock();
+- di_list = get_device_info(devlist, &local_err);
++ di_list = get_device_info(devlist, has_fleecing && fleecing, &local_err);
+ bdrv_graph_co_rdunlock();
+ if (local_err) {
+ error_propagate(errp, local_err);
+@@ -1095,5 +1231,6 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
+ ret->query_bitmap_info = true;
+ ret->pbs_masterkey = true;
+ ret->backup_max_workers = true;
++ ret->backup_fleecing = true;
+ return ret;
+ }
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index 6e7ee87633..dc5f75cd39 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -948,6 +948,10 @@
+ #
+ # @max-workers: see @BackupPerf for details. Default 16.
+ #
++# @fleecing: perform a backup with fleecing. For each device in @devlist, a
++# corresponing '-fleecing' device with the same size already needs to
++# be present.
++#
+ # Returns: the uuid of the backup job
+ #
+ ##
+@@ -968,7 +972,8 @@
+ '*firewall-file': 'str',
+ '*devlist': 'str',
+ '*speed': 'int',
+- '*max-workers': 'int' },
++ '*max-workers': 'int',
++ '*fleecing': 'bool' },
+ 'returns': 'UuidInfo', 'coroutine': true }
+
+ ##
+@@ -1014,6 +1019,8 @@
+ #
+ # @pbs-library-version: Running version of libproxmox-backup-qemu0 library.
+ #
++# @backup-fleecing: Whether backup fleecing is supported or not.
++#
+ # @backup-max-workers: Whether the 'max-workers' @BackupPerf setting is
+ # supported or not.
+ #
+@@ -1025,6 +1032,7 @@
+ 'pbs-dirty-bitmap-migration': 'bool',
+ 'pbs-masterkey': 'bool',
+ 'pbs-library-version': 'str',
++ 'backup-fleecing': 'bool',
+ 'backup-max-workers': 'bool' } }
+
+ ##
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 11 Apr 2024 11:29:27 +0200
-Subject: [PATCH] backup: add minimum cluster size to performance options
-
-Useful to make discard-source work in the context of backup fleecing
-when the fleecing image has a larger granularity than the backup
-target.
-
-Backup/block-copy will use at least this granularity for copy operations
-and in particular, discard requests to the backup source will too. If
-the granularity is too small, they will just be aligned down in
-cbw_co_pdiscard_snapshot() and thus effectively ignored.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/backup.c | 2 +-
- block/copy-before-write.c | 2 ++
- block/copy-before-write.h | 1 +
- blockdev.c | 3 +++
- qapi/block-core.json | 9 +++++++--
- 5 files changed, 14 insertions(+), 3 deletions(-)
-
-diff --git a/block/backup.c b/block/backup.c
-index f19b751fe6..4367278d68 100644
---- a/block/backup.c
-+++ b/block/backup.c
-@@ -434,7 +434,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
- }
-
- cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
-- &bcs, errp);
-+ perf->min_cluster_size, &bcs, errp);
- if (!cbw) {
- goto error;
- }
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index fbc5aeb9eb..3d5523992c 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -557,6 +557,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
- BlockDriverState *target,
- const char *filter_node_name,
- bool discard_source,
-+ int64_t min_cluster_size,
- BlockCopyState **bcs,
- Error **errp)
- {
-@@ -575,6 +576,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
- }
- qdict_put_str(opts, "file", bdrv_get_node_name(source));
- qdict_put_str(opts, "target", bdrv_get_node_name(target));
-+ qdict_put_int(opts, "min-cluster-size", min_cluster_size);
-
- top = bdrv_insert_node(source, opts, flags, errp);
- if (!top) {
-diff --git a/block/copy-before-write.h b/block/copy-before-write.h
-index 01af0cd3c4..dc6cafe7fa 100644
---- a/block/copy-before-write.h
-+++ b/block/copy-before-write.h
-@@ -40,6 +40,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
- BlockDriverState *target,
- const char *filter_node_name,
- bool discard_source,
-+ int64_t min_cluster_size,
- BlockCopyState **bcs,
- Error **errp);
- void bdrv_cbw_drop(BlockDriverState *bs);
-diff --git a/blockdev.c b/blockdev.c
-index e167ad1e54..35ec5d8f0b 100644
---- a/blockdev.c
-+++ b/blockdev.c
-@@ -2781,6 +2781,9 @@ static BlockJob *do_backup_common(BackupCommon *backup,
- if (backup->x_perf->has_max_chunk) {
- perf.max_chunk = backup->x_perf->max_chunk;
- }
-+ if (backup->x_perf->has_min_cluster_size) {
-+ perf.min_cluster_size = backup->x_perf->min_cluster_size;
-+ }
- }
-
- if ((backup->sync == MIRROR_SYNC_MODE_BITMAP) ||
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index f2fec625cc..48eec4ef29 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -1775,11 +1775,16 @@
- # it should not be less than job cluster size which is calculated
- # as maximum of target image cluster size and 64k. Default 0.
- #
-+# @min-cluster-size: Minimum size of blocks used by copy-before-write
-+# and background copy operations. Has to be a power of 2. No
-+# effect if smaller than the maximum of the target's cluster size
-+# and 64 KiB. Default 0. (Since 8.1)
-+#
- # Since: 6.0
- ##
- { 'struct': 'BackupPerf',
-- 'data': { '*use-copy-range': 'bool',
-- '*max-workers': 'int', '*max-chunk': 'int64' } }
-+ 'data': { '*use-copy-range': 'bool', '*max-workers': 'int',
-+ '*max-chunk': 'int64', '*min-cluster-size': 'uint32' } }
-
- ##
- # @BackupCommon:
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 11 Apr 2024 11:29:28 +0200
-Subject: [PATCH] PVE backup: add fleecing option
-
-When a fleecing option is given, it is expected that each device has
-a corresponding "-fleecing" block device already attached, except for
-EFI disk and TPM state, where fleecing is never used.
-
-The following graph was adapted from [0] which also contains more
-details about fleecing.
-
-[guest]
- |
- | root
- v file
-[copy-before-write]<------[snapshot-access]
- | |
- | file | target
- v v
-[source] [fleecing]
-
-For fleecing, a copy-before-write filter is inserted on top of the
-source node, as well as a snapshot-access node pointing to the filter
-node which allows to read the consistent state of the image at the
-time it was inserted. New guest writes are passed through the
-copy-before-write filter which will first copy over old data to the
-fleecing image in case that old data is still needed by the
-snapshot-access node.
-
-The backup process will sequentially read from the snapshot access,
-which has a bitmap and knows whether to read from the original image
-or the fleecing image to get the "snapshot" state, i.e. data from the
-source image at the time when the copy-before-write filter was
-inserted. After reading, the copied sections are discarded from the
-fleecing image to reduce space usage.
-
-All of this can be restricted by an initial dirty bitmap to parts of
-the source image that are required for an incremental backup.
-
-For discard to work, it is necessary that the fleecing image does not
-have a larger cluster size than the backup job granularity. Since
-querying that size does not always work, e.g. for RBD with krbd, the
-cluster size will not be reported, a minimum of 4 MiB is used. A job
-with PBS target already has at least this granularity, so it's just
-relevant for other targets. I.e. edge cases where this minimum is not
-enough should be very rare in practice. If ever necessary in the
-future, can still add a passed-in value for the backup QMP command to
-override.
-
-Additionally, the cbw-timeout and on-cbw-error=break-snapshot options
-are set when installing the copy-before-write filter and
-snapshot-access. When an error or timeout occurs, the problematic (and
-each further) snapshot operation will fail and thus cancel the backup
-instead of breaking the guest write.
-
-Note that job_id cannot be inferred from the snapshot-access bs because
-it has no parent, so just pass the one from the original bs.
-
-[0]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg876056.html
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/monitor/block-hmp-cmds.c | 1 +
- pve-backup.c | 145 ++++++++++++++++++++++++++++++++-
- qapi/block-core.json | 8 +-
- 3 files changed, 150 insertions(+), 4 deletions(-)
-
-diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
-index 1656859e03..f6cc9e5cf7 100644
---- a/block/monitor/block-hmp-cmds.c
-+++ b/block/monitor/block-hmp-cmds.c
-@@ -1072,6 +1072,7 @@ void coroutine_fn hmp_backup(Monitor *mon, const QDict *qdict)
- NULL, NULL,
- devlist, qdict_haskey(qdict, "speed"), speed,
- false, 0, // BackupPerf max-workers
-+ false, false, // fleecing
- &error);
-
- hmp_handle_error(mon, error);
-diff --git a/pve-backup.c b/pve-backup.c
-index 777db7938e..4c728951ac 100644
---- a/pve-backup.c
-+++ b/pve-backup.c
-@@ -7,9 +7,11 @@
- #include "sysemu/blockdev.h"
- #include "block/block_int-global-state.h"
- #include "block/blockjob.h"
-+#include "block/copy-before-write.h"
- #include "block/dirty-bitmap.h"
- #include "block/graph-lock.h"
- #include "qapi/qapi-commands-block.h"
-+#include "qapi/qmp/qdict.h"
- #include "qapi/qmp/qerror.h"
- #include "qemu/cutils.h"
-
-@@ -81,8 +83,15 @@ static void pvebackup_init(void)
- // initialize PVEBackupState at startup
- opts_init(pvebackup_init);
-
-+typedef struct PVEBackupFleecingInfo {
-+ BlockDriverState *bs;
-+ BlockDriverState *cbw;
-+ BlockDriverState *snapshot_access;
-+} PVEBackupFleecingInfo;
-+
- typedef struct PVEBackupDevInfo {
- BlockDriverState *bs;
-+ PVEBackupFleecingInfo fleecing;
- size_t size;
- uint64_t block_size;
- uint8_t dev_id;
-@@ -355,6 +364,25 @@ static void pvebackup_complete_cb(void *opaque, int ret)
- PVEBackupDevInfo *di = opaque;
- di->completed_ret = ret;
-
-+ /*
-+ * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work
-+ * won't be done as a coroutine anyways:
-+ * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would
-+ * just spawn a BH calling bdrv_unref().
-+ * - For cbw, draining would need to spawn a BH.
-+ *
-+ * Note that the AioContext lock is already acquired by our caller, i.e.
-+ * job_finalize_single_locked()
-+ */
-+ if (di->fleecing.snapshot_access) {
-+ bdrv_unref(di->fleecing.snapshot_access);
-+ di->fleecing.snapshot_access = NULL;
-+ }
-+ if (di->fleecing.cbw) {
-+ bdrv_cbw_drop(di->fleecing.cbw);
-+ di->fleecing.cbw = NULL;
-+ }
-+
- /*
- * Needs to happen outside of coroutine, because it takes the graph write lock.
- */
-@@ -525,9 +553,84 @@ static void create_backup_jobs_bh(void *opaque) {
-
- bdrv_drained_begin(di->bs);
-
-+ BackupPerf perf = (BackupPerf){ .max_workers = backup_state.perf.max_workers };
-+
-+ BlockDriverState *source_bs = di->bs;
-+ bool discard_source = false;
-+ bdrv_graph_co_rdlock();
-+ const char *job_id = bdrv_get_device_name(di->bs);
-+ bdrv_graph_co_rdunlock();
-+ if (di->fleecing.bs) {
-+ QDict *cbw_opts = qdict_new();
-+ qdict_put_str(cbw_opts, "driver", "copy-before-write");
-+ qdict_put_str(cbw_opts, "file", bdrv_get_node_name(di->bs));
-+ qdict_put_str(cbw_opts, "target", bdrv_get_node_name(di->fleecing.bs));
-+
-+ if (di->bitmap) {
-+ /*
-+ * Only guest writes to parts relevant for the backup need to be intercepted with
-+ * old data being copied to the fleecing image.
-+ */
-+ qdict_put_str(cbw_opts, "bitmap.node", bdrv_get_node_name(di->bs));
-+ qdict_put_str(cbw_opts, "bitmap.name", bdrv_dirty_bitmap_name(di->bitmap));
-+ }
-+ /*
-+ * Fleecing storage is supposed to be fast and it's better to break backup than guest
-+ * writes. Certain guest drivers like VirtIO-win have 60 seconds timeout by default, so
-+ * abort a bit before that.
-+ */
-+ qdict_put_str(cbw_opts, "on-cbw-error", "break-snapshot");
-+ qdict_put_int(cbw_opts, "cbw-timeout", 45);
-+
-+ di->fleecing.cbw = bdrv_insert_node(di->bs, cbw_opts, BDRV_O_RDWR, &local_err);
-+
-+ if (!di->fleecing.cbw) {
-+ error_setg(errp, "appending cbw node for fleecing failed: %s",
-+ local_err ? error_get_pretty(local_err) : "unknown error");
-+ break;
-+ }
-+
-+ QDict *snapshot_access_opts = qdict_new();
-+ qdict_put_str(snapshot_access_opts, "driver", "snapshot-access");
-+ qdict_put_str(snapshot_access_opts, "file", bdrv_get_node_name(di->fleecing.cbw));
-+
-+ /*
-+ * Holding the AioContext lock here would cause a deadlock, because bdrv_open_driver()
-+ * will aquire it a second time. But it's allowed to be held exactly once when polling
-+ * and that happens when the bdrv_refresh_total_sectors() call is made there.
-+ */
-+ aio_context_release(aio_context);
-+ di->fleecing.snapshot_access =
-+ bdrv_open(NULL, NULL, snapshot_access_opts, BDRV_O_RDWR | BDRV_O_UNMAP, &local_err);
-+ aio_context_acquire(aio_context);
-+ if (!di->fleecing.snapshot_access) {
-+ error_setg(errp, "setting up snapshot access for fleecing failed: %s",
-+ local_err ? error_get_pretty(local_err) : "unknown error");
-+ break;
-+ }
-+ source_bs = di->fleecing.snapshot_access;
-+ discard_source = true;
-+
-+ /*
-+ * bdrv_get_info() just retuns 0 (= doesn't matter) for RBD when using krbd. But discard
-+ * on the fleecing image won't work if the backup job's granularity is less than the RBD
-+ * object size (default 4 MiB), so it does matter. Always use at least 4 MiB. With a PBS
-+ * target, the backup job granularity would already be at least this much.
-+ */
-+ perf.min_cluster_size = 4 * 1024 * 1024;
-+ /*
-+ * For discard to work, cluster size for the backup job must be at least the same as for
-+ * the fleecing image.
-+ */
-+ BlockDriverInfo bdi;
-+ if (bdrv_get_info(di->fleecing.bs, &bdi) >= 0) {
-+ perf.min_cluster_size = MAX(perf.min_cluster_size, bdi.cluster_size);
-+ }
-+ }
-+
- BlockJob *job = backup_job_create(
-- NULL, di->bs, di->target, backup_state.speed, sync_mode, di->bitmap,
-- bitmap_mode, false, NULL, &backup_state.perf, BLOCKDEV_ON_ERROR_REPORT,
-+ job_id, source_bs, di->target, backup_state.speed, sync_mode, di->bitmap,
-+ bitmap_mode, false, discard_source, NULL, &perf, BLOCKDEV_ON_ERROR_REPORT,
- BLOCKDEV_ON_ERROR_REPORT, JOB_DEFAULT, pvebackup_complete_cb, di, backup_state.txn,
- &local_err);
-
-@@ -585,6 +688,14 @@ static void create_backup_jobs_bh(void *opaque) {
- aio_co_enter(data->ctx, data->co);
- }
-
-+/*
-+ * EFI disk and TPM state are small and it's just not worth setting up fleecing for them.
-+ */
-+static bool device_uses_fleecing(const char *device_id)
-+{
-+ return strncmp(device_id, "drive-efidisk", 13) && strncmp(device_id, "drive-tpmstate", 14);
-+}
-+
- /*
- * Returns a list of device infos, which needs to be freed by the caller. In
- * case of an error, errp will be set, but the returned value might still be a
-@@ -592,6 +703,7 @@ static void create_backup_jobs_bh(void *opaque) {
- */
- static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
- const char *devlist,
-+ bool fleecing,
- Error **errp)
- {
- gchar **devs = NULL;
-@@ -615,6 +727,31 @@ static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
- }
- PVEBackupDevInfo *di = g_new0(PVEBackupDevInfo, 1);
- di->bs = bs;
-+
-+ if (fleecing && device_uses_fleecing(*d)) {
-+ g_autofree gchar *fleecing_devid = g_strconcat(*d, "-fleecing", NULL);
-+ BlockBackend *fleecing_blk = blk_by_name(fleecing_devid);
-+ if (!fleecing_blk) {
-+ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
-+ "Device '%s' not found", fleecing_devid);
-+ goto err;
-+ }
-+ BlockDriverState *fleecing_bs = blk_bs(fleecing_blk);
-+ if (!bdrv_co_is_inserted(fleecing_bs)) {
-+ error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, fleecing_devid);
-+ goto err;
-+ }
-+ /*
-+ * Fleecing image needs to be the same size to act as a cbw target.
-+ */
-+ if (bs->total_sectors != fleecing_bs->total_sectors) {
-+ error_setg(errp, "Size mismatch for '%s' - sector count %ld != %ld",
-+ fleecing_devid, fleecing_bs->total_sectors, bs->total_sectors);
-+ goto err;
-+ }
-+ di->fleecing.bs = fleecing_bs;
-+ }
-+
- di_list = g_list_append(di_list, di);
- d++;
- }
-@@ -664,6 +801,7 @@ UuidInfo coroutine_fn *qmp_backup(
- const char *devlist,
- bool has_speed, int64_t speed,
- bool has_max_workers, int64_t max_workers,
-+ bool has_fleecing, bool fleecing,
- Error **errp)
- {
- assert(qemu_in_coroutine());
-@@ -692,7 +830,7 @@ UuidInfo coroutine_fn *qmp_backup(
- format = has_format ? format : BACKUP_FORMAT_VMA;
-
- bdrv_graph_co_rdlock();
-- di_list = get_device_info(devlist, &local_err);
-+ di_list = get_device_info(devlist, has_fleecing && fleecing, &local_err);
- bdrv_graph_co_rdunlock();
- if (local_err) {
- error_propagate(errp, local_err);
-@@ -1100,5 +1238,6 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
- ret->query_bitmap_info = true;
- ret->pbs_masterkey = true;
- ret->backup_max_workers = true;
-+ ret->backup_fleecing = true;
- return ret;
- }
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 48eec4ef29..1c036e488e 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -935,6 +935,10 @@
- #
- # @max-workers: see @BackupPerf for details. Default 16.
- #
-+# @fleecing: perform a backup with fleecing. For each device in @devlist, a
-+# corresponing '-fleecing' device with the same size already needs to
-+# be present.
-+#
- # Returns: the uuid of the backup job
- #
- ##
-@@ -955,7 +959,8 @@
- '*firewall-file': 'str',
- '*devlist': 'str',
- '*speed': 'int',
-- '*max-workers': 'int' },
-+ '*max-workers': 'int',
-+ '*fleecing': 'bool' },
- 'returns': 'UuidInfo', 'coroutine': true }
-
- ##
-@@ -1011,6 +1016,7 @@
- 'pbs-dirty-bitmap-migration': 'bool',
- 'pbs-masterkey': 'bool',
- 'pbs-library-version': 'str',
-+ 'backup-fleecing': 'bool',
- 'backup-max-workers': 'bool' } }
-
- ##
extra/0001-monitor-qmp-fix-race-with-clients-disconnecting-earl.patch
extra/0002-scsi-megasas-Internal-cdbs-have-16-byte-length.patch
extra/0003-ide-avoid-potential-deadlock-when-draining-during-tr.patch
-extra/0004-migration-block-dirty-bitmap-fix-loading-bitmap-when.patch
-extra/0005-Revert-x86-acpi-workaround-Windows-not-handling-name.patch
-extra/0006-qemu_init-increase-NOFILE-soft-limit-on-POSIX.patch
-extra/0007-mirror-Don-t-call-job_pause_point-under-graph-lock.patch
+extra/0004-Revert-x86-acpi-workaround-Windows-not-handling-name.patch
bitmap-mirror/0001-drive-mirror-add-support-for-sync-bitmap-mode-never.patch
bitmap-mirror/0002-drive-mirror-add-support-for-conditional-and-always-.patch
bitmap-mirror/0003-mirror-add-check-for-bitmap-mode-without-bitmap.patch
pve/0035-migration-block-dirty-bitmap-migrate-other-bitmaps-e.patch
pve/0036-PVE-fall-back-to-open-iscsi-initiatorname.patch
pve/0037-PVE-block-stream-increase-chunk-size.patch
-pve/0038-block-io-accept-NULL-qiov-in-bdrv_pad_request.patch
-pve/0039-block-add-alloc-track-driver.patch
-pve/0040-Revert-block-rbd-workaround-for-ceph-issue-53784.patch
-pve/0041-Revert-block-rbd-fix-handling-of-holes-in-.bdrv_co_b.patch
-pve/0042-Revert-block-rbd-implement-bdrv_co_block_status.patch
-pve/0043-alloc-track-error-out-when-auto-remove-is-not-set.patch
-pve/0044-alloc-track-avoid-seemingly-superfluous-child-permis.patch
-pve/0045-block-copy-before-write-fix-permission.patch
-pve/0046-block-copy-before-write-support-unligned-snapshot-di.patch
-pve/0047-block-copy-before-write-create-block_copy-bitmap-in-.patch
-pve/0048-qapi-blockdev-backup-add-discard-source-parameter.patch
-pve/0049-copy-before-write-allow-specifying-minimum-cluster-s.patch
-pve/0050-backup-add-minimum-cluster-size-to-performance-optio.patch
-pve/0051-PVE-backup-add-fleecing-option.patch
+pve/0038-block-add-alloc-track-driver.patch
+pve/0039-Revert-block-rbd-workaround-for-ceph-issue-53784.patch
+pve/0040-Revert-block-rbd-fix-handling-of-holes-in-.bdrv_co_b.patch
+pve/0041-Revert-block-rbd-implement-bdrv_co_block_status.patch
+pve/0042-alloc-track-error-out-when-auto-remove-is-not-set.patch
+pve/0043-alloc-track-avoid-seemingly-superfluous-child-permis.patch
+pve/0044-block-copy-before-write-fix-permission.patch
+pve/0045-block-copy-before-write-support-unligned-snapshot-di.patch
+pve/0046-block-copy-before-write-create-block_copy-bitmap-in-.patch
+pve/0047-qapi-blockdev-backup-add-discard-source-parameter.patch
+pve/0048-copy-before-write-allow-specifying-minimum-cluster-s.patch
+pve/0049-backup-add-minimum-cluster-size-to-performance-optio.patch
+pve/0050-PVE-backup-add-fleecing-option.patch
-Subproject commit 11aa0b1ff115b86160c4d37e7c37e6a6b13b77ea
+Subproject commit c25df57ae8f9fe1c72eee2dab37d76d904ac382e