PC_BIOS_FW_PURGE_LIST_IN = \
hppa-firmware.img \
+ hppa-firmware64.img \
openbios-ppc \
openbios-sparc32 \
openbios-sparc64 \
s390-ccw.img \
s390-netboot.img \
u-boot.e500 \
- .*\.dtb \
+ .*[a-zA-Z0-9]\.dtb \
+ .*[a-zA-Z0-9]\.dts \
qemu_vga.ndrv \
slof.bin \
opensbi-riscv.*-generic-fw_dynamic.bin \
deb kvm: $(DEBS)
$(DEB_DBG): $(DEB)
$(DEB): $(BUILDDIR)
- cd $(BUILDDIR); dpkg-buildpackage -b -us -uc -j
+ cd $(BUILDDIR); dpkg-buildpackage -b -us -uc
lintian $(DEBS)
sbuild: $(DSC)
+pve-qemu-kvm (9.0.0-1) bookworm; urgency=medium
+
+ * update submodule and patches to QEMU 9.0.0
+
+ -- Proxmox Support Team <support@proxmox.com> Mon, 29 Apr 2024 10:51:37 +0200
+
+pve-qemu-kvm (8.2.2-1) bookworm; urgency=medium
+
+ * update submodule and patches to QEMU 8.2.2
+
+ -- Proxmox Support Team <support@proxmox.com> Sat, 27 Apr 2024 12:44:30 +0200
+
pve-qemu-kvm (8.1.5-5) bookworm; urgency=medium
* implement support for backup fleecing
Signed-off-by: John Snow <jsnow@redhat.com>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
-[FE: rebased for 8.1.1]
+[FE: rebased for 8.2.2]
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
---
- block/mirror.c | 98 +++++++++++++++++++++-----
+ block/mirror.c | 99 ++++++++++++++++++++------
blockdev.c | 38 +++++++++-
include/block/block_int-global-state.h | 4 +-
qapi/block-core.json | 25 ++++++-
tests/unit/test-block-iothread.c | 4 +-
- 5 files changed, 142 insertions(+), 27 deletions(-)
+ 5 files changed, 142 insertions(+), 28 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
-index d3cacd1708..1ff42c8af1 100644
+index 1bdce3b657..0c5c72df2e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -51,7 +51,7 @@ typedef struct MirrorBlockJob {
BlockMirrorBackingMode backing_mode;
/* Whether the target image requires explicit zero-initialization */
bool zero_target;
-@@ -65,6 +65,8 @@ typedef struct MirrorBlockJob {
+@@ -73,6 +73,8 @@ typedef struct MirrorBlockJob {
size_t buf_size;
int64_t bdev_length;
unsigned long *cow_bitmap;
BdrvDirtyBitmap *dirty_bitmap;
BdrvDirtyBitmapIter *dbi;
uint8_t *buf;
-@@ -705,7 +707,8 @@ static int mirror_exit_common(Job *job)
- bdrv_child_refresh_perms(mirror_top_bs, mirror_top_bs->backing,
+@@ -722,7 +724,8 @@ static int mirror_exit_common(Job *job)
&error_abort);
+
if (!abort && s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
- BlockDriverState *backing = s->is_none_mode ? src : s->base;
+ BlockDriverState *backing;
BlockDriverState *unfiltered_target = bdrv_skip_filters(target_bs);
if (bdrv_cow_bs(unfiltered_target) != backing) {
-@@ -809,6 +812,16 @@ static void mirror_abort(Job *job)
+@@ -819,6 +822,16 @@ static void mirror_abort(Job *job)
assert(ret == 0);
}
static void coroutine_fn mirror_throttle(MirrorBlockJob *s)
{
int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-@@ -997,7 +1010,8 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
+@@ -1015,7 +1028,8 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
mirror_free_init(s);
s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
ret = mirror_dirty_init(s);
if (ret < 0 || job_is_cancelled(&s->common.job)) {
goto immediate_exit;
-@@ -1251,6 +1265,7 @@ static const BlockJobDriver mirror_job_driver = {
+@@ -1304,6 +1318,7 @@ static const BlockJobDriver mirror_job_driver = {
.run = mirror_run,
.prepare = mirror_prepare,
.abort = mirror_abort,
.pause = mirror_pause,
.complete = mirror_complete,
.cancel = mirror_cancel,
-@@ -1267,6 +1282,7 @@ static const BlockJobDriver commit_active_job_driver = {
+@@ -1322,6 +1337,7 @@ static const BlockJobDriver commit_active_job_driver = {
.run = mirror_run,
.prepare = mirror_prepare,
.abort = mirror_abort,
.pause = mirror_pause,
.complete = mirror_complete,
.cancel = commit_active_cancel,
-@@ -1658,7 +1674,10 @@ static BlockJob *mirror_start_job(
+@@ -1714,7 +1730,10 @@ static BlockJob *mirror_start_job(
BlockCompletionFunc *cb,
void *opaque,
const BlockJobDriver *driver,
bool auto_complete, const char *filter_node_name,
bool is_mirror, MirrorCopyMode copy_mode,
Error **errp)
-@@ -1670,10 +1689,39 @@ static BlockJob *mirror_start_job(
- uint64_t target_perms, target_shared_perms;
- int ret;
+@@ -1728,10 +1747,39 @@ static BlockJob *mirror_start_job(
+
+ GLOBAL_STATE_CODE();
- if (granularity == 0) {
- granularity = bdrv_get_default_bitmap_granularity(target);
assert(is_power_of_2(granularity));
if (buf_size < 0) {
-@@ -1804,7 +1852,9 @@ static BlockJob *mirror_start_job(
+@@ -1871,7 +1919,9 @@ static BlockJob *mirror_start_job(
s->replaces = g_strdup(replaces);
s->on_source_error = on_source_error;
s->on_target_error = on_target_error;
+ s->bitmap_mode = bitmap_mode;
s->backing_mode = backing_mode;
s->zero_target = zero_target;
- s->copy_mode = copy_mode;
-@@ -1825,6 +1875,18 @@ static BlockJob *mirror_start_job(
- bdrv_disable_dirty_bitmap(s->dirty_bitmap);
- }
+ qatomic_set(&s->copy_mode, copy_mode);
+@@ -1897,6 +1947,18 @@ static BlockJob *mirror_start_job(
+ */
+ bdrv_disable_dirty_bitmap(s->dirty_bitmap);
+ if (s->sync_bitmap) {
+ bdrv_dirty_bitmap_set_busy(s->sync_bitmap, true);
+ }
+ }
+
+ bdrv_graph_wrlock();
ret = block_job_add_bdrv(&s->common, "source", bs, 0,
BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE |
- BLK_PERM_CONSISTENT_READ,
-@@ -1902,6 +1964,9 @@ fail:
+@@ -1979,6 +2041,9 @@ fail:
if (s->dirty_bitmap) {
bdrv_release_dirty_bitmap(s->dirty_bitmap);
}
job_early_fail(&s->common.job);
}
-@@ -1919,31 +1984,25 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
+@@ -2001,35 +2066,28 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
BlockDriverState *target, const char *replaces,
int creation_flags, int64_t speed,
uint32_t granularity, int64_t buf_size,
- MirrorSyncMode_str(mode));
- return;
- }
+-
+ bdrv_graph_rdlock_main_loop();
- is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
base = mode == MIRROR_SYNC_MODE_TOP ? bdrv_backing_chain_next(bs) : NULL;
+ bdrv_graph_rdunlock_main_loop();
+
mirror_start_job(job_id, bs, creation_flags, target, replaces,
speed, granularity, buf_size, backing_mode, zero_target,
on_source_error, on_target_error, unmap, NULL, NULL,
}
BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
-@@ -1970,7 +2029,8 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
+@@ -2056,7 +2114,8 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
job_id, bs, creation_flags, base, NULL, speed, 0, 0,
MIRROR_LEAVE_BACKING_CHAIN, false,
on_error, on_error, true, cb, opaque,
errp);
if (!job) {
diff --git a/blockdev.c b/blockdev.c
-index c28462a633..a402fa4bf7 100644
+index 057601dcf0..8682814a7a 100644
--- a/blockdev.c
+++ b/blockdev.c
-@@ -2849,6 +2849,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2776,6 +2776,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
BlockDriverState *target,
const char *replaces,
enum MirrorSyncMode sync,
BlockMirrorBackingMode backing_mode,
bool zero_target,
bool has_speed, int64_t speed,
-@@ -2867,6 +2870,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2794,6 +2797,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
{
BlockDriverState *unfiltered_bs;
int job_flags = JOB_DEFAULT;
GLOBAL_STATE_CODE();
GRAPH_RDLOCK_GUARD_MAINLOOP();
-@@ -2921,6 +2925,29 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2848,6 +2852,29 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
sync = MIRROR_SYNC_MODE_FULL;
}
if (!replaces) {
/* We want to mirror from @bs, but keep implicit filters on top */
unfiltered_bs = bdrv_skip_implicit_filters(bs);
-@@ -2966,8 +2993,8 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2889,8 +2916,8 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
* and will allow to check whether the node still exist at mirror completion
*/
mirror_start(job_id, bs, target,
on_source_error, on_target_error, unmap, filter_node_name,
copy_mode, errp);
}
-@@ -3115,6 +3142,8 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
+@@ -3034,6 +3061,8 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
blockdev_mirror_common(arg->job_id, bs, target_bs,
arg->replaces, arg->sync,
backing_mode, zero_target,
arg->has_speed, arg->speed,
arg->has_granularity, arg->granularity,
-@@ -3136,6 +3165,8 @@ void qmp_blockdev_mirror(const char *job_id,
+@@ -3053,6 +3082,8 @@ void qmp_blockdev_mirror(const char *job_id,
const char *device, const char *target,
const char *replaces,
MirrorSyncMode sync,
bool has_speed, int64_t speed,
bool has_granularity, uint32_t granularity,
bool has_buf_size, int64_t buf_size,
-@@ -3184,7 +3215,8 @@ void qmp_blockdev_mirror(const char *job_id,
+@@ -3093,7 +3124,8 @@ void qmp_blockdev_mirror(const char *job_id,
}
blockdev_mirror_common(job_id, bs, target_bs,
has_granularity, granularity,
has_buf_size, buf_size,
diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
-index da5fb31089..32f0f9858a 100644
+index d2201e27f4..cc1387ae02 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
-@@ -152,7 +152,9 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
+@@ -158,7 +158,9 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
BlockDriverState *target, const char *replaces,
int creation_flags, int64_t speed,
uint32_t granularity, int64_t buf_size,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
diff --git a/qapi/block-core.json b/qapi/block-core.json
-index bca1a0c372..a5cea82139 100644
+index 746d1694c2..45ab548dfe 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
-@@ -2145,6 +2145,15 @@
+@@ -2174,6 +2174,15 @@
# destination (all the disk, only the sectors allocated in the
# topmost image, or only new I/O).
#
# @granularity: granularity of the dirty bitmap, default is 64K if the
# image format doesn't have clusters, 4K if the clusters are
# smaller than that, else the cluster size. Must be a power of 2
-@@ -2187,7 +2196,9 @@
+@@ -2216,7 +2225,9 @@
{ 'struct': 'DriveMirror',
'data': { '*job-id': 'str', 'device': 'str', 'target': 'str',
'*format': 'str', '*node-name': 'str', '*replaces': 'str',
'*speed': 'int', '*granularity': 'uint32',
'*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
'*on-target-error': 'BlockdevOnError',
-@@ -2471,6 +2482,15 @@
+@@ -2496,6 +2507,15 @@
# destination (all the disk, only the sectors allocated in the
# topmost image, or only new I/O).
#
# @granularity: granularity of the dirty bitmap, default is 64K if the
# image format doesn't have clusters, 4K if the clusters are
# smaller than that, else the cluster size. Must be a power of 2
-@@ -2521,7 +2541,8 @@
+@@ -2544,7 +2564,8 @@
{ 'command': 'blockdev-mirror',
'data': { '*job-id': 'str', 'device': 'str', 'target': 'str',
'*replaces': 'str',
'*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
'*on-target-error': 'BlockdevOnError',
diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
-index d727a5fee8..8a34aa2328 100644
+index 3766d5de6b..afa44cbd34 100644
--- a/tests/unit/test-block-iothread.c
+++ b/tests/unit/test-block-iothread.c
-@@ -757,8 +757,8 @@ static void test_propagate_mirror(void)
+@@ -755,8 +755,8 @@ static void test_propagate_mirror(void)
/* Start a mirror job */
mirror_start("job0", src, target, NULL, JOB_DEFAULT, 0, 0, 0,
+ false, BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
false, "filter_node", MIRROR_COPY_MODE_BACKGROUND,
&error_abort);
- WITH_JOB_LOCK_GUARD() {
+
1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
-index 1ff42c8af1..11b8a8e959 100644
+index 0c5c72df2e..37fee3fa25 100644
--- a/block/mirror.c
+++ b/block/mirror.c
-@@ -682,8 +682,6 @@ static int mirror_exit_common(Job *job)
+@@ -693,8 +693,6 @@ static int mirror_exit_common(Job *job)
bdrv_unfreeze_backing_chain(mirror_top_bs, target_bs);
}
/* Make sure that the source BDS doesn't go away during bdrv_replace_node,
* before we can call bdrv_drained_end */
bdrv_ref(src);
-@@ -788,6 +786,18 @@ static int mirror_exit_common(Job *job)
- block_job_remove_all_bdrv(bjob);
- bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort);
+@@ -800,6 +798,18 @@ static int mirror_exit_common(Job *job)
+ bdrv_drained_end(target_bs);
+ bdrv_unref(target_bs);
+ if (s->sync_bitmap) {
+ if (s->bitmap_mode == BITMAP_SYNC_MODE_ALWAYS ||
bs_opaque->job = NULL;
bdrv_drained_end(src);
-@@ -1699,10 +1709,6 @@ static BlockJob *mirror_start_job(
+@@ -1757,10 +1767,6 @@ static BlockJob *mirror_start_job(
" sync mode",
MirrorSyncMode_str(sync_mode));
return NULL;
}
} else if (bitmap) {
error_setg(errp,
-@@ -1719,6 +1725,12 @@ static BlockJob *mirror_start_job(
+@@ -1777,6 +1783,12 @@ static BlockJob *mirror_start_job(
return NULL;
}
granularity = bdrv_dirty_bitmap_granularity(bitmap);
1 file changed, 3 insertions(+)
diff --git a/blockdev.c b/blockdev.c
-index a402fa4bf7..01b0ab0549 100644
+index 8682814a7a..5b75a085ee 100644
--- a/blockdev.c
+++ b/blockdev.c
-@@ -2946,6 +2946,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2873,6 +2873,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_ALLOW_RO, errp)) {
return;
}
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
-index 11b8a8e959..00f2665ca4 100644
+index 37fee3fa25..6b3cce1007 100644
--- a/block/mirror.c
+++ b/block/mirror.c
-@@ -792,8 +792,8 @@ static int mirror_exit_common(Job *job)
+@@ -804,8 +804,8 @@ static int mirror_exit_common(Job *job)
job->ret == 0 && ret == 0)) {
/* Success; synchronize copy back to sync. */
bdrv_clear_dirty_bitmap(s->sync_bitmap, NULL);
}
}
bdrv_release_dirty_bitmap(s->dirty_bitmap);
-@@ -1892,11 +1892,8 @@ static BlockJob *mirror_start_job(
+@@ -1964,11 +1964,8 @@ static BlockJob *mirror_start_job(
}
if (s->sync_mode == MIRROR_SYNC_MODE_BITMAP) {
+ NULL, true);
}
- ret = block_job_add_bdrv(&s->common, "source", bs, 0,
+ bdrv_graph_wrlock();
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
-[FE: rebase for 8.0]
+[FE: rebase for 8.2.2]
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
---
block/mirror.c | 28 +++------------
3 files changed, 70 insertions(+), 59 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
-index 00f2665ca4..60cf574de5 100644
+index 6b3cce1007..2f1223852b 100644
--- a/block/mirror.c
+++ b/block/mirror.c
-@@ -1699,31 +1699,13 @@ static BlockJob *mirror_start_job(
- uint64_t target_perms, target_shared_perms;
- int ret;
+@@ -1757,31 +1757,13 @@ static BlockJob *mirror_start_job(
+
+ GLOBAL_STATE_CODE();
- if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
- error_setg(errp, "Sync mode '%s' not supported",
if (bitmap_mode != BITMAP_SYNC_MODE_NEVER) {
diff --git a/blockdev.c b/blockdev.c
-index 01b0ab0549..cd5f205ad1 100644
+index 5b75a085ee..d27d8c38ec 100644
--- a/blockdev.c
+++ b/blockdev.c
-@@ -2925,7 +2925,36 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2852,7 +2852,36 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
sync = MIRROR_SYNC_MODE_FULL;
}
/**
diff --git a/monitor/monitor.c b/monitor/monitor.c
-index dc352f9e9d..56e1307014 100644
+index 01ede1babd..5681bca346 100644
--- a/monitor/monitor.c
+++ b/monitor/monitor.c
@@ -117,6 +117,21 @@ bool monitor_cur_is_qmp(void)
monitor_qmp_caps_reset(mon);
data = qmp_greeting(mon);
diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
-index 176b549473..790bb7d1da 100644
+index f3488afeef..2624eb3470 100644
--- a/qapi/qmp-dispatch.c
+++ b/qapi/qmp-dispatch.c
@@ -117,16 +117,28 @@ typedef struct QmpDispatchBH {
aio_co_wake(data->co);
}
-@@ -253,6 +265,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ
+@@ -250,6 +262,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ
.ret = &ret,
.errp = &err,
.co = qemu_coroutine_self(),
1 file changed, 2 insertions(+), 12 deletions(-)
diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
-index 32c70c9e99..984b6a3145 100644
+index 2d0c607177..97e51733af 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -1781,7 +1781,7 @@ static int megasas_handle_io(MegasasState *s, MegasasCmd *cmd, int frame_cmd)
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/hw/ide/core.c b/hw/ide/core.c
-index c3508acbb1..289347af58 100644
+index e8cb2dac92..3b21acf651 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
-@@ -444,7 +444,7 @@ static void ide_trim_bh_cb(void *opaque)
+@@ -456,7 +456,7 @@ static void ide_trim_bh_cb(void *opaque)
iocb->bh = NULL;
qemu_aio_unref(iocb);
blk_dec_in_flight(blk);
}
-@@ -504,6 +504,8 @@ static void ide_issue_trim_cb(void *opaque, int ret)
+@@ -516,6 +516,8 @@ static void ide_issue_trim_cb(void *opaque, int ret)
done:
iocb->aiocb = NULL;
if (iocb->bh) {
replay_bh_schedule_event(iocb->bh);
}
}
-@@ -516,9 +518,6 @@ BlockAIOCB *ide_issue_trim(
+@@ -528,9 +530,6 @@ BlockAIOCB *ide_issue_trim(
IDEDevice *dev = s->unit ? s->bus->slave : s->bus->master;
TrimAIOCB *iocb;
iocb = blk_aio_get(&trim_aiocb_info, s->blk, cb, cb_opaque);
iocb->s = s;
iocb->bh = qemu_bh_new_guarded(ide_trim_bh_cb, iocb,
-@@ -742,8 +741,9 @@ void ide_cancel_dma_sync(IDEState *s)
+@@ -754,8 +753,9 @@ void ide_cancel_dma_sync(IDEState *s)
*/
if (s->bus->dma->aiocb) {
trace_ide_cancel_dma_sync_remaining();
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Fri, 17 Nov 2023 11:18:06 +0100
+Subject: [PATCH] Revert "x86: acpi: workaround Windows not handling name
+ references in Package properly"
+
+This reverts commit 44d975ef340e2f21f236f9520c53e1b30d2213a4.
+
+As reported in the community forum [0] and reproduced locally this
+breaks VirtIO network adapters in (at least) the German ISO of Windows
+Server 2022. The fix itself was for
+
+> Issue is not fatal but as result acpi-index/"PCI Label ID" property
+> is either not shown in device details page or shows incorrect value.
+
+so revert and tolerate that as a stop-gap, rather than have the
+devices not working at all.
+
+[0]: https://forum.proxmox.com/threads/92094/post-605684
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ hw/i386/acpi-build.c | 8 ++------
+ 1 file changed, 2 insertions(+), 6 deletions(-)
+
+diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
+index 53f804ac16..9b1b9f0412 100644
+--- a/hw/i386/acpi-build.c
++++ b/hw/i386/acpi-build.c
+@@ -347,13 +347,9 @@ Aml *aml_pci_device_dsm(void)
+ {
+ Aml *params = aml_local(0);
+ Aml *pkg = aml_package(2);
+- aml_append(pkg, aml_int(0));
+- aml_append(pkg, aml_int(0));
++ aml_append(pkg, aml_name("BSEL"));
++ aml_append(pkg, aml_name("ASUN"));
+ aml_append(method, aml_store(pkg, params));
+- aml_append(method,
+- aml_store(aml_name("BSEL"), aml_index(params, aml_int(0))));
+- aml_append(method,
+- aml_store(aml_name("ASUN"), aml_index(params, aml_int(1))));
+ aml_append(method,
+ aml_return(aml_call5("PDSM", aml_arg(0), aml_arg(1),
+ aml_arg(2), aml_arg(3), params))
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Fri, 28 Jul 2023 10:47:48 +0200
-Subject: [PATCH] migration/block-dirty-bitmap: fix loading bitmap when there
- is an iothread
-
-The bdrv_create_dirty_bitmap() function (which is also called by
-bdrv_dirty_bitmap_create_successor()) uses bdrv_getlength(bs). This is
-a wrapper around a coroutine, and thus uses bdrv_poll_co(). Polling
-tries to release the AioContext which will trigger an assert() if it
-hasn't been acquired before.
-
-The issue does not happen for migration, because there we are in a
-coroutine already, so the wrapper will just call bdrv_co_getlength()
-directly without polling.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- migration/block-dirty-bitmap.c | 6 ++++++
- 1 file changed, 6 insertions(+)
-
-diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
-index 032fc5f405..e1ae3b7316 100644
---- a/migration/block-dirty-bitmap.c
-+++ b/migration/block-dirty-bitmap.c
-@@ -805,8 +805,11 @@ static int dirty_bitmap_load_start(QEMUFile *f, DBMLoadState *s)
- "destination", bdrv_dirty_bitmap_name(s->bitmap));
- return -EINVAL;
- } else {
-+ AioContext *ctx = bdrv_get_aio_context(s->bs);
-+ aio_context_acquire(ctx);
- s->bitmap = bdrv_create_dirty_bitmap(s->bs, granularity,
- s->bitmap_name, &local_err);
-+ aio_context_release(ctx);
- if (!s->bitmap) {
- error_report_err(local_err);
- return -EINVAL;
-@@ -833,7 +836,10 @@ static int dirty_bitmap_load_start(QEMUFile *f, DBMLoadState *s)
-
- bdrv_disable_dirty_bitmap(s->bitmap);
- if (flags & DIRTY_BITMAP_MIG_START_FLAG_ENABLED) {
-+ AioContext *ctx = bdrv_get_aio_context(s->bs);
-+ aio_context_acquire(ctx);
- bdrv_dirty_bitmap_create_successor(s->bitmap, &local_err);
-+ aio_context_release(ctx);
- if (local_err) {
- error_report_err(local_err);
- return -EINVAL;
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 28 Sep 2023 10:07:03 +0200
-Subject: [PATCH] Revert "Revert "graph-lock: Disable locking for now""
-
-This reverts commit 3cce22defb4b0e47cf135444e30cc673cff5ebad.
-
-There are still some issues with graph locking, e.g. deadlocks during
-backup canceling [0]. Because the AioContext locks still exist, it
-should be safe to disable locking again.
-
-From the original 80fc5d2600 ("graph-lock: Disable locking for now"):
-
-> We don't currently rely on graph locking yet. It is supposed to replace
-> the AioContext lock eventually to enable multiqueue support, but as long
-> as we still have the AioContext lock, it is sufficient without the graph
-> lock. Once the AioContext lock goes away, the deadlock doesn't exist any
-> more either and this commit can be reverted. (Of course, it can also be
-> reverted while the AioContext lock still exists if the callers have been
-> fixed.)
-
-[0]: https://lists.nongnu.org/archive/html/qemu-devel/2023-09/msg00729.html
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- block/graph-lock.c | 24 ++++++++++++++++++++++++
- 1 file changed, 24 insertions(+)
-
-diff --git a/block/graph-lock.c b/block/graph-lock.c
-index 5e66f01ae8..5c2873262a 100644
---- a/block/graph-lock.c
-+++ b/block/graph-lock.c
-@@ -30,8 +30,10 @@ BdrvGraphLock graph_lock;
- /* Protects the list of aiocontext and orphaned_reader_count */
- static QemuMutex aio_context_list_lock;
-
-+#if 0
- /* Written and read with atomic operations. */
- static int has_writer;
-+#endif
-
- /*
- * A reader coroutine could move from an AioContext to another.
-@@ -88,6 +90,7 @@ void unregister_aiocontext(AioContext *ctx)
- g_free(ctx->bdrv_graph);
- }
-
-+#if 0
- static uint32_t reader_count(void)
- {
- BdrvGraphRWlock *brdv_graph;
-@@ -105,12 +108,19 @@ static uint32_t reader_count(void)
- assert((int32_t)rd >= 0);
- return rd;
- }
-+#endif
-
- void bdrv_graph_wrlock(BlockDriverState *bs)
- {
-+#if 0
- AioContext *ctx = NULL;
-
- GLOBAL_STATE_CODE();
-+ /*
-+ * TODO Some callers hold an AioContext lock when this is called, which
-+ * causes deadlocks. Reenable once the AioContext locking is cleaned up (or
-+ * AioContext locks are gone).
-+ */
- assert(!qatomic_read(&has_writer));
-
- /*
-@@ -158,11 +168,13 @@ void bdrv_graph_wrlock(BlockDriverState *bs)
- if (ctx) {
- aio_context_acquire(bdrv_get_aio_context(bs));
- }
-+#endif
- }
-
- void bdrv_graph_wrunlock(void)
- {
- GLOBAL_STATE_CODE();
-+#if 0
- QEMU_LOCK_GUARD(&aio_context_list_lock);
- assert(qatomic_read(&has_writer));
-
-@@ -174,10 +186,13 @@ void bdrv_graph_wrunlock(void)
-
- /* Wake up all coroutine that are waiting to read the graph */
- qemu_co_enter_all(&reader_queue, &aio_context_list_lock);
-+#endif
- }
-
- void coroutine_fn bdrv_graph_co_rdlock(void)
- {
-+ /* TODO Reenable when wrlock is reenabled */
-+#if 0
- BdrvGraphRWlock *bdrv_graph;
- bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
-
-@@ -237,10 +252,12 @@ void coroutine_fn bdrv_graph_co_rdlock(void)
- qemu_co_queue_wait(&reader_queue, &aio_context_list_lock);
- }
- }
-+#endif
- }
-
- void coroutine_fn bdrv_graph_co_rdunlock(void)
- {
-+#if 0
- BdrvGraphRWlock *bdrv_graph;
- bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
-
-@@ -258,6 +275,7 @@ void coroutine_fn bdrv_graph_co_rdunlock(void)
- if (qatomic_read(&has_writer)) {
- aio_wait_kick();
- }
-+#endif
- }
-
- void bdrv_graph_rdlock_main_loop(void)
-@@ -275,13 +293,19 @@ void bdrv_graph_rdunlock_main_loop(void)
- void assert_bdrv_graph_readable(void)
- {
- /* reader_count() is slow due to aio_context_list_lock lock contention */
-+ /* TODO Reenable when wrlock is reenabled */
-+#if 0
- #ifdef CONFIG_DEBUG_GRAPH_LOCK
- assert(qemu_in_main_thread() || reader_count());
- #endif
-+#endif
- }
-
- void assert_bdrv_graph_writable(void)
- {
- assert(qemu_in_main_thread());
-+ /* TODO Reenable when wrlock is reenabled */
-+#if 0
- assert(qatomic_read(&has_writer));
-+#endif
- }
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Mon, 29 Apr 2024 15:41:11 +0200
+Subject: [PATCH] block/copy-before-write: use uint64_t for timeout in
+ nanoseconds
+
+rather than the uint32_t for which the maximum is slightly more than 4
+seconds and larger values would overflow. The QAPI interface allows
+specifying the number of seconds, so only values 0 to 4 are safe right
+now, other values lead to a much lower timeout than a user expects.
+
+The block_copy() call where this is used already takes a uint64_t for
+the timeout, so no change required there.
+
+Fixes: 6db7fd1ca9 ("block/copy-before-write: implement cbw-timeout option")
+Reported-by: Friedrich Weber <f.weber@proxmox.com>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Tested-by: Friedrich Weber <f.weber@proxmox.com>
+---
+ block/copy-before-write.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 8aba27a71d..026fa9840f 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -43,7 +43,7 @@ typedef struct BDRVCopyBeforeWriteState {
+ BlockCopyState *bcs;
+ BdrvChild *target;
+ OnCbwError on_cbw_error;
+- uint32_t cbw_timeout_ns;
++ uint64_t cbw_timeout_ns;
+
+ /*
+ * @lock: protects access to @access_bitmap, @done_bitmap and
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 28 Sep 2023 11:19:14 +0200
-Subject: [PATCH] migration states: workaround snapshot performance regression
-
-Commit 813cd616 ("migration: Use migration_transferred_bytes() to
-calculate rate_limit") introduced a prohibitive performance regression
-when taking a snapshot [0]. The reason turns out to be the flushing
-done by migration_transferred_bytes()
-
-Just use a _noflush version of the relevant function as a workaround
-until upstream fixes the issue. This is inspired by a not-applied
-upstream series [1], but doing the very minimum to avoid the
-regression.
-
-[0]: https://gitlab.com/qemu-project/qemu/-/issues/1821
-[1]: https://lists.nongnu.org/archive/html/qemu-devel/2023-05/msg07708.html
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- migration/migration-stats.c | 16 +++++++++++++++-
- 1 file changed, 15 insertions(+), 1 deletion(-)
-
-diff --git a/migration/migration-stats.c b/migration/migration-stats.c
-index 095d6d75bb..8073c8ebaa 100644
---- a/migration/migration-stats.c
-+++ b/migration/migration-stats.c
-@@ -18,6 +18,20 @@
-
- MigrationAtomicStats mig_stats;
-
-+/*
-+ * Same as migration_transferred_bytes below, but using the _noflush
-+ * variant of qemu_file_transferred() to avoid a performance
-+ * regression in migration_rate_exceeded().
-+ */
-+static uint64_t migration_transferred_bytes_noflush(QEMUFile *f)
-+{
-+ uint64_t multifd = stat64_get(&mig_stats.multifd_bytes);
-+ uint64_t qemu_file = qemu_file_transferred_noflush(f);
-+
-+ trace_migration_transferred_bytes(qemu_file, multifd);
-+ return qemu_file + multifd;
-+}
-+
- bool migration_rate_exceeded(QEMUFile *f)
- {
- if (qemu_file_get_error(f)) {
-@@ -25,7 +39,7 @@ bool migration_rate_exceeded(QEMUFile *f)
- }
-
- uint64_t rate_limit_start = stat64_get(&mig_stats.rate_limit_start);
-- uint64_t rate_limit_current = migration_transferred_bytes(f);
-+ uint64_t rate_limit_current = migration_transferred_bytes_noflush(f);
- uint64_t rate_limit_used = rate_limit_current - rate_limit_start;
- uint64_t rate_limit_max = stat64_get(&mig_stats.rate_limit_max);
-
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Fri, 17 Nov 2023 11:18:06 +0100
-Subject: [PATCH] Revert "x86: acpi: workaround Windows not handling name
- references in Package properly"
-
-This reverts commit 44d975ef340e2f21f236f9520c53e1b30d2213a4.
-
-As reported in the community forum [0] and reproduced locally this
-breaks VirtIO network adapters in (at least) the German ISO of Windows
-Server 2022. The fix itself was for
-
-> Issue is not fatal but as result acpi-index/"PCI Label ID" property
-> is either not shown in device details page or shows incorrect value.
-
-so revert and tolerate that as a stop-gap, rather than have the
-devices not working at all.
-
-[0]: https://forum.proxmox.com/threads/92094/post-605684
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- hw/i386/acpi-build.c | 8 ++------
- 1 file changed, 2 insertions(+), 6 deletions(-)
-
-diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
-index bb12b0ad43..de14d3c3da 100644
---- a/hw/i386/acpi-build.c
-+++ b/hw/i386/acpi-build.c
-@@ -362,13 +362,9 @@ Aml *aml_pci_device_dsm(void)
- {
- Aml *params = aml_local(0);
- Aml *pkg = aml_package(2);
-- aml_append(pkg, aml_int(0));
-- aml_append(pkg, aml_int(0));
-+ aml_append(pkg, aml_name("BSEL"));
-+ aml_append(pkg, aml_name("ASUN"));
- aml_append(method, aml_store(pkg, params));
-- aml_append(method,
-- aml_store(aml_name("BSEL"), aml_index(params, aml_int(0))));
-- aml_append(method,
-- aml_store(aml_name("ASUN"), aml_index(params, aml_int(1))));
- aml_append(method,
- aml_return(aml_call5("PDSM", aml_arg(0), aml_arg(1),
- aml_arg(2), aml_arg(3), params))
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Paolo Bonzini <pbonzini@redhat.com>
-Date: Tue, 1 Feb 2022 20:09:41 +0100
-Subject: [PATCH] target/i386: the sgx_epc_get_section stub is reachable
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-The sgx_epc_get_section stub is reachable from cpu_x86_cpuid. It
-should not assert, instead it should just return true just like
-the "real" sgx_epc_get_section does when SGX is disabled.
-
-Reported-by: VladimÃr BeneÅ¡ <vbenes@redhat.com>
-Cc: qemu-stable@nongnu.org
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-ID: <20220201190941.106001-1-pbonzini@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-(cherry-picked from commit 219615740425d9683588207b40a365e6741691a6)
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- hw/i386/sgx-stub.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/hw/i386/sgx-stub.c b/hw/i386/sgx-stub.c
-index 26833eb233..16b1dfd90b 100644
---- a/hw/i386/sgx-stub.c
-+++ b/hw/i386/sgx-stub.c
-@@ -34,5 +34,5 @@ void pc_machine_init_sgx_epc(PCMachineState *pcms)
-
- bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size)
- {
-- g_assert_not_reached();
-+ return true;
- }
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Wed, 24 Jan 2024 11:57:48 +0100
-Subject: [PATCH] ui/clipboard: mark type as not available when there is no
- data
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-With VNC, a client can send a non-extended VNC_MSG_CLIENT_CUT_TEXT
-message with len=0. In qemu_clipboard_set_data(), the clipboard info
-will be updated setting data to NULL (because g_memdup(data, size)
-returns NULL when size is 0). If the client does not set the
-VNC_ENCODING_CLIPBOARD_EXT feature when setting up the encodings, then
-the 'request' callback for the clipboard peer is not initialized.
-Later, because data is NULL, qemu_clipboard_request() can be reached
-via vdagent_chr_write() and vdagent_clipboard_recv_request() and
-there, the clipboard owner's 'request' callback will be attempted to
-be called, but that is a NULL pointer.
-
-In particular, this can happen when using the KRDC (22.12.3) VNC
-client.
-
-Another scenario leading to the same issue is with two clients (say
-noVNC and KRDC):
-
-The noVNC client sets the extension VNC_FEATURE_CLIPBOARD_EXT and
-initializes its cbpeer.
-
-The KRDC client does not, but triggers a vnc_client_cut_text() (note
-it's not the _ext variant)). There, a new clipboard info with it as
-the 'owner' is created and via qemu_clipboard_set_data() is called,
-which in turn calls qemu_clipboard_update() with that info.
-
-In qemu_clipboard_update(), the notifier for the noVNC client will be
-called, i.e. vnc_clipboard_notify() and also set vs->cbinfo for the
-noVNC client. The 'owner' in that clipboard info is the clipboard peer
-for the KRDC client, which did not initialize the 'request' function.
-That sounds correct to me, it is the owner of that clipboard info.
-
-Then when noVNC sends a VNC_MSG_CLIENT_CUT_TEXT message (it did set
-the VNC_FEATURE_CLIPBOARD_EXT feature correctly, so a check for it
-passes), that clipboard info is passed to qemu_clipboard_request() and
-the original segfault still happens.
-
-Fix the issue by handling updates with size 0 differently. In
-particular, mark in the clipboard info that the type is not available.
-
-While at it, switch to g_memdup2(), because g_memdup() is deprecated.
-
-Cc: qemu-stable@nongnu.org
-Fixes: CVE-2023-6683
-Reported-by: Markus Frank <m.frank@proxmox.com>
-Suggested-by: Marc-André Lureau <marcandre.lureau@redhat.com>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
-Tested-by: Markus Frank <m.frank@proxmox.com>
-(picked from https://lists.nongnu.org/archive/html/qemu-stable/2024-01/msg00228.html)
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- ui/clipboard.c | 12 +++++++++---
- 1 file changed, 9 insertions(+), 3 deletions(-)
-
-diff --git a/ui/clipboard.c b/ui/clipboard.c
-index 3d14bffaf8..b3f6fa3c9e 100644
---- a/ui/clipboard.c
-+++ b/ui/clipboard.c
-@@ -163,9 +163,15 @@ void qemu_clipboard_set_data(QemuClipboardPeer *peer,
- }
-
- g_free(info->types[type].data);
-- info->types[type].data = g_memdup(data, size);
-- info->types[type].size = size;
-- info->types[type].available = true;
-+ if (size) {
-+ info->types[type].data = g_memdup2(data, size);
-+ info->types[type].size = size;
-+ info->types[type].available = true;
-+ } else {
-+ info->types[type].data = NULL;
-+ info->types[type].size = 0;
-+ info->types[type].available = false;
-+ }
-
- if (update) {
- qemu_clipboard_update(info);
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Hanna Czenczek <hreitz@redhat.com>
-Date: Fri, 2 Feb 2024 16:31:56 +0100
-Subject: [PATCH] virtio-scsi: Attach event vq notifier with no_poll
-
-As of commit 38738f7dbbda90fbc161757b7f4be35b52205552 ("virtio-scsi:
-don't waste CPU polling the event virtqueue"), we only attach an io_read
-notifier for the virtio-scsi event virtqueue instead, and no polling
-notifiers. During operation, the event virtqueue is typically
-non-empty, but none of the buffers are intended to be used immediately.
-Instead, they only get used when certain events occur. Therefore, it
-makes no sense to continuously poll it when non-empty, because it is
-supposed to be and stay non-empty.
-
-We do this by using virtio_queue_aio_attach_host_notifier_no_poll()
-instead of virtio_queue_aio_attach_host_notifier() for the event
-virtqueue.
-
-Commit 766aa2de0f29b657148e04599320d771c36fd126 ("virtio-scsi: implement
-BlockDevOps->drained_begin()") however has virtio_scsi_drained_end() use
-virtio_queue_aio_attach_host_notifier() for all virtqueues, including
-the event virtqueue. This can lead to it being polled again, undoing
-the benefit of commit 38738f7dbbda90fbc161757b7f4be35b52205552.
-
-Fix it by using virtio_queue_aio_attach_host_notifier_no_poll() for the
-event virtqueue.
-
- ("virtio-scsi: implement BlockDevOps->drained_begin()")
-
-Reported-by: Fiona Ebner <f.ebner@proxmox.com>
-Fixes: 766aa2de0f29b657148e04599320d771c36fd126
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Fiona Ebner <f.ebner@proxmox.com>
-Reviewed-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- hw/scsi/virtio-scsi.c | 7 ++++++-
- 1 file changed, 6 insertions(+), 1 deletion(-)
-
-diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
-index 45b95ea070..ad24a882fd 100644
---- a/hw/scsi/virtio-scsi.c
-+++ b/hw/scsi/virtio-scsi.c
-@@ -1148,6 +1148,7 @@ static void virtio_scsi_drained_begin(SCSIBus *bus)
- static void virtio_scsi_drained_end(SCSIBus *bus)
- {
- VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus);
-+ VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
- VirtIODevice *vdev = VIRTIO_DEVICE(s);
- uint32_t total_queues = VIRTIO_SCSI_VQ_NUM_FIXED +
- s->parent_obj.conf.num_queues;
-@@ -1165,7 +1166,11 @@ static void virtio_scsi_drained_end(SCSIBus *bus)
-
- for (uint32_t i = 0; i < total_queues; i++) {
- VirtQueue *vq = virtio_get_queue(vdev, i);
-- virtio_queue_aio_attach_host_notifier(vq, s->ctx);
-+ if (vq == vs->event_vq) {
-+ virtio_queue_aio_attach_host_notifier_no_poll(vq, s->ctx);
-+ } else {
-+ virtio_queue_aio_attach_host_notifier(vq, s->ctx);
-+ }
- }
- }
-
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Hanna Czenczek <hreitz@redhat.com>
-Date: Fri, 2 Feb 2024 16:31:57 +0100
-Subject: [PATCH] virtio: Re-enable notifications after drain
-
-During drain, we do not care about virtqueue notifications, which is why
-we remove the handlers on it. When removing those handlers, whether vq
-notifications are enabled or not depends on whether we were in polling
-mode or not; if not, they are enabled (by default); if so, they have
-been disabled by the io_poll_start callback.
-
-Because we do not care about those notifications after removing the
-handlers, this is fine. However, we have to explicitly ensure they are
-enabled when re-attaching the handlers, so we will resume receiving
-notifications. We do this in virtio_queue_aio_attach_host_notifier*().
-If such a function is called while we are in a polling section,
-attaching the notifiers will then invoke the io_poll_start callback,
-re-disabling notifications.
-
-Because we will always miss virtqueue updates in the drained section, we
-also need to poll the virtqueue once after attaching the notifiers.
-
-Buglink: https://issues.redhat.com/browse/RHEL-3934
-Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- hw/virtio/virtio.c | 42 ++++++++++++++++++++++++++++++++++++++++++
- include/block/aio.h | 7 ++++++-
- 2 files changed, 48 insertions(+), 1 deletion(-)
-
-diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
-index 969c25f4cf..02cce83111 100644
---- a/hw/virtio/virtio.c
-+++ b/hw/virtio/virtio.c
-@@ -3526,6 +3526,17 @@ static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n)
-
- void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx)
- {
-+ /*
-+ * virtio_queue_aio_detach_host_notifier() can leave notifications disabled.
-+ * Re-enable them. (And if detach has not been used before, notifications
-+ * being enabled is still the default state while a notifier is attached;
-+ * see virtio_queue_host_notifier_aio_poll_end(), which will always leave
-+ * notifications enabled once the polling section is left.)
-+ */
-+ if (!virtio_queue_get_notification(vq)) {
-+ virtio_queue_set_notification(vq, 1);
-+ }
-+
- aio_set_event_notifier(ctx, &vq->host_notifier,
- virtio_queue_host_notifier_read,
- virtio_queue_host_notifier_aio_poll,
-@@ -3533,6 +3544,13 @@ void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx)
- aio_set_event_notifier_poll(ctx, &vq->host_notifier,
- virtio_queue_host_notifier_aio_poll_begin,
- virtio_queue_host_notifier_aio_poll_end);
-+
-+ /*
-+ * We will have ignored notifications about new requests from the guest
-+ * while no notifiers were attached, so "kick" the virt queue to process
-+ * those requests now.
-+ */
-+ event_notifier_set(&vq->host_notifier);
- }
-
- /*
-@@ -3543,14 +3561,38 @@ void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx)
- */
- void virtio_queue_aio_attach_host_notifier_no_poll(VirtQueue *vq, AioContext *ctx)
- {
-+ /* See virtio_queue_aio_attach_host_notifier() */
-+ if (!virtio_queue_get_notification(vq)) {
-+ virtio_queue_set_notification(vq, 1);
-+ }
-+
- aio_set_event_notifier(ctx, &vq->host_notifier,
- virtio_queue_host_notifier_read,
- NULL, NULL);
-+
-+ /*
-+ * See virtio_queue_aio_attach_host_notifier().
-+ * Note that this may be unnecessary for the type of virtqueues this
-+ * function is used for. Still, it will not hurt to have a quick look into
-+ * whether we can/should process any of the virtqueue elements.
-+ */
-+ event_notifier_set(&vq->host_notifier);
- }
-
- void virtio_queue_aio_detach_host_notifier(VirtQueue *vq, AioContext *ctx)
- {
- aio_set_event_notifier(ctx, &vq->host_notifier, NULL, NULL, NULL);
-+
-+ /*
-+ * aio_set_event_notifier_poll() does not guarantee whether io_poll_end()
-+ * will run after io_poll_begin(), so by removing the notifier, we do not
-+ * know whether virtio_queue_host_notifier_aio_poll_end() has run after a
-+ * previous virtio_queue_host_notifier_aio_poll_begin(), i.e. whether
-+ * notifications are enabled or disabled. It does not really matter anyway;
-+ * we just removed the notifier, so we do not care about notifications until
-+ * we potentially re-attach it. The attach_host_notifier functions will
-+ * ensure that notifications are enabled again when they are needed.
-+ */
- }
-
- void virtio_queue_host_notifier_read(EventNotifier *n)
-diff --git a/include/block/aio.h b/include/block/aio.h
-index 32042e8905..79efadfa48 100644
---- a/include/block/aio.h
-+++ b/include/block/aio.h
-@@ -498,9 +498,14 @@ void aio_set_event_notifier(AioContext *ctx,
- AioPollFn *io_poll,
- EventNotifierHandler *io_poll_ready);
-
--/* Set polling begin/end callbacks for an event notifier that has already been
-+/*
-+ * Set polling begin/end callbacks for an event notifier that has already been
- * registered with aio_set_event_notifier. Do nothing if the event notifier is
- * not registered.
-+ *
-+ * Note that if the io_poll_end() callback (or the entire notifier) is removed
-+ * during polling, it will not be called, so an io_poll_begin() is not
-+ * necessarily always followed by an io_poll_end().
- */
- void aio_set_event_notifier_poll(AioContext *ctx,
- EventNotifier *notifier,
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Mon, 18 Dec 2023 11:13:40 +0100
-Subject: [PATCH] qemu_init: increase NOFILE soft limit on POSIX
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-In many configurations, e.g. multiple vNICs with multiple queues or
-with many Ceph OSDs, the default soft limit of 1024 is not enough.
-QEMU is supposed to work fine with file descriptors >= 1024 and does
-not use select() on POSIX. Bump the soft limit to the allowed hard
-limit to avoid issues with the aforementioned configurations.
-
-Of course the limit could be raised from the outside, but the man page
-of systemd.exec states about 'LimitNOFILE=':
-
-> Don't use.
-> [...]
-> Typically applications should increase their soft limit to the hard
-> limit on their own, if they are OK with working with file
-> descriptors above 1023,
-
-If the soft limit is already the same as the hard limit, avoid the
-superfluous setrlimit call. This can avoid a warning with a strict
-seccomp filter blocking setrlimit if NOFILE was already raised before
-executing QEMU.
-
-Buglink: https://bugzilla.proxmox.com/show_bug.cgi?id=4507
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
----
- include/sysemu/os-posix.h | 1 +
- include/sysemu/os-win32.h | 5 +++++
- os-posix.c | 22 ++++++++++++++++++++++
- softmmu/vl.c | 2 ++
- 4 files changed, 30 insertions(+)
-
-diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h
-index 1030d39904..edc415aff5 100644
---- a/include/sysemu/os-posix.h
-+++ b/include/sysemu/os-posix.h
-@@ -48,6 +48,7 @@ void os_setup_early_signal_handling(void);
- void os_set_proc_name(const char *s);
- void os_setup_signal_handling(void);
- void os_daemonize(void);
-+void os_setup_limits(void);
- void os_setup_post(void);
- int os_mlock(void);
-
-diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h
-index 91aa0d7ec0..f6e23fe01e 100644
---- a/include/sysemu/os-win32.h
-+++ b/include/sysemu/os-win32.h
-@@ -129,6 +129,11 @@ static inline int os_mlock(void)
- return -ENOSYS;
- }
-
-+void os_setup_limits(void)
-+{
-+ return;
-+}
-+
- #define fsync _commit
-
- #if !defined(lseek)
-diff --git a/os-posix.c b/os-posix.c
-index cfcb96533c..0cc1d991b1 100644
---- a/os-posix.c
-+++ b/os-posix.c
-@@ -24,6 +24,7 @@
- */
-
- #include "qemu/osdep.h"
-+#include <sys/resource.h>
- #include <sys/wait.h>
- #include <pwd.h>
- #include <grp.h>
-@@ -286,6 +287,27 @@ void os_daemonize(void)
- }
- }
-
-+void os_setup_limits(void)
-+{
-+ struct rlimit nofile;
-+
-+ if (getrlimit(RLIMIT_NOFILE, &nofile) < 0) {
-+ warn_report("unable to query NOFILE limit: %s", strerror(errno));
-+ return;
-+ }
-+
-+ if (nofile.rlim_cur == nofile.rlim_max) {
-+ return;
-+ }
-+
-+ nofile.rlim_cur = nofile.rlim_max;
-+
-+ if (setrlimit(RLIMIT_NOFILE, &nofile) < 0) {
-+ warn_report("unable to set NOFILE limit: %s", strerror(errno));
-+ return;
-+ }
-+}
-+
- void os_setup_post(void)
- {
- int fd = 0;
-diff --git a/softmmu/vl.c b/softmmu/vl.c
-index c9e9ede237..ba6ad8a8df 100644
---- a/softmmu/vl.c
-+++ b/softmmu/vl.c
-@@ -2713,6 +2713,8 @@ void qemu_init(int argc, char **argv)
- error_init(argv[0]);
- qemu_init_exec_dir(argv[0]);
-
-+ os_setup_limits();
-+
- qemu_init_arch_modules();
-
- qemu_init_subsystems();
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Stefan Hajnoczi <stefanha@redhat.com>
-Date: Mon, 22 Jan 2024 12:26:25 -0500
-Subject: [PATCH] virtio-blk: avoid using ioeventfd state in irqfd conditional
-
-Requests that complete in an IOThread use irqfd to notify the guest
-while requests that complete in the main loop thread use the traditional
-qdev irq code path. The reason for this conditional is that the irq code
-path requires the BQL:
-
- if (s->ioeventfd_started && !s->ioeventfd_disabled) {
- virtio_notify_irqfd(vdev, req->vq);
- } else {
- virtio_notify(vdev, req->vq);
- }
-
-There is a corner case where the conditional invokes the irq code path
-instead of the irqfd code path:
-
- static void virtio_blk_stop_ioeventfd(VirtIODevice *vdev)
- {
- ...
- /*
- * Set ->ioeventfd_started to false before draining so that host notifiers
- * are not detached/attached anymore.
- */
- s->ioeventfd_started = false;
-
- /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */
- blk_drain(s->conf.conf.blk);
-
-During blk_drain() the conditional produces the wrong result because
-ioeventfd_started is false.
-
-Use qemu_in_iothread() instead of checking the ioeventfd state.
-
-Cc: qemu-stable@nongnu.org
-Buglink: https://issues.redhat.com/browse/RHEL-15394
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-ID: <20240122172625.415386-1-stefanha@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-[FE: backport: dataplane -> ioeventfd rework didn't happen yet]
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- hw/block/virtio-blk.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
-index 39e7f23fab..61bd1f6859 100644
---- a/hw/block/virtio-blk.c
-+++ b/hw/block/virtio-blk.c
-@@ -64,7 +64,7 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
- iov_discard_undo(&req->inhdr_undo);
- iov_discard_undo(&req->outhdr_undo);
- virtqueue_push(req->vq, &req->elem, req->in_len);
-- if (s->dataplane_started && !s->dataplane_disabled) {
-+ if (qemu_in_iothread()) {
- virtio_blk_data_plane_notify(s->dataplane, req->vq);
- } else {
- virtio_notify(vdev, req->vq);
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/block/file-posix.c b/block/file-posix.c
-index 7f540b03ed..ca551baa42 100644
+index 35684f7e21..43bc0bd520 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -563,7 +563,7 @@ static QemuOptsList raw_runtime_opts = {
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/net/net.h b/include/net/net.h
-index 685ec58318..22edf4ee96 100644
+index b1f9b35fcc..096c0d52e4 100644
--- a/include/net/net.h
+++ b/include/net/net.h
-@@ -260,8 +260,8 @@ void netdev_add(QemuOpts *opts, Error **errp);
+@@ -317,8 +317,8 @@ void netdev_add(QemuOpts *opts, Error **errp);
int net_hub_id_for_client(NetClientState *nc, int *id);
NetClientState *net_hub_port_find(int hub_id);
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
-index 0893b794e9..6d650a58b9 100644
+index 6b05738079..d82869900a 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
-@@ -2243,9 +2243,9 @@ uint64_t cpu_get_tsc(CPUX86State *env);
+@@ -2291,9 +2291,9 @@ uint64_t cpu_get_tsc(CPUX86State *env);
#define CPU_RESOLVING_TYPE TYPE_X86_CPU
#ifdef TARGET_X86_64
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/ui/spice-core.c b/ui/spice-core.c
-index 52a59386d7..b20c25aee0 100644
+index 15be640286..ea20e6153c 100644
--- a/ui/spice-core.c
+++ b/ui/spice-core.c
-@@ -691,32 +691,35 @@ static void qemu_spice_init(void)
+@@ -690,32 +690,35 @@ static void qemu_spice_init(void)
if (tls_port) {
x509_dir = qemu_opt_get(opts, "x509-dir");
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/block/gluster.c b/block/gluster.c
-index ad5fadbe79..d0011085c4 100644
+index cc74af06dc..3ba9bbfa5e 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -43,7 +43,7 @@
1 file changed, 2 insertions(+)
diff --git a/block/rbd.c b/block/rbd.c
-index 978671411e..a4749f3b1b 100644
+index 84bb2fa5d7..63f60d41be 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -963,6 +963,8 @@ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/block/gluster.c b/block/gluster.c
-index d0011085c4..2df3d6e35d 100644
+index 3ba9bbfa5e..34936eb855 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -58,6 +58,7 @@ typedef struct GlusterAIOCB {
}
aio_co_schedule(acb->aio_context, acb->coroutine);
-@@ -1021,6 +1024,7 @@ static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
+@@ -1023,6 +1026,7 @@ static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
acb.ret = 0;
acb.coroutine = qemu_coroutine_self();
acb.aio_context = bdrv_get_aio_context(bs);
ret = glfs_zerofill_async(s->fd, offset, bytes, gluster_finish_aiocb, &acb);
if (ret < 0) {
-@@ -1201,9 +1205,11 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
+@@ -1203,9 +1207,11 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
acb.aio_context = bdrv_get_aio_context(bs);
if (write) {
ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
gluster_finish_aiocb, &acb);
}
-@@ -1266,6 +1272,7 @@ static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
+@@ -1268,6 +1274,7 @@ static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
acb.ret = 0;
acb.coroutine = qemu_coroutine_self();
acb.aio_context = bdrv_get_aio_context(bs);
ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb);
if (ret < 0) {
-@@ -1314,6 +1321,7 @@ static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs,
+@@ -1316,6 +1323,7 @@ static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs,
acb.ret = 0;
acb.coroutine = qemu_coroutine_self();
acb.aio_context = bdrv_get_aio_context(bs);
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/qemu-img.c b/qemu-img.c
-index 78433f3746..25d427edd1 100644
+index 7668f86769..2575e97b43 100644
--- a/qemu-img.c
+++ b/qemu-img.c
-@@ -3062,7 +3062,8 @@ static int img_info(int argc, char **argv)
+@@ -3075,7 +3075,8 @@ static int img_info(int argc, char **argv)
list = collect_image_info_list(image_opts, filename, fmt, chain,
force_share);
if (!list) {
2 files changed, 133 insertions(+), 73 deletions(-)
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
-index 1b1dab5b17..d1616c045a 100644
+index c9dd70a892..048788b23d 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
-@@ -58,9 +58,9 @@ SRST
+@@ -60,9 +60,9 @@ SRST
ERST
DEF("dd", img_dd,
DEF("info", img_info,
diff --git a/qemu-img.c b/qemu-img.c
-index 25d427edd1..220e6ec577 100644
+index 2575e97b43..8ec68b346f 100644
--- a/qemu-img.c
+++ b/qemu-img.c
-@@ -4899,10 +4899,12 @@ static int img_bitmap(int argc, char **argv)
+@@ -4993,10 +4993,12 @@ static int img_bitmap(int argc, char **argv)
#define C_IF 04
#define C_OF 010
#define C_SKIP 020
};
struct DdIo {
-@@ -4978,6 +4980,19 @@ static int img_dd_skip(const char *arg,
+@@ -5072,6 +5074,19 @@ static int img_dd_skip(const char *arg,
return 0;
}
static int img_dd(int argc, char **argv)
{
int ret = 0;
-@@ -5018,6 +5033,7 @@ static int img_dd(int argc, char **argv)
+@@ -5112,6 +5127,7 @@ static int img_dd(int argc, char **argv)
{ "if", img_dd_if, C_IF },
{ "of", img_dd_of, C_OF },
{ "skip", img_dd_skip, C_SKIP },
{ NULL, NULL, 0 }
};
const struct option long_options[] = {
-@@ -5093,91 +5109,112 @@ static int img_dd(int argc, char **argv)
+@@ -5187,91 +5203,112 @@ static int img_dd(int argc, char **argv)
arg = NULL;
}
}
if (dd.flags & C_SKIP && (in.offset > INT64_MAX / in.bsz ||
-@@ -5194,20 +5231,43 @@ static int img_dd(int argc, char **argv)
+@@ -5288,20 +5325,43 @@ static int img_dd(int argc, char **argv)
in.buf = g_new(uint8_t, in.bsz);
for (out_pos = 0; in_pos < size; ) {
1 file changed, 25 insertions(+), 3 deletions(-)
diff --git a/qemu-img.c b/qemu-img.c
-index 220e6ec577..58bf9b43d1 100644
+index 8ec68b346f..b98184bba1 100644
--- a/qemu-img.c
+++ b/qemu-img.c
-@@ -4900,11 +4900,13 @@ static int img_bitmap(int argc, char **argv)
+@@ -4994,11 +4994,13 @@ static int img_bitmap(int argc, char **argv)
#define C_OF 010
#define C_SKIP 020
#define C_OSIZE 040
};
struct DdIo {
-@@ -4993,6 +4995,19 @@ static int img_dd_osize(const char *arg,
+@@ -5087,6 +5089,19 @@ static int img_dd_osize(const char *arg,
return 0;
}
static int img_dd(int argc, char **argv)
{
int ret = 0;
-@@ -5007,12 +5022,14 @@ static int img_dd(int argc, char **argv)
+@@ -5101,12 +5116,14 @@ static int img_dd(int argc, char **argv)
int c, i;
const char *out_fmt = "raw";
const char *fmt = NULL;
};
struct DdIo in = {
.bsz = 512, /* Block size is by default 512 bytes */
-@@ -5034,6 +5051,7 @@ static int img_dd(int argc, char **argv)
+@@ -5128,6 +5145,7 @@ static int img_dd(int argc, char **argv)
{ "of", img_dd_of, C_OF },
{ "skip", img_dd_skip, C_SKIP },
{ "osize", img_dd_osize, C_OSIZE },
{ NULL, NULL, 0 }
};
const struct option long_options[] = {
-@@ -5230,9 +5248,10 @@ static int img_dd(int argc, char **argv)
+@@ -5324,9 +5342,10 @@ static int img_dd(int argc, char **argv)
in.buf = g_new(uint8_t, in.bsz);
if (blk1) {
in_ret = blk_pread(blk1, in_pos, bytes, in.buf, 0);
if (in_ret == 0) {
-@@ -5241,6 +5260,9 @@ static int img_dd(int argc, char **argv)
+@@ -5335,6 +5354,9 @@ static int img_dd(int argc, char **argv)
} else {
in_ret = read(STDIN_FILENO, in.buf, bytes);
if (in_ret == 0) {
3 files changed, 26 insertions(+), 12 deletions(-)
diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
-index 15aeddc6d8..5e713e231d 100644
+index 3653adb963..d83e8fb3c0 100644
--- a/docs/tools/qemu-img.rst
+++ b/docs/tools/qemu-img.rst
-@@ -208,6 +208,10 @@ Parameters to convert subcommand:
+@@ -212,6 +212,10 @@ Parameters to convert subcommand:
Parameters to dd subcommand:
.. program:: qemu-img-dd
.. option:: bs=BLOCK_SIZE
-@@ -488,7 +492,7 @@ Command description:
+@@ -492,7 +496,7 @@ Command description:
it doesn't need to be specified separately in this case.
dd copies from *INPUT* file to *OUTPUT* file converting it from
*FMT* format to *OUTPUT_FMT* format.
-@@ -499,6 +503,11 @@ Command description:
+@@ -503,6 +507,11 @@ Command description:
The size syntax is similar to :manpage:`dd(1)`'s size syntax.
Give information about the disk image *FILENAME*. Use it in
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
-index d1616c045a..b5b0bb4467 100644
+index 048788b23d..0b29a67a06 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
-@@ -58,9 +58,9 @@ SRST
+@@ -60,9 +60,9 @@ SRST
ERST
DEF("dd", img_dd,
DEF("info", img_info,
diff --git a/qemu-img.c b/qemu-img.c
-index 58bf9b43d1..9d414d639b 100644
+index b98184bba1..6fc8384f64 100644
--- a/qemu-img.c
+++ b/qemu-img.c
-@@ -5024,7 +5024,7 @@ static int img_dd(int argc, char **argv)
+@@ -5118,7 +5118,7 @@ static int img_dd(int argc, char **argv)
const char *fmt = NULL;
int64_t size = 0, readsize = 0;
int64_t out_pos, in_pos;
struct DdInfo dd = {
.flags = 0,
.count = 0,
-@@ -5062,7 +5062,7 @@ static int img_dd(int argc, char **argv)
+@@ -5156,7 +5156,7 @@ static int img_dd(int argc, char **argv)
{ 0, 0, 0, 0 }
};
if (c == EOF) {
break;
}
-@@ -5082,6 +5082,9 @@ static int img_dd(int argc, char **argv)
+@@ -5176,6 +5176,9 @@ static int img_dd(int argc, char **argv)
case 'h':
help();
break;
case 'U':
force_share = true;
break;
-@@ -5212,13 +5215,15 @@ static int img_dd(int argc, char **argv)
+@@ -5306,13 +5309,15 @@ static int img_dd(int argc, char **argv)
size - in.bsz * in.offset, &error_abort);
}
3 files changed, 36 insertions(+), 7 deletions(-)
diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
-index 5e713e231d..9390d5e5cf 100644
+index d83e8fb3c0..61c6b21859 100644
--- a/docs/tools/qemu-img.rst
+++ b/docs/tools/qemu-img.rst
-@@ -492,10 +492,10 @@ Command description:
+@@ -496,10 +496,10 @@ Command description:
it doesn't need to be specified separately in this case.
The data is by default read and written using blocks of 512 bytes but can be
modified by specifying *BLOCK_SIZE*. If count=\ *BLOCKS* is specified
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
-index b5b0bb4467..36f97e1f19 100644
+index 0b29a67a06..758f397232 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
-@@ -58,9 +58,9 @@ SRST
+@@ -60,9 +60,9 @@ SRST
ERST
DEF("dd", img_dd,
DEF("info", img_info,
diff --git a/qemu-img.c b/qemu-img.c
-index 9d414d639b..e13a12137b 100644
+index 6fc8384f64..a6c88e0860 100644
--- a/qemu-img.c
+++ b/qemu-img.c
-@@ -5016,6 +5016,7 @@ static int img_dd(int argc, char **argv)
+@@ -5110,6 +5110,7 @@ static int img_dd(int argc, char **argv)
BlockDriver *drv = NULL, *proto_drv = NULL;
BlockBackend *blk1 = NULL, *blk2 = NULL;
QemuOpts *opts = NULL;
QemuOptsList *create_opts = NULL;
Error *local_err = NULL;
bool image_opts = false;
-@@ -5025,6 +5026,7 @@ static int img_dd(int argc, char **argv)
+@@ -5119,6 +5120,7 @@ static int img_dd(int argc, char **argv)
int64_t size = 0, readsize = 0;
int64_t out_pos, in_pos;
bool force_share = false, skip_create = false;
struct DdInfo dd = {
.flags = 0,
.count = 0,
-@@ -5062,7 +5064,7 @@ static int img_dd(int argc, char **argv)
+@@ -5156,7 +5158,7 @@ static int img_dd(int argc, char **argv)
{ 0, 0, 0, 0 }
};
if (c == EOF) {
break;
}
-@@ -5085,6 +5087,19 @@ static int img_dd(int argc, char **argv)
+@@ -5179,6 +5181,19 @@ static int img_dd(int argc, char **argv)
case 'n':
skip_create = true;
break;
case 'U':
force_share = true;
break;
-@@ -5144,11 +5159,24 @@ static int img_dd(int argc, char **argv)
+@@ -5238,11 +5253,24 @@ static int img_dd(int argc, char **argv)
if (dd.flags & C_IF) {
blk1 = img_open(image_opts, in.filename, fmt, 0, false, false,
force_share);
}
if (dd.flags & C_OSIZE) {
-@@ -5303,6 +5331,7 @@ static int img_dd(int argc, char **argv)
+@@ -5397,6 +5425,7 @@ static int img_dd(int argc, char **argv)
out:
g_free(arg);
qemu_opts_del(opts);
4 files changed, 82 insertions(+), 4 deletions(-)
diff --git a/hw/core/machine-hmp-cmds.c b/hw/core/machine-hmp-cmds.c
-index c3e55ef9e9..0e32e6201f 100644
+index a6ff6a4875..e7f74d1c63 100644
--- a/hw/core/machine-hmp-cmds.c
+++ b/hw/core/machine-hmp-cmds.c
-@@ -169,7 +169,35 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict)
+@@ -175,7 +175,35 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict)
return;
}
qapi_free_BalloonInfo(info);
}
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
-index d004cf29d2..2660ed520b 100644
+index 609e39a821..8cb6dfcac3 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
-@@ -782,8 +782,37 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
+@@ -781,8 +781,37 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
static void virtio_balloon_stat(void *opaque, BalloonInfo *info)
{
VirtIOBalloon *dev = opaque;
static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
diff --git a/qapi/machine.json b/qapi/machine.json
-index a08b6576ca..5c9a4d55f4 100644
+index e8b60641f2..2054cdc70d 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
-@@ -1063,9 +1063,29 @@
+@@ -1079,9 +1079,29 @@
# @actual: the logical size of the VM in bytes Formula used:
# logical_vm_size = vm_ram_size - balloon_size
#
##
# @query-balloon:
diff --git a/qapi/pragma.json b/qapi/pragma.json
-index 7f810b0e97..325e684411 100644
+index 59fbe74b8c..be8fa304c5 100644
--- a/qapi/pragma.json
+++ b/qapi/pragma.json
-@@ -35,6 +35,7 @@
+@@ -90,6 +90,7 @@
'member-name-exceptions': [ # visible in:
'ACPISlotType', # query-acpi-ospm-status
'AcpiTableOptions', # -acpitable
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c
-index 3860a50c3b..40821e2317 100644
+index 4b72009cd3..314351cdff 100644
--- a/hw/core/machine-qmp-cmds.c
+++ b/hw/core/machine-qmp-cmds.c
-@@ -91,6 +91,12 @@ MachineInfoList *qmp_query_machines(Error **errp)
+@@ -90,6 +90,12 @@ MachineInfoList *qmp_query_machines(Error **errp)
info->numa_mem_supported = mc->numa_mem_supported;
info->deprecated = !!mc->deprecation_reason;
info->acpi = !!object_class_property_find(OBJECT_CLASS(mc), "acpi");
info->default_cpu_type = g_strdup(mc->default_cpu_type);
}
diff --git a/qapi/machine.json b/qapi/machine.json
-index 5c9a4d55f4..fbb61f18e4 100644
+index 2054cdc70d..a024d5b05d 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
-@@ -139,6 +139,8 @@
+@@ -146,6 +146,8 @@
#
# @is-default: whether the machine is default
#
# @cpu-max: maximum number of CPUs supported by the machine type
# (since 1.5)
#
-@@ -163,7 +165,7 @@
+@@ -170,7 +172,7 @@
##
{ 'struct': 'MachineInfo',
'data': { 'name': 'str', '*alias': 'str',
2 files changed, 7 insertions(+)
diff --git a/qapi/ui.json b/qapi/ui.json
-index 006616aa77..dfd1d3e36b 100644
+index f610bce118..6ea26a9acb 100644
--- a/qapi/ui.json
+++ b/qapi/ui.json
-@@ -317,11 +317,14 @@
+@@ -314,11 +314,14 @@
#
# @channels: a list of @SpiceChannel for each active spice channel
#
'if': 'CONFIG_SPICE' }
diff --git a/ui/spice-core.c b/ui/spice-core.c
-index b20c25aee0..26baeb7846 100644
+index ea20e6153c..55a15fba8b 100644
--- a/ui/spice-core.c
+++ b/ui/spice-core.c
@@ -548,6 +548,10 @@ static SpiceInfo *qmp_query_spice_real(Error **errp)
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
- migration/channel-savevm-async.c | 183 +++++++++++++++++++++++++++++++
+ migration/channel-savevm-async.c | 184 +++++++++++++++++++++++++++++++
migration/channel-savevm-async.h | 51 +++++++++
migration/meson.build | 1 +
- 3 files changed, 235 insertions(+)
+ 3 files changed, 236 insertions(+)
create mode 100644 migration/channel-savevm-async.c
create mode 100644 migration/channel-savevm-async.h
diff --git a/migration/channel-savevm-async.c b/migration/channel-savevm-async.c
new file mode 100644
-index 0000000000..aab081ce07
+index 0000000000..081a192f49
--- /dev/null
+++ b/migration/channel-savevm-async.c
-@@ -0,0 +1,183 @@
+@@ -0,0 +1,184 @@
+/*
+ * QIO Channel implementation to be used by savevm-async QMP calls
+ */
+
+static void
+qio_channel_savevm_async_set_aio_fd_handler(QIOChannel *ioc,
-+ AioContext *ctx,
++ AioContext *read_ctx,
+ IOHandler *io_read,
++ AioContext *write_ctx,
+ IOHandler *io_write,
+ void *opaque)
+{
+
+#endif /* QIO_CHANNEL_SAVEVM_ASYNC_H */
diff --git a/migration/meson.build b/migration/meson.build
-index 1ae28523a1..37ddcb5d60 100644
+index 1eeb915ff6..95d1cf2250 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -13,6 +13,7 @@ system_ss.add(files(
[FE: further improve aborting
adapt to removal of QEMUFileOps
improve condition for entering final stage
- adapt to QAPI and other changes for 8.0]
+ adapt to QAPI and other changes for 8.2]
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
---
hmp-commands-info.hx | 13 +
migration/savevm-async.c | 531 +++++++++++++++++++++++++++++++++++
monitor/hmp-cmds.c | 38 +++
qapi/migration.json | 34 +++
- qapi/misc.json | 16 ++
+ qapi/misc.json | 18 ++
qemu-options.hx | 12 +
- softmmu/vl.c | 10 +
- 11 files changed, 677 insertions(+)
+ system/vl.c | 10 +
+ 11 files changed, 679 insertions(+)
create mode 100644 migration/savevm-async.c
diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
-index f5b37eb74a..10fdd822e0 100644
+index ad1b1306e3..d5ab880492 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -525,6 +525,19 @@ SRST
.name = "balloon",
.args_type = "",
diff --git a/hmp-commands.hx b/hmp-commands.hx
-index 2cbd0f77a0..e352f86872 100644
+index 2e2a3bcf98..7506de251c 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
-@@ -1865,3 +1865,20 @@ SRST
+@@ -1862,3 +1862,20 @@ SRST
List event channels in the guest
ERST
#endif
+ .coroutine = true,
+ },
diff --git a/include/migration/snapshot.h b/include/migration/snapshot.h
-index e72083b117..c846d37806 100644
+index 9e4dcaaa75..2581730d74 100644
--- a/include/migration/snapshot.h
+++ b/include/migration/snapshot.h
-@@ -61,4 +61,6 @@ bool delete_snapshot(const char *name,
- bool has_devices, strList *devices,
- Error **errp);
+@@ -68,4 +68,6 @@ bool delete_snapshot(const char *name,
+ */
+ void load_snapshot_resume(RunState state);
+int load_snapshot_from_blockdev(const char *filename, Error **errp);
+
void coroutine_fn hmp_screendump(Monitor *mon, const QDict *qdict);
void hmp_chardev_add(Monitor *mon, const QDict *qdict);
diff --git a/migration/meson.build b/migration/meson.build
-index 37ddcb5d60..07f6057acc 100644
+index 95d1cf2250..800f12a60d 100644
--- a/migration/meson.build
+++ b/migration/meson.build
-@@ -26,6 +26,7 @@ system_ss.add(files(
+@@ -28,6 +28,7 @@ system_ss.add(files(
'options.c',
'postcopy-ram.c',
'savevm.c',
'threadinfo.c',
diff --git a/migration/savevm-async.c b/migration/savevm-async.c
new file mode 100644
-index 0000000000..e9fc18fb10
+index 0000000000..779e4e2a78
--- /dev/null
+++ b/migration/savevm-async.c
@@ -0,0 +1,531 @@
+static void process_savevm_finalize(void *opaque)
+{
+ int ret;
-+ AioContext *iohandler_ctx = iohandler_get_aio_context();
+ MigrationState *ms = migrate_get_current();
+
+ bool aborted = savevm_aborted();
+ * so move it back. It can stay in the main context and live out its live
+ * there, since we're done with it after this method ends anyway.
+ */
-+ aio_context_acquire(iohandler_ctx);
+ blk_set_aio_context(snap_state.target, qemu_get_aio_context(), NULL);
-+ aio_context_release(iohandler_ctx);
+
+ ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+ if (ret < 0) {
+ * lock. Similar to what is done in migration.c, call the exact variant
+ * only once pend_precopy in the estimate is below the threshold.
+ */
-+ qemu_mutex_unlock_iothread();
++ bql_unlock();
+ qemu_savevm_state_pending_estimate(&pend_precopy, &pend_postcopy);
+ if (pend_precopy <= threshold) {
+ qemu_savevm_state_pending_exact(&pend_precopy, &pend_postcopy);
+ }
-+ qemu_mutex_lock_iothread();
++ bql_lock();
+ pending_size = pend_precopy + pend_postcopy;
+
+ /*
+ * so move there now and after every flush.
+ */
+ aio_co_reschedule_self(qemu_get_aio_context());
-+ for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
++ bdrv_graph_co_rdlock();
++ bs = bdrv_first(&it);
++ bdrv_graph_co_rdunlock();
++ while (bs) {
+ /* target has BDRV_O_NO_FLUSH, no sense calling bdrv_flush on it */
-+ if (bs == blk_bs(snap_state.target)) {
-+ continue;
-+ }
-+
-+ AioContext *bs_ctx = bdrv_get_aio_context(bs);
-+ if (bs_ctx != qemu_get_aio_context()) {
-+ DPRINTF("savevm: async flushing drive %s\n", bs->filename);
-+ aio_co_reschedule_self(bs_ctx);
-+ bdrv_graph_co_rdlock();
-+ bdrv_flush(bs);
-+ bdrv_graph_co_rdunlock();
-+ aio_co_reschedule_self(qemu_get_aio_context());
++ if (bs != blk_bs(snap_state.target)) {
++ AioContext *bs_ctx = bdrv_get_aio_context(bs);
++ if (bs_ctx != qemu_get_aio_context()) {
++ DPRINTF("savevm: async flushing drive %s\n", bs->filename);
++ aio_co_reschedule_self(bs_ctx);
++ bdrv_graph_co_rdlock();
++ bdrv_flush(bs);
++ bdrv_graph_co_rdunlock();
++ aio_co_reschedule_self(qemu_get_aio_context());
++ }
+ }
++ bdrv_graph_co_rdlock();
++ bs = bdrv_next(&it);
++ bdrv_graph_co_rdunlock();
+ }
+
+ DPRINTF("timing: async flushing took %ld ms\n",
+ return;
+ }
+
-+ if (migration_is_running(ms->state)) {
++ if (migration_is_running()) {
+ error_set(errp, ERROR_CLASS_GENERIC_ERROR, QERR_MIGRATION_ACTIVE);
+ return;
+ }
+ * State is cleared in process_savevm_co, but has to be initialized
+ * here (blocking main thread, from QMP) to avoid race conditions.
+ */
-+ migrate_init(ms);
++ if (migrate_init(ms, errp)) {
++ return;
++ }
+ memset(&mig_stats, 0, sizeof(mig_stats));
-+ memset(&compression_counters, 0, sizeof(compression_counters));
+ ms->to_dst_file = snap_state.file;
+
+ error_setg(&snap_state.blocker, "block device is in use by savevm");
+ snap_state.state = SAVE_STATE_ACTIVE;
+ snap_state.finalize_bh = qemu_bh_new(process_savevm_finalize, &snap_state);
+ snap_state.co = qemu_coroutine_create(&process_savevm_co, NULL);
-+ qemu_mutex_unlock_iothread();
+ qemu_savevm_state_header(snap_state.file);
+ qemu_savevm_state_setup(snap_state.file);
-+ qemu_mutex_lock_iothread();
+
+ /* Async processing from here on out happens in iohandler context, so let
+ * the target bdrv have its home there.
+ return ret;
+}
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
-index 6c559b48c8..91be698308 100644
+index 871898ac46..ef4634e5c1 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -22,6 +22,7 @@
+#include "qapi/qapi-commands-migration.h"
#include "qapi/qapi-commands-misc.h"
#include "qapi/qmp/qdict.h"
- #include "qapi/qmp/qerror.h"
+ #include "qemu/cutils.h"
@@ -443,3 +444,40 @@ void hmp_info_mtree(Monitor *mon, const QDict *qdict)
mtree_info(flatview, dispatch_tree, owner, disabled);
+ }
+}
diff --git a/qapi/migration.json b/qapi/migration.json
-index 8843e74b59..aca0ca1ac1 100644
+index 8c65b90328..ed20d066cd 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
-@@ -291,6 +291,40 @@
+@@ -297,6 +297,40 @@
'*dirty-limit-throttle-time-per-round': 'uint64',
'*dirty-limit-ring-full-time': 'uint64'} }
# @query-migrate:
#
diff --git a/qapi/misc.json b/qapi/misc.json
-index cda2effa81..94a58bb0bf 100644
+index ec30e5c570..7147199a12 100644
--- a/qapi/misc.json
+++ b/qapi/misc.json
-@@ -456,6 +456,22 @@
+@@ -454,6 +454,24 @@
##
{ 'command': 'query-fdsets', 'returns': ['FdsetInfo'] }
+#
+# Prepare for snapshot and halt VM. Save VM state to statefile.
+#
++# @statefile: target file that state should be written to.
++#
+##
+{ 'command': 'savevm-start', 'data': { '*statefile': 'str' } }
+
# @CommandLineParameterType:
#
diff --git a/qemu-options.hx b/qemu-options.hx
-index 8073f5edf5..dc1ececc9c 100644
+index 8ce85d4559..511ab9415e 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
-@@ -4483,6 +4483,18 @@ SRST
+@@ -4610,6 +4610,18 @@ SRST
Start right away with a saved state (``loadvm`` in monitor)
ERST
#ifndef _WIN32
DEF("daemonize", 0, QEMU_OPTION_daemonize, \
"-daemonize daemonize QEMU after initializing\n", QEMU_ARCH_ALL)
-diff --git a/softmmu/vl.c b/softmmu/vl.c
-index ba6ad8a8df..ddeace306e 100644
---- a/softmmu/vl.c
-+++ b/softmmu/vl.c
-@@ -164,6 +164,7 @@ static const char *accelerators;
+diff --git a/system/vl.c b/system/vl.c
+index c644222982..2738ab7c91 100644
+--- a/system/vl.c
++++ b/system/vl.c
+@@ -163,6 +163,7 @@ static const char *accelerators;
static bool have_custom_ram_size;
static const char *ram_memdev_id;
static QDict *machine_opts_dict;
static QTAILQ_HEAD(, ObjectOption) object_opts = QTAILQ_HEAD_INITIALIZER(object_opts);
static QTAILQ_HEAD(, DeviceOption) device_opts = QTAILQ_HEAD_INITIALIZER(device_opts);
static int display_remote;
-@@ -2647,6 +2648,12 @@ void qmp_x_exit_preconfig(Error **errp)
-
- if (loadvm) {
+@@ -2712,6 +2713,12 @@ void qmp_x_exit_preconfig(Error **errp)
+ RunState state = autostart ? RUN_STATE_RUNNING : runstate_get();
load_snapshot(loadvm, NULL, false, NULL, &error_fatal);
+ load_snapshot_resume(state);
+ } else if (loadstate) {
+ Error *local_err = NULL;
+ if (load_snapshot_from_blockdev(loadstate, &local_err) < 0) {
}
if (replay_mode != REPLAY_MODE_NONE) {
replay_vmstate_init();
-@@ -3196,6 +3203,9 @@ void qemu_init(int argc, char **argv)
+@@ -3259,6 +3266,9 @@ void qemu_init(int argc, char **argv)
case QEMU_OPTION_loadvm:
loadvm = optarg;
break;
[FE: adapt to removal of QEMUFileOps]
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
---
- migration/qemu-file.c | 49 +++++++++++++++++++++++++++-------------
+ migration/qemu-file.c | 50 +++++++++++++++++++++++++++-------------
migration/qemu-file.h | 2 ++
migration/savevm-async.c | 5 ++--
- 3 files changed, 38 insertions(+), 18 deletions(-)
+ 3 files changed, 39 insertions(+), 18 deletions(-)
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
-index 19c33c9985..e9ffff0f0a 100644
+index a10882d47f..19c1de0472 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
-@@ -33,8 +33,8 @@
- #include "options.h"
- #include "qapi/error.h"
+@@ -35,8 +35,8 @@
+ #include "rdma.h"
+ #include "io/channel-file.h"
-#define IO_BUF_SIZE 32768
-#define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 64)
+#define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 256)
struct QEMUFile {
- const QEMUFileHooks *hooks;
-@@ -46,7 +46,8 @@ struct QEMUFile {
+ QIOChannel *ioc;
+@@ -44,7 +44,8 @@ struct QEMUFile {
int buf_index;
int buf_size; /* 0 when writing */
DECLARE_BITMAP(may_free, MAX_IOV_SIZE);
struct iovec iov[MAX_IOV_SIZE];
-@@ -100,7 +101,9 @@ int qemu_file_shutdown(QEMUFile *f)
+@@ -101,7 +102,9 @@ int qemu_file_shutdown(QEMUFile *f)
return 0;
}
{
QEMUFile *f;
-@@ -109,6 +112,8 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
+@@ -110,6 +113,8 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
object_ref(ioc);
f->ioc = ioc;
f->is_writable = is_writable;
return f;
}
-@@ -119,17 +124,27 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
+@@ -120,17 +125,27 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
*/
QEMUFile *qemu_file_get_return_path(QEMUFile *f)
{
+ return qemu_file_new_impl(ioc, false, buffer_size);
}
- void qemu_file_set_hooks(QEMUFile *f, const QEMUFileHooks *hooks)
-@@ -375,7 +390,7 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
+ /*
+@@ -328,7 +343,7 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
do {
len = qio_channel_read(f->ioc,
(char *)f->buf + pending,
&local_error);
if (len == QIO_CHANNEL_ERR_BLOCK) {
if (qemu_in_coroutine()) {
-@@ -425,6 +440,8 @@ int qemu_fclose(QEMUFile *f)
+@@ -368,6 +383,9 @@ int qemu_fclose(QEMUFile *f)
+ ret = ret2;
}
g_clear_pointer(&f->ioc, object_unref);
-
++
+ free(f->buf);
+
- /* If any error was spotted before closing, we should report it
- * instead of the close() return value.
- */
-@@ -479,7 +496,7 @@ static void add_buf_to_iovec(QEMUFile *f, size_t len)
+ error_free(f->last_error_obj);
+ g_free(f);
+ trace_qemu_file_fclose();
+@@ -416,7 +434,7 @@ static void add_buf_to_iovec(QEMUFile *f, size_t len)
{
if (!add_to_iovec(f, f->buf + f->buf_index, len, false)) {
f->buf_index += len;
qemu_fflush(f);
}
}
-@@ -504,7 +521,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
+@@ -441,7 +459,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
}
while (size > 0) {
if (l > size) {
l = size;
}
-@@ -549,8 +566,8 @@ size_t coroutine_mixed_fn qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t si
+@@ -587,8 +605,8 @@ size_t coroutine_mixed_fn qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t si
size_t index;
assert(!qemu_file_is_writable(f));
/* The 1st byte to read from */
index = f->buf_index + offset;
-@@ -600,7 +617,7 @@ size_t coroutine_mixed_fn qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size
+@@ -638,7 +656,7 @@ size_t coroutine_mixed_fn qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size
size_t res;
uint8_t *src;
if (res == 0) {
return done;
}
-@@ -634,7 +651,7 @@ size_t coroutine_mixed_fn qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size
+@@ -672,7 +690,7 @@ size_t coroutine_mixed_fn qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size
*/
size_t coroutine_mixed_fn qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
{
size_t res;
uint8_t *src = NULL;
-@@ -659,7 +676,7 @@ int coroutine_mixed_fn qemu_peek_byte(QEMUFile *f, int offset)
+@@ -697,7 +715,7 @@ int coroutine_mixed_fn qemu_peek_byte(QEMUFile *f, int offset)
int index = f->buf_index + offset;
assert(!qemu_file_is_writable(f));
if (index >= f->buf_size) {
qemu_fill_buffer(f);
-@@ -777,7 +794,7 @@ static int qemu_compress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
+@@ -811,7 +829,7 @@ static int qemu_compress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream,
const uint8_t *p, size_t size)
{
if (blen < compressBound(size)) {
return -1;
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
-index 47015f5201..1312b7c903 100644
+index 32fd4a34fd..36a0cd8cc8 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
-@@ -63,7 +63,9 @@ typedef struct QEMUFileHooks {
- } QEMUFileHooks;
+@@ -30,7 +30,9 @@
+ #include "io/channel.h"
QEMUFile *qemu_file_new_input(QIOChannel *ioc);
+QEMUFile *qemu_file_new_input_sized(QIOChannel *ioc, size_t buffer_size);
QEMUFile *qemu_file_new_output(QIOChannel *ioc);
+QEMUFile *qemu_file_new_output_sized(QIOChannel *ioc, size_t buffer_size);
- void qemu_file_set_hooks(QEMUFile *f, const QEMUFileHooks *hooks);
int qemu_fclose(QEMUFile *f);
+ /*
diff --git a/migration/savevm-async.c b/migration/savevm-async.c
-index e9fc18fb10..80624fada8 100644
+index 779e4e2a78..bf36fc06d2 100644
--- a/migration/savevm-async.c
+++ b/migration/savevm-async.c
-@@ -378,7 +378,7 @@ void qmp_savevm_start(const char *statefile, Error **errp)
+@@ -379,7 +379,7 @@ void qmp_savevm_start(const char *statefile, Error **errp)
QIOChannel *ioc = QIO_CHANNEL(qio_channel_savevm_async_new(snap_state.target,
&snap_state.bs_pos));
Subject: [PATCH] PVE: block: add the zeroinit block driver filter
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
-[FE: adapt to changed function signatures]
+[FE: adapt to changed function signatures
+ adhere to block graph lock requirements]
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
---
block/meson.build | 1 +
- block/zeroinit.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++
- 2 files changed, 201 insertions(+)
+ block/zeroinit.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 215 insertions(+)
create mode 100644 block/zeroinit.c
diff --git a/block/meson.build b/block/meson.build
-index 529fc172c6..1833c71ce9 100644
+index e1f03fd773..b530e117b5 100644
--- a/block/meson.build
+++ b/block/meson.build
-@@ -40,6 +40,7 @@ block_ss.add(files(
- 'throttle-groups.c',
+@@ -39,6 +39,7 @@ block_ss.add(files(
'throttle.c',
+ 'throttle-groups.c',
'write-threshold.c',
+ 'zeroinit.c',
), zstd, zlib, gnutls)
system_ss.add(when: 'CONFIG_TCG', if_true: files('blkreplay.c'))
diff --git a/block/zeroinit.c b/block/zeroinit.c
new file mode 100644
-index 0000000000..1257342724
+index 0000000000..696558d8d6
--- /dev/null
+++ b/block/zeroinit.c
-@@ -0,0 +1,200 @@
+@@ -0,0 +1,214 @@
+/*
+ * Filter to fake a zero-initialized block device.
+ *
+#include "qapi/error.h"
+#include "block/block_int.h"
+#include "block/block-io.h"
++#include "block/graph-lock.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
+#include "qemu/cutils.h"
+ Error **errp)
+{
+ BDRVZeroinitState *s = bs->opaque;
++ BdrvChild *file = NULL;
+ QemuOpts *opts;
+ Error *local_err = NULL;
+ int ret;
+ }
+
+ /* Open the raw file */
-+ bs->file = bdrv_open_child(qemu_opt_get(opts, "x-next"), options, "next",
-+ bs, &child_of_bds,
-+ BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
-+ false, &local_err);
++ file = bdrv_open_child(qemu_opt_get(opts, "x-next"), options, "next", bs,
++ &child_of_bds,
++ BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, false,
++ &local_err);
++ bdrv_graph_wrlock();
++ bs->file = file;
++ bdrv_graph_wrunlock();
+ if (local_err) {
+ ret = -EINVAL;
+ error_propagate(errp, local_err);
+ ret = 0;
+fail:
+ if (ret < 0) {
++ bdrv_graph_wrlock();
+ bdrv_unref_child(bs, bs->file);
++ bdrv_graph_wrunlock();
+ }
+ qemu_opts_del(opts);
+ return ret;
+ (void)s;
+}
+
-+static coroutine_fn int64_t zeroinit_co_getlength(BlockDriverState *bs)
++static coroutine_fn int64_t GRAPH_RDLOCK
++zeroinit_co_getlength(BlockDriverState *bs)
+{
+ return bdrv_co_getlength(bs->file->bs);
+}
+
-+static int coroutine_fn zeroinit_co_preadv(BlockDriverState *bs,
-+ int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
++static int coroutine_fn GRAPH_RDLOCK
++zeroinit_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
++ QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+ return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
+}
+
-+static int coroutine_fn zeroinit_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
-+ int64_t bytes, BdrvRequestFlags flags)
++static int coroutine_fn GRAPH_RDLOCK
++zeroinit_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
++ BdrvRequestFlags flags)
+{
+ BDRVZeroinitState *s = bs->opaque;
+ if (offset >= s->extents)
+ return bdrv_pwrite_zeroes(bs->file, offset, bytes, flags);
+}
+
-+static int coroutine_fn zeroinit_co_pwritev(BlockDriverState *bs,
-+ int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
++static int coroutine_fn GRAPH_RDLOCK
++zeroinit_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
++ QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+ BDRVZeroinitState *s = bs->opaque;
+ int64_t extents = offset + bytes;
+ return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+}
+
-+static coroutine_fn int zeroinit_co_flush(BlockDriverState *bs)
++static coroutine_fn int GRAPH_RDLOCK
++zeroinit_co_flush(BlockDriverState *bs)
+{
+ return bdrv_co_flush(bs->file->bs);
+}
+
-+static int zeroinit_has_zero_init(BlockDriverState *bs)
++static int GRAPH_RDLOCK
++zeroinit_has_zero_init(BlockDriverState *bs)
+{
+ BDRVZeroinitState *s = bs->opaque;
+ return s->has_zero_init;
+}
+
-+static int coroutine_fn zeroinit_co_pdiscard(BlockDriverState *bs,
-+ int64_t offset, int64_t bytes)
++static int coroutine_fn GRAPH_RDLOCK
++zeroinit_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
+{
+ return bdrv_co_pdiscard(bs->file, offset, bytes);
+}
+
-+static int zeroinit_co_truncate(BlockDriverState *bs, int64_t offset,
-+ _Bool exact, PreallocMode prealloc,
-+ BdrvRequestFlags req_flags, Error **errp)
++static int GRAPH_RDLOCK
++zeroinit_co_truncate(BlockDriverState *bs, int64_t offset, _Bool exact,
++ PreallocMode prealloc, BdrvRequestFlags req_flags,
++ Error **errp)
+{
+ return bdrv_co_truncate(bs->file, offset, exact, prealloc, req_flags, errp);
+}
+
-+static coroutine_fn int zeroinit_co_get_info(BlockDriverState *bs,
-+ BlockDriverInfo *bdi)
++static coroutine_fn int GRAPH_RDLOCK
++zeroinit_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ return bdrv_co_get_info(bs->file->bs, bdi);
+}
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
qemu-options.hx | 3 +++
- softmmu/vl.c | 8 ++++++++
+ system/vl.c | 8 ++++++++
2 files changed, 11 insertions(+)
diff --git a/qemu-options.hx b/qemu-options.hx
-index dc1ececc9c..848d2dfdd1 100644
+index 511ab9415e..92e301d545 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
-@@ -1197,6 +1197,9 @@ legacy PC, they are not recommended for modern configurations.
+@@ -1237,6 +1237,9 @@ legacy PC, they are not recommended for modern configurations.
ERST
DEF("fda", HAS_ARG, QEMU_OPTION_fda,
"-fda/-fdb file use 'file' as floppy disk 0/1 image\n", QEMU_ARCH_ALL)
DEF("fdb", HAS_ARG, QEMU_OPTION_fdb, "", QEMU_ARCH_ALL)
-diff --git a/softmmu/vl.c b/softmmu/vl.c
-index ddeace306e..3ee90b3b94 100644
---- a/softmmu/vl.c
-+++ b/softmmu/vl.c
-@@ -2683,6 +2683,7 @@ void qemu_init(int argc, char **argv)
+diff --git a/system/vl.c b/system/vl.c
+index 2738ab7c91..20ebf2c920 100644
+--- a/system/vl.c
++++ b/system/vl.c
+@@ -2748,6 +2748,7 @@ void qemu_init(int argc, char **argv)
MachineClass *machine_class;
bool userconfig = true;
FILE *vmstate_dump_file = NULL;
qemu_add_opts(&qemu_drive_opts);
qemu_add_drive_opts(&qemu_legacy_drive_opts);
-@@ -3308,6 +3309,13 @@ void qemu_init(int argc, char **argv)
+@@ -3371,6 +3372,13 @@ void qemu_init(int argc, char **argv)
machine_parse_property_opt(qemu_find_opts("smp-opts"),
"smp", optarg);
break;
+ exit(1);
+ }
+ break;
+ #ifdef CONFIG_VNC
case QEMU_OPTION_vnc:
vnc_parse(optarg);
- break;
1 file changed, 9 insertions(+)
diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c
-index 4a34f03047..59b917e50c 100644
+index d8fc1e2815..789694b8b3 100644
--- a/hw/intc/apic_common.c
+++ b/hw/intc/apic_common.c
-@@ -252,6 +252,15 @@ static void apic_reset_common(DeviceState *dev)
+@@ -263,6 +263,15 @@ static void apic_reset_common(DeviceState *dev)
info->vapic_base_update(s);
apic_init_reset(dev);
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
block/file-posix.c | 59 ++++++++++++++++++++++++++++++--------------
- qapi/block-core.json | 3 ++-
- 2 files changed, 42 insertions(+), 20 deletions(-)
+ qapi/block-core.json | 7 +++++-
+ 2 files changed, 46 insertions(+), 20 deletions(-)
diff --git a/block/file-posix.c b/block/file-posix.c
-index ca551baa42..8b3b83e9d4 100644
+index 43bc0bd520..60e98c87f1 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
-@@ -2873,6 +2873,7 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
+@@ -2876,6 +2876,7 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
int fd;
uint64_t perm, shared;
int result = 0;
/* Validate options and set default values */
assert(options->driver == BLOCKDEV_DRIVER_FILE);
-@@ -2913,19 +2914,22 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
+@@ -2916,19 +2917,22 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
}
/* Clear the file by truncating it to 0 */
-@@ -2979,13 +2983,15 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
+@@ -2982,13 +2986,15 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
}
out_unlock:
}
out_close:
-@@ -3009,6 +3015,7 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
+@@ -3012,6 +3018,7 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
PreallocMode prealloc;
char *buf = NULL;
Error *local_err = NULL;
/* Skip file: protocol prefix */
strstart(filename, "file:", &filename);
-@@ -3031,6 +3038,18 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
+@@ -3034,6 +3041,18 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
return -EINVAL;
}
options = (BlockdevCreateOptions) {
.driver = BLOCKDEV_DRIVER_FILE,
.u.file = {
-@@ -3042,6 +3061,8 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
+@@ -3045,6 +3064,8 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
.nocow = nocow,
.has_extent_size_hint = has_extent_size_hint,
.extent_size_hint = extent_size_hint,
};
return raw_co_create(&options, errp);
diff --git a/qapi/block-core.json b/qapi/block-core.json
-index a5cea82139..bb471c078d 100644
+index 45ab548dfe..f7c2b63c5d 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
-@@ -4880,7 +4880,8 @@
+@@ -4956,6 +4956,10 @@
+ # @extent-size-hint: Extent size hint to add to the image file; 0 for
+ # not adding an extent size hint (default: 1 MB, since 5.1)
+ #
++# @locking: whether to enable file locking. If set to 'auto', only
++# enable when Open File Descriptor (OFD) locking API is available
++# (default: auto).
++#
+ # Since: 2.12
+ ##
+ { 'struct': 'BlockdevCreateOptionsFile',
+@@ -4963,7 +4967,8 @@
'size': 'size',
'*preallocation': 'PreallocMode',
'*nocow': 'bool',
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/hw/core/machine.c b/hw/core/machine.c
-index f0d35c6401..1427983543 100644
+index 37ede0e7d4..513e49bab1 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
-@@ -148,7 +148,8 @@ GlobalProperty hw_compat_4_0[] = {
+@@ -161,7 +161,8 @@ GlobalProperty hw_compat_4_0[] = {
{ "virtio-vga", "edid", "false" },
{ "virtio-gpu-device", "edid", "false" },
{ "virtio-device", "use-started", "false" },
hw/core/machine-qmp-cmds.c | 5 +++++
include/hw/boards.h | 2 ++
qapi/machine.json | 4 +++-
- softmmu/vl.c | 25 +++++++++++++++++++++++++
+ system/vl.c | 25 +++++++++++++++++++++++++
4 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c
-index 40821e2317..ee93ddd69a 100644
+index 314351cdff..628a3537c5 100644
--- a/hw/core/machine-qmp-cmds.c
+++ b/hw/core/machine-qmp-cmds.c
-@@ -95,6 +95,11 @@ MachineInfoList *qmp_query_machines(Error **errp)
+@@ -94,6 +94,11 @@ MachineInfoList *qmp_query_machines(Error **errp)
if (strcmp(mc->name, MACHINE_GET_CLASS(current_machine)->name) == 0) {
info->has_is_current = true;
info->is_current = true;
if (mc->default_cpu_type) {
diff --git a/include/hw/boards.h b/include/hw/boards.h
-index ed83360198..f8b88cd86a 100644
+index 8b8f6d5c00..dd6d0a1447 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
-@@ -235,6 +235,8 @@ struct MachineClass {
+@@ -246,6 +246,8 @@ struct MachineClass {
const char *desc;
const char *deprecation_reason;
void (*reset)(MachineState *state, ShutdownCause reason);
void (*wakeup)(MachineState *state);
diff --git a/qapi/machine.json b/qapi/machine.json
-index fbb61f18e4..7da3c519ba 100644
+index a024d5b05d..1d69bffaa0 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
-@@ -161,6 +161,8 @@
+@@ -168,6 +168,8 @@
#
# @acpi: machine type supports ACPI (since 8.0)
#
# Since: 1.2
##
{ 'struct': 'MachineInfo',
-@@ -168,7 +170,7 @@
+@@ -175,7 +177,7 @@
'*is-default': 'bool', '*is-current': 'bool', 'cpu-max': 'int',
'hotpluggable-cpus': 'bool', 'numa-mem-supported': 'bool',
'deprecated': 'bool', '*default-cpu-type': 'str',
##
# @query-machines:
-diff --git a/softmmu/vl.c b/softmmu/vl.c
-index 3ee90b3b94..4b6d0b82fd 100644
---- a/softmmu/vl.c
-+++ b/softmmu/vl.c
-@@ -1597,6 +1597,7 @@ static const QEMUOption *lookup_opt(int argc, char **argv,
+diff --git a/system/vl.c b/system/vl.c
+index 20ebf2c920..4d39e32097 100644
+--- a/system/vl.c
++++ b/system/vl.c
+@@ -1659,6 +1659,7 @@ static const QEMUOption *lookup_opt(int argc, char **argv,
static MachineClass *select_machine(QDict *qdict, Error **errp)
{
- const char *optarg = qdict_get_try_str(qdict, "type");
+ const char *machine_type = qdict_get_try_str(qdict, "type");
+ const char *pvever = qdict_get_try_str(qdict, "pvever");
GSList *machines = object_class_get_list(TYPE_MACHINE, false);
MachineClass *machine_class;
Error *local_err = NULL;
-@@ -1614,6 +1615,11 @@ static MachineClass *select_machine(QDict *qdict, Error **errp)
+@@ -1676,6 +1677,11 @@ static MachineClass *select_machine(QDict *qdict, Error **errp)
}
}
g_slist_free(machines);
if (local_err) {
error_append_hint(&local_err, "Use -machine help to list supported machines\n");
-@@ -3250,12 +3256,31 @@ void qemu_init(int argc, char **argv)
+@@ -3313,12 +3319,31 @@ void qemu_init(int argc, char **argv)
case QEMU_OPTION_machine:
{
bool help;
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/block/backup.c b/block/backup.c
-index db3791f4d1..39410dcf8d 100644
+index ec29d6b810..270957c0cd 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -237,8 +237,8 @@ static void backup_init_bcs_bitmap(BackupBlockJob *job)
if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
int64_t offset = 0;
int64_t count;
-@@ -495,6 +493,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
- block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
+@@ -501,6 +499,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
&error_abort);
+ bdrv_graph_wrunlock();
+ backup_init_bcs_bitmap(job);
+
---
block/meson.build | 2 +
meson.build | 5 +
- vma-reader.c | 867 ++++++++++++++++++++++++++++++++++++++++++++
+ vma-reader.c | 870 ++++++++++++++++++++++++++++++++++++++++++++
vma-writer.c | 818 +++++++++++++++++++++++++++++++++++++++++
- vma.c | 900 ++++++++++++++++++++++++++++++++++++++++++++++
+ vma.c | 901 ++++++++++++++++++++++++++++++++++++++++++++++
vma.h | 150 ++++++++
- 6 files changed, 2742 insertions(+)
+ 6 files changed, 2746 insertions(+)
create mode 100644 vma-reader.c
create mode 100644 vma-writer.c
create mode 100644 vma.c
create mode 100644 vma.h
diff --git a/block/meson.build b/block/meson.build
-index 1833c71ce9..59b71ba9f3 100644
+index b530e117b5..b245daa98e 100644
--- a/block/meson.build
+++ b/block/meson.build
-@@ -43,6 +43,8 @@ block_ss.add(files(
+@@ -42,6 +42,8 @@ block_ss.add(files(
'zeroinit.c',
), zstd, zlib, gnutls)
system_ss.add(files('block-ram-registrar.c'))
diff --git a/meson.build b/meson.build
-index a9c4f28247..cd95530d3b 100644
+index 91a0aa64c6..620cc594b2 100644
--- a/meson.build
+++ b/meson.build
-@@ -1778,6 +1778,8 @@ endif
+@@ -1922,6 +1922,8 @@ endif
has_gettid = cc.has_function('gettid')
# libselinux
selinux = dependency('libselinux',
required: get_option('selinux'),
-@@ -3908,6 +3910,9 @@ if have_tools
+@@ -4023,6 +4025,9 @@ if have_tools
dependencies: [blockdev, qemuutil, gnutls, selinux],
install: true)
+ dependencies: [authz, block, crypto, io, qom], install: true)
+
subdir('storage-daemon')
- subdir('contrib/rdmacm-mux')
- subdir('contrib/elf2dmp')
+
+ foreach exe: [ 'qemu-img', 'qemu-io', 'qemu-nbd', 'qemu-storage-daemon']
diff --git a/vma-reader.c b/vma-reader.c
new file mode 100644
-index 0000000000..81a891c6b1
+index 0000000000..d0b6721812
--- /dev/null
+++ b/vma-reader.c
-@@ -0,0 +1,867 @@
+@@ -0,0 +1,870 @@
+/*
+ * VMA: Virtual Machine Archive
+ *
+#include "qemu/ratelimit.h"
+#include "vma.h"
+#include "block/block.h"
++#include "block/graph-lock.h"
+#include "sysemu/block-backend.h"
+
+static unsigned char zero_vma_block[VMA_BLOCK_SIZE];
+ } else {
+ int res = blk_pwrite(target, sector_num * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE, buf, 0);
+ if (res < 0) {
++ bdrv_graph_rdlock_main_loop();
+ error_setg(errp, "blk_pwrite to %s failed (%d)",
+ bdrv_get_device_name(blk_bs(target)), res);
++ bdrv_graph_rdunlock_main_loop();
+ return -1;
+ }
+ }
+}
diff --git a/vma.c b/vma.c
new file mode 100644
-index 0000000000..347f6283ca
+index 0000000000..bb715e9061
--- /dev/null
+++ b/vma.c
-@@ -0,0 +1,900 @@
+@@ -0,0 +1,901 @@
+/*
+ * VMA: Virtual Machine Archive
+ *
+ inbuf);
+ }
+
-+ RestoreMap *map = g_new0(RestoreMap, 1);
-+ map->devname = g_strdup(devname);
-+ map->path = g_strdup(path);
-+ map->format = format;
-+ map->throttling_bps = bps_value;
-+ map->throttling_group = group;
-+ map->cache = cache;
-+ map->write_zero = write_zero;
-+ map->skip = skip;
++ RestoreMap *restore_map = g_new0(RestoreMap, 1);
++ restore_map->devname = g_strdup(devname);
++ restore_map->path = g_strdup(path);
++ restore_map->format = format;
++ restore_map->throttling_bps = bps_value;
++ restore_map->throttling_group = group;
++ restore_map->cache = cache;
++ restore_map->write_zero = write_zero;
++ restore_map->skip = skip;
+
-+ g_hash_table_insert(devmap, map->devname, map);
++ g_hash_table_insert(devmap, restore_map->devname, restore_map);
+
+ };
+ }
+
+static int create_archive(int argc, char **argv)
+{
-+ int i, c;
++ int c;
+ int verbose = 0;
+ const char *archivename;
+ GList *backup_coroutines = NULL;
+ vma_writer_get_status(vmaw, &vmastat);
+
+ if (verbose) {
++ int i;
+ for (i = 0; i < 256; i++) {
+ VmaStreamInfo *si = &vmastat.stream_info[i];
+ if (si->size) {
+ return bs;
+}
diff --git a/block/backup.c b/block/backup.c
-index 39410dcf8d..af87fa6aa9 100644
+index 270957c0cd..16d611c4ca 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -29,28 +29,6 @@
static const BlockJobDriver backup_job_driver;
static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
-@@ -457,6 +435,14 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+@@ -461,6 +439,14 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
}
cluster_size = block_copy_cluster_size(bcs);
if (perf->max_chunk && perf->max_chunk < cluster_size) {
error_setg(errp, "Required max-chunk (%" PRIi64 ") is less than backup "
diff --git a/block/meson.build b/block/meson.build
-index 59b71ba9f3..6fde9f7dcd 100644
+index b245daa98e..e99914eaa4 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -4,6 +4,7 @@ block_ss.add(files(
'amend.c',
'backup.c',
+ 'backup-dump.c',
- 'copy-before-write.c',
'blkdebug.c',
'blklogwrites.c',
+ 'blkverify.c',
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
-index 74195c3004..0a0339eee4 100644
+index 761276127e..b3e6697613 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -26,6 +26,7 @@
BDRV_TRACKED_READ,
BDRV_TRACKED_WRITE,
diff --git a/job.c b/job.c
-index 72d57f0934..93e22d180b 100644
+index 660ce22c56..baf54c8d60 100644
--- a/job.c
+++ b/job.c
-@@ -330,7 +330,8 @@ static bool job_started_locked(Job *job)
+@@ -331,7 +331,8 @@ static bool job_started_locked(Job *job)
}
/* Called with job_mutex held. */
2 files changed, 46 insertions(+)
diff --git a/include/qemu/job.h b/include/qemu/job.h
-index e502787dd8..963cf2bef5 100644
+index 2b873f2576..528cd6acb9 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
-@@ -381,6 +381,18 @@ void job_unlock(void);
+@@ -362,6 +362,18 @@ void job_unlock(void);
*/
JobTxn *job_txn_new(void);
* Release a reference that was previously acquired with job_txn_add_job or
* job_txn_new. If it's the last reference to the object, it will be freed.
diff --git a/job.c b/job.c
-index 93e22d180b..2b31f1e14f 100644
+index baf54c8d60..3ac5e5cde2 100644
--- a/job.c
+++ b/job.c
-@@ -93,6 +93,8 @@ struct JobTxn {
+@@ -94,6 +94,8 @@ struct JobTxn {
/* Reference count */
int refcnt;
};
void job_lock(void)
-@@ -118,6 +120,25 @@ JobTxn *job_txn_new(void)
+@@ -119,6 +121,25 @@ JobTxn *job_txn_new(void)
return txn;
}
/* Called with job_mutex held. */
static void job_txn_ref_locked(JobTxn *txn)
{
-@@ -1057,6 +1078,12 @@ static void job_completed_txn_success_locked(Job *job)
+@@ -1042,6 +1063,12 @@ static void job_completed_txn_success_locked(Job *job)
*/
QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
if (!job_is_completed_locked(other_job)) {
return;
}
assert(other_job->ret == 0);
-@@ -1268,6 +1295,13 @@ int job_finish_sync_locked(Job *job,
+@@ -1253,6 +1280,13 @@ int job_finish_sync_locked(Job *job,
return -EBUSY;
}
monitor/hmp-cmds.c | 72 +++
proxmox-backup-client.c | 146 +++++
proxmox-backup-client.h | 60 ++
- pve-backup.c | 1089 ++++++++++++++++++++++++++++++++
- qapi/block-core.json | 229 +++++++
+ pve-backup.c | 1098 ++++++++++++++++++++++++++++++++
+ qapi/block-core.json | 233 +++++++
qapi/common.json | 14 +
qapi/machine.json | 16 +-
- 14 files changed, 1704 insertions(+), 14 deletions(-)
+ 14 files changed, 1717 insertions(+), 14 deletions(-)
create mode 100644 proxmox-backup-client.c
create mode 100644 proxmox-backup-client.h
create mode 100644 pve-backup.c
diff --git a/block/meson.build b/block/meson.build
-index 6fde9f7dcd..6d468f89e5 100644
+index e99914eaa4..6bba803f94 100644
--- a/block/meson.build
+++ b/block/meson.build
-@@ -45,6 +45,11 @@ block_ss.add(files(
+@@ -44,6 +44,11 @@ block_ss.add(files(
), zstd, zlib, gnutls)
block_ss.add(files('../vma-writer.c'), libuuid)
system_ss.add(when: 'CONFIG_TCG', if_true: files('blkreplay.c'))
system_ss.add(files('block-ram-registrar.c'))
diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
-index ca2599de44..6efe28cef5 100644
+index d954bec6f1..5000c084c5 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
-@@ -1029,3 +1029,42 @@ void hmp_change_medium(Monitor *mon, const char *device, const char *target,
+@@ -1008,3 +1008,42 @@ void hmp_change_medium(Monitor *mon, const char *device, const char *target,
qmp_blockdev_change_medium(device, NULL, target, arg, true, force,
!!read_only, read_only_mode, errp);
}
+ hmp_handle_error(mon, error);
+}
diff --git a/blockdev.c b/blockdev.c
-index cd5f205ad1..7793143d76 100644
+index d27d8c38ec..5e5dbc1da9 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -37,6 +37,7 @@
#include "monitor/monitor.h"
#include "qemu/error-report.h"
diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
-index 10fdd822e0..15937793c1 100644
+index d5ab880492..6c97248d1b 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -471,6 +471,20 @@ SRST
{
.name = "usernet",
diff --git a/hmp-commands.hx b/hmp-commands.hx
-index e352f86872..0c8b6725fb 100644
+index 7506de251c..d5f9c28194 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -101,6 +101,35 @@ ERST
void hmp_device_add(Monitor *mon, const QDict *qdict);
void hmp_device_del(Monitor *mon, const QDict *qdict);
diff --git a/meson.build b/meson.build
-index cd95530d3b..d53976d621 100644
+index 620cc594b2..d16b97cf3c 100644
--- a/meson.build
+++ b/meson.build
-@@ -1779,6 +1779,7 @@ endif
+@@ -1923,6 +1923,7 @@ endif
has_gettid = cc.has_function('gettid')
libuuid = cc.find_library('uuid', required: true)
# libselinux
selinux = dependency('libselinux',
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
-index 91be698308..5b9c231a4c 100644
+index ef4634e5c1..6e25279f42 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -21,6 +21,7 @@
+#endif /* PROXMOX_BACKUP_CLIENT_H */
diff --git a/pve-backup.c b/pve-backup.c
new file mode 100644
-index 0000000000..ae3d137e12
+index 0000000000..9c13a92623
--- /dev/null
+++ b/pve-backup.c
-@@ -0,0 +1,1089 @@
+@@ -0,0 +1,1098 @@
+#include "proxmox-backup-client.h"
+#include "vma.h"
+
+#include "block/block_int-global-state.h"
+#include "block/blockjob.h"
+#include "block/dirty-bitmap.h"
++#include "block/graph-lock.h"
+#include "qapi/qapi-commands-block.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/cutils.h"
+ }
+ }
+
-+ if (di->job) {
-+ WITH_JOB_LOCK_GUARD() {
-+ job_unref_locked(&di->job->job);
-+ di->job = NULL;
-+ }
-+ }
-+
+ // remove self from job list
+ backup_state.di_list = g_list_remove(backup_state.di_list, di);
+
+ di->completed_ret = ret;
+
+ /*
++ * Needs to happen outside of coroutine, because it takes the graph write lock.
++ */
++ if (di->job) {
++ WITH_JOB_LOCK_GUARD() {
++ job_unref_locked(&di->job->job);
++ di->job = NULL;
++ }
++ }
++
++ /*
+ * Schedule stream cleanup in async coroutine. close_image and finish might
+ * take a while, so we can't block on them here. This way it also doesn't
+ * matter if we're already running in a coroutine or not.
+ sync_mode = MIRROR_SYNC_MODE_BITMAP;
+ bitmap_mode = BITMAP_SYNC_MODE_ON_SUCCESS;
+ }
-+ AioContext *aio_context = bdrv_get_aio_context(di->bs);
-+ aio_context_acquire(aio_context);
-+
+ bdrv_drained_begin(di->bs);
+
+ BlockJob *job = backup_job_create(
+
+ bdrv_drained_end(di->bs);
+
-+ aio_context_release(aio_context);
-+
+ di->job = job;
+ if (job) {
+ WITH_JOB_LOCK_GUARD() {
+ * case of an error, errp will be set, but the returned value might still be a
+ * list.
+ */
-+static GList coroutine_fn *get_device_info(
++static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
+ const char *devlist,
+ Error **errp)
+{
+ /* Todo: try to auto-detect format based on file name */
+ format = has_format ? format : BACKUP_FORMAT_VMA;
+
++ bdrv_graph_co_rdlock();
+ di_list = get_device_info(devlist, &local_err);
++ bdrv_graph_co_rdunlock();
+ if (local_err) {
+ error_propagate(errp, local_err);
+ goto err;
+ while (l) {
+ PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
+ l = g_list_next(l);
-+ if (bdrv_op_is_blocked(di->bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
++
++ bdrv_graph_co_rdlock();
++ bool blocked = bdrv_op_is_blocked(di->bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp);
++ bdrv_graph_co_rdunlock();
++ if (blocked) {
+ goto err;
+ }
+
+
+ di->block_size = dump_cb_block_size;
+
++ bdrv_graph_co_rdlock();
+ const char *devname = bdrv_get_device_name(di->bs);
++ bdrv_graph_co_rdunlock();
+ PBSBitmapAction action = PBS_BITMAP_ACTION_NOT_USED;
+ size_t dirty = di->size;
+
+ goto err_mutex;
+ }
+
++ bdrv_graph_co_rdlock();
+ const char *devname = bdrv_get_device_name(di->bs);
++ bdrv_graph_co_rdunlock();
+ di->dev_id = vma_writer_register_stream(vmaw, devname, di->size);
+ if (di->dev_id <= 0) {
+ error_set(errp, ERROR_CLASS_GENERIC_ERROR,
+ return ret;
+}
diff --git a/qapi/block-core.json b/qapi/block-core.json
-index bb471c078d..1b8462a51b 100644
+index f7c2b63c5d..e49c7b5bc9 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
-@@ -839,6 +839,235 @@
+@@ -851,6 +851,239 @@
{ 'command': 'query-block', 'returns': ['BlockInfo'],
'allow-preconfig': true }
+# @config-file: a configuration file to include into
+# the backup archive.
+#
++# @firewall-file: a firewall configuration file to include into the backup
++# archive.
++#
+# @speed: the maximum speed, in bytes per second
+#
+# @devlist: list of block device names (separated by ',', ';'
+#
+# Cancel the current executing backup process.
+#
-+# Returns: nothing on success
-+#
+# Notes: This command succeeds even if there is no backup process running.
+#
+##
+#
+# @pbs-library-version: Running version of libproxmox-backup-qemu0 library.
+#
++# @backup-max-workers: Whether the 'max-workers' @BackupPerf setting is
++# supported or not.
++#
+##
+{ 'struct': 'ProxmoxSupportStatus',
+ 'data': { 'pbs-dirty-bitmap': 'bool',
# @BlockDeviceTimedStats:
#
diff --git a/qapi/common.json b/qapi/common.json
-index 6fed9cde1a..630a2a8f9a 100644
+index 7558ce5430..6e3d800373 100644
--- a/qapi/common.json
+++ b/qapi/common.json
-@@ -207,3 +207,17 @@
+@@ -200,3 +200,17 @@
##
{ 'struct': 'HumanReadableText',
'data': { 'human-readable-text': 'str' } }
+##
+{ 'struct': 'UuidInfo', 'data': {'UUID': 'str'} }
diff --git a/qapi/machine.json b/qapi/machine.json
-index 7da3c519ba..888457f810 100644
+index 1d69bffaa0..731d8d2f60 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -4,6 +4,8 @@
##
# = Machines
##
-@@ -230,20 +232,6 @@
+@@ -237,20 +239,6 @@
##
{ 'command': 'query-target', 'returns': 'TargetInfo' }
create mode 100644 pbs-restore.c
diff --git a/meson.build b/meson.build
-index d53976d621..c3330310d9 100644
+index d16b97cf3c..6de51c34cb 100644
--- a/meson.build
+++ b/meson.build
-@@ -3914,6 +3914,10 @@ if have_tools
+@@ -4029,6 +4029,10 @@ if have_tools
vma = executable('vma', files('vma.c', 'vma-reader.c') + genh,
dependencies: [authz, block, crypto, io, qom], install: true)
+ libproxmox_backup_qemu], install: true)
+
subdir('storage-daemon')
- subdir('contrib/rdmacm-mux')
- subdir('contrib/elf2dmp')
+
+ foreach exe: [ 'qemu-img', 'qemu-io', 'qemu-nbd', 'qemu-storage-daemon']
diff --git a/pbs-restore.c b/pbs-restore.c
new file mode 100644
index 0000000000..f03d9bab8d
getlength is now a coroutine function]
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
---
- block/meson.build | 3 +
- block/pbs.c | 305 +++++++++++++++++++++++++++++++++++++++++++
- configure | 9 ++
+ block/meson.build | 2 +
+ block/pbs.c | 307 +++++++++++++++++++++++++++++++++++++++++++
meson.build | 2 +-
- qapi/block-core.json | 13 ++
+ qapi/block-core.json | 29 ++++
qapi/pragma.json | 1 +
- 6 files changed, 332 insertions(+), 1 deletion(-)
+ 5 files changed, 340 insertions(+), 1 deletion(-)
create mode 100644 block/pbs.c
diff --git a/block/meson.build b/block/meson.build
-index 6d468f89e5..becc99ac4e 100644
+index 6bba803f94..1945e04eeb 100644
--- a/block/meson.build
+++ b/block/meson.build
-@@ -50,6 +50,9 @@ block_ss.add(files(
+@@ -49,6 +49,8 @@ block_ss.add(files(
'../pve-backup.c',
), libproxmox_backup_qemu)
-+block_ss.add(when: 'CONFIG_PBS_BDRV', if_true: files('pbs.c'))
-+block_ss.add(when: 'CONFIG_PBS_BDRV', if_true: libproxmox_backup_qemu)
++block_ss.add(files('pbs.c'), libproxmox_backup_qemu)
+
system_ss.add(when: 'CONFIG_TCG', if_true: files('blkreplay.c'))
system_ss.add(files('block-ram-registrar.c'))
diff --git a/block/pbs.c b/block/pbs.c
new file mode 100644
-index 0000000000..a2211e0f3b
+index 0000000000..dd72356bd3
--- /dev/null
+++ b/block/pbs.c
-@@ -0,0 +1,305 @@
+@@ -0,0 +1,307 @@
+/*
+ * Proxmox Backup Server read-only block driver
+ */
+ proxmox_restore_disconnect(s->conn);
+}
+
-+static coroutine_fn int64_t pbs_co_getlength(BlockDriverState *bs)
++static coroutine_fn int64_t GRAPH_RDLOCK
++pbs_co_getlength(BlockDriverState *bs)
+{
+ BDRVPBSState *s = bs->opaque;
+ return s->length;
+ aio_co_schedule(rcb->ctx, rcb->co);
+}
+
-+static coroutine_fn int pbs_co_preadv(BlockDriverState *bs,
-+ int64_t offset, int64_t bytes,
-+ QEMUIOVector *qiov, BdrvRequestFlags flags)
++static coroutine_fn int GRAPH_RDLOCK
++pbs_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
++ QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+ BDRVPBSState *s = bs->opaque;
+ int ret;
+ return 0;
+}
+
-+static coroutine_fn int pbs_co_pwritev(BlockDriverState *bs,
-+ int64_t offset, int64_t bytes,
-+ QEMUIOVector *qiov, BdrvRequestFlags flags)
++static coroutine_fn int GRAPH_RDLOCK
++pbs_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
++ QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+ fprintf(stderr, "pbs-bdrv: cannot write to backup file, make sure "
+ "any attached disk devices are set to read-only!\n");
+ return -EPERM;
+}
+
-+static void pbs_refresh_filename(BlockDriverState *bs)
++static void GRAPH_RDLOCK
++pbs_refresh_filename(BlockDriverState *bs)
+{
+ BDRVPBSState *s = bs->opaque;
+ if (s->namespace) {
+}
+
+block_init(bdrv_pbs_init);
-diff --git a/configure b/configure
-index 133f4e3235..f5a830c1f3 100755
---- a/configure
-+++ b/configure
-@@ -256,6 +256,7 @@ qemu_suffix="qemu"
- softmmu="yes"
- linux_user=""
- bsd_user=""
-+pbs_bdrv="yes"
- plugins="$default_feature"
- ninja=""
- python=
-@@ -809,6 +810,10 @@ for opt do
- ;;
- --enable-download) download="enabled"; git_submodules_action=update;
- ;;
-+ --disable-pbs-bdrv) pbs_bdrv="no"
-+ ;;
-+ --enable-pbs-bdrv) pbs_bdrv="yes"
-+ ;;
- --enable-plugins) if test "$mingw32" = "yes"; then
- error_exit "TCG plugins not currently supported on Windows platforms"
- else
-@@ -959,6 +964,7 @@ cat << EOF
- bsd-user all BSD usermode emulation targets
- pie Position Independent Executables
- debug-tcg TCG debugging (default is disabled)
-+ pbs-bdrv Proxmox backup server read-only block driver support
-
- NOTE: The object files are built at the place where configure is launched
- EOF
-@@ -1744,6 +1750,9 @@ if test "$solaris" = "yes" ; then
- fi
- echo "SRC_PATH=$source_path" >> $config_host_mak
- echo "TARGET_DIRS=$target_list" >> $config_host_mak
-+if test "$pbs_bdrv" = "yes" ; then
-+ echo "CONFIG_PBS_BDRV=y" >> $config_host_mak
-+fi
-
- # XXX: suppress that
- if [ "$bsd" = "yes" ] ; then
diff --git a/meson.build b/meson.build
-index c3330310d9..cbfc9a43fb 100644
+index 6de51c34cb..3bc039f60f 100644
--- a/meson.build
+++ b/meson.build
-@@ -4319,7 +4319,7 @@ summary_info += {'bzip2 support': libbzip2}
+@@ -4477,7 +4477,7 @@ summary_info += {'bzip2 support': libbzip2}
summary_info += {'lzfse support': liblzfse}
summary_info += {'zstd support': zstd}
summary_info += {'NUMA host support': numa}
summary_info += {'libdaxctl support': libdaxctl}
summary_info += {'libudev': libudev}
diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 1b8462a51b..d67a6d448a 100644
+index e49c7b5bc9..fc32ff9957 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
-@@ -3396,6 +3396,7 @@
+@@ -3457,6 +3457,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
'ssh', 'throttle', 'vdi', 'vhdx',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
-@@ -3482,6 +3483,17 @@
+@@ -3543,6 +3544,33 @@
{ 'struct': 'BlockdevOptionsNull',
'data': { '*size': 'int', '*latency-ns': 'uint64', '*read-zeroes': 'bool' } }
+#
+# Driver specific block device options for the PBS backend.
+#
++# @repository: Proxmox Backup Server repository.
++#
++# @snapshot: backup snapshots ID.
++#
++# @archive: archive name.
++#
++# @keyfile: keyfile to use for encryption.
++#
++# @password: password to use for connection.
++#
++# @fingerprint: backup server fingerprint.
++#
++# @key_password: password to unlock key.
++#
++# @namespace: namespace where backup snapshot lives.
++#
+##
+{ 'struct': 'BlockdevOptionsPbs',
+ 'data': { 'repository': 'str', 'snapshot': 'str', 'archive': 'str',
##
# @BlockdevOptionsNVMe:
#
-@@ -4890,6 +4902,7 @@
+@@ -4977,6 +5005,7 @@
'nfs': 'BlockdevOptionsNfs',
'null-aio': 'BlockdevOptionsNull',
'null-co': 'BlockdevOptionsNull',
'nvme-io_uring': { 'type': 'BlockdevOptionsNvmeIoUring',
'if': 'CONFIG_BLKIO' },
diff --git a/qapi/pragma.json b/qapi/pragma.json
-index 325e684411..b6079f6a0e 100644
+index be8fa304c5..7ff46bd128 100644
--- a/qapi/pragma.json
+++ b/qapi/pragma.json
-@@ -45,6 +45,7 @@
+@@ -100,6 +100,7 @@
'BlockInfo', # query-block
'BlockdevAioOptions', # blockdev-add, -blockdev
'BlockdevDriver', # blockdev-add, query-blockstats, ...
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
- meson.build | 2 ++
+ meson.build | 3 ++-
os-posix.c | 7 +++++--
- 2 files changed, 7 insertions(+), 2 deletions(-)
+ 2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/meson.build b/meson.build
-index cbfc9a43fb..8206270272 100644
+index 3bc039f60f..067e8956a7 100644
--- a/meson.build
+++ b/meson.build
-@@ -1779,6 +1779,7 @@ endif
+@@ -1923,6 +1923,7 @@ endif
has_gettid = cc.has_function('gettid')
libuuid = cc.find_library('uuid', required: true)
libproxmox_backup_qemu = cc.find_library('proxmox_backup_qemu', required: true)
# libselinux
-@@ -3406,6 +3407,7 @@ if have_block
- # os-posix.c contains POSIX-specific functions used by qemu-storage-daemon,
- # os-win32.c does not
- blockdev_ss.add(when: 'CONFIG_POSIX', if_true: files('os-posix.c'))
-+ blockdev_ss.add(when: 'CONFIG_POSIX', if_true: libsystemd)
- system_ss.add(when: 'CONFIG_WIN32', if_true: [files('os-win32.c')])
+@@ -3530,7 +3531,7 @@ if have_block
+ if host_os == 'windows'
+ system_ss.add(files('os-win32.c'))
+ else
+- blockdev_ss.add(files('os-posix.c'))
++ blockdev_ss.add(files('os-posix.c'), libsystemd)
+ endif
endif
diff --git a/os-posix.c b/os-posix.c
-index 0cc1d991b1..f33d9901cf 100644
+index a4284e2c07..197a2120fd 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -29,6 +29,8 @@
+#include <systemd/sd-journal.h>
+#include <syslog.h>
- /* Needed early for CONFIG_BSD etc. */
- #include "net/slirp.h"
-@@ -332,9 +334,10 @@ void os_setup_post(void)
+ #include "qemu/error-report.h"
+ #include "qemu/log.h"
+@@ -302,9 +304,10 @@ void os_setup_post(void)
dup2(fd, 0);
dup2(fd, 1);
create mode 100644 migration/pbs-state.c
diff --git a/include/migration/misc.h b/include/migration/misc.h
-index 7dcc0b5c2c..4c940b2475 100644
+index c9e200f4eb..12c99ebc69 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
-@@ -77,4 +77,7 @@ bool migration_in_bg_snapshot(void);
+@@ -117,4 +117,7 @@ bool migration_in_bg_snapshot(void);
/* migration/block-dirty-bitmap.c */
void dirty_bitmap_mig_init(void);
+
#endif
diff --git a/migration/meson.build b/migration/meson.build
-index 07f6057acc..343994d891 100644
+index 800f12a60d..35a4306183 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -7,7 +7,9 @@ migration_files = files(
system_ss.add(files(
'block-dirty-bitmap.c',
diff --git a/migration/migration.c b/migration/migration.c
-index 7a4c8beb5d..0a955a2a18 100644
+index 86bf76e925..b8d7e471a4 100644
--- a/migration/migration.c
+++ b/migration/migration.c
-@@ -162,6 +162,7 @@ void migration_object_init(void)
+@@ -239,6 +239,7 @@ void migration_object_init(void)
blk_mig_init();
ram_mig_init();
dirty_bitmap_mig_init();
+ pbs_state_mig_init();
}
- void migration_cancel(const Error *error)
+ typedef struct {
diff --git a/migration/pbs-state.c b/migration/pbs-state.c
new file mode 100644
index 0000000000..887e998b9e
+ NULL);
+}
diff --git a/pve-backup.c b/pve-backup.c
-index ae3d137e12..e6b17b797e 100644
+index 9c13a92623..9d480a8eec 100644
--- a/pve-backup.c
+++ b/pve-backup.c
-@@ -1082,6 +1082,7 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
+@@ -1091,6 +1091,7 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
ret->pbs_library_version = g_strdup(proxmox_backup_qemu_version());
ret->pbs_dirty_bitmap = true;
ret->pbs_dirty_bitmap_savevm = true;
ret->pbs_masterkey = true;
ret->backup_max_workers = true;
diff --git a/qapi/block-core.json b/qapi/block-core.json
-index d67a6d448a..09de550c95 100644
+index fc32ff9957..f516d8e95a 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
-@@ -991,6 +991,11 @@
+@@ -1004,6 +1004,11 @@
# @pbs-dirty-bitmap-savevm: True if 'dirty-bitmaps' migration capability can
# safely be set for savevm-async.
#
# @pbs-masterkey: True if the QMP backup call supports the 'master_keyfile'
# parameter.
#
-@@ -1001,6 +1006,7 @@
+@@ -1017,6 +1022,7 @@
'data': { 'pbs-dirty-bitmap': 'bool',
'query-bitmap-info': 'bool',
'pbs-dirty-bitmap-savevm': 'bool',
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
-index e1ae3b7316..285dd1d148 100644
+index 2708abf3d7..fb17c01308 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -540,7 +540,7 @@ static int add_bitmaps_to_list(DBMSaveState *s, BlockDriverState *bs,
1 file changed, 30 insertions(+)
diff --git a/block/iscsi.c b/block/iscsi.c
-index 34f97ab646..398782963d 100644
+index 2ff14b7472..46f275fbf7 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
-@@ -1391,12 +1391,42 @@ static char *get_initiator_name(QemuOpts *opts)
+@@ -1392,12 +1392,42 @@ static char *get_initiator_name(QemuOpts *opts)
const char *name;
char *iscsi_name;
UuidInfo *uuid_info;
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/stream.c b/block/stream.c
-index e522bbdec5..afed72db55 100644
+index 7031eef12b..d2da83ae7c 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -27,7 +27,7 @@ enum {
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Reiter <s.reiter@proxmox.com>
+Date: Mon, 7 Dec 2020 15:21:03 +0100
+Subject: [PATCH] block: add alloc-track driver
+
+Add a new filter node 'alloc-track', which seperates reads and writes to
+different children, thus allowing to put a backing image behind any
+blockdev (regardless of driver support). Since we can't detect any
+pre-allocated blocks, we can only track new writes, hence the write
+target ('file') for this node must always be empty.
+
+Intended use case is for live restoring, i.e. add a backup image as a
+block device into a VM, then put an alloc-track on the restore target
+and set the backup as backing. With this, one can use a regular
+'block-stream' to restore the image, while the VM can already run in the
+background. Copy-on-read will help make progress as the VM reads as
+well.
+
+This only worked if the target supports backing images, so up until now
+only for qcow2, with alloc-track any driver for the target can be used.
+
+Replacing the node cannot be done in the
+track_co_change_backing_file() callback, because replacing a node
+cannot happen in a coroutine and requires the block graph lock
+exclusively. Could either become a special option for the stream job,
+or maybe the upcoming blockdev-replace QMP command can be used in the
+future.
+
+Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+[FE: adapt to changed function signatures
+ make error return value consistent with QEMU
+ avoid premature break during read
+ adhere to block graph lock requirements]
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ block/alloc-track.c | 366 ++++++++++++++++++++++++++++++++++++++++++++
+ block/meson.build | 1 +
+ block/stream.c | 34 ++++
+ 3 files changed, 401 insertions(+)
+ create mode 100644 block/alloc-track.c
+
+diff --git a/block/alloc-track.c b/block/alloc-track.c
+new file mode 100644
+index 0000000000..b9f8ea9137
+--- /dev/null
++++ b/block/alloc-track.c
+@@ -0,0 +1,366 @@
++/*
++ * Node to allow backing images to be applied to any node. Assumes a blank
++ * image to begin with, only new writes are tracked as allocated, thus this
++ * must never be put on a node that already contains data.
++ *
++ * Copyright (c) 2020 Proxmox Server Solutions GmbH
++ * Copyright (c) 2020 Stefan Reiter <s.reiter@proxmox.com>
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * See the COPYING file in the top-level directory.
++ */
++
++#include "qemu/osdep.h"
++#include "qapi/error.h"
++#include "block/block_int.h"
++#include "block/dirty-bitmap.h"
++#include "block/graph-lock.h"
++#include "qapi/qmp/qdict.h"
++#include "qapi/qmp/qstring.h"
++#include "qemu/cutils.h"
++#include "qemu/error-report.h"
++#include "qemu/option.h"
++#include "qemu/module.h"
++#include "sysemu/block-backend.h"
++
++#define TRACK_OPT_AUTO_REMOVE "auto-remove"
++
++typedef enum DropState {
++ DropNone,
++ DropInProgress,
++} DropState;
++
++typedef struct {
++ BdrvDirtyBitmap *bitmap;
++ uint64_t granularity;
++ DropState drop_state;
++ bool auto_remove;
++} BDRVAllocTrackState;
++
++static QemuOptsList runtime_opts = {
++ .name = "alloc-track",
++ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
++ .desc = {
++ {
++ .name = TRACK_OPT_AUTO_REMOVE,
++ .type = QEMU_OPT_BOOL,
++ .help = "automatically replace this node with 'file' when 'backing'"
++ "is detached",
++ },
++ { /* end of list */ }
++ },
++};
++
++static void GRAPH_RDLOCK
++track_refresh_limits(BlockDriverState *bs, Error **errp)
++{
++ BDRVAllocTrackState *s = bs->opaque;
++
++ if (!bs->file) {
++ return;
++ }
++
++ /*
++ * Always use alignment from underlying write device so RMW cycle for
++ * bdrv_pwritev reads data from our backing via track_co_preadv. Also use at
++ * least the bitmap granularity.
++ */
++ bs->bl.request_alignment = MAX(bs->file->bs->bl.request_alignment,
++ s->granularity);
++}
++
++static int track_open(BlockDriverState *bs, QDict *options, int flags,
++ Error **errp)
++{
++ BDRVAllocTrackState *s = bs->opaque;
++ BdrvChild *file = NULL;
++ QemuOpts *opts;
++ Error *local_err = NULL;
++ int ret = 0;
++
++ opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
++ qemu_opts_absorb_qdict(opts, options, &local_err);
++ if (local_err) {
++ error_propagate(errp, local_err);
++ ret = -EINVAL;
++ goto fail;
++ }
++
++ s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
++
++ /* open the target (write) node, backing will be attached by block layer */
++ file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
++ BDRV_CHILD_DATA | BDRV_CHILD_METADATA, false,
++ &local_err);
++ bdrv_graph_wrlock();
++ bs->file = file;
++ bdrv_graph_wrunlock();
++ if (local_err) {
++ ret = -EINVAL;
++ error_propagate(errp, local_err);
++ goto fail;
++ }
++
++ bdrv_graph_rdlock_main_loop();
++ BlockDriverInfo bdi = {0};
++ ret = bdrv_get_info(bs->file->bs, &bdi);
++ if (ret < 0) {
++ /*
++ * Not a hard failure. Worst that can happen is partial cluster
++ * allocation in the write target. However, the driver here returns its
++ * allocation status based on the dirty bitmap, so any other data that
++ * maps to such a cluster will still be copied later by a stream job (or
++ * during writes to that cluster).
++ */
++ warn_report("alloc-track: unable to query cluster size for write target: %s",
++ strerror(ret));
++ }
++ ret = 0;
++ /*
++ * Always consider alignment from underlying write device so RMW cycle for
++ * bdrv_pwritev reads data from our backing via track_co_preadv. Also try to
++ * avoid partial cluster allocation in the write target by considering the
++ * cluster size.
++ */
++ s->granularity = MAX(bs->file->bs->bl.request_alignment,
++ MAX(bdi.cluster_size, BDRV_SECTOR_SIZE));
++ track_refresh_limits(bs, errp);
++ s->bitmap = bdrv_create_dirty_bitmap(bs->file->bs, s->granularity, NULL,
++ &local_err);
++ bdrv_graph_rdunlock_main_loop();
++ if (local_err) {
++ ret = -EIO;
++ error_propagate(errp, local_err);
++ goto fail;
++ }
++
++ s->drop_state = DropNone;
++
++fail:
++ if (ret < 0) {
++ bdrv_graph_wrlock();
++ bdrv_unref_child(bs, bs->file);
++ bdrv_graph_wrunlock();
++ if (s->bitmap) {
++ bdrv_release_dirty_bitmap(s->bitmap);
++ }
++ }
++ qemu_opts_del(opts);
++ return ret;
++}
++
++static void track_close(BlockDriverState *bs)
++{
++ BDRVAllocTrackState *s = bs->opaque;
++ if (s->bitmap) {
++ bdrv_release_dirty_bitmap(s->bitmap);
++ }
++}
++
++static coroutine_fn int64_t GRAPH_RDLOCK
++track_co_getlength(BlockDriverState *bs)
++{
++ return bdrv_co_getlength(bs->file->bs);
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
++ QEMUIOVector *qiov, BdrvRequestFlags flags)
++{
++ BDRVAllocTrackState *s = bs->opaque;
++ QEMUIOVector local_qiov;
++ int ret;
++
++ /* 'cur_offset' is relative to 'offset', 'local_offset' to image start */
++ uint64_t cur_offset, local_offset;
++ int64_t local_bytes;
++ bool alloc;
++
++ if (offset < 0 || bytes < 0) {
++ fprintf(stderr, "unexpected negative 'offset' or 'bytes' value!\n");
++ return -EIO;
++ }
++
++ /* a read request can span multiple granularity-sized chunks, and can thus
++ * contain blocks with different allocation status - we could just iterate
++ * granularity-wise, but for better performance use bdrv_dirty_bitmap_next_X
++ * to find the next flip and consider everything up to that in one go */
++ for (cur_offset = 0; cur_offset < bytes; cur_offset += local_bytes) {
++ local_offset = offset + cur_offset;
++ alloc = bdrv_dirty_bitmap_get(s->bitmap, local_offset);
++ if (alloc) {
++ local_bytes = bdrv_dirty_bitmap_next_zero(s->bitmap, local_offset,
++ bytes - cur_offset);
++ } else {
++ local_bytes = bdrv_dirty_bitmap_next_dirty(s->bitmap, local_offset,
++ bytes - cur_offset);
++ }
++
++ /* _bitmap_next_X return is -1 if no end found within limit, otherwise
++ * offset of next flip (to start of image) */
++ local_bytes = local_bytes < 0 ?
++ bytes - cur_offset :
++ local_bytes - local_offset;
++
++ qemu_iovec_init_slice(&local_qiov, qiov, cur_offset, local_bytes);
++
++ if (alloc) {
++ ret = bdrv_co_preadv(bs->file, local_offset, local_bytes,
++ &local_qiov, flags);
++ } else if (bs->backing) {
++ ret = bdrv_co_preadv(bs->backing, local_offset, local_bytes,
++ &local_qiov, flags);
++ } else {
++ qemu_iovec_memset(&local_qiov, cur_offset, 0, local_bytes);
++ ret = 0;
++ }
++
++ if (ret != 0) {
++ break;
++ }
++ }
++
++ return ret;
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
++ QEMUIOVector *qiov, BdrvRequestFlags flags)
++{
++ return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
++ BdrvRequestFlags flags)
++{
++ return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
++{
++ return bdrv_co_pdiscard(bs->file, offset, bytes);
++}
++
++static coroutine_fn int GRAPH_RDLOCK
++track_co_flush(BlockDriverState *bs)
++{
++ return bdrv_co_flush(bs->file->bs);
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_block_status(BlockDriverState *bs, bool want_zero,
++ int64_t offset,
++ int64_t bytes,
++ int64_t *pnum,
++ int64_t *map,
++ BlockDriverState **file)
++{
++ BDRVAllocTrackState *s = bs->opaque;
++
++ bool alloc = bdrv_dirty_bitmap_get(s->bitmap, offset);
++ int64_t next_flipped;
++ if (alloc) {
++ next_flipped = bdrv_dirty_bitmap_next_zero(s->bitmap, offset, bytes);
++ } else {
++ next_flipped = bdrv_dirty_bitmap_next_dirty(s->bitmap, offset, bytes);
++ }
++
++ /* in case not the entire region has the same state, we need to set pnum to
++ * indicate for how many bytes our result is valid */
++ *pnum = next_flipped == -1 ? bytes : next_flipped - offset;
++ *map = offset;
++
++ if (alloc) {
++ *file = bs->file->bs;
++ return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
++ } else if (bs->backing) {
++ *file = bs->backing->bs;
++ }
++ return 0;
++}
++
++static void GRAPH_RDLOCK
++track_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
++ BlockReopenQueue *reopen_queue, uint64_t perm, uint64_t shared,
++ uint64_t *nperm, uint64_t *nshared)
++{
++ BDRVAllocTrackState *s = bs->opaque;
++
++ *nshared = BLK_PERM_ALL;
++
++ /* in case we're currently dropping ourselves, claim to not use any
++ * permissions at all - which is fine, since from this point on we will
++ * never issue a read or write anymore */
++ if (s->drop_state == DropInProgress) {
++ *nperm = 0;
++ return;
++ }
++
++ if (role & BDRV_CHILD_DATA) {
++ *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
++ } else {
++ /* 'backing' is also a child of our BDS, but we don't expect it to be
++ * writeable, so we only forward 'consistent read' */
++ *nperm = perm & BLK_PERM_CONSISTENT_READ;
++ }
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
++ const char *backing_fmt)
++{
++ /*
++ * Note that the actual backing file graph change is already done in the
++ * stream job itself with bdrv_set_backing_hd_drained(), so no need to
++ * actually do anything here. But still needs to be implemented, to make
++ * our caller (i.e. bdrv_co_change_backing_file() do the right thing).
++ *
++ * FIXME
++ * We'd like to auto-remove ourselves from the block graph, but it cannot
++ * be done from a coroutine. Currently done in the stream job, where it
++ * kinda fits better, but in the long-term, a special parameter would be
++ * nice (or done via qemu-server via upcoming blockdev-replace QMP command).
++ */
++ if (backing_file == NULL) {
++ BDRVAllocTrackState *s = bs->opaque;
++ bdrv_drained_begin(bs);
++ s->drop_state = DropInProgress;
++ bdrv_child_refresh_perms(bs, bs->file, &error_abort);
++ bdrv_drained_end(bs);
++ }
++
++ return 0;
++}
++
++static BlockDriver bdrv_alloc_track = {
++ .format_name = "alloc-track",
++ .instance_size = sizeof(BDRVAllocTrackState),
++
++ .bdrv_file_open = track_open,
++ .bdrv_close = track_close,
++ .bdrv_co_getlength = track_co_getlength,
++ .bdrv_child_perm = track_child_perm,
++ .bdrv_refresh_limits = track_refresh_limits,
++
++ .bdrv_co_pwrite_zeroes = track_co_pwrite_zeroes,
++ .bdrv_co_pwritev = track_co_pwritev,
++ .bdrv_co_preadv = track_co_preadv,
++ .bdrv_co_pdiscard = track_co_pdiscard,
++
++ .bdrv_co_flush = track_co_flush,
++ .bdrv_co_flush_to_disk = track_co_flush,
++
++ .supports_backing = true,
++
++ .bdrv_co_block_status = track_co_block_status,
++ .bdrv_co_change_backing_file = track_co_change_backing_file,
++};
++
++static void bdrv_alloc_track_init(void)
++{
++ bdrv_register(&bdrv_alloc_track);
++}
++
++block_init(bdrv_alloc_track_init);
+diff --git a/block/meson.build b/block/meson.build
+index 1945e04eeb..2873f3a25a 100644
+--- a/block/meson.build
++++ b/block/meson.build
+@@ -2,6 +2,7 @@ block_ss.add(genh)
+ block_ss.add(files(
+ 'accounting.c',
+ 'aio_task.c',
++ 'alloc-track.c',
+ 'amend.c',
+ 'backup.c',
+ 'backup-dump.c',
+diff --git a/block/stream.c b/block/stream.c
+index d2da83ae7c..f941cba14e 100644
+--- a/block/stream.c
++++ b/block/stream.c
+@@ -120,6 +120,40 @@ static int stream_prepare(Job *job)
+ ret = -EPERM;
+ goto out;
+ }
++
++ /*
++ * This cannot be done in the co_change_backing_file callback, because
++ * bdrv_replace_node() cannot be done in a coroutine. The latter also
++ * requires the graph lock exclusively. Only required for the
++ * alloc-track driver.
++ *
++ * The long-term plan is to either have an explicit parameter for the
++ * stream job or use the upcoming blockdev-replace QMP command.
++ */
++ if (base_id == NULL && strcmp(unfiltered_bs->drv->format_name, "alloc-track") == 0) {
++ BlockDriverState *file_bs;
++
++ bdrv_graph_rdlock_main_loop();
++ file_bs = unfiltered_bs->file->bs;
++ bdrv_graph_rdunlock_main_loop();
++
++ bdrv_ref(unfiltered_bs); // unrefed by bdrv_replace_node()
++ bdrv_drained_begin(file_bs);
++ bdrv_graph_wrlock();
++
++ bdrv_replace_node(unfiltered_bs, file_bs, &local_err);
++
++ bdrv_graph_wrunlock();
++ bdrv_drained_end(file_bs);
++ bdrv_unref(unfiltered_bs);
++
++ if (local_err) {
++ error_prepend(&local_err, "failed to replace alloc-track node: ");
++ error_report_err(local_err);
++ ret = -EPERM;
++ goto out;
++ }
++ }
+ }
+
+ out:
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Stefan Reiter <s.reiter@proxmox.com>
-Date: Tue, 2 Mar 2021 16:11:54 +0100
-Subject: [PATCH] block/io: accept NULL qiov in bdrv_pad_request
-
-Some operations, e.g. block-stream, perform reads while discarding the
-results (only copy-on-read matters). In this case they will pass NULL as
-the target QEMUIOVector, which will however trip bdrv_pad_request, since
-it wants to extend its passed vector.
-
-If there is no qiov, no operation can be done with it, but the bytes
-and offset still need to be updated, so the subsequent aligned read
-will actually be aligned and not run into an assertion failure.
-
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
-[FE: do update bytes and offset in any case]
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- block/io.c | 29 ++++++++++++++++-------------
- 1 file changed, 16 insertions(+), 13 deletions(-)
-
-diff --git a/block/io.c b/block/io.c
-index 83d1b1dfdc..e927881e40 100644
---- a/block/io.c
-+++ b/block/io.c
-@@ -1723,22 +1723,25 @@ static int bdrv_pad_request(BlockDriverState *bs,
- return 0;
- }
-
-- sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
-- &sliced_head, &sliced_tail,
-- &sliced_niov);
--
-- /* Guaranteed by bdrv_check_request32() */
-- assert(*bytes <= SIZE_MAX);
-- ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
-- sliced_head, *bytes);
-- if (ret < 0) {
-- bdrv_padding_finalize(pad);
-- return ret;
-+ if (qiov && *qiov) {
-+ sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
-+ &sliced_head, &sliced_tail,
-+ &sliced_niov);
-+
-+ /* Guaranteed by bdrv_check_request32() */
-+ assert(*bytes <= SIZE_MAX);
-+ ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
-+ sliced_head, *bytes);
-+ if (ret < 0) {
-+ bdrv_padding_finalize(pad);
-+ return ret;
-+ }
-+ *qiov = &pad->local_qiov;
-+ *qiov_offset = 0;
- }
-+
- *bytes += pad->head + pad->tail;
- *offset -= pad->head;
-- *qiov = &pad->local_qiov;
-- *qiov_offset = 0;
- if (padded) {
- *padded = true;
- }
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fabian Ebner <f.ebner@proxmox.com>
+Date: Thu, 23 Jun 2022 14:00:05 +0200
+Subject: [PATCH] Revert "block/rbd: workaround for ceph issue #53784"
+
+This reverts commit fc176116cdea816ceb8dd969080b2b95f58edbc0 in
+preparation to revert 0347a8fd4c3faaedf119be04c197804be40a384b.
+
+Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/rbd.c | 42 ++----------------------------------------
+ 1 file changed, 2 insertions(+), 40 deletions(-)
+
+diff --git a/block/rbd.c b/block/rbd.c
+index 63f60d41be..367db42dce 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -1515,7 +1515,6 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+ int status, r;
+ RBDDiffIterateReq req = { .offs = offset };
+ uint64_t features, flags;
+- uint64_t head = 0;
+
+ assert(offset + bytes <= s->image_size);
+
+@@ -1543,43 +1542,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+ return status;
+ }
+
+-#if LIBRBD_VERSION_CODE < LIBRBD_VERSION(1, 17, 0)
+- /*
+- * librbd had a bug until early 2022 that affected all versions of ceph that
+- * supported fast-diff. This bug results in reporting of incorrect offsets
+- * if the offset parameter to rbd_diff_iterate2 is not object aligned.
+- * Work around this bug by rounding down the offset to object boundaries.
+- * This is OK because we call rbd_diff_iterate2 with whole_object = true.
+- * However, this workaround only works for non cloned images with default
+- * striping.
+- *
+- * See: https://tracker.ceph.com/issues/53784
+- */
+-
+- /* check if RBD image has non-default striping enabled */
+- if (features & RBD_FEATURE_STRIPINGV2) {
+- return status;
+- }
+-
+-#pragma GCC diagnostic push
+-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+- /*
+- * check if RBD image is a clone (= has a parent).
+- *
+- * rbd_get_parent_info is deprecated from Nautilus onwards, but the
+- * replacement rbd_get_parent is not present in Luminous and Mimic.
+- */
+- if (rbd_get_parent_info(s->image, NULL, 0, NULL, 0, NULL, 0) != -ENOENT) {
+- return status;
+- }
+-#pragma GCC diagnostic pop
+-
+- head = req.offs & (s->object_size - 1);
+- req.offs -= head;
+- bytes += head;
+-#endif
+-
+- r = rbd_diff_iterate2(s->image, NULL, req.offs, bytes, true, true,
++ r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
+ qemu_rbd_diff_iterate_cb, &req);
+ if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
+ return status;
+@@ -1598,8 +1561,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+ status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
+ }
+
+- assert(req.bytes > head);
+- *pnum = req.bytes - head;
++ *pnum = req.bytes;
+ return status;
+ }
+
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Stefan Reiter <s.reiter@proxmox.com>
-Date: Mon, 7 Dec 2020 15:21:03 +0100
-Subject: [PATCH] block: add alloc-track driver
-
-Add a new filter node 'alloc-track', which seperates reads and writes to
-different children, thus allowing to put a backing image behind any
-blockdev (regardless of driver support). Since we can't detect any
-pre-allocated blocks, we can only track new writes, hence the write
-target ('file') for this node must always be empty.
-
-Intended use case is for live restoring, i.e. add a backup image as a
-block device into a VM, then put an alloc-track on the restore target
-and set the backup as backing. With this, one can use a regular
-'block-stream' to restore the image, while the VM can already run in the
-background. Copy-on-read will help make progress as the VM reads as
-well.
-
-This only worked if the target supports backing images, so up until now
-only for qcow2, with alloc-track any driver for the target can be used.
-
-If 'auto-remove' is set, alloc-track will automatically detach itself
-once the backing image is removed. It will be replaced by 'file'.
-
-Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
-[FE: adapt to changed function signatures
- make error return value consistent with QEMU
- avoid premature break during read]
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- block/alloc-track.c | 352 ++++++++++++++++++++++++++++++++++++++++++++
- block/meson.build | 1 +
- 2 files changed, 353 insertions(+)
- create mode 100644 block/alloc-track.c
-
-diff --git a/block/alloc-track.c b/block/alloc-track.c
-new file mode 100644
-index 0000000000..b75d7c6460
---- /dev/null
-+++ b/block/alloc-track.c
-@@ -0,0 +1,352 @@
-+/*
-+ * Node to allow backing images to be applied to any node. Assumes a blank
-+ * image to begin with, only new writes are tracked as allocated, thus this
-+ * must never be put on a node that already contains data.
-+ *
-+ * Copyright (c) 2020 Proxmox Server Solutions GmbH
-+ * Copyright (c) 2020 Stefan Reiter <s.reiter@proxmox.com>
-+ *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-+ */
-+
-+#include "qemu/osdep.h"
-+#include "qapi/error.h"
-+#include "block/block_int.h"
-+#include "block/dirty-bitmap.h"
-+#include "qapi/qmp/qdict.h"
-+#include "qapi/qmp/qstring.h"
-+#include "qemu/cutils.h"
-+#include "qemu/option.h"
-+#include "qemu/module.h"
-+#include "sysemu/block-backend.h"
-+
-+#define TRACK_OPT_AUTO_REMOVE "auto-remove"
-+
-+typedef enum DropState {
-+ DropNone,
-+ DropRequested,
-+ DropInProgress,
-+} DropState;
-+
-+typedef struct {
-+ BdrvDirtyBitmap *bitmap;
-+ DropState drop_state;
-+ bool auto_remove;
-+} BDRVAllocTrackState;
-+
-+static QemuOptsList runtime_opts = {
-+ .name = "alloc-track",
-+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
-+ .desc = {
-+ {
-+ .name = TRACK_OPT_AUTO_REMOVE,
-+ .type = QEMU_OPT_BOOL,
-+ .help = "automatically replace this node with 'file' when 'backing'"
-+ "is detached",
-+ },
-+ { /* end of list */ }
-+ },
-+};
-+
-+static void track_refresh_limits(BlockDriverState *bs, Error **errp)
-+{
-+ BlockDriverInfo bdi;
-+
-+ if (!bs->file) {
-+ return;
-+ }
-+
-+ /* always use alignment from underlying write device so RMW cycle for
-+ * bdrv_pwritev reads data from our backing via track_co_preadv (no partial
-+ * cluster allocation in 'file') */
-+ bdrv_get_info(bs->file->bs, &bdi);
-+ bs->bl.request_alignment = MAX(bs->file->bs->bl.request_alignment,
-+ MAX(bdi.cluster_size, BDRV_SECTOR_SIZE));
-+}
-+
-+static int track_open(BlockDriverState *bs, QDict *options, int flags,
-+ Error **errp)
-+{
-+ BDRVAllocTrackState *s = bs->opaque;
-+ QemuOpts *opts;
-+ Error *local_err = NULL;
-+ int ret = 0;
-+
-+ opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
-+ qemu_opts_absorb_qdict(opts, options, &local_err);
-+ if (local_err) {
-+ error_propagate(errp, local_err);
-+ ret = -EINVAL;
-+ goto fail;
-+ }
-+
-+ s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
-+
-+ /* open the target (write) node, backing will be attached by block layer */
-+ bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
-+ BDRV_CHILD_DATA | BDRV_CHILD_METADATA, false,
-+ &local_err);
-+ if (local_err) {
-+ ret = -EINVAL;
-+ error_propagate(errp, local_err);
-+ goto fail;
-+ }
-+
-+ track_refresh_limits(bs, errp);
-+ uint64_t gran = bs->bl.request_alignment;
-+ s->bitmap = bdrv_create_dirty_bitmap(bs->file->bs, gran, NULL, &local_err);
-+ if (local_err) {
-+ ret = -EIO;
-+ error_propagate(errp, local_err);
-+ goto fail;
-+ }
-+
-+ s->drop_state = DropNone;
-+
-+fail:
-+ if (ret < 0) {
-+ bdrv_unref_child(bs, bs->file);
-+ if (s->bitmap) {
-+ bdrv_release_dirty_bitmap(s->bitmap);
-+ }
-+ }
-+ qemu_opts_del(opts);
-+ return ret;
-+}
-+
-+static void track_close(BlockDriverState *bs)
-+{
-+ BDRVAllocTrackState *s = bs->opaque;
-+ if (s->bitmap) {
-+ bdrv_release_dirty_bitmap(s->bitmap);
-+ }
-+}
-+
-+static coroutine_fn int64_t track_co_getlength(BlockDriverState *bs)
-+{
-+ return bdrv_co_getlength(bs->file->bs);
-+}
-+
-+static int coroutine_fn track_co_preadv(BlockDriverState *bs,
-+ int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
-+{
-+ BDRVAllocTrackState *s = bs->opaque;
-+ QEMUIOVector local_qiov;
-+ int ret;
-+
-+ /* 'cur_offset' is relative to 'offset', 'local_offset' to image start */
-+ uint64_t cur_offset, local_offset;
-+ int64_t local_bytes;
-+ bool alloc;
-+
-+ if (offset < 0 || bytes < 0) {
-+ fprintf(stderr, "unexpected negative 'offset' or 'bytes' value!\n");
-+ return -EIO;
-+ }
-+
-+ /* a read request can span multiple granularity-sized chunks, and can thus
-+ * contain blocks with different allocation status - we could just iterate
-+ * granularity-wise, but for better performance use bdrv_dirty_bitmap_next_X
-+ * to find the next flip and consider everything up to that in one go */
-+ for (cur_offset = 0; cur_offset < bytes; cur_offset += local_bytes) {
-+ local_offset = offset + cur_offset;
-+ alloc = bdrv_dirty_bitmap_get(s->bitmap, local_offset);
-+ if (alloc) {
-+ local_bytes = bdrv_dirty_bitmap_next_zero(s->bitmap, local_offset,
-+ bytes - cur_offset);
-+ } else {
-+ local_bytes = bdrv_dirty_bitmap_next_dirty(s->bitmap, local_offset,
-+ bytes - cur_offset);
-+ }
-+
-+ /* _bitmap_next_X return is -1 if no end found within limit, otherwise
-+ * offset of next flip (to start of image) */
-+ local_bytes = local_bytes < 0 ?
-+ bytes - cur_offset :
-+ local_bytes - local_offset;
-+
-+ qemu_iovec_init_slice(&local_qiov, qiov, cur_offset, local_bytes);
-+
-+ if (alloc) {
-+ ret = bdrv_co_preadv(bs->file, local_offset, local_bytes,
-+ &local_qiov, flags);
-+ } else if (bs->backing) {
-+ ret = bdrv_co_preadv(bs->backing, local_offset, local_bytes,
-+ &local_qiov, flags);
-+ } else {
-+ qemu_iovec_memset(&local_qiov, cur_offset, 0, local_bytes);
-+ ret = 0;
-+ }
-+
-+ if (ret != 0) {
-+ break;
-+ }
-+ }
-+
-+ return ret;
-+}
-+
-+static int coroutine_fn track_co_pwritev(BlockDriverState *bs,
-+ int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
-+{
-+ return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
-+}
-+
-+static int coroutine_fn track_co_pwrite_zeroes(BlockDriverState *bs,
-+ int64_t offset, int64_t bytes, BdrvRequestFlags flags)
-+{
-+ return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
-+}
-+
-+static int coroutine_fn track_co_pdiscard(BlockDriverState *bs,
-+ int64_t offset, int64_t bytes)
-+{
-+ return bdrv_co_pdiscard(bs->file, offset, bytes);
-+}
-+
-+static coroutine_fn int track_co_flush(BlockDriverState *bs)
-+{
-+ return bdrv_co_flush(bs->file->bs);
-+}
-+
-+static int coroutine_fn track_co_block_status(BlockDriverState *bs,
-+ bool want_zero,
-+ int64_t offset,
-+ int64_t bytes,
-+ int64_t *pnum,
-+ int64_t *map,
-+ BlockDriverState **file)
-+{
-+ BDRVAllocTrackState *s = bs->opaque;
-+
-+ bool alloc = bdrv_dirty_bitmap_get(s->bitmap, offset);
-+ int64_t next_flipped;
-+ if (alloc) {
-+ next_flipped = bdrv_dirty_bitmap_next_zero(s->bitmap, offset, bytes);
-+ } else {
-+ next_flipped = bdrv_dirty_bitmap_next_dirty(s->bitmap, offset, bytes);
-+ }
-+
-+ /* in case not the entire region has the same state, we need to set pnum to
-+ * indicate for how many bytes our result is valid */
-+ *pnum = next_flipped == -1 ? bytes : next_flipped - offset;
-+ *map = offset;
-+
-+ if (alloc) {
-+ *file = bs->file->bs;
-+ return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
-+ } else if (bs->backing) {
-+ *file = bs->backing->bs;
-+ }
-+ return 0;
-+}
-+
-+static void track_child_perm(BlockDriverState *bs, BdrvChild *c,
-+ BdrvChildRole role, BlockReopenQueue *reopen_queue,
-+ uint64_t perm, uint64_t shared,
-+ uint64_t *nperm, uint64_t *nshared)
-+{
-+ BDRVAllocTrackState *s = bs->opaque;
-+
-+ *nshared = BLK_PERM_ALL;
-+
-+ /* in case we're currently dropping ourselves, claim to not use any
-+ * permissions at all - which is fine, since from this point on we will
-+ * never issue a read or write anymore */
-+ if (s->drop_state == DropInProgress) {
-+ *nperm = 0;
-+ return;
-+ }
-+
-+ if (role & BDRV_CHILD_DATA) {
-+ *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
-+ } else {
-+ /* 'backing' is also a child of our BDS, but we don't expect it to be
-+ * writeable, so we only forward 'consistent read' */
-+ *nperm = perm & BLK_PERM_CONSISTENT_READ;
-+ }
-+}
-+
-+static void track_drop(void *opaque)
-+{
-+ BlockDriverState *bs = (BlockDriverState*)opaque;
-+ BlockDriverState *file = bs->file->bs;
-+ BDRVAllocTrackState *s = bs->opaque;
-+
-+ assert(file);
-+
-+ /* we rely on the fact that we're not used anywhere else, so let's wait
-+ * until we're only used once - in the drive connected to the guest (and one
-+ * ref is held by bdrv_ref in track_change_backing_file) */
-+ if (bs->refcnt > 2) {
-+ aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, opaque);
-+ return;
-+ }
-+ AioContext *aio_context = bdrv_get_aio_context(bs);
-+ aio_context_acquire(aio_context);
-+
-+ bdrv_drained_begin(bs);
-+
-+ /* now that we're drained, we can safely set 'DropInProgress' */
-+ s->drop_state = DropInProgress;
-+ bdrv_child_refresh_perms(bs, bs->file, &error_abort);
-+
-+ bdrv_replace_node(bs, file, &error_abort);
-+ bdrv_set_backing_hd(bs, NULL, &error_abort);
-+ bdrv_drained_end(bs);
-+ bdrv_unref(bs);
-+ aio_context_release(aio_context);
-+}
-+
-+static int track_change_backing_file(BlockDriverState *bs,
-+ const char *backing_file,
-+ const char *backing_fmt)
-+{
-+ BDRVAllocTrackState *s = bs->opaque;
-+ if (s->auto_remove && s->drop_state == DropNone &&
-+ backing_file == NULL && backing_fmt == NULL)
-+ {
-+ /* backing file has been disconnected, there's no longer any use for
-+ * this node, so let's remove ourselves from the block graph - we need
-+ * to schedule this for later however, since when this function is
-+ * called, the blockjob modifying us is probably not done yet and has a
-+ * blocker on 'bs' */
-+ s->drop_state = DropRequested;
-+ bdrv_ref(bs);
-+ aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, (void*)bs);
-+ }
-+
-+ return 0;
-+}
-+
-+static BlockDriver bdrv_alloc_track = {
-+ .format_name = "alloc-track",
-+ .instance_size = sizeof(BDRVAllocTrackState),
-+
-+ .bdrv_file_open = track_open,
-+ .bdrv_close = track_close,
-+ .bdrv_co_getlength = track_co_getlength,
-+ .bdrv_child_perm = track_child_perm,
-+ .bdrv_refresh_limits = track_refresh_limits,
-+
-+ .bdrv_co_pwrite_zeroes = track_co_pwrite_zeroes,
-+ .bdrv_co_pwritev = track_co_pwritev,
-+ .bdrv_co_preadv = track_co_preadv,
-+ .bdrv_co_pdiscard = track_co_pdiscard,
-+
-+ .bdrv_co_flush = track_co_flush,
-+ .bdrv_co_flush_to_disk = track_co_flush,
-+
-+ .supports_backing = true,
-+
-+ .bdrv_co_block_status = track_co_block_status,
-+ .bdrv_change_backing_file = track_change_backing_file,
-+};
-+
-+static void bdrv_alloc_track_init(void)
-+{
-+ bdrv_register(&bdrv_alloc_track);
-+}
-+
-+block_init(bdrv_alloc_track_init);
-diff --git a/block/meson.build b/block/meson.build
-index becc99ac4e..0a69836593 100644
---- a/block/meson.build
-+++ b/block/meson.build
-@@ -2,6 +2,7 @@ block_ss.add(genh)
- block_ss.add(files(
- 'accounting.c',
- 'aio_task.c',
-+ 'alloc-track.c',
- 'amend.c',
- 'backup.c',
- 'backup-dump.c',
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fabian Ebner <f.ebner@proxmox.com>
+Date: Thu, 23 Jun 2022 14:00:07 +0200
+Subject: [PATCH] Revert "block/rbd: fix handling of holes in
+ .bdrv_co_block_status"
+
+This reverts commit 9e302f64bb407a9bb097b626da97228c2654cfee in
+preparation to revert 0347a8fd4c3faaedf119be04c197804be40a384b.
+
+Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/rbd.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/block/rbd.c b/block/rbd.c
+index 367db42dce..347b121626 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -1474,11 +1474,11 @@ static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
+ RBDDiffIterateReq *req = opaque;
+
+ assert(req->offs + req->bytes <= offs);
+-
+- /* treat a hole like an unallocated area and bail out */
+- if (!exists) {
+- return 0;
+- }
++ /*
++ * we do not diff against a snapshot so we should never receive a callback
++ * for a hole.
++ */
++ assert(exists);
+
+ if (!req->exists && offs > req->offs) {
+ /*
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fabian Ebner <f.ebner@proxmox.com>
-Date: Thu, 23 Jun 2022 14:00:05 +0200
-Subject: [PATCH] Revert "block/rbd: workaround for ceph issue #53784"
-
-This reverts commit fc176116cdea816ceb8dd969080b2b95f58edbc0 in
-preparation to revert 0347a8fd4c3faaedf119be04c197804be40a384b.
-
-Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/rbd.c | 42 ++----------------------------------------
- 1 file changed, 2 insertions(+), 40 deletions(-)
-
-diff --git a/block/rbd.c b/block/rbd.c
-index a4749f3b1b..53e0396b51 100644
---- a/block/rbd.c
-+++ b/block/rbd.c
-@@ -1511,7 +1511,6 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
- int status, r;
- RBDDiffIterateReq req = { .offs = offset };
- uint64_t features, flags;
-- uint64_t head = 0;
-
- assert(offset + bytes <= s->image_size);
-
-@@ -1539,43 +1538,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
- return status;
- }
-
--#if LIBRBD_VERSION_CODE < LIBRBD_VERSION(1, 17, 0)
-- /*
-- * librbd had a bug until early 2022 that affected all versions of ceph that
-- * supported fast-diff. This bug results in reporting of incorrect offsets
-- * if the offset parameter to rbd_diff_iterate2 is not object aligned.
-- * Work around this bug by rounding down the offset to object boundaries.
-- * This is OK because we call rbd_diff_iterate2 with whole_object = true.
-- * However, this workaround only works for non cloned images with default
-- * striping.
-- *
-- * See: https://tracker.ceph.com/issues/53784
-- */
--
-- /* check if RBD image has non-default striping enabled */
-- if (features & RBD_FEATURE_STRIPINGV2) {
-- return status;
-- }
--
--#pragma GCC diagnostic push
--#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-- /*
-- * check if RBD image is a clone (= has a parent).
-- *
-- * rbd_get_parent_info is deprecated from Nautilus onwards, but the
-- * replacement rbd_get_parent is not present in Luminous and Mimic.
-- */
-- if (rbd_get_parent_info(s->image, NULL, 0, NULL, 0, NULL, 0) != -ENOENT) {
-- return status;
-- }
--#pragma GCC diagnostic pop
--
-- head = req.offs & (s->object_size - 1);
-- req.offs -= head;
-- bytes += head;
--#endif
--
-- r = rbd_diff_iterate2(s->image, NULL, req.offs, bytes, true, true,
-+ r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
- qemu_rbd_diff_iterate_cb, &req);
- if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
- return status;
-@@ -1594,8 +1557,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
- status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
- }
-
-- assert(req.bytes > head);
-- *pnum = req.bytes - head;
-+ *pnum = req.bytes;
- return status;
- }
-
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fabian Ebner <f.ebner@proxmox.com>
-Date: Thu, 23 Jun 2022 14:00:07 +0200
-Subject: [PATCH] Revert "block/rbd: fix handling of holes in
- .bdrv_co_block_status"
-
-This reverts commit 9e302f64bb407a9bb097b626da97228c2654cfee in
-preparation to revert 0347a8fd4c3faaedf119be04c197804be40a384b.
-
-Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/rbd.c | 10 +++++-----
- 1 file changed, 5 insertions(+), 5 deletions(-)
-
-diff --git a/block/rbd.c b/block/rbd.c
-index 53e0396b51..0913a0af39 100644
---- a/block/rbd.c
-+++ b/block/rbd.c
-@@ -1470,11 +1470,11 @@ static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
- RBDDiffIterateReq *req = opaque;
-
- assert(req->offs + req->bytes <= offs);
--
-- /* treat a hole like an unallocated area and bail out */
-- if (!exists) {
-- return 0;
-- }
-+ /*
-+ * we do not diff against a snapshot so we should never receive a callback
-+ * for a hole.
-+ */
-+ assert(exists);
-
- if (!req->exists && offs > req->offs) {
- /*
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fabian Ebner <f.ebner@proxmox.com>
+Date: Tue, 17 May 2022 09:46:02 +0200
+Subject: [PATCH] Revert "block/rbd: implement bdrv_co_block_status"
+
+During backup, bdrv_co_block_status is called for each block copy
+chunk. When RBD is used, the current implementation with
+rbd_diff_iterate2() using whole_object=true takes about linearly more
+time, depending on the image size. Since there are linearly more
+chunks, the slowdown is quadratic, becoming unacceptable for large
+images (starting somewhere between 500-1000 GiB in my testing).
+
+This reverts commit 0347a8fd4c3faaedf119be04c197804be40a384b as a
+stop-gap measure, until it's clear how to make the implemenation
+more efficient.
+
+Upstream bug report:
+https://gitlab.com/qemu-project/qemu/-/issues/1026
+
+Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/rbd.c | 112 ----------------------------------------------------
+ 1 file changed, 112 deletions(-)
+
+diff --git a/block/rbd.c b/block/rbd.c
+index 347b121626..e61b359b97 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -108,12 +108,6 @@ typedef struct RBDTask {
+ int64_t ret;
+ } RBDTask;
+
+-typedef struct RBDDiffIterateReq {
+- uint64_t offs;
+- uint64_t bytes;
+- bool exists;
+-} RBDDiffIterateReq;
+-
+ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
+ BlockdevOptionsRbd *opts, bool cache,
+ const char *keypairs, const char *secretid,
+@@ -1460,111 +1454,6 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
+ return spec_info;
+ }
+
+-/*
+- * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
+- * value in the callback routine. Choose a value that does not conflict with
+- * an existing exitcode and return it if we want to prematurely stop the
+- * execution because we detected a change in the allocation status.
+- */
+-#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
+-
+-static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
+- int exists, void *opaque)
+-{
+- RBDDiffIterateReq *req = opaque;
+-
+- assert(req->offs + req->bytes <= offs);
+- /*
+- * we do not diff against a snapshot so we should never receive a callback
+- * for a hole.
+- */
+- assert(exists);
+-
+- if (!req->exists && offs > req->offs) {
+- /*
+- * we started in an unallocated area and hit the first allocated
+- * block. req->bytes must be set to the length of the unallocated area
+- * before the allocated area. stop further processing.
+- */
+- req->bytes = offs - req->offs;
+- return QEMU_RBD_EXIT_DIFF_ITERATE2;
+- }
+-
+- if (req->exists && offs > req->offs + req->bytes) {
+- /*
+- * we started in an allocated area and jumped over an unallocated area,
+- * req->bytes contains the length of the allocated area before the
+- * unallocated area. stop further processing.
+- */
+- return QEMU_RBD_EXIT_DIFF_ITERATE2;
+- }
+-
+- req->bytes += len;
+- req->exists = true;
+-
+- return 0;
+-}
+-
+-static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+- bool want_zero, int64_t offset,
+- int64_t bytes, int64_t *pnum,
+- int64_t *map,
+- BlockDriverState **file)
+-{
+- BDRVRBDState *s = bs->opaque;
+- int status, r;
+- RBDDiffIterateReq req = { .offs = offset };
+- uint64_t features, flags;
+-
+- assert(offset + bytes <= s->image_size);
+-
+- /* default to all sectors allocated */
+- status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+- *map = offset;
+- *file = bs;
+- *pnum = bytes;
+-
+- /* check if RBD image supports fast-diff */
+- r = rbd_get_features(s->image, &features);
+- if (r < 0) {
+- return status;
+- }
+- if (!(features & RBD_FEATURE_FAST_DIFF)) {
+- return status;
+- }
+-
+- /* check if RBD fast-diff result is valid */
+- r = rbd_get_flags(s->image, &flags);
+- if (r < 0) {
+- return status;
+- }
+- if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
+- return status;
+- }
+-
+- r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
+- qemu_rbd_diff_iterate_cb, &req);
+- if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
+- return status;
+- }
+- assert(req.bytes <= bytes);
+- if (!req.exists) {
+- if (r == 0) {
+- /*
+- * rbd_diff_iterate2 does not invoke callbacks for unallocated
+- * areas. This here catches the case where no callback was
+- * invoked at all (req.bytes == 0).
+- */
+- assert(req.bytes == 0);
+- req.bytes = bytes;
+- }
+- status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
+- }
+-
+- *pnum = req.bytes;
+- return status;
+-}
+-
+ static int64_t coroutine_fn qemu_rbd_co_getlength(BlockDriverState *bs)
+ {
+ BDRVRBDState *s = bs->opaque;
+@@ -1800,7 +1689,6 @@ static BlockDriver bdrv_rbd = {
+ #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+ .bdrv_co_pwrite_zeroes = qemu_rbd_co_pwrite_zeroes,
+ #endif
+- .bdrv_co_block_status = qemu_rbd_co_block_status,
+
+ .bdrv_snapshot_create = qemu_rbd_snap_create,
+ .bdrv_snapshot_delete = qemu_rbd_snap_remove,
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fabian Ebner <f.ebner@proxmox.com>
-Date: Tue, 17 May 2022 09:46:02 +0200
-Subject: [PATCH] Revert "block/rbd: implement bdrv_co_block_status"
-
-During backup, bdrv_co_block_status is called for each block copy
-chunk. When RBD is used, the current implementation with
-rbd_diff_iterate2() using whole_object=true takes about linearly more
-time, depending on the image size. Since there are linearly more
-chunks, the slowdown is quadratic, becoming unacceptable for large
-images (starting somewhere between 500-1000 GiB in my testing).
-
-This reverts commit 0347a8fd4c3faaedf119be04c197804be40a384b as a
-stop-gap measure, until it's clear how to make the implemenation
-more efficient.
-
-Upstream bug report:
-https://gitlab.com/qemu-project/qemu/-/issues/1026
-
-Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/rbd.c | 112 ----------------------------------------------------
- 1 file changed, 112 deletions(-)
-
-diff --git a/block/rbd.c b/block/rbd.c
-index 0913a0af39..1dab254517 100644
---- a/block/rbd.c
-+++ b/block/rbd.c
-@@ -108,12 +108,6 @@ typedef struct RBDTask {
- int64_t ret;
- } RBDTask;
-
--typedef struct RBDDiffIterateReq {
-- uint64_t offs;
-- uint64_t bytes;
-- bool exists;
--} RBDDiffIterateReq;
--
- static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
- BlockdevOptionsRbd *opts, bool cache,
- const char *keypairs, const char *secretid,
-@@ -1456,111 +1450,6 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
- return spec_info;
- }
-
--/*
-- * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
-- * value in the callback routine. Choose a value that does not conflict with
-- * an existing exitcode and return it if we want to prematurely stop the
-- * execution because we detected a change in the allocation status.
-- */
--#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
--
--static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
-- int exists, void *opaque)
--{
-- RBDDiffIterateReq *req = opaque;
--
-- assert(req->offs + req->bytes <= offs);
-- /*
-- * we do not diff against a snapshot so we should never receive a callback
-- * for a hole.
-- */
-- assert(exists);
--
-- if (!req->exists && offs > req->offs) {
-- /*
-- * we started in an unallocated area and hit the first allocated
-- * block. req->bytes must be set to the length of the unallocated area
-- * before the allocated area. stop further processing.
-- */
-- req->bytes = offs - req->offs;
-- return QEMU_RBD_EXIT_DIFF_ITERATE2;
-- }
--
-- if (req->exists && offs > req->offs + req->bytes) {
-- /*
-- * we started in an allocated area and jumped over an unallocated area,
-- * req->bytes contains the length of the allocated area before the
-- * unallocated area. stop further processing.
-- */
-- return QEMU_RBD_EXIT_DIFF_ITERATE2;
-- }
--
-- req->bytes += len;
-- req->exists = true;
--
-- return 0;
--}
--
--static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
-- bool want_zero, int64_t offset,
-- int64_t bytes, int64_t *pnum,
-- int64_t *map,
-- BlockDriverState **file)
--{
-- BDRVRBDState *s = bs->opaque;
-- int status, r;
-- RBDDiffIterateReq req = { .offs = offset };
-- uint64_t features, flags;
--
-- assert(offset + bytes <= s->image_size);
--
-- /* default to all sectors allocated */
-- status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
-- *map = offset;
-- *file = bs;
-- *pnum = bytes;
--
-- /* check if RBD image supports fast-diff */
-- r = rbd_get_features(s->image, &features);
-- if (r < 0) {
-- return status;
-- }
-- if (!(features & RBD_FEATURE_FAST_DIFF)) {
-- return status;
-- }
--
-- /* check if RBD fast-diff result is valid */
-- r = rbd_get_flags(s->image, &flags);
-- if (r < 0) {
-- return status;
-- }
-- if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
-- return status;
-- }
--
-- r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
-- qemu_rbd_diff_iterate_cb, &req);
-- if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
-- return status;
-- }
-- assert(req.bytes <= bytes);
-- if (!req.exists) {
-- if (r == 0) {
-- /*
-- * rbd_diff_iterate2 does not invoke callbacks for unallocated
-- * areas. This here catches the case where no callback was
-- * invoked at all (req.bytes == 0).
-- */
-- assert(req.bytes == 0);
-- req.bytes = bytes;
-- }
-- status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
-- }
--
-- *pnum = req.bytes;
-- return status;
--}
--
- static int64_t coroutine_fn qemu_rbd_co_getlength(BlockDriverState *bs)
- {
- BDRVRBDState *s = bs->opaque;
-@@ -1796,7 +1685,6 @@ static BlockDriver bdrv_rbd = {
- #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
- .bdrv_co_pwrite_zeroes = qemu_rbd_co_pwrite_zeroes,
- #endif
-- .bdrv_co_block_status = qemu_rbd_co_block_status,
-
- .bdrv_snapshot_create = qemu_rbd_snap_create,
- .bdrv_snapshot_delete = qemu_rbd_snap_remove,
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Tue, 26 Mar 2024 14:57:51 +0100
+Subject: [PATCH] alloc-track: error out when auto-remove is not set
+
+Since replacing the node now happens in the stream job, where the
+option cannot be read from (it's internal to the driver), it will
+always be treated as on.
+
+qemu-server will always set it, make sure to have other users notice
+the change (should they even exist). The option can be fully dropped
+in the future while adding a version guard in qemu-server.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ block/alloc-track.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/block/alloc-track.c b/block/alloc-track.c
+index b9f8ea9137..f3ed2935c4 100644
+--- a/block/alloc-track.c
++++ b/block/alloc-track.c
+@@ -34,7 +34,6 @@ typedef struct {
+ BdrvDirtyBitmap *bitmap;
+ uint64_t granularity;
+ DropState drop_state;
+- bool auto_remove;
+ } BDRVAllocTrackState;
+
+ static QemuOptsList runtime_opts = {
+@@ -86,7 +85,11 @@ static int track_open(BlockDriverState *bs, QDict *options, int flags,
+ goto fail;
+ }
+
+- s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
++ if (!qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false)) {
++ error_setg(errp, "alloc-track: requires auto-remove option to be set to on");
++ ret = -EINVAL;
++ goto fail;
++ }
+
+ /* open the target (write) node, backing will be attached by block layer */
+ file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Wed, 27 Mar 2024 11:15:39 +0100
+Subject: [PATCH] alloc-track: avoid seemingly superfluous child permission
+ update
+
+Doesn't seem necessary nowadays (maybe after commit "alloc-track: fix
+deadlock during drop" where the dropping is not rescheduled and delayed
+anymore or some upstream change). Should there really be some issue,
+instead of having a drop state, this could also be just based off the
+fact whether there is still a backing child.
+
+Dumping the cumulative (shared) permissions for the BDS with a debug
+print yields the same values after this patch and with QEMU 8.1,
+namely 3 and 5.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ block/alloc-track.c | 26 --------------------------
+ 1 file changed, 26 deletions(-)
+
+diff --git a/block/alloc-track.c b/block/alloc-track.c
+index f3ed2935c4..29138dcc49 100644
+--- a/block/alloc-track.c
++++ b/block/alloc-track.c
+@@ -25,15 +25,9 @@
+
+ #define TRACK_OPT_AUTO_REMOVE "auto-remove"
+
+-typedef enum DropState {
+- DropNone,
+- DropInProgress,
+-} DropState;
+-
+ typedef struct {
+ BdrvDirtyBitmap *bitmap;
+ uint64_t granularity;
+- DropState drop_state;
+ } BDRVAllocTrackState;
+
+ static QemuOptsList runtime_opts = {
+@@ -137,8 +131,6 @@ static int track_open(BlockDriverState *bs, QDict *options, int flags,
+ goto fail;
+ }
+
+- s->drop_state = DropNone;
+-
+ fail:
+ if (ret < 0) {
+ bdrv_graph_wrlock();
+@@ -289,18 +281,8 @@ track_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+ BlockReopenQueue *reopen_queue, uint64_t perm, uint64_t shared,
+ uint64_t *nperm, uint64_t *nshared)
+ {
+- BDRVAllocTrackState *s = bs->opaque;
+-
+ *nshared = BLK_PERM_ALL;
+
+- /* in case we're currently dropping ourselves, claim to not use any
+- * permissions at all - which is fine, since from this point on we will
+- * never issue a read or write anymore */
+- if (s->drop_state == DropInProgress) {
+- *nperm = 0;
+- return;
+- }
+-
+ if (role & BDRV_CHILD_DATA) {
+ *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
+ } else {
+@@ -326,14 +308,6 @@ track_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
+ * kinda fits better, but in the long-term, a special parameter would be
+ * nice (or done via qemu-server via upcoming blockdev-replace QMP command).
+ */
+- if (backing_file == NULL) {
+- BDRVAllocTrackState *s = bs->opaque;
+- bdrv_drained_begin(bs);
+- s->drop_state = DropInProgress;
+- bdrv_child_refresh_perms(bs, bs->file, &error_abort);
+- bdrv_drained_end(bs);
+- }
+-
+ return 0;
+ }
+
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 6 Apr 2023 14:59:31 +0200
-Subject: [PATCH] alloc-track: fix deadlock during drop
-
-by replacing the block node directly after changing the backing file
-instead of rescheduling it.
-
-With changes in QEMU 8.0, calling bdrv_get_info (and bdrv_unref)
-during drop can lead to a deadlock when using iothread (only triggered
-with multiple disks, except during debugging where it also triggered
-with one disk sometimes):
-1. job_unref_locked acquires the AioContext and calls job->driver->free
-2. track_drop gets scheduled
-3. bdrv_graph_wrlock is called and polls which leads to track_drop being
- called
-4. track_drop acquires the AioContext recursively
-5. bdrv_get_info is a wrapped coroutine (since 8.0) and thus polls for
- bdrv_co_get_info. This releases the AioContext, but only once! The
- documentation for the AIO_WAIT_WHILE macro states that the
- AioContext lock needs to be acquired exactly once, but there does
- not seem to be a way for track_drop to know if it acquired the lock
- recursively or not (without adding further hacks).
-6. Because the AioContext is still held by the main thread once, it can't
- be acquired before entering bdrv_co_get_info in co_schedule_bh_cb
- which happens in the iothread
-
-When doing the operation in change_backing_file, the AioContext has
-already been acquired by the caller, so the issue with the recursive
-lock goes away.
-
-The comment explaining why delaying the replace is necessary is
-> we need to schedule this for later however, since when this function
-> is called, the blockjob modifying us is probably not done yet and
-> has a blocker on 'bs'
-
-However, there is no check for blockers in bdrv_replace_node. It would
-need to be done by us, the caller, with check_to_replace_node.
-Furthermore, the mirror job also does its call to bdrv_replace_node
-while there is an active blocker (inserted by mirror itself) and they
-use a specialized version to check for blockers instead of
-check_to_replace_node there. Alloc-track could also do something
-similar to check for other blockers, but it should be fine to rely on
-Proxmox VE that no other operation with the blockdev is going on.
-
-Mirror also drains the target before replacing the node, but the
-target can have other users. In case of alloc-track the file child
-should not be accessible by anybody else and so there can't be an
-in-flight operation for the file child when alloc-track is drained.
-
-The rescheduling based on refcounting is a hack and it doesn't seem to
-be necessary anymore. It's not clear what the original issue from the
-comment was. Testing with older builds with track_drop done directly
-without rescheduling also didn't lead to any noticable issue for me.
-
-One issue it might have been is the one fixed by b1e1af394d
-("block/stream: Drain subtree around graph change"), where
-block-stream had a use-after-free if the base node changed at an
-inconvenient time (which alloc-track's auto-drop does).
-
-It's also not possible to just not auto-replace the alloc-track. Not
-replacing it at all leads to other operations like block resize
-hanging, and there is no good way to replace it manually via QMP
-(there is x-blockdev-change, but it is experimental and doesn't
-implement the required operation yet). Also, it's just cleaner in
-general to not leave unnecessary block nodes lying around.
-
-Suggested-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/alloc-track.c | 54 ++++++++++++++-------------------------------
- 1 file changed, 16 insertions(+), 38 deletions(-)
-
-diff --git a/block/alloc-track.c b/block/alloc-track.c
-index b75d7c6460..76da140a68 100644
---- a/block/alloc-track.c
-+++ b/block/alloc-track.c
-@@ -25,7 +25,6 @@
-
- typedef enum DropState {
- DropNone,
-- DropRequested,
- DropInProgress,
- } DropState;
-
-@@ -268,37 +267,6 @@ static void track_child_perm(BlockDriverState *bs, BdrvChild *c,
- }
- }
-
--static void track_drop(void *opaque)
--{
-- BlockDriverState *bs = (BlockDriverState*)opaque;
-- BlockDriverState *file = bs->file->bs;
-- BDRVAllocTrackState *s = bs->opaque;
--
-- assert(file);
--
-- /* we rely on the fact that we're not used anywhere else, so let's wait
-- * until we're only used once - in the drive connected to the guest (and one
-- * ref is held by bdrv_ref in track_change_backing_file) */
-- if (bs->refcnt > 2) {
-- aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, opaque);
-- return;
-- }
-- AioContext *aio_context = bdrv_get_aio_context(bs);
-- aio_context_acquire(aio_context);
--
-- bdrv_drained_begin(bs);
--
-- /* now that we're drained, we can safely set 'DropInProgress' */
-- s->drop_state = DropInProgress;
-- bdrv_child_refresh_perms(bs, bs->file, &error_abort);
--
-- bdrv_replace_node(bs, file, &error_abort);
-- bdrv_set_backing_hd(bs, NULL, &error_abort);
-- bdrv_drained_end(bs);
-- bdrv_unref(bs);
-- aio_context_release(aio_context);
--}
--
- static int track_change_backing_file(BlockDriverState *bs,
- const char *backing_file,
- const char *backing_fmt)
-@@ -308,13 +276,23 @@ static int track_change_backing_file(BlockDriverState *bs,
- backing_file == NULL && backing_fmt == NULL)
- {
- /* backing file has been disconnected, there's no longer any use for
-- * this node, so let's remove ourselves from the block graph - we need
-- * to schedule this for later however, since when this function is
-- * called, the blockjob modifying us is probably not done yet and has a
-- * blocker on 'bs' */
-- s->drop_state = DropRequested;
-+ * this node, so let's remove ourselves from the block graph */
-+ BlockDriverState *file = bs->file->bs;
-+
-+ /* Just to be sure, because bdrv_replace_node unrefs it */
- bdrv_ref(bs);
-- aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, (void*)bs);
-+ bdrv_drained_begin(bs);
-+
-+ /* now that we're drained, we can safely set 'DropInProgress' */
-+ s->drop_state = DropInProgress;
-+
-+ bdrv_child_refresh_perms(bs, bs->file, &error_abort);
-+
-+ bdrv_replace_node(bs, file, &error_abort);
-+ bdrv_set_backing_hd(bs, NULL, &error_abort);
-+
-+ bdrv_drained_end(bs);
-+ bdrv_unref(bs);
- }
-
- return 0;
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Date: Thu, 11 Apr 2024 11:29:22 +0200
+Subject: [PATCH] block/copy-before-write: fix permission
+
+In case when source node does not have any parents, the condition still
+works as required: backup job do create the parent by
+
+ block_job_create -> block_job_add_bdrv -> bdrv_root_attach_child
+
+Still, in this case checking @perm variable doesn't work, as backup job
+creates the root blk with empty permissions (as it rely on CBW filter
+to require correct permissions and don't want to create extra
+conflicts).
+
+So, we should not check @perm.
+
+The hack may be dropped entirely when transactional insertion of
+filter (when we don't try to recalculate permissions in intermediate
+state, when filter does conflict with original parent of the source
+node) merged (old big series
+"[PATCH v5 00/45] Transactional block-graph modifying API"[1] and it's
+current in-flight part is "[PATCH v8 0/7] blockdev-replace"[2])
+
+[1] https://patchew.org/QEMU/20220330212902.590099-1-vsementsov@openvz.org/
+[2] https://patchew.org/QEMU/20231017184444.932733-1-vsementsov@yandex-team.ru/
+
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/copy-before-write.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 026fa9840f..5a9456d426 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -364,9 +364,13 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+ perm, shared, nperm, nshared);
+
+ if (!QLIST_EMPTY(&bs->parents)) {
+- if (perm & BLK_PERM_WRITE) {
+- *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
+- }
++ /*
++ * Note, that source child may be shared with backup job. Backup job
++ * does create own blk parent on copy-before-write node, so this
++ * works even if source node does not have any parents before backup
++ * start
++ */
++ *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
+ *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
+ }
+ }
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Fri, 5 May 2023 13:39:53 +0200
-Subject: [PATCH] migration: for snapshots, hold the BQL during setup callbacks
-
-In spirit, this is a partial revert of commit 9b09503752 ("migration:
-run setup callbacks out of big lock"), but only for the snapshot case.
-
-For snapshots, the bdrv_writev_vmstate() function is used during setup
-(in QIOChannelBlock backing the QEMUFile), but not holding the BQL
-while calling it could lead to an assertion failure. To understand
-how, first note the following:
-
-1. Generated coroutine wrappers for block layer functions spawn the
-coroutine and use AIO_WAIT_WHILE()/aio_poll() to wait for it.
-2. If the host OS switches threads at an inconvenient time, it can
-happen that a bottom half scheduled for the main thread's AioContext
-is executed as part of a vCPU thread's aio_poll().
-
-An example leading to the assertion failure is as follows:
-
-main thread:
-1. A snapshot-save QMP command gets issued.
-2. snapshot_save_job_bh() is scheduled.
-
-vCPU thread:
-3. aio_poll() for the main thread's AioContext is called (e.g. when
-the guest writes to a pflash device, as part of blk_pwrite which is a
-generated coroutine wrapper).
-4. snapshot_save_job_bh() is executed as part of aio_poll().
-3. qemu_savevm_state() is called.
-4. qemu_mutex_unlock_iothread() is called. Now
-qemu_get_current_aio_context() returns 0x0.
-5. bdrv_writev_vmstate() is executed during the usual savevm setup.
-But this function is a generated coroutine wrapper, so it uses
-AIO_WAIT_WHILE. There, the assertion
-assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-will fail.
-
-To fix it, ensure that the BQL is held during setup. To avoid changing
-the behavior for migration too, introduce conditionals for the setup
-callbacks that need the BQL and only take the lock if it's not already
-held.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- include/migration/register.h | 2 +-
- migration/block-dirty-bitmap.c | 15 ++++++++++++---
- migration/block.c | 15 ++++++++++++---
- migration/ram.c | 16 +++++++++++++---
- migration/savevm.c | 2 --
- 5 files changed, 38 insertions(+), 12 deletions(-)
-
-diff --git a/include/migration/register.h b/include/migration/register.h
-index 90914f32f5..c728fd9120 100644
---- a/include/migration/register.h
-+++ b/include/migration/register.h
-@@ -43,9 +43,9 @@ typedef struct SaveVMHandlers {
- * by other locks.
- */
- int (*save_live_iterate)(QEMUFile *f, void *opaque);
-+ int (*save_setup)(QEMUFile *f, void *opaque);
-
- /* This runs outside the iothread lock! */
-- int (*save_setup)(QEMUFile *f, void *opaque);
- /* Note for save_live_pending:
- * must_precopy:
- * - must be migrated in precopy or in stopped state
-diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
-index 285dd1d148..f7ee5a74d9 100644
---- a/migration/block-dirty-bitmap.c
-+++ b/migration/block-dirty-bitmap.c
-@@ -1219,10 +1219,17 @@ static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
- {
- DBMSaveState *s = &((DBMState *)opaque)->save;
- SaveBitmapState *dbms = NULL;
-+ bool release_lock = false;
-
-- qemu_mutex_lock_iothread();
-+ /* For snapshots, the BQL is held during setup. */
-+ if (!qemu_mutex_iothread_locked()) {
-+ qemu_mutex_lock_iothread();
-+ release_lock = true;
-+ }
- if (init_dirty_bitmap_migration(s) < 0) {
-- qemu_mutex_unlock_iothread();
-+ if (release_lock) {
-+ qemu_mutex_unlock_iothread();
-+ }
- return -1;
- }
-
-@@ -1230,7 +1237,9 @@ static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
- send_bitmap_start(f, s, dbms);
- }
- qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
-- qemu_mutex_unlock_iothread();
-+ if (release_lock) {
-+ qemu_mutex_unlock_iothread();
-+ }
- return 0;
- }
-
-diff --git a/migration/block.c b/migration/block.c
-index 86c2256a2b..8423e0c9f9 100644
---- a/migration/block.c
-+++ b/migration/block.c
-@@ -725,21 +725,30 @@ static void block_migration_cleanup(void *opaque)
- static int block_save_setup(QEMUFile *f, void *opaque)
- {
- int ret;
-+ bool release_lock = false;
-
- trace_migration_block_save("setup", block_mig_state.submitted,
- block_mig_state.transferred);
-
-- qemu_mutex_lock_iothread();
-+ /* For snapshots, the BQL is held during setup. */
-+ if (!qemu_mutex_iothread_locked()) {
-+ qemu_mutex_lock_iothread();
-+ release_lock = true;
-+ }
- ret = init_blk_migration(f);
- if (ret < 0) {
-- qemu_mutex_unlock_iothread();
-+ if (release_lock) {
-+ qemu_mutex_unlock_iothread();
-+ }
- return ret;
- }
-
- /* start track dirty blocks */
- ret = set_dirty_tracking();
-
-- qemu_mutex_unlock_iothread();
-+ if (release_lock) {
-+ qemu_mutex_unlock_iothread();
-+ }
-
- if (ret) {
- return ret;
-diff --git a/migration/ram.c b/migration/ram.c
-index 6e1514f69f..6a1aec7031 100644
---- a/migration/ram.c
-+++ b/migration/ram.c
-@@ -2896,8 +2896,16 @@ static void migration_bitmap_clear_discarded_pages(RAMState *rs)
-
- static void ram_init_bitmaps(RAMState *rs)
- {
-- /* For memory_global_dirty_log_start below. */
-- qemu_mutex_lock_iothread();
-+ bool release_lock = false;
-+
-+ /*
-+ * For memory_global_dirty_log_start below.
-+ * For snapshots, the BQL is held during setup.
-+ */
-+ if (!qemu_mutex_iothread_locked()) {
-+ qemu_mutex_lock_iothread();
-+ release_lock = true;
-+ }
- qemu_mutex_lock_ramlist();
-
- WITH_RCU_READ_LOCK_GUARD() {
-@@ -2909,7 +2917,9 @@ static void ram_init_bitmaps(RAMState *rs)
- }
- }
- qemu_mutex_unlock_ramlist();
-- qemu_mutex_unlock_iothread();
-+ if (release_lock) {
-+ qemu_mutex_unlock_iothread();
-+ }
-
- /*
- * After an eventual first bitmap sync, fixup the initial bitmap
-diff --git a/migration/savevm.c b/migration/savevm.c
-index d60c4f487a..3c015722f7 100644
---- a/migration/savevm.c
-+++ b/migration/savevm.c
-@@ -1625,10 +1625,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
- reset_vfio_bytes_transferred();
- ms->to_dst_file = f;
-
-- qemu_mutex_unlock_iothread();
- qemu_savevm_state_header(f);
- qemu_savevm_state_setup(f);
-- qemu_mutex_lock_iothread();
-
- while (qemu_file_get_error(f) == 0) {
- if (qemu_savevm_state_iterate(f, false) > 0) {
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Date: Thu, 11 Apr 2024 11:29:23 +0200
+Subject: [PATCH] block/copy-before-write: support unligned snapshot-discard
+
+First thing that crashes on unligned access here is
+bdrv_reset_dirty_bitmap(). Correct way is to align-down the
+snapshot-discard request.
+
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/copy-before-write.c | 16 +++++++++++++---
+ 1 file changed, 13 insertions(+), 3 deletions(-)
+
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 5a9456d426..c0e70669a2 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -325,14 +325,24 @@ static int coroutine_fn GRAPH_RDLOCK
+ cbw_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
+ {
+ BDRVCopyBeforeWriteState *s = bs->opaque;
++ uint32_t cluster_size = block_copy_cluster_size(s->bcs);
++ int64_t aligned_offset = QEMU_ALIGN_UP(offset, cluster_size);
++ int64_t aligned_end = QEMU_ALIGN_DOWN(offset + bytes, cluster_size);
++ int64_t aligned_bytes;
++
++ if (aligned_end <= aligned_offset) {
++ return 0;
++ }
++ aligned_bytes = aligned_end - aligned_offset;
+
+ WITH_QEMU_LOCK_GUARD(&s->lock) {
+- bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes);
++ bdrv_reset_dirty_bitmap(s->access_bitmap, aligned_offset,
++ aligned_bytes);
+ }
+
+- block_copy_reset(s->bcs, offset, bytes);
++ block_copy_reset(s->bcs, aligned_offset, aligned_bytes);
+
+- return bdrv_co_pdiscard(s->target, offset, bytes);
++ return bdrv_co_pdiscard(s->target, aligned_offset, aligned_bytes);
+ }
+
+ static void GRAPH_RDLOCK cbw_refresh_filename(BlockDriverState *bs)
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Fri, 5 May 2023 15:30:16 +0200
-Subject: [PATCH] savevm-async: don't hold BQL during setup
-
-See commit "migration: for snapshots, hold the BQL during setup
-callbacks" for why. This is separate, because a version of that one
-will hopefully land upstream.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- migration/savevm-async.c | 2 --
- 1 file changed, 2 deletions(-)
-
-diff --git a/migration/savevm-async.c b/migration/savevm-async.c
-index 80624fada8..b1d85a4b41 100644
---- a/migration/savevm-async.c
-+++ b/migration/savevm-async.c
-@@ -401,10 +401,8 @@ void qmp_savevm_start(const char *statefile, Error **errp)
- snap_state.state = SAVE_STATE_ACTIVE;
- snap_state.finalize_bh = qemu_bh_new(process_savevm_finalize, &snap_state);
- snap_state.co = qemu_coroutine_create(&process_savevm_co, NULL);
-- qemu_mutex_unlock_iothread();
- qemu_savevm_state_header(snap_state.file);
- qemu_savevm_state_setup(snap_state.file);
-- qemu_mutex_lock_iothread();
-
- /* Async processing from here on out happens in iohandler context, so let
- * the target bdrv have its home there.
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Date: Thu, 11 Apr 2024 11:29:24 +0200
+Subject: [PATCH] block/copy-before-write: create block_copy bitmap in filter
+ node
+
+Currently block_copy creates copy_bitmap in source node. But that is in
+bad relation with .independent_close=true of copy-before-write filter:
+source node may be detached and removed before .bdrv_close() handler
+called, which should call block_copy_state_free(), which in turn should
+remove copy_bitmap.
+
+That's all not ideal: it would be better if internal bitmap of
+block-copy object is not attached to any node. But that is not possible
+now.
+
+The simplest solution is just create copy_bitmap in filter node, where
+anyway two other bitmaps are created.
+
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/block-copy.c | 3 +-
+ block/copy-before-write.c | 2 +-
+ include/block/block-copy.h | 1 +
+ tests/qemu-iotests/257.out | 112 ++++++++++++++++++-------------------
+ 4 files changed, 60 insertions(+), 58 deletions(-)
+
+diff --git a/block/block-copy.c b/block/block-copy.c
+index 9ee3dd7ef5..8fca2c3698 100644
+--- a/block/block-copy.c
++++ b/block/block-copy.c
+@@ -351,6 +351,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
+ }
+
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
++ BlockDriverState *copy_bitmap_bs,
+ const BdrvDirtyBitmap *bitmap,
+ Error **errp)
+ {
+@@ -367,7 +368,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ return NULL;
+ }
+
+- copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
++ copy_bitmap = bdrv_create_dirty_bitmap(copy_bitmap_bs, cluster_size, NULL,
+ errp);
+ if (!copy_bitmap) {
+ return NULL;
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index c0e70669a2..94db31512d 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -468,7 +468,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
+ ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
+ bs->file->bs->supported_zero_flags);
+
+- s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
++ s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp);
+ if (!s->bcs) {
+ error_prepend(errp, "Cannot create block-copy-state: ");
+ return -EINVAL;
+diff --git a/include/block/block-copy.h b/include/block/block-copy.h
+index 0700953ab8..8b41643bfa 100644
+--- a/include/block/block-copy.h
++++ b/include/block/block-copy.h
+@@ -25,6 +25,7 @@ typedef struct BlockCopyState BlockCopyState;
+ typedef struct BlockCopyCallState BlockCopyCallState;
+
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
++ BlockDriverState *copy_bitmap_bs,
+ const BdrvDirtyBitmap *bitmap,
+ Error **errp);
+
+diff --git a/tests/qemu-iotests/257.out b/tests/qemu-iotests/257.out
+index aa76131ca9..c33dd7f3a9 100644
+--- a/tests/qemu-iotests/257.out
++++ b/tests/qemu-iotests/257.out
+@@ -120,16 +120,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -596,16 +596,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -865,16 +865,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -1341,16 +1341,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -1610,16 +1610,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -2086,16 +2086,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -2355,16 +2355,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -2831,16 +2831,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -3100,16 +3100,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -3576,16 +3576,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -3845,16 +3845,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -4321,16 +4321,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -4590,16 +4590,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+@@ -5066,16 +5066,16 @@ write -P0x67 0x3fe0000 0x20000
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- }
+- ],
+- "drive0": [
++ },
+ {
+ "busy": false,
+ "count": 0,
+ "granularity": 65536,
+ "persistent": false,
+ "recording": false
+- },
++ }
++ ],
++ "drive0": [
+ {
+ "busy": false,
+ "count": 458752,
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Date: Thu, 11 Apr 2024 11:29:22 +0200
-Subject: [PATCH] block/copy-before-write: fix permission
-
-In case when source node does not have any parents, the condition still
-works as required: backup job do create the parent by
-
- block_job_create -> block_job_add_bdrv -> bdrv_root_attach_child
-
-Still, in this case checking @perm variable doesn't work, as backup job
-creates the root blk with empty permissions (as it rely on CBW filter
-to require correct permissions and don't want to create extra
-conflicts).
-
-So, we should not check @perm.
-
-The hack may be dropped entirely when transactional insertion of
-filter (when we don't try to recalculate permissions in intermediate
-state, when filter does conflict with original parent of the source
-node) merged (old big series
-"[PATCH v5 00/45] Transactional block-graph modifying API"[1] and it's
-current in-flight part is "[PATCH v8 0/7] blockdev-replace"[2])
-
-[1] https://patchew.org/QEMU/20220330212902.590099-1-vsementsov@openvz.org/
-[2] https://patchew.org/QEMU/20231017184444.932733-1-vsementsov@yandex-team.ru/
-
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/copy-before-write.c | 10 +++++++---
- 1 file changed, 7 insertions(+), 3 deletions(-)
-
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index b866e42271..a2dddf6f57 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -364,9 +364,13 @@ static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c,
- perm, shared, nperm, nshared);
-
- if (!QLIST_EMPTY(&bs->parents)) {
-- if (perm & BLK_PERM_WRITE) {
-- *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
-- }
-+ /*
-+ * Note, that source child may be shared with backup job. Backup job
-+ * does create own blk parent on copy-before-write node, so this
-+ * works even if source node does not have any parents before backup
-+ * start
-+ */
-+ *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
- *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
- }
- }
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Date: Thu, 11 Apr 2024 11:29:23 +0200
-Subject: [PATCH] block/copy-before-write: support unligned snapshot-discard
-
-First thing that crashes on unligned access here is
-bdrv_reset_dirty_bitmap(). Correct way is to align-down the
-snapshot-discard request.
-
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/copy-before-write.c | 16 +++++++++++++---
- 1 file changed, 13 insertions(+), 3 deletions(-)
-
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index a2dddf6f57..0a219c2b75 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -325,14 +325,24 @@ static int coroutine_fn GRAPH_RDLOCK
- cbw_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
- {
- BDRVCopyBeforeWriteState *s = bs->opaque;
-+ uint32_t cluster_size = block_copy_cluster_size(s->bcs);
-+ int64_t aligned_offset = QEMU_ALIGN_UP(offset, cluster_size);
-+ int64_t aligned_end = QEMU_ALIGN_DOWN(offset + bytes, cluster_size);
-+ int64_t aligned_bytes;
-+
-+ if (aligned_end <= aligned_offset) {
-+ return 0;
-+ }
-+ aligned_bytes = aligned_end - aligned_offset;
-
- WITH_QEMU_LOCK_GUARD(&s->lock) {
-- bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes);
-+ bdrv_reset_dirty_bitmap(s->access_bitmap, aligned_offset,
-+ aligned_bytes);
- }
-
-- block_copy_reset(s->bcs, offset, bytes);
-+ block_copy_reset(s->bcs, aligned_offset, aligned_bytes);
-
-- return bdrv_co_pdiscard(s->target, offset, bytes);
-+ return bdrv_co_pdiscard(s->target, aligned_offset, aligned_bytes);
- }
-
- static void cbw_refresh_filename(BlockDriverState *bs)
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Date: Thu, 11 Apr 2024 11:29:25 +0200
+Subject: [PATCH] qapi: blockdev-backup: add discard-source parameter
+
+Add a parameter that enables discard-after-copy. That is mostly useful
+in "push backup with fleecing" scheme, when source is snapshot-access
+format driver node, based on copy-before-write filter snapshot-access
+API:
+
+[guest] [snapshot-access] ~~ blockdev-backup ~~> [backup target]
+ | |
+ | root | file
+ v v
+[copy-before-write]
+ | |
+ | file | target
+ v v
+[active disk] [temp.img]
+
+In this case discard-after-copy does two things:
+
+ - discard data in temp.img to save disk space
+ - avoid further copy-before-write operation in discarded area
+
+Note that we have to declare WRITE permission on source in
+copy-before-write filter, for discard to work. Still we can't take it
+unconditionally, as it will break normal backup from RO source. So, we
+have to add a parameter and pass it thorough bdrv_open flags.
+
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/backup.c | 5 +++--
+ block/block-copy.c | 9 +++++++++
+ block/copy-before-write.c | 15 +++++++++++++--
+ block/copy-before-write.h | 1 +
+ block/replication.c | 4 ++--
+ blockdev.c | 2 +-
+ include/block/block-common.h | 2 ++
+ include/block/block-copy.h | 1 +
+ include/block/block_int-global-state.h | 2 +-
+ qapi/block-core.json | 4 ++++
+ 10 files changed, 37 insertions(+), 8 deletions(-)
+
+diff --git a/block/backup.c b/block/backup.c
+index 16d611c4ca..1963e47ab9 100644
+--- a/block/backup.c
++++ b/block/backup.c
+@@ -332,7 +332,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+ BlockDriverState *target, int64_t speed,
+ MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
+ BitmapSyncMode bitmap_mode,
+- bool compress,
++ bool compress, bool discard_source,
+ const char *filter_node_name,
+ BackupPerf *perf,
+ BlockdevOnError on_source_error,
+@@ -433,7 +433,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+ goto error;
+ }
+
+- cbw = bdrv_cbw_append(bs, target, filter_node_name, &bcs, errp);
++ cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
++ &bcs, errp);
+ if (!cbw) {
+ goto error;
+ }
+diff --git a/block/block-copy.c b/block/block-copy.c
+index 8fca2c3698..7e3b378528 100644
+--- a/block/block-copy.c
++++ b/block/block-copy.c
+@@ -137,6 +137,7 @@ typedef struct BlockCopyState {
+ CoMutex lock;
+ int64_t in_flight_bytes;
+ BlockCopyMethod method;
++ bool discard_source;
+ BlockReqList reqs;
+ QLIST_HEAD(, BlockCopyCallState) calls;
+ /*
+@@ -353,6 +354,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ BlockDriverState *copy_bitmap_bs,
+ const BdrvDirtyBitmap *bitmap,
++ bool discard_source,
+ Error **errp)
+ {
+ ERRP_GUARD();
+@@ -418,6 +420,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ cluster_size),
+ };
+
++ s->discard_source = discard_source;
+ block_copy_set_copy_opts(s, false, false);
+
+ ratelimit_init(&s->rate_limit);
+@@ -589,6 +592,12 @@ static coroutine_fn int block_copy_task_entry(AioTask *task)
+ co_put_to_shres(s->mem, t->req.bytes);
+ block_copy_task_end(t, ret);
+
++ if (s->discard_source && ret == 0) {
++ int64_t nbytes =
++ MIN(t->req.offset + t->req.bytes, s->len) - t->req.offset;
++ bdrv_co_pdiscard(s->source, t->req.offset, nbytes);
++ }
++
+ return ret;
+ }
+
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 94db31512d..853e01a1eb 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -44,6 +44,7 @@ typedef struct BDRVCopyBeforeWriteState {
+ BdrvChild *target;
+ OnCbwError on_cbw_error;
+ uint64_t cbw_timeout_ns;
++ bool discard_source;
+
+ /*
+ * @lock: protects access to @access_bitmap, @done_bitmap and
+@@ -357,6 +358,8 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+ uint64_t perm, uint64_t shared,
+ uint64_t *nperm, uint64_t *nshared)
+ {
++ BDRVCopyBeforeWriteState *s = bs->opaque;
++
+ if (!(role & BDRV_CHILD_FILTERED)) {
+ /*
+ * Target child
+@@ -381,6 +384,10 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+ * start
+ */
+ *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
++ if (s->discard_source) {
++ *nperm = *nperm | BLK_PERM_WRITE;
++ }
++
+ *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
+ }
+ }
+@@ -468,7 +475,9 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
+ ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
+ bs->file->bs->supported_zero_flags);
+
+- s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp);
++ s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
++ s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
++ flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
+ if (!s->bcs) {
+ error_prepend(errp, "Cannot create block-copy-state: ");
+ return -EINVAL;
+@@ -535,12 +544,14 @@ static BlockDriver bdrv_cbw_filter = {
+ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ BlockDriverState *target,
+ const char *filter_node_name,
++ bool discard_source,
+ BlockCopyState **bcs,
+ Error **errp)
+ {
+ BDRVCopyBeforeWriteState *state;
+ BlockDriverState *top;
+ QDict *opts;
++ int flags = BDRV_O_RDWR | (discard_source ? BDRV_O_CBW_DISCARD_SOURCE : 0);
+
+ assert(source->total_sectors == target->total_sectors);
+ GLOBAL_STATE_CODE();
+@@ -553,7 +564,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ qdict_put_str(opts, "file", bdrv_get_node_name(source));
+ qdict_put_str(opts, "target", bdrv_get_node_name(target));
+
+- top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp);
++ top = bdrv_insert_node(source, opts, flags, errp);
+ if (!top) {
+ return NULL;
+ }
+diff --git a/block/copy-before-write.h b/block/copy-before-write.h
+index 6e72bb25e9..01af0cd3c4 100644
+--- a/block/copy-before-write.h
++++ b/block/copy-before-write.h
+@@ -39,6 +39,7 @@
+ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ BlockDriverState *target,
+ const char *filter_node_name,
++ bool discard_source,
+ BlockCopyState **bcs,
+ Error **errp);
+ void bdrv_cbw_drop(BlockDriverState *bs);
+diff --git a/block/replication.c b/block/replication.c
+index ca6bd0a720..0415a5e8b7 100644
+--- a/block/replication.c
++++ b/block/replication.c
+@@ -582,8 +582,8 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
+
+ s->backup_job = backup_job_create(
+ NULL, s->secondary_disk->bs, s->hidden_disk->bs,
+- 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, NULL,
+- &perf,
++ 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, false,
++ NULL, &perf,
+ BLOCKDEV_ON_ERROR_REPORT,
+ BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
+ backup_job_completed, bs, NULL, &local_err);
+diff --git a/blockdev.c b/blockdev.c
+index 5e5dbc1da9..1054a69279 100644
+--- a/blockdev.c
++++ b/blockdev.c
+@@ -2727,7 +2727,7 @@ static BlockJob *do_backup_common(BackupCommon *backup,
+
+ job = backup_job_create(backup->job_id, bs, target_bs, backup->speed,
+ backup->sync, bmap, backup->bitmap_mode,
+- backup->compress,
++ backup->compress, backup->discard_source,
+ backup->filter_node_name,
+ &perf,
+ backup->on_source_error,
+diff --git a/include/block/block-common.h b/include/block/block-common.h
+index a846023a09..338fe5ff7a 100644
+--- a/include/block/block-common.h
++++ b/include/block/block-common.h
+@@ -243,6 +243,8 @@ typedef enum {
+ read-write fails */
+ #define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */
+
++#define BDRV_O_CBW_DISCARD_SOURCE 0x80000 /* for copy-before-write filter */
++
+ #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
+
+
+diff --git a/include/block/block-copy.h b/include/block/block-copy.h
+index 8b41643bfa..bdc703bacd 100644
+--- a/include/block/block-copy.h
++++ b/include/block/block-copy.h
+@@ -27,6 +27,7 @@ typedef struct BlockCopyCallState BlockCopyCallState;
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ BlockDriverState *copy_bitmap_bs,
+ const BdrvDirtyBitmap *bitmap,
++ bool discard_source,
+ Error **errp);
+
+ /* Function should be called prior any actual copy request */
+diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
+index cc1387ae02..f0c642b194 100644
+--- a/include/block/block_int-global-state.h
++++ b/include/block/block_int-global-state.h
+@@ -195,7 +195,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+ MirrorSyncMode sync_mode,
+ BdrvDirtyBitmap *sync_bitmap,
+ BitmapSyncMode bitmap_mode,
+- bool compress,
++ bool compress, bool discard_source,
+ const char *filter_node_name,
+ BackupPerf *perf,
+ BlockdevOnError on_source_error,
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index f516d8e95a..d796d49abb 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -1849,6 +1849,9 @@
+ # node specified by @drive. If this option is not given, a node
+ # name is autogenerated. (Since: 4.2)
+ #
++# @discard-source: Discard blocks on source which are already copied
++# to the target. (Since 9.0)
++#
+ # @x-perf: Performance options. (Since 6.0)
+ #
+ # Features:
+@@ -1870,6 +1873,7 @@
+ '*on-target-error': 'BlockdevOnError',
+ '*auto-finalize': 'bool', '*auto-dismiss': 'bool',
+ '*filter-node-name': 'str',
++ '*discard-source': 'bool',
+ '*x-perf': { 'type': 'BackupPerf',
+ 'features': [ 'unstable' ] } } }
+
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Date: Thu, 11 Apr 2024 11:29:24 +0200
-Subject: [PATCH] block/copy-before-write: create block_copy bitmap in filter
- node
-
-Currently block_copy creates copy_bitmap in source node. But that is in
-bad relation with .independent_close=true of copy-before-write filter:
-source node may be detached and removed before .bdrv_close() handler
-called, which should call block_copy_state_free(), which in turn should
-remove copy_bitmap.
-
-That's all not ideal: it would be better if internal bitmap of
-block-copy object is not attached to any node. But that is not possible
-now.
-
-The simplest solution is just create copy_bitmap in filter node, where
-anyway two other bitmaps are created.
-
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/block-copy.c | 3 +-
- block/copy-before-write.c | 2 +-
- include/block/block-copy.h | 1 +
- tests/qemu-iotests/257.out | 112 ++++++++++++++++++-------------------
- 4 files changed, 60 insertions(+), 58 deletions(-)
-
-diff --git a/block/block-copy.c b/block/block-copy.c
-index e13d7bc6b6..b61685f1a2 100644
---- a/block/block-copy.c
-+++ b/block/block-copy.c
-@@ -346,6 +346,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
- }
-
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-+ BlockDriverState *copy_bitmap_bs,
- const BdrvDirtyBitmap *bitmap,
- Error **errp)
- {
-@@ -360,7 +361,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- return NULL;
- }
-
-- copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
-+ copy_bitmap = bdrv_create_dirty_bitmap(copy_bitmap_bs, cluster_size, NULL,
- errp);
- if (!copy_bitmap) {
- return NULL;
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index 0a219c2b75..d3b95bd600 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -470,7 +470,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
- ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
- bs->file->bs->supported_zero_flags);
-
-- s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
-+ s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp);
- if (!s->bcs) {
- error_prepend(errp, "Cannot create block-copy-state: ");
- ret = -EINVAL;
-diff --git a/include/block/block-copy.h b/include/block/block-copy.h
-index 0700953ab8..8b41643bfa 100644
---- a/include/block/block-copy.h
-+++ b/include/block/block-copy.h
-@@ -25,6 +25,7 @@ typedef struct BlockCopyState BlockCopyState;
- typedef struct BlockCopyCallState BlockCopyCallState;
-
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-+ BlockDriverState *copy_bitmap_bs,
- const BdrvDirtyBitmap *bitmap,
- Error **errp);
-
-diff --git a/tests/qemu-iotests/257.out b/tests/qemu-iotests/257.out
-index aa76131ca9..c33dd7f3a9 100644
---- a/tests/qemu-iotests/257.out
-+++ b/tests/qemu-iotests/257.out
-@@ -120,16 +120,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -596,16 +596,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -865,16 +865,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -1341,16 +1341,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -1610,16 +1610,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -2086,16 +2086,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -2355,16 +2355,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -2831,16 +2831,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -3100,16 +3100,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -3576,16 +3576,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -3845,16 +3845,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -4321,16 +4321,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -4590,16 +4590,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
-@@ -5066,16 +5066,16 @@ write -P0x67 0x3fe0000 0x20000
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- }
-- ],
-- "drive0": [
-+ },
- {
- "busy": false,
- "count": 0,
- "granularity": 65536,
- "persistent": false,
- "recording": false
-- },
-+ }
-+ ],
-+ "drive0": [
- {
- "busy": false,
- "count": 458752,
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Thu, 11 Apr 2024 11:29:26 +0200
+Subject: [PATCH] copy-before-write: allow specifying minimum cluster size
+
+Useful to make discard-source work in the context of backup fleecing
+when the fleecing image has a larger granularity than the backup
+target.
+
+Copy-before-write operations will use at least this granularity and in
+particular, discard requests to the source node will too. If the
+granularity is too small, they will just be aligned down in
+cbw_co_pdiscard_snapshot() and thus effectively ignored.
+
+The QAPI uses uint32 so the value will be non-negative, but still fit
+into a uint64_t.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/block-copy.c | 17 +++++++++++++----
+ block/copy-before-write.c | 3 ++-
+ include/block/block-copy.h | 1 +
+ qapi/block-core.json | 8 +++++++-
+ 4 files changed, 23 insertions(+), 6 deletions(-)
+
+diff --git a/block/block-copy.c b/block/block-copy.c
+index 7e3b378528..adb1cbb440 100644
+--- a/block/block-copy.c
++++ b/block/block-copy.c
+@@ -310,6 +310,7 @@ void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range,
+ }
+
+ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
++ int64_t min_cluster_size,
+ Error **errp)
+ {
+ int ret;
+@@ -335,7 +336,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
+ "used. If the actual block size of the target exceeds "
+ "this default, the backup may be unusable",
+ BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
+- return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
++ return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
+ } else if (ret < 0 && !target_does_cow) {
+ error_setg_errno(errp, -ret,
+ "Couldn't determine the cluster size of the target image, "
+@@ -345,16 +346,18 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
+ return ret;
+ } else if (ret < 0 && target_does_cow) {
+ /* Not fatal; just trudge on ahead. */
+- return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
++ return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
+ }
+
+- return MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
++ return MAX(min_cluster_size,
++ MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size));
+ }
+
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ BlockDriverState *copy_bitmap_bs,
+ const BdrvDirtyBitmap *bitmap,
+ bool discard_source,
++ int64_t min_cluster_size,
+ Error **errp)
+ {
+ ERRP_GUARD();
+@@ -365,7 +368,13 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+
+ GLOBAL_STATE_CODE();
+
+- cluster_size = block_copy_calculate_cluster_size(target->bs, errp);
++ if (min_cluster_size && !is_power_of_2(min_cluster_size)) {
++ error_setg(errp, "min-cluster-size needs to be a power of 2");
++ return NULL;
++ }
++
++ cluster_size = block_copy_calculate_cluster_size(target->bs,
++ min_cluster_size, errp);
+ if (cluster_size < 0) {
+ return NULL;
+ }
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 853e01a1eb..47b3cdd09f 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -477,7 +477,8 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
+
+ s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
+ s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
+- flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
++ flags & BDRV_O_CBW_DISCARD_SOURCE,
++ opts->min_cluster_size, errp);
+ if (!s->bcs) {
+ error_prepend(errp, "Cannot create block-copy-state: ");
+ return -EINVAL;
+diff --git a/include/block/block-copy.h b/include/block/block-copy.h
+index bdc703bacd..77857c6c68 100644
+--- a/include/block/block-copy.h
++++ b/include/block/block-copy.h
+@@ -28,6 +28,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ BlockDriverState *copy_bitmap_bs,
+ const BdrvDirtyBitmap *bitmap,
+ bool discard_source,
++ int64_t min_cluster_size,
+ Error **errp);
+
+ /* Function should be called prior any actual copy request */
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index d796d49abb..edbf6e78b9 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -4930,12 +4930,18 @@
+ # @on-cbw-error parameter will decide how this failure is handled.
+ # Default 0. (Since 7.1)
+ #
++# @min-cluster-size: Minimum size of blocks used by copy-before-write
++# operations. Has to be a power of 2. No effect if smaller than
++# the maximum of the target's cluster size and 64 KiB. Default 0.
++# (Since 8.1)
++#
+ # Since: 6.2
+ ##
+ { 'struct': 'BlockdevOptionsCbw',
+ 'base': 'BlockdevOptionsGenericFormat',
+ 'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap',
+- '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32' } }
++ '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32',
++ '*min-cluster-size': 'uint32' } }
+
+ ##
+ # @BlockdevOptions:
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Thu, 11 Apr 2024 11:29:27 +0200
+Subject: [PATCH] backup: add minimum cluster size to performance options
+
+Useful to make discard-source work in the context of backup fleecing
+when the fleecing image has a larger granularity than the backup
+target.
+
+Backup/block-copy will use at least this granularity for copy operations
+and in particular, discard requests to the backup source will too. If
+the granularity is too small, they will just be aligned down in
+cbw_co_pdiscard_snapshot() and thus effectively ignored.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/backup.c | 2 +-
+ block/copy-before-write.c | 2 ++
+ block/copy-before-write.h | 1 +
+ blockdev.c | 3 +++
+ qapi/block-core.json | 9 +++++++--
+ 5 files changed, 14 insertions(+), 3 deletions(-)
+
+diff --git a/block/backup.c b/block/backup.c
+index 1963e47ab9..fe69723ada 100644
+--- a/block/backup.c
++++ b/block/backup.c
+@@ -434,7 +434,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+ }
+
+ cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
+- &bcs, errp);
++ perf->min_cluster_size, &bcs, errp);
+ if (!cbw) {
+ goto error;
+ }
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 47b3cdd09f..bba58326d7 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -546,6 +546,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ BlockDriverState *target,
+ const char *filter_node_name,
+ bool discard_source,
++ int64_t min_cluster_size,
+ BlockCopyState **bcs,
+ Error **errp)
+ {
+@@ -564,6 +565,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ }
+ qdict_put_str(opts, "file", bdrv_get_node_name(source));
+ qdict_put_str(opts, "target", bdrv_get_node_name(target));
++ qdict_put_int(opts, "min-cluster-size", min_cluster_size);
+
+ top = bdrv_insert_node(source, opts, flags, errp);
+ if (!top) {
+diff --git a/block/copy-before-write.h b/block/copy-before-write.h
+index 01af0cd3c4..dc6cafe7fa 100644
+--- a/block/copy-before-write.h
++++ b/block/copy-before-write.h
+@@ -40,6 +40,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ BlockDriverState *target,
+ const char *filter_node_name,
+ bool discard_source,
++ int64_t min_cluster_size,
+ BlockCopyState **bcs,
+ Error **errp);
+ void bdrv_cbw_drop(BlockDriverState *bs);
+diff --git a/blockdev.c b/blockdev.c
+index 1054a69279..cbe224387b 100644
+--- a/blockdev.c
++++ b/blockdev.c
+@@ -2654,6 +2654,9 @@ static BlockJob *do_backup_common(BackupCommon *backup,
+ if (backup->x_perf->has_max_chunk) {
+ perf.max_chunk = backup->x_perf->max_chunk;
+ }
++ if (backup->x_perf->has_min_cluster_size) {
++ perf.min_cluster_size = backup->x_perf->min_cluster_size;
++ }
+ }
+
+ if ((backup->sync == MIRROR_SYNC_MODE_BITMAP) ||
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index edbf6e78b9..6e7ee87633 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -1790,11 +1790,16 @@
+ # it should not be less than job cluster size which is calculated
+ # as maximum of target image cluster size and 64k. Default 0.
+ #
++# @min-cluster-size: Minimum size of blocks used by copy-before-write
++# and background copy operations. Has to be a power of 2. No
++# effect if smaller than the maximum of the target's cluster size
++# and 64 KiB. Default 0. (Since 8.1)
++#
+ # Since: 6.0
+ ##
+ { 'struct': 'BackupPerf',
+- 'data': { '*use-copy-range': 'bool',
+- '*max-workers': 'int', '*max-chunk': 'int64' } }
++ 'data': { '*use-copy-range': 'bool', '*max-workers': 'int',
++ '*max-chunk': 'int64', '*min-cluster-size': 'uint32' } }
+
+ ##
+ # @BackupCommon:
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Date: Thu, 11 Apr 2024 11:29:25 +0200
-Subject: [PATCH] qapi: blockdev-backup: add discard-source parameter
-
-Add a parameter that enables discard-after-copy. That is mostly useful
-in "push backup with fleecing" scheme, when source is snapshot-access
-format driver node, based on copy-before-write filter snapshot-access
-API:
-
-[guest] [snapshot-access] ~~ blockdev-backup ~~> [backup target]
- | |
- | root | file
- v v
-[copy-before-write]
- | |
- | file | target
- v v
-[active disk] [temp.img]
-
-In this case discard-after-copy does two things:
-
- - discard data in temp.img to save disk space
- - avoid further copy-before-write operation in discarded area
-
-Note that we have to declare WRITE permission on source in
-copy-before-write filter, for discard to work. Still we can't take it
-unconditionally, as it will break normal backup from RO source. So, we
-have to add a parameter and pass it thorough bdrv_open flags.
-
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/backup.c | 5 +++--
- block/block-copy.c | 9 +++++++++
- block/copy-before-write.c | 15 +++++++++++++--
- block/copy-before-write.h | 1 +
- block/replication.c | 4 ++--
- blockdev.c | 2 +-
- include/block/block-common.h | 2 ++
- include/block/block-copy.h | 1 +
- include/block/block_int-global-state.h | 2 +-
- qapi/block-core.json | 4 ++++
- 10 files changed, 37 insertions(+), 8 deletions(-)
-
-diff --git a/block/backup.c b/block/backup.c
-index af87fa6aa9..3dc955f625 100644
---- a/block/backup.c
-+++ b/block/backup.c
-@@ -332,7 +332,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
- BlockDriverState *target, int64_t speed,
- MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
- BitmapSyncMode bitmap_mode,
-- bool compress,
-+ bool compress, bool discard_source,
- const char *filter_node_name,
- BackupPerf *perf,
- BlockdevOnError on_source_error,
-@@ -429,7 +429,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
- goto error;
- }
-
-- cbw = bdrv_cbw_append(bs, target, filter_node_name, &bcs, errp);
-+ cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
-+ &bcs, errp);
- if (!cbw) {
- goto error;
- }
-diff --git a/block/block-copy.c b/block/block-copy.c
-index b61685f1a2..3c61e52bae 100644
---- a/block/block-copy.c
-+++ b/block/block-copy.c
-@@ -137,6 +137,7 @@ typedef struct BlockCopyState {
- CoMutex lock;
- int64_t in_flight_bytes;
- BlockCopyMethod method;
-+ bool discard_source;
- BlockReqList reqs;
- QLIST_HEAD(, BlockCopyCallState) calls;
- /*
-@@ -348,6 +349,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- BlockDriverState *copy_bitmap_bs,
- const BdrvDirtyBitmap *bitmap,
-+ bool discard_source,
- Error **errp)
- {
- ERRP_GUARD();
-@@ -409,6 +411,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- cluster_size),
- };
-
-+ s->discard_source = discard_source;
- block_copy_set_copy_opts(s, false, false);
-
- ratelimit_init(&s->rate_limit);
-@@ -580,6 +583,12 @@ static coroutine_fn int block_copy_task_entry(AioTask *task)
- co_put_to_shres(s->mem, t->req.bytes);
- block_copy_task_end(t, ret);
-
-+ if (s->discard_source && ret == 0) {
-+ int64_t nbytes =
-+ MIN(t->req.offset + t->req.bytes, s->len) - t->req.offset;
-+ bdrv_co_pdiscard(s->source, t->req.offset, nbytes);
-+ }
-+
- return ret;
- }
-
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index d3b95bd600..3503702d71 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -44,6 +44,7 @@ typedef struct BDRVCopyBeforeWriteState {
- BdrvChild *target;
- OnCbwError on_cbw_error;
- uint32_t cbw_timeout_ns;
-+ bool discard_source;
-
- /*
- * @lock: protects access to @access_bitmap, @done_bitmap and
-@@ -357,6 +358,8 @@ static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c,
- uint64_t perm, uint64_t shared,
- uint64_t *nperm, uint64_t *nshared)
- {
-+ BDRVCopyBeforeWriteState *s = bs->opaque;
-+
- if (!(role & BDRV_CHILD_FILTERED)) {
- /*
- * Target child
-@@ -381,6 +384,10 @@ static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c,
- * start
- */
- *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
-+ if (s->discard_source) {
-+ *nperm = *nperm | BLK_PERM_WRITE;
-+ }
-+
- *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
- }
- }
-@@ -470,7 +477,9 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
- ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
- bs->file->bs->supported_zero_flags);
-
-- s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp);
-+ s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
-+ s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
-+ flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
- if (!s->bcs) {
- error_prepend(errp, "Cannot create block-copy-state: ");
- ret = -EINVAL;
-@@ -544,12 +553,14 @@ BlockDriver bdrv_cbw_filter = {
- BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
- BlockDriverState *target,
- const char *filter_node_name,
-+ bool discard_source,
- BlockCopyState **bcs,
- Error **errp)
- {
- BDRVCopyBeforeWriteState *state;
- BlockDriverState *top;
- QDict *opts;
-+ int flags = BDRV_O_RDWR | (discard_source ? BDRV_O_CBW_DISCARD_SOURCE : 0);
-
- assert(source->total_sectors == target->total_sectors);
- GLOBAL_STATE_CODE();
-@@ -562,7 +573,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
- qdict_put_str(opts, "file", bdrv_get_node_name(source));
- qdict_put_str(opts, "target", bdrv_get_node_name(target));
-
-- top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp);
-+ top = bdrv_insert_node(source, opts, flags, errp);
- if (!top) {
- return NULL;
- }
-diff --git a/block/copy-before-write.h b/block/copy-before-write.h
-index 6e72bb25e9..01af0cd3c4 100644
---- a/block/copy-before-write.h
-+++ b/block/copy-before-write.h
-@@ -39,6 +39,7 @@
- BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
- BlockDriverState *target,
- const char *filter_node_name,
-+ bool discard_source,
- BlockCopyState **bcs,
- Error **errp);
- void bdrv_cbw_drop(BlockDriverState *bs);
-diff --git a/block/replication.c b/block/replication.c
-index ea4bf1aa80..39ad78cf98 100644
---- a/block/replication.c
-+++ b/block/replication.c
-@@ -579,8 +579,8 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
-
- s->backup_job = backup_job_create(
- NULL, s->secondary_disk->bs, s->hidden_disk->bs,
-- 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, NULL,
-- &perf,
-+ 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, false,
-+ NULL, &perf,
- BLOCKDEV_ON_ERROR_REPORT,
- BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
- backup_job_completed, bs, NULL, &local_err);
-diff --git a/blockdev.c b/blockdev.c
-index 7793143d76..ce3fef924c 100644
---- a/blockdev.c
-+++ b/blockdev.c
-@@ -2802,7 +2802,7 @@ static BlockJob *do_backup_common(BackupCommon *backup,
-
- job = backup_job_create(backup->job_id, bs, target_bs, backup->speed,
- backup->sync, bmap, backup->bitmap_mode,
-- backup->compress,
-+ backup->compress, backup->discard_source,
- backup->filter_node_name,
- &perf,
- backup->on_source_error,
-diff --git a/include/block/block-common.h b/include/block/block-common.h
-index e15395f2cb..913a8b259c 100644
---- a/include/block/block-common.h
-+++ b/include/block/block-common.h
-@@ -234,6 +234,8 @@ typedef enum {
- read-write fails */
- #define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */
-
-+#define BDRV_O_CBW_DISCARD_SOURCE 0x80000 /* for copy-before-write filter */
-+
- #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
-
-
-diff --git a/include/block/block-copy.h b/include/block/block-copy.h
-index 8b41643bfa..bdc703bacd 100644
---- a/include/block/block-copy.h
-+++ b/include/block/block-copy.h
-@@ -27,6 +27,7 @@ typedef struct BlockCopyCallState BlockCopyCallState;
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- BlockDriverState *copy_bitmap_bs,
- const BdrvDirtyBitmap *bitmap,
-+ bool discard_source,
- Error **errp);
-
- /* Function should be called prior any actual copy request */
-diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
-index 32f0f9858a..546f2b5532 100644
---- a/include/block/block_int-global-state.h
-+++ b/include/block/block_int-global-state.h
-@@ -189,7 +189,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
- MirrorSyncMode sync_mode,
- BdrvDirtyBitmap *sync_bitmap,
- BitmapSyncMode bitmap_mode,
-- bool compress,
-+ bool compress, bool discard_source,
- const char *filter_node_name,
- BackupPerf *perf,
- BlockdevOnError on_source_error,
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 09de550c95..4297e5beda 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -1816,6 +1816,9 @@
- # node specified by @drive. If this option is not given, a node
- # name is autogenerated. (Since: 4.2)
- #
-+# @discard-source: Discard blocks on source which are already copied
-+# to the target. (Since 9.0)
-+#
- # @x-perf: Performance options. (Since 6.0)
- #
- # Features:
-@@ -1837,6 +1840,7 @@
- '*on-target-error': 'BlockdevOnError',
- '*auto-finalize': 'bool', '*auto-dismiss': 'bool',
- '*filter-node-name': 'str',
-+ '*discard-source': 'bool',
- '*x-perf': { 'type': 'BackupPerf',
- 'features': [ 'unstable' ] } } }
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Thu, 11 Apr 2024 11:29:28 +0200
+Subject: [PATCH] PVE backup: add fleecing option
+
+When a fleecing option is given, it is expected that each device has
+a corresponding "-fleecing" block device already attached, except for
+EFI disk and TPM state, where fleecing is never used.
+
+The following graph was adapted from [0] which also contains more
+details about fleecing.
+
+[guest]
+ |
+ | root
+ v file
+[copy-before-write]<------[snapshot-access]
+ | |
+ | file | target
+ v v
+[source] [fleecing]
+
+For fleecing, a copy-before-write filter is inserted on top of the
+source node, as well as a snapshot-access node pointing to the filter
+node which allows to read the consistent state of the image at the
+time it was inserted. New guest writes are passed through the
+copy-before-write filter which will first copy over old data to the
+fleecing image in case that old data is still needed by the
+snapshot-access node.
+
+The backup process will sequentially read from the snapshot access,
+which has a bitmap and knows whether to read from the original image
+or the fleecing image to get the "snapshot" state, i.e. data from the
+source image at the time when the copy-before-write filter was
+inserted. After reading, the copied sections are discarded from the
+fleecing image to reduce space usage.
+
+All of this can be restricted by an initial dirty bitmap to parts of
+the source image that are required for an incremental backup.
+
+For discard to work, it is necessary that the fleecing image does not
+have a larger cluster size than the backup job granularity. Since
+querying that size does not always work, e.g. for RBD with krbd, the
+cluster size will not be reported, a minimum of 4 MiB is used. A job
+with PBS target already has at least this granularity, so it's just
+relevant for other targets. I.e. edge cases where this minimum is not
+enough should be very rare in practice. If ever necessary in the
+future, can still add a passed-in value for the backup QMP command to
+override.
+
+Additionally, the cbw-timeout and on-cbw-error=break-snapshot options
+are set when installing the copy-before-write filter and
+snapshot-access. When an error or timeout occurs, the problematic (and
+each further) snapshot operation will fail and thus cancel the backup
+instead of breaking the guest write.
+
+Note that job_id cannot be inferred from the snapshot-access bs because
+it has no parent, so just pass the one from the original bs.
+
+[0]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg876056.html
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/monitor/block-hmp-cmds.c | 1 +
+ pve-backup.c | 143 ++++++++++++++++++++++++++++++++-
+ qapi/block-core.json | 10 ++-
+ 3 files changed, 150 insertions(+), 4 deletions(-)
+
+diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
+index 5000c084c5..70b3de4c7e 100644
+--- a/block/monitor/block-hmp-cmds.c
++++ b/block/monitor/block-hmp-cmds.c
+@@ -1043,6 +1043,7 @@ void coroutine_fn hmp_backup(Monitor *mon, const QDict *qdict)
+ NULL, NULL,
+ devlist, qdict_haskey(qdict, "speed"), speed,
+ false, 0, // BackupPerf max-workers
++ false, false, // fleecing
+ &error);
+
+ hmp_handle_error(mon, error);
+diff --git a/pve-backup.c b/pve-backup.c
+index 9d480a8eec..7cc1dd3724 100644
+--- a/pve-backup.c
++++ b/pve-backup.c
+@@ -7,9 +7,11 @@
+ #include "sysemu/blockdev.h"
+ #include "block/block_int-global-state.h"
+ #include "block/blockjob.h"
++#include "block/copy-before-write.h"
+ #include "block/dirty-bitmap.h"
+ #include "block/graph-lock.h"
+ #include "qapi/qapi-commands-block.h"
++#include "qapi/qmp/qdict.h"
+ #include "qapi/qmp/qerror.h"
+ #include "qemu/cutils.h"
+
+@@ -81,8 +83,15 @@ static void pvebackup_init(void)
+ // initialize PVEBackupState at startup
+ opts_init(pvebackup_init);
+
++typedef struct PVEBackupFleecingInfo {
++ BlockDriverState *bs;
++ BlockDriverState *cbw;
++ BlockDriverState *snapshot_access;
++} PVEBackupFleecingInfo;
++
+ typedef struct PVEBackupDevInfo {
+ BlockDriverState *bs;
++ PVEBackupFleecingInfo fleecing;
+ size_t size;
+ uint64_t block_size;
+ uint8_t dev_id;
+@@ -355,6 +364,25 @@ static void pvebackup_complete_cb(void *opaque, int ret)
+ PVEBackupDevInfo *di = opaque;
+ di->completed_ret = ret;
+
++ /*
++ * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work
++ * won't be done as a coroutine anyways:
++ * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would
++ * just spawn a BH calling bdrv_unref().
++ * - For cbw, draining would need to spawn a BH.
++ *
++ * Note that the AioContext lock is already acquired by our caller, i.e.
++ * job_finalize_single_locked()
++ */
++ if (di->fleecing.snapshot_access) {
++ bdrv_unref(di->fleecing.snapshot_access);
++ di->fleecing.snapshot_access = NULL;
++ }
++ if (di->fleecing.cbw) {
++ bdrv_cbw_drop(di->fleecing.cbw);
++ di->fleecing.cbw = NULL;
++ }
++
+ /*
+ * Needs to happen outside of coroutine, because it takes the graph write lock.
+ */
+@@ -522,9 +550,82 @@ static void create_backup_jobs_bh(void *opaque) {
+ }
+ bdrv_drained_begin(di->bs);
+
++ BackupPerf perf = (BackupPerf){ .max_workers = backup_state.perf.max_workers };
++
++ BlockDriverState *source_bs = di->bs;
++ bool discard_source = false;
++ bdrv_graph_co_rdlock();
++ const char *job_id = bdrv_get_device_name(di->bs);
++ bdrv_graph_co_rdunlock();
++ if (di->fleecing.bs) {
++ QDict *cbw_opts = qdict_new();
++ qdict_put_str(cbw_opts, "driver", "copy-before-write");
++ qdict_put_str(cbw_opts, "file", bdrv_get_node_name(di->bs));
++ qdict_put_str(cbw_opts, "target", bdrv_get_node_name(di->fleecing.bs));
++
++ if (di->bitmap) {
++ /*
++ * Only guest writes to parts relevant for the backup need to be intercepted with
++ * old data being copied to the fleecing image.
++ */
++ qdict_put_str(cbw_opts, "bitmap.node", bdrv_get_node_name(di->bs));
++ qdict_put_str(cbw_opts, "bitmap.name", bdrv_dirty_bitmap_name(di->bitmap));
++ }
++ /*
++ * Fleecing storage is supposed to be fast and it's better to break backup than guest
++ * writes. Certain guest drivers like VirtIO-win have 60 seconds timeout by default, so
++ * abort a bit before that.
++ */
++ qdict_put_str(cbw_opts, "on-cbw-error", "break-snapshot");
++ qdict_put_int(cbw_opts, "cbw-timeout", 45);
++
++ di->fleecing.cbw = bdrv_insert_node(di->bs, cbw_opts, BDRV_O_RDWR, &local_err);
++
++ if (!di->fleecing.cbw) {
++ error_setg(errp, "appending cbw node for fleecing failed: %s",
++ local_err ? error_get_pretty(local_err) : "unknown error");
++ break;
++ }
++
++ QDict *snapshot_access_opts = qdict_new();
++ qdict_put_str(snapshot_access_opts, "driver", "snapshot-access");
++ qdict_put_str(snapshot_access_opts, "file", bdrv_get_node_name(di->fleecing.cbw));
++
++ /*
++ * Holding the AioContext lock here would cause a deadlock, because bdrv_open_driver()
++ * will aquire it a second time. But it's allowed to be held exactly once when polling
++ * and that happens when the bdrv_refresh_total_sectors() call is made there.
++ */
++ di->fleecing.snapshot_access =
++ bdrv_open(NULL, NULL, snapshot_access_opts, BDRV_O_RDWR | BDRV_O_UNMAP, &local_err);
++ if (!di->fleecing.snapshot_access) {
++ error_setg(errp, "setting up snapshot access for fleecing failed: %s",
++ local_err ? error_get_pretty(local_err) : "unknown error");
++ break;
++ }
++ source_bs = di->fleecing.snapshot_access;
++ discard_source = true;
++
++ /*
++ * bdrv_get_info() just retuns 0 (= doesn't matter) for RBD when using krbd. But discard
++ * on the fleecing image won't work if the backup job's granularity is less than the RBD
++ * object size (default 4 MiB), so it does matter. Always use at least 4 MiB. With a PBS
++ * target, the backup job granularity would already be at least this much.
++ */
++ perf.min_cluster_size = 4 * 1024 * 1024;
++ /*
++ * For discard to work, cluster size for the backup job must be at least the same as for
++ * the fleecing image.
++ */
++ BlockDriverInfo bdi;
++ if (bdrv_get_info(di->fleecing.bs, &bdi) >= 0) {
++ perf.min_cluster_size = MAX(perf.min_cluster_size, bdi.cluster_size);
++ }
++ }
++
+ BlockJob *job = backup_job_create(
+- NULL, di->bs, di->target, backup_state.speed, sync_mode, di->bitmap,
+- bitmap_mode, false, NULL, &backup_state.perf, BLOCKDEV_ON_ERROR_REPORT,
++ job_id, source_bs, di->target, backup_state.speed, sync_mode, di->bitmap,
++ bitmap_mode, false, discard_source, NULL, &perf, BLOCKDEV_ON_ERROR_REPORT,
+ BLOCKDEV_ON_ERROR_REPORT, JOB_DEFAULT, pvebackup_complete_cb, di, backup_state.txn,
+ &local_err);
+
+@@ -580,6 +681,14 @@ static void create_backup_jobs_bh(void *opaque) {
+ aio_co_enter(data->ctx, data->co);
+ }
+
++/*
++ * EFI disk and TPM state are small and it's just not worth setting up fleecing for them.
++ */
++static bool device_uses_fleecing(const char *device_id)
++{
++ return strncmp(device_id, "drive-efidisk", 13) && strncmp(device_id, "drive-tpmstate", 14);
++}
++
+ /*
+ * Returns a list of device infos, which needs to be freed by the caller. In
+ * case of an error, errp will be set, but the returned value might still be a
+@@ -587,6 +696,7 @@ static void create_backup_jobs_bh(void *opaque) {
+ */
+ static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
+ const char *devlist,
++ bool fleecing,
+ Error **errp)
+ {
+ gchar **devs = NULL;
+@@ -610,6 +720,31 @@ static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
+ }
+ PVEBackupDevInfo *di = g_new0(PVEBackupDevInfo, 1);
+ di->bs = bs;
++
++ if (fleecing && device_uses_fleecing(*d)) {
++ g_autofree gchar *fleecing_devid = g_strconcat(*d, "-fleecing", NULL);
++ BlockBackend *fleecing_blk = blk_by_name(fleecing_devid);
++ if (!fleecing_blk) {
++ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
++ "Device '%s' not found", fleecing_devid);
++ goto err;
++ }
++ BlockDriverState *fleecing_bs = blk_bs(fleecing_blk);
++ if (!bdrv_co_is_inserted(fleecing_bs)) {
++ error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, fleecing_devid);
++ goto err;
++ }
++ /*
++ * Fleecing image needs to be the same size to act as a cbw target.
++ */
++ if (bs->total_sectors != fleecing_bs->total_sectors) {
++ error_setg(errp, "Size mismatch for '%s' - sector count %ld != %ld",
++ fleecing_devid, fleecing_bs->total_sectors, bs->total_sectors);
++ goto err;
++ }
++ di->fleecing.bs = fleecing_bs;
++ }
++
+ di_list = g_list_append(di_list, di);
+ d++;
+ }
+@@ -659,6 +794,7 @@ UuidInfo coroutine_fn *qmp_backup(
+ const char *devlist,
+ bool has_speed, int64_t speed,
+ bool has_max_workers, int64_t max_workers,
++ bool has_fleecing, bool fleecing,
+ Error **errp)
+ {
+ assert(qemu_in_coroutine());
+@@ -687,7 +823,7 @@ UuidInfo coroutine_fn *qmp_backup(
+ format = has_format ? format : BACKUP_FORMAT_VMA;
+
+ bdrv_graph_co_rdlock();
+- di_list = get_device_info(devlist, &local_err);
++ di_list = get_device_info(devlist, has_fleecing && fleecing, &local_err);
+ bdrv_graph_co_rdunlock();
+ if (local_err) {
+ error_propagate(errp, local_err);
+@@ -1095,5 +1231,6 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
+ ret->query_bitmap_info = true;
+ ret->pbs_masterkey = true;
+ ret->backup_max_workers = true;
++ ret->backup_fleecing = true;
+ return ret;
+ }
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index 6e7ee87633..dc5f75cd39 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -948,6 +948,10 @@
+ #
+ # @max-workers: see @BackupPerf for details. Default 16.
+ #
++# @fleecing: perform a backup with fleecing. For each device in @devlist, a
++# corresponing '-fleecing' device with the same size already needs to
++# be present.
++#
+ # Returns: the uuid of the backup job
+ #
+ ##
+@@ -968,7 +972,8 @@
+ '*firewall-file': 'str',
+ '*devlist': 'str',
+ '*speed': 'int',
+- '*max-workers': 'int' },
++ '*max-workers': 'int',
++ '*fleecing': 'bool' },
+ 'returns': 'UuidInfo', 'coroutine': true }
+
+ ##
+@@ -1014,6 +1019,8 @@
+ #
+ # @pbs-library-version: Running version of libproxmox-backup-qemu0 library.
+ #
++# @backup-fleecing: Whether backup fleecing is supported or not.
++#
+ # @backup-max-workers: Whether the 'max-workers' @BackupPerf setting is
+ # supported or not.
+ #
+@@ -1025,6 +1032,7 @@
+ 'pbs-dirty-bitmap-migration': 'bool',
+ 'pbs-masterkey': 'bool',
+ 'pbs-library-version': 'str',
++ 'backup-fleecing': 'bool',
+ 'backup-max-workers': 'bool' } }
+
+ ##
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 11 Apr 2024 11:29:26 +0200
-Subject: [PATCH] copy-before-write: allow specifying minimum cluster size
-
-Useful to make discard-source work in the context of backup fleecing
-when the fleecing image has a larger granularity than the backup
-target.
-
-Copy-before-write operations will use at least this granularity and in
-particular, discard requests to the source node will too. If the
-granularity is too small, they will just be aligned down in
-cbw_co_pdiscard_snapshot() and thus effectively ignored.
-
-The QAPI uses uint32 so the value will be non-negative, but still fit
-into a uint64_t.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/block-copy.c | 17 +++++++++++++----
- block/copy-before-write.c | 3 ++-
- include/block/block-copy.h | 1 +
- qapi/block-core.json | 8 +++++++-
- 4 files changed, 23 insertions(+), 6 deletions(-)
-
-diff --git a/block/block-copy.c b/block/block-copy.c
-index 3c61e52bae..c9a722a5a6 100644
---- a/block/block-copy.c
-+++ b/block/block-copy.c
-@@ -310,6 +310,7 @@ void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range,
- }
-
- static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
-+ int64_t min_cluster_size,
- Error **errp)
- {
- int ret;
-@@ -330,7 +331,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
- "used. If the actual block size of the target exceeds "
- "this default, the backup may be unusable",
- BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
-- return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
-+ return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
- } else if (ret < 0 && !target_does_cow) {
- error_setg_errno(errp, -ret,
- "Couldn't determine the cluster size of the target image, "
-@@ -340,16 +341,18 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
- return ret;
- } else if (ret < 0 && target_does_cow) {
- /* Not fatal; just trudge on ahead. */
-- return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
-+ return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
- }
-
-- return MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
-+ return MAX(min_cluster_size,
-+ MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size));
- }
-
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- BlockDriverState *copy_bitmap_bs,
- const BdrvDirtyBitmap *bitmap,
- bool discard_source,
-+ int64_t min_cluster_size,
- Error **errp)
- {
- ERRP_GUARD();
-@@ -358,7 +361,13 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- BdrvDirtyBitmap *copy_bitmap;
- bool is_fleecing;
-
-- cluster_size = block_copy_calculate_cluster_size(target->bs, errp);
-+ if (min_cluster_size && !is_power_of_2(min_cluster_size)) {
-+ error_setg(errp, "min-cluster-size needs to be a power of 2");
-+ return NULL;
-+ }
-+
-+ cluster_size = block_copy_calculate_cluster_size(target->bs,
-+ min_cluster_size, errp);
- if (cluster_size < 0) {
- return NULL;
- }
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index 3503702d71..4a8c5bdb62 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -479,7 +479,8 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
-
- s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
- s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
-- flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
-+ flags & BDRV_O_CBW_DISCARD_SOURCE,
-+ opts->min_cluster_size, errp);
- if (!s->bcs) {
- error_prepend(errp, "Cannot create block-copy-state: ");
- ret = -EINVAL;
-diff --git a/include/block/block-copy.h b/include/block/block-copy.h
-index bdc703bacd..77857c6c68 100644
---- a/include/block/block-copy.h
-+++ b/include/block/block-copy.h
-@@ -28,6 +28,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
- BlockDriverState *copy_bitmap_bs,
- const BdrvDirtyBitmap *bitmap,
- bool discard_source,
-+ int64_t min_cluster_size,
- Error **errp);
-
- /* Function should be called prior any actual copy request */
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 4297e5beda..33e7e3c090 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -4825,12 +4825,18 @@
- # @on-cbw-error parameter will decide how this failure is handled.
- # Default 0. (Since 7.1)
- #
-+# @min-cluster-size: Minimum size of blocks used by copy-before-write
-+# operations. Has to be a power of 2. No effect if smaller than
-+# the maximum of the target's cluster size and 64 KiB. Default 0.
-+# (Since 8.1)
-+#
- # Since: 6.2
- ##
- { 'struct': 'BlockdevOptionsCbw',
- 'base': 'BlockdevOptionsGenericFormat',
- 'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap',
-- '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32' } }
-+ '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32',
-+ '*min-cluster-size': 'uint32' } }
-
- ##
- # @BlockdevOptions:
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Mon, 29 Apr 2024 14:43:58 +0200
+Subject: [PATCH] PVE backup: improve error when copy-before-write fails for
+ fleecing
+
+With fleecing, failure for copy-before-write does not fail the guest
+write, but only sets the snapshot error that is associated to the
+copy-before-write filter, making further requests to the snapshot
+access fail with EACCES, which then also fails the job. But that error
+code is not the root cause of why the backup failed, so bubble up the
+original snapshot error instead.
+
+Reported-by: Friedrich Weber <f.weber@proxmox.com>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Tested-by: Friedrich Weber <f.weber@proxmox.com>
+---
+ block/copy-before-write.c | 18 ++++++++++++------
+ block/copy-before-write.h | 1 +
+ pve-backup.c | 9 +++++++++
+ 3 files changed, 22 insertions(+), 6 deletions(-)
+
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index bba58326d7..50cc4c7aae 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -27,6 +27,7 @@
+ #include "qapi/qmp/qjson.h"
+
+ #include "sysemu/block-backend.h"
++#include "qemu/atomic.h"
+ #include "qemu/cutils.h"
+ #include "qapi/error.h"
+ #include "block/block_int.h"
+@@ -74,7 +75,8 @@ typedef struct BDRVCopyBeforeWriteState {
+ * @snapshot_error is normally zero. But on first copy-before-write failure
+ * when @on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT, @snapshot_error takes
+ * value of this error (<0). After that all in-flight and further
+- * snapshot-API requests will fail with that error.
++ * snapshot-API requests will fail with that error. To be accessed with
++ * atomics.
+ */
+ int snapshot_error;
+ } BDRVCopyBeforeWriteState;
+@@ -114,7 +116,7 @@ static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs,
+ return 0;
+ }
+
+- if (s->snapshot_error) {
++ if (qatomic_read(&s->snapshot_error)) {
+ return 0;
+ }
+
+@@ -138,9 +140,7 @@ static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs,
+ WITH_QEMU_LOCK_GUARD(&s->lock) {
+ if (ret < 0) {
+ assert(s->on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT);
+- if (!s->snapshot_error) {
+- s->snapshot_error = ret;
+- }
++ qatomic_cmpxchg(&s->snapshot_error, 0, ret);
+ } else {
+ bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off);
+ }
+@@ -214,7 +214,7 @@ cbw_snapshot_read_lock(BlockDriverState *bs, int64_t offset, int64_t bytes,
+
+ QEMU_LOCK_GUARD(&s->lock);
+
+- if (s->snapshot_error) {
++ if (qatomic_read(&s->snapshot_error)) {
+ g_free(req);
+ return NULL;
+ }
+@@ -585,6 +585,12 @@ void bdrv_cbw_drop(BlockDriverState *bs)
+ bdrv_unref(bs);
+ }
+
++int bdrv_cbw_snapshot_error(BlockDriverState *bs)
++{
++ BDRVCopyBeforeWriteState *s = bs->opaque;
++ return qatomic_read(&s->snapshot_error);
++}
++
+ static void cbw_init(void)
+ {
+ bdrv_register(&bdrv_cbw_filter);
+diff --git a/block/copy-before-write.h b/block/copy-before-write.h
+index dc6cafe7fa..a27d2d7d9f 100644
+--- a/block/copy-before-write.h
++++ b/block/copy-before-write.h
+@@ -44,5 +44,6 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+ BlockCopyState **bcs,
+ Error **errp);
+ void bdrv_cbw_drop(BlockDriverState *bs);
++int bdrv_cbw_snapshot_error(BlockDriverState *bs);
+
+ #endif /* COPY_BEFORE_WRITE_H */
+diff --git a/pve-backup.c b/pve-backup.c
+index 7cc1dd3724..07709aa350 100644
+--- a/pve-backup.c
++++ b/pve-backup.c
+@@ -379,6 +379,15 @@ static void pvebackup_complete_cb(void *opaque, int ret)
+ di->fleecing.snapshot_access = NULL;
+ }
+ if (di->fleecing.cbw) {
++ /*
++ * With fleecing, failure for cbw does not fail the guest write, but only sets the snapshot
++ * error, making further requests to the snapshot fail with EACCES, which then also fail the
++ * job. But that code is not the root cause and just confusing, so update it.
++ */
++ int snapshot_error = bdrv_cbw_snapshot_error(di->fleecing.cbw);
++ if (di->completed_ret == -EACCES && snapshot_error) {
++ di->completed_ret = snapshot_error;
++ }
+ bdrv_cbw_drop(di->fleecing.cbw);
+ di->fleecing.cbw = NULL;
+ }
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 11 Apr 2024 11:29:27 +0200
-Subject: [PATCH] backup: add minimum cluster size to performance options
-
-Useful to make discard-source work in the context of backup fleecing
-when the fleecing image has a larger granularity than the backup
-target.
-
-Backup/block-copy will use at least this granularity for copy operations
-and in particular, discard requests to the backup source will too. If
-the granularity is too small, they will just be aligned down in
-cbw_co_pdiscard_snapshot() and thus effectively ignored.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/backup.c | 2 +-
- block/copy-before-write.c | 2 ++
- block/copy-before-write.h | 1 +
- blockdev.c | 3 +++
- qapi/block-core.json | 9 +++++++--
- 5 files changed, 14 insertions(+), 3 deletions(-)
-
-diff --git a/block/backup.c b/block/backup.c
-index 3dc955f625..ac5bd81338 100644
---- a/block/backup.c
-+++ b/block/backup.c
-@@ -430,7 +430,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
- }
-
- cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
-- &bcs, errp);
-+ perf->min_cluster_size, &bcs, errp);
- if (!cbw) {
- goto error;
- }
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index 4a8c5bdb62..9ca5ec5e5c 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -555,6 +555,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
- BlockDriverState *target,
- const char *filter_node_name,
- bool discard_source,
-+ int64_t min_cluster_size,
- BlockCopyState **bcs,
- Error **errp)
- {
-@@ -573,6 +574,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
- }
- qdict_put_str(opts, "file", bdrv_get_node_name(source));
- qdict_put_str(opts, "target", bdrv_get_node_name(target));
-+ qdict_put_int(opts, "min-cluster-size", min_cluster_size);
-
- top = bdrv_insert_node(source, opts, flags, errp);
- if (!top) {
-diff --git a/block/copy-before-write.h b/block/copy-before-write.h
-index 01af0cd3c4..dc6cafe7fa 100644
---- a/block/copy-before-write.h
-+++ b/block/copy-before-write.h
-@@ -40,6 +40,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
- BlockDriverState *target,
- const char *filter_node_name,
- bool discard_source,
-+ int64_t min_cluster_size,
- BlockCopyState **bcs,
- Error **errp);
- void bdrv_cbw_drop(BlockDriverState *bs);
-diff --git a/blockdev.c b/blockdev.c
-index ce3fef924c..5ae1dde73c 100644
---- a/blockdev.c
-+++ b/blockdev.c
-@@ -2729,6 +2729,9 @@ static BlockJob *do_backup_common(BackupCommon *backup,
- if (backup->x_perf->has_max_chunk) {
- perf.max_chunk = backup->x_perf->max_chunk;
- }
-+ if (backup->x_perf->has_min_cluster_size) {
-+ perf.min_cluster_size = backup->x_perf->min_cluster_size;
-+ }
- }
-
- if ((backup->sync == MIRROR_SYNC_MODE_BITMAP) ||
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 33e7e3c090..58fd637e86 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -1757,11 +1757,16 @@
- # it should not be less than job cluster size which is calculated
- # as maximum of target image cluster size and 64k. Default 0.
- #
-+# @min-cluster-size: Minimum size of blocks used by copy-before-write
-+# and background copy operations. Has to be a power of 2. No
-+# effect if smaller than the maximum of the target's cluster size
-+# and 64 KiB. Default 0. (Since 8.1)
-+#
- # Since: 6.0
- ##
- { 'struct': 'BackupPerf',
-- 'data': { '*use-copy-range': 'bool',
-- '*max-workers': 'int', '*max-chunk': 'int64' } }
-+ 'data': { '*use-copy-range': 'bool', '*max-workers': 'int',
-+ '*max-chunk': 'int64', '*min-cluster-size': 'uint32' } }
-
- ##
- # @BackupCommon:
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 11 Apr 2024 11:29:28 +0200
-Subject: [PATCH] PVE backup: add fleecing option
-
-When a fleecing option is given, it is expected that each device has
-a corresponding "-fleecing" block device already attached, except for
-EFI disk and TPM state, where fleecing is never used.
-
-The following graph was adapted from [0] which also contains more
-details about fleecing.
-
-[guest]
- |
- | root
- v file
-[copy-before-write]<------[snapshot-access]
- | |
- | file | target
- v v
-[source] [fleecing]
-
-For fleecing, a copy-before-write filter is inserted on top of the
-source node, as well as a snapshot-access node pointing to the filter
-node which allows to read the consistent state of the image at the
-time it was inserted. New guest writes are passed through the
-copy-before-write filter which will first copy over old data to the
-fleecing image in case that old data is still needed by the
-snapshot-access node.
-
-The backup process will sequentially read from the snapshot access,
-which has a bitmap and knows whether to read from the original image
-or the fleecing image to get the "snapshot" state, i.e. data from the
-source image at the time when the copy-before-write filter was
-inserted. After reading, the copied sections are discarded from the
-fleecing image to reduce space usage.
-
-All of this can be restricted by an initial dirty bitmap to parts of
-the source image that are required for an incremental backup.
-
-For discard to work, it is necessary that the fleecing image does not
-have a larger cluster size than the backup job granularity. Since
-querying that size does not always work, e.g. for RBD with krbd, the
-cluster size will not be reported, a minimum of 4 MiB is used. A job
-with PBS target already has at least this granularity, so it's just
-relevant for other targets. I.e. edge cases where this minimum is not
-enough should be very rare in practice. If ever necessary in the
-future, can still add a passed-in value for the backup QMP command to
-override.
-
-Additionally, the cbw-timeout and on-cbw-error=break-snapshot options
-are set when installing the copy-before-write filter and
-snapshot-access. When an error or timeout occurs, the problematic (and
-each further) snapshot operation will fail and thus cancel the backup
-instead of breaking the guest write.
-
-Note that job_id cannot be inferred from the snapshot-access bs because
-it has no parent, so just pass the one from the original bs.
-
-[0]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg876056.html
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/monitor/block-hmp-cmds.c | 1 +
- pve-backup.c | 143 ++++++++++++++++++++++++++++++++-
- qapi/block-core.json | 8 +-
- 3 files changed, 148 insertions(+), 4 deletions(-)
-
-diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
-index 6efe28cef5..ca29cc4281 100644
---- a/block/monitor/block-hmp-cmds.c
-+++ b/block/monitor/block-hmp-cmds.c
-@@ -1064,6 +1064,7 @@ void coroutine_fn hmp_backup(Monitor *mon, const QDict *qdict)
- NULL, NULL,
- devlist, qdict_haskey(qdict, "speed"), speed,
- false, 0, // BackupPerf max-workers
-+ false, false, // fleecing
- &error);
-
- hmp_handle_error(mon, error);
-diff --git a/pve-backup.c b/pve-backup.c
-index e6b17b797e..00aaff6509 100644
---- a/pve-backup.c
-+++ b/pve-backup.c
-@@ -7,8 +7,10 @@
- #include "sysemu/blockdev.h"
- #include "block/block_int-global-state.h"
- #include "block/blockjob.h"
-+#include "block/copy-before-write.h"
- #include "block/dirty-bitmap.h"
- #include "qapi/qapi-commands-block.h"
-+#include "qapi/qmp/qdict.h"
- #include "qapi/qmp/qerror.h"
- #include "qemu/cutils.h"
-
-@@ -80,8 +82,15 @@ static void pvebackup_init(void)
- // initialize PVEBackupState at startup
- opts_init(pvebackup_init);
-
-+typedef struct PVEBackupFleecingInfo {
-+ BlockDriverState *bs;
-+ BlockDriverState *cbw;
-+ BlockDriverState *snapshot_access;
-+} PVEBackupFleecingInfo;
-+
- typedef struct PVEBackupDevInfo {
- BlockDriverState *bs;
-+ PVEBackupFleecingInfo fleecing;
- size_t size;
- uint64_t block_size;
- uint8_t dev_id;
-@@ -361,6 +370,25 @@ static void pvebackup_complete_cb(void *opaque, int ret)
- PVEBackupDevInfo *di = opaque;
- di->completed_ret = ret;
-
-+ /*
-+ * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work
-+ * won't be done as a coroutine anyways:
-+ * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would
-+ * just spawn a BH calling bdrv_unref().
-+ * - For cbw, draining would need to spawn a BH.
-+ *
-+ * Note that the AioContext lock is already acquired by our caller, i.e.
-+ * job_finalize_single_locked()
-+ */
-+ if (di->fleecing.snapshot_access) {
-+ bdrv_unref(di->fleecing.snapshot_access);
-+ di->fleecing.snapshot_access = NULL;
-+ }
-+ if (di->fleecing.cbw) {
-+ bdrv_cbw_drop(di->fleecing.cbw);
-+ di->fleecing.cbw = NULL;
-+ }
-+
- /*
- * Schedule stream cleanup in async coroutine. close_image and finish might
- * take a while, so we can't block on them here. This way it also doesn't
-@@ -521,9 +549,82 @@ static void create_backup_jobs_bh(void *opaque) {
-
- bdrv_drained_begin(di->bs);
-
-+ BackupPerf perf = (BackupPerf){ .max_workers = backup_state.perf.max_workers };
-+
-+ BlockDriverState *source_bs = di->bs;
-+ bool discard_source = false;
-+ const char *job_id = bdrv_get_device_name(di->bs);
-+ if (di->fleecing.bs) {
-+ QDict *cbw_opts = qdict_new();
-+ qdict_put_str(cbw_opts, "driver", "copy-before-write");
-+ qdict_put_str(cbw_opts, "file", bdrv_get_node_name(di->bs));
-+ qdict_put_str(cbw_opts, "target", bdrv_get_node_name(di->fleecing.bs));
-+
-+ if (di->bitmap) {
-+ /*
-+ * Only guest writes to parts relevant for the backup need to be intercepted with
-+ * old data being copied to the fleecing image.
-+ */
-+ qdict_put_str(cbw_opts, "bitmap.node", bdrv_get_node_name(di->bs));
-+ qdict_put_str(cbw_opts, "bitmap.name", bdrv_dirty_bitmap_name(di->bitmap));
-+ }
-+ /*
-+ * Fleecing storage is supposed to be fast and it's better to break backup than guest
-+ * writes. Certain guest drivers like VirtIO-win have 60 seconds timeout by default, so
-+ * abort a bit before that.
-+ */
-+ qdict_put_str(cbw_opts, "on-cbw-error", "break-snapshot");
-+ qdict_put_int(cbw_opts, "cbw-timeout", 45);
-+
-+ di->fleecing.cbw = bdrv_insert_node(di->bs, cbw_opts, BDRV_O_RDWR, &local_err);
-+
-+ if (!di->fleecing.cbw) {
-+ error_setg(errp, "appending cbw node for fleecing failed: %s",
-+ local_err ? error_get_pretty(local_err) : "unknown error");
-+ break;
-+ }
-+
-+ QDict *snapshot_access_opts = qdict_new();
-+ qdict_put_str(snapshot_access_opts, "driver", "snapshot-access");
-+ qdict_put_str(snapshot_access_opts, "file", bdrv_get_node_name(di->fleecing.cbw));
-+
-+ /*
-+ * Holding the AioContext lock here would cause a deadlock, because bdrv_open_driver()
-+ * will aquire it a second time. But it's allowed to be held exactly once when polling
-+ * and that happens when the bdrv_refresh_total_sectors() call is made there.
-+ */
-+ aio_context_release(aio_context);
-+ di->fleecing.snapshot_access =
-+ bdrv_open(NULL, NULL, snapshot_access_opts, BDRV_O_RDWR | BDRV_O_UNMAP, &local_err);
-+ aio_context_acquire(aio_context);
-+ if (!di->fleecing.snapshot_access) {
-+ error_setg(errp, "setting up snapshot access for fleecing failed: %s",
-+ local_err ? error_get_pretty(local_err) : "unknown error");
-+ break;
-+ }
-+ source_bs = di->fleecing.snapshot_access;
-+ discard_source = true;
-+
-+ /*
-+ * bdrv_get_info() just retuns 0 (= doesn't matter) for RBD when using krbd. But discard
-+ * on the fleecing image won't work if the backup job's granularity is less than the RBD
-+ * object size (default 4 MiB), so it does matter. Always use at least 4 MiB. With a PBS
-+ * target, the backup job granularity would already be at least this much.
-+ */
-+ perf.min_cluster_size = 4 * 1024 * 1024;
-+ /*
-+ * For discard to work, cluster size for the backup job must be at least the same as for
-+ * the fleecing image.
-+ */
-+ BlockDriverInfo bdi;
-+ if (bdrv_get_info(di->fleecing.bs, &bdi) >= 0) {
-+ perf.min_cluster_size = MAX(perf.min_cluster_size, bdi.cluster_size);
-+ }
-+ }
-+
- BlockJob *job = backup_job_create(
-- NULL, di->bs, di->target, backup_state.speed, sync_mode, di->bitmap,
-- bitmap_mode, false, NULL, &backup_state.perf, BLOCKDEV_ON_ERROR_REPORT,
-+ job_id, source_bs, di->target, backup_state.speed, sync_mode, di->bitmap,
-+ bitmap_mode, false, discard_source, NULL, &perf, BLOCKDEV_ON_ERROR_REPORT,
- BLOCKDEV_ON_ERROR_REPORT, JOB_DEFAULT, pvebackup_complete_cb, di, backup_state.txn,
- &local_err);
-
-@@ -581,6 +682,14 @@ static void create_backup_jobs_bh(void *opaque) {
- aio_co_enter(data->ctx, data->co);
- }
-
-+/*
-+ * EFI disk and TPM state are small and it's just not worth setting up fleecing for them.
-+ */
-+static bool device_uses_fleecing(const char *device_id)
-+{
-+ return strncmp(device_id, "drive-efidisk", 13) && strncmp(device_id, "drive-tpmstate", 14);
-+}
-+
- /*
- * Returns a list of device infos, which needs to be freed by the caller. In
- * case of an error, errp will be set, but the returned value might still be a
-@@ -588,6 +697,7 @@ static void create_backup_jobs_bh(void *opaque) {
- */
- static GList coroutine_fn *get_device_info(
- const char *devlist,
-+ bool fleecing,
- Error **errp)
- {
- gchar **devs = NULL;
-@@ -611,6 +721,31 @@ static GList coroutine_fn *get_device_info(
- }
- PVEBackupDevInfo *di = g_new0(PVEBackupDevInfo, 1);
- di->bs = bs;
-+
-+ if (fleecing && device_uses_fleecing(*d)) {
-+ g_autofree gchar *fleecing_devid = g_strconcat(*d, "-fleecing", NULL);
-+ BlockBackend *fleecing_blk = blk_by_name(fleecing_devid);
-+ if (!fleecing_blk) {
-+ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
-+ "Device '%s' not found", fleecing_devid);
-+ goto err;
-+ }
-+ BlockDriverState *fleecing_bs = blk_bs(fleecing_blk);
-+ if (!bdrv_co_is_inserted(fleecing_bs)) {
-+ error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, fleecing_devid);
-+ goto err;
-+ }
-+ /*
-+ * Fleecing image needs to be the same size to act as a cbw target.
-+ */
-+ if (bs->total_sectors != fleecing_bs->total_sectors) {
-+ error_setg(errp, "Size mismatch for '%s' - sector count %ld != %ld",
-+ fleecing_devid, fleecing_bs->total_sectors, bs->total_sectors);
-+ goto err;
-+ }
-+ di->fleecing.bs = fleecing_bs;
-+ }
-+
- di_list = g_list_append(di_list, di);
- d++;
- }
-@@ -660,6 +795,7 @@ UuidInfo coroutine_fn *qmp_backup(
- const char *devlist,
- bool has_speed, int64_t speed,
- bool has_max_workers, int64_t max_workers,
-+ bool has_fleecing, bool fleecing,
- Error **errp)
- {
- assert(qemu_in_coroutine());
-@@ -687,7 +823,7 @@ UuidInfo coroutine_fn *qmp_backup(
- /* Todo: try to auto-detect format based on file name */
- format = has_format ? format : BACKUP_FORMAT_VMA;
-
-- di_list = get_device_info(devlist, &local_err);
-+ di_list = get_device_info(devlist, has_fleecing && fleecing, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
- goto err;
-@@ -1086,5 +1222,6 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
- ret->query_bitmap_info = true;
- ret->pbs_masterkey = true;
- ret->backup_max_workers = true;
-+ ret->backup_fleecing = true;
- return ret;
- }
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 58fd637e86..0bc5f42677 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -933,6 +933,10 @@
- #
- # @max-workers: see @BackupPerf for details. Default 16.
- #
-+# @fleecing: perform a backup with fleecing. For each device in @devlist, a
-+# corresponing '-fleecing' device with the same size already needs to
-+# be present.
-+#
- # Returns: the uuid of the backup job
- #
- ##
-@@ -953,7 +957,8 @@
- '*firewall-file': 'str',
- '*devlist': 'str',
- '*speed': 'int',
-- '*max-workers': 'int' },
-+ '*max-workers': 'int',
-+ '*fleecing': 'bool' },
- 'returns': 'UuidInfo', 'coroutine': true }
-
- ##
-@@ -1009,6 +1014,7 @@
- 'pbs-dirty-bitmap-migration': 'bool',
- 'pbs-masterkey': 'bool',
- 'pbs-library-version': 'str',
-+ 'backup-fleecing': 'bool',
- 'backup-max-workers': 'bool' } }
-
- ##
extra/0001-monitor-qmp-fix-race-with-clients-disconnecting-earl.patch
extra/0002-scsi-megasas-Internal-cdbs-have-16-byte-length.patch
extra/0003-ide-avoid-potential-deadlock-when-draining-during-tr.patch
-extra/0004-migration-block-dirty-bitmap-fix-loading-bitmap-when.patch
-extra/0005-Revert-Revert-graph-lock-Disable-locking-for-now.patch
-extra/0006-migration-states-workaround-snapshot-performance-reg.patch
-extra/0007-Revert-x86-acpi-workaround-Windows-not-handling-name.patch
-extra/0008-target-i386-the-sgx_epc_get_section-stub-is-reachabl.patch
-extra/0009-ui-clipboard-mark-type-as-not-available-when-there-i.patch
-extra/0010-virtio-scsi-Attach-event-vq-notifier-with-no_poll.patch
-extra/0011-virtio-Re-enable-notifications-after-drain.patch
-extra/0012-qemu_init-increase-NOFILE-soft-limit-on-POSIX.patch
-extra/0013-virtio-blk-avoid-using-ioeventfd-state-in-irqfd-cond.patch
+extra/0004-Revert-x86-acpi-workaround-Windows-not-handling-name.patch
+extra/0005-block-copy-before-write-use-uint64_t-for-timeout-in-.patch
bitmap-mirror/0001-drive-mirror-add-support-for-sync-bitmap-mode-never.patch
bitmap-mirror/0002-drive-mirror-add-support-for-conditional-and-always-.patch
bitmap-mirror/0003-mirror-add-check-for-bitmap-mode-without-bitmap.patch
pve/0035-migration-block-dirty-bitmap-migrate-other-bitmaps-e.patch
pve/0036-PVE-fall-back-to-open-iscsi-initiatorname.patch
pve/0037-PVE-block-stream-increase-chunk-size.patch
-pve/0038-block-io-accept-NULL-qiov-in-bdrv_pad_request.patch
-pve/0039-block-add-alloc-track-driver.patch
-pve/0040-Revert-block-rbd-workaround-for-ceph-issue-53784.patch
-pve/0041-Revert-block-rbd-fix-handling-of-holes-in-.bdrv_co_b.patch
-pve/0042-Revert-block-rbd-implement-bdrv_co_block_status.patch
-pve/0043-alloc-track-fix-deadlock-during-drop.patch
-pve/0044-migration-for-snapshots-hold-the-BQL-during-setup-ca.patch
-pve/0045-savevm-async-don-t-hold-BQL-during-setup.patch
-pve/0046-block-copy-before-write-fix-permission.patch
-pve/0047-block-copy-before-write-support-unligned-snapshot-di.patch
-pve/0048-block-copy-before-write-create-block_copy-bitmap-in-.patch
-pve/0049-qapi-blockdev-backup-add-discard-source-parameter.patch
-pve/0050-copy-before-write-allow-specifying-minimum-cluster-s.patch
-pve/0051-backup-add-minimum-cluster-size-to-performance-optio.patch
-pve/0052-PVE-backup-add-fleecing-option.patch
+pve/0038-block-add-alloc-track-driver.patch
+pve/0039-Revert-block-rbd-workaround-for-ceph-issue-53784.patch
+pve/0040-Revert-block-rbd-fix-handling-of-holes-in-.bdrv_co_b.patch
+pve/0041-Revert-block-rbd-implement-bdrv_co_block_status.patch
+pve/0042-alloc-track-error-out-when-auto-remove-is-not-set.patch
+pve/0043-alloc-track-avoid-seemingly-superfluous-child-permis.patch
+pve/0044-block-copy-before-write-fix-permission.patch
+pve/0045-block-copy-before-write-support-unligned-snapshot-di.patch
+pve/0046-block-copy-before-write-create-block_copy-bitmap-in-.patch
+pve/0047-qapi-blockdev-backup-add-discard-source-parameter.patch
+pve/0048-copy-before-write-allow-specifying-minimum-cluster-s.patch
+pve/0049-backup-add-minimum-cluster-size-to-performance-optio.patch
+pve/0050-PVE-backup-add-fleecing-option.patch
+pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch
source-is-missing [roms/SLOF/*.oco]
+source-is-missing [linux-user/*/vdso-*.so]
-Subproject commit 20a1b341a0af1fef84cec9e521d33da0e8d9ecf3
+Subproject commit c25df57ae8f9fe1c72eee2dab37d76d904ac382e