[pve-qemu.git] / debian / patches / pve / 0052-PVE-backup-add-fleecing-option.patch

From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Fiona Ebner <f.ebner@proxmox.com>
Date: Thu, 11 Apr 2024 11:29:28 +0200
Subject: [PATCH] PVE backup: add fleecing option

When a fleecing option is given, it is expected that each device has
a corresponding "-fleecing" block device already attached, except for
EFI disk and TPM state, where fleecing is never used.

The following graph was adapted from [0] which also contains more
details about fleecing.

[guest]
   |
   | root
   v                 file
[copy-before-write]<------[snapshot-access]
   |           |
   | file      | target
   v           v
[source] [fleecing]

For fleecing, a copy-before-write filter is inserted on top of the
source node, as well as a snapshot-access node pointing to the filter
node which allows to read the consistent state of the image at the
time it was inserted. New guest writes are passed through the
copy-before-write filter which will first copy over old data to the
fleecing image in case that old data is still needed by the
snapshot-access node.

The backup process will sequentially read from the snapshot access,
which has a bitmap and knows whether to read from the original image
or the fleecing image to get the "snapshot" state, i.e. data from the
source image at the time when the copy-before-write filter was
inserted. After reading, the copied sections are discarded from the
fleecing image to reduce space usage.

All of this can be restricted by an initial dirty bitmap to parts of
the source image that are required for an incremental backup.

For discard to work, it is necessary that the fleecing image does not
have a larger cluster size than the backup job granularity. Since
querying that size does not always work, e.g. for RBD with krbd, the
cluster size will not be reported, a minimum of 4 MiB is used. A job
with PBS target already has at least this granularity, so it's just
relevant for other targets. I.e. edge cases where this minimum is not
enough should be very rare in practice. If ever necessary in the
future, can still add a passed-in value for the backup QMP command to
override.

Additionally, the cbw-timeout and on-cbw-error=break-snapshot options
are set when installing the copy-before-write filter and
snapshot-access. When an error or timeout occurs, the problematic (and
each further) snapshot operation will fail and thus cancel the backup
instead of breaking the guest write.

Note that job_id cannot be inferred from the snapshot-access bs because
it has no parent, so just pass the one from the original bs.

[0]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg876056.html

Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
 block/monitor/block-hmp-cmds.c |   1 +
 pve-backup.c                   | 143 ++++++++++++++++++++++++++++++++-
 qapi/block-core.json           |   8 +-
 3 files changed, 148 insertions(+), 4 deletions(-)

diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index 6efe28cef5..ca29cc4281 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -1064,6 +1064,7 @@ void coroutine_fn hmp_backup(Monitor *mon, const QDict *qdict)
         NULL, NULL,
         devlist, qdict_haskey(qdict, "speed"), speed,
         false, 0, // BackupPerf max-workers
+        false, false, // fleecing
         &error);
 
     hmp_handle_error(mon, error);
diff --git a/pve-backup.c b/pve-backup.c
index e6b17b797e..00aaff6509 100644
--- a/pve-backup.c
+++ b/pve-backup.c
@@ -7,8 +7,10 @@
 #include "sysemu/blockdev.h"
 #include "block/block_int-global-state.h"
 #include "block/blockjob.h"
+#include "block/copy-before-write.h"
 #include "block/dirty-bitmap.h"
 #include "qapi/qapi-commands-block.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/cutils.h"
 
@@ -80,8 +82,15 @@ static void pvebackup_init(void)
 // initialize PVEBackupState at startup
 opts_init(pvebackup_init);
 
+typedef struct PVEBackupFleecingInfo {
+    BlockDriverState *bs;
+    BlockDriverState *cbw;
+    BlockDriverState *snapshot_access;
+} PVEBackupFleecingInfo;
+
 typedef struct PVEBackupDevInfo {
     BlockDriverState *bs;
+    PVEBackupFleecingInfo fleecing;
     size_t size;
     uint64_t block_size;
     uint8_t dev_id;
@@ -361,6 +370,25 @@ static void pvebackup_complete_cb(void *opaque, int ret)
     PVEBackupDevInfo *di = opaque;
     di->completed_ret = ret;
 
+    /*
+     * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work
+     * won't be done as a coroutine anyways:
+     * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would
+     *   just spawn a BH calling bdrv_unref().
+     * - For cbw, draining would need to spawn a BH.
+     *
+     * Note that the AioContext lock is already acquired by our caller, i.e.
+     * job_finalize_single_locked()
+     */
+    if (di->fleecing.snapshot_access) {
+        bdrv_unref(di->fleecing.snapshot_access);
+        di->fleecing.snapshot_access = NULL;
+    }
+    if (di->fleecing.cbw) {
+        bdrv_cbw_drop(di->fleecing.cbw);
+        di->fleecing.cbw = NULL;
+    }
+
     /*
      * Schedule stream cleanup in async coroutine. close_image and finish might
      * take a while, so we can't block on them here. This way it also doesn't
@@ -521,9 +549,82 @@ static void create_backup_jobs_bh(void *opaque) {
 
         bdrv_drained_begin(di->bs);
 
+        BackupPerf perf = (BackupPerf){ .max_workers = backup_state.perf.max_workers };
+
+        BlockDriverState *source_bs = di->bs;
+        bool discard_source = false;
+        const char *job_id = bdrv_get_device_name(di->bs);
+        if (di->fleecing.bs) {
+            QDict *cbw_opts = qdict_new();
+            qdict_put_str(cbw_opts, "driver", "copy-before-write");
+            qdict_put_str(cbw_opts, "file", bdrv_get_node_name(di->bs));
+            qdict_put_str(cbw_opts, "target", bdrv_get_node_name(di->fleecing.bs));
+
+            if (di->bitmap) {
+                /*
+                 * Only guest writes to parts relevant for the backup need to be intercepted with
+                 * old data being copied to the fleecing image.
+                 */
+                qdict_put_str(cbw_opts, "bitmap.node", bdrv_get_node_name(di->bs));
+                qdict_put_str(cbw_opts, "bitmap.name", bdrv_dirty_bitmap_name(di->bitmap));
+            }
+            /*
+             * Fleecing storage is supposed to be fast and it's better to break backup than guest
+             * writes. Certain guest drivers like VirtIO-win have 60 seconds timeout by default, so
+             * abort a bit before that.
+             */
+            qdict_put_str(cbw_opts, "on-cbw-error", "break-snapshot");
+            qdict_put_int(cbw_opts, "cbw-timeout", 45);
+
+            di->fleecing.cbw = bdrv_insert_node(di->bs, cbw_opts, BDRV_O_RDWR, &local_err);
+
+            if (!di->fleecing.cbw) {
+                error_setg(errp, "appending cbw node for fleecing failed: %s",
+                           local_err ? error_get_pretty(local_err) : "unknown error");
+                break;
+            }
+
+            QDict *snapshot_access_opts = qdict_new();
+            qdict_put_str(snapshot_access_opts, "driver", "snapshot-access");
+            qdict_put_str(snapshot_access_opts, "file", bdrv_get_node_name(di->fleecing.cbw));
+
+            /*
+             * Holding the AioContext lock here would cause a deadlock, because bdrv_open_driver()
+             * will aquire it a second time. But it's allowed to be held exactly once when polling
+             * and that happens when the bdrv_refresh_total_sectors() call is made there.
+             */
+            aio_context_release(aio_context);
+            di->fleecing.snapshot_access =
+                bdrv_open(NULL, NULL, snapshot_access_opts, BDRV_O_RDWR | BDRV_O_UNMAP, &local_err);
+            aio_context_acquire(aio_context);
+            if (!di->fleecing.snapshot_access) {
+                error_setg(errp, "setting up snapshot access for fleecing failed: %s",
+                           local_err ? error_get_pretty(local_err) : "unknown error");
+                break;
+            }
+            source_bs = di->fleecing.snapshot_access;
+            discard_source = true;
+
+            /*
+             * bdrv_get_info() just retuns 0 (= doesn't matter) for RBD when using krbd. But discard
+             * on the fleecing image won't work if the backup job's granularity is less than the RBD
+             * object size (default 4 MiB), so it does matter. Always use at least 4 MiB. With a PBS
+             * target, the backup job granularity would already be at least this much.
+             */
+            perf.min_cluster_size = 4 * 1024 * 1024;
+            /*
+             * For discard to work, cluster size for the backup job must be at least the same as for
+             * the fleecing image.
+             */
+            BlockDriverInfo bdi;
+            if (bdrv_get_info(di->fleecing.bs, &bdi) >= 0) {
+                perf.min_cluster_size = MAX(perf.min_cluster_size, bdi.cluster_size);
+            }
+        }
+
         BlockJob *job = backup_job_create(
-            NULL, di->bs, di->target, backup_state.speed, sync_mode, di->bitmap,
-            bitmap_mode, false, NULL, &backup_state.perf, BLOCKDEV_ON_ERROR_REPORT,
+            job_id, source_bs, di->target, backup_state.speed, sync_mode, di->bitmap,
+            bitmap_mode, false, discard_source, NULL, &perf, BLOCKDEV_ON_ERROR_REPORT,
             BLOCKDEV_ON_ERROR_REPORT, JOB_DEFAULT, pvebackup_complete_cb, di, backup_state.txn,
             &local_err);
 
@@ -581,6 +682,14 @@ static void create_backup_jobs_bh(void *opaque) {
     aio_co_enter(data->ctx, data->co);
 }
 
+/*
+ * EFI disk and TPM state are small and it's just not worth setting up fleecing for them.
+ */
+static bool device_uses_fleecing(const char *device_id)
+{
+    return strncmp(device_id, "drive-efidisk", 13) && strncmp(device_id, "drive-tpmstate", 14);
+}
+
 /*
  * Returns a list of device infos, which needs to be freed by the caller. In
  * case of an error, errp will be set, but the returned value might still be a
@@ -588,6 +697,7 @@ static void create_backup_jobs_bh(void *opaque) {
  */
 static GList coroutine_fn *get_device_info(
     const char *devlist,
+    bool fleecing,
     Error **errp)
 {
     gchar **devs = NULL;
@@ -611,6 +721,31 @@ static GList coroutine_fn *get_device_info(
             }
             PVEBackupDevInfo *di = g_new0(PVEBackupDevInfo, 1);
             di->bs = bs;
+
+            if (fleecing && device_uses_fleecing(*d)) {
+                g_autofree gchar *fleecing_devid = g_strconcat(*d, "-fleecing", NULL);
+                BlockBackend *fleecing_blk = blk_by_name(fleecing_devid);
+                if (!fleecing_blk) {
+                    error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+                              "Device '%s' not found", fleecing_devid);
+                    goto err;
+                }
+                BlockDriverState *fleecing_bs = blk_bs(fleecing_blk);
+                if (!bdrv_co_is_inserted(fleecing_bs)) {
+                    error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, fleecing_devid);
+                    goto err;
+                }
+                /*
+                 * Fleecing image needs to be the same size to act as a cbw target.
+                 */
+                if (bs->total_sectors != fleecing_bs->total_sectors) {
+                    error_setg(errp, "Size mismatch for '%s' - sector count %ld != %ld",
+                               fleecing_devid, fleecing_bs->total_sectors, bs->total_sectors);
+                    goto err;
+                }
+                di->fleecing.bs = fleecing_bs;
+            }
+
             di_list = g_list_append(di_list, di);
             d++;
         }
@@ -660,6 +795,7 @@ UuidInfo coroutine_fn *qmp_backup(
     const char *devlist,
     bool has_speed, int64_t speed,
     bool has_max_workers, int64_t max_workers,
+    bool has_fleecing, bool fleecing,
     Error **errp)
 {
     assert(qemu_in_coroutine());
@@ -687,7 +823,7 @@ UuidInfo coroutine_fn *qmp_backup(
     /* Todo: try to auto-detect format based on file name */
     format = has_format ? format : BACKUP_FORMAT_VMA;
 
-    di_list = get_device_info(devlist, &local_err);
+    di_list = get_device_info(devlist, has_fleecing && fleecing, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         goto err;
@@ -1086,5 +1222,6 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
     ret->query_bitmap_info = true;
     ret->pbs_masterkey = true;
     ret->backup_max_workers = true;
+    ret->backup_fleecing = true;
     return ret;
 }
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 58fd637e86..0bc5f42677 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -933,6 +933,10 @@
 #
 # @max-workers: see @BackupPerf for details. Default 16.
 #
+# @fleecing: perform a backup with fleecing. For each device in @devlist, a
+#            corresponing '-fleecing' device with the same size already needs to
+#            be present.
+#
 # Returns: the uuid of the backup job
 #
 ##
@@ -953,7 +957,8 @@
                                     '*firewall-file': 'str',
                                     '*devlist': 'str',
                                     '*speed': 'int',
-                                    '*max-workers': 'int' },
+                                    '*max-workers': 'int',
+                                    '*fleecing': 'bool' },
   'returns': 'UuidInfo', 'coroutine': true }
 
 ##
@@ -1009,6 +1014,7 @@
             'pbs-dirty-bitmap-migration': 'bool',
             'pbs-masterkey': 'bool',
             'pbs-library-version': 'str',
+            'backup-fleecing': 'bool',
             'backup-max-workers': 'bool' } }
 
 ##
Commit	Line	Data
20209d8d TL	1	From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
	2	From: Fiona Ebner <f.ebner@proxmox.com>
	3	Date: Thu, 11 Apr 2024 11:29:28 +0200
	4	Subject: [PATCH] PVE backup: add fleecing option
	5
	6	When a fleecing option is given, it is expected that each device has
	7	a corresponding "-fleecing" block device already attached, except for
	8	EFI disk and TPM state, where fleecing is never used.
	9
	10	The following graph was adapted from [0] which also contains more
	11	details about fleecing.
	12
	13	[guest]
	14	\|
	15	\| root
	16	v file
	17	[copy-before-write]<------[snapshot-access]
	18	\| \|
	19	\| file \| target
	20	v v
	21	[source] [fleecing]
	22
	23	For fleecing, a copy-before-write filter is inserted on top of the
	24	source node, as well as a snapshot-access node pointing to the filter
	25	node which allows to read the consistent state of the image at the
	26	time it was inserted. New guest writes are passed through the
	27	copy-before-write filter which will first copy over old data to the
	28	fleecing image in case that old data is still needed by the
	29	snapshot-access node.
	30
	31	The backup process will sequentially read from the snapshot access,
	32	which has a bitmap and knows whether to read from the original image
	33	or the fleecing image to get the "snapshot" state, i.e. data from the
	34	source image at the time when the copy-before-write filter was
	35	inserted. After reading, the copied sections are discarded from the
	36	fleecing image to reduce space usage.
	37
	38	All of this can be restricted by an initial dirty bitmap to parts of
	39	the source image that are required for an incremental backup.
	40
	41	For discard to work, it is necessary that the fleecing image does not
	42	have a larger cluster size than the backup job granularity. Since
	43	querying that size does not always work, e.g. for RBD with krbd, the
	44	cluster size will not be reported, a minimum of 4 MiB is used. A job
	45	with PBS target already has at least this granularity, so it's just
	46	relevant for other targets. I.e. edge cases where this minimum is not
	47	enough should be very rare in practice. If ever necessary in the
	48	future, can still add a passed-in value for the backup QMP command to
	49	override.
	50
	51	Additionally, the cbw-timeout and on-cbw-error=break-snapshot options
	52	are set when installing the copy-before-write filter and
	53	snapshot-access. When an error or timeout occurs, the problematic (and
	54	each further) snapshot operation will fail and thus cancel the backup
	55	instead of breaking the guest write.
	56
	57	Note that job_id cannot be inferred from the snapshot-access bs because
	58	it has no parent, so just pass the one from the original bs.
	59
	60	[0]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg876056.html
	61
	62	Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
	63	Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
	64	---
65	block/monitor/block-hmp-cmds.c \| 1 +
66	pve-backup.c \| 143 ++++++++++++++++++++++++++++++++-
67	qapi/block-core.json \| 8 +-
68	3 files changed, 148 insertions(+), 4 deletions(-)
69
70	diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
71	index 6efe28cef5..ca29cc4281 100644
72	--- a/block/monitor/block-hmp-cmds.c
73	+++ b/block/monitor/block-hmp-cmds.c
74	@@ -1064,6 +1064,7 @@ void coroutine_fn hmp_backup(Monitor mon, const QDict qdict)
75	NULL, NULL,
76	devlist, qdict_haskey(qdict, "speed"), speed,
77	false, 0, // BackupPerf max-workers
78	+ false, false, // fleecing
79	&error);
80
81	hmp_handle_error(mon, error);
82	diff --git a/pve-backup.c b/pve-backup.c
83	index e6b17b797e..00aaff6509 100644
84	--- a/pve-backup.c
85	+++ b/pve-backup.c
86	@@ -7,8 +7,10 @@
87	#include "sysemu/blockdev.h"
88	#include "block/block_int-global-state.h"
89	#include "block/blockjob.h"
90	+#include "block/copy-before-write.h"
91	#include "block/dirty-bitmap.h"
92	#include "qapi/qapi-commands-block.h"
93	+#include "qapi/qmp/qdict.h"
94	#include "qapi/qmp/qerror.h"
95	#include "qemu/cutils.h"
96
97	@@ -80,8 +82,15 @@ static void pvebackup_init(void)
98	// initialize PVEBackupState at startup
99	opts_init(pvebackup_init);
100
101	+typedef struct PVEBackupFleecingInfo {
102	+ BlockDriverState *bs;
103	+ BlockDriverState *cbw;
104	+ BlockDriverState *snapshot_access;
105	+} PVEBackupFleecingInfo;
106	+
107	typedef struct PVEBackupDevInfo {
108	BlockDriverState *bs;
109	+ PVEBackupFleecingInfo fleecing;
110	size_t size;
111	uint64_t block_size;
112	uint8_t dev_id;
113	@@ -361,6 +370,25 @@ static void pvebackup_complete_cb(void *opaque, int ret)
114	PVEBackupDevInfo *di = opaque;
115	di->completed_ret = ret;
116
117	+ /*
118	+ * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work
119	+ * won't be done as a coroutine anyways:
120	+ * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would
121	+ * just spawn a BH calling bdrv_unref().
122	+ * - For cbw, draining would need to spawn a BH.
123	+ *
124	+ * Note that the AioContext lock is already acquired by our caller, i.e.
125	+ * job_finalize_single_locked()
126	+ */
127	+ if (di->fleecing.snapshot_access) {
128	+ bdrv_unref(di->fleecing.snapshot_access);
129	+ di->fleecing.snapshot_access = NULL;
130	+ }
131	+ if (di->fleecing.cbw) {
132	+ bdrv_cbw_drop(di->fleecing.cbw);
133	+ di->fleecing.cbw = NULL;
134	+ }
135	+
136	/*
137	* Schedule stream cleanup in async coroutine. close_image and finish might
138	* take a while, so we can't block on them here. This way it also doesn't
139	@@ -521,9 +549,82 @@ static void create_backup_jobs_bh(void *opaque) {
140
141	bdrv_drained_begin(di->bs);
142
143	+ BackupPerf perf = (BackupPerf){ .max_workers = backup_state.perf.max_workers };
144	+
145	+ BlockDriverState *source_bs = di->bs;
146	+ bool discard_source = false;
147	+ const char *job_id = bdrv_get_device_name(di->bs);
148	+ if (di->fleecing.bs) {
149	+ QDict *cbw_opts = qdict_new();
150	+ qdict_put_str(cbw_opts, "driver", "copy-before-write");
151	+ qdict_put_str(cbw_opts, "file", bdrv_get_node_name(di->bs));
152	+ qdict_put_str(cbw_opts, "target", bdrv_get_node_name(di->fleecing.bs));
153	+
154	+ if (di->bitmap) {
155	+ /*
156	+ * Only guest writes to parts relevant for the backup need to be intercepted with
157	+ * old data being copied to the fleecing image.
158	+ */
159	+ qdict_put_str(cbw_opts, "bitmap.node", bdrv_get_node_name(di->bs));
160	+ qdict_put_str(cbw_opts, "bitmap.name", bdrv_dirty_bitmap_name(di->bitmap));
161	+ }
162	+ /*
163	+ * Fleecing storage is supposed to be fast and it's better to break backup than guest
164	+ * writes. Certain guest drivers like VirtIO-win have 60 seconds timeout by default, so
165	+ * abort a bit before that.
166	+ */
167	+ qdict_put_str(cbw_opts, "on-cbw-error", "break-snapshot");
168	+ qdict_put_int(cbw_opts, "cbw-timeout", 45);
169	+
170	+ di->fleecing.cbw = bdrv_insert_node(di->bs, cbw_opts, BDRV_O_RDWR, &local_err);
171	+
172	+ if (!di->fleecing.cbw) {
173	+ error_setg(errp, "appending cbw node for fleecing failed: %s",
174	+ local_err ? error_get_pretty(local_err) : "unknown error");
175	+ break;
176	+ }
177	+
178	+ QDict *snapshot_access_opts = qdict_new();
179	+ qdict_put_str(snapshot_access_opts, "driver", "snapshot-access");
180	+ qdict_put_str(snapshot_access_opts, "file", bdrv_get_node_name(di->fleecing.cbw));
181	+
182	+ /*
183	+ * Holding the AioContext lock here would cause a deadlock, because bdrv_open_driver()
184	+ * will aquire it a second time. But it's allowed to be held exactly once when polling
185	+ * and that happens when the bdrv_refresh_total_sectors() call is made there.
186	+ */
187	+ aio_context_release(aio_context);
188	+ di->fleecing.snapshot_access =
189	+ bdrv_open(NULL, NULL, snapshot_access_opts, BDRV_O_RDWR \| BDRV_O_UNMAP, &local_err);
190	+ aio_context_acquire(aio_context);
191	+ if (!di->fleecing.snapshot_access) {
192	+ error_setg(errp, "setting up snapshot access for fleecing failed: %s",
193	+ local_err ? error_get_pretty(local_err) : "unknown error");
194	+ break;
195	+ }
196	+ source_bs = di->fleecing.snapshot_access;
197	+ discard_source = true;
198	+
199	+ /*
200	+ * bdrv_get_info() just retuns 0 (= doesn't matter) for RBD when using krbd. But discard
201	+ * on the fleecing image won't work if the backup job's granularity is less than the RBD
202	+ * object size (default 4 MiB), so it does matter. Always use at least 4 MiB. With a PBS
203	+ * target, the backup job granularity would already be at least this much.
204	+ */
205	+ perf.min_cluster_size = 4 * 1024 * 1024;
206	+ /*
207	+ * For discard to work, cluster size for the backup job must be at least the same as for
208	+ * the fleecing image.
209	+ */
210	+ BlockDriverInfo bdi;
211	+ if (bdrv_get_info(di->fleecing.bs, &bdi) >= 0) {
212	+ perf.min_cluster_size = MAX(perf.min_cluster_size, bdi.cluster_size);
213	+ }
214	+ }
215	+
216	BlockJob *job = backup_job_create(
217	- NULL, di->bs, di->target, backup_state.speed, sync_mode, di->bitmap,
218	- bitmap_mode, false, NULL, &backup_state.perf, BLOCKDEV_ON_ERROR_REPORT,
219	+ job_id, source_bs, di->target, backup_state.speed, sync_mode, di->bitmap,
220	+ bitmap_mode, false, discard_source, NULL, &perf, BLOCKDEV_ON_ERROR_REPORT,
221	BLOCKDEV_ON_ERROR_REPORT, JOB_DEFAULT, pvebackup_complete_cb, di, backup_state.txn,
222	&local_err);
223
224	@@ -581,6 +682,14 @@ static void create_backup_jobs_bh(void *opaque) {
225	aio_co_enter(data->ctx, data->co);
226	}
227
228	+/*
229	+ * EFI disk and TPM state are small and it's just not worth setting up fleecing for them.
230	+ */
231	+static bool device_uses_fleecing(const char *device_id)
232	+{
233	+ return strncmp(device_id, "drive-efidisk", 13) && strncmp(device_id, "drive-tpmstate", 14);
234	+}
235	+
236	/*
237	* Returns a list of device infos, which needs to be freed by the caller. In
238	* case of an error, errp will be set, but the returned value might still be a
239	@@ -588,6 +697,7 @@ static void create_backup_jobs_bh(void *opaque) {
240	*/
241	static GList coroutine_fn *get_device_info(
242	const char *devlist,
243	+ bool fleecing,
244	Error **errp)
245	{
246	gchar **devs = NULL;
247	@@ -611,6 +721,31 @@ static GList coroutine_fn *get_device_info(
248	}
249	PVEBackupDevInfo *di = g_new0(PVEBackupDevInfo, 1);
250	di->bs = bs;
251	+
252	+ if (fleecing && device_uses_fleecing(*d)) {
253	+ g_autofree gchar fleecing_devid = g_strconcat(d, "-fleecing", NULL);
254	+ BlockBackend *fleecing_blk = blk_by_name(fleecing_devid);
255	+ if (!fleecing_blk) {
256	+ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
257	+ "Device '%s' not found", fleecing_devid);
258	+ goto err;
259	+ }
260	+ BlockDriverState *fleecing_bs = blk_bs(fleecing_blk);
261	+ if (!bdrv_co_is_inserted(fleecing_bs)) {
262	+ error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, fleecing_devid);
263	+ goto err;
264	+ }
265	+ /*
266	+ * Fleecing image needs to be the same size to act as a cbw target.
267	+ */
268	+ if (bs->total_sectors != fleecing_bs->total_sectors) {
269	+ error_setg(errp, "Size mismatch for '%s' - sector count %ld != %ld",
270	+ fleecing_devid, fleecing_bs->total_sectors, bs->total_sectors);
271	+ goto err;
272	+ }
273	+ di->fleecing.bs = fleecing_bs;
274	+ }
275	+
276	di_list = g_list_append(di_list, di);
277	d++;
278	}
279	@@ -660,6 +795,7 @@ UuidInfo coroutine_fn *qmp_backup(
280	const char *devlist,
281	bool has_speed, int64_t speed,
282	bool has_max_workers, int64_t max_workers,
283	+ bool has_fleecing, bool fleecing,
284	Error **errp)
285	{
286	assert(qemu_in_coroutine());
287	@@ -687,7 +823,7 @@ UuidInfo coroutine_fn *qmp_backup(
288	/* Todo: try to auto-detect format based on file name */
289	format = has_format ? format : BACKUP_FORMAT_VMA;
290
291	- di_list = get_device_info(devlist, &local_err);
292	+ di_list = get_device_info(devlist, has_fleecing && fleecing, &local_err);
293	if (local_err) {
294	error_propagate(errp, local_err);
295	goto err;
296	@@ -1086,5 +1222,6 @@ ProxmoxSupportStatus qmp_query_proxmox_support(Error *errp)
297	ret->query_bitmap_info = true;
298	ret->pbs_masterkey = true;
299	ret->backup_max_workers = true;
300	+ ret->backup_fleecing = true;
301	return ret;
302	}
303	diff --git a/qapi/block-core.json b/qapi/block-core.json
304	index 58fd637e86..0bc5f42677 100644
305	--- a/qapi/block-core.json
306	+++ b/qapi/block-core.json
307	@@ -933,6 +933,10 @@
308	#
309	# @max-workers: see @BackupPerf for details. Default 16.
310	#
311	+# @fleecing: perform a backup with fleecing. For each device in @devlist, a
312	+# corresponing '-fleecing' device with the same size already needs to
313	+# be present.
314	+#
315	# Returns: the uuid of the backup job
316	#
317	##
318	@@ -953,7 +957,8 @@
319	'*firewall-file': 'str',
320	'*devlist': 'str',
321	'*speed': 'int',
322	- '*max-workers': 'int' },
323	+ '*max-workers': 'int',
324	+ '*fleecing': 'bool' },
325	'returns': 'UuidInfo', 'coroutine': true }
326
327	##
328	@@ -1009,6 +1014,7 @@
329	'pbs-dirty-bitmap-migration': 'bool',
330	'pbs-masterkey': 'bool',
331	'pbs-library-version': 'str',
332	+ 'backup-fleecing': 'bool',
333	'backup-max-workers': 'bool' } }
334
335	##