]>
Commit | Line | Data |
---|---|---|
20209d8d TL |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Fiona Ebner <f.ebner@proxmox.com> | |
3 | Date: Thu, 11 Apr 2024 11:29:28 +0200 | |
4 | Subject: [PATCH] PVE backup: add fleecing option | |
5 | ||
6 | When a fleecing option is given, it is expected that each device has | |
7 | a corresponding "-fleecing" block device already attached, except for | |
8 | EFI disk and TPM state, where fleecing is never used. | |
9 | ||
10 | The following graph was adapted from [0] which also contains more | |
11 | details about fleecing. | |
12 | ||
13 | [guest] | |
14 | | | |
15 | | root | |
16 | v file | |
17 | [copy-before-write]<------[snapshot-access] | |
18 | | | | |
19 | | file | target | |
20 | v v | |
21 | [source] [fleecing] | |
22 | ||
23 | For fleecing, a copy-before-write filter is inserted on top of the | |
24 | source node, as well as a snapshot-access node pointing to the filter | |
25 | node which allows to read the consistent state of the image at the | |
26 | time it was inserted. New guest writes are passed through the | |
27 | copy-before-write filter which will first copy over old data to the | |
28 | fleecing image in case that old data is still needed by the | |
29 | snapshot-access node. | |
30 | ||
31 | The backup process will sequentially read from the snapshot access, | |
32 | which has a bitmap and knows whether to read from the original image | |
33 | or the fleecing image to get the "snapshot" state, i.e. data from the | |
34 | source image at the time when the copy-before-write filter was | |
35 | inserted. After reading, the copied sections are discarded from the | |
36 | fleecing image to reduce space usage. | |
37 | ||
38 | All of this can be restricted by an initial dirty bitmap to parts of | |
39 | the source image that are required for an incremental backup. | |
40 | ||
41 | For discard to work, it is necessary that the fleecing image does not | |
42 | have a larger cluster size than the backup job granularity. Since | |
43 | querying that size does not always work, e.g. for RBD with krbd, the | |
44 | cluster size will not be reported, a minimum of 4 MiB is used. A job | |
45 | with PBS target already has at least this granularity, so it's just | |
46 | relevant for other targets. I.e. edge cases where this minimum is not | |
47 | enough should be very rare in practice. If ever necessary in the | |
48 | future, can still add a passed-in value for the backup QMP command to | |
49 | override. | |
50 | ||
51 | Additionally, the cbw-timeout and on-cbw-error=break-snapshot options | |
52 | are set when installing the copy-before-write filter and | |
53 | snapshot-access. When an error or timeout occurs, the problematic (and | |
54 | each further) snapshot operation will fail and thus cancel the backup | |
55 | instead of breaking the guest write. | |
56 | ||
57 | Note that job_id cannot be inferred from the snapshot-access bs because | |
58 | it has no parent, so just pass the one from the original bs. | |
59 | ||
60 | [0]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg876056.html | |
61 | ||
62 | Signed-off-by: Fiona Ebner <f.ebner@proxmox.com> | |
63 | Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com> | |
64 | --- | |
65 | block/monitor/block-hmp-cmds.c | 1 + | |
66 | pve-backup.c | 143 ++++++++++++++++++++++++++++++++- | |
67 | qapi/block-core.json | 8 +- | |
68 | 3 files changed, 148 insertions(+), 4 deletions(-) | |
69 | ||
70 | diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c | |
71 | index 6efe28cef5..ca29cc4281 100644 | |
72 | --- a/block/monitor/block-hmp-cmds.c | |
73 | +++ b/block/monitor/block-hmp-cmds.c | |
74 | @@ -1064,6 +1064,7 @@ void coroutine_fn hmp_backup(Monitor *mon, const QDict *qdict) | |
75 | NULL, NULL, | |
76 | devlist, qdict_haskey(qdict, "speed"), speed, | |
77 | false, 0, // BackupPerf max-workers | |
78 | + false, false, // fleecing | |
79 | &error); | |
80 | ||
81 | hmp_handle_error(mon, error); | |
82 | diff --git a/pve-backup.c b/pve-backup.c | |
83 | index e6b17b797e..00aaff6509 100644 | |
84 | --- a/pve-backup.c | |
85 | +++ b/pve-backup.c | |
86 | @@ -7,8 +7,10 @@ | |
87 | #include "sysemu/blockdev.h" | |
88 | #include "block/block_int-global-state.h" | |
89 | #include "block/blockjob.h" | |
90 | +#include "block/copy-before-write.h" | |
91 | #include "block/dirty-bitmap.h" | |
92 | #include "qapi/qapi-commands-block.h" | |
93 | +#include "qapi/qmp/qdict.h" | |
94 | #include "qapi/qmp/qerror.h" | |
95 | #include "qemu/cutils.h" | |
96 | ||
97 | @@ -80,8 +82,15 @@ static void pvebackup_init(void) | |
98 | // initialize PVEBackupState at startup | |
99 | opts_init(pvebackup_init); | |
100 | ||
101 | +typedef struct PVEBackupFleecingInfo { | |
102 | + BlockDriverState *bs; | |
103 | + BlockDriverState *cbw; | |
104 | + BlockDriverState *snapshot_access; | |
105 | +} PVEBackupFleecingInfo; | |
106 | + | |
107 | typedef struct PVEBackupDevInfo { | |
108 | BlockDriverState *bs; | |
109 | + PVEBackupFleecingInfo fleecing; | |
110 | size_t size; | |
111 | uint64_t block_size; | |
112 | uint8_t dev_id; | |
113 | @@ -361,6 +370,25 @@ static void pvebackup_complete_cb(void *opaque, int ret) | |
114 | PVEBackupDevInfo *di = opaque; | |
115 | di->completed_ret = ret; | |
116 | ||
117 | + /* | |
118 | + * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work | |
119 | + * won't be done as a coroutine anyways: | |
120 | + * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would | |
121 | + * just spawn a BH calling bdrv_unref(). | |
122 | + * - For cbw, draining would need to spawn a BH. | |
123 | + * | |
124 | + * Note that the AioContext lock is already acquired by our caller, i.e. | |
125 | + * job_finalize_single_locked() | |
126 | + */ | |
127 | + if (di->fleecing.snapshot_access) { | |
128 | + bdrv_unref(di->fleecing.snapshot_access); | |
129 | + di->fleecing.snapshot_access = NULL; | |
130 | + } | |
131 | + if (di->fleecing.cbw) { | |
132 | + bdrv_cbw_drop(di->fleecing.cbw); | |
133 | + di->fleecing.cbw = NULL; | |
134 | + } | |
135 | + | |
136 | /* | |
137 | * Schedule stream cleanup in async coroutine. close_image and finish might | |
138 | * take a while, so we can't block on them here. This way it also doesn't | |
139 | @@ -521,9 +549,82 @@ static void create_backup_jobs_bh(void *opaque) { | |
140 | ||
141 | bdrv_drained_begin(di->bs); | |
142 | ||
143 | + BackupPerf perf = (BackupPerf){ .max_workers = backup_state.perf.max_workers }; | |
144 | + | |
145 | + BlockDriverState *source_bs = di->bs; | |
146 | + bool discard_source = false; | |
147 | + const char *job_id = bdrv_get_device_name(di->bs); | |
148 | + if (di->fleecing.bs) { | |
149 | + QDict *cbw_opts = qdict_new(); | |
150 | + qdict_put_str(cbw_opts, "driver", "copy-before-write"); | |
151 | + qdict_put_str(cbw_opts, "file", bdrv_get_node_name(di->bs)); | |
152 | + qdict_put_str(cbw_opts, "target", bdrv_get_node_name(di->fleecing.bs)); | |
153 | + | |
154 | + if (di->bitmap) { | |
155 | + /* | |
156 | + * Only guest writes to parts relevant for the backup need to be intercepted with | |
157 | + * old data being copied to the fleecing image. | |
158 | + */ | |
159 | + qdict_put_str(cbw_opts, "bitmap.node", bdrv_get_node_name(di->bs)); | |
160 | + qdict_put_str(cbw_opts, "bitmap.name", bdrv_dirty_bitmap_name(di->bitmap)); | |
161 | + } | |
162 | + /* | |
163 | + * Fleecing storage is supposed to be fast and it's better to break backup than guest | |
164 | + * writes. Certain guest drivers like VirtIO-win have 60 seconds timeout by default, so | |
165 | + * abort a bit before that. | |
166 | + */ | |
167 | + qdict_put_str(cbw_opts, "on-cbw-error", "break-snapshot"); | |
168 | + qdict_put_int(cbw_opts, "cbw-timeout", 45); | |
169 | + | |
170 | + di->fleecing.cbw = bdrv_insert_node(di->bs, cbw_opts, BDRV_O_RDWR, &local_err); | |
171 | + | |
172 | + if (!di->fleecing.cbw) { | |
173 | + error_setg(errp, "appending cbw node for fleecing failed: %s", | |
174 | + local_err ? error_get_pretty(local_err) : "unknown error"); | |
175 | + break; | |
176 | + } | |
177 | + | |
178 | + QDict *snapshot_access_opts = qdict_new(); | |
179 | + qdict_put_str(snapshot_access_opts, "driver", "snapshot-access"); | |
180 | + qdict_put_str(snapshot_access_opts, "file", bdrv_get_node_name(di->fleecing.cbw)); | |
181 | + | |
182 | + /* | |
183 | + * Holding the AioContext lock here would cause a deadlock, because bdrv_open_driver() | |
184 | + * will aquire it a second time. But it's allowed to be held exactly once when polling | |
185 | + * and that happens when the bdrv_refresh_total_sectors() call is made there. | |
186 | + */ | |
187 | + aio_context_release(aio_context); | |
188 | + di->fleecing.snapshot_access = | |
189 | + bdrv_open(NULL, NULL, snapshot_access_opts, BDRV_O_RDWR | BDRV_O_UNMAP, &local_err); | |
190 | + aio_context_acquire(aio_context); | |
191 | + if (!di->fleecing.snapshot_access) { | |
192 | + error_setg(errp, "setting up snapshot access for fleecing failed: %s", | |
193 | + local_err ? error_get_pretty(local_err) : "unknown error"); | |
194 | + break; | |
195 | + } | |
196 | + source_bs = di->fleecing.snapshot_access; | |
197 | + discard_source = true; | |
198 | + | |
199 | + /* | |
200 | + * bdrv_get_info() just retuns 0 (= doesn't matter) for RBD when using krbd. But discard | |
201 | + * on the fleecing image won't work if the backup job's granularity is less than the RBD | |
202 | + * object size (default 4 MiB), so it does matter. Always use at least 4 MiB. With a PBS | |
203 | + * target, the backup job granularity would already be at least this much. | |
204 | + */ | |
205 | + perf.min_cluster_size = 4 * 1024 * 1024; | |
206 | + /* | |
207 | + * For discard to work, cluster size for the backup job must be at least the same as for | |
208 | + * the fleecing image. | |
209 | + */ | |
210 | + BlockDriverInfo bdi; | |
211 | + if (bdrv_get_info(di->fleecing.bs, &bdi) >= 0) { | |
212 | + perf.min_cluster_size = MAX(perf.min_cluster_size, bdi.cluster_size); | |
213 | + } | |
214 | + } | |
215 | + | |
216 | BlockJob *job = backup_job_create( | |
217 | - NULL, di->bs, di->target, backup_state.speed, sync_mode, di->bitmap, | |
218 | - bitmap_mode, false, NULL, &backup_state.perf, BLOCKDEV_ON_ERROR_REPORT, | |
219 | + job_id, source_bs, di->target, backup_state.speed, sync_mode, di->bitmap, | |
220 | + bitmap_mode, false, discard_source, NULL, &perf, BLOCKDEV_ON_ERROR_REPORT, | |
221 | BLOCKDEV_ON_ERROR_REPORT, JOB_DEFAULT, pvebackup_complete_cb, di, backup_state.txn, | |
222 | &local_err); | |
223 | ||
224 | @@ -581,6 +682,14 @@ static void create_backup_jobs_bh(void *opaque) { | |
225 | aio_co_enter(data->ctx, data->co); | |
226 | } | |
227 | ||
228 | +/* | |
229 | + * EFI disk and TPM state are small and it's just not worth setting up fleecing for them. | |
230 | + */ | |
231 | +static bool device_uses_fleecing(const char *device_id) | |
232 | +{ | |
233 | + return strncmp(device_id, "drive-efidisk", 13) && strncmp(device_id, "drive-tpmstate", 14); | |
234 | +} | |
235 | + | |
236 | /* | |
237 | * Returns a list of device infos, which needs to be freed by the caller. In | |
238 | * case of an error, errp will be set, but the returned value might still be a | |
239 | @@ -588,6 +697,7 @@ static void create_backup_jobs_bh(void *opaque) { | |
240 | */ | |
241 | static GList coroutine_fn *get_device_info( | |
242 | const char *devlist, | |
243 | + bool fleecing, | |
244 | Error **errp) | |
245 | { | |
246 | gchar **devs = NULL; | |
247 | @@ -611,6 +721,31 @@ static GList coroutine_fn *get_device_info( | |
248 | } | |
249 | PVEBackupDevInfo *di = g_new0(PVEBackupDevInfo, 1); | |
250 | di->bs = bs; | |
251 | + | |
252 | + if (fleecing && device_uses_fleecing(*d)) { | |
253 | + g_autofree gchar *fleecing_devid = g_strconcat(*d, "-fleecing", NULL); | |
254 | + BlockBackend *fleecing_blk = blk_by_name(fleecing_devid); | |
255 | + if (!fleecing_blk) { | |
256 | + error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, | |
257 | + "Device '%s' not found", fleecing_devid); | |
258 | + goto err; | |
259 | + } | |
260 | + BlockDriverState *fleecing_bs = blk_bs(fleecing_blk); | |
261 | + if (!bdrv_co_is_inserted(fleecing_bs)) { | |
262 | + error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, fleecing_devid); | |
263 | + goto err; | |
264 | + } | |
265 | + /* | |
266 | + * Fleecing image needs to be the same size to act as a cbw target. | |
267 | + */ | |
268 | + if (bs->total_sectors != fleecing_bs->total_sectors) { | |
269 | + error_setg(errp, "Size mismatch for '%s' - sector count %ld != %ld", | |
270 | + fleecing_devid, fleecing_bs->total_sectors, bs->total_sectors); | |
271 | + goto err; | |
272 | + } | |
273 | + di->fleecing.bs = fleecing_bs; | |
274 | + } | |
275 | + | |
276 | di_list = g_list_append(di_list, di); | |
277 | d++; | |
278 | } | |
279 | @@ -660,6 +795,7 @@ UuidInfo coroutine_fn *qmp_backup( | |
280 | const char *devlist, | |
281 | bool has_speed, int64_t speed, | |
282 | bool has_max_workers, int64_t max_workers, | |
283 | + bool has_fleecing, bool fleecing, | |
284 | Error **errp) | |
285 | { | |
286 | assert(qemu_in_coroutine()); | |
287 | @@ -687,7 +823,7 @@ UuidInfo coroutine_fn *qmp_backup( | |
288 | /* Todo: try to auto-detect format based on file name */ | |
289 | format = has_format ? format : BACKUP_FORMAT_VMA; | |
290 | ||
291 | - di_list = get_device_info(devlist, &local_err); | |
292 | + di_list = get_device_info(devlist, has_fleecing && fleecing, &local_err); | |
293 | if (local_err) { | |
294 | error_propagate(errp, local_err); | |
295 | goto err; | |
296 | @@ -1086,5 +1222,6 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp) | |
297 | ret->query_bitmap_info = true; | |
298 | ret->pbs_masterkey = true; | |
299 | ret->backup_max_workers = true; | |
300 | + ret->backup_fleecing = true; | |
301 | return ret; | |
302 | } | |
303 | diff --git a/qapi/block-core.json b/qapi/block-core.json | |
304 | index 58fd637e86..0bc5f42677 100644 | |
305 | --- a/qapi/block-core.json | |
306 | +++ b/qapi/block-core.json | |
307 | @@ -933,6 +933,10 @@ | |
308 | # | |
309 | # @max-workers: see @BackupPerf for details. Default 16. | |
310 | # | |
311 | +# @fleecing: perform a backup with fleecing. For each device in @devlist, a | |
312 | +# corresponing '-fleecing' device with the same size already needs to | |
313 | +# be present. | |
314 | +# | |
315 | # Returns: the uuid of the backup job | |
316 | # | |
317 | ## | |
318 | @@ -953,7 +957,8 @@ | |
319 | '*firewall-file': 'str', | |
320 | '*devlist': 'str', | |
321 | '*speed': 'int', | |
322 | - '*max-workers': 'int' }, | |
323 | + '*max-workers': 'int', | |
324 | + '*fleecing': 'bool' }, | |
325 | 'returns': 'UuidInfo', 'coroutine': true } | |
326 | ||
327 | ## | |
328 | @@ -1009,6 +1014,7 @@ | |
329 | 'pbs-dirty-bitmap-migration': 'bool', | |
330 | 'pbs-masterkey': 'bool', | |
331 | 'pbs-library-version': 'str', | |
332 | + 'backup-fleecing': 'bool', | |
333 | 'backup-max-workers': 'bool' } } | |
334 | ||
335 | ## |