]> git.proxmox.com Git - pve-qemu.git/blob - debian/patches/pve/0038-block-add-alloc-track-driver.patch
update submodule and patches to QEMU 9.0.0
[pve-qemu.git] / debian / patches / pve / 0038-block-add-alloc-track-driver.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Stefan Reiter <s.reiter@proxmox.com>
3 Date: Mon, 7 Dec 2020 15:21:03 +0100
4 Subject: [PATCH] block: add alloc-track driver
5
6 Add a new filter node 'alloc-track', which seperates reads and writes to
7 different children, thus allowing to put a backing image behind any
8 blockdev (regardless of driver support). Since we can't detect any
9 pre-allocated blocks, we can only track new writes, hence the write
10 target ('file') for this node must always be empty.
11
12 Intended use case is for live restoring, i.e. add a backup image as a
13 block device into a VM, then put an alloc-track on the restore target
14 and set the backup as backing. With this, one can use a regular
15 'block-stream' to restore the image, while the VM can already run in the
16 background. Copy-on-read will help make progress as the VM reads as
17 well.
18
19 This only worked if the target supports backing images, so up until now
20 only for qcow2, with alloc-track any driver for the target can be used.
21
22 Replacing the node cannot be done in the
23 track_co_change_backing_file() callback, because replacing a node
24 cannot happen in a coroutine and requires the block graph lock
25 exclusively. Could either become a special option for the stream job,
26 or maybe the upcoming blockdev-replace QMP command can be used in the
27 future.
28
29 Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
30 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
31 [FE: adapt to changed function signatures
32 make error return value consistent with QEMU
33 avoid premature break during read
34 adhere to block graph lock requirements]
35 Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
36 ---
37 block/alloc-track.c | 366 ++++++++++++++++++++++++++++++++++++++++++++
38 block/meson.build | 1 +
39 block/stream.c | 34 ++++
40 3 files changed, 401 insertions(+)
41 create mode 100644 block/alloc-track.c
42
43 diff --git a/block/alloc-track.c b/block/alloc-track.c
44 new file mode 100644
45 index 0000000000..b9f8ea9137
46 --- /dev/null
47 +++ b/block/alloc-track.c
48 @@ -0,0 +1,366 @@
49 +/*
50 + * Node to allow backing images to be applied to any node. Assumes a blank
51 + * image to begin with, only new writes are tracked as allocated, thus this
52 + * must never be put on a node that already contains data.
53 + *
54 + * Copyright (c) 2020 Proxmox Server Solutions GmbH
55 + * Copyright (c) 2020 Stefan Reiter <s.reiter@proxmox.com>
56 + *
57 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
58 + * See the COPYING file in the top-level directory.
59 + */
60 +
61 +#include "qemu/osdep.h"
62 +#include "qapi/error.h"
63 +#include "block/block_int.h"
64 +#include "block/dirty-bitmap.h"
65 +#include "block/graph-lock.h"
66 +#include "qapi/qmp/qdict.h"
67 +#include "qapi/qmp/qstring.h"
68 +#include "qemu/cutils.h"
69 +#include "qemu/error-report.h"
70 +#include "qemu/option.h"
71 +#include "qemu/module.h"
72 +#include "sysemu/block-backend.h"
73 +
74 +#define TRACK_OPT_AUTO_REMOVE "auto-remove"
75 +
76 +typedef enum DropState {
77 + DropNone,
78 + DropInProgress,
79 +} DropState;
80 +
81 +typedef struct {
82 + BdrvDirtyBitmap *bitmap;
83 + uint64_t granularity;
84 + DropState drop_state;
85 + bool auto_remove;
86 +} BDRVAllocTrackState;
87 +
88 +static QemuOptsList runtime_opts = {
89 + .name = "alloc-track",
90 + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
91 + .desc = {
92 + {
93 + .name = TRACK_OPT_AUTO_REMOVE,
94 + .type = QEMU_OPT_BOOL,
95 + .help = "automatically replace this node with 'file' when 'backing'"
96 + "is detached",
97 + },
98 + { /* end of list */ }
99 + },
100 +};
101 +
102 +static void GRAPH_RDLOCK
103 +track_refresh_limits(BlockDriverState *bs, Error **errp)
104 +{
105 + BDRVAllocTrackState *s = bs->opaque;
106 +
107 + if (!bs->file) {
108 + return;
109 + }
110 +
111 + /*
112 + * Always use alignment from underlying write device so RMW cycle for
113 + * bdrv_pwritev reads data from our backing via track_co_preadv. Also use at
114 + * least the bitmap granularity.
115 + */
116 + bs->bl.request_alignment = MAX(bs->file->bs->bl.request_alignment,
117 + s->granularity);
118 +}
119 +
120 +static int track_open(BlockDriverState *bs, QDict *options, int flags,
121 + Error **errp)
122 +{
123 + BDRVAllocTrackState *s = bs->opaque;
124 + BdrvChild *file = NULL;
125 + QemuOpts *opts;
126 + Error *local_err = NULL;
127 + int ret = 0;
128 +
129 + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
130 + qemu_opts_absorb_qdict(opts, options, &local_err);
131 + if (local_err) {
132 + error_propagate(errp, local_err);
133 + ret = -EINVAL;
134 + goto fail;
135 + }
136 +
137 + s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
138 +
139 + /* open the target (write) node, backing will be attached by block layer */
140 + file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
141 + BDRV_CHILD_DATA | BDRV_CHILD_METADATA, false,
142 + &local_err);
143 + bdrv_graph_wrlock();
144 + bs->file = file;
145 + bdrv_graph_wrunlock();
146 + if (local_err) {
147 + ret = -EINVAL;
148 + error_propagate(errp, local_err);
149 + goto fail;
150 + }
151 +
152 + bdrv_graph_rdlock_main_loop();
153 + BlockDriverInfo bdi = {0};
154 + ret = bdrv_get_info(bs->file->bs, &bdi);
155 + if (ret < 0) {
156 + /*
157 + * Not a hard failure. Worst that can happen is partial cluster
158 + * allocation in the write target. However, the driver here returns its
159 + * allocation status based on the dirty bitmap, so any other data that
160 + * maps to such a cluster will still be copied later by a stream job (or
161 + * during writes to that cluster).
162 + */
163 + warn_report("alloc-track: unable to query cluster size for write target: %s",
164 + strerror(ret));
165 + }
166 + ret = 0;
167 + /*
168 + * Always consider alignment from underlying write device so RMW cycle for
169 + * bdrv_pwritev reads data from our backing via track_co_preadv. Also try to
170 + * avoid partial cluster allocation in the write target by considering the
171 + * cluster size.
172 + */
173 + s->granularity = MAX(bs->file->bs->bl.request_alignment,
174 + MAX(bdi.cluster_size, BDRV_SECTOR_SIZE));
175 + track_refresh_limits(bs, errp);
176 + s->bitmap = bdrv_create_dirty_bitmap(bs->file->bs, s->granularity, NULL,
177 + &local_err);
178 + bdrv_graph_rdunlock_main_loop();
179 + if (local_err) {
180 + ret = -EIO;
181 + error_propagate(errp, local_err);
182 + goto fail;
183 + }
184 +
185 + s->drop_state = DropNone;
186 +
187 +fail:
188 + if (ret < 0) {
189 + bdrv_graph_wrlock();
190 + bdrv_unref_child(bs, bs->file);
191 + bdrv_graph_wrunlock();
192 + if (s->bitmap) {
193 + bdrv_release_dirty_bitmap(s->bitmap);
194 + }
195 + }
196 + qemu_opts_del(opts);
197 + return ret;
198 +}
199 +
200 +static void track_close(BlockDriverState *bs)
201 +{
202 + BDRVAllocTrackState *s = bs->opaque;
203 + if (s->bitmap) {
204 + bdrv_release_dirty_bitmap(s->bitmap);
205 + }
206 +}
207 +
208 +static coroutine_fn int64_t GRAPH_RDLOCK
209 +track_co_getlength(BlockDriverState *bs)
210 +{
211 + return bdrv_co_getlength(bs->file->bs);
212 +}
213 +
214 +static int coroutine_fn GRAPH_RDLOCK
215 +track_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
216 + QEMUIOVector *qiov, BdrvRequestFlags flags)
217 +{
218 + BDRVAllocTrackState *s = bs->opaque;
219 + QEMUIOVector local_qiov;
220 + int ret;
221 +
222 + /* 'cur_offset' is relative to 'offset', 'local_offset' to image start */
223 + uint64_t cur_offset, local_offset;
224 + int64_t local_bytes;
225 + bool alloc;
226 +
227 + if (offset < 0 || bytes < 0) {
228 + fprintf(stderr, "unexpected negative 'offset' or 'bytes' value!\n");
229 + return -EIO;
230 + }
231 +
232 + /* a read request can span multiple granularity-sized chunks, and can thus
233 + * contain blocks with different allocation status - we could just iterate
234 + * granularity-wise, but for better performance use bdrv_dirty_bitmap_next_X
235 + * to find the next flip and consider everything up to that in one go */
236 + for (cur_offset = 0; cur_offset < bytes; cur_offset += local_bytes) {
237 + local_offset = offset + cur_offset;
238 + alloc = bdrv_dirty_bitmap_get(s->bitmap, local_offset);
239 + if (alloc) {
240 + local_bytes = bdrv_dirty_bitmap_next_zero(s->bitmap, local_offset,
241 + bytes - cur_offset);
242 + } else {
243 + local_bytes = bdrv_dirty_bitmap_next_dirty(s->bitmap, local_offset,
244 + bytes - cur_offset);
245 + }
246 +
247 + /* _bitmap_next_X return is -1 if no end found within limit, otherwise
248 + * offset of next flip (to start of image) */
249 + local_bytes = local_bytes < 0 ?
250 + bytes - cur_offset :
251 + local_bytes - local_offset;
252 +
253 + qemu_iovec_init_slice(&local_qiov, qiov, cur_offset, local_bytes);
254 +
255 + if (alloc) {
256 + ret = bdrv_co_preadv(bs->file, local_offset, local_bytes,
257 + &local_qiov, flags);
258 + } else if (bs->backing) {
259 + ret = bdrv_co_preadv(bs->backing, local_offset, local_bytes,
260 + &local_qiov, flags);
261 + } else {
262 + qemu_iovec_memset(&local_qiov, cur_offset, 0, local_bytes);
263 + ret = 0;
264 + }
265 +
266 + if (ret != 0) {
267 + break;
268 + }
269 + }
270 +
271 + return ret;
272 +}
273 +
274 +static int coroutine_fn GRAPH_RDLOCK
275 +track_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
276 + QEMUIOVector *qiov, BdrvRequestFlags flags)
277 +{
278 + return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
279 +}
280 +
281 +static int coroutine_fn GRAPH_RDLOCK
282 +track_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
283 + BdrvRequestFlags flags)
284 +{
285 + return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
286 +}
287 +
288 +static int coroutine_fn GRAPH_RDLOCK
289 +track_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
290 +{
291 + return bdrv_co_pdiscard(bs->file, offset, bytes);
292 +}
293 +
294 +static coroutine_fn int GRAPH_RDLOCK
295 +track_co_flush(BlockDriverState *bs)
296 +{
297 + return bdrv_co_flush(bs->file->bs);
298 +}
299 +
300 +static int coroutine_fn GRAPH_RDLOCK
301 +track_co_block_status(BlockDriverState *bs, bool want_zero,
302 + int64_t offset,
303 + int64_t bytes,
304 + int64_t *pnum,
305 + int64_t *map,
306 + BlockDriverState **file)
307 +{
308 + BDRVAllocTrackState *s = bs->opaque;
309 +
310 + bool alloc = bdrv_dirty_bitmap_get(s->bitmap, offset);
311 + int64_t next_flipped;
312 + if (alloc) {
313 + next_flipped = bdrv_dirty_bitmap_next_zero(s->bitmap, offset, bytes);
314 + } else {
315 + next_flipped = bdrv_dirty_bitmap_next_dirty(s->bitmap, offset, bytes);
316 + }
317 +
318 + /* in case not the entire region has the same state, we need to set pnum to
319 + * indicate for how many bytes our result is valid */
320 + *pnum = next_flipped == -1 ? bytes : next_flipped - offset;
321 + *map = offset;
322 +
323 + if (alloc) {
324 + *file = bs->file->bs;
325 + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
326 + } else if (bs->backing) {
327 + *file = bs->backing->bs;
328 + }
329 + return 0;
330 +}
331 +
332 +static void GRAPH_RDLOCK
333 +track_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
334 + BlockReopenQueue *reopen_queue, uint64_t perm, uint64_t shared,
335 + uint64_t *nperm, uint64_t *nshared)
336 +{
337 + BDRVAllocTrackState *s = bs->opaque;
338 +
339 + *nshared = BLK_PERM_ALL;
340 +
341 + /* in case we're currently dropping ourselves, claim to not use any
342 + * permissions at all - which is fine, since from this point on we will
343 + * never issue a read or write anymore */
344 + if (s->drop_state == DropInProgress) {
345 + *nperm = 0;
346 + return;
347 + }
348 +
349 + if (role & BDRV_CHILD_DATA) {
350 + *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
351 + } else {
352 + /* 'backing' is also a child of our BDS, but we don't expect it to be
353 + * writeable, so we only forward 'consistent read' */
354 + *nperm = perm & BLK_PERM_CONSISTENT_READ;
355 + }
356 +}
357 +
358 +static int coroutine_fn GRAPH_RDLOCK
359 +track_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
360 + const char *backing_fmt)
361 +{
362 + /*
363 + * Note that the actual backing file graph change is already done in the
364 + * stream job itself with bdrv_set_backing_hd_drained(), so no need to
365 + * actually do anything here. But still needs to be implemented, to make
366 + * our caller (i.e. bdrv_co_change_backing_file() do the right thing).
367 + *
368 + * FIXME
369 + * We'd like to auto-remove ourselves from the block graph, but it cannot
370 + * be done from a coroutine. Currently done in the stream job, where it
371 + * kinda fits better, but in the long-term, a special parameter would be
372 + * nice (or done via qemu-server via upcoming blockdev-replace QMP command).
373 + */
374 + if (backing_file == NULL) {
375 + BDRVAllocTrackState *s = bs->opaque;
376 + bdrv_drained_begin(bs);
377 + s->drop_state = DropInProgress;
378 + bdrv_child_refresh_perms(bs, bs->file, &error_abort);
379 + bdrv_drained_end(bs);
380 + }
381 +
382 + return 0;
383 +}
384 +
385 +static BlockDriver bdrv_alloc_track = {
386 + .format_name = "alloc-track",
387 + .instance_size = sizeof(BDRVAllocTrackState),
388 +
389 + .bdrv_file_open = track_open,
390 + .bdrv_close = track_close,
391 + .bdrv_co_getlength = track_co_getlength,
392 + .bdrv_child_perm = track_child_perm,
393 + .bdrv_refresh_limits = track_refresh_limits,
394 +
395 + .bdrv_co_pwrite_zeroes = track_co_pwrite_zeroes,
396 + .bdrv_co_pwritev = track_co_pwritev,
397 + .bdrv_co_preadv = track_co_preadv,
398 + .bdrv_co_pdiscard = track_co_pdiscard,
399 +
400 + .bdrv_co_flush = track_co_flush,
401 + .bdrv_co_flush_to_disk = track_co_flush,
402 +
403 + .supports_backing = true,
404 +
405 + .bdrv_co_block_status = track_co_block_status,
406 + .bdrv_co_change_backing_file = track_co_change_backing_file,
407 +};
408 +
409 +static void bdrv_alloc_track_init(void)
410 +{
411 + bdrv_register(&bdrv_alloc_track);
412 +}
413 +
414 +block_init(bdrv_alloc_track_init);
415 diff --git a/block/meson.build b/block/meson.build
416 index 1945e04eeb..2873f3a25a 100644
417 --- a/block/meson.build
418 +++ b/block/meson.build
419 @@ -2,6 +2,7 @@ block_ss.add(genh)
420 block_ss.add(files(
421 'accounting.c',
422 'aio_task.c',
423 + 'alloc-track.c',
424 'amend.c',
425 'backup.c',
426 'backup-dump.c',
427 diff --git a/block/stream.c b/block/stream.c
428 index d2da83ae7c..f941cba14e 100644
429 --- a/block/stream.c
430 +++ b/block/stream.c
431 @@ -120,6 +120,40 @@ static int stream_prepare(Job *job)
432 ret = -EPERM;
433 goto out;
434 }
435 +
436 + /*
437 + * This cannot be done in the co_change_backing_file callback, because
438 + * bdrv_replace_node() cannot be done in a coroutine. The latter also
439 + * requires the graph lock exclusively. Only required for the
440 + * alloc-track driver.
441 + *
442 + * The long-term plan is to either have an explicit parameter for the
443 + * stream job or use the upcoming blockdev-replace QMP command.
444 + */
445 + if (base_id == NULL && strcmp(unfiltered_bs->drv->format_name, "alloc-track") == 0) {
446 + BlockDriverState *file_bs;
447 +
448 + bdrv_graph_rdlock_main_loop();
449 + file_bs = unfiltered_bs->file->bs;
450 + bdrv_graph_rdunlock_main_loop();
451 +
452 + bdrv_ref(unfiltered_bs); // unrefed by bdrv_replace_node()
453 + bdrv_drained_begin(file_bs);
454 + bdrv_graph_wrlock();
455 +
456 + bdrv_replace_node(unfiltered_bs, file_bs, &local_err);
457 +
458 + bdrv_graph_wrunlock();
459 + bdrv_drained_end(file_bs);
460 + bdrv_unref(unfiltered_bs);
461 +
462 + if (local_err) {
463 + error_prepend(&local_err, "failed to replace alloc-track node: ");
464 + error_report_err(local_err);
465 + ret = -EPERM;
466 + goto out;
467 + }
468 + }
469 }
470
471 out: