]> git.proxmox.com Git - pve-qemu.git/blame - debian/patches/pve/0039-block-add-alloc-track-driver.patch
update submodule and patches to QEMU 8.1.2
[pve-qemu.git] / debian / patches / pve / 0039-block-add-alloc-track-driver.patch
CommitLineData
677d0d16
SR
1From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: Stefan Reiter <s.reiter@proxmox.com>
3Date: Mon, 7 Dec 2020 15:21:03 +0100
4Subject: [PATCH] block: add alloc-track driver
5
6Add a new filter node 'alloc-track', which seperates reads and writes to
7different children, thus allowing to put a backing image behind any
8blockdev (regardless of driver support). Since we can't detect any
9pre-allocated blocks, we can only track new writes, hence the write
10target ('file') for this node must always be empty.
11
12Intended use case is for live restoring, i.e. add a backup image as a
13block device into a VM, then put an alloc-track on the restore target
14and set the backup as backing. With this, one can use a regular
15'block-stream' to restore the image, while the VM can already run in the
16background. Copy-on-read will help make progress as the VM reads as
17well.
18
19This only worked if the target supports backing images, so up until now
20only for qcow2, with alloc-track any driver for the target can be used.
21
22If 'auto-remove' is set, alloc-track will automatically detach itself
23once the backing image is removed. It will be replaced by 'file'.
24
25Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
ddbf7a87 26Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
4e1935c2 27[FE: adapt to changed function signatures
db5d2a4b
FE
28 make error return value consistent with QEMU
29 avoid premature break during read]
4e1935c2 30Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
677d0d16 31---
db5d2a4b 32 block/alloc-track.c | 352 ++++++++++++++++++++++++++++++++++++++++++++
677d0d16 33 block/meson.build | 1 +
db5d2a4b 34 2 files changed, 353 insertions(+)
677d0d16
SR
35 create mode 100644 block/alloc-track.c
36
37diff --git a/block/alloc-track.c b/block/alloc-track.c
38new file mode 100644
db5d2a4b 39index 0000000000..b75d7c6460
677d0d16
SR
40--- /dev/null
41+++ b/block/alloc-track.c
db5d2a4b 42@@ -0,0 +1,352 @@
677d0d16
SR
43+/*
44+ * Node to allow backing images to be applied to any node. Assumes a blank
45+ * image to begin with, only new writes are tracked as allocated, thus this
46+ * must never be put on a node that already contains data.
47+ *
48+ * Copyright (c) 2020 Proxmox Server Solutions GmbH
49+ * Copyright (c) 2020 Stefan Reiter <s.reiter@proxmox.com>
50+ *
51+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
52+ * See the COPYING file in the top-level directory.
53+ */
54+
55+#include "qemu/osdep.h"
56+#include "qapi/error.h"
57+#include "block/block_int.h"
bf251437 58+#include "block/dirty-bitmap.h"
677d0d16
SR
59+#include "qapi/qmp/qdict.h"
60+#include "qapi/qmp/qstring.h"
61+#include "qemu/cutils.h"
62+#include "qemu/option.h"
63+#include "qemu/module.h"
64+#include "sysemu/block-backend.h"
65+
66+#define TRACK_OPT_AUTO_REMOVE "auto-remove"
67+
68+typedef enum DropState {
69+ DropNone,
70+ DropRequested,
71+ DropInProgress,
72+} DropState;
73+
74+typedef struct {
75+ BdrvDirtyBitmap *bitmap;
76+ DropState drop_state;
77+ bool auto_remove;
78+} BDRVAllocTrackState;
79+
80+static QemuOptsList runtime_opts = {
81+ .name = "alloc-track",
82+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
83+ .desc = {
84+ {
85+ .name = TRACK_OPT_AUTO_REMOVE,
86+ .type = QEMU_OPT_BOOL,
87+ .help = "automatically replace this node with 'file' when 'backing'"
88+ "is detached",
89+ },
90+ { /* end of list */ }
91+ },
92+};
93+
94+static void track_refresh_limits(BlockDriverState *bs, Error **errp)
95+{
96+ BlockDriverInfo bdi;
97+
98+ if (!bs->file) {
99+ return;
100+ }
101+
102+ /* always use alignment from underlying write device so RMW cycle for
103+ * bdrv_pwritev reads data from our backing via track_co_preadv (no partial
104+ * cluster allocation in 'file') */
105+ bdrv_get_info(bs->file->bs, &bdi);
106+ bs->bl.request_alignment = MAX(bs->file->bs->bl.request_alignment,
107+ MAX(bdi.cluster_size, BDRV_SECTOR_SIZE));
108+}
109+
110+static int track_open(BlockDriverState *bs, QDict *options, int flags,
111+ Error **errp)
112+{
113+ BDRVAllocTrackState *s = bs->opaque;
114+ QemuOpts *opts;
115+ Error *local_err = NULL;
116+ int ret = 0;
117+
118+ opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
119+ qemu_opts_absorb_qdict(opts, options, &local_err);
120+ if (local_err) {
121+ error_propagate(errp, local_err);
122+ ret = -EINVAL;
123+ goto fail;
124+ }
125+
126+ s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
127+
128+ /* open the target (write) node, backing will be attached by block layer */
129+ bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
130+ BDRV_CHILD_DATA | BDRV_CHILD_METADATA, false,
131+ &local_err);
132+ if (local_err) {
133+ ret = -EINVAL;
134+ error_propagate(errp, local_err);
135+ goto fail;
136+ }
137+
138+ track_refresh_limits(bs, errp);
139+ uint64_t gran = bs->bl.request_alignment;
140+ s->bitmap = bdrv_create_dirty_bitmap(bs->file->bs, gran, NULL, &local_err);
141+ if (local_err) {
142+ ret = -EIO;
143+ error_propagate(errp, local_err);
144+ goto fail;
145+ }
146+
147+ s->drop_state = DropNone;
148+
149+fail:
150+ if (ret < 0) {
151+ bdrv_unref_child(bs, bs->file);
152+ if (s->bitmap) {
153+ bdrv_release_dirty_bitmap(s->bitmap);
154+ }
155+ }
156+ qemu_opts_del(opts);
157+ return ret;
158+}
159+
160+static void track_close(BlockDriverState *bs)
161+{
162+ BDRVAllocTrackState *s = bs->opaque;
163+ if (s->bitmap) {
164+ bdrv_release_dirty_bitmap(s->bitmap);
165+ }
166+}
167+
bf251437 168+static coroutine_fn int64_t track_co_getlength(BlockDriverState *bs)
677d0d16 169+{
bf251437 170+ return bdrv_co_getlength(bs->file->bs);
677d0d16
SR
171+}
172+
173+static int coroutine_fn track_co_preadv(BlockDriverState *bs,
4567474e 174+ int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
677d0d16
SR
175+{
176+ BDRVAllocTrackState *s = bs->opaque;
177+ QEMUIOVector local_qiov;
178+ int ret;
179+
180+ /* 'cur_offset' is relative to 'offset', 'local_offset' to image start */
181+ uint64_t cur_offset, local_offset;
182+ int64_t local_bytes;
183+ bool alloc;
184+
4567474e
FE
185+ if (offset < 0 || bytes < 0) {
186+ fprintf(stderr, "unexpected negative 'offset' or 'bytes' value!\n");
4e1935c2 187+ return -EIO;
4567474e
FE
188+ }
189+
677d0d16
SR
190+ /* a read request can span multiple granularity-sized chunks, and can thus
191+ * contain blocks with different allocation status - we could just iterate
192+ * granularity-wise, but for better performance use bdrv_dirty_bitmap_next_X
193+ * to find the next flip and consider everything up to that in one go */
194+ for (cur_offset = 0; cur_offset < bytes; cur_offset += local_bytes) {
195+ local_offset = offset + cur_offset;
196+ alloc = bdrv_dirty_bitmap_get(s->bitmap, local_offset);
197+ if (alloc) {
198+ local_bytes = bdrv_dirty_bitmap_next_zero(s->bitmap, local_offset,
199+ bytes - cur_offset);
200+ } else {
201+ local_bytes = bdrv_dirty_bitmap_next_dirty(s->bitmap, local_offset,
202+ bytes - cur_offset);
203+ }
204+
205+ /* _bitmap_next_X return is -1 if no end found within limit, otherwise
206+ * offset of next flip (to start of image) */
207+ local_bytes = local_bytes < 0 ?
208+ bytes - cur_offset :
209+ local_bytes - local_offset;
210+
211+ qemu_iovec_init_slice(&local_qiov, qiov, cur_offset, local_bytes);
212+
213+ if (alloc) {
214+ ret = bdrv_co_preadv(bs->file, local_offset, local_bytes,
215+ &local_qiov, flags);
216+ } else if (bs->backing) {
217+ ret = bdrv_co_preadv(bs->backing, local_offset, local_bytes,
218+ &local_qiov, flags);
219+ } else {
db5d2a4b
FE
220+ qemu_iovec_memset(&local_qiov, cur_offset, 0, local_bytes);
221+ ret = 0;
677d0d16
SR
222+ }
223+
224+ if (ret != 0) {
225+ break;
226+ }
227+ }
228+
229+ return ret;
230+}
231+
232+static int coroutine_fn track_co_pwritev(BlockDriverState *bs,
4567474e 233+ int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
677d0d16
SR
234+{
235+ return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
236+}
237+
238+static int coroutine_fn track_co_pwrite_zeroes(BlockDriverState *bs,
4567474e 239+ int64_t offset, int64_t bytes, BdrvRequestFlags flags)
677d0d16 240+{
4567474e 241+ return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
677d0d16
SR
242+}
243+
244+static int coroutine_fn track_co_pdiscard(BlockDriverState *bs,
4567474e 245+ int64_t offset, int64_t bytes)
677d0d16 246+{
4567474e 247+ return bdrv_co_pdiscard(bs->file, offset, bytes);
677d0d16
SR
248+}
249+
250+static coroutine_fn int track_co_flush(BlockDriverState *bs)
251+{
252+ return bdrv_co_flush(bs->file->bs);
253+}
254+
255+static int coroutine_fn track_co_block_status(BlockDriverState *bs,
256+ bool want_zero,
257+ int64_t offset,
258+ int64_t bytes,
259+ int64_t *pnum,
260+ int64_t *map,
261+ BlockDriverState **file)
262+{
263+ BDRVAllocTrackState *s = bs->opaque;
264+
265+ bool alloc = bdrv_dirty_bitmap_get(s->bitmap, offset);
266+ int64_t next_flipped;
267+ if (alloc) {
268+ next_flipped = bdrv_dirty_bitmap_next_zero(s->bitmap, offset, bytes);
269+ } else {
270+ next_flipped = bdrv_dirty_bitmap_next_dirty(s->bitmap, offset, bytes);
271+ }
272+
273+ /* in case not the entire region has the same state, we need to set pnum to
274+ * indicate for how many bytes our result is valid */
275+ *pnum = next_flipped == -1 ? bytes : next_flipped - offset;
276+ *map = offset;
277+
278+ if (alloc) {
279+ *file = bs->file->bs;
280+ return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
281+ } else if (bs->backing) {
282+ *file = bs->backing->bs;
283+ }
284+ return 0;
285+}
286+
287+static void track_child_perm(BlockDriverState *bs, BdrvChild *c,
288+ BdrvChildRole role, BlockReopenQueue *reopen_queue,
289+ uint64_t perm, uint64_t shared,
290+ uint64_t *nperm, uint64_t *nshared)
291+{
292+ BDRVAllocTrackState *s = bs->opaque;
293+
294+ *nshared = BLK_PERM_ALL;
295+
296+ /* in case we're currently dropping ourselves, claim to not use any
297+ * permissions at all - which is fine, since from this point on we will
298+ * never issue a read or write anymore */
299+ if (s->drop_state == DropInProgress) {
300+ *nperm = 0;
301+ return;
302+ }
303+
304+ if (role & BDRV_CHILD_DATA) {
305+ *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
306+ } else {
307+ /* 'backing' is also a child of our BDS, but we don't expect it to be
308+ * writeable, so we only forward 'consistent read' */
309+ *nperm = perm & BLK_PERM_CONSISTENT_READ;
310+ }
311+}
312+
313+static void track_drop(void *opaque)
314+{
315+ BlockDriverState *bs = (BlockDriverState*)opaque;
316+ BlockDriverState *file = bs->file->bs;
317+ BDRVAllocTrackState *s = bs->opaque;
318+
319+ assert(file);
320+
321+ /* we rely on the fact that we're not used anywhere else, so let's wait
322+ * until we're only used once - in the drive connected to the guest (and one
323+ * ref is held by bdrv_ref in track_change_backing_file) */
324+ if (bs->refcnt > 2) {
325+ aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, opaque);
326+ return;
327+ }
b36e8acc
TL
328+ AioContext *aio_context = bdrv_get_aio_context(bs);
329+ aio_context_acquire(aio_context);
677d0d16 330+
677d0d16
SR
331+ bdrv_drained_begin(bs);
332+
333+ /* now that we're drained, we can safely set 'DropInProgress' */
334+ s->drop_state = DropInProgress;
335+ bdrv_child_refresh_perms(bs, bs->file, &error_abort);
336+
337+ bdrv_replace_node(bs, file, &error_abort);
aa42ea26
TL
338+ bdrv_set_backing_hd(bs, NULL, &error_abort);
339+ bdrv_drained_end(bs);
677d0d16 340+ bdrv_unref(bs);
b36e8acc 341+ aio_context_release(aio_context);
677d0d16
SR
342+}
343+
344+static int track_change_backing_file(BlockDriverState *bs,
345+ const char *backing_file,
346+ const char *backing_fmt)
347+{
348+ BDRVAllocTrackState *s = bs->opaque;
349+ if (s->auto_remove && s->drop_state == DropNone &&
350+ backing_file == NULL && backing_fmt == NULL)
351+ {
352+ /* backing file has been disconnected, there's no longer any use for
353+ * this node, so let's remove ourselves from the block graph - we need
354+ * to schedule this for later however, since when this function is
355+ * called, the blockjob modifying us is probably not done yet and has a
356+ * blocker on 'bs' */
357+ s->drop_state = DropRequested;
358+ bdrv_ref(bs);
359+ aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, (void*)bs);
360+ }
361+
362+ return 0;
363+}
364+
365+static BlockDriver bdrv_alloc_track = {
366+ .format_name = "alloc-track",
367+ .instance_size = sizeof(BDRVAllocTrackState),
368+
369+ .bdrv_file_open = track_open,
370+ .bdrv_close = track_close,
bf251437 371+ .bdrv_co_getlength = track_co_getlength,
677d0d16
SR
372+ .bdrv_child_perm = track_child_perm,
373+ .bdrv_refresh_limits = track_refresh_limits,
374+
375+ .bdrv_co_pwrite_zeroes = track_co_pwrite_zeroes,
376+ .bdrv_co_pwritev = track_co_pwritev,
377+ .bdrv_co_preadv = track_co_preadv,
378+ .bdrv_co_pdiscard = track_co_pdiscard,
379+
380+ .bdrv_co_flush = track_co_flush,
381+ .bdrv_co_flush_to_disk = track_co_flush,
382+
383+ .supports_backing = true,
384+
385+ .bdrv_co_block_status = track_co_block_status,
386+ .bdrv_change_backing_file = track_change_backing_file,
387+};
388+
389+static void bdrv_alloc_track_init(void)
390+{
391+ bdrv_register(&bdrv_alloc_track);
392+}
393+
394+block_init(bdrv_alloc_track_init);
395diff --git a/block/meson.build b/block/meson.build
10e10933 396index becc99ac4e..0a69836593 100644
677d0d16
SR
397--- a/block/meson.build
398+++ b/block/meson.build
399@@ -2,6 +2,7 @@ block_ss.add(genh)
400 block_ss.add(files(
401 'accounting.c',
402 'aio_task.c',
403+ 'alloc-track.c',
404 'amend.c',
405 'backup.c',
406 'backup-dump.c',