]> git.proxmox.com Git - pve-qemu.git/blame - debian/patches/pve/0046-block-add-alloc-track-driver.patch
update submodule and patches to 6.1.1
[pve-qemu.git] / debian / patches / pve / 0046-block-add-alloc-track-driver.patch
CommitLineData
677d0d16
SR
1From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: Stefan Reiter <s.reiter@proxmox.com>
3Date: Mon, 7 Dec 2020 15:21:03 +0100
4Subject: [PATCH] block: add alloc-track driver
5
6Add a new filter node 'alloc-track', which seperates reads and writes to
7different children, thus allowing to put a backing image behind any
8blockdev (regardless of driver support). Since we can't detect any
9pre-allocated blocks, we can only track new writes, hence the write
10target ('file') for this node must always be empty.
11
12Intended use case is for live restoring, i.e. add a backup image as a
13block device into a VM, then put an alloc-track on the restore target
14and set the backup as backing. With this, one can use a regular
15'block-stream' to restore the image, while the VM can already run in the
16background. Copy-on-read will help make progress as the VM reads as
17well.
18
19This only worked if the target supports backing images, so up until now
20only for qcow2, with alloc-track any driver for the target can be used.
21
22If 'auto-remove' is set, alloc-track will automatically detach itself
23once the backing image is removed. It will be replaced by 'file'.
24
25Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
ddbf7a87 26Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
677d0d16 27---
8dca018b 28 block/alloc-track.c | 345 ++++++++++++++++++++++++++++++++++++++++++++
677d0d16 29 block/meson.build | 1 +
8dca018b 30 2 files changed, 346 insertions(+)
677d0d16
SR
31 create mode 100644 block/alloc-track.c
32
33diff --git a/block/alloc-track.c b/block/alloc-track.c
34new file mode 100644
8dca018b 35index 0000000000..35f2737c89
677d0d16
SR
36--- /dev/null
37+++ b/block/alloc-track.c
b36e8acc 38@@ -0,0 +1,345 @@
677d0d16
SR
39+/*
40+ * Node to allow backing images to be applied to any node. Assumes a blank
41+ * image to begin with, only new writes are tracked as allocated, thus this
42+ * must never be put on a node that already contains data.
43+ *
44+ * Copyright (c) 2020 Proxmox Server Solutions GmbH
45+ * Copyright (c) 2020 Stefan Reiter <s.reiter@proxmox.com>
46+ *
47+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
48+ * See the COPYING file in the top-level directory.
49+ */
50+
51+#include "qemu/osdep.h"
52+#include "qapi/error.h"
53+#include "block/block_int.h"
54+#include "qapi/qmp/qdict.h"
55+#include "qapi/qmp/qstring.h"
56+#include "qemu/cutils.h"
57+#include "qemu/option.h"
58+#include "qemu/module.h"
59+#include "sysemu/block-backend.h"
60+
61+#define TRACK_OPT_AUTO_REMOVE "auto-remove"
62+
63+typedef enum DropState {
64+ DropNone,
65+ DropRequested,
66+ DropInProgress,
67+} DropState;
68+
69+typedef struct {
70+ BdrvDirtyBitmap *bitmap;
71+ DropState drop_state;
72+ bool auto_remove;
73+} BDRVAllocTrackState;
74+
75+static QemuOptsList runtime_opts = {
76+ .name = "alloc-track",
77+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
78+ .desc = {
79+ {
80+ .name = TRACK_OPT_AUTO_REMOVE,
81+ .type = QEMU_OPT_BOOL,
82+ .help = "automatically replace this node with 'file' when 'backing'"
83+ "is detached",
84+ },
85+ { /* end of list */ }
86+ },
87+};
88+
89+static void track_refresh_limits(BlockDriverState *bs, Error **errp)
90+{
91+ BlockDriverInfo bdi;
92+
93+ if (!bs->file) {
94+ return;
95+ }
96+
97+ /* always use alignment from underlying write device so RMW cycle for
98+ * bdrv_pwritev reads data from our backing via track_co_preadv (no partial
99+ * cluster allocation in 'file') */
100+ bdrv_get_info(bs->file->bs, &bdi);
101+ bs->bl.request_alignment = MAX(bs->file->bs->bl.request_alignment,
102+ MAX(bdi.cluster_size, BDRV_SECTOR_SIZE));
103+}
104+
105+static int track_open(BlockDriverState *bs, QDict *options, int flags,
106+ Error **errp)
107+{
108+ BDRVAllocTrackState *s = bs->opaque;
109+ QemuOpts *opts;
110+ Error *local_err = NULL;
111+ int ret = 0;
112+
113+ opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
114+ qemu_opts_absorb_qdict(opts, options, &local_err);
115+ if (local_err) {
116+ error_propagate(errp, local_err);
117+ ret = -EINVAL;
118+ goto fail;
119+ }
120+
121+ s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
122+
123+ /* open the target (write) node, backing will be attached by block layer */
124+ bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
125+ BDRV_CHILD_DATA | BDRV_CHILD_METADATA, false,
126+ &local_err);
127+ if (local_err) {
128+ ret = -EINVAL;
129+ error_propagate(errp, local_err);
130+ goto fail;
131+ }
132+
133+ track_refresh_limits(bs, errp);
134+ uint64_t gran = bs->bl.request_alignment;
135+ s->bitmap = bdrv_create_dirty_bitmap(bs->file->bs, gran, NULL, &local_err);
136+ if (local_err) {
137+ ret = -EIO;
138+ error_propagate(errp, local_err);
139+ goto fail;
140+ }
141+
142+ s->drop_state = DropNone;
143+
144+fail:
145+ if (ret < 0) {
146+ bdrv_unref_child(bs, bs->file);
147+ if (s->bitmap) {
148+ bdrv_release_dirty_bitmap(s->bitmap);
149+ }
150+ }
151+ qemu_opts_del(opts);
152+ return ret;
153+}
154+
155+static void track_close(BlockDriverState *bs)
156+{
157+ BDRVAllocTrackState *s = bs->opaque;
158+ if (s->bitmap) {
159+ bdrv_release_dirty_bitmap(s->bitmap);
160+ }
161+}
162+
163+static int64_t track_getlength(BlockDriverState *bs)
164+{
165+ return bdrv_getlength(bs->file->bs);
166+}
167+
168+static int coroutine_fn track_co_preadv(BlockDriverState *bs,
169+ uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
170+{
171+ BDRVAllocTrackState *s = bs->opaque;
172+ QEMUIOVector local_qiov;
173+ int ret;
174+
175+ /* 'cur_offset' is relative to 'offset', 'local_offset' to image start */
176+ uint64_t cur_offset, local_offset;
177+ int64_t local_bytes;
178+ bool alloc;
179+
180+ /* a read request can span multiple granularity-sized chunks, and can thus
181+ * contain blocks with different allocation status - we could just iterate
182+ * granularity-wise, but for better performance use bdrv_dirty_bitmap_next_X
183+ * to find the next flip and consider everything up to that in one go */
184+ for (cur_offset = 0; cur_offset < bytes; cur_offset += local_bytes) {
185+ local_offset = offset + cur_offset;
186+ alloc = bdrv_dirty_bitmap_get(s->bitmap, local_offset);
187+ if (alloc) {
188+ local_bytes = bdrv_dirty_bitmap_next_zero(s->bitmap, local_offset,
189+ bytes - cur_offset);
190+ } else {
191+ local_bytes = bdrv_dirty_bitmap_next_dirty(s->bitmap, local_offset,
192+ bytes - cur_offset);
193+ }
194+
195+ /* _bitmap_next_X return is -1 if no end found within limit, otherwise
196+ * offset of next flip (to start of image) */
197+ local_bytes = local_bytes < 0 ?
198+ bytes - cur_offset :
199+ local_bytes - local_offset;
200+
201+ qemu_iovec_init_slice(&local_qiov, qiov, cur_offset, local_bytes);
202+
203+ if (alloc) {
204+ ret = bdrv_co_preadv(bs->file, local_offset, local_bytes,
205+ &local_qiov, flags);
206+ } else if (bs->backing) {
207+ ret = bdrv_co_preadv(bs->backing, local_offset, local_bytes,
208+ &local_qiov, flags);
209+ } else {
210+ ret = qemu_iovec_memset(&local_qiov, cur_offset, 0, local_bytes);
211+ }
212+
213+ if (ret != 0) {
214+ break;
215+ }
216+ }
217+
218+ return ret;
219+}
220+
221+static int coroutine_fn track_co_pwritev(BlockDriverState *bs,
222+ uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
223+{
224+ return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
225+}
226+
227+static int coroutine_fn track_co_pwrite_zeroes(BlockDriverState *bs,
228+ int64_t offset, int count, BdrvRequestFlags flags)
229+{
0a88214b 230+ return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
677d0d16
SR
231+}
232+
233+static int coroutine_fn track_co_pdiscard(BlockDriverState *bs,
234+ int64_t offset, int count)
235+{
236+ return bdrv_co_pdiscard(bs->file, offset, count);
237+}
238+
239+static coroutine_fn int track_co_flush(BlockDriverState *bs)
240+{
241+ return bdrv_co_flush(bs->file->bs);
242+}
243+
244+static int coroutine_fn track_co_block_status(BlockDriverState *bs,
245+ bool want_zero,
246+ int64_t offset,
247+ int64_t bytes,
248+ int64_t *pnum,
249+ int64_t *map,
250+ BlockDriverState **file)
251+{
252+ BDRVAllocTrackState *s = bs->opaque;
253+
254+ bool alloc = bdrv_dirty_bitmap_get(s->bitmap, offset);
255+ int64_t next_flipped;
256+ if (alloc) {
257+ next_flipped = bdrv_dirty_bitmap_next_zero(s->bitmap, offset, bytes);
258+ } else {
259+ next_flipped = bdrv_dirty_bitmap_next_dirty(s->bitmap, offset, bytes);
260+ }
261+
262+ /* in case not the entire region has the same state, we need to set pnum to
263+ * indicate for how many bytes our result is valid */
264+ *pnum = next_flipped == -1 ? bytes : next_flipped - offset;
265+ *map = offset;
266+
267+ if (alloc) {
268+ *file = bs->file->bs;
269+ return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
270+ } else if (bs->backing) {
271+ *file = bs->backing->bs;
272+ }
273+ return 0;
274+}
275+
276+static void track_child_perm(BlockDriverState *bs, BdrvChild *c,
277+ BdrvChildRole role, BlockReopenQueue *reopen_queue,
278+ uint64_t perm, uint64_t shared,
279+ uint64_t *nperm, uint64_t *nshared)
280+{
281+ BDRVAllocTrackState *s = bs->opaque;
282+
283+ *nshared = BLK_PERM_ALL;
284+
285+ /* in case we're currently dropping ourselves, claim to not use any
286+ * permissions at all - which is fine, since from this point on we will
287+ * never issue a read or write anymore */
288+ if (s->drop_state == DropInProgress) {
289+ *nperm = 0;
290+ return;
291+ }
292+
293+ if (role & BDRV_CHILD_DATA) {
294+ *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
295+ } else {
296+ /* 'backing' is also a child of our BDS, but we don't expect it to be
297+ * writeable, so we only forward 'consistent read' */
298+ *nperm = perm & BLK_PERM_CONSISTENT_READ;
299+ }
300+}
301+
302+static void track_drop(void *opaque)
303+{
304+ BlockDriverState *bs = (BlockDriverState*)opaque;
305+ BlockDriverState *file = bs->file->bs;
306+ BDRVAllocTrackState *s = bs->opaque;
307+
308+ assert(file);
309+
310+ /* we rely on the fact that we're not used anywhere else, so let's wait
311+ * until we're only used once - in the drive connected to the guest (and one
312+ * ref is held by bdrv_ref in track_change_backing_file) */
313+ if (bs->refcnt > 2) {
314+ aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, opaque);
315+ return;
316+ }
b36e8acc
TL
317+ AioContext *aio_context = bdrv_get_aio_context(bs);
318+ aio_context_acquire(aio_context);
677d0d16 319+
677d0d16
SR
320+ bdrv_drained_begin(bs);
321+
322+ /* now that we're drained, we can safely set 'DropInProgress' */
323+ s->drop_state = DropInProgress;
324+ bdrv_child_refresh_perms(bs, bs->file, &error_abort);
325+
326+ bdrv_replace_node(bs, file, &error_abort);
aa42ea26
TL
327+ bdrv_set_backing_hd(bs, NULL, &error_abort);
328+ bdrv_drained_end(bs);
677d0d16 329+ bdrv_unref(bs);
b36e8acc 330+ aio_context_release(aio_context);
677d0d16
SR
331+}
332+
333+static int track_change_backing_file(BlockDriverState *bs,
334+ const char *backing_file,
335+ const char *backing_fmt)
336+{
337+ BDRVAllocTrackState *s = bs->opaque;
338+ if (s->auto_remove && s->drop_state == DropNone &&
339+ backing_file == NULL && backing_fmt == NULL)
340+ {
341+ /* backing file has been disconnected, there's no longer any use for
342+ * this node, so let's remove ourselves from the block graph - we need
343+ * to schedule this for later however, since when this function is
344+ * called, the blockjob modifying us is probably not done yet and has a
345+ * blocker on 'bs' */
346+ s->drop_state = DropRequested;
347+ bdrv_ref(bs);
348+ aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, (void*)bs);
349+ }
350+
351+ return 0;
352+}
353+
354+static BlockDriver bdrv_alloc_track = {
355+ .format_name = "alloc-track",
356+ .instance_size = sizeof(BDRVAllocTrackState),
357+
358+ .bdrv_file_open = track_open,
359+ .bdrv_close = track_close,
360+ .bdrv_getlength = track_getlength,
361+ .bdrv_child_perm = track_child_perm,
362+ .bdrv_refresh_limits = track_refresh_limits,
363+
364+ .bdrv_co_pwrite_zeroes = track_co_pwrite_zeroes,
365+ .bdrv_co_pwritev = track_co_pwritev,
366+ .bdrv_co_preadv = track_co_preadv,
367+ .bdrv_co_pdiscard = track_co_pdiscard,
368+
369+ .bdrv_co_flush = track_co_flush,
370+ .bdrv_co_flush_to_disk = track_co_flush,
371+
372+ .supports_backing = true,
373+
374+ .bdrv_co_block_status = track_co_block_status,
375+ .bdrv_change_backing_file = track_change_backing_file,
376+};
377+
378+static void bdrv_alloc_track_init(void)
379+{
380+ bdrv_register(&bdrv_alloc_track);
381+}
382+
383+block_init(bdrv_alloc_track_init);
384diff --git a/block/meson.build b/block/meson.build
f376b2b9 385index e3ed5ac97c..d1ee260048 100644
677d0d16
SR
386--- a/block/meson.build
387+++ b/block/meson.build
388@@ -2,6 +2,7 @@ block_ss.add(genh)
389 block_ss.add(files(
390 'accounting.c',
391 'aio_task.c',
392+ 'alloc-track.c',
393 'amend.c',
394 'backup.c',
395 'backup-dump.c',