]> git.proxmox.com Git - pve-qemu.git/blob - debian/patches/pve/0048-block-add-alloc-track-driver.patch
d47c84baff04228c4af7249864ac8cf95fdbb756
[pve-qemu.git] / debian / patches / pve / 0048-block-add-alloc-track-driver.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Stefan Reiter <s.reiter@proxmox.com>
3 Date: Mon, 7 Dec 2020 15:21:03 +0100
4 Subject: [PATCH] block: add alloc-track driver
5
6 Add a new filter node 'alloc-track', which seperates reads and writes to
7 different children, thus allowing to put a backing image behind any
8 blockdev (regardless of driver support). Since we can't detect any
9 pre-allocated blocks, we can only track new writes, hence the write
10 target ('file') for this node must always be empty.
11
12 Intended use case is for live restoring, i.e. add a backup image as a
13 block device into a VM, then put an alloc-track on the restore target
14 and set the backup as backing. With this, one can use a regular
15 'block-stream' to restore the image, while the VM can already run in the
16 background. Copy-on-read will help make progress as the VM reads as
17 well.
18
19 This only worked if the target supports backing images, so up until now
20 only for qcow2, with alloc-track any driver for the target can be used.
21
22 If 'auto-remove' is set, alloc-track will automatically detach itself
23 once the backing image is removed. It will be replaced by 'file'.
24
25 Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
26 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
27 [FE: adapt to changed function signatures
28 make error return value consistent with QEMU]
29 Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
30 ---
31 block/alloc-track.c | 350 ++++++++++++++++++++++++++++++++++++++++++++
32 block/meson.build | 1 +
33 2 files changed, 351 insertions(+)
34 create mode 100644 block/alloc-track.c
35
36 diff --git a/block/alloc-track.c b/block/alloc-track.c
37 new file mode 100644
38 index 0000000000..43d40d11af
39 --- /dev/null
40 +++ b/block/alloc-track.c
41 @@ -0,0 +1,350 @@
42 +/*
43 + * Node to allow backing images to be applied to any node. Assumes a blank
44 + * image to begin with, only new writes are tracked as allocated, thus this
45 + * must never be put on a node that already contains data.
46 + *
47 + * Copyright (c) 2020 Proxmox Server Solutions GmbH
48 + * Copyright (c) 2020 Stefan Reiter <s.reiter@proxmox.com>
49 + *
50 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
51 + * See the COPYING file in the top-level directory.
52 + */
53 +
54 +#include "qemu/osdep.h"
55 +#include "qapi/error.h"
56 +#include "block/block_int.h"
57 +#include "qapi/qmp/qdict.h"
58 +#include "qapi/qmp/qstring.h"
59 +#include "qemu/cutils.h"
60 +#include "qemu/option.h"
61 +#include "qemu/module.h"
62 +#include "sysemu/block-backend.h"
63 +
64 +#define TRACK_OPT_AUTO_REMOVE "auto-remove"
65 +
66 +typedef enum DropState {
67 + DropNone,
68 + DropRequested,
69 + DropInProgress,
70 +} DropState;
71 +
72 +typedef struct {
73 + BdrvDirtyBitmap *bitmap;
74 + DropState drop_state;
75 + bool auto_remove;
76 +} BDRVAllocTrackState;
77 +
78 +static QemuOptsList runtime_opts = {
79 + .name = "alloc-track",
80 + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
81 + .desc = {
82 + {
83 + .name = TRACK_OPT_AUTO_REMOVE,
84 + .type = QEMU_OPT_BOOL,
85 + .help = "automatically replace this node with 'file' when 'backing'"
86 + "is detached",
87 + },
88 + { /* end of list */ }
89 + },
90 +};
91 +
92 +static void track_refresh_limits(BlockDriverState *bs, Error **errp)
93 +{
94 + BlockDriverInfo bdi;
95 +
96 + if (!bs->file) {
97 + return;
98 + }
99 +
100 + /* always use alignment from underlying write device so RMW cycle for
101 + * bdrv_pwritev reads data from our backing via track_co_preadv (no partial
102 + * cluster allocation in 'file') */
103 + bdrv_get_info(bs->file->bs, &bdi);
104 + bs->bl.request_alignment = MAX(bs->file->bs->bl.request_alignment,
105 + MAX(bdi.cluster_size, BDRV_SECTOR_SIZE));
106 +}
107 +
108 +static int track_open(BlockDriverState *bs, QDict *options, int flags,
109 + Error **errp)
110 +{
111 + BDRVAllocTrackState *s = bs->opaque;
112 + QemuOpts *opts;
113 + Error *local_err = NULL;
114 + int ret = 0;
115 +
116 + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
117 + qemu_opts_absorb_qdict(opts, options, &local_err);
118 + if (local_err) {
119 + error_propagate(errp, local_err);
120 + ret = -EINVAL;
121 + goto fail;
122 + }
123 +
124 + s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
125 +
126 + /* open the target (write) node, backing will be attached by block layer */
127 + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
128 + BDRV_CHILD_DATA | BDRV_CHILD_METADATA, false,
129 + &local_err);
130 + if (local_err) {
131 + ret = -EINVAL;
132 + error_propagate(errp, local_err);
133 + goto fail;
134 + }
135 +
136 + track_refresh_limits(bs, errp);
137 + uint64_t gran = bs->bl.request_alignment;
138 + s->bitmap = bdrv_create_dirty_bitmap(bs->file->bs, gran, NULL, &local_err);
139 + if (local_err) {
140 + ret = -EIO;
141 + error_propagate(errp, local_err);
142 + goto fail;
143 + }
144 +
145 + s->drop_state = DropNone;
146 +
147 +fail:
148 + if (ret < 0) {
149 + bdrv_unref_child(bs, bs->file);
150 + if (s->bitmap) {
151 + bdrv_release_dirty_bitmap(s->bitmap);
152 + }
153 + }
154 + qemu_opts_del(opts);
155 + return ret;
156 +}
157 +
158 +static void track_close(BlockDriverState *bs)
159 +{
160 + BDRVAllocTrackState *s = bs->opaque;
161 + if (s->bitmap) {
162 + bdrv_release_dirty_bitmap(s->bitmap);
163 + }
164 +}
165 +
166 +static int64_t track_getlength(BlockDriverState *bs)
167 +{
168 + return bdrv_getlength(bs->file->bs);
169 +}
170 +
171 +static int coroutine_fn track_co_preadv(BlockDriverState *bs,
172 + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
173 +{
174 + BDRVAllocTrackState *s = bs->opaque;
175 + QEMUIOVector local_qiov;
176 + int ret;
177 +
178 + /* 'cur_offset' is relative to 'offset', 'local_offset' to image start */
179 + uint64_t cur_offset, local_offset;
180 + int64_t local_bytes;
181 + bool alloc;
182 +
183 + if (offset < 0 || bytes < 0) {
184 + fprintf(stderr, "unexpected negative 'offset' or 'bytes' value!\n");
185 + return -EIO;
186 + }
187 +
188 + /* a read request can span multiple granularity-sized chunks, and can thus
189 + * contain blocks with different allocation status - we could just iterate
190 + * granularity-wise, but for better performance use bdrv_dirty_bitmap_next_X
191 + * to find the next flip and consider everything up to that in one go */
192 + for (cur_offset = 0; cur_offset < bytes; cur_offset += local_bytes) {
193 + local_offset = offset + cur_offset;
194 + alloc = bdrv_dirty_bitmap_get(s->bitmap, local_offset);
195 + if (alloc) {
196 + local_bytes = bdrv_dirty_bitmap_next_zero(s->bitmap, local_offset,
197 + bytes - cur_offset);
198 + } else {
199 + local_bytes = bdrv_dirty_bitmap_next_dirty(s->bitmap, local_offset,
200 + bytes - cur_offset);
201 + }
202 +
203 + /* _bitmap_next_X return is -1 if no end found within limit, otherwise
204 + * offset of next flip (to start of image) */
205 + local_bytes = local_bytes < 0 ?
206 + bytes - cur_offset :
207 + local_bytes - local_offset;
208 +
209 + qemu_iovec_init_slice(&local_qiov, qiov, cur_offset, local_bytes);
210 +
211 + if (alloc) {
212 + ret = bdrv_co_preadv(bs->file, local_offset, local_bytes,
213 + &local_qiov, flags);
214 + } else if (bs->backing) {
215 + ret = bdrv_co_preadv(bs->backing, local_offset, local_bytes,
216 + &local_qiov, flags);
217 + } else {
218 + ret = qemu_iovec_memset(&local_qiov, cur_offset, 0, local_bytes);
219 + }
220 +
221 + if (ret != 0) {
222 + break;
223 + }
224 + }
225 +
226 + return ret;
227 +}
228 +
229 +static int coroutine_fn track_co_pwritev(BlockDriverState *bs,
230 + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
231 +{
232 + return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
233 +}
234 +
235 +static int coroutine_fn track_co_pwrite_zeroes(BlockDriverState *bs,
236 + int64_t offset, int64_t bytes, BdrvRequestFlags flags)
237 +{
238 + return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
239 +}
240 +
241 +static int coroutine_fn track_co_pdiscard(BlockDriverState *bs,
242 + int64_t offset, int64_t bytes)
243 +{
244 + return bdrv_co_pdiscard(bs->file, offset, bytes);
245 +}
246 +
247 +static coroutine_fn int track_co_flush(BlockDriverState *bs)
248 +{
249 + return bdrv_co_flush(bs->file->bs);
250 +}
251 +
252 +static int coroutine_fn track_co_block_status(BlockDriverState *bs,
253 + bool want_zero,
254 + int64_t offset,
255 + int64_t bytes,
256 + int64_t *pnum,
257 + int64_t *map,
258 + BlockDriverState **file)
259 +{
260 + BDRVAllocTrackState *s = bs->opaque;
261 +
262 + bool alloc = bdrv_dirty_bitmap_get(s->bitmap, offset);
263 + int64_t next_flipped;
264 + if (alloc) {
265 + next_flipped = bdrv_dirty_bitmap_next_zero(s->bitmap, offset, bytes);
266 + } else {
267 + next_flipped = bdrv_dirty_bitmap_next_dirty(s->bitmap, offset, bytes);
268 + }
269 +
270 + /* in case not the entire region has the same state, we need to set pnum to
271 + * indicate for how many bytes our result is valid */
272 + *pnum = next_flipped == -1 ? bytes : next_flipped - offset;
273 + *map = offset;
274 +
275 + if (alloc) {
276 + *file = bs->file->bs;
277 + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
278 + } else if (bs->backing) {
279 + *file = bs->backing->bs;
280 + }
281 + return 0;
282 +}
283 +
284 +static void track_child_perm(BlockDriverState *bs, BdrvChild *c,
285 + BdrvChildRole role, BlockReopenQueue *reopen_queue,
286 + uint64_t perm, uint64_t shared,
287 + uint64_t *nperm, uint64_t *nshared)
288 +{
289 + BDRVAllocTrackState *s = bs->opaque;
290 +
291 + *nshared = BLK_PERM_ALL;
292 +
293 + /* in case we're currently dropping ourselves, claim to not use any
294 + * permissions at all - which is fine, since from this point on we will
295 + * never issue a read or write anymore */
296 + if (s->drop_state == DropInProgress) {
297 + *nperm = 0;
298 + return;
299 + }
300 +
301 + if (role & BDRV_CHILD_DATA) {
302 + *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
303 + } else {
304 + /* 'backing' is also a child of our BDS, but we don't expect it to be
305 + * writeable, so we only forward 'consistent read' */
306 + *nperm = perm & BLK_PERM_CONSISTENT_READ;
307 + }
308 +}
309 +
310 +static void track_drop(void *opaque)
311 +{
312 + BlockDriverState *bs = (BlockDriverState*)opaque;
313 + BlockDriverState *file = bs->file->bs;
314 + BDRVAllocTrackState *s = bs->opaque;
315 +
316 + assert(file);
317 +
318 + /* we rely on the fact that we're not used anywhere else, so let's wait
319 + * until we're only used once - in the drive connected to the guest (and one
320 + * ref is held by bdrv_ref in track_change_backing_file) */
321 + if (bs->refcnt > 2) {
322 + aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, opaque);
323 + return;
324 + }
325 + AioContext *aio_context = bdrv_get_aio_context(bs);
326 + aio_context_acquire(aio_context);
327 +
328 + bdrv_drained_begin(bs);
329 +
330 + /* now that we're drained, we can safely set 'DropInProgress' */
331 + s->drop_state = DropInProgress;
332 + bdrv_child_refresh_perms(bs, bs->file, &error_abort);
333 +
334 + bdrv_replace_node(bs, file, &error_abort);
335 + bdrv_set_backing_hd(bs, NULL, &error_abort);
336 + bdrv_drained_end(bs);
337 + bdrv_unref(bs);
338 + aio_context_release(aio_context);
339 +}
340 +
341 +static int track_change_backing_file(BlockDriverState *bs,
342 + const char *backing_file,
343 + const char *backing_fmt)
344 +{
345 + BDRVAllocTrackState *s = bs->opaque;
346 + if (s->auto_remove && s->drop_state == DropNone &&
347 + backing_file == NULL && backing_fmt == NULL)
348 + {
349 + /* backing file has been disconnected, there's no longer any use for
350 + * this node, so let's remove ourselves from the block graph - we need
351 + * to schedule this for later however, since when this function is
352 + * called, the blockjob modifying us is probably not done yet and has a
353 + * blocker on 'bs' */
354 + s->drop_state = DropRequested;
355 + bdrv_ref(bs);
356 + aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, (void*)bs);
357 + }
358 +
359 + return 0;
360 +}
361 +
362 +static BlockDriver bdrv_alloc_track = {
363 + .format_name = "alloc-track",
364 + .instance_size = sizeof(BDRVAllocTrackState),
365 +
366 + .bdrv_file_open = track_open,
367 + .bdrv_close = track_close,
368 + .bdrv_getlength = track_getlength,
369 + .bdrv_child_perm = track_child_perm,
370 + .bdrv_refresh_limits = track_refresh_limits,
371 +
372 + .bdrv_co_pwrite_zeroes = track_co_pwrite_zeroes,
373 + .bdrv_co_pwritev = track_co_pwritev,
374 + .bdrv_co_preadv = track_co_preadv,
375 + .bdrv_co_pdiscard = track_co_pdiscard,
376 +
377 + .bdrv_co_flush = track_co_flush,
378 + .bdrv_co_flush_to_disk = track_co_flush,
379 +
380 + .supports_backing = true,
381 +
382 + .bdrv_co_block_status = track_co_block_status,
383 + .bdrv_change_backing_file = track_change_backing_file,
384 +};
385 +
386 +static void bdrv_alloc_track_init(void)
387 +{
388 + bdrv_register(&bdrv_alloc_track);
389 +}
390 +
391 +block_init(bdrv_alloc_track_init);
392 diff --git a/block/meson.build b/block/meson.build
393 index a26a69434e..74e5f49758 100644
394 --- a/block/meson.build
395 +++ b/block/meson.build
396 @@ -2,6 +2,7 @@ block_ss.add(genh)
397 block_ss.add(files(
398 'accounting.c',
399 'aio_task.c',
400 + 'alloc-track.c',
401 'amend.c',
402 'backup.c',
403 'backup-dump.c',