]>
Commit | Line | Data |
---|---|---|
677d0d16 SR |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Stefan Reiter <s.reiter@proxmox.com> | |
3 | Date: Mon, 7 Dec 2020 15:21:03 +0100 | |
4 | Subject: [PATCH] block: add alloc-track driver | |
5 | ||
6 | Add a new filter node 'alloc-track', which seperates reads and writes to | |
7 | different children, thus allowing to put a backing image behind any | |
8 | blockdev (regardless of driver support). Since we can't detect any | |
9 | pre-allocated blocks, we can only track new writes, hence the write | |
10 | target ('file') for this node must always be empty. | |
11 | ||
12 | Intended use case is for live restoring, i.e. add a backup image as a | |
13 | block device into a VM, then put an alloc-track on the restore target | |
14 | and set the backup as backing. With this, one can use a regular | |
15 | 'block-stream' to restore the image, while the VM can already run in the | |
16 | background. Copy-on-read will help make progress as the VM reads as | |
17 | well. | |
18 | ||
19 | This only worked if the target supports backing images, so up until now | |
20 | only for qcow2, with alloc-track any driver for the target can be used. | |
21 | ||
22 | If 'auto-remove' is set, alloc-track will automatically detach itself | |
23 | once the backing image is removed. It will be replaced by 'file'. | |
24 | ||
25 | Signed-off-by: Stefan Reiter <s.reiter@proxmox.com> | |
ddbf7a87 | 26 | Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com> |
4e1935c2 | 27 | [FE: adapt to changed function signatures |
db5d2a4b FE |
28 | make error return value consistent with QEMU |
29 | avoid premature break during read] | |
4e1935c2 | 30 | Signed-off-by: Fiona Ebner <f.ebner@proxmox.com> |
677d0d16 | 31 | --- |
db5d2a4b | 32 | block/alloc-track.c | 352 ++++++++++++++++++++++++++++++++++++++++++++ |
677d0d16 | 33 | block/meson.build | 1 + |
db5d2a4b | 34 | 2 files changed, 353 insertions(+) |
677d0d16 SR |
35 | create mode 100644 block/alloc-track.c |
36 | ||
37 | diff --git a/block/alloc-track.c b/block/alloc-track.c | |
38 | new file mode 100644 | |
db5d2a4b | 39 | index 0000000000..b75d7c6460 |
677d0d16 SR |
40 | --- /dev/null |
41 | +++ b/block/alloc-track.c | |
db5d2a4b | 42 | @@ -0,0 +1,352 @@ |
677d0d16 SR |
43 | +/* |
44 | + * Node to allow backing images to be applied to any node. Assumes a blank | |
45 | + * image to begin with, only new writes are tracked as allocated, thus this | |
46 | + * must never be put on a node that already contains data. | |
47 | + * | |
48 | + * Copyright (c) 2020 Proxmox Server Solutions GmbH | |
49 | + * Copyright (c) 2020 Stefan Reiter <s.reiter@proxmox.com> | |
50 | + * | |
51 | + * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
52 | + * See the COPYING file in the top-level directory. | |
53 | + */ | |
54 | + | |
55 | +#include "qemu/osdep.h" | |
56 | +#include "qapi/error.h" | |
57 | +#include "block/block_int.h" | |
bf251437 | 58 | +#include "block/dirty-bitmap.h" |
677d0d16 SR |
59 | +#include "qapi/qmp/qdict.h" |
60 | +#include "qapi/qmp/qstring.h" | |
61 | +#include "qemu/cutils.h" | |
62 | +#include "qemu/option.h" | |
63 | +#include "qemu/module.h" | |
64 | +#include "sysemu/block-backend.h" | |
65 | + | |
66 | +#define TRACK_OPT_AUTO_REMOVE "auto-remove" | |
67 | + | |
68 | +typedef enum DropState { | |
69 | + DropNone, | |
70 | + DropRequested, | |
71 | + DropInProgress, | |
72 | +} DropState; | |
73 | + | |
74 | +typedef struct { | |
75 | + BdrvDirtyBitmap *bitmap; | |
76 | + DropState drop_state; | |
77 | + bool auto_remove; | |
78 | +} BDRVAllocTrackState; | |
79 | + | |
80 | +static QemuOptsList runtime_opts = { | |
81 | + .name = "alloc-track", | |
82 | + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), | |
83 | + .desc = { | |
84 | + { | |
85 | + .name = TRACK_OPT_AUTO_REMOVE, | |
86 | + .type = QEMU_OPT_BOOL, | |
87 | + .help = "automatically replace this node with 'file' when 'backing'" | |
88 | + "is detached", | |
89 | + }, | |
90 | + { /* end of list */ } | |
91 | + }, | |
92 | +}; | |
93 | + | |
94 | +static void track_refresh_limits(BlockDriverState *bs, Error **errp) | |
95 | +{ | |
96 | + BlockDriverInfo bdi; | |
97 | + | |
98 | + if (!bs->file) { | |
99 | + return; | |
100 | + } | |
101 | + | |
102 | + /* always use alignment from underlying write device so RMW cycle for | |
103 | + * bdrv_pwritev reads data from our backing via track_co_preadv (no partial | |
104 | + * cluster allocation in 'file') */ | |
105 | + bdrv_get_info(bs->file->bs, &bdi); | |
106 | + bs->bl.request_alignment = MAX(bs->file->bs->bl.request_alignment, | |
107 | + MAX(bdi.cluster_size, BDRV_SECTOR_SIZE)); | |
108 | +} | |
109 | + | |
110 | +static int track_open(BlockDriverState *bs, QDict *options, int flags, | |
111 | + Error **errp) | |
112 | +{ | |
113 | + BDRVAllocTrackState *s = bs->opaque; | |
114 | + QemuOpts *opts; | |
115 | + Error *local_err = NULL; | |
116 | + int ret = 0; | |
117 | + | |
118 | + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); | |
119 | + qemu_opts_absorb_qdict(opts, options, &local_err); | |
120 | + if (local_err) { | |
121 | + error_propagate(errp, local_err); | |
122 | + ret = -EINVAL; | |
123 | + goto fail; | |
124 | + } | |
125 | + | |
126 | + s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false); | |
127 | + | |
128 | + /* open the target (write) node, backing will be attached by block layer */ | |
129 | + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, | |
130 | + BDRV_CHILD_DATA | BDRV_CHILD_METADATA, false, | |
131 | + &local_err); | |
132 | + if (local_err) { | |
133 | + ret = -EINVAL; | |
134 | + error_propagate(errp, local_err); | |
135 | + goto fail; | |
136 | + } | |
137 | + | |
138 | + track_refresh_limits(bs, errp); | |
139 | + uint64_t gran = bs->bl.request_alignment; | |
140 | + s->bitmap = bdrv_create_dirty_bitmap(bs->file->bs, gran, NULL, &local_err); | |
141 | + if (local_err) { | |
142 | + ret = -EIO; | |
143 | + error_propagate(errp, local_err); | |
144 | + goto fail; | |
145 | + } | |
146 | + | |
147 | + s->drop_state = DropNone; | |
148 | + | |
149 | +fail: | |
150 | + if (ret < 0) { | |
151 | + bdrv_unref_child(bs, bs->file); | |
152 | + if (s->bitmap) { | |
153 | + bdrv_release_dirty_bitmap(s->bitmap); | |
154 | + } | |
155 | + } | |
156 | + qemu_opts_del(opts); | |
157 | + return ret; | |
158 | +} | |
159 | + | |
160 | +static void track_close(BlockDriverState *bs) | |
161 | +{ | |
162 | + BDRVAllocTrackState *s = bs->opaque; | |
163 | + if (s->bitmap) { | |
164 | + bdrv_release_dirty_bitmap(s->bitmap); | |
165 | + } | |
166 | +} | |
167 | + | |
bf251437 | 168 | +static coroutine_fn int64_t track_co_getlength(BlockDriverState *bs) |
677d0d16 | 169 | +{ |
bf251437 | 170 | + return bdrv_co_getlength(bs->file->bs); |
677d0d16 SR |
171 | +} |
172 | + | |
173 | +static int coroutine_fn track_co_preadv(BlockDriverState *bs, | |
4567474e | 174 | + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) |
677d0d16 SR |
175 | +{ |
176 | + BDRVAllocTrackState *s = bs->opaque; | |
177 | + QEMUIOVector local_qiov; | |
178 | + int ret; | |
179 | + | |
180 | + /* 'cur_offset' is relative to 'offset', 'local_offset' to image start */ | |
181 | + uint64_t cur_offset, local_offset; | |
182 | + int64_t local_bytes; | |
183 | + bool alloc; | |
184 | + | |
4567474e FE |
185 | + if (offset < 0 || bytes < 0) { |
186 | + fprintf(stderr, "unexpected negative 'offset' or 'bytes' value!\n"); | |
4e1935c2 | 187 | + return -EIO; |
4567474e FE |
188 | + } |
189 | + | |
677d0d16 SR |
190 | + /* a read request can span multiple granularity-sized chunks, and can thus |
191 | + * contain blocks with different allocation status - we could just iterate | |
192 | + * granularity-wise, but for better performance use bdrv_dirty_bitmap_next_X | |
193 | + * to find the next flip and consider everything up to that in one go */ | |
194 | + for (cur_offset = 0; cur_offset < bytes; cur_offset += local_bytes) { | |
195 | + local_offset = offset + cur_offset; | |
196 | + alloc = bdrv_dirty_bitmap_get(s->bitmap, local_offset); | |
197 | + if (alloc) { | |
198 | + local_bytes = bdrv_dirty_bitmap_next_zero(s->bitmap, local_offset, | |
199 | + bytes - cur_offset); | |
200 | + } else { | |
201 | + local_bytes = bdrv_dirty_bitmap_next_dirty(s->bitmap, local_offset, | |
202 | + bytes - cur_offset); | |
203 | + } | |
204 | + | |
205 | + /* _bitmap_next_X return is -1 if no end found within limit, otherwise | |
206 | + * offset of next flip (to start of image) */ | |
207 | + local_bytes = local_bytes < 0 ? | |
208 | + bytes - cur_offset : | |
209 | + local_bytes - local_offset; | |
210 | + | |
211 | + qemu_iovec_init_slice(&local_qiov, qiov, cur_offset, local_bytes); | |
212 | + | |
213 | + if (alloc) { | |
214 | + ret = bdrv_co_preadv(bs->file, local_offset, local_bytes, | |
215 | + &local_qiov, flags); | |
216 | + } else if (bs->backing) { | |
217 | + ret = bdrv_co_preadv(bs->backing, local_offset, local_bytes, | |
218 | + &local_qiov, flags); | |
219 | + } else { | |
db5d2a4b FE |
220 | + qemu_iovec_memset(&local_qiov, cur_offset, 0, local_bytes); |
221 | + ret = 0; | |
677d0d16 SR |
222 | + } |
223 | + | |
224 | + if (ret != 0) { | |
225 | + break; | |
226 | + } | |
227 | + } | |
228 | + | |
229 | + return ret; | |
230 | +} | |
231 | + | |
232 | +static int coroutine_fn track_co_pwritev(BlockDriverState *bs, | |
4567474e | 233 | + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) |
677d0d16 SR |
234 | +{ |
235 | + return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); | |
236 | +} | |
237 | + | |
238 | +static int coroutine_fn track_co_pwrite_zeroes(BlockDriverState *bs, | |
4567474e | 239 | + int64_t offset, int64_t bytes, BdrvRequestFlags flags) |
677d0d16 | 240 | +{ |
4567474e | 241 | + return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); |
677d0d16 SR |
242 | +} |
243 | + | |
244 | +static int coroutine_fn track_co_pdiscard(BlockDriverState *bs, | |
4567474e | 245 | + int64_t offset, int64_t bytes) |
677d0d16 | 246 | +{ |
4567474e | 247 | + return bdrv_co_pdiscard(bs->file, offset, bytes); |
677d0d16 SR |
248 | +} |
249 | + | |
250 | +static coroutine_fn int track_co_flush(BlockDriverState *bs) | |
251 | +{ | |
252 | + return bdrv_co_flush(bs->file->bs); | |
253 | +} | |
254 | + | |
255 | +static int coroutine_fn track_co_block_status(BlockDriverState *bs, | |
256 | + bool want_zero, | |
257 | + int64_t offset, | |
258 | + int64_t bytes, | |
259 | + int64_t *pnum, | |
260 | + int64_t *map, | |
261 | + BlockDriverState **file) | |
262 | +{ | |
263 | + BDRVAllocTrackState *s = bs->opaque; | |
264 | + | |
265 | + bool alloc = bdrv_dirty_bitmap_get(s->bitmap, offset); | |
266 | + int64_t next_flipped; | |
267 | + if (alloc) { | |
268 | + next_flipped = bdrv_dirty_bitmap_next_zero(s->bitmap, offset, bytes); | |
269 | + } else { | |
270 | + next_flipped = bdrv_dirty_bitmap_next_dirty(s->bitmap, offset, bytes); | |
271 | + } | |
272 | + | |
273 | + /* in case not the entire region has the same state, we need to set pnum to | |
274 | + * indicate for how many bytes our result is valid */ | |
275 | + *pnum = next_flipped == -1 ? bytes : next_flipped - offset; | |
276 | + *map = offset; | |
277 | + | |
278 | + if (alloc) { | |
279 | + *file = bs->file->bs; | |
280 | + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; | |
281 | + } else if (bs->backing) { | |
282 | + *file = bs->backing->bs; | |
283 | + } | |
284 | + return 0; | |
285 | +} | |
286 | + | |
287 | +static void track_child_perm(BlockDriverState *bs, BdrvChild *c, | |
288 | + BdrvChildRole role, BlockReopenQueue *reopen_queue, | |
289 | + uint64_t perm, uint64_t shared, | |
290 | + uint64_t *nperm, uint64_t *nshared) | |
291 | +{ | |
292 | + BDRVAllocTrackState *s = bs->opaque; | |
293 | + | |
294 | + *nshared = BLK_PERM_ALL; | |
295 | + | |
296 | + /* in case we're currently dropping ourselves, claim to not use any | |
297 | + * permissions at all - which is fine, since from this point on we will | |
298 | + * never issue a read or write anymore */ | |
299 | + if (s->drop_state == DropInProgress) { | |
300 | + *nperm = 0; | |
301 | + return; | |
302 | + } | |
303 | + | |
304 | + if (role & BDRV_CHILD_DATA) { | |
305 | + *nperm = perm & DEFAULT_PERM_PASSTHROUGH; | |
306 | + } else { | |
307 | + /* 'backing' is also a child of our BDS, but we don't expect it to be | |
308 | + * writeable, so we only forward 'consistent read' */ | |
309 | + *nperm = perm & BLK_PERM_CONSISTENT_READ; | |
310 | + } | |
311 | +} | |
312 | + | |
313 | +static void track_drop(void *opaque) | |
314 | +{ | |
315 | + BlockDriverState *bs = (BlockDriverState*)opaque; | |
316 | + BlockDriverState *file = bs->file->bs; | |
317 | + BDRVAllocTrackState *s = bs->opaque; | |
318 | + | |
319 | + assert(file); | |
320 | + | |
321 | + /* we rely on the fact that we're not used anywhere else, so let's wait | |
322 | + * until we're only used once - in the drive connected to the guest (and one | |
323 | + * ref is held by bdrv_ref in track_change_backing_file) */ | |
324 | + if (bs->refcnt > 2) { | |
325 | + aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, opaque); | |
326 | + return; | |
327 | + } | |
b36e8acc TL |
328 | + AioContext *aio_context = bdrv_get_aio_context(bs); |
329 | + aio_context_acquire(aio_context); | |
677d0d16 | 330 | + |
677d0d16 SR |
331 | + bdrv_drained_begin(bs); |
332 | + | |
333 | + /* now that we're drained, we can safely set 'DropInProgress' */ | |
334 | + s->drop_state = DropInProgress; | |
335 | + bdrv_child_refresh_perms(bs, bs->file, &error_abort); | |
336 | + | |
337 | + bdrv_replace_node(bs, file, &error_abort); | |
aa42ea26 TL |
338 | + bdrv_set_backing_hd(bs, NULL, &error_abort); |
339 | + bdrv_drained_end(bs); | |
677d0d16 | 340 | + bdrv_unref(bs); |
b36e8acc | 341 | + aio_context_release(aio_context); |
677d0d16 SR |
342 | +} |
343 | + | |
344 | +static int track_change_backing_file(BlockDriverState *bs, | |
345 | + const char *backing_file, | |
346 | + const char *backing_fmt) | |
347 | +{ | |
348 | + BDRVAllocTrackState *s = bs->opaque; | |
349 | + if (s->auto_remove && s->drop_state == DropNone && | |
350 | + backing_file == NULL && backing_fmt == NULL) | |
351 | + { | |
352 | + /* backing file has been disconnected, there's no longer any use for | |
353 | + * this node, so let's remove ourselves from the block graph - we need | |
354 | + * to schedule this for later however, since when this function is | |
355 | + * called, the blockjob modifying us is probably not done yet and has a | |
356 | + * blocker on 'bs' */ | |
357 | + s->drop_state = DropRequested; | |
358 | + bdrv_ref(bs); | |
359 | + aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, (void*)bs); | |
360 | + } | |
361 | + | |
362 | + return 0; | |
363 | +} | |
364 | + | |
365 | +static BlockDriver bdrv_alloc_track = { | |
366 | + .format_name = "alloc-track", | |
367 | + .instance_size = sizeof(BDRVAllocTrackState), | |
368 | + | |
369 | + .bdrv_file_open = track_open, | |
370 | + .bdrv_close = track_close, | |
bf251437 | 371 | + .bdrv_co_getlength = track_co_getlength, |
677d0d16 SR |
372 | + .bdrv_child_perm = track_child_perm, |
373 | + .bdrv_refresh_limits = track_refresh_limits, | |
374 | + | |
375 | + .bdrv_co_pwrite_zeroes = track_co_pwrite_zeroes, | |
376 | + .bdrv_co_pwritev = track_co_pwritev, | |
377 | + .bdrv_co_preadv = track_co_preadv, | |
378 | + .bdrv_co_pdiscard = track_co_pdiscard, | |
379 | + | |
380 | + .bdrv_co_flush = track_co_flush, | |
381 | + .bdrv_co_flush_to_disk = track_co_flush, | |
382 | + | |
383 | + .supports_backing = true, | |
384 | + | |
385 | + .bdrv_co_block_status = track_co_block_status, | |
386 | + .bdrv_change_backing_file = track_change_backing_file, | |
387 | +}; | |
388 | + | |
389 | +static void bdrv_alloc_track_init(void) | |
390 | +{ | |
391 | + bdrv_register(&bdrv_alloc_track); | |
392 | +} | |
393 | + | |
394 | +block_init(bdrv_alloc_track_init); | |
395 | diff --git a/block/meson.build b/block/meson.build | |
10e10933 | 396 | index becc99ac4e..0a69836593 100644 |
677d0d16 SR |
397 | --- a/block/meson.build |
398 | +++ b/block/meson.build | |
399 | @@ -2,6 +2,7 @@ block_ss.add(genh) | |
400 | block_ss.add(files( | |
401 | 'accounting.c', | |
402 | 'aio_task.c', | |
403 | + 'alloc-track.c', | |
404 | 'amend.c', | |
405 | 'backup.c', | |
406 | 'backup-dump.c', |