]> git.proxmox.com Git - mirror_qemu.git/blame - block/preallocate.c
block: Mark bdrv_filter_bs() and callers GRAPH_RDLOCK
[mirror_qemu.git] / block / preallocate.c
CommitLineData
33fa2222
VSO
1/*
2 * preallocate filter driver
3 *
4 * The driver performs preallocate operation: it is injected above
5 * some node, and before each write over EOF it does additional preallocating
6 * write-zeroes request.
7 *
8 * Copyright (c) 2020 Virtuozzo International GmbH.
9 *
10 * Author:
11 * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program. If not, see <http://www.gnu.org/licenses/>.
25 */
26
27#include "qemu/osdep.h"
28
29#include "qapi/error.h"
30#include "qemu/module.h"
31#include "qemu/option.h"
32#include "qemu/units.h"
e2c1c34f 33#include "block/block-io.h"
33fa2222
VSO
34#include "block/block_int.h"
35
36
37typedef struct PreallocateOpts {
38 int64_t prealloc_size;
39 int64_t prealloc_align;
40} PreallocateOpts;
41
42typedef struct BDRVPreallocateState {
43 PreallocateOpts opts;
44
45 /*
46 * Track real data end, to crop preallocation on close. If < 0 the status is
47 * unknown.
48 *
49 * @data_end is a maximum of file size on open (or when we get write/resize
50 * permissions) and all write request ends after it. So it's safe to
51 * truncate to data_end if it is valid.
52 */
53 int64_t data_end;
54
55 /*
56 * Start of trailing preallocated area which reads as zero. May be smaller
57 * than data_end, if user does over-EOF write zero operation. If < 0 the
58 * status is unknown.
59 *
60 * If both @zero_start and @file_end are valid, the region
61 * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
62 * is not valid, @zero_start doesn't make much sense.
63 */
64 int64_t zero_start;
65
66 /*
67 * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
68 * to avoid extra lseek() calls on each write operation. If < 0 the status
69 * is unknown.
70 */
71 int64_t file_end;
72
73 /*
74 * All three states @data_end, @zero_start and @file_end are guaranteed to
75 * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
76 * BLK_PERM_WRITE permissions on file child.
77 */
edcce17b
KW
78
79 /* Gives up the resize permission on children when parents don't need it */
80 QEMUBH *drop_resize_bh;
33fa2222
VSO
81} BDRVPreallocateState;
82
edcce17b
KW
83static int preallocate_drop_resize(BlockDriverState *bs, Error **errp);
84static void preallocate_drop_resize_bh(void *opaque);
85
33fa2222
VSO
86#define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
87#define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
88static QemuOptsList runtime_opts = {
89 .name = "preallocate",
90 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
91 .desc = {
92 {
93 .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
94 .type = QEMU_OPT_SIZE,
95 .help = "on preallocation, align file length to this number, "
96 "default 1M",
97 },
98 {
99 .name = PREALLOCATE_OPT_PREALLOC_SIZE,
100 .type = QEMU_OPT_SIZE,
101 .help = "how much to preallocate, default 128M",
102 },
103 { /* end of list */ }
104 },
105};
106
107static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
108 BlockDriverState *child_bs, Error **errp)
109{
110 QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
111
112 if (!qemu_opts_absorb_qdict(opts, options, errp)) {
113 return false;
114 }
115
116 dest->prealloc_align =
117 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
118 dest->prealloc_size =
119 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
120
121 qemu_opts_del(opts);
122
123 if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
124 error_setg(errp, "prealloc-align parameter of preallocate filter "
125 "is not aligned to %llu", BDRV_SECTOR_SIZE);
126 return false;
127 }
128
129 if (!QEMU_IS_ALIGNED(dest->prealloc_align,
130 child_bs->bl.request_alignment)) {
131 error_setg(errp, "prealloc-align parameter of preallocate filter "
132 "is not aligned to underlying node request alignment "
133 "(%" PRIi32 ")", child_bs->bl.request_alignment);
134 return false;
135 }
136
137 return true;
138}
139
140static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
141 Error **errp)
142{
143 BDRVPreallocateState *s = bs->opaque;
83930780 144 int ret;
33fa2222
VSO
145
146 /*
147 * s->data_end and friends should be initialized on permission update.
148 * For this to work, mark them invalid.
149 */
150 s->file_end = s->zero_start = s->data_end = -EINVAL;
edcce17b 151 s->drop_resize_bh = qemu_bh_new(preallocate_drop_resize_bh, bs);
33fa2222 152
83930780
VSO
153 ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
154 if (ret < 0) {
155 return ret;
33fa2222
VSO
156 }
157
158 if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
159 return -EINVAL;
160 }
161
162 bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
163 (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
164
165 bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
166 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
167 bs->file->bs->supported_zero_flags);
168
169 return 0;
170}
171
01e28f60 172static int preallocate_truncate_to_real_size(BlockDriverState *bs, Error **errp)
33fa2222 173{
33fa2222 174 BDRVPreallocateState *s = bs->opaque;
01e28f60 175 int ret;
33fa2222
VSO
176
177 if (s->file_end < 0) {
178 s->file_end = bdrv_getlength(bs->file->bs);
179 if (s->file_end < 0) {
01e28f60
KW
180 error_setg_errno(errp, -s->file_end, "Failed to get file length");
181 return s->file_end;
33fa2222
VSO
182 }
183 }
184
185 if (s->data_end < s->file_end) {
186 ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
187 NULL);
01e28f60
KW
188 if (ret < 0) {
189 error_setg_errno(errp, -ret, "Failed to drop preallocation");
190 s->file_end = ret;
191 return ret;
192 }
193 s->file_end = s->data_end;
194 }
195
196 return 0;
197}
198
199static void preallocate_close(BlockDriverState *bs)
200{
201 BDRVPreallocateState *s = bs->opaque;
202
edcce17b
KW
203 qemu_bh_cancel(s->drop_resize_bh);
204 qemu_bh_delete(s->drop_resize_bh);
205
01e28f60
KW
206 if (s->data_end >= 0) {
207 preallocate_truncate_to_real_size(bs, NULL);
33fa2222
VSO
208 }
209}
210
211
212/*
213 * Handle reopen.
214 *
215 * We must implement reopen handlers, otherwise reopen just don't work. Handle
216 * new options and don't care about preallocation state, as it is handled in
217 * set/check permission handlers.
218 */
219
220static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
221 BlockReopenQueue *queue, Error **errp)
222{
223 PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
edcce17b 224 int ret;
33fa2222
VSO
225
226 if (!preallocate_absorb_opts(opts, reopen_state->options,
227 reopen_state->bs->file->bs, errp)) {
228 g_free(opts);
229 return -EINVAL;
230 }
231
edcce17b
KW
232 /*
233 * Drop the preallocation already here if reopening read-only. The child
234 * might also be reopened read-only and then scheduling a BH during the
235 * permission update is too late.
236 */
237 if ((reopen_state->flags & BDRV_O_RDWR) == 0) {
238 ret = preallocate_drop_resize(reopen_state->bs, errp);
239 if (ret < 0) {
240 g_free(opts);
241 return ret;
242 }
243 }
244
33fa2222
VSO
245 reopen_state->opaque = opts;
246
247 return 0;
248}
249
250static void preallocate_reopen_commit(BDRVReopenState *state)
251{
252 BDRVPreallocateState *s = state->bs->opaque;
253
254 s->opts = *(PreallocateOpts *)state->opaque;
255
256 g_free(state->opaque);
257 state->opaque = NULL;
258}
259
260static void preallocate_reopen_abort(BDRVReopenState *state)
261{
262 g_free(state->opaque);
263 state->opaque = NULL;
264}
265
b9b10c35
KW
266static int coroutine_fn GRAPH_RDLOCK
267preallocate_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
268 QEMUIOVector *qiov, size_t qiov_offset,
269 BdrvRequestFlags flags)
33fa2222
VSO
270{
271 return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
272 flags);
273}
274
9a5a1c62
EGE
275static int coroutine_fn GRAPH_RDLOCK
276preallocate_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
33fa2222
VSO
277{
278 return bdrv_co_pdiscard(bs->file, offset, bytes);
279}
280
281static bool can_write_resize(uint64_t perm)
282{
283 return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
284}
285
286static bool has_prealloc_perms(BlockDriverState *bs)
287{
288 BDRVPreallocateState *s = bs->opaque;
289
290 if (can_write_resize(bs->file->perm)) {
291 assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
292 assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
293 return true;
294 }
295
296 assert(s->data_end < 0);
297 assert(s->zero_start < 0);
298 assert(s->file_end < 0);
299 return false;
300}
301
302/*
303 * Call on each write. Returns true if @want_merge_zero is true and the region
304 * [offset, offset + bytes) is zeroed (as a result of this call or earlier
305 * preallocation).
306 *
307 * want_merge_zero is used to merge write-zero request with preallocation in
308 * one bdrv_co_pwrite_zeroes() call.
309 */
abaf8b75
KW
310static bool coroutine_fn GRAPH_RDLOCK
311handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes,
312 bool want_merge_zero)
33fa2222
VSO
313{
314 BDRVPreallocateState *s = bs->opaque;
315 int64_t end = offset + bytes;
316 int64_t prealloc_start, prealloc_end;
317 int ret;
45e62b46
VSO
318 uint32_t file_align = bs->file->bs->bl.request_alignment;
319 uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align);
320
321 assert(QEMU_IS_ALIGNED(prealloc_align, file_align));
33fa2222
VSO
322
323 if (!has_prealloc_perms(bs)) {
324 /* We don't have state neither should try to recover it */
325 return false;
326 }
327
328 if (s->data_end < 0) {
bd53086e 329 s->data_end = bdrv_co_getlength(bs->file->bs);
33fa2222
VSO
330 if (s->data_end < 0) {
331 return false;
332 }
333
334 if (s->file_end < 0) {
335 s->file_end = s->data_end;
336 }
337 }
338
339 if (end <= s->data_end) {
340 return false;
341 }
342
343 /* We have valid s->data_end, and request writes beyond it. */
344
345 s->data_end = end;
346 if (s->zero_start < 0 || !want_merge_zero) {
347 s->zero_start = end;
348 }
349
350 if (s->file_end < 0) {
bd53086e 351 s->file_end = bdrv_co_getlength(bs->file->bs);
33fa2222
VSO
352 if (s->file_end < 0) {
353 return false;
354 }
355 }
356
357 /* Now s->data_end, s->zero_start and s->file_end are valid. */
358
359 if (end <= s->file_end) {
360 /* No preallocation needed. */
361 return want_merge_zero && offset >= s->zero_start;
362 }
363
364 /* Now we want new preallocation, as request writes beyond s->file_end. */
365
45e62b46
VSO
366 prealloc_start = QEMU_ALIGN_UP(
367 want_merge_zero ? MIN(offset, s->file_end) : s->file_end,
368 file_align);
369 prealloc_end = QEMU_ALIGN_UP(
370 MAX(prealloc_start, end) + s->opts.prealloc_size,
371 prealloc_align);
372
373 want_merge_zero = want_merge_zero && (prealloc_start <= offset);
33fa2222
VSO
374
375 ret = bdrv_co_pwrite_zeroes(
376 bs->file, prealloc_start, prealloc_end - prealloc_start,
377 BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
378 if (ret < 0) {
379 s->file_end = ret;
380 return false;
381 }
382
383 s->file_end = prealloc_end;
384 return want_merge_zero;
385}
386
abaf8b75
KW
387static int coroutine_fn GRAPH_RDLOCK
388preallocate_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
389 int64_t bytes, BdrvRequestFlags flags)
33fa2222
VSO
390{
391 bool want_merge_zero =
392 !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
393 if (handle_write(bs, offset, bytes, want_merge_zero)) {
394 return 0;
395 }
396
397 return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
398}
399
b9b10c35
KW
400static int coroutine_fn GRAPH_RDLOCK
401preallocate_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
402 QEMUIOVector *qiov, size_t qiov_offset,
403 BdrvRequestFlags flags)
33fa2222
VSO
404{
405 handle_write(bs, offset, bytes, false);
406
407 return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
408 flags);
409}
410
c2b8e315 411static int coroutine_fn GRAPH_RDLOCK
33fa2222
VSO
412preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
413 bool exact, PreallocMode prealloc,
414 BdrvRequestFlags flags, Error **errp)
415{
416 ERRP_GUARD();
417 BDRVPreallocateState *s = bs->opaque;
418 int ret;
419
420 if (s->data_end >= 0 && offset > s->data_end) {
421 if (s->file_end < 0) {
bd53086e 422 s->file_end = bdrv_co_getlength(bs->file->bs);
33fa2222
VSO
423 if (s->file_end < 0) {
424 error_setg(errp, "failed to get file length");
425 return s->file_end;
426 }
427 }
428
429 if (prealloc == PREALLOC_MODE_FALLOC) {
430 /*
431 * If offset <= s->file_end, the task is already done, just
432 * update s->data_end, to move part of "filter preallocation"
433 * to "preallocation requested by user".
434 * Otherwise just proceed to preallocate missing part.
435 */
436 if (offset <= s->file_end) {
437 s->data_end = offset;
438 return 0;
439 }
440 } else {
441 /*
442 * We have to drop our preallocation, to
443 * - avoid "Cannot use preallocation for shrinking files" in
444 * case of offset < file_end
445 * - give PREALLOC_MODE_OFF a chance to keep small disk
446 * usage
447 * - give PREALLOC_MODE_FULL a chance to actually write the
448 * whole region as user expects
449 */
450 if (s->file_end > s->data_end) {
451 ret = bdrv_co_truncate(bs->file, s->data_end, true,
452 PREALLOC_MODE_OFF, 0, errp);
453 if (ret < 0) {
454 s->file_end = ret;
455 error_prepend(errp, "preallocate-filter: failed to drop "
456 "write-zero preallocation: ");
457 return ret;
458 }
459 s->file_end = s->data_end;
460 }
461 }
462
463 s->data_end = offset;
464 }
465
466 ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
467 if (ret < 0) {
468 s->file_end = s->zero_start = s->data_end = ret;
469 return ret;
470 }
471
472 if (has_prealloc_perms(bs)) {
473 s->file_end = s->zero_start = s->data_end = offset;
474 }
475 return 0;
476}
477
88095349 478static int coroutine_fn GRAPH_RDLOCK preallocate_co_flush(BlockDriverState *bs)
33fa2222
VSO
479{
480 return bdrv_co_flush(bs->file->bs);
481}
482
8ab8140a
KW
483static int64_t coroutine_fn GRAPH_RDLOCK
484preallocate_co_getlength(BlockDriverState *bs)
33fa2222
VSO
485{
486 int64_t ret;
487 BDRVPreallocateState *s = bs->opaque;
488
489 if (s->data_end >= 0) {
490 return s->data_end;
491 }
492
c86422c5 493 ret = bdrv_co_getlength(bs->file->bs);
33fa2222
VSO
494
495 if (has_prealloc_perms(bs)) {
496 s->file_end = s->zero_start = s->data_end = ret;
497 }
498
499 return ret;
500}
501
edcce17b 502static int preallocate_drop_resize(BlockDriverState *bs, Error **errp)
33fa2222
VSO
503{
504 BDRVPreallocateState *s = bs->opaque;
edcce17b 505 int ret;
33fa2222 506
edcce17b
KW
507 if (s->data_end < 0) {
508 return 0;
509 }
510
511 /*
512 * Before switching children to be read-only, truncate them to remove
513 * the preallocation and let them have the real size.
514 */
515 ret = preallocate_truncate_to_real_size(bs, errp);
516 if (ret < 0) {
517 return ret;
33fa2222
VSO
518 }
519
edcce17b
KW
520 /*
521 * We'll drop our permissions and will allow other users to take write and
522 * resize permissions (see preallocate_child_perm). Anyone will be able to
523 * change the child, so mark all states invalid. We'll regain control if a
524 * parent requests write access again.
525 */
526 s->data_end = s->file_end = s->zero_start = -EINVAL;
527
528 bdrv_graph_rdlock_main_loop();
529 bdrv_child_refresh_perms(bs, bs->file, NULL);
530 bdrv_graph_rdunlock_main_loop();
531
33fa2222
VSO
532 return 0;
533}
534
edcce17b
KW
535static void preallocate_drop_resize_bh(void *opaque)
536{
537 /*
538 * In case of errors, we'll simply keep the exclusive lock on the image
539 * indefinitely.
540 */
541 preallocate_drop_resize(opaque, NULL);
542}
543
33fa2222
VSO
544static void preallocate_set_perm(BlockDriverState *bs,
545 uint64_t perm, uint64_t shared)
546{
547 BDRVPreallocateState *s = bs->opaque;
548
549 if (can_write_resize(perm)) {
edcce17b 550 qemu_bh_cancel(s->drop_resize_bh);
33fa2222
VSO
551 if (s->data_end < 0) {
552 s->data_end = s->file_end = s->zero_start =
edcce17b 553 bs->file->bs->total_sectors * BDRV_SECTOR_SIZE;
33fa2222
VSO
554 }
555 } else {
edcce17b 556 qemu_bh_schedule(s->drop_resize_bh);
33fa2222
VSO
557 }
558}
559
560static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
561 BdrvChildRole role, BlockReopenQueue *reopen_queue,
562 uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
563{
edcce17b
KW
564 BDRVPreallocateState *s = bs->opaque;
565
33fa2222
VSO
566 bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
567
edcce17b
KW
568 /*
569 * We need exclusive write and resize permissions on the child not only when
570 * the parent can write to it, but also after the parent gave up write
571 * permissions until preallocate_drop_resize() has completed.
572 */
573 if (can_write_resize(perm) || s->data_end != -EINVAL) {
33fa2222
VSO
574 *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
575
576 /*
577 * Don't share, to keep our states s->file_end, s->data_end and
578 * s->zero_start valid.
579 */
580 *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
581 }
582}
583
9ea473fb 584static BlockDriver bdrv_preallocate_filter = {
33fa2222
VSO
585 .format_name = "preallocate",
586 .instance_size = sizeof(BDRVPreallocateState),
587
c86422c5
EGE
588 .bdrv_co_getlength = preallocate_co_getlength,
589 .bdrv_open = preallocate_open,
590 .bdrv_close = preallocate_close,
33fa2222
VSO
591
592 .bdrv_reopen_prepare = preallocate_reopen_prepare,
593 .bdrv_reopen_commit = preallocate_reopen_commit,
594 .bdrv_reopen_abort = preallocate_reopen_abort,
595
596 .bdrv_co_preadv_part = preallocate_co_preadv_part,
597 .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
598 .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
599 .bdrv_co_pdiscard = preallocate_co_pdiscard,
600 .bdrv_co_flush = preallocate_co_flush,
601 .bdrv_co_truncate = preallocate_co_truncate,
602
33fa2222
VSO
603 .bdrv_set_perm = preallocate_set_perm,
604 .bdrv_child_perm = preallocate_child_perm,
605
33fa2222
VSO
606 .is_filter = true,
607};
608
609static void bdrv_preallocate_init(void)
610{
611 bdrv_register(&bdrv_preallocate_filter);
612}
613
614block_init(bdrv_preallocate_init);