]> git.proxmox.com Git - mirror_qemu.git/blame - block/preallocate.c
block: Mark read/write in block/io.c GRAPH_RDLOCK
[mirror_qemu.git] / block / preallocate.c
CommitLineData
33fa2222
VSO
1/*
2 * preallocate filter driver
3 *
4 * The driver performs preallocate operation: it is injected above
5 * some node, and before each write over EOF it does additional preallocating
6 * write-zeroes request.
7 *
8 * Copyright (c) 2020 Virtuozzo International GmbH.
9 *
10 * Author:
11 * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program. If not, see <http://www.gnu.org/licenses/>.
25 */
26
27#include "qemu/osdep.h"
28
29#include "qapi/error.h"
30#include "qemu/module.h"
31#include "qemu/option.h"
32#include "qemu/units.h"
e2c1c34f 33#include "block/block-io.h"
33fa2222
VSO
34#include "block/block_int.h"
35
36
37typedef struct PreallocateOpts {
38 int64_t prealloc_size;
39 int64_t prealloc_align;
40} PreallocateOpts;
41
42typedef struct BDRVPreallocateState {
43 PreallocateOpts opts;
44
45 /*
46 * Track real data end, to crop preallocation on close. If < 0 the status is
47 * unknown.
48 *
49 * @data_end is a maximum of file size on open (or when we get write/resize
50 * permissions) and all write request ends after it. So it's safe to
51 * truncate to data_end if it is valid.
52 */
53 int64_t data_end;
54
55 /*
56 * Start of trailing preallocated area which reads as zero. May be smaller
57 * than data_end, if user does over-EOF write zero operation. If < 0 the
58 * status is unknown.
59 *
60 * If both @zero_start and @file_end are valid, the region
61 * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
62 * is not valid, @zero_start doesn't make much sense.
63 */
64 int64_t zero_start;
65
66 /*
67 * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
68 * to avoid extra lseek() calls on each write operation. If < 0 the status
69 * is unknown.
70 */
71 int64_t file_end;
72
73 /*
74 * All three states @data_end, @zero_start and @file_end are guaranteed to
75 * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
76 * BLK_PERM_WRITE permissions on file child.
77 */
78} BDRVPreallocateState;
79
80#define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
81#define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
82static QemuOptsList runtime_opts = {
83 .name = "preallocate",
84 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
85 .desc = {
86 {
87 .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
88 .type = QEMU_OPT_SIZE,
89 .help = "on preallocation, align file length to this number, "
90 "default 1M",
91 },
92 {
93 .name = PREALLOCATE_OPT_PREALLOC_SIZE,
94 .type = QEMU_OPT_SIZE,
95 .help = "how much to preallocate, default 128M",
96 },
97 { /* end of list */ }
98 },
99};
100
101static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
102 BlockDriverState *child_bs, Error **errp)
103{
104 QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
105
106 if (!qemu_opts_absorb_qdict(opts, options, errp)) {
107 return false;
108 }
109
110 dest->prealloc_align =
111 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
112 dest->prealloc_size =
113 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
114
115 qemu_opts_del(opts);
116
117 if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
118 error_setg(errp, "prealloc-align parameter of preallocate filter "
119 "is not aligned to %llu", BDRV_SECTOR_SIZE);
120 return false;
121 }
122
123 if (!QEMU_IS_ALIGNED(dest->prealloc_align,
124 child_bs->bl.request_alignment)) {
125 error_setg(errp, "prealloc-align parameter of preallocate filter "
126 "is not aligned to underlying node request alignment "
127 "(%" PRIi32 ")", child_bs->bl.request_alignment);
128 return false;
129 }
130
131 return true;
132}
133
134static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
135 Error **errp)
136{
137 BDRVPreallocateState *s = bs->opaque;
83930780 138 int ret;
33fa2222
VSO
139
140 /*
141 * s->data_end and friends should be initialized on permission update.
142 * For this to work, mark them invalid.
143 */
144 s->file_end = s->zero_start = s->data_end = -EINVAL;
145
83930780
VSO
146 ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
147 if (ret < 0) {
148 return ret;
33fa2222
VSO
149 }
150
151 if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
152 return -EINVAL;
153 }
154
155 bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
156 (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
157
158 bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
159 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
160 bs->file->bs->supported_zero_flags);
161
162 return 0;
163}
164
165static void preallocate_close(BlockDriverState *bs)
166{
167 int ret;
168 BDRVPreallocateState *s = bs->opaque;
169
170 if (s->data_end < 0) {
171 return;
172 }
173
174 if (s->file_end < 0) {
175 s->file_end = bdrv_getlength(bs->file->bs);
176 if (s->file_end < 0) {
177 return;
178 }
179 }
180
181 if (s->data_end < s->file_end) {
182 ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
183 NULL);
184 s->file_end = ret < 0 ? ret : s->data_end;
185 }
186}
187
188
189/*
190 * Handle reopen.
191 *
192 * We must implement reopen handlers, otherwise reopen just don't work. Handle
193 * new options and don't care about preallocation state, as it is handled in
194 * set/check permission handlers.
195 */
196
197static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
198 BlockReopenQueue *queue, Error **errp)
199{
200 PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
201
202 if (!preallocate_absorb_opts(opts, reopen_state->options,
203 reopen_state->bs->file->bs, errp)) {
204 g_free(opts);
205 return -EINVAL;
206 }
207
208 reopen_state->opaque = opts;
209
210 return 0;
211}
212
213static void preallocate_reopen_commit(BDRVReopenState *state)
214{
215 BDRVPreallocateState *s = state->bs->opaque;
216
217 s->opts = *(PreallocateOpts *)state->opaque;
218
219 g_free(state->opaque);
220 state->opaque = NULL;
221}
222
223static void preallocate_reopen_abort(BDRVReopenState *state)
224{
225 g_free(state->opaque);
226 state->opaque = NULL;
227}
228
229static coroutine_fn int preallocate_co_preadv_part(
f7ef38dd
VSO
230 BlockDriverState *bs, int64_t offset, int64_t bytes,
231 QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags)
33fa2222
VSO
232{
233 return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
234 flags);
235}
236
9a5a1c62
EGE
237static int coroutine_fn GRAPH_RDLOCK
238preallocate_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
33fa2222
VSO
239{
240 return bdrv_co_pdiscard(bs->file, offset, bytes);
241}
242
243static bool can_write_resize(uint64_t perm)
244{
245 return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
246}
247
248static bool has_prealloc_perms(BlockDriverState *bs)
249{
250 BDRVPreallocateState *s = bs->opaque;
251
252 if (can_write_resize(bs->file->perm)) {
253 assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
254 assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
255 return true;
256 }
257
258 assert(s->data_end < 0);
259 assert(s->zero_start < 0);
260 assert(s->file_end < 0);
261 return false;
262}
263
264/*
265 * Call on each write. Returns true if @want_merge_zero is true and the region
266 * [offset, offset + bytes) is zeroed (as a result of this call or earlier
267 * preallocation).
268 *
269 * want_merge_zero is used to merge write-zero request with preallocation in
270 * one bdrv_co_pwrite_zeroes() call.
271 */
abaf8b75
KW
272static bool coroutine_fn GRAPH_RDLOCK
273handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes,
274 bool want_merge_zero)
33fa2222
VSO
275{
276 BDRVPreallocateState *s = bs->opaque;
277 int64_t end = offset + bytes;
278 int64_t prealloc_start, prealloc_end;
279 int ret;
45e62b46
VSO
280 uint32_t file_align = bs->file->bs->bl.request_alignment;
281 uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align);
282
283 assert(QEMU_IS_ALIGNED(prealloc_align, file_align));
33fa2222
VSO
284
285 if (!has_prealloc_perms(bs)) {
286 /* We don't have state neither should try to recover it */
287 return false;
288 }
289
290 if (s->data_end < 0) {
bd53086e 291 s->data_end = bdrv_co_getlength(bs->file->bs);
33fa2222
VSO
292 if (s->data_end < 0) {
293 return false;
294 }
295
296 if (s->file_end < 0) {
297 s->file_end = s->data_end;
298 }
299 }
300
301 if (end <= s->data_end) {
302 return false;
303 }
304
305 /* We have valid s->data_end, and request writes beyond it. */
306
307 s->data_end = end;
308 if (s->zero_start < 0 || !want_merge_zero) {
309 s->zero_start = end;
310 }
311
312 if (s->file_end < 0) {
bd53086e 313 s->file_end = bdrv_co_getlength(bs->file->bs);
33fa2222
VSO
314 if (s->file_end < 0) {
315 return false;
316 }
317 }
318
319 /* Now s->data_end, s->zero_start and s->file_end are valid. */
320
321 if (end <= s->file_end) {
322 /* No preallocation needed. */
323 return want_merge_zero && offset >= s->zero_start;
324 }
325
326 /* Now we want new preallocation, as request writes beyond s->file_end. */
327
45e62b46
VSO
328 prealloc_start = QEMU_ALIGN_UP(
329 want_merge_zero ? MIN(offset, s->file_end) : s->file_end,
330 file_align);
331 prealloc_end = QEMU_ALIGN_UP(
332 MAX(prealloc_start, end) + s->opts.prealloc_size,
333 prealloc_align);
334
335 want_merge_zero = want_merge_zero && (prealloc_start <= offset);
33fa2222
VSO
336
337 ret = bdrv_co_pwrite_zeroes(
338 bs->file, prealloc_start, prealloc_end - prealloc_start,
339 BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
340 if (ret < 0) {
341 s->file_end = ret;
342 return false;
343 }
344
345 s->file_end = prealloc_end;
346 return want_merge_zero;
347}
348
abaf8b75
KW
349static int coroutine_fn GRAPH_RDLOCK
350preallocate_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
351 int64_t bytes, BdrvRequestFlags flags)
33fa2222
VSO
352{
353 bool want_merge_zero =
354 !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
355 if (handle_write(bs, offset, bytes, want_merge_zero)) {
356 return 0;
357 }
358
359 return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
360}
361
362static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs,
e75abeda
VSO
363 int64_t offset,
364 int64_t bytes,
33fa2222
VSO
365 QEMUIOVector *qiov,
366 size_t qiov_offset,
e75abeda 367 BdrvRequestFlags flags)
33fa2222 368{
abaf8b75 369 assume_graph_lock(); /* FIXME */
33fa2222
VSO
370 handle_write(bs, offset, bytes, false);
371
372 return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
373 flags);
374}
375
c2b8e315 376static int coroutine_fn GRAPH_RDLOCK
33fa2222
VSO
377preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
378 bool exact, PreallocMode prealloc,
379 BdrvRequestFlags flags, Error **errp)
380{
381 ERRP_GUARD();
382 BDRVPreallocateState *s = bs->opaque;
383 int ret;
384
385 if (s->data_end >= 0 && offset > s->data_end) {
386 if (s->file_end < 0) {
bd53086e 387 s->file_end = bdrv_co_getlength(bs->file->bs);
33fa2222
VSO
388 if (s->file_end < 0) {
389 error_setg(errp, "failed to get file length");
390 return s->file_end;
391 }
392 }
393
394 if (prealloc == PREALLOC_MODE_FALLOC) {
395 /*
396 * If offset <= s->file_end, the task is already done, just
397 * update s->data_end, to move part of "filter preallocation"
398 * to "preallocation requested by user".
399 * Otherwise just proceed to preallocate missing part.
400 */
401 if (offset <= s->file_end) {
402 s->data_end = offset;
403 return 0;
404 }
405 } else {
406 /*
407 * We have to drop our preallocation, to
408 * - avoid "Cannot use preallocation for shrinking files" in
409 * case of offset < file_end
410 * - give PREALLOC_MODE_OFF a chance to keep small disk
411 * usage
412 * - give PREALLOC_MODE_FULL a chance to actually write the
413 * whole region as user expects
414 */
415 if (s->file_end > s->data_end) {
416 ret = bdrv_co_truncate(bs->file, s->data_end, true,
417 PREALLOC_MODE_OFF, 0, errp);
418 if (ret < 0) {
419 s->file_end = ret;
420 error_prepend(errp, "preallocate-filter: failed to drop "
421 "write-zero preallocation: ");
422 return ret;
423 }
424 s->file_end = s->data_end;
425 }
426 }
427
428 s->data_end = offset;
429 }
430
431 ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
432 if (ret < 0) {
433 s->file_end = s->zero_start = s->data_end = ret;
434 return ret;
435 }
436
437 if (has_prealloc_perms(bs)) {
438 s->file_end = s->zero_start = s->data_end = offset;
439 }
440 return 0;
441}
442
88095349 443static int coroutine_fn GRAPH_RDLOCK preallocate_co_flush(BlockDriverState *bs)
33fa2222
VSO
444{
445 return bdrv_co_flush(bs->file->bs);
446}
447
c86422c5 448static int64_t coroutine_fn preallocate_co_getlength(BlockDriverState *bs)
33fa2222
VSO
449{
450 int64_t ret;
451 BDRVPreallocateState *s = bs->opaque;
452
453 if (s->data_end >= 0) {
454 return s->data_end;
455 }
456
c86422c5 457 ret = bdrv_co_getlength(bs->file->bs);
33fa2222
VSO
458
459 if (has_prealloc_perms(bs)) {
460 s->file_end = s->zero_start = s->data_end = ret;
461 }
462
463 return ret;
464}
465
466static int preallocate_check_perm(BlockDriverState *bs,
467 uint64_t perm, uint64_t shared, Error **errp)
468{
469 BDRVPreallocateState *s = bs->opaque;
470
471 if (s->data_end >= 0 && !can_write_resize(perm)) {
472 /*
473 * Lose permissions.
474 * We should truncate in check_perm, as in set_perm bs->file->perm will
475 * be already changed, and we should not violate it.
476 */
477 if (s->file_end < 0) {
478 s->file_end = bdrv_getlength(bs->file->bs);
479 if (s->file_end < 0) {
480 error_setg(errp, "Failed to get file length");
481 return s->file_end;
482 }
483 }
484
485 if (s->data_end < s->file_end) {
486 int ret = bdrv_truncate(bs->file, s->data_end, true,
487 PREALLOC_MODE_OFF, 0, NULL);
488 if (ret < 0) {
489 error_setg(errp, "Failed to drop preallocation");
490 s->file_end = ret;
491 return ret;
492 }
493 s->file_end = s->data_end;
494 }
495 }
496
497 return 0;
498}
499
500static void preallocate_set_perm(BlockDriverState *bs,
501 uint64_t perm, uint64_t shared)
502{
503 BDRVPreallocateState *s = bs->opaque;
504
505 if (can_write_resize(perm)) {
506 if (s->data_end < 0) {
507 s->data_end = s->file_end = s->zero_start =
508 bdrv_getlength(bs->file->bs);
509 }
510 } else {
511 /*
512 * We drop our permissions, as well as allow shared
513 * permissions (see preallocate_child_perm), anyone will be able to
514 * change the child, so mark all states invalid. We'll regain control if
515 * get good permissions back.
516 */
517 s->data_end = s->file_end = s->zero_start = -EINVAL;
518 }
519}
520
521static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
522 BdrvChildRole role, BlockReopenQueue *reopen_queue,
523 uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
524{
525 bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
526
527 if (can_write_resize(perm)) {
528 /* This should come by default, but let's enforce: */
529 *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
530
531 /*
532 * Don't share, to keep our states s->file_end, s->data_end and
533 * s->zero_start valid.
534 */
535 *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
536 }
537}
538
539BlockDriver bdrv_preallocate_filter = {
540 .format_name = "preallocate",
541 .instance_size = sizeof(BDRVPreallocateState),
542
c86422c5
EGE
543 .bdrv_co_getlength = preallocate_co_getlength,
544 .bdrv_open = preallocate_open,
545 .bdrv_close = preallocate_close,
33fa2222
VSO
546
547 .bdrv_reopen_prepare = preallocate_reopen_prepare,
548 .bdrv_reopen_commit = preallocate_reopen_commit,
549 .bdrv_reopen_abort = preallocate_reopen_abort,
550
551 .bdrv_co_preadv_part = preallocate_co_preadv_part,
552 .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
553 .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
554 .bdrv_co_pdiscard = preallocate_co_pdiscard,
555 .bdrv_co_flush = preallocate_co_flush,
556 .bdrv_co_truncate = preallocate_co_truncate,
557
558 .bdrv_check_perm = preallocate_check_perm,
559 .bdrv_set_perm = preallocate_set_perm,
560 .bdrv_child_perm = preallocate_child_perm,
561
562 .has_variable_length = true,
563 .is_filter = true,
564};
565
566static void bdrv_preallocate_init(void)
567{
568 bdrv_register(&bdrv_preallocate_filter);
569}
570
571block_init(bdrv_preallocate_init);