]> git.proxmox.com Git - mirror_qemu.git/blob - block/preallocate.c
d50ee7f49b5780311db995b619cd2b21a21af8b6
[mirror_qemu.git] / block / preallocate.c
1 /*
2 * preallocate filter driver
3 *
4 * The driver performs preallocate operation: it is injected above
5 * some node, and before each write over EOF it does additional preallocating
6 * write-zeroes request.
7 *
8 * Copyright (c) 2020 Virtuozzo International GmbH.
9 *
10 * Author:
11 * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program. If not, see <http://www.gnu.org/licenses/>.
25 */
26
27 #include "qemu/osdep.h"
28
29 #include "qapi/error.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "qemu/units.h"
33 #include "block/block_int.h"
34
35
36 typedef struct PreallocateOpts {
37 int64_t prealloc_size;
38 int64_t prealloc_align;
39 } PreallocateOpts;
40
41 typedef struct BDRVPreallocateState {
42 PreallocateOpts opts;
43
44 /*
45 * Track real data end, to crop preallocation on close. If < 0 the status is
46 * unknown.
47 *
48 * @data_end is a maximum of file size on open (or when we get write/resize
49 * permissions) and all write request ends after it. So it's safe to
50 * truncate to data_end if it is valid.
51 */
52 int64_t data_end;
53
54 /*
55 * Start of trailing preallocated area which reads as zero. May be smaller
56 * than data_end, if user does over-EOF write zero operation. If < 0 the
57 * status is unknown.
58 *
59 * If both @zero_start and @file_end are valid, the region
60 * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
61 * is not valid, @zero_start doesn't make much sense.
62 */
63 int64_t zero_start;
64
65 /*
66 * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
67 * to avoid extra lseek() calls on each write operation. If < 0 the status
68 * is unknown.
69 */
70 int64_t file_end;
71
72 /*
73 * All three states @data_end, @zero_start and @file_end are guaranteed to
74 * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
75 * BLK_PERM_WRITE permissions on file child.
76 */
77 } BDRVPreallocateState;
78
79 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
80 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
81 static QemuOptsList runtime_opts = {
82 .name = "preallocate",
83 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
84 .desc = {
85 {
86 .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
87 .type = QEMU_OPT_SIZE,
88 .help = "on preallocation, align file length to this number, "
89 "default 1M",
90 },
91 {
92 .name = PREALLOCATE_OPT_PREALLOC_SIZE,
93 .type = QEMU_OPT_SIZE,
94 .help = "how much to preallocate, default 128M",
95 },
96 { /* end of list */ }
97 },
98 };
99
100 static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
101 BlockDriverState *child_bs, Error **errp)
102 {
103 QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
104
105 if (!qemu_opts_absorb_qdict(opts, options, errp)) {
106 return false;
107 }
108
109 dest->prealloc_align =
110 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
111 dest->prealloc_size =
112 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
113
114 qemu_opts_del(opts);
115
116 if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
117 error_setg(errp, "prealloc-align parameter of preallocate filter "
118 "is not aligned to %llu", BDRV_SECTOR_SIZE);
119 return false;
120 }
121
122 if (!QEMU_IS_ALIGNED(dest->prealloc_align,
123 child_bs->bl.request_alignment)) {
124 error_setg(errp, "prealloc-align parameter of preallocate filter "
125 "is not aligned to underlying node request alignment "
126 "(%" PRIi32 ")", child_bs->bl.request_alignment);
127 return false;
128 }
129
130 return true;
131 }
132
133 static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
134 Error **errp)
135 {
136 BDRVPreallocateState *s = bs->opaque;
137 int ret;
138
139 /*
140 * s->data_end and friends should be initialized on permission update.
141 * For this to work, mark them invalid.
142 */
143 s->file_end = s->zero_start = s->data_end = -EINVAL;
144
145 ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
146 if (ret < 0) {
147 return ret;
148 }
149
150 if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
151 return -EINVAL;
152 }
153
154 bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
155 (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
156
157 bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
158 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
159 bs->file->bs->supported_zero_flags);
160
161 return 0;
162 }
163
164 static void preallocate_close(BlockDriverState *bs)
165 {
166 int ret;
167 BDRVPreallocateState *s = bs->opaque;
168
169 if (s->data_end < 0) {
170 return;
171 }
172
173 if (s->file_end < 0) {
174 s->file_end = bdrv_getlength(bs->file->bs);
175 if (s->file_end < 0) {
176 return;
177 }
178 }
179
180 if (s->data_end < s->file_end) {
181 ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
182 NULL);
183 s->file_end = ret < 0 ? ret : s->data_end;
184 }
185 }
186
187
188 /*
189 * Handle reopen.
190 *
191 * We must implement reopen handlers, otherwise reopen just don't work. Handle
192 * new options and don't care about preallocation state, as it is handled in
193 * set/check permission handlers.
194 */
195
196 static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
197 BlockReopenQueue *queue, Error **errp)
198 {
199 PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
200
201 if (!preallocate_absorb_opts(opts, reopen_state->options,
202 reopen_state->bs->file->bs, errp)) {
203 g_free(opts);
204 return -EINVAL;
205 }
206
207 reopen_state->opaque = opts;
208
209 return 0;
210 }
211
212 static void preallocate_reopen_commit(BDRVReopenState *state)
213 {
214 BDRVPreallocateState *s = state->bs->opaque;
215
216 s->opts = *(PreallocateOpts *)state->opaque;
217
218 g_free(state->opaque);
219 state->opaque = NULL;
220 }
221
222 static void preallocate_reopen_abort(BDRVReopenState *state)
223 {
224 g_free(state->opaque);
225 state->opaque = NULL;
226 }
227
228 static coroutine_fn int preallocate_co_preadv_part(
229 BlockDriverState *bs, int64_t offset, int64_t bytes,
230 QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags)
231 {
232 return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
233 flags);
234 }
235
236 static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs,
237 int64_t offset, int64_t bytes)
238 {
239 return bdrv_co_pdiscard(bs->file, offset, bytes);
240 }
241
242 static bool can_write_resize(uint64_t perm)
243 {
244 return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
245 }
246
247 static bool has_prealloc_perms(BlockDriverState *bs)
248 {
249 BDRVPreallocateState *s = bs->opaque;
250
251 if (can_write_resize(bs->file->perm)) {
252 assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
253 assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
254 return true;
255 }
256
257 assert(s->data_end < 0);
258 assert(s->zero_start < 0);
259 assert(s->file_end < 0);
260 return false;
261 }
262
263 /*
264 * Call on each write. Returns true if @want_merge_zero is true and the region
265 * [offset, offset + bytes) is zeroed (as a result of this call or earlier
266 * preallocation).
267 *
268 * want_merge_zero is used to merge write-zero request with preallocation in
269 * one bdrv_co_pwrite_zeroes() call.
270 */
271 static bool coroutine_fn handle_write(BlockDriverState *bs, int64_t offset,
272 int64_t bytes, bool want_merge_zero)
273 {
274 BDRVPreallocateState *s = bs->opaque;
275 int64_t end = offset + bytes;
276 int64_t prealloc_start, prealloc_end;
277 int ret;
278 uint32_t file_align = bs->file->bs->bl.request_alignment;
279 uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align);
280
281 assert(QEMU_IS_ALIGNED(prealloc_align, file_align));
282
283 if (!has_prealloc_perms(bs)) {
284 /* We don't have state neither should try to recover it */
285 return false;
286 }
287
288 if (s->data_end < 0) {
289 s->data_end = bdrv_getlength(bs->file->bs);
290 if (s->data_end < 0) {
291 return false;
292 }
293
294 if (s->file_end < 0) {
295 s->file_end = s->data_end;
296 }
297 }
298
299 if (end <= s->data_end) {
300 return false;
301 }
302
303 /* We have valid s->data_end, and request writes beyond it. */
304
305 s->data_end = end;
306 if (s->zero_start < 0 || !want_merge_zero) {
307 s->zero_start = end;
308 }
309
310 if (s->file_end < 0) {
311 s->file_end = bdrv_getlength(bs->file->bs);
312 if (s->file_end < 0) {
313 return false;
314 }
315 }
316
317 /* Now s->data_end, s->zero_start and s->file_end are valid. */
318
319 if (end <= s->file_end) {
320 /* No preallocation needed. */
321 return want_merge_zero && offset >= s->zero_start;
322 }
323
324 /* Now we want new preallocation, as request writes beyond s->file_end. */
325
326 prealloc_start = QEMU_ALIGN_UP(
327 want_merge_zero ? MIN(offset, s->file_end) : s->file_end,
328 file_align);
329 prealloc_end = QEMU_ALIGN_UP(
330 MAX(prealloc_start, end) + s->opts.prealloc_size,
331 prealloc_align);
332
333 want_merge_zero = want_merge_zero && (prealloc_start <= offset);
334
335 ret = bdrv_co_pwrite_zeroes(
336 bs->file, prealloc_start, prealloc_end - prealloc_start,
337 BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
338 if (ret < 0) {
339 s->file_end = ret;
340 return false;
341 }
342
343 s->file_end = prealloc_end;
344 return want_merge_zero;
345 }
346
347 static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs,
348 int64_t offset, int64_t bytes, BdrvRequestFlags flags)
349 {
350 bool want_merge_zero =
351 !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
352 if (handle_write(bs, offset, bytes, want_merge_zero)) {
353 return 0;
354 }
355
356 return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
357 }
358
359 static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs,
360 int64_t offset,
361 int64_t bytes,
362 QEMUIOVector *qiov,
363 size_t qiov_offset,
364 BdrvRequestFlags flags)
365 {
366 handle_write(bs, offset, bytes, false);
367
368 return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
369 flags);
370 }
371
372 static int coroutine_fn
373 preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
374 bool exact, PreallocMode prealloc,
375 BdrvRequestFlags flags, Error **errp)
376 {
377 ERRP_GUARD();
378 BDRVPreallocateState *s = bs->opaque;
379 int ret;
380
381 if (s->data_end >= 0 && offset > s->data_end) {
382 if (s->file_end < 0) {
383 s->file_end = bdrv_getlength(bs->file->bs);
384 if (s->file_end < 0) {
385 error_setg(errp, "failed to get file length");
386 return s->file_end;
387 }
388 }
389
390 if (prealloc == PREALLOC_MODE_FALLOC) {
391 /*
392 * If offset <= s->file_end, the task is already done, just
393 * update s->data_end, to move part of "filter preallocation"
394 * to "preallocation requested by user".
395 * Otherwise just proceed to preallocate missing part.
396 */
397 if (offset <= s->file_end) {
398 s->data_end = offset;
399 return 0;
400 }
401 } else {
402 /*
403 * We have to drop our preallocation, to
404 * - avoid "Cannot use preallocation for shrinking files" in
405 * case of offset < file_end
406 * - give PREALLOC_MODE_OFF a chance to keep small disk
407 * usage
408 * - give PREALLOC_MODE_FULL a chance to actually write the
409 * whole region as user expects
410 */
411 if (s->file_end > s->data_end) {
412 ret = bdrv_co_truncate(bs->file, s->data_end, true,
413 PREALLOC_MODE_OFF, 0, errp);
414 if (ret < 0) {
415 s->file_end = ret;
416 error_prepend(errp, "preallocate-filter: failed to drop "
417 "write-zero preallocation: ");
418 return ret;
419 }
420 s->file_end = s->data_end;
421 }
422 }
423
424 s->data_end = offset;
425 }
426
427 ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
428 if (ret < 0) {
429 s->file_end = s->zero_start = s->data_end = ret;
430 return ret;
431 }
432
433 if (has_prealloc_perms(bs)) {
434 s->file_end = s->zero_start = s->data_end = offset;
435 }
436 return 0;
437 }
438
439 static int coroutine_fn preallocate_co_flush(BlockDriverState *bs)
440 {
441 return bdrv_co_flush(bs->file->bs);
442 }
443
444 static int64_t preallocate_getlength(BlockDriverState *bs)
445 {
446 int64_t ret;
447 BDRVPreallocateState *s = bs->opaque;
448
449 if (s->data_end >= 0) {
450 return s->data_end;
451 }
452
453 ret = bdrv_getlength(bs->file->bs);
454
455 if (has_prealloc_perms(bs)) {
456 s->file_end = s->zero_start = s->data_end = ret;
457 }
458
459 return ret;
460 }
461
462 static int preallocate_check_perm(BlockDriverState *bs,
463 uint64_t perm, uint64_t shared, Error **errp)
464 {
465 BDRVPreallocateState *s = bs->opaque;
466
467 if (s->data_end >= 0 && !can_write_resize(perm)) {
468 /*
469 * Lose permissions.
470 * We should truncate in check_perm, as in set_perm bs->file->perm will
471 * be already changed, and we should not violate it.
472 */
473 if (s->file_end < 0) {
474 s->file_end = bdrv_getlength(bs->file->bs);
475 if (s->file_end < 0) {
476 error_setg(errp, "Failed to get file length");
477 return s->file_end;
478 }
479 }
480
481 if (s->data_end < s->file_end) {
482 int ret = bdrv_truncate(bs->file, s->data_end, true,
483 PREALLOC_MODE_OFF, 0, NULL);
484 if (ret < 0) {
485 error_setg(errp, "Failed to drop preallocation");
486 s->file_end = ret;
487 return ret;
488 }
489 s->file_end = s->data_end;
490 }
491 }
492
493 return 0;
494 }
495
496 static void preallocate_set_perm(BlockDriverState *bs,
497 uint64_t perm, uint64_t shared)
498 {
499 BDRVPreallocateState *s = bs->opaque;
500
501 if (can_write_resize(perm)) {
502 if (s->data_end < 0) {
503 s->data_end = s->file_end = s->zero_start =
504 bdrv_getlength(bs->file->bs);
505 }
506 } else {
507 /*
508 * We drop our permissions, as well as allow shared
509 * permissions (see preallocate_child_perm), anyone will be able to
510 * change the child, so mark all states invalid. We'll regain control if
511 * get good permissions back.
512 */
513 s->data_end = s->file_end = s->zero_start = -EINVAL;
514 }
515 }
516
517 static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
518 BdrvChildRole role, BlockReopenQueue *reopen_queue,
519 uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
520 {
521 bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
522
523 if (can_write_resize(perm)) {
524 /* This should come by default, but let's enforce: */
525 *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
526
527 /*
528 * Don't share, to keep our states s->file_end, s->data_end and
529 * s->zero_start valid.
530 */
531 *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
532 }
533 }
534
535 BlockDriver bdrv_preallocate_filter = {
536 .format_name = "preallocate",
537 .instance_size = sizeof(BDRVPreallocateState),
538
539 .bdrv_getlength = preallocate_getlength,
540 .bdrv_open = preallocate_open,
541 .bdrv_close = preallocate_close,
542
543 .bdrv_reopen_prepare = preallocate_reopen_prepare,
544 .bdrv_reopen_commit = preallocate_reopen_commit,
545 .bdrv_reopen_abort = preallocate_reopen_abort,
546
547 .bdrv_co_preadv_part = preallocate_co_preadv_part,
548 .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
549 .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
550 .bdrv_co_pdiscard = preallocate_co_pdiscard,
551 .bdrv_co_flush = preallocate_co_flush,
552 .bdrv_co_truncate = preallocate_co_truncate,
553
554 .bdrv_check_perm = preallocate_check_perm,
555 .bdrv_set_perm = preallocate_set_perm,
556 .bdrv_child_perm = preallocate_child_perm,
557
558 .has_variable_length = true,
559 .is_filter = true,
560 };
561
562 static void bdrv_preallocate_init(void)
563 {
564 bdrv_register(&bdrv_preallocate_filter);
565 }
566
567 block_init(bdrv_preallocate_init);