]> git.proxmox.com Git - mirror_qemu.git/blame - block/qcow2-refcount.c
block/qcow2: Add qcow2_refcount_area()
[mirror_qemu.git] / block / qcow2-refcount.c
CommitLineData
f7d0fe02
KW
1/*
2 * Block driver for the QCOW version 2 format
3 *
4 * Copyright (c) 2004-2006 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
80c71a24 25#include "qemu/osdep.h"
da34e65c 26#include "qapi/error.h"
f7d0fe02 27#include "qemu-common.h"
737e150e 28#include "block/block_int.h"
f7d0fe02 29#include "block/qcow2.h"
a40f1c2a 30#include "qemu/range.h"
58369e22 31#include "qemu/bswap.h"
f7d0fe02 32
bb572aef 33static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
92dcb59f 34static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
0e06528e 35 int64_t offset, int64_t length, uint64_t addend,
2aabe7c7 36 bool decrease, enum qcow2_discard_type type);
12cc30a8
HR
37static int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t offset,
38 uint64_t additional_clusters,
39 bool exact_size, int new_refblock_index,
40 uint64_t new_refblock_offset);
f7d0fe02 41
59c0cb78
HR
42static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
43static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
44static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
45static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
7453c96b 46static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
59c0cb78
HR
47static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
48static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
7453c96b 49
59c0cb78
HR
50static void set_refcount_ro0(void *refcount_array, uint64_t index,
51 uint64_t value);
52static void set_refcount_ro1(void *refcount_array, uint64_t index,
53 uint64_t value);
54static void set_refcount_ro2(void *refcount_array, uint64_t index,
55 uint64_t value);
56static void set_refcount_ro3(void *refcount_array, uint64_t index,
57 uint64_t value);
7453c96b
HR
58static void set_refcount_ro4(void *refcount_array, uint64_t index,
59 uint64_t value);
59c0cb78
HR
60static void set_refcount_ro5(void *refcount_array, uint64_t index,
61 uint64_t value);
62static void set_refcount_ro6(void *refcount_array, uint64_t index,
63 uint64_t value);
64
65
66static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
67 &get_refcount_ro0,
68 &get_refcount_ro1,
69 &get_refcount_ro2,
70 &get_refcount_ro3,
71 &get_refcount_ro4,
72 &get_refcount_ro5,
73 &get_refcount_ro6
74};
75
76static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
77 &set_refcount_ro0,
78 &set_refcount_ro1,
79 &set_refcount_ro2,
80 &set_refcount_ro3,
81 &set_refcount_ro4,
82 &set_refcount_ro5,
83 &set_refcount_ro6
84};
7453c96b 85
3b88e52b 86
f7d0fe02
KW
87/*********************************************************/
88/* refcount handling */
89
7061a078
AG
90static void update_max_refcount_table_index(BDRVQcow2State *s)
91{
92 unsigned i = s->refcount_table_size - 1;
93 while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
94 i--;
95 }
96 /* Set s->max_refcount_table_index to the index of the last used entry */
97 s->max_refcount_table_index = i;
98}
99
ed6ccf0f 100int qcow2_refcount_init(BlockDriverState *bs)
f7d0fe02 101{
ff99129a 102 BDRVQcow2State *s = bs->opaque;
5dab2fad
KW
103 unsigned int refcount_table_size2, i;
104 int ret;
f7d0fe02 105
59c0cb78
HR
106 assert(s->refcount_order >= 0 && s->refcount_order <= 6);
107
108 s->get_refcount = get_refcount_funcs[s->refcount_order];
109 s->set_refcount = set_refcount_funcs[s->refcount_order];
7453c96b 110
5dab2fad 111 assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
f7d0fe02 112 refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
de82815d
KW
113 s->refcount_table = g_try_malloc(refcount_table_size2);
114
f7d0fe02 115 if (s->refcount_table_size > 0) {
de82815d 116 if (s->refcount_table == NULL) {
8fcffa98 117 ret = -ENOMEM;
de82815d
KW
118 goto fail;
119 }
66f82cee 120 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
cf2ab8fc 121 ret = bdrv_pread(bs->file, s->refcount_table_offset,
f7d0fe02 122 s->refcount_table, refcount_table_size2);
8fcffa98 123 if (ret < 0) {
f7d0fe02 124 goto fail;
8fcffa98 125 }
f7d0fe02
KW
126 for(i = 0; i < s->refcount_table_size; i++)
127 be64_to_cpus(&s->refcount_table[i]);
7061a078 128 update_max_refcount_table_index(s);
f7d0fe02
KW
129 }
130 return 0;
131 fail:
8fcffa98 132 return ret;
f7d0fe02
KW
133}
134
ed6ccf0f 135void qcow2_refcount_close(BlockDriverState *bs)
f7d0fe02 136{
ff99129a 137 BDRVQcow2State *s = bs->opaque;
7267c094 138 g_free(s->refcount_table);
f7d0fe02
KW
139}
140
141
59c0cb78
HR
142static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
143{
144 return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
145}
146
147static void set_refcount_ro0(void *refcount_array, uint64_t index,
148 uint64_t value)
149{
150 assert(!(value >> 1));
151 ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
152 ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
153}
154
155static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
156{
157 return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
158 & 0x3;
159}
160
161static void set_refcount_ro1(void *refcount_array, uint64_t index,
162 uint64_t value)
163{
164 assert(!(value >> 2));
165 ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
166 ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
167}
168
169static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
170{
171 return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
172 & 0xf;
173}
174
175static void set_refcount_ro2(void *refcount_array, uint64_t index,
176 uint64_t value)
177{
178 assert(!(value >> 4));
179 ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
180 ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
181}
182
183static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
184{
185 return ((const uint8_t *)refcount_array)[index];
186}
187
188static void set_refcount_ro3(void *refcount_array, uint64_t index,
189 uint64_t value)
190{
191 assert(!(value >> 8));
192 ((uint8_t *)refcount_array)[index] = value;
193}
194
7453c96b
HR
195static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
196{
197 return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
198}
199
200static void set_refcount_ro4(void *refcount_array, uint64_t index,
201 uint64_t value)
202{
203 assert(!(value >> 16));
204 ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
205}
206
59c0cb78
HR
207static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
208{
209 return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
210}
211
212static void set_refcount_ro5(void *refcount_array, uint64_t index,
213 uint64_t value)
214{
215 assert(!(value >> 32));
216 ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
217}
218
219static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
220{
221 return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
222}
223
224static void set_refcount_ro6(void *refcount_array, uint64_t index,
225 uint64_t value)
226{
227 ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
228}
229
7453c96b 230
f7d0fe02 231static int load_refcount_block(BlockDriverState *bs,
29c1a730
KW
232 int64_t refcount_block_offset,
233 void **refcount_block)
f7d0fe02 234{
ff99129a 235 BDRVQcow2State *s = bs->opaque;
3b88e52b 236
66f82cee 237 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
9be38598
EH
238 return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
239 refcount_block);
f7d0fe02
KW
240}
241
018faafd 242/*
7324c10f
HR
243 * Retrieves the refcount of the cluster given by its index and stores it in
244 * *refcount. Returns 0 on success and -errno on failure.
018faafd 245 */
7324c10f 246int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
0e06528e 247 uint64_t *refcount)
f7d0fe02 248{
ff99129a 249 BDRVQcow2State *s = bs->opaque;
db8a31d1 250 uint64_t refcount_table_index, block_index;
f7d0fe02 251 int64_t refcount_block_offset;
018faafd 252 int ret;
7453c96b 253 void *refcount_block;
f7d0fe02 254
17bd5f47 255 refcount_table_index = cluster_index >> s->refcount_block_bits;
7324c10f
HR
256 if (refcount_table_index >= s->refcount_table_size) {
257 *refcount = 0;
f7d0fe02 258 return 0;
7324c10f 259 }
26d49c46
HR
260 refcount_block_offset =
261 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
7324c10f
HR
262 if (!refcount_block_offset) {
263 *refcount = 0;
f7d0fe02 264 return 0;
7324c10f 265 }
29c1a730 266
a97c67ee
HR
267 if (offset_into_cluster(s, refcount_block_offset)) {
268 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
269 " unaligned (reftable index: %#" PRIx64 ")",
270 refcount_block_offset, refcount_table_index);
271 return -EIO;
272 }
273
29c1a730 274 ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
7453c96b 275 &refcount_block);
29c1a730
KW
276 if (ret < 0) {
277 return ret;
f7d0fe02 278 }
29c1a730 279
17bd5f47 280 block_index = cluster_index & (s->refcount_block_size - 1);
7453c96b 281 *refcount = s->get_refcount(refcount_block, block_index);
29c1a730 282
a3f1afb4 283 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
29c1a730 284
7324c10f 285 return 0;
f7d0fe02
KW
286}
287
92dcb59f 288/* Checks if two offsets are described by the same refcount block */
ff99129a 289static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
92dcb59f
KW
290 uint64_t offset_b)
291{
17bd5f47
HR
292 uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
293 uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
92dcb59f
KW
294
295 return (block_a == block_b);
296}
297
298/*
299 * Loads a refcount block. If it doesn't exist yet, it is allocated first
300 * (including growing the refcount table if needed).
301 *
29c1a730 302 * Returns 0 on success or -errno in error case
92dcb59f 303 */
29c1a730 304static int alloc_refcount_block(BlockDriverState *bs,
7453c96b 305 int64_t cluster_index, void **refcount_block)
f7d0fe02 306{
ff99129a 307 BDRVQcow2State *s = bs->opaque;
92dcb59f 308 unsigned int refcount_table_index;
12cc30a8 309 int64_t ret;
92dcb59f 310
66f82cee 311 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
8252278a 312
92dcb59f 313 /* Find the refcount block for the given cluster */
17bd5f47 314 refcount_table_index = cluster_index >> s->refcount_block_bits;
92dcb59f
KW
315
316 if (refcount_table_index < s->refcount_table_size) {
317
318 uint64_t refcount_block_offset =
76dc9e0c 319 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
92dcb59f
KW
320
321 /* If it's already there, we're done */
322 if (refcount_block_offset) {
a97c67ee
HR
323 if (offset_into_cluster(s, refcount_block_offset)) {
324 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
325 PRIx64 " unaligned (reftable index: "
326 "%#x)", refcount_block_offset,
327 refcount_table_index);
328 return -EIO;
329 }
330
29c1a730 331 return load_refcount_block(bs, refcount_block_offset,
7453c96b 332 refcount_block);
92dcb59f
KW
333 }
334 }
335
336 /*
337 * If we came here, we need to allocate something. Something is at least
338 * a cluster for the new refcount block. It may also include a new refcount
339 * table if the old refcount table is too small.
340 *
341 * Note that allocating clusters here needs some special care:
342 *
343 * - We can't use the normal qcow2_alloc_clusters(), it would try to
344 * increase the refcount and very likely we would end up with an endless
345 * recursion. Instead we must place the refcount blocks in a way that
346 * they can describe them themselves.
347 *
348 * - We need to consider that at this point we are inside update_refcounts
b106ad91
KW
349 * and potentially doing an initial refcount increase. This means that
350 * some clusters have already been allocated by the caller, but their
351 * refcount isn't accurate yet. If we allocate clusters for metadata, we
352 * need to return -EAGAIN to signal the caller that it needs to restart
353 * the search for free clusters.
92dcb59f
KW
354 *
355 * - alloc_clusters_noref and qcow2_free_clusters may load a different
356 * refcount block into the cache
357 */
358
29c1a730
KW
359 *refcount_block = NULL;
360
361 /* We write to the refcount table, so we might depend on L2 tables */
9991923b
SH
362 ret = qcow2_cache_flush(bs, s->l2_table_cache);
363 if (ret < 0) {
364 return ret;
365 }
92dcb59f
KW
366
367 /* Allocate the refcount block itself and mark it as used */
2eaa8f63
KW
368 int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
369 if (new_block < 0) {
370 return new_block;
371 }
f7d0fe02 372
f7d0fe02 373#ifdef DEBUG_ALLOC2
92dcb59f
KW
374 fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
375 " at %" PRIx64 "\n",
376 refcount_table_index, cluster_index << s->cluster_bits, new_block);
f7d0fe02 377#endif
92dcb59f
KW
378
379 if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
25408c09 380 /* Zero the new refcount block before updating it */
29c1a730 381 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
7453c96b 382 refcount_block);
29c1a730
KW
383 if (ret < 0) {
384 goto fail_block;
385 }
386
387 memset(*refcount_block, 0, s->cluster_size);
25408c09 388
92dcb59f
KW
389 /* The block describes itself, need to update the cache */
390 int block_index = (new_block >> s->cluster_bits) &
17bd5f47 391 (s->refcount_block_size - 1);
7453c96b 392 s->set_refcount(*refcount_block, block_index, 1);
92dcb59f
KW
393 } else {
394 /* Described somewhere else. This can recurse at most twice before we
395 * arrive at a block that describes itself. */
2aabe7c7 396 ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
6cfcb9b8 397 QCOW2_DISCARD_NEVER);
92dcb59f
KW
398 if (ret < 0) {
399 goto fail_block;
400 }
25408c09 401
9991923b
SH
402 ret = qcow2_cache_flush(bs, s->refcount_block_cache);
403 if (ret < 0) {
404 goto fail_block;
405 }
1c4c2814 406
25408c09
KW
407 /* Initialize the new refcount block only after updating its refcount,
408 * update_refcount uses the refcount cache itself */
29c1a730 409 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
7453c96b 410 refcount_block);
29c1a730
KW
411 if (ret < 0) {
412 goto fail_block;
413 }
414
415 memset(*refcount_block, 0, s->cluster_size);
92dcb59f
KW
416 }
417
418 /* Now the new refcount block needs to be written to disk */
66f82cee 419 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
72e80b89 420 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
29c1a730 421 ret = qcow2_cache_flush(bs, s->refcount_block_cache);
92dcb59f
KW
422 if (ret < 0) {
423 goto fail_block;
424 }
425
426 /* If the refcount table is big enough, just hook the block up there */
427 if (refcount_table_index < s->refcount_table_size) {
428 uint64_t data64 = cpu_to_be64(new_block);
66f82cee 429 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
d9ca2ea2 430 ret = bdrv_pwrite_sync(bs->file,
92dcb59f
KW
431 s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
432 &data64, sizeof(data64));
433 if (ret < 0) {
434 goto fail_block;
435 }
436
437 s->refcount_table[refcount_table_index] = new_block;
7061a078
AG
438 /* If there's a hole in s->refcount_table then it can happen
439 * that refcount_table_index < s->max_refcount_table_index */
440 s->max_refcount_table_index =
441 MAX(s->max_refcount_table_index, refcount_table_index);
b106ad91
KW
442
443 /* The new refcount block may be where the caller intended to put its
444 * data, so let it restart the search. */
445 return -EAGAIN;
29c1a730
KW
446 }
447
a3f1afb4 448 qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
92dcb59f
KW
449
450 /*
451 * If we come here, we need to grow the refcount table. Again, a new
452 * refcount table needs some space and we can't simply allocate to avoid
453 * endless recursion.
454 *
455 * Therefore let's grab new refcount blocks at the end of the image, which
456 * will describe themselves and the new refcount table. This way we can
457 * reference them only in the new table and do the switch to the new
458 * refcount table at once without producing an inconsistent state in
459 * between.
460 */
66f82cee 461 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
8252278a 462
14a58a4e
HR
463 /* Calculate the number of refcount blocks needed so far; this will be the
464 * basis for calculating the index of the first cluster used for the
465 * self-describing refcount structures which we are about to create.
466 *
467 * Because we reached this point, there cannot be any refcount entries for
468 * cluster_index or higher indices yet. However, because new_block has been
469 * allocated to describe that cluster (and it will assume this role later
470 * on), we cannot use that index; also, new_block may actually have a higher
471 * cluster index than cluster_index, so it needs to be taken into account
472 * here (and 1 needs to be added to its value because that cluster is used).
473 */
474 uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
475 (new_block >> s->cluster_bits) + 1),
476 s->refcount_block_size);
92dcb59f 477
12cc30a8
HR
478 /* Create the new refcount table and blocks */
479 uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
480 s->cluster_size;
481
482 ret = qcow2_refcount_area(bs, meta_offset, 0, false,
483 refcount_table_index, new_block);
484 if (ret < 0) {
485 return ret;
2b5d5953
KW
486 }
487
12cc30a8
HR
488 ret = load_refcount_block(bs, new_block, refcount_block);
489 if (ret < 0) {
490 return ret;
491 }
92dcb59f 492
12cc30a8
HR
493 /* If we were trying to do the initial refcount update for some cluster
494 * allocation, we might have used the same clusters to store newly
495 * allocated metadata. Make the caller search some new space. */
496 return -EAGAIN;
92dcb59f 497
12cc30a8
HR
498fail_block:
499 if (*refcount_block != NULL) {
500 qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
501 }
502 return ret;
503}
92dcb59f 504
12cc30a8
HR
505/*
506 * Starting at @start_offset, this function creates new self-covering refcount
507 * structures: A new refcount table and refcount blocks which cover all of
508 * themselves, and a number of @additional_clusters beyond their end.
509 * @start_offset must be at the end of the image file, that is, there must be
510 * only empty space beyond it.
511 * If @exact_size is false, the refcount table will have 50 % more entries than
512 * necessary so it will not need to grow again soon.
513 * If @new_refblock_offset is not zero, it contains the offset of a refcount
514 * block that should be entered into the new refcount table at index
515 * @new_refblock_index.
516 *
517 * Returns: The offset after the new refcount structures (i.e. where the
518 * @additional_clusters may be placed) on success, -errno on error.
519 */
520static int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
521 uint64_t additional_clusters,
522 bool exact_size, int new_refblock_index,
523 uint64_t new_refblock_offset)
524{
525 BDRVQcow2State *s = bs->opaque;
526 uint64_t total_refblock_count_u64, additional_refblock_count;
527 int total_refblock_count, table_size, area_reftable_index, table_clusters;
528 int i;
529 uint64_t table_offset, block_offset, end_offset;
530 int ret;
531 uint64_t *new_table;
92dcb59f 532
12cc30a8 533 assert(!(start_offset % s->cluster_size));
de82815d 534
12cc30a8
HR
535 qcow2_refcount_metadata_size(start_offset / s->cluster_size +
536 additional_clusters,
537 s->cluster_size, s->refcount_order,
538 !exact_size, &total_refblock_count_u64);
539 if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
540 return -EFBIG;
541 }
542 total_refblock_count = total_refblock_count_u64;
543
544 /* Index in the refcount table of the first refcount block to cover the area
545 * of refcount structures we are about to create; we know that
546 * @total_refblock_count can cover @start_offset, so this will definitely
547 * fit into an int. */
548 area_reftable_index = (start_offset / s->cluster_size) /
549 s->refcount_block_size;
550
551 if (exact_size) {
552 table_size = total_refblock_count;
553 } else {
554 table_size = total_refblock_count +
555 DIV_ROUND_UP(total_refblock_count, 2);
556 }
557 /* The qcow2 file can only store the reftable size in number of clusters */
558 table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t));
559 table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size;
560
561 if (table_size > QCOW_MAX_REFTABLE_SIZE) {
562 return -EFBIG;
563 }
564
565 new_table = g_try_new0(uint64_t, table_size);
566
567 assert(table_size > 0);
568 if (new_table == NULL) {
de82815d 569 ret = -ENOMEM;
12cc30a8 570 goto fail;
de82815d 571 }
92dcb59f 572
92dcb59f 573 /* Fill the new refcount table */
12cc30a8
HR
574 if (table_size > s->max_refcount_table_index) {
575 /* We're actually growing the reftable */
576 memcpy(new_table, s->refcount_table,
577 (s->max_refcount_table_index + 1) * sizeof(uint64_t));
578 } else {
579 /* Improbable case: We're shrinking the reftable. However, the caller
580 * has assured us that there is only empty space beyond @start_offset,
581 * so we can simply drop all of the refblocks that won't fit into the
582 * new reftable. */
583 memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t));
584 }
92dcb59f 585
12cc30a8
HR
586 if (new_refblock_offset) {
587 assert(new_refblock_index < total_refblock_count);
588 new_table[new_refblock_index] = new_refblock_offset;
589 }
590
591 /* Count how many new refblocks we have to create */
592 additional_refblock_count = 0;
593 for (i = area_reftable_index; i < total_refblock_count; i++) {
594 if (!new_table[i]) {
595 additional_refblock_count++;
596 }
92dcb59f
KW
597 }
598
12cc30a8
HR
599 table_offset = start_offset + additional_refblock_count * s->cluster_size;
600 end_offset = table_offset + table_clusters * s->cluster_size;
601
602 /* Fill the refcount blocks, and create new ones, if necessary */
603 block_offset = start_offset;
604 for (i = area_reftable_index; i < total_refblock_count; i++) {
605 void *refblock_data;
606 uint64_t first_offset_covered;
607
608 /* Reuse an existing refblock if possible, create a new one otherwise */
609 if (new_table[i]) {
610 ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
611 &refblock_data);
612 if (ret < 0) {
613 goto fail;
614 }
615 } else {
616 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
617 block_offset, &refblock_data);
618 if (ret < 0) {
619 goto fail;
620 }
621 memset(refblock_data, 0, s->cluster_size);
622 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
623 refblock_data);
624
625 new_table[i] = block_offset;
626 block_offset += s->cluster_size;
627 }
628
629 /* First host offset covered by this refblock */
630 first_offset_covered = (uint64_t)i * s->refcount_block_size *
631 s->cluster_size;
632 if (first_offset_covered < end_offset) {
633 int j, end_index;
634
635 /* Set the refcount of all of the new refcount structures to 1 */
636
637 if (first_offset_covered < start_offset) {
638 assert(i == area_reftable_index);
639 j = (start_offset - first_offset_covered) / s->cluster_size;
640 assert(j < s->refcount_block_size);
641 } else {
642 j = 0;
643 }
644
645 end_index = MIN((end_offset - first_offset_covered) /
646 s->cluster_size,
647 s->refcount_block_size);
648
649 for (; j < end_index; j++) {
650 /* The caller guaranteed us this space would be empty */
651 assert(s->get_refcount(refblock_data, j) == 0);
652 s->set_refcount(refblock_data, j, 1);
653 }
654
655 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
656 refblock_data);
657 }
658
659 qcow2_cache_put(bs, s->refcount_block_cache, &refblock_data);
92dcb59f
KW
660 }
661
12cc30a8
HR
662 assert(block_offset == table_offset);
663
92dcb59f 664 /* Write refcount blocks to disk */
66f82cee 665 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
12cc30a8 666 ret = qcow2_cache_flush(bs, s->refcount_block_cache);
92dcb59f 667 if (ret < 0) {
12cc30a8 668 goto fail;
92dcb59f
KW
669 }
670
671 /* Write refcount table to disk */
12cc30a8 672 for (i = 0; i < total_refblock_count; i++) {
92dcb59f
KW
673 cpu_to_be64s(&new_table[i]);
674 }
675
66f82cee 676 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
d9ca2ea2 677 ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
92dcb59f
KW
678 table_size * sizeof(uint64_t));
679 if (ret < 0) {
12cc30a8 680 goto fail;
92dcb59f
KW
681 }
682
12cc30a8 683 for (i = 0; i < total_refblock_count; i++) {
87267753 684 be64_to_cpus(&new_table[i]);
92dcb59f 685 }
f7d0fe02 686
92dcb59f 687 /* Hook up the new refcount table in the qcow2 header */
95334230
JS
688 struct QEMU_PACKED {
689 uint64_t d64;
690 uint32_t d32;
691 } data;
f1f7a1dd
PM
692 data.d64 = cpu_to_be64(table_offset);
693 data.d32 = cpu_to_be32(table_clusters);
66f82cee 694 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
d9ca2ea2 695 ret = bdrv_pwrite_sync(bs->file,
9a4f4c31 696 offsetof(QCowHeader, refcount_table_offset),
95334230 697 &data, sizeof(data));
92dcb59f 698 if (ret < 0) {
12cc30a8 699 goto fail;
f2b7c8b3
KW
700 }
701
92dcb59f
KW
702 /* And switch it in memory */
703 uint64_t old_table_offset = s->refcount_table_offset;
704 uint64_t old_table_size = s->refcount_table_size;
705
7267c094 706 g_free(s->refcount_table);
f7d0fe02 707 s->refcount_table = new_table;
92dcb59f 708 s->refcount_table_size = table_size;
f7d0fe02 709 s->refcount_table_offset = table_offset;
7061a078 710 update_max_refcount_table_index(s);
f7d0fe02 711
b106ad91 712 /* Free old table. */
6cfcb9b8
KW
713 qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
714 QCOW2_DISCARD_OTHER);
f7d0fe02 715
12cc30a8 716 return end_offset;
f7d0fe02 717
12cc30a8 718fail:
7267c094 719 g_free(new_table);
29c1a730 720 return ret;
9923e05e
KW
721}
722
0b919fae
KW
723void qcow2_process_discards(BlockDriverState *bs, int ret)
724{
ff99129a 725 BDRVQcow2State *s = bs->opaque;
0b919fae
KW
726 Qcow2DiscardRegion *d, *next;
727
728 QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
729 QTAILQ_REMOVE(&s->discards, d, next);
730
731 /* Discard is optional, ignore the return value */
732 if (ret >= 0) {
0c51a893 733 bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
0b919fae
KW
734 }
735
736 g_free(d);
737 }
738}
739
740static void update_refcount_discard(BlockDriverState *bs,
741 uint64_t offset, uint64_t length)
742{
ff99129a 743 BDRVQcow2State *s = bs->opaque;
0b919fae
KW
744 Qcow2DiscardRegion *d, *p, *next;
745
746 QTAILQ_FOREACH(d, &s->discards, next) {
747 uint64_t new_start = MIN(offset, d->offset);
748 uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
749
750 if (new_end - new_start <= length + d->bytes) {
751 /* There can't be any overlap, areas ending up here have no
752 * references any more and therefore shouldn't get freed another
753 * time. */
754 assert(d->bytes + length == new_end - new_start);
755 d->offset = new_start;
756 d->bytes = new_end - new_start;
757 goto found;
758 }
759 }
760
761 d = g_malloc(sizeof(*d));
762 *d = (Qcow2DiscardRegion) {
763 .bs = bs,
764 .offset = offset,
765 .bytes = length,
766 };
767 QTAILQ_INSERT_TAIL(&s->discards, d, next);
768
769found:
770 /* Merge discard requests if they are adjacent now */
771 QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
772 if (p == d
773 || p->offset > d->offset + d->bytes
774 || d->offset > p->offset + p->bytes)
775 {
776 continue;
777 }
778
779 /* Still no overlap possible */
780 assert(p->offset == d->offset + d->bytes
781 || d->offset == p->offset + p->bytes);
782
783 QTAILQ_REMOVE(&s->discards, p, next);
784 d->offset = MIN(d->offset, p->offset);
785 d->bytes += p->bytes;
d8bb71b6 786 g_free(p);
0b919fae
KW
787 }
788}
789
f7d0fe02 790/* XXX: cache several refcount block clusters ? */
2aabe7c7
HR
791/* @addend is the absolute value of the addend; if @decrease is set, @addend
792 * will be subtracted from the current refcount, otherwise it will be added */
db3a964f 793static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
2aabe7c7
HR
794 int64_t offset,
795 int64_t length,
0e06528e 796 uint64_t addend,
2aabe7c7
HR
797 bool decrease,
798 enum qcow2_discard_type type)
f7d0fe02 799{
ff99129a 800 BDRVQcow2State *s = bs->opaque;
f7d0fe02 801 int64_t start, last, cluster_offset;
7453c96b 802 void *refcount_block = NULL;
29c1a730 803 int64_t old_table_index = -1;
09508d13 804 int ret;
f7d0fe02
KW
805
806#ifdef DEBUG_ALLOC2
2aabe7c7 807 fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
0e06528e 808 " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
2aabe7c7 809 addend);
f7d0fe02 810#endif
7322afe7 811 if (length < 0) {
f7d0fe02 812 return -EINVAL;
7322afe7
KW
813 } else if (length == 0) {
814 return 0;
815 }
816
2aabe7c7 817 if (decrease) {
29c1a730
KW
818 qcow2_cache_set_dependency(bs, s->refcount_block_cache,
819 s->l2_table_cache);
820 }
821
ac95acdb
HT
822 start = start_of_cluster(s, offset);
823 last = start_of_cluster(s, offset + length - 1);
f7d0fe02
KW
824 for(cluster_offset = start; cluster_offset <= last;
825 cluster_offset += s->cluster_size)
826 {
2aabe7c7 827 int block_index;
0e06528e 828 uint64_t refcount;
f7d0fe02 829 int64_t cluster_index = cluster_offset >> s->cluster_bits;
17bd5f47 830 int64_t table_index = cluster_index >> s->refcount_block_bits;
f7d0fe02 831
29c1a730
KW
832 /* Load the refcount block and allocate it if needed */
833 if (table_index != old_table_index) {
834 if (refcount_block) {
a3f1afb4 835 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
29c1a730 836 }
29c1a730 837 ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
ed0df867 838 if (ret < 0) {
29c1a730 839 goto fail;
f7d0fe02 840 }
f7d0fe02 841 }
29c1a730 842 old_table_index = table_index;
f7d0fe02 843
72e80b89
AG
844 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
845 refcount_block);
f7d0fe02
KW
846
847 /* we can update the count and save it */
17bd5f47 848 block_index = cluster_index & (s->refcount_block_size - 1);
f7d0fe02 849
7453c96b 850 refcount = s->get_refcount(refcount_block, block_index);
0e06528e
HR
851 if (decrease ? (refcount - addend > refcount)
852 : (refcount + addend < refcount ||
853 refcount + addend > s->refcount_max))
2aabe7c7 854 {
09508d13
KW
855 ret = -EINVAL;
856 goto fail;
857 }
2aabe7c7
HR
858 if (decrease) {
859 refcount -= addend;
860 } else {
861 refcount += addend;
862 }
f7d0fe02
KW
863 if (refcount == 0 && cluster_index < s->free_cluster_index) {
864 s->free_cluster_index = cluster_index;
865 }
7453c96b 866 s->set_refcount(refcount_block, block_index, refcount);
0b919fae 867
67af674e 868 if (refcount == 0 && s->discard_passthrough[type]) {
0b919fae 869 update_refcount_discard(bs, cluster_offset, s->cluster_size);
67af674e 870 }
f7d0fe02
KW
871 }
872
09508d13
KW
873 ret = 0;
874fail:
0b919fae
KW
875 if (!s->cache_discards) {
876 qcow2_process_discards(bs, ret);
877 }
878
f7d0fe02 879 /* Write last changed block to disk */
29c1a730 880 if (refcount_block) {
a3f1afb4 881 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
f7d0fe02
KW
882 }
883
09508d13
KW
884 /*
885 * Try do undo any updates if an error is returned (This may succeed in
886 * some cases like ENOSPC for allocating a new refcount block)
887 */
888 if (ret < 0) {
889 int dummy;
2aabe7c7
HR
890 dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
891 !decrease, QCOW2_DISCARD_NEVER);
83e3f76c 892 (void)dummy;
09508d13
KW
893 }
894
895 return ret;
f7d0fe02
KW
896}
897
018faafd 898/*
44751917 899 * Increases or decreases the refcount of a given cluster.
018faafd 900 *
2aabe7c7
HR
901 * @addend is the absolute value of the addend; if @decrease is set, @addend
902 * will be subtracted from the current refcount, otherwise it will be added.
903 *
c6e9d8ae 904 * On success 0 is returned; on failure -errno is returned.
018faafd 905 */
32b6444d
HR
906int qcow2_update_cluster_refcount(BlockDriverState *bs,
907 int64_t cluster_index,
0e06528e 908 uint64_t addend, bool decrease,
32b6444d 909 enum qcow2_discard_type type)
f7d0fe02 910{
ff99129a 911 BDRVQcow2State *s = bs->opaque;
f7d0fe02
KW
912 int ret;
913
6cfcb9b8 914 ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
2aabe7c7 915 decrease, type);
f7d0fe02
KW
916 if (ret < 0) {
917 return ret;
918 }
919
c6e9d8ae 920 return 0;
f7d0fe02
KW
921}
922
923
924
925/*********************************************************/
926/* cluster allocation functions */
927
928
929
930/* return < 0 if error */
bb572aef 931static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
f7d0fe02 932{
ff99129a 933 BDRVQcow2State *s = bs->opaque;
0e06528e 934 uint64_t i, nb_clusters, refcount;
7324c10f 935 int ret;
f7d0fe02 936
ecbda7a2
KW
937 /* We can't allocate clusters if they may still be queued for discard. */
938 if (s->cache_discards) {
939 qcow2_process_discards(bs, 0);
940 }
941
f7d0fe02
KW
942 nb_clusters = size_to_clusters(s, size);
943retry:
944 for(i = 0; i < nb_clusters; i++) {
bb572aef 945 uint64_t next_cluster_index = s->free_cluster_index++;
7324c10f 946 ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
2eaa8f63 947
7324c10f
HR
948 if (ret < 0) {
949 return ret;
2eaa8f63 950 } else if (refcount != 0) {
f7d0fe02 951 goto retry;
2eaa8f63 952 }
f7d0fe02 953 }
91f827dc
HR
954
955 /* Make sure that all offsets in the "allocated" range are representable
956 * in an int64_t */
65f33bc0
HR
957 if (s->free_cluster_index > 0 &&
958 s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
959 {
91f827dc
HR
960 return -EFBIG;
961 }
962
f7d0fe02 963#ifdef DEBUG_ALLOC2
35ee5e39 964 fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
f7d0fe02
KW
965 size,
966 (s->free_cluster_index - nb_clusters) << s->cluster_bits);
967#endif
968 return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
969}
970
bb572aef 971int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
f7d0fe02
KW
972{
973 int64_t offset;
db3a964f 974 int ret;
f7d0fe02 975
66f82cee 976 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
b106ad91
KW
977 do {
978 offset = alloc_clusters_noref(bs, size);
979 if (offset < 0) {
980 return offset;
981 }
982
2aabe7c7 983 ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
b106ad91 984 } while (ret == -EAGAIN);
2eaa8f63 985
db3a964f
KW
986 if (ret < 0) {
987 return ret;
988 }
1c4c2814 989
f7d0fe02
KW
990 return offset;
991}
992
b6d36def
HR
993int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
994 int64_t nb_clusters)
256900b1 995{
ff99129a 996 BDRVQcow2State *s = bs->opaque;
0e06528e 997 uint64_t cluster_index, refcount;
33304ec9 998 uint64_t i;
7324c10f 999 int ret;
33304ec9
HT
1000
1001 assert(nb_clusters >= 0);
1002 if (nb_clusters == 0) {
1003 return 0;
1004 }
256900b1 1005
b106ad91
KW
1006 do {
1007 /* Check how many clusters there are free */
1008 cluster_index = offset >> s->cluster_bits;
1009 for(i = 0; i < nb_clusters; i++) {
7324c10f
HR
1010 ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1011 if (ret < 0) {
1012 return ret;
b106ad91
KW
1013 } else if (refcount != 0) {
1014 break;
1015 }
256900b1 1016 }
256900b1 1017
b106ad91 1018 /* And then allocate them */
2aabe7c7 1019 ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
b106ad91
KW
1020 QCOW2_DISCARD_NEVER);
1021 } while (ret == -EAGAIN);
f24423bd 1022
256900b1
KW
1023 if (ret < 0) {
1024 return ret;
1025 }
1026
1027 return i;
1028}
1029
f7d0fe02
KW
1030/* only used to allocate compressed sectors. We try to allocate
1031 contiguous sectors. size must be <= cluster_size */
ed6ccf0f 1032int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
f7d0fe02 1033{
ff99129a 1034 BDRVQcow2State *s = bs->opaque;
8c44dfbc
HR
1035 int64_t offset;
1036 size_t free_in_cluster;
1037 int ret;
f7d0fe02 1038
66f82cee 1039 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
f7d0fe02 1040 assert(size > 0 && size <= s->cluster_size);
8c44dfbc
HR
1041 assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1042
1043 offset = s->free_byte_offset;
1044
1045 if (offset) {
0e06528e 1046 uint64_t refcount;
7324c10f
HR
1047 ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1048 if (ret < 0) {
1049 return ret;
5d757b56 1050 }
8c44dfbc 1051
346a53df 1052 if (refcount == s->refcount_max) {
8c44dfbc 1053 offset = 0;
5d757b56 1054 }
8c44dfbc
HR
1055 }
1056
1057 free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
3e5feb62
JM
1058 do {
1059 if (!offset || free_in_cluster < size) {
1060 int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
1061 if (new_cluster < 0) {
1062 return new_cluster;
1063 }
8c44dfbc 1064
3e5feb62
JM
1065 if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1066 offset = new_cluster;
2ac01520
HR
1067 free_in_cluster = s->cluster_size;
1068 } else {
1069 free_in_cluster += s->cluster_size;
3e5feb62 1070 }
f7d0fe02 1071 }
29216ed1 1072
3e5feb62
JM
1073 assert(offset);
1074 ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
2ac01520
HR
1075 if (ret < 0) {
1076 offset = 0;
1077 }
3e5feb62 1078 } while (ret == -EAGAIN);
8c44dfbc
HR
1079 if (ret < 0) {
1080 return ret;
1081 }
1082
1083 /* The cluster refcount was incremented; refcount blocks must be flushed
1084 * before the caller's L2 table updates. */
c1f5bafd 1085 qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
8c44dfbc
HR
1086
1087 s->free_byte_offset = offset + size;
1088 if (!offset_into_cluster(s, s->free_byte_offset)) {
1089 s->free_byte_offset = 0;
1090 }
1091
f7d0fe02
KW
1092 return offset;
1093}
1094
ed6ccf0f 1095void qcow2_free_clusters(BlockDriverState *bs,
6cfcb9b8
KW
1096 int64_t offset, int64_t size,
1097 enum qcow2_discard_type type)
f7d0fe02 1098{
db3a964f
KW
1099 int ret;
1100
66f82cee 1101 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
2aabe7c7 1102 ret = update_refcount(bs, offset, size, 1, true, type);
db3a964f
KW
1103 if (ret < 0) {
1104 fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
003fad6e 1105 /* TODO Remember the clusters to free them later and avoid leaking */
db3a964f 1106 }
f7d0fe02
KW
1107}
1108
45aba42f 1109/*
c7a4c37a
KW
1110 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1111 * normal cluster, compressed cluster, etc.)
45aba42f 1112 */
6cfcb9b8
KW
1113void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1114 int nb_clusters, enum qcow2_discard_type type)
45aba42f 1115{
ff99129a 1116 BDRVQcow2State *s = bs->opaque;
45aba42f 1117
c7a4c37a
KW
1118 switch (qcow2_get_cluster_type(l2_entry)) {
1119 case QCOW2_CLUSTER_COMPRESSED:
1120 {
1121 int nb_csectors;
1122 nb_csectors = ((l2_entry >> s->csize_shift) &
1123 s->csize_mask) + 1;
1124 qcow2_free_clusters(bs,
1125 (l2_entry & s->cluster_offset_mask) & ~511,
6cfcb9b8 1126 nb_csectors * 512, type);
c7a4c37a
KW
1127 }
1128 break;
1129 case QCOW2_CLUSTER_NORMAL:
fdfab37d
EB
1130 case QCOW2_CLUSTER_ZERO_ALLOC:
1131 if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1132 qcow2_signal_corruption(bs, false, -1, -1,
1133 "Cannot free unaligned cluster %#llx",
1134 l2_entry & L2E_OFFSET_MASK);
1135 } else {
1136 qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1137 nb_clusters << s->cluster_bits, type);
8f730dd2 1138 }
c7a4c37a 1139 break;
fdfab37d 1140 case QCOW2_CLUSTER_ZERO_PLAIN:
c7a4c37a
KW
1141 case QCOW2_CLUSTER_UNALLOCATED:
1142 break;
1143 default:
1144 abort();
45aba42f 1145 }
45aba42f
KW
1146}
1147
f7d0fe02
KW
1148
1149
1150/*********************************************************/
1151/* snapshots and image creation */
1152
1153
1154
f7d0fe02 1155/* update the refcounts of snapshots and the copied flag */
ed6ccf0f
KW
1156int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1157 int64_t l1_table_offset, int l1_size, int addend)
f7d0fe02 1158{
ff99129a 1159 BDRVQcow2State *s = bs->opaque;
b32cbae1 1160 uint64_t *l1_table, *l2_table, l2_offset, entry, l1_size2, refcount;
de82815d 1161 bool l1_allocated = false;
b32cbae1 1162 int64_t old_entry, old_l2_offset;
7324c10f 1163 int i, j, l1_modified = 0, nb_csectors;
29c1a730 1164 int ret;
f7d0fe02 1165
2aabe7c7
HR
1166 assert(addend >= -1 && addend <= 1);
1167
f7d0fe02
KW
1168 l2_table = NULL;
1169 l1_table = NULL;
1170 l1_size2 = l1_size * sizeof(uint64_t);
43a0cac4 1171
0b919fae
KW
1172 s->cache_discards = true;
1173
43a0cac4
KW
1174 /* WARNING: qcow2_snapshot_goto relies on this function not using the
1175 * l1_table_offset when it is the current s->l1_table_offset! Be careful
1176 * when changing this! */
f7d0fe02 1177 if (l1_table_offset != s->l1_table_offset) {
de82815d
KW
1178 l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1179 if (l1_size2 && l1_table == NULL) {
1180 ret = -ENOMEM;
1181 goto fail;
1182 }
1183 l1_allocated = true;
c2bc78b6 1184
cf2ab8fc 1185 ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
c2bc78b6 1186 if (ret < 0) {
f7d0fe02 1187 goto fail;
93913dfd
KW
1188 }
1189
b32cbae1 1190 for (i = 0; i < l1_size; i++) {
f7d0fe02 1191 be64_to_cpus(&l1_table[i]);
b32cbae1 1192 }
f7d0fe02
KW
1193 } else {
1194 assert(l1_size == s->l1_size);
1195 l1_table = s->l1_table;
de82815d 1196 l1_allocated = false;
f7d0fe02
KW
1197 }
1198
b32cbae1 1199 for (i = 0; i < l1_size; i++) {
f7d0fe02
KW
1200 l2_offset = l1_table[i];
1201 if (l2_offset) {
1202 old_l2_offset = l2_offset;
8e37f681 1203 l2_offset &= L1E_OFFSET_MASK;
29c1a730 1204
a97c67ee
HR
1205 if (offset_into_cluster(s, l2_offset)) {
1206 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1207 PRIx64 " unaligned (L1 index: %#x)",
1208 l2_offset, i);
1209 ret = -EIO;
1210 goto fail;
1211 }
1212
29c1a730
KW
1213 ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1214 (void**) &l2_table);
1215 if (ret < 0) {
f7d0fe02 1216 goto fail;
29c1a730
KW
1217 }
1218
b32cbae1 1219 for (j = 0; j < s->l2_size; j++) {
8b81a7b6 1220 uint64_t cluster_index;
b32cbae1 1221 uint64_t offset;
8b81a7b6 1222
b32cbae1
EB
1223 entry = be64_to_cpu(l2_table[j]);
1224 old_entry = entry;
1225 entry &= ~QCOW_OFLAG_COPIED;
1226 offset = entry & L2E_OFFSET_MASK;
8b81a7b6 1227
b32cbae1 1228 switch (qcow2_get_cluster_type(entry)) {
bbd995d8
EB
1229 case QCOW2_CLUSTER_COMPRESSED:
1230 nb_csectors = ((entry >> s->csize_shift) &
1231 s->csize_mask) + 1;
1232 if (addend != 0) {
1233 ret = update_refcount(bs,
b32cbae1 1234 (entry & s->cluster_offset_mask) & ~511,
2aabe7c7 1235 nb_csectors * 512, abs(addend), addend < 0,
6cfcb9b8 1236 QCOW2_DISCARD_SNAPSHOT);
bbd995d8 1237 if (ret < 0) {
a97c67ee
HR
1238 goto fail;
1239 }
bbd995d8
EB
1240 }
1241 /* compressed clusters are never modified */
1242 refcount = 2;
1243 break;
1244
1245 case QCOW2_CLUSTER_NORMAL:
fdfab37d 1246 case QCOW2_CLUSTER_ZERO_ALLOC:
bbd995d8 1247 if (offset_into_cluster(s, offset)) {
fdfab37d
EB
1248 qcow2_signal_corruption(bs, true, -1, -1, "Cluster "
1249 "allocation offset %#" PRIx64
bbd995d8
EB
1250 " unaligned (L2 offset: %#"
1251 PRIx64 ", L2 index: %#x)",
1252 offset, l2_offset, j);
1253 ret = -EIO;
1254 goto fail;
1255 }
a97c67ee 1256
bbd995d8 1257 cluster_index = offset >> s->cluster_bits;
fdfab37d 1258 assert(cluster_index);
bbd995d8
EB
1259 if (addend != 0) {
1260 ret = qcow2_update_cluster_refcount(bs,
2aabe7c7 1261 cluster_index, abs(addend), addend < 0,
32b6444d 1262 QCOW2_DISCARD_SNAPSHOT);
7324c10f 1263 if (ret < 0) {
018faafd
KW
1264 goto fail;
1265 }
bbd995d8 1266 }
f7d0fe02 1267
bbd995d8
EB
1268 ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1269 if (ret < 0) {
1270 goto fail;
1271 }
1272 break;
1273
fdfab37d 1274 case QCOW2_CLUSTER_ZERO_PLAIN:
bbd995d8
EB
1275 case QCOW2_CLUSTER_UNALLOCATED:
1276 refcount = 0;
1277 break;
8b81a7b6 1278
bbd995d8
EB
1279 default:
1280 abort();
8b81a7b6
HR
1281 }
1282
1283 if (refcount == 1) {
b32cbae1 1284 entry |= QCOW_OFLAG_COPIED;
8b81a7b6 1285 }
b32cbae1 1286 if (entry != old_entry) {
8b81a7b6
HR
1287 if (addend > 0) {
1288 qcow2_cache_set_dependency(bs, s->l2_table_cache,
1289 s->refcount_block_cache);
f7d0fe02 1290 }
b32cbae1 1291 l2_table[j] = cpu_to_be64(entry);
72e80b89
AG
1292 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
1293 l2_table);
f7d0fe02
KW
1294 }
1295 }
29c1a730 1296
a3f1afb4 1297 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
29c1a730 1298
f7d0fe02 1299 if (addend != 0) {
c6e9d8ae
HR
1300 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1301 s->cluster_bits,
2aabe7c7 1302 abs(addend), addend < 0,
c6e9d8ae
HR
1303 QCOW2_DISCARD_SNAPSHOT);
1304 if (ret < 0) {
1305 goto fail;
1306 }
f7d0fe02 1307 }
7324c10f
HR
1308 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1309 &refcount);
1310 if (ret < 0) {
018faafd
KW
1311 goto fail;
1312 } else if (refcount == 1) {
f7d0fe02
KW
1313 l2_offset |= QCOW_OFLAG_COPIED;
1314 }
1315 if (l2_offset != old_l2_offset) {
1316 l1_table[i] = l2_offset;
1317 l1_modified = 1;
1318 }
1319 }
1320 }
93913dfd 1321
2154f24e 1322 ret = bdrv_flush(bs);
93913dfd
KW
1323fail:
1324 if (l2_table) {
1325 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1326 }
1327
0b919fae
KW
1328 s->cache_discards = false;
1329 qcow2_process_discards(bs, ret);
1330
43a0cac4 1331 /* Update L1 only if it isn't deleted anyway (addend = -1) */
c2b6ff51
KW
1332 if (ret == 0 && addend >= 0 && l1_modified) {
1333 for (i = 0; i < l1_size; i++) {
f7d0fe02 1334 cpu_to_be64s(&l1_table[i]);
c2b6ff51
KW
1335 }
1336
d9ca2ea2 1337 ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
9a4f4c31 1338 l1_table, l1_size2);
c2b6ff51
KW
1339
1340 for (i = 0; i < l1_size; i++) {
f7d0fe02 1341 be64_to_cpus(&l1_table[i]);
c2b6ff51 1342 }
f7d0fe02
KW
1343 }
1344 if (l1_allocated)
7267c094 1345 g_free(l1_table);
93913dfd 1346 return ret;
f7d0fe02
KW
1347}
1348
1349
1350
1351
1352/*********************************************************/
1353/* refcount checking functions */
1354
1355
c2551b47 1356static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
5fee192e
HR
1357{
1358 /* This assertion holds because there is no way we can address more than
1359 * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1360 * offsets have to be representable in bytes); due to every cluster
1361 * corresponding to one refcount entry, we are well below that limit */
1362 assert(entries < (UINT64_C(1) << (64 - 9)));
1363
1364 /* Thanks to the assertion this will not overflow, because
1365 * s->refcount_order < 7.
1366 * (note: x << s->refcount_order == x * s->refcount_bits) */
1367 return DIV_ROUND_UP(entries << s->refcount_order, 8);
1368}
1369
1370/**
1371 * Reallocates *array so that it can hold new_size entries. *size must contain
1372 * the current number of entries in *array. If the reallocation fails, *array
1373 * and *size will not be modified and -errno will be returned. If the
1374 * reallocation is successful, *array will be set to the new buffer, *size
1375 * will be set to new_size and 0 will be returned. The size of the reallocated
1376 * refcount array buffer will be aligned to a cluster boundary, and the newly
1377 * allocated area will be zeroed.
1378 */
ff99129a 1379static int realloc_refcount_array(BDRVQcow2State *s, void **array,
5fee192e
HR
1380 int64_t *size, int64_t new_size)
1381{
b6d36def 1382 int64_t old_byte_size, new_byte_size;
7453c96b 1383 void *new_ptr;
5fee192e
HR
1384
1385 /* Round to clusters so the array can be directly written to disk */
1386 old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1387 * s->cluster_size;
1388 new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1389 * s->cluster_size;
1390
1391 if (new_byte_size == old_byte_size) {
1392 *size = new_size;
1393 return 0;
1394 }
1395
1396 assert(new_byte_size > 0);
1397
b6d36def
HR
1398 if (new_byte_size > SIZE_MAX) {
1399 return -ENOMEM;
1400 }
1401
5fee192e
HR
1402 new_ptr = g_try_realloc(*array, new_byte_size);
1403 if (!new_ptr) {
1404 return -ENOMEM;
1405 }
1406
1407 if (new_byte_size > old_byte_size) {
b6d36def 1408 memset((char *)new_ptr + old_byte_size, 0,
5fee192e
HR
1409 new_byte_size - old_byte_size);
1410 }
1411
1412 *array = new_ptr;
1413 *size = new_size;
1414
1415 return 0;
1416}
f7d0fe02
KW
1417
1418/*
1419 * Increases the refcount for a range of clusters in a given refcount table.
1420 * This is used to construct a temporary refcount table out of L1 and L2 tables
b6af0975 1421 * which can be compared to the refcount table saved in the image.
f7d0fe02 1422 *
9ac228e0 1423 * Modifies the number of errors in res.
f7d0fe02 1424 */
8a5bb1f1
VSO
1425int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1426 void **refcount_table,
1427 int64_t *refcount_table_size,
1428 int64_t offset, int64_t size)
f7d0fe02 1429{
ff99129a 1430 BDRVQcow2State *s = bs->opaque;
7453c96b 1431 uint64_t start, last, cluster_offset, k, refcount;
5fee192e 1432 int ret;
f7d0fe02 1433
fef4d3d5
HR
1434 if (size <= 0) {
1435 return 0;
1436 }
f7d0fe02 1437
ac95acdb
HT
1438 start = start_of_cluster(s, offset);
1439 last = start_of_cluster(s, offset + size - 1);
f7d0fe02
KW
1440 for(cluster_offset = start; cluster_offset <= last;
1441 cluster_offset += s->cluster_size) {
1442 k = cluster_offset >> s->cluster_bits;
641bb63c 1443 if (k >= *refcount_table_size) {
5fee192e
HR
1444 ret = realloc_refcount_array(s, refcount_table,
1445 refcount_table_size, k + 1);
1446 if (ret < 0) {
641bb63c 1447 res->check_errors++;
5fee192e 1448 return ret;
f7d0fe02 1449 }
641bb63c
HR
1450 }
1451
7453c96b
HR
1452 refcount = s->get_refcount(*refcount_table, k);
1453 if (refcount == s->refcount_max) {
641bb63c
HR
1454 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1455 "\n", cluster_offset);
03bb78ed
HR
1456 fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1457 "width or qemu-img convert to create a clean copy if the "
1458 "image cannot be opened for writing\n");
641bb63c 1459 res->corruptions++;
7453c96b 1460 continue;
f7d0fe02 1461 }
7453c96b 1462 s->set_refcount(*refcount_table, k, refcount + 1);
f7d0fe02 1463 }
fef4d3d5
HR
1464
1465 return 0;
f7d0fe02
KW
1466}
1467
801f7044
SH
1468/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1469enum {
fba31bae 1470 CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */
801f7044
SH
1471};
1472
f7d0fe02
KW
1473/*
1474 * Increases the refcount in the given refcount table for the all clusters
1475 * referenced in the L2 table. While doing so, performs some checks on L2
1476 * entries.
1477 *
1478 * Returns the number of errors found by the checks or -errno if an internal
1479 * error occurred.
1480 */
9ac228e0 1481static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
7453c96b
HR
1482 void **refcount_table,
1483 int64_t *refcount_table_size, int64_t l2_offset,
1484 int flags)
f7d0fe02 1485{
ff99129a 1486 BDRVQcow2State *s = bs->opaque;
afdf0abe 1487 uint64_t *l2_table, l2_entry;
fba31bae 1488 uint64_t next_contiguous_offset = 0;
ad27390c 1489 int i, l2_size, nb_csectors, ret;
f7d0fe02
KW
1490
1491 /* Read L2 table from disk */
1492 l2_size = s->l2_size * sizeof(uint64_t);
7267c094 1493 l2_table = g_malloc(l2_size);
f7d0fe02 1494
cf2ab8fc 1495 ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
ad27390c
HR
1496 if (ret < 0) {
1497 fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1498 res->check_errors++;
f7d0fe02 1499 goto fail;
ad27390c 1500 }
f7d0fe02
KW
1501
1502 /* Do the actual checks */
1503 for(i = 0; i < s->l2_size; i++) {
afdf0abe
KW
1504 l2_entry = be64_to_cpu(l2_table[i]);
1505
1506 switch (qcow2_get_cluster_type(l2_entry)) {
1507 case QCOW2_CLUSTER_COMPRESSED:
1508 /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1509 if (l2_entry & QCOW_OFLAG_COPIED) {
1510 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1511 "copied flag must never be set for compressed "
1512 "clusters\n", l2_entry >> s->cluster_bits);
1513 l2_entry &= ~QCOW_OFLAG_COPIED;
1514 res->corruptions++;
1515 }
f7d0fe02 1516
afdf0abe
KW
1517 /* Mark cluster as used */
1518 nb_csectors = ((l2_entry >> s->csize_shift) &
1519 s->csize_mask) + 1;
1520 l2_entry &= s->cluster_offset_mask;
8a5bb1f1
VSO
1521 ret = qcow2_inc_refcounts_imrt(bs, res,
1522 refcount_table, refcount_table_size,
1523 l2_entry & ~511, nb_csectors * 512);
fef4d3d5
HR
1524 if (ret < 0) {
1525 goto fail;
1526 }
fba31bae
SH
1527
1528 if (flags & CHECK_FRAG_INFO) {
1529 res->bfi.allocated_clusters++;
4db35162 1530 res->bfi.compressed_clusters++;
fba31bae
SH
1531
1532 /* Compressed clusters are fragmented by nature. Since they
1533 * take up sub-sector space but we only have sector granularity
1534 * I/O we need to re-read the same sectors even for adjacent
1535 * compressed clusters.
1536 */
1537 res->bfi.fragmented_clusters++;
1538 }
afdf0abe 1539 break;
f7d0fe02 1540
fdfab37d 1541 case QCOW2_CLUSTER_ZERO_ALLOC:
afdf0abe
KW
1542 case QCOW2_CLUSTER_NORMAL:
1543 {
afdf0abe 1544 uint64_t offset = l2_entry & L2E_OFFSET_MASK;
f7d0fe02 1545
fba31bae
SH
1546 if (flags & CHECK_FRAG_INFO) {
1547 res->bfi.allocated_clusters++;
1548 if (next_contiguous_offset &&
1549 offset != next_contiguous_offset) {
1550 res->bfi.fragmented_clusters++;
1551 }
1552 next_contiguous_offset = offset + s->cluster_size;
1553 }
1554
afdf0abe 1555 /* Mark cluster as used */
8a5bb1f1
VSO
1556 ret = qcow2_inc_refcounts_imrt(bs, res,
1557 refcount_table, refcount_table_size,
1558 offset, s->cluster_size);
fef4d3d5
HR
1559 if (ret < 0) {
1560 goto fail;
1561 }
afdf0abe
KW
1562
1563 /* Correct offsets are cluster aligned */
ac95acdb 1564 if (offset_into_cluster(s, offset)) {
afdf0abe
KW
1565 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1566 "properly aligned; L2 entry corrupted.\n", offset);
1567 res->corruptions++;
1568 }
1569 break;
1570 }
1571
fdfab37d 1572 case QCOW2_CLUSTER_ZERO_PLAIN:
afdf0abe
KW
1573 case QCOW2_CLUSTER_UNALLOCATED:
1574 break;
1575
1576 default:
1577 abort();
f7d0fe02
KW
1578 }
1579 }
1580
7267c094 1581 g_free(l2_table);
9ac228e0 1582 return 0;
f7d0fe02
KW
1583
1584fail:
7267c094 1585 g_free(l2_table);
ad27390c 1586 return ret;
f7d0fe02
KW
1587}
1588
1589/*
1590 * Increases the refcount for the L1 table, its L2 tables and all referenced
1591 * clusters in the given refcount table. While doing so, performs some checks
1592 * on L1 and L2 entries.
1593 *
1594 * Returns the number of errors found by the checks or -errno if an internal
1595 * error occurred.
1596 */
1597static int check_refcounts_l1(BlockDriverState *bs,
9ac228e0 1598 BdrvCheckResult *res,
7453c96b 1599 void **refcount_table,
641bb63c 1600 int64_t *refcount_table_size,
f7d0fe02 1601 int64_t l1_table_offset, int l1_size,
801f7044 1602 int flags)
f7d0fe02 1603{
ff99129a 1604 BDRVQcow2State *s = bs->opaque;
fef4d3d5 1605 uint64_t *l1_table = NULL, l2_offset, l1_size2;
4f6ed88c 1606 int i, ret;
f7d0fe02
KW
1607
1608 l1_size2 = l1_size * sizeof(uint64_t);
1609
1610 /* Mark L1 table as used */
8a5bb1f1
VSO
1611 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1612 l1_table_offset, l1_size2);
fef4d3d5
HR
1613 if (ret < 0) {
1614 goto fail;
1615 }
f7d0fe02
KW
1616
1617 /* Read L1 table entries from disk */
fef4d3d5 1618 if (l1_size2 > 0) {
de82815d
KW
1619 l1_table = g_try_malloc(l1_size2);
1620 if (l1_table == NULL) {
1621 ret = -ENOMEM;
ad27390c 1622 res->check_errors++;
de82815d
KW
1623 goto fail;
1624 }
cf2ab8fc 1625 ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
ad27390c
HR
1626 if (ret < 0) {
1627 fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1628 res->check_errors++;
702ef63f 1629 goto fail;
ad27390c 1630 }
702ef63f
KW
1631 for(i = 0;i < l1_size; i++)
1632 be64_to_cpus(&l1_table[i]);
1633 }
f7d0fe02
KW
1634
1635 /* Do the actual checks */
1636 for(i = 0; i < l1_size; i++) {
1637 l2_offset = l1_table[i];
1638 if (l2_offset) {
f7d0fe02 1639 /* Mark L2 table as used */
afdf0abe 1640 l2_offset &= L1E_OFFSET_MASK;
8a5bb1f1
VSO
1641 ret = qcow2_inc_refcounts_imrt(bs, res,
1642 refcount_table, refcount_table_size,
1643 l2_offset, s->cluster_size);
fef4d3d5
HR
1644 if (ret < 0) {
1645 goto fail;
1646 }
f7d0fe02
KW
1647
1648 /* L2 tables are cluster aligned */
ac95acdb 1649 if (offset_into_cluster(s, l2_offset)) {
f7d0fe02
KW
1650 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1651 "cluster aligned; L1 entry corrupted\n", l2_offset);
9ac228e0 1652 res->corruptions++;
f7d0fe02
KW
1653 }
1654
1655 /* Process and check L2 entries */
9ac228e0 1656 ret = check_refcounts_l2(bs, res, refcount_table,
801f7044 1657 refcount_table_size, l2_offset, flags);
f7d0fe02
KW
1658 if (ret < 0) {
1659 goto fail;
1660 }
f7d0fe02
KW
1661 }
1662 }
7267c094 1663 g_free(l1_table);
9ac228e0 1664 return 0;
f7d0fe02
KW
1665
1666fail:
7267c094 1667 g_free(l1_table);
ad27390c 1668 return ret;
f7d0fe02
KW
1669}
1670
4f6ed88c
HR
1671/*
1672 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1673 *
1674 * This function does not print an error message nor does it increment
44751917
HR
1675 * check_errors if qcow2_get_refcount fails (this is because such an error will
1676 * have been already detected and sufficiently signaled by the calling function
4f6ed88c
HR
1677 * (qcow2_check_refcounts) by the time this function is called).
1678 */
e23e400e
HR
1679static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1680 BdrvCheckMode fix)
4f6ed88c 1681{
ff99129a 1682 BDRVQcow2State *s = bs->opaque;
4f6ed88c
HR
1683 uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1684 int ret;
0e06528e 1685 uint64_t refcount;
4f6ed88c
HR
1686 int i, j;
1687
1688 for (i = 0; i < s->l1_size; i++) {
1689 uint64_t l1_entry = s->l1_table[i];
1690 uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
e23e400e 1691 bool l2_dirty = false;
4f6ed88c
HR
1692
1693 if (!l2_offset) {
1694 continue;
1695 }
1696
7324c10f
HR
1697 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1698 &refcount);
1699 if (ret < 0) {
4f6ed88c
HR
1700 /* don't print message nor increment check_errors */
1701 continue;
1702 }
1703 if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
e23e400e 1704 fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
0e06528e 1705 "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
e23e400e
HR
1706 fix & BDRV_FIX_ERRORS ? "Repairing" :
1707 "ERROR",
4f6ed88c 1708 i, l1_entry, refcount);
e23e400e
HR
1709 if (fix & BDRV_FIX_ERRORS) {
1710 s->l1_table[i] = refcount == 1
1711 ? l1_entry | QCOW_OFLAG_COPIED
1712 : l1_entry & ~QCOW_OFLAG_COPIED;
1713 ret = qcow2_write_l1_entry(bs, i);
1714 if (ret < 0) {
1715 res->check_errors++;
1716 goto fail;
1717 }
1718 res->corruptions_fixed++;
1719 } else {
1720 res->corruptions++;
1721 }
4f6ed88c
HR
1722 }
1723
cf2ab8fc 1724 ret = bdrv_pread(bs->file, l2_offset, l2_table,
4f6ed88c
HR
1725 s->l2_size * sizeof(uint64_t));
1726 if (ret < 0) {
1727 fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1728 strerror(-ret));
1729 res->check_errors++;
1730 goto fail;
1731 }
1732
1733 for (j = 0; j < s->l2_size; j++) {
1734 uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1735 uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
3ef95218 1736 QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
4f6ed88c 1737
fdfab37d
EB
1738 if (cluster_type == QCOW2_CLUSTER_NORMAL ||
1739 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
7324c10f
HR
1740 ret = qcow2_get_refcount(bs,
1741 data_offset >> s->cluster_bits,
1742 &refcount);
1743 if (ret < 0) {
4f6ed88c
HR
1744 /* don't print message nor increment check_errors */
1745 continue;
1746 }
1747 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
e23e400e 1748 fprintf(stderr, "%s OFLAG_COPIED data cluster: "
0e06528e 1749 "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
e23e400e
HR
1750 fix & BDRV_FIX_ERRORS ? "Repairing" :
1751 "ERROR",
4f6ed88c 1752 l2_entry, refcount);
e23e400e
HR
1753 if (fix & BDRV_FIX_ERRORS) {
1754 l2_table[j] = cpu_to_be64(refcount == 1
1755 ? l2_entry | QCOW_OFLAG_COPIED
1756 : l2_entry & ~QCOW_OFLAG_COPIED);
1757 l2_dirty = true;
1758 res->corruptions_fixed++;
1759 } else {
1760 res->corruptions++;
1761 }
4f6ed88c
HR
1762 }
1763 }
1764 }
e23e400e
HR
1765
1766 if (l2_dirty) {
231bb267
HR
1767 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1768 l2_offset, s->cluster_size);
e23e400e
HR
1769 if (ret < 0) {
1770 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1771 "overlap check failed: %s\n", strerror(-ret));
1772 res->check_errors++;
1773 goto fail;
1774 }
1775
d9ca2ea2 1776 ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
9a4f4c31 1777 s->cluster_size);
e23e400e
HR
1778 if (ret < 0) {
1779 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1780 strerror(-ret));
1781 res->check_errors++;
1782 goto fail;
1783 }
1784 }
4f6ed88c
HR
1785 }
1786
1787 ret = 0;
1788
1789fail:
1790 qemu_vfree(l2_table);
1791 return ret;
1792}
1793
6ca56bf5
HR
1794/*
1795 * Checks consistency of refblocks and accounts for each refblock in
1796 * *refcount_table.
1797 */
1798static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
f307b255 1799 BdrvCheckMode fix, bool *rebuild,
7453c96b 1800 void **refcount_table, int64_t *nb_clusters)
6ca56bf5 1801{
ff99129a 1802 BDRVQcow2State *s = bs->opaque;
001c158d 1803 int64_t i, size;
fef4d3d5 1804 int ret;
6ca56bf5 1805
f7d0fe02 1806 for(i = 0; i < s->refcount_table_size; i++) {
6882c8fa 1807 uint64_t offset, cluster;
f7d0fe02 1808 offset = s->refcount_table[i];
6882c8fa 1809 cluster = offset >> s->cluster_bits;
746c3cb5
KW
1810
1811 /* Refcount blocks are cluster aligned */
ac95acdb 1812 if (offset_into_cluster(s, offset)) {
166acf54 1813 fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
746c3cb5 1814 "cluster aligned; refcount table entry corrupted\n", i);
9ac228e0 1815 res->corruptions++;
f307b255 1816 *rebuild = true;
6882c8fa
KW
1817 continue;
1818 }
1819
6ca56bf5 1820 if (cluster >= *nb_clusters) {
001c158d
HR
1821 fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1822 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1823
1824 if (fix & BDRV_FIX_ERRORS) {
5fee192e 1825 int64_t new_nb_clusters;
ed3d2ec9 1826 Error *local_err = NULL;
001c158d
HR
1827
1828 if (offset > INT64_MAX - s->cluster_size) {
1829 ret = -EINVAL;
1830 goto resize_fail;
1831 }
1832
ed3d2ec9 1833 ret = bdrv_truncate(bs->file, offset + s->cluster_size,
7ea37c30 1834 PREALLOC_MODE_OFF, &local_err);
001c158d 1835 if (ret < 0) {
ed3d2ec9 1836 error_report_err(local_err);
001c158d
HR
1837 goto resize_fail;
1838 }
9a4f4c31 1839 size = bdrv_getlength(bs->file->bs);
001c158d
HR
1840 if (size < 0) {
1841 ret = size;
1842 goto resize_fail;
1843 }
1844
5fee192e
HR
1845 new_nb_clusters = size_to_clusters(s, size);
1846 assert(new_nb_clusters >= *nb_clusters);
001c158d 1847
5fee192e
HR
1848 ret = realloc_refcount_array(s, refcount_table,
1849 nb_clusters, new_nb_clusters);
1850 if (ret < 0) {
001c158d 1851 res->check_errors++;
5fee192e 1852 return ret;
001c158d 1853 }
001c158d
HR
1854
1855 if (cluster >= *nb_clusters) {
1856 ret = -EINVAL;
1857 goto resize_fail;
1858 }
1859
1860 res->corruptions_fixed++;
8a5bb1f1
VSO
1861 ret = qcow2_inc_refcounts_imrt(bs, res,
1862 refcount_table, nb_clusters,
1863 offset, s->cluster_size);
001c158d
HR
1864 if (ret < 0) {
1865 return ret;
1866 }
1867 /* No need to check whether the refcount is now greater than 1:
1868 * This area was just allocated and zeroed, so it can only be
8a5bb1f1 1869 * exactly 1 after qcow2_inc_refcounts_imrt() */
001c158d
HR
1870 continue;
1871
1872resize_fail:
1873 res->corruptions++;
f307b255 1874 *rebuild = true;
001c158d
HR
1875 fprintf(stderr, "ERROR could not resize image: %s\n",
1876 strerror(-ret));
1877 } else {
1878 res->corruptions++;
1879 }
6882c8fa 1880 continue;
746c3cb5
KW
1881 }
1882
f7d0fe02 1883 if (offset != 0) {
8a5bb1f1
VSO
1884 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1885 offset, s->cluster_size);
fef4d3d5
HR
1886 if (ret < 0) {
1887 return ret;
1888 }
7453c96b 1889 if (s->get_refcount(*refcount_table, cluster) != 1) {
f307b255 1890 fprintf(stderr, "ERROR refcount block %" PRId64
7453c96b
HR
1891 " refcount=%" PRIu64 "\n", i,
1892 s->get_refcount(*refcount_table, cluster));
f307b255
HR
1893 res->corruptions++;
1894 *rebuild = true;
746c3cb5 1895 }
f7d0fe02
KW
1896 }
1897 }
1898
6ca56bf5
HR
1899 return 0;
1900}
1901
057a3fe5
HR
1902/*
1903 * Calculates an in-memory refcount table.
1904 */
1905static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
f307b255 1906 BdrvCheckMode fix, bool *rebuild,
7453c96b 1907 void **refcount_table, int64_t *nb_clusters)
057a3fe5 1908{
ff99129a 1909 BDRVQcow2State *s = bs->opaque;
057a3fe5
HR
1910 int64_t i;
1911 QCowSnapshot *sn;
1912 int ret;
1913
9696df21 1914 if (!*refcount_table) {
5fee192e
HR
1915 int64_t old_size = 0;
1916 ret = realloc_refcount_array(s, refcount_table,
1917 &old_size, *nb_clusters);
1918 if (ret < 0) {
9696df21 1919 res->check_errors++;
5fee192e 1920 return ret;
9696df21 1921 }
057a3fe5
HR
1922 }
1923
1924 /* header */
8a5bb1f1
VSO
1925 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1926 0, s->cluster_size);
fef4d3d5
HR
1927 if (ret < 0) {
1928 return ret;
1929 }
057a3fe5
HR
1930
1931 /* current L1 table */
641bb63c 1932 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
057a3fe5
HR
1933 s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1934 if (ret < 0) {
1935 return ret;
1936 }
1937
1938 /* snapshots */
1939 for (i = 0; i < s->nb_snapshots; i++) {
1940 sn = s->snapshots + i;
641bb63c 1941 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
fef4d3d5 1942 sn->l1_table_offset, sn->l1_size, 0);
057a3fe5
HR
1943 if (ret < 0) {
1944 return ret;
1945 }
1946 }
8a5bb1f1
VSO
1947 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1948 s->snapshots_offset, s->snapshots_size);
fef4d3d5
HR
1949 if (ret < 0) {
1950 return ret;
1951 }
057a3fe5
HR
1952
1953 /* refcount data */
8a5bb1f1
VSO
1954 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1955 s->refcount_table_offset,
1956 s->refcount_table_size * sizeof(uint64_t));
fef4d3d5
HR
1957 if (ret < 0) {
1958 return ret;
1959 }
057a3fe5 1960
4652b8f3
DB
1961 /* encryption */
1962 if (s->crypto_header.length) {
8a5bb1f1
VSO
1963 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1964 s->crypto_header.offset,
1965 s->crypto_header.length);
4652b8f3
DB
1966 if (ret < 0) {
1967 return ret;
1968 }
1969 }
1970
88ddffae
VSO
1971 /* bitmaps */
1972 ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
1973 if (ret < 0) {
1974 return ret;
1975 }
1976
f307b255 1977 return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
057a3fe5
HR
1978}
1979
6ca56bf5
HR
1980/*
1981 * Compares the actual reference count for each cluster in the image against the
1982 * refcount as reported by the refcount structures on-disk.
1983 */
1984static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
f307b255
HR
1985 BdrvCheckMode fix, bool *rebuild,
1986 int64_t *highest_cluster,
7453c96b 1987 void *refcount_table, int64_t nb_clusters)
6ca56bf5 1988{
ff99129a 1989 BDRVQcow2State *s = bs->opaque;
6ca56bf5 1990 int64_t i;
0e06528e 1991 uint64_t refcount1, refcount2;
7324c10f 1992 int ret;
6ca56bf5
HR
1993
1994 for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
7324c10f
HR
1995 ret = qcow2_get_refcount(bs, i, &refcount1);
1996 if (ret < 0) {
166acf54 1997 fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
7324c10f 1998 i, strerror(-ret));
9ac228e0 1999 res->check_errors++;
f74550fd 2000 continue;
018faafd
KW
2001 }
2002
7453c96b 2003 refcount2 = s->get_refcount(refcount_table, i);
c6bb9ad1
FS
2004
2005 if (refcount1 > 0 || refcount2 > 0) {
6ca56bf5 2006 *highest_cluster = i;
c6bb9ad1
FS
2007 }
2008
f7d0fe02 2009 if (refcount1 != refcount2) {
166acf54
KW
2010 /* Check if we're allowed to fix the mismatch */
2011 int *num_fixed = NULL;
f307b255
HR
2012 if (refcount1 == 0) {
2013 *rebuild = true;
2014 } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
166acf54
KW
2015 num_fixed = &res->leaks_fixed;
2016 } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2017 num_fixed = &res->corruptions_fixed;
2018 }
2019
0e06528e
HR
2020 fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2021 " reference=%" PRIu64 "\n",
166acf54
KW
2022 num_fixed != NULL ? "Repairing" :
2023 refcount1 < refcount2 ? "ERROR" :
2024 "Leaked",
f7d0fe02 2025 i, refcount1, refcount2);
166acf54
KW
2026
2027 if (num_fixed) {
2028 ret = update_refcount(bs, i << s->cluster_bits, 1,
2aabe7c7
HR
2029 refcount_diff(refcount1, refcount2),
2030 refcount1 > refcount2,
6cfcb9b8 2031 QCOW2_DISCARD_ALWAYS);
166acf54
KW
2032 if (ret >= 0) {
2033 (*num_fixed)++;
2034 continue;
2035 }
2036 }
2037
2038 /* And if we couldn't, print an error */
9ac228e0
KW
2039 if (refcount1 < refcount2) {
2040 res->corruptions++;
2041 } else {
2042 res->leaks++;
2043 }
f7d0fe02
KW
2044 }
2045 }
6ca56bf5
HR
2046}
2047
c7c0681b
HR
2048/*
2049 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2050 * the on-disk refcount structures.
2051 *
2052 * On input, *first_free_cluster tells where to start looking, and need not
2053 * actually be a free cluster; the returned offset will not be before that
2054 * cluster. On output, *first_free_cluster points to the first gap found, even
2055 * if that gap was too small to be used as the returned offset.
2056 *
2057 * Note that *first_free_cluster is a cluster index whereas the return value is
2058 * an offset.
2059 */
2060static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2061 int cluster_count,
7453c96b 2062 void **refcount_table,
c7c0681b
HR
2063 int64_t *imrt_nb_clusters,
2064 int64_t *first_free_cluster)
2065{
ff99129a 2066 BDRVQcow2State *s = bs->opaque;
c7c0681b
HR
2067 int64_t cluster = *first_free_cluster, i;
2068 bool first_gap = true;
2069 int contiguous_free_clusters;
5fee192e 2070 int ret;
c7c0681b
HR
2071
2072 /* Starting at *first_free_cluster, find a range of at least cluster_count
2073 * continuously free clusters */
2074 for (contiguous_free_clusters = 0;
2075 cluster < *imrt_nb_clusters &&
2076 contiguous_free_clusters < cluster_count;
2077 cluster++)
2078 {
7453c96b 2079 if (!s->get_refcount(*refcount_table, cluster)) {
c7c0681b
HR
2080 contiguous_free_clusters++;
2081 if (first_gap) {
2082 /* If this is the first free cluster found, update
2083 * *first_free_cluster accordingly */
2084 *first_free_cluster = cluster;
2085 first_gap = false;
2086 }
2087 } else if (contiguous_free_clusters) {
2088 contiguous_free_clusters = 0;
2089 }
2090 }
2091
2092 /* If contiguous_free_clusters is greater than zero, it contains the number
2093 * of continuously free clusters until the current cluster; the first free
2094 * cluster in the current "gap" is therefore
2095 * cluster - contiguous_free_clusters */
2096
2097 /* If no such range could be found, grow the in-memory refcount table
2098 * accordingly to append free clusters at the end of the image */
2099 if (contiguous_free_clusters < cluster_count) {
c7c0681b
HR
2100 /* contiguous_free_clusters clusters are already empty at the image end;
2101 * we need cluster_count clusters; therefore, we have to allocate
2102 * cluster_count - contiguous_free_clusters new clusters at the end of
2103 * the image (which is the current value of cluster; note that cluster
2104 * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2105 * the image end) */
5fee192e
HR
2106 ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2107 cluster + cluster_count
2108 - contiguous_free_clusters);
2109 if (ret < 0) {
2110 return ret;
c7c0681b 2111 }
c7c0681b
HR
2112 }
2113
2114 /* Go back to the first free cluster */
2115 cluster -= contiguous_free_clusters;
2116 for (i = 0; i < cluster_count; i++) {
7453c96b 2117 s->set_refcount(*refcount_table, cluster + i, 1);
c7c0681b
HR
2118 }
2119
2120 return cluster << s->cluster_bits;
2121}
2122
2123/*
2124 * Creates a new refcount structure based solely on the in-memory information
2125 * given through *refcount_table. All necessary allocations will be reflected
2126 * in that array.
2127 *
2128 * On success, the old refcount structure is leaked (it will be covered by the
2129 * new refcount structure).
2130 */
2131static int rebuild_refcount_structure(BlockDriverState *bs,
2132 BdrvCheckResult *res,
7453c96b 2133 void **refcount_table,
c7c0681b
HR
2134 int64_t *nb_clusters)
2135{
ff99129a 2136 BDRVQcow2State *s = bs->opaque;
c7c0681b
HR
2137 int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2138 int64_t refblock_offset, refblock_start, refblock_index;
2139 uint32_t reftable_size = 0;
2140 uint64_t *on_disk_reftable = NULL;
7453c96b
HR
2141 void *on_disk_refblock;
2142 int ret = 0;
c7c0681b
HR
2143 struct {
2144 uint64_t reftable_offset;
2145 uint32_t reftable_clusters;
2146 } QEMU_PACKED reftable_offset_and_clusters;
2147
2148 qcow2_cache_empty(bs, s->refcount_block_cache);
2149
2150write_refblocks:
2151 for (; cluster < *nb_clusters; cluster++) {
7453c96b 2152 if (!s->get_refcount(*refcount_table, cluster)) {
c7c0681b
HR
2153 continue;
2154 }
2155
2156 refblock_index = cluster >> s->refcount_block_bits;
2157 refblock_start = refblock_index << s->refcount_block_bits;
2158
2159 /* Don't allocate a cluster in a refblock already written to disk */
2160 if (first_free_cluster < refblock_start) {
2161 first_free_cluster = refblock_start;
2162 }
2163 refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2164 nb_clusters, &first_free_cluster);
2165 if (refblock_offset < 0) {
2166 fprintf(stderr, "ERROR allocating refblock: %s\n",
2167 strerror(-refblock_offset));
2168 res->check_errors++;
2169 ret = refblock_offset;
2170 goto fail;
2171 }
2172
2173 if (reftable_size <= refblock_index) {
2174 uint32_t old_reftable_size = reftable_size;
2175 uint64_t *new_on_disk_reftable;
2176
2177 reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2178 s->cluster_size) / sizeof(uint64_t);
2179 new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2180 reftable_size *
2181 sizeof(uint64_t));
2182 if (!new_on_disk_reftable) {
2183 res->check_errors++;
2184 ret = -ENOMEM;
2185 goto fail;
2186 }
2187 on_disk_reftable = new_on_disk_reftable;
2188
2189 memset(on_disk_reftable + old_reftable_size, 0,
2190 (reftable_size - old_reftable_size) * sizeof(uint64_t));
2191
2192 /* The offset we have for the reftable is now no longer valid;
2193 * this will leak that range, but we can easily fix that by running
2194 * a leak-fixing check after this rebuild operation */
2195 reftable_offset = -1;
2196 }
2197 on_disk_reftable[refblock_index] = refblock_offset;
2198
2199 /* If this is apparently the last refblock (for now), try to squeeze the
2200 * reftable in */
2201 if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2202 reftable_offset < 0)
2203 {
2204 uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2205 sizeof(uint64_t));
2206 reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2207 refcount_table, nb_clusters,
2208 &first_free_cluster);
2209 if (reftable_offset < 0) {
2210 fprintf(stderr, "ERROR allocating reftable: %s\n",
2211 strerror(-reftable_offset));
2212 res->check_errors++;
2213 ret = reftable_offset;
2214 goto fail;
2215 }
2216 }
2217
2218 ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2219 s->cluster_size);
2220 if (ret < 0) {
2221 fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2222 goto fail;
2223 }
2224
7453c96b
HR
2225 /* The size of *refcount_table is always cluster-aligned, therefore the
2226 * write operation will not overflow */
2227 on_disk_refblock = (void *)((char *) *refcount_table +
2228 refblock_index * s->cluster_size);
c7c0681b 2229
18d51c4b 2230 ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
7453c96b 2231 on_disk_refblock, s->cluster_sectors);
c7c0681b
HR
2232 if (ret < 0) {
2233 fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2234 goto fail;
2235 }
2236
2237 /* Go to the end of this refblock */
2238 cluster = refblock_start + s->refcount_block_size - 1;
2239 }
2240
2241 if (reftable_offset < 0) {
2242 uint64_t post_refblock_start, reftable_clusters;
2243
2244 post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2245 reftable_clusters = size_to_clusters(s,
2246 reftable_size * sizeof(uint64_t));
2247 /* Not pretty but simple */
2248 if (first_free_cluster < post_refblock_start) {
2249 first_free_cluster = post_refblock_start;
2250 }
2251 reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2252 refcount_table, nb_clusters,
2253 &first_free_cluster);
2254 if (reftable_offset < 0) {
2255 fprintf(stderr, "ERROR allocating reftable: %s\n",
2256 strerror(-reftable_offset));
2257 res->check_errors++;
2258 ret = reftable_offset;
2259 goto fail;
2260 }
2261
2262 goto write_refblocks;
2263 }
2264
2265 assert(on_disk_reftable);
2266
2267 for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2268 cpu_to_be64s(&on_disk_reftable[refblock_index]);
2269 }
2270
2271 ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2272 reftable_size * sizeof(uint64_t));
2273 if (ret < 0) {
2274 fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2275 goto fail;
2276 }
2277
2278 assert(reftable_size < INT_MAX / sizeof(uint64_t));
d9ca2ea2 2279 ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
c7c0681b
HR
2280 reftable_size * sizeof(uint64_t));
2281 if (ret < 0) {
2282 fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2283 goto fail;
2284 }
2285
2286 /* Enter new reftable into the image header */
f1f7a1dd
PM
2287 reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2288 reftable_offset_and_clusters.reftable_clusters =
2289 cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
d9ca2ea2
KW
2290 ret = bdrv_pwrite_sync(bs->file,
2291 offsetof(QCowHeader, refcount_table_offset),
c7c0681b
HR
2292 &reftable_offset_and_clusters,
2293 sizeof(reftable_offset_and_clusters));
2294 if (ret < 0) {
2295 fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2296 goto fail;
2297 }
2298
2299 for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2300 be64_to_cpus(&on_disk_reftable[refblock_index]);
2301 }
2302 s->refcount_table = on_disk_reftable;
2303 s->refcount_table_offset = reftable_offset;
2304 s->refcount_table_size = reftable_size;
7061a078 2305 update_max_refcount_table_index(s);
c7c0681b
HR
2306
2307 return 0;
2308
2309fail:
2310 g_free(on_disk_reftable);
2311 return ret;
2312}
2313
6ca56bf5
HR
2314/*
2315 * Checks an image for refcount consistency.
2316 *
2317 * Returns 0 if no errors are found, the number of errors in case the image is
2318 * detected as corrupted, and -errno when an internal error occurred.
2319 */
2320int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2321 BdrvCheckMode fix)
2322{
ff99129a 2323 BDRVQcow2State *s = bs->opaque;
c7c0681b 2324 BdrvCheckResult pre_compare_res;
6ca56bf5 2325 int64_t size, highest_cluster, nb_clusters;
7453c96b 2326 void *refcount_table = NULL;
f307b255 2327 bool rebuild = false;
6ca56bf5
HR
2328 int ret;
2329
9a4f4c31 2330 size = bdrv_getlength(bs->file->bs);
6ca56bf5
HR
2331 if (size < 0) {
2332 res->check_errors++;
2333 return size;
2334 }
2335
2336 nb_clusters = size_to_clusters(s, size);
2337 if (nb_clusters > INT_MAX) {
2338 res->check_errors++;
2339 return -EFBIG;
2340 }
2341
2342 res->bfi.total_clusters =
2343 size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2344
f307b255
HR
2345 ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2346 &nb_clusters);
6ca56bf5
HR
2347 if (ret < 0) {
2348 goto fail;
2349 }
2350
c7c0681b
HR
2351 /* In case we don't need to rebuild the refcount structure (but want to fix
2352 * something), this function is immediately called again, in which case the
2353 * result should be ignored */
2354 pre_compare_res = *res;
2355 compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
6ca56bf5 2356 nb_clusters);
f7d0fe02 2357
c7c0681b 2358 if (rebuild && (fix & BDRV_FIX_ERRORS)) {
791230d8
HR
2359 BdrvCheckResult old_res = *res;
2360 int fresh_leaks = 0;
2361
c7c0681b
HR
2362 fprintf(stderr, "Rebuilding refcount structure\n");
2363 ret = rebuild_refcount_structure(bs, res, &refcount_table,
2364 &nb_clusters);
2365 if (ret < 0) {
2366 goto fail;
2367 }
791230d8
HR
2368
2369 res->corruptions = 0;
2370 res->leaks = 0;
2371
2372 /* Because the old reftable has been exchanged for a new one the
2373 * references have to be recalculated */
2374 rebuild = false;
7453c96b 2375 memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
791230d8
HR
2376 ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2377 &nb_clusters);
2378 if (ret < 0) {
2379 goto fail;
2380 }
2381
2382 if (fix & BDRV_FIX_LEAKS) {
2383 /* The old refcount structures are now leaked, fix it; the result
2384 * can be ignored, aside from leaks which were introduced by
2385 * rebuild_refcount_structure() that could not be fixed */
2386 BdrvCheckResult saved_res = *res;
2387 *res = (BdrvCheckResult){ 0 };
2388
2389 compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2390 &highest_cluster, refcount_table, nb_clusters);
2391 if (rebuild) {
2392 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2393 "broken\n");
2394 }
2395
2396 /* Any leaks accounted for here were introduced by
2397 * rebuild_refcount_structure() because that function has created a
2398 * new refcount structure from scratch */
2399 fresh_leaks = res->leaks;
2400 *res = saved_res;
2401 }
2402
2403 if (res->corruptions < old_res.corruptions) {
2404 res->corruptions_fixed += old_res.corruptions - res->corruptions;
2405 }
2406 if (res->leaks < old_res.leaks) {
2407 res->leaks_fixed += old_res.leaks - res->leaks;
2408 }
2409 res->leaks += fresh_leaks;
c7c0681b
HR
2410 } else if (fix) {
2411 if (rebuild) {
2412 fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2413 res->check_errors++;
2414 ret = -EIO;
2415 goto fail;
2416 }
2417
2418 if (res->leaks || res->corruptions) {
2419 *res = pre_compare_res;
2420 compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2421 refcount_table, nb_clusters);
2422 }
f307b255
HR
2423 }
2424
4f6ed88c 2425 /* check OFLAG_COPIED */
e23e400e 2426 ret = check_oflag_copied(bs, res, fix);
4f6ed88c
HR
2427 if (ret < 0) {
2428 goto fail;
2429 }
2430
c6bb9ad1 2431 res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
80fa3341
KW
2432 ret = 0;
2433
2434fail:
7267c094 2435 g_free(refcount_table);
f7d0fe02 2436
80fa3341 2437 return ret;
f7d0fe02
KW
2438}
2439
a40f1c2a
HR
2440#define overlaps_with(ofs, sz) \
2441 ranges_overlap(offset, size, ofs, sz)
2442
2443/*
2444 * Checks if the given offset into the image file is actually free to use by
2445 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2446 * i.e. a sanity check without relying on the refcount tables.
2447 *
231bb267
HR
2448 * The ign parameter specifies what checks not to perform (being a bitmask of
2449 * QCow2MetadataOverlap values), i.e., what sections to ignore.
a40f1c2a
HR
2450 *
2451 * Returns:
2452 * - 0 if writing to this offset will not affect the mentioned metadata
2453 * - a positive QCow2MetadataOverlap value indicating one overlapping section
2454 * - a negative value (-errno) indicating an error while performing a check,
2455 * e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2456 */
231bb267 2457int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
a40f1c2a
HR
2458 int64_t size)
2459{
ff99129a 2460 BDRVQcow2State *s = bs->opaque;
3e355390 2461 int chk = s->overlap_check & ~ign;
a40f1c2a
HR
2462 int i, j;
2463
2464 if (!size) {
2465 return 0;
2466 }
2467
2468 if (chk & QCOW2_OL_MAIN_HEADER) {
2469 if (offset < s->cluster_size) {
2470 return QCOW2_OL_MAIN_HEADER;
2471 }
2472 }
2473
2474 /* align range to test to cluster boundaries */
2475 size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2476 offset = start_of_cluster(s, offset);
2477
2478 if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2479 if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2480 return QCOW2_OL_ACTIVE_L1;
2481 }
2482 }
2483
2484 if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2485 if (overlaps_with(s->refcount_table_offset,
2486 s->refcount_table_size * sizeof(uint64_t))) {
2487 return QCOW2_OL_REFCOUNT_TABLE;
2488 }
2489 }
2490
2491 if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2492 if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2493 return QCOW2_OL_SNAPSHOT_TABLE;
2494 }
2495 }
2496
2497 if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2498 for (i = 0; i < s->nb_snapshots; i++) {
2499 if (s->snapshots[i].l1_size &&
2500 overlaps_with(s->snapshots[i].l1_table_offset,
2501 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2502 return QCOW2_OL_INACTIVE_L1;
2503 }
2504 }
2505 }
2506
2507 if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2508 for (i = 0; i < s->l1_size; i++) {
2509 if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2510 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2511 s->cluster_size)) {
2512 return QCOW2_OL_ACTIVE_L2;
2513 }
2514 }
2515 }
2516
2517 if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
7061a078
AG
2518 unsigned last_entry = s->max_refcount_table_index;
2519 assert(last_entry < s->refcount_table_size);
2520 assert(last_entry + 1 == s->refcount_table_size ||
2521 (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2522 for (i = 0; i <= last_entry; i++) {
a40f1c2a
HR
2523 if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2524 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2525 s->cluster_size)) {
2526 return QCOW2_OL_REFCOUNT_BLOCK;
2527 }
2528 }
2529 }
2530
2531 if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2532 for (i = 0; i < s->nb_snapshots; i++) {
2533 uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2534 uint32_t l1_sz = s->snapshots[i].l1_size;
998b959c 2535 uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
de82815d 2536 uint64_t *l1 = g_try_malloc(l1_sz2);
a40f1c2a
HR
2537 int ret;
2538
de82815d
KW
2539 if (l1_sz2 && l1 == NULL) {
2540 return -ENOMEM;
2541 }
2542
cf2ab8fc 2543 ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
a40f1c2a
HR
2544 if (ret < 0) {
2545 g_free(l1);
2546 return ret;
2547 }
2548
2549 for (j = 0; j < l1_sz; j++) {
1e242b55
HR
2550 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2551 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
a40f1c2a
HR
2552 g_free(l1);
2553 return QCOW2_OL_INACTIVE_L2;
2554 }
2555 }
2556
2557 g_free(l1);
2558 }
2559 }
2560
2561 return 0;
2562}
2563
2564static const char *metadata_ol_names[] = {
2565 [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header",
2566 [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table",
2567 [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table",
2568 [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2569 [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2570 [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2571 [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table",
2572 [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table",
2573};
2574
2575/*
2576 * First performs a check for metadata overlaps (through
2577 * qcow2_check_metadata_overlap); if that fails with a negative value (error
2578 * while performing a check), that value is returned. If an impending overlap
2579 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2580 * and -EIO returned.
2581 *
2582 * Returns 0 if there were neither overlaps nor errors while checking for
2583 * overlaps; or a negative value (-errno) on error.
2584 */
231bb267 2585int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
a40f1c2a
HR
2586 int64_t size)
2587{
231bb267 2588 int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
a40f1c2a
HR
2589
2590 if (ret < 0) {
2591 return ret;
2592 } else if (ret > 0) {
786a4ea8 2593 int metadata_ol_bitnr = ctz32(ret);
a40f1c2a
HR
2594 assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2595
adb43552
HR
2596 qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2597 "write on metadata (overlaps with %s)",
2598 metadata_ol_names[metadata_ol_bitnr]);
a40f1c2a
HR
2599 return -EIO;
2600 }
2601
2602 return 0;
2603}
791c9a00
HR
2604
2605/* A pointer to a function of this type is given to walk_over_reftable(). That
2606 * function will create refblocks and pass them to a RefblockFinishOp once they
2607 * are completed (@refblock). @refblock_empty is set if the refblock is
2608 * completely empty.
2609 *
2610 * Along with the refblock, a corresponding reftable entry is passed, in the
2611 * reftable @reftable (which may be reallocated) at @reftable_index.
2612 *
2613 * @allocated should be set to true if a new cluster has been allocated.
2614 */
2615typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2616 uint64_t reftable_index, uint64_t *reftable_size,
2617 void *refblock, bool refblock_empty,
2618 bool *allocated, Error **errp);
2619
2620/**
2621 * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2622 * it is not empty) and inserts its offset into the new reftable. The size of
2623 * this new reftable is increased as required.
2624 */
2625static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2626 uint64_t reftable_index, uint64_t *reftable_size,
2627 void *refblock, bool refblock_empty, bool *allocated,
2628 Error **errp)
2629{
2630 BDRVQcow2State *s = bs->opaque;
2631 int64_t offset;
2632
2633 if (!refblock_empty && reftable_index >= *reftable_size) {
2634 uint64_t *new_reftable;
2635 uint64_t new_reftable_size;
2636
2637 new_reftable_size = ROUND_UP(reftable_index + 1,
2638 s->cluster_size / sizeof(uint64_t));
2639 if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2640 error_setg(errp,
2641 "This operation would make the refcount table grow "
2642 "beyond the maximum size supported by QEMU, aborting");
2643 return -ENOTSUP;
2644 }
2645
2646 new_reftable = g_try_realloc(*reftable, new_reftable_size *
2647 sizeof(uint64_t));
2648 if (!new_reftable) {
2649 error_setg(errp, "Failed to increase reftable buffer size");
2650 return -ENOMEM;
2651 }
2652
2653 memset(new_reftable + *reftable_size, 0,
2654 (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2655
2656 *reftable = new_reftable;
2657 *reftable_size = new_reftable_size;
2658 }
2659
2660 if (!refblock_empty && !(*reftable)[reftable_index]) {
2661 offset = qcow2_alloc_clusters(bs, s->cluster_size);
2662 if (offset < 0) {
2663 error_setg_errno(errp, -offset, "Failed to allocate refblock");
2664 return offset;
2665 }
2666 (*reftable)[reftable_index] = offset;
2667 *allocated = true;
2668 }
2669
2670 return 0;
2671}
2672
2673/**
2674 * This "operation" for walk_over_reftable() writes the refblock to disk at the
2675 * offset specified by the new reftable's entry. It does not modify the new
2676 * reftable or change any refcounts.
2677 */
2678static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2679 uint64_t reftable_index, uint64_t *reftable_size,
2680 void *refblock, bool refblock_empty, bool *allocated,
2681 Error **errp)
2682{
2683 BDRVQcow2State *s = bs->opaque;
2684 int64_t offset;
2685 int ret;
2686
2687 if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2688 offset = (*reftable)[reftable_index];
2689
2690 ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2691 if (ret < 0) {
2692 error_setg_errno(errp, -ret, "Overlap check failed");
2693 return ret;
2694 }
2695
d9ca2ea2 2696 ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
791c9a00
HR
2697 if (ret < 0) {
2698 error_setg_errno(errp, -ret, "Failed to write refblock");
2699 return ret;
2700 }
2701 } else {
2702 assert(refblock_empty);
2703 }
2704
2705 return 0;
2706}
2707
2708/**
2709 * This function walks over the existing reftable and every referenced refblock;
2710 * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2711 * create an equal new entry in the passed @new_refblock. Once that
2712 * @new_refblock is completely filled, @operation will be called.
2713 *
2714 * @status_cb and @cb_opaque are used for the amend operation's status callback.
2715 * @index is the index of the walk_over_reftable() calls and @total is the total
2716 * number of walk_over_reftable() calls per amend operation. Both are used for
2717 * calculating the parameters for the status callback.
2718 *
2719 * @allocated is set to true if a new cluster has been allocated.
2720 */
2721static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2722 uint64_t *new_reftable_index,
2723 uint64_t *new_reftable_size,
2724 void *new_refblock, int new_refblock_size,
2725 int new_refcount_bits,
2726 RefblockFinishOp *operation, bool *allocated,
2727 Qcow2SetRefcountFunc *new_set_refcount,
2728 BlockDriverAmendStatusCB *status_cb,
2729 void *cb_opaque, int index, int total,
2730 Error **errp)
2731{
2732 BDRVQcow2State *s = bs->opaque;
2733 uint64_t reftable_index;
2734 bool new_refblock_empty = true;
2735 int refblock_index;
2736 int new_refblock_index = 0;
2737 int ret;
2738
2739 for (reftable_index = 0; reftable_index < s->refcount_table_size;
2740 reftable_index++)
2741 {
2742 uint64_t refblock_offset = s->refcount_table[reftable_index]
2743 & REFT_OFFSET_MASK;
2744
2745 status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2746 (uint64_t)total * s->refcount_table_size, cb_opaque);
2747
2748 if (refblock_offset) {
2749 void *refblock;
2750
2751 if (offset_into_cluster(s, refblock_offset)) {
2752 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2753 PRIx64 " unaligned (reftable index: %#"
2754 PRIx64 ")", refblock_offset,
2755 reftable_index);
2756 error_setg(errp,
2757 "Image is corrupt (unaligned refblock offset)");
2758 return -EIO;
2759 }
2760
2761 ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2762 &refblock);
2763 if (ret < 0) {
2764 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2765 return ret;
2766 }
2767
2768 for (refblock_index = 0; refblock_index < s->refcount_block_size;
2769 refblock_index++)
2770 {
2771 uint64_t refcount;
2772
2773 if (new_refblock_index >= new_refblock_size) {
2774 /* new_refblock is now complete */
2775 ret = operation(bs, new_reftable, *new_reftable_index,
2776 new_reftable_size, new_refblock,
2777 new_refblock_empty, allocated, errp);
2778 if (ret < 0) {
2779 qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2780 return ret;
2781 }
2782
2783 (*new_reftable_index)++;
2784 new_refblock_index = 0;
2785 new_refblock_empty = true;
2786 }
2787
2788 refcount = s->get_refcount(refblock, refblock_index);
2789 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2790 uint64_t offset;
2791
2792 qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2793
2794 offset = ((reftable_index << s->refcount_block_bits)
2795 + refblock_index) << s->cluster_bits;
2796
2797 error_setg(errp, "Cannot decrease refcount entry width to "
2798 "%i bits: Cluster at offset %#" PRIx64 " has a "
2799 "refcount of %" PRIu64, new_refcount_bits,
2800 offset, refcount);
2801 return -EINVAL;
2802 }
2803
2804 if (new_set_refcount) {
2805 new_set_refcount(new_refblock, new_refblock_index++,
2806 refcount);
2807 } else {
2808 new_refblock_index++;
2809 }
2810 new_refblock_empty = new_refblock_empty && refcount == 0;
2811 }
2812
2813 qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2814 } else {
2815 /* No refblock means every refcount is 0 */
2816 for (refblock_index = 0; refblock_index < s->refcount_block_size;
2817 refblock_index++)
2818 {
2819 if (new_refblock_index >= new_refblock_size) {
2820 /* new_refblock is now complete */
2821 ret = operation(bs, new_reftable, *new_reftable_index,
2822 new_reftable_size, new_refblock,
2823 new_refblock_empty, allocated, errp);
2824 if (ret < 0) {
2825 return ret;
2826 }
2827
2828 (*new_reftable_index)++;
2829 new_refblock_index = 0;
2830 new_refblock_empty = true;
2831 }
2832
2833 if (new_set_refcount) {
2834 new_set_refcount(new_refblock, new_refblock_index++, 0);
2835 } else {
2836 new_refblock_index++;
2837 }
2838 }
2839 }
2840 }
2841
2842 if (new_refblock_index > 0) {
2843 /* Complete the potentially existing partially filled final refblock */
2844 if (new_set_refcount) {
2845 for (; new_refblock_index < new_refblock_size;
2846 new_refblock_index++)
2847 {
2848 new_set_refcount(new_refblock, new_refblock_index, 0);
2849 }
2850 }
2851
2852 ret = operation(bs, new_reftable, *new_reftable_index,
2853 new_reftable_size, new_refblock, new_refblock_empty,
2854 allocated, errp);
2855 if (ret < 0) {
2856 return ret;
2857 }
2858
2859 (*new_reftable_index)++;
2860 }
2861
2862 status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2863 (uint64_t)total * s->refcount_table_size, cb_opaque);
2864
2865 return 0;
2866}
2867
2868int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
2869 BlockDriverAmendStatusCB *status_cb,
2870 void *cb_opaque, Error **errp)
2871{
2872 BDRVQcow2State *s = bs->opaque;
2873 Qcow2GetRefcountFunc *new_get_refcount;
2874 Qcow2SetRefcountFunc *new_set_refcount;
2875 void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
2876 uint64_t *new_reftable = NULL, new_reftable_size = 0;
2877 uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
2878 uint64_t new_reftable_index = 0;
2879 uint64_t i;
2880 int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
2881 int new_refblock_size, new_refcount_bits = 1 << refcount_order;
2882 int old_refcount_order;
2883 int walk_index = 0;
2884 int ret;
2885 bool new_allocation;
2886
2887 assert(s->qcow_version >= 3);
2888 assert(refcount_order >= 0 && refcount_order <= 6);
2889
2890 /* see qcow2_open() */
2891 new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
2892
2893 new_get_refcount = get_refcount_funcs[refcount_order];
2894 new_set_refcount = set_refcount_funcs[refcount_order];
2895
2896
2897 do {
2898 int total_walks;
2899
2900 new_allocation = false;
2901
2902 /* At least we have to do this walk and the one which writes the
2903 * refblocks; also, at least we have to do this loop here at least
2904 * twice (normally), first to do the allocations, and second to
2905 * determine that everything is correctly allocated, this then makes
2906 * three walks in total */
2907 total_walks = MAX(walk_index + 2, 3);
2908
2909 /* First, allocate the structures so they are present in the refcount
2910 * structures */
2911 ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2912 &new_reftable_size, NULL, new_refblock_size,
2913 new_refcount_bits, &alloc_refblock,
2914 &new_allocation, NULL, status_cb, cb_opaque,
2915 walk_index++, total_walks, errp);
2916 if (ret < 0) {
2917 goto done;
2918 }
2919
2920 new_reftable_index = 0;
2921
2922 if (new_allocation) {
2923 if (new_reftable_offset) {
2924 qcow2_free_clusters(bs, new_reftable_offset,
2925 allocated_reftable_size * sizeof(uint64_t),
2926 QCOW2_DISCARD_NEVER);
2927 }
2928
2929 new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
2930 sizeof(uint64_t));
2931 if (new_reftable_offset < 0) {
2932 error_setg_errno(errp, -new_reftable_offset,
2933 "Failed to allocate the new reftable");
2934 ret = new_reftable_offset;
2935 goto done;
2936 }
2937 allocated_reftable_size = new_reftable_size;
2938 }
2939 } while (new_allocation);
2940
2941 /* Second, write the new refblocks */
2942 ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2943 &new_reftable_size, new_refblock,
2944 new_refblock_size, new_refcount_bits,
2945 &flush_refblock, &new_allocation, new_set_refcount,
2946 status_cb, cb_opaque, walk_index, walk_index + 1,
2947 errp);
2948 if (ret < 0) {
2949 goto done;
2950 }
2951 assert(!new_allocation);
2952
2953
2954 /* Write the new reftable */
2955 ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
2956 new_reftable_size * sizeof(uint64_t));
2957 if (ret < 0) {
2958 error_setg_errno(errp, -ret, "Overlap check failed");
2959 goto done;
2960 }
2961
2962 for (i = 0; i < new_reftable_size; i++) {
2963 cpu_to_be64s(&new_reftable[i]);
2964 }
2965
d9ca2ea2 2966 ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
791c9a00
HR
2967 new_reftable_size * sizeof(uint64_t));
2968
2969 for (i = 0; i < new_reftable_size; i++) {
2970 be64_to_cpus(&new_reftable[i]);
2971 }
2972
2973 if (ret < 0) {
2974 error_setg_errno(errp, -ret, "Failed to write the new reftable");
2975 goto done;
2976 }
2977
2978
2979 /* Empty the refcount cache */
2980 ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2981 if (ret < 0) {
2982 error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
2983 goto done;
2984 }
2985
2986 /* Update the image header to point to the new reftable; this only updates
2987 * the fields which are relevant to qcow2_update_header(); other fields
2988 * such as s->refcount_table or s->refcount_bits stay stale for now
2989 * (because we have to restore everything if qcow2_update_header() fails) */
2990 old_refcount_order = s->refcount_order;
2991 old_reftable_size = s->refcount_table_size;
2992 old_reftable_offset = s->refcount_table_offset;
2993
2994 s->refcount_order = refcount_order;
2995 s->refcount_table_size = new_reftable_size;
2996 s->refcount_table_offset = new_reftable_offset;
2997
2998 ret = qcow2_update_header(bs);
2999 if (ret < 0) {
3000 s->refcount_order = old_refcount_order;
3001 s->refcount_table_size = old_reftable_size;
3002 s->refcount_table_offset = old_reftable_offset;
3003 error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3004 goto done;
3005 }
3006
3007 /* Now update the rest of the in-memory information */
3008 old_reftable = s->refcount_table;
3009 s->refcount_table = new_reftable;
7061a078 3010 update_max_refcount_table_index(s);
791c9a00
HR
3011
3012 s->refcount_bits = 1 << refcount_order;
3013 s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3014 s->refcount_max += s->refcount_max - 1;
3015
3016 s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3017 s->refcount_block_size = 1 << s->refcount_block_bits;
3018
3019 s->get_refcount = new_get_refcount;
3020 s->set_refcount = new_set_refcount;
3021
3022 /* For cleaning up all old refblocks and the old reftable below the "done"
3023 * label */
3024 new_reftable = old_reftable;
3025 new_reftable_size = old_reftable_size;
3026 new_reftable_offset = old_reftable_offset;
3027
3028done:
3029 if (new_reftable) {
3030 /* On success, new_reftable actually points to the old reftable (and
3031 * new_reftable_size is the old reftable's size); but that is just
3032 * fine */
3033 for (i = 0; i < new_reftable_size; i++) {
3034 uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3035 if (offset) {
3036 qcow2_free_clusters(bs, offset, s->cluster_size,
3037 QCOW2_DISCARD_OTHER);
3038 }
3039 }
3040 g_free(new_reftable);
3041
3042 if (new_reftable_offset > 0) {
3043 qcow2_free_clusters(bs, new_reftable_offset,
3044 new_reftable_size * sizeof(uint64_t),
3045 QCOW2_DISCARD_OTHER);
3046 }
3047 }
3048
3049 qemu_vfree(new_refblock);
3050 return ret;
3051}