]> git.proxmox.com Git - mirror_qemu.git/blame - block/qcow2-refcount.c
qcow2: add qcow2_cache_discard
[mirror_qemu.git] / block / qcow2-refcount.c
CommitLineData
f7d0fe02
KW
1/*
2 * Block driver for the QCOW version 2 format
3 *
4 * Copyright (c) 2004-2006 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
80c71a24 25#include "qemu/osdep.h"
da34e65c 26#include "qapi/error.h"
f7d0fe02 27#include "qemu-common.h"
737e150e 28#include "block/block_int.h"
f7d0fe02 29#include "block/qcow2.h"
a40f1c2a 30#include "qemu/range.h"
58369e22 31#include "qemu/bswap.h"
f7d0fe02 32
bb572aef 33static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
92dcb59f 34static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
0e06528e 35 int64_t offset, int64_t length, uint64_t addend,
2aabe7c7 36 bool decrease, enum qcow2_discard_type type);
f7d0fe02 37
59c0cb78
HR
38static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
39static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
40static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
41static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
7453c96b 42static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
59c0cb78
HR
43static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
44static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
7453c96b 45
59c0cb78
HR
46static void set_refcount_ro0(void *refcount_array, uint64_t index,
47 uint64_t value);
48static void set_refcount_ro1(void *refcount_array, uint64_t index,
49 uint64_t value);
50static void set_refcount_ro2(void *refcount_array, uint64_t index,
51 uint64_t value);
52static void set_refcount_ro3(void *refcount_array, uint64_t index,
53 uint64_t value);
7453c96b
HR
54static void set_refcount_ro4(void *refcount_array, uint64_t index,
55 uint64_t value);
59c0cb78
HR
56static void set_refcount_ro5(void *refcount_array, uint64_t index,
57 uint64_t value);
58static void set_refcount_ro6(void *refcount_array, uint64_t index,
59 uint64_t value);
60
61
62static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
63 &get_refcount_ro0,
64 &get_refcount_ro1,
65 &get_refcount_ro2,
66 &get_refcount_ro3,
67 &get_refcount_ro4,
68 &get_refcount_ro5,
69 &get_refcount_ro6
70};
71
72static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
73 &set_refcount_ro0,
74 &set_refcount_ro1,
75 &set_refcount_ro2,
76 &set_refcount_ro3,
77 &set_refcount_ro4,
78 &set_refcount_ro5,
79 &set_refcount_ro6
80};
7453c96b 81
3b88e52b 82
f7d0fe02
KW
83/*********************************************************/
84/* refcount handling */
85
7061a078
AG
86static void update_max_refcount_table_index(BDRVQcow2State *s)
87{
88 unsigned i = s->refcount_table_size - 1;
89 while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
90 i--;
91 }
92 /* Set s->max_refcount_table_index to the index of the last used entry */
93 s->max_refcount_table_index = i;
94}
95
ed6ccf0f 96int qcow2_refcount_init(BlockDriverState *bs)
f7d0fe02 97{
ff99129a 98 BDRVQcow2State *s = bs->opaque;
5dab2fad
KW
99 unsigned int refcount_table_size2, i;
100 int ret;
f7d0fe02 101
59c0cb78
HR
102 assert(s->refcount_order >= 0 && s->refcount_order <= 6);
103
104 s->get_refcount = get_refcount_funcs[s->refcount_order];
105 s->set_refcount = set_refcount_funcs[s->refcount_order];
7453c96b 106
5dab2fad 107 assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
f7d0fe02 108 refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
de82815d
KW
109 s->refcount_table = g_try_malloc(refcount_table_size2);
110
f7d0fe02 111 if (s->refcount_table_size > 0) {
de82815d 112 if (s->refcount_table == NULL) {
8fcffa98 113 ret = -ENOMEM;
de82815d
KW
114 goto fail;
115 }
66f82cee 116 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
cf2ab8fc 117 ret = bdrv_pread(bs->file, s->refcount_table_offset,
f7d0fe02 118 s->refcount_table, refcount_table_size2);
8fcffa98 119 if (ret < 0) {
f7d0fe02 120 goto fail;
8fcffa98 121 }
f7d0fe02
KW
122 for(i = 0; i < s->refcount_table_size; i++)
123 be64_to_cpus(&s->refcount_table[i]);
7061a078 124 update_max_refcount_table_index(s);
f7d0fe02
KW
125 }
126 return 0;
127 fail:
8fcffa98 128 return ret;
f7d0fe02
KW
129}
130
ed6ccf0f 131void qcow2_refcount_close(BlockDriverState *bs)
f7d0fe02 132{
ff99129a 133 BDRVQcow2State *s = bs->opaque;
7267c094 134 g_free(s->refcount_table);
f7d0fe02
KW
135}
136
137
59c0cb78
HR
138static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
139{
140 return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
141}
142
143static void set_refcount_ro0(void *refcount_array, uint64_t index,
144 uint64_t value)
145{
146 assert(!(value >> 1));
147 ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
148 ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
149}
150
151static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
152{
153 return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
154 & 0x3;
155}
156
157static void set_refcount_ro1(void *refcount_array, uint64_t index,
158 uint64_t value)
159{
160 assert(!(value >> 2));
161 ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
162 ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
163}
164
165static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
166{
167 return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
168 & 0xf;
169}
170
171static void set_refcount_ro2(void *refcount_array, uint64_t index,
172 uint64_t value)
173{
174 assert(!(value >> 4));
175 ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
176 ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
177}
178
179static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
180{
181 return ((const uint8_t *)refcount_array)[index];
182}
183
184static void set_refcount_ro3(void *refcount_array, uint64_t index,
185 uint64_t value)
186{
187 assert(!(value >> 8));
188 ((uint8_t *)refcount_array)[index] = value;
189}
190
7453c96b
HR
191static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
192{
193 return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
194}
195
196static void set_refcount_ro4(void *refcount_array, uint64_t index,
197 uint64_t value)
198{
199 assert(!(value >> 16));
200 ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
201}
202
59c0cb78
HR
203static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
204{
205 return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
206}
207
208static void set_refcount_ro5(void *refcount_array, uint64_t index,
209 uint64_t value)
210{
211 assert(!(value >> 32));
212 ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
213}
214
215static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
216{
217 return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
218}
219
220static void set_refcount_ro6(void *refcount_array, uint64_t index,
221 uint64_t value)
222{
223 ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
224}
225
7453c96b 226
f7d0fe02 227static int load_refcount_block(BlockDriverState *bs,
29c1a730
KW
228 int64_t refcount_block_offset,
229 void **refcount_block)
f7d0fe02 230{
ff99129a 231 BDRVQcow2State *s = bs->opaque;
3b88e52b 232
66f82cee 233 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
9be38598
EH
234 return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
235 refcount_block);
f7d0fe02
KW
236}
237
018faafd 238/*
7324c10f
HR
239 * Retrieves the refcount of the cluster given by its index and stores it in
240 * *refcount. Returns 0 on success and -errno on failure.
018faafd 241 */
7324c10f 242int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
0e06528e 243 uint64_t *refcount)
f7d0fe02 244{
ff99129a 245 BDRVQcow2State *s = bs->opaque;
db8a31d1 246 uint64_t refcount_table_index, block_index;
f7d0fe02 247 int64_t refcount_block_offset;
018faafd 248 int ret;
7453c96b 249 void *refcount_block;
f7d0fe02 250
17bd5f47 251 refcount_table_index = cluster_index >> s->refcount_block_bits;
7324c10f
HR
252 if (refcount_table_index >= s->refcount_table_size) {
253 *refcount = 0;
f7d0fe02 254 return 0;
7324c10f 255 }
26d49c46
HR
256 refcount_block_offset =
257 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
7324c10f
HR
258 if (!refcount_block_offset) {
259 *refcount = 0;
f7d0fe02 260 return 0;
7324c10f 261 }
29c1a730 262
a97c67ee
HR
263 if (offset_into_cluster(s, refcount_block_offset)) {
264 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
265 " unaligned (reftable index: %#" PRIx64 ")",
266 refcount_block_offset, refcount_table_index);
267 return -EIO;
268 }
269
29c1a730 270 ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
7453c96b 271 &refcount_block);
29c1a730
KW
272 if (ret < 0) {
273 return ret;
f7d0fe02 274 }
29c1a730 275
17bd5f47 276 block_index = cluster_index & (s->refcount_block_size - 1);
7453c96b 277 *refcount = s->get_refcount(refcount_block, block_index);
29c1a730 278
a3f1afb4 279 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
29c1a730 280
7324c10f 281 return 0;
f7d0fe02
KW
282}
283
92dcb59f 284/* Checks if two offsets are described by the same refcount block */
ff99129a 285static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
92dcb59f
KW
286 uint64_t offset_b)
287{
17bd5f47
HR
288 uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
289 uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
92dcb59f
KW
290
291 return (block_a == block_b);
292}
293
294/*
295 * Loads a refcount block. If it doesn't exist yet, it is allocated first
296 * (including growing the refcount table if needed).
297 *
29c1a730 298 * Returns 0 on success or -errno in error case
92dcb59f 299 */
29c1a730 300static int alloc_refcount_block(BlockDriverState *bs,
7453c96b 301 int64_t cluster_index, void **refcount_block)
f7d0fe02 302{
ff99129a 303 BDRVQcow2State *s = bs->opaque;
92dcb59f 304 unsigned int refcount_table_index;
12cc30a8 305 int64_t ret;
92dcb59f 306
66f82cee 307 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
8252278a 308
92dcb59f 309 /* Find the refcount block for the given cluster */
17bd5f47 310 refcount_table_index = cluster_index >> s->refcount_block_bits;
92dcb59f
KW
311
312 if (refcount_table_index < s->refcount_table_size) {
313
314 uint64_t refcount_block_offset =
76dc9e0c 315 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
92dcb59f
KW
316
317 /* If it's already there, we're done */
318 if (refcount_block_offset) {
a97c67ee
HR
319 if (offset_into_cluster(s, refcount_block_offset)) {
320 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
321 PRIx64 " unaligned (reftable index: "
322 "%#x)", refcount_block_offset,
323 refcount_table_index);
324 return -EIO;
325 }
326
29c1a730 327 return load_refcount_block(bs, refcount_block_offset,
7453c96b 328 refcount_block);
92dcb59f
KW
329 }
330 }
331
332 /*
333 * If we came here, we need to allocate something. Something is at least
334 * a cluster for the new refcount block. It may also include a new refcount
335 * table if the old refcount table is too small.
336 *
337 * Note that allocating clusters here needs some special care:
338 *
339 * - We can't use the normal qcow2_alloc_clusters(), it would try to
340 * increase the refcount and very likely we would end up with an endless
341 * recursion. Instead we must place the refcount blocks in a way that
342 * they can describe them themselves.
343 *
344 * - We need to consider that at this point we are inside update_refcounts
b106ad91
KW
345 * and potentially doing an initial refcount increase. This means that
346 * some clusters have already been allocated by the caller, but their
347 * refcount isn't accurate yet. If we allocate clusters for metadata, we
348 * need to return -EAGAIN to signal the caller that it needs to restart
349 * the search for free clusters.
92dcb59f
KW
350 *
351 * - alloc_clusters_noref and qcow2_free_clusters may load a different
352 * refcount block into the cache
353 */
354
29c1a730
KW
355 *refcount_block = NULL;
356
357 /* We write to the refcount table, so we might depend on L2 tables */
9991923b
SH
358 ret = qcow2_cache_flush(bs, s->l2_table_cache);
359 if (ret < 0) {
360 return ret;
361 }
92dcb59f
KW
362
363 /* Allocate the refcount block itself and mark it as used */
2eaa8f63
KW
364 int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
365 if (new_block < 0) {
366 return new_block;
367 }
f7d0fe02 368
f7d0fe02 369#ifdef DEBUG_ALLOC2
92dcb59f
KW
370 fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
371 " at %" PRIx64 "\n",
372 refcount_table_index, cluster_index << s->cluster_bits, new_block);
f7d0fe02 373#endif
92dcb59f
KW
374
375 if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
25408c09 376 /* Zero the new refcount block before updating it */
29c1a730 377 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
7453c96b 378 refcount_block);
29c1a730 379 if (ret < 0) {
60c48a29 380 goto fail;
29c1a730
KW
381 }
382
383 memset(*refcount_block, 0, s->cluster_size);
25408c09 384
92dcb59f
KW
385 /* The block describes itself, need to update the cache */
386 int block_index = (new_block >> s->cluster_bits) &
17bd5f47 387 (s->refcount_block_size - 1);
7453c96b 388 s->set_refcount(*refcount_block, block_index, 1);
92dcb59f
KW
389 } else {
390 /* Described somewhere else. This can recurse at most twice before we
391 * arrive at a block that describes itself. */
2aabe7c7 392 ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
6cfcb9b8 393 QCOW2_DISCARD_NEVER);
92dcb59f 394 if (ret < 0) {
60c48a29 395 goto fail;
92dcb59f 396 }
25408c09 397
9991923b
SH
398 ret = qcow2_cache_flush(bs, s->refcount_block_cache);
399 if (ret < 0) {
60c48a29 400 goto fail;
9991923b 401 }
1c4c2814 402
25408c09
KW
403 /* Initialize the new refcount block only after updating its refcount,
404 * update_refcount uses the refcount cache itself */
29c1a730 405 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
7453c96b 406 refcount_block);
29c1a730 407 if (ret < 0) {
60c48a29 408 goto fail;
29c1a730
KW
409 }
410
411 memset(*refcount_block, 0, s->cluster_size);
92dcb59f
KW
412 }
413
414 /* Now the new refcount block needs to be written to disk */
66f82cee 415 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
72e80b89 416 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
29c1a730 417 ret = qcow2_cache_flush(bs, s->refcount_block_cache);
92dcb59f 418 if (ret < 0) {
60c48a29 419 goto fail;
92dcb59f
KW
420 }
421
422 /* If the refcount table is big enough, just hook the block up there */
423 if (refcount_table_index < s->refcount_table_size) {
424 uint64_t data64 = cpu_to_be64(new_block);
66f82cee 425 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
d9ca2ea2 426 ret = bdrv_pwrite_sync(bs->file,
92dcb59f
KW
427 s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
428 &data64, sizeof(data64));
429 if (ret < 0) {
60c48a29 430 goto fail;
92dcb59f
KW
431 }
432
433 s->refcount_table[refcount_table_index] = new_block;
7061a078
AG
434 /* If there's a hole in s->refcount_table then it can happen
435 * that refcount_table_index < s->max_refcount_table_index */
436 s->max_refcount_table_index =
437 MAX(s->max_refcount_table_index, refcount_table_index);
b106ad91
KW
438
439 /* The new refcount block may be where the caller intended to put its
440 * data, so let it restart the search. */
441 return -EAGAIN;
29c1a730
KW
442 }
443
a3f1afb4 444 qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
92dcb59f
KW
445
446 /*
447 * If we come here, we need to grow the refcount table. Again, a new
448 * refcount table needs some space and we can't simply allocate to avoid
449 * endless recursion.
450 *
451 * Therefore let's grab new refcount blocks at the end of the image, which
452 * will describe themselves and the new refcount table. This way we can
453 * reference them only in the new table and do the switch to the new
454 * refcount table at once without producing an inconsistent state in
455 * between.
456 */
66f82cee 457 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
8252278a 458
14a58a4e
HR
459 /* Calculate the number of refcount blocks needed so far; this will be the
460 * basis for calculating the index of the first cluster used for the
461 * self-describing refcount structures which we are about to create.
462 *
463 * Because we reached this point, there cannot be any refcount entries for
464 * cluster_index or higher indices yet. However, because new_block has been
465 * allocated to describe that cluster (and it will assume this role later
466 * on), we cannot use that index; also, new_block may actually have a higher
467 * cluster index than cluster_index, so it needs to be taken into account
468 * here (and 1 needs to be added to its value because that cluster is used).
469 */
470 uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
471 (new_block >> s->cluster_bits) + 1),
472 s->refcount_block_size);
92dcb59f 473
12cc30a8
HR
474 /* Create the new refcount table and blocks */
475 uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
476 s->cluster_size;
477
478 ret = qcow2_refcount_area(bs, meta_offset, 0, false,
479 refcount_table_index, new_block);
480 if (ret < 0) {
481 return ret;
2b5d5953
KW
482 }
483
12cc30a8
HR
484 ret = load_refcount_block(bs, new_block, refcount_block);
485 if (ret < 0) {
486 return ret;
487 }
92dcb59f 488
12cc30a8
HR
489 /* If we were trying to do the initial refcount update for some cluster
490 * allocation, we might have used the same clusters to store newly
491 * allocated metadata. Make the caller search some new space. */
492 return -EAGAIN;
92dcb59f 493
60c48a29 494fail:
12cc30a8
HR
495 if (*refcount_block != NULL) {
496 qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
497 }
498 return ret;
499}
92dcb59f 500
12cc30a8
HR
501/*
502 * Starting at @start_offset, this function creates new self-covering refcount
503 * structures: A new refcount table and refcount blocks which cover all of
504 * themselves, and a number of @additional_clusters beyond their end.
505 * @start_offset must be at the end of the image file, that is, there must be
506 * only empty space beyond it.
507 * If @exact_size is false, the refcount table will have 50 % more entries than
508 * necessary so it will not need to grow again soon.
509 * If @new_refblock_offset is not zero, it contains the offset of a refcount
510 * block that should be entered into the new refcount table at index
511 * @new_refblock_index.
512 *
513 * Returns: The offset after the new refcount structures (i.e. where the
514 * @additional_clusters may be placed) on success, -errno on error.
515 */
772d1f97
HR
516int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
517 uint64_t additional_clusters, bool exact_size,
518 int new_refblock_index,
519 uint64_t new_refblock_offset)
12cc30a8
HR
520{
521 BDRVQcow2State *s = bs->opaque;
522 uint64_t total_refblock_count_u64, additional_refblock_count;
523 int total_refblock_count, table_size, area_reftable_index, table_clusters;
524 int i;
525 uint64_t table_offset, block_offset, end_offset;
526 int ret;
527 uint64_t *new_table;
92dcb59f 528
12cc30a8 529 assert(!(start_offset % s->cluster_size));
de82815d 530
12cc30a8
HR
531 qcow2_refcount_metadata_size(start_offset / s->cluster_size +
532 additional_clusters,
533 s->cluster_size, s->refcount_order,
534 !exact_size, &total_refblock_count_u64);
535 if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
536 return -EFBIG;
537 }
538 total_refblock_count = total_refblock_count_u64;
539
540 /* Index in the refcount table of the first refcount block to cover the area
541 * of refcount structures we are about to create; we know that
542 * @total_refblock_count can cover @start_offset, so this will definitely
543 * fit into an int. */
544 area_reftable_index = (start_offset / s->cluster_size) /
545 s->refcount_block_size;
546
547 if (exact_size) {
548 table_size = total_refblock_count;
549 } else {
550 table_size = total_refblock_count +
551 DIV_ROUND_UP(total_refblock_count, 2);
552 }
553 /* The qcow2 file can only store the reftable size in number of clusters */
554 table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t));
555 table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size;
556
557 if (table_size > QCOW_MAX_REFTABLE_SIZE) {
558 return -EFBIG;
559 }
560
561 new_table = g_try_new0(uint64_t, table_size);
562
563 assert(table_size > 0);
564 if (new_table == NULL) {
de82815d 565 ret = -ENOMEM;
12cc30a8 566 goto fail;
de82815d 567 }
92dcb59f 568
92dcb59f 569 /* Fill the new refcount table */
12cc30a8
HR
570 if (table_size > s->max_refcount_table_index) {
571 /* We're actually growing the reftable */
572 memcpy(new_table, s->refcount_table,
573 (s->max_refcount_table_index + 1) * sizeof(uint64_t));
574 } else {
575 /* Improbable case: We're shrinking the reftable. However, the caller
576 * has assured us that there is only empty space beyond @start_offset,
577 * so we can simply drop all of the refblocks that won't fit into the
578 * new reftable. */
579 memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t));
580 }
92dcb59f 581
12cc30a8
HR
582 if (new_refblock_offset) {
583 assert(new_refblock_index < total_refblock_count);
584 new_table[new_refblock_index] = new_refblock_offset;
585 }
586
587 /* Count how many new refblocks we have to create */
588 additional_refblock_count = 0;
589 for (i = area_reftable_index; i < total_refblock_count; i++) {
590 if (!new_table[i]) {
591 additional_refblock_count++;
592 }
92dcb59f
KW
593 }
594
12cc30a8
HR
595 table_offset = start_offset + additional_refblock_count * s->cluster_size;
596 end_offset = table_offset + table_clusters * s->cluster_size;
597
598 /* Fill the refcount blocks, and create new ones, if necessary */
599 block_offset = start_offset;
600 for (i = area_reftable_index; i < total_refblock_count; i++) {
601 void *refblock_data;
602 uint64_t first_offset_covered;
603
604 /* Reuse an existing refblock if possible, create a new one otherwise */
605 if (new_table[i]) {
606 ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
607 &refblock_data);
608 if (ret < 0) {
609 goto fail;
610 }
611 } else {
612 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
613 block_offset, &refblock_data);
614 if (ret < 0) {
615 goto fail;
616 }
617 memset(refblock_data, 0, s->cluster_size);
618 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
619 refblock_data);
620
621 new_table[i] = block_offset;
622 block_offset += s->cluster_size;
623 }
624
625 /* First host offset covered by this refblock */
626 first_offset_covered = (uint64_t)i * s->refcount_block_size *
627 s->cluster_size;
628 if (first_offset_covered < end_offset) {
629 int j, end_index;
630
631 /* Set the refcount of all of the new refcount structures to 1 */
632
633 if (first_offset_covered < start_offset) {
634 assert(i == area_reftable_index);
635 j = (start_offset - first_offset_covered) / s->cluster_size;
636 assert(j < s->refcount_block_size);
637 } else {
638 j = 0;
639 }
640
641 end_index = MIN((end_offset - first_offset_covered) /
642 s->cluster_size,
643 s->refcount_block_size);
644
645 for (; j < end_index; j++) {
646 /* The caller guaranteed us this space would be empty */
647 assert(s->get_refcount(refblock_data, j) == 0);
648 s->set_refcount(refblock_data, j, 1);
649 }
650
651 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
652 refblock_data);
653 }
654
655 qcow2_cache_put(bs, s->refcount_block_cache, &refblock_data);
92dcb59f
KW
656 }
657
12cc30a8
HR
658 assert(block_offset == table_offset);
659
92dcb59f 660 /* Write refcount blocks to disk */
66f82cee 661 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
12cc30a8 662 ret = qcow2_cache_flush(bs, s->refcount_block_cache);
92dcb59f 663 if (ret < 0) {
12cc30a8 664 goto fail;
92dcb59f
KW
665 }
666
667 /* Write refcount table to disk */
12cc30a8 668 for (i = 0; i < total_refblock_count; i++) {
92dcb59f
KW
669 cpu_to_be64s(&new_table[i]);
670 }
671
66f82cee 672 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
d9ca2ea2 673 ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
92dcb59f
KW
674 table_size * sizeof(uint64_t));
675 if (ret < 0) {
12cc30a8 676 goto fail;
92dcb59f
KW
677 }
678
12cc30a8 679 for (i = 0; i < total_refblock_count; i++) {
87267753 680 be64_to_cpus(&new_table[i]);
92dcb59f 681 }
f7d0fe02 682
92dcb59f 683 /* Hook up the new refcount table in the qcow2 header */
95334230
JS
684 struct QEMU_PACKED {
685 uint64_t d64;
686 uint32_t d32;
687 } data;
f1f7a1dd
PM
688 data.d64 = cpu_to_be64(table_offset);
689 data.d32 = cpu_to_be32(table_clusters);
66f82cee 690 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
d9ca2ea2 691 ret = bdrv_pwrite_sync(bs->file,
9a4f4c31 692 offsetof(QCowHeader, refcount_table_offset),
95334230 693 &data, sizeof(data));
92dcb59f 694 if (ret < 0) {
12cc30a8 695 goto fail;
f2b7c8b3
KW
696 }
697
92dcb59f
KW
698 /* And switch it in memory */
699 uint64_t old_table_offset = s->refcount_table_offset;
700 uint64_t old_table_size = s->refcount_table_size;
701
7267c094 702 g_free(s->refcount_table);
f7d0fe02 703 s->refcount_table = new_table;
92dcb59f 704 s->refcount_table_size = table_size;
f7d0fe02 705 s->refcount_table_offset = table_offset;
7061a078 706 update_max_refcount_table_index(s);
f7d0fe02 707
b106ad91 708 /* Free old table. */
6cfcb9b8
KW
709 qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
710 QCOW2_DISCARD_OTHER);
f7d0fe02 711
12cc30a8 712 return end_offset;
f7d0fe02 713
12cc30a8 714fail:
7267c094 715 g_free(new_table);
29c1a730 716 return ret;
9923e05e
KW
717}
718
0b919fae
KW
719void qcow2_process_discards(BlockDriverState *bs, int ret)
720{
ff99129a 721 BDRVQcow2State *s = bs->opaque;
0b919fae
KW
722 Qcow2DiscardRegion *d, *next;
723
724 QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
725 QTAILQ_REMOVE(&s->discards, d, next);
726
727 /* Discard is optional, ignore the return value */
728 if (ret >= 0) {
0c51a893 729 bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
0b919fae
KW
730 }
731
732 g_free(d);
733 }
734}
735
736static void update_refcount_discard(BlockDriverState *bs,
737 uint64_t offset, uint64_t length)
738{
ff99129a 739 BDRVQcow2State *s = bs->opaque;
0b919fae
KW
740 Qcow2DiscardRegion *d, *p, *next;
741
742 QTAILQ_FOREACH(d, &s->discards, next) {
743 uint64_t new_start = MIN(offset, d->offset);
744 uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
745
746 if (new_end - new_start <= length + d->bytes) {
747 /* There can't be any overlap, areas ending up here have no
748 * references any more and therefore shouldn't get freed another
749 * time. */
750 assert(d->bytes + length == new_end - new_start);
751 d->offset = new_start;
752 d->bytes = new_end - new_start;
753 goto found;
754 }
755 }
756
757 d = g_malloc(sizeof(*d));
758 *d = (Qcow2DiscardRegion) {
759 .bs = bs,
760 .offset = offset,
761 .bytes = length,
762 };
763 QTAILQ_INSERT_TAIL(&s->discards, d, next);
764
765found:
766 /* Merge discard requests if they are adjacent now */
767 QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
768 if (p == d
769 || p->offset > d->offset + d->bytes
770 || d->offset > p->offset + p->bytes)
771 {
772 continue;
773 }
774
775 /* Still no overlap possible */
776 assert(p->offset == d->offset + d->bytes
777 || d->offset == p->offset + p->bytes);
778
779 QTAILQ_REMOVE(&s->discards, p, next);
780 d->offset = MIN(d->offset, p->offset);
781 d->bytes += p->bytes;
d8bb71b6 782 g_free(p);
0b919fae
KW
783 }
784}
785
f7d0fe02 786/* XXX: cache several refcount block clusters ? */
2aabe7c7
HR
787/* @addend is the absolute value of the addend; if @decrease is set, @addend
788 * will be subtracted from the current refcount, otherwise it will be added */
db3a964f 789static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
2aabe7c7
HR
790 int64_t offset,
791 int64_t length,
0e06528e 792 uint64_t addend,
2aabe7c7
HR
793 bool decrease,
794 enum qcow2_discard_type type)
f7d0fe02 795{
ff99129a 796 BDRVQcow2State *s = bs->opaque;
f7d0fe02 797 int64_t start, last, cluster_offset;
7453c96b 798 void *refcount_block = NULL;
29c1a730 799 int64_t old_table_index = -1;
09508d13 800 int ret;
f7d0fe02
KW
801
802#ifdef DEBUG_ALLOC2
2aabe7c7 803 fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
0e06528e 804 " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
2aabe7c7 805 addend);
f7d0fe02 806#endif
7322afe7 807 if (length < 0) {
f7d0fe02 808 return -EINVAL;
7322afe7
KW
809 } else if (length == 0) {
810 return 0;
811 }
812
2aabe7c7 813 if (decrease) {
29c1a730
KW
814 qcow2_cache_set_dependency(bs, s->refcount_block_cache,
815 s->l2_table_cache);
816 }
817
ac95acdb
HT
818 start = start_of_cluster(s, offset);
819 last = start_of_cluster(s, offset + length - 1);
f7d0fe02
KW
820 for(cluster_offset = start; cluster_offset <= last;
821 cluster_offset += s->cluster_size)
822 {
2aabe7c7 823 int block_index;
0e06528e 824 uint64_t refcount;
f7d0fe02 825 int64_t cluster_index = cluster_offset >> s->cluster_bits;
17bd5f47 826 int64_t table_index = cluster_index >> s->refcount_block_bits;
f7d0fe02 827
29c1a730
KW
828 /* Load the refcount block and allocate it if needed */
829 if (table_index != old_table_index) {
830 if (refcount_block) {
a3f1afb4 831 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
29c1a730 832 }
29c1a730 833 ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
ed0df867 834 if (ret < 0) {
29c1a730 835 goto fail;
f7d0fe02 836 }
f7d0fe02 837 }
29c1a730 838 old_table_index = table_index;
f7d0fe02 839
72e80b89
AG
840 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
841 refcount_block);
f7d0fe02
KW
842
843 /* we can update the count and save it */
17bd5f47 844 block_index = cluster_index & (s->refcount_block_size - 1);
f7d0fe02 845
7453c96b 846 refcount = s->get_refcount(refcount_block, block_index);
0e06528e
HR
847 if (decrease ? (refcount - addend > refcount)
848 : (refcount + addend < refcount ||
849 refcount + addend > s->refcount_max))
2aabe7c7 850 {
09508d13
KW
851 ret = -EINVAL;
852 goto fail;
853 }
2aabe7c7
HR
854 if (decrease) {
855 refcount -= addend;
856 } else {
857 refcount += addend;
858 }
f7d0fe02
KW
859 if (refcount == 0 && cluster_index < s->free_cluster_index) {
860 s->free_cluster_index = cluster_index;
861 }
7453c96b 862 s->set_refcount(refcount_block, block_index, refcount);
0b919fae 863
f71c08ea
PB
864 if (refcount == 0) {
865 void *table;
866
867 table = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
868 offset);
869 if (table != NULL) {
870 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
871 qcow2_cache_discard(bs, s->refcount_block_cache, table);
872 }
873
874 table = qcow2_cache_is_table_offset(bs, s->l2_table_cache, offset);
875 if (table != NULL) {
876 qcow2_cache_discard(bs, s->l2_table_cache, table);
877 }
878
879 if (s->discard_passthrough[type]) {
880 update_refcount_discard(bs, cluster_offset, s->cluster_size);
881 }
67af674e 882 }
f7d0fe02
KW
883 }
884
09508d13
KW
885 ret = 0;
886fail:
0b919fae
KW
887 if (!s->cache_discards) {
888 qcow2_process_discards(bs, ret);
889 }
890
f7d0fe02 891 /* Write last changed block to disk */
29c1a730 892 if (refcount_block) {
a3f1afb4 893 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
f7d0fe02
KW
894 }
895
09508d13
KW
896 /*
897 * Try do undo any updates if an error is returned (This may succeed in
898 * some cases like ENOSPC for allocating a new refcount block)
899 */
900 if (ret < 0) {
901 int dummy;
2aabe7c7
HR
902 dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
903 !decrease, QCOW2_DISCARD_NEVER);
83e3f76c 904 (void)dummy;
09508d13
KW
905 }
906
907 return ret;
f7d0fe02
KW
908}
909
018faafd 910/*
44751917 911 * Increases or decreases the refcount of a given cluster.
018faafd 912 *
2aabe7c7
HR
913 * @addend is the absolute value of the addend; if @decrease is set, @addend
914 * will be subtracted from the current refcount, otherwise it will be added.
915 *
c6e9d8ae 916 * On success 0 is returned; on failure -errno is returned.
018faafd 917 */
32b6444d
HR
918int qcow2_update_cluster_refcount(BlockDriverState *bs,
919 int64_t cluster_index,
0e06528e 920 uint64_t addend, bool decrease,
32b6444d 921 enum qcow2_discard_type type)
f7d0fe02 922{
ff99129a 923 BDRVQcow2State *s = bs->opaque;
f7d0fe02
KW
924 int ret;
925
6cfcb9b8 926 ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
2aabe7c7 927 decrease, type);
f7d0fe02
KW
928 if (ret < 0) {
929 return ret;
930 }
931
c6e9d8ae 932 return 0;
f7d0fe02
KW
933}
934
935
936
937/*********************************************************/
938/* cluster allocation functions */
939
940
941
942/* return < 0 if error */
bb572aef 943static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
f7d0fe02 944{
ff99129a 945 BDRVQcow2State *s = bs->opaque;
0e06528e 946 uint64_t i, nb_clusters, refcount;
7324c10f 947 int ret;
f7d0fe02 948
ecbda7a2
KW
949 /* We can't allocate clusters if they may still be queued for discard. */
950 if (s->cache_discards) {
951 qcow2_process_discards(bs, 0);
952 }
953
f7d0fe02
KW
954 nb_clusters = size_to_clusters(s, size);
955retry:
956 for(i = 0; i < nb_clusters; i++) {
bb572aef 957 uint64_t next_cluster_index = s->free_cluster_index++;
7324c10f 958 ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
2eaa8f63 959
7324c10f
HR
960 if (ret < 0) {
961 return ret;
2eaa8f63 962 } else if (refcount != 0) {
f7d0fe02 963 goto retry;
2eaa8f63 964 }
f7d0fe02 965 }
91f827dc
HR
966
967 /* Make sure that all offsets in the "allocated" range are representable
968 * in an int64_t */
65f33bc0
HR
969 if (s->free_cluster_index > 0 &&
970 s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
971 {
91f827dc
HR
972 return -EFBIG;
973 }
974
f7d0fe02 975#ifdef DEBUG_ALLOC2
35ee5e39 976 fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
f7d0fe02
KW
977 size,
978 (s->free_cluster_index - nb_clusters) << s->cluster_bits);
979#endif
980 return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
981}
982
bb572aef 983int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
f7d0fe02
KW
984{
985 int64_t offset;
db3a964f 986 int ret;
f7d0fe02 987
66f82cee 988 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
b106ad91
KW
989 do {
990 offset = alloc_clusters_noref(bs, size);
991 if (offset < 0) {
992 return offset;
993 }
994
2aabe7c7 995 ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
b106ad91 996 } while (ret == -EAGAIN);
2eaa8f63 997
db3a964f
KW
998 if (ret < 0) {
999 return ret;
1000 }
1c4c2814 1001
f7d0fe02
KW
1002 return offset;
1003}
1004
b6d36def
HR
1005int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1006 int64_t nb_clusters)
256900b1 1007{
ff99129a 1008 BDRVQcow2State *s = bs->opaque;
0e06528e 1009 uint64_t cluster_index, refcount;
33304ec9 1010 uint64_t i;
7324c10f 1011 int ret;
33304ec9
HT
1012
1013 assert(nb_clusters >= 0);
1014 if (nb_clusters == 0) {
1015 return 0;
1016 }
256900b1 1017
b106ad91
KW
1018 do {
1019 /* Check how many clusters there are free */
1020 cluster_index = offset >> s->cluster_bits;
1021 for(i = 0; i < nb_clusters; i++) {
7324c10f
HR
1022 ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1023 if (ret < 0) {
1024 return ret;
b106ad91
KW
1025 } else if (refcount != 0) {
1026 break;
1027 }
256900b1 1028 }
256900b1 1029
b106ad91 1030 /* And then allocate them */
2aabe7c7 1031 ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
b106ad91
KW
1032 QCOW2_DISCARD_NEVER);
1033 } while (ret == -EAGAIN);
f24423bd 1034
256900b1
KW
1035 if (ret < 0) {
1036 return ret;
1037 }
1038
1039 return i;
1040}
1041
f7d0fe02
KW
1042/* only used to allocate compressed sectors. We try to allocate
1043 contiguous sectors. size must be <= cluster_size */
ed6ccf0f 1044int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
f7d0fe02 1045{
ff99129a 1046 BDRVQcow2State *s = bs->opaque;
8c44dfbc
HR
1047 int64_t offset;
1048 size_t free_in_cluster;
1049 int ret;
f7d0fe02 1050
66f82cee 1051 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
f7d0fe02 1052 assert(size > 0 && size <= s->cluster_size);
8c44dfbc
HR
1053 assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1054
1055 offset = s->free_byte_offset;
1056
1057 if (offset) {
0e06528e 1058 uint64_t refcount;
7324c10f
HR
1059 ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1060 if (ret < 0) {
1061 return ret;
5d757b56 1062 }
8c44dfbc 1063
346a53df 1064 if (refcount == s->refcount_max) {
8c44dfbc 1065 offset = 0;
5d757b56 1066 }
8c44dfbc
HR
1067 }
1068
1069 free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
3e5feb62
JM
1070 do {
1071 if (!offset || free_in_cluster < size) {
1072 int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
1073 if (new_cluster < 0) {
1074 return new_cluster;
1075 }
8c44dfbc 1076
3e5feb62
JM
1077 if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1078 offset = new_cluster;
2ac01520
HR
1079 free_in_cluster = s->cluster_size;
1080 } else {
1081 free_in_cluster += s->cluster_size;
3e5feb62 1082 }
f7d0fe02 1083 }
29216ed1 1084
3e5feb62
JM
1085 assert(offset);
1086 ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
2ac01520
HR
1087 if (ret < 0) {
1088 offset = 0;
1089 }
3e5feb62 1090 } while (ret == -EAGAIN);
8c44dfbc
HR
1091 if (ret < 0) {
1092 return ret;
1093 }
1094
1095 /* The cluster refcount was incremented; refcount blocks must be flushed
1096 * before the caller's L2 table updates. */
c1f5bafd 1097 qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
8c44dfbc
HR
1098
1099 s->free_byte_offset = offset + size;
1100 if (!offset_into_cluster(s, s->free_byte_offset)) {
1101 s->free_byte_offset = 0;
1102 }
1103
f7d0fe02
KW
1104 return offset;
1105}
1106
ed6ccf0f 1107void qcow2_free_clusters(BlockDriverState *bs,
6cfcb9b8
KW
1108 int64_t offset, int64_t size,
1109 enum qcow2_discard_type type)
f7d0fe02 1110{
db3a964f
KW
1111 int ret;
1112
66f82cee 1113 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
2aabe7c7 1114 ret = update_refcount(bs, offset, size, 1, true, type);
db3a964f
KW
1115 if (ret < 0) {
1116 fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
003fad6e 1117 /* TODO Remember the clusters to free them later and avoid leaking */
db3a964f 1118 }
f7d0fe02
KW
1119}
1120
45aba42f 1121/*
c7a4c37a
KW
1122 * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1123 * normal cluster, compressed cluster, etc.)
45aba42f 1124 */
6cfcb9b8
KW
1125void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1126 int nb_clusters, enum qcow2_discard_type type)
45aba42f 1127{
ff99129a 1128 BDRVQcow2State *s = bs->opaque;
45aba42f 1129
c7a4c37a
KW
1130 switch (qcow2_get_cluster_type(l2_entry)) {
1131 case QCOW2_CLUSTER_COMPRESSED:
1132 {
1133 int nb_csectors;
1134 nb_csectors = ((l2_entry >> s->csize_shift) &
1135 s->csize_mask) + 1;
1136 qcow2_free_clusters(bs,
1137 (l2_entry & s->cluster_offset_mask) & ~511,
6cfcb9b8 1138 nb_csectors * 512, type);
c7a4c37a
KW
1139 }
1140 break;
1141 case QCOW2_CLUSTER_NORMAL:
fdfab37d
EB
1142 case QCOW2_CLUSTER_ZERO_ALLOC:
1143 if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1144 qcow2_signal_corruption(bs, false, -1, -1,
1145 "Cannot free unaligned cluster %#llx",
1146 l2_entry & L2E_OFFSET_MASK);
1147 } else {
1148 qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1149 nb_clusters << s->cluster_bits, type);
8f730dd2 1150 }
c7a4c37a 1151 break;
fdfab37d 1152 case QCOW2_CLUSTER_ZERO_PLAIN:
c7a4c37a
KW
1153 case QCOW2_CLUSTER_UNALLOCATED:
1154 break;
1155 default:
1156 abort();
45aba42f 1157 }
45aba42f
KW
1158}
1159
f7d0fe02
KW
1160
1161
1162/*********************************************************/
1163/* snapshots and image creation */
1164
1165
1166
f7d0fe02 1167/* update the refcounts of snapshots and the copied flag */
ed6ccf0f
KW
1168int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1169 int64_t l1_table_offset, int l1_size, int addend)
f7d0fe02 1170{
ff99129a 1171 BDRVQcow2State *s = bs->opaque;
b32cbae1 1172 uint64_t *l1_table, *l2_table, l2_offset, entry, l1_size2, refcount;
de82815d 1173 bool l1_allocated = false;
b32cbae1 1174 int64_t old_entry, old_l2_offset;
7324c10f 1175 int i, j, l1_modified = 0, nb_csectors;
29c1a730 1176 int ret;
f7d0fe02 1177
2aabe7c7
HR
1178 assert(addend >= -1 && addend <= 1);
1179
f7d0fe02
KW
1180 l2_table = NULL;
1181 l1_table = NULL;
1182 l1_size2 = l1_size * sizeof(uint64_t);
43a0cac4 1183
0b919fae
KW
1184 s->cache_discards = true;
1185
43a0cac4
KW
1186 /* WARNING: qcow2_snapshot_goto relies on this function not using the
1187 * l1_table_offset when it is the current s->l1_table_offset! Be careful
1188 * when changing this! */
f7d0fe02 1189 if (l1_table_offset != s->l1_table_offset) {
de82815d
KW
1190 l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1191 if (l1_size2 && l1_table == NULL) {
1192 ret = -ENOMEM;
1193 goto fail;
1194 }
1195 l1_allocated = true;
c2bc78b6 1196
cf2ab8fc 1197 ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
c2bc78b6 1198 if (ret < 0) {
f7d0fe02 1199 goto fail;
93913dfd
KW
1200 }
1201
b32cbae1 1202 for (i = 0; i < l1_size; i++) {
f7d0fe02 1203 be64_to_cpus(&l1_table[i]);
b32cbae1 1204 }
f7d0fe02
KW
1205 } else {
1206 assert(l1_size == s->l1_size);
1207 l1_table = s->l1_table;
de82815d 1208 l1_allocated = false;
f7d0fe02
KW
1209 }
1210
b32cbae1 1211 for (i = 0; i < l1_size; i++) {
f7d0fe02
KW
1212 l2_offset = l1_table[i];
1213 if (l2_offset) {
1214 old_l2_offset = l2_offset;
8e37f681 1215 l2_offset &= L1E_OFFSET_MASK;
29c1a730 1216
a97c67ee
HR
1217 if (offset_into_cluster(s, l2_offset)) {
1218 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1219 PRIx64 " unaligned (L1 index: %#x)",
1220 l2_offset, i);
1221 ret = -EIO;
1222 goto fail;
1223 }
1224
29c1a730
KW
1225 ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1226 (void**) &l2_table);
1227 if (ret < 0) {
f7d0fe02 1228 goto fail;
29c1a730
KW
1229 }
1230
b32cbae1 1231 for (j = 0; j < s->l2_size; j++) {
8b81a7b6 1232 uint64_t cluster_index;
b32cbae1 1233 uint64_t offset;
8b81a7b6 1234
b32cbae1
EB
1235 entry = be64_to_cpu(l2_table[j]);
1236 old_entry = entry;
1237 entry &= ~QCOW_OFLAG_COPIED;
1238 offset = entry & L2E_OFFSET_MASK;
8b81a7b6 1239
b32cbae1 1240 switch (qcow2_get_cluster_type(entry)) {
bbd995d8
EB
1241 case QCOW2_CLUSTER_COMPRESSED:
1242 nb_csectors = ((entry >> s->csize_shift) &
1243 s->csize_mask) + 1;
1244 if (addend != 0) {
1245 ret = update_refcount(bs,
b32cbae1 1246 (entry & s->cluster_offset_mask) & ~511,
2aabe7c7 1247 nb_csectors * 512, abs(addend), addend < 0,
6cfcb9b8 1248 QCOW2_DISCARD_SNAPSHOT);
bbd995d8 1249 if (ret < 0) {
a97c67ee
HR
1250 goto fail;
1251 }
bbd995d8
EB
1252 }
1253 /* compressed clusters are never modified */
1254 refcount = 2;
1255 break;
1256
1257 case QCOW2_CLUSTER_NORMAL:
fdfab37d 1258 case QCOW2_CLUSTER_ZERO_ALLOC:
bbd995d8 1259 if (offset_into_cluster(s, offset)) {
fdfab37d
EB
1260 qcow2_signal_corruption(bs, true, -1, -1, "Cluster "
1261 "allocation offset %#" PRIx64
bbd995d8
EB
1262 " unaligned (L2 offset: %#"
1263 PRIx64 ", L2 index: %#x)",
1264 offset, l2_offset, j);
1265 ret = -EIO;
1266 goto fail;
1267 }
a97c67ee 1268
bbd995d8 1269 cluster_index = offset >> s->cluster_bits;
fdfab37d 1270 assert(cluster_index);
bbd995d8
EB
1271 if (addend != 0) {
1272 ret = qcow2_update_cluster_refcount(bs,
2aabe7c7 1273 cluster_index, abs(addend), addend < 0,
32b6444d 1274 QCOW2_DISCARD_SNAPSHOT);
7324c10f 1275 if (ret < 0) {
018faafd
KW
1276 goto fail;
1277 }
bbd995d8 1278 }
f7d0fe02 1279
bbd995d8
EB
1280 ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1281 if (ret < 0) {
1282 goto fail;
1283 }
1284 break;
1285
fdfab37d 1286 case QCOW2_CLUSTER_ZERO_PLAIN:
bbd995d8
EB
1287 case QCOW2_CLUSTER_UNALLOCATED:
1288 refcount = 0;
1289 break;
8b81a7b6 1290
bbd995d8
EB
1291 default:
1292 abort();
8b81a7b6
HR
1293 }
1294
1295 if (refcount == 1) {
b32cbae1 1296 entry |= QCOW_OFLAG_COPIED;
8b81a7b6 1297 }
b32cbae1 1298 if (entry != old_entry) {
8b81a7b6
HR
1299 if (addend > 0) {
1300 qcow2_cache_set_dependency(bs, s->l2_table_cache,
1301 s->refcount_block_cache);
f7d0fe02 1302 }
b32cbae1 1303 l2_table[j] = cpu_to_be64(entry);
72e80b89
AG
1304 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
1305 l2_table);
f7d0fe02
KW
1306 }
1307 }
29c1a730 1308
a3f1afb4 1309 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
29c1a730 1310
f7d0fe02 1311 if (addend != 0) {
c6e9d8ae
HR
1312 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1313 s->cluster_bits,
2aabe7c7 1314 abs(addend), addend < 0,
c6e9d8ae
HR
1315 QCOW2_DISCARD_SNAPSHOT);
1316 if (ret < 0) {
1317 goto fail;
1318 }
f7d0fe02 1319 }
7324c10f
HR
1320 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1321 &refcount);
1322 if (ret < 0) {
018faafd
KW
1323 goto fail;
1324 } else if (refcount == 1) {
f7d0fe02
KW
1325 l2_offset |= QCOW_OFLAG_COPIED;
1326 }
1327 if (l2_offset != old_l2_offset) {
1328 l1_table[i] = l2_offset;
1329 l1_modified = 1;
1330 }
1331 }
1332 }
93913dfd 1333
2154f24e 1334 ret = bdrv_flush(bs);
93913dfd
KW
1335fail:
1336 if (l2_table) {
1337 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1338 }
1339
0b919fae
KW
1340 s->cache_discards = false;
1341 qcow2_process_discards(bs, ret);
1342
43a0cac4 1343 /* Update L1 only if it isn't deleted anyway (addend = -1) */
c2b6ff51
KW
1344 if (ret == 0 && addend >= 0 && l1_modified) {
1345 for (i = 0; i < l1_size; i++) {
f7d0fe02 1346 cpu_to_be64s(&l1_table[i]);
c2b6ff51
KW
1347 }
1348
d9ca2ea2 1349 ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
9a4f4c31 1350 l1_table, l1_size2);
c2b6ff51
KW
1351
1352 for (i = 0; i < l1_size; i++) {
f7d0fe02 1353 be64_to_cpus(&l1_table[i]);
c2b6ff51 1354 }
f7d0fe02
KW
1355 }
1356 if (l1_allocated)
7267c094 1357 g_free(l1_table);
93913dfd 1358 return ret;
f7d0fe02
KW
1359}
1360
1361
1362
1363
1364/*********************************************************/
1365/* refcount checking functions */
1366
1367
c2551b47 1368static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
5fee192e
HR
1369{
1370 /* This assertion holds because there is no way we can address more than
1371 * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1372 * offsets have to be representable in bytes); due to every cluster
1373 * corresponding to one refcount entry, we are well below that limit */
1374 assert(entries < (UINT64_C(1) << (64 - 9)));
1375
1376 /* Thanks to the assertion this will not overflow, because
1377 * s->refcount_order < 7.
1378 * (note: x << s->refcount_order == x * s->refcount_bits) */
1379 return DIV_ROUND_UP(entries << s->refcount_order, 8);
1380}
1381
1382/**
1383 * Reallocates *array so that it can hold new_size entries. *size must contain
1384 * the current number of entries in *array. If the reallocation fails, *array
1385 * and *size will not be modified and -errno will be returned. If the
1386 * reallocation is successful, *array will be set to the new buffer, *size
1387 * will be set to new_size and 0 will be returned. The size of the reallocated
1388 * refcount array buffer will be aligned to a cluster boundary, and the newly
1389 * allocated area will be zeroed.
1390 */
ff99129a 1391static int realloc_refcount_array(BDRVQcow2State *s, void **array,
5fee192e
HR
1392 int64_t *size, int64_t new_size)
1393{
b6d36def 1394 int64_t old_byte_size, new_byte_size;
7453c96b 1395 void *new_ptr;
5fee192e
HR
1396
1397 /* Round to clusters so the array can be directly written to disk */
1398 old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1399 * s->cluster_size;
1400 new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1401 * s->cluster_size;
1402
1403 if (new_byte_size == old_byte_size) {
1404 *size = new_size;
1405 return 0;
1406 }
1407
1408 assert(new_byte_size > 0);
1409
b6d36def
HR
1410 if (new_byte_size > SIZE_MAX) {
1411 return -ENOMEM;
1412 }
1413
5fee192e
HR
1414 new_ptr = g_try_realloc(*array, new_byte_size);
1415 if (!new_ptr) {
1416 return -ENOMEM;
1417 }
1418
1419 if (new_byte_size > old_byte_size) {
b6d36def 1420 memset((char *)new_ptr + old_byte_size, 0,
5fee192e
HR
1421 new_byte_size - old_byte_size);
1422 }
1423
1424 *array = new_ptr;
1425 *size = new_size;
1426
1427 return 0;
1428}
f7d0fe02
KW
1429
1430/*
1431 * Increases the refcount for a range of clusters in a given refcount table.
1432 * This is used to construct a temporary refcount table out of L1 and L2 tables
b6af0975 1433 * which can be compared to the refcount table saved in the image.
f7d0fe02 1434 *
9ac228e0 1435 * Modifies the number of errors in res.
f7d0fe02 1436 */
8a5bb1f1
VSO
1437int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1438 void **refcount_table,
1439 int64_t *refcount_table_size,
1440 int64_t offset, int64_t size)
f7d0fe02 1441{
ff99129a 1442 BDRVQcow2State *s = bs->opaque;
7453c96b 1443 uint64_t start, last, cluster_offset, k, refcount;
5fee192e 1444 int ret;
f7d0fe02 1445
fef4d3d5
HR
1446 if (size <= 0) {
1447 return 0;
1448 }
f7d0fe02 1449
ac95acdb
HT
1450 start = start_of_cluster(s, offset);
1451 last = start_of_cluster(s, offset + size - 1);
f7d0fe02
KW
1452 for(cluster_offset = start; cluster_offset <= last;
1453 cluster_offset += s->cluster_size) {
1454 k = cluster_offset >> s->cluster_bits;
641bb63c 1455 if (k >= *refcount_table_size) {
5fee192e
HR
1456 ret = realloc_refcount_array(s, refcount_table,
1457 refcount_table_size, k + 1);
1458 if (ret < 0) {
641bb63c 1459 res->check_errors++;
5fee192e 1460 return ret;
f7d0fe02 1461 }
641bb63c
HR
1462 }
1463
7453c96b
HR
1464 refcount = s->get_refcount(*refcount_table, k);
1465 if (refcount == s->refcount_max) {
641bb63c
HR
1466 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1467 "\n", cluster_offset);
03bb78ed
HR
1468 fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1469 "width or qemu-img convert to create a clean copy if the "
1470 "image cannot be opened for writing\n");
641bb63c 1471 res->corruptions++;
7453c96b 1472 continue;
f7d0fe02 1473 }
7453c96b 1474 s->set_refcount(*refcount_table, k, refcount + 1);
f7d0fe02 1475 }
fef4d3d5
HR
1476
1477 return 0;
f7d0fe02
KW
1478}
1479
801f7044
SH
1480/* Flags for check_refcounts_l1() and check_refcounts_l2() */
1481enum {
fba31bae 1482 CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */
801f7044
SH
1483};
1484
f7d0fe02
KW
1485/*
1486 * Increases the refcount in the given refcount table for the all clusters
1487 * referenced in the L2 table. While doing so, performs some checks on L2
1488 * entries.
1489 *
1490 * Returns the number of errors found by the checks or -errno if an internal
1491 * error occurred.
1492 */
9ac228e0 1493static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
7453c96b
HR
1494 void **refcount_table,
1495 int64_t *refcount_table_size, int64_t l2_offset,
1496 int flags)
f7d0fe02 1497{
ff99129a 1498 BDRVQcow2State *s = bs->opaque;
afdf0abe 1499 uint64_t *l2_table, l2_entry;
fba31bae 1500 uint64_t next_contiguous_offset = 0;
ad27390c 1501 int i, l2_size, nb_csectors, ret;
f7d0fe02
KW
1502
1503 /* Read L2 table from disk */
1504 l2_size = s->l2_size * sizeof(uint64_t);
7267c094 1505 l2_table = g_malloc(l2_size);
f7d0fe02 1506
cf2ab8fc 1507 ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
ad27390c
HR
1508 if (ret < 0) {
1509 fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1510 res->check_errors++;
f7d0fe02 1511 goto fail;
ad27390c 1512 }
f7d0fe02
KW
1513
1514 /* Do the actual checks */
1515 for(i = 0; i < s->l2_size; i++) {
afdf0abe
KW
1516 l2_entry = be64_to_cpu(l2_table[i]);
1517
1518 switch (qcow2_get_cluster_type(l2_entry)) {
1519 case QCOW2_CLUSTER_COMPRESSED:
1520 /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1521 if (l2_entry & QCOW_OFLAG_COPIED) {
1522 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1523 "copied flag must never be set for compressed "
1524 "clusters\n", l2_entry >> s->cluster_bits);
1525 l2_entry &= ~QCOW_OFLAG_COPIED;
1526 res->corruptions++;
1527 }
f7d0fe02 1528
afdf0abe
KW
1529 /* Mark cluster as used */
1530 nb_csectors = ((l2_entry >> s->csize_shift) &
1531 s->csize_mask) + 1;
1532 l2_entry &= s->cluster_offset_mask;
8a5bb1f1
VSO
1533 ret = qcow2_inc_refcounts_imrt(bs, res,
1534 refcount_table, refcount_table_size,
1535 l2_entry & ~511, nb_csectors * 512);
fef4d3d5
HR
1536 if (ret < 0) {
1537 goto fail;
1538 }
fba31bae
SH
1539
1540 if (flags & CHECK_FRAG_INFO) {
1541 res->bfi.allocated_clusters++;
4db35162 1542 res->bfi.compressed_clusters++;
fba31bae
SH
1543
1544 /* Compressed clusters are fragmented by nature. Since they
1545 * take up sub-sector space but we only have sector granularity
1546 * I/O we need to re-read the same sectors even for adjacent
1547 * compressed clusters.
1548 */
1549 res->bfi.fragmented_clusters++;
1550 }
afdf0abe 1551 break;
f7d0fe02 1552
fdfab37d 1553 case QCOW2_CLUSTER_ZERO_ALLOC:
afdf0abe
KW
1554 case QCOW2_CLUSTER_NORMAL:
1555 {
afdf0abe 1556 uint64_t offset = l2_entry & L2E_OFFSET_MASK;
f7d0fe02 1557
fba31bae
SH
1558 if (flags & CHECK_FRAG_INFO) {
1559 res->bfi.allocated_clusters++;
1560 if (next_contiguous_offset &&
1561 offset != next_contiguous_offset) {
1562 res->bfi.fragmented_clusters++;
1563 }
1564 next_contiguous_offset = offset + s->cluster_size;
1565 }
1566
afdf0abe 1567 /* Mark cluster as used */
8a5bb1f1
VSO
1568 ret = qcow2_inc_refcounts_imrt(bs, res,
1569 refcount_table, refcount_table_size,
1570 offset, s->cluster_size);
fef4d3d5
HR
1571 if (ret < 0) {
1572 goto fail;
1573 }
afdf0abe
KW
1574
1575 /* Correct offsets are cluster aligned */
ac95acdb 1576 if (offset_into_cluster(s, offset)) {
afdf0abe
KW
1577 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1578 "properly aligned; L2 entry corrupted.\n", offset);
1579 res->corruptions++;
1580 }
1581 break;
1582 }
1583
fdfab37d 1584 case QCOW2_CLUSTER_ZERO_PLAIN:
afdf0abe
KW
1585 case QCOW2_CLUSTER_UNALLOCATED:
1586 break;
1587
1588 default:
1589 abort();
f7d0fe02
KW
1590 }
1591 }
1592
7267c094 1593 g_free(l2_table);
9ac228e0 1594 return 0;
f7d0fe02
KW
1595
1596fail:
7267c094 1597 g_free(l2_table);
ad27390c 1598 return ret;
f7d0fe02
KW
1599}
1600
1601/*
1602 * Increases the refcount for the L1 table, its L2 tables and all referenced
1603 * clusters in the given refcount table. While doing so, performs some checks
1604 * on L1 and L2 entries.
1605 *
1606 * Returns the number of errors found by the checks or -errno if an internal
1607 * error occurred.
1608 */
1609static int check_refcounts_l1(BlockDriverState *bs,
9ac228e0 1610 BdrvCheckResult *res,
7453c96b 1611 void **refcount_table,
641bb63c 1612 int64_t *refcount_table_size,
f7d0fe02 1613 int64_t l1_table_offset, int l1_size,
801f7044 1614 int flags)
f7d0fe02 1615{
ff99129a 1616 BDRVQcow2State *s = bs->opaque;
fef4d3d5 1617 uint64_t *l1_table = NULL, l2_offset, l1_size2;
4f6ed88c 1618 int i, ret;
f7d0fe02
KW
1619
1620 l1_size2 = l1_size * sizeof(uint64_t);
1621
1622 /* Mark L1 table as used */
8a5bb1f1
VSO
1623 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1624 l1_table_offset, l1_size2);
fef4d3d5
HR
1625 if (ret < 0) {
1626 goto fail;
1627 }
f7d0fe02
KW
1628
1629 /* Read L1 table entries from disk */
fef4d3d5 1630 if (l1_size2 > 0) {
de82815d
KW
1631 l1_table = g_try_malloc(l1_size2);
1632 if (l1_table == NULL) {
1633 ret = -ENOMEM;
ad27390c 1634 res->check_errors++;
de82815d
KW
1635 goto fail;
1636 }
cf2ab8fc 1637 ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
ad27390c
HR
1638 if (ret < 0) {
1639 fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1640 res->check_errors++;
702ef63f 1641 goto fail;
ad27390c 1642 }
702ef63f
KW
1643 for(i = 0;i < l1_size; i++)
1644 be64_to_cpus(&l1_table[i]);
1645 }
f7d0fe02
KW
1646
1647 /* Do the actual checks */
1648 for(i = 0; i < l1_size; i++) {
1649 l2_offset = l1_table[i];
1650 if (l2_offset) {
f7d0fe02 1651 /* Mark L2 table as used */
afdf0abe 1652 l2_offset &= L1E_OFFSET_MASK;
8a5bb1f1
VSO
1653 ret = qcow2_inc_refcounts_imrt(bs, res,
1654 refcount_table, refcount_table_size,
1655 l2_offset, s->cluster_size);
fef4d3d5
HR
1656 if (ret < 0) {
1657 goto fail;
1658 }
f7d0fe02
KW
1659
1660 /* L2 tables are cluster aligned */
ac95acdb 1661 if (offset_into_cluster(s, l2_offset)) {
f7d0fe02
KW
1662 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1663 "cluster aligned; L1 entry corrupted\n", l2_offset);
9ac228e0 1664 res->corruptions++;
f7d0fe02
KW
1665 }
1666
1667 /* Process and check L2 entries */
9ac228e0 1668 ret = check_refcounts_l2(bs, res, refcount_table,
801f7044 1669 refcount_table_size, l2_offset, flags);
f7d0fe02
KW
1670 if (ret < 0) {
1671 goto fail;
1672 }
f7d0fe02
KW
1673 }
1674 }
7267c094 1675 g_free(l1_table);
9ac228e0 1676 return 0;
f7d0fe02
KW
1677
1678fail:
7267c094 1679 g_free(l1_table);
ad27390c 1680 return ret;
f7d0fe02
KW
1681}
1682
4f6ed88c
HR
1683/*
1684 * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1685 *
1686 * This function does not print an error message nor does it increment
44751917
HR
1687 * check_errors if qcow2_get_refcount fails (this is because such an error will
1688 * have been already detected and sufficiently signaled by the calling function
4f6ed88c
HR
1689 * (qcow2_check_refcounts) by the time this function is called).
1690 */
e23e400e
HR
1691static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1692 BdrvCheckMode fix)
4f6ed88c 1693{
ff99129a 1694 BDRVQcow2State *s = bs->opaque;
4f6ed88c
HR
1695 uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1696 int ret;
0e06528e 1697 uint64_t refcount;
4f6ed88c
HR
1698 int i, j;
1699
1700 for (i = 0; i < s->l1_size; i++) {
1701 uint64_t l1_entry = s->l1_table[i];
1702 uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
e23e400e 1703 bool l2_dirty = false;
4f6ed88c
HR
1704
1705 if (!l2_offset) {
1706 continue;
1707 }
1708
7324c10f
HR
1709 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1710 &refcount);
1711 if (ret < 0) {
4f6ed88c
HR
1712 /* don't print message nor increment check_errors */
1713 continue;
1714 }
1715 if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
e23e400e 1716 fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
0e06528e 1717 "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
e23e400e
HR
1718 fix & BDRV_FIX_ERRORS ? "Repairing" :
1719 "ERROR",
4f6ed88c 1720 i, l1_entry, refcount);
e23e400e
HR
1721 if (fix & BDRV_FIX_ERRORS) {
1722 s->l1_table[i] = refcount == 1
1723 ? l1_entry | QCOW_OFLAG_COPIED
1724 : l1_entry & ~QCOW_OFLAG_COPIED;
1725 ret = qcow2_write_l1_entry(bs, i);
1726 if (ret < 0) {
1727 res->check_errors++;
1728 goto fail;
1729 }
1730 res->corruptions_fixed++;
1731 } else {
1732 res->corruptions++;
1733 }
4f6ed88c
HR
1734 }
1735
cf2ab8fc 1736 ret = bdrv_pread(bs->file, l2_offset, l2_table,
4f6ed88c
HR
1737 s->l2_size * sizeof(uint64_t));
1738 if (ret < 0) {
1739 fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1740 strerror(-ret));
1741 res->check_errors++;
1742 goto fail;
1743 }
1744
1745 for (j = 0; j < s->l2_size; j++) {
1746 uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1747 uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
3ef95218 1748 QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
4f6ed88c 1749
fdfab37d
EB
1750 if (cluster_type == QCOW2_CLUSTER_NORMAL ||
1751 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
7324c10f
HR
1752 ret = qcow2_get_refcount(bs,
1753 data_offset >> s->cluster_bits,
1754 &refcount);
1755 if (ret < 0) {
4f6ed88c
HR
1756 /* don't print message nor increment check_errors */
1757 continue;
1758 }
1759 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
e23e400e 1760 fprintf(stderr, "%s OFLAG_COPIED data cluster: "
0e06528e 1761 "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
e23e400e
HR
1762 fix & BDRV_FIX_ERRORS ? "Repairing" :
1763 "ERROR",
4f6ed88c 1764 l2_entry, refcount);
e23e400e
HR
1765 if (fix & BDRV_FIX_ERRORS) {
1766 l2_table[j] = cpu_to_be64(refcount == 1
1767 ? l2_entry | QCOW_OFLAG_COPIED
1768 : l2_entry & ~QCOW_OFLAG_COPIED);
1769 l2_dirty = true;
1770 res->corruptions_fixed++;
1771 } else {
1772 res->corruptions++;
1773 }
4f6ed88c
HR
1774 }
1775 }
1776 }
e23e400e
HR
1777
1778 if (l2_dirty) {
231bb267
HR
1779 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1780 l2_offset, s->cluster_size);
e23e400e
HR
1781 if (ret < 0) {
1782 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1783 "overlap check failed: %s\n", strerror(-ret));
1784 res->check_errors++;
1785 goto fail;
1786 }
1787
d9ca2ea2 1788 ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
9a4f4c31 1789 s->cluster_size);
e23e400e
HR
1790 if (ret < 0) {
1791 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1792 strerror(-ret));
1793 res->check_errors++;
1794 goto fail;
1795 }
1796 }
4f6ed88c
HR
1797 }
1798
1799 ret = 0;
1800
1801fail:
1802 qemu_vfree(l2_table);
1803 return ret;
1804}
1805
6ca56bf5
HR
1806/*
1807 * Checks consistency of refblocks and accounts for each refblock in
1808 * *refcount_table.
1809 */
1810static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
f307b255 1811 BdrvCheckMode fix, bool *rebuild,
7453c96b 1812 void **refcount_table, int64_t *nb_clusters)
6ca56bf5 1813{
ff99129a 1814 BDRVQcow2State *s = bs->opaque;
001c158d 1815 int64_t i, size;
fef4d3d5 1816 int ret;
6ca56bf5 1817
f7d0fe02 1818 for(i = 0; i < s->refcount_table_size; i++) {
6882c8fa 1819 uint64_t offset, cluster;
f7d0fe02 1820 offset = s->refcount_table[i];
6882c8fa 1821 cluster = offset >> s->cluster_bits;
746c3cb5
KW
1822
1823 /* Refcount blocks are cluster aligned */
ac95acdb 1824 if (offset_into_cluster(s, offset)) {
166acf54 1825 fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
746c3cb5 1826 "cluster aligned; refcount table entry corrupted\n", i);
9ac228e0 1827 res->corruptions++;
f307b255 1828 *rebuild = true;
6882c8fa
KW
1829 continue;
1830 }
1831
6ca56bf5 1832 if (cluster >= *nb_clusters) {
001c158d
HR
1833 fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1834 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1835
1836 if (fix & BDRV_FIX_ERRORS) {
5fee192e 1837 int64_t new_nb_clusters;
ed3d2ec9 1838 Error *local_err = NULL;
001c158d
HR
1839
1840 if (offset > INT64_MAX - s->cluster_size) {
1841 ret = -EINVAL;
1842 goto resize_fail;
1843 }
1844
ed3d2ec9 1845 ret = bdrv_truncate(bs->file, offset + s->cluster_size,
7ea37c30 1846 PREALLOC_MODE_OFF, &local_err);
001c158d 1847 if (ret < 0) {
ed3d2ec9 1848 error_report_err(local_err);
001c158d
HR
1849 goto resize_fail;
1850 }
9a4f4c31 1851 size = bdrv_getlength(bs->file->bs);
001c158d
HR
1852 if (size < 0) {
1853 ret = size;
1854 goto resize_fail;
1855 }
1856
5fee192e
HR
1857 new_nb_clusters = size_to_clusters(s, size);
1858 assert(new_nb_clusters >= *nb_clusters);
001c158d 1859
5fee192e
HR
1860 ret = realloc_refcount_array(s, refcount_table,
1861 nb_clusters, new_nb_clusters);
1862 if (ret < 0) {
001c158d 1863 res->check_errors++;
5fee192e 1864 return ret;
001c158d 1865 }
001c158d
HR
1866
1867 if (cluster >= *nb_clusters) {
1868 ret = -EINVAL;
1869 goto resize_fail;
1870 }
1871
1872 res->corruptions_fixed++;
8a5bb1f1
VSO
1873 ret = qcow2_inc_refcounts_imrt(bs, res,
1874 refcount_table, nb_clusters,
1875 offset, s->cluster_size);
001c158d
HR
1876 if (ret < 0) {
1877 return ret;
1878 }
1879 /* No need to check whether the refcount is now greater than 1:
1880 * This area was just allocated and zeroed, so it can only be
8a5bb1f1 1881 * exactly 1 after qcow2_inc_refcounts_imrt() */
001c158d
HR
1882 continue;
1883
1884resize_fail:
1885 res->corruptions++;
f307b255 1886 *rebuild = true;
001c158d
HR
1887 fprintf(stderr, "ERROR could not resize image: %s\n",
1888 strerror(-ret));
1889 } else {
1890 res->corruptions++;
1891 }
6882c8fa 1892 continue;
746c3cb5
KW
1893 }
1894
f7d0fe02 1895 if (offset != 0) {
8a5bb1f1
VSO
1896 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1897 offset, s->cluster_size);
fef4d3d5
HR
1898 if (ret < 0) {
1899 return ret;
1900 }
7453c96b 1901 if (s->get_refcount(*refcount_table, cluster) != 1) {
f307b255 1902 fprintf(stderr, "ERROR refcount block %" PRId64
7453c96b
HR
1903 " refcount=%" PRIu64 "\n", i,
1904 s->get_refcount(*refcount_table, cluster));
f307b255
HR
1905 res->corruptions++;
1906 *rebuild = true;
746c3cb5 1907 }
f7d0fe02
KW
1908 }
1909 }
1910
6ca56bf5
HR
1911 return 0;
1912}
1913
057a3fe5
HR
1914/*
1915 * Calculates an in-memory refcount table.
1916 */
1917static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
f307b255 1918 BdrvCheckMode fix, bool *rebuild,
7453c96b 1919 void **refcount_table, int64_t *nb_clusters)
057a3fe5 1920{
ff99129a 1921 BDRVQcow2State *s = bs->opaque;
057a3fe5
HR
1922 int64_t i;
1923 QCowSnapshot *sn;
1924 int ret;
1925
9696df21 1926 if (!*refcount_table) {
5fee192e
HR
1927 int64_t old_size = 0;
1928 ret = realloc_refcount_array(s, refcount_table,
1929 &old_size, *nb_clusters);
1930 if (ret < 0) {
9696df21 1931 res->check_errors++;
5fee192e 1932 return ret;
9696df21 1933 }
057a3fe5
HR
1934 }
1935
1936 /* header */
8a5bb1f1
VSO
1937 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1938 0, s->cluster_size);
fef4d3d5
HR
1939 if (ret < 0) {
1940 return ret;
1941 }
057a3fe5
HR
1942
1943 /* current L1 table */
641bb63c 1944 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
057a3fe5
HR
1945 s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1946 if (ret < 0) {
1947 return ret;
1948 }
1949
1950 /* snapshots */
1951 for (i = 0; i < s->nb_snapshots; i++) {
1952 sn = s->snapshots + i;
641bb63c 1953 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
fef4d3d5 1954 sn->l1_table_offset, sn->l1_size, 0);
057a3fe5
HR
1955 if (ret < 0) {
1956 return ret;
1957 }
1958 }
8a5bb1f1
VSO
1959 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1960 s->snapshots_offset, s->snapshots_size);
fef4d3d5
HR
1961 if (ret < 0) {
1962 return ret;
1963 }
057a3fe5
HR
1964
1965 /* refcount data */
8a5bb1f1
VSO
1966 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1967 s->refcount_table_offset,
1968 s->refcount_table_size * sizeof(uint64_t));
fef4d3d5
HR
1969 if (ret < 0) {
1970 return ret;
1971 }
057a3fe5 1972
4652b8f3
DB
1973 /* encryption */
1974 if (s->crypto_header.length) {
8a5bb1f1
VSO
1975 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
1976 s->crypto_header.offset,
1977 s->crypto_header.length);
4652b8f3
DB
1978 if (ret < 0) {
1979 return ret;
1980 }
1981 }
1982
88ddffae
VSO
1983 /* bitmaps */
1984 ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
1985 if (ret < 0) {
1986 return ret;
1987 }
1988
f307b255 1989 return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
057a3fe5
HR
1990}
1991
6ca56bf5
HR
1992/*
1993 * Compares the actual reference count for each cluster in the image against the
1994 * refcount as reported by the refcount structures on-disk.
1995 */
1996static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
f307b255
HR
1997 BdrvCheckMode fix, bool *rebuild,
1998 int64_t *highest_cluster,
7453c96b 1999 void *refcount_table, int64_t nb_clusters)
6ca56bf5 2000{
ff99129a 2001 BDRVQcow2State *s = bs->opaque;
6ca56bf5 2002 int64_t i;
0e06528e 2003 uint64_t refcount1, refcount2;
7324c10f 2004 int ret;
6ca56bf5
HR
2005
2006 for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
7324c10f
HR
2007 ret = qcow2_get_refcount(bs, i, &refcount1);
2008 if (ret < 0) {
166acf54 2009 fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
7324c10f 2010 i, strerror(-ret));
9ac228e0 2011 res->check_errors++;
f74550fd 2012 continue;
018faafd
KW
2013 }
2014
7453c96b 2015 refcount2 = s->get_refcount(refcount_table, i);
c6bb9ad1
FS
2016
2017 if (refcount1 > 0 || refcount2 > 0) {
6ca56bf5 2018 *highest_cluster = i;
c6bb9ad1
FS
2019 }
2020
f7d0fe02 2021 if (refcount1 != refcount2) {
166acf54
KW
2022 /* Check if we're allowed to fix the mismatch */
2023 int *num_fixed = NULL;
f307b255
HR
2024 if (refcount1 == 0) {
2025 *rebuild = true;
2026 } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
166acf54
KW
2027 num_fixed = &res->leaks_fixed;
2028 } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2029 num_fixed = &res->corruptions_fixed;
2030 }
2031
0e06528e
HR
2032 fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2033 " reference=%" PRIu64 "\n",
166acf54
KW
2034 num_fixed != NULL ? "Repairing" :
2035 refcount1 < refcount2 ? "ERROR" :
2036 "Leaked",
f7d0fe02 2037 i, refcount1, refcount2);
166acf54
KW
2038
2039 if (num_fixed) {
2040 ret = update_refcount(bs, i << s->cluster_bits, 1,
2aabe7c7
HR
2041 refcount_diff(refcount1, refcount2),
2042 refcount1 > refcount2,
6cfcb9b8 2043 QCOW2_DISCARD_ALWAYS);
166acf54
KW
2044 if (ret >= 0) {
2045 (*num_fixed)++;
2046 continue;
2047 }
2048 }
2049
2050 /* And if we couldn't, print an error */
9ac228e0
KW
2051 if (refcount1 < refcount2) {
2052 res->corruptions++;
2053 } else {
2054 res->leaks++;
2055 }
f7d0fe02
KW
2056 }
2057 }
6ca56bf5
HR
2058}
2059
c7c0681b
HR
2060/*
2061 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2062 * the on-disk refcount structures.
2063 *
2064 * On input, *first_free_cluster tells where to start looking, and need not
2065 * actually be a free cluster; the returned offset will not be before that
2066 * cluster. On output, *first_free_cluster points to the first gap found, even
2067 * if that gap was too small to be used as the returned offset.
2068 *
2069 * Note that *first_free_cluster is a cluster index whereas the return value is
2070 * an offset.
2071 */
2072static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2073 int cluster_count,
7453c96b 2074 void **refcount_table,
c7c0681b
HR
2075 int64_t *imrt_nb_clusters,
2076 int64_t *first_free_cluster)
2077{
ff99129a 2078 BDRVQcow2State *s = bs->opaque;
c7c0681b
HR
2079 int64_t cluster = *first_free_cluster, i;
2080 bool first_gap = true;
2081 int contiguous_free_clusters;
5fee192e 2082 int ret;
c7c0681b
HR
2083
2084 /* Starting at *first_free_cluster, find a range of at least cluster_count
2085 * continuously free clusters */
2086 for (contiguous_free_clusters = 0;
2087 cluster < *imrt_nb_clusters &&
2088 contiguous_free_clusters < cluster_count;
2089 cluster++)
2090 {
7453c96b 2091 if (!s->get_refcount(*refcount_table, cluster)) {
c7c0681b
HR
2092 contiguous_free_clusters++;
2093 if (first_gap) {
2094 /* If this is the first free cluster found, update
2095 * *first_free_cluster accordingly */
2096 *first_free_cluster = cluster;
2097 first_gap = false;
2098 }
2099 } else if (contiguous_free_clusters) {
2100 contiguous_free_clusters = 0;
2101 }
2102 }
2103
2104 /* If contiguous_free_clusters is greater than zero, it contains the number
2105 * of continuously free clusters until the current cluster; the first free
2106 * cluster in the current "gap" is therefore
2107 * cluster - contiguous_free_clusters */
2108
2109 /* If no such range could be found, grow the in-memory refcount table
2110 * accordingly to append free clusters at the end of the image */
2111 if (contiguous_free_clusters < cluster_count) {
c7c0681b
HR
2112 /* contiguous_free_clusters clusters are already empty at the image end;
2113 * we need cluster_count clusters; therefore, we have to allocate
2114 * cluster_count - contiguous_free_clusters new clusters at the end of
2115 * the image (which is the current value of cluster; note that cluster
2116 * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2117 * the image end) */
5fee192e
HR
2118 ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2119 cluster + cluster_count
2120 - contiguous_free_clusters);
2121 if (ret < 0) {
2122 return ret;
c7c0681b 2123 }
c7c0681b
HR
2124 }
2125
2126 /* Go back to the first free cluster */
2127 cluster -= contiguous_free_clusters;
2128 for (i = 0; i < cluster_count; i++) {
7453c96b 2129 s->set_refcount(*refcount_table, cluster + i, 1);
c7c0681b
HR
2130 }
2131
2132 return cluster << s->cluster_bits;
2133}
2134
2135/*
2136 * Creates a new refcount structure based solely on the in-memory information
2137 * given through *refcount_table. All necessary allocations will be reflected
2138 * in that array.
2139 *
2140 * On success, the old refcount structure is leaked (it will be covered by the
2141 * new refcount structure).
2142 */
2143static int rebuild_refcount_structure(BlockDriverState *bs,
2144 BdrvCheckResult *res,
7453c96b 2145 void **refcount_table,
c7c0681b
HR
2146 int64_t *nb_clusters)
2147{
ff99129a 2148 BDRVQcow2State *s = bs->opaque;
c7c0681b
HR
2149 int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2150 int64_t refblock_offset, refblock_start, refblock_index;
2151 uint32_t reftable_size = 0;
2152 uint64_t *on_disk_reftable = NULL;
7453c96b
HR
2153 void *on_disk_refblock;
2154 int ret = 0;
c7c0681b
HR
2155 struct {
2156 uint64_t reftable_offset;
2157 uint32_t reftable_clusters;
2158 } QEMU_PACKED reftable_offset_and_clusters;
2159
2160 qcow2_cache_empty(bs, s->refcount_block_cache);
2161
2162write_refblocks:
2163 for (; cluster < *nb_clusters; cluster++) {
7453c96b 2164 if (!s->get_refcount(*refcount_table, cluster)) {
c7c0681b
HR
2165 continue;
2166 }
2167
2168 refblock_index = cluster >> s->refcount_block_bits;
2169 refblock_start = refblock_index << s->refcount_block_bits;
2170
2171 /* Don't allocate a cluster in a refblock already written to disk */
2172 if (first_free_cluster < refblock_start) {
2173 first_free_cluster = refblock_start;
2174 }
2175 refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2176 nb_clusters, &first_free_cluster);
2177 if (refblock_offset < 0) {
2178 fprintf(stderr, "ERROR allocating refblock: %s\n",
2179 strerror(-refblock_offset));
2180 res->check_errors++;
2181 ret = refblock_offset;
2182 goto fail;
2183 }
2184
2185 if (reftable_size <= refblock_index) {
2186 uint32_t old_reftable_size = reftable_size;
2187 uint64_t *new_on_disk_reftable;
2188
2189 reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2190 s->cluster_size) / sizeof(uint64_t);
2191 new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2192 reftable_size *
2193 sizeof(uint64_t));
2194 if (!new_on_disk_reftable) {
2195 res->check_errors++;
2196 ret = -ENOMEM;
2197 goto fail;
2198 }
2199 on_disk_reftable = new_on_disk_reftable;
2200
2201 memset(on_disk_reftable + old_reftable_size, 0,
2202 (reftable_size - old_reftable_size) * sizeof(uint64_t));
2203
2204 /* The offset we have for the reftable is now no longer valid;
2205 * this will leak that range, but we can easily fix that by running
2206 * a leak-fixing check after this rebuild operation */
2207 reftable_offset = -1;
f80ac75d
PMD
2208 } else {
2209 assert(on_disk_reftable);
c7c0681b
HR
2210 }
2211 on_disk_reftable[refblock_index] = refblock_offset;
2212
2213 /* If this is apparently the last refblock (for now), try to squeeze the
2214 * reftable in */
2215 if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2216 reftable_offset < 0)
2217 {
2218 uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2219 sizeof(uint64_t));
2220 reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2221 refcount_table, nb_clusters,
2222 &first_free_cluster);
2223 if (reftable_offset < 0) {
2224 fprintf(stderr, "ERROR allocating reftable: %s\n",
2225 strerror(-reftable_offset));
2226 res->check_errors++;
2227 ret = reftable_offset;
2228 goto fail;
2229 }
2230 }
2231
2232 ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2233 s->cluster_size);
2234 if (ret < 0) {
2235 fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2236 goto fail;
2237 }
2238
7453c96b
HR
2239 /* The size of *refcount_table is always cluster-aligned, therefore the
2240 * write operation will not overflow */
2241 on_disk_refblock = (void *)((char *) *refcount_table +
2242 refblock_index * s->cluster_size);
c7c0681b 2243
18d51c4b 2244 ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
7453c96b 2245 on_disk_refblock, s->cluster_sectors);
c7c0681b
HR
2246 if (ret < 0) {
2247 fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2248 goto fail;
2249 }
2250
2251 /* Go to the end of this refblock */
2252 cluster = refblock_start + s->refcount_block_size - 1;
2253 }
2254
2255 if (reftable_offset < 0) {
2256 uint64_t post_refblock_start, reftable_clusters;
2257
2258 post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2259 reftable_clusters = size_to_clusters(s,
2260 reftable_size * sizeof(uint64_t));
2261 /* Not pretty but simple */
2262 if (first_free_cluster < post_refblock_start) {
2263 first_free_cluster = post_refblock_start;
2264 }
2265 reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2266 refcount_table, nb_clusters,
2267 &first_free_cluster);
2268 if (reftable_offset < 0) {
2269 fprintf(stderr, "ERROR allocating reftable: %s\n",
2270 strerror(-reftable_offset));
2271 res->check_errors++;
2272 ret = reftable_offset;
2273 goto fail;
2274 }
2275
2276 goto write_refblocks;
2277 }
2278
c7c0681b
HR
2279 for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2280 cpu_to_be64s(&on_disk_reftable[refblock_index]);
2281 }
2282
2283 ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2284 reftable_size * sizeof(uint64_t));
2285 if (ret < 0) {
2286 fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2287 goto fail;
2288 }
2289
2290 assert(reftable_size < INT_MAX / sizeof(uint64_t));
d9ca2ea2 2291 ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
c7c0681b
HR
2292 reftable_size * sizeof(uint64_t));
2293 if (ret < 0) {
2294 fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2295 goto fail;
2296 }
2297
2298 /* Enter new reftable into the image header */
f1f7a1dd
PM
2299 reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2300 reftable_offset_and_clusters.reftable_clusters =
2301 cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
d9ca2ea2
KW
2302 ret = bdrv_pwrite_sync(bs->file,
2303 offsetof(QCowHeader, refcount_table_offset),
c7c0681b
HR
2304 &reftable_offset_and_clusters,
2305 sizeof(reftable_offset_and_clusters));
2306 if (ret < 0) {
2307 fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2308 goto fail;
2309 }
2310
2311 for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2312 be64_to_cpus(&on_disk_reftable[refblock_index]);
2313 }
2314 s->refcount_table = on_disk_reftable;
2315 s->refcount_table_offset = reftable_offset;
2316 s->refcount_table_size = reftable_size;
7061a078 2317 update_max_refcount_table_index(s);
c7c0681b
HR
2318
2319 return 0;
2320
2321fail:
2322 g_free(on_disk_reftable);
2323 return ret;
2324}
2325
6ca56bf5
HR
2326/*
2327 * Checks an image for refcount consistency.
2328 *
2329 * Returns 0 if no errors are found, the number of errors in case the image is
2330 * detected as corrupted, and -errno when an internal error occurred.
2331 */
2332int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2333 BdrvCheckMode fix)
2334{
ff99129a 2335 BDRVQcow2State *s = bs->opaque;
c7c0681b 2336 BdrvCheckResult pre_compare_res;
6ca56bf5 2337 int64_t size, highest_cluster, nb_clusters;
7453c96b 2338 void *refcount_table = NULL;
f307b255 2339 bool rebuild = false;
6ca56bf5
HR
2340 int ret;
2341
9a4f4c31 2342 size = bdrv_getlength(bs->file->bs);
6ca56bf5
HR
2343 if (size < 0) {
2344 res->check_errors++;
2345 return size;
2346 }
2347
2348 nb_clusters = size_to_clusters(s, size);
2349 if (nb_clusters > INT_MAX) {
2350 res->check_errors++;
2351 return -EFBIG;
2352 }
2353
2354 res->bfi.total_clusters =
2355 size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2356
f307b255
HR
2357 ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2358 &nb_clusters);
6ca56bf5
HR
2359 if (ret < 0) {
2360 goto fail;
2361 }
2362
c7c0681b
HR
2363 /* In case we don't need to rebuild the refcount structure (but want to fix
2364 * something), this function is immediately called again, in which case the
2365 * result should be ignored */
2366 pre_compare_res = *res;
2367 compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
6ca56bf5 2368 nb_clusters);
f7d0fe02 2369
c7c0681b 2370 if (rebuild && (fix & BDRV_FIX_ERRORS)) {
791230d8
HR
2371 BdrvCheckResult old_res = *res;
2372 int fresh_leaks = 0;
2373
c7c0681b
HR
2374 fprintf(stderr, "Rebuilding refcount structure\n");
2375 ret = rebuild_refcount_structure(bs, res, &refcount_table,
2376 &nb_clusters);
2377 if (ret < 0) {
2378 goto fail;
2379 }
791230d8
HR
2380
2381 res->corruptions = 0;
2382 res->leaks = 0;
2383
2384 /* Because the old reftable has been exchanged for a new one the
2385 * references have to be recalculated */
2386 rebuild = false;
7453c96b 2387 memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
791230d8
HR
2388 ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2389 &nb_clusters);
2390 if (ret < 0) {
2391 goto fail;
2392 }
2393
2394 if (fix & BDRV_FIX_LEAKS) {
2395 /* The old refcount structures are now leaked, fix it; the result
2396 * can be ignored, aside from leaks which were introduced by
2397 * rebuild_refcount_structure() that could not be fixed */
2398 BdrvCheckResult saved_res = *res;
2399 *res = (BdrvCheckResult){ 0 };
2400
2401 compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2402 &highest_cluster, refcount_table, nb_clusters);
2403 if (rebuild) {
2404 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2405 "broken\n");
2406 }
2407
2408 /* Any leaks accounted for here were introduced by
2409 * rebuild_refcount_structure() because that function has created a
2410 * new refcount structure from scratch */
2411 fresh_leaks = res->leaks;
2412 *res = saved_res;
2413 }
2414
2415 if (res->corruptions < old_res.corruptions) {
2416 res->corruptions_fixed += old_res.corruptions - res->corruptions;
2417 }
2418 if (res->leaks < old_res.leaks) {
2419 res->leaks_fixed += old_res.leaks - res->leaks;
2420 }
2421 res->leaks += fresh_leaks;
c7c0681b
HR
2422 } else if (fix) {
2423 if (rebuild) {
2424 fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2425 res->check_errors++;
2426 ret = -EIO;
2427 goto fail;
2428 }
2429
2430 if (res->leaks || res->corruptions) {
2431 *res = pre_compare_res;
2432 compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2433 refcount_table, nb_clusters);
2434 }
f307b255
HR
2435 }
2436
4f6ed88c 2437 /* check OFLAG_COPIED */
e23e400e 2438 ret = check_oflag_copied(bs, res, fix);
4f6ed88c
HR
2439 if (ret < 0) {
2440 goto fail;
2441 }
2442
c6bb9ad1 2443 res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
80fa3341
KW
2444 ret = 0;
2445
2446fail:
7267c094 2447 g_free(refcount_table);
f7d0fe02 2448
80fa3341 2449 return ret;
f7d0fe02
KW
2450}
2451
a40f1c2a
HR
2452#define overlaps_with(ofs, sz) \
2453 ranges_overlap(offset, size, ofs, sz)
2454
2455/*
2456 * Checks if the given offset into the image file is actually free to use by
2457 * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2458 * i.e. a sanity check without relying on the refcount tables.
2459 *
231bb267
HR
2460 * The ign parameter specifies what checks not to perform (being a bitmask of
2461 * QCow2MetadataOverlap values), i.e., what sections to ignore.
a40f1c2a
HR
2462 *
2463 * Returns:
2464 * - 0 if writing to this offset will not affect the mentioned metadata
2465 * - a positive QCow2MetadataOverlap value indicating one overlapping section
2466 * - a negative value (-errno) indicating an error while performing a check,
2467 * e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2468 */
231bb267 2469int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
a40f1c2a
HR
2470 int64_t size)
2471{
ff99129a 2472 BDRVQcow2State *s = bs->opaque;
3e355390 2473 int chk = s->overlap_check & ~ign;
a40f1c2a
HR
2474 int i, j;
2475
2476 if (!size) {
2477 return 0;
2478 }
2479
2480 if (chk & QCOW2_OL_MAIN_HEADER) {
2481 if (offset < s->cluster_size) {
2482 return QCOW2_OL_MAIN_HEADER;
2483 }
2484 }
2485
2486 /* align range to test to cluster boundaries */
2487 size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2488 offset = start_of_cluster(s, offset);
2489
2490 if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2491 if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2492 return QCOW2_OL_ACTIVE_L1;
2493 }
2494 }
2495
2496 if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2497 if (overlaps_with(s->refcount_table_offset,
2498 s->refcount_table_size * sizeof(uint64_t))) {
2499 return QCOW2_OL_REFCOUNT_TABLE;
2500 }
2501 }
2502
2503 if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2504 if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2505 return QCOW2_OL_SNAPSHOT_TABLE;
2506 }
2507 }
2508
2509 if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2510 for (i = 0; i < s->nb_snapshots; i++) {
2511 if (s->snapshots[i].l1_size &&
2512 overlaps_with(s->snapshots[i].l1_table_offset,
2513 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2514 return QCOW2_OL_INACTIVE_L1;
2515 }
2516 }
2517 }
2518
2519 if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2520 for (i = 0; i < s->l1_size; i++) {
2521 if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2522 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2523 s->cluster_size)) {
2524 return QCOW2_OL_ACTIVE_L2;
2525 }
2526 }
2527 }
2528
2529 if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
7061a078
AG
2530 unsigned last_entry = s->max_refcount_table_index;
2531 assert(last_entry < s->refcount_table_size);
2532 assert(last_entry + 1 == s->refcount_table_size ||
2533 (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2534 for (i = 0; i <= last_entry; i++) {
a40f1c2a
HR
2535 if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2536 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2537 s->cluster_size)) {
2538 return QCOW2_OL_REFCOUNT_BLOCK;
2539 }
2540 }
2541 }
2542
2543 if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2544 for (i = 0; i < s->nb_snapshots; i++) {
2545 uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2546 uint32_t l1_sz = s->snapshots[i].l1_size;
998b959c 2547 uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
de82815d 2548 uint64_t *l1 = g_try_malloc(l1_sz2);
a40f1c2a
HR
2549 int ret;
2550
de82815d
KW
2551 if (l1_sz2 && l1 == NULL) {
2552 return -ENOMEM;
2553 }
2554
cf2ab8fc 2555 ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
a40f1c2a
HR
2556 if (ret < 0) {
2557 g_free(l1);
2558 return ret;
2559 }
2560
2561 for (j = 0; j < l1_sz; j++) {
1e242b55
HR
2562 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2563 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
a40f1c2a
HR
2564 g_free(l1);
2565 return QCOW2_OL_INACTIVE_L2;
2566 }
2567 }
2568
2569 g_free(l1);
2570 }
2571 }
2572
2573 return 0;
2574}
2575
2576static const char *metadata_ol_names[] = {
2577 [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header",
2578 [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table",
2579 [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table",
2580 [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2581 [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2582 [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2583 [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table",
2584 [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table",
2585};
2586
2587/*
2588 * First performs a check for metadata overlaps (through
2589 * qcow2_check_metadata_overlap); if that fails with a negative value (error
2590 * while performing a check), that value is returned. If an impending overlap
2591 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2592 * and -EIO returned.
2593 *
2594 * Returns 0 if there were neither overlaps nor errors while checking for
2595 * overlaps; or a negative value (-errno) on error.
2596 */
231bb267 2597int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
a40f1c2a
HR
2598 int64_t size)
2599{
231bb267 2600 int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
a40f1c2a
HR
2601
2602 if (ret < 0) {
2603 return ret;
2604 } else if (ret > 0) {
786a4ea8 2605 int metadata_ol_bitnr = ctz32(ret);
a40f1c2a
HR
2606 assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2607
adb43552
HR
2608 qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2609 "write on metadata (overlaps with %s)",
2610 metadata_ol_names[metadata_ol_bitnr]);
a40f1c2a
HR
2611 return -EIO;
2612 }
2613
2614 return 0;
2615}
791c9a00
HR
2616
2617/* A pointer to a function of this type is given to walk_over_reftable(). That
2618 * function will create refblocks and pass them to a RefblockFinishOp once they
2619 * are completed (@refblock). @refblock_empty is set if the refblock is
2620 * completely empty.
2621 *
2622 * Along with the refblock, a corresponding reftable entry is passed, in the
2623 * reftable @reftable (which may be reallocated) at @reftable_index.
2624 *
2625 * @allocated should be set to true if a new cluster has been allocated.
2626 */
2627typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2628 uint64_t reftable_index, uint64_t *reftable_size,
2629 void *refblock, bool refblock_empty,
2630 bool *allocated, Error **errp);
2631
2632/**
2633 * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2634 * it is not empty) and inserts its offset into the new reftable. The size of
2635 * this new reftable is increased as required.
2636 */
2637static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2638 uint64_t reftable_index, uint64_t *reftable_size,
2639 void *refblock, bool refblock_empty, bool *allocated,
2640 Error **errp)
2641{
2642 BDRVQcow2State *s = bs->opaque;
2643 int64_t offset;
2644
2645 if (!refblock_empty && reftable_index >= *reftable_size) {
2646 uint64_t *new_reftable;
2647 uint64_t new_reftable_size;
2648
2649 new_reftable_size = ROUND_UP(reftable_index + 1,
2650 s->cluster_size / sizeof(uint64_t));
2651 if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2652 error_setg(errp,
2653 "This operation would make the refcount table grow "
2654 "beyond the maximum size supported by QEMU, aborting");
2655 return -ENOTSUP;
2656 }
2657
2658 new_reftable = g_try_realloc(*reftable, new_reftable_size *
2659 sizeof(uint64_t));
2660 if (!new_reftable) {
2661 error_setg(errp, "Failed to increase reftable buffer size");
2662 return -ENOMEM;
2663 }
2664
2665 memset(new_reftable + *reftable_size, 0,
2666 (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2667
2668 *reftable = new_reftable;
2669 *reftable_size = new_reftable_size;
2670 }
2671
2672 if (!refblock_empty && !(*reftable)[reftable_index]) {
2673 offset = qcow2_alloc_clusters(bs, s->cluster_size);
2674 if (offset < 0) {
2675 error_setg_errno(errp, -offset, "Failed to allocate refblock");
2676 return offset;
2677 }
2678 (*reftable)[reftable_index] = offset;
2679 *allocated = true;
2680 }
2681
2682 return 0;
2683}
2684
2685/**
2686 * This "operation" for walk_over_reftable() writes the refblock to disk at the
2687 * offset specified by the new reftable's entry. It does not modify the new
2688 * reftable or change any refcounts.
2689 */
2690static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2691 uint64_t reftable_index, uint64_t *reftable_size,
2692 void *refblock, bool refblock_empty, bool *allocated,
2693 Error **errp)
2694{
2695 BDRVQcow2State *s = bs->opaque;
2696 int64_t offset;
2697 int ret;
2698
2699 if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2700 offset = (*reftable)[reftable_index];
2701
2702 ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2703 if (ret < 0) {
2704 error_setg_errno(errp, -ret, "Overlap check failed");
2705 return ret;
2706 }
2707
d9ca2ea2 2708 ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
791c9a00
HR
2709 if (ret < 0) {
2710 error_setg_errno(errp, -ret, "Failed to write refblock");
2711 return ret;
2712 }
2713 } else {
2714 assert(refblock_empty);
2715 }
2716
2717 return 0;
2718}
2719
2720/**
2721 * This function walks over the existing reftable and every referenced refblock;
2722 * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2723 * create an equal new entry in the passed @new_refblock. Once that
2724 * @new_refblock is completely filled, @operation will be called.
2725 *
2726 * @status_cb and @cb_opaque are used for the amend operation's status callback.
2727 * @index is the index of the walk_over_reftable() calls and @total is the total
2728 * number of walk_over_reftable() calls per amend operation. Both are used for
2729 * calculating the parameters for the status callback.
2730 *
2731 * @allocated is set to true if a new cluster has been allocated.
2732 */
2733static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2734 uint64_t *new_reftable_index,
2735 uint64_t *new_reftable_size,
2736 void *new_refblock, int new_refblock_size,
2737 int new_refcount_bits,
2738 RefblockFinishOp *operation, bool *allocated,
2739 Qcow2SetRefcountFunc *new_set_refcount,
2740 BlockDriverAmendStatusCB *status_cb,
2741 void *cb_opaque, int index, int total,
2742 Error **errp)
2743{
2744 BDRVQcow2State *s = bs->opaque;
2745 uint64_t reftable_index;
2746 bool new_refblock_empty = true;
2747 int refblock_index;
2748 int new_refblock_index = 0;
2749 int ret;
2750
2751 for (reftable_index = 0; reftable_index < s->refcount_table_size;
2752 reftable_index++)
2753 {
2754 uint64_t refblock_offset = s->refcount_table[reftable_index]
2755 & REFT_OFFSET_MASK;
2756
2757 status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2758 (uint64_t)total * s->refcount_table_size, cb_opaque);
2759
2760 if (refblock_offset) {
2761 void *refblock;
2762
2763 if (offset_into_cluster(s, refblock_offset)) {
2764 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2765 PRIx64 " unaligned (reftable index: %#"
2766 PRIx64 ")", refblock_offset,
2767 reftable_index);
2768 error_setg(errp,
2769 "Image is corrupt (unaligned refblock offset)");
2770 return -EIO;
2771 }
2772
2773 ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2774 &refblock);
2775 if (ret < 0) {
2776 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2777 return ret;
2778 }
2779
2780 for (refblock_index = 0; refblock_index < s->refcount_block_size;
2781 refblock_index++)
2782 {
2783 uint64_t refcount;
2784
2785 if (new_refblock_index >= new_refblock_size) {
2786 /* new_refblock is now complete */
2787 ret = operation(bs, new_reftable, *new_reftable_index,
2788 new_reftable_size, new_refblock,
2789 new_refblock_empty, allocated, errp);
2790 if (ret < 0) {
2791 qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2792 return ret;
2793 }
2794
2795 (*new_reftable_index)++;
2796 new_refblock_index = 0;
2797 new_refblock_empty = true;
2798 }
2799
2800 refcount = s->get_refcount(refblock, refblock_index);
2801 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2802 uint64_t offset;
2803
2804 qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2805
2806 offset = ((reftable_index << s->refcount_block_bits)
2807 + refblock_index) << s->cluster_bits;
2808
2809 error_setg(errp, "Cannot decrease refcount entry width to "
2810 "%i bits: Cluster at offset %#" PRIx64 " has a "
2811 "refcount of %" PRIu64, new_refcount_bits,
2812 offset, refcount);
2813 return -EINVAL;
2814 }
2815
2816 if (new_set_refcount) {
2817 new_set_refcount(new_refblock, new_refblock_index++,
2818 refcount);
2819 } else {
2820 new_refblock_index++;
2821 }
2822 new_refblock_empty = new_refblock_empty && refcount == 0;
2823 }
2824
2825 qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2826 } else {
2827 /* No refblock means every refcount is 0 */
2828 for (refblock_index = 0; refblock_index < s->refcount_block_size;
2829 refblock_index++)
2830 {
2831 if (new_refblock_index >= new_refblock_size) {
2832 /* new_refblock is now complete */
2833 ret = operation(bs, new_reftable, *new_reftable_index,
2834 new_reftable_size, new_refblock,
2835 new_refblock_empty, allocated, errp);
2836 if (ret < 0) {
2837 return ret;
2838 }
2839
2840 (*new_reftable_index)++;
2841 new_refblock_index = 0;
2842 new_refblock_empty = true;
2843 }
2844
2845 if (new_set_refcount) {
2846 new_set_refcount(new_refblock, new_refblock_index++, 0);
2847 } else {
2848 new_refblock_index++;
2849 }
2850 }
2851 }
2852 }
2853
2854 if (new_refblock_index > 0) {
2855 /* Complete the potentially existing partially filled final refblock */
2856 if (new_set_refcount) {
2857 for (; new_refblock_index < new_refblock_size;
2858 new_refblock_index++)
2859 {
2860 new_set_refcount(new_refblock, new_refblock_index, 0);
2861 }
2862 }
2863
2864 ret = operation(bs, new_reftable, *new_reftable_index,
2865 new_reftable_size, new_refblock, new_refblock_empty,
2866 allocated, errp);
2867 if (ret < 0) {
2868 return ret;
2869 }
2870
2871 (*new_reftable_index)++;
2872 }
2873
2874 status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2875 (uint64_t)total * s->refcount_table_size, cb_opaque);
2876
2877 return 0;
2878}
2879
2880int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
2881 BlockDriverAmendStatusCB *status_cb,
2882 void *cb_opaque, Error **errp)
2883{
2884 BDRVQcow2State *s = bs->opaque;
2885 Qcow2GetRefcountFunc *new_get_refcount;
2886 Qcow2SetRefcountFunc *new_set_refcount;
2887 void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
2888 uint64_t *new_reftable = NULL, new_reftable_size = 0;
2889 uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
2890 uint64_t new_reftable_index = 0;
2891 uint64_t i;
2892 int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
2893 int new_refblock_size, new_refcount_bits = 1 << refcount_order;
2894 int old_refcount_order;
2895 int walk_index = 0;
2896 int ret;
2897 bool new_allocation;
2898
2899 assert(s->qcow_version >= 3);
2900 assert(refcount_order >= 0 && refcount_order <= 6);
2901
2902 /* see qcow2_open() */
2903 new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
2904
2905 new_get_refcount = get_refcount_funcs[refcount_order];
2906 new_set_refcount = set_refcount_funcs[refcount_order];
2907
2908
2909 do {
2910 int total_walks;
2911
2912 new_allocation = false;
2913
2914 /* At least we have to do this walk and the one which writes the
2915 * refblocks; also, at least we have to do this loop here at least
2916 * twice (normally), first to do the allocations, and second to
2917 * determine that everything is correctly allocated, this then makes
2918 * three walks in total */
2919 total_walks = MAX(walk_index + 2, 3);
2920
2921 /* First, allocate the structures so they are present in the refcount
2922 * structures */
2923 ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2924 &new_reftable_size, NULL, new_refblock_size,
2925 new_refcount_bits, &alloc_refblock,
2926 &new_allocation, NULL, status_cb, cb_opaque,
2927 walk_index++, total_walks, errp);
2928 if (ret < 0) {
2929 goto done;
2930 }
2931
2932 new_reftable_index = 0;
2933
2934 if (new_allocation) {
2935 if (new_reftable_offset) {
2936 qcow2_free_clusters(bs, new_reftable_offset,
2937 allocated_reftable_size * sizeof(uint64_t),
2938 QCOW2_DISCARD_NEVER);
2939 }
2940
2941 new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
2942 sizeof(uint64_t));
2943 if (new_reftable_offset < 0) {
2944 error_setg_errno(errp, -new_reftable_offset,
2945 "Failed to allocate the new reftable");
2946 ret = new_reftable_offset;
2947 goto done;
2948 }
2949 allocated_reftable_size = new_reftable_size;
2950 }
2951 } while (new_allocation);
2952
2953 /* Second, write the new refblocks */
2954 ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2955 &new_reftable_size, new_refblock,
2956 new_refblock_size, new_refcount_bits,
2957 &flush_refblock, &new_allocation, new_set_refcount,
2958 status_cb, cb_opaque, walk_index, walk_index + 1,
2959 errp);
2960 if (ret < 0) {
2961 goto done;
2962 }
2963 assert(!new_allocation);
2964
2965
2966 /* Write the new reftable */
2967 ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
2968 new_reftable_size * sizeof(uint64_t));
2969 if (ret < 0) {
2970 error_setg_errno(errp, -ret, "Overlap check failed");
2971 goto done;
2972 }
2973
2974 for (i = 0; i < new_reftable_size; i++) {
2975 cpu_to_be64s(&new_reftable[i]);
2976 }
2977
d9ca2ea2 2978 ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
791c9a00
HR
2979 new_reftable_size * sizeof(uint64_t));
2980
2981 for (i = 0; i < new_reftable_size; i++) {
2982 be64_to_cpus(&new_reftable[i]);
2983 }
2984
2985 if (ret < 0) {
2986 error_setg_errno(errp, -ret, "Failed to write the new reftable");
2987 goto done;
2988 }
2989
2990
2991 /* Empty the refcount cache */
2992 ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2993 if (ret < 0) {
2994 error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
2995 goto done;
2996 }
2997
2998 /* Update the image header to point to the new reftable; this only updates
2999 * the fields which are relevant to qcow2_update_header(); other fields
3000 * such as s->refcount_table or s->refcount_bits stay stale for now
3001 * (because we have to restore everything if qcow2_update_header() fails) */
3002 old_refcount_order = s->refcount_order;
3003 old_reftable_size = s->refcount_table_size;
3004 old_reftable_offset = s->refcount_table_offset;
3005
3006 s->refcount_order = refcount_order;
3007 s->refcount_table_size = new_reftable_size;
3008 s->refcount_table_offset = new_reftable_offset;
3009
3010 ret = qcow2_update_header(bs);
3011 if (ret < 0) {
3012 s->refcount_order = old_refcount_order;
3013 s->refcount_table_size = old_reftable_size;
3014 s->refcount_table_offset = old_reftable_offset;
3015 error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3016 goto done;
3017 }
3018
3019 /* Now update the rest of the in-memory information */
3020 old_reftable = s->refcount_table;
3021 s->refcount_table = new_reftable;
7061a078 3022 update_max_refcount_table_index(s);
791c9a00
HR
3023
3024 s->refcount_bits = 1 << refcount_order;
3025 s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3026 s->refcount_max += s->refcount_max - 1;
3027
3028 s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3029 s->refcount_block_size = 1 << s->refcount_block_bits;
3030
3031 s->get_refcount = new_get_refcount;
3032 s->set_refcount = new_set_refcount;
3033
3034 /* For cleaning up all old refblocks and the old reftable below the "done"
3035 * label */
3036 new_reftable = old_reftable;
3037 new_reftable_size = old_reftable_size;
3038 new_reftable_offset = old_reftable_offset;
3039
3040done:
3041 if (new_reftable) {
3042 /* On success, new_reftable actually points to the old reftable (and
3043 * new_reftable_size is the old reftable's size); but that is just
3044 * fine */
3045 for (i = 0; i < new_reftable_size; i++) {
3046 uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3047 if (offset) {
3048 qcow2_free_clusters(bs, offset, s->cluster_size,
3049 QCOW2_DISCARD_OTHER);
3050 }
3051 }
3052 g_free(new_reftable);
3053
3054 if (new_reftable_offset > 0) {
3055 qcow2_free_clusters(bs, new_reftable_offset,
3056 new_reftable_size * sizeof(uint64_t),
3057 QCOW2_DISCARD_OTHER);
3058 }
3059 }
3060
3061 qemu_vfree(new_refblock);
3062 return ret;
3063}