]> git.proxmox.com Git - mirror_qemu.git/blob - block/qcow2-refcount.c
qcow2: Allow alloc_clusters_noref to return errors
[mirror_qemu.git] / block / qcow2-refcount.c
1 /*
2 * Block driver for the QCOW version 2 format
3 *
4 * Copyright (c) 2004-2006 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "qemu-common.h"
26 #include "block_int.h"
27 #include "block/qcow2.h"
28
29 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
30 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
31 int64_t offset, int64_t length,
32 int addend);
33
34
35 static int cache_refcount_updates = 0;
36
37 static int write_refcount_block(BlockDriverState *bs)
38 {
39 BDRVQcowState *s = bs->opaque;
40 size_t size = s->cluster_size;
41
42 if (s->refcount_block_cache_offset == 0) {
43 return 0;
44 }
45
46 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE);
47 if (bdrv_pwrite(bs->file, s->refcount_block_cache_offset,
48 s->refcount_block_cache, size) != size)
49 {
50 return -EIO;
51 }
52
53 return 0;
54 }
55
56 /*********************************************************/
57 /* refcount handling */
58
59 int qcow2_refcount_init(BlockDriverState *bs)
60 {
61 BDRVQcowState *s = bs->opaque;
62 int ret, refcount_table_size2, i;
63
64 s->refcount_block_cache = qemu_malloc(s->cluster_size);
65 refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
66 s->refcount_table = qemu_malloc(refcount_table_size2);
67 if (s->refcount_table_size > 0) {
68 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
69 ret = bdrv_pread(bs->file, s->refcount_table_offset,
70 s->refcount_table, refcount_table_size2);
71 if (ret != refcount_table_size2)
72 goto fail;
73 for(i = 0; i < s->refcount_table_size; i++)
74 be64_to_cpus(&s->refcount_table[i]);
75 }
76 return 0;
77 fail:
78 return -ENOMEM;
79 }
80
81 void qcow2_refcount_close(BlockDriverState *bs)
82 {
83 BDRVQcowState *s = bs->opaque;
84 qemu_free(s->refcount_block_cache);
85 qemu_free(s->refcount_table);
86 }
87
88
89 static int load_refcount_block(BlockDriverState *bs,
90 int64_t refcount_block_offset)
91 {
92 BDRVQcowState *s = bs->opaque;
93 int ret;
94
95 if (cache_refcount_updates) {
96 write_refcount_block(bs);
97 }
98
99 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
100 ret = bdrv_pread(bs->file, refcount_block_offset, s->refcount_block_cache,
101 s->cluster_size);
102 if (ret != s->cluster_size)
103 return -EIO;
104 s->refcount_block_cache_offset = refcount_block_offset;
105 return 0;
106 }
107
108 /*
109 * Returns the refcount of the cluster given by its index. Any non-negative
110 * return value is the refcount of the cluster, negative values are -errno
111 * and indicate an error.
112 */
113 static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
114 {
115 BDRVQcowState *s = bs->opaque;
116 int refcount_table_index, block_index;
117 int64_t refcount_block_offset;
118 int ret;
119
120 refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
121 if (refcount_table_index >= s->refcount_table_size)
122 return 0;
123 refcount_block_offset = s->refcount_table[refcount_table_index];
124 if (!refcount_block_offset)
125 return 0;
126 if (refcount_block_offset != s->refcount_block_cache_offset) {
127 /* better than nothing: return allocated if read error */
128 ret = load_refcount_block(bs, refcount_block_offset);
129 if (ret < 0) {
130 return ret;
131 }
132 }
133 block_index = cluster_index &
134 ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
135 return be16_to_cpu(s->refcount_block_cache[block_index]);
136 }
137
138 /*
139 * Rounds the refcount table size up to avoid growing the table for each single
140 * refcount block that is allocated.
141 */
142 static unsigned int next_refcount_table_size(BDRVQcowState *s,
143 unsigned int min_size)
144 {
145 unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
146 unsigned int refcount_table_clusters =
147 MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
148
149 while (min_clusters > refcount_table_clusters) {
150 refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
151 }
152
153 return refcount_table_clusters << (s->cluster_bits - 3);
154 }
155
156
157 /* Checks if two offsets are described by the same refcount block */
158 static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
159 uint64_t offset_b)
160 {
161 uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
162 uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
163
164 return (block_a == block_b);
165 }
166
167 /*
168 * Loads a refcount block. If it doesn't exist yet, it is allocated first
169 * (including growing the refcount table if needed).
170 *
171 * Returns the offset of the refcount block on success or -errno in error case
172 */
173 static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
174 {
175 BDRVQcowState *s = bs->opaque;
176 unsigned int refcount_table_index;
177 int ret;
178
179 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
180
181 /* Find the refcount block for the given cluster */
182 refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
183
184 if (refcount_table_index < s->refcount_table_size) {
185
186 uint64_t refcount_block_offset =
187 s->refcount_table[refcount_table_index];
188
189 /* If it's already there, we're done */
190 if (refcount_block_offset) {
191 if (refcount_block_offset != s->refcount_block_cache_offset) {
192 ret = load_refcount_block(bs, refcount_block_offset);
193 if (ret < 0) {
194 return ret;
195 }
196 }
197 return refcount_block_offset;
198 }
199 }
200
201 /*
202 * If we came here, we need to allocate something. Something is at least
203 * a cluster for the new refcount block. It may also include a new refcount
204 * table if the old refcount table is too small.
205 *
206 * Note that allocating clusters here needs some special care:
207 *
208 * - We can't use the normal qcow2_alloc_clusters(), it would try to
209 * increase the refcount and very likely we would end up with an endless
210 * recursion. Instead we must place the refcount blocks in a way that
211 * they can describe them themselves.
212 *
213 * - We need to consider that at this point we are inside update_refcounts
214 * and doing the initial refcount increase. This means that some clusters
215 * have already been allocated by the caller, but their refcount isn't
216 * accurate yet. free_cluster_index tells us where this allocation ends
217 * as long as we don't overwrite it by freeing clusters.
218 *
219 * - alloc_clusters_noref and qcow2_free_clusters may load a different
220 * refcount block into the cache
221 */
222
223 if (cache_refcount_updates) {
224 ret = write_refcount_block(bs);
225 if (ret < 0) {
226 return ret;
227 }
228 }
229
230 /* Allocate the refcount block itself and mark it as used */
231 int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
232 if (new_block < 0) {
233 return new_block;
234 }
235
236 #ifdef DEBUG_ALLOC2
237 fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
238 " at %" PRIx64 "\n",
239 refcount_table_index, cluster_index << s->cluster_bits, new_block);
240 #endif
241
242 if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
243 /* Zero the new refcount block before updating it */
244 memset(s->refcount_block_cache, 0, s->cluster_size);
245 s->refcount_block_cache_offset = new_block;
246
247 /* The block describes itself, need to update the cache */
248 int block_index = (new_block >> s->cluster_bits) &
249 ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
250 s->refcount_block_cache[block_index] = cpu_to_be16(1);
251 } else {
252 /* Described somewhere else. This can recurse at most twice before we
253 * arrive at a block that describes itself. */
254 ret = update_refcount(bs, new_block, s->cluster_size, 1);
255 if (ret < 0) {
256 goto fail_block;
257 }
258
259 /* Initialize the new refcount block only after updating its refcount,
260 * update_refcount uses the refcount cache itself */
261 memset(s->refcount_block_cache, 0, s->cluster_size);
262 s->refcount_block_cache_offset = new_block;
263 }
264
265 /* Now the new refcount block needs to be written to disk */
266 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
267 ret = bdrv_pwrite(bs->file, new_block, s->refcount_block_cache,
268 s->cluster_size);
269 if (ret < 0) {
270 goto fail_block;
271 }
272
273 /* If the refcount table is big enough, just hook the block up there */
274 if (refcount_table_index < s->refcount_table_size) {
275 uint64_t data64 = cpu_to_be64(new_block);
276 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
277 ret = bdrv_pwrite(bs->file,
278 s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
279 &data64, sizeof(data64));
280 if (ret < 0) {
281 goto fail_block;
282 }
283
284 s->refcount_table[refcount_table_index] = new_block;
285 return new_block;
286 }
287
288 /*
289 * If we come here, we need to grow the refcount table. Again, a new
290 * refcount table needs some space and we can't simply allocate to avoid
291 * endless recursion.
292 *
293 * Therefore let's grab new refcount blocks at the end of the image, which
294 * will describe themselves and the new refcount table. This way we can
295 * reference them only in the new table and do the switch to the new
296 * refcount table at once without producing an inconsistent state in
297 * between.
298 */
299 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
300
301 /* Calculate the number of refcount blocks needed so far */
302 uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
303 uint64_t blocks_used = (s->free_cluster_index +
304 refcount_block_clusters - 1) / refcount_block_clusters;
305
306 /* And now we need at least one block more for the new metadata */
307 uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
308 uint64_t last_table_size;
309 uint64_t blocks_clusters;
310 do {
311 uint64_t table_clusters = size_to_clusters(s, table_size);
312 blocks_clusters = 1 +
313 ((table_clusters + refcount_block_clusters - 1)
314 / refcount_block_clusters);
315 uint64_t meta_clusters = table_clusters + blocks_clusters;
316
317 last_table_size = table_size;
318 table_size = next_refcount_table_size(s, blocks_used +
319 ((meta_clusters + refcount_block_clusters - 1)
320 / refcount_block_clusters));
321
322 } while (last_table_size != table_size);
323
324 #ifdef DEBUG_ALLOC2
325 fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
326 s->refcount_table_size, table_size);
327 #endif
328
329 /* Create the new refcount table and blocks */
330 uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
331 s->cluster_size;
332 uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
333 uint16_t *new_blocks = qemu_mallocz(blocks_clusters * s->cluster_size);
334 uint64_t *new_table = qemu_mallocz(table_size * sizeof(uint64_t));
335
336 assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
337
338 /* Fill the new refcount table */
339 memcpy(new_table, s->refcount_table,
340 s->refcount_table_size * sizeof(uint64_t));
341 new_table[refcount_table_index] = new_block;
342
343 int i;
344 for (i = 0; i < blocks_clusters; i++) {
345 new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
346 }
347
348 /* Fill the refcount blocks */
349 uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
350 int block = 0;
351 for (i = 0; i < table_clusters + blocks_clusters; i++) {
352 new_blocks[block++] = cpu_to_be16(1);
353 }
354
355 /* Write refcount blocks to disk */
356 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
357 ret = bdrv_pwrite(bs->file, meta_offset, new_blocks,
358 blocks_clusters * s->cluster_size);
359 qemu_free(new_blocks);
360 if (ret < 0) {
361 goto fail_table;
362 }
363
364 /* Write refcount table to disk */
365 for(i = 0; i < table_size; i++) {
366 cpu_to_be64s(&new_table[i]);
367 }
368
369 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
370 ret = bdrv_pwrite(bs->file, table_offset, new_table,
371 table_size * sizeof(uint64_t));
372 if (ret < 0) {
373 goto fail_table;
374 }
375
376 for(i = 0; i < table_size; i++) {
377 cpu_to_be64s(&new_table[i]);
378 }
379
380 /* Hook up the new refcount table in the qcow2 header */
381 uint8_t data[12];
382 cpu_to_be64w((uint64_t*)data, table_offset);
383 cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
384 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
385 ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, refcount_table_offset),
386 data, sizeof(data));
387 if (ret < 0) {
388 goto fail_table;
389 }
390
391 /* And switch it in memory */
392 uint64_t old_table_offset = s->refcount_table_offset;
393 uint64_t old_table_size = s->refcount_table_size;
394
395 qemu_free(s->refcount_table);
396 s->refcount_table = new_table;
397 s->refcount_table_size = table_size;
398 s->refcount_table_offset = table_offset;
399
400 /* Free old table. Remember, we must not change free_cluster_index */
401 uint64_t old_free_cluster_index = s->free_cluster_index;
402 qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
403 s->free_cluster_index = old_free_cluster_index;
404
405 ret = load_refcount_block(bs, new_block);
406 if (ret < 0) {
407 goto fail_block;
408 }
409
410 return new_block;
411
412 fail_table:
413 qemu_free(new_table);
414 fail_block:
415 s->refcount_block_cache_offset = 0;
416 return ret;
417 }
418
419 #define REFCOUNTS_PER_SECTOR (512 >> REFCOUNT_SHIFT)
420 static int write_refcount_block_entries(BlockDriverState *bs,
421 int64_t refcount_block_offset, int first_index, int last_index)
422 {
423 BDRVQcowState *s = bs->opaque;
424 size_t size;
425 int ret;
426
427 if (cache_refcount_updates) {
428 return 0;
429 }
430
431 if (first_index < 0) {
432 return 0;
433 }
434
435 first_index &= ~(REFCOUNTS_PER_SECTOR - 1);
436 last_index = (last_index + REFCOUNTS_PER_SECTOR)
437 & ~(REFCOUNTS_PER_SECTOR - 1);
438
439 size = (last_index - first_index) << REFCOUNT_SHIFT;
440
441 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
442 ret = bdrv_pwrite(bs->file,
443 refcount_block_offset + (first_index << REFCOUNT_SHIFT),
444 &s->refcount_block_cache[first_index], size);
445 if (ret < 0) {
446 return ret;
447 }
448
449 return 0;
450 }
451
452 /* XXX: cache several refcount block clusters ? */
453 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
454 int64_t offset, int64_t length, int addend)
455 {
456 BDRVQcowState *s = bs->opaque;
457 int64_t start, last, cluster_offset;
458 int64_t refcount_block_offset = 0;
459 int64_t table_index = -1, old_table_index;
460 int first_index = -1, last_index = -1;
461 int ret;
462
463 #ifdef DEBUG_ALLOC2
464 printf("update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
465 offset, length, addend);
466 #endif
467 if (length < 0) {
468 return -EINVAL;
469 } else if (length == 0) {
470 return 0;
471 }
472
473 start = offset & ~(s->cluster_size - 1);
474 last = (offset + length - 1) & ~(s->cluster_size - 1);
475 for(cluster_offset = start; cluster_offset <= last;
476 cluster_offset += s->cluster_size)
477 {
478 int block_index, refcount;
479 int64_t cluster_index = cluster_offset >> s->cluster_bits;
480 int64_t new_block;
481
482 /* Only write refcount block to disk when we are done with it */
483 old_table_index = table_index;
484 table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
485 if ((old_table_index >= 0) && (table_index != old_table_index)) {
486
487 ret = write_refcount_block_entries(bs, refcount_block_offset,
488 first_index, last_index);
489 if (ret < 0) {
490 return ret;
491 }
492
493 first_index = -1;
494 last_index = -1;
495 }
496
497 /* Load the refcount block and allocate it if needed */
498 new_block = alloc_refcount_block(bs, cluster_index);
499 if (new_block < 0) {
500 ret = new_block;
501 goto fail;
502 }
503 refcount_block_offset = new_block;
504
505 /* we can update the count and save it */
506 block_index = cluster_index &
507 ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
508 if (first_index == -1 || block_index < first_index) {
509 first_index = block_index;
510 }
511 if (block_index > last_index) {
512 last_index = block_index;
513 }
514
515 refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
516 refcount += addend;
517 if (refcount < 0 || refcount > 0xffff) {
518 ret = -EINVAL;
519 goto fail;
520 }
521 if (refcount == 0 && cluster_index < s->free_cluster_index) {
522 s->free_cluster_index = cluster_index;
523 }
524 s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
525 }
526
527 ret = 0;
528 fail:
529
530 /* Write last changed block to disk */
531 if (refcount_block_offset != 0) {
532 int wret;
533 wret = write_refcount_block_entries(bs, refcount_block_offset,
534 first_index, last_index);
535 if (wret < 0) {
536 return ret < 0 ? ret : wret;
537 }
538 }
539
540 /*
541 * Try do undo any updates if an error is returned (This may succeed in
542 * some cases like ENOSPC for allocating a new refcount block)
543 */
544 if (ret < 0) {
545 int dummy;
546 dummy = update_refcount(bs, offset, cluster_offset - offset, -addend);
547 }
548
549 return ret;
550 }
551
552 /*
553 * Increases or decreases the refcount of a given cluster by one.
554 * addend must be 1 or -1.
555 *
556 * If the return value is non-negative, it is the new refcount of the cluster.
557 * If it is negative, it is -errno and indicates an error.
558 */
559 static int update_cluster_refcount(BlockDriverState *bs,
560 int64_t cluster_index,
561 int addend)
562 {
563 BDRVQcowState *s = bs->opaque;
564 int ret;
565
566 ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend);
567 if (ret < 0) {
568 return ret;
569 }
570
571 return get_refcount(bs, cluster_index);
572 }
573
574
575
576 /*********************************************************/
577 /* cluster allocation functions */
578
579
580
581 /* return < 0 if error */
582 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
583 {
584 BDRVQcowState *s = bs->opaque;
585 int i, nb_clusters, refcount;
586
587 nb_clusters = size_to_clusters(s, size);
588 retry:
589 for(i = 0; i < nb_clusters; i++) {
590 int64_t next_cluster_index = s->free_cluster_index++;
591 refcount = get_refcount(bs, next_cluster_index);
592
593 if (refcount < 0) {
594 return refcount;
595 } else if (refcount != 0) {
596 goto retry;
597 }
598 }
599 #ifdef DEBUG_ALLOC2
600 printf("alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
601 size,
602 (s->free_cluster_index - nb_clusters) << s->cluster_bits);
603 #endif
604 return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
605 }
606
607 int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
608 {
609 int64_t offset;
610 int ret;
611
612 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
613 offset = alloc_clusters_noref(bs, size);
614 if (offset < 0) {
615 return offset;
616 }
617
618 ret = update_refcount(bs, offset, size, 1);
619 if (ret < 0) {
620 return ret;
621 }
622 return offset;
623 }
624
625 /* only used to allocate compressed sectors. We try to allocate
626 contiguous sectors. size must be <= cluster_size */
627 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
628 {
629 BDRVQcowState *s = bs->opaque;
630 int64_t offset, cluster_offset;
631 int free_in_cluster;
632
633 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
634 assert(size > 0 && size <= s->cluster_size);
635 if (s->free_byte_offset == 0) {
636 s->free_byte_offset = qcow2_alloc_clusters(bs, s->cluster_size);
637 if (s->free_byte_offset < 0) {
638 return s->free_byte_offset;
639 }
640 }
641 redo:
642 free_in_cluster = s->cluster_size -
643 (s->free_byte_offset & (s->cluster_size - 1));
644 if (size <= free_in_cluster) {
645 /* enough space in current cluster */
646 offset = s->free_byte_offset;
647 s->free_byte_offset += size;
648 free_in_cluster -= size;
649 if (free_in_cluster == 0)
650 s->free_byte_offset = 0;
651 if ((offset & (s->cluster_size - 1)) != 0)
652 update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
653 } else {
654 offset = qcow2_alloc_clusters(bs, s->cluster_size);
655 if (offset < 0) {
656 return offset;
657 }
658 cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
659 if ((cluster_offset + s->cluster_size) == offset) {
660 /* we are lucky: contiguous data */
661 offset = s->free_byte_offset;
662 update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
663 s->free_byte_offset += size;
664 } else {
665 s->free_byte_offset = offset;
666 goto redo;
667 }
668 }
669 return offset;
670 }
671
672 void qcow2_free_clusters(BlockDriverState *bs,
673 int64_t offset, int64_t size)
674 {
675 int ret;
676
677 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
678 ret = update_refcount(bs, offset, size, -1);
679 if (ret < 0) {
680 fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
681 /* TODO Remember the clusters to free them later and avoid leaking */
682 }
683 }
684
685 /*
686 * free_any_clusters
687 *
688 * free clusters according to its type: compressed or not
689 *
690 */
691
692 void qcow2_free_any_clusters(BlockDriverState *bs,
693 uint64_t cluster_offset, int nb_clusters)
694 {
695 BDRVQcowState *s = bs->opaque;
696
697 /* free the cluster */
698
699 if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
700 int nb_csectors;
701 nb_csectors = ((cluster_offset >> s->csize_shift) &
702 s->csize_mask) + 1;
703 qcow2_free_clusters(bs,
704 (cluster_offset & s->cluster_offset_mask) & ~511,
705 nb_csectors * 512);
706 return;
707 }
708
709 qcow2_free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
710
711 return;
712 }
713
714
715
716 /*********************************************************/
717 /* snapshots and image creation */
718
719
720
721 void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset,
722 int64_t size)
723 {
724 int refcount;
725 int64_t start, last, cluster_offset;
726 uint16_t *p;
727
728 start = offset & ~(s->cluster_size - 1);
729 last = (offset + size - 1) & ~(s->cluster_size - 1);
730 for(cluster_offset = start; cluster_offset <= last;
731 cluster_offset += s->cluster_size) {
732 p = &s->refcount_block[cluster_offset >> s->cluster_bits];
733 refcount = be16_to_cpu(*p);
734 refcount++;
735 *p = cpu_to_be16(refcount);
736 }
737 }
738
739 /* update the refcounts of snapshots and the copied flag */
740 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
741 int64_t l1_table_offset, int l1_size, int addend)
742 {
743 BDRVQcowState *s = bs->opaque;
744 uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
745 int64_t old_offset, old_l2_offset;
746 int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
747
748 qcow2_l2_cache_reset(bs);
749 cache_refcount_updates = 1;
750
751 l2_table = NULL;
752 l1_table = NULL;
753 l1_size2 = l1_size * sizeof(uint64_t);
754 if (l1_table_offset != s->l1_table_offset) {
755 if (l1_size2 != 0) {
756 l1_table = qemu_mallocz(align_offset(l1_size2, 512));
757 } else {
758 l1_table = NULL;
759 }
760 l1_allocated = 1;
761 if (bdrv_pread(bs->file, l1_table_offset,
762 l1_table, l1_size2) != l1_size2)
763 goto fail;
764 for(i = 0;i < l1_size; i++)
765 be64_to_cpus(&l1_table[i]);
766 } else {
767 assert(l1_size == s->l1_size);
768 l1_table = s->l1_table;
769 l1_allocated = 0;
770 }
771
772 l2_size = s->l2_size * sizeof(uint64_t);
773 l2_table = qemu_malloc(l2_size);
774 l1_modified = 0;
775 for(i = 0; i < l1_size; i++) {
776 l2_offset = l1_table[i];
777 if (l2_offset) {
778 old_l2_offset = l2_offset;
779 l2_offset &= ~QCOW_OFLAG_COPIED;
780 l2_modified = 0;
781 if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
782 goto fail;
783 for(j = 0; j < s->l2_size; j++) {
784 offset = be64_to_cpu(l2_table[j]);
785 if (offset != 0) {
786 old_offset = offset;
787 offset &= ~QCOW_OFLAG_COPIED;
788 if (offset & QCOW_OFLAG_COMPRESSED) {
789 nb_csectors = ((offset >> s->csize_shift) &
790 s->csize_mask) + 1;
791 if (addend != 0) {
792 int ret;
793 ret = update_refcount(bs,
794 (offset & s->cluster_offset_mask) & ~511,
795 nb_csectors * 512, addend);
796 if (ret < 0) {
797 goto fail;
798 }
799 }
800 /* compressed clusters are never modified */
801 refcount = 2;
802 } else {
803 if (addend != 0) {
804 refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
805 } else {
806 refcount = get_refcount(bs, offset >> s->cluster_bits);
807 }
808
809 if (refcount < 0) {
810 goto fail;
811 }
812 }
813
814 if (refcount == 1) {
815 offset |= QCOW_OFLAG_COPIED;
816 }
817 if (offset != old_offset) {
818 l2_table[j] = cpu_to_be64(offset);
819 l2_modified = 1;
820 }
821 }
822 }
823 if (l2_modified) {
824 if (bdrv_pwrite(bs->file,
825 l2_offset, l2_table, l2_size) != l2_size)
826 goto fail;
827 }
828
829 if (addend != 0) {
830 refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
831 } else {
832 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
833 }
834 if (refcount < 0) {
835 goto fail;
836 } else if (refcount == 1) {
837 l2_offset |= QCOW_OFLAG_COPIED;
838 }
839 if (l2_offset != old_l2_offset) {
840 l1_table[i] = l2_offset;
841 l1_modified = 1;
842 }
843 }
844 }
845 if (l1_modified) {
846 for(i = 0; i < l1_size; i++)
847 cpu_to_be64s(&l1_table[i]);
848 if (bdrv_pwrite(bs->file, l1_table_offset, l1_table,
849 l1_size2) != l1_size2)
850 goto fail;
851 for(i = 0; i < l1_size; i++)
852 be64_to_cpus(&l1_table[i]);
853 }
854 if (l1_allocated)
855 qemu_free(l1_table);
856 qemu_free(l2_table);
857 cache_refcount_updates = 0;
858 write_refcount_block(bs);
859 return 0;
860 fail:
861 if (l1_allocated)
862 qemu_free(l1_table);
863 qemu_free(l2_table);
864 cache_refcount_updates = 0;
865 write_refcount_block(bs);
866 return -EIO;
867 }
868
869
870
871
872 /*********************************************************/
873 /* refcount checking functions */
874
875
876
877 /*
878 * Increases the refcount for a range of clusters in a given refcount table.
879 * This is used to construct a temporary refcount table out of L1 and L2 tables
880 * which can be compared the the refcount table saved in the image.
881 *
882 * Returns the number of errors in the image that were found
883 */
884 static int inc_refcounts(BlockDriverState *bs,
885 uint16_t *refcount_table,
886 int refcount_table_size,
887 int64_t offset, int64_t size)
888 {
889 BDRVQcowState *s = bs->opaque;
890 int64_t start, last, cluster_offset;
891 int k;
892 int errors = 0;
893
894 if (size <= 0)
895 return 0;
896
897 start = offset & ~(s->cluster_size - 1);
898 last = (offset + size - 1) & ~(s->cluster_size - 1);
899 for(cluster_offset = start; cluster_offset <= last;
900 cluster_offset += s->cluster_size) {
901 k = cluster_offset >> s->cluster_bits;
902 if (k < 0 || k >= refcount_table_size) {
903 fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
904 cluster_offset);
905 errors++;
906 } else {
907 if (++refcount_table[k] == 0) {
908 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
909 "\n", cluster_offset);
910 errors++;
911 }
912 }
913 }
914
915 return errors;
916 }
917
918 /*
919 * Increases the refcount in the given refcount table for the all clusters
920 * referenced in the L2 table. While doing so, performs some checks on L2
921 * entries.
922 *
923 * Returns the number of errors found by the checks or -errno if an internal
924 * error occurred.
925 */
926 static int check_refcounts_l2(BlockDriverState *bs,
927 uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
928 int check_copied)
929 {
930 BDRVQcowState *s = bs->opaque;
931 uint64_t *l2_table, offset;
932 int i, l2_size, nb_csectors, refcount;
933 int errors = 0;
934
935 /* Read L2 table from disk */
936 l2_size = s->l2_size * sizeof(uint64_t);
937 l2_table = qemu_malloc(l2_size);
938
939 if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
940 goto fail;
941
942 /* Do the actual checks */
943 for(i = 0; i < s->l2_size; i++) {
944 offset = be64_to_cpu(l2_table[i]);
945 if (offset != 0) {
946 if (offset & QCOW_OFLAG_COMPRESSED) {
947 /* Compressed clusters don't have QCOW_OFLAG_COPIED */
948 if (offset & QCOW_OFLAG_COPIED) {
949 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
950 "copied flag must never be set for compressed "
951 "clusters\n", offset >> s->cluster_bits);
952 offset &= ~QCOW_OFLAG_COPIED;
953 errors++;
954 }
955
956 /* Mark cluster as used */
957 nb_csectors = ((offset >> s->csize_shift) &
958 s->csize_mask) + 1;
959 offset &= s->cluster_offset_mask;
960 errors += inc_refcounts(bs, refcount_table,
961 refcount_table_size,
962 offset & ~511, nb_csectors * 512);
963 } else {
964 /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
965 if (check_copied) {
966 uint64_t entry = offset;
967 offset &= ~QCOW_OFLAG_COPIED;
968 refcount = get_refcount(bs, offset >> s->cluster_bits);
969 if (refcount < 0) {
970 fprintf(stderr, "Can't get refcount for offset %"
971 PRIx64 ": %s\n", entry, strerror(-refcount));
972 }
973 if ((refcount == 1) != ((entry & QCOW_OFLAG_COPIED) != 0)) {
974 fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
975 PRIx64 " refcount=%d\n", entry, refcount);
976 errors++;
977 }
978 }
979
980 /* Mark cluster as used */
981 offset &= ~QCOW_OFLAG_COPIED;
982 errors += inc_refcounts(bs, refcount_table,
983 refcount_table_size,
984 offset, s->cluster_size);
985
986 /* Correct offsets are cluster aligned */
987 if (offset & (s->cluster_size - 1)) {
988 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
989 "properly aligned; L2 entry corrupted.\n", offset);
990 errors++;
991 }
992 }
993 }
994 }
995
996 qemu_free(l2_table);
997 return errors;
998
999 fail:
1000 fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1001 qemu_free(l2_table);
1002 return -EIO;
1003 }
1004
1005 /*
1006 * Increases the refcount for the L1 table, its L2 tables and all referenced
1007 * clusters in the given refcount table. While doing so, performs some checks
1008 * on L1 and L2 entries.
1009 *
1010 * Returns the number of errors found by the checks or -errno if an internal
1011 * error occurred.
1012 */
1013 static int check_refcounts_l1(BlockDriverState *bs,
1014 uint16_t *refcount_table,
1015 int refcount_table_size,
1016 int64_t l1_table_offset, int l1_size,
1017 int check_copied)
1018 {
1019 BDRVQcowState *s = bs->opaque;
1020 uint64_t *l1_table, l2_offset, l1_size2;
1021 int i, refcount, ret;
1022 int errors = 0;
1023
1024 l1_size2 = l1_size * sizeof(uint64_t);
1025
1026 /* Mark L1 table as used */
1027 errors += inc_refcounts(bs, refcount_table, refcount_table_size,
1028 l1_table_offset, l1_size2);
1029
1030 /* Read L1 table entries from disk */
1031 if (l1_size2 == 0) {
1032 l1_table = NULL;
1033 } else {
1034 l1_table = qemu_malloc(l1_size2);
1035 if (bdrv_pread(bs->file, l1_table_offset,
1036 l1_table, l1_size2) != l1_size2)
1037 goto fail;
1038 for(i = 0;i < l1_size; i++)
1039 be64_to_cpus(&l1_table[i]);
1040 }
1041
1042 /* Do the actual checks */
1043 for(i = 0; i < l1_size; i++) {
1044 l2_offset = l1_table[i];
1045 if (l2_offset) {
1046 /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
1047 if (check_copied) {
1048 refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
1049 >> s->cluster_bits);
1050 if (refcount < 0) {
1051 fprintf(stderr, "Can't get refcount for l2_offset %"
1052 PRIx64 ": %s\n", l2_offset, strerror(-refcount));
1053 }
1054 if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
1055 fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
1056 " refcount=%d\n", l2_offset, refcount);
1057 errors++;
1058 }
1059 }
1060
1061 /* Mark L2 table as used */
1062 l2_offset &= ~QCOW_OFLAG_COPIED;
1063 errors += inc_refcounts(bs, refcount_table,
1064 refcount_table_size,
1065 l2_offset,
1066 s->cluster_size);
1067
1068 /* L2 tables are cluster aligned */
1069 if (l2_offset & (s->cluster_size - 1)) {
1070 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1071 "cluster aligned; L1 entry corrupted\n", l2_offset);
1072 errors++;
1073 }
1074
1075 /* Process and check L2 entries */
1076 ret = check_refcounts_l2(bs, refcount_table, refcount_table_size,
1077 l2_offset, check_copied);
1078 if (ret < 0) {
1079 goto fail;
1080 }
1081 errors += ret;
1082 }
1083 }
1084 qemu_free(l1_table);
1085 return errors;
1086
1087 fail:
1088 fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1089 qemu_free(l1_table);
1090 return -EIO;
1091 }
1092
1093 /*
1094 * Checks an image for refcount consistency.
1095 *
1096 * Returns 0 if no errors are found, the number of errors in case the image is
1097 * detected as corrupted, and -errno when an internal error occured.
1098 */
1099 int qcow2_check_refcounts(BlockDriverState *bs)
1100 {
1101 BDRVQcowState *s = bs->opaque;
1102 int64_t size;
1103 int nb_clusters, refcount1, refcount2, i;
1104 QCowSnapshot *sn;
1105 uint16_t *refcount_table;
1106 int ret, errors = 0;
1107
1108 size = bdrv_getlength(bs->file);
1109 nb_clusters = size_to_clusters(s, size);
1110 refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
1111
1112 /* header */
1113 errors += inc_refcounts(bs, refcount_table, nb_clusters,
1114 0, s->cluster_size);
1115
1116 /* current L1 table */
1117 ret = check_refcounts_l1(bs, refcount_table, nb_clusters,
1118 s->l1_table_offset, s->l1_size, 1);
1119 if (ret < 0) {
1120 return ret;
1121 }
1122 errors += ret;
1123
1124 /* snapshots */
1125 for(i = 0; i < s->nb_snapshots; i++) {
1126 sn = s->snapshots + i;
1127 check_refcounts_l1(bs, refcount_table, nb_clusters,
1128 sn->l1_table_offset, sn->l1_size, 0);
1129 }
1130 errors += inc_refcounts(bs, refcount_table, nb_clusters,
1131 s->snapshots_offset, s->snapshots_size);
1132
1133 /* refcount data */
1134 errors += inc_refcounts(bs, refcount_table, nb_clusters,
1135 s->refcount_table_offset,
1136 s->refcount_table_size * sizeof(uint64_t));
1137 for(i = 0; i < s->refcount_table_size; i++) {
1138 int64_t offset;
1139 offset = s->refcount_table[i];
1140
1141 /* Refcount blocks are cluster aligned */
1142 if (offset & (s->cluster_size - 1)) {
1143 fprintf(stderr, "ERROR refcount block %d is not "
1144 "cluster aligned; refcount table entry corrupted\n", i);
1145 errors++;
1146 }
1147
1148 if (offset != 0) {
1149 errors += inc_refcounts(bs, refcount_table, nb_clusters,
1150 offset, s->cluster_size);
1151 if (refcount_table[offset / s->cluster_size] != 1) {
1152 fprintf(stderr, "ERROR refcount block %d refcount=%d\n",
1153 i, refcount_table[offset / s->cluster_size]);
1154 }
1155 }
1156 }
1157
1158 /* compare ref counts */
1159 for(i = 0; i < nb_clusters; i++) {
1160 refcount1 = get_refcount(bs, i);
1161 if (refcount1 < 0) {
1162 fprintf(stderr, "Can't get refcount for cluster %d: %s\n",
1163 i, strerror(-refcount1));
1164 }
1165
1166 refcount2 = refcount_table[i];
1167 if (refcount1 != refcount2) {
1168 fprintf(stderr, "ERROR cluster %d refcount=%d reference=%d\n",
1169 i, refcount1, refcount2);
1170 errors++;
1171 }
1172 }
1173
1174 qemu_free(refcount_table);
1175
1176 return errors;
1177 }
1178