]> git.proxmox.com Git - qemu.git/blob - block/qcow2.c
qcow2: mark image clean after repair succeeds
[qemu.git] / block / qcow2.c
1 /*
2 * Block driver for the QCOW version 2 format
3 *
4 * Copyright (c) 2004-2006 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "qemu-common.h"
25 #include "block_int.h"
26 #include "module.h"
27 #include <zlib.h>
28 #include "aes.h"
29 #include "block/qcow2.h"
30 #include "qemu-error.h"
31 #include "qerror.h"
32 #include "trace.h"
33
34 /*
35 Differences with QCOW:
36
37 - Support for multiple incremental snapshots.
38 - Memory management by reference counts.
39 - Clusters which have a reference count of one have the bit
40 QCOW_OFLAG_COPIED to optimize write performance.
41 - Size of compressed clusters is stored in sectors to reduce bit usage
42 in the cluster offsets.
43 - Support for storing additional data (such as the VM state) in the
44 snapshots.
45 - If a backing store is used, the cluster size is not constrained
46 (could be backported to QCOW).
47 - L2 tables have always a size of one cluster.
48 */
49
50
51 typedef struct {
52 uint32_t magic;
53 uint32_t len;
54 } QCowExtension;
55 #define QCOW2_EXT_MAGIC_END 0
56 #define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
57 #define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
58
59 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
60 {
61 const QCowHeader *cow_header = (const void *)buf;
62
63 if (buf_size >= sizeof(QCowHeader) &&
64 be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
65 be32_to_cpu(cow_header->version) >= 2)
66 return 100;
67 else
68 return 0;
69 }
70
71
72 /*
73 * read qcow2 extension and fill bs
74 * start reading from start_offset
75 * finish reading upon magic of value 0 or when end_offset reached
76 * unknown magic is skipped (future extension this version knows nothing about)
77 * return 0 upon success, non-0 otherwise
78 */
79 static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
80 uint64_t end_offset, void **p_feature_table)
81 {
82 BDRVQcowState *s = bs->opaque;
83 QCowExtension ext;
84 uint64_t offset;
85 int ret;
86
87 #ifdef DEBUG_EXT
88 printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
89 #endif
90 offset = start_offset;
91 while (offset < end_offset) {
92
93 #ifdef DEBUG_EXT
94 /* Sanity check */
95 if (offset > s->cluster_size)
96 printf("qcow2_read_extension: suspicious offset %lu\n", offset);
97
98 printf("attempting to read extended header in offset %lu\n", offset);
99 #endif
100
101 if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) {
102 fprintf(stderr, "qcow2_read_extension: ERROR: "
103 "pread fail from offset %" PRIu64 "\n",
104 offset);
105 return 1;
106 }
107 be32_to_cpus(&ext.magic);
108 be32_to_cpus(&ext.len);
109 offset += sizeof(ext);
110 #ifdef DEBUG_EXT
111 printf("ext.magic = 0x%x\n", ext.magic);
112 #endif
113 if (ext.len > end_offset - offset) {
114 error_report("Header extension too large");
115 return -EINVAL;
116 }
117
118 switch (ext.magic) {
119 case QCOW2_EXT_MAGIC_END:
120 return 0;
121
122 case QCOW2_EXT_MAGIC_BACKING_FORMAT:
123 if (ext.len >= sizeof(bs->backing_format)) {
124 fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
125 " (>=%zu)\n",
126 ext.len, sizeof(bs->backing_format));
127 return 2;
128 }
129 if (bdrv_pread(bs->file, offset , bs->backing_format,
130 ext.len) != ext.len)
131 return 3;
132 bs->backing_format[ext.len] = '\0';
133 #ifdef DEBUG_EXT
134 printf("Qcow2: Got format extension %s\n", bs->backing_format);
135 #endif
136 break;
137
138 case QCOW2_EXT_MAGIC_FEATURE_TABLE:
139 if (p_feature_table != NULL) {
140 void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
141 ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
142 if (ret < 0) {
143 return ret;
144 }
145
146 *p_feature_table = feature_table;
147 }
148 break;
149
150 default:
151 /* unknown magic - save it in case we need to rewrite the header */
152 {
153 Qcow2UnknownHeaderExtension *uext;
154
155 uext = g_malloc0(sizeof(*uext) + ext.len);
156 uext->magic = ext.magic;
157 uext->len = ext.len;
158 QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
159
160 ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
161 if (ret < 0) {
162 return ret;
163 }
164 }
165 break;
166 }
167
168 offset += ((ext.len + 7) & ~7);
169 }
170
171 return 0;
172 }
173
174 static void cleanup_unknown_header_ext(BlockDriverState *bs)
175 {
176 BDRVQcowState *s = bs->opaque;
177 Qcow2UnknownHeaderExtension *uext, *next;
178
179 QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
180 QLIST_REMOVE(uext, next);
181 g_free(uext);
182 }
183 }
184
185 static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs,
186 const char *fmt, ...)
187 {
188 char msg[64];
189 va_list ap;
190
191 va_start(ap, fmt);
192 vsnprintf(msg, sizeof(msg), fmt, ap);
193 va_end(ap);
194
195 qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
196 bs->device_name, "qcow2", msg);
197 }
198
199 static void report_unsupported_feature(BlockDriverState *bs,
200 Qcow2Feature *table, uint64_t mask)
201 {
202 while (table && table->name[0] != '\0') {
203 if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
204 if (mask & (1 << table->bit)) {
205 report_unsupported(bs, "%.46s",table->name);
206 mask &= ~(1 << table->bit);
207 }
208 }
209 table++;
210 }
211
212 if (mask) {
213 report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask);
214 }
215 }
216
217 /*
218 * Sets the dirty bit and flushes afterwards if necessary.
219 *
220 * The incompatible_features bit is only set if the image file header was
221 * updated successfully. Therefore it is not required to check the return
222 * value of this function.
223 */
224 static int qcow2_mark_dirty(BlockDriverState *bs)
225 {
226 BDRVQcowState *s = bs->opaque;
227 uint64_t val;
228 int ret;
229
230 assert(s->qcow_version >= 3);
231
232 if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
233 return 0; /* already dirty */
234 }
235
236 val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
237 ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
238 &val, sizeof(val));
239 if (ret < 0) {
240 return ret;
241 }
242 ret = bdrv_flush(bs->file);
243 if (ret < 0) {
244 return ret;
245 }
246
247 /* Only treat image as dirty if the header was updated successfully */
248 s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
249 return 0;
250 }
251
252 /*
253 * Clears the dirty bit and flushes before if necessary. Only call this
254 * function when there are no pending requests, it does not guard against
255 * concurrent requests dirtying the image.
256 */
257 static int qcow2_mark_clean(BlockDriverState *bs)
258 {
259 BDRVQcowState *s = bs->opaque;
260
261 if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
262 int ret = bdrv_flush(bs);
263 if (ret < 0) {
264 return ret;
265 }
266
267 s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
268 return qcow2_update_header(bs);
269 }
270 return 0;
271 }
272
273 static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
274 BdrvCheckMode fix)
275 {
276 int ret = qcow2_check_refcounts(bs, result, fix);
277 if (ret < 0) {
278 return ret;
279 }
280
281 if (fix && result->check_errors == 0 && result->corruptions == 0) {
282 return qcow2_mark_clean(bs);
283 }
284 return ret;
285 }
286
287 static int qcow2_open(BlockDriverState *bs, int flags)
288 {
289 BDRVQcowState *s = bs->opaque;
290 int len, i, ret = 0;
291 QCowHeader header;
292 uint64_t ext_end;
293
294 ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
295 if (ret < 0) {
296 goto fail;
297 }
298 be32_to_cpus(&header.magic);
299 be32_to_cpus(&header.version);
300 be64_to_cpus(&header.backing_file_offset);
301 be32_to_cpus(&header.backing_file_size);
302 be64_to_cpus(&header.size);
303 be32_to_cpus(&header.cluster_bits);
304 be32_to_cpus(&header.crypt_method);
305 be64_to_cpus(&header.l1_table_offset);
306 be32_to_cpus(&header.l1_size);
307 be64_to_cpus(&header.refcount_table_offset);
308 be32_to_cpus(&header.refcount_table_clusters);
309 be64_to_cpus(&header.snapshots_offset);
310 be32_to_cpus(&header.nb_snapshots);
311
312 if (header.magic != QCOW_MAGIC) {
313 ret = -EINVAL;
314 goto fail;
315 }
316 if (header.version < 2 || header.version > 3) {
317 report_unsupported(bs, "QCOW version %d", header.version);
318 ret = -ENOTSUP;
319 goto fail;
320 }
321
322 s->qcow_version = header.version;
323
324 /* Initialise version 3 header fields */
325 if (header.version == 2) {
326 header.incompatible_features = 0;
327 header.compatible_features = 0;
328 header.autoclear_features = 0;
329 header.refcount_order = 4;
330 header.header_length = 72;
331 } else {
332 be64_to_cpus(&header.incompatible_features);
333 be64_to_cpus(&header.compatible_features);
334 be64_to_cpus(&header.autoclear_features);
335 be32_to_cpus(&header.refcount_order);
336 be32_to_cpus(&header.header_length);
337 }
338
339 if (header.header_length > sizeof(header)) {
340 s->unknown_header_fields_size = header.header_length - sizeof(header);
341 s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
342 ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
343 s->unknown_header_fields_size);
344 if (ret < 0) {
345 goto fail;
346 }
347 }
348
349 if (header.backing_file_offset) {
350 ext_end = header.backing_file_offset;
351 } else {
352 ext_end = 1 << header.cluster_bits;
353 }
354
355 /* Handle feature bits */
356 s->incompatible_features = header.incompatible_features;
357 s->compatible_features = header.compatible_features;
358 s->autoclear_features = header.autoclear_features;
359
360 if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
361 void *feature_table = NULL;
362 qcow2_read_extensions(bs, header.header_length, ext_end,
363 &feature_table);
364 report_unsupported_feature(bs, feature_table,
365 s->incompatible_features &
366 ~QCOW2_INCOMPAT_MASK);
367 ret = -ENOTSUP;
368 goto fail;
369 }
370
371 /* Check support for various header values */
372 if (header.refcount_order != 4) {
373 report_unsupported(bs, "%d bit reference counts",
374 1 << header.refcount_order);
375 ret = -ENOTSUP;
376 goto fail;
377 }
378
379 if (header.cluster_bits < MIN_CLUSTER_BITS ||
380 header.cluster_bits > MAX_CLUSTER_BITS) {
381 ret = -EINVAL;
382 goto fail;
383 }
384 if (header.crypt_method > QCOW_CRYPT_AES) {
385 ret = -EINVAL;
386 goto fail;
387 }
388 s->crypt_method_header = header.crypt_method;
389 if (s->crypt_method_header) {
390 bs->encrypted = 1;
391 }
392 s->cluster_bits = header.cluster_bits;
393 s->cluster_size = 1 << s->cluster_bits;
394 s->cluster_sectors = 1 << (s->cluster_bits - 9);
395 s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
396 s->l2_size = 1 << s->l2_bits;
397 bs->total_sectors = header.size / 512;
398 s->csize_shift = (62 - (s->cluster_bits - 8));
399 s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
400 s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
401 s->refcount_table_offset = header.refcount_table_offset;
402 s->refcount_table_size =
403 header.refcount_table_clusters << (s->cluster_bits - 3);
404
405 s->snapshots_offset = header.snapshots_offset;
406 s->nb_snapshots = header.nb_snapshots;
407
408 /* read the level 1 table */
409 s->l1_size = header.l1_size;
410 s->l1_vm_state_index = size_to_l1(s, header.size);
411 /* the L1 table must contain at least enough entries to put
412 header.size bytes */
413 if (s->l1_size < s->l1_vm_state_index) {
414 ret = -EINVAL;
415 goto fail;
416 }
417 s->l1_table_offset = header.l1_table_offset;
418 if (s->l1_size > 0) {
419 s->l1_table = g_malloc0(
420 align_offset(s->l1_size * sizeof(uint64_t), 512));
421 ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
422 s->l1_size * sizeof(uint64_t));
423 if (ret < 0) {
424 goto fail;
425 }
426 for(i = 0;i < s->l1_size; i++) {
427 be64_to_cpus(&s->l1_table[i]);
428 }
429 }
430
431 /* alloc L2 table/refcount block cache */
432 s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
433 s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
434
435 s->cluster_cache = g_malloc(s->cluster_size);
436 /* one more sector for decompressed data alignment */
437 s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
438 + 512);
439 s->cluster_cache_offset = -1;
440 s->flags = flags;
441
442 ret = qcow2_refcount_init(bs);
443 if (ret != 0) {
444 goto fail;
445 }
446
447 QLIST_INIT(&s->cluster_allocs);
448
449 /* read qcow2 extensions */
450 if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) {
451 ret = -EINVAL;
452 goto fail;
453 }
454
455 /* read the backing file name */
456 if (header.backing_file_offset != 0) {
457 len = header.backing_file_size;
458 if (len > 1023) {
459 len = 1023;
460 }
461 ret = bdrv_pread(bs->file, header.backing_file_offset,
462 bs->backing_file, len);
463 if (ret < 0) {
464 goto fail;
465 }
466 bs->backing_file[len] = '\0';
467 }
468
469 ret = qcow2_read_snapshots(bs);
470 if (ret < 0) {
471 goto fail;
472 }
473
474 /* Clear unknown autoclear feature bits */
475 if (!bs->read_only && s->autoclear_features != 0) {
476 s->autoclear_features = 0;
477 ret = qcow2_update_header(bs);
478 if (ret < 0) {
479 goto fail;
480 }
481 }
482
483 /* Initialise locks */
484 qemu_co_mutex_init(&s->lock);
485
486 /* Repair image if dirty */
487 if ((s->incompatible_features & QCOW2_INCOMPAT_DIRTY) &&
488 !bs->read_only) {
489 BdrvCheckResult result = {0};
490
491 ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
492 if (ret < 0) {
493 goto fail;
494 }
495 }
496
497 #ifdef DEBUG_ALLOC
498 {
499 BdrvCheckResult result = {0};
500 qcow2_check_refcounts(bs, &result, 0);
501 }
502 #endif
503 return ret;
504
505 fail:
506 g_free(s->unknown_header_fields);
507 cleanup_unknown_header_ext(bs);
508 qcow2_free_snapshots(bs);
509 qcow2_refcount_close(bs);
510 g_free(s->l1_table);
511 if (s->l2_table_cache) {
512 qcow2_cache_destroy(bs, s->l2_table_cache);
513 }
514 g_free(s->cluster_cache);
515 qemu_vfree(s->cluster_data);
516 return ret;
517 }
518
519 static int qcow2_set_key(BlockDriverState *bs, const char *key)
520 {
521 BDRVQcowState *s = bs->opaque;
522 uint8_t keybuf[16];
523 int len, i;
524
525 memset(keybuf, 0, 16);
526 len = strlen(key);
527 if (len > 16)
528 len = 16;
529 /* XXX: we could compress the chars to 7 bits to increase
530 entropy */
531 for(i = 0;i < len;i++) {
532 keybuf[i] = key[i];
533 }
534 s->crypt_method = s->crypt_method_header;
535
536 if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
537 return -1;
538 if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
539 return -1;
540 #if 0
541 /* test */
542 {
543 uint8_t in[16];
544 uint8_t out[16];
545 uint8_t tmp[16];
546 for(i=0;i<16;i++)
547 in[i] = i;
548 AES_encrypt(in, tmp, &s->aes_encrypt_key);
549 AES_decrypt(tmp, out, &s->aes_decrypt_key);
550 for(i = 0; i < 16; i++)
551 printf(" %02x", tmp[i]);
552 printf("\n");
553 for(i = 0; i < 16; i++)
554 printf(" %02x", out[i]);
555 printf("\n");
556 }
557 #endif
558 return 0;
559 }
560
561 static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
562 int64_t sector_num, int nb_sectors, int *pnum)
563 {
564 BDRVQcowState *s = bs->opaque;
565 uint64_t cluster_offset;
566 int ret;
567
568 *pnum = nb_sectors;
569 /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
570 * can't pass them on today */
571 qemu_co_mutex_lock(&s->lock);
572 ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
573 qemu_co_mutex_unlock(&s->lock);
574 if (ret < 0) {
575 *pnum = 0;
576 }
577
578 return (cluster_offset != 0);
579 }
580
581 /* handle reading after the end of the backing file */
582 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
583 int64_t sector_num, int nb_sectors)
584 {
585 int n1;
586 if ((sector_num + nb_sectors) <= bs->total_sectors)
587 return nb_sectors;
588 if (sector_num >= bs->total_sectors)
589 n1 = 0;
590 else
591 n1 = bs->total_sectors - sector_num;
592
593 qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
594
595 return n1;
596 }
597
598 static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
599 int remaining_sectors, QEMUIOVector *qiov)
600 {
601 BDRVQcowState *s = bs->opaque;
602 int index_in_cluster, n1;
603 int ret;
604 int cur_nr_sectors; /* number of sectors in current iteration */
605 uint64_t cluster_offset = 0;
606 uint64_t bytes_done = 0;
607 QEMUIOVector hd_qiov;
608 uint8_t *cluster_data = NULL;
609
610 qemu_iovec_init(&hd_qiov, qiov->niov);
611
612 qemu_co_mutex_lock(&s->lock);
613
614 while (remaining_sectors != 0) {
615
616 /* prepare next request */
617 cur_nr_sectors = remaining_sectors;
618 if (s->crypt_method) {
619 cur_nr_sectors = MIN(cur_nr_sectors,
620 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
621 }
622
623 ret = qcow2_get_cluster_offset(bs, sector_num << 9,
624 &cur_nr_sectors, &cluster_offset);
625 if (ret < 0) {
626 goto fail;
627 }
628
629 index_in_cluster = sector_num & (s->cluster_sectors - 1);
630
631 qemu_iovec_reset(&hd_qiov);
632 qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
633 cur_nr_sectors * 512);
634
635 switch (ret) {
636 case QCOW2_CLUSTER_UNALLOCATED:
637
638 if (bs->backing_hd) {
639 /* read from the base image */
640 n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
641 sector_num, cur_nr_sectors);
642 if (n1 > 0) {
643 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
644 qemu_co_mutex_unlock(&s->lock);
645 ret = bdrv_co_readv(bs->backing_hd, sector_num,
646 n1, &hd_qiov);
647 qemu_co_mutex_lock(&s->lock);
648 if (ret < 0) {
649 goto fail;
650 }
651 }
652 } else {
653 /* Note: in this case, no need to wait */
654 qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
655 }
656 break;
657
658 case QCOW2_CLUSTER_ZERO:
659 if (s->qcow_version < 3) {
660 ret = -EIO;
661 goto fail;
662 }
663 qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
664 break;
665
666 case QCOW2_CLUSTER_COMPRESSED:
667 /* add AIO support for compressed blocks ? */
668 ret = qcow2_decompress_cluster(bs, cluster_offset);
669 if (ret < 0) {
670 goto fail;
671 }
672
673 qemu_iovec_from_buf(&hd_qiov, 0,
674 s->cluster_cache + index_in_cluster * 512,
675 512 * cur_nr_sectors);
676 break;
677
678 case QCOW2_CLUSTER_NORMAL:
679 if ((cluster_offset & 511) != 0) {
680 ret = -EIO;
681 goto fail;
682 }
683
684 if (s->crypt_method) {
685 /*
686 * For encrypted images, read everything into a temporary
687 * contiguous buffer on which the AES functions can work.
688 */
689 if (!cluster_data) {
690 cluster_data =
691 qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
692 }
693
694 assert(cur_nr_sectors <=
695 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
696 qemu_iovec_reset(&hd_qiov);
697 qemu_iovec_add(&hd_qiov, cluster_data,
698 512 * cur_nr_sectors);
699 }
700
701 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
702 qemu_co_mutex_unlock(&s->lock);
703 ret = bdrv_co_readv(bs->file,
704 (cluster_offset >> 9) + index_in_cluster,
705 cur_nr_sectors, &hd_qiov);
706 qemu_co_mutex_lock(&s->lock);
707 if (ret < 0) {
708 goto fail;
709 }
710 if (s->crypt_method) {
711 qcow2_encrypt_sectors(s, sector_num, cluster_data,
712 cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
713 qemu_iovec_from_buf(qiov, bytes_done,
714 cluster_data, 512 * cur_nr_sectors);
715 }
716 break;
717
718 default:
719 g_assert_not_reached();
720 ret = -EIO;
721 goto fail;
722 }
723
724 remaining_sectors -= cur_nr_sectors;
725 sector_num += cur_nr_sectors;
726 bytes_done += cur_nr_sectors * 512;
727 }
728 ret = 0;
729
730 fail:
731 qemu_co_mutex_unlock(&s->lock);
732
733 qemu_iovec_destroy(&hd_qiov);
734 qemu_vfree(cluster_data);
735
736 return ret;
737 }
738
739 static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
740 {
741 /* Take the request off the list of running requests */
742 if (m->nb_clusters != 0) {
743 QLIST_REMOVE(m, next_in_flight);
744 }
745
746 /* Restart all dependent requests */
747 if (!qemu_co_queue_empty(&m->dependent_requests)) {
748 qemu_co_mutex_unlock(&s->lock);
749 qemu_co_queue_restart_all(&m->dependent_requests);
750 qemu_co_mutex_lock(&s->lock);
751 }
752 }
753
754 static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
755 int64_t sector_num,
756 int remaining_sectors,
757 QEMUIOVector *qiov)
758 {
759 BDRVQcowState *s = bs->opaque;
760 int index_in_cluster;
761 int n_end;
762 int ret;
763 int cur_nr_sectors; /* number of sectors in current iteration */
764 uint64_t cluster_offset;
765 QEMUIOVector hd_qiov;
766 uint64_t bytes_done = 0;
767 uint8_t *cluster_data = NULL;
768 QCowL2Meta l2meta = {
769 .nb_clusters = 0,
770 };
771
772 trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
773 remaining_sectors);
774
775 qemu_co_queue_init(&l2meta.dependent_requests);
776
777 qemu_iovec_init(&hd_qiov, qiov->niov);
778
779 s->cluster_cache_offset = -1; /* disable compressed cache */
780
781 qemu_co_mutex_lock(&s->lock);
782
783 while (remaining_sectors != 0) {
784
785 trace_qcow2_writev_start_part(qemu_coroutine_self());
786 index_in_cluster = sector_num & (s->cluster_sectors - 1);
787 n_end = index_in_cluster + remaining_sectors;
788 if (s->crypt_method &&
789 n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
790 n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
791 }
792
793 ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
794 index_in_cluster, n_end, &cur_nr_sectors, &l2meta);
795 if (ret < 0) {
796 goto fail;
797 }
798
799 if (l2meta.nb_clusters > 0 &&
800 (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)) {
801 qcow2_mark_dirty(bs);
802 }
803
804 cluster_offset = l2meta.cluster_offset;
805 assert((cluster_offset & 511) == 0);
806
807 qemu_iovec_reset(&hd_qiov);
808 qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
809 cur_nr_sectors * 512);
810
811 if (s->crypt_method) {
812 if (!cluster_data) {
813 cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
814 s->cluster_size);
815 }
816
817 assert(hd_qiov.size <=
818 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
819 qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
820
821 qcow2_encrypt_sectors(s, sector_num, cluster_data,
822 cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
823
824 qemu_iovec_reset(&hd_qiov);
825 qemu_iovec_add(&hd_qiov, cluster_data,
826 cur_nr_sectors * 512);
827 }
828
829 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
830 qemu_co_mutex_unlock(&s->lock);
831 trace_qcow2_writev_data(qemu_coroutine_self(),
832 (cluster_offset >> 9) + index_in_cluster);
833 ret = bdrv_co_writev(bs->file,
834 (cluster_offset >> 9) + index_in_cluster,
835 cur_nr_sectors, &hd_qiov);
836 qemu_co_mutex_lock(&s->lock);
837 if (ret < 0) {
838 goto fail;
839 }
840
841 ret = qcow2_alloc_cluster_link_l2(bs, &l2meta);
842 if (ret < 0) {
843 goto fail;
844 }
845
846 run_dependent_requests(s, &l2meta);
847
848 remaining_sectors -= cur_nr_sectors;
849 sector_num += cur_nr_sectors;
850 bytes_done += cur_nr_sectors * 512;
851 trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
852 }
853 ret = 0;
854
855 fail:
856 run_dependent_requests(s, &l2meta);
857
858 qemu_co_mutex_unlock(&s->lock);
859
860 qemu_iovec_destroy(&hd_qiov);
861 qemu_vfree(cluster_data);
862 trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
863
864 return ret;
865 }
866
867 static void qcow2_close(BlockDriverState *bs)
868 {
869 BDRVQcowState *s = bs->opaque;
870 g_free(s->l1_table);
871
872 qcow2_cache_flush(bs, s->l2_table_cache);
873 qcow2_cache_flush(bs, s->refcount_block_cache);
874
875 qcow2_mark_clean(bs);
876
877 qcow2_cache_destroy(bs, s->l2_table_cache);
878 qcow2_cache_destroy(bs, s->refcount_block_cache);
879
880 g_free(s->unknown_header_fields);
881 cleanup_unknown_header_ext(bs);
882
883 g_free(s->cluster_cache);
884 qemu_vfree(s->cluster_data);
885 qcow2_refcount_close(bs);
886 qcow2_free_snapshots(bs);
887 }
888
889 static void qcow2_invalidate_cache(BlockDriverState *bs)
890 {
891 BDRVQcowState *s = bs->opaque;
892 int flags = s->flags;
893 AES_KEY aes_encrypt_key;
894 AES_KEY aes_decrypt_key;
895 uint32_t crypt_method = 0;
896
897 /*
898 * Backing files are read-only which makes all of their metadata immutable,
899 * that means we don't have to worry about reopening them here.
900 */
901
902 if (s->crypt_method) {
903 crypt_method = s->crypt_method;
904 memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
905 memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
906 }
907
908 qcow2_close(bs);
909
910 memset(s, 0, sizeof(BDRVQcowState));
911 qcow2_open(bs, flags);
912
913 if (crypt_method) {
914 s->crypt_method = crypt_method;
915 memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
916 memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
917 }
918 }
919
920 static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
921 size_t len, size_t buflen)
922 {
923 QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
924 size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
925
926 if (buflen < ext_len) {
927 return -ENOSPC;
928 }
929
930 *ext_backing_fmt = (QCowExtension) {
931 .magic = cpu_to_be32(magic),
932 .len = cpu_to_be32(len),
933 };
934 memcpy(buf + sizeof(QCowExtension), s, len);
935
936 return ext_len;
937 }
938
939 /*
940 * Updates the qcow2 header, including the variable length parts of it, i.e.
941 * the backing file name and all extensions. qcow2 was not designed to allow
942 * such changes, so if we run out of space (we can only use the first cluster)
943 * this function may fail.
944 *
945 * Returns 0 on success, -errno in error cases.
946 */
947 int qcow2_update_header(BlockDriverState *bs)
948 {
949 BDRVQcowState *s = bs->opaque;
950 QCowHeader *header;
951 char *buf;
952 size_t buflen = s->cluster_size;
953 int ret;
954 uint64_t total_size;
955 uint32_t refcount_table_clusters;
956 size_t header_length;
957 Qcow2UnknownHeaderExtension *uext;
958
959 buf = qemu_blockalign(bs, buflen);
960
961 /* Header structure */
962 header = (QCowHeader*) buf;
963
964 if (buflen < sizeof(*header)) {
965 ret = -ENOSPC;
966 goto fail;
967 }
968
969 header_length = sizeof(*header) + s->unknown_header_fields_size;
970 total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
971 refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
972
973 *header = (QCowHeader) {
974 /* Version 2 fields */
975 .magic = cpu_to_be32(QCOW_MAGIC),
976 .version = cpu_to_be32(s->qcow_version),
977 .backing_file_offset = 0,
978 .backing_file_size = 0,
979 .cluster_bits = cpu_to_be32(s->cluster_bits),
980 .size = cpu_to_be64(total_size),
981 .crypt_method = cpu_to_be32(s->crypt_method_header),
982 .l1_size = cpu_to_be32(s->l1_size),
983 .l1_table_offset = cpu_to_be64(s->l1_table_offset),
984 .refcount_table_offset = cpu_to_be64(s->refcount_table_offset),
985 .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
986 .nb_snapshots = cpu_to_be32(s->nb_snapshots),
987 .snapshots_offset = cpu_to_be64(s->snapshots_offset),
988
989 /* Version 3 fields */
990 .incompatible_features = cpu_to_be64(s->incompatible_features),
991 .compatible_features = cpu_to_be64(s->compatible_features),
992 .autoclear_features = cpu_to_be64(s->autoclear_features),
993 .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT),
994 .header_length = cpu_to_be32(header_length),
995 };
996
997 /* For older versions, write a shorter header */
998 switch (s->qcow_version) {
999 case 2:
1000 ret = offsetof(QCowHeader, incompatible_features);
1001 break;
1002 case 3:
1003 ret = sizeof(*header);
1004 break;
1005 default:
1006 ret = -EINVAL;
1007 goto fail;
1008 }
1009
1010 buf += ret;
1011 buflen -= ret;
1012 memset(buf, 0, buflen);
1013
1014 /* Preserve any unknown field in the header */
1015 if (s->unknown_header_fields_size) {
1016 if (buflen < s->unknown_header_fields_size) {
1017 ret = -ENOSPC;
1018 goto fail;
1019 }
1020
1021 memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
1022 buf += s->unknown_header_fields_size;
1023 buflen -= s->unknown_header_fields_size;
1024 }
1025
1026 /* Backing file format header extension */
1027 if (*bs->backing_format) {
1028 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
1029 bs->backing_format, strlen(bs->backing_format),
1030 buflen);
1031 if (ret < 0) {
1032 goto fail;
1033 }
1034
1035 buf += ret;
1036 buflen -= ret;
1037 }
1038
1039 /* Feature table */
1040 Qcow2Feature features[] = {
1041 {
1042 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
1043 .bit = QCOW2_INCOMPAT_DIRTY_BITNR,
1044 .name = "dirty bit",
1045 },
1046 {
1047 .type = QCOW2_FEAT_TYPE_COMPATIBLE,
1048 .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
1049 .name = "lazy refcounts",
1050 },
1051 };
1052
1053 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
1054 features, sizeof(features), buflen);
1055 if (ret < 0) {
1056 goto fail;
1057 }
1058 buf += ret;
1059 buflen -= ret;
1060
1061 /* Keep unknown header extensions */
1062 QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
1063 ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
1064 if (ret < 0) {
1065 goto fail;
1066 }
1067
1068 buf += ret;
1069 buflen -= ret;
1070 }
1071
1072 /* End of header extensions */
1073 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
1074 if (ret < 0) {
1075 goto fail;
1076 }
1077
1078 buf += ret;
1079 buflen -= ret;
1080
1081 /* Backing file name */
1082 if (*bs->backing_file) {
1083 size_t backing_file_len = strlen(bs->backing_file);
1084
1085 if (buflen < backing_file_len) {
1086 ret = -ENOSPC;
1087 goto fail;
1088 }
1089
1090 strncpy(buf, bs->backing_file, buflen);
1091
1092 header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
1093 header->backing_file_size = cpu_to_be32(backing_file_len);
1094 }
1095
1096 /* Write the new header */
1097 ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
1098 if (ret < 0) {
1099 goto fail;
1100 }
1101
1102 ret = 0;
1103 fail:
1104 qemu_vfree(header);
1105 return ret;
1106 }
1107
1108 static int qcow2_change_backing_file(BlockDriverState *bs,
1109 const char *backing_file, const char *backing_fmt)
1110 {
1111 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1112 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1113
1114 return qcow2_update_header(bs);
1115 }
1116
1117 static int preallocate(BlockDriverState *bs)
1118 {
1119 uint64_t nb_sectors;
1120 uint64_t offset;
1121 int num;
1122 int ret;
1123 QCowL2Meta meta;
1124
1125 nb_sectors = bdrv_getlength(bs) >> 9;
1126 offset = 0;
1127 qemu_co_queue_init(&meta.dependent_requests);
1128 meta.cluster_offset = 0;
1129
1130 while (nb_sectors) {
1131 num = MIN(nb_sectors, INT_MAX >> 9);
1132 ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta);
1133 if (ret < 0) {
1134 return ret;
1135 }
1136
1137 ret = qcow2_alloc_cluster_link_l2(bs, &meta);
1138 if (ret < 0) {
1139 qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters);
1140 return ret;
1141 }
1142
1143 /* There are no dependent requests, but we need to remove our request
1144 * from the list of in-flight requests */
1145 run_dependent_requests(bs->opaque, &meta);
1146
1147 /* TODO Preallocate data if requested */
1148
1149 nb_sectors -= num;
1150 offset += num << 9;
1151 }
1152
1153 /*
1154 * It is expected that the image file is large enough to actually contain
1155 * all of the allocated clusters (otherwise we get failing reads after
1156 * EOF). Extend the image to the last allocated sector.
1157 */
1158 if (meta.cluster_offset != 0) {
1159 uint8_t buf[512];
1160 memset(buf, 0, 512);
1161 ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1);
1162 if (ret < 0) {
1163 return ret;
1164 }
1165 }
1166
1167 return 0;
1168 }
1169
1170 static int qcow2_create2(const char *filename, int64_t total_size,
1171 const char *backing_file, const char *backing_format,
1172 int flags, size_t cluster_size, int prealloc,
1173 QEMUOptionParameter *options, int version)
1174 {
1175 /* Calculate cluster_bits */
1176 int cluster_bits;
1177 cluster_bits = ffs(cluster_size) - 1;
1178 if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
1179 (1 << cluster_bits) != cluster_size)
1180 {
1181 error_report(
1182 "Cluster size must be a power of two between %d and %dk",
1183 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
1184 return -EINVAL;
1185 }
1186
1187 /*
1188 * Open the image file and write a minimal qcow2 header.
1189 *
1190 * We keep things simple and start with a zero-sized image. We also
1191 * do without refcount blocks or a L1 table for now. We'll fix the
1192 * inconsistency later.
1193 *
1194 * We do need a refcount table because growing the refcount table means
1195 * allocating two new refcount blocks - the seconds of which would be at
1196 * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
1197 * size for any qcow2 image.
1198 */
1199 BlockDriverState* bs;
1200 QCowHeader header;
1201 uint8_t* refcount_table;
1202 int ret;
1203
1204 ret = bdrv_create_file(filename, options);
1205 if (ret < 0) {
1206 return ret;
1207 }
1208
1209 ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
1210 if (ret < 0) {
1211 return ret;
1212 }
1213
1214 /* Write the header */
1215 memset(&header, 0, sizeof(header));
1216 header.magic = cpu_to_be32(QCOW_MAGIC);
1217 header.version = cpu_to_be32(version);
1218 header.cluster_bits = cpu_to_be32(cluster_bits);
1219 header.size = cpu_to_be64(0);
1220 header.l1_table_offset = cpu_to_be64(0);
1221 header.l1_size = cpu_to_be32(0);
1222 header.refcount_table_offset = cpu_to_be64(cluster_size);
1223 header.refcount_table_clusters = cpu_to_be32(1);
1224 header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT);
1225 header.header_length = cpu_to_be32(sizeof(header));
1226
1227 if (flags & BLOCK_FLAG_ENCRYPT) {
1228 header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1229 } else {
1230 header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1231 }
1232
1233 if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
1234 header.compatible_features |=
1235 cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
1236 }
1237
1238 ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
1239 if (ret < 0) {
1240 goto out;
1241 }
1242
1243 /* Write an empty refcount table */
1244 refcount_table = g_malloc0(cluster_size);
1245 ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
1246 g_free(refcount_table);
1247
1248 if (ret < 0) {
1249 goto out;
1250 }
1251
1252 bdrv_close(bs);
1253
1254 /*
1255 * And now open the image and make it consistent first (i.e. increase the
1256 * refcount of the cluster that is occupied by the header and the refcount
1257 * table)
1258 */
1259 BlockDriver* drv = bdrv_find_format("qcow2");
1260 assert(drv != NULL);
1261 ret = bdrv_open(bs, filename,
1262 BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv);
1263 if (ret < 0) {
1264 goto out;
1265 }
1266
1267 ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
1268 if (ret < 0) {
1269 goto out;
1270
1271 } else if (ret != 0) {
1272 error_report("Huh, first cluster in empty image is already in use?");
1273 abort();
1274 }
1275
1276 /* Okay, now that we have a valid image, let's give it the right size */
1277 ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
1278 if (ret < 0) {
1279 goto out;
1280 }
1281
1282 /* Want a backing file? There you go.*/
1283 if (backing_file) {
1284 ret = bdrv_change_backing_file(bs, backing_file, backing_format);
1285 if (ret < 0) {
1286 goto out;
1287 }
1288 }
1289
1290 /* And if we're supposed to preallocate metadata, do that now */
1291 if (prealloc) {
1292 BDRVQcowState *s = bs->opaque;
1293 qemu_co_mutex_lock(&s->lock);
1294 ret = preallocate(bs);
1295 qemu_co_mutex_unlock(&s->lock);
1296 if (ret < 0) {
1297 goto out;
1298 }
1299 }
1300
1301 ret = 0;
1302 out:
1303 bdrv_delete(bs);
1304 return ret;
1305 }
1306
1307 static int qcow2_create(const char *filename, QEMUOptionParameter *options)
1308 {
1309 const char *backing_file = NULL;
1310 const char *backing_fmt = NULL;
1311 uint64_t sectors = 0;
1312 int flags = 0;
1313 size_t cluster_size = DEFAULT_CLUSTER_SIZE;
1314 int prealloc = 0;
1315 int version = 2;
1316
1317 /* Read out options */
1318 while (options && options->name) {
1319 if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1320 sectors = options->value.n / 512;
1321 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1322 backing_file = options->value.s;
1323 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
1324 backing_fmt = options->value.s;
1325 } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
1326 flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
1327 } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
1328 if (options->value.n) {
1329 cluster_size = options->value.n;
1330 }
1331 } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
1332 if (!options->value.s || !strcmp(options->value.s, "off")) {
1333 prealloc = 0;
1334 } else if (!strcmp(options->value.s, "metadata")) {
1335 prealloc = 1;
1336 } else {
1337 fprintf(stderr, "Invalid preallocation mode: '%s'\n",
1338 options->value.s);
1339 return -EINVAL;
1340 }
1341 } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
1342 if (!options->value.s || !strcmp(options->value.s, "0.10")) {
1343 version = 2;
1344 } else if (!strcmp(options->value.s, "1.1")) {
1345 version = 3;
1346 } else {
1347 fprintf(stderr, "Invalid compatibility level: '%s'\n",
1348 options->value.s);
1349 return -EINVAL;
1350 }
1351 } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
1352 flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
1353 }
1354 options++;
1355 }
1356
1357 if (backing_file && prealloc) {
1358 fprintf(stderr, "Backing file and preallocation cannot be used at "
1359 "the same time\n");
1360 return -EINVAL;
1361 }
1362
1363 if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
1364 fprintf(stderr, "Lazy refcounts only supported with compatibility "
1365 "level 1.1 and above (use compat=1.1 or greater)\n");
1366 return -EINVAL;
1367 }
1368
1369 return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
1370 cluster_size, prealloc, options, version);
1371 }
1372
1373 static int qcow2_make_empty(BlockDriverState *bs)
1374 {
1375 #if 0
1376 /* XXX: not correct */
1377 BDRVQcowState *s = bs->opaque;
1378 uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1379 int ret;
1380
1381 memset(s->l1_table, 0, l1_length);
1382 if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
1383 return -1;
1384 ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
1385 if (ret < 0)
1386 return ret;
1387
1388 l2_cache_reset(bs);
1389 #endif
1390 return 0;
1391 }
1392
1393 static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
1394 int64_t sector_num, int nb_sectors)
1395 {
1396 int ret;
1397 BDRVQcowState *s = bs->opaque;
1398
1399 /* Emulate misaligned zero writes */
1400 if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
1401 return -ENOTSUP;
1402 }
1403
1404 /* Whatever is left can use real zero clusters */
1405 qemu_co_mutex_lock(&s->lock);
1406 ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
1407 nb_sectors);
1408 qemu_co_mutex_unlock(&s->lock);
1409
1410 return ret;
1411 }
1412
1413 static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
1414 int64_t sector_num, int nb_sectors)
1415 {
1416 int ret;
1417 BDRVQcowState *s = bs->opaque;
1418
1419 qemu_co_mutex_lock(&s->lock);
1420 ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
1421 nb_sectors);
1422 qemu_co_mutex_unlock(&s->lock);
1423 return ret;
1424 }
1425
1426 static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
1427 {
1428 BDRVQcowState *s = bs->opaque;
1429 int ret, new_l1_size;
1430
1431 if (offset & 511) {
1432 error_report("The new size must be a multiple of 512");
1433 return -EINVAL;
1434 }
1435
1436 /* cannot proceed if image has snapshots */
1437 if (s->nb_snapshots) {
1438 error_report("Can't resize an image which has snapshots");
1439 return -ENOTSUP;
1440 }
1441
1442 /* shrinking is currently not supported */
1443 if (offset < bs->total_sectors * 512) {
1444 error_report("qcow2 doesn't support shrinking images yet");
1445 return -ENOTSUP;
1446 }
1447
1448 new_l1_size = size_to_l1(s, offset);
1449 ret = qcow2_grow_l1_table(bs, new_l1_size, true);
1450 if (ret < 0) {
1451 return ret;
1452 }
1453
1454 /* write updated header.size */
1455 offset = cpu_to_be64(offset);
1456 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
1457 &offset, sizeof(uint64_t));
1458 if (ret < 0) {
1459 return ret;
1460 }
1461
1462 s->l1_vm_state_index = new_l1_size;
1463 return 0;
1464 }
1465
1466 /* XXX: put compressed sectors first, then all the cluster aligned
1467 tables to avoid losing bytes in alignment */
1468 static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
1469 const uint8_t *buf, int nb_sectors)
1470 {
1471 BDRVQcowState *s = bs->opaque;
1472 z_stream strm;
1473 int ret, out_len;
1474 uint8_t *out_buf;
1475 uint64_t cluster_offset;
1476
1477 if (nb_sectors == 0) {
1478 /* align end of file to a sector boundary to ease reading with
1479 sector based I/Os */
1480 cluster_offset = bdrv_getlength(bs->file);
1481 cluster_offset = (cluster_offset + 511) & ~511;
1482 bdrv_truncate(bs->file, cluster_offset);
1483 return 0;
1484 }
1485
1486 if (nb_sectors != s->cluster_sectors)
1487 return -EINVAL;
1488
1489 out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1490
1491 /* best compression, small window, no zlib header */
1492 memset(&strm, 0, sizeof(strm));
1493 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1494 Z_DEFLATED, -12,
1495 9, Z_DEFAULT_STRATEGY);
1496 if (ret != 0) {
1497 ret = -EINVAL;
1498 goto fail;
1499 }
1500
1501 strm.avail_in = s->cluster_size;
1502 strm.next_in = (uint8_t *)buf;
1503 strm.avail_out = s->cluster_size;
1504 strm.next_out = out_buf;
1505
1506 ret = deflate(&strm, Z_FINISH);
1507 if (ret != Z_STREAM_END && ret != Z_OK) {
1508 deflateEnd(&strm);
1509 ret = -EINVAL;
1510 goto fail;
1511 }
1512 out_len = strm.next_out - out_buf;
1513
1514 deflateEnd(&strm);
1515
1516 if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1517 /* could not compress: write normal cluster */
1518 ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
1519 if (ret < 0) {
1520 goto fail;
1521 }
1522 } else {
1523 cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
1524 sector_num << 9, out_len);
1525 if (!cluster_offset) {
1526 ret = -EIO;
1527 goto fail;
1528 }
1529 cluster_offset &= s->cluster_offset_mask;
1530 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
1531 ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
1532 if (ret < 0) {
1533 goto fail;
1534 }
1535 }
1536
1537 ret = 0;
1538 fail:
1539 g_free(out_buf);
1540 return ret;
1541 }
1542
1543 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
1544 {
1545 BDRVQcowState *s = bs->opaque;
1546 int ret;
1547
1548 qemu_co_mutex_lock(&s->lock);
1549 ret = qcow2_cache_flush(bs, s->l2_table_cache);
1550 if (ret < 0) {
1551 qemu_co_mutex_unlock(&s->lock);
1552 return ret;
1553 }
1554
1555 if (qcow2_need_accurate_refcounts(s)) {
1556 ret = qcow2_cache_flush(bs, s->refcount_block_cache);
1557 if (ret < 0) {
1558 qemu_co_mutex_unlock(&s->lock);
1559 return ret;
1560 }
1561 }
1562 qemu_co_mutex_unlock(&s->lock);
1563
1564 return 0;
1565 }
1566
1567 static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
1568 {
1569 return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
1570 }
1571
1572 static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1573 {
1574 BDRVQcowState *s = bs->opaque;
1575 bdi->cluster_size = s->cluster_size;
1576 bdi->vm_state_offset = qcow2_vm_state_offset(s);
1577 return 0;
1578 }
1579
1580 #if 0
1581 static void dump_refcounts(BlockDriverState *bs)
1582 {
1583 BDRVQcowState *s = bs->opaque;
1584 int64_t nb_clusters, k, k1, size;
1585 int refcount;
1586
1587 size = bdrv_getlength(bs->file);
1588 nb_clusters = size_to_clusters(s, size);
1589 for(k = 0; k < nb_clusters;) {
1590 k1 = k;
1591 refcount = get_refcount(bs, k);
1592 k++;
1593 while (k < nb_clusters && get_refcount(bs, k) == refcount)
1594 k++;
1595 printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
1596 k - k1);
1597 }
1598 }
1599 #endif
1600
1601 static int qcow2_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1602 int64_t pos, int size)
1603 {
1604 BDRVQcowState *s = bs->opaque;
1605 int growable = bs->growable;
1606 int ret;
1607
1608 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
1609 bs->growable = 1;
1610 ret = bdrv_pwrite(bs, qcow2_vm_state_offset(s) + pos, buf, size);
1611 bs->growable = growable;
1612
1613 return ret;
1614 }
1615
1616 static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1617 int64_t pos, int size)
1618 {
1619 BDRVQcowState *s = bs->opaque;
1620 int growable = bs->growable;
1621 int ret;
1622
1623 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
1624 bs->growable = 1;
1625 ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
1626 bs->growable = growable;
1627
1628 return ret;
1629 }
1630
1631 static QEMUOptionParameter qcow2_create_options[] = {
1632 {
1633 .name = BLOCK_OPT_SIZE,
1634 .type = OPT_SIZE,
1635 .help = "Virtual disk size"
1636 },
1637 {
1638 .name = BLOCK_OPT_COMPAT_LEVEL,
1639 .type = OPT_STRING,
1640 .help = "Compatibility level (0.10 or 1.1)"
1641 },
1642 {
1643 .name = BLOCK_OPT_BACKING_FILE,
1644 .type = OPT_STRING,
1645 .help = "File name of a base image"
1646 },
1647 {
1648 .name = BLOCK_OPT_BACKING_FMT,
1649 .type = OPT_STRING,
1650 .help = "Image format of the base image"
1651 },
1652 {
1653 .name = BLOCK_OPT_ENCRYPT,
1654 .type = OPT_FLAG,
1655 .help = "Encrypt the image"
1656 },
1657 {
1658 .name = BLOCK_OPT_CLUSTER_SIZE,
1659 .type = OPT_SIZE,
1660 .help = "qcow2 cluster size",
1661 .value = { .n = DEFAULT_CLUSTER_SIZE },
1662 },
1663 {
1664 .name = BLOCK_OPT_PREALLOC,
1665 .type = OPT_STRING,
1666 .help = "Preallocation mode (allowed values: off, metadata)"
1667 },
1668 {
1669 .name = BLOCK_OPT_LAZY_REFCOUNTS,
1670 .type = OPT_FLAG,
1671 .help = "Postpone refcount updates",
1672 },
1673 { NULL }
1674 };
1675
1676 static BlockDriver bdrv_qcow2 = {
1677 .format_name = "qcow2",
1678 .instance_size = sizeof(BDRVQcowState),
1679 .bdrv_probe = qcow2_probe,
1680 .bdrv_open = qcow2_open,
1681 .bdrv_close = qcow2_close,
1682 .bdrv_create = qcow2_create,
1683 .bdrv_co_is_allocated = qcow2_co_is_allocated,
1684 .bdrv_set_key = qcow2_set_key,
1685 .bdrv_make_empty = qcow2_make_empty,
1686
1687 .bdrv_co_readv = qcow2_co_readv,
1688 .bdrv_co_writev = qcow2_co_writev,
1689 .bdrv_co_flush_to_os = qcow2_co_flush_to_os,
1690
1691 .bdrv_co_write_zeroes = qcow2_co_write_zeroes,
1692 .bdrv_co_discard = qcow2_co_discard,
1693 .bdrv_truncate = qcow2_truncate,
1694 .bdrv_write_compressed = qcow2_write_compressed,
1695
1696 .bdrv_snapshot_create = qcow2_snapshot_create,
1697 .bdrv_snapshot_goto = qcow2_snapshot_goto,
1698 .bdrv_snapshot_delete = qcow2_snapshot_delete,
1699 .bdrv_snapshot_list = qcow2_snapshot_list,
1700 .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
1701 .bdrv_get_info = qcow2_get_info,
1702
1703 .bdrv_save_vmstate = qcow2_save_vmstate,
1704 .bdrv_load_vmstate = qcow2_load_vmstate,
1705
1706 .bdrv_change_backing_file = qcow2_change_backing_file,
1707
1708 .bdrv_invalidate_cache = qcow2_invalidate_cache,
1709
1710 .create_options = qcow2_create_options,
1711 .bdrv_check = qcow2_check,
1712 };
1713
1714 static void bdrv_qcow2_init(void)
1715 {
1716 bdrv_register(&bdrv_qcow2);
1717 }
1718
1719 block_init(bdrv_qcow2_init);