]> git.proxmox.com Git - qemu.git/blame - block/qcow2.c
qcow2: Version 3 images
[qemu.git] / block / qcow2.c
CommitLineData
585f8587
FB
1/*
2 * Block driver for the QCOW version 2 format
5fafdf24 3 *
585f8587 4 * Copyright (c) 2004-2006 Fabrice Bellard
5fafdf24 5 *
585f8587
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
faf07963 24#include "qemu-common.h"
585f8587 25#include "block_int.h"
5efa9d5a 26#include "module.h"
585f8587
FB
27#include <zlib.h>
28#include "aes.h"
f7d0fe02 29#include "block/qcow2.h"
a9420734 30#include "qemu-error.h"
e8cdcec1 31#include "qerror.h"
3cce16f4 32#include "trace.h"
585f8587
FB
33
34/*
35 Differences with QCOW:
36
37 - Support for multiple incremental snapshots.
38 - Memory management by reference counts.
39 - Clusters which have a reference count of one have the bit
40 QCOW_OFLAG_COPIED to optimize write performance.
5fafdf24 41 - Size of compressed clusters is stored in sectors to reduce bit usage
585f8587
FB
42 in the cluster offsets.
43 - Support for storing additional data (such as the VM state) in the
3b46e624 44 snapshots.
585f8587
FB
45 - If a backing store is used, the cluster size is not constrained
46 (could be backported to QCOW).
47 - L2 tables have always a size of one cluster.
48*/
49
9b80ddf3
AL
50
51typedef struct {
52 uint32_t magic;
53 uint32_t len;
54} QCowExtension;
7c80ab3f
JS
55#define QCOW2_EXT_MAGIC_END 0
56#define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
9b80ddf3 57
7c80ab3f 58static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
585f8587
FB
59{
60 const QCowHeader *cow_header = (const void *)buf;
3b46e624 61
585f8587
FB
62 if (buf_size >= sizeof(QCowHeader) &&
63 be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
6744cbab 64 be32_to_cpu(cow_header->version) >= 2)
585f8587
FB
65 return 100;
66 else
67 return 0;
68}
69
9b80ddf3
AL
70
71/*
72 * read qcow2 extension and fill bs
73 * start reading from start_offset
74 * finish reading upon magic of value 0 or when end_offset reached
75 * unknown magic is skipped (future extension this version knows nothing about)
76 * return 0 upon success, non-0 otherwise
77 */
7c80ab3f
JS
78static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
79 uint64_t end_offset)
9b80ddf3 80{
75bab85c 81 BDRVQcowState *s = bs->opaque;
9b80ddf3
AL
82 QCowExtension ext;
83 uint64_t offset;
75bab85c 84 int ret;
9b80ddf3
AL
85
86#ifdef DEBUG_EXT
7c80ab3f 87 printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
9b80ddf3
AL
88#endif
89 offset = start_offset;
90 while (offset < end_offset) {
91
92#ifdef DEBUG_EXT
93 /* Sanity check */
94 if (offset > s->cluster_size)
7c80ab3f 95 printf("qcow2_read_extension: suspicious offset %lu\n", offset);
9b80ddf3 96
9b2260cb 97 printf("attempting to read extended header in offset %lu\n", offset);
9b80ddf3
AL
98#endif
99
66f82cee 100 if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) {
7c80ab3f 101 fprintf(stderr, "qcow2_read_extension: ERROR: "
0bfcd599
BS
102 "pread fail from offset %" PRIu64 "\n",
103 offset);
9b80ddf3
AL
104 return 1;
105 }
106 be32_to_cpus(&ext.magic);
107 be32_to_cpus(&ext.len);
108 offset += sizeof(ext);
109#ifdef DEBUG_EXT
110 printf("ext.magic = 0x%x\n", ext.magic);
111#endif
64ca6aee
KW
112 if (ext.len > end_offset - offset) {
113 error_report("Header extension too large");
114 return -EINVAL;
115 }
116
9b80ddf3 117 switch (ext.magic) {
7c80ab3f 118 case QCOW2_EXT_MAGIC_END:
9b80ddf3 119 return 0;
f965509c 120
7c80ab3f 121 case QCOW2_EXT_MAGIC_BACKING_FORMAT:
f965509c
AL
122 if (ext.len >= sizeof(bs->backing_format)) {
123 fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
4c978075 124 " (>=%zu)\n",
f965509c
AL
125 ext.len, sizeof(bs->backing_format));
126 return 2;
127 }
66f82cee 128 if (bdrv_pread(bs->file, offset , bs->backing_format,
f965509c
AL
129 ext.len) != ext.len)
130 return 3;
131 bs->backing_format[ext.len] = '\0';
132#ifdef DEBUG_EXT
133 printf("Qcow2: Got format extension %s\n", bs->backing_format);
134#endif
f965509c
AL
135 break;
136
9b80ddf3 137 default:
75bab85c
KW
138 /* unknown magic - save it in case we need to rewrite the header */
139 {
140 Qcow2UnknownHeaderExtension *uext;
141
142 uext = g_malloc0(sizeof(*uext) + ext.len);
143 uext->magic = ext.magic;
144 uext->len = ext.len;
145 QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
146
147 ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
148 if (ret < 0) {
149 return ret;
150 }
75bab85c 151 }
9b80ddf3
AL
152 break;
153 }
fd29b4bb
KW
154
155 offset += ((ext.len + 7) & ~7);
9b80ddf3
AL
156 }
157
158 return 0;
159}
160
75bab85c
KW
161static void cleanup_unknown_header_ext(BlockDriverState *bs)
162{
163 BDRVQcowState *s = bs->opaque;
164 Qcow2UnknownHeaderExtension *uext, *next;
165
166 QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
167 QLIST_REMOVE(uext, next);
168 g_free(uext);
169 }
170}
9b80ddf3 171
6744cbab
KW
172static void report_unsupported(BlockDriverState *bs, const char *fmt, ...)
173{
174 char msg[64];
175 va_list ap;
176
177 va_start(ap, fmt);
178 vsnprintf(msg, sizeof(msg), fmt, ap);
179 va_end(ap);
180
181 qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
182 bs->device_name, "qcow2", msg);
183}
184
7c80ab3f 185static int qcow2_open(BlockDriverState *bs, int flags)
585f8587
FB
186{
187 BDRVQcowState *s = bs->opaque;
6d85a57e 188 int len, i, ret = 0;
585f8587 189 QCowHeader header;
9b80ddf3 190 uint64_t ext_end;
29c1a730 191 bool writethrough;
585f8587 192
6d85a57e
JS
193 ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
194 if (ret < 0) {
585f8587 195 goto fail;
6d85a57e 196 }
585f8587
FB
197 be32_to_cpus(&header.magic);
198 be32_to_cpus(&header.version);
199 be64_to_cpus(&header.backing_file_offset);
200 be32_to_cpus(&header.backing_file_size);
201 be64_to_cpus(&header.size);
202 be32_to_cpus(&header.cluster_bits);
203 be32_to_cpus(&header.crypt_method);
204 be64_to_cpus(&header.l1_table_offset);
205 be32_to_cpus(&header.l1_size);
206 be64_to_cpus(&header.refcount_table_offset);
207 be32_to_cpus(&header.refcount_table_clusters);
208 be64_to_cpus(&header.snapshots_offset);
209 be32_to_cpus(&header.nb_snapshots);
3b46e624 210
e8cdcec1 211 if (header.magic != QCOW_MAGIC) {
6d85a57e 212 ret = -EINVAL;
585f8587 213 goto fail;
6d85a57e 214 }
6744cbab
KW
215 if (header.version < 2 || header.version > 3) {
216 report_unsupported(bs, "QCOW version %d", header.version);
217 ret = -ENOTSUP;
218 goto fail;
219 }
220
221 s->qcow_version = header.version;
222
223 /* Initialise version 3 header fields */
224 if (header.version == 2) {
225 header.incompatible_features = 0;
226 header.compatible_features = 0;
227 header.autoclear_features = 0;
228 header.refcount_order = 4;
229 header.header_length = 72;
230 } else {
231 be64_to_cpus(&header.incompatible_features);
232 be64_to_cpus(&header.compatible_features);
233 be64_to_cpus(&header.autoclear_features);
234 be32_to_cpus(&header.refcount_order);
235 be32_to_cpus(&header.header_length);
236 }
237
238 if (header.header_length > sizeof(header)) {
239 s->unknown_header_fields_size = header.header_length - sizeof(header);
240 s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
241 ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
242 s->unknown_header_fields_size);
243 if (ret < 0) {
244 goto fail;
245 }
246 }
247
248 /* Handle feature bits */
249 s->incompatible_features = header.incompatible_features;
250 s->compatible_features = header.compatible_features;
251 s->autoclear_features = header.autoclear_features;
252
253 if (s->incompatible_features != 0) {
254 report_unsupported(bs, "incompatible features mask %" PRIx64,
255 header.incompatible_features);
256 ret = -ENOTSUP;
257 goto fail;
258 }
259
260 if (!bs->read_only && s->autoclear_features != 0) {
261 s->autoclear_features = 0;
262 qcow2_update_header(bs);
263 }
264
265 /* Check support for various header values */
266 if (header.refcount_order != 4) {
267 report_unsupported(bs, "%d bit reference counts",
268 1 << header.refcount_order);
e8cdcec1
KW
269 ret = -ENOTSUP;
270 goto fail;
271 }
6744cbab 272
d191d12d 273 if (header.cluster_bits < MIN_CLUSTER_BITS ||
6d85a57e
JS
274 header.cluster_bits > MAX_CLUSTER_BITS) {
275 ret = -EINVAL;
585f8587 276 goto fail;
6d85a57e
JS
277 }
278 if (header.crypt_method > QCOW_CRYPT_AES) {
279 ret = -EINVAL;
585f8587 280 goto fail;
6d85a57e 281 }
585f8587 282 s->crypt_method_header = header.crypt_method;
6d85a57e 283 if (s->crypt_method_header) {
585f8587 284 bs->encrypted = 1;
6d85a57e 285 }
585f8587
FB
286 s->cluster_bits = header.cluster_bits;
287 s->cluster_size = 1 << s->cluster_bits;
288 s->cluster_sectors = 1 << (s->cluster_bits - 9);
289 s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
290 s->l2_size = 1 << s->l2_bits;
291 bs->total_sectors = header.size / 512;
292 s->csize_shift = (62 - (s->cluster_bits - 8));
293 s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
294 s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
295 s->refcount_table_offset = header.refcount_table_offset;
5fafdf24 296 s->refcount_table_size =
585f8587
FB
297 header.refcount_table_clusters << (s->cluster_bits - 3);
298
299 s->snapshots_offset = header.snapshots_offset;
300 s->nb_snapshots = header.nb_snapshots;
301
302 /* read the level 1 table */
303 s->l1_size = header.l1_size;
419b19d9 304 s->l1_vm_state_index = size_to_l1(s, header.size);
585f8587
FB
305 /* the L1 table must contain at least enough entries to put
306 header.size bytes */
6d85a57e
JS
307 if (s->l1_size < s->l1_vm_state_index) {
308 ret = -EINVAL;
585f8587 309 goto fail;
6d85a57e 310 }
585f8587 311 s->l1_table_offset = header.l1_table_offset;
d191d12d 312 if (s->l1_size > 0) {
7267c094 313 s->l1_table = g_malloc0(
d191d12d 314 align_offset(s->l1_size * sizeof(uint64_t), 512));
6d85a57e
JS
315 ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
316 s->l1_size * sizeof(uint64_t));
317 if (ret < 0) {
d191d12d 318 goto fail;
6d85a57e 319 }
d191d12d
SW
320 for(i = 0;i < s->l1_size; i++) {
321 be64_to_cpus(&s->l1_table[i]);
322 }
585f8587 323 }
29c1a730
KW
324
325 /* alloc L2 table/refcount block cache */
a6599793 326 writethrough = ((flags & BDRV_O_CACHE_WB) == 0);
29c1a730
KW
327 s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE, writethrough);
328 s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE,
329 writethrough);
330
7267c094 331 s->cluster_cache = g_malloc(s->cluster_size);
585f8587 332 /* one more sector for decompressed data alignment */
dea43a65 333 s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
095a9c58 334 + 512);
585f8587 335 s->cluster_cache_offset = -1;
06d9260f 336 s->flags = flags;
3b46e624 337
6d85a57e
JS
338 ret = qcow2_refcount_init(bs);
339 if (ret != 0) {
585f8587 340 goto fail;
6d85a57e 341 }
585f8587 342
72cf2d4f 343 QLIST_INIT(&s->cluster_allocs);
f214978a 344
9b80ddf3 345 /* read qcow2 extensions */
6d85a57e 346 if (header.backing_file_offset) {
9b80ddf3 347 ext_end = header.backing_file_offset;
6d85a57e 348 } else {
9b80ddf3 349 ext_end = s->cluster_size;
6d85a57e 350 }
6744cbab 351 if (qcow2_read_extensions(bs, header.header_length, ext_end)) {
6d85a57e 352 ret = -EINVAL;
9b80ddf3 353 goto fail;
6d85a57e 354 }
9b80ddf3 355
585f8587
FB
356 /* read the backing file name */
357 if (header.backing_file_offset != 0) {
358 len = header.backing_file_size;
6d85a57e 359 if (len > 1023) {
585f8587 360 len = 1023;
6d85a57e
JS
361 }
362 ret = bdrv_pread(bs->file, header.backing_file_offset,
363 bs->backing_file, len);
364 if (ret < 0) {
585f8587 365 goto fail;
6d85a57e 366 }
585f8587
FB
367 bs->backing_file[len] = '\0';
368 }
42deb29f
KW
369
370 ret = qcow2_read_snapshots(bs);
371 if (ret < 0) {
585f8587 372 goto fail;
6d85a57e 373 }
585f8587 374
68d100e9
KW
375 /* Initialise locks */
376 qemu_co_mutex_init(&s->lock);
377
585f8587 378#ifdef DEBUG_ALLOC
6cbc3031
PH
379 {
380 BdrvCheckResult result = {0};
381 qcow2_check_refcounts(bs, &result);
382 }
585f8587 383#endif
6d85a57e 384 return ret;
585f8587
FB
385
386 fail:
6744cbab 387 g_free(s->unknown_header_fields);
75bab85c 388 cleanup_unknown_header_ext(bs);
ed6ccf0f
KW
389 qcow2_free_snapshots(bs);
390 qcow2_refcount_close(bs);
7267c094 391 g_free(s->l1_table);
29c1a730
KW
392 if (s->l2_table_cache) {
393 qcow2_cache_destroy(bs, s->l2_table_cache);
394 }
7267c094 395 g_free(s->cluster_cache);
dea43a65 396 qemu_vfree(s->cluster_data);
6d85a57e 397 return ret;
585f8587
FB
398}
399
7c80ab3f 400static int qcow2_set_key(BlockDriverState *bs, const char *key)
585f8587
FB
401{
402 BDRVQcowState *s = bs->opaque;
403 uint8_t keybuf[16];
404 int len, i;
3b46e624 405
585f8587
FB
406 memset(keybuf, 0, 16);
407 len = strlen(key);
408 if (len > 16)
409 len = 16;
410 /* XXX: we could compress the chars to 7 bits to increase
411 entropy */
412 for(i = 0;i < len;i++) {
413 keybuf[i] = key[i];
414 }
415 s->crypt_method = s->crypt_method_header;
416
417 if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
418 return -1;
419 if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
420 return -1;
421#if 0
422 /* test */
423 {
424 uint8_t in[16];
425 uint8_t out[16];
426 uint8_t tmp[16];
427 for(i=0;i<16;i++)
428 in[i] = i;
429 AES_encrypt(in, tmp, &s->aes_encrypt_key);
430 AES_decrypt(tmp, out, &s->aes_decrypt_key);
431 for(i = 0; i < 16; i++)
432 printf(" %02x", tmp[i]);
433 printf("\n");
434 for(i = 0; i < 16; i++)
435 printf(" %02x", out[i]);
436 printf("\n");
437 }
438#endif
439 return 0;
440}
441
f8a2e5e3
SH
442static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
443 int64_t sector_num, int nb_sectors, int *pnum)
585f8587 444{
f8a2e5e3 445 BDRVQcowState *s = bs->opaque;
585f8587 446 uint64_t cluster_offset;
1c46efaa 447 int ret;
585f8587 448
095a9c58 449 *pnum = nb_sectors;
f8a2e5e3
SH
450 /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
451 * can't pass them on today */
452 qemu_co_mutex_lock(&s->lock);
1c46efaa 453 ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
f8a2e5e3 454 qemu_co_mutex_unlock(&s->lock);
1c46efaa
KW
455 if (ret < 0) {
456 *pnum = 0;
457 }
095a9c58 458
585f8587
FB
459 return (cluster_offset != 0);
460}
461
a9465922 462/* handle reading after the end of the backing file */
bd28f835
KW
463int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
464 int64_t sector_num, int nb_sectors)
a9465922
FB
465{
466 int n1;
467 if ((sector_num + nb_sectors) <= bs->total_sectors)
468 return nb_sectors;
469 if (sector_num >= bs->total_sectors)
470 n1 = 0;
471 else
472 n1 = bs->total_sectors - sector_num;
bd28f835 473
e0d9c6f9 474 qemu_iovec_memset_skip(qiov, 0, 512 * (nb_sectors - n1), 512 * n1);
bd28f835 475
a9465922
FB
476 return n1;
477}
478
a968168c 479static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
3fc48d09 480 int remaining_sectors, QEMUIOVector *qiov)
585f8587 481{
585f8587 482 BDRVQcowState *s = bs->opaque;
a9465922 483 int index_in_cluster, n1;
68d100e9 484 int ret;
faf575c1 485 int cur_nr_sectors; /* number of sectors in current iteration */
c2bdd990 486 uint64_t cluster_offset = 0;
3fc48d09
FZ
487 uint64_t bytes_done = 0;
488 QEMUIOVector hd_qiov;
489 uint8_t *cluster_data = NULL;
585f8587 490
3fc48d09
FZ
491 qemu_iovec_init(&hd_qiov, qiov->niov);
492
493 qemu_co_mutex_lock(&s->lock);
494
495 while (remaining_sectors != 0) {
bd28f835 496
5ebaa27e 497 /* prepare next request */
3fc48d09 498 cur_nr_sectors = remaining_sectors;
5ebaa27e
FZ
499 if (s->crypt_method) {
500 cur_nr_sectors = MIN(cur_nr_sectors,
501 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
585f8587 502 }
5ebaa27e 503
3fc48d09 504 ret = qcow2_get_cluster_offset(bs, sector_num << 9,
5ebaa27e 505 &cur_nr_sectors, &cluster_offset);
8af36488 506 if (ret < 0) {
3fc48d09 507 goto fail;
8af36488 508 }
bd28f835 509
3fc48d09 510 index_in_cluster = sector_num & (s->cluster_sectors - 1);
c87c0672 511
3fc48d09
FZ
512 qemu_iovec_reset(&hd_qiov);
513 qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
5ebaa27e
FZ
514 cur_nr_sectors * 512);
515
68d000a3
KW
516 switch (ret) {
517 case QCOW2_CLUSTER_UNALLOCATED:
5ebaa27e
FZ
518
519 if (bs->backing_hd) {
520 /* read from the base image */
3fc48d09
FZ
521 n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
522 sector_num, cur_nr_sectors);
5ebaa27e
FZ
523 if (n1 > 0) {
524 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
525 qemu_co_mutex_unlock(&s->lock);
3fc48d09
FZ
526 ret = bdrv_co_readv(bs->backing_hd, sector_num,
527 n1, &hd_qiov);
5ebaa27e
FZ
528 qemu_co_mutex_lock(&s->lock);
529 if (ret < 0) {
3fc48d09 530 goto fail;
5ebaa27e
FZ
531 }
532 }
533 } else {
534 /* Note: in this case, no need to wait */
3fc48d09 535 qemu_iovec_memset(&hd_qiov, 0, 512 * cur_nr_sectors);
5ebaa27e 536 }
68d000a3
KW
537 break;
538
539 case QCOW2_CLUSTER_COMPRESSED:
5ebaa27e
FZ
540 /* add AIO support for compressed blocks ? */
541 ret = qcow2_decompress_cluster(bs, cluster_offset);
542 if (ret < 0) {
3fc48d09 543 goto fail;
bd28f835
KW
544 }
545
3fc48d09 546 qemu_iovec_from_buffer(&hd_qiov,
5ebaa27e 547 s->cluster_cache + index_in_cluster * 512,
faf575c1 548 512 * cur_nr_sectors);
68d000a3
KW
549 break;
550
551 case QCOW2_CLUSTER_NORMAL:
5ebaa27e 552 if ((cluster_offset & 511) != 0) {
3fc48d09
FZ
553 ret = -EIO;
554 goto fail;
5ebaa27e 555 }
bd28f835 556
5ebaa27e
FZ
557 if (s->crypt_method) {
558 /*
559 * For encrypted images, read everything into a temporary
560 * contiguous buffer on which the AES functions can work.
561 */
3fc48d09
FZ
562 if (!cluster_data) {
563 cluster_data =
dea43a65 564 qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
5ebaa27e
FZ
565 }
566
567 assert(cur_nr_sectors <=
568 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
3fc48d09
FZ
569 qemu_iovec_reset(&hd_qiov);
570 qemu_iovec_add(&hd_qiov, cluster_data,
5ebaa27e
FZ
571 512 * cur_nr_sectors);
572 }
573
574 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
575 qemu_co_mutex_unlock(&s->lock);
576 ret = bdrv_co_readv(bs->file,
577 (cluster_offset >> 9) + index_in_cluster,
3fc48d09 578 cur_nr_sectors, &hd_qiov);
5ebaa27e
FZ
579 qemu_co_mutex_lock(&s->lock);
580 if (ret < 0) {
3fc48d09 581 goto fail;
5ebaa27e
FZ
582 }
583 if (s->crypt_method) {
3fc48d09
FZ
584 qcow2_encrypt_sectors(s, sector_num, cluster_data,
585 cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
586 qemu_iovec_reset(&hd_qiov);
587 qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
5ebaa27e 588 cur_nr_sectors * 512);
3fc48d09 589 qemu_iovec_from_buffer(&hd_qiov, cluster_data,
5ebaa27e
FZ
590 512 * cur_nr_sectors);
591 }
68d000a3
KW
592 break;
593
594 default:
595 g_assert_not_reached();
596 ret = -EIO;
597 goto fail;
faf575c1 598 }
f141eafe 599
3fc48d09
FZ
600 remaining_sectors -= cur_nr_sectors;
601 sector_num += cur_nr_sectors;
602 bytes_done += cur_nr_sectors * 512;
5ebaa27e 603 }
3fc48d09 604 ret = 0;
faf575c1 605
3fc48d09 606fail:
68d100e9 607 qemu_co_mutex_unlock(&s->lock);
42496d62 608
3fc48d09 609 qemu_iovec_destroy(&hd_qiov);
dea43a65 610 qemu_vfree(cluster_data);
68d100e9
KW
611
612 return ret;
585f8587
FB
613}
614
68d100e9 615static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
f214978a 616{
f214978a
KW
617 /* Take the request off the list of running requests */
618 if (m->nb_clusters != 0) {
72cf2d4f 619 QLIST_REMOVE(m, next_in_flight);
f214978a
KW
620 }
621
d4c146f0 622 /* Restart all dependent requests */
68d100e9
KW
623 if (!qemu_co_queue_empty(&m->dependent_requests)) {
624 qemu_co_mutex_unlock(&s->lock);
e8ee5e4c 625 qemu_co_queue_restart_all(&m->dependent_requests);
68d100e9 626 qemu_co_mutex_lock(&s->lock);
f214978a 627 }
f214978a
KW
628}
629
a968168c 630static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
3fc48d09
FZ
631 int64_t sector_num,
632 int remaining_sectors,
633 QEMUIOVector *qiov)
585f8587 634{
585f8587 635 BDRVQcowState *s = bs->opaque;
585f8587 636 int index_in_cluster;
095a9c58 637 int n_end;
68d100e9 638 int ret;
faf575c1 639 int cur_nr_sectors; /* number of sectors in current iteration */
c2bdd990 640 uint64_t cluster_offset;
3fc48d09
FZ
641 QEMUIOVector hd_qiov;
642 uint64_t bytes_done = 0;
643 uint8_t *cluster_data = NULL;
8e217d53
KW
644 QCowL2Meta l2meta = {
645 .nb_clusters = 0,
646 };
c2271403 647
3cce16f4
KW
648 trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
649 remaining_sectors);
650
c2271403 651 qemu_co_queue_init(&l2meta.dependent_requests);
585f8587 652
3fc48d09
FZ
653 qemu_iovec_init(&hd_qiov, qiov->niov);
654
655 s->cluster_cache_offset = -1; /* disable compressed cache */
3b46e624 656
3fc48d09
FZ
657 qemu_co_mutex_lock(&s->lock);
658
659 while (remaining_sectors != 0) {
660
3cce16f4 661 trace_qcow2_writev_start_part(qemu_coroutine_self());
3fc48d09
FZ
662 index_in_cluster = sector_num & (s->cluster_sectors - 1);
663 n_end = index_in_cluster + remaining_sectors;
5ebaa27e
FZ
664 if (s->crypt_method &&
665 n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
666 n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
667 }
095a9c58 668
3fc48d09 669 ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
5ebaa27e
FZ
670 index_in_cluster, n_end, &cur_nr_sectors, &l2meta);
671 if (ret < 0) {
3fc48d09 672 goto fail;
5ebaa27e 673 }
148da7ea 674
5ebaa27e
FZ
675 cluster_offset = l2meta.cluster_offset;
676 assert((cluster_offset & 511) == 0);
148da7ea 677
3fc48d09
FZ
678 qemu_iovec_reset(&hd_qiov);
679 qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
5ebaa27e 680 cur_nr_sectors * 512);
6f5f060b 681
5ebaa27e 682 if (s->crypt_method) {
3fc48d09 683 if (!cluster_data) {
dea43a65 684 cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
5ebaa27e
FZ
685 s->cluster_size);
686 }
6f5f060b 687
3fc48d09 688 assert(hd_qiov.size <=
5ebaa27e 689 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
3fc48d09 690 qemu_iovec_to_buffer(&hd_qiov, cluster_data);
6f5f060b 691
3fc48d09
FZ
692 qcow2_encrypt_sectors(s, sector_num, cluster_data,
693 cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
6f5f060b 694
3fc48d09
FZ
695 qemu_iovec_reset(&hd_qiov);
696 qemu_iovec_add(&hd_qiov, cluster_data,
5ebaa27e
FZ
697 cur_nr_sectors * 512);
698 }
6f5f060b 699
5ebaa27e
FZ
700 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
701 qemu_co_mutex_unlock(&s->lock);
3cce16f4
KW
702 trace_qcow2_writev_data(qemu_coroutine_self(),
703 (cluster_offset >> 9) + index_in_cluster);
5ebaa27e
FZ
704 ret = bdrv_co_writev(bs->file,
705 (cluster_offset >> 9) + index_in_cluster,
3fc48d09 706 cur_nr_sectors, &hd_qiov);
5ebaa27e
FZ
707 qemu_co_mutex_lock(&s->lock);
708 if (ret < 0) {
3fc48d09 709 goto fail;
5ebaa27e 710 }
f141eafe 711
5ebaa27e 712 ret = qcow2_alloc_cluster_link_l2(bs, &l2meta);
5ebaa27e 713 if (ret < 0) {
3fc48d09 714 goto fail;
5ebaa27e 715 }
faf575c1 716
0fa9131a
KW
717 run_dependent_requests(s, &l2meta);
718
3fc48d09
FZ
719 remaining_sectors -= cur_nr_sectors;
720 sector_num += cur_nr_sectors;
721 bytes_done += cur_nr_sectors * 512;
3cce16f4 722 trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
5ebaa27e 723 }
3fc48d09 724 ret = 0;
faf575c1 725
3fc48d09 726fail:
0fa9131a
KW
727 run_dependent_requests(s, &l2meta);
728
68d100e9 729 qemu_co_mutex_unlock(&s->lock);
3b46e624 730
3fc48d09 731 qemu_iovec_destroy(&hd_qiov);
dea43a65 732 qemu_vfree(cluster_data);
3cce16f4 733 trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
42496d62 734
68d100e9 735 return ret;
585f8587
FB
736}
737
7c80ab3f 738static void qcow2_close(BlockDriverState *bs)
585f8587
FB
739{
740 BDRVQcowState *s = bs->opaque;
7267c094 741 g_free(s->l1_table);
29c1a730
KW
742
743 qcow2_cache_flush(bs, s->l2_table_cache);
744 qcow2_cache_flush(bs, s->refcount_block_cache);
745
746 qcow2_cache_destroy(bs, s->l2_table_cache);
747 qcow2_cache_destroy(bs, s->refcount_block_cache);
748
6744cbab 749 g_free(s->unknown_header_fields);
75bab85c 750 cleanup_unknown_header_ext(bs);
6744cbab 751
7267c094 752 g_free(s->cluster_cache);
dea43a65 753 qemu_vfree(s->cluster_data);
ed6ccf0f 754 qcow2_refcount_close(bs);
28c1202b 755 qcow2_free_snapshots(bs);
585f8587
FB
756}
757
06d9260f
AL
758static void qcow2_invalidate_cache(BlockDriverState *bs)
759{
760 BDRVQcowState *s = bs->opaque;
761 int flags = s->flags;
762 AES_KEY aes_encrypt_key;
763 AES_KEY aes_decrypt_key;
764 uint32_t crypt_method = 0;
765
766 /*
767 * Backing files are read-only which makes all of their metadata immutable,
768 * that means we don't have to worry about reopening them here.
769 */
770
771 if (s->crypt_method) {
772 crypt_method = s->crypt_method;
773 memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
774 memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
775 }
776
777 qcow2_close(bs);
778
779 memset(s, 0, sizeof(BDRVQcowState));
780 qcow2_open(bs, flags);
781
782 if (crypt_method) {
783 s->crypt_method = crypt_method;
784 memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
785 memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
786 }
787}
788
e24e49e6
KW
789static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
790 size_t len, size_t buflen)
791{
792 QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
793 size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
794
795 if (buflen < ext_len) {
796 return -ENOSPC;
797 }
798
799 *ext_backing_fmt = (QCowExtension) {
800 .magic = cpu_to_be32(magic),
801 .len = cpu_to_be32(len),
802 };
803 memcpy(buf + sizeof(QCowExtension), s, len);
804
805 return ext_len;
806}
807
756e6736 808/*
e24e49e6
KW
809 * Updates the qcow2 header, including the variable length parts of it, i.e.
810 * the backing file name and all extensions. qcow2 was not designed to allow
811 * such changes, so if we run out of space (we can only use the first cluster)
812 * this function may fail.
756e6736
KW
813 *
814 * Returns 0 on success, -errno in error cases.
815 */
e24e49e6 816int qcow2_update_header(BlockDriverState *bs)
756e6736 817{
756e6736 818 BDRVQcowState *s = bs->opaque;
e24e49e6
KW
819 QCowHeader *header;
820 char *buf;
821 size_t buflen = s->cluster_size;
756e6736 822 int ret;
e24e49e6
KW
823 uint64_t total_size;
824 uint32_t refcount_table_clusters;
6744cbab 825 size_t header_length;
75bab85c 826 Qcow2UnknownHeaderExtension *uext;
756e6736 827
e24e49e6 828 buf = qemu_blockalign(bs, buflen);
756e6736 829
e24e49e6
KW
830 /* Header structure */
831 header = (QCowHeader*) buf;
756e6736 832
e24e49e6
KW
833 if (buflen < sizeof(*header)) {
834 ret = -ENOSPC;
835 goto fail;
756e6736
KW
836 }
837
6744cbab 838 header_length = sizeof(*header) + s->unknown_header_fields_size;
e24e49e6
KW
839 total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
840 refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
841
842 *header = (QCowHeader) {
6744cbab 843 /* Version 2 fields */
e24e49e6 844 .magic = cpu_to_be32(QCOW_MAGIC),
6744cbab 845 .version = cpu_to_be32(s->qcow_version),
e24e49e6
KW
846 .backing_file_offset = 0,
847 .backing_file_size = 0,
848 .cluster_bits = cpu_to_be32(s->cluster_bits),
849 .size = cpu_to_be64(total_size),
850 .crypt_method = cpu_to_be32(s->crypt_method_header),
851 .l1_size = cpu_to_be32(s->l1_size),
852 .l1_table_offset = cpu_to_be64(s->l1_table_offset),
853 .refcount_table_offset = cpu_to_be64(s->refcount_table_offset),
854 .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
855 .nb_snapshots = cpu_to_be32(s->nb_snapshots),
856 .snapshots_offset = cpu_to_be64(s->snapshots_offset),
6744cbab
KW
857
858 /* Version 3 fields */
859 .incompatible_features = cpu_to_be64(s->incompatible_features),
860 .compatible_features = cpu_to_be64(s->compatible_features),
861 .autoclear_features = cpu_to_be64(s->autoclear_features),
862 .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT),
863 .header_length = cpu_to_be32(header_length),
e24e49e6 864 };
756e6736 865
6744cbab
KW
866 /* For older versions, write a shorter header */
867 switch (s->qcow_version) {
868 case 2:
869 ret = offsetof(QCowHeader, incompatible_features);
870 break;
871 case 3:
872 ret = sizeof(*header);
873 break;
874 default:
875 return -EINVAL;
876 }
877
878 buf += ret;
879 buflen -= ret;
880 memset(buf, 0, buflen);
881
882 /* Preserve any unknown field in the header */
883 if (s->unknown_header_fields_size) {
884 if (buflen < s->unknown_header_fields_size) {
885 ret = -ENOSPC;
886 goto fail;
887 }
888
889 memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
890 buf += s->unknown_header_fields_size;
891 buflen -= s->unknown_header_fields_size;
892 }
756e6736 893
e24e49e6
KW
894 /* Backing file format header extension */
895 if (*bs->backing_format) {
896 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
897 bs->backing_format, strlen(bs->backing_format),
898 buflen);
899 if (ret < 0) {
900 goto fail;
756e6736
KW
901 }
902
e24e49e6
KW
903 buf += ret;
904 buflen -= ret;
756e6736
KW
905 }
906
75bab85c
KW
907 /* Keep unknown header extensions */
908 QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
909 ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
910 if (ret < 0) {
911 goto fail;
912 }
913
914 buf += ret;
915 buflen -= ret;
916 }
917
e24e49e6
KW
918 /* End of header extensions */
919 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
756e6736
KW
920 if (ret < 0) {
921 goto fail;
922 }
923
e24e49e6
KW
924 buf += ret;
925 buflen -= ret;
756e6736 926
e24e49e6
KW
927 /* Backing file name */
928 if (*bs->backing_file) {
929 size_t backing_file_len = strlen(bs->backing_file);
930
931 if (buflen < backing_file_len) {
932 ret = -ENOSPC;
933 goto fail;
934 }
935
936 strncpy(buf, bs->backing_file, buflen);
937
938 header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
939 header->backing_file_size = cpu_to_be32(backing_file_len);
756e6736
KW
940 }
941
e24e49e6
KW
942 /* Write the new header */
943 ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
756e6736
KW
944 if (ret < 0) {
945 goto fail;
946 }
947
948 ret = 0;
949fail:
e24e49e6 950 qemu_vfree(header);
756e6736
KW
951 return ret;
952}
953
954static int qcow2_change_backing_file(BlockDriverState *bs,
955 const char *backing_file, const char *backing_fmt)
956{
e24e49e6
KW
957 /* Backing file format doesn't make sense without a backing file */
958 if (backing_fmt && !backing_file) {
959 return -EINVAL;
960 }
961
962 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
963 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
964
965 return qcow2_update_header(bs);
756e6736
KW
966}
967
a35e1c17
KW
968static int preallocate(BlockDriverState *bs)
969{
a35e1c17
KW
970 uint64_t nb_sectors;
971 uint64_t offset;
972 int num;
148da7ea 973 int ret;
a35e1c17
KW
974 QCowL2Meta meta;
975
976 nb_sectors = bdrv_getlength(bs) >> 9;
977 offset = 0;
68d100e9 978 qemu_co_queue_init(&meta.dependent_requests);
148da7ea 979 meta.cluster_offset = 0;
a35e1c17
KW
980
981 while (nb_sectors) {
982 num = MIN(nb_sectors, INT_MAX >> 9);
148da7ea 983 ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta);
148da7ea 984 if (ret < 0) {
19dbcbf7 985 return ret;
a35e1c17
KW
986 }
987
19dbcbf7
KW
988 ret = qcow2_alloc_cluster_link_l2(bs, &meta);
989 if (ret < 0) {
148da7ea 990 qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters);
19dbcbf7 991 return ret;
a35e1c17
KW
992 }
993
f214978a
KW
994 /* There are no dependent requests, but we need to remove our request
995 * from the list of in-flight requests */
68d100e9 996 run_dependent_requests(bs->opaque, &meta);
f214978a 997
a35e1c17
KW
998 /* TODO Preallocate data if requested */
999
1000 nb_sectors -= num;
1001 offset += num << 9;
1002 }
1003
1004 /*
1005 * It is expected that the image file is large enough to actually contain
1006 * all of the allocated clusters (otherwise we get failing reads after
1007 * EOF). Extend the image to the last allocated sector.
1008 */
148da7ea 1009 if (meta.cluster_offset != 0) {
ea80b906
KW
1010 uint8_t buf[512];
1011 memset(buf, 0, 512);
19dbcbf7
KW
1012 ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1);
1013 if (ret < 0) {
1014 return ret;
1015 }
a35e1c17
KW
1016 }
1017
1018 return 0;
1019}
1020
7c80ab3f
JS
1021static int qcow2_create2(const char *filename, int64_t total_size,
1022 const char *backing_file, const char *backing_format,
1023 int flags, size_t cluster_size, int prealloc,
6744cbab 1024 QEMUOptionParameter *options, int version)
a9420734 1025{
9b2260cb 1026 /* Calculate cluster_bits */
a9420734
KW
1027 int cluster_bits;
1028 cluster_bits = ffs(cluster_size) - 1;
1029 if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
1030 (1 << cluster_bits) != cluster_size)
1031 {
1032 error_report(
6daf194d 1033 "Cluster size must be a power of two between %d and %dk",
a9420734
KW
1034 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
1035 return -EINVAL;
1036 }
1037
1038 /*
1039 * Open the image file and write a minimal qcow2 header.
1040 *
1041 * We keep things simple and start with a zero-sized image. We also
1042 * do without refcount blocks or a L1 table for now. We'll fix the
1043 * inconsistency later.
1044 *
1045 * We do need a refcount table because growing the refcount table means
1046 * allocating two new refcount blocks - the seconds of which would be at
1047 * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
1048 * size for any qcow2 image.
1049 */
1050 BlockDriverState* bs;
1051 QCowHeader header;
1052 uint8_t* refcount_table;
1053 int ret;
1054
1055 ret = bdrv_create_file(filename, options);
1056 if (ret < 0) {
1057 return ret;
1058 }
1059
1060 ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
1061 if (ret < 0) {
1062 return ret;
1063 }
1064
1065 /* Write the header */
1066 memset(&header, 0, sizeof(header));
1067 header.magic = cpu_to_be32(QCOW_MAGIC);
6744cbab 1068 header.version = cpu_to_be32(version);
a9420734
KW
1069 header.cluster_bits = cpu_to_be32(cluster_bits);
1070 header.size = cpu_to_be64(0);
1071 header.l1_table_offset = cpu_to_be64(0);
1072 header.l1_size = cpu_to_be32(0);
1073 header.refcount_table_offset = cpu_to_be64(cluster_size);
1074 header.refcount_table_clusters = cpu_to_be32(1);
6744cbab
KW
1075 header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT);
1076 header.header_length = cpu_to_be32(sizeof(header));
a9420734
KW
1077
1078 if (flags & BLOCK_FLAG_ENCRYPT) {
1079 header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1080 } else {
1081 header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1082 }
1083
1084 ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
1085 if (ret < 0) {
1086 goto out;
1087 }
1088
1089 /* Write an empty refcount table */
7267c094 1090 refcount_table = g_malloc0(cluster_size);
a9420734 1091 ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
7267c094 1092 g_free(refcount_table);
a9420734
KW
1093
1094 if (ret < 0) {
1095 goto out;
1096 }
1097
1098 bdrv_close(bs);
1099
1100 /*
1101 * And now open the image and make it consistent first (i.e. increase the
1102 * refcount of the cluster that is occupied by the header and the refcount
1103 * table)
1104 */
1105 BlockDriver* drv = bdrv_find_format("qcow2");
1106 assert(drv != NULL);
e1a7107f
KW
1107 ret = bdrv_open(bs, filename,
1108 BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv);
a9420734
KW
1109 if (ret < 0) {
1110 goto out;
1111 }
1112
1113 ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
1114 if (ret < 0) {
1115 goto out;
1116
1117 } else if (ret != 0) {
1118 error_report("Huh, first cluster in empty image is already in use?");
1119 abort();
1120 }
1121
1122 /* Okay, now that we have a valid image, let's give it the right size */
1123 ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
1124 if (ret < 0) {
1125 goto out;
1126 }
1127
1128 /* Want a backing file? There you go.*/
1129 if (backing_file) {
1130 ret = bdrv_change_backing_file(bs, backing_file, backing_format);
1131 if (ret < 0) {
1132 goto out;
1133 }
1134 }
1135
1136 /* And if we're supposed to preallocate metadata, do that now */
1137 if (prealloc) {
1138 ret = preallocate(bs);
1139 if (ret < 0) {
1140 goto out;
1141 }
1142 }
1143
1144 ret = 0;
1145out:
1146 bdrv_delete(bs);
1147 return ret;
1148}
de5f3f40 1149
7c80ab3f 1150static int qcow2_create(const char *filename, QEMUOptionParameter *options)
de5f3f40
KW
1151{
1152 const char *backing_file = NULL;
1153 const char *backing_fmt = NULL;
1154 uint64_t sectors = 0;
1155 int flags = 0;
99cce9fa 1156 size_t cluster_size = DEFAULT_CLUSTER_SIZE;
de5f3f40 1157 int prealloc = 0;
6744cbab 1158 int version = 2;
de5f3f40
KW
1159
1160 /* Read out options */
1161 while (options && options->name) {
1162 if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1163 sectors = options->value.n / 512;
1164 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1165 backing_file = options->value.s;
1166 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
1167 backing_fmt = options->value.s;
1168 } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
1169 flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
1170 } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
1171 if (options->value.n) {
1172 cluster_size = options->value.n;
1173 }
1174 } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
1175 if (!options->value.s || !strcmp(options->value.s, "off")) {
1176 prealloc = 0;
1177 } else if (!strcmp(options->value.s, "metadata")) {
1178 prealloc = 1;
1179 } else {
1180 fprintf(stderr, "Invalid preallocation mode: '%s'\n",
1181 options->value.s);
1182 return -EINVAL;
1183 }
6744cbab
KW
1184 } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
1185 if (!options->value.s || !strcmp(options->value.s, "0.10")) {
1186 version = 2;
1187 } else if (!strcmp(options->value.s, "1.1")) {
1188 version = 3;
1189 } else {
1190 fprintf(stderr, "Invalid compatibility level: '%s'\n",
1191 options->value.s);
1192 return -EINVAL;
1193 }
de5f3f40
KW
1194 }
1195 options++;
1196 }
1197
1198 if (backing_file && prealloc) {
1199 fprintf(stderr, "Backing file and preallocation cannot be used at "
1200 "the same time\n");
1201 return -EINVAL;
1202 }
1203
7c80ab3f 1204 return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
6744cbab 1205 cluster_size, prealloc, options, version);
de5f3f40
KW
1206}
1207
7c80ab3f 1208static int qcow2_make_empty(BlockDriverState *bs)
20d97356
BS
1209{
1210#if 0
1211 /* XXX: not correct */
1212 BDRVQcowState *s = bs->opaque;
1213 uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1214 int ret;
1215
1216 memset(s->l1_table, 0, l1_length);
66f82cee 1217 if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
20d97356 1218 return -1;
66f82cee 1219 ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
20d97356
BS
1220 if (ret < 0)
1221 return ret;
1222
1223 l2_cache_reset(bs);
1224#endif
1225 return 0;
1226}
1227
6db39ae2
PB
1228static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
1229 int64_t sector_num, int nb_sectors)
5ea929e3 1230{
6db39ae2
PB
1231 int ret;
1232 BDRVQcowState *s = bs->opaque;
1233
1234 qemu_co_mutex_lock(&s->lock);
1235 ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
5ea929e3 1236 nb_sectors);
6db39ae2
PB
1237 qemu_co_mutex_unlock(&s->lock);
1238 return ret;
5ea929e3
KW
1239}
1240
419b19d9
SH
1241static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
1242{
1243 BDRVQcowState *s = bs->opaque;
1244 int ret, new_l1_size;
1245
1246 if (offset & 511) {
259b2173 1247 error_report("The new size must be a multiple of 512");
419b19d9
SH
1248 return -EINVAL;
1249 }
1250
1251 /* cannot proceed if image has snapshots */
1252 if (s->nb_snapshots) {
259b2173 1253 error_report("Can't resize an image which has snapshots");
419b19d9
SH
1254 return -ENOTSUP;
1255 }
1256
1257 /* shrinking is currently not supported */
1258 if (offset < bs->total_sectors * 512) {
259b2173 1259 error_report("qcow2 doesn't support shrinking images yet");
419b19d9
SH
1260 return -ENOTSUP;
1261 }
1262
1263 new_l1_size = size_to_l1(s, offset);
72893756 1264 ret = qcow2_grow_l1_table(bs, new_l1_size, true);
419b19d9
SH
1265 if (ret < 0) {
1266 return ret;
1267 }
1268
1269 /* write updated header.size */
1270 offset = cpu_to_be64(offset);
8b3b7206
KW
1271 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
1272 &offset, sizeof(uint64_t));
419b19d9
SH
1273 if (ret < 0) {
1274 return ret;
1275 }
1276
1277 s->l1_vm_state_index = new_l1_size;
1278 return 0;
1279}
1280
20d97356
BS
1281/* XXX: put compressed sectors first, then all the cluster aligned
1282 tables to avoid losing bytes in alignment */
7c80ab3f
JS
1283static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
1284 const uint8_t *buf, int nb_sectors)
20d97356
BS
1285{
1286 BDRVQcowState *s = bs->opaque;
1287 z_stream strm;
1288 int ret, out_len;
1289 uint8_t *out_buf;
1290 uint64_t cluster_offset;
1291
1292 if (nb_sectors == 0) {
1293 /* align end of file to a sector boundary to ease reading with
1294 sector based I/Os */
66f82cee 1295 cluster_offset = bdrv_getlength(bs->file);
20d97356 1296 cluster_offset = (cluster_offset + 511) & ~511;
66f82cee 1297 bdrv_truncate(bs->file, cluster_offset);
20d97356
BS
1298 return 0;
1299 }
1300
1301 if (nb_sectors != s->cluster_sectors)
1302 return -EINVAL;
1303
7267c094 1304 out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
20d97356
BS
1305
1306 /* best compression, small window, no zlib header */
1307 memset(&strm, 0, sizeof(strm));
1308 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1309 Z_DEFLATED, -12,
1310 9, Z_DEFAULT_STRATEGY);
1311 if (ret != 0) {
8f1efd00
KW
1312 ret = -EINVAL;
1313 goto fail;
20d97356
BS
1314 }
1315
1316 strm.avail_in = s->cluster_size;
1317 strm.next_in = (uint8_t *)buf;
1318 strm.avail_out = s->cluster_size;
1319 strm.next_out = out_buf;
1320
1321 ret = deflate(&strm, Z_FINISH);
1322 if (ret != Z_STREAM_END && ret != Z_OK) {
20d97356 1323 deflateEnd(&strm);
8f1efd00
KW
1324 ret = -EINVAL;
1325 goto fail;
20d97356
BS
1326 }
1327 out_len = strm.next_out - out_buf;
1328
1329 deflateEnd(&strm);
1330
1331 if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1332 /* could not compress: write normal cluster */
8f1efd00
KW
1333 ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
1334 if (ret < 0) {
1335 goto fail;
1336 }
20d97356
BS
1337 } else {
1338 cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
1339 sector_num << 9, out_len);
8f1efd00
KW
1340 if (!cluster_offset) {
1341 ret = -EIO;
1342 goto fail;
1343 }
20d97356 1344 cluster_offset &= s->cluster_offset_mask;
66f82cee 1345 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
8f1efd00
KW
1346 ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
1347 if (ret < 0) {
1348 goto fail;
20d97356
BS
1349 }
1350 }
1351
8f1efd00
KW
1352 ret = 0;
1353fail:
7267c094 1354 g_free(out_buf);
8f1efd00 1355 return ret;
20d97356
BS
1356}
1357
a968168c 1358static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
20d97356 1359{
29c1a730
KW
1360 BDRVQcowState *s = bs->opaque;
1361 int ret;
1362
8b94ff85 1363 qemu_co_mutex_lock(&s->lock);
29c1a730
KW
1364 ret = qcow2_cache_flush(bs, s->l2_table_cache);
1365 if (ret < 0) {
c95de7e2 1366 qemu_co_mutex_unlock(&s->lock);
8b94ff85 1367 return ret;
29c1a730
KW
1368 }
1369
1370 ret = qcow2_cache_flush(bs, s->refcount_block_cache);
1371 if (ret < 0) {
c95de7e2 1372 qemu_co_mutex_unlock(&s->lock);
8b94ff85 1373 return ret;
29c1a730 1374 }
8b94ff85 1375 qemu_co_mutex_unlock(&s->lock);
29c1a730 1376
eb489bb1
KW
1377 return 0;
1378}
1379
7c80ab3f 1380static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
20d97356
BS
1381{
1382 return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
1383}
1384
7c80ab3f 1385static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
20d97356
BS
1386{
1387 BDRVQcowState *s = bs->opaque;
1388 bdi->cluster_size = s->cluster_size;
7c80ab3f 1389 bdi->vm_state_offset = qcow2_vm_state_offset(s);
20d97356
BS
1390 return 0;
1391}
1392
1393
7c80ab3f 1394static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result)
20d97356 1395{
9ac228e0 1396 return qcow2_check_refcounts(bs, result);
20d97356
BS
1397}
1398
1399#if 0
1400static void dump_refcounts(BlockDriverState *bs)
1401{
1402 BDRVQcowState *s = bs->opaque;
1403 int64_t nb_clusters, k, k1, size;
1404 int refcount;
1405
66f82cee 1406 size = bdrv_getlength(bs->file);
20d97356
BS
1407 nb_clusters = size_to_clusters(s, size);
1408 for(k = 0; k < nb_clusters;) {
1409 k1 = k;
1410 refcount = get_refcount(bs, k);
1411 k++;
1412 while (k < nb_clusters && get_refcount(bs, k) == refcount)
1413 k++;
0bfcd599
BS
1414 printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
1415 k - k1);
20d97356
BS
1416 }
1417}
1418#endif
1419
7c80ab3f
JS
1420static int qcow2_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1421 int64_t pos, int size)
20d97356
BS
1422{
1423 BDRVQcowState *s = bs->opaque;
1424 int growable = bs->growable;
1425 int ret;
1426
66f82cee 1427 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
20d97356 1428 bs->growable = 1;
7c80ab3f 1429 ret = bdrv_pwrite(bs, qcow2_vm_state_offset(s) + pos, buf, size);
20d97356
BS
1430 bs->growable = growable;
1431
1432 return ret;
1433}
1434
7c80ab3f
JS
1435static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1436 int64_t pos, int size)
20d97356
BS
1437{
1438 BDRVQcowState *s = bs->opaque;
1439 int growable = bs->growable;
1440 int ret;
1441
66f82cee 1442 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
20d97356 1443 bs->growable = 1;
7c80ab3f 1444 ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
20d97356
BS
1445 bs->growable = growable;
1446
1447 return ret;
1448}
1449
7c80ab3f 1450static QEMUOptionParameter qcow2_create_options[] = {
20d97356
BS
1451 {
1452 .name = BLOCK_OPT_SIZE,
1453 .type = OPT_SIZE,
1454 .help = "Virtual disk size"
1455 },
6744cbab
KW
1456 {
1457 .name = BLOCK_OPT_COMPAT_LEVEL,
1458 .type = OPT_STRING,
1459 .help = "Compatibility level (0.10 or 1.1)"
1460 },
20d97356
BS
1461 {
1462 .name = BLOCK_OPT_BACKING_FILE,
1463 .type = OPT_STRING,
1464 .help = "File name of a base image"
1465 },
1466 {
1467 .name = BLOCK_OPT_BACKING_FMT,
1468 .type = OPT_STRING,
1469 .help = "Image format of the base image"
1470 },
1471 {
1472 .name = BLOCK_OPT_ENCRYPT,
1473 .type = OPT_FLAG,
1474 .help = "Encrypt the image"
1475 },
1476 {
1477 .name = BLOCK_OPT_CLUSTER_SIZE,
1478 .type = OPT_SIZE,
99cce9fa
KW
1479 .help = "qcow2 cluster size",
1480 .value = { .n = DEFAULT_CLUSTER_SIZE },
20d97356
BS
1481 },
1482 {
1483 .name = BLOCK_OPT_PREALLOC,
1484 .type = OPT_STRING,
1485 .help = "Preallocation mode (allowed values: off, metadata)"
1486 },
1487 { NULL }
1488};
1489
1490static BlockDriver bdrv_qcow2 = {
7c80ab3f
JS
1491 .format_name = "qcow2",
1492 .instance_size = sizeof(BDRVQcowState),
1493 .bdrv_probe = qcow2_probe,
1494 .bdrv_open = qcow2_open,
1495 .bdrv_close = qcow2_close,
1496 .bdrv_create = qcow2_create,
f8a2e5e3 1497 .bdrv_co_is_allocated = qcow2_co_is_allocated,
7c80ab3f
JS
1498 .bdrv_set_key = qcow2_set_key,
1499 .bdrv_make_empty = qcow2_make_empty,
1500
c68b89ac
KW
1501 .bdrv_co_readv = qcow2_co_readv,
1502 .bdrv_co_writev = qcow2_co_writev,
eb489bb1 1503 .bdrv_co_flush_to_os = qcow2_co_flush_to_os,
419b19d9 1504
6db39ae2 1505 .bdrv_co_discard = qcow2_co_discard,
419b19d9 1506 .bdrv_truncate = qcow2_truncate,
7c80ab3f 1507 .bdrv_write_compressed = qcow2_write_compressed,
20d97356
BS
1508
1509 .bdrv_snapshot_create = qcow2_snapshot_create,
1510 .bdrv_snapshot_goto = qcow2_snapshot_goto,
1511 .bdrv_snapshot_delete = qcow2_snapshot_delete,
1512 .bdrv_snapshot_list = qcow2_snapshot_list,
51ef6727 1513 .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
7c80ab3f 1514 .bdrv_get_info = qcow2_get_info,
20d97356 1515
7c80ab3f
JS
1516 .bdrv_save_vmstate = qcow2_save_vmstate,
1517 .bdrv_load_vmstate = qcow2_load_vmstate,
20d97356
BS
1518
1519 .bdrv_change_backing_file = qcow2_change_backing_file,
1520
06d9260f
AL
1521 .bdrv_invalidate_cache = qcow2_invalidate_cache,
1522
7c80ab3f
JS
1523 .create_options = qcow2_create_options,
1524 .bdrv_check = qcow2_check,
20d97356
BS
1525};
1526
5efa9d5a
AL
1527static void bdrv_qcow2_init(void)
1528{
1529 bdrv_register(&bdrv_qcow2);
1530}
1531
1532block_init(bdrv_qcow2_init);