]> git.proxmox.com Git - qemu.git/blob - block/qcow2.c
Merge remote-tracking branch 'mdroth/qga-pull-2013-05-13' into staging
[qemu.git] / block / qcow2.c
1 /*
2 * Block driver for the QCOW version 2 format
3 *
4 * Copyright (c) 2004-2006 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "qemu-common.h"
25 #include "block/block_int.h"
26 #include "qemu/module.h"
27 #include <zlib.h>
28 #include "qemu/aes.h"
29 #include "block/qcow2.h"
30 #include "qemu/error-report.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qapi/qmp/qbool.h"
33 #include "trace.h"
34
35 /*
36 Differences with QCOW:
37
38 - Support for multiple incremental snapshots.
39 - Memory management by reference counts.
40 - Clusters which have a reference count of one have the bit
41 QCOW_OFLAG_COPIED to optimize write performance.
42 - Size of compressed clusters is stored in sectors to reduce bit usage
43 in the cluster offsets.
44 - Support for storing additional data (such as the VM state) in the
45 snapshots.
46 - If a backing store is used, the cluster size is not constrained
47 (could be backported to QCOW).
48 - L2 tables have always a size of one cluster.
49 */
50
51
52 typedef struct {
53 uint32_t magic;
54 uint32_t len;
55 } QCowExtension;
56
57 #define QCOW2_EXT_MAGIC_END 0
58 #define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
59 #define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
60
61 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
62 {
63 const QCowHeader *cow_header = (const void *)buf;
64
65 if (buf_size >= sizeof(QCowHeader) &&
66 be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
67 be32_to_cpu(cow_header->version) >= 2)
68 return 100;
69 else
70 return 0;
71 }
72
73
74 /*
75 * read qcow2 extension and fill bs
76 * start reading from start_offset
77 * finish reading upon magic of value 0 or when end_offset reached
78 * unknown magic is skipped (future extension this version knows nothing about)
79 * return 0 upon success, non-0 otherwise
80 */
81 static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
82 uint64_t end_offset, void **p_feature_table)
83 {
84 BDRVQcowState *s = bs->opaque;
85 QCowExtension ext;
86 uint64_t offset;
87 int ret;
88
89 #ifdef DEBUG_EXT
90 printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
91 #endif
92 offset = start_offset;
93 while (offset < end_offset) {
94
95 #ifdef DEBUG_EXT
96 /* Sanity check */
97 if (offset > s->cluster_size)
98 printf("qcow2_read_extension: suspicious offset %lu\n", offset);
99
100 printf("attempting to read extended header in offset %lu\n", offset);
101 #endif
102
103 if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) {
104 fprintf(stderr, "qcow2_read_extension: ERROR: "
105 "pread fail from offset %" PRIu64 "\n",
106 offset);
107 return 1;
108 }
109 be32_to_cpus(&ext.magic);
110 be32_to_cpus(&ext.len);
111 offset += sizeof(ext);
112 #ifdef DEBUG_EXT
113 printf("ext.magic = 0x%x\n", ext.magic);
114 #endif
115 if (ext.len > end_offset - offset) {
116 error_report("Header extension too large");
117 return -EINVAL;
118 }
119
120 switch (ext.magic) {
121 case QCOW2_EXT_MAGIC_END:
122 return 0;
123
124 case QCOW2_EXT_MAGIC_BACKING_FORMAT:
125 if (ext.len >= sizeof(bs->backing_format)) {
126 fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
127 " (>=%zu)\n",
128 ext.len, sizeof(bs->backing_format));
129 return 2;
130 }
131 if (bdrv_pread(bs->file, offset , bs->backing_format,
132 ext.len) != ext.len)
133 return 3;
134 bs->backing_format[ext.len] = '\0';
135 #ifdef DEBUG_EXT
136 printf("Qcow2: Got format extension %s\n", bs->backing_format);
137 #endif
138 break;
139
140 case QCOW2_EXT_MAGIC_FEATURE_TABLE:
141 if (p_feature_table != NULL) {
142 void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
143 ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
144 if (ret < 0) {
145 return ret;
146 }
147
148 *p_feature_table = feature_table;
149 }
150 break;
151
152 default:
153 /* unknown magic - save it in case we need to rewrite the header */
154 {
155 Qcow2UnknownHeaderExtension *uext;
156
157 uext = g_malloc0(sizeof(*uext) + ext.len);
158 uext->magic = ext.magic;
159 uext->len = ext.len;
160 QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
161
162 ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
163 if (ret < 0) {
164 return ret;
165 }
166 }
167 break;
168 }
169
170 offset += ((ext.len + 7) & ~7);
171 }
172
173 return 0;
174 }
175
176 static void cleanup_unknown_header_ext(BlockDriverState *bs)
177 {
178 BDRVQcowState *s = bs->opaque;
179 Qcow2UnknownHeaderExtension *uext, *next;
180
181 QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
182 QLIST_REMOVE(uext, next);
183 g_free(uext);
184 }
185 }
186
187 static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs,
188 const char *fmt, ...)
189 {
190 char msg[64];
191 va_list ap;
192
193 va_start(ap, fmt);
194 vsnprintf(msg, sizeof(msg), fmt, ap);
195 va_end(ap);
196
197 qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
198 bs->device_name, "qcow2", msg);
199 }
200
201 static void report_unsupported_feature(BlockDriverState *bs,
202 Qcow2Feature *table, uint64_t mask)
203 {
204 while (table && table->name[0] != '\0') {
205 if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
206 if (mask & (1 << table->bit)) {
207 report_unsupported(bs, "%.46s",table->name);
208 mask &= ~(1 << table->bit);
209 }
210 }
211 table++;
212 }
213
214 if (mask) {
215 report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask);
216 }
217 }
218
219 /*
220 * Sets the dirty bit and flushes afterwards if necessary.
221 *
222 * The incompatible_features bit is only set if the image file header was
223 * updated successfully. Therefore it is not required to check the return
224 * value of this function.
225 */
226 int qcow2_mark_dirty(BlockDriverState *bs)
227 {
228 BDRVQcowState *s = bs->opaque;
229 uint64_t val;
230 int ret;
231
232 assert(s->qcow_version >= 3);
233
234 if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
235 return 0; /* already dirty */
236 }
237
238 val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
239 ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
240 &val, sizeof(val));
241 if (ret < 0) {
242 return ret;
243 }
244 ret = bdrv_flush(bs->file);
245 if (ret < 0) {
246 return ret;
247 }
248
249 /* Only treat image as dirty if the header was updated successfully */
250 s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
251 return 0;
252 }
253
254 /*
255 * Clears the dirty bit and flushes before if necessary. Only call this
256 * function when there are no pending requests, it does not guard against
257 * concurrent requests dirtying the image.
258 */
259 static int qcow2_mark_clean(BlockDriverState *bs)
260 {
261 BDRVQcowState *s = bs->opaque;
262
263 if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
264 int ret = bdrv_flush(bs);
265 if (ret < 0) {
266 return ret;
267 }
268
269 s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
270 return qcow2_update_header(bs);
271 }
272 return 0;
273 }
274
275 static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
276 BdrvCheckMode fix)
277 {
278 int ret = qcow2_check_refcounts(bs, result, fix);
279 if (ret < 0) {
280 return ret;
281 }
282
283 if (fix && result->check_errors == 0 && result->corruptions == 0) {
284 return qcow2_mark_clean(bs);
285 }
286 return ret;
287 }
288
289 static QemuOptsList qcow2_runtime_opts = {
290 .name = "qcow2",
291 .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
292 .desc = {
293 {
294 .name = "lazy_refcounts",
295 .type = QEMU_OPT_BOOL,
296 .help = "Postpone refcount updates",
297 },
298 { /* end of list */ }
299 },
300 };
301
302 static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
303 {
304 BDRVQcowState *s = bs->opaque;
305 int len, i, ret = 0;
306 QCowHeader header;
307 QemuOpts *opts;
308 Error *local_err = NULL;
309 uint64_t ext_end;
310 uint64_t l1_vm_state_index;
311
312 ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
313 if (ret < 0) {
314 goto fail;
315 }
316 be32_to_cpus(&header.magic);
317 be32_to_cpus(&header.version);
318 be64_to_cpus(&header.backing_file_offset);
319 be32_to_cpus(&header.backing_file_size);
320 be64_to_cpus(&header.size);
321 be32_to_cpus(&header.cluster_bits);
322 be32_to_cpus(&header.crypt_method);
323 be64_to_cpus(&header.l1_table_offset);
324 be32_to_cpus(&header.l1_size);
325 be64_to_cpus(&header.refcount_table_offset);
326 be32_to_cpus(&header.refcount_table_clusters);
327 be64_to_cpus(&header.snapshots_offset);
328 be32_to_cpus(&header.nb_snapshots);
329
330 if (header.magic != QCOW_MAGIC) {
331 ret = -EMEDIUMTYPE;
332 goto fail;
333 }
334 if (header.version < 2 || header.version > 3) {
335 report_unsupported(bs, "QCOW version %d", header.version);
336 ret = -ENOTSUP;
337 goto fail;
338 }
339
340 s->qcow_version = header.version;
341
342 /* Initialise version 3 header fields */
343 if (header.version == 2) {
344 header.incompatible_features = 0;
345 header.compatible_features = 0;
346 header.autoclear_features = 0;
347 header.refcount_order = 4;
348 header.header_length = 72;
349 } else {
350 be64_to_cpus(&header.incompatible_features);
351 be64_to_cpus(&header.compatible_features);
352 be64_to_cpus(&header.autoclear_features);
353 be32_to_cpus(&header.refcount_order);
354 be32_to_cpus(&header.header_length);
355 }
356
357 if (header.header_length > sizeof(header)) {
358 s->unknown_header_fields_size = header.header_length - sizeof(header);
359 s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
360 ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
361 s->unknown_header_fields_size);
362 if (ret < 0) {
363 goto fail;
364 }
365 }
366
367 if (header.backing_file_offset) {
368 ext_end = header.backing_file_offset;
369 } else {
370 ext_end = 1 << header.cluster_bits;
371 }
372
373 /* Handle feature bits */
374 s->incompatible_features = header.incompatible_features;
375 s->compatible_features = header.compatible_features;
376 s->autoclear_features = header.autoclear_features;
377
378 if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
379 void *feature_table = NULL;
380 qcow2_read_extensions(bs, header.header_length, ext_end,
381 &feature_table);
382 report_unsupported_feature(bs, feature_table,
383 s->incompatible_features &
384 ~QCOW2_INCOMPAT_MASK);
385 ret = -ENOTSUP;
386 goto fail;
387 }
388
389 /* Check support for various header values */
390 if (header.refcount_order != 4) {
391 report_unsupported(bs, "%d bit reference counts",
392 1 << header.refcount_order);
393 ret = -ENOTSUP;
394 goto fail;
395 }
396
397 if (header.cluster_bits < MIN_CLUSTER_BITS ||
398 header.cluster_bits > MAX_CLUSTER_BITS) {
399 ret = -EINVAL;
400 goto fail;
401 }
402 if (header.crypt_method > QCOW_CRYPT_AES) {
403 ret = -EINVAL;
404 goto fail;
405 }
406 s->crypt_method_header = header.crypt_method;
407 if (s->crypt_method_header) {
408 bs->encrypted = 1;
409 }
410 s->cluster_bits = header.cluster_bits;
411 s->cluster_size = 1 << s->cluster_bits;
412 s->cluster_sectors = 1 << (s->cluster_bits - 9);
413 s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
414 s->l2_size = 1 << s->l2_bits;
415 bs->total_sectors = header.size / 512;
416 s->csize_shift = (62 - (s->cluster_bits - 8));
417 s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
418 s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
419 s->refcount_table_offset = header.refcount_table_offset;
420 s->refcount_table_size =
421 header.refcount_table_clusters << (s->cluster_bits - 3);
422
423 s->snapshots_offset = header.snapshots_offset;
424 s->nb_snapshots = header.nb_snapshots;
425
426 /* read the level 1 table */
427 s->l1_size = header.l1_size;
428
429 l1_vm_state_index = size_to_l1(s, header.size);
430 if (l1_vm_state_index > INT_MAX) {
431 ret = -EFBIG;
432 goto fail;
433 }
434 s->l1_vm_state_index = l1_vm_state_index;
435
436 /* the L1 table must contain at least enough entries to put
437 header.size bytes */
438 if (s->l1_size < s->l1_vm_state_index) {
439 ret = -EINVAL;
440 goto fail;
441 }
442 s->l1_table_offset = header.l1_table_offset;
443 if (s->l1_size > 0) {
444 s->l1_table = g_malloc0(
445 align_offset(s->l1_size * sizeof(uint64_t), 512));
446 ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
447 s->l1_size * sizeof(uint64_t));
448 if (ret < 0) {
449 goto fail;
450 }
451 for(i = 0;i < s->l1_size; i++) {
452 be64_to_cpus(&s->l1_table[i]);
453 }
454 }
455
456 /* alloc L2 table/refcount block cache */
457 s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
458 s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
459
460 s->cluster_cache = g_malloc(s->cluster_size);
461 /* one more sector for decompressed data alignment */
462 s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
463 + 512);
464 s->cluster_cache_offset = -1;
465 s->flags = flags;
466
467 ret = qcow2_refcount_init(bs);
468 if (ret != 0) {
469 goto fail;
470 }
471
472 QLIST_INIT(&s->cluster_allocs);
473
474 /* read qcow2 extensions */
475 if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) {
476 ret = -EINVAL;
477 goto fail;
478 }
479
480 /* read the backing file name */
481 if (header.backing_file_offset != 0) {
482 len = header.backing_file_size;
483 if (len > 1023) {
484 len = 1023;
485 }
486 ret = bdrv_pread(bs->file, header.backing_file_offset,
487 bs->backing_file, len);
488 if (ret < 0) {
489 goto fail;
490 }
491 bs->backing_file[len] = '\0';
492 }
493
494 ret = qcow2_read_snapshots(bs);
495 if (ret < 0) {
496 goto fail;
497 }
498
499 /* Clear unknown autoclear feature bits */
500 if (!bs->read_only && s->autoclear_features != 0) {
501 s->autoclear_features = 0;
502 ret = qcow2_update_header(bs);
503 if (ret < 0) {
504 goto fail;
505 }
506 }
507
508 /* Initialise locks */
509 qemu_co_mutex_init(&s->lock);
510
511 /* Repair image if dirty */
512 if (!(flags & BDRV_O_CHECK) && !bs->read_only &&
513 (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
514 BdrvCheckResult result = {0};
515
516 ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
517 if (ret < 0) {
518 goto fail;
519 }
520 }
521
522 /* Enable lazy_refcounts according to image and command line options */
523 opts = qemu_opts_create_nofail(&qcow2_runtime_opts);
524 qemu_opts_absorb_qdict(opts, options, &local_err);
525 if (error_is_set(&local_err)) {
526 qerror_report_err(local_err);
527 error_free(local_err);
528 ret = -EINVAL;
529 goto fail;
530 }
531
532 s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
533 (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
534
535 qemu_opts_del(opts);
536
537 if (s->use_lazy_refcounts && s->qcow_version < 3) {
538 qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require "
539 "a qcow2 image with at least qemu 1.1 compatibility level");
540 ret = -EINVAL;
541 goto fail;
542 }
543
544 #ifdef DEBUG_ALLOC
545 {
546 BdrvCheckResult result = {0};
547 qcow2_check_refcounts(bs, &result, 0);
548 }
549 #endif
550 return ret;
551
552 fail:
553 g_free(s->unknown_header_fields);
554 cleanup_unknown_header_ext(bs);
555 qcow2_free_snapshots(bs);
556 qcow2_refcount_close(bs);
557 g_free(s->l1_table);
558 if (s->l2_table_cache) {
559 qcow2_cache_destroy(bs, s->l2_table_cache);
560 }
561 g_free(s->cluster_cache);
562 qemu_vfree(s->cluster_data);
563 return ret;
564 }
565
566 static int qcow2_set_key(BlockDriverState *bs, const char *key)
567 {
568 BDRVQcowState *s = bs->opaque;
569 uint8_t keybuf[16];
570 int len, i;
571
572 memset(keybuf, 0, 16);
573 len = strlen(key);
574 if (len > 16)
575 len = 16;
576 /* XXX: we could compress the chars to 7 bits to increase
577 entropy */
578 for(i = 0;i < len;i++) {
579 keybuf[i] = key[i];
580 }
581 s->crypt_method = s->crypt_method_header;
582
583 if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
584 return -1;
585 if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
586 return -1;
587 #if 0
588 /* test */
589 {
590 uint8_t in[16];
591 uint8_t out[16];
592 uint8_t tmp[16];
593 for(i=0;i<16;i++)
594 in[i] = i;
595 AES_encrypt(in, tmp, &s->aes_encrypt_key);
596 AES_decrypt(tmp, out, &s->aes_decrypt_key);
597 for(i = 0; i < 16; i++)
598 printf(" %02x", tmp[i]);
599 printf("\n");
600 for(i = 0; i < 16; i++)
601 printf(" %02x", out[i]);
602 printf("\n");
603 }
604 #endif
605 return 0;
606 }
607
608 /* We have nothing to do for QCOW2 reopen, stubs just return
609 * success */
610 static int qcow2_reopen_prepare(BDRVReopenState *state,
611 BlockReopenQueue *queue, Error **errp)
612 {
613 return 0;
614 }
615
616 static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
617 int64_t sector_num, int nb_sectors, int *pnum)
618 {
619 BDRVQcowState *s = bs->opaque;
620 uint64_t cluster_offset;
621 int ret;
622
623 *pnum = nb_sectors;
624 /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
625 * can't pass them on today */
626 qemu_co_mutex_lock(&s->lock);
627 ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
628 qemu_co_mutex_unlock(&s->lock);
629 if (ret < 0) {
630 *pnum = 0;
631 }
632
633 return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO);
634 }
635
636 /* handle reading after the end of the backing file */
637 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
638 int64_t sector_num, int nb_sectors)
639 {
640 int n1;
641 if ((sector_num + nb_sectors) <= bs->total_sectors)
642 return nb_sectors;
643 if (sector_num >= bs->total_sectors)
644 n1 = 0;
645 else
646 n1 = bs->total_sectors - sector_num;
647
648 qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
649
650 return n1;
651 }
652
653 static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
654 int remaining_sectors, QEMUIOVector *qiov)
655 {
656 BDRVQcowState *s = bs->opaque;
657 int index_in_cluster, n1;
658 int ret;
659 int cur_nr_sectors; /* number of sectors in current iteration */
660 uint64_t cluster_offset = 0;
661 uint64_t bytes_done = 0;
662 QEMUIOVector hd_qiov;
663 uint8_t *cluster_data = NULL;
664
665 qemu_iovec_init(&hd_qiov, qiov->niov);
666
667 qemu_co_mutex_lock(&s->lock);
668
669 while (remaining_sectors != 0) {
670
671 /* prepare next request */
672 cur_nr_sectors = remaining_sectors;
673 if (s->crypt_method) {
674 cur_nr_sectors = MIN(cur_nr_sectors,
675 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
676 }
677
678 ret = qcow2_get_cluster_offset(bs, sector_num << 9,
679 &cur_nr_sectors, &cluster_offset);
680 if (ret < 0) {
681 goto fail;
682 }
683
684 index_in_cluster = sector_num & (s->cluster_sectors - 1);
685
686 qemu_iovec_reset(&hd_qiov);
687 qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
688 cur_nr_sectors * 512);
689
690 switch (ret) {
691 case QCOW2_CLUSTER_UNALLOCATED:
692
693 if (bs->backing_hd) {
694 /* read from the base image */
695 n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
696 sector_num, cur_nr_sectors);
697 if (n1 > 0) {
698 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
699 qemu_co_mutex_unlock(&s->lock);
700 ret = bdrv_co_readv(bs->backing_hd, sector_num,
701 n1, &hd_qiov);
702 qemu_co_mutex_lock(&s->lock);
703 if (ret < 0) {
704 goto fail;
705 }
706 }
707 } else {
708 /* Note: in this case, no need to wait */
709 qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
710 }
711 break;
712
713 case QCOW2_CLUSTER_ZERO:
714 qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
715 break;
716
717 case QCOW2_CLUSTER_COMPRESSED:
718 /* add AIO support for compressed blocks ? */
719 ret = qcow2_decompress_cluster(bs, cluster_offset);
720 if (ret < 0) {
721 goto fail;
722 }
723
724 qemu_iovec_from_buf(&hd_qiov, 0,
725 s->cluster_cache + index_in_cluster * 512,
726 512 * cur_nr_sectors);
727 break;
728
729 case QCOW2_CLUSTER_NORMAL:
730 if ((cluster_offset & 511) != 0) {
731 ret = -EIO;
732 goto fail;
733 }
734
735 if (s->crypt_method) {
736 /*
737 * For encrypted images, read everything into a temporary
738 * contiguous buffer on which the AES functions can work.
739 */
740 if (!cluster_data) {
741 cluster_data =
742 qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
743 }
744
745 assert(cur_nr_sectors <=
746 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
747 qemu_iovec_reset(&hd_qiov);
748 qemu_iovec_add(&hd_qiov, cluster_data,
749 512 * cur_nr_sectors);
750 }
751
752 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
753 qemu_co_mutex_unlock(&s->lock);
754 ret = bdrv_co_readv(bs->file,
755 (cluster_offset >> 9) + index_in_cluster,
756 cur_nr_sectors, &hd_qiov);
757 qemu_co_mutex_lock(&s->lock);
758 if (ret < 0) {
759 goto fail;
760 }
761 if (s->crypt_method) {
762 qcow2_encrypt_sectors(s, sector_num, cluster_data,
763 cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
764 qemu_iovec_from_buf(qiov, bytes_done,
765 cluster_data, 512 * cur_nr_sectors);
766 }
767 break;
768
769 default:
770 g_assert_not_reached();
771 ret = -EIO;
772 goto fail;
773 }
774
775 remaining_sectors -= cur_nr_sectors;
776 sector_num += cur_nr_sectors;
777 bytes_done += cur_nr_sectors * 512;
778 }
779 ret = 0;
780
781 fail:
782 qemu_co_mutex_unlock(&s->lock);
783
784 qemu_iovec_destroy(&hd_qiov);
785 qemu_vfree(cluster_data);
786
787 return ret;
788 }
789
790 static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
791 int64_t sector_num,
792 int remaining_sectors,
793 QEMUIOVector *qiov)
794 {
795 BDRVQcowState *s = bs->opaque;
796 int index_in_cluster;
797 int n_end;
798 int ret;
799 int cur_nr_sectors; /* number of sectors in current iteration */
800 uint64_t cluster_offset;
801 QEMUIOVector hd_qiov;
802 uint64_t bytes_done = 0;
803 uint8_t *cluster_data = NULL;
804 QCowL2Meta *l2meta = NULL;
805
806 trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
807 remaining_sectors);
808
809 qemu_iovec_init(&hd_qiov, qiov->niov);
810
811 s->cluster_cache_offset = -1; /* disable compressed cache */
812
813 qemu_co_mutex_lock(&s->lock);
814
815 while (remaining_sectors != 0) {
816
817 l2meta = NULL;
818
819 trace_qcow2_writev_start_part(qemu_coroutine_self());
820 index_in_cluster = sector_num & (s->cluster_sectors - 1);
821 n_end = index_in_cluster + remaining_sectors;
822 if (s->crypt_method &&
823 n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
824 n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
825 }
826
827 ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
828 index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
829 if (ret < 0) {
830 goto fail;
831 }
832
833 assert((cluster_offset & 511) == 0);
834
835 qemu_iovec_reset(&hd_qiov);
836 qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
837 cur_nr_sectors * 512);
838
839 if (s->crypt_method) {
840 if (!cluster_data) {
841 cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
842 s->cluster_size);
843 }
844
845 assert(hd_qiov.size <=
846 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
847 qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
848
849 qcow2_encrypt_sectors(s, sector_num, cluster_data,
850 cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
851
852 qemu_iovec_reset(&hd_qiov);
853 qemu_iovec_add(&hd_qiov, cluster_data,
854 cur_nr_sectors * 512);
855 }
856
857 qemu_co_mutex_unlock(&s->lock);
858 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
859 trace_qcow2_writev_data(qemu_coroutine_self(),
860 (cluster_offset >> 9) + index_in_cluster);
861 ret = bdrv_co_writev(bs->file,
862 (cluster_offset >> 9) + index_in_cluster,
863 cur_nr_sectors, &hd_qiov);
864 qemu_co_mutex_lock(&s->lock);
865 if (ret < 0) {
866 goto fail;
867 }
868
869 while (l2meta != NULL) {
870 QCowL2Meta *next;
871
872 ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
873 if (ret < 0) {
874 goto fail;
875 }
876
877 /* Take the request off the list of running requests */
878 if (l2meta->nb_clusters != 0) {
879 QLIST_REMOVE(l2meta, next_in_flight);
880 }
881
882 qemu_co_queue_restart_all(&l2meta->dependent_requests);
883
884 next = l2meta->next;
885 g_free(l2meta);
886 l2meta = next;
887 }
888
889 remaining_sectors -= cur_nr_sectors;
890 sector_num += cur_nr_sectors;
891 bytes_done += cur_nr_sectors * 512;
892 trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
893 }
894 ret = 0;
895
896 fail:
897 qemu_co_mutex_unlock(&s->lock);
898
899 while (l2meta != NULL) {
900 QCowL2Meta *next;
901
902 if (l2meta->nb_clusters != 0) {
903 QLIST_REMOVE(l2meta, next_in_flight);
904 }
905 qemu_co_queue_restart_all(&l2meta->dependent_requests);
906
907 next = l2meta->next;
908 g_free(l2meta);
909 l2meta = next;
910 }
911
912 qemu_iovec_destroy(&hd_qiov);
913 qemu_vfree(cluster_data);
914 trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
915
916 return ret;
917 }
918
919 static void qcow2_close(BlockDriverState *bs)
920 {
921 BDRVQcowState *s = bs->opaque;
922 g_free(s->l1_table);
923
924 qcow2_cache_flush(bs, s->l2_table_cache);
925 qcow2_cache_flush(bs, s->refcount_block_cache);
926
927 qcow2_mark_clean(bs);
928
929 qcow2_cache_destroy(bs, s->l2_table_cache);
930 qcow2_cache_destroy(bs, s->refcount_block_cache);
931
932 g_free(s->unknown_header_fields);
933 cleanup_unknown_header_ext(bs);
934
935 g_free(s->cluster_cache);
936 qemu_vfree(s->cluster_data);
937 qcow2_refcount_close(bs);
938 qcow2_free_snapshots(bs);
939 }
940
941 static void qcow2_invalidate_cache(BlockDriverState *bs)
942 {
943 BDRVQcowState *s = bs->opaque;
944 int flags = s->flags;
945 AES_KEY aes_encrypt_key;
946 AES_KEY aes_decrypt_key;
947 uint32_t crypt_method = 0;
948 QDict *options;
949
950 /*
951 * Backing files are read-only which makes all of their metadata immutable,
952 * that means we don't have to worry about reopening them here.
953 */
954
955 if (s->crypt_method) {
956 crypt_method = s->crypt_method;
957 memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
958 memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
959 }
960
961 qcow2_close(bs);
962
963 options = qdict_new();
964 qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS,
965 qbool_from_int(s->use_lazy_refcounts));
966
967 memset(s, 0, sizeof(BDRVQcowState));
968 qcow2_open(bs, options, flags);
969
970 QDECREF(options);
971
972 if (crypt_method) {
973 s->crypt_method = crypt_method;
974 memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
975 memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
976 }
977 }
978
979 static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
980 size_t len, size_t buflen)
981 {
982 QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
983 size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
984
985 if (buflen < ext_len) {
986 return -ENOSPC;
987 }
988
989 *ext_backing_fmt = (QCowExtension) {
990 .magic = cpu_to_be32(magic),
991 .len = cpu_to_be32(len),
992 };
993 memcpy(buf + sizeof(QCowExtension), s, len);
994
995 return ext_len;
996 }
997
998 /*
999 * Updates the qcow2 header, including the variable length parts of it, i.e.
1000 * the backing file name and all extensions. qcow2 was not designed to allow
1001 * such changes, so if we run out of space (we can only use the first cluster)
1002 * this function may fail.
1003 *
1004 * Returns 0 on success, -errno in error cases.
1005 */
1006 int qcow2_update_header(BlockDriverState *bs)
1007 {
1008 BDRVQcowState *s = bs->opaque;
1009 QCowHeader *header;
1010 char *buf;
1011 size_t buflen = s->cluster_size;
1012 int ret;
1013 uint64_t total_size;
1014 uint32_t refcount_table_clusters;
1015 size_t header_length;
1016 Qcow2UnknownHeaderExtension *uext;
1017
1018 buf = qemu_blockalign(bs, buflen);
1019
1020 /* Header structure */
1021 header = (QCowHeader*) buf;
1022
1023 if (buflen < sizeof(*header)) {
1024 ret = -ENOSPC;
1025 goto fail;
1026 }
1027
1028 header_length = sizeof(*header) + s->unknown_header_fields_size;
1029 total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
1030 refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
1031
1032 *header = (QCowHeader) {
1033 /* Version 2 fields */
1034 .magic = cpu_to_be32(QCOW_MAGIC),
1035 .version = cpu_to_be32(s->qcow_version),
1036 .backing_file_offset = 0,
1037 .backing_file_size = 0,
1038 .cluster_bits = cpu_to_be32(s->cluster_bits),
1039 .size = cpu_to_be64(total_size),
1040 .crypt_method = cpu_to_be32(s->crypt_method_header),
1041 .l1_size = cpu_to_be32(s->l1_size),
1042 .l1_table_offset = cpu_to_be64(s->l1_table_offset),
1043 .refcount_table_offset = cpu_to_be64(s->refcount_table_offset),
1044 .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
1045 .nb_snapshots = cpu_to_be32(s->nb_snapshots),
1046 .snapshots_offset = cpu_to_be64(s->snapshots_offset),
1047
1048 /* Version 3 fields */
1049 .incompatible_features = cpu_to_be64(s->incompatible_features),
1050 .compatible_features = cpu_to_be64(s->compatible_features),
1051 .autoclear_features = cpu_to_be64(s->autoclear_features),
1052 .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT),
1053 .header_length = cpu_to_be32(header_length),
1054 };
1055
1056 /* For older versions, write a shorter header */
1057 switch (s->qcow_version) {
1058 case 2:
1059 ret = offsetof(QCowHeader, incompatible_features);
1060 break;
1061 case 3:
1062 ret = sizeof(*header);
1063 break;
1064 default:
1065 ret = -EINVAL;
1066 goto fail;
1067 }
1068
1069 buf += ret;
1070 buflen -= ret;
1071 memset(buf, 0, buflen);
1072
1073 /* Preserve any unknown field in the header */
1074 if (s->unknown_header_fields_size) {
1075 if (buflen < s->unknown_header_fields_size) {
1076 ret = -ENOSPC;
1077 goto fail;
1078 }
1079
1080 memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
1081 buf += s->unknown_header_fields_size;
1082 buflen -= s->unknown_header_fields_size;
1083 }
1084
1085 /* Backing file format header extension */
1086 if (*bs->backing_format) {
1087 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
1088 bs->backing_format, strlen(bs->backing_format),
1089 buflen);
1090 if (ret < 0) {
1091 goto fail;
1092 }
1093
1094 buf += ret;
1095 buflen -= ret;
1096 }
1097
1098 /* Feature table */
1099 Qcow2Feature features[] = {
1100 {
1101 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
1102 .bit = QCOW2_INCOMPAT_DIRTY_BITNR,
1103 .name = "dirty bit",
1104 },
1105 {
1106 .type = QCOW2_FEAT_TYPE_COMPATIBLE,
1107 .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
1108 .name = "lazy refcounts",
1109 },
1110 };
1111
1112 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
1113 features, sizeof(features), buflen);
1114 if (ret < 0) {
1115 goto fail;
1116 }
1117 buf += ret;
1118 buflen -= ret;
1119
1120 /* Keep unknown header extensions */
1121 QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
1122 ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
1123 if (ret < 0) {
1124 goto fail;
1125 }
1126
1127 buf += ret;
1128 buflen -= ret;
1129 }
1130
1131 /* End of header extensions */
1132 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
1133 if (ret < 0) {
1134 goto fail;
1135 }
1136
1137 buf += ret;
1138 buflen -= ret;
1139
1140 /* Backing file name */
1141 if (*bs->backing_file) {
1142 size_t backing_file_len = strlen(bs->backing_file);
1143
1144 if (buflen < backing_file_len) {
1145 ret = -ENOSPC;
1146 goto fail;
1147 }
1148
1149 /* Using strncpy is ok here, since buf is not NUL-terminated. */
1150 strncpy(buf, bs->backing_file, buflen);
1151
1152 header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
1153 header->backing_file_size = cpu_to_be32(backing_file_len);
1154 }
1155
1156 /* Write the new header */
1157 ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
1158 if (ret < 0) {
1159 goto fail;
1160 }
1161
1162 ret = 0;
1163 fail:
1164 qemu_vfree(header);
1165 return ret;
1166 }
1167
1168 static int qcow2_change_backing_file(BlockDriverState *bs,
1169 const char *backing_file, const char *backing_fmt)
1170 {
1171 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1172 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1173
1174 return qcow2_update_header(bs);
1175 }
1176
1177 static int preallocate(BlockDriverState *bs)
1178 {
1179 uint64_t nb_sectors;
1180 uint64_t offset;
1181 uint64_t host_offset = 0;
1182 int num;
1183 int ret;
1184 QCowL2Meta *meta;
1185
1186 nb_sectors = bdrv_getlength(bs) >> 9;
1187 offset = 0;
1188
1189 while (nb_sectors) {
1190 num = MIN(nb_sectors, INT_MAX >> 9);
1191 ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
1192 &host_offset, &meta);
1193 if (ret < 0) {
1194 return ret;
1195 }
1196
1197 ret = qcow2_alloc_cluster_link_l2(bs, meta);
1198 if (ret < 0) {
1199 qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters);
1200 return ret;
1201 }
1202
1203 /* There are no dependent requests, but we need to remove our request
1204 * from the list of in-flight requests */
1205 if (meta != NULL) {
1206 QLIST_REMOVE(meta, next_in_flight);
1207 }
1208
1209 /* TODO Preallocate data if requested */
1210
1211 nb_sectors -= num;
1212 offset += num << 9;
1213 }
1214
1215 /*
1216 * It is expected that the image file is large enough to actually contain
1217 * all of the allocated clusters (otherwise we get failing reads after
1218 * EOF). Extend the image to the last allocated sector.
1219 */
1220 if (host_offset != 0) {
1221 uint8_t buf[512];
1222 memset(buf, 0, 512);
1223 ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
1224 if (ret < 0) {
1225 return ret;
1226 }
1227 }
1228
1229 return 0;
1230 }
1231
1232 static int qcow2_create2(const char *filename, int64_t total_size,
1233 const char *backing_file, const char *backing_format,
1234 int flags, size_t cluster_size, int prealloc,
1235 QEMUOptionParameter *options, int version)
1236 {
1237 /* Calculate cluster_bits */
1238 int cluster_bits;
1239 cluster_bits = ffs(cluster_size) - 1;
1240 if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
1241 (1 << cluster_bits) != cluster_size)
1242 {
1243 error_report(
1244 "Cluster size must be a power of two between %d and %dk",
1245 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
1246 return -EINVAL;
1247 }
1248
1249 /*
1250 * Open the image file and write a minimal qcow2 header.
1251 *
1252 * We keep things simple and start with a zero-sized image. We also
1253 * do without refcount blocks or a L1 table for now. We'll fix the
1254 * inconsistency later.
1255 *
1256 * We do need a refcount table because growing the refcount table means
1257 * allocating two new refcount blocks - the seconds of which would be at
1258 * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
1259 * size for any qcow2 image.
1260 */
1261 BlockDriverState* bs;
1262 QCowHeader header;
1263 uint8_t* refcount_table;
1264 int ret;
1265
1266 ret = bdrv_create_file(filename, options);
1267 if (ret < 0) {
1268 return ret;
1269 }
1270
1271 ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR);
1272 if (ret < 0) {
1273 return ret;
1274 }
1275
1276 /* Write the header */
1277 memset(&header, 0, sizeof(header));
1278 header.magic = cpu_to_be32(QCOW_MAGIC);
1279 header.version = cpu_to_be32(version);
1280 header.cluster_bits = cpu_to_be32(cluster_bits);
1281 header.size = cpu_to_be64(0);
1282 header.l1_table_offset = cpu_to_be64(0);
1283 header.l1_size = cpu_to_be32(0);
1284 header.refcount_table_offset = cpu_to_be64(cluster_size);
1285 header.refcount_table_clusters = cpu_to_be32(1);
1286 header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT);
1287 header.header_length = cpu_to_be32(sizeof(header));
1288
1289 if (flags & BLOCK_FLAG_ENCRYPT) {
1290 header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1291 } else {
1292 header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1293 }
1294
1295 if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
1296 header.compatible_features |=
1297 cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
1298 }
1299
1300 ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
1301 if (ret < 0) {
1302 goto out;
1303 }
1304
1305 /* Write an empty refcount table */
1306 refcount_table = g_malloc0(cluster_size);
1307 ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
1308 g_free(refcount_table);
1309
1310 if (ret < 0) {
1311 goto out;
1312 }
1313
1314 bdrv_close(bs);
1315
1316 /*
1317 * And now open the image and make it consistent first (i.e. increase the
1318 * refcount of the cluster that is occupied by the header and the refcount
1319 * table)
1320 */
1321 BlockDriver* drv = bdrv_find_format("qcow2");
1322 assert(drv != NULL);
1323 ret = bdrv_open(bs, filename, NULL,
1324 BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv);
1325 if (ret < 0) {
1326 goto out;
1327 }
1328
1329 ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
1330 if (ret < 0) {
1331 goto out;
1332
1333 } else if (ret != 0) {
1334 error_report("Huh, first cluster in empty image is already in use?");
1335 abort();
1336 }
1337
1338 /* Okay, now that we have a valid image, let's give it the right size */
1339 ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
1340 if (ret < 0) {
1341 goto out;
1342 }
1343
1344 /* Want a backing file? There you go.*/
1345 if (backing_file) {
1346 ret = bdrv_change_backing_file(bs, backing_file, backing_format);
1347 if (ret < 0) {
1348 goto out;
1349 }
1350 }
1351
1352 /* And if we're supposed to preallocate metadata, do that now */
1353 if (prealloc) {
1354 BDRVQcowState *s = bs->opaque;
1355 qemu_co_mutex_lock(&s->lock);
1356 ret = preallocate(bs);
1357 qemu_co_mutex_unlock(&s->lock);
1358 if (ret < 0) {
1359 goto out;
1360 }
1361 }
1362
1363 ret = 0;
1364 out:
1365 bdrv_delete(bs);
1366 return ret;
1367 }
1368
1369 static int qcow2_create(const char *filename, QEMUOptionParameter *options)
1370 {
1371 const char *backing_file = NULL;
1372 const char *backing_fmt = NULL;
1373 uint64_t sectors = 0;
1374 int flags = 0;
1375 size_t cluster_size = DEFAULT_CLUSTER_SIZE;
1376 int prealloc = 0;
1377 int version = 2;
1378
1379 /* Read out options */
1380 while (options && options->name) {
1381 if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1382 sectors = options->value.n / 512;
1383 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1384 backing_file = options->value.s;
1385 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
1386 backing_fmt = options->value.s;
1387 } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
1388 flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
1389 } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
1390 if (options->value.n) {
1391 cluster_size = options->value.n;
1392 }
1393 } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
1394 if (!options->value.s || !strcmp(options->value.s, "off")) {
1395 prealloc = 0;
1396 } else if (!strcmp(options->value.s, "metadata")) {
1397 prealloc = 1;
1398 } else {
1399 fprintf(stderr, "Invalid preallocation mode: '%s'\n",
1400 options->value.s);
1401 return -EINVAL;
1402 }
1403 } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
1404 if (!options->value.s || !strcmp(options->value.s, "0.10")) {
1405 version = 2;
1406 } else if (!strcmp(options->value.s, "1.1")) {
1407 version = 3;
1408 } else {
1409 fprintf(stderr, "Invalid compatibility level: '%s'\n",
1410 options->value.s);
1411 return -EINVAL;
1412 }
1413 } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
1414 flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
1415 }
1416 options++;
1417 }
1418
1419 if (backing_file && prealloc) {
1420 fprintf(stderr, "Backing file and preallocation cannot be used at "
1421 "the same time\n");
1422 return -EINVAL;
1423 }
1424
1425 if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
1426 fprintf(stderr, "Lazy refcounts only supported with compatibility "
1427 "level 1.1 and above (use compat=1.1 or greater)\n");
1428 return -EINVAL;
1429 }
1430
1431 return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
1432 cluster_size, prealloc, options, version);
1433 }
1434
1435 static int qcow2_make_empty(BlockDriverState *bs)
1436 {
1437 #if 0
1438 /* XXX: not correct */
1439 BDRVQcowState *s = bs->opaque;
1440 uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1441 int ret;
1442
1443 memset(s->l1_table, 0, l1_length);
1444 if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
1445 return -1;
1446 ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
1447 if (ret < 0)
1448 return ret;
1449
1450 l2_cache_reset(bs);
1451 #endif
1452 return 0;
1453 }
1454
1455 static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
1456 int64_t sector_num, int nb_sectors)
1457 {
1458 int ret;
1459 BDRVQcowState *s = bs->opaque;
1460
1461 /* Emulate misaligned zero writes */
1462 if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
1463 return -ENOTSUP;
1464 }
1465
1466 /* Whatever is left can use real zero clusters */
1467 qemu_co_mutex_lock(&s->lock);
1468 ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
1469 nb_sectors);
1470 qemu_co_mutex_unlock(&s->lock);
1471
1472 return ret;
1473 }
1474
1475 static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
1476 int64_t sector_num, int nb_sectors)
1477 {
1478 int ret;
1479 BDRVQcowState *s = bs->opaque;
1480
1481 qemu_co_mutex_lock(&s->lock);
1482 ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
1483 nb_sectors);
1484 qemu_co_mutex_unlock(&s->lock);
1485 return ret;
1486 }
1487
1488 static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
1489 {
1490 BDRVQcowState *s = bs->opaque;
1491 int64_t new_l1_size;
1492 int ret;
1493
1494 if (offset & 511) {
1495 error_report("The new size must be a multiple of 512");
1496 return -EINVAL;
1497 }
1498
1499 /* cannot proceed if image has snapshots */
1500 if (s->nb_snapshots) {
1501 error_report("Can't resize an image which has snapshots");
1502 return -ENOTSUP;
1503 }
1504
1505 /* shrinking is currently not supported */
1506 if (offset < bs->total_sectors * 512) {
1507 error_report("qcow2 doesn't support shrinking images yet");
1508 return -ENOTSUP;
1509 }
1510
1511 new_l1_size = size_to_l1(s, offset);
1512 ret = qcow2_grow_l1_table(bs, new_l1_size, true);
1513 if (ret < 0) {
1514 return ret;
1515 }
1516
1517 /* write updated header.size */
1518 offset = cpu_to_be64(offset);
1519 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
1520 &offset, sizeof(uint64_t));
1521 if (ret < 0) {
1522 return ret;
1523 }
1524
1525 s->l1_vm_state_index = new_l1_size;
1526 return 0;
1527 }
1528
1529 /* XXX: put compressed sectors first, then all the cluster aligned
1530 tables to avoid losing bytes in alignment */
1531 static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
1532 const uint8_t *buf, int nb_sectors)
1533 {
1534 BDRVQcowState *s = bs->opaque;
1535 z_stream strm;
1536 int ret, out_len;
1537 uint8_t *out_buf;
1538 uint64_t cluster_offset;
1539
1540 if (nb_sectors == 0) {
1541 /* align end of file to a sector boundary to ease reading with
1542 sector based I/Os */
1543 cluster_offset = bdrv_getlength(bs->file);
1544 cluster_offset = (cluster_offset + 511) & ~511;
1545 bdrv_truncate(bs->file, cluster_offset);
1546 return 0;
1547 }
1548
1549 if (nb_sectors != s->cluster_sectors) {
1550 ret = -EINVAL;
1551
1552 /* Zero-pad last write if image size is not cluster aligned */
1553 if (sector_num + nb_sectors == bs->total_sectors &&
1554 nb_sectors < s->cluster_sectors) {
1555 uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
1556 memset(pad_buf, 0, s->cluster_size);
1557 memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
1558 ret = qcow2_write_compressed(bs, sector_num,
1559 pad_buf, s->cluster_sectors);
1560 qemu_vfree(pad_buf);
1561 }
1562 return ret;
1563 }
1564
1565 out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1566
1567 /* best compression, small window, no zlib header */
1568 memset(&strm, 0, sizeof(strm));
1569 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1570 Z_DEFLATED, -12,
1571 9, Z_DEFAULT_STRATEGY);
1572 if (ret != 0) {
1573 ret = -EINVAL;
1574 goto fail;
1575 }
1576
1577 strm.avail_in = s->cluster_size;
1578 strm.next_in = (uint8_t *)buf;
1579 strm.avail_out = s->cluster_size;
1580 strm.next_out = out_buf;
1581
1582 ret = deflate(&strm, Z_FINISH);
1583 if (ret != Z_STREAM_END && ret != Z_OK) {
1584 deflateEnd(&strm);
1585 ret = -EINVAL;
1586 goto fail;
1587 }
1588 out_len = strm.next_out - out_buf;
1589
1590 deflateEnd(&strm);
1591
1592 if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1593 /* could not compress: write normal cluster */
1594 ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
1595 if (ret < 0) {
1596 goto fail;
1597 }
1598 } else {
1599 cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
1600 sector_num << 9, out_len);
1601 if (!cluster_offset) {
1602 ret = -EIO;
1603 goto fail;
1604 }
1605 cluster_offset &= s->cluster_offset_mask;
1606 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
1607 ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
1608 if (ret < 0) {
1609 goto fail;
1610 }
1611 }
1612
1613 ret = 0;
1614 fail:
1615 g_free(out_buf);
1616 return ret;
1617 }
1618
1619 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
1620 {
1621 BDRVQcowState *s = bs->opaque;
1622 int ret;
1623
1624 qemu_co_mutex_lock(&s->lock);
1625 ret = qcow2_cache_flush(bs, s->l2_table_cache);
1626 if (ret < 0) {
1627 qemu_co_mutex_unlock(&s->lock);
1628 return ret;
1629 }
1630
1631 if (qcow2_need_accurate_refcounts(s)) {
1632 ret = qcow2_cache_flush(bs, s->refcount_block_cache);
1633 if (ret < 0) {
1634 qemu_co_mutex_unlock(&s->lock);
1635 return ret;
1636 }
1637 }
1638 qemu_co_mutex_unlock(&s->lock);
1639
1640 return 0;
1641 }
1642
1643 static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
1644 {
1645 return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
1646 }
1647
1648 static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1649 {
1650 BDRVQcowState *s = bs->opaque;
1651 bdi->cluster_size = s->cluster_size;
1652 bdi->vm_state_offset = qcow2_vm_state_offset(s);
1653 return 0;
1654 }
1655
1656 #if 0
1657 static void dump_refcounts(BlockDriverState *bs)
1658 {
1659 BDRVQcowState *s = bs->opaque;
1660 int64_t nb_clusters, k, k1, size;
1661 int refcount;
1662
1663 size = bdrv_getlength(bs->file);
1664 nb_clusters = size_to_clusters(s, size);
1665 for(k = 0; k < nb_clusters;) {
1666 k1 = k;
1667 refcount = get_refcount(bs, k);
1668 k++;
1669 while (k < nb_clusters && get_refcount(bs, k) == refcount)
1670 k++;
1671 printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
1672 k - k1);
1673 }
1674 }
1675 #endif
1676
1677 static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
1678 int64_t pos)
1679 {
1680 BDRVQcowState *s = bs->opaque;
1681 int growable = bs->growable;
1682 int ret;
1683
1684 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
1685 bs->growable = 1;
1686 ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
1687 bs->growable = growable;
1688
1689 return ret;
1690 }
1691
1692 static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1693 int64_t pos, int size)
1694 {
1695 BDRVQcowState *s = bs->opaque;
1696 int growable = bs->growable;
1697 int ret;
1698
1699 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
1700 bs->growable = 1;
1701 ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
1702 bs->growable = growable;
1703
1704 return ret;
1705 }
1706
1707 static QEMUOptionParameter qcow2_create_options[] = {
1708 {
1709 .name = BLOCK_OPT_SIZE,
1710 .type = OPT_SIZE,
1711 .help = "Virtual disk size"
1712 },
1713 {
1714 .name = BLOCK_OPT_COMPAT_LEVEL,
1715 .type = OPT_STRING,
1716 .help = "Compatibility level (0.10 or 1.1)"
1717 },
1718 {
1719 .name = BLOCK_OPT_BACKING_FILE,
1720 .type = OPT_STRING,
1721 .help = "File name of a base image"
1722 },
1723 {
1724 .name = BLOCK_OPT_BACKING_FMT,
1725 .type = OPT_STRING,
1726 .help = "Image format of the base image"
1727 },
1728 {
1729 .name = BLOCK_OPT_ENCRYPT,
1730 .type = OPT_FLAG,
1731 .help = "Encrypt the image"
1732 },
1733 {
1734 .name = BLOCK_OPT_CLUSTER_SIZE,
1735 .type = OPT_SIZE,
1736 .help = "qcow2 cluster size",
1737 .value = { .n = DEFAULT_CLUSTER_SIZE },
1738 },
1739 {
1740 .name = BLOCK_OPT_PREALLOC,
1741 .type = OPT_STRING,
1742 .help = "Preallocation mode (allowed values: off, metadata)"
1743 },
1744 {
1745 .name = BLOCK_OPT_LAZY_REFCOUNTS,
1746 .type = OPT_FLAG,
1747 .help = "Postpone refcount updates",
1748 },
1749 { NULL }
1750 };
1751
1752 static BlockDriver bdrv_qcow2 = {
1753 .format_name = "qcow2",
1754 .instance_size = sizeof(BDRVQcowState),
1755 .bdrv_probe = qcow2_probe,
1756 .bdrv_open = qcow2_open,
1757 .bdrv_close = qcow2_close,
1758 .bdrv_reopen_prepare = qcow2_reopen_prepare,
1759 .bdrv_create = qcow2_create,
1760 .bdrv_co_is_allocated = qcow2_co_is_allocated,
1761 .bdrv_set_key = qcow2_set_key,
1762 .bdrv_make_empty = qcow2_make_empty,
1763
1764 .bdrv_co_readv = qcow2_co_readv,
1765 .bdrv_co_writev = qcow2_co_writev,
1766 .bdrv_co_flush_to_os = qcow2_co_flush_to_os,
1767
1768 .bdrv_co_write_zeroes = qcow2_co_write_zeroes,
1769 .bdrv_co_discard = qcow2_co_discard,
1770 .bdrv_truncate = qcow2_truncate,
1771 .bdrv_write_compressed = qcow2_write_compressed,
1772
1773 .bdrv_snapshot_create = qcow2_snapshot_create,
1774 .bdrv_snapshot_goto = qcow2_snapshot_goto,
1775 .bdrv_snapshot_delete = qcow2_snapshot_delete,
1776 .bdrv_snapshot_list = qcow2_snapshot_list,
1777 .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
1778 .bdrv_get_info = qcow2_get_info,
1779
1780 .bdrv_save_vmstate = qcow2_save_vmstate,
1781 .bdrv_load_vmstate = qcow2_load_vmstate,
1782
1783 .bdrv_change_backing_file = qcow2_change_backing_file,
1784
1785 .bdrv_invalidate_cache = qcow2_invalidate_cache,
1786
1787 .create_options = qcow2_create_options,
1788 .bdrv_check = qcow2_check,
1789 };
1790
1791 static void bdrv_qcow2_init(void)
1792 {
1793 bdrv_register(&bdrv_qcow2);
1794 }
1795
1796 block_init(bdrv_qcow2_init);