]> git.proxmox.com Git - qemu.git/blobdiff - block/qcow2.c
block: move include files to include/block/
[qemu.git] / block / qcow2.c
index eb5ea485d97a71d5eccc03c594cf388a0e829dad..205d910a52d018ce7e502d9dd9cd8ecb62111dd9 100644 (file)
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include "module.h"
 #include <zlib.h>
-#include "aes.h"
+#include "block/aes.h"
 #include "block/qcow2.h"
 #include "qemu-error.h"
-#include "qerror.h"
+#include "qapi/qmp/qerror.h"
+#include "trace.h"
 
 /*
   Differences with QCOW:
@@ -51,8 +52,10 @@ typedef struct {
     uint32_t magic;
     uint32_t len;
 } QCowExtension;
+
 #define  QCOW2_EXT_MAGIC_END 0
 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
+#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
 
 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
@@ -60,7 +63,7 @@ static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
 
     if (buf_size >= sizeof(QCowHeader) &&
         be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
-        be32_to_cpu(cow_header->version) >= QCOW_VERSION)
+        be32_to_cpu(cow_header->version) >= 2)
         return 100;
     else
         return 0;
@@ -75,7 +78,7 @@ static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
  * return 0 upon success, non-0 otherwise
  */
 static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
-                                 uint64_t end_offset)
+                                 uint64_t end_offset, void **p_feature_table)
 {
     BDRVQcowState *s = bs->opaque;
     QCowExtension ext;
@@ -133,6 +136,18 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
 #endif
             break;
 
+        case QCOW2_EXT_MAGIC_FEATURE_TABLE:
+            if (p_feature_table != NULL) {
+                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
+                ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
+                if (ret < 0) {
+                    return ret;
+                }
+
+                *p_feature_table = feature_table;
+            }
+            break;
+
         default:
             /* unknown magic - save it in case we need to rewrite the header */
             {
@@ -168,13 +183,114 @@ static void cleanup_unknown_header_ext(BlockDriverState *bs)
     }
 }
 
+static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs,
+    const char *fmt, ...)
+{
+    char msg[64];
+    va_list ap;
+
+    va_start(ap, fmt);
+    vsnprintf(msg, sizeof(msg), fmt, ap);
+    va_end(ap);
+
+    qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
+        bs->device_name, "qcow2", msg);
+}
+
+static void report_unsupported_feature(BlockDriverState *bs,
+    Qcow2Feature *table, uint64_t mask)
+{
+    while (table && table->name[0] != '\0') {
+        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
+            if (mask & (1 << table->bit)) {
+                report_unsupported(bs, "%.46s",table->name);
+                mask &= ~(1 << table->bit);
+            }
+        }
+        table++;
+    }
+
+    if (mask) {
+        report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask);
+    }
+}
+
+/*
+ * Sets the dirty bit and flushes afterwards if necessary.
+ *
+ * The incompatible_features bit is only set if the image file header was
+ * updated successfully.  Therefore it is not required to check the return
+ * value of this function.
+ */
+int qcow2_mark_dirty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t val;
+    int ret;
+
+    assert(s->qcow_version >= 3);
+
+    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
+        return 0; /* already dirty */
+    }
+
+    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
+    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
+                      &val, sizeof(val));
+    if (ret < 0) {
+        return ret;
+    }
+    ret = bdrv_flush(bs->file);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Only treat image as dirty if the header was updated successfully */
+    s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
+    return 0;
+}
+
+/*
+ * Clears the dirty bit and flushes before if necessary.  Only call this
+ * function when there are no pending requests, it does not guard against
+ * concurrent requests dirtying the image.
+ */
+static int qcow2_mark_clean(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
+        int ret = bdrv_flush(bs);
+        if (ret < 0) {
+            return ret;
+        }
+
+        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
+        return qcow2_update_header(bs);
+    }
+    return 0;
+}
+
+static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
+                       BdrvCheckMode fix)
+{
+    int ret = qcow2_check_refcounts(bs, result, fix);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (fix && result->check_errors == 0 && result->corruptions == 0) {
+        return qcow2_mark_clean(bs);
+    }
+    return ret;
+}
+
 static int qcow2_open(BlockDriverState *bs, int flags)
 {
     BDRVQcowState *s = bs->opaque;
     int len, i, ret = 0;
     QCowHeader header;
     uint64_t ext_end;
-    bool writethrough;
 
     ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
     if (ret < 0) {
@@ -198,14 +314,69 @@ static int qcow2_open(BlockDriverState *bs, int flags)
         ret = -EINVAL;
         goto fail;
     }
-    if (header.version != QCOW_VERSION) {
-        char version[64];
-        snprintf(version, sizeof(version), "QCOW version %d", header.version);
-        qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-            bs->device_name, "qcow2", version);
+    if (header.version < 2 || header.version > 3) {
+        report_unsupported(bs, "QCOW version %d", header.version);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    s->qcow_version = header.version;
+
+    /* Initialise version 3 header fields */
+    if (header.version == 2) {
+        header.incompatible_features    = 0;
+        header.compatible_features      = 0;
+        header.autoclear_features       = 0;
+        header.refcount_order           = 4;
+        header.header_length            = 72;
+    } else {
+        be64_to_cpus(&header.incompatible_features);
+        be64_to_cpus(&header.compatible_features);
+        be64_to_cpus(&header.autoclear_features);
+        be32_to_cpus(&header.refcount_order);
+        be32_to_cpus(&header.header_length);
+    }
+
+    if (header.header_length > sizeof(header)) {
+        s->unknown_header_fields_size = header.header_length - sizeof(header);
+        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
+        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
+                         s->unknown_header_fields_size);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    if (header.backing_file_offset) {
+        ext_end = header.backing_file_offset;
+    } else {
+        ext_end = 1 << header.cluster_bits;
+    }
+
+    /* Handle feature bits */
+    s->incompatible_features    = header.incompatible_features;
+    s->compatible_features      = header.compatible_features;
+    s->autoclear_features       = header.autoclear_features;
+
+    if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
+        void *feature_table = NULL;
+        qcow2_read_extensions(bs, header.header_length, ext_end,
+                              &feature_table);
+        report_unsupported_feature(bs, feature_table,
+                                   s->incompatible_features &
+                                   ~QCOW2_INCOMPAT_MASK);
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    /* Check support for various header values */
+    if (header.refcount_order != 4) {
+        report_unsupported(bs, "%d bit reference counts",
+                           1 << header.refcount_order);
         ret = -ENOTSUP;
         goto fail;
     }
+
     if (header.cluster_bits < MIN_CLUSTER_BITS ||
         header.cluster_bits > MAX_CLUSTER_BITS) {
         ret = -EINVAL;
@@ -259,10 +430,8 @@ static int qcow2_open(BlockDriverState *bs, int flags)
     }
 
     /* alloc L2 table/refcount block cache */
-    writethrough = ((flags & BDRV_O_CACHE_WB) == 0);
-    s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE, writethrough);
-    s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE,
-        writethrough);
+    s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
+    s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
 
     s->cluster_cache = g_malloc(s->cluster_size);
     /* one more sector for decompressed data alignment */
@@ -279,12 +448,7 @@ static int qcow2_open(BlockDriverState *bs, int flags)
     QLIST_INIT(&s->cluster_allocs);
 
     /* read qcow2 extensions */
-    if (header.backing_file_offset) {
-        ext_end = header.backing_file_offset;
-    } else {
-        ext_end = s->cluster_size;
-    }
-    if (qcow2_read_extensions(bs, sizeof(header), ext_end)) {
+    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) {
         ret = -EINVAL;
         goto fail;
     }
@@ -308,18 +472,39 @@ static int qcow2_open(BlockDriverState *bs, int flags)
         goto fail;
     }
 
+    /* Clear unknown autoclear feature bits */
+    if (!bs->read_only && s->autoclear_features != 0) {
+        s->autoclear_features = 0;
+        ret = qcow2_update_header(bs);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
     /* Initialise locks */
     qemu_co_mutex_init(&s->lock);
 
+    /* Repair image if dirty */
+    if (!(flags & BDRV_O_CHECK) && !bs->read_only &&
+        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
+        BdrvCheckResult result = {0};
+
+        ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
 #ifdef DEBUG_ALLOC
     {
         BdrvCheckResult result = {0};
-        qcow2_check_refcounts(bs, &result);
+        qcow2_check_refcounts(bs, &result, 0);
     }
 #endif
     return ret;
 
  fail:
+    g_free(s->unknown_header_fields);
     cleanup_unknown_header_ext(bs);
     qcow2_free_snapshots(bs);
     qcow2_refcount_close(bs);
@@ -374,6 +559,14 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
     return 0;
 }
 
+/* We have nothing to do for QCOW2 reopen, stubs just return
+ * success */
+static int qcow2_reopen_prepare(BDRVReopenState *state,
+                                BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
@@ -406,7 +599,7 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
     else
         n1 = bs->total_sectors - sector_num;
 
-    qemu_iovec_memset_skip(qiov, 0, 512 * (nb_sectors - n1), 512 * n1);
+    qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
 
     return n1;
 }
@@ -445,10 +638,11 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
 
         qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
             cur_nr_sectors * 512);
 
-        if (!cluster_offset) {
+        switch (ret) {
+        case QCOW2_CLUSTER_UNALLOCATED:
 
             if (bs->backing_hd) {
                 /* read from the base image */
@@ -466,19 +660,31 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                 }
             } else {
                 /* Note: in this case, no need to wait */
-                qemu_iovec_memset(&hd_qiov, 0, 512 * cur_nr_sectors);
+                qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+            }
+            break;
+
+        case QCOW2_CLUSTER_ZERO:
+            if (s->qcow_version < 3) {
+                ret = -EIO;
+                goto fail;
             }
-        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
+            break;
+
+        case QCOW2_CLUSTER_COMPRESSED:
             /* add AIO support for compressed blocks ? */
             ret = qcow2_decompress_cluster(bs, cluster_offset);
             if (ret < 0) {
                 goto fail;
             }
 
-            qemu_iovec_from_buffer(&hd_qiov,
+            qemu_iovec_from_buf(&hd_qiov, 0,
                 s->cluster_cache + index_in_cluster * 512,
                 512 * cur_nr_sectors);
-        } else {
+            break;
+
+        case QCOW2_CLUSTER_NORMAL:
             if ((cluster_offset & 511) != 0) {
                 ret = -EIO;
                 goto fail;
@@ -513,12 +719,15 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
             if (s->crypt_method) {
                 qcow2_encrypt_sectors(s, sector_num,  cluster_data,
                     cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
-                qemu_iovec_reset(&hd_qiov);
-                qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
-                    cur_nr_sectors * 512);
-                qemu_iovec_from_buffer(&hd_qiov, cluster_data,
-                    512 * cur_nr_sectors);
+                qemu_iovec_from_buf(qiov, bytes_done,
+                    cluster_data, 512 * cur_nr_sectors);
             }
+            break;
+
+        default:
+            g_assert_not_reached();
+            ret = -EIO;
+            goto fail;
         }
 
         remaining_sectors -= cur_nr_sectors;
@@ -536,21 +745,6 @@ fail:
     return ret;
 }
 
-static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
-{
-    /* Take the request off the list of running requests */
-    if (m->nb_clusters != 0) {
-        QLIST_REMOVE(m, next_in_flight);
-    }
-
-    /* Restart all dependent requests */
-    if (!qemu_co_queue_empty(&m->dependent_requests)) {
-        qemu_co_mutex_unlock(&s->lock);
-        qemu_co_queue_restart_all(&m->dependent_requests);
-        qemu_co_mutex_lock(&s->lock);
-    }
-}
-
 static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                            int64_t sector_num,
                            int remaining_sectors,
@@ -565,11 +759,10 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
     QEMUIOVector hd_qiov;
     uint64_t bytes_done = 0;
     uint8_t *cluster_data = NULL;
-    QCowL2Meta l2meta = {
-        .nb_clusters = 0,
-    };
+    QCowL2Meta *l2meta;
 
-    qemu_co_queue_init(&l2meta.dependent_requests);
+    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
+                                 remaining_sectors);
 
     qemu_iovec_init(&hd_qiov, qiov->niov);
 
@@ -579,6 +772,9 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 
     while (remaining_sectors != 0) {
 
+        l2meta = NULL;
+
+        trace_qcow2_writev_start_part(qemu_coroutine_self());
         index_in_cluster = sector_num & (s->cluster_sectors - 1);
         n_end = index_in_cluster + remaining_sectors;
         if (s->crypt_method &&
@@ -587,16 +783,15 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
         }
 
         ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
-            index_in_cluster, n_end, &cur_nr_sectors, &l2meta);
+            index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
         if (ret < 0) {
             goto fail;
         }
 
-        cluster_offset = l2meta.cluster_offset;
         assert((cluster_offset & 511) == 0);
 
         qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_copy(&hd_qiov, qiov, bytes_done,
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
             cur_nr_sectors * 512);
 
         if (s->crypt_method) {
@@ -607,7 +802,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
 
             assert(hd_qiov.size <=
                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
-            qemu_iovec_to_buffer(&hd_qiov, cluster_data);
+            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
 
             qcow2_encrypt_sectors(s, sector_num, cluster_data,
                 cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
@@ -617,8 +812,10 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                 cur_nr_sectors * 512);
         }
 
-        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
         qemu_co_mutex_unlock(&s->lock);
+        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+        trace_qcow2_writev_data(qemu_coroutine_self(),
+                                (cluster_offset >> 9) + index_in_cluster);
         ret = bdrv_co_writev(bs->file,
                              (cluster_offset >> 9) + index_in_cluster,
                              cur_nr_sectors, &hd_qiov);
@@ -627,26 +824,46 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
             goto fail;
         }
 
-        ret = qcow2_alloc_cluster_link_l2(bs, &l2meta);
-        if (ret < 0) {
-            goto fail;
-        }
+        if (l2meta != NULL) {
+            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
+            if (ret < 0) {
+                goto fail;
+            }
+
+            /* Take the request off the list of running requests */
+            if (l2meta->nb_clusters != 0) {
+                QLIST_REMOVE(l2meta, next_in_flight);
+            }
 
-        run_dependent_requests(s, &l2meta);
+            qemu_co_mutex_unlock(&s->lock);
+            qemu_co_queue_restart_all(&l2meta->dependent_requests);
+            qemu_co_mutex_lock(&s->lock);
+
+            g_free(l2meta);
+            l2meta = NULL;
+        }
 
         remaining_sectors -= cur_nr_sectors;
         sector_num += cur_nr_sectors;
         bytes_done += cur_nr_sectors * 512;
+        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
     }
     ret = 0;
 
 fail:
-    run_dependent_requests(s, &l2meta);
-
     qemu_co_mutex_unlock(&s->lock);
 
+    if (l2meta != NULL) {
+        if (l2meta->nb_clusters != 0) {
+            QLIST_REMOVE(l2meta, next_in_flight);
+        }
+        qemu_co_queue_restart_all(&l2meta->dependent_requests);
+        g_free(l2meta);
+    }
+
     qemu_iovec_destroy(&hd_qiov);
     qemu_vfree(cluster_data);
+    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
 
     return ret;
 }
@@ -659,10 +876,14 @@ static void qcow2_close(BlockDriverState *bs)
     qcow2_cache_flush(bs, s->l2_table_cache);
     qcow2_cache_flush(bs, s->refcount_block_cache);
 
+    qcow2_mark_clean(bs);
+
     qcow2_cache_destroy(bs, s->l2_table_cache);
     qcow2_cache_destroy(bs, s->refcount_block_cache);
 
+    g_free(s->unknown_header_fields);
     cleanup_unknown_header_ext(bs);
+
     g_free(s->cluster_cache);
     qemu_vfree(s->cluster_data);
     qcow2_refcount_close(bs);
@@ -736,10 +957,10 @@ int qcow2_update_header(BlockDriverState *bs)
     int ret;
     uint64_t total_size;
     uint32_t refcount_table_clusters;
+    size_t header_length;
     Qcow2UnknownHeaderExtension *uext;
 
     buf = qemu_blockalign(bs, buflen);
-    memset(buf, 0, s->cluster_size);
 
     /* Header structure */
     header = (QCowHeader*) buf;
@@ -749,12 +970,14 @@ int qcow2_update_header(BlockDriverState *bs)
         goto fail;
     }
 
+    header_length = sizeof(*header) + s->unknown_header_fields_size;
     total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
     refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
 
     *header = (QCowHeader) {
+        /* Version 2 fields */
         .magic                  = cpu_to_be32(QCOW_MAGIC),
-        .version                = cpu_to_be32(QCOW_VERSION),
+        .version                = cpu_to_be32(s->qcow_version),
         .backing_file_offset    = 0,
         .backing_file_size      = 0,
         .cluster_bits           = cpu_to_be32(s->cluster_bits),
@@ -766,10 +989,43 @@ int qcow2_update_header(BlockDriverState *bs)
         .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
         .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
         .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
+
+        /* Version 3 fields */
+        .incompatible_features  = cpu_to_be64(s->incompatible_features),
+        .compatible_features    = cpu_to_be64(s->compatible_features),
+        .autoclear_features     = cpu_to_be64(s->autoclear_features),
+        .refcount_order         = cpu_to_be32(3 + REFCOUNT_SHIFT),
+        .header_length          = cpu_to_be32(header_length),
     };
 
-    buf += sizeof(*header);
-    buflen -= sizeof(*header);
+    /* For older versions, write a shorter header */
+    switch (s->qcow_version) {
+    case 2:
+        ret = offsetof(QCowHeader, incompatible_features);
+        break;
+    case 3:
+        ret = sizeof(*header);
+        break;
+    default:
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    buf += ret;
+    buflen -= ret;
+    memset(buf, 0, buflen);
+
+    /* Preserve any unknown field in the header */
+    if (s->unknown_header_fields_size) {
+        if (buflen < s->unknown_header_fields_size) {
+            ret = -ENOSPC;
+            goto fail;
+        }
+
+        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
+        buf += s->unknown_header_fields_size;
+        buflen -= s->unknown_header_fields_size;
+    }
 
     /* Backing file format header extension */
     if (*bs->backing_format) {
@@ -784,6 +1040,28 @@ int qcow2_update_header(BlockDriverState *bs)
         buflen -= ret;
     }
 
+    /* Feature table */
+    Qcow2Feature features[] = {
+        {
+            .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
+            .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
+            .name = "dirty bit",
+        },
+        {
+            .type = QCOW2_FEAT_TYPE_COMPATIBLE,
+            .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
+            .name = "lazy refcounts",
+        },
+    };
+
+    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
+                         features, sizeof(features), buflen);
+    if (ret < 0) {
+        goto fail;
+    }
+    buf += ret;
+    buflen -= ret;
+
     /* Keep unknown header extensions */
     QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
         ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
@@ -813,6 +1091,7 @@ int qcow2_update_header(BlockDriverState *bs)
             goto fail;
         }
 
+        /* Using strncpy is ok here, since buf is not NUL-terminated. */
         strncpy(buf, bs->backing_file, buflen);
 
         header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
@@ -834,11 +1113,6 @@ fail:
 static int qcow2_change_backing_file(BlockDriverState *bs,
     const char *backing_file, const char *backing_fmt)
 {
-    /* Backing file format doesn't make sense without a backing file */
-    if (backing_fmt && !backing_file) {
-        return -EINVAL;
-    }
-
     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
     pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
 
@@ -849,31 +1123,33 @@ static int preallocate(BlockDriverState *bs)
 {
     uint64_t nb_sectors;
     uint64_t offset;
+    uint64_t host_offset = 0;
     int num;
     int ret;
-    QCowL2Meta meta;
+    QCowL2Meta *meta;
 
     nb_sectors = bdrv_getlength(bs) >> 9;
     offset = 0;
-    qemu_co_queue_init(&meta.dependent_requests);
-    meta.cluster_offset = 0;
 
     while (nb_sectors) {
         num = MIN(nb_sectors, INT_MAX >> 9);
-        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta);
+        ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
+                                         &host_offset, &meta);
         if (ret < 0) {
             return ret;
         }
 
-        ret = qcow2_alloc_cluster_link_l2(bs, &meta);
+        ret = qcow2_alloc_cluster_link_l2(bs, meta);
         if (ret < 0) {
-            qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters);
+            qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters);
             return ret;
         }
 
         /* There are no dependent requests, but we need to remove our request
          * from the list of in-flight requests */
-        run_dependent_requests(bs->opaque, &meta);
+        if (meta != NULL) {
+            QLIST_REMOVE(meta, next_in_flight);
+        }
 
         /* TODO Preallocate data if requested */
 
@@ -886,10 +1162,10 @@ static int preallocate(BlockDriverState *bs)
      * all of the allocated clusters (otherwise we get failing reads after
      * EOF). Extend the image to the last allocated sector.
      */
-    if (meta.cluster_offset != 0) {
+    if (host_offset != 0) {
         uint8_t buf[512];
         memset(buf, 0, 512);
-        ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1);
+        ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
         if (ret < 0) {
             return ret;
         }
@@ -901,7 +1177,7 @@ static int preallocate(BlockDriverState *bs)
 static int qcow2_create2(const char *filename, int64_t total_size,
                          const char *backing_file, const char *backing_format,
                          int flags, size_t cluster_size, int prealloc,
-                         QEMUOptionParameter *options)
+                         QEMUOptionParameter *options, int version)
 {
     /* Calculate cluster_bits */
     int cluster_bits;
@@ -945,13 +1221,15 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     /* Write the header */
     memset(&header, 0, sizeof(header));
     header.magic = cpu_to_be32(QCOW_MAGIC);
-    header.version = cpu_to_be32(QCOW_VERSION);
+    header.version = cpu_to_be32(version);
     header.cluster_bits = cpu_to_be32(cluster_bits);
     header.size = cpu_to_be64(0);
     header.l1_table_offset = cpu_to_be64(0);
     header.l1_size = cpu_to_be32(0);
     header.refcount_table_offset = cpu_to_be64(cluster_size);
     header.refcount_table_clusters = cpu_to_be32(1);
+    header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT);
+    header.header_length = cpu_to_be32(sizeof(header));
 
     if (flags & BLOCK_FLAG_ENCRYPT) {
         header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
@@ -959,6 +1237,11 @@ static int qcow2_create2(const char *filename, int64_t total_size,
         header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
     }
 
+    if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
+        header.compatible_features |=
+            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
+    }
+
     ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
     if (ret < 0) {
         goto out;
@@ -1013,7 +1296,10 @@ static int qcow2_create2(const char *filename, int64_t total_size,
 
     /* And if we're supposed to preallocate metadata, do that now */
     if (prealloc) {
+        BDRVQcowState *s = bs->opaque;
+        qemu_co_mutex_lock(&s->lock);
         ret = preallocate(bs);
+        qemu_co_mutex_unlock(&s->lock);
         if (ret < 0) {
             goto out;
         }
@@ -1033,6 +1319,7 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options)
     int flags = 0;
     size_t cluster_size = DEFAULT_CLUSTER_SIZE;
     int prealloc = 0;
+    int version = 2;
 
     /* Read out options */
     while (options && options->name) {
@@ -1058,6 +1345,18 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options)
                     options->value.s);
                 return -EINVAL;
             }
+        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
+            if (!options->value.s || !strcmp(options->value.s, "0.10")) {
+                version = 2;
+            } else if (!strcmp(options->value.s, "1.1")) {
+                version = 3;
+            } else {
+                fprintf(stderr, "Invalid compatibility level: '%s'\n",
+                    options->value.s);
+                return -EINVAL;
+            }
+        } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
+            flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
         }
         options++;
     }
@@ -1068,8 +1367,14 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options)
         return -EINVAL;
     }
 
+    if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
+        fprintf(stderr, "Lazy refcounts only supported with compatibility "
+                "level 1.1 and above (use compat=1.1 or greater)\n");
+        return -EINVAL;
+    }
+
     return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
-                         cluster_size, prealloc, options);
+                         cluster_size, prealloc, options, version);
 }
 
 static int qcow2_make_empty(BlockDriverState *bs)
@@ -1092,6 +1397,26 @@ static int qcow2_make_empty(BlockDriverState *bs)
     return 0;
 }
 
+static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors)
+{
+    int ret;
+    BDRVQcowState *s = bs->opaque;
+
+    /* Emulate misaligned zero writes */
+    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
+        return -ENOTSUP;
+    }
+
+    /* Whatever is left can use real zero clusters */
+    qemu_co_mutex_lock(&s->lock);
+    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
+        nb_sectors);
+    qemu_co_mutex_unlock(&s->lock);
+
+    return ret;
+}
+
 static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors)
 {
@@ -1111,16 +1436,19 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
     int ret, new_l1_size;
 
     if (offset & 511) {
+        error_report("The new size must be a multiple of 512");
         return -EINVAL;
     }
 
     /* cannot proceed if image has snapshots */
     if (s->nb_snapshots) {
+        error_report("Can't resize an image which has snapshots");
         return -ENOTSUP;
     }
 
     /* shrinking is currently not supported */
     if (offset < bs->total_sectors * 512) {
+        error_report("qcow2 doesn't support shrinking images yet");
         return -ENOTSUP;
     }
 
@@ -1231,21 +1559,18 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
         return ret;
     }
 
-    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
-    if (ret < 0) {
-        qemu_co_mutex_unlock(&s->lock);
-        return ret;
+    if (qcow2_need_accurate_refcounts(s)) {
+        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+        if (ret < 0) {
+            qemu_co_mutex_unlock(&s->lock);
+            return ret;
+        }
     }
     qemu_co_mutex_unlock(&s->lock);
 
     return 0;
 }
 
-static coroutine_fn int qcow2_co_flush_to_disk(BlockDriverState *bs)
-{
-    return bdrv_co_flush(bs->file);
-}
-
 static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
 {
        return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
@@ -1259,12 +1584,6 @@ static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
     return 0;
 }
 
-
-static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result)
-{
-    return qcow2_check_refcounts(bs, result);
-}
-
 #if 0
 static void dump_refcounts(BlockDriverState *bs)
 {
@@ -1322,6 +1641,11 @@ static QEMUOptionParameter qcow2_create_options[] = {
         .type = OPT_SIZE,
         .help = "Virtual disk size"
     },
+    {
+        .name = BLOCK_OPT_COMPAT_LEVEL,
+        .type = OPT_STRING,
+        .help = "Compatibility level (0.10 or 1.1)"
+    },
     {
         .name = BLOCK_OPT_BACKING_FILE,
         .type = OPT_STRING,
@@ -1348,6 +1672,11 @@ static QEMUOptionParameter qcow2_create_options[] = {
         .type = OPT_STRING,
         .help = "Preallocation mode (allowed values: off, metadata)"
     },
+    {
+        .name = BLOCK_OPT_LAZY_REFCOUNTS,
+        .type = OPT_FLAG,
+        .help = "Postpone refcount updates",
+    },
     { NULL }
 };
 
@@ -1357,6 +1686,7 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_probe         = qcow2_probe,
     .bdrv_open          = qcow2_open,
     .bdrv_close         = qcow2_close,
+    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
     .bdrv_create        = qcow2_create,
     .bdrv_co_is_allocated = qcow2_co_is_allocated,
     .bdrv_set_key       = qcow2_set_key,
@@ -1365,8 +1695,8 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_co_readv          = qcow2_co_readv,
     .bdrv_co_writev         = qcow2_co_writev,
     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
-    .bdrv_co_flush_to_disk  = qcow2_co_flush_to_disk,
 
+    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
     .bdrv_co_discard        = qcow2_co_discard,
     .bdrv_truncate          = qcow2_truncate,
     .bdrv_write_compressed  = qcow2_write_compressed,