]> git.proxmox.com Git - qemu.git/blobdiff - block/qcow2-refcount.c
rng-egd: remove redundant free
[qemu.git] / block / qcow2-refcount.c
index 917fc88d9c888b5265cb8afe4b86fc2607bccadf..1ff43d0906ca78c0b4c0074619a745e20d733f1a 100644 (file)
  */
 
 #include "qemu-common.h"
-#include "block_int.h"
+#include "block/block_int.h"
 #include "block/qcow2.h"
+#include "qemu/range.h"
+#include "qapi/qmp/types.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                             int64_t offset, int64_t length,
-                            int addend);
+                            int addend, enum qcow2_discard_type type);
 
 
-static int cache_refcount_updates = 0;
-
-static int write_refcount_block(BDRVQcowState *s)
-{
-    size_t size = s->cluster_size;
-
-    if (s->refcount_block_cache_offset == 0) {
-        return 0;
-    }
-
-    if (bdrv_pwrite(s->hd, s->refcount_block_cache_offset,
-            s->refcount_block_cache, size) != size)
-    {
-        return -EIO;
-    }
-
-    return 0;
-}
-
 /*********************************************************/
 /* refcount handling */
 
@@ -59,11 +42,11 @@ int qcow2_refcount_init(BlockDriverState *bs)
     BDRVQcowState *s = bs->opaque;
     int ret, refcount_table_size2, i;
 
-    s->refcount_block_cache = qemu_malloc(s->cluster_size);
     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
-    s->refcount_table = qemu_malloc(refcount_table_size2);
+    s->refcount_table = g_malloc(refcount_table_size2);
     if (s->refcount_table_size > 0) {
-        ret = bdrv_pread(s->hd, s->refcount_table_offset,
+        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
+        ret = bdrv_pread(bs->file, s->refcount_table_offset,
                          s->refcount_table, refcount_table_size2);
         if (ret != refcount_table_size2)
             goto fail;
@@ -78,34 +61,37 @@ int qcow2_refcount_init(BlockDriverState *bs)
 void qcow2_refcount_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
-    qemu_free(s->refcount_block_cache);
-    qemu_free(s->refcount_table);
+    g_free(s->refcount_table);
 }
 
 
 static int load_refcount_block(BlockDriverState *bs,
-                               int64_t refcount_block_offset)
+                               int64_t refcount_block_offset,
+                               void **refcount_block)
 {
     BDRVQcowState *s = bs->opaque;
     int ret;
 
-    if (cache_refcount_updates) {
-        write_refcount_block(s);
-    }
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+        refcount_block);
 
-    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
-                     s->cluster_size);
-    if (ret != s->cluster_size)
-        return -EIO;
-    s->refcount_block_cache_offset = refcount_block_offset;
-    return 0;
+    return ret;
 }
 
+/*
+ * Returns the refcount of the cluster given by its index. Any non-negative
+ * return value is the refcount of the cluster, negative values are -errno
+ * and indicate an error.
+ */
 static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
 {
     BDRVQcowState *s = bs->opaque;
     int refcount_table_index, block_index;
     int64_t refcount_block_offset;
+    int ret;
+    uint16_t *refcount_block;
+    uint16_t refcount;
 
     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
     if (refcount_table_index >= s->refcount_table_size)
@@ -113,14 +99,24 @@ static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
     refcount_block_offset = s->refcount_table[refcount_table_index];
     if (!refcount_block_offset)
         return 0;
-    if (refcount_block_offset != s->refcount_block_cache_offset) {
-        /* better than nothing: return allocated if read error */
-        if (load_refcount_block(bs, refcount_block_offset) < 0)
-            return 1;
+
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
+        (void**) &refcount_block);
+    if (ret < 0) {
+        return ret;
     }
+
     block_index = cluster_index &
         ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
-    return be16_to_cpu(s->refcount_block_cache[block_index]);
+    refcount = be16_to_cpu(refcount_block[block_index]);
+
+    ret = qcow2_cache_put(bs, s->refcount_block_cache,
+        (void**) &refcount_block);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return refcount;
 }
 
 /*
@@ -156,31 +152,29 @@ static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
  * Loads a refcount block. If it doesn't exist yet, it is allocated first
  * (including growing the refcount table if needed).
  *
- * Returns the offset of the refcount block on success or -errno in error case
+ * Returns 0 on success or -errno in error case
  */
-static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
+static int alloc_refcount_block(BlockDriverState *bs,
+    int64_t cluster_index, uint16_t **refcount_block)
 {
     BDRVQcowState *s = bs->opaque;
     unsigned int refcount_table_index;
     int ret;
 
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
+
     /* Find the refcount block for the given cluster */
     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
 
     if (refcount_table_index < s->refcount_table_size) {
 
         uint64_t refcount_block_offset =
-            s->refcount_table[refcount_table_index];
+            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
 
         /* If it's already there, we're done */
         if (refcount_block_offset) {
-            if (refcount_block_offset != s->refcount_block_cache_offset) {
-                ret = load_refcount_block(bs, refcount_block_offset);
-                if (ret < 0) {
-                    return ret;
-                }
-            }
-            return refcount_block_offset;
+             return load_refcount_block(bs, refcount_block_offset,
+                 (void**) refcount_block);
         }
     }
 
@@ -206,17 +200,19 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
      *   refcount block into the cache
      */
 
-    if (cache_refcount_updates) {
-        ret = write_refcount_block(s);
-        if (ret < 0) {
-            return ret;
-        }
+    *refcount_block = NULL;
+
+    /* We write to the refcount table, so we might depend on L2 tables */
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        return ret;
     }
 
     /* Allocate the refcount block itself and mark it as used */
-    uint64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
-    memset(s->refcount_block_cache, 0, s->cluster_size);
-    s->refcount_block_cache_offset = new_block;
+    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
+    if (new_block < 0) {
+        return new_block;
+    }
 
 #ifdef DEBUG_ALLOC2
     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
@@ -225,22 +221,48 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
 #endif
 
     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
+        /* Zero the new refcount block before updating it */
+        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+            (void**) refcount_block);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        memset(*refcount_block, 0, s->cluster_size);
+
         /* The block describes itself, need to update the cache */
         int block_index = (new_block >> s->cluster_bits) &
             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
-        s->refcount_block_cache[block_index] = cpu_to_be16(1);
+        (*refcount_block)[block_index] = cpu_to_be16(1);
     } else {
         /* Described somewhere else. This can recurse at most twice before we
          * arrive at a block that describes itself. */
-        ret = update_refcount(bs, new_block, s->cluster_size, 1);
+        ret = update_refcount(bs, new_block, s->cluster_size, 1,
+                              QCOW2_DISCARD_NEVER);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
         if (ret < 0) {
             goto fail_block;
         }
+
+        /* Initialize the new refcount block only after updating its refcount,
+         * update_refcount uses the refcount cache itself */
+        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
+            (void**) refcount_block);
+        if (ret < 0) {
+            goto fail_block;
+        }
+
+        memset(*refcount_block, 0, s->cluster_size);
     }
 
     /* Now the new refcount block needs to be written to disk */
-    ret = bdrv_pwrite(s->hd, new_block, s->refcount_block_cache,
-        s->cluster_size);
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
+    qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
     if (ret < 0) {
         goto fail_block;
     }
@@ -248,7 +270,8 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
     /* If the refcount table is big enough, just hook the block up there */
     if (refcount_table_index < s->refcount_table_size) {
         uint64_t data64 = cpu_to_be64(new_block);
-        ret = bdrv_pwrite(s->hd,
+        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
+        ret = bdrv_pwrite_sync(bs->file,
             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
             &data64, sizeof(data64));
         if (ret < 0) {
@@ -256,7 +279,12 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
         }
 
         s->refcount_table[refcount_table_index] = new_block;
-        return new_block;
+        return 0;
+    }
+
+    ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+    if (ret < 0) {
+        goto fail_block;
     }
 
     /*
@@ -270,6 +298,8 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
      * refcount table at once without producing an inconsistent state in
      * between.
      */
+    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
+
     /* Calculate the number of refcount blocks needed so far */
     uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
     uint64_t blocks_used = (s->free_cluster_index +
@@ -280,7 +310,8 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
     uint64_t last_table_size;
     uint64_t blocks_clusters;
     do {
-        uint64_t table_clusters = size_to_clusters(s, table_size);
+        uint64_t table_clusters =
+            size_to_clusters(s, table_size * sizeof(uint64_t));
         blocks_clusters = 1 +
             ((table_clusters + refcount_block_clusters - 1)
             / refcount_block_clusters);
@@ -302,8 +333,8 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
     uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
         s->cluster_size;
     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
-    uint16_t *new_blocks = qemu_mallocz(blocks_clusters * s->cluster_size);
-    uint64_t *new_table = qemu_mallocz(table_size * sizeof(uint64_t));
+    uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
+    uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
 
     assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
 
@@ -325,9 +356,10 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
     }
 
     /* Write refcount blocks to disk */
-    ret = bdrv_pwrite(s->hd, meta_offset, new_blocks,
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
+    ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
         blocks_clusters * s->cluster_size);
-    qemu_free(new_blocks);
+    g_free(new_blocks);
     if (ret < 0) {
         goto fail_table;
     }
@@ -337,21 +369,23 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
         cpu_to_be64s(&new_table[i]);
     }
 
-    ret = bdrv_pwrite(s->hd, table_offset, new_table,
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
+    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
         table_size * sizeof(uint64_t));
     if (ret < 0) {
         goto fail_table;
     }
 
     for(i = 0; i < table_size; i++) {
-        cpu_to_be64s(&new_table[i]);
+        be64_to_cpus(&new_table[i]);
     }
 
     /* Hook up the new refcount table in the qcow2 header */
     uint8_t data[12];
     cpu_to_be64w((uint64_t*)data, table_offset);
     cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
-    ret = bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
+    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
         data, sizeof(data));
     if (ret < 0) {
         goto fail_table;
@@ -361,68 +395,113 @@ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
     uint64_t old_table_offset = s->refcount_table_offset;
     uint64_t old_table_size = s->refcount_table_size;
 
-    qemu_free(s->refcount_table);
+    g_free(s->refcount_table);
     s->refcount_table = new_table;
     s->refcount_table_size = table_size;
     s->refcount_table_offset = table_offset;
 
     /* Free old table. Remember, we must not change free_cluster_index */
     uint64_t old_free_cluster_index = s->free_cluster_index;
-    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
+    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
+                        QCOW2_DISCARD_OTHER);
     s->free_cluster_index = old_free_cluster_index;
 
-    ret = load_refcount_block(bs, new_block);
+    ret = load_refcount_block(bs, new_block, (void**) refcount_block);
     if (ret < 0) {
-        goto fail_block;
+        return ret;
     }
 
-    return new_block;
+    return 0;
 
 fail_table:
-    qemu_free(new_table);
+    g_free(new_table);
 fail_block:
-    s->refcount_block_cache_offset = 0;
+    if (*refcount_block != NULL) {
+        qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+    }
     return ret;
 }
 
-#define REFCOUNTS_PER_SECTOR (512 >> REFCOUNT_SHIFT)
-static int write_refcount_block_entries(BDRVQcowState *s,
-    int64_t refcount_block_offset, int first_index, int last_index)
+void qcow2_process_discards(BlockDriverState *bs, int ret)
 {
-    size_t size;
+    BDRVQcowState *s = bs->opaque;
+    Qcow2DiscardRegion *d, *next;
 
-    if (cache_refcount_updates) {
-        return 0;
-    }
+    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
+        QTAILQ_REMOVE(&s->discards, d, next);
+
+        /* Discard is optional, ignore the return value */
+        if (ret >= 0) {
+            bdrv_discard(bs->file,
+                         d->offset >> BDRV_SECTOR_BITS,
+                         d->bytes >> BDRV_SECTOR_BITS);
+        }
 
-    first_index &= ~(REFCOUNTS_PER_SECTOR - 1);
-    last_index = (last_index + REFCOUNTS_PER_SECTOR)
-        & ~(REFCOUNTS_PER_SECTOR - 1);
+        g_free(d);
+    }
+}
 
-    size = (last_index - first_index) << REFCOUNT_SHIFT;
-    if (bdrv_pwrite(s->hd,
-        refcount_block_offset + (first_index << REFCOUNT_SHIFT),
-        &s->refcount_block_cache[first_index], size) != size)
-    {
-        return -EIO;
+static void update_refcount_discard(BlockDriverState *bs,
+                                    uint64_t offset, uint64_t length)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2DiscardRegion *d, *p, *next;
+
+    QTAILQ_FOREACH(d, &s->discards, next) {
+        uint64_t new_start = MIN(offset, d->offset);
+        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
+
+        if (new_end - new_start <= length + d->bytes) {
+            /* There can't be any overlap, areas ending up here have no
+             * references any more and therefore shouldn't get freed another
+             * time. */
+            assert(d->bytes + length == new_end - new_start);
+            d->offset = new_start;
+            d->bytes = new_end - new_start;
+            goto found;
+        }
     }
 
-    return 0;
+    d = g_malloc(sizeof(*d));
+    *d = (Qcow2DiscardRegion) {
+        .bs     = bs,
+        .offset = offset,
+        .bytes  = length,
+    };
+    QTAILQ_INSERT_TAIL(&s->discards, d, next);
+
+found:
+    /* Merge discard requests if they are adjacent now */
+    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
+        if (p == d
+            || p->offset > d->offset + d->bytes
+            || d->offset > p->offset + p->bytes)
+        {
+            continue;
+        }
+
+        /* Still no overlap possible */
+        assert(p->offset == d->offset + d->bytes
+            || d->offset == p->offset + p->bytes);
+
+        QTAILQ_REMOVE(&s->discards, p, next);
+        d->offset = MIN(d->offset, p->offset);
+        d->bytes += p->bytes;
+    }
 }
 
 /* XXX: cache several refcount block clusters ? */
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
-    int64_t offset, int64_t length, int addend)
+    int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
     int64_t start, last, cluster_offset;
-    int64_t refcount_block_offset = 0;
-    int64_t table_index = -1, old_table_index;
-    int first_index = -1, last_index = -1;
+    uint16_t *refcount_block = NULL;
+    int64_t old_table_index = -1;
     int ret;
 
 #ifdef DEBUG_ALLOC2
-    printf("update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
+    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
            offset, length, addend);
 #endif
     if (length < 0) {
@@ -431,6 +510,11 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         return 0;
     }
 
+    if (addend < 0) {
+        qcow2_cache_set_dependency(bs, s->refcount_block_cache,
+            s->l2_table_cache);
+    }
+
     start = offset & ~(s->cluster_size - 1);
     last = (offset + length - 1) & ~(s->cluster_size - 1);
     for(cluster_offset = start; cluster_offset <= last;
@@ -438,42 +522,33 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
     {
         int block_index, refcount;
         int64_t cluster_index = cluster_offset >> s->cluster_bits;
-        int64_t new_block;
-
-        /* Only write refcount block to disk when we are done with it */
-        old_table_index = table_index;
-        table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
-        if ((old_table_index >= 0) && (table_index != old_table_index)) {
+        int64_t table_index =
+            cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
 
-            if (write_refcount_block_entries(s, refcount_block_offset,
-                first_index, last_index) < 0)
-            {
-                return -EIO;
+        /* Load the refcount block and allocate it if needed */
+        if (table_index != old_table_index) {
+            if (refcount_block) {
+                ret = qcow2_cache_put(bs, s->refcount_block_cache,
+                    (void**) &refcount_block);
+                if (ret < 0) {
+                    goto fail;
+                }
             }
 
-            first_index = -1;
-            last_index = -1;
+            ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
+            if (ret < 0) {
+                goto fail;
+            }
         }
+        old_table_index = table_index;
 
-        /* Load the refcount block and allocate it if needed */
-        new_block = alloc_refcount_block(bs, cluster_index);
-        if (new_block < 0) {
-            ret = new_block;
-            goto fail;
-        }
-        refcount_block_offset = new_block;
+        qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
 
         /* we can update the count and save it */
         block_index = cluster_index &
             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
-        if (first_index == -1 || block_index < first_index) {
-            first_index = block_index;
-        }
-        if (block_index > last_index) {
-            last_index = block_index;
-        }
 
-        refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
+        refcount = be16_to_cpu(refcount_block[block_index]);
         refcount += addend;
         if (refcount < 0 || refcount > 0xffff) {
             ret = -EINVAL;
@@ -482,18 +557,26 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         if (refcount == 0 && cluster_index < s->free_cluster_index) {
             s->free_cluster_index = cluster_index;
         }
-        s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
+        refcount_block[block_index] = cpu_to_be16(refcount);
+
+        if (refcount == 0 && s->discard_passthrough[type]) {
+            update_refcount_discard(bs, cluster_offset, s->cluster_size);
+        }
     }
 
     ret = 0;
 fail:
+    if (!s->cache_discards) {
+        qcow2_process_discards(bs, ret);
+    }
 
     /* Write last changed block to disk */
-    if (refcount_block_offset != 0) {
-        if (write_refcount_block_entries(s, refcount_block_offset,
-            first_index, last_index) < 0)
-        {
-            return ret < 0 ? ret : -EIO;
+    if (refcount_block) {
+        int wret;
+        wret = qcow2_cache_put(bs, s->refcount_block_cache,
+            (void**) &refcount_block);
+        if (wret < 0) {
+            return ret < 0 ? ret : wret;
         }
     }
 
@@ -503,21 +586,31 @@ fail:
      */
     if (ret < 0) {
         int dummy;
-        dummy = update_refcount(bs, offset, cluster_offset - offset, -addend);
+        dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
+                                QCOW2_DISCARD_NEVER);
+        (void)dummy;
     }
 
     return ret;
 }
 
-/* addend must be 1 or -1 */
-static int update_cluster_refcount(BlockDriverState *bs,
-                                   int64_t cluster_index,
-                                   int addend)
+/*
+ * Increases or decreases the refcount of a given cluster by one.
+ * addend must be 1 or -1.
+ *
+ * If the return value is non-negative, it is the new refcount of the cluster.
+ * If it is negative, it is -errno and indicates an error.
+ */
+int qcow2_update_cluster_refcount(BlockDriverState *bs,
+                                  int64_t cluster_index,
+                                  int addend,
+                                  enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
     int ret;
 
-    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend);
+    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
+                          type);
     if (ret < 0) {
         return ret;
     }
@@ -536,17 +629,22 @@ static int update_cluster_refcount(BlockDriverState *bs,
 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
 {
     BDRVQcowState *s = bs->opaque;
-    int i, nb_clusters;
+    int i, nb_clusters, refcount;
 
     nb_clusters = size_to_clusters(s, size);
 retry:
     for(i = 0; i < nb_clusters; i++) {
-        int64_t i = s->free_cluster_index++;
-        if (get_refcount(bs, i) != 0)
+        int64_t next_cluster_index = s->free_cluster_index++;
+        refcount = get_refcount(bs, next_cluster_index);
+
+        if (refcount < 0) {
+            return refcount;
+        } else if (refcount != 0) {
             goto retry;
+        }
     }
 #ifdef DEBUG_ALLOC2
-    printf("alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
+    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
             size,
             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
 #endif
@@ -558,14 +656,55 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
     int64_t offset;
     int ret;
 
+    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
     offset = alloc_clusters_noref(bs, size);
-    ret = update_refcount(bs, offset, size, 1);
+    if (offset < 0) {
+        return offset;
+    }
+
+    ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
     if (ret < 0) {
         return ret;
     }
+
     return offset;
 }
 
+int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+    int nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t cluster_index;
+    uint64_t old_free_cluster_index;
+    int i, refcount, ret;
+
+    /* Check how many clusters there are free */
+    cluster_index = offset >> s->cluster_bits;
+    for(i = 0; i < nb_clusters; i++) {
+        refcount = get_refcount(bs, cluster_index++);
+
+        if (refcount < 0) {
+            return refcount;
+        } else if (refcount != 0) {
+            break;
+        }
+    }
+
+    /* And then allocate them */
+    old_free_cluster_index = s->free_cluster_index;
+    s->free_cluster_index = cluster_index + i;
+
+    ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
+                          QCOW2_DISCARD_NEVER);
+    if (ret < 0) {
+        return ret;
+    }
+
+    s->free_cluster_index = old_free_cluster_index;
+
+    return i;
+}
+
 /* only used to allocate compressed sectors. We try to allocate
    contiguous sectors. size must be <= cluster_size */
 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
@@ -574,12 +713,14 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
     int64_t offset, cluster_offset;
     int free_in_cluster;
 
+    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
     assert(size > 0 && size <= s->cluster_size);
     if (s->free_byte_offset == 0) {
-        s->free_byte_offset = qcow2_alloc_clusters(bs, s->cluster_size);
-        if (s->free_byte_offset < 0) {
-            return s->free_byte_offset;
+        offset = qcow2_alloc_clusters(bs, s->cluster_size);
+        if (offset < 0) {
+            return offset;
         }
+        s->free_byte_offset = offset;
     }
  redo:
     free_in_cluster = s->cluster_size -
@@ -592,7 +733,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
         if (free_in_cluster == 0)
             s->free_byte_offset = 0;
         if ((offset & (s->cluster_size - 1)) != 0)
-            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                          QCOW2_DISCARD_NEVER);
     } else {
         offset = qcow2_alloc_clusters(bs, s->cluster_size);
         if (offset < 0) {
@@ -602,55 +744,69 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
         if ((cluster_offset + s->cluster_size) == offset) {
             /* we are lucky: contiguous data */
             offset = s->free_byte_offset;
-            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                          QCOW2_DISCARD_NEVER);
             s->free_byte_offset += size;
         } else {
             s->free_byte_offset = offset;
             goto redo;
         }
     }
+
+    /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
+     * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
+     * be flushed before the caller's L2 table updates.
+     */
+    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
     return offset;
 }
 
 void qcow2_free_clusters(BlockDriverState *bs,
-                          int64_t offset, int64_t size)
+                          int64_t offset, int64_t size,
+                          enum qcow2_discard_type type)
 {
     int ret;
 
-    ret = update_refcount(bs, offset, size, -1);
+    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
+    ret = update_refcount(bs, offset, size, -1, type);
     if (ret < 0) {
         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
-        abort();
+        /* TODO Remember the clusters to free them later and avoid leaking */
     }
 }
 
 /*
- * free_any_clusters
- *
- * free clusters according to its type: compressed or not
- *
+ * Free a cluster using its L2 entry (handles clusters of all types, e.g.
+ * normal cluster, compressed cluster, etc.)
  */
-
-void qcow2_free_any_clusters(BlockDriverState *bs,
-    uint64_t cluster_offset, int nb_clusters)
+void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
+                             int nb_clusters, enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
 
-    /* free the cluster */
-
-    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
-        int nb_csectors;
-        nb_csectors = ((cluster_offset >> s->csize_shift) &
-                       s->csize_mask) + 1;
-        qcow2_free_clusters(bs,
-            (cluster_offset & s->cluster_offset_mask) & ~511,
-            nb_csectors * 512);
-        return;
+    switch (qcow2_get_cluster_type(l2_entry)) {
+    case QCOW2_CLUSTER_COMPRESSED:
+        {
+            int nb_csectors;
+            nb_csectors = ((l2_entry >> s->csize_shift) &
+                           s->csize_mask) + 1;
+            qcow2_free_clusters(bs,
+                (l2_entry & s->cluster_offset_mask) & ~511,
+                nb_csectors * 512, type);
+        }
+        break;
+    case QCOW2_CLUSTER_NORMAL:
+    case QCOW2_CLUSTER_ZERO:
+        if (l2_entry & L2E_OFFSET_MASK) {
+            qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
+                                nb_clusters << s->cluster_bits, type);
+        }
+        break;
+    case QCOW2_CLUSTER_UNALLOCATED:
+        break;
+    default:
+        abort();
     }
-
-    qcow2_free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
-
-    return;
 }
 
 
@@ -660,24 +816,6 @@ void qcow2_free_any_clusters(BlockDriverState *bs,
 
 
 
-void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset,
-    int64_t size)
-{
-    int refcount;
-    int64_t start, last, cluster_offset;
-    uint16_t *p;
-
-    start = offset & ~(s->cluster_size - 1);
-    last = (offset + size - 1)  & ~(s->cluster_size - 1);
-    for(cluster_offset = start; cluster_offset <= last;
-        cluster_offset += s->cluster_size) {
-        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
-        refcount = be16_to_cpu(*p);
-        refcount++;
-        *p = cpu_to_be16(refcount);
-    }
-}
-
 /* update the refcounts of snapshots and the copied flag */
 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     int64_t l1_table_offset, int l1_size, int addend)
@@ -685,24 +823,27 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     BDRVQcowState *s = bs->opaque;
     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
     int64_t old_offset, old_l2_offset;
-    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
-
-    qcow2_l2_cache_reset(bs);
-    cache_refcount_updates = 1;
+    int i, j, l1_modified = 0, nb_csectors, refcount;
+    int ret;
 
     l2_table = NULL;
     l1_table = NULL;
     l1_size2 = l1_size * sizeof(uint64_t);
+
+    s->cache_discards = true;
+
+    /* WARNING: qcow2_snapshot_goto relies on this function not using the
+     * l1_table_offset when it is the current s->l1_table_offset! Be careful
+     * when changing this! */
     if (l1_table_offset != s->l1_table_offset) {
-        if (l1_size2 != 0) {
-            l1_table = qemu_mallocz(align_offset(l1_size2, 512));
-        } else {
-            l1_table = NULL;
-        }
+        l1_table = g_malloc0(align_offset(l1_size2, 512));
         l1_allocated = 1;
-        if (bdrv_pread(s->hd, l1_table_offset,
-                       l1_table, l1_size2) != l1_size2)
+
+        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
+        if (ret < 0) {
             goto fail;
+        }
+
         for(i = 0;i < l1_size; i++)
             be64_to_cpus(&l1_table[i]);
     } else {
@@ -711,65 +852,101 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
         l1_allocated = 0;
     }
 
-    l2_size = s->l2_size * sizeof(uint64_t);
-    l2_table = qemu_malloc(l2_size);
-    l1_modified = 0;
     for(i = 0; i < l1_size; i++) {
         l2_offset = l1_table[i];
         if (l2_offset) {
             old_l2_offset = l2_offset;
-            l2_offset &= ~QCOW_OFLAG_COPIED;
-            l2_modified = 0;
-            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
+            l2_offset &= L1E_OFFSET_MASK;
+
+            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+                (void**) &l2_table);
+            if (ret < 0) {
                 goto fail;
+            }
+
             for(j = 0; j < s->l2_size; j++) {
+                uint64_t cluster_index;
+
                 offset = be64_to_cpu(l2_table[j]);
-                if (offset != 0) {
-                    old_offset = offset;
-                    offset &= ~QCOW_OFLAG_COPIED;
-                    if (offset & QCOW_OFLAG_COMPRESSED) {
+                old_offset = offset;
+                offset &= ~QCOW_OFLAG_COPIED;
+
+                switch (qcow2_get_cluster_type(offset)) {
+                    case QCOW2_CLUSTER_COMPRESSED:
                         nb_csectors = ((offset >> s->csize_shift) &
                                        s->csize_mask) + 1;
                         if (addend != 0) {
-                            int ret;
                             ret = update_refcount(bs,
                                 (offset & s->cluster_offset_mask) & ~511,
-                                nb_csectors * 512, addend);
+                                nb_csectors * 512, addend,
+                                QCOW2_DISCARD_SNAPSHOT);
                             if (ret < 0) {
                                 goto fail;
                             }
                         }
                         /* compressed clusters are never modified */
                         refcount = 2;
-                    } else {
+                        break;
+
+                    case QCOW2_CLUSTER_NORMAL:
+                    case QCOW2_CLUSTER_ZERO:
+                        cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
+                        if (!cluster_index) {
+                            /* unallocated */
+                            refcount = 0;
+                            break;
+                        }
                         if (addend != 0) {
-                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
+                            refcount = qcow2_update_cluster_refcount(bs,
+                                    cluster_index, addend,
+                                    QCOW2_DISCARD_SNAPSHOT);
                         } else {
-                            refcount = get_refcount(bs, offset >> s->cluster_bits);
+                            refcount = get_refcount(bs, cluster_index);
                         }
-                    }
 
-                    if (refcount == 1) {
-                        offset |= QCOW_OFLAG_COPIED;
-                    }
-                    if (offset != old_offset) {
-                        l2_table[j] = cpu_to_be64(offset);
-                        l2_modified = 1;
+                        if (refcount < 0) {
+                            ret = refcount;
+                            goto fail;
+                        }
+                        break;
+
+                    case QCOW2_CLUSTER_UNALLOCATED:
+                        refcount = 0;
+                        break;
+
+                    default:
+                        abort();
+                }
+
+                if (refcount == 1) {
+                    offset |= QCOW_OFLAG_COPIED;
+                }
+                if (offset != old_offset) {
+                    if (addend > 0) {
+                        qcow2_cache_set_dependency(bs, s->l2_table_cache,
+                            s->refcount_block_cache);
                     }
+                    l2_table[j] = cpu_to_be64(offset);
+                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                 }
             }
-            if (l2_modified) {
-                if (bdrv_pwrite(s->hd,
-                                l2_offset, l2_table, l2_size) != l2_size)
-                    goto fail;
+
+            ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+            if (ret < 0) {
+                goto fail;
             }
 
+
             if (addend != 0) {
-                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
+                refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
+                        s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
             } else {
                 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
             }
-            if (refcount == 1) {
+            if (refcount < 0) {
+                ret = refcount;
+                goto fail;
+            } else if (refcount == 1) {
                 l2_offset |= QCOW_OFLAG_COPIED;
             }
             if (l2_offset != old_l2_offset) {
@@ -778,28 +955,31 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
             }
         }
     }
-    if (l1_modified) {
-        for(i = 0; i < l1_size; i++)
+
+    ret = bdrv_flush(bs);
+fail:
+    if (l2_table) {
+        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
+    }
+
+    s->cache_discards = false;
+    qcow2_process_discards(bs, ret);
+
+    /* Update L1 only if it isn't deleted anyway (addend = -1) */
+    if (ret == 0 && addend >= 0 && l1_modified) {
+        for (i = 0; i < l1_size; i++) {
             cpu_to_be64s(&l1_table[i]);
-        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
-                        l1_size2) != l1_size2)
-            goto fail;
-        for(i = 0; i < l1_size; i++)
+        }
+
+        ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2);
+
+        for (i = 0; i < l1_size; i++) {
             be64_to_cpus(&l1_table[i]);
+        }
     }
     if (l1_allocated)
-        qemu_free(l1_table);
-    qemu_free(l2_table);
-    cache_refcount_updates = 0;
-    write_refcount_block(s);
-    return 0;
- fail:
-    if (l1_allocated)
-        qemu_free(l1_table);
-    qemu_free(l2_table);
-    cache_refcount_updates = 0;
-    write_refcount_block(s);
-    return -EIO;
+        g_free(l1_table);
+    return ret;
 }
 
 
@@ -815,9 +995,10 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
  * This is used to construct a temporary refcount table out of L1 and L2 tables
  * which can be compared the the refcount table saved in the image.
  *
- * Returns the number of errors in the image that were found
+ * Modifies the number of errors in res.
  */
-static int inc_refcounts(BlockDriverState *bs,
+static void inc_refcounts(BlockDriverState *bs,
+                          BdrvCheckResult *res,
                           uint16_t *refcount_table,
                           int refcount_table_size,
                           int64_t offset, int64_t size)
@@ -825,32 +1006,39 @@ static int inc_refcounts(BlockDriverState *bs,
     BDRVQcowState *s = bs->opaque;
     int64_t start, last, cluster_offset;
     int k;
-    int errors = 0;
 
     if (size <= 0)
-        return 0;
+        return;
 
     start = offset & ~(s->cluster_size - 1);
     last = (offset + size - 1) & ~(s->cluster_size - 1);
     for(cluster_offset = start; cluster_offset <= last;
         cluster_offset += s->cluster_size) {
         k = cluster_offset >> s->cluster_bits;
-        if (k < 0 || k >= refcount_table_size) {
+        if (k < 0) {
             fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
                 cluster_offset);
-            errors++;
+            res->corruptions++;
+        } else if (k >= refcount_table_size) {
+            fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
+                "the end of the image file, can't properly check refcounts.\n",
+                cluster_offset);
+            res->check_errors++;
         } else {
             if (++refcount_table[k] == 0) {
                 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
                     "\n", cluster_offset);
-                errors++;
+                res->corruptions++;
             }
         }
     }
-
-    return errors;
 }
 
+/* Flags for check_refcounts_l1() and check_refcounts_l2() */
+enum {
+    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
+};
+
 /*
  * Increases the refcount in the given refcount table for the all clusters
  * referenced in the L2 table. While doing so, performs some checks on L2
@@ -859,78 +1047,103 @@ static int inc_refcounts(BlockDriverState *bs,
  * Returns the number of errors found by the checks or -errno if an internal
  * error occurred.
  */
-static int check_refcounts_l2(BlockDriverState *bs,
+static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
     uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
-    int check_copied)
+    int flags)
 {
     BDRVQcowState *s = bs->opaque;
-    uint64_t *l2_table, offset;
-    int i, l2_size, nb_csectors, refcount;
-    int errors = 0;
+    uint64_t *l2_table, l2_entry;
+    uint64_t next_contiguous_offset = 0;
+    int i, l2_size, nb_csectors;
 
     /* Read L2 table from disk */
     l2_size = s->l2_size * sizeof(uint64_t);
-    l2_table = qemu_malloc(l2_size);
+    l2_table = g_malloc(l2_size);
 
-    if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
+    if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
         goto fail;
 
     /* Do the actual checks */
     for(i = 0; i < s->l2_size; i++) {
-        offset = be64_to_cpu(l2_table[i]);
-        if (offset != 0) {
-            if (offset & QCOW_OFLAG_COMPRESSED) {
-                /* Compressed clusters don't have QCOW_OFLAG_COPIED */
-                if (offset & QCOW_OFLAG_COPIED) {
-                    fprintf(stderr, "ERROR: cluster %" PRId64 ": "
-                        "copied flag must never be set for compressed "
-                        "clusters\n", offset >> s->cluster_bits);
-                    offset &= ~QCOW_OFLAG_COPIED;
-                    errors++;
-                }
+        l2_entry = be64_to_cpu(l2_table[i]);
+
+        switch (qcow2_get_cluster_type(l2_entry)) {
+        case QCOW2_CLUSTER_COMPRESSED:
+            /* Compressed clusters don't have QCOW_OFLAG_COPIED */
+            if (l2_entry & QCOW_OFLAG_COPIED) {
+                fprintf(stderr, "ERROR: cluster %" PRId64 ": "
+                    "copied flag must never be set for compressed "
+                    "clusters\n", l2_entry >> s->cluster_bits);
+                l2_entry &= ~QCOW_OFLAG_COPIED;
+                res->corruptions++;
+            }
 
-                /* Mark cluster as used */
-                nb_csectors = ((offset >> s->csize_shift) &
-                               s->csize_mask) + 1;
-                offset &= s->cluster_offset_mask;
-                errors += inc_refcounts(bs, refcount_table,
-                              refcount_table_size,
-                              offset & ~511, nb_csectors * 512);
-            } else {
-                /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
-                if (check_copied) {
-                    uint64_t entry = offset;
-                    offset &= ~QCOW_OFLAG_COPIED;
-                    refcount = get_refcount(bs, offset >> s->cluster_bits);
-                    if ((refcount == 1) != ((entry & QCOW_OFLAG_COPIED) != 0)) {
-                        fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
-                            PRIx64 " refcount=%d\n", entry, refcount);
-                        errors++;
-                    }
-                }
+            /* Mark cluster as used */
+            nb_csectors = ((l2_entry >> s->csize_shift) &
+                           s->csize_mask) + 1;
+            l2_entry &= s->cluster_offset_mask;
+            inc_refcounts(bs, res, refcount_table, refcount_table_size,
+                l2_entry & ~511, nb_csectors * 512);
+
+            if (flags & CHECK_FRAG_INFO) {
+                res->bfi.allocated_clusters++;
+                res->bfi.compressed_clusters++;
+
+                /* Compressed clusters are fragmented by nature.  Since they
+                 * take up sub-sector space but we only have sector granularity
+                 * I/O we need to re-read the same sectors even for adjacent
+                 * compressed clusters.
+                 */
+                res->bfi.fragmented_clusters++;
+            }
+            break;
 
-                /* Mark cluster as used */
-                offset &= ~QCOW_OFLAG_COPIED;
-                errors += inc_refcounts(bs, refcount_table,
-                              refcount_table_size,
-                              offset, s->cluster_size);
-
-                /* Correct offsets are cluster aligned */
-                if (offset & (s->cluster_size - 1)) {
-                    fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
-                        "properly aligned; L2 entry corrupted.\n", offset);
-                    errors++;
+        case QCOW2_CLUSTER_ZERO:
+            if ((l2_entry & L2E_OFFSET_MASK) == 0) {
+                break;
+            }
+            /* fall through */
+
+        case QCOW2_CLUSTER_NORMAL:
+        {
+            uint64_t offset = l2_entry & L2E_OFFSET_MASK;
+
+            if (flags & CHECK_FRAG_INFO) {
+                res->bfi.allocated_clusters++;
+                if (next_contiguous_offset &&
+                    offset != next_contiguous_offset) {
+                    res->bfi.fragmented_clusters++;
                 }
+                next_contiguous_offset = offset + s->cluster_size;
+            }
+
+            /* Mark cluster as used */
+            inc_refcounts(bs, res, refcount_table,refcount_table_size,
+                offset, s->cluster_size);
+
+            /* Correct offsets are cluster aligned */
+            if (offset & (s->cluster_size - 1)) {
+                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
+                    "properly aligned; L2 entry corrupted.\n", offset);
+                res->corruptions++;
             }
+            break;
+        }
+
+        case QCOW2_CLUSTER_UNALLOCATED:
+            break;
+
+        default:
+            abort();
         }
     }
 
-    qemu_free(l2_table);
-    return errors;
+    g_free(l2_table);
+    return 0;
 
 fail:
-    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
-    qemu_free(l2_table);
+    fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
+    g_free(l2_table);
     return -EIO;
 }
 
@@ -943,28 +1156,28 @@ fail:
  * error occurred.
  */
 static int check_refcounts_l1(BlockDriverState *bs,
+                              BdrvCheckResult *res,
                               uint16_t *refcount_table,
                               int refcount_table_size,
                               int64_t l1_table_offset, int l1_size,
-                              int check_copied)
+                              int flags)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t *l1_table, l2_offset, l1_size2;
-    int i, refcount, ret;
-    int errors = 0;
+    int i, ret;
 
     l1_size2 = l1_size * sizeof(uint64_t);
 
     /* Mark L1 table as used */
-    errors += inc_refcounts(bs, refcount_table, refcount_table_size,
-                  l1_table_offset, l1_size2);
+    inc_refcounts(bs, res, refcount_table, refcount_table_size,
+        l1_table_offset, l1_size2);
 
     /* Read L1 table entries from disk */
     if (l1_size2 == 0) {
         l1_table = NULL;
     } else {
-        l1_table = qemu_malloc(l1_size2);
-        if (bdrv_pread(s->hd, l1_table_offset,
+        l1_table = g_malloc(l1_size2);
+        if (bdrv_pread(bs->file, l1_table_offset,
                        l1_table, l1_size2) != l1_size2)
             goto fail;
         for(i = 0;i < l1_size; i++)
@@ -975,127 +1188,613 @@ static int check_refcounts_l1(BlockDriverState *bs,
     for(i = 0; i < l1_size; i++) {
         l2_offset = l1_table[i];
         if (l2_offset) {
-            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
-            if (check_copied) {
-                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
-                    >> s->cluster_bits);
-                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
-                    fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
-                        " refcount=%d\n", l2_offset, refcount);
-                    errors++;
-                }
-            }
-
             /* Mark L2 table as used */
-            l2_offset &= ~QCOW_OFLAG_COPIED;
-            errors += inc_refcounts(bs, refcount_table,
-                          refcount_table_size,
-                          l2_offset,
-                          s->cluster_size);
+            l2_offset &= L1E_OFFSET_MASK;
+            inc_refcounts(bs, res, refcount_table, refcount_table_size,
+                l2_offset, s->cluster_size);
 
             /* L2 tables are cluster aligned */
             if (l2_offset & (s->cluster_size - 1)) {
                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
                     "cluster aligned; L1 entry corrupted\n", l2_offset);
-                errors++;
+                res->corruptions++;
             }
 
             /* Process and check L2 entries */
-            ret = check_refcounts_l2(bs, refcount_table, refcount_table_size,
-                l2_offset, check_copied);
+            ret = check_refcounts_l2(bs, res, refcount_table,
+                                     refcount_table_size, l2_offset, flags);
             if (ret < 0) {
                 goto fail;
             }
-            errors += ret;
         }
     }
-    qemu_free(l1_table);
-    return errors;
+    g_free(l1_table);
+    return 0;
 
 fail:
     fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
-    qemu_free(l1_table);
+    res->check_errors++;
+    g_free(l1_table);
     return -EIO;
 }
 
+/*
+ * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
+ *
+ * This function does not print an error message nor does it increment
+ * check_errors if get_refcount fails (this is because such an error will have
+ * been already detected and sufficiently signaled by the calling function
+ * (qcow2_check_refcounts) by the time this function is called).
+ */
+static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
+                              BdrvCheckMode fix)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
+    int ret;
+    int refcount;
+    int i, j;
+
+    for (i = 0; i < s->l1_size; i++) {
+        uint64_t l1_entry = s->l1_table[i];
+        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
+        bool l2_dirty = false;
+
+        if (!l2_offset) {
+            continue;
+        }
+
+        refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+        if (refcount < 0) {
+            /* don't print message nor increment check_errors */
+            continue;
+        }
+        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
+            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
+                    "l1_entry=%" PRIx64 " refcount=%d\n",
+                    fix & BDRV_FIX_ERRORS ? "Repairing" :
+                                            "ERROR",
+                    i, l1_entry, refcount);
+            if (fix & BDRV_FIX_ERRORS) {
+                s->l1_table[i] = refcount == 1
+                               ? l1_entry |  QCOW_OFLAG_COPIED
+                               : l1_entry & ~QCOW_OFLAG_COPIED;
+                ret = qcow2_write_l1_entry(bs, i);
+                if (ret < 0) {
+                    res->check_errors++;
+                    goto fail;
+                }
+                res->corruptions_fixed++;
+            } else {
+                res->corruptions++;
+            }
+        }
+
+        ret = bdrv_pread(bs->file, l2_offset, l2_table,
+                         s->l2_size * sizeof(uint64_t));
+        if (ret < 0) {
+            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
+                    strerror(-ret));
+            res->check_errors++;
+            goto fail;
+        }
+
+        for (j = 0; j < s->l2_size; j++) {
+            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
+            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
+            int cluster_type = qcow2_get_cluster_type(l2_entry);
+
+            if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
+                ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
+                refcount = get_refcount(bs, data_offset >> s->cluster_bits);
+                if (refcount < 0) {
+                    /* don't print message nor increment check_errors */
+                    continue;
+                }
+                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
+                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
+                            "l2_entry=%" PRIx64 " refcount=%d\n",
+                            fix & BDRV_FIX_ERRORS ? "Repairing" :
+                                                    "ERROR",
+                            l2_entry, refcount);
+                    if (fix & BDRV_FIX_ERRORS) {
+                        l2_table[j] = cpu_to_be64(refcount == 1
+                                    ? l2_entry |  QCOW_OFLAG_COPIED
+                                    : l2_entry & ~QCOW_OFLAG_COPIED);
+                        l2_dirty = true;
+                        res->corruptions_fixed++;
+                    } else {
+                        res->corruptions++;
+                    }
+                }
+            }
+        }
+
+        if (l2_dirty) {
+            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
+                                                l2_offset, s->cluster_size);
+            if (ret < 0) {
+                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
+                        "overlap check failed: %s\n", strerror(-ret));
+                res->check_errors++;
+                goto fail;
+            }
+
+            ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size);
+            if (ret < 0) {
+                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
+                        strerror(-ret));
+                res->check_errors++;
+                goto fail;
+            }
+        }
+    }
+
+    ret = 0;
+
+fail:
+    qemu_vfree(l2_table);
+    return ret;
+}
+
+/*
+ * Writes one sector of the refcount table to the disk
+ */
+#define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t))
+static int write_reftable_entry(BlockDriverState *bs, int rt_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t buf[RT_ENTRIES_PER_SECTOR];
+    int rt_start_index;
+    int i, ret;
+
+    rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1);
+    for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) {
+        buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]);
+    }
+
+    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_TABLE,
+            s->refcount_table_offset + rt_start_index * sizeof(uint64_t),
+            sizeof(buf));
+    if (ret < 0) {
+        return ret;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
+    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
+            rt_start_index * sizeof(uint64_t), buf, sizeof(buf));
+    if (ret < 0) {
+        return ret;
+    }
+
+    return 0;
+}
+
+/*
+ * Allocates a new cluster for the given refcount block (represented by its
+ * offset in the image file) and copies the current content there. This function
+ * does _not_ decrement the reference count for the currently occupied cluster.
+ *
+ * This function prints an informative message to stderr on error (and returns
+ * -errno); on success, 0 is returned.
+ */
+static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index,
+                                      uint64_t offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t new_offset = 0;
+    void *refcount_block = NULL;
+    int ret;
+
+    /* allocate new refcount block */
+    new_offset = qcow2_alloc_clusters(bs, s->cluster_size);
+    if (new_offset < 0) {
+        fprintf(stderr, "Could not allocate new cluster: %s\n",
+                strerror(-new_offset));
+        ret = new_offset;
+        goto fail;
+    }
+
+    /* fetch current refcount block content */
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, offset, &refcount_block);
+    if (ret < 0) {
+        fprintf(stderr, "Could not fetch refcount block: %s\n", strerror(-ret));
+        goto fail;
+    }
+
+    /* new block has not yet been entered into refcount table, therefore it is
+     * no refcount block yet (regarding this check) */
+    ret = qcow2_pre_write_overlap_check(bs, 0, new_offset, s->cluster_size);
+    if (ret < 0) {
+        fprintf(stderr, "Could not write refcount block; metadata overlap "
+                "check failed: %s\n", strerror(-ret));
+        /* the image will be marked corrupt, so don't even attempt on freeing
+         * the cluster */
+        new_offset = 0;
+        goto fail;
+    }
+
+    /* write to new block */
+    ret = bdrv_write(bs->file, new_offset / BDRV_SECTOR_SIZE, refcount_block,
+            s->cluster_sectors);
+    if (ret < 0) {
+        fprintf(stderr, "Could not write refcount block: %s\n", strerror(-ret));
+        goto fail;
+    }
+
+    /* update refcount table */
+    assert(!(new_offset & (s->cluster_size - 1)));
+    s->refcount_table[reftable_index] = new_offset;
+    ret = write_reftable_entry(bs, reftable_index);
+    if (ret < 0) {
+        fprintf(stderr, "Could not update refcount table: %s\n",
+                strerror(-ret));
+        goto fail;
+    }
+
+fail:
+    if (new_offset && (ret < 0)) {
+        qcow2_free_clusters(bs, new_offset, s->cluster_size,
+                QCOW2_DISCARD_ALWAYS);
+    }
+    if (refcount_block) {
+        if (ret < 0) {
+            qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+        } else {
+            ret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+        }
+    }
+    if (ret < 0) {
+        return ret;
+    }
+    return new_offset;
+}
+
 /*
  * Checks an image for refcount consistency.
  *
  * Returns 0 if no errors are found, the number of errors in case the image is
- * detected as corrupted, and -errno when an internal error occured.
+ * detected as corrupted, and -errno when an internal error occurred.
  */
-int qcow2_check_refcounts(BlockDriverState *bs)
+int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                          BdrvCheckMode fix)
 {
     BDRVQcowState *s = bs->opaque;
-    int64_t size;
-    int nb_clusters, refcount1, refcount2, i;
+    int64_t size, i, highest_cluster;
+    int nb_clusters, refcount1, refcount2;
     QCowSnapshot *sn;
     uint16_t *refcount_table;
-    int ret, errors = 0;
+    int ret;
 
-    size = bdrv_getlength(s->hd);
+    size = bdrv_getlength(bs->file);
     nb_clusters = size_to_clusters(s, size);
-    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
+    refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
+
+    res->bfi.total_clusters =
+        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
 
     /* header */
-    errors += inc_refcounts(bs, refcount_table, nb_clusters,
-                  0, s->cluster_size);
+    inc_refcounts(bs, res, refcount_table, nb_clusters,
+        0, s->cluster_size);
 
     /* current L1 table */
-    ret = check_refcounts_l1(bs, refcount_table, nb_clusters,
-                       s->l1_table_offset, s->l1_size, 1);
+    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
+                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
     if (ret < 0) {
-        return ret;
+        goto fail;
     }
-    errors += ret;
 
     /* snapshots */
     for(i = 0; i < s->nb_snapshots; i++) {
         sn = s->snapshots + i;
-        check_refcounts_l1(bs, refcount_table, nb_clusters,
-                           sn->l1_table_offset, sn->l1_size, 0);
+        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
+            sn->l1_table_offset, sn->l1_size, 0);
+        if (ret < 0) {
+            goto fail;
+        }
     }
-    errors += inc_refcounts(bs, refcount_table, nb_clusters,
-                  s->snapshots_offset, s->snapshots_size);
+    inc_refcounts(bs, res, refcount_table, nb_clusters,
+        s->snapshots_offset, s->snapshots_size);
 
     /* refcount data */
-    errors += inc_refcounts(bs, refcount_table, nb_clusters,
-                  s->refcount_table_offset,
-                  s->refcount_table_size * sizeof(uint64_t));
+    inc_refcounts(bs, res, refcount_table, nb_clusters,
+        s->refcount_table_offset,
+        s->refcount_table_size * sizeof(uint64_t));
+
     for(i = 0; i < s->refcount_table_size; i++) {
-        int64_t offset;
+        uint64_t offset, cluster;
         offset = s->refcount_table[i];
+        cluster = offset >> s->cluster_bits;
 
         /* Refcount blocks are cluster aligned */
         if (offset & (s->cluster_size - 1)) {
-            fprintf(stderr, "ERROR refcount block %d is not "
+            fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
                 "cluster aligned; refcount table entry corrupted\n", i);
-            errors++;
+            res->corruptions++;
+            continue;
+        }
+
+        if (cluster >= nb_clusters) {
+            fprintf(stderr, "ERROR refcount block %" PRId64
+                    " is outside image\n", i);
+            res->corruptions++;
+            continue;
         }
 
         if (offset != 0) {
-            errors += inc_refcounts(bs, refcount_table, nb_clusters,
-                          offset, s->cluster_size);
-            if (refcount_table[offset / s->cluster_size] != 1) {
-                fprintf(stderr, "ERROR refcount block %d refcount=%d\n",
-                    i, refcount_table[offset / s->cluster_size]);
+            inc_refcounts(bs, res, refcount_table, nb_clusters,
+                offset, s->cluster_size);
+            if (refcount_table[cluster] != 1) {
+                fprintf(stderr, "%s refcount block %" PRId64
+                    " refcount=%d\n",
+                    fix & BDRV_FIX_ERRORS ? "Repairing" :
+                                            "ERROR",
+                    i, refcount_table[cluster]);
+
+                if (fix & BDRV_FIX_ERRORS) {
+                    int64_t new_offset;
+
+                    new_offset = realloc_refcount_block(bs, i, offset);
+                    if (new_offset < 0) {
+                        res->corruptions++;
+                        continue;
+                    }
+
+                    /* update refcounts */
+                    if ((new_offset >> s->cluster_bits) >= nb_clusters) {
+                        /* increase refcount_table size if necessary */
+                        int old_nb_clusters = nb_clusters;
+                        nb_clusters = (new_offset >> s->cluster_bits) + 1;
+                        refcount_table = g_realloc(refcount_table,
+                                nb_clusters * sizeof(uint16_t));
+                        memset(&refcount_table[old_nb_clusters], 0, (nb_clusters
+                                - old_nb_clusters) * sizeof(uint16_t));
+                    }
+                    refcount_table[cluster]--;
+                    inc_refcounts(bs, res, refcount_table, nb_clusters,
+                            new_offset, s->cluster_size);
+
+                    res->corruptions_fixed++;
+                } else {
+                    res->corruptions++;
+                }
             }
         }
     }
 
     /* compare ref counts */
-    for(i = 0; i < nb_clusters; i++) {
+    for (i = 0, highest_cluster = 0; i < nb_clusters; i++) {
         refcount1 = get_refcount(bs, i);
+        if (refcount1 < 0) {
+            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
+                i, strerror(-refcount1));
+            res->check_errors++;
+            continue;
+        }
+
         refcount2 = refcount_table[i];
+
+        if (refcount1 > 0 || refcount2 > 0) {
+            highest_cluster = i;
+        }
+
         if (refcount1 != refcount2) {
-            fprintf(stderr, "ERROR cluster %d refcount=%d reference=%d\n",
+
+            /* Check if we're allowed to fix the mismatch */
+            int *num_fixed = NULL;
+            if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
+                num_fixed = &res->leaks_fixed;
+            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
+                num_fixed = &res->corruptions_fixed;
+            }
+
+            fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
+                   num_fixed != NULL     ? "Repairing" :
+                   refcount1 < refcount2 ? "ERROR" :
+                                           "Leaked",
                    i, refcount1, refcount2);
-            errors++;
+
+            if (num_fixed) {
+                ret = update_refcount(bs, i << s->cluster_bits, 1,
+                                      refcount2 - refcount1,
+                                      QCOW2_DISCARD_ALWAYS);
+                if (ret >= 0) {
+                    (*num_fixed)++;
+                    continue;
+                }
+            }
+
+            /* And if we couldn't, print an error */
+            if (refcount1 < refcount2) {
+                res->corruptions++;
+            } else {
+                res->leaks++;
+            }
+        }
+    }
+
+    /* check OFLAG_COPIED */
+    ret = check_oflag_copied(bs, res, fix);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
+    ret = 0;
+
+fail:
+    g_free(refcount_table);
+
+    return ret;
+}
+
+#define overlaps_with(ofs, sz) \
+    ranges_overlap(offset, size, ofs, sz)
+
+/*
+ * Checks if the given offset into the image file is actually free to use by
+ * looking for overlaps with important metadata sections (L1/L2 tables etc.),
+ * i.e. a sanity check without relying on the refcount tables.
+ *
+ * The ign parameter specifies what checks not to perform (being a bitmask of
+ * QCow2MetadataOverlap values), i.e., what sections to ignore.
+ *
+ * Returns:
+ * - 0 if writing to this offset will not affect the mentioned metadata
+ * - a positive QCow2MetadataOverlap value indicating one overlapping section
+ * - a negative value (-errno) indicating an error while performing a check,
+ *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
+ */
+int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
+                                 int64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int chk = s->overlap_check & ~ign;
+    int i, j;
+
+    if (!size) {
+        return 0;
+    }
+
+    if (chk & QCOW2_OL_MAIN_HEADER) {
+        if (offset < s->cluster_size) {
+            return QCOW2_OL_MAIN_HEADER;
+        }
+    }
+
+    /* align range to test to cluster boundaries */
+    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
+    offset = start_of_cluster(s, offset);
+
+    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
+        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
+            return QCOW2_OL_ACTIVE_L1;
         }
     }
 
-    qemu_free(refcount_table);
+    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
+        if (overlaps_with(s->refcount_table_offset,
+            s->refcount_table_size * sizeof(uint64_t))) {
+            return QCOW2_OL_REFCOUNT_TABLE;
+        }
+    }
+
+    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
+        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
+            return QCOW2_OL_SNAPSHOT_TABLE;
+        }
+    }
+
+    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
+        for (i = 0; i < s->nb_snapshots; i++) {
+            if (s->snapshots[i].l1_size &&
+                overlaps_with(s->snapshots[i].l1_table_offset,
+                s->snapshots[i].l1_size * sizeof(uint64_t))) {
+                return QCOW2_OL_INACTIVE_L1;
+            }
+        }
+    }
 
-    return errors;
+    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
+        for (i = 0; i < s->l1_size; i++) {
+            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
+                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
+                s->cluster_size)) {
+                return QCOW2_OL_ACTIVE_L2;
+            }
+        }
+    }
+
+    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
+        for (i = 0; i < s->refcount_table_size; i++) {
+            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
+                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
+                s->cluster_size)) {
+                return QCOW2_OL_REFCOUNT_BLOCK;
+            }
+        }
+    }
+
+    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
+        for (i = 0; i < s->nb_snapshots; i++) {
+            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
+            uint32_t l1_sz  = s->snapshots[i].l1_size;
+            uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
+            uint64_t *l1 = g_malloc(l1_sz2);
+            int ret;
+
+            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
+            if (ret < 0) {
+                g_free(l1);
+                return ret;
+            }
+
+            for (j = 0; j < l1_sz; j++) {
+                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
+                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
+                    g_free(l1);
+                    return QCOW2_OL_INACTIVE_L2;
+                }
+            }
+
+            g_free(l1);
+        }
+    }
+
+    return 0;
 }
 
+static const char *metadata_ol_names[] = {
+    [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
+    [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
+    [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
+    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
+    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
+    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
+    [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
+    [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
+};
+
+/*
+ * First performs a check for metadata overlaps (through
+ * qcow2_check_metadata_overlap); if that fails with a negative value (error
+ * while performing a check), that value is returned. If an impending overlap
+ * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
+ * and -EIO returned.
+ *
+ * Returns 0 if there were neither overlaps nor errors while checking for
+ * overlaps; or a negative value (-errno) on error.
+ */
+int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
+                                  int64_t size)
+{
+    int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
+
+    if (ret < 0) {
+        return ret;
+    } else if (ret > 0) {
+        int metadata_ol_bitnr = ffs(ret) - 1;
+        char *message;
+        QObject *data;
+
+        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
+
+        fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps "
+                "with %s); image marked as corrupt.\n",
+                metadata_ol_names[metadata_ol_bitnr]);
+        message = g_strdup_printf("Prevented %s overwrite",
+                metadata_ol_names[metadata_ol_bitnr]);
+        data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, 'offset': %"
+                PRId64 ", 'size': %" PRId64 " }", bs->device_name, message,
+                offset, size);
+        monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data);
+        g_free(message);
+        qobject_decref(data);
+
+        qcow2_mark_corrupt(bs);
+        bs->drv = NULL; /* make BDS unusable */
+        return -EIO;
+    }
+
+    return 0;
+}