]> git.proxmox.com Git - mirror_qemu.git/blobdiff - block/qcow2-refcount.c
remove unnecessary casts from uintptr_t
[mirror_qemu.git] / block / qcow2-refcount.c
index 94033972bedc7ac904f89fd6b436c55b3619c1ad..0266542ceea26e0829c9757f5d7a1ac067a714ae 100644 (file)
@@ -23,6 +23,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "block/block-io.h"
 #include "qapi/error.h"
 #include "qcow2.h"
 #include "qemu/range.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
                                     uint64_t max);
-static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
-                            int64_t offset, int64_t length, uint64_t addend,
-                            bool decrease, enum qcow2_discard_type type);
+
+G_GNUC_WARN_UNUSED_RESULT
+static int update_refcount(BlockDriverState *bs,
+                           int64_t offset, int64_t length, uint64_t addend,
+                           bool decrease, enum qcow2_discard_type type);
 
 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
@@ -95,7 +98,7 @@ static void update_max_refcount_table_index(BDRVQcow2State *s)
     s->max_refcount_table_index = i;
 }
 
-int qcow2_refcount_init(BlockDriverState *bs)
+int coroutine_fn qcow2_refcount_init(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
     unsigned int refcount_table_size2, i;
@@ -115,9 +118,9 @@ int qcow2_refcount_init(BlockDriverState *bs)
             ret = -ENOMEM;
             goto fail;
         }
-        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
-        ret = bdrv_pread(bs->file, s->refcount_table_offset,
-                         s->refcount_table, refcount_table_size2);
+        BLKDBG_CO_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
+        ret = bdrv_co_pread(bs->file, s->refcount_table_offset,
+                            refcount_table_size2, s->refcount_table, 0);
         if (ret < 0) {
             goto fail;
         }
@@ -226,9 +229,9 @@ static void set_refcount_ro6(void *refcount_array, uint64_t index,
 }
 
 
-static int load_refcount_block(BlockDriverState *bs,
-                               int64_t refcount_block_offset,
-                               void **refcount_block)
+static int GRAPH_RDLOCK
+load_refcount_block(BlockDriverState *bs, int64_t refcount_block_offset,
+                    void **refcount_block)
 {
     BDRVQcow2State *s = bs->opaque;
 
@@ -299,8 +302,9 @@ static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
  *
  * Returns 0 on success or -errno in error case
  */
-static int alloc_refcount_block(BlockDriverState *bs,
-                                int64_t cluster_index, void **refcount_block)
+static int GRAPH_RDLOCK
+alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index,
+                     void **refcount_block)
 {
     BDRVQcow2State *s = bs->opaque;
     unsigned int refcount_table_index;
@@ -437,7 +441,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
         ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
                                refcount_table_index * REFTABLE_ENTRY_SIZE,
-            &data64, sizeof(data64));
+            sizeof(data64), &data64, 0);
         if (ret < 0) {
             goto fail;
         }
@@ -682,8 +686,8 @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
-    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
-        table_size * REFTABLE_ENTRY_SIZE);
+    ret = bdrv_pwrite_sync(bs->file, table_offset,
+                           table_size * REFTABLE_ENTRY_SIZE, new_table, 0);
     if (ret < 0) {
         goto fail;
     }
@@ -702,7 +706,7 @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
     ret = bdrv_pwrite_sync(bs->file,
                            offsetof(QCowHeader, refcount_table_offset),
-                           &data, sizeof(data));
+                           sizeof(data), &data, 0);
     if (ret < 0) {
         goto fail;
     }
@@ -803,12 +807,9 @@ found:
 /* XXX: cache several refcount block clusters ? */
 /* @addend is the absolute value of the addend; if @decrease is set, @addend
  * will be subtracted from the current refcount, otherwise it will be added */
-static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
-                                                   int64_t offset,
-                                                   int64_t length,
-                                                   uint64_t addend,
-                                                   bool decrease,
-                                                   enum qcow2_discard_type type)
+static int GRAPH_RDLOCK
+update_refcount(BlockDriverState *bs, int64_t offset, int64_t length,
+                uint64_t addend, bool decrease, enum qcow2_discard_type type)
 {
     BDRVQcow2State *s = bs->opaque;
     int64_t start, last, cluster_offset;
@@ -964,8 +965,8 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs,
 
 
 /* return < 0 if error */
-static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
-                                    uint64_t max)
+static int64_t GRAPH_RDLOCK
+alloc_clusters_noref(BlockDriverState *bs, uint64_t size, uint64_t max)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t i, nb_clusters, refcount;
@@ -1027,8 +1028,8 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
     return offset;
 }
 
-int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
-                                int64_t nb_clusters)
+int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
+                                             int64_t nb_clusters)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t cluster_index, refcount;
@@ -1066,14 +1067,14 @@ int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
 
 /* only used to allocate compressed sectors. We try to allocate
    contiguous sectors. size must be <= cluster_size */
-int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
+int64_t coroutine_fn GRAPH_RDLOCK qcow2_alloc_bytes(BlockDriverState *bs, int size)
 {
     BDRVQcow2State *s = bs->opaque;
     int64_t offset;
     size_t free_in_cluster;
     int ret;
 
-    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
+    BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
     assert(size > 0 && size <= s->cluster_size);
     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
 
@@ -1204,7 +1205,7 @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
     }
 }
 
-int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
+int qcow2_write_caches(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
     int ret;
@@ -1224,7 +1225,7 @@ int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
     return 0;
 }
 
-int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
+int qcow2_flush_caches(BlockDriverState *bs)
 {
     int ret = qcow2_write_caches(bs);
     if (ret < 0) {
@@ -1272,7 +1273,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
         }
         l1_allocated = true;
 
-        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
+        ret = bdrv_pread(bs->file, l1_table_offset, l1_size2, l1_table, 0);
         if (ret < 0) {
             goto fail;
         }
@@ -1433,8 +1434,8 @@ fail:
             cpu_to_be64s(&l1_table[i]);
         }
 
-        ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
-                               l1_table, l1_size2);
+        ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_size2, l1_table,
+                               0);
 
         for (i = 0; i < l1_size; i++) {
             be64_to_cpus(&l1_table[i]);
@@ -1521,10 +1522,11 @@ static int realloc_refcount_array(BDRVQcow2State *s, void **array,
  *
  * Modifies the number of errors in res.
  */
-int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
-                             void **refcount_table,
-                             int64_t *refcount_table_size,
-                             int64_t offset, int64_t size)
+int coroutine_fn GRAPH_RDLOCK
+qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
+                         void **refcount_table,
+                         int64_t *refcount_table_size,
+                         int64_t offset, int64_t size)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t start, last, cluster_offset, k, refcount;
@@ -1535,7 +1537,7 @@ int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
         return 0;
     }
 
-    file_len = bdrv_getlength(bs->file->bs);
+    file_len = bdrv_co_getlength(bs->file->bs);
     if (file_len < 0) {
         return file_len;
     }
@@ -1597,10 +1599,11 @@ enum {
  *
  * On failure in-memory @l2_table may be modified.
  */
-static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
-                                uint64_t l2_offset,
-                                uint64_t *l2_table, int l2_index, bool active,
-                                bool *metadata_overlap)
+static int coroutine_fn GRAPH_RDLOCK
+fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
+                     uint64_t l2_offset, uint64_t *l2_table,
+                     int l2_index, bool active,
+                     bool *metadata_overlap)
 {
     BDRVQcow2State *s = bs->opaque;
     int ret;
@@ -1631,8 +1634,8 @@ static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
         goto fail;
     }
 
-    ret = bdrv_pwrite_sync(bs->file, l2e_offset, &l2_table[idx],
-                           l2_entry_size(s));
+    ret = bdrv_co_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s),
+                              &l2_table[idx], 0);
     if (ret < 0) {
         fprintf(stderr, "ERROR: Failed to overwrite L2 "
                 "table entry: %s\n", strerror(-ret));
@@ -1656,10 +1659,11 @@ fail:
  * Returns the number of errors found by the checks or -errno if an internal
  * error occurred.
  */
-static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
-                              void **refcount_table,
-                              int64_t *refcount_table_size, int64_t l2_offset,
-                              int flags, BdrvCheckMode fix, bool active)
+static int coroutine_fn GRAPH_RDLOCK
+check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
+                   void **refcount_table,
+                   int64_t *refcount_table_size, int64_t l2_offset,
+                   int flags, BdrvCheckMode fix, bool active)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t l2_entry, l2_bitmap;
@@ -1670,7 +1674,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
     bool metadata_overlap;
 
     /* Read L2 table from disk */
-    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size_bytes);
+    ret = bdrv_co_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0);
     if (ret < 0) {
         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
         res->check_errors++;
@@ -1855,12 +1859,11 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
  * Returns the number of errors found by the checks or -errno if an internal
  * error occurred.
  */
-static int check_refcounts_l1(BlockDriverState *bs,
-                              BdrvCheckResult *res,
-                              void **refcount_table,
-                              int64_t *refcount_table_size,
-                              int64_t l1_table_offset, int l1_size,
-                              int flags, BdrvCheckMode fix, bool active)
+static int coroutine_fn GRAPH_RDLOCK
+check_refcounts_l1(BlockDriverState *bs, BdrvCheckResult *res,
+                   void **refcount_table, int64_t *refcount_table_size,
+                   int64_t l1_table_offset, int l1_size,
+                   int flags, BdrvCheckMode fix, bool active)
 {
     BDRVQcow2State *s = bs->opaque;
     size_t l1_size_bytes = l1_size * L1E_SIZE;
@@ -1886,7 +1889,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
     }
 
     /* Read L1 table entries from disk */
-    ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size_bytes);
+    ret = bdrv_co_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0);
     if (ret < 0) {
         fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
         res->check_errors++;
@@ -1946,8 +1949,8 @@ static int check_refcounts_l1(BlockDriverState *bs,
  * have been already detected and sufficiently signaled by the calling function
  * (qcow2_check_refcounts) by the time this function is called).
  */
-static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
-                              BdrvCheckMode fix)
+static int coroutine_fn GRAPH_RDLOCK
+check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
@@ -2002,8 +2005,8 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
             }
         }
 
-        ret = bdrv_pread(bs->file, l2_offset, l2_table,
-                         s->l2_size * l2_entry_size(s));
+        ret = bdrv_co_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s),
+                            l2_table, 0);
         if (ret < 0) {
             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
                     strerror(-ret));
@@ -2056,8 +2059,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
                 goto fail;
             }
 
-            ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
-                              s->cluster_size);
+            ret = bdrv_co_pwrite(bs->file, l2_offset, s->cluster_size, l2_table, 0);
             if (ret < 0) {
                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
                         strerror(-ret));
@@ -2080,9 +2082,10 @@ fail:
  * Checks consistency of refblocks and accounts for each refblock in
  * *refcount_table.
  */
-static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
-                           BdrvCheckMode fix, bool *rebuild,
-                           void **refcount_table, int64_t *nb_clusters)
+static int coroutine_fn GRAPH_RDLOCK
+check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
+                BdrvCheckMode fix, bool *rebuild,
+                void **refcount_table, int64_t *nb_clusters)
 {
     BDRVQcow2State *s = bs->opaque;
     int64_t i, size;
@@ -2124,13 +2127,13 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
                     goto resize_fail;
                 }
 
-                ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
-                                    PREALLOC_MODE_OFF, 0, &local_err);
+                ret = bdrv_co_truncate(bs->file, offset + s->cluster_size, false,
+                                       PREALLOC_MODE_OFF, 0, &local_err);
                 if (ret < 0) {
                     error_report_err(local_err);
                     goto resize_fail;
                 }
-                size = bdrv_getlength(bs->file->bs);
+                size = bdrv_co_getlength(bs->file->bs);
                 if (size < 0) {
                     ret = size;
                     goto resize_fail;
@@ -2194,9 +2197,10 @@ resize_fail:
 /*
  * Calculates an in-memory refcount table.
  */
-static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
-                               BdrvCheckMode fix, bool *rebuild,
-                               void **refcount_table, int64_t *nb_clusters)
+static int coroutine_fn GRAPH_RDLOCK
+calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                    BdrvCheckMode fix, bool *rebuild,
+                    void **refcount_table, int64_t *nb_clusters)
 {
     BDRVQcow2State *s = bs->opaque;
     int64_t i;
@@ -2296,10 +2300,11 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
  * Compares the actual reference count for each cluster in the image against the
  * refcount as reported by the refcount structures on-disk.
  */
-static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
-                              BdrvCheckMode fix, bool *rebuild,
-                              int64_t *highest_cluster,
-                              void *refcount_table, int64_t nb_clusters)
+static void coroutine_fn GRAPH_RDLOCK
+compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+                  BdrvCheckMode fix, bool *rebuild,
+                  int64_t *highest_cluster,
+                  void *refcount_table, int64_t nb_clusters)
 {
     BDRVQcow2State *s = bs->opaque;
     int64_t i;
@@ -2436,188 +2441,329 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs,
 }
 
 /*
- * Creates a new refcount structure based solely on the in-memory information
- * given through *refcount_table. All necessary allocations will be reflected
- * in that array.
+ * Helper function for rebuild_refcount_structure().
  *
- * On success, the old refcount structure is leaked (it will be covered by the
- * new refcount structure).
+ * Scan the range of clusters [first_cluster, end_cluster) for allocated
+ * clusters and write all corresponding refblocks to disk.  The refblock
+ * and allocation data is taken from the in-memory refcount table
+ * *refcount_table[] (of size *nb_clusters), which is basically one big
+ * (unlimited size) refblock for the whole image.
+ *
+ * For these refblocks, clusters are allocated using said in-memory
+ * refcount table.  Care is taken that these allocations are reflected
+ * in the refblocks written to disk.
+ *
+ * The refblocks' offsets are written into a reftable, which is
+ * *on_disk_reftable_ptr[] (of size *on_disk_reftable_entries_ptr).  If
+ * that reftable is of insufficient size, it will be resized to fit.
+ * This reftable is not written to disk.
+ *
+ * (If *on_disk_reftable_ptr is not NULL, the entries within are assumed
+ * to point to existing valid refblocks that do not need to be allocated
+ * again.)
+ *
+ * Return whether the on-disk reftable array was resized (true/false),
+ * or -errno on error.
  */
-static int rebuild_refcount_structure(BlockDriverState *bs,
-                                      BdrvCheckResult *res,
-                                      void **refcount_table,
-                                      int64_t *nb_clusters)
+static int coroutine_fn GRAPH_RDLOCK
+rebuild_refcounts_write_refblocks(
+        BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters,
+        int64_t first_cluster, int64_t end_cluster,
+        uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr,
+        Error **errp
+    )
 {
     BDRVQcow2State *s = bs->opaque;
-    int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
+    int64_t cluster;
     int64_t refblock_offset, refblock_start, refblock_index;
-    uint32_t reftable_size = 0;
-    uint64_t *on_disk_reftable = NULL;
+    int64_t first_free_cluster = 0;
+    uint64_t *on_disk_reftable = *on_disk_reftable_ptr;
+    uint32_t on_disk_reftable_entries = *on_disk_reftable_entries_ptr;
     void *on_disk_refblock;
-    int ret = 0;
-    struct {
-        uint64_t reftable_offset;
-        uint32_t reftable_clusters;
-    } QEMU_PACKED reftable_offset_and_clusters;
-
-    qcow2_cache_empty(bs, s->refcount_block_cache);
+    bool reftable_grown = false;
+    int ret;
 
-write_refblocks:
-    for (; cluster < *nb_clusters; cluster++) {
+    for (cluster = first_cluster; cluster < end_cluster; cluster++) {
+        /* Check all clusters to find refblocks that contain non-zero entries */
         if (!s->get_refcount(*refcount_table, cluster)) {
             continue;
         }
 
+        /*
+         * This cluster is allocated, so we need to create a refblock
+         * for it.  The data we will write to disk is just the
+         * respective slice from *refcount_table, so it will contain
+         * accurate refcounts for all clusters belonging to this
+         * refblock.  After we have written it, we will therefore skip
+         * all remaining clusters in this refblock.
+         */
+
         refblock_index = cluster >> s->refcount_block_bits;
         refblock_start = refblock_index << s->refcount_block_bits;
 
-        /* Don't allocate a cluster in a refblock already written to disk */
-        if (first_free_cluster < refblock_start) {
-            first_free_cluster = refblock_start;
-        }
-        refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
-                                              nb_clusters, &first_free_cluster);
-        if (refblock_offset < 0) {
-            fprintf(stderr, "ERROR allocating refblock: %s\n",
-                    strerror(-refblock_offset));
-            res->check_errors++;
-            ret = refblock_offset;
-            goto fail;
-        }
+        if (on_disk_reftable_entries > refblock_index &&
+            on_disk_reftable[refblock_index])
+        {
+            /*
+             * We can get here after a `goto write_refblocks`: We have a
+             * reftable from a previous run, and the refblock is already
+             * allocated.  No need to allocate it again.
+             */
+            refblock_offset = on_disk_reftable[refblock_index];
+        } else {
+            int64_t refblock_cluster_index;
 
-        if (reftable_size <= refblock_index) {
-            uint32_t old_reftable_size = reftable_size;
-            uint64_t *new_on_disk_reftable;
+            /* Don't allocate a cluster in a refblock already written to disk */
+            if (first_free_cluster < refblock_start) {
+                first_free_cluster = refblock_start;
+            }
+            refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
+                                                  nb_clusters,
+                                                  &first_free_cluster);
+            if (refblock_offset < 0) {
+                error_setg_errno(errp, -refblock_offset,
+                                 "ERROR allocating refblock");
+                return refblock_offset;
+            }
 
-            reftable_size = ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
-                                     s->cluster_size) / REFTABLE_ENTRY_SIZE;
-            new_on_disk_reftable = g_try_realloc(on_disk_reftable,
-                                                 reftable_size *
-                                                 REFTABLE_ENTRY_SIZE);
-            if (!new_on_disk_reftable) {
-                res->check_errors++;
-                ret = -ENOMEM;
-                goto fail;
+            refblock_cluster_index = refblock_offset / s->cluster_size;
+            if (refblock_cluster_index >= end_cluster) {
+                /*
+                 * We must write the refblock that holds this refblock's
+                 * refcount
+                 */
+                end_cluster = refblock_cluster_index + 1;
             }
-            on_disk_reftable = new_on_disk_reftable;
 
-            memset(on_disk_reftable + old_reftable_size, 0,
-                   (reftable_size - old_reftable_size) * REFTABLE_ENTRY_SIZE);
+            if (on_disk_reftable_entries <= refblock_index) {
+                on_disk_reftable_entries =
+                    ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
+                             s->cluster_size) / REFTABLE_ENTRY_SIZE;
+                on_disk_reftable =
+                    g_try_realloc(on_disk_reftable,
+                                  on_disk_reftable_entries *
+                                  REFTABLE_ENTRY_SIZE);
+                if (!on_disk_reftable) {
+                    error_setg(errp, "ERROR allocating reftable memory");
+                    return -ENOMEM;
+                }
 
-            /* The offset we have for the reftable is now no longer valid;
-             * this will leak that range, but we can easily fix that by running
-             * a leak-fixing check after this rebuild operation */
-            reftable_offset = -1;
-        } else {
-            assert(on_disk_reftable);
-        }
-        on_disk_reftable[refblock_index] = refblock_offset;
+                memset(on_disk_reftable + *on_disk_reftable_entries_ptr, 0,
+                       (on_disk_reftable_entries -
+                        *on_disk_reftable_entries_ptr) *
+                       REFTABLE_ENTRY_SIZE);
 
-        /* If this is apparently the last refblock (for now), try to squeeze the
-         * reftable in */
-        if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
-            reftable_offset < 0)
-        {
-            uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
-                                                          REFTABLE_ENTRY_SIZE);
-            reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
-                                                  refcount_table, nb_clusters,
-                                                  &first_free_cluster);
-            if (reftable_offset < 0) {
-                fprintf(stderr, "ERROR allocating reftable: %s\n",
-                        strerror(-reftable_offset));
-                res->check_errors++;
-                ret = reftable_offset;
-                goto fail;
+                *on_disk_reftable_ptr = on_disk_reftable;
+                *on_disk_reftable_entries_ptr = on_disk_reftable_entries;
+
+                reftable_grown = true;
+            } else {
+                assert(on_disk_reftable);
             }
+            on_disk_reftable[refblock_index] = refblock_offset;
         }
 
+        /* Refblock is allocated, write it to disk */
+
         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
                                             s->cluster_size, false);
         if (ret < 0) {
-            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
-            goto fail;
+            error_setg_errno(errp, -ret, "ERROR writing refblock");
+            return ret;
         }
 
-        /* The size of *refcount_table is always cluster-aligned, therefore the
-         * write operation will not overflow */
+        /*
+         * The refblock is simply a slice of *refcount_table.
+         * Note that the size of *refcount_table is always aligned to
+         * whole clusters, so the write operation will not result in
+         * out-of-bounds accesses.
+         */
         on_disk_refblock = (void *)((char *) *refcount_table +
                                     refblock_index * s->cluster_size);
 
-        ret = bdrv_pwrite(bs->file, refblock_offset, on_disk_refblock,
-                          s->cluster_size);
+        ret = bdrv_co_pwrite(bs->file, refblock_offset, s->cluster_size,
+                             on_disk_refblock, 0);
         if (ret < 0) {
-            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
-            goto fail;
+            error_setg_errno(errp, -ret, "ERROR writing refblock");
+            return ret;
         }
 
-        /* Go to the end of this refblock */
+        /* This refblock is done, skip to its end */
         cluster = refblock_start + s->refcount_block_size - 1;
     }
 
-    if (reftable_offset < 0) {
-        uint64_t post_refblock_start, reftable_clusters;
+    return reftable_grown;
+}
+
+/*
+ * Creates a new refcount structure based solely on the in-memory information
+ * given through *refcount_table (this in-memory information is basically just
+ * the concatenation of all refblocks).  All necessary allocations will be
+ * reflected in that array.
+ *
+ * On success, the old refcount structure is leaked (it will be covered by the
+ * new refcount structure).
+ */
+static int coroutine_fn GRAPH_RDLOCK
+rebuild_refcount_structure(BlockDriverState *bs, BdrvCheckResult *res,
+                           void **refcount_table, int64_t *nb_clusters,
+                           Error **errp)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int64_t reftable_offset = -1;
+    int64_t reftable_length = 0;
+    int64_t reftable_clusters;
+    int64_t refblock_index;
+    uint32_t on_disk_reftable_entries = 0;
+    uint64_t *on_disk_reftable = NULL;
+    int ret = 0;
+    int reftable_size_changed = 0;
+    struct {
+        uint64_t reftable_offset;
+        uint32_t reftable_clusters;
+    } QEMU_PACKED reftable_offset_and_clusters;
+
+    qcow2_cache_empty(bs, s->refcount_block_cache);
+
+    /*
+     * For each refblock containing entries, we try to allocate a
+     * cluster (in the in-memory refcount table) and write its offset
+     * into on_disk_reftable[].  We then write the whole refblock to
+     * disk (as a slice of the in-memory refcount table).
+     * This is done by rebuild_refcounts_write_refblocks().
+     *
+     * Once we have scanned all clusters, we try to find space for the
+     * reftable.  This will dirty the in-memory refcount table (i.e.
+     * make it differ from the refblocks we have already written), so we
+     * need to run rebuild_refcounts_write_refblocks() again for the
+     * range of clusters where the reftable has been allocated.
+     *
+     * This second run might make the reftable grow again, in which case
+     * we will need to allocate another space for it, which is why we
+     * repeat all this until the reftable stops growing.
+     *
+     * (This loop will terminate, because with every cluster the
+     * reftable grows, it can accommodate a multitude of more refcounts,
+     * so that at some point this must be able to cover the reftable
+     * and all refblocks describing it.)
+     *
+     * We then convert the reftable to big-endian and write it to disk.
+     *
+     * Note that we never free any reftable allocations.  Doing so would
+     * needlessly complicate the algorithm: The eventual second check
+     * run we do will clean up all leaks we have caused.
+     */
+
+    reftable_size_changed =
+        rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
+                                          0, *nb_clusters,
+                                          &on_disk_reftable,
+                                          &on_disk_reftable_entries, errp);
+    if (reftable_size_changed < 0) {
+        res->check_errors++;
+        ret = reftable_size_changed;
+        goto fail;
+    }
+
+    /*
+     * There was no reftable before, so rebuild_refcounts_write_refblocks()
+     * must have increased its size (from 0 to something).
+     */
+    assert(reftable_size_changed);
+
+    do {
+        int64_t reftable_start_cluster, reftable_end_cluster;
+        int64_t first_free_cluster = 0;
+
+        reftable_length = on_disk_reftable_entries * REFTABLE_ENTRY_SIZE;
+        reftable_clusters = size_to_clusters(s, reftable_length);
 
-        post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
-        reftable_clusters =
-            size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE);
-        /* Not pretty but simple */
-        if (first_free_cluster < post_refblock_start) {
-            first_free_cluster = post_refblock_start;
-        }
         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
                                               refcount_table, nb_clusters,
                                               &first_free_cluster);
         if (reftable_offset < 0) {
-            fprintf(stderr, "ERROR allocating reftable: %s\n",
-                    strerror(-reftable_offset));
+            error_setg_errno(errp, -reftable_offset,
+                             "ERROR allocating reftable");
             res->check_errors++;
             ret = reftable_offset;
             goto fail;
         }
 
-        goto write_refblocks;
-    }
+        /*
+         * We need to update the affected refblocks, so re-run the
+         * write_refblocks loop for the reftable's range of clusters.
+         */
+        assert(offset_into_cluster(s, reftable_offset) == 0);
+        reftable_start_cluster = reftable_offset / s->cluster_size;
+        reftable_end_cluster = reftable_start_cluster + reftable_clusters;
+        reftable_size_changed =
+            rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
+                                              reftable_start_cluster,
+                                              reftable_end_cluster,
+                                              &on_disk_reftable,
+                                              &on_disk_reftable_entries, errp);
+        if (reftable_size_changed < 0) {
+            res->check_errors++;
+            ret = reftable_size_changed;
+            goto fail;
+        }
+
+        /*
+         * If the reftable size has changed, we will need to find a new
+         * allocation, repeating the loop.
+         */
+    } while (reftable_size_changed);
+
+    /* The above loop must have run at least once */
+    assert(reftable_offset >= 0);
+
+    /*
+     * All allocations are done, all refblocks are written, convert the
+     * reftable to big-endian and write it to disk.
+     */
 
-    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
+    for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
+         refblock_index++)
+    {
         cpu_to_be64s(&on_disk_reftable[refblock_index]);
     }
 
-    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
-                                        reftable_size * REFTABLE_ENTRY_SIZE,
+    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length,
                                         false);
     if (ret < 0) {
-        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
+        error_setg_errno(errp, -ret, "ERROR writing reftable");
         goto fail;
     }
 
-    assert(reftable_size < INT_MAX / REFTABLE_ENTRY_SIZE);
-    ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
-                      reftable_size * REFTABLE_ENTRY_SIZE);
+    assert(reftable_length < INT_MAX);
+    ret = bdrv_co_pwrite(bs->file, reftable_offset, reftable_length,
+                         on_disk_reftable, 0);
     if (ret < 0) {
-        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
+        error_setg_errno(errp, -ret, "ERROR writing reftable");
         goto fail;
     }
 
     /* Enter new reftable into the image header */
     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
     reftable_offset_and_clusters.reftable_clusters =
-        cpu_to_be32(size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE));
-    ret = bdrv_pwrite_sync(bs->file,
-                           offsetof(QCowHeader, refcount_table_offset),
-                           &reftable_offset_and_clusters,
-                           sizeof(reftable_offset_and_clusters));
+        cpu_to_be32(reftable_clusters);
+    ret = bdrv_co_pwrite_sync(bs->file,
+                              offsetof(QCowHeader, refcount_table_offset),
+                              sizeof(reftable_offset_and_clusters),
+                              &reftable_offset_and_clusters, 0);
     if (ret < 0) {
-        fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
+        error_setg_errno(errp, -ret, "ERROR setting reftable");
         goto fail;
     }
 
-    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
+    for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
+         refblock_index++)
+    {
         be64_to_cpus(&on_disk_reftable[refblock_index]);
     }
     s->refcount_table = on_disk_reftable;
     s->refcount_table_offset = reftable_offset;
-    s->refcount_table_size = reftable_size;
+    s->refcount_table_size = on_disk_reftable_entries;
     update_max_refcount_table_index(s);
 
     return 0;
@@ -2633,8 +2779,8 @@ fail:
  * Returns 0 if no errors are found, the number of errors in case the image is
  * detected as corrupted, and -errno when an internal error occurred.
  */
-int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
-                          BdrvCheckMode fix)
+int coroutine_fn GRAPH_RDLOCK
+qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
 {
     BDRVQcow2State *s = bs->opaque;
     BdrvCheckResult pre_compare_res;
@@ -2643,7 +2789,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
     bool rebuild = false;
     int ret;
 
-    size = bdrv_getlength(bs->file->bs);
+    size = bdrv_co_getlength(bs->file->bs);
     if (size < 0) {
         res->check_errors++;
         return size;
@@ -2674,11 +2820,13 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
         BdrvCheckResult old_res = *res;
         int fresh_leaks = 0;
+        Error *local_err = NULL;
 
         fprintf(stderr, "Rebuilding refcount structure\n");
         ret = rebuild_refcount_structure(bs, res, &refcount_table,
-                                         &nb_clusters);
+                                         &nb_clusters, &local_err);
         if (ret < 0) {
+            error_report_err(local_err);
             goto fail;
         }
 
@@ -2864,7 +3012,7 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
                 return -ENOMEM;
             }
 
-            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
+            ret = bdrv_pread(bs->file, l1_ofs, l1_sz2, l1, 0);
             if (ret < 0) {
                 g_free(l1);
                 return ret;
@@ -2953,20 +3101,22 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
  *
  * @allocated should be set to true if a new cluster has been allocated.
  */
-typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
-                               uint64_t reftable_index, uint64_t *reftable_size,
-                               void *refblock, bool refblock_empty,
-                               bool *allocated, Error **errp);
+typedef int /* GRAPH_RDLOCK_PTR */
+    (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
+                       uint64_t reftable_index, uint64_t *reftable_size,
+                       void *refblock, bool refblock_empty,
+                       bool *allocated, Error **errp);
 
 /**
  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
  * it is not empty) and inserts its offset into the new reftable. The size of
  * this new reftable is increased as required.
  */
-static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
-                          uint64_t reftable_index, uint64_t *reftable_size,
-                          void *refblock, bool refblock_empty, bool *allocated,
-                          Error **errp)
+static int GRAPH_RDLOCK
+alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
+               uint64_t reftable_index, uint64_t *reftable_size,
+               void *refblock, bool refblock_empty, bool *allocated,
+               Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
     int64_t offset;
@@ -3016,10 +3166,11 @@ static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
  * offset specified by the new reftable's entry. It does not modify the new
  * reftable or change any refcounts.
  */
-static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
-                          uint64_t reftable_index, uint64_t *reftable_size,
-                          void *refblock, bool refblock_empty, bool *allocated,
-                          Error **errp)
+static int GRAPH_RDLOCK
+flush_refblock(BlockDriverState *bs, uint64_t **reftable,
+               uint64_t reftable_index, uint64_t *reftable_size,
+               void *refblock, bool refblock_empty, bool *allocated,
+               Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
     int64_t offset;
@@ -3035,7 +3186,7 @@ static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
             return ret;
         }
 
-        ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
+        ret = bdrv_pwrite(bs->file, offset, s->cluster_size, refblock, 0);
         if (ret < 0) {
             error_setg_errno(errp, -ret, "Failed to write refblock");
             return ret;
@@ -3060,16 +3211,17 @@ static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
  *
  * @allocated is set to true if a new cluster has been allocated.
  */
-static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
-                              uint64_t *new_reftable_index,
-                              uint64_t *new_reftable_size,
-                              void *new_refblock, int new_refblock_size,
-                              int new_refcount_bits,
-                              RefblockFinishOp *operation, bool *allocated,
-                              Qcow2SetRefcountFunc *new_set_refcount,
-                              BlockDriverAmendStatusCB *status_cb,
-                              void *cb_opaque, int index, int total,
-                              Error **errp)
+static int GRAPH_RDLOCK
+walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
+                   uint64_t *new_reftable_index,
+                   uint64_t *new_reftable_size,
+                   void *new_refblock, int new_refblock_size,
+                   int new_refcount_bits,
+                   RefblockFinishOp *operation, bool *allocated,
+                   Qcow2SetRefcountFunc *new_set_refcount,
+                   BlockDriverAmendStatusCB *status_cb,
+                   void *cb_opaque, int index, int total,
+                   Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t reftable_index;
@@ -3307,8 +3459,9 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
         cpu_to_be64s(&new_reftable[i]);
     }
 
-    ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
-                      new_reftable_size * REFTABLE_ENTRY_SIZE);
+    ret = bdrv_pwrite(bs->file, new_reftable_offset,
+                      new_reftable_size * REFTABLE_ENTRY_SIZE, new_reftable,
+                      0);
 
     for (i = 0; i < new_reftable_size; i++) {
         be64_to_cpus(&new_reftable[i]);
@@ -3394,7 +3547,8 @@ done:
     return ret;
 }
 
-static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
+static int64_t coroutine_fn GRAPH_RDLOCK
+get_refblock_offset(BlockDriverState *bs, uint64_t offset)
 {
     BDRVQcow2State *s = bs->opaque;
     uint32_t index = offset_to_reftable_index(s, offset);
@@ -3413,8 +3567,8 @@ static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
     return covering_refblock_offset;
 }
 
-static int qcow2_discard_refcount_block(BlockDriverState *bs,
-                                        uint64_t discard_block_offs)
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs)
 {
     BDRVQcow2State *s = bs->opaque;
     int64_t refblock_offs;
@@ -3470,7 +3624,7 @@ static int qcow2_discard_refcount_block(BlockDriverState *bs,
     return 0;
 }
 
-int qcow2_shrink_reftable(BlockDriverState *bs)
+int coroutine_fn qcow2_shrink_reftable(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t *reftable_tmp =
@@ -3511,8 +3665,9 @@ int qcow2_shrink_reftable(BlockDriverState *bs)
         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
     }
 
-    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
-                           s->refcount_table_size * REFTABLE_ENTRY_SIZE);
+    ret = bdrv_co_pwrite_sync(bs->file, s->refcount_table_offset,
+                              s->refcount_table_size * REFTABLE_ENTRY_SIZE,
+                              reftable_tmp, 0);
     /*
      * If the write in the reftable failed the image may contain a partially
      * overwritten reftable. In this case it would be better to clear the
@@ -3537,7 +3692,7 @@ out:
     return ret;
 }
 
-int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
+int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
 {
     BDRVQcow2State *s = bs->opaque;
     int64_t i;
@@ -3559,7 +3714,8 @@ int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
     return -EIO;
 }
 
-int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
+int coroutine_fn GRAPH_RDLOCK
+qcow2_detect_metadata_preallocation(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
     int64_t i, end_cluster, cluster_count = 0, threshold;
@@ -3567,12 +3723,12 @@ int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
 
     qemu_co_mutex_assert_locked(&s->lock);
 
-    file_length = bdrv_getlength(bs->file->bs);
+    file_length = bdrv_co_getlength(bs->file->bs);
     if (file_length < 0) {
         return file_length;
     }
 
-    real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
+    real_allocation = bdrv_co_get_allocated_file_size(bs->file->bs);
     if (real_allocation < 0) {
         return real_allocation;
     }