]> git.proxmox.com Git - mirror_qemu.git/blobdiff - block.c
migration/rdma: Fix unwanted integer truncation
[mirror_qemu.git] / block.c
diff --git a/block.c b/block.c
index cfb7e088955801ece5d4ad5d47778637f0f4db1c..af04c8ac6f3efe73591230baf4fa1cd109aff195 100644 (file)
--- a/block.c
+++ b/block.c
@@ -91,9 +91,11 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
 static bool bdrv_recurse_has_child(BlockDriverState *bs,
                                    BlockDriverState *child);
 
-static void bdrv_replace_child_noperm(BdrvChild *child,
-                                      BlockDriverState *new_bs);
-static void bdrv_remove_child(BdrvChild *child, Transaction *tran);
+static void GRAPH_WRLOCK
+bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs);
+
+static void GRAPH_WRLOCK
+bdrv_remove_child(BdrvChild *child, Transaction *tran);
 
 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
                                BlockReopenQueue *queue,
@@ -415,7 +417,7 @@ BlockDriverState *bdrv_new(void)
     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
         QLIST_INIT(&bs->op_blockers[i]);
     }
-    qemu_co_mutex_init(&bs->reqs_lock);
+    qemu_mutex_init(&bs->reqs_lock);
     qemu_mutex_init(&bs->dirty_bitmap_mutex);
     bs->refcnt = 1;
     bs->aio_context = qemu_get_aio_context();
@@ -661,8 +663,10 @@ int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
     blk = blk_co_new_open(filename, NULL, options,
                           BDRV_O_RDWR | BDRV_O_RESIZE, errp);
     if (!blk) {
-        error_prepend(errp, "Protocol driver '%s' does not support image "
-                      "creation, and opening the image failed: ",
+        error_prepend(errp, "Protocol driver '%s' does not support creating "
+                      "new images, so an existing image must be selected as "
+                      "the target; however, opening the given target as an "
+                      "existing image failed: ",
                       drv->format_name);
         return -EINVAL;
     }
@@ -1697,7 +1701,9 @@ bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
 open_failed:
     bs->drv = NULL;
     if (bs->file != NULL) {
+        bdrv_graph_wrlock(NULL);
         bdrv_unref_child(bs, bs->file);
+        bdrv_graph_wrunlock();
         assert(!bs->file);
     }
     g_free(bs->opaque);
@@ -2113,7 +2119,6 @@ static int bdrv_fill_options(QDict **options, const char *filename,
 
 typedef struct BlockReopenQueueEntry {
      bool prepared;
-     bool perms_checked;
      BDRVReopenState state;
      QTAILQ_ENTRY(BlockReopenQueueEntry) entry;
 } BlockReopenQueueEntry;
@@ -2199,7 +2204,8 @@ static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, Error **errp)
     return false;
 }
 
-static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
+static bool GRAPH_RDLOCK
+bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
 {
     BdrvChild *a, *b;
     GLOBAL_STATE_CODE();
@@ -2224,11 +2230,12 @@ static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
     return false;
 }
 
-static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
-                            BdrvChild *c, BdrvChildRole role,
-                            BlockReopenQueue *reopen_queue,
-                            uint64_t parent_perm, uint64_t parent_shared,
-                            uint64_t *nperm, uint64_t *nshared)
+static void GRAPH_RDLOCK
+bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
+                BdrvChild *c, BdrvChildRole role,
+                BlockReopenQueue *reopen_queue,
+                uint64_t parent_perm, uint64_t parent_shared,
+                uint64_t *nperm, uint64_t *nshared)
 {
     assert(bs->drv && bs->drv->bdrv_child_perm);
     GLOBAL_STATE_CODE();
@@ -2252,8 +2259,8 @@ static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
  * simplest way to satisfy this criteria: use only result of
  * bdrv_topological_dfs() or NULL as @list parameter.
  */
-static GSList *bdrv_topological_dfs(GSList *list, GHashTable *found,
-                                    BlockDriverState *bs)
+static GSList * GRAPH_RDLOCK
+bdrv_topological_dfs(GSList *list, GHashTable *found, BlockDriverState *bs)
 {
     BdrvChild *child;
     g_autoptr(GHashTable) local_found = NULL;
@@ -2316,7 +2323,7 @@ static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm,
     tran_add(tran, &bdrv_child_set_pem_drv, s);
 }
 
-static void bdrv_drv_set_perm_commit(void *opaque)
+static void GRAPH_RDLOCK bdrv_drv_set_perm_commit(void *opaque)
 {
     BlockDriverState *bs = opaque;
     uint64_t cumulative_perms, cumulative_shared_perms;
@@ -2329,7 +2336,7 @@ static void bdrv_drv_set_perm_commit(void *opaque)
     }
 }
 
-static void bdrv_drv_set_perm_abort(void *opaque)
+static void GRAPH_RDLOCK bdrv_drv_set_perm_abort(void *opaque)
 {
     BlockDriverState *bs = opaque;
     GLOBAL_STATE_CODE();
@@ -2344,9 +2351,13 @@ TransactionActionDrv bdrv_drv_set_perm_drv = {
     .commit = bdrv_drv_set_perm_commit,
 };
 
-static int bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm,
-                             uint64_t shared_perm, Transaction *tran,
-                             Error **errp)
+/*
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a reader lock for the graph.
+ */
+static int GRAPH_RDLOCK
+bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared_perm,
+                  Transaction *tran, Error **errp)
 {
     GLOBAL_STATE_CODE();
     if (!bs->drv) {
@@ -2372,20 +2383,22 @@ typedef struct BdrvReplaceChildState {
     BlockDriverState *old_bs;
 } BdrvReplaceChildState;
 
-static void bdrv_replace_child_commit(void *opaque)
+static void GRAPH_WRLOCK bdrv_replace_child_commit(void *opaque)
 {
     BdrvReplaceChildState *s = opaque;
     GLOBAL_STATE_CODE();
 
-    bdrv_unref(s->old_bs);
+    bdrv_schedule_unref(s->old_bs);
 }
 
-static void bdrv_replace_child_abort(void *opaque)
+static void GRAPH_WRLOCK bdrv_replace_child_abort(void *opaque)
 {
     BdrvReplaceChildState *s = opaque;
     BlockDriverState *new_bs = s->child->bs;
 
     GLOBAL_STATE_CODE();
+    assert_bdrv_graph_writable();
+
     /* old_bs reference is transparently moved from @s to @s->child */
     if (!s->child->bs) {
         /*
@@ -2402,6 +2415,7 @@ static void bdrv_replace_child_abort(void *opaque)
     }
     assert(s->child->quiesced_parent);
     bdrv_replace_child_noperm(s->child, s->old_bs);
+
     bdrv_unref(new_bs);
 }
 
@@ -2419,10 +2433,14 @@ static TransactionActionDrv bdrv_replace_child_drv = {
  * Both @child->bs and @new_bs (if non-NULL) must be drained. @new_bs must be
  * kept drained until the transaction is completed.
  *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
+ *
  * The function doesn't update permissions, caller is responsible for this.
  */
-static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
-                                    Transaction *tran)
+static void GRAPH_WRLOCK
+bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
+                        Transaction *tran)
 {
     BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
 
@@ -2438,6 +2456,7 @@ static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
     if (new_bs) {
         bdrv_ref(new_bs);
     }
+
     bdrv_replace_child_noperm(child, new_bs);
     /* old_bs reference is transparently moved from @child to @s */
 }
@@ -2445,9 +2464,13 @@ static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
 /*
  * Refresh permissions in @bs subtree. The function is intended to be called
  * after some graph modification that was done without permission update.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a reader lock for the graph.
  */
-static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
-                                  Transaction *tran, Error **errp)
+static int GRAPH_RDLOCK
+bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
+                       Transaction *tran, Error **errp)
 {
     BlockDriver *drv = bs->drv;
     BdrvChild *c;
@@ -2520,9 +2543,13 @@ static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
 /*
  * @list is a product of bdrv_topological_dfs() (may be called several times) -
  * a topologically sorted subgraph.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a reader lock for the graph.
  */
-static int bdrv_do_refresh_perms(GSList *list, BlockReopenQueue *q,
-                                 Transaction *tran, Error **errp)
+static int GRAPH_RDLOCK
+bdrv_do_refresh_perms(GSList *list, BlockReopenQueue *q, Transaction *tran,
+                      Error **errp)
 {
     int ret;
     BlockDriverState *bs;
@@ -2548,9 +2575,13 @@ static int bdrv_do_refresh_perms(GSList *list, BlockReopenQueue *q,
  * @list is any list of nodes. List is completed by all subtrees and
  * topologically sorted. It's not a problem if some node occurs in the @list
  * several times.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a reader lock for the graph.
  */
-static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
-                                   Transaction *tran, Error **errp)
+static int GRAPH_RDLOCK
+bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q, Transaction *tran,
+                        Error **errp)
 {
     g_autoptr(GHashTable) found = g_hash_table_new(NULL, NULL);
     g_autoptr(GSList) refresh_list = NULL;
@@ -2609,9 +2640,14 @@ char *bdrv_perm_names(uint64_t perm)
 }
 
 
-/* @tran is allowed to be NULL. In this case no rollback is possible */
-static int bdrv_refresh_perms(BlockDriverState *bs, Transaction *tran,
-                              Error **errp)
+/*
+ * @tran is allowed to be NULL. In this case no rollback is possible.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a reader lock for the graph.
+ */
+static int GRAPH_RDLOCK
+bdrv_refresh_perms(BlockDriverState *bs, Transaction *tran, Error **errp)
 {
     int ret;
     Transaction *local_tran = NULL;
@@ -2857,8 +2893,8 @@ uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
  * If @new_bs is non-NULL, the parent of @child must already be drained through
  * @child and the caller must hold the AioContext lock for @new_bs.
  */
-static void bdrv_replace_child_noperm(BdrvChild *child,
-                                      BlockDriverState *new_bs)
+static void GRAPH_WRLOCK
+bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs)
 {
     BlockDriverState *old_bs = child->bs;
     int new_bs_quiesce_counter;
@@ -2893,8 +2929,6 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
     }
 
-    /* TODO Pull this up into the callers to avoid polling here */
-    bdrv_graph_wrlock(new_bs);
     if (old_bs) {
         if (child->klass->detach) {
             child->klass->detach(child);
@@ -2910,7 +2944,6 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
             child->klass->attach(child);
         }
     }
-    bdrv_graph_wrunlock();
 
     /*
      * If the parent was drained through this BdrvChild previously, but new_bs
@@ -2945,12 +2978,14 @@ typedef struct BdrvAttachChildCommonState {
     AioContext *old_child_ctx;
 } BdrvAttachChildCommonState;
 
-static void bdrv_attach_child_common_abort(void *opaque)
+static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque)
 {
     BdrvAttachChildCommonState *s = opaque;
     BlockDriverState *bs = s->child->bs;
 
     GLOBAL_STATE_CODE();
+    assert_bdrv_graph_writable();
+
     bdrv_replace_child_noperm(s->child, NULL);
 
     if (bdrv_get_aio_context(bs) != s->old_child_ctx) {
@@ -2975,7 +3010,7 @@ static void bdrv_attach_child_common_abort(void *opaque)
         tran_commit(tran);
     }
 
-    bdrv_unref(bs);
+    bdrv_schedule_unref(bs);
     bdrv_child_free(s->child);
 }
 
@@ -2989,19 +3024,23 @@ static TransactionActionDrv bdrv_attach_child_common_drv = {
  *
  * Function doesn't update permissions, caller is responsible for this.
  *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
+ *
  * Returns new created child.
  *
  * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
  * @child_bs can move to a different AioContext in this function. Callers must
  * make sure that their AioContext locking is still correct after this.
  */
-static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
-                                           const char *child_name,
-                                           const BdrvChildClass *child_class,
-                                           BdrvChildRole child_role,
-                                           uint64_t perm, uint64_t shared_perm,
-                                           void *opaque,
-                                           Transaction *tran, Error **errp)
+static BdrvChild * GRAPH_WRLOCK
+bdrv_attach_child_common(BlockDriverState *child_bs,
+                         const char *child_name,
+                         const BdrvChildClass *child_class,
+                         BdrvChildRole child_role,
+                         uint64_t perm, uint64_t shared_perm,
+                         void *opaque,
+                         Transaction *tran, Error **errp)
 {
     BdrvChild *new_child;
     AioContext *parent_ctx, *new_child_ctx;
@@ -3033,18 +3072,19 @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
                                               &local_err);
 
         if (ret < 0 && child_class->change_aio_ctx) {
-            Transaction *tran = tran_new();
+            Transaction *aio_ctx_tran = tran_new();
             GHashTable *visited = g_hash_table_new(NULL, NULL);
             bool ret_child;
 
             g_hash_table_add(visited, new_child);
             ret_child = child_class->change_aio_ctx(new_child, child_ctx,
-                                                    visited, tran, NULL);
+                                                    visited, aio_ctx_tran,
+                                                    NULL);
             if (ret_child == true) {
                 error_free(local_err);
                 ret = 0;
             }
-            tran_finalize(tran, ret_child == true ? 0 : -1);
+            tran_finalize(aio_ctx_tran, ret_child == true ? 0 : -1);
             g_hash_table_destroy(visited);
         }
 
@@ -3104,14 +3144,18 @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
  * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
  * @child_bs can move to a different AioContext in this function. Callers must
  * make sure that their AioContext locking is still correct after this.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
  */
-static BdrvChild *bdrv_attach_child_noperm(BlockDriverState *parent_bs,
-                                           BlockDriverState *child_bs,
-                                           const char *child_name,
-                                           const BdrvChildClass *child_class,
-                                           BdrvChildRole child_role,
-                                           Transaction *tran,
-                                           Error **errp)
+static BdrvChild * GRAPH_WRLOCK
+bdrv_attach_child_noperm(BlockDriverState *parent_bs,
+                         BlockDriverState *child_bs,
+                         const char *child_name,
+                         const BdrvChildClass *child_class,
+                         BdrvChildRole child_role,
+                         Transaction *tran,
+                         Error **errp)
 {
     uint64_t perm, shared_perm;
 
@@ -3156,6 +3200,8 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
 
     GLOBAL_STATE_CODE();
 
+    bdrv_graph_wrlock(child_bs);
+
     child = bdrv_attach_child_common(child_bs, child_name, child_class,
                                    child_role, perm, shared_perm, opaque,
                                    tran, errp);
@@ -3168,6 +3214,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
 
 out:
     tran_finalize(tran, ret);
+    bdrv_graph_wrunlock();
 
     bdrv_unref(child_bs);
 
@@ -3213,7 +3260,7 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
 out:
     tran_finalize(tran, ret);
 
-    bdrv_unref(child_bs);
+    bdrv_schedule_unref(child_bs);
 
     return ret < 0 ? NULL : child;
 }
@@ -3243,7 +3290,7 @@ void bdrv_root_unref_child(BdrvChild *child)
                                     NULL);
     }
 
-    bdrv_unref(child_bs);
+    bdrv_schedule_unref(child_bs);
 }
 
 typedef struct BdrvSetInheritsFrom {
@@ -3287,8 +3334,9 @@ static void bdrv_set_inherits_from(BlockDriverState *bs,
  * @root that point to @root, where necessary.
  * @tran is allowed to be NULL. In this case no rollback is possible
  */
-static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
-                                     Transaction *tran)
+static void GRAPH_WRLOCK
+bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
+                         Transaction *tran)
 {
     BdrvChild *c;
 
@@ -3325,7 +3373,8 @@ void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
 }
 
 
-static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
+static void GRAPH_RDLOCK
+bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
 {
     BdrvChild *c;
     GLOBAL_STATE_CODE();
@@ -3366,16 +3415,23 @@ static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
  * Sets the bs->backing or bs->file link of a BDS. A new reference is created;
  * callers which don't need their own reference any more must call bdrv_unref().
  *
+ * If the respective child is already present (i.e. we're detaching a node),
+ * that child node must be drained.
+ *
  * Function doesn't update permissions, caller is responsible for this.
  *
  * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
  * @child_bs can move to a different AioContext in this function. Callers must
  * make sure that their AioContext locking is still correct after this.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
  */
-static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
-                                           BlockDriverState *child_bs,
-                                           bool is_backing,
-                                           Transaction *tran, Error **errp)
+static int GRAPH_WRLOCK
+bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
+                                BlockDriverState *child_bs,
+                                bool is_backing,
+                                Transaction *tran, Error **errp)
 {
     bool update_inherits_from =
         bdrv_inherits_from_recursive(child_bs, parent_bs);
@@ -3426,6 +3482,7 @@ static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
     }
 
     if (child) {
+        assert(child->bs->quiesce_counter);
         bdrv_unset_inherits_from(parent_bs, child, tran);
         bdrv_remove_child(child, tran);
     }
@@ -3452,9 +3509,7 @@ static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
     }
 
 out:
-    bdrv_graph_rdlock_main_loop();
     bdrv_refresh_limits(parent_bs, tran, NULL);
-    bdrv_graph_rdunlock_main_loop();
 
     return 0;
 }
@@ -3463,10 +3518,17 @@ out:
  * The caller must hold the AioContext lock for @backing_hd. Both @bs and
  * @backing_hd can move to a different AioContext in this function. Callers must
  * make sure that their AioContext locking is still correct after this.
+ *
+ * If a backing child is already present (i.e. we're detaching a node), that
+ * child node must be drained.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
  */
-static int bdrv_set_backing_noperm(BlockDriverState *bs,
-                                   BlockDriverState *backing_hd,
-                                   Transaction *tran, Error **errp)
+static int GRAPH_WRLOCK
+bdrv_set_backing_noperm(BlockDriverState *bs,
+                        BlockDriverState *backing_hd,
+                        Transaction *tran, Error **errp)
 {
     GLOBAL_STATE_CODE();
     return bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp);
@@ -3481,6 +3543,10 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs,
 
     GLOBAL_STATE_CODE();
     assert(bs->quiesce_counter > 0);
+    if (bs->backing) {
+        assert(bs->backing->bs->quiesce_counter > 0);
+    }
+    bdrv_graph_wrlock(backing_hd);
 
     ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
     if (ret < 0) {
@@ -3490,18 +3556,22 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs,
     ret = bdrv_refresh_perms(bs, tran, errp);
 out:
     tran_finalize(tran, ret);
+    bdrv_graph_wrunlock();
     return ret;
 }
 
 int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
                         Error **errp)
 {
+    BlockDriverState *drain_bs = bs->backing ? bs->backing->bs : bs;
     int ret;
     GLOBAL_STATE_CODE();
 
-    bdrv_drained_begin(bs);
+    bdrv_ref(drain_bs);
+    bdrv_drained_begin(drain_bs);
     ret = bdrv_set_backing_hd_drained(bs, backing_hd, errp);
-    bdrv_drained_end(bs);
+    bdrv_drained_end(drain_bs);
+    bdrv_unref(drain_bs);
 
     return ret;
 }
@@ -3711,11 +3781,13 @@ BdrvChild *bdrv_open_child(const char *filename,
         return NULL;
     }
 
+    bdrv_graph_wrlock(NULL);
     ctx = bdrv_get_aio_context(bs);
     aio_context_acquire(ctx);
     child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
                               errp);
     aio_context_release(ctx);
+    bdrv_graph_wrunlock();
 
     return child;
 }
@@ -3900,6 +3972,9 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
     GLOBAL_STATE_CODE();
     assert(!qemu_in_coroutine());
 
+    /* TODO We'll eventually have to take a writer lock in this function */
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
     if (reference) {
         bool options_non_empty = options ? qdict_size(options) : false;
         qobject_unref(options);
@@ -4539,7 +4614,10 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
      * reconfiguring the fd and that's why it does it in raw_check_perm(), not
      * in raw_reopen_prepare() which is called with "old" permissions.
      */
+    bdrv_graph_rdlock_main_loop();
     ret = bdrv_list_refresh_perms(refresh_list, bs_queue, tran, errp);
+    bdrv_graph_rdunlock_main_loop();
+
     if (ret < 0) {
         goto abort;
     }
@@ -4560,7 +4638,9 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
         aio_context_release(ctx);
     }
 
+    bdrv_graph_wrlock(NULL);
     tran_commit(tran);
+    bdrv_graph_wrunlock();
 
     QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
         BlockDriverState *bs = bs_entry->state.bs;
@@ -4577,7 +4657,10 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
     goto cleanup;
 
 abort:
+    bdrv_graph_wrlock(NULL);
     tran_abort(tran);
+    bdrv_graph_wrunlock();
+
     QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
         if (bs_entry->prepared) {
             ctx = bdrv_get_aio_context(bs_entry->state.bs);
@@ -4643,6 +4726,9 @@ int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
  * true and reopen_state->new_backing_bs contains a pointer to the new
  * backing BlockDriverState (or NULL).
  *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
+ *
  * Return 0 on success, otherwise return < 0 and set @errp.
  *
  * The caller must hold the AioContext lock of @reopen_state->bs.
@@ -4727,6 +4813,11 @@ static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
         reopen_state->old_file_bs = old_child_bs;
     }
 
+    if (old_child_bs) {
+        bdrv_ref(old_child_bs);
+        bdrv_drained_begin(old_child_bs);
+    }
+
     old_ctx = bdrv_get_aio_context(bs);
     ctx = bdrv_get_aio_context(new_child_bs);
     if (old_ctx != ctx) {
@@ -4734,14 +4825,23 @@ static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
         aio_context_acquire(ctx);
     }
 
+    bdrv_graph_wrlock(new_child_bs);
+
     ret = bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
                                           tran, errp);
 
+    bdrv_graph_wrunlock();
+
     if (old_ctx != ctx) {
         aio_context_release(ctx);
         aio_context_acquire(old_ctx);
     }
 
+    if (old_child_bs) {
+        bdrv_drained_end(old_child_bs);
+        bdrv_unref(old_child_bs);
+    }
+
     return ret;
 }
 
@@ -4762,6 +4862,9 @@ static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
  * commit() for any other BDS that have been left in a prepare() state
  *
  * The caller must hold the AioContext lock of @reopen_state->bs.
+ *
+ * After calling this function, the transaction @change_child_tran may only be
+ * completed while holding a writer lock for the graph.
  */
 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
                                BlockReopenQueue *queue,
@@ -5063,9 +5166,11 @@ static void bdrv_close(BlockDriverState *bs)
         bs->drv = NULL;
     }
 
+    bdrv_graph_wrlock(NULL);
     QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
         bdrv_unref_child(bs, child);
     }
+    bdrv_graph_wrunlock();
 
     assert(!bs->backing);
     assert(!bs->file);
@@ -5120,7 +5225,7 @@ void bdrv_close_all(void)
     assert(QTAILQ_EMPTY(&all_bdrv_states));
 }
 
-static bool should_update_child(BdrvChild *c, BlockDriverState *to)
+static bool GRAPH_RDLOCK should_update_child(BdrvChild *c, BlockDriverState *to)
 {
     GQueue *queue;
     GHashTable *found;
@@ -5209,45 +5314,47 @@ static TransactionActionDrv bdrv_remove_child_drv = {
     .commit = bdrv_remove_child_commit,
 };
 
-/* Function doesn't update permissions, caller is responsible for this. */
-static void bdrv_remove_child(BdrvChild *child, Transaction *tran)
+/*
+ * Function doesn't update permissions, caller is responsible for this.
+ *
+ * @child->bs (if non-NULL) must be drained.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
+ */
+static void GRAPH_WRLOCK bdrv_remove_child(BdrvChild *child, Transaction *tran)
 {
     if (!child) {
         return;
     }
 
     if (child->bs) {
-        BlockDriverState *bs = child->bs;
-        bdrv_drained_begin(bs);
+        assert(child->quiesced_parent);
         bdrv_replace_child_tran(child, NULL, tran);
-        bdrv_drained_end(bs);
     }
 
     tran_add(tran, &bdrv_remove_child_drv, child);
 }
 
-static void undrain_on_clean_cb(void *opaque)
-{
-    bdrv_drained_end(opaque);
-}
-
-static TransactionActionDrv undrain_on_clean = {
-    .clean = undrain_on_clean_cb,
-};
-
-static int bdrv_replace_node_noperm(BlockDriverState *from,
-                                    BlockDriverState *to,
-                                    bool auto_skip, Transaction *tran,
-                                    Error **errp)
+/*
+ * Both @from and @to (if non-NULL) must be drained. @to must be kept drained
+ * until the transaction is completed.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
+ */
+static int GRAPH_WRLOCK
+bdrv_replace_node_noperm(BlockDriverState *from,
+                         BlockDriverState *to,
+                         bool auto_skip, Transaction *tran,
+                         Error **errp)
 {
     BdrvChild *c, *next;
 
     GLOBAL_STATE_CODE();
 
-    bdrv_drained_begin(from);
-    bdrv_drained_begin(to);
-    tran_add(tran, &undrain_on_clean, from);
-    tran_add(tran, &undrain_on_clean, to);
+    assert(from->quiesce_counter);
+    assert(to->quiesce_counter);
 
     QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
         assert(c->bs == from);
@@ -5310,6 +5417,9 @@ static int bdrv_replace_node_common(BlockDriverState *from,
     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
     assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
     bdrv_drained_begin(from);
+    bdrv_drained_begin(to);
+
+    bdrv_graph_wrlock(to);
 
     /*
      * Do the replacement without permission update.
@@ -5323,6 +5433,7 @@ static int bdrv_replace_node_common(BlockDriverState *from,
     }
 
     if (detach_subchain) {
+        /* to_cow_parent is already drained because from is drained */
         bdrv_remove_child(bdrv_filter_or_cow_child(to_cow_parent), tran);
     }
 
@@ -5338,7 +5449,9 @@ static int bdrv_replace_node_common(BlockDriverState *from,
 
 out:
     tran_finalize(tran, ret);
+    bdrv_graph_wrunlock();
 
+    bdrv_drained_end(to);
     bdrv_drained_end(from);
     bdrv_unref(from);
 
@@ -5388,6 +5501,22 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
     assert(!bs_new->backing);
 
     old_context = bdrv_get_aio_context(bs_top);
+    bdrv_drained_begin(bs_top);
+
+    /*
+     * bdrv_drained_begin() requires that only the AioContext of the drained
+     * node is locked, and at this point it can still differ from the AioContext
+     * of bs_top.
+     */
+    new_context = bdrv_get_aio_context(bs_new);
+    aio_context_release(old_context);
+    aio_context_acquire(new_context);
+    bdrv_drained_begin(bs_new);
+    aio_context_release(new_context);
+    aio_context_acquire(old_context);
+    new_context = NULL;
+
+    bdrv_graph_wrlock(bs_top);
 
     child = bdrv_attach_child_noperm(bs_new, bs_top, "backing",
                                      &child_of_bds, bdrv_backing_role(bs_new),
@@ -5398,10 +5527,9 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
     }
 
     /*
-     * bdrv_attach_child_noperm could change the AioContext of bs_top.
-     * bdrv_replace_node_noperm calls bdrv_drained_begin, so let's temporarily
-     * hold the new AioContext, since bdrv_drained_begin calls BDRV_POLL_WHILE
-     * that assumes the new lock is taken.
+     * bdrv_attach_child_noperm could change the AioContext of bs_top and
+     * bs_new, but at least they are in the same AioContext now. This is the
+     * AioContext that we need to lock for the rest of the function.
      */
     new_context = bdrv_get_aio_context(bs_top);
 
@@ -5419,9 +5547,11 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
 out:
     tran_finalize(tran, ret);
 
-    bdrv_graph_rdlock_main_loop();
     bdrv_refresh_limits(bs_top, NULL, NULL);
-    bdrv_graph_rdunlock_main_loop();
+    bdrv_graph_wrunlock();
+
+    bdrv_drained_end(bs_top);
+    bdrv_drained_end(bs_new);
 
     if (new_context && old_context != new_context) {
         aio_context_release(new_context);
@@ -5445,6 +5575,7 @@ int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
     bdrv_ref(old_bs);
     bdrv_drained_begin(old_bs);
     bdrv_drained_begin(new_bs);
+    bdrv_graph_wrlock(new_bs);
 
     bdrv_replace_child_tran(child, new_bs, tran);
 
@@ -5455,6 +5586,7 @@ int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
 
     tran_finalize(tran, ret);
 
+    bdrv_graph_wrunlock();
     bdrv_drained_end(old_bs);
     bdrv_drained_end(new_bs);
     bdrv_unref(old_bs);
@@ -5476,6 +5608,8 @@ static void bdrv_delete(BlockDriverState *bs)
 
     bdrv_close(bs);
 
+    qemu_mutex_destroy(&bs->reqs_lock);
+
     g_free(bs);
 }
 
@@ -5808,9 +5942,11 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
         backing_file_str = base->filename;
     }
 
+    bdrv_graph_rdlock_main_loop();
     QLIST_FOREACH(c, &top->parents, next_parent) {
         updated_children = g_slist_prepend(updated_children, c);
     }
+    bdrv_graph_rdunlock_main_loop();
 
     /*
      * It seems correct to pass detach_subchain=true here, but it triggers
@@ -6073,12 +6209,12 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
     QLIST_FOREACH(drv, &bdrv_drivers, list) {
         if (drv->format_name) {
             bool found = false;
-            int i = count;
 
             if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, read_only)) {
                 continue;
             }
 
+            i = count;
             while (formats && i && !found) {
                 found = !strcmp(formats[--i], drv->format_name);
             }
@@ -6733,6 +6869,7 @@ int bdrv_activate(BlockDriverState *bs, Error **errp)
     BdrvDirtyBitmap *bm;
 
     GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
 
     if (!bs->drv)  {
         return -ENOMEDIUM;
@@ -6863,6 +7000,7 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs)
     uint64_t cumulative_perms, cumulative_shared_perms;
 
     GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
 
     if (!bs->drv) {
         return -ENOMEDIUM;
@@ -7041,6 +7179,23 @@ void bdrv_unref(BlockDriverState *bs)
     }
 }
 
+/*
+ * Release a BlockDriverState reference while holding the graph write lock.
+ *
+ * Calling bdrv_unref() directly is forbidden while holding the graph lock
+ * because bdrv_close() both involves polling and taking the graph lock
+ * internally. bdrv_schedule_unref() instead delays decreasing the refcount and
+ * possibly closing @bs until the graph lock is released.
+ */
+void bdrv_schedule_unref(BlockDriverState *bs)
+{
+    if (!bs) {
+        return;
+    }
+    aio_bh_schedule_oneshot(qemu_get_aio_context(),
+                            (QEMUBHFunc *) bdrv_unref, bs);
+}
+
 struct BdrvOpBlocker {
     Error *reason;
     QLIST_ENTRY(BdrvOpBlocker) list;
@@ -7537,17 +7692,21 @@ static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
         return true;
     }
 
+    bdrv_graph_rdlock_main_loop();
     QLIST_FOREACH(c, &bs->parents, next_parent) {
         if (!bdrv_parent_change_aio_context(c, ctx, visited, tran, errp)) {
+            bdrv_graph_rdunlock_main_loop();
             return false;
         }
     }
 
     QLIST_FOREACH(c, &bs->children, next) {
         if (!bdrv_child_change_aio_context(c, ctx, visited, tran, errp)) {
+            bdrv_graph_rdunlock_main_loop();
             return false;
         }
     }
+    bdrv_graph_rdunlock_main_loop();
 
     state = g_new(BdrvStateSetAioContext, 1);
     *state = (BdrvStateSetAioContext) {