]> git.proxmox.com Git - pve-qemu.git/blame - debian/patches/pve/0043-alloc-track-fix-deadlock-during-drop.patch
backup: factor out & clean up gathering device info into helper
[pve-qemu.git] / debian / patches / pve / 0043-alloc-track-fix-deadlock-during-drop.patch
CommitLineData
bf251437
FE
1From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: Fiona Ebner <f.ebner@proxmox.com>
3Date: Thu, 6 Apr 2023 14:59:31 +0200
4Subject: [PATCH] alloc-track: fix deadlock during drop
5
6by replacing the block node directly after changing the backing file
7instead of rescheduling it.
8
9With changes in QEMU 8.0, calling bdrv_get_info (and bdrv_unref)
10during drop can lead to a deadlock when using iothread (only triggered
11with multiple disks, except during debugging where it also triggered
12with one disk sometimes):
131. job_unref_locked acquires the AioContext and calls job->driver->free
142. track_drop gets scheduled
153. bdrv_graph_wrlock is called and polls which leads to track_drop being
16 called
174. track_drop acquires the AioContext recursively
185. bdrv_get_info is a wrapped coroutine (since 8.0) and thus polls for
19 bdrv_co_get_info. This releases the AioContext, but only once! The
20 documentation for the AIO_WAIT_WHILE macro states that the
21 AioContext lock needs to be acquired exactly once, but there does
22 not seem to be a way for track_drop to know if it acquired the lock
23 recursively or not (without adding further hacks).
246. Because the AioContext is still held by the main thread once, it can't
25 be acquired before entering bdrv_co_get_info in co_schedule_bh_cb
26 which happens in the iothread
27
28When doing the operation in change_backing_file, the AioContext has
29already been acquired by the caller, so the issue with the recursive
30lock goes away.
31
32The comment explaining why delaying the replace is necessary is
33> we need to schedule this for later however, since when this function
34> is called, the blockjob modifying us is probably not done yet and
35> has a blocker on 'bs'
36
37However, there is no check for blockers in bdrv_replace_node. It would
38need to be done by us, the caller, with check_to_replace_node.
39Furthermore, the mirror job also does its call to bdrv_replace_node
40while there is an active blocker (inserted by mirror itself) and they
41use a specialized version to check for blockers instead of
42check_to_replace_node there. Alloc-track could also do something
43similar to check for other blockers, but it should be fine to rely on
44Proxmox VE that no other operation with the blockdev is going on.
45
46Mirror also drains the target before replacing the node, but the
47target can have other users. In case of alloc-track the file child
48should not be accessible by anybody else and so there can't be an
49in-flight operation for the file child when alloc-track is drained.
50
51The rescheduling based on refcounting is a hack and it doesn't seem to
52be necessary anymore. It's not clear what the original issue from the
53comment was. Testing with older builds with track_drop done directly
54without rescheduling also didn't lead to any noticable issue for me.
55
56One issue it might have been is the one fixed by b1e1af394d
57("block/stream: Drain subtree around graph change"), where
58block-stream had a use-after-free if the base node changed at an
59inconvenient time (which alloc-track's auto-drop does).
60
61It's also not possible to just not auto-replace the alloc-track. Not
62replacing it at all leads to other operations like block resize
63hanging, and there is no good way to replace it manually via QMP
64(there is x-blockdev-change, but it is experimental and doesn't
65implement the required operation yet). Also, it's just cleaner in
66general to not leave unnecessary block nodes lying around.
67
68Suggested-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
69Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
8dd76cc5 70Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
bf251437
FE
71---
72 block/alloc-track.c | 54 ++++++++++++++-------------------------------
73 1 file changed, 16 insertions(+), 38 deletions(-)
74
75diff --git a/block/alloc-track.c b/block/alloc-track.c
76index b75d7c6460..76da140a68 100644
77--- a/block/alloc-track.c
78+++ b/block/alloc-track.c
79@@ -25,7 +25,6 @@
80
81 typedef enum DropState {
82 DropNone,
83- DropRequested,
84 DropInProgress,
85 } DropState;
86
87@@ -268,37 +267,6 @@ static void track_child_perm(BlockDriverState *bs, BdrvChild *c,
88 }
89 }
90
91-static void track_drop(void *opaque)
92-{
93- BlockDriverState *bs = (BlockDriverState*)opaque;
94- BlockDriverState *file = bs->file->bs;
95- BDRVAllocTrackState *s = bs->opaque;
96-
97- assert(file);
98-
99- /* we rely on the fact that we're not used anywhere else, so let's wait
100- * until we're only used once - in the drive connected to the guest (and one
101- * ref is held by bdrv_ref in track_change_backing_file) */
102- if (bs->refcnt > 2) {
103- aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, opaque);
104- return;
105- }
106- AioContext *aio_context = bdrv_get_aio_context(bs);
107- aio_context_acquire(aio_context);
108-
109- bdrv_drained_begin(bs);
110-
111- /* now that we're drained, we can safely set 'DropInProgress' */
112- s->drop_state = DropInProgress;
113- bdrv_child_refresh_perms(bs, bs->file, &error_abort);
114-
115- bdrv_replace_node(bs, file, &error_abort);
116- bdrv_set_backing_hd(bs, NULL, &error_abort);
117- bdrv_drained_end(bs);
118- bdrv_unref(bs);
119- aio_context_release(aio_context);
120-}
121-
122 static int track_change_backing_file(BlockDriverState *bs,
123 const char *backing_file,
124 const char *backing_fmt)
125@@ -308,13 +276,23 @@ static int track_change_backing_file(BlockDriverState *bs,
126 backing_file == NULL && backing_fmt == NULL)
127 {
128 /* backing file has been disconnected, there's no longer any use for
129- * this node, so let's remove ourselves from the block graph - we need
130- * to schedule this for later however, since when this function is
131- * called, the blockjob modifying us is probably not done yet and has a
132- * blocker on 'bs' */
133- s->drop_state = DropRequested;
134+ * this node, so let's remove ourselves from the block graph */
135+ BlockDriverState *file = bs->file->bs;
136+
137+ /* Just to be sure, because bdrv_replace_node unrefs it */
138 bdrv_ref(bs);
139- aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, (void*)bs);
140+ bdrv_drained_begin(bs);
141+
142+ /* now that we're drained, we can safely set 'DropInProgress' */
143+ s->drop_state = DropInProgress;
144+
145+ bdrv_child_refresh_perms(bs, bs->file, &error_abort);
146+
147+ bdrv_replace_node(bs, file, &error_abort);
148+ bdrv_set_backing_hd(bs, NULL, &error_abort);
149+
150+ bdrv_drained_end(bs);
151+ bdrv_unref(bs);
152 }
153
154 return 0;