]>
Commit | Line | Data |
---|---|---|
bf251437 FE |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Fiona Ebner <f.ebner@proxmox.com> | |
3 | Date: Thu, 6 Apr 2023 14:59:31 +0200 | |
4 | Subject: [PATCH] alloc-track: fix deadlock during drop | |
5 | ||
6 | by replacing the block node directly after changing the backing file | |
7 | instead of rescheduling it. | |
8 | ||
9 | With changes in QEMU 8.0, calling bdrv_get_info (and bdrv_unref) | |
10 | during drop can lead to a deadlock when using iothread (only triggered | |
11 | with multiple disks, except during debugging where it also triggered | |
12 | with one disk sometimes): | |
13 | 1. job_unref_locked acquires the AioContext and calls job->driver->free | |
14 | 2. track_drop gets scheduled | |
15 | 3. bdrv_graph_wrlock is called and polls which leads to track_drop being | |
16 | called | |
17 | 4. track_drop acquires the AioContext recursively | |
18 | 5. bdrv_get_info is a wrapped coroutine (since 8.0) and thus polls for | |
19 | bdrv_co_get_info. This releases the AioContext, but only once! The | |
20 | documentation for the AIO_WAIT_WHILE macro states that the | |
21 | AioContext lock needs to be acquired exactly once, but there does | |
22 | not seem to be a way for track_drop to know if it acquired the lock | |
23 | recursively or not (without adding further hacks). | |
24 | 6. Because the AioContext is still held by the main thread once, it can't | |
25 | be acquired before entering bdrv_co_get_info in co_schedule_bh_cb | |
26 | which happens in the iothread | |
27 | ||
28 | When doing the operation in change_backing_file, the AioContext has | |
29 | already been acquired by the caller, so the issue with the recursive | |
30 | lock goes away. | |
31 | ||
32 | The comment explaining why delaying the replace is necessary is | |
33 | > we need to schedule this for later however, since when this function | |
34 | > is called, the blockjob modifying us is probably not done yet and | |
35 | > has a blocker on 'bs' | |
36 | ||
37 | However, there is no check for blockers in bdrv_replace_node. It would | |
38 | need to be done by us, the caller, with check_to_replace_node. | |
39 | Furthermore, the mirror job also does its call to bdrv_replace_node | |
40 | while there is an active blocker (inserted by mirror itself) and they | |
41 | use a specialized version to check for blockers instead of | |
42 | check_to_replace_node there. Alloc-track could also do something | |
43 | similar to check for other blockers, but it should be fine to rely on | |
44 | Proxmox VE that no other operation with the blockdev is going on. | |
45 | ||
46 | Mirror also drains the target before replacing the node, but the | |
47 | target can have other users. In case of alloc-track the file child | |
48 | should not be accessible by anybody else and so there can't be an | |
49 | in-flight operation for the file child when alloc-track is drained. | |
50 | ||
51 | The rescheduling based on refcounting is a hack and it doesn't seem to | |
52 | be necessary anymore. It's not clear what the original issue from the | |
53 | comment was. Testing with older builds with track_drop done directly | |
54 | without rescheduling also didn't lead to any noticable issue for me. | |
55 | ||
56 | One issue it might have been is the one fixed by b1e1af394d | |
57 | ("block/stream: Drain subtree around graph change"), where | |
58 | block-stream had a use-after-free if the base node changed at an | |
59 | inconvenient time (which alloc-track's auto-drop does). | |
60 | ||
61 | It's also not possible to just not auto-replace the alloc-track. Not | |
62 | replacing it at all leads to other operations like block resize | |
63 | hanging, and there is no good way to replace it manually via QMP | |
64 | (there is x-blockdev-change, but it is experimental and doesn't | |
65 | implement the required operation yet). Also, it's just cleaner in | |
66 | general to not leave unnecessary block nodes lying around. | |
67 | ||
68 | Suggested-by: Wolfgang Bumiller <w.bumiller@proxmox.com> | |
69 | Signed-off-by: Fiona Ebner <f.ebner@proxmox.com> | |
8dd76cc5 | 70 | Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com> |
bf251437 FE |
71 | --- |
72 | block/alloc-track.c | 54 ++++++++++++++------------------------------- | |
73 | 1 file changed, 16 insertions(+), 38 deletions(-) | |
74 | ||
75 | diff --git a/block/alloc-track.c b/block/alloc-track.c | |
76 | index b75d7c6460..76da140a68 100644 | |
77 | --- a/block/alloc-track.c | |
78 | +++ b/block/alloc-track.c | |
79 | @@ -25,7 +25,6 @@ | |
80 | ||
81 | typedef enum DropState { | |
82 | DropNone, | |
83 | - DropRequested, | |
84 | DropInProgress, | |
85 | } DropState; | |
86 | ||
87 | @@ -268,37 +267,6 @@ static void track_child_perm(BlockDriverState *bs, BdrvChild *c, | |
88 | } | |
89 | } | |
90 | ||
91 | -static void track_drop(void *opaque) | |
92 | -{ | |
93 | - BlockDriverState *bs = (BlockDriverState*)opaque; | |
94 | - BlockDriverState *file = bs->file->bs; | |
95 | - BDRVAllocTrackState *s = bs->opaque; | |
96 | - | |
97 | - assert(file); | |
98 | - | |
99 | - /* we rely on the fact that we're not used anywhere else, so let's wait | |
100 | - * until we're only used once - in the drive connected to the guest (and one | |
101 | - * ref is held by bdrv_ref in track_change_backing_file) */ | |
102 | - if (bs->refcnt > 2) { | |
103 | - aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, opaque); | |
104 | - return; | |
105 | - } | |
106 | - AioContext *aio_context = bdrv_get_aio_context(bs); | |
107 | - aio_context_acquire(aio_context); | |
108 | - | |
109 | - bdrv_drained_begin(bs); | |
110 | - | |
111 | - /* now that we're drained, we can safely set 'DropInProgress' */ | |
112 | - s->drop_state = DropInProgress; | |
113 | - bdrv_child_refresh_perms(bs, bs->file, &error_abort); | |
114 | - | |
115 | - bdrv_replace_node(bs, file, &error_abort); | |
116 | - bdrv_set_backing_hd(bs, NULL, &error_abort); | |
117 | - bdrv_drained_end(bs); | |
118 | - bdrv_unref(bs); | |
119 | - aio_context_release(aio_context); | |
120 | -} | |
121 | - | |
122 | static int track_change_backing_file(BlockDriverState *bs, | |
123 | const char *backing_file, | |
124 | const char *backing_fmt) | |
125 | @@ -308,13 +276,23 @@ static int track_change_backing_file(BlockDriverState *bs, | |
126 | backing_file == NULL && backing_fmt == NULL) | |
127 | { | |
128 | /* backing file has been disconnected, there's no longer any use for | |
129 | - * this node, so let's remove ourselves from the block graph - we need | |
130 | - * to schedule this for later however, since when this function is | |
131 | - * called, the blockjob modifying us is probably not done yet and has a | |
132 | - * blocker on 'bs' */ | |
133 | - s->drop_state = DropRequested; | |
134 | + * this node, so let's remove ourselves from the block graph */ | |
135 | + BlockDriverState *file = bs->file->bs; | |
136 | + | |
137 | + /* Just to be sure, because bdrv_replace_node unrefs it */ | |
138 | bdrv_ref(bs); | |
139 | - aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, (void*)bs); | |
140 | + bdrv_drained_begin(bs); | |
141 | + | |
142 | + /* now that we're drained, we can safely set 'DropInProgress' */ | |
143 | + s->drop_state = DropInProgress; | |
144 | + | |
145 | + bdrv_child_refresh_perms(bs, bs->file, &error_abort); | |
146 | + | |
147 | + bdrv_replace_node(bs, file, &error_abort); | |
148 | + bdrv_set_backing_hd(bs, NULL, &error_abort); | |
149 | + | |
150 | + bdrv_drained_end(bs); | |
151 | + bdrv_unref(bs); | |
152 | } | |
153 | ||
154 | return 0; |