]>
Commit | Line | Data |
---|---|---|
aead9dc9 PB |
1 | /* |
2 | * Graph lock: rwlock to protect block layer graph manipulations (add/remove | |
3 | * edges and nodes) | |
4 | * | |
5 | * Copyright (c) 2022 Red Hat | |
6 | * | |
7 | * This library is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * This library is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. | |
19 | */ | |
20 | ||
21 | #include "qemu/osdep.h" | |
22 | #include "qemu/main-loop.h" | |
23 | #include "block/graph-lock.h" | |
24 | #include "block/block.h" | |
25 | #include "block/block_int.h" | |
26 | ||
4002ffdc KW |
27 | /* Dummy lock object to use for Thread Safety Analysis (TSA) */ |
28 | BdrvGraphLock graph_lock; | |
29 | ||
aead9dc9 PB |
30 | /* Protects the list of aiocontext and orphaned_reader_count */ |
31 | static QemuMutex aio_context_list_lock; | |
32 | ||
33 | /* Written and read with atomic operations. */ | |
34 | static int has_writer; | |
35 | ||
36 | /* | |
37 | * A reader coroutine could move from an AioContext to another. | |
38 | * If this happens, there is no problem from the point of view of | |
39 | * counters. The problem is that the total count becomes | |
40 | * unbalanced if one of the two AioContexts gets deleted. | |
41 | * The count of readers must remain correct, so the AioContext's | |
42 | * balance is transferred to this glboal variable. | |
43 | * Protected by aio_context_list_lock. | |
44 | */ | |
45 | static uint32_t orphaned_reader_count; | |
46 | ||
47 | /* Queue of readers waiting for the writer to finish */ | |
48 | static CoQueue reader_queue; | |
49 | ||
50 | struct BdrvGraphRWlock { | |
51 | /* How many readers are currently reading the graph. */ | |
52 | uint32_t reader_count; | |
53 | ||
54 | /* | |
55 | * List of BdrvGraphRWlock kept in graph-lock.c | |
56 | * Protected by aio_context_list_lock | |
57 | */ | |
58 | QTAILQ_ENTRY(BdrvGraphRWlock) next_aio; | |
59 | }; | |
60 | ||
61 | /* | |
62 | * List of BdrvGraphRWlock. This list ensures that each BdrvGraphRWlock | |
63 | * can safely modify only its own counter, avoid reading/writing | |
64 | * others and thus improving performances by avoiding cacheline bounces. | |
65 | */ | |
66 | static QTAILQ_HEAD(, BdrvGraphRWlock) aio_context_list = | |
67 | QTAILQ_HEAD_INITIALIZER(aio_context_list); | |
68 | ||
69 | static void __attribute__((__constructor__)) bdrv_init_graph_lock(void) | |
70 | { | |
71 | qemu_mutex_init(&aio_context_list_lock); | |
72 | qemu_co_queue_init(&reader_queue); | |
73 | } | |
74 | ||
75 | void register_aiocontext(AioContext *ctx) | |
76 | { | |
77 | ctx->bdrv_graph = g_new0(BdrvGraphRWlock, 1); | |
78 | QEMU_LOCK_GUARD(&aio_context_list_lock); | |
79 | assert(ctx->bdrv_graph->reader_count == 0); | |
80 | QTAILQ_INSERT_TAIL(&aio_context_list, ctx->bdrv_graph, next_aio); | |
81 | } | |
82 | ||
83 | void unregister_aiocontext(AioContext *ctx) | |
84 | { | |
85 | QEMU_LOCK_GUARD(&aio_context_list_lock); | |
86 | orphaned_reader_count += ctx->bdrv_graph->reader_count; | |
87 | QTAILQ_REMOVE(&aio_context_list, ctx->bdrv_graph, next_aio); | |
88 | g_free(ctx->bdrv_graph); | |
89 | } | |
90 | ||
91 | static uint32_t reader_count(void) | |
92 | { | |
93 | BdrvGraphRWlock *brdv_graph; | |
94 | uint32_t rd; | |
95 | ||
96 | QEMU_LOCK_GUARD(&aio_context_list_lock); | |
97 | ||
3202d8e4 | 98 | /* rd can temporarily be negative, but the total will *always* be >= 0 */ |
aead9dc9 PB |
99 | rd = orphaned_reader_count; |
100 | QTAILQ_FOREACH(brdv_graph, &aio_context_list, next_aio) { | |
101 | rd += qatomic_read(&brdv_graph->reader_count); | |
102 | } | |
103 | ||
104 | /* shouldn't overflow unless there are 2^31 readers */ | |
105 | assert((int32_t)rd >= 0); | |
106 | return rd; | |
107 | } | |
108 | ||
e6e964b8 | 109 | void no_coroutine_fn bdrv_graph_wrlock(BlockDriverState *bs) |
aead9dc9 | 110 | { |
31b2ddfe KW |
111 | AioContext *ctx = NULL; |
112 | ||
aead9dc9 PB |
113 | GLOBAL_STATE_CODE(); |
114 | assert(!qatomic_read(&has_writer)); | |
e6e964b8 | 115 | assert(!qemu_in_coroutine()); |
aead9dc9 | 116 | |
31b2ddfe KW |
117 | /* |
118 | * Release only non-mainloop AioContext. The mainloop often relies on the | |
119 | * BQL and doesn't lock the main AioContext before doing things. | |
120 | */ | |
121 | if (bs) { | |
122 | ctx = bdrv_get_aio_context(bs); | |
123 | if (ctx != qemu_get_aio_context()) { | |
124 | aio_context_release(ctx); | |
125 | } else { | |
126 | ctx = NULL; | |
127 | } | |
128 | } | |
129 | ||
aead9dc9 PB |
130 | /* Make sure that constantly arriving new I/O doesn't cause starvation */ |
131 | bdrv_drain_all_begin_nopoll(); | |
132 | ||
133 | /* | |
134 | * reader_count == 0: this means writer will read has_reader as 1 | |
135 | * reader_count >= 1: we don't know if writer read has_writer == 0 or 1, | |
136 | * but we need to wait. | |
137 | * Wait by allowing other coroutine (and possible readers) to continue. | |
138 | */ | |
139 | do { | |
140 | /* | |
141 | * has_writer must be 0 while polling, otherwise we get a deadlock if | |
142 | * any callback involved during AIO_WAIT_WHILE() tries to acquire the | |
143 | * reader lock. | |
144 | */ | |
145 | qatomic_set(&has_writer, 0); | |
d805d8a2 | 146 | AIO_WAIT_WHILE_UNLOCKED(NULL, reader_count() >= 1); |
aead9dc9 PB |
147 | qatomic_set(&has_writer, 1); |
148 | ||
149 | /* | |
150 | * We want to only check reader_count() after has_writer = 1 is visible | |
151 | * to other threads. That way no more readers can sneak in after we've | |
152 | * determined reader_count() == 0. | |
153 | */ | |
154 | smp_mb(); | |
155 | } while (reader_count() >= 1); | |
156 | ||
157 | bdrv_drain_all_end(); | |
31b2ddfe KW |
158 | |
159 | if (ctx) { | |
160 | aio_context_acquire(bdrv_get_aio_context(bs)); | |
161 | } | |
aead9dc9 PB |
162 | } |
163 | ||
164 | void bdrv_graph_wrunlock(void) | |
165 | { | |
166 | GLOBAL_STATE_CODE(); | |
aead9dc9 PB |
167 | assert(qatomic_read(&has_writer)); |
168 | ||
ac2ae233 KW |
169 | WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) { |
170 | /* | |
171 | * No need for memory barriers, this works in pair with | |
172 | * the slow path of rdlock() and both take the lock. | |
173 | */ | |
174 | qatomic_store_release(&has_writer, 0); | |
175 | ||
176 | /* Wake up all coroutines that are waiting to read the graph */ | |
177 | qemu_co_enter_all(&reader_queue, &aio_context_list_lock); | |
178 | } | |
179 | ||
aead9dc9 | 180 | /* |
ac2ae233 KW |
181 | * Run any BHs that were scheduled during the wrlock section and that |
182 | * callers might expect to have finished (in particular, this is important | |
183 | * for bdrv_schedule_unref()). | |
184 | * | |
185 | * Do this only after restarting coroutines so that nested event loops in | |
186 | * BHs don't deadlock if their condition relies on the coroutine making | |
187 | * progress. | |
aead9dc9 | 188 | */ |
ac2ae233 | 189 | aio_bh_poll(qemu_get_aio_context()); |
aead9dc9 PB |
190 | } |
191 | ||
192 | void coroutine_fn bdrv_graph_co_rdlock(void) | |
193 | { | |
194 | BdrvGraphRWlock *bdrv_graph; | |
195 | bdrv_graph = qemu_get_current_aio_context()->bdrv_graph; | |
196 | ||
aead9dc9 PB |
197 | for (;;) { |
198 | qatomic_set(&bdrv_graph->reader_count, | |
199 | bdrv_graph->reader_count + 1); | |
200 | /* make sure writer sees reader_count before we check has_writer */ | |
201 | smp_mb(); | |
202 | ||
203 | /* | |
204 | * has_writer == 0: this means writer will read reader_count as >= 1 | |
205 | * has_writer == 1: we don't know if writer read reader_count == 0 | |
206 | * or > 0, but we need to wait anyways because | |
207 | * it will write. | |
208 | */ | |
209 | if (!qatomic_read(&has_writer)) { | |
210 | break; | |
211 | } | |
212 | ||
213 | /* | |
214 | * Synchronize access with reader_count() in bdrv_graph_wrlock(). | |
215 | * Case 1: | |
216 | * If this critical section gets executed first, reader_count will | |
217 | * decrease and the reader will go to sleep. | |
218 | * Then the writer will read reader_count that does not take into | |
219 | * account this reader, and if there's no other reader it will | |
220 | * enter the write section. | |
221 | * Case 2: | |
222 | * If reader_count() critical section gets executed first, | |
223 | * then writer will read reader_count >= 1. | |
224 | * It will wait in AIO_WAIT_WHILE(), but once it releases the lock | |
225 | * we will enter this critical section and call aio_wait_kick(). | |
226 | */ | |
227 | WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) { | |
228 | /* | |
229 | * Additional check when we use the above lock to synchronize | |
230 | * with bdrv_graph_wrunlock(). | |
231 | * Case 1: | |
232 | * If this gets executed first, has_writer is still 1, so we reduce | |
233 | * reader_count and go to sleep. | |
234 | * Then the writer will set has_writer to 0 and wake up all readers, | |
235 | * us included. | |
236 | * Case 2: | |
237 | * If bdrv_graph_wrunlock() critical section gets executed first, | |
238 | * then it will set has_writer to 0 and wake up all other readers. | |
239 | * Then we execute this critical section, and therefore must check | |
240 | * again for has_writer, otherwise we sleep without any writer | |
241 | * actually running. | |
242 | */ | |
243 | if (!qatomic_read(&has_writer)) { | |
244 | return; | |
245 | } | |
246 | ||
247 | /* slow path where reader sleeps */ | |
248 | bdrv_graph->reader_count--; | |
249 | aio_wait_kick(); | |
250 | qemu_co_queue_wait(&reader_queue, &aio_context_list_lock); | |
251 | } | |
252 | } | |
253 | } | |
254 | ||
255 | void coroutine_fn bdrv_graph_co_rdunlock(void) | |
256 | { | |
257 | BdrvGraphRWlock *bdrv_graph; | |
258 | bdrv_graph = qemu_get_current_aio_context()->bdrv_graph; | |
259 | ||
aead9dc9 PB |
260 | qatomic_store_release(&bdrv_graph->reader_count, |
261 | bdrv_graph->reader_count - 1); | |
262 | /* make sure writer sees reader_count before we check has_writer */ | |
263 | smp_mb(); | |
264 | ||
265 | /* | |
266 | * has_writer == 0: this means reader will read reader_count decreased | |
267 | * has_writer == 1: we don't know if writer read reader_count old or | |
268 | * new. Therefore, kick again so on next iteration | |
269 | * writer will for sure read the updated value. | |
270 | */ | |
271 | if (qatomic_read(&has_writer)) { | |
272 | aio_wait_kick(); | |
273 | } | |
274 | } | |
275 | ||
276 | void bdrv_graph_rdlock_main_loop(void) | |
277 | { | |
278 | GLOBAL_STATE_CODE(); | |
279 | assert(!qemu_in_coroutine()); | |
280 | } | |
281 | ||
282 | void bdrv_graph_rdunlock_main_loop(void) | |
283 | { | |
284 | GLOBAL_STATE_CODE(); | |
285 | assert(!qemu_in_coroutine()); | |
286 | } | |
3f35f82e EGE |
287 | |
288 | void assert_bdrv_graph_readable(void) | |
289 | { | |
58a2e3f5 SH |
290 | /* reader_count() is slow due to aio_context_list_lock lock contention */ |
291 | #ifdef CONFIG_DEBUG_GRAPH_LOCK | |
3f35f82e | 292 | assert(qemu_in_main_thread() || reader_count()); |
58a2e3f5 | 293 | #endif |
3f35f82e EGE |
294 | } |
295 | ||
296 | void assert_bdrv_graph_writable(void) | |
297 | { | |
298 | assert(qemu_in_main_thread()); | |
299 | assert(qatomic_read(&has_writer)); | |
300 | } |