]>
Commit | Line | Data |
---|---|---|
73fd282e SH |
1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* | |
3 | * Linux io_uring file descriptor monitoring | |
4 | * | |
5 | * The Linux io_uring API supports file descriptor monitoring with a few | |
6 | * advantages over existing APIs like poll(2) and epoll(7): | |
7 | * | |
8 | * 1. Userspace polling of events is possible because the completion queue (cq | |
9 | * ring) is shared between the kernel and userspace. This allows | |
10 | * applications that rely on userspace polling to also monitor file | |
11 | * descriptors in the same userspace polling loop. | |
12 | * | |
13 | * 2. Submission and completion is batched and done together in a single system | |
14 | * call. This minimizes the number of system calls. | |
15 | * | |
16 | * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than | |
17 | * poll(2). | |
18 | * | |
19 | * 4. Nanosecond timeouts are supported so it requires fewer syscalls than | |
20 | * epoll(7). | |
21 | * | |
22 | * This code only monitors file descriptors and does not do asynchronous disk | |
23 | * I/O. Implementing disk I/O efficiently has other requirements and should | |
24 | * use a separate io_uring so it does not make sense to unify the code. | |
25 | * | |
26 | * File descriptor monitoring is implemented using the following operations: | |
27 | * | |
28 | * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored. | |
29 | * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored. When | |
30 | * the poll mask changes for a file descriptor it is first removed and then | |
31 | * re-added with the new poll mask, so this operation is also used as part | |
32 | * of modifying an existing monitored file descriptor. | |
33 | * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait | |
34 | * for events. This operation self-cancels if another event completes | |
35 | * before the timeout. | |
36 | * | |
37 | * io_uring calls the submission queue the "sq ring" and the completion queue | |
38 | * the "cq ring". Ring entries are called "sqe" and "cqe", respectively. | |
39 | * | |
40 | * The code is structured so that sq/cq rings are only modified within | |
41 | * fdmon_io_uring_wait(). Changes to AioHandlers are made by enqueuing them on | |
42 | * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD | |
43 | * and/or IORING_OP_POLL_REMOVE sqes for them. | |
44 | */ | |
45 | ||
46 | #include "qemu/osdep.h" | |
47 | #include <poll.h> | |
48 | #include "qemu/rcu_queue.h" | |
49 | #include "aio-posix.h" | |
50 | ||
51 | enum { | |
52 | FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */ | |
53 | ||
54 | /* AioHandler::flags */ | |
55 | FDMON_IO_URING_PENDING = (1 << 0), | |
56 | FDMON_IO_URING_ADD = (1 << 1), | |
57 | FDMON_IO_URING_REMOVE = (1 << 2), | |
58 | }; | |
59 | ||
60 | static inline int poll_events_from_pfd(int pfd_events) | |
61 | { | |
62 | return (pfd_events & G_IO_IN ? POLLIN : 0) | | |
63 | (pfd_events & G_IO_OUT ? POLLOUT : 0) | | |
64 | (pfd_events & G_IO_HUP ? POLLHUP : 0) | | |
65 | (pfd_events & G_IO_ERR ? POLLERR : 0); | |
66 | } | |
67 | ||
68 | static inline int pfd_events_from_poll(int poll_events) | |
69 | { | |
70 | return (poll_events & POLLIN ? G_IO_IN : 0) | | |
71 | (poll_events & POLLOUT ? G_IO_OUT : 0) | | |
72 | (poll_events & POLLHUP ? G_IO_HUP : 0) | | |
73 | (poll_events & POLLERR ? G_IO_ERR : 0); | |
74 | } | |
75 | ||
76 | /* | |
77 | * Returns an sqe for submitting a request. Only be called within | |
78 | * fdmon_io_uring_wait(). | |
79 | */ | |
80 | static struct io_uring_sqe *get_sqe(AioContext *ctx) | |
81 | { | |
82 | struct io_uring *ring = &ctx->fdmon_io_uring; | |
83 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); | |
84 | int ret; | |
85 | ||
86 | if (likely(sqe)) { | |
87 | return sqe; | |
88 | } | |
89 | ||
90 | /* No free sqes left, submit pending sqes first */ | |
636b836d SH |
91 | do { |
92 | ret = io_uring_submit(ring); | |
93 | } while (ret == -EINTR); | |
94 | ||
73fd282e SH |
95 | assert(ret > 1); |
96 | sqe = io_uring_get_sqe(ring); | |
97 | assert(sqe); | |
98 | return sqe; | |
99 | } | |
100 | ||
101 | /* Atomically enqueue an AioHandler for sq ring submission */ | |
102 | static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags) | |
103 | { | |
104 | unsigned old_flags; | |
105 | ||
d73415a3 | 106 | old_flags = qatomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags); |
73fd282e SH |
107 | if (!(old_flags & FDMON_IO_URING_PENDING)) { |
108 | QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted); | |
109 | } | |
110 | } | |
111 | ||
112 | /* Dequeue an AioHandler for sq ring submission. Called by fill_sq_ring(). */ | |
113 | static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags) | |
114 | { | |
115 | AioHandler *node = QSLIST_FIRST(head); | |
116 | ||
117 | if (!node) { | |
118 | return NULL; | |
119 | } | |
120 | ||
121 | /* Doesn't need to be atomic since fill_sq_ring() moves the list */ | |
122 | QSLIST_REMOVE_HEAD(head, node_submitted); | |
123 | ||
124 | /* | |
125 | * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two | |
126 | * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and | |
127 | * telling process_cqe() to delete the AioHandler when its | |
128 | * IORING_OP_POLL_ADD completes. | |
129 | */ | |
d73415a3 | 130 | *flags = qatomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING | |
73fd282e SH |
131 | FDMON_IO_URING_ADD)); |
132 | return node; | |
133 | } | |
134 | ||
135 | static void fdmon_io_uring_update(AioContext *ctx, | |
136 | AioHandler *old_node, | |
137 | AioHandler *new_node) | |
138 | { | |
139 | if (new_node) { | |
140 | enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD); | |
141 | } | |
142 | ||
143 | if (old_node) { | |
144 | /* | |
145 | * Deletion is tricky because IORING_OP_POLL_ADD and | |
146 | * IORING_OP_POLL_REMOVE are async. We need to wait for the original | |
147 | * IORING_OP_POLL_ADD to complete before this handler can be freed | |
148 | * safely. | |
149 | * | |
150 | * It's possible that the file descriptor becomes ready and the | |
151 | * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is | |
152 | * submitted, too. | |
153 | * | |
154 | * Mark this handler deleted right now but don't place it on | |
155 | * ctx->deleted_aio_handlers yet. Instead, manually fudge the list | |
156 | * entry to make QLIST_IS_INSERTED() think this handler has been | |
157 | * inserted and other code recognizes this AioHandler as deleted. | |
158 | * | |
159 | * Once the original IORING_OP_POLL_ADD completes we enqueue the | |
160 | * handler on the real ctx->deleted_aio_handlers list to be freed. | |
161 | */ | |
162 | assert(!QLIST_IS_INSERTED(old_node, node_deleted)); | |
163 | old_node->node_deleted.le_prev = &old_node->node_deleted.le_next; | |
164 | ||
165 | enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE); | |
166 | } | |
167 | } | |
168 | ||
169 | static void add_poll_add_sqe(AioContext *ctx, AioHandler *node) | |
170 | { | |
171 | struct io_uring_sqe *sqe = get_sqe(ctx); | |
172 | int events = poll_events_from_pfd(node->pfd.events); | |
173 | ||
174 | io_uring_prep_poll_add(sqe, node->pfd.fd, events); | |
175 | io_uring_sqe_set_data(sqe, node); | |
176 | } | |
177 | ||
178 | static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node) | |
179 | { | |
180 | struct io_uring_sqe *sqe = get_sqe(ctx); | |
181 | ||
8a947c7a HW |
182 | #ifdef LIBURING_HAVE_DATA64 |
183 | io_uring_prep_poll_remove(sqe, (__u64)(uintptr_t)node); | |
184 | #else | |
73fd282e | 185 | io_uring_prep_poll_remove(sqe, node); |
8a947c7a | 186 | #endif |
73fd282e SH |
187 | } |
188 | ||
189 | /* Add a timeout that self-cancels when another cqe becomes ready */ | |
190 | static void add_timeout_sqe(AioContext *ctx, int64_t ns) | |
191 | { | |
192 | struct io_uring_sqe *sqe; | |
193 | struct __kernel_timespec ts = { | |
194 | .tv_sec = ns / NANOSECONDS_PER_SECOND, | |
195 | .tv_nsec = ns % NANOSECONDS_PER_SECOND, | |
196 | }; | |
197 | ||
198 | sqe = get_sqe(ctx); | |
199 | io_uring_prep_timeout(sqe, &ts, 1, 0); | |
200 | } | |
201 | ||
202 | /* Add sqes from ctx->submit_list for submission */ | |
203 | static void fill_sq_ring(AioContext *ctx) | |
204 | { | |
205 | AioHandlerSList submit_list; | |
206 | AioHandler *node; | |
207 | unsigned flags; | |
208 | ||
209 | QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list); | |
210 | ||
211 | while ((node = dequeue(&submit_list, &flags))) { | |
212 | /* Order matters, just in case both flags were set */ | |
213 | if (flags & FDMON_IO_URING_ADD) { | |
214 | add_poll_add_sqe(ctx, node); | |
215 | } | |
216 | if (flags & FDMON_IO_URING_REMOVE) { | |
217 | add_poll_remove_sqe(ctx, node); | |
218 | } | |
219 | } | |
220 | } | |
221 | ||
222 | /* Returns true if a handler became ready */ | |
223 | static bool process_cqe(AioContext *ctx, | |
224 | AioHandlerList *ready_list, | |
225 | struct io_uring_cqe *cqe) | |
226 | { | |
227 | AioHandler *node = io_uring_cqe_get_data(cqe); | |
228 | unsigned flags; | |
229 | ||
230 | /* poll_timeout and poll_remove have a zero user_data field */ | |
231 | if (!node) { | |
232 | return false; | |
233 | } | |
234 | ||
235 | /* | |
236 | * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race | |
237 | * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE | |
238 | * bit before IORING_OP_POLL_REMOVE is submitted. | |
239 | */ | |
d73415a3 | 240 | flags = qatomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE); |
73fd282e SH |
241 | if (flags & FDMON_IO_URING_REMOVE) { |
242 | QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); | |
243 | return false; | |
244 | } | |
245 | ||
246 | aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res)); | |
247 | ||
248 | /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */ | |
249 | add_poll_add_sqe(ctx, node); | |
250 | return true; | |
251 | } | |
252 | ||
253 | static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list) | |
254 | { | |
255 | struct io_uring *ring = &ctx->fdmon_io_uring; | |
256 | struct io_uring_cqe *cqe; | |
257 | unsigned num_cqes = 0; | |
258 | unsigned num_ready = 0; | |
259 | unsigned head; | |
260 | ||
261 | io_uring_for_each_cqe(ring, head, cqe) { | |
262 | if (process_cqe(ctx, ready_list, cqe)) { | |
263 | num_ready++; | |
264 | } | |
265 | ||
266 | num_cqes++; | |
267 | } | |
268 | ||
269 | io_uring_cq_advance(ring, num_cqes); | |
270 | return num_ready; | |
271 | } | |
272 | ||
273 | static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list, | |
274 | int64_t timeout) | |
275 | { | |
276 | unsigned wait_nr = 1; /* block until at least one cqe is ready */ | |
277 | int ret; | |
278 | ||
73fd282e SH |
279 | if (timeout == 0) { |
280 | wait_nr = 0; /* non-blocking */ | |
281 | } else if (timeout > 0) { | |
282 | add_timeout_sqe(ctx, timeout); | |
283 | } | |
284 | ||
285 | fill_sq_ring(ctx); | |
286 | ||
636b836d SH |
287 | do { |
288 | ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr); | |
289 | } while (ret == -EINTR); | |
290 | ||
73fd282e SH |
291 | assert(ret >= 0); |
292 | ||
293 | return process_cq_ring(ctx, ready_list); | |
294 | } | |
295 | ||
aa38e19f SH |
296 | static bool fdmon_io_uring_need_wait(AioContext *ctx) |
297 | { | |
ff807d55 SH |
298 | /* Have io_uring events completed? */ |
299 | if (io_uring_cq_ready(&ctx->fdmon_io_uring)) { | |
300 | return true; | |
301 | } | |
302 | ||
ae60ab7e SH |
303 | /* Are there pending sqes to submit? */ |
304 | if (io_uring_sq_ready(&ctx->fdmon_io_uring)) { | |
305 | return true; | |
306 | } | |
307 | ||
308 | /* Do we need to process AioHandlers for io_uring changes? */ | |
ff807d55 SH |
309 | if (!QSLIST_EMPTY_RCU(&ctx->submit_list)) { |
310 | return true; | |
311 | } | |
312 | ||
60f782b6 | 313 | return false; |
aa38e19f SH |
314 | } |
315 | ||
73fd282e SH |
316 | static const FDMonOps fdmon_io_uring_ops = { |
317 | .update = fdmon_io_uring_update, | |
318 | .wait = fdmon_io_uring_wait, | |
aa38e19f | 319 | .need_wait = fdmon_io_uring_need_wait, |
73fd282e SH |
320 | }; |
321 | ||
322 | bool fdmon_io_uring_setup(AioContext *ctx) | |
323 | { | |
324 | int ret; | |
325 | ||
326 | ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0); | |
327 | if (ret != 0) { | |
328 | return false; | |
329 | } | |
330 | ||
331 | QSLIST_INIT(&ctx->submit_list); | |
332 | ctx->fdmon_ops = &fdmon_io_uring_ops; | |
333 | return true; | |
334 | } | |
335 | ||
336 | void fdmon_io_uring_destroy(AioContext *ctx) | |
337 | { | |
338 | if (ctx->fdmon_ops == &fdmon_io_uring_ops) { | |
339 | AioHandler *node; | |
340 | ||
341 | io_uring_queue_exit(&ctx->fdmon_io_uring); | |
342 | ||
de137e44 | 343 | /* Move handlers due to be removed onto the deleted list */ |
73fd282e | 344 | while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) { |
d73415a3 | 345 | unsigned flags = qatomic_fetch_and(&node->flags, |
de137e44 SH |
346 | ~(FDMON_IO_URING_PENDING | |
347 | FDMON_IO_URING_ADD | | |
348 | FDMON_IO_URING_REMOVE)); | |
349 | ||
350 | if (flags & FDMON_IO_URING_REMOVE) { | |
351 | QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); | |
352 | } | |
353 | ||
73fd282e | 354 | QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted); |
73fd282e SH |
355 | } |
356 | ||
357 | ctx->fdmon_ops = &fdmon_poll_ops; | |
358 | } | |
359 | } |