]>
Commit | Line | Data |
---|---|---|
73fd282e SH |
1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* | |
3 | * Linux io_uring file descriptor monitoring | |
4 | * | |
5 | * The Linux io_uring API supports file descriptor monitoring with a few | |
6 | * advantages over existing APIs like poll(2) and epoll(7): | |
7 | * | |
8 | * 1. Userspace polling of events is possible because the completion queue (cq | |
9 | * ring) is shared between the kernel and userspace. This allows | |
10 | * applications that rely on userspace polling to also monitor file | |
11 | * descriptors in the same userspace polling loop. | |
12 | * | |
13 | * 2. Submission and completion is batched and done together in a single system | |
14 | * call. This minimizes the number of system calls. | |
15 | * | |
16 | * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than | |
17 | * poll(2). | |
18 | * | |
19 | * 4. Nanosecond timeouts are supported so it requires fewer syscalls than | |
20 | * epoll(7). | |
21 | * | |
22 | * This code only monitors file descriptors and does not do asynchronous disk | |
23 | * I/O. Implementing disk I/O efficiently has other requirements and should | |
24 | * use a separate io_uring so it does not make sense to unify the code. | |
25 | * | |
26 | * File descriptor monitoring is implemented using the following operations: | |
27 | * | |
28 | * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored. | |
29 | * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored. When | |
30 | * the poll mask changes for a file descriptor it is first removed and then | |
31 | * re-added with the new poll mask, so this operation is also used as part | |
32 | * of modifying an existing monitored file descriptor. | |
33 | * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait | |
34 | * for events. This operation self-cancels if another event completes | |
35 | * before the timeout. | |
36 | * | |
37 | * io_uring calls the submission queue the "sq ring" and the completion queue | |
38 | * the "cq ring". Ring entries are called "sqe" and "cqe", respectively. | |
39 | * | |
40 | * The code is structured so that sq/cq rings are only modified within | |
41 | * fdmon_io_uring_wait(). Changes to AioHandlers are made by enqueuing them on | |
42 | * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD | |
43 | * and/or IORING_OP_POLL_REMOVE sqes for them. | |
44 | */ | |
45 | ||
46 | #include "qemu/osdep.h" | |
47 | #include <poll.h> | |
48 | #include "qemu/rcu_queue.h" | |
49 | #include "aio-posix.h" | |
50 | ||
51 | enum { | |
52 | FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */ | |
53 | ||
54 | /* AioHandler::flags */ | |
55 | FDMON_IO_URING_PENDING = (1 << 0), | |
56 | FDMON_IO_URING_ADD = (1 << 1), | |
57 | FDMON_IO_URING_REMOVE = (1 << 2), | |
58 | }; | |
59 | ||
60 | static inline int poll_events_from_pfd(int pfd_events) | |
61 | { | |
62 | return (pfd_events & G_IO_IN ? POLLIN : 0) | | |
63 | (pfd_events & G_IO_OUT ? POLLOUT : 0) | | |
64 | (pfd_events & G_IO_HUP ? POLLHUP : 0) | | |
65 | (pfd_events & G_IO_ERR ? POLLERR : 0); | |
66 | } | |
67 | ||
68 | static inline int pfd_events_from_poll(int poll_events) | |
69 | { | |
70 | return (poll_events & POLLIN ? G_IO_IN : 0) | | |
71 | (poll_events & POLLOUT ? G_IO_OUT : 0) | | |
72 | (poll_events & POLLHUP ? G_IO_HUP : 0) | | |
73 | (poll_events & POLLERR ? G_IO_ERR : 0); | |
74 | } | |
75 | ||
76 | /* | |
77 | * Returns an sqe for submitting a request. Only be called within | |
78 | * fdmon_io_uring_wait(). | |
79 | */ | |
80 | static struct io_uring_sqe *get_sqe(AioContext *ctx) | |
81 | { | |
82 | struct io_uring *ring = &ctx->fdmon_io_uring; | |
83 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); | |
84 | int ret; | |
85 | ||
86 | if (likely(sqe)) { | |
87 | return sqe; | |
88 | } | |
89 | ||
90 | /* No free sqes left, submit pending sqes first */ | |
91 | ret = io_uring_submit(ring); | |
92 | assert(ret > 1); | |
93 | sqe = io_uring_get_sqe(ring); | |
94 | assert(sqe); | |
95 | return sqe; | |
96 | } | |
97 | ||
98 | /* Atomically enqueue an AioHandler for sq ring submission */ | |
99 | static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags) | |
100 | { | |
101 | unsigned old_flags; | |
102 | ||
103 | old_flags = atomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags); | |
104 | if (!(old_flags & FDMON_IO_URING_PENDING)) { | |
105 | QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted); | |
106 | } | |
107 | } | |
108 | ||
109 | /* Dequeue an AioHandler for sq ring submission. Called by fill_sq_ring(). */ | |
110 | static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags) | |
111 | { | |
112 | AioHandler *node = QSLIST_FIRST(head); | |
113 | ||
114 | if (!node) { | |
115 | return NULL; | |
116 | } | |
117 | ||
118 | /* Doesn't need to be atomic since fill_sq_ring() moves the list */ | |
119 | QSLIST_REMOVE_HEAD(head, node_submitted); | |
120 | ||
121 | /* | |
122 | * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two | |
123 | * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and | |
124 | * telling process_cqe() to delete the AioHandler when its | |
125 | * IORING_OP_POLL_ADD completes. | |
126 | */ | |
127 | *flags = atomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING | | |
128 | FDMON_IO_URING_ADD)); | |
129 | return node; | |
130 | } | |
131 | ||
132 | static void fdmon_io_uring_update(AioContext *ctx, | |
133 | AioHandler *old_node, | |
134 | AioHandler *new_node) | |
135 | { | |
136 | if (new_node) { | |
137 | enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD); | |
138 | } | |
139 | ||
140 | if (old_node) { | |
141 | /* | |
142 | * Deletion is tricky because IORING_OP_POLL_ADD and | |
143 | * IORING_OP_POLL_REMOVE are async. We need to wait for the original | |
144 | * IORING_OP_POLL_ADD to complete before this handler can be freed | |
145 | * safely. | |
146 | * | |
147 | * It's possible that the file descriptor becomes ready and the | |
148 | * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is | |
149 | * submitted, too. | |
150 | * | |
151 | * Mark this handler deleted right now but don't place it on | |
152 | * ctx->deleted_aio_handlers yet. Instead, manually fudge the list | |
153 | * entry to make QLIST_IS_INSERTED() think this handler has been | |
154 | * inserted and other code recognizes this AioHandler as deleted. | |
155 | * | |
156 | * Once the original IORING_OP_POLL_ADD completes we enqueue the | |
157 | * handler on the real ctx->deleted_aio_handlers list to be freed. | |
158 | */ | |
159 | assert(!QLIST_IS_INSERTED(old_node, node_deleted)); | |
160 | old_node->node_deleted.le_prev = &old_node->node_deleted.le_next; | |
161 | ||
162 | enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE); | |
163 | } | |
164 | } | |
165 | ||
166 | static void add_poll_add_sqe(AioContext *ctx, AioHandler *node) | |
167 | { | |
168 | struct io_uring_sqe *sqe = get_sqe(ctx); | |
169 | int events = poll_events_from_pfd(node->pfd.events); | |
170 | ||
171 | io_uring_prep_poll_add(sqe, node->pfd.fd, events); | |
172 | io_uring_sqe_set_data(sqe, node); | |
173 | } | |
174 | ||
175 | static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node) | |
176 | { | |
177 | struct io_uring_sqe *sqe = get_sqe(ctx); | |
178 | ||
179 | io_uring_prep_poll_remove(sqe, node); | |
180 | } | |
181 | ||
182 | /* Add a timeout that self-cancels when another cqe becomes ready */ | |
183 | static void add_timeout_sqe(AioContext *ctx, int64_t ns) | |
184 | { | |
185 | struct io_uring_sqe *sqe; | |
186 | struct __kernel_timespec ts = { | |
187 | .tv_sec = ns / NANOSECONDS_PER_SECOND, | |
188 | .tv_nsec = ns % NANOSECONDS_PER_SECOND, | |
189 | }; | |
190 | ||
191 | sqe = get_sqe(ctx); | |
192 | io_uring_prep_timeout(sqe, &ts, 1, 0); | |
193 | } | |
194 | ||
195 | /* Add sqes from ctx->submit_list for submission */ | |
196 | static void fill_sq_ring(AioContext *ctx) | |
197 | { | |
198 | AioHandlerSList submit_list; | |
199 | AioHandler *node; | |
200 | unsigned flags; | |
201 | ||
202 | QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list); | |
203 | ||
204 | while ((node = dequeue(&submit_list, &flags))) { | |
205 | /* Order matters, just in case both flags were set */ | |
206 | if (flags & FDMON_IO_URING_ADD) { | |
207 | add_poll_add_sqe(ctx, node); | |
208 | } | |
209 | if (flags & FDMON_IO_URING_REMOVE) { | |
210 | add_poll_remove_sqe(ctx, node); | |
211 | } | |
212 | } | |
213 | } | |
214 | ||
215 | /* Returns true if a handler became ready */ | |
216 | static bool process_cqe(AioContext *ctx, | |
217 | AioHandlerList *ready_list, | |
218 | struct io_uring_cqe *cqe) | |
219 | { | |
220 | AioHandler *node = io_uring_cqe_get_data(cqe); | |
221 | unsigned flags; | |
222 | ||
223 | /* poll_timeout and poll_remove have a zero user_data field */ | |
224 | if (!node) { | |
225 | return false; | |
226 | } | |
227 | ||
228 | /* | |
229 | * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race | |
230 | * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE | |
231 | * bit before IORING_OP_POLL_REMOVE is submitted. | |
232 | */ | |
233 | flags = atomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE); | |
234 | if (flags & FDMON_IO_URING_REMOVE) { | |
235 | QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); | |
236 | return false; | |
237 | } | |
238 | ||
239 | aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res)); | |
240 | ||
241 | /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */ | |
242 | add_poll_add_sqe(ctx, node); | |
243 | return true; | |
244 | } | |
245 | ||
246 | static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list) | |
247 | { | |
248 | struct io_uring *ring = &ctx->fdmon_io_uring; | |
249 | struct io_uring_cqe *cqe; | |
250 | unsigned num_cqes = 0; | |
251 | unsigned num_ready = 0; | |
252 | unsigned head; | |
253 | ||
254 | io_uring_for_each_cqe(ring, head, cqe) { | |
255 | if (process_cqe(ctx, ready_list, cqe)) { | |
256 | num_ready++; | |
257 | } | |
258 | ||
259 | num_cqes++; | |
260 | } | |
261 | ||
262 | io_uring_cq_advance(ring, num_cqes); | |
263 | return num_ready; | |
264 | } | |
265 | ||
266 | static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list, | |
267 | int64_t timeout) | |
268 | { | |
269 | unsigned wait_nr = 1; /* block until at least one cqe is ready */ | |
270 | int ret; | |
271 | ||
272 | /* Fall back while external clients are disabled */ | |
273 | if (atomic_read(&ctx->external_disable_cnt)) { | |
274 | return fdmon_poll_ops.wait(ctx, ready_list, timeout); | |
275 | } | |
276 | ||
277 | if (timeout == 0) { | |
278 | wait_nr = 0; /* non-blocking */ | |
279 | } else if (timeout > 0) { | |
280 | add_timeout_sqe(ctx, timeout); | |
281 | } | |
282 | ||
283 | fill_sq_ring(ctx); | |
284 | ||
285 | ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr); | |
286 | assert(ret >= 0); | |
287 | ||
288 | return process_cq_ring(ctx, ready_list); | |
289 | } | |
290 | ||
291 | static const FDMonOps fdmon_io_uring_ops = { | |
292 | .update = fdmon_io_uring_update, | |
293 | .wait = fdmon_io_uring_wait, | |
294 | }; | |
295 | ||
296 | bool fdmon_io_uring_setup(AioContext *ctx) | |
297 | { | |
298 | int ret; | |
299 | ||
300 | ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0); | |
301 | if (ret != 0) { | |
302 | return false; | |
303 | } | |
304 | ||
305 | QSLIST_INIT(&ctx->submit_list); | |
306 | ctx->fdmon_ops = &fdmon_io_uring_ops; | |
307 | return true; | |
308 | } | |
309 | ||
310 | void fdmon_io_uring_destroy(AioContext *ctx) | |
311 | { | |
312 | if (ctx->fdmon_ops == &fdmon_io_uring_ops) { | |
313 | AioHandler *node; | |
314 | ||
315 | io_uring_queue_exit(&ctx->fdmon_io_uring); | |
316 | ||
317 | /* No need to submit these anymore, just free them. */ | |
318 | while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) { | |
319 | QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted); | |
320 | QLIST_REMOVE(node, node); | |
321 | g_free(node); | |
322 | } | |
323 | ||
324 | ctx->fdmon_ops = &fdmon_poll_ops; | |
325 | } | |
326 | } |