]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/mainloop.c
mainloop: add io_uring support
[mirror_lxc.git] / src / lxc / mainloop.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE 1
5 #endif
6 #include <errno.h>
7 #include <fcntl.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <sys/poll.h>
12 #include <sys/epoll.h>
13 #include <unistd.h>
14
15 #include "config.h"
16 #include "log.h"
17 #include "macro.h"
18 #include "mainloop.h"
19
20 lxc_log_define(mainloop, lxc);
21
22 #define CANCEL_RAISED (1 << 0)
23 #define CANCEL_RECEIVED (1 << 1)
24 #define CANCEL_SUCCESS (1 << 2)
25
26 struct mainloop_handler {
27 struct lxc_list *list;
28 int fd;
29 void *data;
30 lxc_mainloop_callback_t callback;
31 lxc_mainloop_cleanup_t cleanup;
32 const char *handler_name;
33 unsigned int flags;
34 };
35
36 #define MAX_EVENTS 10
37
38 static int __io_uring_disarm(struct lxc_async_descr *descr,
39 struct mainloop_handler *handler);
40
41 static void delete_handler(struct lxc_async_descr *descr,
42 struct mainloop_handler *handler, bool oneshot)
43 {
44 int ret = 0;
45 struct lxc_list *list;
46
47 if (descr->type == LXC_MAINLOOP_IO_URING) {
48 /*
49 * For a oneshot handler we don't have to do anything. If we
50 * end up here we know that an event for this handler has been
51 * generated before and since this is a oneshot handler it
52 * means that it has been deactivated. So the only thing we
53 * need to do is to call the registered cleanup handler and
54 * remove the handlerfrom the list.
55 */
56 if (!oneshot)
57 ret = __io_uring_disarm(descr, handler);
58 } else {
59 ret = epoll_ctl(descr->epfd, EPOLL_CTL_DEL, handler->fd, NULL);
60 }
61 if (ret < 0)
62 SYSWARN("Failed to delete \"%d\" for \"%s\"", handler->fd, handler->handler_name);
63
64 if (handler->cleanup) {
65 ret = handler->cleanup(handler->fd, handler->data);
66 if (ret < 0)
67 SYSWARN("Failed to call cleanup \"%s\" handler", handler->handler_name);
68 }
69
70 list = move_ptr(handler->list);
71 lxc_list_del(list);
72 free(list->elem);
73 free(list);
74 }
75
76 #ifndef HAVE_LIBURING
77 static inline int __lxc_mainloop_io_uring(struct lxc_async_descr *descr,
78 int timeout_ms)
79 {
80 return ret_errno(ENOSYS);
81 }
82
83 static int __io_uring_arm(struct lxc_async_descr *descr,
84 struct mainloop_handler *handler, bool oneshot)
85 {
86 return ret_errno(ENOSYS);
87 }
88
89 static int __io_uring_disarm(struct lxc_async_descr *descr,
90 struct mainloop_handler *handler)
91 {
92 return ret_errno(ENOSYS);
93 }
94
95 static inline int __io_uring_open(struct lxc_async_descr *descr)
96 {
97 return ret_errno(ENOSYS);
98 }
99
100 #else
101
102 static inline int __io_uring_open(struct lxc_async_descr *descr)
103 {
104 int ret;
105 *descr = (struct lxc_async_descr){
106 .epfd = -EBADF,
107 };
108
109 descr->ring = mmap(NULL, sizeof(struct io_uring), PROT_READ | PROT_WRITE,
110 MAP_SHARED | MAP_POPULATE | MAP_ANONYMOUS, -1, 0);
111 if (descr->ring == MAP_FAILED)
112 return syserror("Failed to mmap io_uring memory");
113
114 ret = io_uring_queue_init(512, descr->ring, IORING_SETUP_SQPOLL);
115 if (ret) {
116 SYSERROR("Failed to initialize io_uring instance");
117 goto on_error;
118 }
119
120 ret = io_uring_ring_dontfork(descr->ring);
121 if (ret) {
122 SYSERROR("Failed to prevent inheritance of io_uring mmaped region");
123 goto on_error;
124 }
125
126 descr->type = LXC_MAINLOOP_IO_URING;
127 TRACE("Created io-uring instance");
128 return 0;
129
130 on_error:
131 ret = munmap(descr->ring, sizeof(struct io_uring));
132 if (ret < 0)
133 SYSWARN("Failed to unmap io_uring mmaped memory");
134
135 return ret_errno(ENOSYS);
136 }
137
138 static int __io_uring_arm(struct lxc_async_descr *descr,
139 struct mainloop_handler *handler, bool oneshot)
140 {
141 int ret;
142 struct io_uring_sqe *sqe;
143
144 sqe = io_uring_get_sqe(descr->ring);
145 if (!sqe)
146 return syserror_set(ENOENT, "Failed to get submission queue entry");
147
148 io_uring_prep_poll_add(sqe, handler->fd, EPOLLIN);
149
150 /*
151 * Raise IORING_POLL_ADD_MULTI to set up a multishot poll. The same sqe
152 * will now produce multiple cqes. A cqe produced from a multishot sqe
153 * will raise IORING_CQE_F_MORE in cqe->flags.
154 * Some devices can't be used with IORING_POLL_ADD_MULTI. This can only
155 * be detected at completion time. The IORING_CQE_F_MORE flag will not
156 * raised in cqe->flags. This includes terminal devices. So
157 * unfortunately we can't use multishot for them although we really
158 * would like to. But instead we will need to resubmit them. The
159 * io_uring based mainloop will deal cases whwere multishot doesn't
160 * work and resubmit the request. The handler just needs to inform the
161 * mainloop that it wants to keep the handler.
162 */
163 if (!oneshot)
164 sqe->len |= IORING_POLL_ADD_MULTI;
165
166 io_uring_sqe_set_data(sqe, handler);
167 ret = io_uring_submit(descr->ring);
168 if (ret < 0) {
169 if (!oneshot && ret == -EINVAL) {
170 /* The kernel might not yet support multishot. */
171 sqe->len &= ~IORING_POLL_ADD_MULTI;
172 ret = io_uring_submit(descr->ring);
173 }
174 }
175 if (ret < 0)
176 return syserror_ret(ret, "Failed to add \"%s\" handler", handler->handler_name);
177
178 TRACE("Added \"%s\" handler", handler->handler_name);
179 return 0;
180 }
181
182 static int __io_uring_disarm(struct lxc_async_descr *descr,
183 struct mainloop_handler *handler)
184 {
185 int ret;
186 struct io_uring_sqe *sqe;
187
188 sqe = io_uring_get_sqe(descr->ring);
189 if (!sqe)
190 return syserror_set(ENOENT,
191 "Failed to get submission queue entry");
192
193 io_uring_prep_poll_remove(sqe, handler);
194 handler->flags |= CANCEL_RAISED;
195 io_uring_sqe_set_data(sqe, handler);
196 ret = io_uring_submit(descr->ring);
197 if (ret < 0) {
198 handler->flags &= ~CANCEL_RAISED;
199 return syserror_ret(ret, "Failed to remove \"%s\" handler",
200 handler->handler_name);
201 }
202
203 TRACE("Removed handler \"%s\"", handler->handler_name);
204 return ret;
205 }
206
207 static void msec_to_ts(struct __kernel_timespec *ts, unsigned int timeout_ms)
208 {
209 ts->tv_sec = timeout_ms / 1000;
210 ts->tv_nsec = (timeout_ms % 1000) * 1000000;
211 }
212
213 static int __lxc_mainloop_io_uring(struct lxc_async_descr *descr, int timeout_ms)
214 {
215 struct __kernel_timespec ts;
216
217 if (timeout_ms >= 0)
218 msec_to_ts(&ts, timeout_ms);
219
220 for (;;) {
221 int ret;
222 __s32 mask = 0;
223 bool oneshot = false;
224 struct io_uring_cqe *cqe = NULL;
225 struct mainloop_handler *handler = NULL;
226
227 if (timeout_ms >= 0)
228 ret = io_uring_wait_cqe_timeout(descr->ring, &cqe, &ts);
229 else
230 ret = io_uring_wait_cqe(descr->ring, &cqe);
231 if (ret < 0) {
232 if (ret == -EINTR)
233 continue;
234
235 if (ret == -ETIME)
236 return 0;
237
238 return syserror_ret(ret, "Failed to wait for completion");
239 }
240
241 ret = LXC_MAINLOOP_CONTINUE;
242 oneshot = !(cqe->flags & IORING_CQE_F_MORE);
243 mask = cqe->res;
244 handler = io_uring_cqe_get_data(cqe);
245 io_uring_cqe_seen(descr->ring, cqe);
246
247 switch (mask) {
248 case -ECANCELED:
249 handler->flags |= CANCEL_RECEIVED;
250 TRACE("Canceled \"%s\" handler", handler->handler_name);
251 goto out;
252 case -ENOENT:
253 handler->flags = CANCEL_SUCCESS | CANCEL_RECEIVED;
254 TRACE("No sqe for \"%s\" handler", handler->handler_name);
255 goto out;
256 case -EALREADY:
257 TRACE("Repeat sqe remove request for \"%s\" handler", handler->handler_name);
258 goto out;
259 case 0:
260 handler->flags |= CANCEL_SUCCESS;
261 TRACE("Removed \"%s\" handler", handler->handler_name);
262 goto out;
263 default:
264 /*
265 * We need to always remove the handler for a
266 * successful oneshot request.
267 */
268 if (oneshot)
269 handler->flags = CANCEL_SUCCESS | CANCEL_RECEIVED;
270 }
271
272 ret = handler->callback(handler->fd, mask, handler->data, descr);
273 switch (ret) {
274 case LXC_MAINLOOP_CONTINUE:
275 /* We're operating in oneshot mode so we need to rearm. */
276 if (oneshot && __io_uring_arm(descr, handler, true))
277 return -1;
278 break;
279 case LXC_MAINLOOP_DISARM:
280 if (has_exact_flags(handler->flags, (CANCEL_SUCCESS | CANCEL_RECEIVED)))
281 delete_handler(descr, handler, oneshot);
282 break;
283 case LXC_MAINLOOP_CLOSE:
284 return log_trace(0, "Closing from \"%s\"", handler->handler_name);
285 case LXC_MAINLOOP_ERROR:
286 return syserror_ret(-1, "Closing with error from \"%s\"", handler->handler_name);
287 }
288
289 out:
290 if (lxc_list_empty(&descr->handlers))
291 return error_ret(0, "Closing because there are no more handlers");
292 }
293 }
294 #endif
295
296 static int __lxc_mainloop_epoll(struct lxc_async_descr *descr, int timeout_ms)
297 {
298 int i, nfds, ret;
299 struct mainloop_handler *handler;
300 struct epoll_event events[MAX_EVENTS];
301
302 for (;;) {
303 nfds = epoll_wait(descr->epfd, events, MAX_EVENTS, timeout_ms);
304 if (nfds < 0) {
305 if (errno == EINTR)
306 continue;
307
308 return -errno;
309 }
310
311 for (i = 0; i < nfds; i++) {
312 handler = events[i].data.ptr;
313
314 /* If the handler returns a positive value, exit the
315 * mainloop.
316 */
317 ret = handler->callback(handler->fd, events[i].events,
318 handler->data, descr);
319 switch (ret) {
320 case LXC_MAINLOOP_DISARM:
321 delete_handler(descr, handler, false);
322 __fallthrough;
323 case LXC_MAINLOOP_CONTINUE:
324 break;
325 case LXC_MAINLOOP_CLOSE:
326 return 0;
327 case LXC_MAINLOOP_ERROR:
328 return -1;
329 }
330 }
331
332 if (nfds == 0)
333 return 0;
334
335 if (lxc_list_empty(&descr->handlers))
336 return 0;
337 }
338 }
339
340 int lxc_mainloop(struct lxc_async_descr *descr, int timeout_ms)
341 {
342 if (descr->type == LXC_MAINLOOP_IO_URING)
343 return __lxc_mainloop_io_uring(descr, timeout_ms);
344
345 return __lxc_mainloop_epoll(descr, timeout_ms);
346 }
347
348 static int __lxc_mainloop_add_handler_events(struct lxc_async_descr *descr,
349 int fd, int events,
350 lxc_mainloop_callback_t callback,
351 lxc_mainloop_cleanup_t cleanup,
352 void *data, bool oneshot,
353 const char *handler_name)
354 {
355 __do_free struct mainloop_handler *handler = NULL;
356 __do_free struct lxc_list *list = NULL;
357 int ret;
358 struct epoll_event ev;
359
360 if (fd < 0)
361 return ret_errno(EBADF);
362
363 if (!callback || !cleanup || !events || !handler_name)
364 return ret_errno(EINVAL);
365
366 handler = zalloc(sizeof(*handler));
367 if (!handler)
368 return ret_errno(ENOMEM);
369
370 handler->callback = callback;
371 handler->cleanup = cleanup;
372 handler->fd = fd;
373 handler->data = data;
374 handler->handler_name = handler_name;
375
376 if (descr->type == LXC_MAINLOOP_IO_URING) {
377 ret = __io_uring_arm(descr, handler, oneshot);
378 } else {
379 ev.events = events;
380 ev.data.ptr = handler;
381 ret = epoll_ctl(descr->epfd, EPOLL_CTL_ADD, fd, &ev);
382 }
383 if (ret < 0)
384 return -errno;
385
386 list = lxc_list_new();
387 if (!list)
388 return ret_errno(ENOMEM);
389
390 handler->list = list;
391 lxc_list_add_elem(list, move_ptr(handler));;
392 lxc_list_add_tail(&descr->handlers, move_ptr(list));
393 return 0;
394 }
395
396 int lxc_mainloop_add_handler_events(struct lxc_async_descr *descr, int fd,
397 int events,
398 lxc_mainloop_callback_t callback,
399 lxc_mainloop_cleanup_t cleanup,
400 void *data, const char *handler_name)
401 {
402 return __lxc_mainloop_add_handler_events(descr, fd, events,
403 callback, cleanup,
404 data, false, handler_name);
405 }
406
407 int lxc_mainloop_add_handler(struct lxc_async_descr *descr, int fd,
408 lxc_mainloop_callback_t callback,
409 lxc_mainloop_cleanup_t cleanup,
410 void *data, const char *handler_name)
411 {
412 return __lxc_mainloop_add_handler_events(descr, fd, EPOLLIN,
413 callback, cleanup,
414 data, false, handler_name);
415 }
416
417 int lxc_mainloop_add_oneshot_handler(struct lxc_async_descr *descr, int fd,
418 lxc_mainloop_callback_t callback,
419 lxc_mainloop_cleanup_t cleanup,
420 void *data, const char *handler_name)
421 {
422 return __lxc_mainloop_add_handler_events(descr, fd, EPOLLIN,
423 callback, cleanup,
424 data, true, handler_name);
425 }
426
427 int lxc_mainloop_del_handler(struct lxc_async_descr *descr, int fd)
428 {
429 int ret;
430 struct lxc_list *iterator = NULL;
431
432 lxc_list_for_each(iterator, &descr->handlers) {
433 struct mainloop_handler *handler = iterator->elem;
434
435 if (handler->fd != fd)
436 continue;
437
438 if (descr->type == LXC_MAINLOOP_IO_URING)
439 ret = __io_uring_disarm(descr, handler);
440 else
441 ret = epoll_ctl(descr->epfd, EPOLL_CTL_DEL, fd, NULL);
442 if (ret < 0)
443 return syserror("Failed to disarm \"%s\"", handler->handler_name);
444
445 /*
446 * For io_uring the deletion happens at completion time. Either
447 * we get ENOENT if the request was oneshot and it had already
448 * triggered or we get ECANCELED for the original sqe and 0 for
449 * the cancellation request.
450 */
451 if (descr->type == LXC_MAINLOOP_EPOLL) {
452 lxc_list_del(iterator);
453 free(iterator->elem);
454 free(iterator);
455 }
456
457 return 0;
458 }
459
460 return ret_errno(EINVAL);
461 }
462
463 static inline int __epoll_open(struct lxc_async_descr *descr)
464 {
465 *descr = (struct lxc_async_descr){
466 .epfd = -EBADF,
467 };
468
469 descr->epfd = epoll_create1(EPOLL_CLOEXEC);
470 if (descr->epfd < 0)
471 return syserror("Failed to create epoll instance");
472
473 descr->type = LXC_MAINLOOP_EPOLL;
474 TRACE("Created epoll instance");
475 return 0;
476 }
477
478 int lxc_mainloop_open(struct lxc_async_descr *descr)
479 {
480 int ret;
481
482 ret = __io_uring_open(descr);
483 if (ret == -ENOSYS)
484 ret = __epoll_open(descr);
485 if (ret < 0)
486 return syserror("Failed to create mainloop instance");
487
488 lxc_list_init(&descr->handlers);
489 return 0;
490 }
491
492 void lxc_mainloop_close(struct lxc_async_descr *descr)
493 {
494 struct lxc_list *iterator, *next;
495
496 iterator = descr->handlers.next;
497 while (iterator != &descr->handlers) {
498 next = iterator->next;
499
500 lxc_list_del(iterator);
501 free(iterator->elem);
502 free(iterator);
503 iterator = next;
504 }
505
506 if (descr->type == LXC_MAINLOOP_IO_URING) {
507 #ifdef HAVE_LIBURING
508 io_uring_queue_exit(descr->ring);
509 munmap(descr->ring, sizeof(struct io_uring));
510 #else
511 ERROR("Unsupported io_uring mainloop");
512 #endif
513 } else {
514 close_prot_errno_disarm(descr->epfd);
515 }
516 }