]> git.proxmox.com Git - mirror_lxcfs.git/blame - cgroups/cgfsng.c
bindings: add infrastructure for cgroup2 support
[mirror_lxcfs.git] / cgroups / cgfsng.c
CommitLineData
5fbea8a6
CB
1/* SPDX-License-Identifier: LGPL-2.1+ */
2
3/*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
8 * each controller.
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
12 * a comma-separated list of controllers.
13 */
14
15#ifndef _GNU_SOURCE
16#define _GNU_SOURCE 1
17#endif
18#include <ctype.h>
19#include <dirent.h>
20#include <errno.h>
21#include <grp.h>
22#include <linux/kdev_t.h>
23#include <linux/types.h>
24#include <poll.h>
25#include <signal.h>
26#include <stdint.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <sys/mount.h>
31#include <sys/types.h>
32#include <unistd.h>
33
34#include "cgroup.h"
35#include "cgroup2_devices.h"
36#include "cgroup_utils.h"
37#include "macro.h"
38#include "memory_utils.h"
39
40static void free_string_list(char **clist)
41{
42 int i;
43
44 if (!clist)
45 return;
46
47 for (i = 0; clist[i]; i++)
48 free(clist[i]);
49
50 free(clist);
51}
52
53/* Given a pointer to a null-terminated array of pointers, realloc to add one
54 * entry, and point the new entry to NULL. Do not fail. Return the index to the
55 * second-to-last entry - that is, the one which is now available for use
56 * (keeping the list null-terminated).
57 */
58static int append_null_to_list(void ***list)
59{
60 int newentry = 0;
61
62 if (*list)
63 for (; (*list)[newentry]; newentry++)
64 ;
65
66 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
67 (*list)[newentry + 1] = NULL;
68 return newentry;
69}
70
71/* Given a null-terminated array of strings, check whether @entry is one of the
72 * strings.
73 */
74static bool string_in_list(char **list, const char *entry)
75{
76 int i;
77
78 if (!list)
79 return false;
80
81 for (i = 0; list[i]; i++)
82 if (strcmp(list[i], entry) == 0)
83 return true;
84
85 return false;
86}
87
88/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
89 * "name=systemd". Do not fail.
90 */
91static char *cg_legacy_must_prefix_named(char *entry)
92{
93 size_t len;
94 char *prefixed;
95
96 len = strlen(entry);
97 prefixed = must_realloc(NULL, len + 6);
98
99 memcpy(prefixed, "name=", STRLITERALLEN("name="));
100 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
101 prefixed[len + 5] = '\0';
102
103 return prefixed;
104}
105
106/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
107 * we are called.
108 *
109 * We also handle named subsystems here. Any controller which is not a kernel
110 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
111 * we refuse to use because we're not sure which we have here.
112 * (TODO: We could work around this in some cases by just remounting to be
113 * unambiguous, or by comparing mountpoint contents with current cgroup.)
114 *
115 * The last entry will always be NULL.
116 */
117static void must_append_controller(char **klist, char **nlist, char ***clist,
118 char *entry)
119{
120 int newentry;
121 char *copy;
122
123 if (string_in_list(klist, entry) && string_in_list(nlist, entry))
124 return;
125
126 newentry = append_null_to_list((void ***)clist);
127
128 if (strncmp(entry, "name=", 5) == 0)
129 copy = must_copy_string(entry);
130 else if (string_in_list(klist, entry))
131 copy = must_copy_string(entry);
132 else
133 copy = cg_legacy_must_prefix_named(entry);
134
135 (*clist)[newentry] = copy;
136}
137
138/* Given a handler's cgroup data, return the struct hierarchy for the controller
139 * @c, or NULL if there is none.
140 */
141static struct hierarchy *cgfsng_get_hierarchy(struct cgroup_ops *ops,
142 const char *controller)
143{
144 int i;
145
146 errno = ENOENT;
147
148 if (!ops->hierarchies)
149 return NULL;
150
151 for (i = 0; ops->hierarchies[i]; i++) {
152 if (!controller) {
153 /* This is the empty unified hierarchy. */
154 if (ops->hierarchies[i]->controllers &&
155 !ops->hierarchies[i]->controllers[0])
156 return ops->hierarchies[i];
157 continue;
158 } else if (pure_unified_layout(ops) &&
159 strcmp(controller, "devices") == 0) {
160 if (ops->unified->bpf_device_controller)
161 return ops->unified;
162 break;
163 }
164
165 if (string_in_list(ops->hierarchies[i]->controllers, controller))
166 return ops->hierarchies[i];
167 }
168
169 return NULL;
170}
171
172static inline struct hierarchy *get_hierarchy(struct cgroup_ops *ops,
173 const char *controller)
174{
175 return cgfsng_get_hierarchy(ops, controller);
176}
177
178/* Given two null-terminated lists of strings, return true if any string is in
179 * both.
180 */
181static bool controller_lists_intersect(char **l1, char **l2)
182{
183 int i;
184
185 if (!l1 || !l2)
186 return false;
187
188 for (i = 0; l1[i]; i++) {
189 if (string_in_list(l2, l1[i]))
190 return true;
191 }
192
193 return false;
194}
195
196/* For a null-terminated list of controllers @clist, return true if any of those
197 * controllers is already listed the null-terminated list of hierarchies @hlist.
198 * Realistically, if one is present, all must be present.
199 */
200static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
201{
202 int i;
203
204 if (!hlist)
205 return false;
206
207 for (i = 0; hlist[i]; i++)
208 if (controller_lists_intersect(hlist[i]->controllers, clist))
209 return true;
210
211 return false;
212}
213
214/* Get the controllers from a mountinfo line There are other ways we could get
215 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
216 * could parse the mount options. But we simply assume that the mountpoint must
217 * be /sys/fs/cgroup/controller-list
218 */
219static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
220 int type, char **controllers)
221{
222 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
223 * for legacy hierarchies.
224 */
225 int i;
226 char *p2, *tok;
227 char *p = line, *sep = ",";
228 char **aret = NULL;
229
230 for (i = 0; i < 4; i++) {
231 p = strchr(p, ' ');
232 if (!p)
233 return NULL;
234 p++;
235 }
236
237 /* Note, if we change how mountinfo works, then our caller will need to
238 * verify /sys/fs/cgroup/ in this field.
239 */
240 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
241 return NULL;
242
243 p += 15;
244 p2 = strchr(p, ' ');
245 if (!p2)
246 return NULL;
247 *p2 = '\0';
248
249 if (type == CGROUP_SUPER_MAGIC) {
250 __do_free char *dup = NULL;
251
252 /* strdup() here for v1 hierarchies. Otherwise
253 * lxc_iterate_parts() will destroy mountpoints such as
254 * "/sys/fs/cgroup/cpu,cpuacct".
255 */
256 dup = must_copy_string(p);
257 if (!dup)
258 return NULL;
259
260 lxc_iterate_parts (tok, dup, sep)
261 must_append_controller(klist, nlist, &aret, tok);
262 *controllers = move_ptr(dup);
263 }
264 *p2 = ' ';
265
266 return aret;
267}
268
269static char **cg_unified_make_empty_controller(void)
270{
271 int newentry;
272 char **aret = NULL;
273
274 newentry = append_null_to_list((void ***)&aret);
275 aret[newentry] = NULL;
276 return aret;
277}
278
279static char **cg_unified_get_controllers(const char *file)
280{
281 __do_free char *buf = NULL;
282 char *sep = " \t\n";
283 char **aret = NULL;
284 char *tok;
285
286 buf = read_file(file);
287 if (!buf)
288 return NULL;
289
290 lxc_iterate_parts(tok, buf, sep) {
291 int newentry;
292 char *copy;
293
294 newentry = append_null_to_list((void ***)&aret);
295 copy = must_copy_string(tok);
296 aret[newentry] = copy;
297 }
298
299 return aret;
300}
301
302static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
303 char *container_base_path, int type)
304{
305 struct hierarchy *new;
306 int newentry;
307
308 new = zalloc(sizeof(*new));
309 new->controllers = clist;
310 new->mountpoint = mountpoint;
311 new->container_base_path = container_base_path;
312 new->version = type;
313
314 newentry = append_null_to_list((void ***)h);
315 (*h)[newentry] = new;
316 return new;
317}
318
319/* Get a copy of the mountpoint from @line, which is a line from
320 * /proc/self/mountinfo.
321 */
322static char *cg_hybrid_get_mountpoint(char *line)
323{
324 int i;
325 size_t len;
326 char *p2;
327 char *p = line, *sret = NULL;
328
329 for (i = 0; i < 4; i++) {
330 p = strchr(p, ' ');
331 if (!p)
332 return NULL;
333 p++;
334 }
335
336 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
337 return NULL;
338
339 p2 = strchr(p + 15, ' ');
340 if (!p2)
341 return NULL;
342 *p2 = '\0';
343
344 len = strlen(p);
345 sret = must_realloc(NULL, len + 1);
346 memcpy(sret, p, len);
347 sret[len] = '\0';
348 return sret;
349}
350
351static void must_append_string(char ***list, char *entry)
352{
353 int newentry;
354 char *copy;
355
356 newentry = append_null_to_list((void ***)list);
357 copy = must_copy_string(entry);
358 (*list)[newentry] = copy;
359}
360
361static int get_existing_subsystems(char ***klist, char ***nlist)
362{
363 __do_free char *line = NULL;
364 __do_fclose FILE *f = NULL;
365 size_t len = 0;
366
367 f = fopen("/proc/self/cgroup", "r");
368 if (!f)
369 return -1;
370
371 while (getline(&line, &len, f) != -1) {
372 char *p, *p2, *tok;
373 p = strchr(line, ':');
374 if (!p)
375 continue;
376 p++;
377 p2 = strchr(p, ':');
378 if (!p2)
379 continue;
380 *p2 = '\0';
381
382 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
383 * contains an entry of the form:
384 *
385 * 0::/some/path
386 *
387 * In this case we use "cgroup2" as controller name.
388 */
389 if ((p2 - p) == 0) {
390 must_append_string(klist, "cgroup2");
391 continue;
392 }
393
394 lxc_iterate_parts(tok, p, ",") {
395 if (strncmp(tok, "name=", 5) == 0)
396 must_append_string(nlist, tok);
397 else
398 must_append_string(klist, tok);
399 }
400 }
401
402 return 0;
403}
404
405static void trim(char *s)
406{
407 size_t len;
408
409 len = strlen(s);
410 while ((len > 1) && (s[len - 1] == '\n'))
411 s[--len] = '\0';
412}
413
414/* __cg_mount_direct
415 *
416 * Mount cgroup hierarchies directly without using bind-mounts. The main
417 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
418 * cgroups for the LXC_AUTO_CGROUP_FULL option.
419 */
420static int __cg_mount_direct(struct hierarchy *h, const char *controllerpath)
421{
422 __do_free char *controllers = NULL;
423 char *fstype = "cgroup2";
424 unsigned long flags = 0;
425 int ret;
426
427 flags |= MS_NOSUID;
428 flags |= MS_NOEXEC;
429 flags |= MS_NODEV;
430 flags |= MS_RELATIME;
431
432 if (h->version != CGROUP2_SUPER_MAGIC) {
433 controllers = lxc_string_join(",", (const char **)h->controllers, false);
434 if (!controllers)
435 return -ENOMEM;
436 fstype = "cgroup";
437 }
438
439 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
440 if (ret < 0)
441 return -1;
442
443 return 0;
444}
445
446static inline int cg_mount_cgroup_full(struct hierarchy *h,
447 const char *controllerpath)
448{
449 return __cg_mount_direct(h, controllerpath);
450}
451
452static bool cgfsng_mount(struct cgroup_ops *ops, const char *root)
453{
454 __do_free char *cgroup_root = NULL;
455 int ret;
456 bool retval = false;
457
458 if (!ops)
459 return ret_set_errno(false, ENOENT);
460
461 if (!ops->hierarchies)
462 return true;
463
464 cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
465 if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
466 return cg_mount_cgroup_full(ops->unified, cgroup_root) == 0;
467
468 /* mount tmpfs */
469 ret = safe_mount(NULL, cgroup_root, "tmpfs",
470 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
471 "size=10240k,mode=755", root);
472 if (ret < 0)
473 goto on_error;
474
475 for (int i = 0; ops->hierarchies[i]; i++) {
476 __do_free char *controllerpath = NULL;
477 struct hierarchy *h = ops->hierarchies[i];
478 char *controller = strrchr(h->mountpoint, '/');
479
480 if (!controller)
481 continue;
482 controller++;
483
484 controllerpath = must_make_path(cgroup_root, controller, NULL);
485 if (dir_exists(controllerpath))
486 continue;
487
488 ret = mkdir(controllerpath, 0755);
489 if (ret < 0)
490 log_error_errno(goto on_error, errno,
491 "Error creating cgroup path: %s",
492 controllerpath);
493
494 ret = cg_mount_cgroup_full( h, controllerpath);
495 if (ret < 0)
496 goto on_error;
497 }
498 retval = true;
499
500on_error:
501 return retval;
502}
503
504static int recursive_count_nrtasks(char *dirname)
505{
506 __do_free char *path = NULL;
507 __do_closedir DIR *dir = NULL;
508 struct dirent *direntp;
509 int count = 0, ret;
510
511 dir = opendir(dirname);
512 if (!dir)
513 return 0;
514
515 while ((direntp = readdir(dir))) {
516 struct stat mystat;
517
518 if (!strcmp(direntp->d_name, ".") ||
519 !strcmp(direntp->d_name, ".."))
520 continue;
521
522 path = must_make_path(dirname, direntp->d_name, NULL);
523
524 if (lstat(path, &mystat))
525 continue;
526
527 if (!S_ISDIR(mystat.st_mode))
528 continue;
529
530 count += recursive_count_nrtasks(path);
531 }
532
533 path = must_make_path(dirname, "cgroup.procs", NULL);
534 ret = lxc_count_file_lines(path);
535 if (ret != -1)
536 count += ret;
537
538 return count;
539}
540
541static int cgfsng_nrtasks(struct cgroup_ops *ops)
542{
543 __do_free char *path = NULL;
544
545 if (!ops)
546 return ret_set_errno(-1, ENOENT);
547
548 if (!ops->container_cgroup || !ops->hierarchies)
549 return ret_set_errno(-1, EINVAL);
550
551 path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
552 return recursive_count_nrtasks(path);
553}
554
555static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
556{
557 int i = 0;
558
559 if (!ops)
560 return ret_set_errno(-1, ENOENT);
561
562 if (!ops->hierarchies)
563 return 0;
564
565 for (; ops->hierarchies[i]; i++)
566 ;
567
568 return i;
569}
570
571static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
572{
573 int i;
574
575 if (!ops)
576 return ret_set_errno(false, ENOENT);
577
578 if (!ops->hierarchies)
579 return false;
580
581 /* sanity check n */
582 for (i = 0; i < n; i++)
583 if (!ops->hierarchies[i])
584 return ret_set_errno(false, ENOENT);
585
586 *out = ops->hierarchies[i]->controllers;
587
588 return true;
589}
590
591/* At startup, parse_hierarchies finds all the info we need about cgroup
592 * mountpoints and current cgroups, and stores it in @d.
593 */
594static int cg_hybrid_init(struct cgroup_ops *ops)
595{
596 __do_free char *basecginfo = NULL;
597 __do_free char *line = NULL;
598 __do_fclose FILE *f = NULL;
599 int ret;
600 size_t len = 0;
601 char **klist = NULL, **nlist = NULL;
602
603 /* Root spawned containers escape the current cgroup, so use init's
604 * cgroups as our base in that case.
605 */
606 basecginfo = read_file("/proc/1/cgroup");
607 if (!basecginfo)
608 return ret_set_errno(-1, ENOMEM);
609
610 ret = get_existing_subsystems(&klist, &nlist);
611 if (ret < 0)
612 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
613
614 f = fopen("/proc/self/mountinfo", "r");
615 if (!f)
616 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
617
618 while (getline(&line, &len, f) != -1) {
619 int type;
620 struct hierarchy *new;
621 char *base_cgroup = NULL, *mountpoint = NULL;
622 char **controller_list = NULL;
623 __do_free char *controllers = NULL;
624
625 type = get_cgroup_version(line);
626 if (type == 0)
627 continue;
628
629 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
630 continue;
631
632 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
633 if (type == CGROUP2_SUPER_MAGIC)
634 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
635 else if (type == CGROUP_SUPER_MAGIC)
636 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
637 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
638 if (type == CGROUP_SUPER_MAGIC)
639 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
640 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
641 if (type == CGROUP2_SUPER_MAGIC)
642 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
643 }
644
645 controller_list = cg_hybrid_get_controllers(klist, nlist, line,
646 type, &controllers);
647 if (!controller_list && type == CGROUP_SUPER_MAGIC)
648 continue;
649
650 if (type == CGROUP_SUPER_MAGIC)
651 if (controller_list_is_dup(ops->hierarchies, controller_list))
652 ret_set_errno(goto next, EEXIST);
653
654 mountpoint = cg_hybrid_get_mountpoint(line);
655 if (!mountpoint)
656 log_error_errno(goto next, EINVAL, "Failed parsing mountpoint from \"%s\"", line);
657
658 if (type == CGROUP_SUPER_MAGIC) {
659 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
660 } else {
661 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
662 }
663 if (!base_cgroup)
664 log_error_errno(goto next, EINVAL, "Failed to find current cgroup %s", mountpoint);
665
666 trim(base_cgroup);
667 prune_init_scope(base_cgroup);
668
669 if (type == CGROUP2_SUPER_MAGIC) {
670 char *cgv2_ctrl_path;
671
672 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
673 "cgroup.controllers",
674 NULL);
675
676 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
677 free(cgv2_ctrl_path);
678 if (!controller_list)
679 controller_list = cg_unified_make_empty_controller();
680 }
681
682 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
683 new->__controllers = move_ptr(controllers);
684 if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
685 ops->unified = new;
686
687 continue;
688
689 next:
690 free_string_list(controller_list);
691 free(mountpoint);
692 free(base_cgroup);
693 }
694
695 free_string_list(klist);
696 free_string_list(nlist);
697
698 return 0;
699}
700
701static int cg_unified_init(struct cgroup_ops *ops)
702{
703 __do_free char *subtree_path = NULL;
704 int ret;
705 char *mountpoint;
706 char **delegatable;
707 struct hierarchy *new;
708 char *base_cgroup = NULL;
709
710 ret = unified_cgroup_hierarchy();
711 if (ret == -ENOMEDIUM)
712 return ret_errno(ENOMEDIUM);
713
714 if (ret != CGROUP2_SUPER_MAGIC)
715 return 0;
716
717 base_cgroup = cg_unified_get_current_cgroup(1);
718 if (!base_cgroup)
719 return ret_errno(EINVAL);
720 prune_init_scope(base_cgroup);
721
722 /*
723 * We assume that the cgroup we're currently in has been delegated to
724 * us and we are free to further delege all of the controllers listed
725 * in cgroup.controllers further down the hierarchy.
726 */
727 mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
728 subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
729 delegatable = cg_unified_get_controllers(subtree_path);
730 if (!delegatable)
731 delegatable = cg_unified_make_empty_controller();
732
733 /* TODO: If the user requested specific controllers via lxc.cgroup.use
734 * we should verify here. The reason I'm not doing it right is that I'm
735 * not convinced that lxc.cgroup.use will be the future since it is a
736 * global property. I much rather have an option that lets you request
737 * controllers per container.
738 */
739
740 new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
741
742 if (bpf_devices_cgroup_supported())
743 new->bpf_device_controller = 1;
744
745 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
746 ops->unified = new;
747 return CGROUP2_SUPER_MAGIC;
748}
749
750static int cg_init(struct cgroup_ops *ops)
751{
752 int ret;
753
754 ret = cg_unified_init(ops);
755 if (ret < 0)
756 return -1;
757
758 if (ret == CGROUP2_SUPER_MAGIC)
759 return 0;
760
761 return cg_hybrid_init(ops);
762}
763
764struct cgroup_ops *cgfsng_ops_init(void)
765{
766 __do_free struct cgroup_ops *cgfsng_ops = NULL;
767
768 cgfsng_ops = malloc(sizeof(struct cgroup_ops));
769 if (!cgfsng_ops)
770 return ret_set_errno(NULL, ENOMEM);
771
772 memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
773 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
774
775 if (cg_init(cgfsng_ops))
776 return NULL;
777
778 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
779 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
780 cgfsng_ops->get_hierarchy = get_hierarchy;
781 cgfsng_ops->driver = "cgfsng";
782 cgfsng_ops->version = "1.0.0";
783 cgfsng_ops->mount = cgfsng_mount;
784 cgfsng_ops->nrtasks = cgfsng_nrtasks;
785
786 return move_ptr(cgfsng_ops);
787}