]> git.proxmox.com Git - mirror_lxcfs.git/blame - cgroups/cgfsng.c
tree-wide: make fopen() calls cloexec
[mirror_lxcfs.git] / cgroups / cgfsng.c
CommitLineData
5fbea8a6
CB
1/* SPDX-License-Identifier: LGPL-2.1+ */
2
3/*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
8 * each controller.
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
12 * a comma-separated list of controllers.
13 */
14
15#ifndef _GNU_SOURCE
1f5596dd 16#define _GNU_SOURCE
5fbea8a6 17#endif
1f5596dd
CB
18
19#ifndef FUSE_USE_VERSION
20#define FUSE_USE_VERSION 26
21#endif
22
23#define _FILE_OFFSET_BITS 64
24
5fbea8a6
CB
25#include <ctype.h>
26#include <dirent.h>
27#include <errno.h>
28#include <grp.h>
29#include <linux/kdev_t.h>
30#include <linux/types.h>
31#include <poll.h>
32#include <signal.h>
33#include <stdint.h>
34#include <stdio.h>
35#include <stdlib.h>
36#include <string.h>
37#include <sys/mount.h>
38#include <sys/types.h>
39#include <unistd.h>
40
1f5596dd
CB
41#include "../config.h"
42#include "../macro.h"
43#include "../memory_utils.h"
5fbea8a6
CB
44#include "cgroup.h"
45#include "cgroup2_devices.h"
46#include "cgroup_utils.h"
5fbea8a6
CB
47
48static void free_string_list(char **clist)
49{
50 int i;
51
52 if (!clist)
53 return;
54
55 for (i = 0; clist[i]; i++)
56 free(clist[i]);
57
58 free(clist);
59}
60
61/* Given a pointer to a null-terminated array of pointers, realloc to add one
62 * entry, and point the new entry to NULL. Do not fail. Return the index to the
63 * second-to-last entry - that is, the one which is now available for use
64 * (keeping the list null-terminated).
65 */
66static int append_null_to_list(void ***list)
67{
68 int newentry = 0;
69
70 if (*list)
71 for (; (*list)[newentry]; newentry++)
72 ;
73
74 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
75 (*list)[newentry + 1] = NULL;
76 return newentry;
77}
78
79/* Given a null-terminated array of strings, check whether @entry is one of the
80 * strings.
81 */
82static bool string_in_list(char **list, const char *entry)
83{
84 int i;
85
86 if (!list)
87 return false;
88
89 for (i = 0; list[i]; i++)
90 if (strcmp(list[i], entry) == 0)
91 return true;
92
93 return false;
94}
95
96/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
97 * "name=systemd". Do not fail.
98 */
99static char *cg_legacy_must_prefix_named(char *entry)
100{
101 size_t len;
102 char *prefixed;
103
104 len = strlen(entry);
105 prefixed = must_realloc(NULL, len + 6);
106
107 memcpy(prefixed, "name=", STRLITERALLEN("name="));
108 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
109 prefixed[len + 5] = '\0';
110
111 return prefixed;
112}
113
114/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
115 * we are called.
116 *
117 * We also handle named subsystems here. Any controller which is not a kernel
118 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
119 * we refuse to use because we're not sure which we have here.
120 * (TODO: We could work around this in some cases by just remounting to be
121 * unambiguous, or by comparing mountpoint contents with current cgroup.)
122 *
123 * The last entry will always be NULL.
124 */
125static void must_append_controller(char **klist, char **nlist, char ***clist,
126 char *entry)
127{
128 int newentry;
129 char *copy;
130
131 if (string_in_list(klist, entry) && string_in_list(nlist, entry))
132 return;
133
134 newentry = append_null_to_list((void ***)clist);
135
136 if (strncmp(entry, "name=", 5) == 0)
137 copy = must_copy_string(entry);
138 else if (string_in_list(klist, entry))
139 copy = must_copy_string(entry);
140 else
141 copy = cg_legacy_must_prefix_named(entry);
142
143 (*clist)[newentry] = copy;
144}
145
146/* Given a handler's cgroup data, return the struct hierarchy for the controller
147 * @c, or NULL if there is none.
148 */
149static struct hierarchy *cgfsng_get_hierarchy(struct cgroup_ops *ops,
150 const char *controller)
151{
152 int i;
153
154 errno = ENOENT;
155
156 if (!ops->hierarchies)
157 return NULL;
158
159 for (i = 0; ops->hierarchies[i]; i++) {
160 if (!controller) {
161 /* This is the empty unified hierarchy. */
162 if (ops->hierarchies[i]->controllers &&
163 !ops->hierarchies[i]->controllers[0])
164 return ops->hierarchies[i];
165 continue;
166 } else if (pure_unified_layout(ops) &&
167 strcmp(controller, "devices") == 0) {
168 if (ops->unified->bpf_device_controller)
169 return ops->unified;
170 break;
171 }
172
173 if (string_in_list(ops->hierarchies[i]->controllers, controller))
174 return ops->hierarchies[i];
175 }
176
177 return NULL;
178}
179
180static inline struct hierarchy *get_hierarchy(struct cgroup_ops *ops,
181 const char *controller)
182{
183 return cgfsng_get_hierarchy(ops, controller);
184}
185
186/* Given two null-terminated lists of strings, return true if any string is in
187 * both.
188 */
189static bool controller_lists_intersect(char **l1, char **l2)
190{
191 int i;
192
193 if (!l1 || !l2)
194 return false;
195
196 for (i = 0; l1[i]; i++) {
197 if (string_in_list(l2, l1[i]))
198 return true;
199 }
200
201 return false;
202}
203
204/* For a null-terminated list of controllers @clist, return true if any of those
205 * controllers is already listed the null-terminated list of hierarchies @hlist.
206 * Realistically, if one is present, all must be present.
207 */
208static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
209{
210 int i;
211
212 if (!hlist)
213 return false;
214
215 for (i = 0; hlist[i]; i++)
216 if (controller_lists_intersect(hlist[i]->controllers, clist))
217 return true;
218
219 return false;
220}
221
222/* Get the controllers from a mountinfo line There are other ways we could get
223 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
224 * could parse the mount options. But we simply assume that the mountpoint must
225 * be /sys/fs/cgroup/controller-list
226 */
227static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
228 int type, char **controllers)
229{
230 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
231 * for legacy hierarchies.
232 */
233 int i;
234 char *p2, *tok;
235 char *p = line, *sep = ",";
236 char **aret = NULL;
237
238 for (i = 0; i < 4; i++) {
239 p = strchr(p, ' ');
240 if (!p)
241 return NULL;
242 p++;
243 }
244
245 /* Note, if we change how mountinfo works, then our caller will need to
246 * verify /sys/fs/cgroup/ in this field.
247 */
248 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
249 return NULL;
250
251 p += 15;
252 p2 = strchr(p, ' ');
253 if (!p2)
254 return NULL;
255 *p2 = '\0';
256
257 if (type == CGROUP_SUPER_MAGIC) {
258 __do_free char *dup = NULL;
259
260 /* strdup() here for v1 hierarchies. Otherwise
261 * lxc_iterate_parts() will destroy mountpoints such as
262 * "/sys/fs/cgroup/cpu,cpuacct".
263 */
264 dup = must_copy_string(p);
265 if (!dup)
266 return NULL;
267
268 lxc_iterate_parts (tok, dup, sep)
269 must_append_controller(klist, nlist, &aret, tok);
270 *controllers = move_ptr(dup);
271 }
272 *p2 = ' ';
273
274 return aret;
275}
276
277static char **cg_unified_make_empty_controller(void)
278{
279 int newentry;
280 char **aret = NULL;
281
282 newentry = append_null_to_list((void ***)&aret);
283 aret[newentry] = NULL;
284 return aret;
285}
286
287static char **cg_unified_get_controllers(const char *file)
288{
289 __do_free char *buf = NULL;
290 char *sep = " \t\n";
291 char **aret = NULL;
292 char *tok;
293
294 buf = read_file(file);
295 if (!buf)
296 return NULL;
297
298 lxc_iterate_parts(tok, buf, sep) {
299 int newentry;
300 char *copy;
301
302 newentry = append_null_to_list((void ***)&aret);
303 copy = must_copy_string(tok);
304 aret[newentry] = copy;
305 }
306
307 return aret;
308}
309
310static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
0fd1b770 311 char *base_path, int type)
5fbea8a6
CB
312{
313 struct hierarchy *new;
314 int newentry;
315
316 new = zalloc(sizeof(*new));
317 new->controllers = clist;
318 new->mountpoint = mountpoint;
0fd1b770 319 new->base_path = base_path;
5fbea8a6
CB
320 new->version = type;
321
322 newentry = append_null_to_list((void ***)h);
323 (*h)[newentry] = new;
324 return new;
325}
326
327/* Get a copy of the mountpoint from @line, which is a line from
328 * /proc/self/mountinfo.
329 */
330static char *cg_hybrid_get_mountpoint(char *line)
331{
332 int i;
333 size_t len;
334 char *p2;
335 char *p = line, *sret = NULL;
336
337 for (i = 0; i < 4; i++) {
338 p = strchr(p, ' ');
339 if (!p)
340 return NULL;
341 p++;
342 }
343
344 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
345 return NULL;
346
347 p2 = strchr(p + 15, ' ');
348 if (!p2)
349 return NULL;
350 *p2 = '\0';
351
352 len = strlen(p);
353 sret = must_realloc(NULL, len + 1);
354 memcpy(sret, p, len);
355 sret[len] = '\0';
356 return sret;
357}
358
359static void must_append_string(char ***list, char *entry)
360{
361 int newentry;
362 char *copy;
363
364 newentry = append_null_to_list((void ***)list);
365 copy = must_copy_string(entry);
366 (*list)[newentry] = copy;
367}
368
369static int get_existing_subsystems(char ***klist, char ***nlist)
370{
371 __do_free char *line = NULL;
372 __do_fclose FILE *f = NULL;
373 size_t len = 0;
374
dbb1f822 375 f = fopen("/proc/self/cgroup", "re");
5fbea8a6
CB
376 if (!f)
377 return -1;
378
379 while (getline(&line, &len, f) != -1) {
380 char *p, *p2, *tok;
381 p = strchr(line, ':');
382 if (!p)
383 continue;
384 p++;
385 p2 = strchr(p, ':');
386 if (!p2)
387 continue;
388 *p2 = '\0';
389
390 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
391 * contains an entry of the form:
392 *
393 * 0::/some/path
394 *
395 * In this case we use "cgroup2" as controller name.
396 */
397 if ((p2 - p) == 0) {
398 must_append_string(klist, "cgroup2");
399 continue;
400 }
401
402 lxc_iterate_parts(tok, p, ",") {
403 if (strncmp(tok, "name=", 5) == 0)
404 must_append_string(nlist, tok);
405 else
406 must_append_string(klist, tok);
407 }
408 }
409
410 return 0;
411}
412
413static void trim(char *s)
414{
415 size_t len;
416
417 len = strlen(s);
418 while ((len > 1) && (s[len - 1] == '\n'))
419 s[--len] = '\0';
420}
421
422/* __cg_mount_direct
423 *
424 * Mount cgroup hierarchies directly without using bind-mounts. The main
425 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
426 * cgroups for the LXC_AUTO_CGROUP_FULL option.
427 */
428static int __cg_mount_direct(struct hierarchy *h, const char *controllerpath)
429{
430 __do_free char *controllers = NULL;
431 char *fstype = "cgroup2";
432 unsigned long flags = 0;
433 int ret;
434
435 flags |= MS_NOSUID;
436 flags |= MS_NOEXEC;
437 flags |= MS_NODEV;
438 flags |= MS_RELATIME;
439
440 if (h->version != CGROUP2_SUPER_MAGIC) {
441 controllers = lxc_string_join(",", (const char **)h->controllers, false);
442 if (!controllers)
443 return -ENOMEM;
444 fstype = "cgroup";
445 }
446
447 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
448 if (ret < 0)
449 return -1;
450
451 return 0;
452}
453
454static inline int cg_mount_cgroup_full(struct hierarchy *h,
455 const char *controllerpath)
456{
457 return __cg_mount_direct(h, controllerpath);
458}
459
460static bool cgfsng_mount(struct cgroup_ops *ops, const char *root)
461{
462 __do_free char *cgroup_root = NULL;
463 int ret;
464 bool retval = false;
465
466 if (!ops)
467 return ret_set_errno(false, ENOENT);
468
469 if (!ops->hierarchies)
470 return true;
471
472 cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
473 if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
474 return cg_mount_cgroup_full(ops->unified, cgroup_root) == 0;
475
476 /* mount tmpfs */
477 ret = safe_mount(NULL, cgroup_root, "tmpfs",
478 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
479 "size=10240k,mode=755", root);
480 if (ret < 0)
481 goto on_error;
482
483 for (int i = 0; ops->hierarchies[i]; i++) {
484 __do_free char *controllerpath = NULL;
485 struct hierarchy *h = ops->hierarchies[i];
486 char *controller = strrchr(h->mountpoint, '/');
487
488 if (!controller)
489 continue;
490 controller++;
491
492 controllerpath = must_make_path(cgroup_root, controller, NULL);
493 if (dir_exists(controllerpath))
494 continue;
495
496 ret = mkdir(controllerpath, 0755);
497 if (ret < 0)
498 log_error_errno(goto on_error, errno,
499 "Error creating cgroup path: %s",
500 controllerpath);
501
502 ret = cg_mount_cgroup_full( h, controllerpath);
503 if (ret < 0)
504 goto on_error;
505 }
506 retval = true;
507
508on_error:
509 return retval;
510}
511
5fbea8a6
CB
512static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
513{
514 int i = 0;
515
516 if (!ops)
517 return ret_set_errno(-1, ENOENT);
518
519 if (!ops->hierarchies)
520 return 0;
521
522 for (; ops->hierarchies[i]; i++)
523 ;
524
525 return i;
526}
527
528static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
529{
530 int i;
531
532 if (!ops)
533 return ret_set_errno(false, ENOENT);
534
535 if (!ops->hierarchies)
536 return false;
537
538 /* sanity check n */
539 for (i = 0; i < n; i++)
540 if (!ops->hierarchies[i])
541 return ret_set_errno(false, ENOENT);
542
543 *out = ops->hierarchies[i]->controllers;
544
545 return true;
546}
547
1ca6a467
CB
548static bool cgfsng_get(struct cgroup_ops *ops, const char *controller,
549 const char *cgroup, const char *file, char **value)
550{
551 __do_free char *path = NULL;
552 struct hierarchy *h;
553
554 h = ops->get_hierarchy(ops, controller);
555 if (!h)
556 return false;
557
075387cd 558 path = must_make_path(dot_or_empty(cgroup), cgroup, file, NULL);
1ca6a467
CB
559 *value = readat_file(h->fd, path);
560 return *value != NULL;
561}
562
66c5e848
CB
563static int cgfsng_get_memory(struct cgroup_ops *ops, const char *cgroup,
564 const char *file, char **value)
565{
566 __do_free char *path = NULL;
567 struct hierarchy *h;
568 int ret;
569
570 h = ops->get_hierarchy(ops, "memory");
571 if (!h)
572 return -1;
573
574 if (!is_unified_hierarchy(h)) {
575 if (strcmp(file, "memory.max") == 0)
576 file = "memory.limit_in_bytes";
577 else if (strcmp(file, "memory.swap.max") == 0)
578 file = "memory.memsw.limit_in_bytes";
579 else if (strcmp(file, "memory.swap.current") == 0)
580 file = "memory.memsw.usage_in_bytes";
581 else if (strcmp(file, "memory.current") == 0)
582 file = "memory.usage_in_bytes";
583 ret = CGROUP_SUPER_MAGIC;
584 } else {
585 ret = CGROUP2_SUPER_MAGIC;
586 }
587
075387cd 588 path = must_make_path(dot_or_empty(cgroup), cgroup, file, NULL);
66c5e848
CB
589 *value = readat_file(h->fd, path);
590 if (!*value)
591 ret = -1;
592
593 return ret;
594}
595
acff9786
CB
596static int cgfsng_get_memory_stats_fd(struct cgroup_ops *ops, const char *cgroup)
597{
598 __do_free char *path = NULL;
599 struct hierarchy *h;
600
601 h = ops->get_hierarchy(ops, "memory");
602 if (!h)
603 return -1;
604
605 path = must_make_path(dot_or_empty(cgroup), cgroup, "memory.stat", NULL);
606 return openat(h->fd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
607}
608
66c5e848
CB
609static int cgfsng_get_memory_current(struct cgroup_ops *ops, const char *cgroup,
610 char **value)
611{
612 return cgfsng_get_memory(ops, cgroup, "memory.current", value);
613}
614
615static int cgfsng_get_memory_swap_current(struct cgroup_ops *ops,
616 const char *cgroup, char **value)
617{
618 return cgfsng_get_memory(ops, cgroup, "memory.swap.current", value);
619}
620
621static int cgfsng_get_memory_max(struct cgroup_ops *ops, const char *cgroup,
622 char **value)
623{
624 return cgfsng_get_memory(ops, cgroup, "memory.max", value);
625}
626
627static int cgfsng_get_memory_swap_max(struct cgroup_ops *ops,
628 const char *cgroup, char **value)
629{
630 return cgfsng_get_memory(ops, cgroup, "memory.swap.max", value);
631}
632
633static int cgfsng_get_memory_stats(struct cgroup_ops *ops, const char *cgroup,
634 char **value)
635{
636 return cgfsng_get_memory(ops, cgroup, "memory.stat", value);
637}
638
2c3bcd9e
CB
639static char *readat_cpuset(int cgroup_fd)
640{
641 __do_free char *val = NULL;
642
643 val = readat_file(cgroup_fd, "cpuset.cpus");
644 if (val && strcmp(val, "") != 0)
645 return move_ptr(val);
646
647 free_disarm(val);
648 val = readat_file(cgroup_fd, "cpuset.cpus.effective");
649 if (val && strcmp(val, "") != 0)
650 return move_ptr(val);
651
652 return NULL;
653}
654
655static int cgfsng_get_cpuset_cpus(struct cgroup_ops *ops, const char *cgroup,
656 char **value)
657{
658 __do_close_prot_errno int cgroup_fd = -EBADF;
659 __do_free char *path = NULL;
660 char *v;
661 struct hierarchy *h;
662 int ret;
663
664 h = ops->get_hierarchy(ops, "cpuset");
665 if (!h)
666 return -1;
667
668 if (!is_unified_hierarchy(h))
669 ret = CGROUP_SUPER_MAGIC;
670 else
671 ret = CGROUP2_SUPER_MAGIC;
672
673 *value = NULL;
075387cd 674 path = must_make_path(dot_or_empty(cgroup), cgroup, NULL);
2c3bcd9e
CB
675 cgroup_fd = openat_safe(h->fd, path);
676 if (cgroup_fd < 0) {
677 return -1;
678 }
679 v = readat_cpuset(cgroup_fd);
680 if (v) {
681 *value = v;
682 return ret;
683 }
684
685 /*
686 * cpuset.cpus and cpuset.cpus.effective are empty so we need to look
687 * the nearest ancestor with a non-empty cpuset.cpus{.effective} file.
688 */
689 for (;;) {
690 int fd;
691
692 fd = openat_safe(cgroup_fd, "../");
693 if (fd < 0 || !is_cgroup_fd(fd)) {
694 fprintf(stderr, "2222: %s\n", strerror(errno));
695 return -1;
696 }
697
698 close_prot_errno_replace(cgroup_fd, fd);
699
700 v = readat_cpuset(fd);
701 if (v) {
702 *value = v;
703 return ret;
704 }
705 }
706
707 return -1;
708}
709
9a9484ab
CB
710static int cgfsng_get_io(struct cgroup_ops *ops, const char *cgroup,
711 const char *file, char **value)
712{
713 __do_free char *path = NULL;
714 struct hierarchy *h;
715 int ret;
716
717 h = ops->get_hierarchy(ops, "blkio");
718 if (!h)
719 return -1;
720
721 if (!is_unified_hierarchy(h))
722 ret = CGROUP_SUPER_MAGIC;
723 else
724 ret = CGROUP2_SUPER_MAGIC;
725
726 path = must_make_path(dot_or_empty(cgroup), cgroup, file, NULL);
727 *value = readat_file(h->fd, path);
728 if (!*value) {
729 if (errno == ENOENT)
730 errno = EOPNOTSUPP;
731 return ret_errno(errno);
732 }
733
734 return ret;
735}
736
737static int cgfsng_get_io_service_bytes(struct cgroup_ops *ops,
738 const char *cgroup, char **value)
739{
740 return cgfsng_get_io(ops, cgroup, "blkio.io_service_bytes_recursive", value);
741}
742
743static int cgfsng_get_io_service_time(struct cgroup_ops *ops,
744 const char *cgroup, char **value)
745{
746 return cgfsng_get_io(ops, cgroup, "blkio.io_service_time_recursive", value);
747}
748
749static int cgfsng_get_io_serviced(struct cgroup_ops *ops, const char *cgroup,
750 char **value)
751{
752 return cgfsng_get_io(ops, cgroup, "blkio.io_serviced_recursive", value);
753}
754
755static int cgfsng_get_io_merged(struct cgroup_ops *ops, const char *cgroup,
756 char **value)
757{
758 return cgfsng_get_io(ops, cgroup, "blkio.io_merged_recursive", value);
759}
760
761static int cgfsng_get_io_wait_time(struct cgroup_ops *ops, const char *cgroup,
762 char **value)
763{
764 return cgfsng_get_io(ops, cgroup, "blkio.io_wait_time_recursive", value);
765}
766
77f4399a
CB
767static bool cgfsng_can_use_cpuview(struct cgroup_ops *ops)
768{
769 struct hierarchy *cpu, *cpuacct;
770
771 if (pure_unified_layout(ops))
772 return false;
773
774 cpu = ops->get_hierarchy(ops, "cpu");
775 if (!cpu || is_unified_hierarchy(cpu))
776 return false;
777
778 cpuacct = ops->get_hierarchy(ops, "cpuacct");
779 if (!cpuacct || is_unified_hierarchy(cpuacct))
780 return false;
781
782 return true;
783}
784
5fbea8a6
CB
785/* At startup, parse_hierarchies finds all the info we need about cgroup
786 * mountpoints and current cgroups, and stores it in @d.
787 */
788static int cg_hybrid_init(struct cgroup_ops *ops)
789{
790 __do_free char *basecginfo = NULL;
791 __do_free char *line = NULL;
792 __do_fclose FILE *f = NULL;
793 int ret;
794 size_t len = 0;
795 char **klist = NULL, **nlist = NULL;
796
797 /* Root spawned containers escape the current cgroup, so use init's
798 * cgroups as our base in that case.
799 */
800 basecginfo = read_file("/proc/1/cgroup");
801 if (!basecginfo)
802 return ret_set_errno(-1, ENOMEM);
803
804 ret = get_existing_subsystems(&klist, &nlist);
805 if (ret < 0)
806 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
807
dbb1f822 808 f = fopen("/proc/self/mountinfo", "re");
5fbea8a6
CB
809 if (!f)
810 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
811
812 while (getline(&line, &len, f) != -1) {
813 int type;
814 struct hierarchy *new;
815 char *base_cgroup = NULL, *mountpoint = NULL;
816 char **controller_list = NULL;
817 __do_free char *controllers = NULL;
818
819 type = get_cgroup_version(line);
820 if (type == 0)
821 continue;
822
823 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
824 continue;
825
826 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
827 if (type == CGROUP2_SUPER_MAGIC)
828 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
829 else if (type == CGROUP_SUPER_MAGIC)
830 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
831 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
832 if (type == CGROUP_SUPER_MAGIC)
833 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
834 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
835 if (type == CGROUP2_SUPER_MAGIC)
836 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
837 }
838
839 controller_list = cg_hybrid_get_controllers(klist, nlist, line,
840 type, &controllers);
841 if (!controller_list && type == CGROUP_SUPER_MAGIC)
842 continue;
843
844 if (type == CGROUP_SUPER_MAGIC)
845 if (controller_list_is_dup(ops->hierarchies, controller_list))
846 ret_set_errno(goto next, EEXIST);
847
848 mountpoint = cg_hybrid_get_mountpoint(line);
849 if (!mountpoint)
850 log_error_errno(goto next, EINVAL, "Failed parsing mountpoint from \"%s\"", line);
851
0fd1b770 852 if (type == CGROUP_SUPER_MAGIC)
5fbea8a6 853 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
0fd1b770 854 else
5fbea8a6 855 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
5fbea8a6
CB
856 if (!base_cgroup)
857 log_error_errno(goto next, EINVAL, "Failed to find current cgroup %s", mountpoint);
858
859 trim(base_cgroup);
860 prune_init_scope(base_cgroup);
861
862 if (type == CGROUP2_SUPER_MAGIC) {
863 char *cgv2_ctrl_path;
864
865 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
866 "cgroup.controllers",
867 NULL);
868
869 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
870 free(cgv2_ctrl_path);
871 if (!controller_list)
872 controller_list = cg_unified_make_empty_controller();
873 }
874
875 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
876 new->__controllers = move_ptr(controllers);
877 if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
878 ops->unified = new;
879
880 continue;
881
882 next:
883 free_string_list(controller_list);
884 free(mountpoint);
885 free(base_cgroup);
886 }
887
888 free_string_list(klist);
889 free_string_list(nlist);
890
891 return 0;
892}
893
894static int cg_unified_init(struct cgroup_ops *ops)
895{
896 __do_free char *subtree_path = NULL;
897 int ret;
898 char *mountpoint;
899 char **delegatable;
900 struct hierarchy *new;
901 char *base_cgroup = NULL;
902
903 ret = unified_cgroup_hierarchy();
904 if (ret == -ENOMEDIUM)
905 return ret_errno(ENOMEDIUM);
906
907 if (ret != CGROUP2_SUPER_MAGIC)
908 return 0;
909
910 base_cgroup = cg_unified_get_current_cgroup(1);
911 if (!base_cgroup)
912 return ret_errno(EINVAL);
913 prune_init_scope(base_cgroup);
914
915 /*
916 * We assume that the cgroup we're currently in has been delegated to
917 * us and we are free to further delege all of the controllers listed
918 * in cgroup.controllers further down the hierarchy.
919 */
920 mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
921 subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
922 delegatable = cg_unified_get_controllers(subtree_path);
923 if (!delegatable)
924 delegatable = cg_unified_make_empty_controller();
925
926 /* TODO: If the user requested specific controllers via lxc.cgroup.use
927 * we should verify here. The reason I'm not doing it right is that I'm
928 * not convinced that lxc.cgroup.use will be the future since it is a
929 * global property. I much rather have an option that lets you request
930 * controllers per container.
931 */
932
933 new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
934
935 if (bpf_devices_cgroup_supported())
936 new->bpf_device_controller = 1;
937
938 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
939 ops->unified = new;
940 return CGROUP2_SUPER_MAGIC;
941}
942
943static int cg_init(struct cgroup_ops *ops)
944{
945 int ret;
946
947 ret = cg_unified_init(ops);
948 if (ret < 0)
949 return -1;
950
951 if (ret == CGROUP2_SUPER_MAGIC)
952 return 0;
953
954 return cg_hybrid_init(ops);
955}
956
957struct cgroup_ops *cgfsng_ops_init(void)
958{
959 __do_free struct cgroup_ops *cgfsng_ops = NULL;
960
961 cgfsng_ops = malloc(sizeof(struct cgroup_ops));
962 if (!cgfsng_ops)
963 return ret_set_errno(NULL, ENOMEM);
964
965 memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
966 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
967
968 if (cg_init(cgfsng_ops))
969 return NULL;
970
971 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
1ca6a467 972 cgfsng_ops->get = cgfsng_get;
5fbea8a6
CB
973 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
974 cgfsng_ops->get_hierarchy = get_hierarchy;
975 cgfsng_ops->driver = "cgfsng";
976 cgfsng_ops->version = "1.0.0";
977 cgfsng_ops->mount = cgfsng_mount;
5fbea8a6 978
66c5e848 979 /* memory */
acff9786 980 cgfsng_ops->get_memory_stats_fd = cgfsng_get_memory_stats_fd;
66c5e848
CB
981 cgfsng_ops->get_memory_stats = cgfsng_get_memory_stats;
982 cgfsng_ops->get_memory_max = cgfsng_get_memory_max;
983 cgfsng_ops->get_memory_swap_max = cgfsng_get_memory_swap_max;
984 cgfsng_ops->get_memory_current = cgfsng_get_memory_current;
985 cgfsng_ops->get_memory_swap_current = cgfsng_get_memory_swap_current;
986
2c3bcd9e
CB
987 /* cpuset */
988 cgfsng_ops->get_cpuset_cpus = cgfsng_get_cpuset_cpus;
77f4399a 989 cgfsng_ops->can_use_cpuview = cgfsng_can_use_cpuview;
2c3bcd9e 990
9a9484ab
CB
991 /* blkio */
992 cgfsng_ops->get_io_service_bytes = cgfsng_get_io_service_bytes;
993 cgfsng_ops->get_io_service_time = cgfsng_get_io_service_time;
994 cgfsng_ops->get_io_serviced = cgfsng_get_io_serviced;
995 cgfsng_ops->get_io_merged = cgfsng_get_io_merged;
996 cgfsng_ops->get_io_wait_time = cgfsng_get_io_wait_time;
997
998
5fbea8a6
CB
999 return move_ptr(cgfsng_ops);
1000}