]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/cgroups/cgfsng.c
proc_fuse: add /proc/slabinfo with slab accounting memcg
[mirror_lxcfs.git] / src / cgroups / cgfsng.c
CommitLineData
5fbea8a6
CB
1/* SPDX-License-Identifier: LGPL-2.1+ */
2
3/*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
8 * each controller.
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
12 * a comma-separated list of controllers.
13 */
14
15#ifndef _GNU_SOURCE
1f5596dd 16#define _GNU_SOURCE
5fbea8a6 17#endif
1f5596dd 18
f834b6bf
SP
19#include "../config.h"
20
21#ifdef HAVE_FUSE3
22#ifndef FUSE_USE_VERSION
23#define FUSE_USE_VERSION 30
24#endif
25#else
1f5596dd
CB
26#ifndef FUSE_USE_VERSION
27#define FUSE_USE_VERSION 26
28#endif
f834b6bf 29#endif
1f5596dd
CB
30
31#define _FILE_OFFSET_BITS 64
32
5fbea8a6
CB
33#include <ctype.h>
34#include <dirent.h>
35#include <errno.h>
36#include <grp.h>
37#include <linux/kdev_t.h>
38#include <linux/types.h>
39#include <poll.h>
40#include <signal.h>
41#include <stdint.h>
42#include <stdio.h>
43#include <stdlib.h>
44#include <string.h>
45#include <sys/mount.h>
46#include <sys/types.h>
47#include <unistd.h>
48
1f5596dd
CB
49#include "../macro.h"
50#include "../memory_utils.h"
757a63e7 51#include "../utils.h"
5fbea8a6
CB
52#include "cgroup.h"
53#include "cgroup2_devices.h"
54#include "cgroup_utils.h"
5fbea8a6 55
5fbea8a6
CB
56/* Given a pointer to a null-terminated array of pointers, realloc to add one
57 * entry, and point the new entry to NULL. Do not fail. Return the index to the
58 * second-to-last entry - that is, the one which is now available for use
59 * (keeping the list null-terminated).
60 */
61static int append_null_to_list(void ***list)
62{
63 int newentry = 0;
64
65 if (*list)
66 for (; (*list)[newentry]; newentry++)
67 ;
68
69 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
70 (*list)[newentry + 1] = NULL;
71 return newentry;
72}
73
74/* Given a null-terminated array of strings, check whether @entry is one of the
75 * strings.
76 */
77static bool string_in_list(char **list, const char *entry)
78{
79 int i;
80
81 if (!list)
82 return false;
83
84 for (i = 0; list[i]; i++)
85 if (strcmp(list[i], entry) == 0)
86 return true;
87
88 return false;
89}
90
91/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
92 * "name=systemd". Do not fail.
93 */
94static char *cg_legacy_must_prefix_named(char *entry)
95{
96 size_t len;
97 char *prefixed;
98
99 len = strlen(entry);
100 prefixed = must_realloc(NULL, len + 6);
101
102 memcpy(prefixed, "name=", STRLITERALLEN("name="));
103 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
104 prefixed[len + 5] = '\0';
105
106 return prefixed;
107}
108
109/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
110 * we are called.
111 *
112 * We also handle named subsystems here. Any controller which is not a kernel
113 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
114 * we refuse to use because we're not sure which we have here.
115 * (TODO: We could work around this in some cases by just remounting to be
116 * unambiguous, or by comparing mountpoint contents with current cgroup.)
117 *
118 * The last entry will always be NULL.
119 */
120static void must_append_controller(char **klist, char **nlist, char ***clist,
121 char *entry)
122{
123 int newentry;
124 char *copy;
125
126 if (string_in_list(klist, entry) && string_in_list(nlist, entry))
127 return;
128
129 newentry = append_null_to_list((void ***)clist);
130
131 if (strncmp(entry, "name=", 5) == 0)
132 copy = must_copy_string(entry);
133 else if (string_in_list(klist, entry))
134 copy = must_copy_string(entry);
135 else
136 copy = cg_legacy_must_prefix_named(entry);
137
138 (*clist)[newentry] = copy;
139}
140
141/* Given a handler's cgroup data, return the struct hierarchy for the controller
142 * @c, or NULL if there is none.
143 */
144static struct hierarchy *cgfsng_get_hierarchy(struct cgroup_ops *ops,
145 const char *controller)
146{
147 int i;
148
149 errno = ENOENT;
150
151 if (!ops->hierarchies)
152 return NULL;
153
154 for (i = 0; ops->hierarchies[i]; i++) {
155 if (!controller) {
156 /* This is the empty unified hierarchy. */
157 if (ops->hierarchies[i]->controllers &&
158 !ops->hierarchies[i]->controllers[0])
159 return ops->hierarchies[i];
160 continue;
161 } else if (pure_unified_layout(ops) &&
162 strcmp(controller, "devices") == 0) {
163 if (ops->unified->bpf_device_controller)
164 return ops->unified;
165 break;
166 }
167
168 if (string_in_list(ops->hierarchies[i]->controllers, controller))
169 return ops->hierarchies[i];
170 }
171
172 return NULL;
173}
174
5fbea8a6
CB
175/* Given two null-terminated lists of strings, return true if any string is in
176 * both.
177 */
178static bool controller_lists_intersect(char **l1, char **l2)
179{
180 int i;
181
182 if (!l1 || !l2)
183 return false;
184
185 for (i = 0; l1[i]; i++) {
186 if (string_in_list(l2, l1[i]))
187 return true;
188 }
189
190 return false;
191}
192
193/* For a null-terminated list of controllers @clist, return true if any of those
194 * controllers is already listed the null-terminated list of hierarchies @hlist.
195 * Realistically, if one is present, all must be present.
196 */
197static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
198{
199 int i;
200
201 if (!hlist)
202 return false;
203
204 for (i = 0; hlist[i]; i++)
205 if (controller_lists_intersect(hlist[i]->controllers, clist))
206 return true;
207
208 return false;
209}
210
211/* Get the controllers from a mountinfo line There are other ways we could get
212 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
213 * could parse the mount options. But we simply assume that the mountpoint must
214 * be /sys/fs/cgroup/controller-list
215 */
216static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
217 int type, char **controllers)
218{
219 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
220 * for legacy hierarchies.
221 */
222 int i;
223 char *p2, *tok;
224 char *p = line, *sep = ",";
225 char **aret = NULL;
226
227 for (i = 0; i < 4; i++) {
228 p = strchr(p, ' ');
229 if (!p)
230 return NULL;
231 p++;
232 }
233
234 /* Note, if we change how mountinfo works, then our caller will need to
235 * verify /sys/fs/cgroup/ in this field.
236 */
237 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
238 return NULL;
239
240 p += 15;
241 p2 = strchr(p, ' ');
242 if (!p2)
243 return NULL;
244 *p2 = '\0';
245
246 if (type == CGROUP_SUPER_MAGIC) {
247 __do_free char *dup = NULL;
248
249 /* strdup() here for v1 hierarchies. Otherwise
250 * lxc_iterate_parts() will destroy mountpoints such as
251 * "/sys/fs/cgroup/cpu,cpuacct".
252 */
253 dup = must_copy_string(p);
254 if (!dup)
255 return NULL;
256
257 lxc_iterate_parts (tok, dup, sep)
258 must_append_controller(klist, nlist, &aret, tok);
259 *controllers = move_ptr(dup);
260 }
261 *p2 = ' ';
262
263 return aret;
264}
265
266static char **cg_unified_make_empty_controller(void)
267{
268 int newentry;
269 char **aret = NULL;
270
271 newentry = append_null_to_list((void ***)&aret);
272 aret[newentry] = NULL;
273 return aret;
274}
275
276static char **cg_unified_get_controllers(const char *file)
277{
278 __do_free char *buf = NULL;
279 char *sep = " \t\n";
280 char **aret = NULL;
281 char *tok;
282
283 buf = read_file(file);
284 if (!buf)
285 return NULL;
286
287 lxc_iterate_parts(tok, buf, sep) {
288 int newentry;
289 char *copy;
290
291 newentry = append_null_to_list((void ***)&aret);
292 copy = must_copy_string(tok);
293 aret[newentry] = copy;
294 }
295
296 return aret;
297}
298
299static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
0fd1b770 300 char *base_path, int type)
5fbea8a6
CB
301{
302 struct hierarchy *new;
303 int newentry;
304
305 new = zalloc(sizeof(*new));
306 new->controllers = clist;
307 new->mountpoint = mountpoint;
0fd1b770 308 new->base_path = base_path;
5fbea8a6
CB
309 new->version = type;
310
311 newentry = append_null_to_list((void ***)h);
312 (*h)[newentry] = new;
313 return new;
314}
315
316/* Get a copy of the mountpoint from @line, which is a line from
317 * /proc/self/mountinfo.
318 */
319static char *cg_hybrid_get_mountpoint(char *line)
320{
321 int i;
322 size_t len;
323 char *p2;
324 char *p = line, *sret = NULL;
325
326 for (i = 0; i < 4; i++) {
327 p = strchr(p, ' ');
328 if (!p)
329 return NULL;
330 p++;
331 }
332
333 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
334 return NULL;
335
336 p2 = strchr(p + 15, ' ');
337 if (!p2)
338 return NULL;
339 *p2 = '\0';
340
341 len = strlen(p);
342 sret = must_realloc(NULL, len + 1);
343 memcpy(sret, p, len);
344 sret[len] = '\0';
345 return sret;
346}
347
348static void must_append_string(char ***list, char *entry)
349{
350 int newentry;
351 char *copy;
352
353 newentry = append_null_to_list((void ***)list);
354 copy = must_copy_string(entry);
355 (*list)[newentry] = copy;
356}
357
358static int get_existing_subsystems(char ***klist, char ***nlist)
359{
360 __do_free char *line = NULL;
361 __do_fclose FILE *f = NULL;
362 size_t len = 0;
363
dbb1f822 364 f = fopen("/proc/self/cgroup", "re");
5fbea8a6
CB
365 if (!f)
366 return -1;
367
368 while (getline(&line, &len, f) != -1) {
369 char *p, *p2, *tok;
370 p = strchr(line, ':');
371 if (!p)
372 continue;
373 p++;
374 p2 = strchr(p, ':');
375 if (!p2)
376 continue;
377 *p2 = '\0';
378
379 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
380 * contains an entry of the form:
381 *
382 * 0::/some/path
383 *
384 * In this case we use "cgroup2" as controller name.
385 */
386 if ((p2 - p) == 0) {
387 must_append_string(klist, "cgroup2");
388 continue;
389 }
390
391 lxc_iterate_parts(tok, p, ",") {
392 if (strncmp(tok, "name=", 5) == 0)
393 must_append_string(nlist, tok);
394 else
395 must_append_string(klist, tok);
396 }
397 }
398
399 return 0;
400}
401
402static void trim(char *s)
403{
404 size_t len;
405
406 len = strlen(s);
407 while ((len > 1) && (s[len - 1] == '\n'))
408 s[--len] = '\0';
409}
410
411/* __cg_mount_direct
412 *
413 * Mount cgroup hierarchies directly without using bind-mounts. The main
414 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
415 * cgroups for the LXC_AUTO_CGROUP_FULL option.
416 */
417static int __cg_mount_direct(struct hierarchy *h, const char *controllerpath)
418{
419 __do_free char *controllers = NULL;
420 char *fstype = "cgroup2";
421 unsigned long flags = 0;
422 int ret;
423
424 flags |= MS_NOSUID;
425 flags |= MS_NOEXEC;
426 flags |= MS_NODEV;
427 flags |= MS_RELATIME;
428
429 if (h->version != CGROUP2_SUPER_MAGIC) {
430 controllers = lxc_string_join(",", (const char **)h->controllers, false);
431 if (!controllers)
432 return -ENOMEM;
433 fstype = "cgroup";
434 }
435
436 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
437 if (ret < 0)
438 return -1;
439
440 return 0;
441}
442
443static inline int cg_mount_cgroup_full(struct hierarchy *h,
444 const char *controllerpath)
445{
446 return __cg_mount_direct(h, controllerpath);
447}
448
449static bool cgfsng_mount(struct cgroup_ops *ops, const char *root)
450{
451 __do_free char *cgroup_root = NULL;
452 int ret;
453 bool retval = false;
454
455 if (!ops)
456 return ret_set_errno(false, ENOENT);
457
458 if (!ops->hierarchies)
459 return true;
460
461 cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
462 if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
463 return cg_mount_cgroup_full(ops->unified, cgroup_root) == 0;
464
465 /* mount tmpfs */
466 ret = safe_mount(NULL, cgroup_root, "tmpfs",
467 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
468 "size=10240k,mode=755", root);
469 if (ret < 0)
470 goto on_error;
471
472 for (int i = 0; ops->hierarchies[i]; i++) {
473 __do_free char *controllerpath = NULL;
474 struct hierarchy *h = ops->hierarchies[i];
475 char *controller = strrchr(h->mountpoint, '/');
476
477 if (!controller)
478 continue;
479 controller++;
480
481 controllerpath = must_make_path(cgroup_root, controller, NULL);
482 if (dir_exists(controllerpath))
483 continue;
484
485 ret = mkdir(controllerpath, 0755);
486 if (ret < 0)
487 log_error_errno(goto on_error, errno,
488 "Error creating cgroup path: %s",
489 controllerpath);
490
8a03c08b 491 ret = cg_mount_cgroup_full(h, controllerpath);
5fbea8a6
CB
492 if (ret < 0)
493 goto on_error;
494 }
495 retval = true;
496
497on_error:
498 return retval;
499}
500
5fbea8a6
CB
501static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
502{
503 int i = 0;
504
505 if (!ops)
506 return ret_set_errno(-1, ENOENT);
507
508 if (!ops->hierarchies)
509 return 0;
510
511 for (; ops->hierarchies[i]; i++)
512 ;
513
514 return i;
515}
516
517static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
518{
519 int i;
520
521 if (!ops)
522 return ret_set_errno(false, ENOENT);
523
524 if (!ops->hierarchies)
525 return false;
526
527 /* sanity check n */
528 for (i = 0; i < n; i++)
529 if (!ops->hierarchies[i])
530 return ret_set_errno(false, ENOENT);
531
532 *out = ops->hierarchies[i]->controllers;
533
534 return true;
535}
536
1ca6a467
CB
537static bool cgfsng_get(struct cgroup_ops *ops, const char *controller,
538 const char *cgroup, const char *file, char **value)
539{
540 __do_free char *path = NULL;
541 struct hierarchy *h;
542
543 h = ops->get_hierarchy(ops, controller);
544 if (!h)
545 return false;
546
925d5849 547 path = must_make_path_relative(cgroup, file, NULL);
1ca6a467
CB
548 *value = readat_file(h->fd, path);
549 return *value != NULL;
550}
551
66c5e848
CB
552static int cgfsng_get_memory(struct cgroup_ops *ops, const char *cgroup,
553 const char *file, char **value)
554{
555 __do_free char *path = NULL;
556 struct hierarchy *h;
114eb8b8 557 int cgroup2_root_fd, layout, ret;
66c5e848
CB
558
559 h = ops->get_hierarchy(ops, "memory");
560 if (!h)
561 return -1;
562
563 if (!is_unified_hierarchy(h)) {
564 if (strcmp(file, "memory.max") == 0)
565 file = "memory.limit_in_bytes";
566 else if (strcmp(file, "memory.swap.max") == 0)
567 file = "memory.memsw.limit_in_bytes";
568 else if (strcmp(file, "memory.swap.current") == 0)
569 file = "memory.memsw.usage_in_bytes";
570 else if (strcmp(file, "memory.current") == 0)
571 file = "memory.usage_in_bytes";
114eb8b8
CB
572 layout = CGROUP_SUPER_MAGIC;
573 cgroup2_root_fd = -EBADF;
66c5e848 574 } else {
114eb8b8
CB
575 layout = CGROUP2_SUPER_MAGIC;
576 cgroup2_root_fd = ops->cgroup2_root_fd;
66c5e848
CB
577 }
578
114eb8b8
CB
579 path = must_make_path_relative(cgroup, NULL);
580 ret = cgroup_walkup_to_root(cgroup2_root_fd, h->fd, path, file, value);
581 if (ret < 0)
582 return ret;
583 if (ret == 1) {
584 *value = strdup("");
585 if (!*value)
586 return -ENOMEM;
587 }
66c5e848 588
114eb8b8 589 return layout;
66c5e848
CB
590}
591
acff9786
CB
592static int cgfsng_get_memory_stats_fd(struct cgroup_ops *ops, const char *cgroup)
593{
594 __do_free char *path = NULL;
595 struct hierarchy *h;
596
597 h = ops->get_hierarchy(ops, "memory");
598 if (!h)
599 return -1;
600
925d5849 601 path = must_make_path_relative(cgroup, "memory.stat", NULL);
acff9786
CB
602 return openat(h->fd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
603}
604
66c5e848
CB
605static int cgfsng_get_memory_current(struct cgroup_ops *ops, const char *cgroup,
606 char **value)
607{
608 return cgfsng_get_memory(ops, cgroup, "memory.current", value);
609}
610
611static int cgfsng_get_memory_swap_current(struct cgroup_ops *ops,
612 const char *cgroup, char **value)
613{
614 return cgfsng_get_memory(ops, cgroup, "memory.swap.current", value);
615}
616
617static int cgfsng_get_memory_max(struct cgroup_ops *ops, const char *cgroup,
618 char **value)
619{
620 return cgfsng_get_memory(ops, cgroup, "memory.max", value);
621}
622
4032b735
SG
623static int cgfsng_get_memory_swappiness(struct cgroup_ops *ops, const char *cgroup,
624 char **value)
625{
626 return cgfsng_get_memory(ops, cgroup, "memory.swappiness", value);
627}
628
66c5e848
CB
629static int cgfsng_get_memory_swap_max(struct cgroup_ops *ops,
630 const char *cgroup, char **value)
631{
632 return cgfsng_get_memory(ops, cgroup, "memory.swap.max", value);
633}
634
6cc153e6
FS
635static int cgfsng_get_memory_slabinfo_fd(struct cgroup_ops *ops, const char *cgroup)
636{
637 __do_free char *path = NULL;
638 struct hierarchy *h;
639
640 h = ops->get_hierarchy(ops, "memory");
641 if (!h)
642 return -1;
643
644 if (faccessat(h->fd, "memory.kmem.slabinfo", F_OK, 0))
645 return -1;
646
647 path = must_make_path_relative(cgroup, "memory.kmem.slabinfo", NULL);
648 return openat(h->fd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
649}
650
c6805016
CB
651static bool cgfsng_can_use_swap(struct cgroup_ops *ops)
652{
653 bool has_swap = false;
654 struct hierarchy *h;
655
656 h = ops->get_hierarchy(ops, "memory");
657 if (!h)
658 return false;
659
660 if (is_unified_hierarchy(h)) {
661 if (faccessat(h->fd, "memory.swap.max", F_OK, 0))
662 return false;
663
664 if (faccessat(h->fd, "memory.swap.current", F_OK, 0))
665 return false;
666
667 has_swap = true;
668 } else {
669 if (faccessat(h->fd, "memory.memsw.limit_in_bytes", F_OK, 0))
670 return false;
671
672 if (faccessat(h->fd, "memory.memsw.usage_in_bytes", F_OK, 0))
673 return false;
674
675 has_swap = true;
676 }
677
678 return has_swap;
679}
680
66c5e848
CB
681static int cgfsng_get_memory_stats(struct cgroup_ops *ops, const char *cgroup,
682 char **value)
683{
684 return cgfsng_get_memory(ops, cgroup, "memory.stat", value);
685}
686
2c3bcd9e
CB
687static char *readat_cpuset(int cgroup_fd)
688{
689 __do_free char *val = NULL;
690
691 val = readat_file(cgroup_fd, "cpuset.cpus");
692 if (val && strcmp(val, "") != 0)
693 return move_ptr(val);
694
695 free_disarm(val);
696 val = readat_file(cgroup_fd, "cpuset.cpus.effective");
697 if (val && strcmp(val, "") != 0)
698 return move_ptr(val);
699
700 return NULL;
701}
702
703static int cgfsng_get_cpuset_cpus(struct cgroup_ops *ops, const char *cgroup,
704 char **value)
705{
05b7a16d 706 __do_close int cgroup_fd = -EBADF;
2c3bcd9e
CB
707 __do_free char *path = NULL;
708 char *v;
709 struct hierarchy *h;
710 int ret;
711
712 h = ops->get_hierarchy(ops, "cpuset");
713 if (!h)
714 return -1;
715
716 if (!is_unified_hierarchy(h))
717 ret = CGROUP_SUPER_MAGIC;
718 else
719 ret = CGROUP2_SUPER_MAGIC;
720
721 *value = NULL;
925d5849 722 path = must_make_path_relative(cgroup, NULL);
2c3bcd9e
CB
723 cgroup_fd = openat_safe(h->fd, path);
724 if (cgroup_fd < 0) {
725 return -1;
726 }
727 v = readat_cpuset(cgroup_fd);
728 if (v) {
729 *value = v;
730 return ret;
731 }
732
733 /*
734 * cpuset.cpus and cpuset.cpus.effective are empty so we need to look
735 * the nearest ancestor with a non-empty cpuset.cpus{.effective} file.
736 */
737 for (;;) {
738 int fd;
739
740 fd = openat_safe(cgroup_fd, "../");
227479b5 741 if (fd < 0 || !is_cgroup_fd(fd))
2c3bcd9e 742 return -1;
2c3bcd9e
CB
743
744 close_prot_errno_replace(cgroup_fd, fd);
745
746 v = readat_cpuset(fd);
747 if (v) {
748 *value = v;
749 return ret;
750 }
751 }
752
753 return -1;
754}
755
9a9484ab
CB
756static int cgfsng_get_io(struct cgroup_ops *ops, const char *cgroup,
757 const char *file, char **value)
758{
759 __do_free char *path = NULL;
760 struct hierarchy *h;
761 int ret;
762
763 h = ops->get_hierarchy(ops, "blkio");
764 if (!h)
765 return -1;
766
767 if (!is_unified_hierarchy(h))
768 ret = CGROUP_SUPER_MAGIC;
769 else
770 ret = CGROUP2_SUPER_MAGIC;
771
925d5849 772 path = must_make_path_relative(cgroup, file, NULL);
9a9484ab
CB
773 *value = readat_file(h->fd, path);
774 if (!*value) {
775 if (errno == ENOENT)
776 errno = EOPNOTSUPP;
777 return ret_errno(errno);
778 }
779
780 return ret;
781}
782
783static int cgfsng_get_io_service_bytes(struct cgroup_ops *ops,
784 const char *cgroup, char **value)
785{
786 return cgfsng_get_io(ops, cgroup, "blkio.io_service_bytes_recursive", value);
787}
788
789static int cgfsng_get_io_service_time(struct cgroup_ops *ops,
790 const char *cgroup, char **value)
791{
792 return cgfsng_get_io(ops, cgroup, "blkio.io_service_time_recursive", value);
793}
794
795static int cgfsng_get_io_serviced(struct cgroup_ops *ops, const char *cgroup,
796 char **value)
797{
798 return cgfsng_get_io(ops, cgroup, "blkio.io_serviced_recursive", value);
799}
800
801static int cgfsng_get_io_merged(struct cgroup_ops *ops, const char *cgroup,
802 char **value)
803{
804 return cgfsng_get_io(ops, cgroup, "blkio.io_merged_recursive", value);
805}
806
807static int cgfsng_get_io_wait_time(struct cgroup_ops *ops, const char *cgroup,
808 char **value)
809{
810 return cgfsng_get_io(ops, cgroup, "blkio.io_wait_time_recursive", value);
811}
812
77f4399a
CB
813static bool cgfsng_can_use_cpuview(struct cgroup_ops *ops)
814{
815 struct hierarchy *cpu, *cpuacct;
816
817 if (pure_unified_layout(ops))
9844eea7 818 return true;
77f4399a
CB
819
820 cpu = ops->get_hierarchy(ops, "cpu");
821 if (!cpu || is_unified_hierarchy(cpu))
822 return false;
823
824 cpuacct = ops->get_hierarchy(ops, "cpuacct");
825 if (!cpuacct || is_unified_hierarchy(cpuacct))
826 return false;
827
828 return true;
829}
830
5fbea8a6
CB
831/* At startup, parse_hierarchies finds all the info we need about cgroup
832 * mountpoints and current cgroups, and stores it in @d.
833 */
834static int cg_hybrid_init(struct cgroup_ops *ops)
835{
836 __do_free char *basecginfo = NULL;
837 __do_free char *line = NULL;
757a63e7 838 __do_free void *fopen_cache = NULL;
5fbea8a6
CB
839 __do_fclose FILE *f = NULL;
840 int ret;
841 size_t len = 0;
842 char **klist = NULL, **nlist = NULL;
843
844 /* Root spawned containers escape the current cgroup, so use init's
845 * cgroups as our base in that case.
846 */
847 basecginfo = read_file("/proc/1/cgroup");
848 if (!basecginfo)
849 return ret_set_errno(-1, ENOMEM);
850
851 ret = get_existing_subsystems(&klist, &nlist);
852 if (ret < 0)
853 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
854
757a63e7 855 f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
5fbea8a6
CB
856 if (!f)
857 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
858
859 while (getline(&line, &len, f) != -1) {
860 int type;
861 struct hierarchy *new;
862 char *base_cgroup = NULL, *mountpoint = NULL;
863 char **controller_list = NULL;
864 __do_free char *controllers = NULL;
865
866 type = get_cgroup_version(line);
867 if (type == 0)
868 continue;
869
870 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
871 continue;
872
873 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
874 if (type == CGROUP2_SUPER_MAGIC)
875 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
876 else if (type == CGROUP_SUPER_MAGIC)
877 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
878 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
879 if (type == CGROUP_SUPER_MAGIC)
880 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
881 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
882 if (type == CGROUP2_SUPER_MAGIC)
883 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
884 }
885
886 controller_list = cg_hybrid_get_controllers(klist, nlist, line,
887 type, &controllers);
888 if (!controller_list && type == CGROUP_SUPER_MAGIC)
889 continue;
890
891 if (type == CGROUP_SUPER_MAGIC)
892 if (controller_list_is_dup(ops->hierarchies, controller_list))
893 ret_set_errno(goto next, EEXIST);
894
895 mountpoint = cg_hybrid_get_mountpoint(line);
896 if (!mountpoint)
897 log_error_errno(goto next, EINVAL, "Failed parsing mountpoint from \"%s\"", line);
898
0fd1b770 899 if (type == CGROUP_SUPER_MAGIC)
5fbea8a6 900 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
0fd1b770 901 else
5fbea8a6 902 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
5fbea8a6
CB
903 if (!base_cgroup)
904 log_error_errno(goto next, EINVAL, "Failed to find current cgroup %s", mountpoint);
905
906 trim(base_cgroup);
907 prune_init_scope(base_cgroup);
908
909 if (type == CGROUP2_SUPER_MAGIC) {
910 char *cgv2_ctrl_path;
911
912 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
913 "cgroup.controllers",
914 NULL);
915
916 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
917 free(cgv2_ctrl_path);
918 if (!controller_list)
919 controller_list = cg_unified_make_empty_controller();
920 }
921
922 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
923 new->__controllers = move_ptr(controllers);
924 if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
925 ops->unified = new;
926
927 continue;
928
929 next:
930 free_string_list(controller_list);
931 free(mountpoint);
932 free(base_cgroup);
933 }
934
935 free_string_list(klist);
936 free_string_list(nlist);
937
938 return 0;
939}
940
941static int cg_unified_init(struct cgroup_ops *ops)
942{
943 __do_free char *subtree_path = NULL;
944 int ret;
945 char *mountpoint;
946 char **delegatable;
947 struct hierarchy *new;
948 char *base_cgroup = NULL;
949
950 ret = unified_cgroup_hierarchy();
951 if (ret == -ENOMEDIUM)
952 return ret_errno(ENOMEDIUM);
953
954 if (ret != CGROUP2_SUPER_MAGIC)
955 return 0;
956
957 base_cgroup = cg_unified_get_current_cgroup(1);
958 if (!base_cgroup)
959 return ret_errno(EINVAL);
960 prune_init_scope(base_cgroup);
961
962 /*
963 * We assume that the cgroup we're currently in has been delegated to
964 * us and we are free to further delege all of the controllers listed
965 * in cgroup.controllers further down the hierarchy.
966 */
967 mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
968 subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
969 delegatable = cg_unified_get_controllers(subtree_path);
970 if (!delegatable)
971 delegatable = cg_unified_make_empty_controller();
972
973 /* TODO: If the user requested specific controllers via lxc.cgroup.use
974 * we should verify here. The reason I'm not doing it right is that I'm
975 * not convinced that lxc.cgroup.use will be the future since it is a
976 * global property. I much rather have an option that lets you request
977 * controllers per container.
978 */
979
980 new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
981
982 if (bpf_devices_cgroup_supported())
983 new->bpf_device_controller = 1;
984
985 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
986 ops->unified = new;
114eb8b8
CB
987
988 ops->cgroup2_root_fd = open(DEFAULT_CGROUP_MOUNTPOINT, O_DIRECTORY | O_PATH | O_CLOEXEC);
989 if (ops->cgroup2_root_fd < 0)
990 return -errno;
991
5fbea8a6
CB
992 return CGROUP2_SUPER_MAGIC;
993}
994
995static int cg_init(struct cgroup_ops *ops)
996{
997 int ret;
998
999 ret = cg_unified_init(ops);
1000 if (ret < 0)
1001 return -1;
1002
1003 if (ret == CGROUP2_SUPER_MAGIC)
1004 return 0;
1005
1006 return cg_hybrid_init(ops);
1007}
1008
1009struct cgroup_ops *cgfsng_ops_init(void)
1010{
1011 __do_free struct cgroup_ops *cgfsng_ops = NULL;
1012
114eb8b8 1013 cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
5fbea8a6
CB
1014 if (!cgfsng_ops)
1015 return ret_set_errno(NULL, ENOMEM);
1016
5fbea8a6 1017 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
114eb8b8
CB
1018 cgfsng_ops->mntns_fd = -EBADF;
1019 cgfsng_ops->cgroup2_root_fd = -EBADF;
5fbea8a6
CB
1020
1021 if (cg_init(cgfsng_ops))
1022 return NULL;
1023
1024 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
1ca6a467 1025 cgfsng_ops->get = cgfsng_get;
5fbea8a6 1026 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
8a03c08b 1027 cgfsng_ops->get_hierarchy = cgfsng_get_hierarchy;
5fbea8a6
CB
1028 cgfsng_ops->driver = "cgfsng";
1029 cgfsng_ops->version = "1.0.0";
1030 cgfsng_ops->mount = cgfsng_mount;
5fbea8a6 1031
66c5e848 1032 /* memory */
acff9786 1033 cgfsng_ops->get_memory_stats_fd = cgfsng_get_memory_stats_fd;
66c5e848
CB
1034 cgfsng_ops->get_memory_stats = cgfsng_get_memory_stats;
1035 cgfsng_ops->get_memory_max = cgfsng_get_memory_max;
4032b735 1036 cgfsng_ops->get_memory_swappiness = cgfsng_get_memory_swappiness;
66c5e848
CB
1037 cgfsng_ops->get_memory_swap_max = cgfsng_get_memory_swap_max;
1038 cgfsng_ops->get_memory_current = cgfsng_get_memory_current;
1039 cgfsng_ops->get_memory_swap_current = cgfsng_get_memory_swap_current;
6cc153e6 1040 cgfsng_ops->get_memory_slabinfo_fd = cgfsng_get_memory_slabinfo_fd;
c6805016 1041 cgfsng_ops->can_use_swap = cgfsng_can_use_swap;
66c5e848 1042
2c3bcd9e
CB
1043 /* cpuset */
1044 cgfsng_ops->get_cpuset_cpus = cgfsng_get_cpuset_cpus;
77f4399a 1045 cgfsng_ops->can_use_cpuview = cgfsng_can_use_cpuview;
2c3bcd9e 1046
9a9484ab
CB
1047 /* blkio */
1048 cgfsng_ops->get_io_service_bytes = cgfsng_get_io_service_bytes;
1049 cgfsng_ops->get_io_service_time = cgfsng_get_io_service_time;
1050 cgfsng_ops->get_io_serviced = cgfsng_get_io_serviced;
1051 cgfsng_ops->get_io_merged = cgfsng_get_io_merged;
1052 cgfsng_ops->get_io_wait_time = cgfsng_get_io_wait_time;
1053
1054
5fbea8a6
CB
1055 return move_ptr(cgfsng_ops);
1056}