]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
tree-wide: s/__unused/__lxc_unused/g
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
ccb4cabe
SH
2
3/*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
0e7ff52c 8 * each controller.
ccb4cabe
SH
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
18406e5a 12 * a comma-separated list of controllers.
ccb4cabe 13 */
a54694f8 14
d38dd64a
CB
15#ifndef _GNU_SOURCE
16#define _GNU_SOURCE 1
17#endif
a54694f8
CB
18#include <ctype.h>
19#include <dirent.h>
20#include <errno.h>
21#include <grp.h>
d38dd64a
CB
22#include <linux/kdev_t.h>
23#include <linux/types.h>
942e193e
CB
24#include <poll.h>
25#include <signal.h>
a54694f8 26#include <stdint.h>
ccb4cabe
SH
27#include <stdio.h>
28#include <stdlib.h>
a54694f8 29#include <string.h>
438c4581 30#include <sys/types.h>
d38dd64a 31#include <unistd.h>
c8bf519d 32
b635e92d 33#include "caps.h"
ccb4cabe 34#include "cgroup.h"
bf651989 35#include "cgroup2_devices.h"
6328fd9c 36#include "cgroup_utils.h"
ccb4cabe 37#include "commands.h"
43654d34 38#include "conf.h"
d38dd64a 39#include "config.h"
a54694f8 40#include "log.h"
c19ad94b 41#include "macro.h"
018051e3 42#include "mainloop.h"
861cb8c2 43#include "memory_utils.h"
43654d34 44#include "storage/storage.h"
a54694f8 45#include "utils.h"
ccb4cabe 46
64e82f8b
DJ
47#ifndef HAVE_STRLCPY
48#include "include/strlcpy.h"
49#endif
50
3ebe2fbd
DJ
51#ifndef HAVE_STRLCAT
52#include "include/strlcat.h"
53#endif
54
ac2cecc4 55lxc_log_define(cgfsng, cgroup);
ccb4cabe 56
ccb4cabe
SH
57static void free_string_list(char **clist)
58{
2d5fe5ba 59 int i;
ccb4cabe 60
2d5fe5ba
CB
61 if (!clist)
62 return;
63
64 for (i = 0; clist[i]; i++)
65 free(clist[i]);
66
67 free(clist);
ccb4cabe
SH
68}
69
8b8db2f6
CB
70/* Given a pointer to a null-terminated array of pointers, realloc to add one
71 * entry, and point the new entry to NULL. Do not fail. Return the index to the
72 * second-to-last entry - that is, the one which is now available for use
73 * (keeping the list null-terminated).
ccb4cabe
SH
74 */
75static int append_null_to_list(void ***list)
76{
77 int newentry = 0;
78
79 if (*list)
8b8db2f6
CB
80 for (; (*list)[newentry]; newentry++)
81 ;
ccb4cabe
SH
82
83 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
84 (*list)[newentry + 1] = NULL;
85 return newentry;
86}
87
8073018d
CB
88/* Given a null-terminated array of strings, check whether @entry is one of the
89 * strings.
ccb4cabe
SH
90 */
91static bool string_in_list(char **list, const char *entry)
92{
93 int i;
94
95 if (!list)
96 return false;
d6337a5f 97
ccb4cabe
SH
98 for (i = 0; list[i]; i++)
99 if (strcmp(list[i], entry) == 0)
100 return true;
101
102 return false;
103}
104
ac010944
CB
105/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
106 * "name=systemd". Do not fail.
107 */
108static char *cg_legacy_must_prefix_named(char *entry)
109{
110 size_t len;
111 char *prefixed;
112
113 len = strlen(entry);
f25a2044 114 prefixed = must_realloc(NULL, len + 6);
ac010944 115
6333c915
CB
116 memcpy(prefixed, "name=", STRLITERALLEN("name="));
117 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
ac010944 118 prefixed[len + 5] = '\0';
99bb3fa8 119
ac010944
CB
120 return prefixed;
121}
122
42a993b4
CB
123/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
124 * we are called.
ccb4cabe 125 *
42a993b4
CB
126 * We also handle named subsystems here. Any controller which is not a kernel
127 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
128 * we refuse to use because we're not sure which we have here.
129 * (TODO: We could work around this in some cases by just remounting to be
130 * unambiguous, or by comparing mountpoint contents with current cgroup.)
ccb4cabe
SH
131 *
132 * The last entry will always be NULL.
133 */
42a993b4
CB
134static void must_append_controller(char **klist, char **nlist, char ***clist,
135 char *entry)
ccb4cabe
SH
136{
137 int newentry;
138 char *copy;
139
140 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
c2712f64 141 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
ccb4cabe
SH
142 ERROR("It is both a named and kernel subsystem");
143 return;
144 }
145
146 newentry = append_null_to_list((void ***)clist);
147
148 if (strncmp(entry, "name=", 5) == 0)
149 copy = must_copy_string(entry);
150 else if (string_in_list(klist, entry))
151 copy = must_copy_string(entry);
152 else
7745483d 153 copy = cg_legacy_must_prefix_named(entry);
ccb4cabe
SH
154
155 (*clist)[newentry] = copy;
156}
157
2a63b5cb
CB
158static inline bool pure_unified_layout(const struct cgroup_ops *ops)
159{
160 return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED;
161}
162
5ae0207c
CB
163/* Given a handler's cgroup data, return the struct hierarchy for the controller
164 * @c, or NULL if there is none.
ccb4cabe 165 */
27a5132c 166struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
ccb4cabe
SH
167{
168 int i;
169
27a5132c
CB
170 errno = ENOENT;
171
172 if (!ops->hierarchies) {
173 TRACE("There are no useable cgroup controllers");
ccb4cabe 174 return NULL;
27a5132c 175 }
d6337a5f 176
2202afc9 177 for (i = 0; ops->hierarchies[i]; i++) {
27a5132c 178 if (!controller) {
d6337a5f 179 /* This is the empty unified hierarchy. */
2202afc9
CB
180 if (ops->hierarchies[i]->controllers &&
181 !ops->hierarchies[i]->controllers[0])
182 return ops->hierarchies[i];
106f1f38 183 continue;
2a63b5cb
CB
184 } else if (pure_unified_layout(ops) &&
185 strcmp(controller, "devices") == 0) {
186 if (ops->unified->bpf_device_controller)
187 return ops->unified;
188 break;
d6337a5f
CB
189 }
190
27a5132c 191 if (string_in_list(ops->hierarchies[i]->controllers, controller))
2202afc9 192 return ops->hierarchies[i];
ccb4cabe 193 }
d6337a5f 194
27a5132c
CB
195 if (controller)
196 WARN("There is no useable %s controller", controller);
197 else
198 WARN("There is no empty unified cgroup hierarchy");
199
ccb4cabe
SH
200 return NULL;
201}
202
a54694f8
CB
203#define BATCH_SIZE 50
204static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
205{
206 int newbatches = (newlen / BATCH_SIZE) + 1;
207 int oldbatches = (oldlen / BATCH_SIZE) + 1;
208
209 if (!*mem || newbatches > oldbatches) {
210 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
211 }
212}
213
214static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
215{
216 size_t full = oldlen + newlen;
217
218 batch_realloc(dest, oldlen, full + 1);
219
220 memcpy(*dest + oldlen, new, newlen + 1);
221}
222
223/* Slurp in a whole file */
d6337a5f 224static char *read_file(const char *fnam)
a54694f8 225{
d97919ab
CB
226 __do_free char *line = NULL;
227 __do_fclose FILE *f = NULL;
a54694f8 228 int linelen;
d97919ab
CB
229 char *buf = NULL;
230 size_t len = 0, fulllen = 0;
a54694f8
CB
231
232 f = fopen(fnam, "r");
233 if (!f)
234 return NULL;
235 while ((linelen = getline(&line, &len, f)) != -1) {
236 append_line(&buf, fulllen, line, linelen);
237 fulllen += linelen;
238 }
a54694f8
CB
239 return buf;
240}
241
242/* Taken over modified from the kernel sources. */
243#define NBITS 32 /* bits in uint32_t */
244#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
245#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
246
247static void set_bit(unsigned bit, uint32_t *bitarr)
248{
249 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
250}
251
252static void clear_bit(unsigned bit, uint32_t *bitarr)
253{
254 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
255}
256
257static bool is_set(unsigned bit, uint32_t *bitarr)
258{
259 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
260}
261
262/* Create cpumask from cpulist aka turn:
263 *
264 * 0,2-3
265 *
d5d468f6 266 * into bit array
a54694f8
CB
267 *
268 * 1 0 1 1
269 */
270static uint32_t *lxc_cpumask(char *buf, size_t nbits)
271{
272 char *token;
d5d468f6
CB
273 size_t arrlen;
274 uint32_t *bitarr;
d5d468f6
CB
275
276 arrlen = BITS_TO_LONGS(nbits);
277 bitarr = calloc(arrlen, sizeof(uint32_t));
a54694f8
CB
278 if (!bitarr)
279 return NULL;
280
0be0d78f 281 lxc_iterate_parts(token, buf, ",") {
a54694f8 282 errno = 0;
d5d468f6
CB
283 unsigned end, start;
284 char *range;
a54694f8 285
d5d468f6
CB
286 start = strtoul(token, NULL, 0);
287 end = start;
288 range = strchr(token, '-');
a54694f8
CB
289 if (range)
290 end = strtoul(range + 1, NULL, 0);
d5d468f6 291
a54694f8
CB
292 if (!(start <= end)) {
293 free(bitarr);
294 return NULL;
295 }
296
297 if (end >= nbits) {
298 free(bitarr);
299 return NULL;
300 }
301
302 while (start <= end)
303 set_bit(start++, bitarr);
304 }
305
306 return bitarr;
307}
308
a54694f8
CB
309/* Turn cpumask into simple, comma-separated cpulist. */
310static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
311{
a54694f8 312 int ret;
414c6719 313 size_t i;
24cac6af 314 char *tmp = NULL;
a54694f8 315 char **cpulist = NULL;
c19ad94b 316 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
a54694f8
CB
317
318 for (i = 0; i <= nbits; i++) {
414c6719
CB
319 if (!is_set(i, bitarr))
320 continue;
321
979a0d93
CB
322 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
323 if (ret < 0 || (size_t)ret >= sizeof(numstr)) {
414c6719
CB
324 lxc_free_array((void **)cpulist, free);
325 return NULL;
326 }
327
328 ret = lxc_append_string(&cpulist, numstr);
329 if (ret < 0) {
330 lxc_free_array((void **)cpulist, free);
331 return NULL;
a54694f8
CB
332 }
333 }
414c6719
CB
334
335 if (!cpulist)
336 return NULL;
337
24cac6af
L
338 tmp = lxc_string_join(",", (const char **)cpulist, false);
339 lxc_free_array((void **)cpulist, free);
340
341 return tmp;
a54694f8
CB
342}
343
344static ssize_t get_max_cpus(char *cpulist)
345{
346 char *c1, *c2;
347 char *maxcpus = cpulist;
348 size_t cpus = 0;
349
350 c1 = strrchr(maxcpus, ',');
351 if (c1)
352 c1++;
353
354 c2 = strrchr(maxcpus, '-');
355 if (c2)
356 c2++;
357
358 if (!c1 && !c2)
359 c1 = maxcpus;
360 else if (c1 > c2)
361 c2 = c1;
362 else if (c1 < c2)
363 c1 = c2;
333987b9 364 else if (!c1 && c2)
a54694f8
CB
365 c1 = c2;
366
a54694f8
CB
367 errno = 0;
368 cpus = strtoul(c1, NULL, 0);
369 if (errno != 0)
370 return -1;
371
372 return cpus;
373}
374
6f9584d8 375#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
36f70181 376#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
a3926f6a 377static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
a54694f8 378{
d97919ab 379 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
36f70181
CB
380 *offlinecpus = NULL, *posscpus = NULL;
381 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
382 *possmask = NULL;
a54694f8
CB
383 int ret;
384 ssize_t i;
d97919ab 385 char oldv;
7717e175 386 char *lastslash;
36f70181 387 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
6f9584d8 388 bool bret = false, flipped_bit = false;
a54694f8
CB
389
390 lastslash = strrchr(path, '/');
59ac3b88
CB
391 if (!lastslash) {
392 ERROR("Failed to detect \"/\" in \"%s\"", path);
a54694f8
CB
393 return bret;
394 }
395 oldv = *lastslash;
396 *lastslash = '\0';
397 fpath = must_make_path(path, "cpuset.cpus", NULL);
f68ea354 398 *lastslash = oldv;
a54694f8 399 posscpus = read_file(fpath);
6f9584d8 400 if (!posscpus) {
59ac3b88 401 SYSERROR("Failed to read file \"%s\"", fpath);
d97919ab 402 return false;
6f9584d8 403 }
a54694f8
CB
404
405 /* Get maximum number of cpus found in possible cpuset. */
406 maxposs = get_max_cpus(posscpus);
92d5ea57 407 if (maxposs < 0 || maxposs >= INT_MAX - 1)
d97919ab 408 return false;
a54694f8 409
36f70181
CB
410 if (file_exists(__ISOL_CPUS)) {
411 isolcpus = read_file(__ISOL_CPUS);
412 if (!isolcpus) {
413 SYSERROR("Failed to read file \"%s\"", __ISOL_CPUS);
414 return false;
65d29cbc 415 }
6f9584d8 416
36f70181
CB
417 if (isdigit(isolcpus[0])) {
418 /* Get maximum number of cpus found in isolated cpuset. */
419 maxisol = get_max_cpus(isolcpus);
420 if (maxisol < 0 || maxisol >= INT_MAX - 1)
421 return false;
6f9584d8 422 }
36f70181
CB
423
424 if (maxposs < maxisol)
425 maxposs = maxisol;
426 maxposs++;
427 } else {
428 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
a54694f8
CB
429 }
430
36f70181
CB
431 if (file_exists(__OFFLINE_CPUS)) {
432 offlinecpus = read_file(__OFFLINE_CPUS);
433 if (!offlinecpus) {
434 SYSERROR("Failed to read file \"%s\"", __OFFLINE_CPUS);
435 return false;
436 }
437
438 if (isdigit(offlinecpus[0])) {
439 /* Get maximum number of cpus found in offline cpuset. */
440 maxoffline = get_max_cpus(offlinecpus);
441 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
442 return false;
443 }
444
445 if (maxposs < maxoffline)
446 maxposs = maxoffline;
447 maxposs++;
448 } else {
449 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
450 }
a54694f8 451
dcd14a3d
CB
452 if ((maxisol == 0) && (maxoffline == 0)) {
453 cpulist = move_ptr(posscpus);
36f70181 454 goto copy_parent;
dcd14a3d 455 }
a54694f8
CB
456
457 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8 458 if (!possmask) {
59ac3b88 459 ERROR("Failed to create cpumask for possible cpus");
d97919ab 460 return false;
6f9584d8 461 }
a54694f8 462
36f70181
CB
463 if (maxisol > 0) {
464 isolmask = lxc_cpumask(isolcpus, maxposs);
465 if (!isolmask) {
466 ERROR("Failed to create cpumask for isolated cpus");
467 return false;
468 }
469 }
470
471 if (maxoffline > 0) {
472 offlinemask = lxc_cpumask(offlinecpus, maxposs);
473 if (!offlinemask) {
474 ERROR("Failed to create cpumask for offline cpus");
475 return false;
476 }
6f9584d8 477 }
a54694f8
CB
478
479 for (i = 0; i <= maxposs; i++) {
36f70181
CB
480 if ((isolmask && !is_set(i, isolmask)) ||
481 (offlinemask && !is_set(i, offlinemask)) ||
482 !is_set(i, possmask))
59ac3b88
CB
483 continue;
484
485 flipped_bit = true;
486 clear_bit(i, possmask);
a54694f8
CB
487 }
488
6f9584d8 489 if (!flipped_bit) {
b31d62b8
CB
490 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
491 TRACE("No isolated or offline cpus present in cpuset");
492 } else {
493 cpulist = move_ptr(posscpus);
494 TRACE("Removed isolated or offline cpus from cpuset");
6f9584d8 495 }
6f9584d8 496 if (!cpulist) {
59ac3b88 497 ERROR("Failed to create cpu list");
d97919ab 498 return false;
6f9584d8 499 }
a54694f8
CB
500
501copy_parent:
36f70181 502 if (!am_initialized) {
36f70181
CB
503 fpath = must_make_path(path, "cpuset.cpus", NULL);
504 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false,
505 0666);
36f70181
CB
506 if (ret < 0) {
507 SYSERROR("Failed to write cpu list to \"%s\"", fpath);
508 return false;
509 }
510
511 TRACE("Copied cpu settings of parent cgroup");
6f9584d8
CB
512 }
513
d97919ab 514 return true;
a54694f8
CB
515}
516
e3a3fecf
SH
517/* Copy contents of parent(@path)/@file to @path/@file */
518static bool copy_parent_file(char *path, char *file)
519{
d97919ab 520 __do_free char *child_path = NULL, *parent_path = NULL, *value = NULL;
e3a3fecf 521 int ret;
d97919ab 522 char oldv;
b095a8eb 523 int len = 0;
d97919ab 524 char *lastslash = NULL;
e3a3fecf
SH
525
526 lastslash = strrchr(path, '/');
b095a8eb
CB
527 if (!lastslash) {
528 ERROR("Failed to detect \"/\" in \"%s\"", path);
e3a3fecf
SH
529 return false;
530 }
531 oldv = *lastslash;
532 *lastslash = '\0';
d97919ab
CB
533 parent_path = must_make_path(path, file, NULL);
534 len = lxc_read_from_file(parent_path, NULL, 0);
b53a0853
CB
535 if (len <= 0) {
536 SYSERROR("Failed to determine buffer size");
537 return false;
538 }
b095a8eb 539
f25a2044 540 value = must_realloc(NULL, len + 1);
d97919ab 541 ret = lxc_read_from_file(parent_path, value, len);
b53a0853
CB
542 if (ret != len) {
543 SYSERROR("Failed to read from parent file \"%s\"", parent_path);
544 return false;
545 }
b095a8eb 546
e3a3fecf 547 *lastslash = oldv;
d97919ab
CB
548 child_path = must_make_path(path, file, NULL);
549 ret = lxc_write_to_file(child_path, value, len, false, 0666);
e3a3fecf 550 if (ret < 0)
d97919ab 551 SYSERROR("Failed to write \"%s\" to file \"%s\"", value, child_path);
e3a3fecf 552 return ret >= 0;
e3a3fecf
SH
553}
554
7793add3
CB
555/* Initialize the cpuset hierarchy in first directory of @gname and set
556 * cgroup.clone_children so that children inherit settings. Since the
557 * h->base_path is populated by init or ourselves, we know it is already
558 * initialized.
e3a3fecf 559 */
a3926f6a 560static bool cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf 561{
d97919ab 562 __do_free char *cgpath = NULL, *clonechildrenpath = NULL;
7793add3
CB
563 int ret;
564 char v;
d97919ab 565 char *slash;
e3a3fecf
SH
566
567 if (!string_in_list(h->controllers, "cpuset"))
568 return true;
569
570 if (*cgname == '/')
571 cgname++;
572 slash = strchr(cgname, '/');
573 if (slash)
574 *slash = '\0';
575
bb221ad1 576 cgpath = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
e3a3fecf
SH
577 if (slash)
578 *slash = '/';
7793add3
CB
579
580 ret = mkdir(cgpath, 0755);
581 if (ret < 0) {
582 if (errno != EEXIST) {
583 SYSERROR("Failed to create directory \"%s\"", cgpath);
7793add3
CB
584 return false;
585 }
e3a3fecf 586 }
6f9584d8 587
f8390327 588 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
6328fd9c 589 /* unified hierarchy doesn't have clone_children */
d97919ab 590 if (!file_exists(clonechildrenpath))
e3a3fecf 591 return true;
7793add3
CB
592
593 ret = lxc_read_from_file(clonechildrenpath, &v, 1);
594 if (ret < 0) {
595 SYSERROR("Failed to read file \"%s\"", clonechildrenpath);
e3a3fecf
SH
596 return false;
597 }
598
a54694f8 599 /* Make sure any isolated cpus are removed from cpuset.cpus. */
a3926f6a 600 if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1')) {
7793add3 601 SYSERROR("Failed to remove isolated cpus");
a54694f8 602 return false;
6f9584d8 603 }
a54694f8 604
7793add3 605 /* Already set for us by someone else. */
b28c2810
CB
606 if (v == '1')
607 TRACE("\"cgroup.clone_children\" was already set to \"1\"");
e3a3fecf
SH
608
609 /* copy parent's settings */
a54694f8 610 if (!copy_parent_file(cgpath, "cpuset.mems")) {
7793add3 611 SYSERROR("Failed to copy \"cpuset.mems\" settings");
e3a3fecf
SH
612 return false;
613 }
e3a3fecf 614
7cea5905 615 ret = lxc_write_to_file(clonechildrenpath, "1", 1, false, 0666);
7793add3 616 if (ret < 0) {
e3a3fecf 617 /* Set clone_children so children inherit our settings */
7793add3 618 SYSERROR("Failed to write 1 to \"%s\"", clonechildrenpath);
e3a3fecf
SH
619 return false;
620 }
d97919ab 621
e3a3fecf
SH
622 return true;
623}
624
5c0089ae
CB
625/* Given two null-terminated lists of strings, return true if any string is in
626 * both.
ccb4cabe
SH
627 */
628static bool controller_lists_intersect(char **l1, char **l2)
629{
630 int i;
631
632 if (!l1 || !l2)
633 return false;
634
635 for (i = 0; l1[i]; i++) {
636 if (string_in_list(l2, l1[i]))
637 return true;
638 }
5c0089ae 639
ccb4cabe
SH
640 return false;
641}
642
258449e5
CB
643/* For a null-terminated list of controllers @clist, return true if any of those
644 * controllers is already listed the null-terminated list of hierarchies @hlist.
645 * Realistically, if one is present, all must be present.
ccb4cabe
SH
646 */
647static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
648{
649 int i;
650
651 if (!hlist)
652 return false;
258449e5 653
ccb4cabe
SH
654 for (i = 0; hlist[i]; i++)
655 if (controller_lists_intersect(hlist[i]->controllers, clist))
656 return true;
ccb4cabe 657
258449e5 658 return false;
ccb4cabe
SH
659}
660
f57ac67f
CB
661/* Return true if the controller @entry is found in the null-terminated list of
662 * hierarchies @hlist.
ccb4cabe
SH
663 */
664static bool controller_found(struct hierarchy **hlist, char *entry)
665{
666 int i;
d6337a5f 667
ccb4cabe
SH
668 if (!hlist)
669 return false;
670
671 for (i = 0; hlist[i]; i++)
672 if (string_in_list(hlist[i]->controllers, entry))
673 return true;
d6337a5f 674
ccb4cabe
SH
675 return false;
676}
677
e1c27ab0
CB
678/* Return true if all of the controllers which we require have been found. The
679 * required list is freezer and anything in lxc.cgroup.use.
ccb4cabe 680 */
2202afc9 681static bool all_controllers_found(struct cgroup_ops *ops)
ccb4cabe 682{
b7b18fc5 683 char **cur;
2202afc9 684 struct hierarchy **hlist = ops->hierarchies;
ccb4cabe 685
2202afc9 686 if (!ops->cgroup_use)
ccb4cabe 687 return true;
c2712f64 688
b7b18fc5
CB
689 for (cur = ops->cgroup_use; cur && *cur; cur++)
690 if (!controller_found(hlist, *cur)) {
691 ERROR("No %s controller mountpoint found", *cur);
ccb4cabe
SH
692 return false;
693 }
c2712f64 694
ccb4cabe
SH
695 return true;
696}
697
f205f10c
CB
698/* Get the controllers from a mountinfo line There are other ways we could get
699 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
700 * could parse the mount options. But we simply assume that the mountpoint must
701 * be /sys/fs/cgroup/controller-list
ccb4cabe 702 */
a3926f6a
CB
703static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
704 int type)
ccb4cabe 705{
f205f10c
CB
706 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
707 * for legacy hierarchies.
708 */
ccb4cabe 709 int i;
d97919ab 710 char *p2, *tok;
0be0d78f 711 char *p = line, *sep = ",";
411ac6d8 712 char **aret = NULL;
6328fd9c 713
ccb4cabe 714 for (i = 0; i < 4; i++) {
235f1815 715 p = strchr(p, ' ');
ccb4cabe
SH
716 if (!p)
717 return NULL;
718 p++;
719 }
a55f31bd 720
f205f10c
CB
721 /* Note, if we change how mountinfo works, then our caller will need to
722 * verify /sys/fs/cgroup/ in this field.
723 */
dca9587a
CB
724 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0) {
725 ERROR("Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
ccb4cabe 726 return NULL;
5059aae9 727 }
d6337a5f 728
ccb4cabe 729 p += 15;
235f1815 730 p2 = strchr(p, ' ');
ccb4cabe 731 if (!p2) {
2202afc9 732 ERROR("Corrupt mountinfo");
ccb4cabe
SH
733 return NULL;
734 }
735 *p2 = '\0';
6328fd9c 736
d6337a5f 737 if (type == CGROUP_SUPER_MAGIC) {
88396101 738 __do_free char *dup = NULL;
d97919ab 739
0be0d78f
CB
740 /* strdup() here for v1 hierarchies. Otherwise
741 * lxc_iterate_parts() will destroy mountpoints such as
742 * "/sys/fs/cgroup/cpu,cpuacct".
d6337a5f 743 */
d97919ab 744 dup = must_copy_string(p);
d6337a5f
CB
745 if (!dup)
746 return NULL;
747
d97919ab 748 lxc_iterate_parts (tok, dup, sep)
d6337a5f 749 must_append_controller(klist, nlist, &aret, tok);
411ac6d8 750 }
d6337a5f 751 *p2 = ' ';
f205f10c 752
d6337a5f
CB
753 return aret;
754}
411ac6d8 755
d6337a5f
CB
756static char **cg_unified_make_empty_controller(void)
757{
758 int newentry;
759 char **aret = NULL;
760
761 newentry = append_null_to_list((void ***)&aret);
762 aret[newentry] = NULL;
763 return aret;
764}
765
766static char **cg_unified_get_controllers(const char *file)
767{
d97919ab 768 __do_free char *buf = NULL;
0be0d78f 769 char *sep = " \t\n";
d6337a5f 770 char **aret = NULL;
2a63b5cb 771 char *tok;
d6337a5f
CB
772
773 buf = read_file(file);
774 if (!buf)
411ac6d8 775 return NULL;
6328fd9c 776
0be0d78f 777 lxc_iterate_parts(tok, buf, sep) {
d6337a5f
CB
778 int newentry;
779 char *copy;
780
781 newentry = append_null_to_list((void ***)&aret);
782 copy = must_copy_string(tok);
783 aret[newentry] = copy;
ccb4cabe
SH
784 }
785
786 return aret;
787}
788
2202afc9 789static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
bb221ad1 790 char *container_base_path, int type)
ccb4cabe
SH
791{
792 struct hierarchy *new;
793 int newentry;
794
f25a2044 795 new = must_realloc(NULL, sizeof(*new));
ccb4cabe
SH
796 new->controllers = clist;
797 new->mountpoint = mountpoint;
bb221ad1 798 new->container_base_path = container_base_path;
eb697136 799 new->container_full_path = NULL;
e09b62f9 800 new->monitor_full_path = NULL;
d6337a5f 801 new->version = type;
a6ca2ed8 802 new->cgroup2_chown = NULL;
6328fd9c 803
2202afc9
CB
804 newentry = append_null_to_list((void ***)h);
805 (*h)[newentry] = new;
d6337a5f 806 return new;
ccb4cabe
SH
807}
808
798c3b33
CB
809/* Get a copy of the mountpoint from @line, which is a line from
810 * /proc/self/mountinfo.
ccb4cabe 811 */
a3926f6a 812static char *cg_hybrid_get_mountpoint(char *line)
ccb4cabe
SH
813{
814 int i;
ccb4cabe 815 size_t len;
798c3b33
CB
816 char *p2;
817 char *p = line, *sret = NULL;
ccb4cabe
SH
818
819 for (i = 0; i < 4; i++) {
235f1815 820 p = strchr(p, ' ');
ccb4cabe
SH
821 if (!p)
822 return NULL;
823 p++;
824 }
d6337a5f 825
dca9587a 826 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
d6337a5f
CB
827 return NULL;
828
829 p2 = strchr(p + 15, ' ');
830 if (!p2)
831 return NULL;
832 *p2 = '\0';
833
ccb4cabe 834 len = strlen(p);
f25a2044 835 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
836 memcpy(sret, p, len);
837 sret[len] = '\0';
838 return sret;
839}
840
f523291e 841/* Given a multi-line string, return a null-terminated copy of the current line. */
ccb4cabe
SH
842static char *copy_to_eol(char *p)
843{
235f1815 844 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
845 size_t len;
846
847 if (!p2)
848 return NULL;
849
850 len = p2 - p;
f25a2044 851 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
852 memcpy(sret, p, len);
853 sret[len] = '\0';
854 return sret;
855}
856
bced39de
CB
857/* cgline: pointer to character after the first ':' in a line in a \n-terminated
858 * /proc/self/cgroup file. Check whether controller c is present.
ccb4cabe
SH
859 */
860static bool controller_in_clist(char *cgline, char *c)
861{
d97919ab
CB
862 __do_free char *tmp = NULL;
863 char *tok, *eol;
ccb4cabe
SH
864 size_t len;
865
235f1815 866 eol = strchr(cgline, ':');
ccb4cabe
SH
867 if (!eol)
868 return false;
869
870 len = eol - cgline;
861cb8c2 871 tmp = must_realloc(NULL, len + 1);
ccb4cabe
SH
872 memcpy(tmp, cgline, len);
873 tmp[len] = '\0';
874
d97919ab
CB
875 lxc_iterate_parts(tok, tmp, ",")
876 if (strcmp(tok, c) == 0)
ccb4cabe 877 return true;
d6337a5f 878
ccb4cabe
SH
879 return false;
880}
881
c3ef912e
CB
882/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
883 * @controller.
ccb4cabe 884 */
c3ef912e
CB
885static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
886 int type)
ccb4cabe
SH
887{
888 char *p = basecginfo;
6328fd9c 889
d6337a5f
CB
890 for (;;) {
891 bool is_cgv2_base_cgroup = false;
892
6328fd9c 893 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
d6337a5f
CB
894 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
895 is_cgv2_base_cgroup = true;
ccb4cabe 896
235f1815 897 p = strchr(p, ':');
ccb4cabe
SH
898 if (!p)
899 return NULL;
900 p++;
d6337a5f
CB
901
902 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
235f1815 903 p = strchr(p, ':');
ccb4cabe
SH
904 if (!p)
905 return NULL;
906 p++;
907 return copy_to_eol(p);
908 }
909
235f1815 910 p = strchr(p, '\n');
ccb4cabe
SH
911 if (!p)
912 return NULL;
913 p++;
914 }
915}
916
ccb4cabe
SH
917static void must_append_string(char ***list, char *entry)
918{
6dfb18bf 919 int newentry;
ccb4cabe
SH
920 char *copy;
921
6dfb18bf 922 newentry = append_null_to_list((void ***)list);
ccb4cabe
SH
923 copy = must_copy_string(entry);
924 (*list)[newentry] = copy;
925}
926
d6337a5f 927static int get_existing_subsystems(char ***klist, char ***nlist)
ccb4cabe 928{
d97919ab
CB
929 __do_free char *line = NULL;
930 __do_fclose FILE *f = NULL;
ccb4cabe
SH
931 size_t len = 0;
932
d6337a5f
CB
933 f = fopen("/proc/self/cgroup", "r");
934 if (!f)
935 return -1;
936
ccb4cabe 937 while (getline(&line, &len, f) != -1) {
0be0d78f 938 char *p, *p2, *tok;
235f1815 939 p = strchr(line, ':');
ccb4cabe
SH
940 if (!p)
941 continue;
942 p++;
235f1815 943 p2 = strchr(p, ':');
ccb4cabe
SH
944 if (!p2)
945 continue;
946 *p2 = '\0';
ff8d6ee9 947
6328fd9c
CB
948 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
949 * contains an entry of the form:
ff8d6ee9
CB
950 *
951 * 0::/some/path
952 *
6328fd9c 953 * In this case we use "cgroup2" as controller name.
ff8d6ee9 954 */
6328fd9c
CB
955 if ((p2 - p) == 0) {
956 must_append_string(klist, "cgroup2");
ff8d6ee9 957 continue;
6328fd9c 958 }
ff8d6ee9 959
0be0d78f 960 lxc_iterate_parts(tok, p, ",") {
ccb4cabe
SH
961 if (strncmp(tok, "name=", 5) == 0)
962 must_append_string(nlist, tok);
963 else
964 must_append_string(klist, tok);
965 }
966 }
967
d6337a5f 968 return 0;
ccb4cabe
SH
969}
970
971static void trim(char *s)
972{
7689dfd7
CB
973 size_t len;
974
975 len = strlen(s);
2c28d76b 976 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
977 s[--len] = '\0';
978}
979
2202afc9 980static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
ccb4cabe
SH
981{
982 int i;
27d84737 983 struct hierarchy **it;
41c33dbe 984
2202afc9
CB
985 if (!ops->hierarchies) {
986 TRACE(" No hierarchies found");
ccb4cabe
SH
987 return;
988 }
27d84737 989
2202afc9
CB
990 TRACE(" Hierarchies:");
991 for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
ccb4cabe 992 int j;
27d84737
CB
993 char **cit;
994
bb221ad1 995 TRACE(" %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
2202afc9
CB
996 TRACE(" mountpoint: %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
997 TRACE(" controllers:");
a7b0cc4c 998 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
2202afc9 999 TRACE(" %d: %s", j, *cit);
ccb4cabe
SH
1000 }
1001}
41c33dbe 1002
a3926f6a
CB
1003static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1004 char **nlist)
41c33dbe
SH
1005{
1006 int k;
a7b0cc4c 1007 char **it;
41c33dbe 1008
2202afc9
CB
1009 TRACE("basecginfo is:");
1010 TRACE("%s", basecginfo);
41c33dbe 1011
a7b0cc4c 1012 for (k = 0, it = klist; it && *it; it++, k++)
2202afc9 1013 TRACE("kernel subsystem %d: %s", k, *it);
0f71dd9b 1014
a7b0cc4c 1015 for (k = 0, it = nlist; it && *it; it++, k++)
2202afc9 1016 TRACE("named subsystem %d: %s", k, *it);
41c33dbe 1017}
ccb4cabe 1018
2202afc9
CB
1019static int cgroup_rmdir(struct hierarchy **hierarchies,
1020 const char *container_cgroup)
c71d83e1 1021{
2202afc9 1022 int i;
d6337a5f 1023
2202afc9
CB
1024 if (!container_cgroup || !hierarchies)
1025 return 0;
d6337a5f 1026
2202afc9
CB
1027 for (i = 0; hierarchies[i]; i++) {
1028 int ret;
1029 struct hierarchy *h = hierarchies[i];
d6337a5f 1030
eb697136 1031 if (!h->container_full_path)
2202afc9
CB
1032 continue;
1033
eb697136 1034 ret = recursive_destroy(h->container_full_path);
2202afc9 1035 if (ret < 0)
eb697136 1036 WARN("Failed to destroy \"%s\"", h->container_full_path);
2202afc9 1037
eb697136
CB
1038 free(h->container_full_path);
1039 h->container_full_path = NULL;
2202afc9 1040 }
d6337a5f 1041
c71d83e1 1042 return 0;
d6337a5f
CB
1043}
1044
2202afc9
CB
1045struct generic_userns_exec_data {
1046 struct hierarchy **hierarchies;
1047 const char *container_cgroup;
1048 struct lxc_conf *conf;
1049 uid_t origuid; /* target uid in parent namespace */
1050 char *path;
1051};
d6337a5f 1052
2202afc9
CB
1053static int cgroup_rmdir_wrapper(void *data)
1054{
1055 int ret;
1056 struct generic_userns_exec_data *arg = data;
1057 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1058 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
d6337a5f 1059
2202afc9
CB
1060 ret = setresgid(nsgid, nsgid, nsgid);
1061 if (ret < 0) {
1062 SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
1063 (int)nsgid, (int)nsgid);
1064 return -1;
1065 }
d6337a5f 1066
2202afc9
CB
1067 ret = setresuid(nsuid, nsuid, nsuid);
1068 if (ret < 0) {
1069 SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
1070 (int)nsuid, (int)nsuid);
1071 return -1;
1072 }
d6337a5f 1073
2202afc9
CB
1074 ret = setgroups(0, NULL);
1075 if (ret < 0 && errno != EPERM) {
1076 SYSERROR("Failed to setgroups(0, NULL)");
1077 return -1;
1078 }
d6337a5f 1079
2202afc9 1080 return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
d6337a5f
CB
1081}
1082
434c8e15
CB
1083__cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
1084 struct lxc_handler *handler)
d6337a5f
CB
1085{
1086 int ret;
2202afc9 1087 struct generic_userns_exec_data wrap;
bd8ef4e4 1088
fc1c3af9
CB
1089 if (!ops)
1090 log_error_errno(return, ENOENT, "Called with uninitialized cgroup operations");
1091
69b4a4bb
CB
1092 if (!ops->hierarchies)
1093 return;
1094
fc1c3af9
CB
1095 if (!handler)
1096 log_error_errno(return, EINVAL, "Called with uninitialized handler");
1097
1098 if (!handler->conf)
1099 log_error_errno(return, EINVAL, "Called with uninitialized conf");
1100
4160c3a0 1101 wrap.origuid = 0;
2202afc9
CB
1102 wrap.container_cgroup = ops->container_cgroup;
1103 wrap.hierarchies = ops->hierarchies;
1104 wrap.conf = handler->conf;
4160c3a0 1105
bf651989
CB
1106#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
1107 ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices);
1108 if (ret < 0)
1109 WARN("Failed to detach bpf program from cgroup");
1110#endif
1111
2202afc9
CB
1112 if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
1113 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
bd8ef4e4 1114 "cgroup_rmdir_wrapper");
ccb4cabe 1115 else
2202afc9 1116 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
bd8ef4e4
CB
1117 if (ret < 0) {
1118 WARN("Failed to destroy cgroups");
ccb4cabe 1119 return;
ccb4cabe 1120 }
ccb4cabe
SH
1121}
1122
434c8e15
CB
1123__cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1124 struct lxc_handler *handler)
1125{
1126 int len;
434c8e15 1127 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
b376d3d0
CB
1128 struct lxc_conf *conf;
1129
1130 if (!ops)
1131 log_error_errno(return, ENOENT, "Called with uninitialized cgroup operations");
434c8e15
CB
1132
1133 if (!ops->hierarchies)
1134 return;
1135
b376d3d0
CB
1136 if (!handler)
1137 log_error_errno(return, EINVAL, "Called with uninitialized handler");
1138
1139 if (!handler->conf)
1140 log_error_errno(return, EINVAL, "Called with uninitialized conf");
1141
1142 conf = handler->conf;
1143
434c8e15
CB
1144 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1145 if (len < 0 || (size_t)len >= sizeof(pidstr))
1146 return;
1147
1148 for (int i = 0; ops->hierarchies[i]; i++) {
d97919ab 1149 __do_free char *pivot_path = NULL;
434c8e15 1150 int ret;
23e5c045 1151 char *chop;
ecedb5de 1152 char pivot_cgroup[] = PIVOT_CGROUP;
434c8e15
CB
1153 struct hierarchy *h = ops->hierarchies[i];
1154
1155 if (!h->monitor_full_path)
1156 continue;
1157
1158 if (conf && conf->cgroup_meta.dir)
1159 pivot_path = must_make_path(h->mountpoint,
1160 h->container_base_path,
1161 conf->cgroup_meta.dir,
625ad37b 1162 PIVOT_CGROUP,
434c8e15
CB
1163 "cgroup.procs", NULL);
1164 else
1165 pivot_path = must_make_path(h->mountpoint,
1166 h->container_base_path,
625ad37b 1167 PIVOT_CGROUP,
434c8e15
CB
1168 "cgroup.procs", NULL);
1169
23e5c045
CB
1170 chop = strrchr(pivot_path, '/');
1171 if (chop)
1172 *chop = '\0';
1173
ecedb5de
CB
1174 /*
1175 * Make sure not to pass in the ro string literal PIVOT_CGROUP
1176 * here.
1177 */
b376d3d0
CB
1178 if (!cg_legacy_handle_cpuset_hierarchy(h, pivot_cgroup))
1179 log_warn_errno(continue,
1180 errno, "Failed to handle legacy cpuset controller");
ecedb5de 1181
434c8e15 1182 ret = mkdir_p(pivot_path, 0755);
b376d3d0
CB
1183 if (ret < 0 && errno != EEXIST)
1184 log_warn_errno(continue, errno,
1185 "Failed to create cgroup \"%s\"\n",
1186 pivot_path);
434c8e15 1187
23e5c045
CB
1188 if (chop)
1189 *chop = '/';
1190
434c8e15
CB
1191 /* Move ourselves into the pivot cgroup to delete our own
1192 * cgroup.
1193 */
1194 ret = lxc_write_to_file(pivot_path, pidstr, len, false, 0666);
b376d3d0
CB
1195 if (ret != 0)
1196 log_warn_errno(continue, errno,
1197 "Failed to move monitor %s to \"%s\"\n",
1198 pidstr, pivot_path);
434c8e15
CB
1199
1200 ret = recursive_destroy(h->monitor_full_path);
1201 if (ret < 0)
1202 WARN("Failed to destroy \"%s\"", h->monitor_full_path);
434c8e15
CB
1203 }
1204}
1205
6099dd5a
CB
1206static int mkdir_eexist_on_last(const char *dir, mode_t mode)
1207{
1208 const char *tmp = dir;
1209 const char *orig = dir;
1210 size_t orig_len;
1211
1212 orig_len = strlen(dir);
1213 do {
6453ba56 1214 __do_free char *makeme = NULL;
6099dd5a
CB
1215 int ret;
1216 size_t cur_len;
6099dd5a
CB
1217
1218 dir = tmp + strspn(tmp, "/");
1219 tmp = dir + strcspn(dir, "/");
1220
1221 errno = ENOMEM;
1222 cur_len = dir - orig;
1223 makeme = strndup(orig, cur_len);
1224 if (!makeme)
1225 return -1;
1226
1227 ret = mkdir(makeme, mode);
1228 if (ret < 0) {
1229 if ((errno != EEXIST) || (orig_len == cur_len)) {
1230 SYSERROR("Failed to create directory \"%s\"", makeme);
6099dd5a
CB
1231 return -1;
1232 }
1233 }
6099dd5a
CB
1234 } while (tmp != dir);
1235
1236 return 0;
1237}
1238
72068e74
CB
1239static bool monitor_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1240{
1241 int ret;
1242
ef185360
CB
1243 if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1244 ERROR("Failed to handle legacy cpuset controller");
1245 return false;
1246 }
1247
72068e74 1248 h->monitor_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
6099dd5a
CB
1249 ret = mkdir_eexist_on_last(h->monitor_full_path, 0755);
1250 if (ret < 0) {
1251 ERROR("Failed to create cgroup \"%s\"", h->monitor_full_path);
ee455be4
CB
1252 return false;
1253 }
72068e74 1254
c581d2a6 1255 return true;
72068e74
CB
1256}
1257
1258static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
ccb4cabe 1259{
0c3deb94
CB
1260 int ret;
1261
ef185360
CB
1262 if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1263 ERROR("Failed to handle legacy cpuset controller");
1264 return false;
1265 }
1266
bb221ad1 1267 h->container_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
6099dd5a
CB
1268 ret = mkdir_eexist_on_last(h->container_full_path, 0755);
1269 if (ret < 0) {
1270 ERROR("Failed to create cgroup \"%s\"", h->container_full_path);
d8da679e 1271 return false;
6f9584d8 1272 }
0c3deb94 1273
c581d2a6 1274 return true;
ccb4cabe
SH
1275}
1276
72068e74 1277static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname, bool monitor)
ccb4cabe 1278{
e56639fb 1279 int ret;
72068e74
CB
1280 char *full_path;
1281
1282 if (monitor)
1283 full_path = h->monitor_full_path;
1284 else
1285 full_path = h->container_full_path;
e56639fb 1286
72068e74 1287 ret = rmdir(full_path);
e56639fb 1288 if (ret < 0)
72068e74
CB
1289 SYSERROR("Failed to rmdir(\"%s\") from failed creation attempt", full_path);
1290
1291 free(full_path);
1292
1293 if (monitor)
1294 h->monitor_full_path = NULL;
1295 else
1296 h->container_full_path = NULL;
1297}
1298
b857f4be 1299__cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
f2668eea 1300 struct lxc_handler *handler)
72068e74 1301{
d97919ab
CB
1302 __do_free char *monitor_cgroup = NULL;
1303 char *offset, *tmp;
ebc10afe 1304 int i, idx = 0;
5ce03bc0 1305 size_t len;
0d66e29a 1306 struct lxc_conf *conf;
72068e74 1307
0d66e29a
CB
1308 if (!ops)
1309 return ret_set_errno(false, ENOENT);
e56639fb 1310
69b4a4bb
CB
1311 if (!ops->hierarchies)
1312 return true;
1313
0d66e29a
CB
1314 if (ops->monitor_cgroup)
1315 return ret_set_errno(false, EEXIST);
1316
1317 if (!handler || !handler->conf)
1318 return ret_set_errno(false, EINVAL);
1319
1320 conf = handler->conf;
1321
72068e74 1322 if (conf->cgroup_meta.dir)
5ce03bc0
CB
1323 tmp = lxc_string_join("/",
1324 (const char *[]){conf->cgroup_meta.dir,
1325 ops->monitor_pattern,
1326 handler->name, NULL},
1327 false);
72068e74 1328 else
5ce03bc0
CB
1329 tmp = must_make_path(ops->monitor_pattern, handler->name, NULL);
1330 if (!tmp)
0d66e29a 1331 return ret_set_errno(false, ENOMEM);
72068e74 1332
5ce03bc0 1333 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
5407d095 1334 monitor_cgroup = must_realloc(tmp, len);
5ce03bc0 1335 offset = monitor_cgroup + len - 5;
5407d095 1336 *offset = 0;
5ce03bc0
CB
1337
1338 do {
0d66e29a
CB
1339 if (idx)
1340 sprintf(offset, "-%d", idx);
72068e74 1341
ebc10afe 1342 for (i = 0; ops->hierarchies[i]; i++) {
f2668eea
CB
1343 if (!monitor_create_path_for_hierarchy(ops->hierarchies[i],
1344 monitor_cgroup)) {
1345 ERROR("Failed to create cgroup \"%s\"",
1346 ops->hierarchies[i]->monitor_full_path);
5ce03bc0 1347 for (int j = 0; j < i; j++)
f2668eea
CB
1348 remove_path_for_hierarchy(ops->hierarchies[j],
1349 monitor_cgroup,
1350 true);
5ce03bc0
CB
1351
1352 idx++;
1353 break;
1354 }
1355 }
ebc10afe 1356 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
5ce03bc0 1357
d97919ab 1358 if (idx == 1000)
0d66e29a 1359 return ret_set_errno(false, ERANGE);
72068e74 1360
c581d2a6 1361 ops->monitor_cgroup = move_ptr(monitor_cgroup);
6e8703a4 1362 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
ccb4cabe
SH
1363}
1364
cecad0c1
CB
1365/* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1366 * next cgroup_pattern-1, -2, ..., -999.
ccb4cabe 1367 */
b857f4be 1368__cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
6439f06e 1369 struct lxc_handler *handler)
ccb4cabe 1370{
d97919ab 1371 __do_free char *container_cgroup = NULL, *tmp = NULL;
bad788b0 1372 int i, ret;
ccb4cabe 1373 size_t len;
d97919ab 1374 char *offset;
7d531e9b 1375 int idx = 0;
2202afc9 1376 struct lxc_conf *conf = handler->conf;
ccb4cabe 1377
d97919ab 1378 if (ops->container_cgroup)
ccb4cabe 1379 return false;
43654d34 1380
2202afc9 1381 if (!conf)
ccb4cabe 1382 return false;
ccb4cabe 1383
69b4a4bb
CB
1384 if (!ops->hierarchies)
1385 return true;
1386
2202afc9 1387 if (conf->cgroup_meta.dir)
3ec12d39 1388 tmp = lxc_string_join("/", (const char *[]){conf->cgroup_meta.dir, handler->name, NULL}, false);
43654d34 1389 else
2202afc9 1390 tmp = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
ccb4cabe
SH
1391 if (!tmp) {
1392 ERROR("Failed expanding cgroup name pattern");
1393 return false;
1394 }
64e82f8b 1395
1a0e70ac 1396 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
f25a2044 1397 container_cgroup = must_realloc(NULL, len);
64e82f8b 1398 (void)strlcpy(container_cgroup, tmp, len);
0c3deb94 1399 offset = container_cgroup + len - 5;
ccb4cabe 1400
d97919ab 1401 do {
c74da4ab 1402 if (idx) {
bad788b0 1403 ret = snprintf(offset, 5, "-%d", idx);
c74da4ab
CB
1404 if (ret < 0 || (size_t)ret >= 5)
1405 return false;
1406 }
bb30b52a 1407
d97919ab 1408 for (i = 0; ops->hierarchies[i]; i++) {
d99d5c93
CB
1409 if (!container_create_path_for_hierarchy(ops->hierarchies[i],
1410 container_cgroup)) {
1411 ERROR("Failed to create cgroup \"%s\"",
1412 ops->hierarchies[i]->container_full_path);
d97919ab 1413 for (int j = 0; j < i; j++)
d99d5c93
CB
1414 remove_path_for_hierarchy(ops->hierarchies[j],
1415 container_cgroup,
1416 false);
d97919ab
CB
1417 idx++;
1418 break;
66b66624
CB
1419 }
1420 }
d97919ab 1421 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
cecad0c1 1422
d97919ab
CB
1423 if (idx == 1000)
1424 return false;
cecad0c1 1425
c74da4ab 1426 INFO("The container process uses \"%s\" as cgroup", container_cgroup);
e4edf5d7 1427 ops->container_cgroup = move_ptr(container_cgroup);
bad788b0
CB
1428
1429 if (ops->unified && ops->unified->container_full_path) {
1430 ret = open(ops->unified->container_full_path,
1431 O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1432 if (ret < 0)
1433 return log_error_errno(false,
1434 errno, "Failed to open file descriptor for unified hierarchy");
1435 ops->unified_fd = ret;
1436 }
1437
ccb4cabe 1438 return true;
ccb4cabe
SH
1439}
1440
c581d2a6
CB
1441__cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1442 struct lxc_handler *handler)
ccb4cabe 1443{
c581d2a6
CB
1444 int monitor_len, transient_len;
1445 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1446 transient[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 1447
797fa65e
CB
1448 if (!ops)
1449 return ret_set_errno(false, ENOENT);
1450
69b4a4bb
CB
1451 if (!ops->hierarchies)
1452 return true;
1453
797fa65e
CB
1454 if (!ops->monitor_cgroup)
1455 return ret_set_errno(false, ENOENT);
1456
1457 if (!handler || !handler->conf)
1458 return ret_set_errno(false, EINVAL);
1459
c581d2a6
CB
1460 monitor_len = snprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1461 if (handler->transient_pid > 0)
1462 transient_len = snprintf(transient, sizeof(transient), "%d",
1463 handler->transient_pid);
ccb4cabe 1464
eeef32bb 1465 for (int i = 0; ops->hierarchies[i]; i++) {
88396101 1466 __do_free char *path = NULL;
c581d2a6 1467 int ret;
08768001 1468
c581d2a6
CB
1469 path = must_make_path(ops->hierarchies[i]->monitor_full_path,
1470 "cgroup.procs", NULL);
1471 ret = lxc_writeat(-1, path, monitor, monitor_len);
1472 if (ret != 0)
1473 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", path);
1474
1475 if (handler->transient_pid < 0)
1476 return true;
1477
1478 ret = lxc_writeat(-1, path, transient, transient_len);
1479 if (ret != 0)
1480 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", path);
ccb4cabe 1481 }
c581d2a6 1482 handler->transient_pid = -1;
ccb4cabe
SH
1483
1484 return true;
1485}
1486
c581d2a6
CB
1487__cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1488 struct lxc_handler *handler)
eeef32bb 1489{
c581d2a6
CB
1490 int len;
1491 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
eeef32bb 1492
c581d2a6
CB
1493 if (!ops->hierarchies)
1494 return true;
1495
1496 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1497
1498 for (int i = 0; ops->hierarchies[i]; i++) {
1499 __do_free char *path = NULL;
1500 int ret;
1501
1502 path = must_make_path(ops->hierarchies[i]->container_full_path,
1503 "cgroup.procs", NULL);
1504 ret = lxc_writeat(-1, path, pidstr, len);
1505 if (ret != 0)
1506 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", path);
1507 }
1508
1509 return true;
eeef32bb
CB
1510}
1511
6efacf80
CB
1512static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
1513 mode_t chmod_mode)
1514{
1515 int ret;
1516
1517 ret = chown(path, chown_uid, chown_gid);
1518 if (ret < 0) {
a24c5678 1519 SYSWARN("Failed to chown(%s, %d, %d)", path, (int)chown_uid, (int)chown_gid);
6efacf80
CB
1520 return -1;
1521 }
1522
1523 ret = chmod(path, chmod_mode);
1524 if (ret < 0) {
a24c5678 1525 SYSWARN("Failed to chmod(%s, %d)", path, (int)chmod_mode);
6efacf80
CB
1526 return -1;
1527 }
1528
1529 return 0;
1530}
1531
1532/* chgrp the container cgroups to container group. We leave
c0888dfe
SH
1533 * the container owner as cgroup owner. So we must make the
1534 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1535 *
1536 * Also chown the tasks and cgroup.procs files. Those may not
1537 * exist depending on kernel version.
c0888dfe 1538 */
ccb4cabe
SH
1539static int chown_cgroup_wrapper(void *data)
1540{
6a720d74 1541 int ret;
4160c3a0
CB
1542 uid_t destuid;
1543 struct generic_userns_exec_data *arg = data;
1544 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1545 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1546
6efacf80 1547 ret = setresgid(nsgid, nsgid, nsgid);
803e4123
CB
1548 if (ret < 0)
1549 return log_error_errno(-1, errno,
1550 "Failed to setresgid(%d, %d, %d)",
1551 (int)nsgid, (int)nsgid, (int)nsgid);
6efacf80
CB
1552
1553 ret = setresuid(nsuid, nsuid, nsuid);
803e4123
CB
1554 if (ret < 0)
1555 return log_error_errno(-1, errno,
1556 "Failed to setresuid(%d, %d, %d)",
1557 (int)nsuid, (int)nsuid, (int)nsuid);
6efacf80
CB
1558
1559 ret = setgroups(0, NULL);
803e4123
CB
1560 if (ret < 0 && errno != EPERM)
1561 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
ccb4cabe
SH
1562
1563 destuid = get_ns_uid(arg->origuid);
b962868f
CB
1564 if (destuid == LXC_INVALID_UID)
1565 destuid = 0;
ccb4cabe 1566
6a720d74 1567 for (int i = 0; arg->hierarchies[i]; i++) {
d97919ab 1568 __do_free char *fullpath = NULL;
eb697136 1569 char *path = arg->hierarchies[i]->container_full_path;
43647298 1570
63e42fee 1571 ret = chowmod(path, destuid, nsgid, 0775);
6efacf80 1572 if (ret < 0)
803e4123
CB
1573 log_info_errno(continue,
1574 errno, "Failed to change %s to uid %d and gid %d and mode 0755",
1575 path, destuid, nsgid);
c0888dfe 1576
6efacf80
CB
1577 /* Failures to chown() these are inconvenient but not
1578 * detrimental We leave these owned by the container launcher,
1579 * so that container root can write to the files to attach. We
1580 * chmod() them 664 so that container systemd can write to the
1581 * files (which systemd in wily insists on doing).
ab8f5424 1582 */
6efacf80 1583
2202afc9 1584 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
6efacf80 1585 fullpath = must_make_path(path, "tasks", NULL);
803e4123
CB
1586 ret = chowmod(fullpath, destuid, nsgid, 0664);
1587 if (ret < 0)
1588 SYSINFO("Failed to change %s to uid %d and gid %d and mode 0664",
1589 fullpath, destuid, nsgid);
6efacf80 1590 }
43647298
SH
1591
1592 fullpath = must_make_path(path, "cgroup.procs", NULL);
803e4123
CB
1593 ret = chowmod(fullpath, destuid, nsgid, 0664);
1594 if (ret < 0)
1595 SYSINFO("Failed to change %s to uid %d and gid %d and mode 0664",
1596 fullpath, destuid, nsgid);
0e17357c 1597
2202afc9 1598 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
0e17357c
CB
1599 continue;
1600
a6ca2ed8
CB
1601 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++) {
1602 fullpath = must_make_path(path, *p, NULL);
803e4123
CB
1603 ret = chowmod(fullpath, destuid, nsgid, 0664);
1604 if (ret < 0)
1605 SYSINFO("Failed to change %s to uid %d and gid %d and mode 0664",
1606 fullpath, destuid, nsgid);
a6ca2ed8 1607 }
ccb4cabe
SH
1608 }
1609
1610 return 0;
1611}
1612
b857f4be 1613__cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
fb55e009 1614 struct lxc_conf *conf)
ccb4cabe 1615{
4160c3a0 1616 struct generic_userns_exec_data wrap;
ccb4cabe 1617
ccb4cabe
SH
1618 if (lxc_list_empty(&conf->id_map))
1619 return true;
1620
69b4a4bb
CB
1621 if (!ops->hierarchies)
1622 return true;
1623
ccb4cabe 1624 wrap.origuid = geteuid();
4160c3a0 1625 wrap.path = NULL;
2202afc9 1626 wrap.hierarchies = ops->hierarchies;
4160c3a0 1627 wrap.conf = conf;
ccb4cabe 1628
c9b7c33e
CB
1629 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1630 "chown_cgroup_wrapper") < 0) {
f7faba6c 1631 ERROR("Error requesting cgroup chown in new user namespace");
ccb4cabe
SH
1632 return false;
1633 }
1634
1635 return true;
1636}
1637
8aa1044f
SH
1638/* cgroup-full:* is done, no need to create subdirs */
1639static bool cg_mount_needs_subdirs(int type)
1640{
1641 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1642 return false;
a3926f6a 1643
8aa1044f
SH
1644 return true;
1645}
1646
886cac86
CB
1647/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1648 * remount controller ro if needed and bindmount the cgroupfs onto
25fa6f8c 1649 * control/the/cg/path.
8aa1044f 1650 */
6812d833
CB
1651static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1652 char *controllerpath, char *cgpath,
1653 const char *container_cgroup)
8aa1044f 1654{
d97919ab 1655 __do_free char *sourcepath = NULL;
5285689c 1656 int ret, remount_flags;
886cac86
CB
1657 int flags = MS_BIND;
1658
8aa1044f 1659 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
886cac86
CB
1660 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1661 if (ret < 0) {
1662 SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
1663 controllerpath, controllerpath);
8aa1044f
SH
1664 return -1;
1665 }
886cac86 1666
5285689c
CB
1667 remount_flags = add_required_remount_flags(controllerpath,
1668 controllerpath,
1669 flags | MS_REMOUNT);
886cac86 1670 ret = mount(controllerpath, controllerpath, "cgroup",
8186c5c7
CB
1671 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1672 NULL);
886cac86
CB
1673 if (ret < 0) {
1674 SYSERROR("Failed to remount \"%s\" ro", controllerpath);
8aa1044f
SH
1675 return -1;
1676 }
886cac86 1677
8aa1044f
SH
1678 INFO("Remounted %s read-only", controllerpath);
1679 }
886cac86 1680
bb221ad1 1681 sourcepath = must_make_path(h->mountpoint, h->container_base_path,
886cac86 1682 container_cgroup, NULL);
8aa1044f
SH
1683 if (type == LXC_AUTO_CGROUP_RO)
1684 flags |= MS_RDONLY;
886cac86
CB
1685
1686 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1687 if (ret < 0) {
1688 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
8aa1044f
SH
1689 return -1;
1690 }
886cac86 1691 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
f8c40ffa
L
1692
1693 if (flags & MS_RDONLY) {
5285689c
CB
1694 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1695 flags | MS_REMOUNT);
1696 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
886cac86
CB
1697 if (ret < 0) {
1698 SYSERROR("Failed to remount \"%s\" ro", cgpath);
f8c40ffa
L
1699 return -1;
1700 }
5285689c 1701 INFO("Remounted %s read-only", cgpath);
f8c40ffa
L
1702 }
1703
886cac86 1704 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
8aa1044f
SH
1705 return 0;
1706}
1707
6812d833
CB
1708/* __cg_mount_direct
1709 *
1710 * Mount cgroup hierarchies directly without using bind-mounts. The main
1711 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1712 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1713 */
1714static int __cg_mount_direct(int type, struct hierarchy *h,
1715 const char *controllerpath)
b635e92d 1716{
d97919ab 1717 __do_free char *controllers = NULL;
a760603e
CB
1718 char *fstype = "cgroup2";
1719 unsigned long flags = 0;
f6b54668 1720 int ret;
b635e92d 1721
a760603e
CB
1722 flags |= MS_NOSUID;
1723 flags |= MS_NOEXEC;
1724 flags |= MS_NODEV;
1725 flags |= MS_RELATIME;
1726
1727 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1728 flags |= MS_RDONLY;
1729
d6337a5f 1730 if (h->version != CGROUP2_SUPER_MAGIC) {
a760603e
CB
1731 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1732 if (!controllers)
1733 return -ENOMEM;
1734 fstype = "cgroup";
b635e92d
CB
1735 }
1736
a760603e 1737 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
b635e92d 1738 if (ret < 0) {
6812d833 1739 SYSERROR("Failed to mount \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1740 return -1;
1741 }
1742
6812d833 1743 DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1744 return 0;
1745}
1746
6812d833
CB
1747static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1748 const char *controllerpath)
1749{
1750 return __cg_mount_direct(type, h, controllerpath);
1751}
1752
1753static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1754 const char *controllerpath)
1755{
1756 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1757 return 0;
1758
1759 return __cg_mount_direct(type, h, controllerpath);
1760}
1761
b857f4be 1762__cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
8d661d38
CB
1763 struct lxc_handler *handler,
1764 const char *root, int type)
ccb4cabe 1765{
6607d6e9 1766 __do_free char *cgroup_root = NULL;
dfa835ac 1767 int ret;
affd10fa 1768 bool has_cgns = false, retval = false, wants_force_mount = false;
8aa1044f 1769
69b4a4bb
CB
1770 if (!ops->hierarchies)
1771 return true;
1772
8aa1044f
SH
1773 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1774 return true;
1775
3f69fb12
SY
1776 if (type & LXC_AUTO_CGROUP_FORCE) {
1777 type &= ~LXC_AUTO_CGROUP_FORCE;
1778 wants_force_mount = true;
1779 }
b635e92d 1780
3f69fb12
SY
1781 if (!wants_force_mount){
1782 if (!lxc_list_empty(&handler->conf->keepcaps))
1783 wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1784 else
1785 wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1786 }
8aa1044f 1787
3f69fb12
SY
1788 has_cgns = cgns_supported();
1789 if (has_cgns && !wants_force_mount)
1790 return true;
8aa1044f
SH
1791
1792 if (type == LXC_AUTO_CGROUP_NOSPEC)
1793 type = LXC_AUTO_CGROUP_MIXED;
1794 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1795 type = LXC_AUTO_CGROUP_FULL_MIXED;
1796
dca9587a 1797 cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
8d661d38 1798 if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
8d661d38
CB
1799 if (has_cgns && wants_force_mount) {
1800 /* If cgroup namespaces are supported but the container
1801 * will not have CAP_SYS_ADMIN after it has started we
1802 * need to mount the cgroups manually.
1803 */
1804 return cg_mount_in_cgroup_namespace(type, ops->unified,
6607d6e9 1805 cgroup_root) == 0;
8d661d38
CB
1806 }
1807
6607d6e9 1808 return cg_mount_cgroup_full(type, ops->unified, cgroup_root) == 0;
8d661d38
CB
1809 }
1810
1811 /* mount tmpfs */
6607d6e9 1812 ret = safe_mount(NULL, cgroup_root, "tmpfs",
3f69fb12
SY
1813 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1814 "size=10240k,mode=755", root);
1815 if (ret < 0)
1816 goto on_error;
8aa1044f 1817
dfa835ac 1818 for (int i = 0; ops->hierarchies[i]; i++) {
d97919ab 1819 __do_free char *controllerpath = NULL, *path2 = NULL;
2202afc9 1820 struct hierarchy *h = ops->hierarchies[i];
8aa1044f 1821 char *controller = strrchr(h->mountpoint, '/');
8aa1044f
SH
1822
1823 if (!controller)
1824 continue;
1825 controller++;
affd10fa 1826
6607d6e9 1827 controllerpath = must_make_path(cgroup_root, controller, NULL);
d97919ab 1828 if (dir_exists(controllerpath))
8aa1044f 1829 continue;
affd10fa 1830
3f69fb12
SY
1831 ret = mkdir(controllerpath, 0755);
1832 if (ret < 0) {
8aa1044f 1833 SYSERROR("Error creating cgroup path: %s", controllerpath);
3f69fb12 1834 goto on_error;
8aa1044f 1835 }
b635e92d 1836
3f69fb12 1837 if (has_cgns && wants_force_mount) {
b635e92d
CB
1838 /* If cgroup namespaces are supported but the container
1839 * will not have CAP_SYS_ADMIN after it has started we
1840 * need to mount the cgroups manually.
1841 */
3f69fb12 1842 ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
3f69fb12
SY
1843 if (ret < 0)
1844 goto on_error;
1845
b635e92d
CB
1846 continue;
1847 }
1848
6812d833 1849 ret = cg_mount_cgroup_full(type, h, controllerpath);
d97919ab 1850 if (ret < 0)
3f69fb12 1851 goto on_error;
3f69fb12 1852
d97919ab 1853 if (!cg_mount_needs_subdirs(type))
8aa1044f 1854 continue;
3f69fb12 1855
bb221ad1 1856 path2 = must_make_path(controllerpath, h->container_base_path,
2202afc9 1857 ops->container_cgroup, NULL);
3f69fb12 1858 ret = mkdir_p(path2, 0755);
d97919ab 1859 if (ret < 0)
3f69fb12 1860 goto on_error;
2f62fb00 1861
6812d833 1862 ret = cg_legacy_mount_controllers(type, h, controllerpath,
2202afc9 1863 path2, ops->container_cgroup);
3f69fb12
SY
1864 if (ret < 0)
1865 goto on_error;
8aa1044f
SH
1866 }
1867 retval = true;
1868
3f69fb12 1869on_error:
8aa1044f 1870 return retval;
ccb4cabe
SH
1871}
1872
1873static int recursive_count_nrtasks(char *dirname)
1874{
d97919ab 1875 __do_free char *path = NULL;
88396101 1876 __do_closedir DIR *dir = NULL;
74f96976 1877 struct dirent *direntp;
ccb4cabe 1878 int count = 0, ret;
ccb4cabe
SH
1879
1880 dir = opendir(dirname);
1881 if (!dir)
1882 return 0;
1883
74f96976 1884 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1885 struct stat mystat;
1886
ccb4cabe
SH
1887 if (!strcmp(direntp->d_name, ".") ||
1888 !strcmp(direntp->d_name, ".."))
1889 continue;
1890
1891 path = must_make_path(dirname, direntp->d_name, NULL);
1892
1893 if (lstat(path, &mystat))
d97919ab 1894 continue;
ccb4cabe
SH
1895
1896 if (!S_ISDIR(mystat.st_mode))
d97919ab 1897 continue;
ccb4cabe
SH
1898
1899 count += recursive_count_nrtasks(path);
ccb4cabe
SH
1900 }
1901
1902 path = must_make_path(dirname, "cgroup.procs", NULL);
1903 ret = lxc_count_file_lines(path);
1904 if (ret != -1)
1905 count += ret;
ccb4cabe
SH
1906
1907 return count;
1908}
1909
b857f4be 1910__cgfsng_ops static int cgfsng_nrtasks(struct cgroup_ops *ops)
3135c5d4 1911{
d97919ab 1912 __do_free char *path = NULL;
ccb4cabe 1913
2202afc9 1914 if (!ops->container_cgroup || !ops->hierarchies)
ccb4cabe 1915 return -1;
a3926f6a 1916
eb697136 1917 path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
3312a94f 1918 return recursive_count_nrtasks(path);
ccb4cabe
SH
1919}
1920
11c23867 1921/* Only root needs to escape to the cgroup of its init. */
b857f4be 1922__cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
fb55e009 1923 struct lxc_conf *conf)
ccb4cabe 1924{
69b4a4bb 1925 if (conf->cgroup_meta.relative || geteuid() || !ops->hierarchies)
ccb4cabe
SH
1926 return true;
1927
779b3d82 1928 for (int i = 0; ops->hierarchies[i]; i++) {
11c23867 1929 int ret;
88396101 1930 __do_free char *fullpath = NULL;
11c23867 1931
2202afc9 1932 fullpath = must_make_path(ops->hierarchies[i]->mountpoint,
bb221ad1 1933 ops->hierarchies[i]->container_base_path,
11c23867 1934 "cgroup.procs", NULL);
7cea5905 1935 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
11c23867
CB
1936 if (ret != 0) {
1937 SYSERROR("Failed to escape to cgroup \"%s\"", fullpath);
6df334d1 1938 return false;
ccb4cabe 1939 }
ccb4cabe
SH
1940 }
1941
6df334d1 1942 return true;
ccb4cabe
SH
1943}
1944
b857f4be 1945__cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
36662416 1946{
69b4a4bb
CB
1947 int i = 0;
1948
1949 if (!ops->hierarchies)
1950 return 0;
36662416 1951
69b4a4bb 1952 for (; ops->hierarchies[i]; i++)
36662416
TA
1953 ;
1954
1955 return i;
1956}
1957
b857f4be 1958__cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
36662416
TA
1959{
1960 int i;
1961
69b4a4bb
CB
1962 if (!ops->hierarchies)
1963 return false;
1964
36662416 1965 /* sanity check n */
6b38e644 1966 for (i = 0; i < n; i++)
2202afc9 1967 if (!ops->hierarchies[i])
36662416 1968 return false;
36662416 1969
2202afc9 1970 *out = ops->hierarchies[i]->controllers;
36662416
TA
1971
1972 return true;
1973}
1974
942e193e 1975
ee3a7775 1976static bool cg_legacy_freeze(struct cgroup_ops *ops)
ccb4cabe 1977{
ee3a7775 1978 __do_free char *path = NULL;
d6337a5f 1979 struct hierarchy *h;
ccb4cabe 1980
ee3a7775
CB
1981 h = get_hierarchy(ops, "freezer");
1982 if (!h)
d2203230 1983 return ret_set_errno(-1, ENOENT);
81468ea7 1984
ee3a7775 1985 path = must_make_path(h->container_full_path, "freezer.state", NULL);
018051e3 1986 return lxc_write_to_file(path, "FROZEN", STRLITERALLEN("FROZEN"), false, 0666);
ee3a7775 1987}
942e193e 1988
018051e3
CB
1989static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1990 struct lxc_epoll_descr *descr)
ee3a7775 1991{
018051e3
CB
1992 __do_close_prot_errno int duped_fd = -EBADF;
1993 __do_free char *line = NULL;
ee3a7775 1994 __do_fclose FILE *f = NULL;
018051e3
CB
1995 int state = PTR_TO_INT(cbdata);
1996 size_t len;
1997 const char *state_string;
1998
1999 duped_fd = dup(fd);
2000 if (duped_fd < 0)
2001 return LXC_MAINLOOP_ERROR;
2002
2003 if (lseek(duped_fd, 0, SEEK_SET) < (off_t)-1)
2004 return LXC_MAINLOOP_ERROR;
2005
2006 f = fdopen(duped_fd, "re");
2007 if (!f)
2008 return LXC_MAINLOOP_ERROR;
2009 move_fd(duped_fd);
2010
2011 if (state == 1)
2012 state_string = "frozen 1";
2013 else
2014 state_string = "frozen 0";
2015
2016 while (getline(&line, &len, f) != -1)
2017 if (strncmp(line, state_string, STRLITERALLEN("frozen") + 2) == 0)
2018 return LXC_MAINLOOP_CLOSE;
2019
2020 return LXC_MAINLOOP_CONTINUE;
2021}
2022
2023static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
2024{
2025 __do_close_prot_errno int fd = -EBADF;
2026 __do_free char *path = NULL;
2027 __do_lxc_mainloop_close struct lxc_epoll_descr *descr_ptr = NULL;
2028 int ret;
2029 struct lxc_epoll_descr descr;
ee3a7775 2030 struct hierarchy *h;
942e193e
CB
2031
2032 h = ops->unified;
457ca9aa 2033 if (!h)
d2203230 2034 return ret_set_errno(-1, ENOENT);
d6337a5f 2035
018051e3 2036 if (!h->container_full_path)
d2203230 2037 return ret_set_errno(-1, EEXIST);
d6337a5f 2038
018051e3
CB
2039 if (timeout != 0) {
2040 __do_free char *events_file = NULL;
942e193e 2041
018051e3
CB
2042 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2043 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2044 if (fd < 0)
d2203230 2045 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
942e193e 2046
018051e3
CB
2047 ret = lxc_mainloop_open(&descr);
2048 if (ret)
d2203230 2049 return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container freeze");
942e193e 2050
018051e3
CB
2051 /* automatically cleaned up now */
2052 descr_ptr = &descr;
942e193e 2053
018051e3
CB
2054 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){1}));
2055 if (ret < 0)
d2203230 2056 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 2057 }
942e193e 2058
018051e3
CB
2059 path = must_make_path(h->container_full_path, "cgroup.freeze", NULL);
2060 ret = lxc_write_to_file(path, "1", 1, false, 0666);
2061 if (ret < 0)
d2203230 2062 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
2063
2064 if (timeout != 0 && lxc_mainloop(&descr, timeout))
d2203230 2065 return log_error_errno(-1, errno, "Failed to wait for container to be frozen");
018051e3
CB
2066
2067 return 0;
942e193e
CB
2068}
2069
018051e3 2070__cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
942e193e 2071{
81468ea7 2072 if (!ops->hierarchies)
d2203230 2073 return ret_set_errno(-1, ENOENT);
81468ea7 2074
ee3a7775
CB
2075 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2076 return cg_legacy_freeze(ops);
942e193e 2077
018051e3 2078 return cg_unified_freeze(ops, timeout);
ee3a7775
CB
2079}
2080
018051e3 2081static int cg_legacy_unfreeze(struct cgroup_ops *ops)
ee3a7775
CB
2082{
2083 __do_free char *path = NULL;
2084 struct hierarchy *h;
2085
2086 h = get_hierarchy(ops, "freezer");
2087 if (!h)
d2203230 2088 return ret_set_errno(-1, ENOENT);
ee3a7775
CB
2089
2090 path = must_make_path(h->container_full_path, "freezer.state", NULL);
018051e3 2091 return lxc_write_to_file(path, "THAWED", STRLITERALLEN("THAWED"), false, 0666);
ee3a7775
CB
2092}
2093
018051e3 2094static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775 2095{
018051e3 2096 __do_close_prot_errno int fd = -EBADF;
ee3a7775 2097 __do_free char *path = NULL;
018051e3
CB
2098 __do_lxc_mainloop_close struct lxc_epoll_descr *descr_ptr = NULL;
2099 int ret;
2100 struct lxc_epoll_descr descr;
ee3a7775 2101 struct hierarchy *h;
942e193e
CB
2102
2103 h = ops->unified;
2104 if (!h)
d2203230 2105 return ret_set_errno(-1, ENOENT);
018051e3
CB
2106
2107 if (!h->container_full_path)
d2203230 2108 return ret_set_errno(-1, EEXIST);
018051e3
CB
2109
2110 if (timeout != 0) {
2111 __do_free char *events_file = NULL;
2112
2113 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2114 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2115 if (fd < 0)
d2203230 2116 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
018051e3
CB
2117
2118 ret = lxc_mainloop_open(&descr);
2119 if (ret)
d2203230 2120 return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container unfreeze");
018051e3
CB
2121
2122 /* automatically cleaned up now */
2123 descr_ptr = &descr;
2124
2125 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){0}));
2126 if (ret < 0)
d2203230 2127 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 2128 }
942e193e 2129
ee3a7775 2130 path = must_make_path(h->container_full_path, "cgroup.freeze", NULL);
018051e3
CB
2131 ret = lxc_write_to_file(path, "0", 1, false, 0666);
2132 if (ret < 0)
d2203230 2133 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
2134
2135 if (timeout != 0 && lxc_mainloop(&descr, timeout))
d2203230 2136 return log_error_errno(-1, errno, "Failed to wait for container to be unfrozen");
018051e3
CB
2137
2138 return 0;
ee3a7775
CB
2139}
2140
018051e3 2141__cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775
CB
2142{
2143 if (!ops->hierarchies)
d2203230 2144 return ret_set_errno(-1, ENOENT);
ee3a7775
CB
2145
2146 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2147 return cg_legacy_unfreeze(ops);
2148
018051e3 2149 return cg_unified_unfreeze(ops, timeout);
ccb4cabe
SH
2150}
2151
b857f4be 2152__cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
fb55e009 2153 const char *controller)
ccb4cabe 2154{
d6337a5f
CB
2155 struct hierarchy *h;
2156
2202afc9 2157 h = get_hierarchy(ops, controller);
106f1f38 2158 if (!h) {
2202afc9
CB
2159 WARN("Failed to find hierarchy for controller \"%s\"",
2160 controller ? controller : "(null)");
ccb4cabe 2161 return NULL;
106f1f38 2162 }
ccb4cabe 2163
eb697136 2164 return h->container_full_path ? h->container_full_path + strlen(h->mountpoint) : NULL;
371f834d
SH
2165}
2166
c40c8209
CB
2167/* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2168 * which must be freed by the caller.
371f834d 2169 */
c40c8209
CB
2170static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2171 const char *inpath,
2172 const char *filename)
371f834d 2173{
371f834d 2174 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
2175}
2176
900b6606 2177static int cgroup_attach_leaf(int unified_fd, int64_t pid)
c2aed66d 2178{
ad275c16 2179 int idx = 1;
c2aed66d 2180 int ret;
900b6606 2181 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
ad275c16 2182 char attach_cgroup[STRLITERALLEN("lxc-1000/cgroup.procs") + 1];
900b6606 2183 size_t pidstr_len;
c2aed66d 2184
ad275c16
CB
2185 /* Create leaf cgroup. */
2186 ret = mkdirat(unified_fd, "lxc", 0755);
2187 if (ret < 0 && errno != EEXIST)
2188 return log_error_errno(-1, errno, "Failed to create leaf cgroup \"lxc\"");
2189
900b6606 2190 pidstr_len = sprintf(pidstr, INT64_FMT, pid);
ad275c16
CB
2191 ret = lxc_writeat(unified_fd, "lxc/cgroup.procs", pidstr, pidstr_len);
2192 if (ret < 0)
2193 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
c2aed66d 2194 if (ret == 0)
bad788b0 2195 return 0;
ad275c16 2196
bad788b0
CB
2197 /* this is a non-leaf node */
2198 if (errno != EBUSY)
d2203230 2199 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2200
c2aed66d 2201 do {
bad788b0 2202 char *slash;
c2aed66d 2203
ad275c16 2204 sprintf(attach_cgroup, "lxc-%d/cgroup.procs", idx);
bad788b0
CB
2205 slash = &attach_cgroup[ret] - STRLITERALLEN("/cgroup.procs");
2206 *slash = '\0';
ad275c16 2207
bad788b0 2208 ret = mkdirat(unified_fd, attach_cgroup, 0755);
c2aed66d 2209 if (ret < 0 && errno != EEXIST)
d2203230 2210 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
c2aed66d 2211
bad788b0 2212 *slash = '/';
ad275c16 2213
bad788b0 2214 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
c2aed66d 2215 if (ret == 0)
bad788b0 2216 return 0;
c2aed66d
CB
2217
2218 /* this is a non-leaf node */
2219 if (errno != EBUSY)
d2203230 2220 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2221
edae86e9
CB
2222 idx++;
2223 } while (idx < 1000);
c2aed66d 2224
ad275c16 2225 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d
CB
2226}
2227
900b6606
CB
2228int cgroup_attach(const char *name, const char *lxcpath, int64_t pid)
2229{
2230 __do_close_prot_errno int unified_fd = -EBADF;
2231
2232 unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
2233 if (unified_fd < 0)
2234 return -1;
2235
2236 return cgroup_attach_leaf(unified_fd, pid);
2237}
2238
2239/* Technically, we're always at a delegation boundary here (This is especially
2240 * true when cgroup namespaces are available.). The reasoning is that in order
2241 * for us to have been able to start a container in the first place the root
2242 * cgroup must have been a leaf node. Now, either the container's init system
2243 * has populated the cgroup and kept it as a leaf node or it has created
2244 * subtrees. In the former case we will simply attach to the leaf node we
2245 * created when we started the container in the latter case we create our own
2246 * cgroup for the attaching process.
2247 */
2248static int __cg_unified_attach(const struct hierarchy *h, const char *name,
2249 const char *lxcpath, pid_t pid,
2250 const char *controller)
2251{
2252 __do_close_prot_errno int unified_fd = -EBADF;
2253 int ret;
2254
2255 ret = cgroup_attach(name, lxcpath, pid);
2256 if (ret < 0) {
2257 __do_free char *path = NULL, *cgroup = NULL;
2258
2259 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2260 /* not running */
2261 if (!cgroup)
2262 return 0;
2263
2264 path = must_make_path(h->mountpoint, cgroup, NULL);
2265 unified_fd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
2266 }
2267 if (unified_fd < 0)
2268 return -1;
2269
2270 return cgroup_attach_leaf(unified_fd, pid);
2271}
2272
b857f4be 2273__cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
fb55e009 2274 const char *lxcpath, pid_t pid)
ccb4cabe 2275{
81b5d48a 2276 int len, ret;
a3650c0c 2277 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 2278
69b4a4bb
CB
2279 if (!ops->hierarchies)
2280 return true;
2281
a3650c0c
CB
2282 len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
2283 if (len < 0 || (size_t)len >= sizeof(pidstr))
ccb4cabe
SH
2284 return false;
2285
81b5d48a 2286 for (int i = 0; ops->hierarchies[i]; i++) {
c05b17bd 2287 __do_free char *fullpath = NULL, *path = NULL;
2202afc9 2288 struct hierarchy *h = ops->hierarchies[i];
ccb4cabe 2289
c2aed66d 2290 if (h->version == CGROUP2_SUPER_MAGIC) {
900b6606 2291 ret = __cg_unified_attach(h, name, lxcpath, pid,
a3926f6a 2292 h->controllers[0]);
c2aed66d
CB
2293 if (ret < 0)
2294 return false;
2295
2296 continue;
2297 }
2298
ccb4cabe 2299 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
c2aed66d
CB
2300 /* not running */
2301 if (!path)
e2cb2e74 2302 return false;
ccb4cabe 2303
371f834d 2304 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
7cea5905 2305 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
c2aed66d 2306 if (ret < 0) {
ccb4cabe 2307 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
ccb4cabe
SH
2308 return false;
2309 }
ccb4cabe
SH
2310 }
2311
ccb4cabe
SH
2312 return true;
2313}
2314
e2bd2b13
CB
2315/* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2316 * don't have a cgroup_data set up, so we ask the running container through the
2317 * commands API for the cgroup path.
ccb4cabe 2318 */
b857f4be 2319__cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
fb55e009
CB
2320 char *value, size_t len, const char *name,
2321 const char *lxcpath)
ccb4cabe 2322{
d97919ab 2323 __do_free char *path = NULL;
88396101 2324 __do_free char *controller = NULL;
d97919ab 2325 char *p;
0069cc61 2326 struct hierarchy *h;
861cb8c2 2327 int ret = -1;
ccb4cabe 2328
861cb8c2 2329 controller = must_copy_string(filename);
0069cc61
CB
2330 p = strchr(controller, '.');
2331 if (p)
ccb4cabe
SH
2332 *p = '\0';
2333
0069cc61
CB
2334 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2335 /* not running */
2336 if (!path)
ccb4cabe
SH
2337 return -1;
2338
2202afc9 2339 h = get_hierarchy(ops, controller);
ccb4cabe 2340 if (h) {
88396101 2341 __do_free char *fullpath = NULL;
0069cc61
CB
2342
2343 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe 2344 ret = lxc_read_from_file(fullpath, value, len);
ccb4cabe 2345 }
ccb4cabe
SH
2346
2347 return ret;
2348}
2349
2a63b5cb
CB
2350static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2351 const char *val)
2352{
2353 int count, ret;
2354 char temp[50];
2355
2356 if (strcmp("devices.allow", key) == 0)
2357 device->allow = 1;
2358 else
2359 device->allow = 0;
2360
2361 if (strcmp(val, "a") == 0) {
2362 /* global rule */
2363 device->type = 'a';
2364 device->major = -1;
2365 device->minor = -1;
fda39d45
CB
2366 device->global_rule = device->allow
2367 ? LXC_BPF_DEVICE_CGROUP_BLACKLIST
2368 : LXC_BPF_DEVICE_CGROUP_WHITELIST;
2a63b5cb
CB
2369 device->allow = -1;
2370 return 0;
2371 } else {
fda39d45 2372 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2a63b5cb
CB
2373 }
2374
2375 switch (*val) {
2376 case 'a':
2377 __fallthrough;
2378 case 'b':
2379 __fallthrough;
2380 case 'c':
2381 device->type = *val;
2382 break;
2383 default:
2384 return -1;
2385 }
2386
2387 val++;
2388 if (!isspace(*val))
2389 return -1;
2390 val++;
2391 if (*val == '*') {
2392 device->major = -1;
2393 val++;
2394 } else if (isdigit(*val)) {
2395 memset(temp, 0, sizeof(temp));
2396 for (count = 0; count < sizeof(temp) - 1; count++) {
2397 temp[count] = *val;
2398 val++;
2399 if (!isdigit(*val))
2400 break;
2401 }
2402 ret = lxc_safe_int(temp, &device->major);
2403 if (ret)
2404 return -1;
2405 } else {
2406 return -1;
2407 }
2408 if (*val != ':')
2409 return -1;
2410 val++;
2411
2412 /* read minor */
2413 if (*val == '*') {
2414 device->minor = -1;
2415 val++;
2416 } else if (isdigit(*val)) {
2417 memset(temp, 0, sizeof(temp));
2418 for (count = 0; count < sizeof(temp) - 1; count++) {
2419 temp[count] = *val;
2420 val++;
2421 if (!isdigit(*val))
2422 break;
2423 }
2424 ret = lxc_safe_int(temp, &device->minor);
2425 if (ret)
2426 return -1;
2427 } else {
2428 return -1;
2429 }
2430 if (!isspace(*val))
2431 return -1;
2432 for (val++, count = 0; count < 3; count++, val++) {
2433 switch (*val) {
2434 case 'r':
2435 device->access[count] = *val;
2436 break;
2437 case 'w':
2438 device->access[count] = *val;
2439 break;
2440 case 'm':
2441 device->access[count] = *val;
2442 break;
2443 case '\n':
2444 case '\0':
2445 count = 3;
2446 break;
2447 default:
2448 return -1;
2449 }
2450 }
2451
2452 return 0;
2453}
2454
eec533e3
CB
2455/* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2456 * don't have a cgroup_data set up, so we ask the running container through the
2457 * commands API for the cgroup path.
ccb4cabe 2458 */
b857f4be 2459__cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2a63b5cb 2460 const char *key, const char *value,
fb55e009 2461 const char *name, const char *lxcpath)
ccb4cabe 2462{
d97919ab 2463 __do_free char *path = NULL;
88396101 2464 __do_free char *controller = NULL;
d97919ab 2465 char *p;
87777968 2466 struct hierarchy *h;
861cb8c2 2467 int ret = -1;
ccb4cabe 2468
2a63b5cb 2469 controller = must_copy_string(key);
87777968
CB
2470 p = strchr(controller, '.');
2471 if (p)
ccb4cabe
SH
2472 *p = '\0';
2473
2a63b5cb
CB
2474 if (pure_unified_layout(ops) && strcmp(controller, "devices") == 0) {
2475 struct device_item device = {0};
2476
2477 ret = device_cgroup_rule_parse(&device, key, value);
2478 if (ret < 0)
d2203230 2479 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2a63b5cb
CB
2480 key, value);
2481
2482 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2483 if (ret < 0)
2484 return -1;
2485
2486 return 0;
2487 }
2488
87777968
CB
2489 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2490 /* not running */
2491 if (!path)
ccb4cabe
SH
2492 return -1;
2493
2202afc9 2494 h = get_hierarchy(ops, controller);
ccb4cabe 2495 if (h) {
88396101 2496 __do_free char *fullpath = NULL;
87777968 2497
2a63b5cb 2498 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
7cea5905 2499 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
ccb4cabe 2500 }
ccb4cabe
SH
2501
2502 return ret;
2503}
2504
91d1a13a 2505/* take devices cgroup line
72add155
SH
2506 * /dev/foo rwx
2507 * and convert it to a valid
2508 * type major:minor mode
91d1a13a
CB
2509 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2510 * the output.
72add155
SH
2511 */
2512static int convert_devpath(const char *invalue, char *dest)
2513{
88396101 2514 __do_free char *path = NULL;
2a06d041 2515 int n_parts;
d97919ab 2516 char *p, type;
72add155 2517 unsigned long minor, major;
91d1a13a 2518 struct stat sb;
2a06d041
CB
2519 int ret = -EINVAL;
2520 char *mode = NULL;
72add155
SH
2521
2522 path = must_copy_string(invalue);
2523
91d1a13a
CB
2524 /* Read path followed by mode. Ignore any trailing text.
2525 * A ' # comment' would be legal. Technically other text is not
2526 * legal, we could check for that if we cared to.
72add155 2527 */
0dbdb99e 2528 for (n_parts = 1, p = path; *p; p++) {
2c2d6c49
SH
2529 if (*p != ' ')
2530 continue;
2531 *p = '\0';
91d1a13a 2532
2c2d6c49
SH
2533 if (n_parts != 1)
2534 break;
2535 p++;
2536 n_parts++;
91d1a13a 2537
2c2d6c49
SH
2538 while (*p == ' ')
2539 p++;
91d1a13a 2540
2c2d6c49 2541 mode = p;
91d1a13a 2542
2c2d6c49
SH
2543 if (*p == '\0')
2544 goto out;
72add155 2545 }
2c2d6c49
SH
2546
2547 if (n_parts == 1)
72add155 2548 goto out;
72add155
SH
2549
2550 ret = stat(path, &sb);
2551 if (ret < 0)
2552 goto out;
2553
72add155
SH
2554 mode_t m = sb.st_mode & S_IFMT;
2555 switch (m) {
2556 case S_IFBLK:
2557 type = 'b';
2558 break;
2559 case S_IFCHR:
2560 type = 'c';
2561 break;
2c2d6c49 2562 default:
91d1a13a 2563 ERROR("Unsupported device type %i for \"%s\"", m, path);
72add155
SH
2564 ret = -EINVAL;
2565 goto out;
2566 }
2c2d6c49
SH
2567
2568 major = MAJOR(sb.st_rdev);
2569 minor = MINOR(sb.st_rdev);
2570 ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
72add155 2571 if (ret < 0 || ret >= 50) {
2a06d041
CB
2572 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2573 "chars)", type, major, minor, mode);
72add155
SH
2574 ret = -ENAMETOOLONG;
2575 goto out;
2576 }
2577 ret = 0;
2578
2579out:
72add155
SH
2580 return ret;
2581}
2582
90e97284
CB
2583/* Called from setup_limits - here we have the container's cgroup_data because
2584 * we created the cgroups.
ccb4cabe 2585 */
2202afc9
CB
2586static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2587 const char *value)
ccb4cabe 2588{
88396101 2589 __do_free char *controller = NULL;
d97919ab
CB
2590 __do_free char *fullpath = NULL;
2591 char *p;
1a0e70ac
CB
2592 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2593 char converted_value[50];
b3646d7e
CB
2594 struct hierarchy *h;
2595 int ret = 0;
64e82f8b 2596
861cb8c2 2597 controller = must_copy_string(filename);
ab1a6cac
CB
2598 p = strchr(controller, '.');
2599 if (p)
ccb4cabe
SH
2600 *p = '\0';
2601
c8bf519d 2602 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
72add155
SH
2603 ret = convert_devpath(value, converted_value);
2604 if (ret < 0)
c8bf519d 2605 return ret;
72add155 2606 value = converted_value;
c8bf519d 2607 }
2608
2202afc9 2609 h = get_hierarchy(ops, controller);
b3646d7e
CB
2610 if (!h) {
2611 ERROR("Failed to setup limits for the \"%s\" controller. "
2612 "The controller seems to be unused by \"cgfsng\" cgroup "
2613 "driver or not enabled on the cgroup hierarchy",
2614 controller);
d1953b26 2615 errno = ENOENT;
ab1a6cac 2616 return -ENOENT;
ccb4cabe 2617 }
b3646d7e 2618
eb697136 2619 fullpath = must_make_path(h->container_full_path, filename, NULL);
7cea5905 2620 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
ccb4cabe
SH
2621 return ret;
2622}
2623
c581d2a6
CB
2624__cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2625 struct lxc_conf *conf,
2626 bool do_devices)
ccb4cabe 2627{
d97919ab 2628 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
c581d2a6 2629 struct lxc_list *cgroup_settings = &conf->cgroup;
d97919ab 2630 struct lxc_list *iterator, *next;
ccb4cabe 2631 struct lxc_cgroup *cg;
ccb4cabe
SH
2632 bool ret = false;
2633
2634 if (lxc_list_empty(cgroup_settings))
2635 return true;
2636
69b4a4bb
CB
2637 if (!ops->hierarchies)
2638 return false;
2639
ccb4cabe 2640 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
6b38e644 2641 if (!sorted_cgroup_settings)
ccb4cabe 2642 return false;
ccb4cabe 2643
ccb4cabe
SH
2644 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2645 cg = iterator->elem;
2646
2647 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2202afc9 2648 if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
ccb4cabe 2649 if (do_devices && (errno == EACCES || errno == EPERM)) {
c347df58
CB
2650 WARN("Failed to set \"%s\" to \"%s\"",
2651 cg->subsystem, cg->value);
ccb4cabe
SH
2652 continue;
2653 }
c347df58
CB
2654 WARN("Failed to set \"%s\" to \"%s\"",
2655 cg->subsystem, cg->value);
ccb4cabe
SH
2656 goto out;
2657 }
c347df58
CB
2658 DEBUG("Set controller \"%s\" set to \"%s\"",
2659 cg->subsystem, cg->value);
ccb4cabe 2660 }
ccb4cabe
SH
2661 }
2662
2663 ret = true;
6b38e644 2664 INFO("Limits for the legacy cgroup hierarchies have been setup");
ccb4cabe 2665out:
ccb4cabe
SH
2666 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2667 lxc_list_del(iterator);
2668 free(iterator);
2669 }
d97919ab 2670
ccb4cabe
SH
2671 return ret;
2672}
2673
bf651989
CB
2674/*
2675 * Some of the parsing logic comes from the original cgroup device v1
2676 * implementation in the kernel.
2677 */
4bfb655e
CB
2678static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2679 struct lxc_conf *conf, const char *key,
bf651989
CB
2680 const char *val)
2681{
2682#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
4bfb655e 2683 struct device_item device_item = {0};
2a63b5cb 2684 int ret;
bf651989 2685
2a63b5cb
CB
2686 ret = device_cgroup_rule_parse(&device_item, key, val);
2687 if (ret < 0)
d2203230 2688 return log_error_errno(-1, EINVAL,
2a63b5cb
CB
2689 "Failed to parse device string %s=%s",
2690 key, val);
4bfb655e
CB
2691
2692 ret = bpf_list_add_device(conf, &device_item);
2a63b5cb 2693 if (ret < 0)
4bfb655e 2694 return -1;
bf651989
CB
2695#endif
2696 return 0;
2697}
2698
c581d2a6
CB
2699__cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2700 struct lxc_handler *handler)
6b38e644
CB
2701{
2702 struct lxc_list *iterator;
2202afc9 2703 struct hierarchy *h = ops->unified;
c581d2a6
CB
2704 struct lxc_conf *conf = handler->conf;
2705 struct lxc_list *cgroup_settings = &conf->cgroup2;
6b38e644
CB
2706
2707 if (lxc_list_empty(cgroup_settings))
2708 return true;
2709
2710 if (!h)
2711 return false;
2712
bf651989 2713 lxc_list_for_each (iterator, cgroup_settings) {
88396101 2714 __do_free char *fullpath = NULL;
6b38e644 2715 int ret;
6b38e644
CB
2716 struct lxc_cgroup *cg = iterator->elem;
2717
bf651989 2718 if (strncmp("devices", cg->subsystem, 7) == 0) {
4bfb655e 2719 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem,
bf651989
CB
2720 cg->value);
2721 } else {
2722 fullpath = must_make_path(h->container_full_path,
2723 cg->subsystem, NULL);
2724 ret = lxc_write_to_file(fullpath, cg->value,
2725 strlen(cg->value), false, 0666);
2726 if (ret < 0) {
2727 SYSERROR("Failed to set \"%s\" to \"%s\"",
2728 cg->subsystem, cg->value);
2729 return false;
2730 }
6b38e644
CB
2731 }
2732 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2733 }
2734
2735 INFO("Limits for the unified cgroup hierarchy have been setup");
2736 return true;
2737}
2738
bf651989
CB
2739__cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
2740 struct lxc_handler *handler)
2741{
2742#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
2a63b5cb
CB
2743 __do_bpf_program_free struct bpf_program *devices = NULL;
2744 struct lxc_conf *conf = handler->conf;
2745 struct hierarchy *unified = ops->unified;
bf651989 2746 int ret;
2a63b5cb
CB
2747 struct lxc_list *it;
2748 struct bpf_program *devices_old;
bf651989 2749
9994db51
CB
2750 if (!unified || !unified->bpf_device_controller ||
2751 !unified->container_full_path || lxc_list_empty(&conf->devices))
bf651989
CB
2752 return true;
2753
2a63b5cb
CB
2754 devices = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
2755 if (!devices)
d47ff01b
CB
2756 return log_error_errno(false, ENOMEM,
2757 "Failed to create new bpf program");
2a63b5cb
CB
2758
2759 ret = bpf_program_init(devices);
bf651989 2760 if (ret)
d47ff01b
CB
2761 return log_error_errno(false, ENOMEM,
2762 "Failed to initialize bpf program");
2a63b5cb
CB
2763
2764 lxc_list_for_each(it, &conf->devices) {
2765 struct device_item *cur = it->elem;
2766
2767 ret = bpf_program_append_device(devices, cur);
2768 if (ret)
d47ff01b
CB
2769 return log_error_errno(false,
2770 ENOMEM, "Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2771 cur->type, cur->major,
2772 cur->minor, cur->access,
2773 cur->allow, cur->global_rule);
2a63b5cb
CB
2774 TRACE("Added rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2775 cur->type, cur->major, cur->minor, cur->access,
2776 cur->allow, cur->global_rule);
2777 }
2778
2779 ret = bpf_program_finalize(devices);
2780 if (ret)
d47ff01b
CB
2781 return log_error_errno(false, ENOMEM,
2782 "Failed to finalize bpf program");
bf651989 2783
2a63b5cb
CB
2784 ret = bpf_program_cgroup_attach(devices, BPF_CGROUP_DEVICE,
2785 unified->container_full_path,
cce5a3d7
CB
2786 BPF_F_ALLOW_MULTI);
2787 if (ret)
d47ff01b
CB
2788 return log_error_errno(false, ENOMEM,
2789 "Failed to attach bpf program");
cce5a3d7
CB
2790
2791 /* Replace old bpf program. */
2a63b5cb
CB
2792 devices_old = move_ptr(conf->cgroup2_devices);
2793 conf->cgroup2_devices = move_ptr(devices);
2794 devices = move_ptr(devices_old);
bf651989 2795#endif
cce5a3d7 2796 return true;
bf651989
CB
2797}
2798
c581d2a6 2799bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
6b38e644 2800{
c581d2a6
CB
2801 __do_free char *add_controllers = NULL, *base_path = NULL;
2802 struct hierarchy *unified = ops->unified;
2803 ssize_t parts_len;
2804 char **it;
2805 size_t full_len = 0;
2806 char **parts = NULL;
2807 bool bret = false;
6b38e644 2808
c581d2a6
CB
2809 if (!ops->hierarchies || !pure_unified_layout(ops) ||
2810 !unified->controllers[0])
bf651989
CB
2811 return true;
2812
c581d2a6
CB
2813 /* For now we simply enable all controllers that we have detected by
2814 * creating a string like "+memory +pids +cpu +io".
2815 * TODO: In the near future we might want to support "-<controller>"
2816 * etc. but whether supporting semantics like this make sense will need
2817 * some thinking.
2818 */
2819 for (it = unified->controllers; it && *it; it++) {
2820 full_len += strlen(*it) + 2;
2821 add_controllers = must_realloc(add_controllers, full_len + 1);
2822
2823 if (unified->controllers[0] == *it)
2824 add_controllers[0] = '\0';
2825
2826 (void)strlcat(add_controllers, "+", full_len + 1);
2827 (void)strlcat(add_controllers, *it, full_len + 1);
2828
2829 if ((it + 1) && *(it + 1))
2830 (void)strlcat(add_controllers, " ", full_len + 1);
2831 }
2832
2833 parts = lxc_string_split(cgroup, '/');
2834 if (!parts)
2835 goto on_error;
2836
2837 parts_len = lxc_array_len((void **)parts);
2838 if (parts_len > 0)
2839 parts_len--;
2840
2841 base_path = must_make_path(unified->mountpoint, unified->container_base_path, NULL);
2842 for (ssize_t i = -1; i < parts_len; i++) {
2843 int ret;
2844 __do_free char *target = NULL;
2845
2846 if (i >= 0)
2847 base_path = must_append_path(base_path, parts[i], NULL);
2848 target = must_make_path(base_path, "cgroup.subtree_control", NULL);
2849 ret = lxc_writeat(-1, target, add_controllers, full_len);
61fbc369
CB
2850 if (ret < 0)
2851 log_error_errno(goto on_error,
2852 errno, "Could not enable \"%s\" controllers in the unified cgroup \"%s\"",
2853 add_controllers, target);
c581d2a6
CB
2854 TRACE("Enable \"%s\" controllers in the unified cgroup \"%s\"", add_controllers, target);
2855 }
2856
2857 bret = true;
2858
2859on_error:
2860 lxc_free_array((void **)parts, free);
2861 return bret;
2862}
2863
2864__cgfsng_ops bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2865{
61fbc369
CB
2866 if (!ops)
2867 return ret_set_errno(false, ENOENT);
2868
c581d2a6
CB
2869 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2870}
2871
2872__cgfsng_ops bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2873{
61fbc369
CB
2874 if (!ops)
2875 return ret_set_errno(false, ENOENT);
2876
c581d2a6 2877 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2202afc9
CB
2878}
2879
b7b18fc5
CB
2880static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2881 char **controllers)
2882{
b7b18fc5
CB
2883 if (!ops->cgroup_use)
2884 return true;
2885
431e2c54 2886 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
b7b18fc5
CB
2887 bool found = false;
2888
431e2c54 2889 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
b7b18fc5
CB
2890 if (strcmp(*cur_use, *cur_ctrl) != 0)
2891 continue;
2892
2893 found = true;
2894 break;
2895 }
2896
2897 if (found)
2898 continue;
2899
2900 return false;
2901 }
2902
2903 return true;
2904}
2905
a6ca2ed8
CB
2906static void cg_unified_delegate(char ***delegate)
2907{
d606c4e9 2908 __do_free char *buf = NULL;
a6ca2ed8 2909 char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
d606c4e9
CB
2910 char *token;
2911 int idx;
a6ca2ed8 2912
d606c4e9
CB
2913 buf = read_file("/sys/kernel/cgroup/delegate");
2914 if (!buf) {
a6ca2ed8
CB
2915 for (char **p = standard; p && *p; p++) {
2916 idx = append_null_to_list((void ***)delegate);
2917 (*delegate)[idx] = must_copy_string(*p);
2918 }
d606c4e9
CB
2919 log_warn_errno(return, errno, "Failed to read /sys/kernel/cgroup/delegate");
2920 }
a6ca2ed8 2921
d606c4e9
CB
2922 lxc_iterate_parts (token, buf, " \t\n") {
2923 /*
2924 * We always need to chown this for both cgroup and
2925 * cgroup2.
2926 */
2927 if (strcmp(token, "cgroup.procs") == 0)
2928 continue;
2929
2930 idx = append_null_to_list((void ***)delegate);
2931 (*delegate)[idx] = must_copy_string(token);
a6ca2ed8
CB
2932 }
2933}
2934
2202afc9
CB
2935/* At startup, parse_hierarchies finds all the info we need about cgroup
2936 * mountpoints and current cgroups, and stores it in @d.
2937 */
341e6516 2938static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
2202afc9 2939{
88396101 2940 __do_free char *basecginfo = NULL;
d97919ab
CB
2941 __do_free char *line = NULL;
2942 __do_fclose FILE *f = NULL;
2202afc9 2943 int ret;
2202afc9 2944 size_t len = 0;
2202afc9
CB
2945 char **klist = NULL, **nlist = NULL;
2946
2947 /* Root spawned containers escape the current cgroup, so use init's
2948 * cgroups as our base in that case.
2949 */
9caee129 2950 if (!relative && (geteuid() == 0))
2202afc9
CB
2951 basecginfo = read_file("/proc/1/cgroup");
2952 else
2953 basecginfo = read_file("/proc/self/cgroup");
2954 if (!basecginfo)
341e6516 2955 return ret_set_errno(-1, ENOMEM);
2202afc9
CB
2956
2957 ret = get_existing_subsystems(&klist, &nlist);
341e6516
CB
2958 if (ret < 0)
2959 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
2202afc9
CB
2960
2961 f = fopen("/proc/self/mountinfo", "r");
341e6516
CB
2962 if (!f)
2963 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
2202afc9
CB
2964
2965 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2966
2967 while (getline(&line, &len, f) != -1) {
2968 int type;
2969 bool writeable;
2970 struct hierarchy *new;
2971 char *base_cgroup = NULL, *mountpoint = NULL;
2972 char **controller_list = NULL;
2973
2974 type = get_cgroup_version(line);
2975 if (type == 0)
2976 continue;
2977
2978 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2979 continue;
2980
2981 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2982 if (type == CGROUP2_SUPER_MAGIC)
2983 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2984 else if (type == CGROUP_SUPER_MAGIC)
2985 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2986 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2987 if (type == CGROUP_SUPER_MAGIC)
2988 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2989 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2990 if (type == CGROUP2_SUPER_MAGIC)
2991 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2992 }
2993
2994 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
2995 if (!controller_list && type == CGROUP_SUPER_MAGIC)
2996 continue;
2997
2998 if (type == CGROUP_SUPER_MAGIC)
2999 if (controller_list_is_dup(ops->hierarchies, controller_list))
341e6516 3000 log_trace_errno(goto next, EEXIST, "Skipping duplicating controller");
2202afc9
CB
3001
3002 mountpoint = cg_hybrid_get_mountpoint(line);
341e6516
CB
3003 if (!mountpoint)
3004 log_error_errno(goto next, EINVAL, "Failed parsing mountpoint from \"%s\"", line);
2202afc9
CB
3005
3006 if (type == CGROUP_SUPER_MAGIC)
3007 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
3008 else
3009 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
341e6516
CB
3010 if (!base_cgroup)
3011 log_error_errno(goto next, EINVAL, "Failed to find current cgroup");
2202afc9
CB
3012
3013 trim(base_cgroup);
3014 prune_init_scope(base_cgroup);
3015 if (type == CGROUP2_SUPER_MAGIC)
3016 writeable = test_writeable_v2(mountpoint, base_cgroup);
3017 else
3018 writeable = test_writeable_v1(mountpoint, base_cgroup);
3019 if (!writeable)
341e6516 3020 log_trace_errno(goto next, EROFS, "The %s group is not writeable", base_cgroup);
2202afc9
CB
3021
3022 if (type == CGROUP2_SUPER_MAGIC) {
3023 char *cgv2_ctrl_path;
3024
3025 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
3026 "cgroup.controllers",
3027 NULL);
3028
3029 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
3030 free(cgv2_ctrl_path);
3031 if (!controller_list) {
3032 controller_list = cg_unified_make_empty_controller();
3033 TRACE("No controllers are enabled for "
3034 "delegation in the unified hierarchy");
3035 }
3036 }
3037
b7b18fc5
CB
3038 /* Exclude all controllers that cgroup use does not want. */
3039 if (!cgroup_use_wants_controllers(ops, controller_list))
341e6516 3040 log_trace_errno(goto next, EINVAL, "Skipping controller");
b7b18fc5 3041
2202afc9 3042 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
a6ca2ed8
CB
3043 if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
3044 if (unprivileged)
3045 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 3046 ops->unified = new;
a6ca2ed8 3047 }
2202afc9
CB
3048
3049 continue;
3050
3051 next:
3052 free_string_list(controller_list);
3053 free(mountpoint);
3054 free(base_cgroup);
3055 }
3056
3057 free_string_list(klist);
3058 free_string_list(nlist);
3059
2202afc9
CB
3060 TRACE("Writable cgroup hierarchies:");
3061 lxc_cgfsng_print_hierarchies(ops);
3062
3063 /* verify that all controllers in cgroup.use and all crucial
3064 * controllers are accounted for
3065 */
3066 if (!all_controllers_found(ops))
341e6516 3067 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
2202afc9 3068
341e6516 3069 return 0;
2202afc9
CB
3070}
3071
2202afc9 3072/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
9caee129 3073static char *cg_unified_get_current_cgroup(bool relative)
2202afc9 3074{
88396101 3075 __do_free char *basecginfo = NULL;
d97919ab 3076 char *base_cgroup;
2202afc9
CB
3077 char *copy = NULL;
3078
9caee129 3079 if (!relative && (geteuid() == 0))
2202afc9
CB
3080 basecginfo = read_file("/proc/1/cgroup");
3081 else
3082 basecginfo = read_file("/proc/self/cgroup");
3083 if (!basecginfo)
3084 return NULL;
3085
3086 base_cgroup = strstr(basecginfo, "0::/");
3087 if (!base_cgroup)
3088 goto cleanup_on_err;
3089
3090 base_cgroup = base_cgroup + 3;
3091 copy = copy_to_eol(base_cgroup);
3092 if (!copy)
3093 goto cleanup_on_err;
3094
3095cleanup_on_err:
2202afc9
CB
3096 if (copy)
3097 trim(copy);
3098
3099 return copy;
3100}
3101
a6ca2ed8
CB
3102static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3103 bool unprivileged)
2202afc9 3104{
d97919ab 3105 __do_free char *subtree_path = NULL;
2202afc9 3106 int ret;
7717e175 3107 char *mountpoint;
2202afc9 3108 char **delegatable;
a6ca2ed8 3109 struct hierarchy *new;
2202afc9
CB
3110 char *base_cgroup = NULL;
3111
d47ff01b 3112 ret = unified_cgroup_hierarchy();
2202afc9 3113 if (ret == -ENOMEDIUM)
d2203230 3114 return ret_errno(ENOMEDIUM);
2202afc9
CB
3115
3116 if (ret != CGROUP2_SUPER_MAGIC)
3117 return 0;
3118
9caee129 3119 base_cgroup = cg_unified_get_current_cgroup(relative);
2202afc9 3120 if (!base_cgroup)
d2203230 3121 return ret_errno(EINVAL);
c581d2a6
CB
3122 if (!relative)
3123 prune_init_scope(base_cgroup);
2202afc9 3124
d606c4e9
CB
3125 /*
3126 * We assume that the cgroup we're currently in has been delegated to
3127 * us and we are free to further delege all of the controllers listed
3128 * in cgroup.controllers further down the hierarchy.
2202afc9 3129 */
dca9587a 3130 mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
c581d2a6 3131 subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
2202afc9 3132 delegatable = cg_unified_get_controllers(subtree_path);
2202afc9
CB
3133 if (!delegatable)
3134 delegatable = cg_unified_make_empty_controller();
3135 if (!delegatable[0])
3136 TRACE("No controllers are enabled for delegation");
3137
3138 /* TODO: If the user requested specific controllers via lxc.cgroup.use
3139 * we should verify here. The reason I'm not doing it right is that I'm
3140 * not convinced that lxc.cgroup.use will be the future since it is a
3141 * global property. I much rather have an option that lets you request
3142 * controllers per container.
3143 */
3144
a6ca2ed8 3145 new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
d606c4e9 3146 if (unprivileged)
a6ca2ed8 3147 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 3148
2a63b5cb
CB
3149 if (bpf_devices_cgroup_supported())
3150 new->bpf_device_controller = 1;
3151
2202afc9 3152 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
908e0ee5 3153 ops->unified = new;
2202afc9
CB
3154 return CGROUP2_SUPER_MAGIC;
3155}
3156
341e6516 3157static int cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
2202afc9
CB
3158{
3159 int ret;
3160 const char *tmp;
9caee129 3161 bool relative = conf->cgroup_meta.relative;
2202afc9
CB
3162
3163 tmp = lxc_global_config_value("lxc.cgroup.use");
b7b18fc5 3164 if (tmp) {
88396101 3165 __do_free char *pin = NULL;
d97919ab 3166 char *chop, *cur;
b7b18fc5
CB
3167
3168 pin = must_copy_string(tmp);
3169 chop = pin;
3170
d97919ab 3171 lxc_iterate_parts(cur, chop, ",")
b7b18fc5 3172 must_append_string(&ops->cgroup_use, cur);
b7b18fc5 3173 }
2202afc9 3174
a6ca2ed8 3175 ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9 3176 if (ret < 0)
341e6516 3177 return -1;
2202afc9
CB
3178
3179 if (ret == CGROUP2_SUPER_MAGIC)
341e6516 3180 return 0;
2202afc9 3181
a6ca2ed8 3182 return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9
CB
3183}
3184
341e6516 3185__cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
2202afc9
CB
3186{
3187 const char *cgroup_pattern;
3188
341e6516
CB
3189 if (!ops)
3190 return ret_set_errno(-1, ENOENT);
3191
2202afc9
CB
3192 /* copy system-wide cgroup information */
3193 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3194 if (!cgroup_pattern) {
3195 /* lxc.cgroup.pattern is only NULL on error. */
3196 ERROR("Failed to retrieve cgroup pattern");
341e6516 3197 return ret_set_errno(-1, ENOMEM);
2202afc9
CB
3198 }
3199 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
625ad37b 3200 ops->monitor_pattern = MONITOR_CGROUP;
2202afc9 3201
341e6516 3202 return 0;
2202afc9
CB
3203}
3204
5a087e05 3205struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
2202afc9 3206{
a64edc1c 3207 __do_free struct cgroup_ops *cgfsng_ops = NULL;
2202afc9
CB
3208
3209 cgfsng_ops = malloc(sizeof(struct cgroup_ops));
3210 if (!cgfsng_ops)
341e6516 3211 return ret_set_errno(NULL, ENOMEM);
2202afc9
CB
3212
3213 memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
3214 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3215
341e6516 3216 if (cg_init(cgfsng_ops, conf))
2202afc9 3217 return NULL;
2202afc9 3218
bad788b0
CB
3219 cgfsng_ops->unified_fd = -EBADF;
3220
2202afc9 3221 cgfsng_ops->data_init = cgfsng_data_init;
434c8e15
CB
3222 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3223 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
72068e74 3224 cgfsng_ops->monitor_create = cgfsng_monitor_create;
eeef32bb 3225 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
c581d2a6
CB
3226 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3227 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
e8b181f5
CB
3228 cgfsng_ops->payload_create = cgfsng_payload_create;
3229 cgfsng_ops->payload_enter = cgfsng_payload_enter;
2202afc9
CB
3230 cgfsng_ops->escape = cgfsng_escape;
3231 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
3232 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
3233 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3234 cgfsng_ops->get = cgfsng_get;
3235 cgfsng_ops->set = cgfsng_set;
942e193e 3236 cgfsng_ops->freeze = cgfsng_freeze;
2202afc9 3237 cgfsng_ops->unfreeze = cgfsng_unfreeze;
c581d2a6 3238 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
2202afc9
CB
3239 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3240 cgfsng_ops->driver = "cgfsng";
3241 cgfsng_ops->version = "1.0.0";
3242 cgfsng_ops->attach = cgfsng_attach;
3243 cgfsng_ops->chown = cgfsng_chown;
3244 cgfsng_ops->mount = cgfsng_mount;
3245 cgfsng_ops->nrtasks = cgfsng_nrtasks;
bf651989 3246 cgfsng_ops->devices_activate = cgfsng_devices_activate;
2202afc9 3247
a64edc1c 3248 return move_ptr(cgfsng_ops);
2202afc9 3249}