]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
cgroups/cgfsng: rework legacy cpuset handling
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
ccb4cabe
SH
2
3/*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
0e7ff52c 8 * each controller.
ccb4cabe
SH
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
18406e5a 12 * a comma-separated list of controllers.
ccb4cabe 13 */
a54694f8 14
d38dd64a
CB
15#ifndef _GNU_SOURCE
16#define _GNU_SOURCE 1
17#endif
a54694f8
CB
18#include <ctype.h>
19#include <dirent.h>
20#include <errno.h>
21#include <grp.h>
d38dd64a
CB
22#include <linux/kdev_t.h>
23#include <linux/types.h>
942e193e
CB
24#include <poll.h>
25#include <signal.h>
a54694f8 26#include <stdint.h>
ccb4cabe
SH
27#include <stdio.h>
28#include <stdlib.h>
a54694f8 29#include <string.h>
438c4581 30#include <sys/types.h>
d38dd64a 31#include <unistd.h>
c8bf519d 32
b635e92d 33#include "caps.h"
ccb4cabe 34#include "cgroup.h"
bf651989 35#include "cgroup2_devices.h"
6328fd9c 36#include "cgroup_utils.h"
ccb4cabe 37#include "commands.h"
43654d34 38#include "conf.h"
d38dd64a 39#include "config.h"
a54694f8 40#include "log.h"
c19ad94b 41#include "macro.h"
018051e3 42#include "mainloop.h"
861cb8c2 43#include "memory_utils.h"
43654d34 44#include "storage/storage.h"
a54694f8 45#include "utils.h"
ccb4cabe 46
64e82f8b
DJ
47#ifndef HAVE_STRLCPY
48#include "include/strlcpy.h"
49#endif
50
3ebe2fbd
DJ
51#ifndef HAVE_STRLCAT
52#include "include/strlcat.h"
53#endif
54
ac2cecc4 55lxc_log_define(cgfsng, cgroup);
ccb4cabe 56
ccb4cabe
SH
57static void free_string_list(char **clist)
58{
2d5fe5ba 59 int i;
ccb4cabe 60
2d5fe5ba
CB
61 if (!clist)
62 return;
63
64 for (i = 0; clist[i]; i++)
65 free(clist[i]);
66
67 free(clist);
ccb4cabe
SH
68}
69
8b8db2f6
CB
70/* Given a pointer to a null-terminated array of pointers, realloc to add one
71 * entry, and point the new entry to NULL. Do not fail. Return the index to the
72 * second-to-last entry - that is, the one which is now available for use
73 * (keeping the list null-terminated).
ccb4cabe
SH
74 */
75static int append_null_to_list(void ***list)
76{
77 int newentry = 0;
78
79 if (*list)
8b8db2f6
CB
80 for (; (*list)[newentry]; newentry++)
81 ;
ccb4cabe
SH
82
83 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
84 (*list)[newentry + 1] = NULL;
85 return newentry;
86}
87
8073018d
CB
88/* Given a null-terminated array of strings, check whether @entry is one of the
89 * strings.
ccb4cabe
SH
90 */
91static bool string_in_list(char **list, const char *entry)
92{
93 int i;
94
95 if (!list)
96 return false;
d6337a5f 97
ccb4cabe
SH
98 for (i = 0; list[i]; i++)
99 if (strcmp(list[i], entry) == 0)
100 return true;
101
102 return false;
103}
104
ac010944
CB
105/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
106 * "name=systemd". Do not fail.
107 */
108static char *cg_legacy_must_prefix_named(char *entry)
109{
110 size_t len;
111 char *prefixed;
112
113 len = strlen(entry);
f25a2044 114 prefixed = must_realloc(NULL, len + 6);
ac010944 115
6333c915
CB
116 memcpy(prefixed, "name=", STRLITERALLEN("name="));
117 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
ac010944 118 prefixed[len + 5] = '\0';
99bb3fa8 119
ac010944
CB
120 return prefixed;
121}
122
42a993b4
CB
123/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
124 * we are called.
ccb4cabe 125 *
42a993b4
CB
126 * We also handle named subsystems here. Any controller which is not a kernel
127 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
128 * we refuse to use because we're not sure which we have here.
129 * (TODO: We could work around this in some cases by just remounting to be
130 * unambiguous, or by comparing mountpoint contents with current cgroup.)
ccb4cabe
SH
131 *
132 * The last entry will always be NULL.
133 */
42a993b4
CB
134static void must_append_controller(char **klist, char **nlist, char ***clist,
135 char *entry)
ccb4cabe
SH
136{
137 int newentry;
138 char *copy;
139
140 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
c2712f64 141 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
ccb4cabe
SH
142 ERROR("It is both a named and kernel subsystem");
143 return;
144 }
145
146 newentry = append_null_to_list((void ***)clist);
147
148 if (strncmp(entry, "name=", 5) == 0)
149 copy = must_copy_string(entry);
150 else if (string_in_list(klist, entry))
151 copy = must_copy_string(entry);
152 else
7745483d 153 copy = cg_legacy_must_prefix_named(entry);
ccb4cabe
SH
154
155 (*clist)[newentry] = copy;
156}
157
2a63b5cb
CB
158static inline bool pure_unified_layout(const struct cgroup_ops *ops)
159{
160 return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED;
161}
162
5ae0207c
CB
163/* Given a handler's cgroup data, return the struct hierarchy for the controller
164 * @c, or NULL if there is none.
ccb4cabe 165 */
27a5132c 166struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
ccb4cabe
SH
167{
168 int i;
169
27a5132c
CB
170 errno = ENOENT;
171
172 if (!ops->hierarchies) {
173 TRACE("There are no useable cgroup controllers");
ccb4cabe 174 return NULL;
27a5132c 175 }
d6337a5f 176
2202afc9 177 for (i = 0; ops->hierarchies[i]; i++) {
27a5132c 178 if (!controller) {
d6337a5f 179 /* This is the empty unified hierarchy. */
2202afc9
CB
180 if (ops->hierarchies[i]->controllers &&
181 !ops->hierarchies[i]->controllers[0])
182 return ops->hierarchies[i];
106f1f38 183 continue;
2a63b5cb
CB
184 } else if (pure_unified_layout(ops) &&
185 strcmp(controller, "devices") == 0) {
186 if (ops->unified->bpf_device_controller)
187 return ops->unified;
188 break;
d6337a5f
CB
189 }
190
27a5132c 191 if (string_in_list(ops->hierarchies[i]->controllers, controller))
2202afc9 192 return ops->hierarchies[i];
ccb4cabe 193 }
d6337a5f 194
27a5132c
CB
195 if (controller)
196 WARN("There is no useable %s controller", controller);
197 else
198 WARN("There is no empty unified cgroup hierarchy");
199
ccb4cabe
SH
200 return NULL;
201}
202
a54694f8
CB
203#define BATCH_SIZE 50
204static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
205{
206 int newbatches = (newlen / BATCH_SIZE) + 1;
207 int oldbatches = (oldlen / BATCH_SIZE) + 1;
208
209 if (!*mem || newbatches > oldbatches) {
210 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
211 }
212}
213
214static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
215{
216 size_t full = oldlen + newlen;
217
218 batch_realloc(dest, oldlen, full + 1);
219
220 memcpy(*dest + oldlen, new, newlen + 1);
221}
222
223/* Slurp in a whole file */
d6337a5f 224static char *read_file(const char *fnam)
a54694f8 225{
d97919ab
CB
226 __do_free char *line = NULL;
227 __do_fclose FILE *f = NULL;
a54694f8 228 int linelen;
d97919ab
CB
229 char *buf = NULL;
230 size_t len = 0, fulllen = 0;
a54694f8
CB
231
232 f = fopen(fnam, "r");
233 if (!f)
234 return NULL;
235 while ((linelen = getline(&line, &len, f)) != -1) {
236 append_line(&buf, fulllen, line, linelen);
237 fulllen += linelen;
238 }
a54694f8
CB
239 return buf;
240}
241
242/* Taken over modified from the kernel sources. */
243#define NBITS 32 /* bits in uint32_t */
244#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
245#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
246
247static void set_bit(unsigned bit, uint32_t *bitarr)
248{
249 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
250}
251
252static void clear_bit(unsigned bit, uint32_t *bitarr)
253{
254 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
255}
256
257static bool is_set(unsigned bit, uint32_t *bitarr)
258{
259 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
260}
261
262/* Create cpumask from cpulist aka turn:
263 *
264 * 0,2-3
265 *
d5d468f6 266 * into bit array
a54694f8
CB
267 *
268 * 1 0 1 1
269 */
270static uint32_t *lxc_cpumask(char *buf, size_t nbits)
271{
272 char *token;
d5d468f6 273 size_t arrlen;
c5b8049e 274 __do_free uint32_t *bitarr = NULL;
d5d468f6
CB
275
276 arrlen = BITS_TO_LONGS(nbits);
277 bitarr = calloc(arrlen, sizeof(uint32_t));
a54694f8 278 if (!bitarr)
c5b8049e 279 return ret_set_errno(NULL, ENOMEM);
a54694f8 280
0be0d78f 281 lxc_iterate_parts(token, buf, ",") {
a54694f8 282 errno = 0;
d5d468f6
CB
283 unsigned end, start;
284 char *range;
a54694f8 285
d5d468f6
CB
286 start = strtoul(token, NULL, 0);
287 end = start;
288 range = strchr(token, '-');
a54694f8
CB
289 if (range)
290 end = strtoul(range + 1, NULL, 0);
d5d468f6 291
c5b8049e
CB
292 if (!(start <= end))
293 return ret_set_errno(NULL, EINVAL);
a54694f8 294
c5b8049e
CB
295 if (end >= nbits)
296 return ret_set_errno(NULL, EINVAL);
a54694f8
CB
297
298 while (start <= end)
299 set_bit(start++, bitarr);
300 }
301
c5b8049e 302 return move_ptr(bitarr);
a54694f8
CB
303}
304
a54694f8
CB
305/* Turn cpumask into simple, comma-separated cpulist. */
306static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
307{
a54694f8 308 int ret;
414c6719 309 size_t i;
24cac6af 310 char *tmp = NULL;
a54694f8 311 char **cpulist = NULL;
c19ad94b 312 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
a54694f8
CB
313
314 for (i = 0; i <= nbits; i++) {
414c6719
CB
315 if (!is_set(i, bitarr))
316 continue;
317
979a0d93
CB
318 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
319 if (ret < 0 || (size_t)ret >= sizeof(numstr)) {
414c6719
CB
320 lxc_free_array((void **)cpulist, free);
321 return NULL;
322 }
323
324 ret = lxc_append_string(&cpulist, numstr);
325 if (ret < 0) {
326 lxc_free_array((void **)cpulist, free);
c5b8049e 327 return ret_set_errno(NULL, ENOMEM);
a54694f8
CB
328 }
329 }
414c6719
CB
330
331 if (!cpulist)
c5b8049e 332 return ret_set_errno(NULL, ENOMEM);
414c6719 333
24cac6af
L
334 tmp = lxc_string_join(",", (const char **)cpulist, false);
335 lxc_free_array((void **)cpulist, free);
336
337 return tmp;
a54694f8
CB
338}
339
340static ssize_t get_max_cpus(char *cpulist)
341{
342 char *c1, *c2;
343 char *maxcpus = cpulist;
344 size_t cpus = 0;
345
346 c1 = strrchr(maxcpus, ',');
347 if (c1)
348 c1++;
349
350 c2 = strrchr(maxcpus, '-');
351 if (c2)
352 c2++;
353
354 if (!c1 && !c2)
355 c1 = maxcpus;
356 else if (c1 > c2)
357 c2 = c1;
358 else if (c1 < c2)
359 c1 = c2;
333987b9 360 else if (!c1 && c2)
a54694f8
CB
361 c1 = c2;
362
a54694f8
CB
363 errno = 0;
364 cpus = strtoul(c1, NULL, 0);
365 if (errno != 0)
366 return -1;
367
368 return cpus;
369}
370
6f9584d8 371#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
36f70181 372#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
c5b8049e
CB
373static bool cg_legacy_filter_and_set_cpus(const char *parent_cgroup,
374 char *child_cgroup, bool am_initialized)
a54694f8 375{
d97919ab 376 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
36f70181
CB
377 *offlinecpus = NULL, *posscpus = NULL;
378 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
379 *possmask = NULL;
a54694f8
CB
380 int ret;
381 ssize_t i;
36f70181 382 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
c5b8049e 383 bool flipped_bit = false;
a54694f8 384
c5b8049e
CB
385 SYSERROR("AAAA: %s | %s", parent_cgroup, child_cgroup);
386 fpath = must_make_path(parent_cgroup, "cpuset.cpus", NULL);
a54694f8 387 posscpus = read_file(fpath);
c5b8049e
CB
388 if (!posscpus)
389 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
a54694f8
CB
390
391 /* Get maximum number of cpus found in possible cpuset. */
392 maxposs = get_max_cpus(posscpus);
92d5ea57 393 if (maxposs < 0 || maxposs >= INT_MAX - 1)
d97919ab 394 return false;
a54694f8 395
36f70181
CB
396 if (file_exists(__ISOL_CPUS)) {
397 isolcpus = read_file(__ISOL_CPUS);
c5b8049e
CB
398 if (!isolcpus)
399 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
6f9584d8 400
36f70181
CB
401 if (isdigit(isolcpus[0])) {
402 /* Get maximum number of cpus found in isolated cpuset. */
403 maxisol = get_max_cpus(isolcpus);
404 if (maxisol < 0 || maxisol >= INT_MAX - 1)
405 return false;
6f9584d8 406 }
36f70181
CB
407
408 if (maxposs < maxisol)
409 maxposs = maxisol;
410 maxposs++;
411 } else {
412 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
a54694f8
CB
413 }
414
36f70181
CB
415 if (file_exists(__OFFLINE_CPUS)) {
416 offlinecpus = read_file(__OFFLINE_CPUS);
c5b8049e
CB
417 if (!offlinecpus)
418 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
36f70181
CB
419
420 if (isdigit(offlinecpus[0])) {
421 /* Get maximum number of cpus found in offline cpuset. */
422 maxoffline = get_max_cpus(offlinecpus);
423 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
424 return false;
425 }
426
427 if (maxposs < maxoffline)
428 maxposs = maxoffline;
429 maxposs++;
430 } else {
431 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
432 }
a54694f8 433
dcd14a3d
CB
434 if ((maxisol == 0) && (maxoffline == 0)) {
435 cpulist = move_ptr(posscpus);
36f70181 436 goto copy_parent;
dcd14a3d 437 }
a54694f8
CB
438
439 possmask = lxc_cpumask(posscpus, maxposs);
c5b8049e
CB
440 if (!possmask)
441 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
a54694f8 442
36f70181
CB
443 if (maxisol > 0) {
444 isolmask = lxc_cpumask(isolcpus, maxposs);
c5b8049e
CB
445 if (!isolmask)
446 return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
36f70181
CB
447 }
448
449 if (maxoffline > 0) {
450 offlinemask = lxc_cpumask(offlinecpus, maxposs);
c5b8049e
CB
451 if (!offlinemask)
452 return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
6f9584d8 453 }
a54694f8
CB
454
455 for (i = 0; i <= maxposs; i++) {
36f70181
CB
456 if ((isolmask && !is_set(i, isolmask)) ||
457 (offlinemask && !is_set(i, offlinemask)) ||
458 !is_set(i, possmask))
59ac3b88
CB
459 continue;
460
461 flipped_bit = true;
462 clear_bit(i, possmask);
a54694f8
CB
463 }
464
6f9584d8 465 if (!flipped_bit) {
b31d62b8
CB
466 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
467 TRACE("No isolated or offline cpus present in cpuset");
468 } else {
469 cpulist = move_ptr(posscpus);
470 TRACE("Removed isolated or offline cpus from cpuset");
6f9584d8 471 }
c5b8049e
CB
472 if (!cpulist)
473 return log_error_errno(false, errno, "Failed to create cpu list");
a54694f8
CB
474
475copy_parent:
36f70181 476 if (!am_initialized) {
c5b8049e 477 ret = lxc_write_openat(child_cgroup, "cpuset.cpus", cpulist, strlen(cpulist));
c04a6d4e
CB
478 if (ret < 0)
479 return log_error_errno(false,
480 errno, "Failed to write cpu list to \"%s/cpuset.cpus\"",
c5b8049e 481 child_cgroup);
36f70181
CB
482
483 TRACE("Copied cpu settings of parent cgroup");
6f9584d8
CB
484 }
485
d97919ab 486 return true;
a54694f8
CB
487}
488
e3a3fecf 489/* Copy contents of parent(@path)/@file to @path/@file */
c5b8049e
CB
490static bool copy_parent_file(const char *parent_cgroup,
491 const char *child_cgroup, const char *file)
e3a3fecf 492{
c5b8049e 493 __do_free char *parent_file = NULL, *value = NULL;
b095a8eb 494 int len = 0;
fe70edee 495 int ret;
e3a3fecf 496
c5b8049e
CB
497 parent_file = must_make_path(parent_cgroup, file, NULL);
498 len = lxc_read_from_file(parent_file, NULL, 0);
fe70edee
CB
499 if (len <= 0)
500 return log_error_errno(false, errno,
501 "Failed to determine buffer size");
b095a8eb 502
f25a2044 503 value = must_realloc(NULL, len + 1);
fe70edee 504 value[len] = '\0';
c5b8049e 505 ret = lxc_read_from_file(parent_file, value, len);
fe70edee
CB
506 if (ret != len)
507 return log_error_errno(false, errno,
508 "Failed to read from parent file \"%s\"",
c5b8049e 509 parent_file);
b095a8eb 510
c5b8049e 511 ret = lxc_write_openat(child_cgroup, file, value, len);
fe70edee
CB
512 if (ret < 0 && errno != EACCES)
513 return log_error_errno(false,
514 errno, "Failed to write \"%s\" to file \"%s/%s\"",
c5b8049e 515 value, child_cgroup, file);
fe70edee 516 return true;
e3a3fecf
SH
517}
518
c04a6d4e
CB
519static bool is_unified_hierarchy(const struct hierarchy *h)
520{
521 return h->version == CGROUP2_SUPER_MAGIC;
522}
523
f990d3bf
CB
524/*
525 * Initialize the cpuset hierarchy in first directory of @cgroup_leaf and set
7793add3
CB
526 * cgroup.clone_children so that children inherit settings. Since the
527 * h->base_path is populated by init or ourselves, we know it is already
528 * initialized.
fe70edee
CB
529 *
530 * returns -1 on error, 0 when we didn't created a cgroup, 1 if we created a
531 * cgroup.
e3a3fecf 532 */
f990d3bf
CB
533static int cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h,
534 const char *cgroup_leaf)
e3a3fecf 535{
c5b8049e 536 __do_free char *parent_cgroup = NULL, *child_cgroup = NULL, *dup = NULL;
c04a6d4e 537 __do_close_prot_errno int cgroup_fd = -EBADF;
c5b8049e 538 int fret = -1;
7793add3
CB
539 int ret;
540 char v;
f990d3bf 541 char *leaf, *slash;
e3a3fecf 542
c04a6d4e 543 if (is_unified_hierarchy(h))
fe70edee 544 return 0;
c04a6d4e 545
e3a3fecf 546 if (!string_in_list(h->controllers, "cpuset"))
fe70edee 547 return 0;
e3a3fecf 548
f990d3bf
CB
549 if (!cgroup_leaf)
550 return ret_set_errno(-1, EINVAL);
551
552 dup = strdup(cgroup_leaf);
553 if (!dup)
554 return ret_set_errno(-1, ENOMEM);
555
c5b8049e
CB
556 parent_cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
557
558 leaf = dup;
f990d3bf
CB
559 leaf += strspn(leaf, "/");
560 slash = strchr(leaf, '/');
e3a3fecf
SH
561 if (slash)
562 *slash = '\0';
c5b8049e 563 child_cgroup = must_make_path(parent_cgroup, leaf, NULL);
e3a3fecf
SH
564 if (slash)
565 *slash = '/';
7793add3 566
fe70edee 567 fret = 1;
c5b8049e 568 ret = mkdir(child_cgroup, 0755);
7793add3 569 if (ret < 0) {
fe70edee 570 if (errno != EEXIST)
c5b8049e 571 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", child_cgroup);
fe70edee
CB
572
573 fret = 0;
e3a3fecf 574 }
6f9584d8 575
c5b8049e 576 cgroup_fd = lxc_open_dirfd(child_cgroup);
c04a6d4e 577 if (cgroup_fd < 0)
fe70edee 578 return -1;
7793add3 579
c04a6d4e 580 ret = lxc_readat(cgroup_fd, "cgroup.clone_children", &v, 1);
fe70edee 581 if (ret < 0)
c5b8049e 582 return log_error_errno(-1, errno, "Failed to read file \"%s/cgroup.clone_children\"", child_cgroup);
e3a3fecf 583
a54694f8 584 /* Make sure any isolated cpus are removed from cpuset.cpus. */
c5b8049e 585 if (!cg_legacy_filter_and_set_cpus(parent_cgroup, child_cgroup, v == '1'))
fe70edee 586 return log_error_errno(-1, errno, "Failed to remove isolated cpus");
a54694f8 587
7793add3 588 /* Already set for us by someone else. */
b28c2810
CB
589 if (v == '1')
590 TRACE("\"cgroup.clone_children\" was already set to \"1\"");
e3a3fecf
SH
591
592 /* copy parent's settings */
c5b8049e 593 if (!copy_parent_file(parent_cgroup, child_cgroup, "cpuset.mems"))
fe70edee 594 return log_error_errno(-1, errno, "Failed to copy \"cpuset.mems\" settings");
e3a3fecf 595
fe70edee 596 /* Set clone_children so children inherit our settings */
c04a6d4e 597 ret = lxc_writeat(cgroup_fd, "cgroup.clone_children", "1", 1);
fe70edee 598 if (ret < 0)
c5b8049e 599 return log_error_errno(-1, errno, "Failed to write 1 to \"%s/cgroup.clone_children\"", child_cgroup);
d97919ab 600
fe70edee 601 return fret;
e3a3fecf
SH
602}
603
5c0089ae
CB
604/* Given two null-terminated lists of strings, return true if any string is in
605 * both.
ccb4cabe
SH
606 */
607static bool controller_lists_intersect(char **l1, char **l2)
608{
609 int i;
610
611 if (!l1 || !l2)
612 return false;
613
614 for (i = 0; l1[i]; i++) {
615 if (string_in_list(l2, l1[i]))
616 return true;
617 }
5c0089ae 618
ccb4cabe
SH
619 return false;
620}
621
258449e5
CB
622/* For a null-terminated list of controllers @clist, return true if any of those
623 * controllers is already listed the null-terminated list of hierarchies @hlist.
624 * Realistically, if one is present, all must be present.
ccb4cabe
SH
625 */
626static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
627{
628 int i;
629
630 if (!hlist)
631 return false;
258449e5 632
ccb4cabe
SH
633 for (i = 0; hlist[i]; i++)
634 if (controller_lists_intersect(hlist[i]->controllers, clist))
635 return true;
ccb4cabe 636
258449e5 637 return false;
ccb4cabe
SH
638}
639
f57ac67f
CB
640/* Return true if the controller @entry is found in the null-terminated list of
641 * hierarchies @hlist.
ccb4cabe
SH
642 */
643static bool controller_found(struct hierarchy **hlist, char *entry)
644{
645 int i;
d6337a5f 646
ccb4cabe
SH
647 if (!hlist)
648 return false;
649
650 for (i = 0; hlist[i]; i++)
651 if (string_in_list(hlist[i]->controllers, entry))
652 return true;
d6337a5f 653
ccb4cabe
SH
654 return false;
655}
656
e1c27ab0
CB
657/* Return true if all of the controllers which we require have been found. The
658 * required list is freezer and anything in lxc.cgroup.use.
ccb4cabe 659 */
2202afc9 660static bool all_controllers_found(struct cgroup_ops *ops)
ccb4cabe 661{
b7b18fc5 662 char **cur;
2202afc9 663 struct hierarchy **hlist = ops->hierarchies;
ccb4cabe 664
2202afc9 665 if (!ops->cgroup_use)
ccb4cabe 666 return true;
c2712f64 667
b7b18fc5
CB
668 for (cur = ops->cgroup_use; cur && *cur; cur++)
669 if (!controller_found(hlist, *cur)) {
670 ERROR("No %s controller mountpoint found", *cur);
ccb4cabe
SH
671 return false;
672 }
c2712f64 673
ccb4cabe
SH
674 return true;
675}
676
f205f10c
CB
677/* Get the controllers from a mountinfo line There are other ways we could get
678 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
679 * could parse the mount options. But we simply assume that the mountpoint must
680 * be /sys/fs/cgroup/controller-list
ccb4cabe 681 */
a3926f6a
CB
682static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
683 int type)
ccb4cabe 684{
f205f10c
CB
685 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
686 * for legacy hierarchies.
687 */
ccb4cabe 688 int i;
d97919ab 689 char *p2, *tok;
0be0d78f 690 char *p = line, *sep = ",";
411ac6d8 691 char **aret = NULL;
6328fd9c 692
ccb4cabe 693 for (i = 0; i < 4; i++) {
235f1815 694 p = strchr(p, ' ');
ccb4cabe
SH
695 if (!p)
696 return NULL;
697 p++;
698 }
a55f31bd 699
f205f10c
CB
700 /* Note, if we change how mountinfo works, then our caller will need to
701 * verify /sys/fs/cgroup/ in this field.
702 */
dca9587a
CB
703 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0) {
704 ERROR("Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
ccb4cabe 705 return NULL;
5059aae9 706 }
d6337a5f 707
ccb4cabe 708 p += 15;
235f1815 709 p2 = strchr(p, ' ');
ccb4cabe 710 if (!p2) {
2202afc9 711 ERROR("Corrupt mountinfo");
ccb4cabe
SH
712 return NULL;
713 }
714 *p2 = '\0';
6328fd9c 715
d6337a5f 716 if (type == CGROUP_SUPER_MAGIC) {
88396101 717 __do_free char *dup = NULL;
d97919ab 718
0be0d78f
CB
719 /* strdup() here for v1 hierarchies. Otherwise
720 * lxc_iterate_parts() will destroy mountpoints such as
721 * "/sys/fs/cgroup/cpu,cpuacct".
d6337a5f 722 */
d97919ab 723 dup = must_copy_string(p);
d6337a5f
CB
724 if (!dup)
725 return NULL;
726
d97919ab 727 lxc_iterate_parts (tok, dup, sep)
d6337a5f 728 must_append_controller(klist, nlist, &aret, tok);
411ac6d8 729 }
d6337a5f 730 *p2 = ' ';
f205f10c 731
d6337a5f
CB
732 return aret;
733}
411ac6d8 734
d6337a5f
CB
735static char **cg_unified_make_empty_controller(void)
736{
737 int newentry;
738 char **aret = NULL;
739
740 newentry = append_null_to_list((void ***)&aret);
741 aret[newentry] = NULL;
742 return aret;
743}
744
745static char **cg_unified_get_controllers(const char *file)
746{
d97919ab 747 __do_free char *buf = NULL;
0be0d78f 748 char *sep = " \t\n";
d6337a5f 749 char **aret = NULL;
2a63b5cb 750 char *tok;
d6337a5f
CB
751
752 buf = read_file(file);
753 if (!buf)
411ac6d8 754 return NULL;
6328fd9c 755
0be0d78f 756 lxc_iterate_parts(tok, buf, sep) {
d6337a5f
CB
757 int newentry;
758 char *copy;
759
760 newentry = append_null_to_list((void ***)&aret);
761 copy = must_copy_string(tok);
762 aret[newentry] = copy;
ccb4cabe
SH
763 }
764
765 return aret;
766}
767
2202afc9 768static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
bb221ad1 769 char *container_base_path, int type)
ccb4cabe
SH
770{
771 struct hierarchy *new;
772 int newentry;
773
f25a2044 774 new = must_realloc(NULL, sizeof(*new));
ccb4cabe
SH
775 new->controllers = clist;
776 new->mountpoint = mountpoint;
bb221ad1 777 new->container_base_path = container_base_path;
eb697136 778 new->container_full_path = NULL;
e09b62f9 779 new->monitor_full_path = NULL;
d6337a5f 780 new->version = type;
a6ca2ed8 781 new->cgroup2_chown = NULL;
6328fd9c 782
2202afc9
CB
783 newentry = append_null_to_list((void ***)h);
784 (*h)[newentry] = new;
d6337a5f 785 return new;
ccb4cabe
SH
786}
787
798c3b33
CB
788/* Get a copy of the mountpoint from @line, which is a line from
789 * /proc/self/mountinfo.
ccb4cabe 790 */
a3926f6a 791static char *cg_hybrid_get_mountpoint(char *line)
ccb4cabe
SH
792{
793 int i;
ccb4cabe 794 size_t len;
798c3b33
CB
795 char *p2;
796 char *p = line, *sret = NULL;
ccb4cabe
SH
797
798 for (i = 0; i < 4; i++) {
235f1815 799 p = strchr(p, ' ');
ccb4cabe
SH
800 if (!p)
801 return NULL;
802 p++;
803 }
d6337a5f 804
dca9587a 805 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
d6337a5f
CB
806 return NULL;
807
808 p2 = strchr(p + 15, ' ');
809 if (!p2)
810 return NULL;
811 *p2 = '\0';
812
ccb4cabe 813 len = strlen(p);
f25a2044 814 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
815 memcpy(sret, p, len);
816 sret[len] = '\0';
817 return sret;
818}
819
f523291e 820/* Given a multi-line string, return a null-terminated copy of the current line. */
ccb4cabe
SH
821static char *copy_to_eol(char *p)
822{
235f1815 823 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
824 size_t len;
825
826 if (!p2)
827 return NULL;
828
829 len = p2 - p;
f25a2044 830 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
831 memcpy(sret, p, len);
832 sret[len] = '\0';
833 return sret;
834}
835
bced39de
CB
836/* cgline: pointer to character after the first ':' in a line in a \n-terminated
837 * /proc/self/cgroup file. Check whether controller c is present.
ccb4cabe
SH
838 */
839static bool controller_in_clist(char *cgline, char *c)
840{
d97919ab
CB
841 __do_free char *tmp = NULL;
842 char *tok, *eol;
ccb4cabe
SH
843 size_t len;
844
235f1815 845 eol = strchr(cgline, ':');
ccb4cabe
SH
846 if (!eol)
847 return false;
848
849 len = eol - cgline;
861cb8c2 850 tmp = must_realloc(NULL, len + 1);
ccb4cabe
SH
851 memcpy(tmp, cgline, len);
852 tmp[len] = '\0';
853
d97919ab
CB
854 lxc_iterate_parts(tok, tmp, ",")
855 if (strcmp(tok, c) == 0)
ccb4cabe 856 return true;
d6337a5f 857
ccb4cabe
SH
858 return false;
859}
860
c3ef912e
CB
861/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
862 * @controller.
ccb4cabe 863 */
c3ef912e
CB
864static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
865 int type)
ccb4cabe
SH
866{
867 char *p = basecginfo;
6328fd9c 868
d6337a5f
CB
869 for (;;) {
870 bool is_cgv2_base_cgroup = false;
871
6328fd9c 872 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
d6337a5f
CB
873 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
874 is_cgv2_base_cgroup = true;
ccb4cabe 875
235f1815 876 p = strchr(p, ':');
ccb4cabe
SH
877 if (!p)
878 return NULL;
879 p++;
d6337a5f
CB
880
881 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
235f1815 882 p = strchr(p, ':');
ccb4cabe
SH
883 if (!p)
884 return NULL;
885 p++;
886 return copy_to_eol(p);
887 }
888
235f1815 889 p = strchr(p, '\n');
ccb4cabe
SH
890 if (!p)
891 return NULL;
892 p++;
893 }
894}
895
ccb4cabe
SH
896static void must_append_string(char ***list, char *entry)
897{
6dfb18bf 898 int newentry;
ccb4cabe
SH
899 char *copy;
900
6dfb18bf 901 newentry = append_null_to_list((void ***)list);
ccb4cabe
SH
902 copy = must_copy_string(entry);
903 (*list)[newentry] = copy;
904}
905
d6337a5f 906static int get_existing_subsystems(char ***klist, char ***nlist)
ccb4cabe 907{
d97919ab
CB
908 __do_free char *line = NULL;
909 __do_fclose FILE *f = NULL;
ccb4cabe
SH
910 size_t len = 0;
911
d6337a5f
CB
912 f = fopen("/proc/self/cgroup", "r");
913 if (!f)
914 return -1;
915
ccb4cabe 916 while (getline(&line, &len, f) != -1) {
0be0d78f 917 char *p, *p2, *tok;
235f1815 918 p = strchr(line, ':');
ccb4cabe
SH
919 if (!p)
920 continue;
921 p++;
235f1815 922 p2 = strchr(p, ':');
ccb4cabe
SH
923 if (!p2)
924 continue;
925 *p2 = '\0';
ff8d6ee9 926
6328fd9c
CB
927 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
928 * contains an entry of the form:
ff8d6ee9
CB
929 *
930 * 0::/some/path
931 *
6328fd9c 932 * In this case we use "cgroup2" as controller name.
ff8d6ee9 933 */
6328fd9c
CB
934 if ((p2 - p) == 0) {
935 must_append_string(klist, "cgroup2");
ff8d6ee9 936 continue;
6328fd9c 937 }
ff8d6ee9 938
0be0d78f 939 lxc_iterate_parts(tok, p, ",") {
ccb4cabe
SH
940 if (strncmp(tok, "name=", 5) == 0)
941 must_append_string(nlist, tok);
942 else
943 must_append_string(klist, tok);
944 }
945 }
946
d6337a5f 947 return 0;
ccb4cabe
SH
948}
949
950static void trim(char *s)
951{
7689dfd7
CB
952 size_t len;
953
954 len = strlen(s);
2c28d76b 955 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
956 s[--len] = '\0';
957}
958
2202afc9 959static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
ccb4cabe
SH
960{
961 int i;
27d84737 962 struct hierarchy **it;
41c33dbe 963
2202afc9
CB
964 if (!ops->hierarchies) {
965 TRACE(" No hierarchies found");
ccb4cabe
SH
966 return;
967 }
27d84737 968
2202afc9
CB
969 TRACE(" Hierarchies:");
970 for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
ccb4cabe 971 int j;
27d84737
CB
972 char **cit;
973
bb221ad1 974 TRACE(" %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
2202afc9
CB
975 TRACE(" mountpoint: %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
976 TRACE(" controllers:");
a7b0cc4c 977 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
2202afc9 978 TRACE(" %d: %s", j, *cit);
ccb4cabe
SH
979 }
980}
41c33dbe 981
a3926f6a
CB
982static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
983 char **nlist)
41c33dbe
SH
984{
985 int k;
a7b0cc4c 986 char **it;
41c33dbe 987
2202afc9
CB
988 TRACE("basecginfo is:");
989 TRACE("%s", basecginfo);
41c33dbe 990
a7b0cc4c 991 for (k = 0, it = klist; it && *it; it++, k++)
2202afc9 992 TRACE("kernel subsystem %d: %s", k, *it);
0f71dd9b 993
a7b0cc4c 994 for (k = 0, it = nlist; it && *it; it++, k++)
2202afc9 995 TRACE("named subsystem %d: %s", k, *it);
41c33dbe 996}
ccb4cabe 997
2202afc9
CB
998static int cgroup_rmdir(struct hierarchy **hierarchies,
999 const char *container_cgroup)
c71d83e1 1000{
2202afc9 1001 int i;
d6337a5f 1002
2202afc9
CB
1003 if (!container_cgroup || !hierarchies)
1004 return 0;
d6337a5f 1005
2202afc9
CB
1006 for (i = 0; hierarchies[i]; i++) {
1007 int ret;
1008 struct hierarchy *h = hierarchies[i];
d6337a5f 1009
eb697136 1010 if (!h->container_full_path)
2202afc9
CB
1011 continue;
1012
eb697136 1013 ret = recursive_destroy(h->container_full_path);
2202afc9 1014 if (ret < 0)
eb697136 1015 WARN("Failed to destroy \"%s\"", h->container_full_path);
2202afc9 1016
eb697136
CB
1017 free(h->container_full_path);
1018 h->container_full_path = NULL;
2202afc9 1019 }
d6337a5f 1020
c71d83e1 1021 return 0;
d6337a5f
CB
1022}
1023
2202afc9
CB
1024struct generic_userns_exec_data {
1025 struct hierarchy **hierarchies;
1026 const char *container_cgroup;
1027 struct lxc_conf *conf;
1028 uid_t origuid; /* target uid in parent namespace */
1029 char *path;
1030};
d6337a5f 1031
2202afc9
CB
1032static int cgroup_rmdir_wrapper(void *data)
1033{
1034 int ret;
1035 struct generic_userns_exec_data *arg = data;
1036 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1037 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
d6337a5f 1038
2202afc9
CB
1039 ret = setresgid(nsgid, nsgid, nsgid);
1040 if (ret < 0) {
1041 SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
1042 (int)nsgid, (int)nsgid);
1043 return -1;
1044 }
d6337a5f 1045
2202afc9
CB
1046 ret = setresuid(nsuid, nsuid, nsuid);
1047 if (ret < 0) {
1048 SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
1049 (int)nsuid, (int)nsuid);
1050 return -1;
1051 }
d6337a5f 1052
2202afc9
CB
1053 ret = setgroups(0, NULL);
1054 if (ret < 0 && errno != EPERM) {
1055 SYSERROR("Failed to setgroups(0, NULL)");
1056 return -1;
1057 }
d6337a5f 1058
2202afc9 1059 return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
d6337a5f
CB
1060}
1061
434c8e15
CB
1062__cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
1063 struct lxc_handler *handler)
d6337a5f
CB
1064{
1065 int ret;
2202afc9 1066 struct generic_userns_exec_data wrap;
bd8ef4e4 1067
fc1c3af9
CB
1068 if (!ops)
1069 log_error_errno(return, ENOENT, "Called with uninitialized cgroup operations");
1070
69b4a4bb
CB
1071 if (!ops->hierarchies)
1072 return;
1073
fc1c3af9
CB
1074 if (!handler)
1075 log_error_errno(return, EINVAL, "Called with uninitialized handler");
1076
1077 if (!handler->conf)
1078 log_error_errno(return, EINVAL, "Called with uninitialized conf");
1079
4160c3a0 1080 wrap.origuid = 0;
2202afc9
CB
1081 wrap.container_cgroup = ops->container_cgroup;
1082 wrap.hierarchies = ops->hierarchies;
1083 wrap.conf = handler->conf;
4160c3a0 1084
bf651989
CB
1085#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
1086 ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices);
1087 if (ret < 0)
1088 WARN("Failed to detach bpf program from cgroup");
1089#endif
1090
2202afc9
CB
1091 if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
1092 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
bd8ef4e4 1093 "cgroup_rmdir_wrapper");
ccb4cabe 1094 else
2202afc9 1095 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
bd8ef4e4
CB
1096 if (ret < 0) {
1097 WARN("Failed to destroy cgroups");
ccb4cabe 1098 return;
ccb4cabe 1099 }
ccb4cabe
SH
1100}
1101
434c8e15
CB
1102__cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1103 struct lxc_handler *handler)
1104{
1105 int len;
434c8e15 1106 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
b376d3d0
CB
1107 struct lxc_conf *conf;
1108
1109 if (!ops)
1110 log_error_errno(return, ENOENT, "Called with uninitialized cgroup operations");
434c8e15
CB
1111
1112 if (!ops->hierarchies)
1113 return;
1114
b376d3d0
CB
1115 if (!handler)
1116 log_error_errno(return, EINVAL, "Called with uninitialized handler");
1117
1118 if (!handler->conf)
1119 log_error_errno(return, EINVAL, "Called with uninitialized conf");
1120
1121 conf = handler->conf;
1122
434c8e15
CB
1123 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1124 if (len < 0 || (size_t)len >= sizeof(pidstr))
1125 return;
1126
1127 for (int i = 0; ops->hierarchies[i]; i++) {
d97919ab 1128 __do_free char *pivot_path = NULL;
fe70edee 1129 char pivot_cgroup[] = CGROUP_PIVOT;
434c8e15 1130 struct hierarchy *h = ops->hierarchies[i];
fe70edee 1131 int ret;
434c8e15
CB
1132
1133 if (!h->monitor_full_path)
1134 continue;
1135
1136 if (conf && conf->cgroup_meta.dir)
1137 pivot_path = must_make_path(h->mountpoint,
1138 h->container_base_path,
1139 conf->cgroup_meta.dir,
fe70edee 1140 CGROUP_PIVOT, NULL);
434c8e15
CB
1141 else
1142 pivot_path = must_make_path(h->mountpoint,
1143 h->container_base_path,
fe70edee 1144 CGROUP_PIVOT, NULL);
23e5c045 1145
ecedb5de 1146 /*
fe70edee 1147 * Make sure not to pass in the ro string literal CGROUP_PIVOT
ecedb5de
CB
1148 * here.
1149 */
fe70edee
CB
1150 if (cg_legacy_handle_cpuset_hierarchy(h, pivot_cgroup) < 0)
1151 log_warn_errno(continue, errno, "Failed to handle legacy cpuset controller");
ecedb5de 1152
434c8e15 1153 ret = mkdir_p(pivot_path, 0755);
b376d3d0
CB
1154 if (ret < 0 && errno != EEXIST)
1155 log_warn_errno(continue, errno,
1156 "Failed to create cgroup \"%s\"\n",
1157 pivot_path);
434c8e15 1158
fe70edee
CB
1159 /*
1160 * Move ourselves into the pivot cgroup to delete our own
434c8e15
CB
1161 * cgroup.
1162 */
fe70edee 1163 ret = lxc_write_openat(pivot_path, "cgroup.procs", pidstr, len);
b376d3d0
CB
1164 if (ret != 0)
1165 log_warn_errno(continue, errno,
1166 "Failed to move monitor %s to \"%s\"\n",
1167 pidstr, pivot_path);
434c8e15
CB
1168
1169 ret = recursive_destroy(h->monitor_full_path);
1170 if (ret < 0)
1171 WARN("Failed to destroy \"%s\"", h->monitor_full_path);
434c8e15
CB
1172 }
1173}
1174
6099dd5a
CB
1175static int mkdir_eexist_on_last(const char *dir, mode_t mode)
1176{
1177 const char *tmp = dir;
1178 const char *orig = dir;
1179 size_t orig_len;
1180
1181 orig_len = strlen(dir);
1182 do {
6453ba56 1183 __do_free char *makeme = NULL;
6099dd5a
CB
1184 int ret;
1185 size_t cur_len;
6099dd5a
CB
1186
1187 dir = tmp + strspn(tmp, "/");
1188 tmp = dir + strcspn(dir, "/");
1189
1190 errno = ENOMEM;
1191 cur_len = dir - orig;
1192 makeme = strndup(orig, cur_len);
1193 if (!makeme)
1194 return -1;
1195
1196 ret = mkdir(makeme, mode);
1197 if (ret < 0) {
1198 if ((errno != EEXIST) || (orig_len == cur_len)) {
1199 SYSERROR("Failed to create directory \"%s\"", makeme);
6099dd5a
CB
1200 return -1;
1201 }
1202 }
6099dd5a
CB
1203 } while (tmp != dir);
1204
1205 return 0;
1206}
1207
fe70edee 1208static bool create_cgroup_tree(struct hierarchy *h, const char *cgroup_tree,
f990d3bf 1209 const char *cgroup_leaf, bool payload)
72068e74 1210{
fe70edee
CB
1211 __do_free char *path = NULL;
1212 int ret, ret_cpuset;
72068e74 1213
fe70edee
CB
1214 path = must_make_path(h->mountpoint, h->container_base_path, cgroup_leaf, NULL);
1215 if (dir_exists(path))
1216 return log_warn_errno(false, errno, "The %s cgroup already existed", path);
72068e74 1217
fe70edee
CB
1218 ret_cpuset = cg_legacy_handle_cpuset_hierarchy(h, cgroup_leaf);
1219 if (ret_cpuset < 0)
1220 return log_error_errno(false, errno, "Failed to handle legacy cpuset controller");
0c3deb94 1221
fe70edee 1222 ret = mkdir_eexist_on_last(path, 0755);
6099dd5a 1223 if (ret < 0) {
fe70edee
CB
1224 /*
1225 * This is the cpuset controller and
1226 * cg_legacy_handle_cpuset_hierarchy() has created our target
1227 * directory for us to ensure correct initialization.
1228 */
1229 if (ret_cpuset != 1 || cgroup_tree)
1230 return log_error_errno(false, errno, "Failed to create %s cgroup", path);
6f9584d8 1231 }
0c3deb94 1232
fe70edee
CB
1233 if (payload)
1234 h->container_full_path = move_ptr(path);
1235 else
1236 h->monitor_full_path = move_ptr(path);
1237
c581d2a6 1238 return true;
ccb4cabe
SH
1239}
1240
fe70edee 1241static void cgroup_remove_leaf(struct hierarchy *h, bool payload)
ccb4cabe 1242{
fe70edee 1243 __do_free char *full_path = NULL;
72068e74 1244
fe70edee 1245 if (payload)
72068e74 1246 full_path = h->container_full_path;
fe70edee
CB
1247 else
1248 full_path = h->monitor_full_path;
e56639fb 1249
fe70edee
CB
1250 if (rmdir(full_path))
1251 SYSWARN("Failed to rmdir(\"%s\") cgroup", full_path);
72068e74 1252
fe70edee 1253 if (payload)
72068e74 1254 h->container_full_path = NULL;
fe70edee
CB
1255 else
1256 h->monitor_full_path = NULL;
72068e74
CB
1257}
1258
b857f4be 1259__cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
f2668eea 1260 struct lxc_handler *handler)
72068e74 1261{
d97919ab 1262 __do_free char *monitor_cgroup = NULL;
fe70edee
CB
1263 const char *cgroup_tree;
1264 int idx = 0;
1265 int i;
5ce03bc0 1266 size_t len;
fe70edee 1267 char *suffix;
0d66e29a 1268 struct lxc_conf *conf;
72068e74 1269
0d66e29a
CB
1270 if (!ops)
1271 return ret_set_errno(false, ENOENT);
e56639fb 1272
69b4a4bb
CB
1273 if (!ops->hierarchies)
1274 return true;
1275
0d66e29a
CB
1276 if (ops->monitor_cgroup)
1277 return ret_set_errno(false, EEXIST);
1278
1279 if (!handler || !handler->conf)
1280 return ret_set_errno(false, EINVAL);
1281
1282 conf = handler->conf;
fe70edee 1283 cgroup_tree = conf->cgroup_meta.dir;
0d66e29a 1284
fe70edee
CB
1285 if (cgroup_tree)
1286 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1287 DEFAULT_MONITOR_CGROUP_PREFIX,
1288 handler->name,
1289 CGROUP_CREATE_RETRY, NULL);
72068e74 1290 else
fe70edee
CB
1291 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1292 handler->name,
1293 CGROUP_CREATE_RETRY, NULL);
1294 if (!monitor_cgroup)
0d66e29a 1295 return ret_set_errno(false, ENOMEM);
72068e74 1296
fe70edee
CB
1297 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1298 *suffix = '\0';
5ce03bc0 1299 do {
0d66e29a 1300 if (idx)
fe70edee 1301 sprintf(suffix, "-%d", idx);
72068e74 1302
ebc10afe 1303 for (i = 0; ops->hierarchies[i]; i++) {
fe70edee
CB
1304 if (create_cgroup_tree(ops->hierarchies[i], cgroup_tree, monitor_cgroup, false))
1305 continue;
1306
1307 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path ?: "(null)");
1308 for (int j = 0; j < i; j++)
1309 cgroup_remove_leaf(ops->hierarchies[j], false);
1310
1311 idx++;
1312 break;
5ce03bc0 1313 }
ebc10afe 1314 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
5ce03bc0 1315
d97919ab 1316 if (idx == 1000)
0d66e29a 1317 return ret_set_errno(false, ERANGE);
72068e74 1318
c581d2a6 1319 ops->monitor_cgroup = move_ptr(monitor_cgroup);
6e8703a4 1320 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
ccb4cabe
SH
1321}
1322
fe70edee
CB
1323/*
1324 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
cecad0c1 1325 * next cgroup_pattern-1, -2, ..., -999.
ccb4cabe 1326 */
b857f4be 1327__cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
f3839f12 1328 struct lxc_handler *handler)
ccb4cabe 1329{
fe70edee
CB
1330 __do_free char *container_cgroup = NULL;
1331 const char *cgroup_tree;
f3839f12 1332 int idx = 0;
fe70edee 1333 int i;
ccb4cabe 1334 size_t len;
fe70edee 1335 char *suffix;
f3839f12 1336 struct lxc_conf *conf;
43654d34 1337
f3839f12
CB
1338 if (!ops)
1339 return ret_set_errno(false, ENOENT);
ccb4cabe 1340
69b4a4bb
CB
1341 if (!ops->hierarchies)
1342 return true;
1343
f3839f12
CB
1344 if (ops->container_cgroup)
1345 return ret_set_errno(false, EEXIST);
1346
1347 if (!handler || !handler->conf)
1348 return ret_set_errno(false, EINVAL);
1349
1350 conf = handler->conf;
fe70edee 1351 cgroup_tree = conf->cgroup_meta.dir;
f3839f12 1352
fe70edee
CB
1353 if (cgroup_tree)
1354 container_cgroup = must_concat(&len, cgroup_tree, "/",
1355 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1356 handler->name,
1357 CGROUP_CREATE_RETRY, NULL);
43654d34 1358 else
fe70edee
CB
1359 container_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1360 handler->name,
1361 CGROUP_CREATE_RETRY, NULL);
1362 if (!container_cgroup)
1363 return ret_set_errno(false, ENOMEM);
ccb4cabe 1364
fe70edee
CB
1365 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1366 *suffix = '\0';
d97919ab 1367 do {
f3839f12 1368 if (idx)
fe70edee 1369 sprintf(suffix, "-%d", idx);
bb30b52a 1370
d97919ab 1371 for (i = 0; ops->hierarchies[i]; i++) {
fe70edee
CB
1372 if (create_cgroup_tree(ops->hierarchies[i], cgroup_tree, container_cgroup, true))
1373 continue;
1374
1375 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
1376 for (int j = 0; j < i; j++)
1377 cgroup_remove_leaf(ops->hierarchies[j], true);
1378
1379 idx++;
1380 break;
66b66624 1381 }
d97919ab 1382 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
cecad0c1 1383
d97919ab 1384 if (idx == 1000)
f3839f12 1385 return ret_set_errno(false, ERANGE);
cecad0c1 1386
bad788b0 1387 if (ops->unified && ops->unified->container_full_path) {
fe70edee
CB
1388 int ret;
1389
bad788b0
CB
1390 ret = open(ops->unified->container_full_path,
1391 O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1392 if (ret < 0)
1393 return log_error_errno(false,
1394 errno, "Failed to open file descriptor for unified hierarchy");
1395 ops->unified_fd = ret;
1396 }
1397
fe70edee
CB
1398 ops->container_cgroup = move_ptr(container_cgroup);
1399 INFO("The container process uses \"%s\" as cgroup", ops->container_cgroup);
ccb4cabe 1400 return true;
ccb4cabe
SH
1401}
1402
c581d2a6
CB
1403__cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1404 struct lxc_handler *handler)
ccb4cabe 1405{
c581d2a6
CB
1406 int monitor_len, transient_len;
1407 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1408 transient[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 1409
797fa65e
CB
1410 if (!ops)
1411 return ret_set_errno(false, ENOENT);
1412
69b4a4bb
CB
1413 if (!ops->hierarchies)
1414 return true;
1415
797fa65e
CB
1416 if (!ops->monitor_cgroup)
1417 return ret_set_errno(false, ENOENT);
1418
1419 if (!handler || !handler->conf)
1420 return ret_set_errno(false, EINVAL);
1421
c581d2a6
CB
1422 monitor_len = snprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1423 if (handler->transient_pid > 0)
1424 transient_len = snprintf(transient, sizeof(transient), "%d",
1425 handler->transient_pid);
ccb4cabe 1426
eeef32bb 1427 for (int i = 0; ops->hierarchies[i]; i++) {
88396101 1428 __do_free char *path = NULL;
c581d2a6 1429 int ret;
08768001 1430
c581d2a6
CB
1431 path = must_make_path(ops->hierarchies[i]->monitor_full_path,
1432 "cgroup.procs", NULL);
1433 ret = lxc_writeat(-1, path, monitor, monitor_len);
1434 if (ret != 0)
1435 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", path);
1436
1437 if (handler->transient_pid < 0)
1438 return true;
1439
1440 ret = lxc_writeat(-1, path, transient, transient_len);
1441 if (ret != 0)
1442 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", path);
ccb4cabe 1443 }
c581d2a6 1444 handler->transient_pid = -1;
ccb4cabe
SH
1445
1446 return true;
1447}
1448
c581d2a6
CB
1449__cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1450 struct lxc_handler *handler)
eeef32bb 1451{
c581d2a6
CB
1452 int len;
1453 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
eeef32bb 1454
4490328e
CB
1455 if (!ops)
1456 return ret_set_errno(false, ENOENT);
1457
c581d2a6
CB
1458 if (!ops->hierarchies)
1459 return true;
1460
4490328e
CB
1461 if (!ops->container_cgroup)
1462 return ret_set_errno(false, ENOENT);
1463
1464 if (!handler || !handler->conf)
1465 return ret_set_errno(false, EINVAL);
1466
c581d2a6
CB
1467 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1468
1469 for (int i = 0; ops->hierarchies[i]; i++) {
1470 __do_free char *path = NULL;
1471 int ret;
1472
1473 path = must_make_path(ops->hierarchies[i]->container_full_path,
1474 "cgroup.procs", NULL);
1475 ret = lxc_writeat(-1, path, pidstr, len);
1476 if (ret != 0)
1477 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", path);
1478 }
1479
1480 return true;
eeef32bb
CB
1481}
1482
6efacf80
CB
1483static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
1484 mode_t chmod_mode)
1485{
1486 int ret;
1487
1488 ret = chown(path, chown_uid, chown_gid);
1489 if (ret < 0) {
a24c5678 1490 SYSWARN("Failed to chown(%s, %d, %d)", path, (int)chown_uid, (int)chown_gid);
6efacf80
CB
1491 return -1;
1492 }
1493
1494 ret = chmod(path, chmod_mode);
1495 if (ret < 0) {
a24c5678 1496 SYSWARN("Failed to chmod(%s, %d)", path, (int)chmod_mode);
6efacf80
CB
1497 return -1;
1498 }
1499
1500 return 0;
1501}
1502
1503/* chgrp the container cgroups to container group. We leave
c0888dfe
SH
1504 * the container owner as cgroup owner. So we must make the
1505 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1506 *
1507 * Also chown the tasks and cgroup.procs files. Those may not
1508 * exist depending on kernel version.
c0888dfe 1509 */
ccb4cabe
SH
1510static int chown_cgroup_wrapper(void *data)
1511{
6a720d74 1512 int ret;
4160c3a0
CB
1513 uid_t destuid;
1514 struct generic_userns_exec_data *arg = data;
1515 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1516 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1517
6efacf80 1518 ret = setresgid(nsgid, nsgid, nsgid);
803e4123
CB
1519 if (ret < 0)
1520 return log_error_errno(-1, errno,
1521 "Failed to setresgid(%d, %d, %d)",
1522 (int)nsgid, (int)nsgid, (int)nsgid);
6efacf80
CB
1523
1524 ret = setresuid(nsuid, nsuid, nsuid);
803e4123
CB
1525 if (ret < 0)
1526 return log_error_errno(-1, errno,
1527 "Failed to setresuid(%d, %d, %d)",
1528 (int)nsuid, (int)nsuid, (int)nsuid);
6efacf80
CB
1529
1530 ret = setgroups(0, NULL);
803e4123
CB
1531 if (ret < 0 && errno != EPERM)
1532 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
ccb4cabe
SH
1533
1534 destuid = get_ns_uid(arg->origuid);
b962868f
CB
1535 if (destuid == LXC_INVALID_UID)
1536 destuid = 0;
ccb4cabe 1537
6a720d74 1538 for (int i = 0; arg->hierarchies[i]; i++) {
d97919ab 1539 __do_free char *fullpath = NULL;
eb697136 1540 char *path = arg->hierarchies[i]->container_full_path;
43647298 1541
63e42fee 1542 ret = chowmod(path, destuid, nsgid, 0775);
6efacf80 1543 if (ret < 0)
803e4123
CB
1544 log_info_errno(continue,
1545 errno, "Failed to change %s to uid %d and gid %d and mode 0755",
1546 path, destuid, nsgid);
c0888dfe 1547
6efacf80
CB
1548 /* Failures to chown() these are inconvenient but not
1549 * detrimental We leave these owned by the container launcher,
1550 * so that container root can write to the files to attach. We
1551 * chmod() them 664 so that container systemd can write to the
1552 * files (which systemd in wily insists on doing).
ab8f5424 1553 */
6efacf80 1554
2202afc9 1555 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
6efacf80 1556 fullpath = must_make_path(path, "tasks", NULL);
803e4123
CB
1557 ret = chowmod(fullpath, destuid, nsgid, 0664);
1558 if (ret < 0)
1559 SYSINFO("Failed to change %s to uid %d and gid %d and mode 0664",
1560 fullpath, destuid, nsgid);
6efacf80 1561 }
43647298
SH
1562
1563 fullpath = must_make_path(path, "cgroup.procs", NULL);
803e4123
CB
1564 ret = chowmod(fullpath, destuid, nsgid, 0664);
1565 if (ret < 0)
1566 SYSINFO("Failed to change %s to uid %d and gid %d and mode 0664",
1567 fullpath, destuid, nsgid);
0e17357c 1568
2202afc9 1569 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
0e17357c
CB
1570 continue;
1571
a6ca2ed8
CB
1572 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++) {
1573 fullpath = must_make_path(path, *p, NULL);
803e4123
CB
1574 ret = chowmod(fullpath, destuid, nsgid, 0664);
1575 if (ret < 0)
1576 SYSINFO("Failed to change %s to uid %d and gid %d and mode 0664",
1577 fullpath, destuid, nsgid);
a6ca2ed8 1578 }
ccb4cabe
SH
1579 }
1580
1581 return 0;
1582}
1583
b857f4be 1584__cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
c98bbf71 1585 struct lxc_conf *conf)
ccb4cabe 1586{
4160c3a0 1587 struct generic_userns_exec_data wrap;
ccb4cabe 1588
c98bbf71
CB
1589 if (!ops)
1590 return ret_set_errno(false, ENOENT);
ccb4cabe 1591
69b4a4bb
CB
1592 if (!ops->hierarchies)
1593 return true;
1594
c98bbf71
CB
1595 if (!ops->container_cgroup)
1596 return ret_set_errno(false, ENOENT);
1597
1598 if (!conf)
1599 return ret_set_errno(false, EINVAL);
1600
1601 if (lxc_list_empty(&conf->id_map))
1602 return true;
1603
ccb4cabe 1604 wrap.origuid = geteuid();
4160c3a0 1605 wrap.path = NULL;
2202afc9 1606 wrap.hierarchies = ops->hierarchies;
4160c3a0 1607 wrap.conf = conf;
ccb4cabe 1608
c98bbf71
CB
1609 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1610 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
ccb4cabe
SH
1611
1612 return true;
1613}
1614
8aa1044f
SH
1615/* cgroup-full:* is done, no need to create subdirs */
1616static bool cg_mount_needs_subdirs(int type)
1617{
1618 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1619 return false;
a3926f6a 1620
8aa1044f
SH
1621 return true;
1622}
1623
886cac86
CB
1624/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1625 * remount controller ro if needed and bindmount the cgroupfs onto
25fa6f8c 1626 * control/the/cg/path.
8aa1044f 1627 */
6812d833
CB
1628static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1629 char *controllerpath, char *cgpath,
1630 const char *container_cgroup)
8aa1044f 1631{
d97919ab 1632 __do_free char *sourcepath = NULL;
5285689c 1633 int ret, remount_flags;
886cac86
CB
1634 int flags = MS_BIND;
1635
8aa1044f 1636 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
886cac86
CB
1637 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1638 if (ret < 0) {
1639 SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
1640 controllerpath, controllerpath);
8aa1044f
SH
1641 return -1;
1642 }
886cac86 1643
5285689c
CB
1644 remount_flags = add_required_remount_flags(controllerpath,
1645 controllerpath,
1646 flags | MS_REMOUNT);
886cac86 1647 ret = mount(controllerpath, controllerpath, "cgroup",
8186c5c7
CB
1648 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1649 NULL);
886cac86
CB
1650 if (ret < 0) {
1651 SYSERROR("Failed to remount \"%s\" ro", controllerpath);
8aa1044f
SH
1652 return -1;
1653 }
886cac86 1654
8aa1044f
SH
1655 INFO("Remounted %s read-only", controllerpath);
1656 }
886cac86 1657
bb221ad1 1658 sourcepath = must_make_path(h->mountpoint, h->container_base_path,
886cac86 1659 container_cgroup, NULL);
8aa1044f
SH
1660 if (type == LXC_AUTO_CGROUP_RO)
1661 flags |= MS_RDONLY;
886cac86
CB
1662
1663 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1664 if (ret < 0) {
1665 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
8aa1044f
SH
1666 return -1;
1667 }
886cac86 1668 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
f8c40ffa
L
1669
1670 if (flags & MS_RDONLY) {
5285689c
CB
1671 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1672 flags | MS_REMOUNT);
1673 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
886cac86
CB
1674 if (ret < 0) {
1675 SYSERROR("Failed to remount \"%s\" ro", cgpath);
f8c40ffa
L
1676 return -1;
1677 }
5285689c 1678 INFO("Remounted %s read-only", cgpath);
f8c40ffa
L
1679 }
1680
886cac86 1681 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
8aa1044f
SH
1682 return 0;
1683}
1684
6812d833
CB
1685/* __cg_mount_direct
1686 *
1687 * Mount cgroup hierarchies directly without using bind-mounts. The main
1688 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1689 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1690 */
1691static int __cg_mount_direct(int type, struct hierarchy *h,
1692 const char *controllerpath)
b635e92d 1693{
d97919ab 1694 __do_free char *controllers = NULL;
a760603e
CB
1695 char *fstype = "cgroup2";
1696 unsigned long flags = 0;
f6b54668 1697 int ret;
b635e92d 1698
a760603e
CB
1699 flags |= MS_NOSUID;
1700 flags |= MS_NOEXEC;
1701 flags |= MS_NODEV;
1702 flags |= MS_RELATIME;
1703
1704 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1705 flags |= MS_RDONLY;
1706
d6337a5f 1707 if (h->version != CGROUP2_SUPER_MAGIC) {
a760603e
CB
1708 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1709 if (!controllers)
1710 return -ENOMEM;
1711 fstype = "cgroup";
b635e92d
CB
1712 }
1713
a760603e 1714 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
b635e92d 1715 if (ret < 0) {
6812d833 1716 SYSERROR("Failed to mount \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1717 return -1;
1718 }
1719
6812d833 1720 DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1721 return 0;
1722}
1723
6812d833
CB
1724static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1725 const char *controllerpath)
1726{
1727 return __cg_mount_direct(type, h, controllerpath);
1728}
1729
1730static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1731 const char *controllerpath)
1732{
1733 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1734 return 0;
1735
1736 return __cg_mount_direct(type, h, controllerpath);
1737}
1738
b857f4be 1739__cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
8d661d38
CB
1740 struct lxc_handler *handler,
1741 const char *root, int type)
ccb4cabe 1742{
6607d6e9 1743 __do_free char *cgroup_root = NULL;
dfa835ac 1744 int ret;
affd10fa 1745 bool has_cgns = false, retval = false, wants_force_mount = false;
8aa1044f 1746
9585ccb3
CB
1747 if (!ops)
1748 return ret_set_errno(false, ENOENT);
1749
69b4a4bb
CB
1750 if (!ops->hierarchies)
1751 return true;
1752
9585ccb3
CB
1753 if (!handler || !handler->conf)
1754 return ret_set_errno(false, EINVAL);
1755
8aa1044f
SH
1756 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1757 return true;
1758
3f69fb12
SY
1759 if (type & LXC_AUTO_CGROUP_FORCE) {
1760 type &= ~LXC_AUTO_CGROUP_FORCE;
1761 wants_force_mount = true;
1762 }
b635e92d 1763
3f69fb12
SY
1764 if (!wants_force_mount){
1765 if (!lxc_list_empty(&handler->conf->keepcaps))
1766 wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1767 else
1768 wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1769 }
8aa1044f 1770
3f69fb12
SY
1771 has_cgns = cgns_supported();
1772 if (has_cgns && !wants_force_mount)
1773 return true;
8aa1044f
SH
1774
1775 if (type == LXC_AUTO_CGROUP_NOSPEC)
1776 type = LXC_AUTO_CGROUP_MIXED;
1777 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1778 type = LXC_AUTO_CGROUP_FULL_MIXED;
1779
dca9587a 1780 cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
8d661d38 1781 if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
8d661d38
CB
1782 if (has_cgns && wants_force_mount) {
1783 /* If cgroup namespaces are supported but the container
1784 * will not have CAP_SYS_ADMIN after it has started we
1785 * need to mount the cgroups manually.
1786 */
1787 return cg_mount_in_cgroup_namespace(type, ops->unified,
6607d6e9 1788 cgroup_root) == 0;
8d661d38
CB
1789 }
1790
6607d6e9 1791 return cg_mount_cgroup_full(type, ops->unified, cgroup_root) == 0;
8d661d38
CB
1792 }
1793
1794 /* mount tmpfs */
6607d6e9 1795 ret = safe_mount(NULL, cgroup_root, "tmpfs",
3f69fb12
SY
1796 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1797 "size=10240k,mode=755", root);
1798 if (ret < 0)
1799 goto on_error;
8aa1044f 1800
dfa835ac 1801 for (int i = 0; ops->hierarchies[i]; i++) {
d97919ab 1802 __do_free char *controllerpath = NULL, *path2 = NULL;
2202afc9 1803 struct hierarchy *h = ops->hierarchies[i];
8aa1044f 1804 char *controller = strrchr(h->mountpoint, '/');
8aa1044f
SH
1805
1806 if (!controller)
1807 continue;
1808 controller++;
affd10fa 1809
6607d6e9 1810 controllerpath = must_make_path(cgroup_root, controller, NULL);
d97919ab 1811 if (dir_exists(controllerpath))
8aa1044f 1812 continue;
affd10fa 1813
3f69fb12 1814 ret = mkdir(controllerpath, 0755);
9585ccb3
CB
1815 if (ret < 0)
1816 log_error_errno(goto on_error, errno,
1817 "Error creating cgroup path: %s",
1818 controllerpath);
b635e92d 1819
3f69fb12 1820 if (has_cgns && wants_force_mount) {
b635e92d
CB
1821 /* If cgroup namespaces are supported but the container
1822 * will not have CAP_SYS_ADMIN after it has started we
1823 * need to mount the cgroups manually.
1824 */
3f69fb12 1825 ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
3f69fb12
SY
1826 if (ret < 0)
1827 goto on_error;
1828
b635e92d
CB
1829 continue;
1830 }
1831
6812d833 1832 ret = cg_mount_cgroup_full(type, h, controllerpath);
d97919ab 1833 if (ret < 0)
3f69fb12 1834 goto on_error;
3f69fb12 1835
d97919ab 1836 if (!cg_mount_needs_subdirs(type))
8aa1044f 1837 continue;
3f69fb12 1838
bb221ad1 1839 path2 = must_make_path(controllerpath, h->container_base_path,
2202afc9 1840 ops->container_cgroup, NULL);
3f69fb12 1841 ret = mkdir_p(path2, 0755);
d97919ab 1842 if (ret < 0)
3f69fb12 1843 goto on_error;
2f62fb00 1844
6812d833 1845 ret = cg_legacy_mount_controllers(type, h, controllerpath,
2202afc9 1846 path2, ops->container_cgroup);
3f69fb12
SY
1847 if (ret < 0)
1848 goto on_error;
8aa1044f
SH
1849 }
1850 retval = true;
1851
3f69fb12 1852on_error:
8aa1044f 1853 return retval;
ccb4cabe
SH
1854}
1855
1856static int recursive_count_nrtasks(char *dirname)
1857{
d97919ab 1858 __do_free char *path = NULL;
88396101 1859 __do_closedir DIR *dir = NULL;
74f96976 1860 struct dirent *direntp;
ccb4cabe 1861 int count = 0, ret;
ccb4cabe
SH
1862
1863 dir = opendir(dirname);
1864 if (!dir)
1865 return 0;
1866
74f96976 1867 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1868 struct stat mystat;
1869
ccb4cabe
SH
1870 if (!strcmp(direntp->d_name, ".") ||
1871 !strcmp(direntp->d_name, ".."))
1872 continue;
1873
1874 path = must_make_path(dirname, direntp->d_name, NULL);
1875
1876 if (lstat(path, &mystat))
d97919ab 1877 continue;
ccb4cabe
SH
1878
1879 if (!S_ISDIR(mystat.st_mode))
d97919ab 1880 continue;
ccb4cabe
SH
1881
1882 count += recursive_count_nrtasks(path);
ccb4cabe
SH
1883 }
1884
1885 path = must_make_path(dirname, "cgroup.procs", NULL);
1886 ret = lxc_count_file_lines(path);
1887 if (ret != -1)
1888 count += ret;
ccb4cabe
SH
1889
1890 return count;
1891}
1892
b857f4be 1893__cgfsng_ops static int cgfsng_nrtasks(struct cgroup_ops *ops)
3135c5d4 1894{
d97919ab 1895 __do_free char *path = NULL;
ccb4cabe 1896
1aae36a9
CB
1897 if (!ops)
1898 return ret_set_errno(-1, ENOENT);
1899
2202afc9 1900 if (!ops->container_cgroup || !ops->hierarchies)
1aae36a9 1901 return ret_set_errno(-1, EINVAL);
a3926f6a 1902
eb697136 1903 path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
3312a94f 1904 return recursive_count_nrtasks(path);
ccb4cabe
SH
1905}
1906
11c23867 1907/* Only root needs to escape to the cgroup of its init. */
b857f4be 1908__cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
52d08ab0 1909 struct lxc_conf *conf)
ccb4cabe 1910{
52d08ab0
CB
1911 if (!ops)
1912 return ret_set_errno(false, ENOENT);
1913
1914 if (!ops->hierarchies)
1915 return true;
1916
1917 if (!conf)
1918 return ret_set_errno(false, EINVAL);
1919
1920 if (conf->cgroup_meta.relative || geteuid())
ccb4cabe
SH
1921 return true;
1922
779b3d82 1923 for (int i = 0; ops->hierarchies[i]; i++) {
88396101 1924 __do_free char *fullpath = NULL;
52d08ab0 1925 int ret;
11c23867 1926
52d08ab0
CB
1927 fullpath =
1928 must_make_path(ops->hierarchies[i]->mountpoint,
1929 ops->hierarchies[i]->container_base_path,
1930 "cgroup.procs", NULL);
7cea5905 1931 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
52d08ab0
CB
1932 if (ret != 0)
1933 return log_error_errno(false,
1934 errno, "Failed to escape to cgroup \"%s\"",
1935 fullpath);
ccb4cabe
SH
1936 }
1937
6df334d1 1938 return true;
ccb4cabe
SH
1939}
1940
b857f4be 1941__cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
36662416 1942{
69b4a4bb
CB
1943 int i = 0;
1944
e3ffb28b
CB
1945 if (!ops)
1946 return ret_set_errno(-1, ENOENT);
1947
69b4a4bb
CB
1948 if (!ops->hierarchies)
1949 return 0;
36662416 1950
69b4a4bb 1951 for (; ops->hierarchies[i]; i++)
36662416
TA
1952 ;
1953
1954 return i;
1955}
1956
aa48a34f
CB
1957__cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n,
1958 char ***out)
36662416
TA
1959{
1960 int i;
1961
aa48a34f
CB
1962 if (!ops)
1963 return ret_set_errno(false, ENOENT);
1964
69b4a4bb
CB
1965 if (!ops->hierarchies)
1966 return false;
1967
36662416 1968 /* sanity check n */
6b38e644 1969 for (i = 0; i < n; i++)
2202afc9 1970 if (!ops->hierarchies[i])
aa48a34f 1971 return ret_set_errno(false, ENOENT);
36662416 1972
2202afc9 1973 *out = ops->hierarchies[i]->controllers;
36662416
TA
1974
1975 return true;
1976}
1977
ee3a7775 1978static bool cg_legacy_freeze(struct cgroup_ops *ops)
ccb4cabe 1979{
d6337a5f 1980 struct hierarchy *h;
ccb4cabe 1981
ee3a7775
CB
1982 h = get_hierarchy(ops, "freezer");
1983 if (!h)
d2203230 1984 return ret_set_errno(-1, ENOENT);
81468ea7 1985
c04a6d4e
CB
1986 return lxc_write_openat(h->container_full_path, "freezer.state",
1987 "FROZEN", STRLITERALLEN("FROZEN"));
ee3a7775 1988}
942e193e 1989
018051e3
CB
1990static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1991 struct lxc_epoll_descr *descr)
ee3a7775 1992{
018051e3
CB
1993 __do_close_prot_errno int duped_fd = -EBADF;
1994 __do_free char *line = NULL;
ee3a7775 1995 __do_fclose FILE *f = NULL;
018051e3
CB
1996 int state = PTR_TO_INT(cbdata);
1997 size_t len;
1998 const char *state_string;
1999
2000 duped_fd = dup(fd);
2001 if (duped_fd < 0)
2002 return LXC_MAINLOOP_ERROR;
2003
2004 if (lseek(duped_fd, 0, SEEK_SET) < (off_t)-1)
2005 return LXC_MAINLOOP_ERROR;
2006
2007 f = fdopen(duped_fd, "re");
2008 if (!f)
2009 return LXC_MAINLOOP_ERROR;
2010 move_fd(duped_fd);
2011
2012 if (state == 1)
2013 state_string = "frozen 1";
2014 else
2015 state_string = "frozen 0";
2016
2017 while (getline(&line, &len, f) != -1)
2018 if (strncmp(line, state_string, STRLITERALLEN("frozen") + 2) == 0)
2019 return LXC_MAINLOOP_CLOSE;
2020
2021 return LXC_MAINLOOP_CONTINUE;
2022}
2023
2024static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
2025{
2026 __do_close_prot_errno int fd = -EBADF;
018051e3
CB
2027 __do_lxc_mainloop_close struct lxc_epoll_descr *descr_ptr = NULL;
2028 int ret;
2029 struct lxc_epoll_descr descr;
ee3a7775 2030 struct hierarchy *h;
942e193e
CB
2031
2032 h = ops->unified;
457ca9aa 2033 if (!h)
d2203230 2034 return ret_set_errno(-1, ENOENT);
d6337a5f 2035
018051e3 2036 if (!h->container_full_path)
d2203230 2037 return ret_set_errno(-1, EEXIST);
d6337a5f 2038
018051e3
CB
2039 if (timeout != 0) {
2040 __do_free char *events_file = NULL;
942e193e 2041
018051e3
CB
2042 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2043 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2044 if (fd < 0)
d2203230 2045 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
942e193e 2046
018051e3
CB
2047 ret = lxc_mainloop_open(&descr);
2048 if (ret)
d2203230 2049 return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container freeze");
942e193e 2050
018051e3
CB
2051 /* automatically cleaned up now */
2052 descr_ptr = &descr;
942e193e 2053
018051e3
CB
2054 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){1}));
2055 if (ret < 0)
d2203230 2056 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 2057 }
942e193e 2058
c04a6d4e 2059 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "1", 1);
018051e3 2060 if (ret < 0)
d2203230 2061 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
2062
2063 if (timeout != 0 && lxc_mainloop(&descr, timeout))
d2203230 2064 return log_error_errno(-1, errno, "Failed to wait for container to be frozen");
018051e3
CB
2065
2066 return 0;
942e193e
CB
2067}
2068
018051e3 2069__cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
942e193e 2070{
81468ea7 2071 if (!ops->hierarchies)
d2203230 2072 return ret_set_errno(-1, ENOENT);
81468ea7 2073
ee3a7775
CB
2074 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2075 return cg_legacy_freeze(ops);
942e193e 2076
018051e3 2077 return cg_unified_freeze(ops, timeout);
ee3a7775
CB
2078}
2079
018051e3 2080static int cg_legacy_unfreeze(struct cgroup_ops *ops)
ee3a7775 2081{
ee3a7775
CB
2082 struct hierarchy *h;
2083
2084 h = get_hierarchy(ops, "freezer");
2085 if (!h)
d2203230 2086 return ret_set_errno(-1, ENOENT);
ee3a7775 2087
c04a6d4e
CB
2088 return lxc_write_openat(h->container_full_path, "freezer.state",
2089 "THAWED", STRLITERALLEN("THAWED"));
ee3a7775
CB
2090}
2091
018051e3 2092static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775 2093{
018051e3 2094 __do_close_prot_errno int fd = -EBADF;
018051e3
CB
2095 __do_lxc_mainloop_close struct lxc_epoll_descr *descr_ptr = NULL;
2096 int ret;
2097 struct lxc_epoll_descr descr;
ee3a7775 2098 struct hierarchy *h;
942e193e
CB
2099
2100 h = ops->unified;
2101 if (!h)
d2203230 2102 return ret_set_errno(-1, ENOENT);
018051e3
CB
2103
2104 if (!h->container_full_path)
d2203230 2105 return ret_set_errno(-1, EEXIST);
018051e3
CB
2106
2107 if (timeout != 0) {
2108 __do_free char *events_file = NULL;
2109
2110 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2111 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2112 if (fd < 0)
d2203230 2113 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
018051e3
CB
2114
2115 ret = lxc_mainloop_open(&descr);
2116 if (ret)
d2203230 2117 return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container unfreeze");
018051e3
CB
2118
2119 /* automatically cleaned up now */
2120 descr_ptr = &descr;
2121
2122 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){0}));
2123 if (ret < 0)
d2203230 2124 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 2125 }
942e193e 2126
c04a6d4e 2127 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "0", 1);
018051e3 2128 if (ret < 0)
d2203230 2129 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
2130
2131 if (timeout != 0 && lxc_mainloop(&descr, timeout))
d2203230 2132 return log_error_errno(-1, errno, "Failed to wait for container to be unfrozen");
018051e3
CB
2133
2134 return 0;
ee3a7775
CB
2135}
2136
018051e3 2137__cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775
CB
2138{
2139 if (!ops->hierarchies)
d2203230 2140 return ret_set_errno(-1, ENOENT);
ee3a7775
CB
2141
2142 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2143 return cg_legacy_unfreeze(ops);
2144
018051e3 2145 return cg_unified_unfreeze(ops, timeout);
ccb4cabe
SH
2146}
2147
b857f4be 2148__cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
6bdf9691 2149 const char *controller)
ccb4cabe 2150{
d6337a5f
CB
2151 struct hierarchy *h;
2152
2202afc9 2153 h = get_hierarchy(ops, controller);
6bdf9691
CB
2154 if (!h)
2155 return log_warn_errno(NULL,
2156 ENOENT, "Failed to find hierarchy for controller \"%s\"",
2157 controller ? controller : "(null)");
ccb4cabe 2158
6bdf9691
CB
2159 return h->container_full_path
2160 ? h->container_full_path + strlen(h->mountpoint)
2161 : NULL;
371f834d
SH
2162}
2163
c40c8209
CB
2164/* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2165 * which must be freed by the caller.
371f834d 2166 */
c40c8209
CB
2167static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2168 const char *inpath,
2169 const char *filename)
371f834d 2170{
371f834d 2171 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
2172}
2173
900b6606 2174static int cgroup_attach_leaf(int unified_fd, int64_t pid)
c2aed66d 2175{
ad275c16 2176 int idx = 1;
c2aed66d 2177 int ret;
900b6606 2178 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
ad275c16 2179 char attach_cgroup[STRLITERALLEN("lxc-1000/cgroup.procs") + 1];
900b6606 2180 size_t pidstr_len;
c2aed66d 2181
ad275c16
CB
2182 /* Create leaf cgroup. */
2183 ret = mkdirat(unified_fd, "lxc", 0755);
2184 if (ret < 0 && errno != EEXIST)
2185 return log_error_errno(-1, errno, "Failed to create leaf cgroup \"lxc\"");
2186
900b6606 2187 pidstr_len = sprintf(pidstr, INT64_FMT, pid);
ad275c16
CB
2188 ret = lxc_writeat(unified_fd, "lxc/cgroup.procs", pidstr, pidstr_len);
2189 if (ret < 0)
2190 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
c2aed66d 2191 if (ret == 0)
bad788b0 2192 return 0;
ad275c16 2193
bad788b0
CB
2194 /* this is a non-leaf node */
2195 if (errno != EBUSY)
d2203230 2196 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2197
c2aed66d 2198 do {
bad788b0 2199 char *slash;
c2aed66d 2200
ad275c16 2201 sprintf(attach_cgroup, "lxc-%d/cgroup.procs", idx);
bad788b0
CB
2202 slash = &attach_cgroup[ret] - STRLITERALLEN("/cgroup.procs");
2203 *slash = '\0';
ad275c16 2204
bad788b0 2205 ret = mkdirat(unified_fd, attach_cgroup, 0755);
c2aed66d 2206 if (ret < 0 && errno != EEXIST)
d2203230 2207 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
c2aed66d 2208
bad788b0 2209 *slash = '/';
ad275c16 2210
bad788b0 2211 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
c2aed66d 2212 if (ret == 0)
bad788b0 2213 return 0;
c2aed66d
CB
2214
2215 /* this is a non-leaf node */
2216 if (errno != EBUSY)
d2203230 2217 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2218
edae86e9
CB
2219 idx++;
2220 } while (idx < 1000);
c2aed66d 2221
ad275c16 2222 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d
CB
2223}
2224
900b6606
CB
2225int cgroup_attach(const char *name, const char *lxcpath, int64_t pid)
2226{
2227 __do_close_prot_errno int unified_fd = -EBADF;
2228
2229 unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
2230 if (unified_fd < 0)
2231 return -1;
2232
2233 return cgroup_attach_leaf(unified_fd, pid);
2234}
2235
2236/* Technically, we're always at a delegation boundary here (This is especially
2237 * true when cgroup namespaces are available.). The reasoning is that in order
2238 * for us to have been able to start a container in the first place the root
2239 * cgroup must have been a leaf node. Now, either the container's init system
2240 * has populated the cgroup and kept it as a leaf node or it has created
2241 * subtrees. In the former case we will simply attach to the leaf node we
2242 * created when we started the container in the latter case we create our own
2243 * cgroup for the attaching process.
2244 */
2245static int __cg_unified_attach(const struct hierarchy *h, const char *name,
2246 const char *lxcpath, pid_t pid,
2247 const char *controller)
2248{
2249 __do_close_prot_errno int unified_fd = -EBADF;
2250 int ret;
2251
2252 ret = cgroup_attach(name, lxcpath, pid);
2253 if (ret < 0) {
2254 __do_free char *path = NULL, *cgroup = NULL;
2255
2256 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2257 /* not running */
2258 if (!cgroup)
2259 return 0;
2260
2261 path = must_make_path(h->mountpoint, cgroup, NULL);
2262 unified_fd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
2263 }
2264 if (unified_fd < 0)
2265 return -1;
2266
2267 return cgroup_attach_leaf(unified_fd, pid);
2268}
2269
b857f4be 2270__cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
fb55e009 2271 const char *lxcpath, pid_t pid)
ccb4cabe 2272{
81b5d48a 2273 int len, ret;
a3650c0c 2274 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 2275
ab9a452d
CB
2276 if (!ops)
2277 return ret_set_errno(false, ENOENT);
2278
69b4a4bb
CB
2279 if (!ops->hierarchies)
2280 return true;
2281
a3650c0c
CB
2282 len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
2283 if (len < 0 || (size_t)len >= sizeof(pidstr))
ccb4cabe
SH
2284 return false;
2285
81b5d48a 2286 for (int i = 0; ops->hierarchies[i]; i++) {
c05b17bd 2287 __do_free char *fullpath = NULL, *path = NULL;
2202afc9 2288 struct hierarchy *h = ops->hierarchies[i];
ccb4cabe 2289
c2aed66d 2290 if (h->version == CGROUP2_SUPER_MAGIC) {
900b6606 2291 ret = __cg_unified_attach(h, name, lxcpath, pid,
a3926f6a 2292 h->controllers[0]);
c2aed66d
CB
2293 if (ret < 0)
2294 return false;
2295
2296 continue;
2297 }
2298
ccb4cabe 2299 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
c2aed66d
CB
2300 /* not running */
2301 if (!path)
e2cb2e74 2302 return false;
ccb4cabe 2303
371f834d 2304 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
7cea5905 2305 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
ab9a452d
CB
2306 if (ret < 0)
2307 return log_error_errno(false, errno,
2308 "Failed to attach %d to %s",
2309 (int)pid, fullpath);
ccb4cabe
SH
2310 }
2311
ccb4cabe
SH
2312 return true;
2313}
2314
e2bd2b13
CB
2315/* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2316 * don't have a cgroup_data set up, so we ask the running container through the
2317 * commands API for the cgroup path.
ccb4cabe 2318 */
b857f4be 2319__cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
fb55e009
CB
2320 char *value, size_t len, const char *name,
2321 const char *lxcpath)
ccb4cabe 2322{
d97919ab 2323 __do_free char *path = NULL;
88396101 2324 __do_free char *controller = NULL;
d97919ab 2325 char *p;
0069cc61 2326 struct hierarchy *h;
861cb8c2 2327 int ret = -1;
ccb4cabe 2328
a358028a
CB
2329 if (!ops)
2330 return ret_set_errno(-1, ENOENT);
2331
861cb8c2 2332 controller = must_copy_string(filename);
0069cc61
CB
2333 p = strchr(controller, '.');
2334 if (p)
ccb4cabe
SH
2335 *p = '\0';
2336
0069cc61
CB
2337 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2338 /* not running */
2339 if (!path)
ccb4cabe
SH
2340 return -1;
2341
2202afc9 2342 h = get_hierarchy(ops, controller);
ccb4cabe 2343 if (h) {
88396101 2344 __do_free char *fullpath = NULL;
0069cc61
CB
2345
2346 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe 2347 ret = lxc_read_from_file(fullpath, value, len);
ccb4cabe 2348 }
ccb4cabe
SH
2349
2350 return ret;
2351}
2352
cb3fc90c
CB
2353static int device_cgroup_parse_access(struct device_item *device, const char *val)
2354{
2355 for (int count = 0; count < 3; count++, val++) {
2356 switch (*val) {
2357 case 'r':
2358 device->access[count] = *val;
2359 break;
2360 case 'w':
2361 device->access[count] = *val;
2362 break;
2363 case 'm':
2364 device->access[count] = *val;
2365 break;
2366 case '\n':
2367 case '\0':
2368 count = 3;
2369 break;
2370 default:
2371 return ret_errno(EINVAL);
2372 }
2373 }
2374
2375 return 0;
2376}
2377
2a63b5cb
CB
2378static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2379 const char *val)
2380{
2381 int count, ret;
2382 char temp[50];
2383
2384 if (strcmp("devices.allow", key) == 0)
2385 device->allow = 1;
2386 else
2387 device->allow = 0;
2388
2389 if (strcmp(val, "a") == 0) {
2390 /* global rule */
2391 device->type = 'a';
2392 device->major = -1;
2393 device->minor = -1;
fda39d45
CB
2394 device->global_rule = device->allow
2395 ? LXC_BPF_DEVICE_CGROUP_BLACKLIST
2396 : LXC_BPF_DEVICE_CGROUP_WHITELIST;
2a63b5cb
CB
2397 device->allow = -1;
2398 return 0;
2399 } else {
fda39d45 2400 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2a63b5cb
CB
2401 }
2402
2403 switch (*val) {
2404 case 'a':
2405 __fallthrough;
2406 case 'b':
2407 __fallthrough;
2408 case 'c':
2409 device->type = *val;
2410 break;
2411 default:
2412 return -1;
2413 }
2414
2415 val++;
2416 if (!isspace(*val))
2417 return -1;
2418 val++;
2419 if (*val == '*') {
2420 device->major = -1;
2421 val++;
2422 } else if (isdigit(*val)) {
2423 memset(temp, 0, sizeof(temp));
2424 for (count = 0; count < sizeof(temp) - 1; count++) {
2425 temp[count] = *val;
2426 val++;
2427 if (!isdigit(*val))
2428 break;
2429 }
2430 ret = lxc_safe_int(temp, &device->major);
2431 if (ret)
2432 return -1;
2433 } else {
2434 return -1;
2435 }
2436 if (*val != ':')
2437 return -1;
2438 val++;
2439
2440 /* read minor */
2441 if (*val == '*') {
2442 device->minor = -1;
2443 val++;
2444 } else if (isdigit(*val)) {
2445 memset(temp, 0, sizeof(temp));
2446 for (count = 0; count < sizeof(temp) - 1; count++) {
2447 temp[count] = *val;
2448 val++;
2449 if (!isdigit(*val))
2450 break;
2451 }
2452 ret = lxc_safe_int(temp, &device->minor);
2453 if (ret)
2454 return -1;
2455 } else {
2456 return -1;
2457 }
2458 if (!isspace(*val))
2459 return -1;
2a63b5cb 2460
cb3fc90c 2461 return device_cgroup_parse_access(device, ++val);
2a63b5cb
CB
2462}
2463
eec533e3
CB
2464/* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2465 * don't have a cgroup_data set up, so we ask the running container through the
2466 * commands API for the cgroup path.
ccb4cabe 2467 */
b857f4be 2468__cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2a63b5cb 2469 const char *key, const char *value,
fb55e009 2470 const char *name, const char *lxcpath)
ccb4cabe 2471{
d97919ab 2472 __do_free char *path = NULL;
88396101 2473 __do_free char *controller = NULL;
d97919ab 2474 char *p;
87777968 2475 struct hierarchy *h;
861cb8c2 2476 int ret = -1;
ccb4cabe 2477
a358028a
CB
2478 if (!ops)
2479 return ret_set_errno(-1, ENOENT);
2480
2a63b5cb 2481 controller = must_copy_string(key);
87777968
CB
2482 p = strchr(controller, '.');
2483 if (p)
ccb4cabe
SH
2484 *p = '\0';
2485
2a63b5cb
CB
2486 if (pure_unified_layout(ops) && strcmp(controller, "devices") == 0) {
2487 struct device_item device = {0};
2488
2489 ret = device_cgroup_rule_parse(&device, key, value);
2490 if (ret < 0)
d2203230 2491 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2a63b5cb
CB
2492 key, value);
2493
2494 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2495 if (ret < 0)
2496 return -1;
2497
2498 return 0;
2499 }
2500
87777968
CB
2501 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2502 /* not running */
2503 if (!path)
ccb4cabe
SH
2504 return -1;
2505
2202afc9 2506 h = get_hierarchy(ops, controller);
ccb4cabe 2507 if (h) {
88396101 2508 __do_free char *fullpath = NULL;
87777968 2509
2a63b5cb 2510 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
7cea5905 2511 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
ccb4cabe 2512 }
ccb4cabe
SH
2513
2514 return ret;
2515}
2516
91d1a13a 2517/* take devices cgroup line
72add155
SH
2518 * /dev/foo rwx
2519 * and convert it to a valid
2520 * type major:minor mode
91d1a13a
CB
2521 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2522 * the output.
72add155 2523 */
cb3fc90c
CB
2524static int device_cgroup_rule_parse_devpath(struct device_item *device,
2525 const char *devpath)
72add155 2526{
88396101 2527 __do_free char *path = NULL;
2a06d041 2528 char *mode = NULL;
cb3fc90c
CB
2529 int n_parts, ret;
2530 char *p;
2531 struct stat sb;
72add155 2532
cb3fc90c 2533 path = must_copy_string(devpath);
72add155 2534
cb3fc90c
CB
2535 /*
2536 * Read path followed by mode. Ignore any trailing text.
91d1a13a
CB
2537 * A ' # comment' would be legal. Technically other text is not
2538 * legal, we could check for that if we cared to.
72add155 2539 */
0dbdb99e 2540 for (n_parts = 1, p = path; *p; p++) {
2c2d6c49
SH
2541 if (*p != ' ')
2542 continue;
2543 *p = '\0';
91d1a13a 2544
2c2d6c49
SH
2545 if (n_parts != 1)
2546 break;
2547 p++;
2548 n_parts++;
91d1a13a 2549
2c2d6c49
SH
2550 while (*p == ' ')
2551 p++;
91d1a13a 2552
2c2d6c49 2553 mode = p;
91d1a13a 2554
2c2d6c49 2555 if (*p == '\0')
cb3fc90c 2556 return ret_set_errno(-1, EINVAL);
72add155 2557 }
2c2d6c49 2558
cb3fc90c
CB
2559 if (device_cgroup_parse_access(device, mode) < 0)
2560 return -1;
2561
2c2d6c49 2562 if (n_parts == 1)
cb3fc90c 2563 return ret_set_errno(-1, EINVAL);
72add155
SH
2564
2565 ret = stat(path, &sb);
2566 if (ret < 0)
cb3fc90c 2567 return ret_set_errno(-1, errno);
72add155 2568
72add155
SH
2569 mode_t m = sb.st_mode & S_IFMT;
2570 switch (m) {
2571 case S_IFBLK:
cb3fc90c 2572 device->type = 'b';
72add155
SH
2573 break;
2574 case S_IFCHR:
cb3fc90c 2575 device->type = 'c';
72add155 2576 break;
2c2d6c49 2577 default:
cb3fc90c
CB
2578 return log_error_errno(-1, EINVAL,
2579 "Unsupported device type %i for \"%s\"",
2580 m, path);
72add155 2581 }
2c2d6c49 2582
cb3fc90c
CB
2583 device->major = MAJOR(sb.st_rdev);
2584 device->minor = MINOR(sb.st_rdev);
2585 device->allow = 1;
2586 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
72add155 2587
cb3fc90c
CB
2588 return 0;
2589}
2590
2591static int convert_devpath(const char *invalue, char *dest)
2592{
2593 struct device_item device = {0};
2594 int ret;
2595
2596 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2597 if (ret < 0)
2598 return -1;
2599
2600 ret = snprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2601 device.minor, device.access);
2602 if (ret < 0 || ret >= 50)
2603 return log_error_errno(-1,
2604 ENAMETOOLONG, "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2605 device.type, device.major, device.minor,
2606 device.access);
2607
2608 return 0;
72add155
SH
2609}
2610
90e97284
CB
2611/* Called from setup_limits - here we have the container's cgroup_data because
2612 * we created the cgroups.
ccb4cabe 2613 */
2202afc9
CB
2614static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2615 const char *value)
ccb4cabe 2616{
88396101 2617 __do_free char *controller = NULL;
d97919ab 2618 char *p;
1a0e70ac
CB
2619 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2620 char converted_value[50];
b3646d7e 2621 struct hierarchy *h;
64e82f8b 2622
861cb8c2 2623 controller = must_copy_string(filename);
ab1a6cac
CB
2624 p = strchr(controller, '.');
2625 if (p)
ccb4cabe
SH
2626 *p = '\0';
2627
c8bf519d 2628 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
c04a6d4e
CB
2629 int ret;
2630
72add155
SH
2631 ret = convert_devpath(value, converted_value);
2632 if (ret < 0)
c8bf519d 2633 return ret;
72add155 2634 value = converted_value;
c8bf519d 2635 }
2636
2202afc9 2637 h = get_hierarchy(ops, controller);
b3646d7e
CB
2638 if (!h) {
2639 ERROR("Failed to setup limits for the \"%s\" controller. "
2640 "The controller seems to be unused by \"cgfsng\" cgroup "
2641 "driver or not enabled on the cgroup hierarchy",
2642 controller);
d1953b26 2643 errno = ENOENT;
ab1a6cac 2644 return -ENOENT;
ccb4cabe 2645 }
b3646d7e 2646
c04a6d4e 2647 return lxc_write_openat(h->container_full_path, filename, value, strlen(value));
ccb4cabe
SH
2648}
2649
c581d2a6
CB
2650__cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2651 struct lxc_conf *conf,
2652 bool do_devices)
ccb4cabe 2653{
d97919ab 2654 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
c581d2a6 2655 struct lxc_list *cgroup_settings = &conf->cgroup;
d97919ab 2656 struct lxc_list *iterator, *next;
ccb4cabe 2657 struct lxc_cgroup *cg;
ccb4cabe
SH
2658 bool ret = false;
2659
92ca7eb5
CB
2660 if (!ops)
2661 return ret_set_errno(false, ENOENT);
2662
2663 if (!conf)
2664 return ret_set_errno(false, EINVAL);
2665
2666 cgroup_settings = &conf->cgroup;
ccb4cabe
SH
2667 if (lxc_list_empty(cgroup_settings))
2668 return true;
2669
69b4a4bb 2670 if (!ops->hierarchies)
92ca7eb5 2671 return ret_set_errno(false, EINVAL);
69b4a4bb 2672
ccb4cabe 2673 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
6b38e644 2674 if (!sorted_cgroup_settings)
ccb4cabe 2675 return false;
ccb4cabe 2676
ccb4cabe
SH
2677 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2678 cg = iterator->elem;
2679
2680 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2202afc9 2681 if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
92ca7eb5
CB
2682 if (do_devices && (errno == EACCES || errno == EPERM))
2683 log_warn_errno(continue,
2684 errno, "Failed to set \"%s\" to \"%s\"",
2685 cg->subsystem, cg->value);
2686 log_warn_errno(goto out, errno,
2687 "Failed to set \"%s\" to \"%s\"",
2688 cg->subsystem, cg->value);
ccb4cabe 2689 }
c347df58
CB
2690 DEBUG("Set controller \"%s\" set to \"%s\"",
2691 cg->subsystem, cg->value);
ccb4cabe 2692 }
ccb4cabe
SH
2693 }
2694
2695 ret = true;
6b38e644 2696 INFO("Limits for the legacy cgroup hierarchies have been setup");
ccb4cabe 2697out:
ccb4cabe
SH
2698 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2699 lxc_list_del(iterator);
2700 free(iterator);
2701 }
d97919ab 2702
ccb4cabe
SH
2703 return ret;
2704}
2705
bf651989
CB
2706/*
2707 * Some of the parsing logic comes from the original cgroup device v1
2708 * implementation in the kernel.
2709 */
4bfb655e
CB
2710static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2711 struct lxc_conf *conf, const char *key,
bf651989
CB
2712 const char *val)
2713{
2714#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
4bfb655e 2715 struct device_item device_item = {0};
2a63b5cb 2716 int ret;
bf651989 2717
cb3fc90c
CB
2718 if (strcmp("devices.allow", key) == 0 && *val == '/')
2719 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2720 else
2721 ret = device_cgroup_rule_parse(&device_item, key, val);
2a63b5cb 2722 if (ret < 0)
d2203230 2723 return log_error_errno(-1, EINVAL,
2a63b5cb
CB
2724 "Failed to parse device string %s=%s",
2725 key, val);
4bfb655e
CB
2726
2727 ret = bpf_list_add_device(conf, &device_item);
2a63b5cb 2728 if (ret < 0)
4bfb655e 2729 return -1;
bf651989
CB
2730#endif
2731 return 0;
2732}
2733
c581d2a6
CB
2734__cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2735 struct lxc_handler *handler)
6b38e644 2736{
7e31931f
CB
2737 struct lxc_list *cgroup_settings, *iterator;
2738 struct hierarchy *h;
2739 struct lxc_conf *conf;
6b38e644 2740
7e31931f
CB
2741 if (!ops)
2742 return ret_set_errno(false, ENOENT);
2743
2744 if (!ops->hierarchies)
6b38e644
CB
2745 return true;
2746
7e31931f
CB
2747 if (!ops->container_cgroup)
2748 return ret_set_errno(false, EINVAL);
2749
2750 if (!handler || !handler->conf)
2751 return ret_set_errno(false, EINVAL);
2752 conf = handler->conf;
2753
2754 if (lxc_list_empty(&conf->cgroup2))
2755 return true;
2756 cgroup_settings = &conf->cgroup2;
2757
2758 if (!ops->unified)
6b38e644 2759 return false;
7e31931f 2760 h = ops->unified;
6b38e644 2761
bf651989 2762 lxc_list_for_each (iterator, cgroup_settings) {
6b38e644 2763 struct lxc_cgroup *cg = iterator->elem;
c04a6d4e 2764 int ret;
6b38e644 2765
bf651989 2766 if (strncmp("devices", cg->subsystem, 7) == 0) {
4bfb655e 2767 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem,
bf651989
CB
2768 cg->value);
2769 } else {
c04a6d4e
CB
2770 ret = lxc_write_openat(h->container_full_path,
2771 cg->subsystem, cg->value,
2772 strlen(cg->value));
7e31931f
CB
2773 if (ret < 0)
2774 return log_error_errno(false,
2775 errno, "Failed to set \"%s\" to \"%s\"",
2776 cg->subsystem, cg->value);
6b38e644
CB
2777 }
2778 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2779 }
2780
7e31931f 2781 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
6b38e644
CB
2782}
2783
bf651989
CB
2784__cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
2785 struct lxc_handler *handler)
2786{
2787#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
2a63b5cb 2788 __do_bpf_program_free struct bpf_program *devices = NULL;
bf651989 2789 int ret;
e552bd1a
CB
2790 struct lxc_conf *conf;
2791 struct hierarchy *unified;
2a63b5cb
CB
2792 struct lxc_list *it;
2793 struct bpf_program *devices_old;
bf651989 2794
e552bd1a
CB
2795 if (!ops)
2796 return ret_set_errno(false, ENOENT);
2797
2798 if (!ops->hierarchies)
2799 return true;
2800
2801 if (!ops->container_cgroup)
2802 return ret_set_errno(false, EEXIST);
2803
2804 if (!handler || !handler->conf)
2805 return ret_set_errno(false, EINVAL);
2806 conf = handler->conf;
2807
2808 unified = ops->unified;
9994db51
CB
2809 if (!unified || !unified->bpf_device_controller ||
2810 !unified->container_full_path || lxc_list_empty(&conf->devices))
bf651989
CB
2811 return true;
2812
2a63b5cb
CB
2813 devices = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
2814 if (!devices)
d47ff01b
CB
2815 return log_error_errno(false, ENOMEM,
2816 "Failed to create new bpf program");
2a63b5cb
CB
2817
2818 ret = bpf_program_init(devices);
bf651989 2819 if (ret)
d47ff01b
CB
2820 return log_error_errno(false, ENOMEM,
2821 "Failed to initialize bpf program");
2a63b5cb
CB
2822
2823 lxc_list_for_each(it, &conf->devices) {
2824 struct device_item *cur = it->elem;
2825
2826 ret = bpf_program_append_device(devices, cur);
2827 if (ret)
d47ff01b
CB
2828 return log_error_errno(false,
2829 ENOMEM, "Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2830 cur->type, cur->major,
2831 cur->minor, cur->access,
2832 cur->allow, cur->global_rule);
2a63b5cb
CB
2833 TRACE("Added rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2834 cur->type, cur->major, cur->minor, cur->access,
2835 cur->allow, cur->global_rule);
2836 }
2837
2838 ret = bpf_program_finalize(devices);
2839 if (ret)
d47ff01b
CB
2840 return log_error_errno(false, ENOMEM,
2841 "Failed to finalize bpf program");
bf651989 2842
2a63b5cb
CB
2843 ret = bpf_program_cgroup_attach(devices, BPF_CGROUP_DEVICE,
2844 unified->container_full_path,
cce5a3d7
CB
2845 BPF_F_ALLOW_MULTI);
2846 if (ret)
d47ff01b
CB
2847 return log_error_errno(false, ENOMEM,
2848 "Failed to attach bpf program");
cce5a3d7
CB
2849
2850 /* Replace old bpf program. */
2a63b5cb
CB
2851 devices_old = move_ptr(conf->cgroup2_devices);
2852 conf->cgroup2_devices = move_ptr(devices);
2853 devices = move_ptr(devices_old);
bf651989 2854#endif
cce5a3d7 2855 return true;
bf651989
CB
2856}
2857
c581d2a6 2858bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
6b38e644 2859{
c581d2a6
CB
2860 __do_free char *add_controllers = NULL, *base_path = NULL;
2861 struct hierarchy *unified = ops->unified;
2862 ssize_t parts_len;
2863 char **it;
2864 size_t full_len = 0;
2865 char **parts = NULL;
2866 bool bret = false;
6b38e644 2867
c581d2a6
CB
2868 if (!ops->hierarchies || !pure_unified_layout(ops) ||
2869 !unified->controllers[0])
bf651989
CB
2870 return true;
2871
c581d2a6
CB
2872 /* For now we simply enable all controllers that we have detected by
2873 * creating a string like "+memory +pids +cpu +io".
2874 * TODO: In the near future we might want to support "-<controller>"
2875 * etc. but whether supporting semantics like this make sense will need
2876 * some thinking.
2877 */
2878 for (it = unified->controllers; it && *it; it++) {
2879 full_len += strlen(*it) + 2;
2880 add_controllers = must_realloc(add_controllers, full_len + 1);
2881
2882 if (unified->controllers[0] == *it)
2883 add_controllers[0] = '\0';
2884
2885 (void)strlcat(add_controllers, "+", full_len + 1);
2886 (void)strlcat(add_controllers, *it, full_len + 1);
2887
2888 if ((it + 1) && *(it + 1))
2889 (void)strlcat(add_controllers, " ", full_len + 1);
2890 }
2891
2892 parts = lxc_string_split(cgroup, '/');
2893 if (!parts)
2894 goto on_error;
2895
2896 parts_len = lxc_array_len((void **)parts);
2897 if (parts_len > 0)
2898 parts_len--;
2899
2900 base_path = must_make_path(unified->mountpoint, unified->container_base_path, NULL);
2901 for (ssize_t i = -1; i < parts_len; i++) {
2902 int ret;
2903 __do_free char *target = NULL;
2904
2905 if (i >= 0)
2906 base_path = must_append_path(base_path, parts[i], NULL);
2907 target = must_make_path(base_path, "cgroup.subtree_control", NULL);
2908 ret = lxc_writeat(-1, target, add_controllers, full_len);
61fbc369
CB
2909 if (ret < 0)
2910 log_error_errno(goto on_error,
2911 errno, "Could not enable \"%s\" controllers in the unified cgroup \"%s\"",
2912 add_controllers, target);
c581d2a6
CB
2913 TRACE("Enable \"%s\" controllers in the unified cgroup \"%s\"", add_controllers, target);
2914 }
2915
2916 bret = true;
2917
2918on_error:
2919 lxc_free_array((void **)parts, free);
2920 return bret;
2921}
2922
2923__cgfsng_ops bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2924{
61fbc369
CB
2925 if (!ops)
2926 return ret_set_errno(false, ENOENT);
2927
c581d2a6
CB
2928 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2929}
2930
2931__cgfsng_ops bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2932{
61fbc369
CB
2933 if (!ops)
2934 return ret_set_errno(false, ENOENT);
2935
c581d2a6 2936 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2202afc9
CB
2937}
2938
b7b18fc5
CB
2939static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2940 char **controllers)
2941{
b7b18fc5
CB
2942 if (!ops->cgroup_use)
2943 return true;
2944
431e2c54 2945 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
b7b18fc5
CB
2946 bool found = false;
2947
431e2c54 2948 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
b7b18fc5
CB
2949 if (strcmp(*cur_use, *cur_ctrl) != 0)
2950 continue;
2951
2952 found = true;
2953 break;
2954 }
2955
2956 if (found)
2957 continue;
2958
2959 return false;
2960 }
2961
2962 return true;
2963}
2964
a6ca2ed8
CB
2965static void cg_unified_delegate(char ***delegate)
2966{
d606c4e9 2967 __do_free char *buf = NULL;
a6ca2ed8 2968 char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
d606c4e9
CB
2969 char *token;
2970 int idx;
a6ca2ed8 2971
d606c4e9
CB
2972 buf = read_file("/sys/kernel/cgroup/delegate");
2973 if (!buf) {
a6ca2ed8
CB
2974 for (char **p = standard; p && *p; p++) {
2975 idx = append_null_to_list((void ***)delegate);
2976 (*delegate)[idx] = must_copy_string(*p);
2977 }
d606c4e9
CB
2978 log_warn_errno(return, errno, "Failed to read /sys/kernel/cgroup/delegate");
2979 }
a6ca2ed8 2980
d606c4e9
CB
2981 lxc_iterate_parts (token, buf, " \t\n") {
2982 /*
2983 * We always need to chown this for both cgroup and
2984 * cgroup2.
2985 */
2986 if (strcmp(token, "cgroup.procs") == 0)
2987 continue;
2988
2989 idx = append_null_to_list((void ***)delegate);
2990 (*delegate)[idx] = must_copy_string(token);
a6ca2ed8
CB
2991 }
2992}
2993
2202afc9
CB
2994/* At startup, parse_hierarchies finds all the info we need about cgroup
2995 * mountpoints and current cgroups, and stores it in @d.
2996 */
341e6516 2997static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
2202afc9 2998{
88396101 2999 __do_free char *basecginfo = NULL;
d97919ab
CB
3000 __do_free char *line = NULL;
3001 __do_fclose FILE *f = NULL;
2202afc9 3002 int ret;
2202afc9 3003 size_t len = 0;
2202afc9
CB
3004 char **klist = NULL, **nlist = NULL;
3005
3006 /* Root spawned containers escape the current cgroup, so use init's
3007 * cgroups as our base in that case.
3008 */
9caee129 3009 if (!relative && (geteuid() == 0))
2202afc9
CB
3010 basecginfo = read_file("/proc/1/cgroup");
3011 else
3012 basecginfo = read_file("/proc/self/cgroup");
3013 if (!basecginfo)
341e6516 3014 return ret_set_errno(-1, ENOMEM);
2202afc9
CB
3015
3016 ret = get_existing_subsystems(&klist, &nlist);
341e6516
CB
3017 if (ret < 0)
3018 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
2202afc9
CB
3019
3020 f = fopen("/proc/self/mountinfo", "r");
341e6516
CB
3021 if (!f)
3022 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
2202afc9
CB
3023
3024 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
3025
3026 while (getline(&line, &len, f) != -1) {
3027 int type;
3028 bool writeable;
3029 struct hierarchy *new;
3030 char *base_cgroup = NULL, *mountpoint = NULL;
3031 char **controller_list = NULL;
3032
3033 type = get_cgroup_version(line);
3034 if (type == 0)
3035 continue;
3036
3037 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
3038 continue;
3039
3040 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
3041 if (type == CGROUP2_SUPER_MAGIC)
3042 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3043 else if (type == CGROUP_SUPER_MAGIC)
3044 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3045 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
3046 if (type == CGROUP_SUPER_MAGIC)
3047 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3048 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3049 if (type == CGROUP2_SUPER_MAGIC)
3050 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3051 }
3052
3053 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
3054 if (!controller_list && type == CGROUP_SUPER_MAGIC)
3055 continue;
3056
3057 if (type == CGROUP_SUPER_MAGIC)
3058 if (controller_list_is_dup(ops->hierarchies, controller_list))
341e6516 3059 log_trace_errno(goto next, EEXIST, "Skipping duplicating controller");
2202afc9
CB
3060
3061 mountpoint = cg_hybrid_get_mountpoint(line);
341e6516
CB
3062 if (!mountpoint)
3063 log_error_errno(goto next, EINVAL, "Failed parsing mountpoint from \"%s\"", line);
2202afc9
CB
3064
3065 if (type == CGROUP_SUPER_MAGIC)
3066 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
3067 else
3068 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
341e6516
CB
3069 if (!base_cgroup)
3070 log_error_errno(goto next, EINVAL, "Failed to find current cgroup");
2202afc9
CB
3071
3072 trim(base_cgroup);
3073 prune_init_scope(base_cgroup);
3074 if (type == CGROUP2_SUPER_MAGIC)
3075 writeable = test_writeable_v2(mountpoint, base_cgroup);
3076 else
3077 writeable = test_writeable_v1(mountpoint, base_cgroup);
3078 if (!writeable)
341e6516 3079 log_trace_errno(goto next, EROFS, "The %s group is not writeable", base_cgroup);
2202afc9
CB
3080
3081 if (type == CGROUP2_SUPER_MAGIC) {
3082 char *cgv2_ctrl_path;
3083
3084 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
3085 "cgroup.controllers",
3086 NULL);
3087
3088 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
3089 free(cgv2_ctrl_path);
3090 if (!controller_list) {
3091 controller_list = cg_unified_make_empty_controller();
3092 TRACE("No controllers are enabled for "
3093 "delegation in the unified hierarchy");
3094 }
3095 }
3096
b7b18fc5
CB
3097 /* Exclude all controllers that cgroup use does not want. */
3098 if (!cgroup_use_wants_controllers(ops, controller_list))
341e6516 3099 log_trace_errno(goto next, EINVAL, "Skipping controller");
b7b18fc5 3100
2202afc9 3101 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
a6ca2ed8
CB
3102 if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
3103 if (unprivileged)
3104 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 3105 ops->unified = new;
a6ca2ed8 3106 }
2202afc9
CB
3107
3108 continue;
3109
3110 next:
3111 free_string_list(controller_list);
3112 free(mountpoint);
3113 free(base_cgroup);
3114 }
3115
3116 free_string_list(klist);
3117 free_string_list(nlist);
3118
2202afc9
CB
3119 TRACE("Writable cgroup hierarchies:");
3120 lxc_cgfsng_print_hierarchies(ops);
3121
3122 /* verify that all controllers in cgroup.use and all crucial
3123 * controllers are accounted for
3124 */
3125 if (!all_controllers_found(ops))
341e6516 3126 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
2202afc9 3127
341e6516 3128 return 0;
2202afc9
CB
3129}
3130
2202afc9 3131/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
9caee129 3132static char *cg_unified_get_current_cgroup(bool relative)
2202afc9 3133{
88396101 3134 __do_free char *basecginfo = NULL;
d97919ab 3135 char *base_cgroup;
2202afc9
CB
3136 char *copy = NULL;
3137
9caee129 3138 if (!relative && (geteuid() == 0))
2202afc9
CB
3139 basecginfo = read_file("/proc/1/cgroup");
3140 else
3141 basecginfo = read_file("/proc/self/cgroup");
3142 if (!basecginfo)
3143 return NULL;
3144
3145 base_cgroup = strstr(basecginfo, "0::/");
3146 if (!base_cgroup)
3147 goto cleanup_on_err;
3148
3149 base_cgroup = base_cgroup + 3;
3150 copy = copy_to_eol(base_cgroup);
3151 if (!copy)
3152 goto cleanup_on_err;
3153
3154cleanup_on_err:
2202afc9
CB
3155 if (copy)
3156 trim(copy);
3157
3158 return copy;
3159}
3160
a6ca2ed8
CB
3161static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3162 bool unprivileged)
2202afc9 3163{
d97919ab 3164 __do_free char *subtree_path = NULL;
2202afc9 3165 int ret;
7717e175 3166 char *mountpoint;
2202afc9 3167 char **delegatable;
a6ca2ed8 3168 struct hierarchy *new;
2202afc9
CB
3169 char *base_cgroup = NULL;
3170
d47ff01b 3171 ret = unified_cgroup_hierarchy();
2202afc9 3172 if (ret == -ENOMEDIUM)
d2203230 3173 return ret_errno(ENOMEDIUM);
2202afc9
CB
3174
3175 if (ret != CGROUP2_SUPER_MAGIC)
3176 return 0;
3177
9caee129 3178 base_cgroup = cg_unified_get_current_cgroup(relative);
2202afc9 3179 if (!base_cgroup)
d2203230 3180 return ret_errno(EINVAL);
c581d2a6
CB
3181 if (!relative)
3182 prune_init_scope(base_cgroup);
2202afc9 3183
d606c4e9
CB
3184 /*
3185 * We assume that the cgroup we're currently in has been delegated to
3186 * us and we are free to further delege all of the controllers listed
3187 * in cgroup.controllers further down the hierarchy.
2202afc9 3188 */
dca9587a 3189 mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
c581d2a6 3190 subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
2202afc9 3191 delegatable = cg_unified_get_controllers(subtree_path);
2202afc9
CB
3192 if (!delegatable)
3193 delegatable = cg_unified_make_empty_controller();
3194 if (!delegatable[0])
3195 TRACE("No controllers are enabled for delegation");
3196
3197 /* TODO: If the user requested specific controllers via lxc.cgroup.use
3198 * we should verify here. The reason I'm not doing it right is that I'm
3199 * not convinced that lxc.cgroup.use will be the future since it is a
3200 * global property. I much rather have an option that lets you request
3201 * controllers per container.
3202 */
3203
a6ca2ed8 3204 new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
d606c4e9 3205 if (unprivileged)
a6ca2ed8 3206 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 3207
2a63b5cb
CB
3208 if (bpf_devices_cgroup_supported())
3209 new->bpf_device_controller = 1;
3210
2202afc9 3211 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
908e0ee5 3212 ops->unified = new;
2202afc9
CB
3213 return CGROUP2_SUPER_MAGIC;
3214}
3215
341e6516 3216static int cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
2202afc9
CB
3217{
3218 int ret;
3219 const char *tmp;
9caee129 3220 bool relative = conf->cgroup_meta.relative;
2202afc9
CB
3221
3222 tmp = lxc_global_config_value("lxc.cgroup.use");
b7b18fc5 3223 if (tmp) {
88396101 3224 __do_free char *pin = NULL;
d97919ab 3225 char *chop, *cur;
b7b18fc5
CB
3226
3227 pin = must_copy_string(tmp);
3228 chop = pin;
3229
d97919ab 3230 lxc_iterate_parts(cur, chop, ",")
b7b18fc5 3231 must_append_string(&ops->cgroup_use, cur);
b7b18fc5 3232 }
2202afc9 3233
a6ca2ed8 3234 ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9 3235 if (ret < 0)
341e6516 3236 return -1;
2202afc9
CB
3237
3238 if (ret == CGROUP2_SUPER_MAGIC)
341e6516 3239 return 0;
2202afc9 3240
a6ca2ed8 3241 return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9
CB
3242}
3243
341e6516 3244__cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
2202afc9
CB
3245{
3246 const char *cgroup_pattern;
3247
341e6516
CB
3248 if (!ops)
3249 return ret_set_errno(-1, ENOENT);
3250
2202afc9
CB
3251 /* copy system-wide cgroup information */
3252 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3253 if (!cgroup_pattern) {
3254 /* lxc.cgroup.pattern is only NULL on error. */
3255 ERROR("Failed to retrieve cgroup pattern");
341e6516 3256 return ret_set_errno(-1, ENOMEM);
2202afc9
CB
3257 }
3258 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
3259
341e6516 3260 return 0;
2202afc9
CB
3261}
3262
5a087e05 3263struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
2202afc9 3264{
a64edc1c 3265 __do_free struct cgroup_ops *cgfsng_ops = NULL;
2202afc9
CB
3266
3267 cgfsng_ops = malloc(sizeof(struct cgroup_ops));
3268 if (!cgfsng_ops)
341e6516 3269 return ret_set_errno(NULL, ENOMEM);
2202afc9
CB
3270
3271 memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
3272 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3273
341e6516 3274 if (cg_init(cgfsng_ops, conf))
2202afc9 3275 return NULL;
2202afc9 3276
bad788b0
CB
3277 cgfsng_ops->unified_fd = -EBADF;
3278
2202afc9 3279 cgfsng_ops->data_init = cgfsng_data_init;
434c8e15
CB
3280 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3281 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
72068e74 3282 cgfsng_ops->monitor_create = cgfsng_monitor_create;
eeef32bb 3283 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
c581d2a6
CB
3284 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3285 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
e8b181f5
CB
3286 cgfsng_ops->payload_create = cgfsng_payload_create;
3287 cgfsng_ops->payload_enter = cgfsng_payload_enter;
2202afc9
CB
3288 cgfsng_ops->escape = cgfsng_escape;
3289 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
3290 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
3291 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3292 cgfsng_ops->get = cgfsng_get;
3293 cgfsng_ops->set = cgfsng_set;
942e193e 3294 cgfsng_ops->freeze = cgfsng_freeze;
2202afc9 3295 cgfsng_ops->unfreeze = cgfsng_unfreeze;
c581d2a6 3296 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
2202afc9
CB
3297 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3298 cgfsng_ops->driver = "cgfsng";
3299 cgfsng_ops->version = "1.0.0";
3300 cgfsng_ops->attach = cgfsng_attach;
3301 cgfsng_ops->chown = cgfsng_chown;
3302 cgfsng_ops->mount = cgfsng_mount;
3303 cgfsng_ops->nrtasks = cgfsng_nrtasks;
bf651989 3304 cgfsng_ops->devices_activate = cgfsng_devices_activate;
2202afc9 3305
a64edc1c 3306 return move_ptr(cgfsng_ops);
2202afc9 3307}