]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
cgroups: move check for valid monitor process up
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
ccb4cabe
SH
2
3/*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
0e7ff52c 8 * each controller.
ccb4cabe
SH
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
18406e5a 12 * a comma-separated list of controllers.
ccb4cabe 13 */
a54694f8 14
d38dd64a
CB
15#ifndef _GNU_SOURCE
16#define _GNU_SOURCE 1
17#endif
a54694f8
CB
18#include <ctype.h>
19#include <dirent.h>
20#include <errno.h>
21#include <grp.h>
d38dd64a
CB
22#include <linux/kdev_t.h>
23#include <linux/types.h>
942e193e
CB
24#include <poll.h>
25#include <signal.h>
a54694f8 26#include <stdint.h>
ccb4cabe
SH
27#include <stdio.h>
28#include <stdlib.h>
a54694f8 29#include <string.h>
438c4581 30#include <sys/types.h>
d38dd64a 31#include <unistd.h>
c8bf519d 32
b635e92d 33#include "caps.h"
ccb4cabe 34#include "cgroup.h"
bf651989 35#include "cgroup2_devices.h"
6328fd9c 36#include "cgroup_utils.h"
ccb4cabe 37#include "commands.h"
43654d34 38#include "conf.h"
d38dd64a 39#include "config.h"
a54694f8 40#include "log.h"
c19ad94b 41#include "macro.h"
018051e3 42#include "mainloop.h"
861cb8c2 43#include "memory_utils.h"
43654d34 44#include "storage/storage.h"
a54694f8 45#include "utils.h"
ccb4cabe 46
64e82f8b
DJ
47#ifndef HAVE_STRLCPY
48#include "include/strlcpy.h"
49#endif
50
3ebe2fbd
DJ
51#ifndef HAVE_STRLCAT
52#include "include/strlcat.h"
53#endif
54
ac2cecc4 55lxc_log_define(cgfsng, cgroup);
ccb4cabe 56
8b8db2f6
CB
57/* Given a pointer to a null-terminated array of pointers, realloc to add one
58 * entry, and point the new entry to NULL. Do not fail. Return the index to the
59 * second-to-last entry - that is, the one which is now available for use
60 * (keeping the list null-terminated).
ccb4cabe
SH
61 */
62static int append_null_to_list(void ***list)
63{
64 int newentry = 0;
65
66 if (*list)
8b8db2f6
CB
67 for (; (*list)[newentry]; newentry++)
68 ;
ccb4cabe
SH
69
70 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
71 (*list)[newentry + 1] = NULL;
72 return newentry;
73}
74
8073018d
CB
75/* Given a null-terminated array of strings, check whether @entry is one of the
76 * strings.
ccb4cabe
SH
77 */
78static bool string_in_list(char **list, const char *entry)
79{
ccb4cabe
SH
80 if (!list)
81 return false;
d6337a5f 82
77c3e9a2 83 for (int i = 0; list[i]; i++)
ccb4cabe
SH
84 if (strcmp(list[i], entry) == 0)
85 return true;
86
87 return false;
88}
89
ac010944
CB
90/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
91 * "name=systemd". Do not fail.
92 */
93static char *cg_legacy_must_prefix_named(char *entry)
94{
95 size_t len;
96 char *prefixed;
97
98 len = strlen(entry);
f25a2044 99 prefixed = must_realloc(NULL, len + 6);
ac010944 100
6333c915
CB
101 memcpy(prefixed, "name=", STRLITERALLEN("name="));
102 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
ac010944 103 prefixed[len + 5] = '\0';
99bb3fa8 104
ac010944
CB
105 return prefixed;
106}
107
42a993b4
CB
108/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
109 * we are called.
ccb4cabe 110 *
42a993b4
CB
111 * We also handle named subsystems here. Any controller which is not a kernel
112 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
113 * we refuse to use because we're not sure which we have here.
114 * (TODO: We could work around this in some cases by just remounting to be
115 * unambiguous, or by comparing mountpoint contents with current cgroup.)
ccb4cabe
SH
116 *
117 * The last entry will always be NULL.
118 */
42a993b4
CB
119static void must_append_controller(char **klist, char **nlist, char ***clist,
120 char *entry)
ccb4cabe
SH
121{
122 int newentry;
123 char *copy;
124
125 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
c2712f64 126 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
ccb4cabe
SH
127 ERROR("It is both a named and kernel subsystem");
128 return;
129 }
130
131 newentry = append_null_to_list((void ***)clist);
132
133 if (strncmp(entry, "name=", 5) == 0)
134 copy = must_copy_string(entry);
135 else if (string_in_list(klist, entry))
136 copy = must_copy_string(entry);
137 else
7745483d 138 copy = cg_legacy_must_prefix_named(entry);
ccb4cabe
SH
139
140 (*clist)[newentry] = copy;
141}
142
5ae0207c
CB
143/* Given a handler's cgroup data, return the struct hierarchy for the controller
144 * @c, or NULL if there is none.
ccb4cabe 145 */
27a5132c 146struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
ccb4cabe 147{
77c3e9a2
CB
148 if (!ops->hierarchies)
149 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
d6337a5f 150
77c3e9a2 151 for (int i = 0; ops->hierarchies[i]; i++) {
27a5132c 152 if (!controller) {
d6337a5f 153 /* This is the empty unified hierarchy. */
2202afc9
CB
154 if (ops->hierarchies[i]->controllers &&
155 !ops->hierarchies[i]->controllers[0])
156 return ops->hierarchies[i];
106f1f38 157 continue;
2a63b5cb
CB
158 } else if (pure_unified_layout(ops) &&
159 strcmp(controller, "devices") == 0) {
160 if (ops->unified->bpf_device_controller)
161 return ops->unified;
162 break;
d6337a5f
CB
163 }
164
27a5132c 165 if (string_in_list(ops->hierarchies[i]->controllers, controller))
2202afc9 166 return ops->hierarchies[i];
ccb4cabe 167 }
d6337a5f 168
27a5132c
CB
169 if (controller)
170 WARN("There is no useable %s controller", controller);
171 else
172 WARN("There is no empty unified cgroup hierarchy");
173
77c3e9a2 174 return ret_set_errno(NULL, ENOENT);
ccb4cabe
SH
175}
176
a54694f8
CB
177#define BATCH_SIZE 50
178static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
179{
180 int newbatches = (newlen / BATCH_SIZE) + 1;
181 int oldbatches = (oldlen / BATCH_SIZE) + 1;
182
77c3e9a2 183 if (!*mem || newbatches > oldbatches)
a54694f8 184 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
a54694f8
CB
185}
186
187static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
188{
189 size_t full = oldlen + newlen;
190
191 batch_realloc(dest, oldlen, full + 1);
192
193 memcpy(*dest + oldlen, new, newlen + 1);
194}
195
196/* Slurp in a whole file */
d6337a5f 197static char *read_file(const char *fnam)
a54694f8 198{
77c3e9a2 199 __do_free char *buf = NULL, *line = NULL;
d97919ab 200 __do_fclose FILE *f = NULL;
d97919ab 201 size_t len = 0, fulllen = 0;
77c3e9a2 202 int linelen;
a54694f8 203
4110345b 204 f = fopen(fnam, "re");
a54694f8
CB
205 if (!f)
206 return NULL;
77c3e9a2 207
a54694f8
CB
208 while ((linelen = getline(&line, &len, f)) != -1) {
209 append_line(&buf, fulllen, line, linelen);
210 fulllen += linelen;
211 }
77c3e9a2
CB
212
213 return move_ptr(buf);
a54694f8
CB
214}
215
216/* Taken over modified from the kernel sources. */
217#define NBITS 32 /* bits in uint32_t */
218#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
219#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
220
221static void set_bit(unsigned bit, uint32_t *bitarr)
222{
223 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
224}
225
226static void clear_bit(unsigned bit, uint32_t *bitarr)
227{
228 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
229}
230
231static bool is_set(unsigned bit, uint32_t *bitarr)
232{
233 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
234}
235
236/* Create cpumask from cpulist aka turn:
237 *
238 * 0,2-3
239 *
d5d468f6 240 * into bit array
a54694f8
CB
241 *
242 * 1 0 1 1
243 */
244static uint32_t *lxc_cpumask(char *buf, size_t nbits)
245{
77c3e9a2 246 __do_free uint32_t *bitarr = NULL;
a54694f8 247 char *token;
d5d468f6 248 size_t arrlen;
d5d468f6
CB
249
250 arrlen = BITS_TO_LONGS(nbits);
251 bitarr = calloc(arrlen, sizeof(uint32_t));
a54694f8 252 if (!bitarr)
c5b8049e 253 return ret_set_errno(NULL, ENOMEM);
a54694f8 254
0be0d78f 255 lxc_iterate_parts(token, buf, ",") {
a54694f8 256 errno = 0;
d5d468f6
CB
257 unsigned end, start;
258 char *range;
a54694f8 259
d5d468f6
CB
260 start = strtoul(token, NULL, 0);
261 end = start;
262 range = strchr(token, '-');
a54694f8
CB
263 if (range)
264 end = strtoul(range + 1, NULL, 0);
d5d468f6 265
c5b8049e
CB
266 if (!(start <= end))
267 return ret_set_errno(NULL, EINVAL);
a54694f8 268
c5b8049e
CB
269 if (end >= nbits)
270 return ret_set_errno(NULL, EINVAL);
a54694f8
CB
271
272 while (start <= end)
273 set_bit(start++, bitarr);
274 }
275
c5b8049e 276 return move_ptr(bitarr);
a54694f8
CB
277}
278
a54694f8
CB
279/* Turn cpumask into simple, comma-separated cpulist. */
280static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
281{
f761d24d 282 __do_free_string_list char **cpulist = NULL;
c19ad94b 283 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
77c3e9a2 284 int ret;
a54694f8 285
77c3e9a2 286 for (size_t i = 0; i <= nbits; i++) {
414c6719
CB
287 if (!is_set(i, bitarr))
288 continue;
289
979a0d93 290 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
f761d24d 291 if (ret < 0 || (size_t)ret >= sizeof(numstr))
414c6719 292 return NULL;
414c6719
CB
293
294 ret = lxc_append_string(&cpulist, numstr);
f761d24d 295 if (ret < 0)
c5b8049e 296 return ret_set_errno(NULL, ENOMEM);
a54694f8 297 }
414c6719
CB
298
299 if (!cpulist)
c5b8049e 300 return ret_set_errno(NULL, ENOMEM);
414c6719 301
f761d24d 302 return lxc_string_join(",", (const char **)cpulist, false);
a54694f8
CB
303}
304
305static ssize_t get_max_cpus(char *cpulist)
306{
307 char *c1, *c2;
308 char *maxcpus = cpulist;
309 size_t cpus = 0;
310
311 c1 = strrchr(maxcpus, ',');
312 if (c1)
313 c1++;
314
315 c2 = strrchr(maxcpus, '-');
316 if (c2)
317 c2++;
318
319 if (!c1 && !c2)
320 c1 = maxcpus;
321 else if (c1 > c2)
322 c2 = c1;
323 else if (c1 < c2)
324 c1 = c2;
333987b9 325 else if (!c1 && c2)
a54694f8
CB
326 c1 = c2;
327
a54694f8
CB
328 errno = 0;
329 cpus = strtoul(c1, NULL, 0);
330 if (errno != 0)
331 return -1;
332
333 return cpus;
334}
335
6f9584d8 336#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
36f70181 337#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
c5b8049e
CB
338static bool cg_legacy_filter_and_set_cpus(const char *parent_cgroup,
339 char *child_cgroup, bool am_initialized)
a54694f8 340{
d97919ab 341 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
36f70181
CB
342 *offlinecpus = NULL, *posscpus = NULL;
343 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
344 *possmask = NULL;
a54694f8
CB
345 int ret;
346 ssize_t i;
36f70181 347 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
c5b8049e 348 bool flipped_bit = false;
a54694f8 349
c5b8049e 350 fpath = must_make_path(parent_cgroup, "cpuset.cpus", NULL);
a54694f8 351 posscpus = read_file(fpath);
c5b8049e
CB
352 if (!posscpus)
353 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
a54694f8
CB
354
355 /* Get maximum number of cpus found in possible cpuset. */
356 maxposs = get_max_cpus(posscpus);
92d5ea57 357 if (maxposs < 0 || maxposs >= INT_MAX - 1)
d97919ab 358 return false;
a54694f8 359
36f70181
CB
360 if (file_exists(__ISOL_CPUS)) {
361 isolcpus = read_file(__ISOL_CPUS);
c5b8049e
CB
362 if (!isolcpus)
363 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
6f9584d8 364
36f70181
CB
365 if (isdigit(isolcpus[0])) {
366 /* Get maximum number of cpus found in isolated cpuset. */
367 maxisol = get_max_cpus(isolcpus);
368 if (maxisol < 0 || maxisol >= INT_MAX - 1)
369 return false;
6f9584d8 370 }
36f70181
CB
371
372 if (maxposs < maxisol)
373 maxposs = maxisol;
374 maxposs++;
375 } else {
376 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
a54694f8
CB
377 }
378
36f70181
CB
379 if (file_exists(__OFFLINE_CPUS)) {
380 offlinecpus = read_file(__OFFLINE_CPUS);
c5b8049e
CB
381 if (!offlinecpus)
382 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
36f70181
CB
383
384 if (isdigit(offlinecpus[0])) {
385 /* Get maximum number of cpus found in offline cpuset. */
386 maxoffline = get_max_cpus(offlinecpus);
387 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
388 return false;
389 }
390
391 if (maxposs < maxoffline)
392 maxposs = maxoffline;
393 maxposs++;
394 } else {
395 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
396 }
a54694f8 397
dcd14a3d
CB
398 if ((maxisol == 0) && (maxoffline == 0)) {
399 cpulist = move_ptr(posscpus);
36f70181 400 goto copy_parent;
dcd14a3d 401 }
a54694f8
CB
402
403 possmask = lxc_cpumask(posscpus, maxposs);
c5b8049e
CB
404 if (!possmask)
405 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
a54694f8 406
36f70181
CB
407 if (maxisol > 0) {
408 isolmask = lxc_cpumask(isolcpus, maxposs);
c5b8049e
CB
409 if (!isolmask)
410 return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
36f70181
CB
411 }
412
413 if (maxoffline > 0) {
414 offlinemask = lxc_cpumask(offlinecpus, maxposs);
c5b8049e
CB
415 if (!offlinemask)
416 return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
6f9584d8 417 }
a54694f8
CB
418
419 for (i = 0; i <= maxposs; i++) {
36f70181
CB
420 if ((isolmask && !is_set(i, isolmask)) ||
421 (offlinemask && !is_set(i, offlinemask)) ||
422 !is_set(i, possmask))
59ac3b88
CB
423 continue;
424
425 flipped_bit = true;
426 clear_bit(i, possmask);
a54694f8
CB
427 }
428
6f9584d8 429 if (!flipped_bit) {
b31d62b8
CB
430 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
431 TRACE("No isolated or offline cpus present in cpuset");
432 } else {
433 cpulist = move_ptr(posscpus);
434 TRACE("Removed isolated or offline cpus from cpuset");
6f9584d8 435 }
c5b8049e
CB
436 if (!cpulist)
437 return log_error_errno(false, errno, "Failed to create cpu list");
a54694f8
CB
438
439copy_parent:
36f70181 440 if (!am_initialized) {
c5b8049e 441 ret = lxc_write_openat(child_cgroup, "cpuset.cpus", cpulist, strlen(cpulist));
c04a6d4e
CB
442 if (ret < 0)
443 return log_error_errno(false,
444 errno, "Failed to write cpu list to \"%s/cpuset.cpus\"",
c5b8049e 445 child_cgroup);
36f70181
CB
446
447 TRACE("Copied cpu settings of parent cgroup");
6f9584d8
CB
448 }
449
d97919ab 450 return true;
a54694f8
CB
451}
452
e3a3fecf 453/* Copy contents of parent(@path)/@file to @path/@file */
c5b8049e
CB
454static bool copy_parent_file(const char *parent_cgroup,
455 const char *child_cgroup, const char *file)
e3a3fecf 456{
c5b8049e 457 __do_free char *parent_file = NULL, *value = NULL;
b095a8eb 458 int len = 0;
fe70edee 459 int ret;
e3a3fecf 460
c5b8049e
CB
461 parent_file = must_make_path(parent_cgroup, file, NULL);
462 len = lxc_read_from_file(parent_file, NULL, 0);
fe70edee 463 if (len <= 0)
77c3e9a2 464 return log_error_errno(false, errno, "Failed to determine buffer size");
b095a8eb 465
f25a2044 466 value = must_realloc(NULL, len + 1);
fe70edee 467 value[len] = '\0';
c5b8049e 468 ret = lxc_read_from_file(parent_file, value, len);
fe70edee 469 if (ret != len)
77c3e9a2 470 return log_error_errno(false, errno, "Failed to read from parent file \"%s\"", parent_file);
b095a8eb 471
c5b8049e 472 ret = lxc_write_openat(child_cgroup, file, value, len);
fe70edee 473 if (ret < 0 && errno != EACCES)
77c3e9a2 474 return log_error_errno(false, errno, "Failed to write \"%s\" to file \"%s/%s\"",
c5b8049e 475 value, child_cgroup, file);
fe70edee 476 return true;
e3a3fecf
SH
477}
478
77c3e9a2 479static inline bool is_unified_hierarchy(const struct hierarchy *h)
c04a6d4e
CB
480{
481 return h->version == CGROUP2_SUPER_MAGIC;
482}
483
f990d3bf
CB
484/*
485 * Initialize the cpuset hierarchy in first directory of @cgroup_leaf and set
7793add3
CB
486 * cgroup.clone_children so that children inherit settings. Since the
487 * h->base_path is populated by init or ourselves, we know it is already
488 * initialized.
fe70edee
CB
489 *
490 * returns -1 on error, 0 when we didn't created a cgroup, 1 if we created a
491 * cgroup.
e3a3fecf 492 */
f990d3bf
CB
493static int cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h,
494 const char *cgroup_leaf)
e3a3fecf 495{
c5b8049e 496 __do_free char *parent_cgroup = NULL, *child_cgroup = NULL, *dup = NULL;
f62cf1d4 497 __do_close int cgroup_fd = -EBADF;
c5b8049e 498 int fret = -1;
7793add3
CB
499 int ret;
500 char v;
f990d3bf 501 char *leaf, *slash;
e3a3fecf 502
c04a6d4e 503 if (is_unified_hierarchy(h))
fe70edee 504 return 0;
c04a6d4e 505
e3a3fecf 506 if (!string_in_list(h->controllers, "cpuset"))
fe70edee 507 return 0;
e3a3fecf 508
f990d3bf
CB
509 if (!cgroup_leaf)
510 return ret_set_errno(-1, EINVAL);
511
512 dup = strdup(cgroup_leaf);
513 if (!dup)
514 return ret_set_errno(-1, ENOMEM);
515
c5b8049e
CB
516 parent_cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
517
518 leaf = dup;
f990d3bf
CB
519 leaf += strspn(leaf, "/");
520 slash = strchr(leaf, '/');
e3a3fecf
SH
521 if (slash)
522 *slash = '\0';
c5b8049e 523 child_cgroup = must_make_path(parent_cgroup, leaf, NULL);
e3a3fecf
SH
524 if (slash)
525 *slash = '/';
7793add3 526
fe70edee 527 fret = 1;
c5b8049e 528 ret = mkdir(child_cgroup, 0755);
7793add3 529 if (ret < 0) {
fe70edee 530 if (errno != EEXIST)
c5b8049e 531 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", child_cgroup);
fe70edee
CB
532
533 fret = 0;
e3a3fecf 534 }
6f9584d8 535
c5b8049e 536 cgroup_fd = lxc_open_dirfd(child_cgroup);
c04a6d4e 537 if (cgroup_fd < 0)
fe70edee 538 return -1;
7793add3 539
c04a6d4e 540 ret = lxc_readat(cgroup_fd, "cgroup.clone_children", &v, 1);
fe70edee 541 if (ret < 0)
c5b8049e 542 return log_error_errno(-1, errno, "Failed to read file \"%s/cgroup.clone_children\"", child_cgroup);
e3a3fecf 543
a54694f8 544 /* Make sure any isolated cpus are removed from cpuset.cpus. */
c5b8049e 545 if (!cg_legacy_filter_and_set_cpus(parent_cgroup, child_cgroup, v == '1'))
fe70edee 546 return log_error_errno(-1, errno, "Failed to remove isolated cpus");
a54694f8 547
7793add3 548 /* Already set for us by someone else. */
b28c2810
CB
549 if (v == '1')
550 TRACE("\"cgroup.clone_children\" was already set to \"1\"");
e3a3fecf
SH
551
552 /* copy parent's settings */
c5b8049e 553 if (!copy_parent_file(parent_cgroup, child_cgroup, "cpuset.mems"))
fe70edee 554 return log_error_errno(-1, errno, "Failed to copy \"cpuset.mems\" settings");
e3a3fecf 555
fe70edee 556 /* Set clone_children so children inherit our settings */
c04a6d4e 557 ret = lxc_writeat(cgroup_fd, "cgroup.clone_children", "1", 1);
fe70edee 558 if (ret < 0)
c5b8049e 559 return log_error_errno(-1, errno, "Failed to write 1 to \"%s/cgroup.clone_children\"", child_cgroup);
d97919ab 560
fe70edee 561 return fret;
e3a3fecf
SH
562}
563
5c0089ae
CB
564/* Given two null-terminated lists of strings, return true if any string is in
565 * both.
ccb4cabe
SH
566 */
567static bool controller_lists_intersect(char **l1, char **l2)
568{
ccb4cabe
SH
569 if (!l1 || !l2)
570 return false;
571
77c3e9a2 572 for (int i = 0; l1[i]; i++)
ccb4cabe
SH
573 if (string_in_list(l2, l1[i]))
574 return true;
5c0089ae 575
ccb4cabe
SH
576 return false;
577}
578
258449e5
CB
579/* For a null-terminated list of controllers @clist, return true if any of those
580 * controllers is already listed the null-terminated list of hierarchies @hlist.
581 * Realistically, if one is present, all must be present.
ccb4cabe
SH
582 */
583static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
584{
ccb4cabe
SH
585 if (!hlist)
586 return false;
258449e5 587
77c3e9a2 588 for (int i = 0; hlist[i]; i++)
ccb4cabe
SH
589 if (controller_lists_intersect(hlist[i]->controllers, clist))
590 return true;
ccb4cabe 591
258449e5 592 return false;
ccb4cabe
SH
593}
594
f57ac67f
CB
595/* Return true if the controller @entry is found in the null-terminated list of
596 * hierarchies @hlist.
ccb4cabe
SH
597 */
598static bool controller_found(struct hierarchy **hlist, char *entry)
599{
ccb4cabe
SH
600 if (!hlist)
601 return false;
602
77c3e9a2 603 for (int i = 0; hlist[i]; i++)
ccb4cabe
SH
604 if (string_in_list(hlist[i]->controllers, entry))
605 return true;
d6337a5f 606
ccb4cabe
SH
607 return false;
608}
609
e1c27ab0
CB
610/* Return true if all of the controllers which we require have been found. The
611 * required list is freezer and anything in lxc.cgroup.use.
ccb4cabe 612 */
2202afc9 613static bool all_controllers_found(struct cgroup_ops *ops)
ccb4cabe 614{
77c3e9a2 615 struct hierarchy **hlist;
ccb4cabe 616
2202afc9 617 if (!ops->cgroup_use)
ccb4cabe 618 return true;
c2712f64 619
77c3e9a2
CB
620 hlist = ops->hierarchies;
621 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
622 if (!controller_found(hlist, *cur))
623 return log_error(false, "No %s controller mountpoint found", *cur);
c2712f64 624
ccb4cabe
SH
625 return true;
626}
627
f205f10c
CB
628/* Get the controllers from a mountinfo line There are other ways we could get
629 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
630 * could parse the mount options. But we simply assume that the mountpoint must
631 * be /sys/fs/cgroup/controller-list
ccb4cabe 632 */
a3926f6a
CB
633static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
634 int type)
ccb4cabe 635{
f205f10c
CB
636 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
637 * for legacy hierarchies.
638 */
f761d24d 639 __do_free_string_list char **aret = NULL;
ccb4cabe 640 int i;
d97919ab 641 char *p2, *tok;
0be0d78f 642 char *p = line, *sep = ",";
6328fd9c 643
ccb4cabe 644 for (i = 0; i < 4; i++) {
235f1815 645 p = strchr(p, ' ');
ccb4cabe
SH
646 if (!p)
647 return NULL;
648 p++;
649 }
a55f31bd 650
f205f10c
CB
651 /* Note, if we change how mountinfo works, then our caller will need to
652 * verify /sys/fs/cgroup/ in this field.
653 */
77c3e9a2
CB
654 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
655 return log_error(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
d6337a5f 656
ccb4cabe 657 p += 15;
235f1815 658 p2 = strchr(p, ' ');
77c3e9a2
CB
659 if (!p2)
660 return log_error(NULL, "Corrupt mountinfo");
ccb4cabe 661 *p2 = '\0';
6328fd9c 662
d6337a5f 663 if (type == CGROUP_SUPER_MAGIC) {
88396101 664 __do_free char *dup = NULL;
d97919ab 665
0be0d78f
CB
666 /* strdup() here for v1 hierarchies. Otherwise
667 * lxc_iterate_parts() will destroy mountpoints such as
668 * "/sys/fs/cgroup/cpu,cpuacct".
d6337a5f 669 */
d97919ab 670 dup = must_copy_string(p);
d6337a5f
CB
671 if (!dup)
672 return NULL;
673
d97919ab 674 lxc_iterate_parts (tok, dup, sep)
d6337a5f 675 must_append_controller(klist, nlist, &aret, tok);
411ac6d8 676 }
d6337a5f 677 *p2 = ' ';
f205f10c 678
f761d24d 679 return move_ptr(aret);
d6337a5f 680}
411ac6d8 681
d6337a5f
CB
682static char **cg_unified_make_empty_controller(void)
683{
f761d24d 684 __do_free_string_list char **aret = NULL;
d6337a5f 685 int newentry;
d6337a5f
CB
686
687 newentry = append_null_to_list((void ***)&aret);
688 aret[newentry] = NULL;
f761d24d 689 return move_ptr(aret);
d6337a5f
CB
690}
691
692static char **cg_unified_get_controllers(const char *file)
693{
d97919ab 694 __do_free char *buf = NULL;
f761d24d 695 __do_free_string_list char **aret = NULL;
0be0d78f 696 char *sep = " \t\n";
2a63b5cb 697 char *tok;
d6337a5f
CB
698
699 buf = read_file(file);
700 if (!buf)
411ac6d8 701 return NULL;
6328fd9c 702
0be0d78f 703 lxc_iterate_parts(tok, buf, sep) {
d6337a5f
CB
704 int newentry;
705 char *copy;
706
707 newentry = append_null_to_list((void ***)&aret);
708 copy = must_copy_string(tok);
709 aret[newentry] = copy;
ccb4cabe
SH
710 }
711
f761d24d 712 return move_ptr(aret);
ccb4cabe
SH
713}
714
2202afc9 715static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
bb221ad1 716 char *container_base_path, int type)
ccb4cabe
SH
717{
718 struct hierarchy *new;
719 int newentry;
720
1973b62a 721 new = zalloc(sizeof(*new));
ccb4cabe
SH
722 new->controllers = clist;
723 new->mountpoint = mountpoint;
bb221ad1 724 new->container_base_path = container_base_path;
d6337a5f 725 new->version = type;
1973b62a
CB
726 new->cgfd_con = -EBADF;
727 new->cgfd_mon = -EBADF;
6328fd9c 728
2202afc9
CB
729 newentry = append_null_to_list((void ***)h);
730 (*h)[newentry] = new;
d6337a5f 731 return new;
ccb4cabe
SH
732}
733
798c3b33
CB
734/* Get a copy of the mountpoint from @line, which is a line from
735 * /proc/self/mountinfo.
ccb4cabe 736 */
a3926f6a 737static char *cg_hybrid_get_mountpoint(char *line)
ccb4cabe 738{
77c3e9a2 739 char *p = line, *sret = NULL;
ccb4cabe 740 size_t len;
798c3b33 741 char *p2;
ccb4cabe 742
77c3e9a2 743 for (int i = 0; i < 4; i++) {
235f1815 744 p = strchr(p, ' ');
ccb4cabe
SH
745 if (!p)
746 return NULL;
747 p++;
748 }
d6337a5f 749
dca9587a 750 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
d6337a5f
CB
751 return NULL;
752
753 p2 = strchr(p + 15, ' ');
754 if (!p2)
755 return NULL;
756 *p2 = '\0';
757
ccb4cabe 758 len = strlen(p);
f25a2044 759 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
760 memcpy(sret, p, len);
761 sret[len] = '\0';
77c3e9a2 762
ccb4cabe
SH
763 return sret;
764}
765
f523291e 766/* Given a multi-line string, return a null-terminated copy of the current line. */
ccb4cabe
SH
767static char *copy_to_eol(char *p)
768{
77c3e9a2 769 char *p2, *sret;
ccb4cabe
SH
770 size_t len;
771
77c3e9a2 772 p2 = strchr(p, '\n');
ccb4cabe
SH
773 if (!p2)
774 return NULL;
775
776 len = p2 - p;
f25a2044 777 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
778 memcpy(sret, p, len);
779 sret[len] = '\0';
77c3e9a2 780
ccb4cabe
SH
781 return sret;
782}
783
bced39de
CB
784/* cgline: pointer to character after the first ':' in a line in a \n-terminated
785 * /proc/self/cgroup file. Check whether controller c is present.
ccb4cabe
SH
786 */
787static bool controller_in_clist(char *cgline, char *c)
788{
d97919ab
CB
789 __do_free char *tmp = NULL;
790 char *tok, *eol;
ccb4cabe
SH
791 size_t len;
792
235f1815 793 eol = strchr(cgline, ':');
ccb4cabe
SH
794 if (!eol)
795 return false;
796
797 len = eol - cgline;
861cb8c2 798 tmp = must_realloc(NULL, len + 1);
ccb4cabe
SH
799 memcpy(tmp, cgline, len);
800 tmp[len] = '\0';
801
d97919ab
CB
802 lxc_iterate_parts(tok, tmp, ",")
803 if (strcmp(tok, c) == 0)
ccb4cabe 804 return true;
d6337a5f 805
ccb4cabe
SH
806 return false;
807}
808
c3ef912e
CB
809/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
810 * @controller.
ccb4cabe 811 */
c3ef912e
CB
812static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
813 int type)
ccb4cabe
SH
814{
815 char *p = basecginfo;
6328fd9c 816
d6337a5f
CB
817 for (;;) {
818 bool is_cgv2_base_cgroup = false;
819
6328fd9c 820 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
d6337a5f
CB
821 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
822 is_cgv2_base_cgroup = true;
ccb4cabe 823
235f1815 824 p = strchr(p, ':');
ccb4cabe
SH
825 if (!p)
826 return NULL;
827 p++;
d6337a5f
CB
828
829 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
235f1815 830 p = strchr(p, ':');
ccb4cabe
SH
831 if (!p)
832 return NULL;
833 p++;
834 return copy_to_eol(p);
835 }
836
235f1815 837 p = strchr(p, '\n');
ccb4cabe
SH
838 if (!p)
839 return NULL;
840 p++;
841 }
842}
843
ccb4cabe
SH
844static void must_append_string(char ***list, char *entry)
845{
6dfb18bf 846 int newentry;
ccb4cabe
SH
847 char *copy;
848
6dfb18bf 849 newentry = append_null_to_list((void ***)list);
ccb4cabe
SH
850 copy = must_copy_string(entry);
851 (*list)[newentry] = copy;
852}
853
d6337a5f 854static int get_existing_subsystems(char ***klist, char ***nlist)
ccb4cabe 855{
d97919ab
CB
856 __do_free char *line = NULL;
857 __do_fclose FILE *f = NULL;
ccb4cabe
SH
858 size_t len = 0;
859
4110345b 860 f = fopen("/proc/self/cgroup", "re");
d6337a5f
CB
861 if (!f)
862 return -1;
863
ccb4cabe 864 while (getline(&line, &len, f) != -1) {
0be0d78f 865 char *p, *p2, *tok;
235f1815 866 p = strchr(line, ':');
ccb4cabe
SH
867 if (!p)
868 continue;
869 p++;
235f1815 870 p2 = strchr(p, ':');
ccb4cabe
SH
871 if (!p2)
872 continue;
873 *p2 = '\0';
ff8d6ee9 874
6328fd9c
CB
875 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
876 * contains an entry of the form:
ff8d6ee9
CB
877 *
878 * 0::/some/path
879 *
6328fd9c 880 * In this case we use "cgroup2" as controller name.
ff8d6ee9 881 */
6328fd9c
CB
882 if ((p2 - p) == 0) {
883 must_append_string(klist, "cgroup2");
ff8d6ee9 884 continue;
6328fd9c 885 }
ff8d6ee9 886
0be0d78f 887 lxc_iterate_parts(tok, p, ",") {
ccb4cabe
SH
888 if (strncmp(tok, "name=", 5) == 0)
889 must_append_string(nlist, tok);
890 else
891 must_append_string(klist, tok);
892 }
893 }
894
d6337a5f 895 return 0;
ccb4cabe
SH
896}
897
d7314671 898static char *trim(char *s)
ccb4cabe 899{
7689dfd7
CB
900 size_t len;
901
902 len = strlen(s);
2c28d76b 903 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe 904 s[--len] = '\0';
d7314671
CB
905
906 return s;
ccb4cabe
SH
907}
908
2202afc9 909static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
ccb4cabe
SH
910{
911 int i;
27d84737 912 struct hierarchy **it;
41c33dbe 913
2202afc9
CB
914 if (!ops->hierarchies) {
915 TRACE(" No hierarchies found");
ccb4cabe
SH
916 return;
917 }
27d84737 918
2202afc9
CB
919 TRACE(" Hierarchies:");
920 for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
ccb4cabe 921 int j;
27d84737
CB
922 char **cit;
923
bb221ad1 924 TRACE(" %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
2202afc9
CB
925 TRACE(" mountpoint: %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
926 TRACE(" controllers:");
a7b0cc4c 927 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
2202afc9 928 TRACE(" %d: %s", j, *cit);
ccb4cabe
SH
929 }
930}
41c33dbe 931
a3926f6a
CB
932static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
933 char **nlist)
41c33dbe
SH
934{
935 int k;
a7b0cc4c 936 char **it;
41c33dbe 937
2202afc9
CB
938 TRACE("basecginfo is:");
939 TRACE("%s", basecginfo);
41c33dbe 940
a7b0cc4c 941 for (k = 0, it = klist; it && *it; it++, k++)
2202afc9 942 TRACE("kernel subsystem %d: %s", k, *it);
0f71dd9b 943
a7b0cc4c 944 for (k = 0, it = nlist; it && *it; it++, k++)
2202afc9 945 TRACE("named subsystem %d: %s", k, *it);
41c33dbe 946}
ccb4cabe 947
2202afc9
CB
948static int cgroup_rmdir(struct hierarchy **hierarchies,
949 const char *container_cgroup)
c71d83e1 950{
2202afc9
CB
951 if (!container_cgroup || !hierarchies)
952 return 0;
d6337a5f 953
8e64b673 954 for (int i = 0; hierarchies[i]; i++) {
2202afc9 955 struct hierarchy *h = hierarchies[i];
77c3e9a2 956 int ret;
d6337a5f 957
eb697136 958 if (!h->container_full_path)
2202afc9
CB
959 continue;
960
eb697136 961 ret = recursive_destroy(h->container_full_path);
2202afc9 962 if (ret < 0)
eb697136 963 WARN("Failed to destroy \"%s\"", h->container_full_path);
2202afc9 964
77c3e9a2 965 free_disarm(h->container_full_path);
2202afc9 966 }
d6337a5f 967
c71d83e1 968 return 0;
d6337a5f
CB
969}
970
2202afc9
CB
971struct generic_userns_exec_data {
972 struct hierarchy **hierarchies;
973 const char *container_cgroup;
974 struct lxc_conf *conf;
975 uid_t origuid; /* target uid in parent namespace */
976 char *path;
977};
d6337a5f 978
2202afc9
CB
979static int cgroup_rmdir_wrapper(void *data)
980{
2202afc9
CB
981 struct generic_userns_exec_data *arg = data;
982 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
983 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
8e64b673 984 int ret;
d6337a5f 985
b58214ac
CB
986 if (!lxc_setgroups(0, NULL) && errno != EPERM)
987 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
988
2202afc9 989 ret = setresgid(nsgid, nsgid, nsgid);
8e64b673 990 if (ret < 0)
77c3e9a2 991 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
8e64b673 992 (int)nsgid, (int)nsgid, (int)nsgid);
d6337a5f 993
2202afc9 994 ret = setresuid(nsuid, nsuid, nsuid);
8e64b673 995 if (ret < 0)
77c3e9a2 996 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
8e64b673 997 (int)nsuid, (int)nsuid, (int)nsuid);
d6337a5f 998
2202afc9 999 return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
d6337a5f
CB
1000}
1001
434c8e15
CB
1002__cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
1003 struct lxc_handler *handler)
d6337a5f
CB
1004{
1005 int ret;
bd8ef4e4 1006
fc3b9533
CB
1007 if (!ops) {
1008 ERROR("Called with uninitialized cgroup operations");
1009 return;
1010 }
fc1c3af9 1011
69b4a4bb
CB
1012 if (!ops->hierarchies)
1013 return;
1014
fc3b9533
CB
1015 if (!handler) {
1016 ERROR("Called with uninitialized handler");
1017 return;
1018 }
fc1c3af9 1019
fc3b9533
CB
1020 if (!handler->conf) {
1021 ERROR("Called with uninitialized conf");
1022 return;
1023 }
fc1c3af9 1024
bf651989
CB
1025#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
1026 ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices);
1027 if (ret < 0)
1028 WARN("Failed to detach bpf program from cgroup");
1029#endif
1030
8e64b673
CB
1031 if (handler->conf && !lxc_list_empty(&handler->conf->id_map)) {
1032 struct generic_userns_exec_data wrap = {
77c3e9a2
CB
1033 .conf = handler->conf,
1034 .container_cgroup = ops->container_cgroup,
1035 .hierarchies = ops->hierarchies,
1036 .origuid = 0,
8e64b673 1037 };
2202afc9 1038 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
bd8ef4e4 1039 "cgroup_rmdir_wrapper");
8e64b673 1040 } else {
2202afc9 1041 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
ccb4cabe 1042 }
8e64b673 1043 if (ret < 0)
fc3b9533 1044 SYSWARN("Failed to destroy cgroups");
ccb4cabe
SH
1045}
1046
434c8e15
CB
1047__cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1048 struct lxc_handler *handler)
1049{
1050 int len;
434c8e15 1051 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1973b62a 1052 const struct lxc_conf *conf;
b376d3d0 1053
fc3b9533
CB
1054 if (!ops) {
1055 ERROR("Called with uninitialized cgroup operations");
1056 return;
1057 }
434c8e15
CB
1058
1059 if (!ops->hierarchies)
1060 return;
1061
fc3b9533
CB
1062 if (!handler) {
1063 ERROR("Called with uninitialized handler");
1064 return;
1065 }
b376d3d0 1066
fc3b9533
CB
1067 if (!handler->conf) {
1068 ERROR("Called with uninitialized conf");
1069 return;
1070 }
1973b62a
CB
1071 conf = handler->conf;
1072
434c8e15
CB
1073 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1074 if (len < 0 || (size_t)len >= sizeof(pidstr))
1075 return;
1076
1077 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1078 __do_free char *pivot_path = NULL;
434c8e15 1079 struct hierarchy *h = ops->hierarchies[i];
fe70edee 1080 int ret;
434c8e15
CB
1081
1082 if (!h->monitor_full_path)
1083 continue;
1084
c468e4d4
CB
1085 /* Monitor might have died before we entered the cgroup. */
1086 if (handler->monitor_pid <= 0) {
1087 WARN("No valid monitor process found while destroying cgroups");
1088 goto try_recursive_destroy;
1089 }
1090
1973b62a
CB
1091 if (conf && conf->cgroup_meta.dir)
1092 pivot_path = must_make_path(h->mountpoint,
1093 h->container_base_path,
1094 conf->cgroup_meta.dir,
1095 CGROUP_PIVOT, NULL);
1096 else
1097 pivot_path = must_make_path(h->mountpoint,
1098 h->container_base_path,
1099 CGROUP_PIVOT, NULL);
1100
1101 ret = mkdir_p(pivot_path, 0755);
fc3b9533
CB
1102 if (ret < 0 && errno != EEXIST) {
1103 ERROR("Failed to create %s", pivot_path);
1104 goto try_recursive_destroy;
1105 }
1973b62a 1106
c468e4d4
CB
1107 ret = lxc_write_openat(pivot_path, "cgroup.procs", pidstr, len);
1108 if (ret != 0) {
1109 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
1110 continue;
fc3b9533 1111 }
434c8e15 1112
1973b62a 1113try_recursive_destroy:
434c8e15
CB
1114 ret = recursive_destroy(h->monitor_full_path);
1115 if (ret < 0)
1116 WARN("Failed to destroy \"%s\"", h->monitor_full_path);
434c8e15
CB
1117 }
1118}
1119
6099dd5a
CB
1120static int mkdir_eexist_on_last(const char *dir, mode_t mode)
1121{
1122 const char *tmp = dir;
1123 const char *orig = dir;
1124 size_t orig_len;
1125
1126 orig_len = strlen(dir);
1127 do {
6453ba56 1128 __do_free char *makeme = NULL;
6099dd5a
CB
1129 int ret;
1130 size_t cur_len;
6099dd5a
CB
1131
1132 dir = tmp + strspn(tmp, "/");
1133 tmp = dir + strcspn(dir, "/");
1134
6099dd5a
CB
1135 cur_len = dir - orig;
1136 makeme = strndup(orig, cur_len);
1137 if (!makeme)
77c3e9a2 1138 return ret_set_errno(-1, ENOMEM);
6099dd5a
CB
1139
1140 ret = mkdir(makeme, mode);
77c3e9a2
CB
1141 if (ret < 0 && ((errno != EEXIST) || (orig_len == cur_len)))
1142 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
6099dd5a
CB
1143 } while (tmp != dir);
1144
1145 return 0;
1146}
1147
fe70edee 1148static bool create_cgroup_tree(struct hierarchy *h, const char *cgroup_tree,
f990d3bf 1149 const char *cgroup_leaf, bool payload)
72068e74 1150{
fe70edee
CB
1151 __do_free char *path = NULL;
1152 int ret, ret_cpuset;
72068e74 1153
fe70edee
CB
1154 path = must_make_path(h->mountpoint, h->container_base_path, cgroup_leaf, NULL);
1155 if (dir_exists(path))
1156 return log_warn_errno(false, errno, "The %s cgroup already existed", path);
72068e74 1157
fe70edee
CB
1158 ret_cpuset = cg_legacy_handle_cpuset_hierarchy(h, cgroup_leaf);
1159 if (ret_cpuset < 0)
1160 return log_error_errno(false, errno, "Failed to handle legacy cpuset controller");
0c3deb94 1161
fe70edee 1162 ret = mkdir_eexist_on_last(path, 0755);
6099dd5a 1163 if (ret < 0) {
fe70edee
CB
1164 /*
1165 * This is the cpuset controller and
1166 * cg_legacy_handle_cpuset_hierarchy() has created our target
1167 * directory for us to ensure correct initialization.
1168 */
1169 if (ret_cpuset != 1 || cgroup_tree)
1170 return log_error_errno(false, errno, "Failed to create %s cgroup", path);
6f9584d8 1171 }
0c3deb94 1172
1973b62a
CB
1173 if (payload) {
1174 h->cgfd_con = lxc_open_dirfd(path);
1175 if (h->cgfd_con < 0)
1176 return log_error_errno(false, errno, "Failed to open %s", path);
fe70edee 1177 h->container_full_path = move_ptr(path);
1973b62a
CB
1178 } else {
1179 h->cgfd_mon = lxc_open_dirfd(path);
1180 if (h->cgfd_mon < 0)
1181 return log_error_errno(false, errno, "Failed to open %s", path);
fe70edee 1182 h->monitor_full_path = move_ptr(path);
1973b62a 1183 }
fe70edee 1184
c581d2a6 1185 return true;
ccb4cabe
SH
1186}
1187
fe70edee 1188static void cgroup_remove_leaf(struct hierarchy *h, bool payload)
ccb4cabe 1189{
fe70edee 1190 __do_free char *full_path = NULL;
72068e74 1191
1973b62a 1192 if (payload) {
f62cf1d4 1193 __lxc_unused __do_close int fd = move_fd(h->cgfd_con);
d6bdd182 1194 full_path = move_ptr(h->container_full_path);
1973b62a 1195 } else {
f62cf1d4 1196 __lxc_unused __do_close int fd = move_fd(h->cgfd_mon);
d6bdd182 1197 full_path = move_ptr(h->monitor_full_path);
1973b62a 1198 }
e56639fb 1199
d6bdd182 1200 if (full_path && rmdir(full_path))
fe70edee 1201 SYSWARN("Failed to rmdir(\"%s\") cgroup", full_path);
72068e74
CB
1202}
1203
b857f4be 1204__cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
f2668eea 1205 struct lxc_handler *handler)
72068e74 1206{
b3ed2061 1207 __do_free char *monitor_cgroup = NULL, *__cgroup_tree = NULL;
fe70edee
CB
1208 const char *cgroup_tree;
1209 int idx = 0;
1210 int i;
5ce03bc0 1211 size_t len;
fe70edee 1212 char *suffix;
0d66e29a 1213 struct lxc_conf *conf;
72068e74 1214
0d66e29a
CB
1215 if (!ops)
1216 return ret_set_errno(false, ENOENT);
e56639fb 1217
69b4a4bb
CB
1218 if (!ops->hierarchies)
1219 return true;
1220
0d66e29a
CB
1221 if (ops->monitor_cgroup)
1222 return ret_set_errno(false, EEXIST);
1223
1224 if (!handler || !handler->conf)
1225 return ret_set_errno(false, EINVAL);
1226
1227 conf = handler->conf;
1228
b3ed2061
CB
1229 if (conf->cgroup_meta.dir) {
1230 cgroup_tree = conf->cgroup_meta.dir;
fe70edee
CB
1231 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1232 DEFAULT_MONITOR_CGROUP_PREFIX,
1233 handler->name,
1234 CGROUP_CREATE_RETRY, NULL);
b3ed2061
CB
1235 } else if (ops->cgroup_pattern) {
1236 __cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
d6bdd182
CB
1237 if (!__cgroup_tree)
1238 return ret_set_errno(false, ENOMEM);
1239
b3ed2061 1240 cgroup_tree = __cgroup_tree;
d6bdd182
CB
1241 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1242 DEFAULT_MONITOR_CGROUP,
b3ed2061
CB
1243 CGROUP_CREATE_RETRY, NULL);
1244 } else {
d6bdd182 1245 cgroup_tree = NULL;
fe70edee
CB
1246 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1247 handler->name,
1248 CGROUP_CREATE_RETRY, NULL);
b3ed2061 1249 }
fe70edee 1250 if (!monitor_cgroup)
0d66e29a 1251 return ret_set_errno(false, ENOMEM);
72068e74 1252
fe70edee
CB
1253 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1254 *suffix = '\0';
5ce03bc0 1255 do {
0d66e29a 1256 if (idx)
fe70edee 1257 sprintf(suffix, "-%d", idx);
72068e74 1258
ebc10afe 1259 for (i = 0; ops->hierarchies[i]; i++) {
fe70edee
CB
1260 if (create_cgroup_tree(ops->hierarchies[i], cgroup_tree, monitor_cgroup, false))
1261 continue;
1262
1263 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path ?: "(null)");
1264 for (int j = 0; j < i; j++)
1265 cgroup_remove_leaf(ops->hierarchies[j], false);
1266
1267 idx++;
1268 break;
5ce03bc0 1269 }
ebc10afe 1270 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
5ce03bc0 1271
d97919ab 1272 if (idx == 1000)
0d66e29a 1273 return ret_set_errno(false, ERANGE);
72068e74 1274
c581d2a6 1275 ops->monitor_cgroup = move_ptr(monitor_cgroup);
6e8703a4 1276 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
ccb4cabe
SH
1277}
1278
fe70edee
CB
1279/*
1280 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
cecad0c1 1281 * next cgroup_pattern-1, -2, ..., -999.
ccb4cabe 1282 */
b857f4be 1283__cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
f3839f12 1284 struct lxc_handler *handler)
ccb4cabe 1285{
b3ed2061 1286 __do_free char *container_cgroup = NULL, *__cgroup_tree = NULL;
fe70edee 1287 const char *cgroup_tree;
f3839f12 1288 int idx = 0;
fe70edee 1289 int i;
ccb4cabe 1290 size_t len;
fe70edee 1291 char *suffix;
f3839f12 1292 struct lxc_conf *conf;
43654d34 1293
f3839f12
CB
1294 if (!ops)
1295 return ret_set_errno(false, ENOENT);
ccb4cabe 1296
69b4a4bb
CB
1297 if (!ops->hierarchies)
1298 return true;
1299
f3839f12
CB
1300 if (ops->container_cgroup)
1301 return ret_set_errno(false, EEXIST);
1302
1303 if (!handler || !handler->conf)
1304 return ret_set_errno(false, EINVAL);
1305
1306 conf = handler->conf;
1307
b3ed2061
CB
1308 if (conf->cgroup_meta.dir) {
1309 cgroup_tree = conf->cgroup_meta.dir;
fe70edee
CB
1310 container_cgroup = must_concat(&len, cgroup_tree, "/",
1311 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1312 handler->name,
1313 CGROUP_CREATE_RETRY, NULL);
b3ed2061
CB
1314 } else if (ops->cgroup_pattern) {
1315 __cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
d6bdd182
CB
1316 if (!__cgroup_tree)
1317 return ret_set_errno(false, ENOMEM);
1318
b3ed2061 1319 cgroup_tree = __cgroup_tree;
d6bdd182
CB
1320 container_cgroup = must_concat(&len, cgroup_tree, "/",
1321 DEFAULT_PAYLOAD_CGROUP,
b3ed2061
CB
1322 CGROUP_CREATE_RETRY, NULL);
1323 } else {
d6bdd182 1324 cgroup_tree = NULL;
fe70edee
CB
1325 container_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1326 handler->name,
1327 CGROUP_CREATE_RETRY, NULL);
b3ed2061 1328 }
fe70edee
CB
1329 if (!container_cgroup)
1330 return ret_set_errno(false, ENOMEM);
ccb4cabe 1331
fe70edee
CB
1332 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1333 *suffix = '\0';
d97919ab 1334 do {
f3839f12 1335 if (idx)
fe70edee 1336 sprintf(suffix, "-%d", idx);
bb30b52a 1337
d97919ab 1338 for (i = 0; ops->hierarchies[i]; i++) {
fe70edee
CB
1339 if (create_cgroup_tree(ops->hierarchies[i], cgroup_tree, container_cgroup, true))
1340 continue;
1341
1342 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
1343 for (int j = 0; j < i; j++)
1344 cgroup_remove_leaf(ops->hierarchies[j], true);
1345
1346 idx++;
1347 break;
66b66624 1348 }
d97919ab 1349 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
cecad0c1 1350
d97919ab 1351 if (idx == 1000)
f3839f12 1352 return ret_set_errno(false, ERANGE);
cecad0c1 1353
fe70edee
CB
1354 ops->container_cgroup = move_ptr(container_cgroup);
1355 INFO("The container process uses \"%s\" as cgroup", ops->container_cgroup);
ccb4cabe 1356 return true;
ccb4cabe
SH
1357}
1358
c581d2a6
CB
1359__cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1360 struct lxc_handler *handler)
ccb4cabe 1361{
c581d2a6
CB
1362 int monitor_len, transient_len;
1363 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1364 transient[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 1365
797fa65e
CB
1366 if (!ops)
1367 return ret_set_errno(false, ENOENT);
1368
69b4a4bb
CB
1369 if (!ops->hierarchies)
1370 return true;
1371
797fa65e
CB
1372 if (!ops->monitor_cgroup)
1373 return ret_set_errno(false, ENOENT);
1374
1375 if (!handler || !handler->conf)
1376 return ret_set_errno(false, EINVAL);
1377
c581d2a6
CB
1378 monitor_len = snprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1379 if (handler->transient_pid > 0)
1973b62a 1380 transient_len = snprintf(transient, sizeof(transient), "%d", handler->transient_pid);
ccb4cabe 1381
eeef32bb 1382 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1383 struct hierarchy *h = ops->hierarchies[i];
c581d2a6 1384 int ret;
08768001 1385
1973b62a
CB
1386 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", monitor, monitor_len);
1387 if (ret)
1388 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->monitor_full_path);
c581d2a6
CB
1389
1390 if (handler->transient_pid < 0)
1391 return true;
1392
1973b62a
CB
1393 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", transient, transient_len);
1394 if (ret)
1395 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->monitor_full_path);
1396
1397 /*
78eb6aa6 1398 * we don't keep the fds for non-unified hierarchies around
1973b62a 1399 * mainly because we don't make use of them anymore after the
78eb6aa6 1400 * core cgroup setup is done but also because there are quite a
1973b62a
CB
1401 * lot of them.
1402 */
1403 if (!is_unified_hierarchy(h))
1404 close_prot_errno_disarm(h->cgfd_mon);
ccb4cabe 1405 }
c581d2a6 1406 handler->transient_pid = -1;
ccb4cabe
SH
1407
1408 return true;
1409}
1410
c581d2a6
CB
1411__cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1412 struct lxc_handler *handler)
eeef32bb 1413{
c581d2a6
CB
1414 int len;
1415 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
eeef32bb 1416
4490328e
CB
1417 if (!ops)
1418 return ret_set_errno(false, ENOENT);
1419
c581d2a6
CB
1420 if (!ops->hierarchies)
1421 return true;
1422
4490328e
CB
1423 if (!ops->container_cgroup)
1424 return ret_set_errno(false, ENOENT);
1425
1426 if (!handler || !handler->conf)
1427 return ret_set_errno(false, EINVAL);
1428
c581d2a6
CB
1429 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1430
1431 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1432 struct hierarchy *h = ops->hierarchies[i];
c581d2a6
CB
1433 int ret;
1434
1973b62a 1435 ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
c581d2a6 1436 if (ret != 0)
1973b62a 1437 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
c581d2a6
CB
1438 }
1439
1440 return true;
eeef32bb
CB
1441}
1442
1973b62a
CB
1443static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1444 gid_t chown_gid, mode_t chmod_mode)
6efacf80
CB
1445{
1446 int ret;
1447
1973b62a
CB
1448 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1449 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1450 if (ret < 0)
1451 return log_warn_errno(-1,
1452 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1453 dirfd, path, (int)chown_uid,
1454 (int)chown_gid);
6efacf80 1455
1973b62a
CB
1456 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1457 if (ret < 0)
1458 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1459 dirfd, path, (int)chmod_mode);
6efacf80
CB
1460
1461 return 0;
1462}
1463
1464/* chgrp the container cgroups to container group. We leave
c0888dfe
SH
1465 * the container owner as cgroup owner. So we must make the
1466 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1467 *
1468 * Also chown the tasks and cgroup.procs files. Those may not
1469 * exist depending on kernel version.
c0888dfe 1470 */
ccb4cabe
SH
1471static int chown_cgroup_wrapper(void *data)
1472{
6a720d74 1473 int ret;
4160c3a0
CB
1474 uid_t destuid;
1475 struct generic_userns_exec_data *arg = data;
1476 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1477 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1478
b58214ac
CB
1479 if (!lxc_setgroups(0, NULL) && errno != EPERM)
1480 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1481
6efacf80 1482 ret = setresgid(nsgid, nsgid, nsgid);
803e4123 1483 if (ret < 0)
77c3e9a2 1484 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
803e4123 1485 (int)nsgid, (int)nsgid, (int)nsgid);
6efacf80
CB
1486
1487 ret = setresuid(nsuid, nsuid, nsuid);
803e4123 1488 if (ret < 0)
77c3e9a2 1489 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
803e4123 1490 (int)nsuid, (int)nsuid, (int)nsuid);
6efacf80 1491
ccb4cabe 1492 destuid = get_ns_uid(arg->origuid);
b962868f
CB
1493 if (destuid == LXC_INVALID_UID)
1494 destuid = 0;
ccb4cabe 1495
6a720d74 1496 for (int i = 0; arg->hierarchies[i]; i++) {
1973b62a 1497 int dirfd = arg->hierarchies[i]->cgfd_con;
43647298 1498
1973b62a 1499 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
c0888dfe 1500
1973b62a
CB
1501 /*
1502 * Failures to chown() these are inconvenient but not
6efacf80
CB
1503 * detrimental We leave these owned by the container launcher,
1504 * so that container root can write to the files to attach. We
1505 * chmod() them 664 so that container systemd can write to the
1506 * files (which systemd in wily insists on doing).
ab8f5424 1507 */
6efacf80 1508
1973b62a
CB
1509 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC)
1510 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
43647298 1511
1973b62a 1512 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
0e17357c 1513
2202afc9 1514 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
0e17357c
CB
1515 continue;
1516
1973b62a
CB
1517 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++)
1518 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
ccb4cabe
SH
1519 }
1520
1521 return 0;
1522}
1523
b857f4be 1524__cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
c98bbf71 1525 struct lxc_conf *conf)
ccb4cabe 1526{
4160c3a0 1527 struct generic_userns_exec_data wrap;
ccb4cabe 1528
c98bbf71
CB
1529 if (!ops)
1530 return ret_set_errno(false, ENOENT);
ccb4cabe 1531
69b4a4bb
CB
1532 if (!ops->hierarchies)
1533 return true;
1534
c98bbf71
CB
1535 if (!ops->container_cgroup)
1536 return ret_set_errno(false, ENOENT);
1537
1538 if (!conf)
1539 return ret_set_errno(false, EINVAL);
1540
1541 if (lxc_list_empty(&conf->id_map))
1542 return true;
1543
ccb4cabe 1544 wrap.origuid = geteuid();
4160c3a0 1545 wrap.path = NULL;
2202afc9 1546 wrap.hierarchies = ops->hierarchies;
4160c3a0 1547 wrap.conf = conf;
ccb4cabe 1548
c98bbf71
CB
1549 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1550 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
ccb4cabe
SH
1551
1552 return true;
1553}
1554
78eb6aa6
CB
1555__cgfsng_ops void cgfsng_payload_finalize(struct cgroup_ops *ops)
1556{
1557 if (!ops)
1558 return;
1559
1560 if (!ops->hierarchies)
1561 return;
1562
1563 for (int i = 0; ops->hierarchies[i]; i++) {
1564 struct hierarchy *h = ops->hierarchies[i];
1565 /*
1566 * we don't keep the fds for non-unified hierarchies around
1567 * mainly because we don't make use of them anymore after the
1568 * core cgroup setup is done but also because there are quite a
1569 * lot of them.
1570 */
1571 if (!is_unified_hierarchy(h))
1572 close_prot_errno_disarm(h->cgfd_con);
1573 }
1574}
1575
8aa1044f 1576/* cgroup-full:* is done, no need to create subdirs */
77c3e9a2 1577static inline bool cg_mount_needs_subdirs(int type)
8aa1044f 1578{
77c3e9a2 1579 return !(type >= LXC_AUTO_CGROUP_FULL_RO);
8aa1044f
SH
1580}
1581
886cac86
CB
1582/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1583 * remount controller ro if needed and bindmount the cgroupfs onto
25fa6f8c 1584 * control/the/cg/path.
8aa1044f 1585 */
6812d833
CB
1586static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1587 char *controllerpath, char *cgpath,
1588 const char *container_cgroup)
8aa1044f 1589{
d97919ab 1590 __do_free char *sourcepath = NULL;
5285689c 1591 int ret, remount_flags;
886cac86
CB
1592 int flags = MS_BIND;
1593
8aa1044f 1594 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
886cac86 1595 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
77c3e9a2
CB
1596 if (ret < 0)
1597 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1598 controllerpath, controllerpath);
886cac86 1599
5285689c
CB
1600 remount_flags = add_required_remount_flags(controllerpath,
1601 controllerpath,
1602 flags | MS_REMOUNT);
886cac86 1603 ret = mount(controllerpath, controllerpath, "cgroup",
8186c5c7
CB
1604 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1605 NULL);
77c3e9a2
CB
1606 if (ret < 0)
1607 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath);
886cac86 1608
8aa1044f
SH
1609 INFO("Remounted %s read-only", controllerpath);
1610 }
886cac86 1611
bb221ad1 1612 sourcepath = must_make_path(h->mountpoint, h->container_base_path,
886cac86 1613 container_cgroup, NULL);
8aa1044f
SH
1614 if (type == LXC_AUTO_CGROUP_RO)
1615 flags |= MS_RDONLY;
886cac86
CB
1616
1617 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
77c3e9a2
CB
1618 if (ret < 0)
1619 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1620 h->controllers[0], cgpath);
886cac86 1621 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
f8c40ffa
L
1622
1623 if (flags & MS_RDONLY) {
5285689c
CB
1624 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1625 flags | MS_REMOUNT);
1626 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
77c3e9a2
CB
1627 if (ret < 0)
1628 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
5285689c 1629 INFO("Remounted %s read-only", cgpath);
f8c40ffa
L
1630 }
1631
886cac86 1632 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
8aa1044f
SH
1633 return 0;
1634}
1635
6812d833
CB
1636/* __cg_mount_direct
1637 *
1638 * Mount cgroup hierarchies directly without using bind-mounts. The main
1639 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1640 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1641 */
1642static int __cg_mount_direct(int type, struct hierarchy *h,
1643 const char *controllerpath)
b635e92d 1644{
d97919ab 1645 __do_free char *controllers = NULL;
a760603e
CB
1646 char *fstype = "cgroup2";
1647 unsigned long flags = 0;
f6b54668 1648 int ret;
b635e92d 1649
a760603e
CB
1650 flags |= MS_NOSUID;
1651 flags |= MS_NOEXEC;
1652 flags |= MS_NODEV;
1653 flags |= MS_RELATIME;
1654
1655 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1656 flags |= MS_RDONLY;
1657
d6337a5f 1658 if (h->version != CGROUP2_SUPER_MAGIC) {
a760603e
CB
1659 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1660 if (!controllers)
1661 return -ENOMEM;
1662 fstype = "cgroup";
b635e92d
CB
1663 }
1664
a760603e 1665 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
77c3e9a2
CB
1666 if (ret < 0)
1667 return log_error_errno(-1, errno, "Failed to mount \"%s\" with cgroup filesystem type %s",
1668 controllerpath, fstype);
b635e92d 1669
6812d833 1670 DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1671 return 0;
1672}
1673
6812d833
CB
1674static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1675 const char *controllerpath)
1676{
1677 return __cg_mount_direct(type, h, controllerpath);
1678}
1679
1680static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1681 const char *controllerpath)
1682{
1683 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1684 return 0;
1685
1686 return __cg_mount_direct(type, h, controllerpath);
1687}
1688
b857f4be 1689__cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
8d661d38
CB
1690 struct lxc_handler *handler,
1691 const char *root, int type)
ccb4cabe 1692{
6607d6e9 1693 __do_free char *cgroup_root = NULL;
d7314671 1694 bool has_cgns = false, wants_force_mount = false;
dfa835ac 1695 int ret;
8aa1044f 1696
9585ccb3
CB
1697 if (!ops)
1698 return ret_set_errno(false, ENOENT);
1699
69b4a4bb
CB
1700 if (!ops->hierarchies)
1701 return true;
1702
9585ccb3
CB
1703 if (!handler || !handler->conf)
1704 return ret_set_errno(false, EINVAL);
1705
8aa1044f
SH
1706 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1707 return true;
1708
3f69fb12
SY
1709 if (type & LXC_AUTO_CGROUP_FORCE) {
1710 type &= ~LXC_AUTO_CGROUP_FORCE;
1711 wants_force_mount = true;
1712 }
b635e92d 1713
3f69fb12
SY
1714 if (!wants_force_mount){
1715 if (!lxc_list_empty(&handler->conf->keepcaps))
1716 wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1717 else
1718 wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1719 }
8aa1044f 1720
3f69fb12
SY
1721 has_cgns = cgns_supported();
1722 if (has_cgns && !wants_force_mount)
1723 return true;
8aa1044f
SH
1724
1725 if (type == LXC_AUTO_CGROUP_NOSPEC)
1726 type = LXC_AUTO_CGROUP_MIXED;
1727 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1728 type = LXC_AUTO_CGROUP_FULL_MIXED;
1729
dca9587a 1730 cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
8d661d38 1731 if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
8d661d38 1732 if (has_cgns && wants_force_mount) {
d7314671
CB
1733 /*
1734 * If cgroup namespaces are supported but the container
8d661d38
CB
1735 * will not have CAP_SYS_ADMIN after it has started we
1736 * need to mount the cgroups manually.
1737 */
d7314671 1738 return cg_mount_in_cgroup_namespace(type, ops->unified, cgroup_root) == 0;
8d661d38
CB
1739 }
1740
6607d6e9 1741 return cg_mount_cgroup_full(type, ops->unified, cgroup_root) == 0;
8d661d38
CB
1742 }
1743
1744 /* mount tmpfs */
6607d6e9 1745 ret = safe_mount(NULL, cgroup_root, "tmpfs",
3f69fb12
SY
1746 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1747 "size=10240k,mode=755", root);
1748 if (ret < 0)
d7314671 1749 return false;
8aa1044f 1750
dfa835ac 1751 for (int i = 0; ops->hierarchies[i]; i++) {
d97919ab 1752 __do_free char *controllerpath = NULL, *path2 = NULL;
2202afc9 1753 struct hierarchy *h = ops->hierarchies[i];
8aa1044f 1754 char *controller = strrchr(h->mountpoint, '/');
8aa1044f
SH
1755
1756 if (!controller)
1757 continue;
1758 controller++;
affd10fa 1759
6607d6e9 1760 controllerpath = must_make_path(cgroup_root, controller, NULL);
d97919ab 1761 if (dir_exists(controllerpath))
8aa1044f 1762 continue;
affd10fa 1763
3f69fb12 1764 ret = mkdir(controllerpath, 0755);
d7314671
CB
1765 if (ret < 0)
1766 return log_error_errno(false, errno, "Error creating cgroup path: %s", controllerpath);
b635e92d 1767
3f69fb12 1768 if (has_cgns && wants_force_mount) {
b635e92d
CB
1769 /* If cgroup namespaces are supported but the container
1770 * will not have CAP_SYS_ADMIN after it has started we
1771 * need to mount the cgroups manually.
1772 */
3f69fb12 1773 ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
3f69fb12 1774 if (ret < 0)
d7314671 1775 return false;
3f69fb12 1776
b635e92d
CB
1777 continue;
1778 }
1779
6812d833 1780 ret = cg_mount_cgroup_full(type, h, controllerpath);
d97919ab 1781 if (ret < 0)
d7314671 1782 return false;
3f69fb12 1783
d97919ab 1784 if (!cg_mount_needs_subdirs(type))
8aa1044f 1785 continue;
3f69fb12 1786
bb221ad1 1787 path2 = must_make_path(controllerpath, h->container_base_path,
2202afc9 1788 ops->container_cgroup, NULL);
3f69fb12 1789 ret = mkdir_p(path2, 0755);
d97919ab 1790 if (ret < 0)
d7314671 1791 return false;
2f62fb00 1792
6812d833 1793 ret = cg_legacy_mount_controllers(type, h, controllerpath,
2202afc9 1794 path2, ops->container_cgroup);
3f69fb12 1795 if (ret < 0)
d7314671 1796 return false;
8aa1044f 1797 }
8aa1044f 1798
d7314671 1799 return true;
ccb4cabe
SH
1800}
1801
11c23867 1802/* Only root needs to escape to the cgroup of its init. */
b857f4be 1803__cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
52d08ab0 1804 struct lxc_conf *conf)
ccb4cabe 1805{
52d08ab0
CB
1806 if (!ops)
1807 return ret_set_errno(false, ENOENT);
1808
1809 if (!ops->hierarchies)
1810 return true;
1811
1812 if (!conf)
1813 return ret_set_errno(false, EINVAL);
1814
1815 if (conf->cgroup_meta.relative || geteuid())
ccb4cabe
SH
1816 return true;
1817
779b3d82 1818 for (int i = 0; ops->hierarchies[i]; i++) {
88396101 1819 __do_free char *fullpath = NULL;
52d08ab0 1820 int ret;
11c23867 1821
52d08ab0
CB
1822 fullpath =
1823 must_make_path(ops->hierarchies[i]->mountpoint,
1824 ops->hierarchies[i]->container_base_path,
1825 "cgroup.procs", NULL);
7cea5905 1826 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
52d08ab0 1827 if (ret != 0)
77c3e9a2 1828 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
ccb4cabe
SH
1829 }
1830
6df334d1 1831 return true;
ccb4cabe
SH
1832}
1833
b857f4be 1834__cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
36662416 1835{
69b4a4bb
CB
1836 int i = 0;
1837
e3ffb28b
CB
1838 if (!ops)
1839 return ret_set_errno(-1, ENOENT);
1840
69b4a4bb
CB
1841 if (!ops->hierarchies)
1842 return 0;
36662416 1843
69b4a4bb 1844 for (; ops->hierarchies[i]; i++)
36662416
TA
1845 ;
1846
1847 return i;
1848}
1849
aa48a34f
CB
1850__cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n,
1851 char ***out)
36662416
TA
1852{
1853 int i;
1854
aa48a34f
CB
1855 if (!ops)
1856 return ret_set_errno(false, ENOENT);
1857
69b4a4bb 1858 if (!ops->hierarchies)
77c3e9a2 1859 return ret_set_errno(false, ENOENT);
69b4a4bb 1860
36662416 1861 /* sanity check n */
6b38e644 1862 for (i = 0; i < n; i++)
2202afc9 1863 if (!ops->hierarchies[i])
aa48a34f 1864 return ret_set_errno(false, ENOENT);
36662416 1865
2202afc9 1866 *out = ops->hierarchies[i]->controllers;
36662416
TA
1867
1868 return true;
1869}
1870
ee3a7775 1871static bool cg_legacy_freeze(struct cgroup_ops *ops)
ccb4cabe 1872{
d6337a5f 1873 struct hierarchy *h;
ccb4cabe 1874
ee3a7775
CB
1875 h = get_hierarchy(ops, "freezer");
1876 if (!h)
d2203230 1877 return ret_set_errno(-1, ENOENT);
81468ea7 1878
c04a6d4e
CB
1879 return lxc_write_openat(h->container_full_path, "freezer.state",
1880 "FROZEN", STRLITERALLEN("FROZEN"));
ee3a7775 1881}
942e193e 1882
018051e3
CB
1883static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1884 struct lxc_epoll_descr *descr)
ee3a7775 1885{
f62cf1d4 1886 __do_close int duped_fd = -EBADF;
018051e3 1887 __do_free char *line = NULL;
ee3a7775 1888 __do_fclose FILE *f = NULL;
018051e3
CB
1889 int state = PTR_TO_INT(cbdata);
1890 size_t len;
1891 const char *state_string;
1892
1893 duped_fd = dup(fd);
1894 if (duped_fd < 0)
1895 return LXC_MAINLOOP_ERROR;
1896
1897 if (lseek(duped_fd, 0, SEEK_SET) < (off_t)-1)
1898 return LXC_MAINLOOP_ERROR;
1899
1900 f = fdopen(duped_fd, "re");
1901 if (!f)
1902 return LXC_MAINLOOP_ERROR;
1903 move_fd(duped_fd);
1904
1905 if (state == 1)
1906 state_string = "frozen 1";
1907 else
1908 state_string = "frozen 0";
1909
1910 while (getline(&line, &len, f) != -1)
1911 if (strncmp(line, state_string, STRLITERALLEN("frozen") + 2) == 0)
1912 return LXC_MAINLOOP_CLOSE;
1913
1914 return LXC_MAINLOOP_CONTINUE;
1915}
1916
1917static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1918{
f62cf1d4 1919 __do_close int fd = -EBADF;
eafc1bb6 1920 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
018051e3
CB
1921 int ret;
1922 struct lxc_epoll_descr descr;
ee3a7775 1923 struct hierarchy *h;
942e193e
CB
1924
1925 h = ops->unified;
457ca9aa 1926 if (!h)
d2203230 1927 return ret_set_errno(-1, ENOENT);
d6337a5f 1928
018051e3 1929 if (!h->container_full_path)
d2203230 1930 return ret_set_errno(-1, EEXIST);
d6337a5f 1931
018051e3
CB
1932 if (timeout != 0) {
1933 __do_free char *events_file = NULL;
942e193e 1934
018051e3
CB
1935 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
1936 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1937 if (fd < 0)
d2203230 1938 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
942e193e 1939
018051e3
CB
1940 ret = lxc_mainloop_open(&descr);
1941 if (ret)
d2203230 1942 return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container freeze");
942e193e 1943
018051e3
CB
1944 /* automatically cleaned up now */
1945 descr_ptr = &descr;
942e193e 1946
018051e3
CB
1947 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){1}));
1948 if (ret < 0)
d2203230 1949 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 1950 }
942e193e 1951
c04a6d4e 1952 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "1", 1);
018051e3 1953 if (ret < 0)
d2203230 1954 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
1955
1956 if (timeout != 0 && lxc_mainloop(&descr, timeout))
d2203230 1957 return log_error_errno(-1, errno, "Failed to wait for container to be frozen");
018051e3
CB
1958
1959 return 0;
942e193e
CB
1960}
1961
018051e3 1962__cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
942e193e 1963{
81468ea7 1964 if (!ops->hierarchies)
d2203230 1965 return ret_set_errno(-1, ENOENT);
81468ea7 1966
ee3a7775
CB
1967 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
1968 return cg_legacy_freeze(ops);
942e193e 1969
018051e3 1970 return cg_unified_freeze(ops, timeout);
ee3a7775
CB
1971}
1972
018051e3 1973static int cg_legacy_unfreeze(struct cgroup_ops *ops)
ee3a7775 1974{
ee3a7775
CB
1975 struct hierarchy *h;
1976
1977 h = get_hierarchy(ops, "freezer");
1978 if (!h)
d2203230 1979 return ret_set_errno(-1, ENOENT);
ee3a7775 1980
c04a6d4e
CB
1981 return lxc_write_openat(h->container_full_path, "freezer.state",
1982 "THAWED", STRLITERALLEN("THAWED"));
ee3a7775
CB
1983}
1984
018051e3 1985static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775 1986{
f62cf1d4 1987 __do_close int fd = -EBADF;
eafc1bb6 1988 call_cleaner(lxc_mainloop_close)struct lxc_epoll_descr *descr_ptr = NULL;
018051e3
CB
1989 int ret;
1990 struct lxc_epoll_descr descr;
ee3a7775 1991 struct hierarchy *h;
942e193e
CB
1992
1993 h = ops->unified;
1994 if (!h)
d2203230 1995 return ret_set_errno(-1, ENOENT);
018051e3
CB
1996
1997 if (!h->container_full_path)
d2203230 1998 return ret_set_errno(-1, EEXIST);
018051e3
CB
1999
2000 if (timeout != 0) {
2001 __do_free char *events_file = NULL;
2002
2003 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2004 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2005 if (fd < 0)
d2203230 2006 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
018051e3
CB
2007
2008 ret = lxc_mainloop_open(&descr);
2009 if (ret)
d2203230 2010 return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container unfreeze");
018051e3
CB
2011
2012 /* automatically cleaned up now */
2013 descr_ptr = &descr;
2014
2015 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){0}));
2016 if (ret < 0)
d2203230 2017 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 2018 }
942e193e 2019
c04a6d4e 2020 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "0", 1);
018051e3 2021 if (ret < 0)
d2203230 2022 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
2023
2024 if (timeout != 0 && lxc_mainloop(&descr, timeout))
d2203230 2025 return log_error_errno(-1, errno, "Failed to wait for container to be unfrozen");
018051e3
CB
2026
2027 return 0;
ee3a7775
CB
2028}
2029
018051e3 2030__cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775
CB
2031{
2032 if (!ops->hierarchies)
d2203230 2033 return ret_set_errno(-1, ENOENT);
ee3a7775
CB
2034
2035 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2036 return cg_legacy_unfreeze(ops);
2037
018051e3 2038 return cg_unified_unfreeze(ops, timeout);
ccb4cabe
SH
2039}
2040
b857f4be 2041__cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
6bdf9691 2042 const char *controller)
ccb4cabe 2043{
d6337a5f
CB
2044 struct hierarchy *h;
2045
2202afc9 2046 h = get_hierarchy(ops, controller);
6bdf9691 2047 if (!h)
77c3e9a2 2048 return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
6bdf9691 2049 controller ? controller : "(null)");
ccb4cabe 2050
6bdf9691
CB
2051 return h->container_full_path
2052 ? h->container_full_path + strlen(h->mountpoint)
2053 : NULL;
371f834d
SH
2054}
2055
c40c8209
CB
2056/* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2057 * which must be freed by the caller.
371f834d 2058 */
c40c8209
CB
2059static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2060 const char *inpath,
2061 const char *filename)
371f834d 2062{
371f834d 2063 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
2064}
2065
4b86fefd 2066static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
c2aed66d 2067{
ad275c16 2068 int idx = 1;
c2aed66d 2069 int ret;
900b6606
CB
2070 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2071 size_t pidstr_len;
c2aed66d 2072
ad275c16 2073 /* Create leaf cgroup. */
275e8ef8 2074 ret = mkdirat(unified_fd, ".lxc", 0755);
ad275c16 2075 if (ret < 0 && errno != EEXIST)
275e8ef8 2076 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
ad275c16 2077
7581a82f 2078 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
275e8ef8 2079 ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
ad275c16
CB
2080 if (ret < 0)
2081 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
c2aed66d 2082 if (ret == 0)
bad788b0 2083 return 0;
ad275c16 2084
bad788b0
CB
2085 /* this is a non-leaf node */
2086 if (errno != EBUSY)
d2203230 2087 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2088
c2aed66d 2089 do {
7581a82f 2090 bool rm = false;
275e8ef8 2091 char attach_cgroup[STRLITERALLEN(".lxc-1000/cgroup.procs") + 1];
bad788b0 2092 char *slash;
c2aed66d 2093
5045306b
CB
2094 ret = snprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2095 if (ret < 0 || (size_t)ret >= sizeof(attach_cgroup))
2096 return ret_errno(EIO);
2097
bad788b0
CB
2098 slash = &attach_cgroup[ret] - STRLITERALLEN("/cgroup.procs");
2099 *slash = '\0';
ad275c16 2100
bad788b0 2101 ret = mkdirat(unified_fd, attach_cgroup, 0755);
c2aed66d 2102 if (ret < 0 && errno != EEXIST)
d2203230 2103 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
7581a82f
CB
2104 if (ret == 0)
2105 rm = true;
c2aed66d 2106
bad788b0 2107 *slash = '/';
ad275c16 2108
bad788b0 2109 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
c2aed66d 2110 if (ret == 0)
bad788b0 2111 return 0;
c2aed66d 2112
7581a82f
CB
2113 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2114 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2115
c2aed66d
CB
2116 /* this is a non-leaf node */
2117 if (errno != EBUSY)
d2203230 2118 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2119
edae86e9
CB
2120 idx++;
2121 } while (idx < 1000);
c2aed66d 2122
ad275c16 2123 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d
CB
2124}
2125
4b86fefd
CB
2126struct userns_exec_unified_attach_data {
2127 const struct lxc_conf *conf;
2128 int unified_fd;
2129 pid_t pid;
2130};
2131
2132static int cgroup_unified_attach_wrapper(void *data)
2133{
2134 struct userns_exec_unified_attach_data *args = data;
4b86fefd
CB
2135
2136 if (!args->conf || args->unified_fd < 0 || args->pid <= 0)
2137 return ret_errno(EINVAL);
2138
4b86fefd
CB
2139 return cgroup_attach_leaf(args->conf, args->unified_fd, args->pid);
2140}
2141
7581a82f
CB
2142int cgroup_attach(const struct lxc_conf *conf, const char *name,
2143 const char *lxcpath, pid_t pid)
900b6606 2144{
f62cf1d4 2145 __do_close int unified_fd = -EBADF;
7581a82f
CB
2146 int ret;
2147
2148 if (!conf || !name || !lxcpath || pid <= 0)
2149 return ret_errno(EINVAL);
900b6606
CB
2150
2151 unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
2152 if (unified_fd < 0)
7581a82f 2153 return ret_errno(EBADF);
900b6606 2154
4b86fefd
CB
2155 if (!lxc_list_empty(&conf->id_map)) {
2156 struct userns_exec_unified_attach_data args = {
2157 .conf = conf,
2158 .unified_fd = unified_fd,
2159 .pid = pid,
2160 };
ba7ca43b 2161
edf88289 2162 ret = userns_exec_minimal(conf, cgroup_unified_attach_wrapper, &args);
4b86fefd
CB
2163 } else {
2164 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2165 }
7581a82f 2166
4b86fefd 2167 return ret;
900b6606
CB
2168}
2169
2170/* Technically, we're always at a delegation boundary here (This is especially
2171 * true when cgroup namespaces are available.). The reasoning is that in order
2172 * for us to have been able to start a container in the first place the root
2173 * cgroup must have been a leaf node. Now, either the container's init system
2174 * has populated the cgroup and kept it as a leaf node or it has created
2175 * subtrees. In the former case we will simply attach to the leaf node we
2176 * created when we started the container in the latter case we create our own
2177 * cgroup for the attaching process.
2178 */
7581a82f
CB
2179static int __cg_unified_attach(const struct hierarchy *h,
2180 const struct lxc_conf *conf, const char *name,
900b6606
CB
2181 const char *lxcpath, pid_t pid,
2182 const char *controller)
2183{
f62cf1d4 2184 __do_close int unified_fd = -EBADF;
32908bfd 2185 __do_free char *path = NULL, *cgroup = NULL;
900b6606
CB
2186 int ret;
2187
7581a82f
CB
2188 if (!conf || !name || !lxcpath || pid <= 0)
2189 return ret_errno(EINVAL);
2190
2191 ret = cgroup_attach(conf, name, lxcpath, pid);
32908bfd
CB
2192 if (ret == 0)
2193 return log_trace(0, "Attached to unified cgroup via command handler");
2194 if (ret != -EBADF)
2195 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2196
2197 /* Fall back to retrieving the path for the unified cgroup. */
2198 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2199 /* not running */
2200 if (!cgroup)
2201 return 0;
900b6606 2202
32908bfd 2203 path = must_make_path(h->mountpoint, cgroup, NULL);
900b6606 2204
32908bfd 2205 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
900b6606 2206 if (unified_fd < 0)
7581a82f
CB
2207 return ret_errno(EBADF);
2208
4b86fefd
CB
2209 if (!lxc_list_empty(&conf->id_map)) {
2210 struct userns_exec_unified_attach_data args = {
2211 .conf = conf,
2212 .unified_fd = unified_fd,
2213 .pid = pid,
2214 };
2215
edf88289 2216 ret = userns_exec_minimal(conf, cgroup_unified_attach_wrapper, &args);
4b86fefd
CB
2217 } else {
2218 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2219 }
2220
2221 return ret;
900b6606
CB
2222}
2223
7581a82f
CB
2224__cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2225 const struct lxc_conf *conf,
2226 const char *name, const char *lxcpath,
2227 pid_t pid)
ccb4cabe 2228{
81b5d48a 2229 int len, ret;
a3650c0c 2230 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 2231
ab9a452d
CB
2232 if (!ops)
2233 return ret_set_errno(false, ENOENT);
2234
69b4a4bb
CB
2235 if (!ops->hierarchies)
2236 return true;
2237
a3650c0c
CB
2238 len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
2239 if (len < 0 || (size_t)len >= sizeof(pidstr))
ccb4cabe
SH
2240 return false;
2241
81b5d48a 2242 for (int i = 0; ops->hierarchies[i]; i++) {
c05b17bd 2243 __do_free char *fullpath = NULL, *path = NULL;
2202afc9 2244 struct hierarchy *h = ops->hierarchies[i];
ccb4cabe 2245
c2aed66d 2246 if (h->version == CGROUP2_SUPER_MAGIC) {
7581a82f 2247 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
a3926f6a 2248 h->controllers[0]);
c2aed66d
CB
2249 if (ret < 0)
2250 return false;
2251
2252 continue;
2253 }
2254
ccb4cabe 2255 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
c2aed66d
CB
2256 /* not running */
2257 if (!path)
e2cb2e74 2258 return false;
ccb4cabe 2259
371f834d 2260 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
7cea5905 2261 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
ab9a452d 2262 if (ret < 0)
77c3e9a2 2263 return log_error_errno(false, errno, "Failed to attach %d to %s",
ab9a452d 2264 (int)pid, fullpath);
ccb4cabe
SH
2265 }
2266
ccb4cabe
SH
2267 return true;
2268}
2269
e2bd2b13
CB
2270/* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2271 * don't have a cgroup_data set up, so we ask the running container through the
2272 * commands API for the cgroup path.
ccb4cabe 2273 */
b857f4be 2274__cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
fb55e009
CB
2275 char *value, size_t len, const char *name,
2276 const char *lxcpath)
ccb4cabe 2277{
d97919ab 2278 __do_free char *path = NULL;
88396101 2279 __do_free char *controller = NULL;
d97919ab 2280 char *p;
0069cc61 2281 struct hierarchy *h;
861cb8c2 2282 int ret = -1;
ccb4cabe 2283
a358028a
CB
2284 if (!ops)
2285 return ret_set_errno(-1, ENOENT);
2286
861cb8c2 2287 controller = must_copy_string(filename);
0069cc61
CB
2288 p = strchr(controller, '.');
2289 if (p)
ccb4cabe
SH
2290 *p = '\0';
2291
0069cc61
CB
2292 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2293 /* not running */
2294 if (!path)
ccb4cabe
SH
2295 return -1;
2296
2202afc9 2297 h = get_hierarchy(ops, controller);
ccb4cabe 2298 if (h) {
88396101 2299 __do_free char *fullpath = NULL;
0069cc61
CB
2300
2301 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe 2302 ret = lxc_read_from_file(fullpath, value, len);
ccb4cabe 2303 }
ccb4cabe
SH
2304
2305 return ret;
2306}
2307
cb3fc90c
CB
2308static int device_cgroup_parse_access(struct device_item *device, const char *val)
2309{
2310 for (int count = 0; count < 3; count++, val++) {
2311 switch (*val) {
2312 case 'r':
2313 device->access[count] = *val;
2314 break;
2315 case 'w':
2316 device->access[count] = *val;
2317 break;
2318 case 'm':
2319 device->access[count] = *val;
2320 break;
2321 case '\n':
2322 case '\0':
2323 count = 3;
2324 break;
2325 default:
2326 return ret_errno(EINVAL);
2327 }
2328 }
2329
2330 return 0;
2331}
2332
2a63b5cb
CB
2333static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2334 const char *val)
2335{
2336 int count, ret;
2337 char temp[50];
2338
2339 if (strcmp("devices.allow", key) == 0)
2340 device->allow = 1;
2341 else
2342 device->allow = 0;
2343
2344 if (strcmp(val, "a") == 0) {
2345 /* global rule */
2346 device->type = 'a';
2347 device->major = -1;
2348 device->minor = -1;
fda39d45
CB
2349 device->global_rule = device->allow
2350 ? LXC_BPF_DEVICE_CGROUP_BLACKLIST
2351 : LXC_BPF_DEVICE_CGROUP_WHITELIST;
2a63b5cb
CB
2352 device->allow = -1;
2353 return 0;
2a63b5cb
CB
2354 }
2355
77c3e9a2
CB
2356 /* local rule */
2357 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2358
2a63b5cb
CB
2359 switch (*val) {
2360 case 'a':
2361 __fallthrough;
2362 case 'b':
2363 __fallthrough;
2364 case 'c':
2365 device->type = *val;
2366 break;
2367 default:
2368 return -1;
2369 }
2370
2371 val++;
2372 if (!isspace(*val))
2373 return -1;
2374 val++;
2375 if (*val == '*') {
2376 device->major = -1;
2377 val++;
2378 } else if (isdigit(*val)) {
2379 memset(temp, 0, sizeof(temp));
2380 for (count = 0; count < sizeof(temp) - 1; count++) {
2381 temp[count] = *val;
2382 val++;
2383 if (!isdigit(*val))
2384 break;
2385 }
2386 ret = lxc_safe_int(temp, &device->major);
2387 if (ret)
2388 return -1;
2389 } else {
2390 return -1;
2391 }
2392 if (*val != ':')
2393 return -1;
2394 val++;
2395
2396 /* read minor */
2397 if (*val == '*') {
2398 device->minor = -1;
2399 val++;
2400 } else if (isdigit(*val)) {
2401 memset(temp, 0, sizeof(temp));
2402 for (count = 0; count < sizeof(temp) - 1; count++) {
2403 temp[count] = *val;
2404 val++;
2405 if (!isdigit(*val))
2406 break;
2407 }
2408 ret = lxc_safe_int(temp, &device->minor);
2409 if (ret)
2410 return -1;
2411 } else {
2412 return -1;
2413 }
2414 if (!isspace(*val))
2415 return -1;
2a63b5cb 2416
cb3fc90c 2417 return device_cgroup_parse_access(device, ++val);
2a63b5cb
CB
2418}
2419
eec533e3
CB
2420/* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2421 * don't have a cgroup_data set up, so we ask the running container through the
2422 * commands API for the cgroup path.
ccb4cabe 2423 */
b857f4be 2424__cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2a63b5cb 2425 const char *key, const char *value,
fb55e009 2426 const char *name, const char *lxcpath)
ccb4cabe 2427{
d97919ab 2428 __do_free char *path = NULL;
88396101 2429 __do_free char *controller = NULL;
d97919ab 2430 char *p;
87777968 2431 struct hierarchy *h;
861cb8c2 2432 int ret = -1;
ccb4cabe 2433
a358028a
CB
2434 if (!ops)
2435 return ret_set_errno(-1, ENOENT);
2436
2a63b5cb 2437 controller = must_copy_string(key);
87777968
CB
2438 p = strchr(controller, '.');
2439 if (p)
ccb4cabe
SH
2440 *p = '\0';
2441
2a63b5cb
CB
2442 if (pure_unified_layout(ops) && strcmp(controller, "devices") == 0) {
2443 struct device_item device = {0};
2444
2445 ret = device_cgroup_rule_parse(&device, key, value);
2446 if (ret < 0)
d2203230 2447 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2a63b5cb
CB
2448 key, value);
2449
2450 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2451 if (ret < 0)
2452 return -1;
2453
2454 return 0;
2455 }
2456
87777968
CB
2457 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2458 /* not running */
2459 if (!path)
ccb4cabe
SH
2460 return -1;
2461
2202afc9 2462 h = get_hierarchy(ops, controller);
ccb4cabe 2463 if (h) {
88396101 2464 __do_free char *fullpath = NULL;
87777968 2465
2a63b5cb 2466 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
7cea5905 2467 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
ccb4cabe 2468 }
ccb4cabe
SH
2469
2470 return ret;
2471}
2472
91d1a13a 2473/* take devices cgroup line
72add155
SH
2474 * /dev/foo rwx
2475 * and convert it to a valid
2476 * type major:minor mode
91d1a13a
CB
2477 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2478 * the output.
72add155 2479 */
cb3fc90c
CB
2480static int device_cgroup_rule_parse_devpath(struct device_item *device,
2481 const char *devpath)
72add155 2482{
88396101 2483 __do_free char *path = NULL;
2a06d041 2484 char *mode = NULL;
cb3fc90c
CB
2485 int n_parts, ret;
2486 char *p;
2487 struct stat sb;
72add155 2488
cb3fc90c 2489 path = must_copy_string(devpath);
72add155 2490
cb3fc90c
CB
2491 /*
2492 * Read path followed by mode. Ignore any trailing text.
91d1a13a
CB
2493 * A ' # comment' would be legal. Technically other text is not
2494 * legal, we could check for that if we cared to.
72add155 2495 */
0dbdb99e 2496 for (n_parts = 1, p = path; *p; p++) {
2c2d6c49
SH
2497 if (*p != ' ')
2498 continue;
2499 *p = '\0';
91d1a13a 2500
2c2d6c49
SH
2501 if (n_parts != 1)
2502 break;
2503 p++;
2504 n_parts++;
91d1a13a 2505
2c2d6c49
SH
2506 while (*p == ' ')
2507 p++;
91d1a13a 2508
2c2d6c49 2509 mode = p;
91d1a13a 2510
2c2d6c49 2511 if (*p == '\0')
cb3fc90c 2512 return ret_set_errno(-1, EINVAL);
72add155 2513 }
2c2d6c49 2514
cb3fc90c
CB
2515 if (device_cgroup_parse_access(device, mode) < 0)
2516 return -1;
2517
2c2d6c49 2518 if (n_parts == 1)
cb3fc90c 2519 return ret_set_errno(-1, EINVAL);
72add155
SH
2520
2521 ret = stat(path, &sb);
2522 if (ret < 0)
cb3fc90c 2523 return ret_set_errno(-1, errno);
72add155 2524
72add155
SH
2525 mode_t m = sb.st_mode & S_IFMT;
2526 switch (m) {
2527 case S_IFBLK:
cb3fc90c 2528 device->type = 'b';
72add155
SH
2529 break;
2530 case S_IFCHR:
cb3fc90c 2531 device->type = 'c';
72add155 2532 break;
2c2d6c49 2533 default:
77c3e9a2 2534 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
72add155 2535 }
2c2d6c49 2536
cb3fc90c
CB
2537 device->major = MAJOR(sb.st_rdev);
2538 device->minor = MINOR(sb.st_rdev);
2539 device->allow = 1;
2540 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
72add155 2541
cb3fc90c
CB
2542 return 0;
2543}
2544
2545static int convert_devpath(const char *invalue, char *dest)
2546{
2547 struct device_item device = {0};
2548 int ret;
2549
2550 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2551 if (ret < 0)
2552 return -1;
2553
2554 ret = snprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2555 device.minor, device.access);
2556 if (ret < 0 || ret >= 50)
77c3e9a2
CB
2557 return log_error_errno(-1, ENAMETOOLONG, "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2558 device.type, device.major, device.minor, device.access);
cb3fc90c
CB
2559
2560 return 0;
72add155
SH
2561}
2562
90e97284
CB
2563/* Called from setup_limits - here we have the container's cgroup_data because
2564 * we created the cgroups.
ccb4cabe 2565 */
2202afc9
CB
2566static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2567 const char *value)
ccb4cabe 2568{
88396101 2569 __do_free char *controller = NULL;
d97919ab 2570 char *p;
1a0e70ac
CB
2571 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2572 char converted_value[50];
b3646d7e 2573 struct hierarchy *h;
64e82f8b 2574
861cb8c2 2575 controller = must_copy_string(filename);
ab1a6cac
CB
2576 p = strchr(controller, '.');
2577 if (p)
ccb4cabe
SH
2578 *p = '\0';
2579
c8bf519d 2580 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
c04a6d4e
CB
2581 int ret;
2582
72add155
SH
2583 ret = convert_devpath(value, converted_value);
2584 if (ret < 0)
c8bf519d 2585 return ret;
72add155 2586 value = converted_value;
c8bf519d 2587 }
2588
2202afc9 2589 h = get_hierarchy(ops, controller);
77c3e9a2
CB
2590 if (!h)
2591 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
b3646d7e 2592
c04a6d4e 2593 return lxc_write_openat(h->container_full_path, filename, value, strlen(value));
ccb4cabe
SH
2594}
2595
c581d2a6
CB
2596__cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2597 struct lxc_conf *conf,
2598 bool do_devices)
ccb4cabe 2599{
d97919ab 2600 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
c581d2a6 2601 struct lxc_list *cgroup_settings = &conf->cgroup;
d97919ab 2602 struct lxc_list *iterator, *next;
ccb4cabe 2603 struct lxc_cgroup *cg;
ccb4cabe
SH
2604 bool ret = false;
2605
92ca7eb5
CB
2606 if (!ops)
2607 return ret_set_errno(false, ENOENT);
2608
2609 if (!conf)
2610 return ret_set_errno(false, EINVAL);
2611
2612 cgroup_settings = &conf->cgroup;
ccb4cabe
SH
2613 if (lxc_list_empty(cgroup_settings))
2614 return true;
2615
69b4a4bb 2616 if (!ops->hierarchies)
92ca7eb5 2617 return ret_set_errno(false, EINVAL);
69b4a4bb 2618
ccb4cabe 2619 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
6b38e644 2620 if (!sorted_cgroup_settings)
ccb4cabe 2621 return false;
ccb4cabe 2622
ccb4cabe
SH
2623 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2624 cg = iterator->elem;
2625
2626 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2202afc9 2627 if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
fc3b9533
CB
2628 if (do_devices && (errno == EACCES || errno == EPERM)) {
2629 SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2630 continue;
2631 }
2632 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2633 goto out;
ccb4cabe 2634 }
77c3e9a2 2635 DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
ccb4cabe 2636 }
ccb4cabe
SH
2637 }
2638
2639 ret = true;
6b38e644 2640 INFO("Limits for the legacy cgroup hierarchies have been setup");
ccb4cabe 2641out:
ccb4cabe
SH
2642 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2643 lxc_list_del(iterator);
2644 free(iterator);
2645 }
d97919ab 2646
ccb4cabe
SH
2647 return ret;
2648}
2649
bf651989
CB
2650/*
2651 * Some of the parsing logic comes from the original cgroup device v1
2652 * implementation in the kernel.
2653 */
4bfb655e
CB
2654static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2655 struct lxc_conf *conf, const char *key,
bf651989
CB
2656 const char *val)
2657{
2658#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
4bfb655e 2659 struct device_item device_item = {0};
2a63b5cb 2660 int ret;
bf651989 2661
cb3fc90c
CB
2662 if (strcmp("devices.allow", key) == 0 && *val == '/')
2663 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2664 else
2665 ret = device_cgroup_rule_parse(&device_item, key, val);
2a63b5cb 2666 if (ret < 0)
77c3e9a2 2667 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", key, val);
4bfb655e
CB
2668
2669 ret = bpf_list_add_device(conf, &device_item);
2a63b5cb 2670 if (ret < 0)
4bfb655e 2671 return -1;
bf651989
CB
2672#endif
2673 return 0;
2674}
2675
c581d2a6
CB
2676__cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2677 struct lxc_handler *handler)
6b38e644 2678{
7e31931f
CB
2679 struct lxc_list *cgroup_settings, *iterator;
2680 struct hierarchy *h;
2681 struct lxc_conf *conf;
6b38e644 2682
7e31931f
CB
2683 if (!ops)
2684 return ret_set_errno(false, ENOENT);
2685
2686 if (!ops->hierarchies)
6b38e644
CB
2687 return true;
2688
7e31931f
CB
2689 if (!ops->container_cgroup)
2690 return ret_set_errno(false, EINVAL);
2691
2692 if (!handler || !handler->conf)
2693 return ret_set_errno(false, EINVAL);
2694 conf = handler->conf;
2695
2696 if (lxc_list_empty(&conf->cgroup2))
2697 return true;
2698 cgroup_settings = &conf->cgroup2;
2699
2700 if (!ops->unified)
6b38e644 2701 return false;
7e31931f 2702 h = ops->unified;
6b38e644 2703
bf651989 2704 lxc_list_for_each (iterator, cgroup_settings) {
6b38e644 2705 struct lxc_cgroup *cg = iterator->elem;
c04a6d4e 2706 int ret;
6b38e644 2707
bf651989 2708 if (strncmp("devices", cg->subsystem, 7) == 0) {
4bfb655e 2709 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem,
bf651989
CB
2710 cg->value);
2711 } else {
c04a6d4e
CB
2712 ret = lxc_write_openat(h->container_full_path,
2713 cg->subsystem, cg->value,
2714 strlen(cg->value));
7e31931f 2715 if (ret < 0)
77c3e9a2 2716 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"",
7e31931f 2717 cg->subsystem, cg->value);
6b38e644
CB
2718 }
2719 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2720 }
2721
7e31931f 2722 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
6b38e644
CB
2723}
2724
bf651989
CB
2725__cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
2726 struct lxc_handler *handler)
2727{
2728#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
2a63b5cb 2729 __do_bpf_program_free struct bpf_program *devices = NULL;
bf651989 2730 int ret;
e552bd1a
CB
2731 struct lxc_conf *conf;
2732 struct hierarchy *unified;
2a63b5cb
CB
2733 struct lxc_list *it;
2734 struct bpf_program *devices_old;
bf651989 2735
e552bd1a
CB
2736 if (!ops)
2737 return ret_set_errno(false, ENOENT);
2738
2739 if (!ops->hierarchies)
2740 return true;
2741
2742 if (!ops->container_cgroup)
2743 return ret_set_errno(false, EEXIST);
2744
2745 if (!handler || !handler->conf)
2746 return ret_set_errno(false, EINVAL);
2747 conf = handler->conf;
2748
2749 unified = ops->unified;
9994db51
CB
2750 if (!unified || !unified->bpf_device_controller ||
2751 !unified->container_full_path || lxc_list_empty(&conf->devices))
bf651989
CB
2752 return true;
2753
2a63b5cb
CB
2754 devices = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
2755 if (!devices)
77c3e9a2 2756 return log_error_errno(false, ENOMEM, "Failed to create new bpf program");
2a63b5cb
CB
2757
2758 ret = bpf_program_init(devices);
bf651989 2759 if (ret)
77c3e9a2 2760 return log_error_errno(false, ENOMEM, "Failed to initialize bpf program");
2a63b5cb
CB
2761
2762 lxc_list_for_each(it, &conf->devices) {
2763 struct device_item *cur = it->elem;
2764
2765 ret = bpf_program_append_device(devices, cur);
2766 if (ret)
77c3e9a2
CB
2767 return log_error_errno(false, ENOMEM, "Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2768 cur->type,
2769 cur->major,
2770 cur->minor,
2771 cur->access,
2772 cur->allow,
2773 cur->global_rule);
2a63b5cb 2774 TRACE("Added rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
77c3e9a2
CB
2775 cur->type,
2776 cur->major,
2777 cur->minor,
2778 cur->access,
2779 cur->allow,
2780 cur->global_rule);
2a63b5cb
CB
2781 }
2782
2783 ret = bpf_program_finalize(devices);
2784 if (ret)
77c3e9a2 2785 return log_error_errno(false, ENOMEM, "Failed to finalize bpf program");
bf651989 2786
2a63b5cb
CB
2787 ret = bpf_program_cgroup_attach(devices, BPF_CGROUP_DEVICE,
2788 unified->container_full_path,
cce5a3d7
CB
2789 BPF_F_ALLOW_MULTI);
2790 if (ret)
77c3e9a2 2791 return log_error_errno(false, ENOMEM, "Failed to attach bpf program");
cce5a3d7
CB
2792
2793 /* Replace old bpf program. */
2a63b5cb
CB
2794 devices_old = move_ptr(conf->cgroup2_devices);
2795 conf->cgroup2_devices = move_ptr(devices);
2796 devices = move_ptr(devices_old);
bf651989 2797#endif
cce5a3d7 2798 return true;
bf651989
CB
2799}
2800
c581d2a6 2801bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
6b38e644 2802{
c581d2a6 2803 __do_free char *add_controllers = NULL, *base_path = NULL;
f761d24d 2804 __do_free_string_list char **parts = NULL;
c581d2a6
CB
2805 struct hierarchy *unified = ops->unified;
2806 ssize_t parts_len;
2807 char **it;
2808 size_t full_len = 0;
6b38e644 2809
c581d2a6
CB
2810 if (!ops->hierarchies || !pure_unified_layout(ops) ||
2811 !unified->controllers[0])
bf651989
CB
2812 return true;
2813
c581d2a6
CB
2814 /* For now we simply enable all controllers that we have detected by
2815 * creating a string like "+memory +pids +cpu +io".
2816 * TODO: In the near future we might want to support "-<controller>"
2817 * etc. but whether supporting semantics like this make sense will need
2818 * some thinking.
2819 */
2820 for (it = unified->controllers; it && *it; it++) {
2821 full_len += strlen(*it) + 2;
2822 add_controllers = must_realloc(add_controllers, full_len + 1);
2823
2824 if (unified->controllers[0] == *it)
2825 add_controllers[0] = '\0';
2826
2827 (void)strlcat(add_controllers, "+", full_len + 1);
2828 (void)strlcat(add_controllers, *it, full_len + 1);
2829
2830 if ((it + 1) && *(it + 1))
2831 (void)strlcat(add_controllers, " ", full_len + 1);
2832 }
2833
2834 parts = lxc_string_split(cgroup, '/');
2835 if (!parts)
f761d24d 2836 return false;
c581d2a6
CB
2837
2838 parts_len = lxc_array_len((void **)parts);
2839 if (parts_len > 0)
2840 parts_len--;
2841
2842 base_path = must_make_path(unified->mountpoint, unified->container_base_path, NULL);
2843 for (ssize_t i = -1; i < parts_len; i++) {
2844 int ret;
2845 __do_free char *target = NULL;
2846
2847 if (i >= 0)
2848 base_path = must_append_path(base_path, parts[i], NULL);
2849 target = must_make_path(base_path, "cgroup.subtree_control", NULL);
2850 ret = lxc_writeat(-1, target, add_controllers, full_len);
61fbc369 2851 if (ret < 0)
f761d24d
CB
2852 return log_error_errno(false, errno, "Could not enable \"%s\" controllers in the unified cgroup \"%s\"",
2853 add_controllers, target);
c581d2a6
CB
2854 TRACE("Enable \"%s\" controllers in the unified cgroup \"%s\"", add_controllers, target);
2855 }
2856
f761d24d 2857 return true;
c581d2a6
CB
2858}
2859
2860__cgfsng_ops bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2861{
61fbc369
CB
2862 if (!ops)
2863 return ret_set_errno(false, ENOENT);
2864
c581d2a6
CB
2865 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2866}
2867
2868__cgfsng_ops bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2869{
61fbc369
CB
2870 if (!ops)
2871 return ret_set_errno(false, ENOENT);
2872
c581d2a6 2873 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2202afc9
CB
2874}
2875
b7b18fc5
CB
2876static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2877 char **controllers)
2878{
b7b18fc5
CB
2879 if (!ops->cgroup_use)
2880 return true;
2881
431e2c54 2882 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
b7b18fc5
CB
2883 bool found = false;
2884
431e2c54 2885 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
b7b18fc5
CB
2886 if (strcmp(*cur_use, *cur_ctrl) != 0)
2887 continue;
2888
2889 found = true;
2890 break;
2891 }
2892
2893 if (found)
2894 continue;
2895
2896 return false;
2897 }
2898
2899 return true;
2900}
2901
a6ca2ed8
CB
2902static void cg_unified_delegate(char ***delegate)
2903{
d606c4e9 2904 __do_free char *buf = NULL;
a6ca2ed8 2905 char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
d606c4e9
CB
2906 char *token;
2907 int idx;
a6ca2ed8 2908
d606c4e9
CB
2909 buf = read_file("/sys/kernel/cgroup/delegate");
2910 if (!buf) {
a6ca2ed8
CB
2911 for (char **p = standard; p && *p; p++) {
2912 idx = append_null_to_list((void ***)delegate);
2913 (*delegate)[idx] = must_copy_string(*p);
2914 }
fc3b9533
CB
2915 SYSWARN("Failed to read /sys/kernel/cgroup/delegate");
2916 return;
d606c4e9 2917 }
a6ca2ed8 2918
d606c4e9
CB
2919 lxc_iterate_parts (token, buf, " \t\n") {
2920 /*
2921 * We always need to chown this for both cgroup and
2922 * cgroup2.
2923 */
2924 if (strcmp(token, "cgroup.procs") == 0)
2925 continue;
2926
2927 idx = append_null_to_list((void ***)delegate);
2928 (*delegate)[idx] = must_copy_string(token);
a6ca2ed8
CB
2929 }
2930}
2931
2202afc9
CB
2932/* At startup, parse_hierarchies finds all the info we need about cgroup
2933 * mountpoints and current cgroups, and stores it in @d.
2934 */
341e6516 2935static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
2202afc9 2936{
bbba37f7
CB
2937 __do_free char *basecginfo = NULL, *line = NULL;
2938 __do_free_string_list char **klist = NULL, **nlist = NULL;
d97919ab 2939 __do_fclose FILE *f = NULL;
2202afc9 2940 int ret;
2202afc9 2941 size_t len = 0;
2202afc9
CB
2942
2943 /* Root spawned containers escape the current cgroup, so use init's
2944 * cgroups as our base in that case.
2945 */
9caee129 2946 if (!relative && (geteuid() == 0))
2202afc9
CB
2947 basecginfo = read_file("/proc/1/cgroup");
2948 else
2949 basecginfo = read_file("/proc/self/cgroup");
2950 if (!basecginfo)
341e6516 2951 return ret_set_errno(-1, ENOMEM);
2202afc9
CB
2952
2953 ret = get_existing_subsystems(&klist, &nlist);
341e6516
CB
2954 if (ret < 0)
2955 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
2202afc9 2956
4110345b 2957 f = fopen("/proc/self/mountinfo", "re");
341e6516
CB
2958 if (!f)
2959 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
2202afc9
CB
2960
2961 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2962
2963 while (getline(&line, &len, f) != -1) {
bbba37f7
CB
2964 __do_free char *base_cgroup = NULL, *mountpoint = NULL;
2965 __do_free_string_list char **controller_list = NULL;
2202afc9
CB
2966 int type;
2967 bool writeable;
2968 struct hierarchy *new;
2202afc9
CB
2969
2970 type = get_cgroup_version(line);
2971 if (type == 0)
2972 continue;
2973
2974 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2975 continue;
2976
2977 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2978 if (type == CGROUP2_SUPER_MAGIC)
2979 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2980 else if (type == CGROUP_SUPER_MAGIC)
2981 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2982 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2983 if (type == CGROUP_SUPER_MAGIC)
2984 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2985 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2986 if (type == CGROUP2_SUPER_MAGIC)
2987 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2988 }
2989
2990 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
2991 if (!controller_list && type == CGROUP_SUPER_MAGIC)
2992 continue;
2993
2994 if (type == CGROUP_SUPER_MAGIC)
fc3b9533
CB
2995 if (controller_list_is_dup(ops->hierarchies, controller_list)) {
2996 TRACE("Skipping duplicating controller");
2997 continue;
2998 }
2202afc9
CB
2999
3000 mountpoint = cg_hybrid_get_mountpoint(line);
fc3b9533
CB
3001 if (!mountpoint) {
3002 ERROR("Failed parsing mountpoint from \"%s\"", line);
3003 continue;
3004 }
2202afc9
CB
3005
3006 if (type == CGROUP_SUPER_MAGIC)
3007 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
3008 else
3009 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
fc3b9533
CB
3010 if (!base_cgroup) {
3011 ERROR("Failed to find current cgroup");
3012 continue;
3013 }
2202afc9
CB
3014
3015 trim(base_cgroup);
3016 prune_init_scope(base_cgroup);
3017 if (type == CGROUP2_SUPER_MAGIC)
3018 writeable = test_writeable_v2(mountpoint, base_cgroup);
3019 else
3020 writeable = test_writeable_v1(mountpoint, base_cgroup);
fc3b9533
CB
3021 if (!writeable) {
3022 TRACE("The %s group is not writeable", base_cgroup);
3023 continue;
3024 }
2202afc9
CB
3025
3026 if (type == CGROUP2_SUPER_MAGIC) {
3027 char *cgv2_ctrl_path;
3028
3029 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
3030 "cgroup.controllers",
3031 NULL);
3032
3033 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
3034 free(cgv2_ctrl_path);
3035 if (!controller_list) {
3036 controller_list = cg_unified_make_empty_controller();
3037 TRACE("No controllers are enabled for "
3038 "delegation in the unified hierarchy");
3039 }
3040 }
3041
b7b18fc5 3042 /* Exclude all controllers that cgroup use does not want. */
fc3b9533
CB
3043 if (!cgroup_use_wants_controllers(ops, controller_list)) {
3044 TRACE("Skipping controller");
3045 continue;
3046 }
b7b18fc5 3047
bbba37f7 3048 new = add_hierarchy(&ops->hierarchies, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
a6ca2ed8
CB
3049 if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
3050 if (unprivileged)
3051 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 3052 ops->unified = new;
a6ca2ed8 3053 }
2202afc9
CB
3054 }
3055
2202afc9
CB
3056 TRACE("Writable cgroup hierarchies:");
3057 lxc_cgfsng_print_hierarchies(ops);
3058
3059 /* verify that all controllers in cgroup.use and all crucial
3060 * controllers are accounted for
3061 */
3062 if (!all_controllers_found(ops))
341e6516 3063 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
2202afc9 3064
341e6516 3065 return 0;
2202afc9
CB
3066}
3067
2202afc9 3068/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
9caee129 3069static char *cg_unified_get_current_cgroup(bool relative)
2202afc9 3070{
88396101 3071 __do_free char *basecginfo = NULL;
d7314671 3072 char *copy;
d97919ab 3073 char *base_cgroup;
2202afc9 3074
9caee129 3075 if (!relative && (geteuid() == 0))
2202afc9
CB
3076 basecginfo = read_file("/proc/1/cgroup");
3077 else
3078 basecginfo = read_file("/proc/self/cgroup");
3079 if (!basecginfo)
3080 return NULL;
3081
3082 base_cgroup = strstr(basecginfo, "0::/");
3083 if (!base_cgroup)
d7314671 3084 return NULL;
2202afc9
CB
3085
3086 base_cgroup = base_cgroup + 3;
3087 copy = copy_to_eol(base_cgroup);
3088 if (!copy)
d7314671 3089 return NULL;
2202afc9 3090
d7314671 3091 return trim(copy);
2202afc9
CB
3092}
3093
a6ca2ed8
CB
3094static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3095 bool unprivileged)
2202afc9 3096{
d97919ab 3097 __do_free char *subtree_path = NULL;
2202afc9 3098 int ret;
7717e175 3099 char *mountpoint;
2202afc9 3100 char **delegatable;
a6ca2ed8 3101 struct hierarchy *new;
2202afc9
CB
3102 char *base_cgroup = NULL;
3103
d47ff01b 3104 ret = unified_cgroup_hierarchy();
2202afc9 3105 if (ret == -ENOMEDIUM)
d2203230 3106 return ret_errno(ENOMEDIUM);
2202afc9
CB
3107
3108 if (ret != CGROUP2_SUPER_MAGIC)
3109 return 0;
3110
9caee129 3111 base_cgroup = cg_unified_get_current_cgroup(relative);
2202afc9 3112 if (!base_cgroup)
d2203230 3113 return ret_errno(EINVAL);
c581d2a6
CB
3114 if (!relative)
3115 prune_init_scope(base_cgroup);
2202afc9 3116
d606c4e9
CB
3117 /*
3118 * We assume that the cgroup we're currently in has been delegated to
3119 * us and we are free to further delege all of the controllers listed
3120 * in cgroup.controllers further down the hierarchy.
2202afc9 3121 */
dca9587a 3122 mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
c581d2a6 3123 subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
2202afc9 3124 delegatable = cg_unified_get_controllers(subtree_path);
2202afc9
CB
3125 if (!delegatable)
3126 delegatable = cg_unified_make_empty_controller();
3127 if (!delegatable[0])
3128 TRACE("No controllers are enabled for delegation");
3129
3130 /* TODO: If the user requested specific controllers via lxc.cgroup.use
3131 * we should verify here. The reason I'm not doing it right is that I'm
3132 * not convinced that lxc.cgroup.use will be the future since it is a
3133 * global property. I much rather have an option that lets you request
3134 * controllers per container.
3135 */
3136
a6ca2ed8 3137 new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
d606c4e9 3138 if (unprivileged)
a6ca2ed8 3139 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 3140
2a63b5cb
CB
3141 if (bpf_devices_cgroup_supported())
3142 new->bpf_device_controller = 1;
3143
2202afc9 3144 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
908e0ee5 3145 ops->unified = new;
77c3e9a2 3146
2202afc9
CB
3147 return CGROUP2_SUPER_MAGIC;
3148}
3149
341e6516 3150static int cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
2202afc9
CB
3151{
3152 int ret;
3153 const char *tmp;
9caee129 3154 bool relative = conf->cgroup_meta.relative;
2202afc9
CB
3155
3156 tmp = lxc_global_config_value("lxc.cgroup.use");
b7b18fc5 3157 if (tmp) {
88396101 3158 __do_free char *pin = NULL;
d97919ab 3159 char *chop, *cur;
b7b18fc5
CB
3160
3161 pin = must_copy_string(tmp);
3162 chop = pin;
3163
d97919ab 3164 lxc_iterate_parts(cur, chop, ",")
b7b18fc5 3165 must_append_string(&ops->cgroup_use, cur);
b7b18fc5 3166 }
2202afc9 3167
a6ca2ed8 3168 ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9 3169 if (ret < 0)
341e6516 3170 return -1;
2202afc9
CB
3171
3172 if (ret == CGROUP2_SUPER_MAGIC)
341e6516 3173 return 0;
2202afc9 3174
a6ca2ed8 3175 return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9
CB
3176}
3177
341e6516 3178__cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
2202afc9
CB
3179{
3180 const char *cgroup_pattern;
3181
341e6516
CB
3182 if (!ops)
3183 return ret_set_errno(-1, ENOENT);
3184
2202afc9
CB
3185 /* copy system-wide cgroup information */
3186 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
b3ed2061
CB
3187 if (cgroup_pattern && strcmp(cgroup_pattern, "") != 0)
3188 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
2202afc9 3189
341e6516 3190 return 0;
2202afc9
CB
3191}
3192
5a087e05 3193struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
2202afc9 3194{
a64edc1c 3195 __do_free struct cgroup_ops *cgfsng_ops = NULL;
2202afc9
CB
3196
3197 cgfsng_ops = malloc(sizeof(struct cgroup_ops));
3198 if (!cgfsng_ops)
341e6516 3199 return ret_set_errno(NULL, ENOMEM);
2202afc9
CB
3200
3201 memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
3202 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3203
341e6516 3204 if (cg_init(cgfsng_ops, conf))
2202afc9 3205 return NULL;
2202afc9
CB
3206
3207 cgfsng_ops->data_init = cgfsng_data_init;
434c8e15
CB
3208 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3209 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
72068e74 3210 cgfsng_ops->monitor_create = cgfsng_monitor_create;
eeef32bb 3211 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
c581d2a6
CB
3212 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3213 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
e8b181f5
CB
3214 cgfsng_ops->payload_create = cgfsng_payload_create;
3215 cgfsng_ops->payload_enter = cgfsng_payload_enter;
78eb6aa6 3216 cgfsng_ops->payload_finalize = cgfsng_payload_finalize;
2202afc9
CB
3217 cgfsng_ops->escape = cgfsng_escape;
3218 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
3219 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
3220 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3221 cgfsng_ops->get = cgfsng_get;
3222 cgfsng_ops->set = cgfsng_set;
942e193e 3223 cgfsng_ops->freeze = cgfsng_freeze;
2202afc9 3224 cgfsng_ops->unfreeze = cgfsng_unfreeze;
c581d2a6 3225 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
2202afc9
CB
3226 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3227 cgfsng_ops->driver = "cgfsng";
3228 cgfsng_ops->version = "1.0.0";
3229 cgfsng_ops->attach = cgfsng_attach;
3230 cgfsng_ops->chown = cgfsng_chown;
3231 cgfsng_ops->mount = cgfsng_mount;
bf651989 3232 cgfsng_ops->devices_activate = cgfsng_devices_activate;
2202afc9 3233
a64edc1c 3234 return move_ptr(cgfsng_ops);
2202afc9 3235}