]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
Merge pull request #3331 from brauner/2020-03-27/fixes
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
ccb4cabe
SH
2
3/*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
0e7ff52c 8 * each controller.
ccb4cabe
SH
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
18406e5a 12 * a comma-separated list of controllers.
ccb4cabe 13 */
a54694f8 14
d38dd64a
CB
15#ifndef _GNU_SOURCE
16#define _GNU_SOURCE 1
17#endif
a54694f8
CB
18#include <ctype.h>
19#include <dirent.h>
20#include <errno.h>
21#include <grp.h>
d38dd64a
CB
22#include <linux/kdev_t.h>
23#include <linux/types.h>
942e193e
CB
24#include <poll.h>
25#include <signal.h>
a54694f8 26#include <stdint.h>
ccb4cabe
SH
27#include <stdio.h>
28#include <stdlib.h>
a54694f8 29#include <string.h>
438c4581 30#include <sys/types.h>
d38dd64a 31#include <unistd.h>
c8bf519d 32
b635e92d 33#include "caps.h"
ccb4cabe 34#include "cgroup.h"
bf651989 35#include "cgroup2_devices.h"
6328fd9c 36#include "cgroup_utils.h"
ccb4cabe 37#include "commands.h"
43654d34 38#include "conf.h"
d38dd64a 39#include "config.h"
a54694f8 40#include "log.h"
c19ad94b 41#include "macro.h"
018051e3 42#include "mainloop.h"
861cb8c2 43#include "memory_utils.h"
43654d34 44#include "storage/storage.h"
a54694f8 45#include "utils.h"
ccb4cabe 46
64e82f8b
DJ
47#ifndef HAVE_STRLCPY
48#include "include/strlcpy.h"
49#endif
50
3ebe2fbd
DJ
51#ifndef HAVE_STRLCAT
52#include "include/strlcat.h"
53#endif
54
ac2cecc4 55lxc_log_define(cgfsng, cgroup);
ccb4cabe 56
8b8db2f6
CB
57/* Given a pointer to a null-terminated array of pointers, realloc to add one
58 * entry, and point the new entry to NULL. Do not fail. Return the index to the
59 * second-to-last entry - that is, the one which is now available for use
60 * (keeping the list null-terminated).
ccb4cabe
SH
61 */
62static int append_null_to_list(void ***list)
63{
64 int newentry = 0;
65
66 if (*list)
8b8db2f6
CB
67 for (; (*list)[newentry]; newentry++)
68 ;
ccb4cabe
SH
69
70 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
71 (*list)[newentry + 1] = NULL;
72 return newentry;
73}
74
8073018d
CB
75/* Given a null-terminated array of strings, check whether @entry is one of the
76 * strings.
ccb4cabe
SH
77 */
78static bool string_in_list(char **list, const char *entry)
79{
ccb4cabe
SH
80 if (!list)
81 return false;
d6337a5f 82
77c3e9a2 83 for (int i = 0; list[i]; i++)
ccb4cabe
SH
84 if (strcmp(list[i], entry) == 0)
85 return true;
86
87 return false;
88}
89
ac010944
CB
90/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
91 * "name=systemd". Do not fail.
92 */
93static char *cg_legacy_must_prefix_named(char *entry)
94{
95 size_t len;
96 char *prefixed;
97
98 len = strlen(entry);
f25a2044 99 prefixed = must_realloc(NULL, len + 6);
ac010944 100
6333c915
CB
101 memcpy(prefixed, "name=", STRLITERALLEN("name="));
102 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
ac010944 103 prefixed[len + 5] = '\0';
99bb3fa8 104
ac010944
CB
105 return prefixed;
106}
107
42a993b4
CB
108/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
109 * we are called.
ccb4cabe 110 *
42a993b4
CB
111 * We also handle named subsystems here. Any controller which is not a kernel
112 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
113 * we refuse to use because we're not sure which we have here.
114 * (TODO: We could work around this in some cases by just remounting to be
115 * unambiguous, or by comparing mountpoint contents with current cgroup.)
ccb4cabe
SH
116 *
117 * The last entry will always be NULL.
118 */
42a993b4
CB
119static void must_append_controller(char **klist, char **nlist, char ***clist,
120 char *entry)
ccb4cabe
SH
121{
122 int newentry;
123 char *copy;
124
125 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
c2712f64 126 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
ccb4cabe
SH
127 ERROR("It is both a named and kernel subsystem");
128 return;
129 }
130
131 newentry = append_null_to_list((void ***)clist);
132
133 if (strncmp(entry, "name=", 5) == 0)
134 copy = must_copy_string(entry);
135 else if (string_in_list(klist, entry))
136 copy = must_copy_string(entry);
137 else
7745483d 138 copy = cg_legacy_must_prefix_named(entry);
ccb4cabe
SH
139
140 (*clist)[newentry] = copy;
141}
142
5ae0207c
CB
143/* Given a handler's cgroup data, return the struct hierarchy for the controller
144 * @c, or NULL if there is none.
ccb4cabe 145 */
27a5132c 146struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
ccb4cabe 147{
77c3e9a2
CB
148 if (!ops->hierarchies)
149 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
d6337a5f 150
77c3e9a2 151 for (int i = 0; ops->hierarchies[i]; i++) {
27a5132c 152 if (!controller) {
d6337a5f 153 /* This is the empty unified hierarchy. */
2202afc9
CB
154 if (ops->hierarchies[i]->controllers &&
155 !ops->hierarchies[i]->controllers[0])
156 return ops->hierarchies[i];
106f1f38 157 continue;
2a63b5cb
CB
158 } else if (pure_unified_layout(ops) &&
159 strcmp(controller, "devices") == 0) {
160 if (ops->unified->bpf_device_controller)
161 return ops->unified;
162 break;
d6337a5f
CB
163 }
164
27a5132c 165 if (string_in_list(ops->hierarchies[i]->controllers, controller))
2202afc9 166 return ops->hierarchies[i];
ccb4cabe 167 }
d6337a5f 168
27a5132c
CB
169 if (controller)
170 WARN("There is no useable %s controller", controller);
171 else
172 WARN("There is no empty unified cgroup hierarchy");
173
77c3e9a2 174 return ret_set_errno(NULL, ENOENT);
ccb4cabe
SH
175}
176
a54694f8
CB
177#define BATCH_SIZE 50
178static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
179{
180 int newbatches = (newlen / BATCH_SIZE) + 1;
181 int oldbatches = (oldlen / BATCH_SIZE) + 1;
182
77c3e9a2 183 if (!*mem || newbatches > oldbatches)
a54694f8 184 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
a54694f8
CB
185}
186
187static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
188{
189 size_t full = oldlen + newlen;
190
191 batch_realloc(dest, oldlen, full + 1);
192
193 memcpy(*dest + oldlen, new, newlen + 1);
194}
195
196/* Slurp in a whole file */
d6337a5f 197static char *read_file(const char *fnam)
a54694f8 198{
77c3e9a2 199 __do_free char *buf = NULL, *line = NULL;
d97919ab 200 __do_fclose FILE *f = NULL;
d97919ab 201 size_t len = 0, fulllen = 0;
77c3e9a2 202 int linelen;
a54694f8 203
4110345b 204 f = fopen(fnam, "re");
a54694f8
CB
205 if (!f)
206 return NULL;
77c3e9a2 207
a54694f8
CB
208 while ((linelen = getline(&line, &len, f)) != -1) {
209 append_line(&buf, fulllen, line, linelen);
210 fulllen += linelen;
211 }
77c3e9a2
CB
212
213 return move_ptr(buf);
a54694f8
CB
214}
215
216/* Taken over modified from the kernel sources. */
217#define NBITS 32 /* bits in uint32_t */
218#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
219#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
220
221static void set_bit(unsigned bit, uint32_t *bitarr)
222{
223 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
224}
225
226static void clear_bit(unsigned bit, uint32_t *bitarr)
227{
228 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
229}
230
231static bool is_set(unsigned bit, uint32_t *bitarr)
232{
233 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
234}
235
236/* Create cpumask from cpulist aka turn:
237 *
238 * 0,2-3
239 *
d5d468f6 240 * into bit array
a54694f8
CB
241 *
242 * 1 0 1 1
243 */
244static uint32_t *lxc_cpumask(char *buf, size_t nbits)
245{
77c3e9a2 246 __do_free uint32_t *bitarr = NULL;
a54694f8 247 char *token;
d5d468f6 248 size_t arrlen;
d5d468f6
CB
249
250 arrlen = BITS_TO_LONGS(nbits);
251 bitarr = calloc(arrlen, sizeof(uint32_t));
a54694f8 252 if (!bitarr)
c5b8049e 253 return ret_set_errno(NULL, ENOMEM);
a54694f8 254
0be0d78f 255 lxc_iterate_parts(token, buf, ",") {
a54694f8 256 errno = 0;
d5d468f6
CB
257 unsigned end, start;
258 char *range;
a54694f8 259
d5d468f6
CB
260 start = strtoul(token, NULL, 0);
261 end = start;
262 range = strchr(token, '-');
a54694f8
CB
263 if (range)
264 end = strtoul(range + 1, NULL, 0);
d5d468f6 265
c5b8049e
CB
266 if (!(start <= end))
267 return ret_set_errno(NULL, EINVAL);
a54694f8 268
c5b8049e
CB
269 if (end >= nbits)
270 return ret_set_errno(NULL, EINVAL);
a54694f8
CB
271
272 while (start <= end)
273 set_bit(start++, bitarr);
274 }
275
c5b8049e 276 return move_ptr(bitarr);
a54694f8
CB
277}
278
a54694f8
CB
279/* Turn cpumask into simple, comma-separated cpulist. */
280static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
281{
f761d24d 282 __do_free_string_list char **cpulist = NULL;
c19ad94b 283 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
77c3e9a2 284 int ret;
a54694f8 285
77c3e9a2 286 for (size_t i = 0; i <= nbits; i++) {
414c6719
CB
287 if (!is_set(i, bitarr))
288 continue;
289
979a0d93 290 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
f761d24d 291 if (ret < 0 || (size_t)ret >= sizeof(numstr))
414c6719 292 return NULL;
414c6719
CB
293
294 ret = lxc_append_string(&cpulist, numstr);
f761d24d 295 if (ret < 0)
c5b8049e 296 return ret_set_errno(NULL, ENOMEM);
a54694f8 297 }
414c6719
CB
298
299 if (!cpulist)
c5b8049e 300 return ret_set_errno(NULL, ENOMEM);
414c6719 301
f761d24d 302 return lxc_string_join(",", (const char **)cpulist, false);
a54694f8
CB
303}
304
305static ssize_t get_max_cpus(char *cpulist)
306{
307 char *c1, *c2;
308 char *maxcpus = cpulist;
309 size_t cpus = 0;
310
311 c1 = strrchr(maxcpus, ',');
312 if (c1)
313 c1++;
314
315 c2 = strrchr(maxcpus, '-');
316 if (c2)
317 c2++;
318
319 if (!c1 && !c2)
320 c1 = maxcpus;
321 else if (c1 > c2)
322 c2 = c1;
323 else if (c1 < c2)
324 c1 = c2;
333987b9 325 else if (!c1 && c2)
a54694f8
CB
326 c1 = c2;
327
a54694f8
CB
328 errno = 0;
329 cpus = strtoul(c1, NULL, 0);
330 if (errno != 0)
331 return -1;
332
333 return cpus;
334}
335
6f9584d8 336#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
36f70181 337#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
c5b8049e
CB
338static bool cg_legacy_filter_and_set_cpus(const char *parent_cgroup,
339 char *child_cgroup, bool am_initialized)
a54694f8 340{
d97919ab 341 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
36f70181
CB
342 *offlinecpus = NULL, *posscpus = NULL;
343 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
344 *possmask = NULL;
a54694f8
CB
345 int ret;
346 ssize_t i;
36f70181 347 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
c5b8049e 348 bool flipped_bit = false;
a54694f8 349
c5b8049e 350 fpath = must_make_path(parent_cgroup, "cpuset.cpus", NULL);
a54694f8 351 posscpus = read_file(fpath);
c5b8049e
CB
352 if (!posscpus)
353 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
a54694f8
CB
354
355 /* Get maximum number of cpus found in possible cpuset. */
356 maxposs = get_max_cpus(posscpus);
92d5ea57 357 if (maxposs < 0 || maxposs >= INT_MAX - 1)
d97919ab 358 return false;
a54694f8 359
36f70181
CB
360 if (file_exists(__ISOL_CPUS)) {
361 isolcpus = read_file(__ISOL_CPUS);
c5b8049e
CB
362 if (!isolcpus)
363 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
6f9584d8 364
36f70181
CB
365 if (isdigit(isolcpus[0])) {
366 /* Get maximum number of cpus found in isolated cpuset. */
367 maxisol = get_max_cpus(isolcpus);
368 if (maxisol < 0 || maxisol >= INT_MAX - 1)
369 return false;
6f9584d8 370 }
36f70181
CB
371
372 if (maxposs < maxisol)
373 maxposs = maxisol;
374 maxposs++;
375 } else {
376 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
a54694f8
CB
377 }
378
36f70181
CB
379 if (file_exists(__OFFLINE_CPUS)) {
380 offlinecpus = read_file(__OFFLINE_CPUS);
c5b8049e
CB
381 if (!offlinecpus)
382 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
36f70181
CB
383
384 if (isdigit(offlinecpus[0])) {
385 /* Get maximum number of cpus found in offline cpuset. */
386 maxoffline = get_max_cpus(offlinecpus);
387 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
388 return false;
389 }
390
391 if (maxposs < maxoffline)
392 maxposs = maxoffline;
393 maxposs++;
394 } else {
395 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
396 }
a54694f8 397
dcd14a3d
CB
398 if ((maxisol == 0) && (maxoffline == 0)) {
399 cpulist = move_ptr(posscpus);
36f70181 400 goto copy_parent;
dcd14a3d 401 }
a54694f8
CB
402
403 possmask = lxc_cpumask(posscpus, maxposs);
c5b8049e
CB
404 if (!possmask)
405 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
a54694f8 406
36f70181
CB
407 if (maxisol > 0) {
408 isolmask = lxc_cpumask(isolcpus, maxposs);
c5b8049e
CB
409 if (!isolmask)
410 return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
36f70181
CB
411 }
412
413 if (maxoffline > 0) {
414 offlinemask = lxc_cpumask(offlinecpus, maxposs);
c5b8049e
CB
415 if (!offlinemask)
416 return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
6f9584d8 417 }
a54694f8
CB
418
419 for (i = 0; i <= maxposs; i++) {
36f70181
CB
420 if ((isolmask && !is_set(i, isolmask)) ||
421 (offlinemask && !is_set(i, offlinemask)) ||
422 !is_set(i, possmask))
59ac3b88
CB
423 continue;
424
425 flipped_bit = true;
426 clear_bit(i, possmask);
a54694f8
CB
427 }
428
6f9584d8 429 if (!flipped_bit) {
b31d62b8
CB
430 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
431 TRACE("No isolated or offline cpus present in cpuset");
432 } else {
433 cpulist = move_ptr(posscpus);
434 TRACE("Removed isolated or offline cpus from cpuset");
6f9584d8 435 }
c5b8049e
CB
436 if (!cpulist)
437 return log_error_errno(false, errno, "Failed to create cpu list");
a54694f8
CB
438
439copy_parent:
36f70181 440 if (!am_initialized) {
c5b8049e 441 ret = lxc_write_openat(child_cgroup, "cpuset.cpus", cpulist, strlen(cpulist));
c04a6d4e
CB
442 if (ret < 0)
443 return log_error_errno(false,
444 errno, "Failed to write cpu list to \"%s/cpuset.cpus\"",
c5b8049e 445 child_cgroup);
36f70181
CB
446
447 TRACE("Copied cpu settings of parent cgroup");
6f9584d8
CB
448 }
449
d97919ab 450 return true;
a54694f8
CB
451}
452
e3a3fecf 453/* Copy contents of parent(@path)/@file to @path/@file */
c5b8049e
CB
454static bool copy_parent_file(const char *parent_cgroup,
455 const char *child_cgroup, const char *file)
e3a3fecf 456{
c5b8049e 457 __do_free char *parent_file = NULL, *value = NULL;
b095a8eb 458 int len = 0;
fe70edee 459 int ret;
e3a3fecf 460
c5b8049e
CB
461 parent_file = must_make_path(parent_cgroup, file, NULL);
462 len = lxc_read_from_file(parent_file, NULL, 0);
fe70edee 463 if (len <= 0)
77c3e9a2 464 return log_error_errno(false, errno, "Failed to determine buffer size");
b095a8eb 465
f25a2044 466 value = must_realloc(NULL, len + 1);
fe70edee 467 value[len] = '\0';
c5b8049e 468 ret = lxc_read_from_file(parent_file, value, len);
fe70edee 469 if (ret != len)
77c3e9a2 470 return log_error_errno(false, errno, "Failed to read from parent file \"%s\"", parent_file);
b095a8eb 471
c5b8049e 472 ret = lxc_write_openat(child_cgroup, file, value, len);
fe70edee 473 if (ret < 0 && errno != EACCES)
77c3e9a2 474 return log_error_errno(false, errno, "Failed to write \"%s\" to file \"%s/%s\"",
c5b8049e 475 value, child_cgroup, file);
fe70edee 476 return true;
e3a3fecf
SH
477}
478
77c3e9a2 479static inline bool is_unified_hierarchy(const struct hierarchy *h)
c04a6d4e
CB
480{
481 return h->version == CGROUP2_SUPER_MAGIC;
482}
483
f990d3bf
CB
484/*
485 * Initialize the cpuset hierarchy in first directory of @cgroup_leaf and set
7793add3
CB
486 * cgroup.clone_children so that children inherit settings. Since the
487 * h->base_path is populated by init or ourselves, we know it is already
488 * initialized.
fe70edee
CB
489 *
490 * returns -1 on error, 0 when we didn't created a cgroup, 1 if we created a
491 * cgroup.
e3a3fecf 492 */
f990d3bf
CB
493static int cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h,
494 const char *cgroup_leaf)
e3a3fecf 495{
c5b8049e 496 __do_free char *parent_cgroup = NULL, *child_cgroup = NULL, *dup = NULL;
f62cf1d4 497 __do_close int cgroup_fd = -EBADF;
c5b8049e 498 int fret = -1;
7793add3
CB
499 int ret;
500 char v;
f990d3bf 501 char *leaf, *slash;
e3a3fecf 502
c04a6d4e 503 if (is_unified_hierarchy(h))
fe70edee 504 return 0;
c04a6d4e 505
e3a3fecf 506 if (!string_in_list(h->controllers, "cpuset"))
fe70edee 507 return 0;
e3a3fecf 508
f990d3bf
CB
509 if (!cgroup_leaf)
510 return ret_set_errno(-1, EINVAL);
511
512 dup = strdup(cgroup_leaf);
513 if (!dup)
514 return ret_set_errno(-1, ENOMEM);
515
c5b8049e
CB
516 parent_cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
517
518 leaf = dup;
f990d3bf
CB
519 leaf += strspn(leaf, "/");
520 slash = strchr(leaf, '/');
e3a3fecf
SH
521 if (slash)
522 *slash = '\0';
c5b8049e 523 child_cgroup = must_make_path(parent_cgroup, leaf, NULL);
e3a3fecf
SH
524 if (slash)
525 *slash = '/';
7793add3 526
fe70edee 527 fret = 1;
c5b8049e 528 ret = mkdir(child_cgroup, 0755);
7793add3 529 if (ret < 0) {
fe70edee 530 if (errno != EEXIST)
c5b8049e 531 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", child_cgroup);
fe70edee
CB
532
533 fret = 0;
e3a3fecf 534 }
6f9584d8 535
c5b8049e 536 cgroup_fd = lxc_open_dirfd(child_cgroup);
c04a6d4e 537 if (cgroup_fd < 0)
fe70edee 538 return -1;
7793add3 539
c04a6d4e 540 ret = lxc_readat(cgroup_fd, "cgroup.clone_children", &v, 1);
fe70edee 541 if (ret < 0)
c5b8049e 542 return log_error_errno(-1, errno, "Failed to read file \"%s/cgroup.clone_children\"", child_cgroup);
e3a3fecf 543
a54694f8 544 /* Make sure any isolated cpus are removed from cpuset.cpus. */
c5b8049e 545 if (!cg_legacy_filter_and_set_cpus(parent_cgroup, child_cgroup, v == '1'))
fe70edee 546 return log_error_errno(-1, errno, "Failed to remove isolated cpus");
a54694f8 547
7793add3 548 /* Already set for us by someone else. */
b28c2810
CB
549 if (v == '1')
550 TRACE("\"cgroup.clone_children\" was already set to \"1\"");
e3a3fecf
SH
551
552 /* copy parent's settings */
c5b8049e 553 if (!copy_parent_file(parent_cgroup, child_cgroup, "cpuset.mems"))
fe70edee 554 return log_error_errno(-1, errno, "Failed to copy \"cpuset.mems\" settings");
e3a3fecf 555
fe70edee 556 /* Set clone_children so children inherit our settings */
c04a6d4e 557 ret = lxc_writeat(cgroup_fd, "cgroup.clone_children", "1", 1);
fe70edee 558 if (ret < 0)
c5b8049e 559 return log_error_errno(-1, errno, "Failed to write 1 to \"%s/cgroup.clone_children\"", child_cgroup);
d97919ab 560
fe70edee 561 return fret;
e3a3fecf
SH
562}
563
5c0089ae
CB
564/* Given two null-terminated lists of strings, return true if any string is in
565 * both.
ccb4cabe
SH
566 */
567static bool controller_lists_intersect(char **l1, char **l2)
568{
ccb4cabe
SH
569 if (!l1 || !l2)
570 return false;
571
77c3e9a2 572 for (int i = 0; l1[i]; i++)
ccb4cabe
SH
573 if (string_in_list(l2, l1[i]))
574 return true;
5c0089ae 575
ccb4cabe
SH
576 return false;
577}
578
258449e5
CB
579/* For a null-terminated list of controllers @clist, return true if any of those
580 * controllers is already listed the null-terminated list of hierarchies @hlist.
581 * Realistically, if one is present, all must be present.
ccb4cabe
SH
582 */
583static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
584{
ccb4cabe
SH
585 if (!hlist)
586 return false;
258449e5 587
77c3e9a2 588 for (int i = 0; hlist[i]; i++)
ccb4cabe
SH
589 if (controller_lists_intersect(hlist[i]->controllers, clist))
590 return true;
ccb4cabe 591
258449e5 592 return false;
ccb4cabe
SH
593}
594
f57ac67f
CB
595/* Return true if the controller @entry is found in the null-terminated list of
596 * hierarchies @hlist.
ccb4cabe
SH
597 */
598static bool controller_found(struct hierarchy **hlist, char *entry)
599{
ccb4cabe
SH
600 if (!hlist)
601 return false;
602
77c3e9a2 603 for (int i = 0; hlist[i]; i++)
ccb4cabe
SH
604 if (string_in_list(hlist[i]->controllers, entry))
605 return true;
d6337a5f 606
ccb4cabe
SH
607 return false;
608}
609
e1c27ab0
CB
610/* Return true if all of the controllers which we require have been found. The
611 * required list is freezer and anything in lxc.cgroup.use.
ccb4cabe 612 */
2202afc9 613static bool all_controllers_found(struct cgroup_ops *ops)
ccb4cabe 614{
77c3e9a2 615 struct hierarchy **hlist;
ccb4cabe 616
2202afc9 617 if (!ops->cgroup_use)
ccb4cabe 618 return true;
c2712f64 619
77c3e9a2
CB
620 hlist = ops->hierarchies;
621 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
622 if (!controller_found(hlist, *cur))
623 return log_error(false, "No %s controller mountpoint found", *cur);
c2712f64 624
ccb4cabe
SH
625 return true;
626}
627
f205f10c
CB
628/* Get the controllers from a mountinfo line There are other ways we could get
629 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
630 * could parse the mount options. But we simply assume that the mountpoint must
631 * be /sys/fs/cgroup/controller-list
ccb4cabe 632 */
a3926f6a
CB
633static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
634 int type)
ccb4cabe 635{
f205f10c
CB
636 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
637 * for legacy hierarchies.
638 */
f761d24d 639 __do_free_string_list char **aret = NULL;
ccb4cabe 640 int i;
d97919ab 641 char *p2, *tok;
0be0d78f 642 char *p = line, *sep = ",";
6328fd9c 643
ccb4cabe 644 for (i = 0; i < 4; i++) {
235f1815 645 p = strchr(p, ' ');
ccb4cabe
SH
646 if (!p)
647 return NULL;
648 p++;
649 }
a55f31bd 650
f205f10c
CB
651 /* Note, if we change how mountinfo works, then our caller will need to
652 * verify /sys/fs/cgroup/ in this field.
653 */
77c3e9a2
CB
654 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
655 return log_error(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
d6337a5f 656
ccb4cabe 657 p += 15;
235f1815 658 p2 = strchr(p, ' ');
77c3e9a2
CB
659 if (!p2)
660 return log_error(NULL, "Corrupt mountinfo");
ccb4cabe 661 *p2 = '\0';
6328fd9c 662
d6337a5f 663 if (type == CGROUP_SUPER_MAGIC) {
88396101 664 __do_free char *dup = NULL;
d97919ab 665
0be0d78f
CB
666 /* strdup() here for v1 hierarchies. Otherwise
667 * lxc_iterate_parts() will destroy mountpoints such as
668 * "/sys/fs/cgroup/cpu,cpuacct".
d6337a5f 669 */
d97919ab 670 dup = must_copy_string(p);
d6337a5f
CB
671 if (!dup)
672 return NULL;
673
d97919ab 674 lxc_iterate_parts (tok, dup, sep)
d6337a5f 675 must_append_controller(klist, nlist, &aret, tok);
411ac6d8 676 }
d6337a5f 677 *p2 = ' ';
f205f10c 678
f761d24d 679 return move_ptr(aret);
d6337a5f 680}
411ac6d8 681
d6337a5f
CB
682static char **cg_unified_make_empty_controller(void)
683{
f761d24d 684 __do_free_string_list char **aret = NULL;
d6337a5f 685 int newentry;
d6337a5f
CB
686
687 newentry = append_null_to_list((void ***)&aret);
688 aret[newentry] = NULL;
f761d24d 689 return move_ptr(aret);
d6337a5f
CB
690}
691
692static char **cg_unified_get_controllers(const char *file)
693{
d97919ab 694 __do_free char *buf = NULL;
f761d24d 695 __do_free_string_list char **aret = NULL;
0be0d78f 696 char *sep = " \t\n";
2a63b5cb 697 char *tok;
d6337a5f
CB
698
699 buf = read_file(file);
700 if (!buf)
411ac6d8 701 return NULL;
6328fd9c 702
0be0d78f 703 lxc_iterate_parts(tok, buf, sep) {
d6337a5f
CB
704 int newentry;
705 char *copy;
706
707 newentry = append_null_to_list((void ***)&aret);
708 copy = must_copy_string(tok);
709 aret[newentry] = copy;
ccb4cabe
SH
710 }
711
f761d24d 712 return move_ptr(aret);
ccb4cabe
SH
713}
714
2202afc9 715static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
bb221ad1 716 char *container_base_path, int type)
ccb4cabe
SH
717{
718 struct hierarchy *new;
719 int newentry;
720
1973b62a 721 new = zalloc(sizeof(*new));
ccb4cabe
SH
722 new->controllers = clist;
723 new->mountpoint = mountpoint;
bb221ad1 724 new->container_base_path = container_base_path;
d6337a5f 725 new->version = type;
1973b62a
CB
726 new->cgfd_con = -EBADF;
727 new->cgfd_mon = -EBADF;
6328fd9c 728
2202afc9
CB
729 newentry = append_null_to_list((void ***)h);
730 (*h)[newentry] = new;
d6337a5f 731 return new;
ccb4cabe
SH
732}
733
798c3b33
CB
734/* Get a copy of the mountpoint from @line, which is a line from
735 * /proc/self/mountinfo.
ccb4cabe 736 */
a3926f6a 737static char *cg_hybrid_get_mountpoint(char *line)
ccb4cabe 738{
77c3e9a2 739 char *p = line, *sret = NULL;
ccb4cabe 740 size_t len;
798c3b33 741 char *p2;
ccb4cabe 742
77c3e9a2 743 for (int i = 0; i < 4; i++) {
235f1815 744 p = strchr(p, ' ');
ccb4cabe
SH
745 if (!p)
746 return NULL;
747 p++;
748 }
d6337a5f 749
dca9587a 750 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
d6337a5f
CB
751 return NULL;
752
753 p2 = strchr(p + 15, ' ');
754 if (!p2)
755 return NULL;
756 *p2 = '\0';
757
ccb4cabe 758 len = strlen(p);
f25a2044 759 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
760 memcpy(sret, p, len);
761 sret[len] = '\0';
77c3e9a2 762
ccb4cabe
SH
763 return sret;
764}
765
f523291e 766/* Given a multi-line string, return a null-terminated copy of the current line. */
ccb4cabe
SH
767static char *copy_to_eol(char *p)
768{
77c3e9a2 769 char *p2, *sret;
ccb4cabe
SH
770 size_t len;
771
77c3e9a2 772 p2 = strchr(p, '\n');
ccb4cabe
SH
773 if (!p2)
774 return NULL;
775
776 len = p2 - p;
f25a2044 777 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
778 memcpy(sret, p, len);
779 sret[len] = '\0';
77c3e9a2 780
ccb4cabe
SH
781 return sret;
782}
783
bced39de
CB
784/* cgline: pointer to character after the first ':' in a line in a \n-terminated
785 * /proc/self/cgroup file. Check whether controller c is present.
ccb4cabe
SH
786 */
787static bool controller_in_clist(char *cgline, char *c)
788{
d97919ab
CB
789 __do_free char *tmp = NULL;
790 char *tok, *eol;
ccb4cabe
SH
791 size_t len;
792
235f1815 793 eol = strchr(cgline, ':');
ccb4cabe
SH
794 if (!eol)
795 return false;
796
797 len = eol - cgline;
861cb8c2 798 tmp = must_realloc(NULL, len + 1);
ccb4cabe
SH
799 memcpy(tmp, cgline, len);
800 tmp[len] = '\0';
801
d97919ab
CB
802 lxc_iterate_parts(tok, tmp, ",")
803 if (strcmp(tok, c) == 0)
ccb4cabe 804 return true;
d6337a5f 805
ccb4cabe
SH
806 return false;
807}
808
c3ef912e
CB
809/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
810 * @controller.
ccb4cabe 811 */
c3ef912e
CB
812static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
813 int type)
ccb4cabe
SH
814{
815 char *p = basecginfo;
6328fd9c 816
d6337a5f
CB
817 for (;;) {
818 bool is_cgv2_base_cgroup = false;
819
6328fd9c 820 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
d6337a5f
CB
821 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
822 is_cgv2_base_cgroup = true;
ccb4cabe 823
235f1815 824 p = strchr(p, ':');
ccb4cabe
SH
825 if (!p)
826 return NULL;
827 p++;
d6337a5f
CB
828
829 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
235f1815 830 p = strchr(p, ':');
ccb4cabe
SH
831 if (!p)
832 return NULL;
833 p++;
834 return copy_to_eol(p);
835 }
836
235f1815 837 p = strchr(p, '\n');
ccb4cabe
SH
838 if (!p)
839 return NULL;
840 p++;
841 }
842}
843
ccb4cabe
SH
844static void must_append_string(char ***list, char *entry)
845{
6dfb18bf 846 int newentry;
ccb4cabe
SH
847 char *copy;
848
6dfb18bf 849 newentry = append_null_to_list((void ***)list);
ccb4cabe
SH
850 copy = must_copy_string(entry);
851 (*list)[newentry] = copy;
852}
853
d6337a5f 854static int get_existing_subsystems(char ***klist, char ***nlist)
ccb4cabe 855{
d97919ab
CB
856 __do_free char *line = NULL;
857 __do_fclose FILE *f = NULL;
ccb4cabe
SH
858 size_t len = 0;
859
4110345b 860 f = fopen("/proc/self/cgroup", "re");
d6337a5f
CB
861 if (!f)
862 return -1;
863
ccb4cabe 864 while (getline(&line, &len, f) != -1) {
0be0d78f 865 char *p, *p2, *tok;
235f1815 866 p = strchr(line, ':');
ccb4cabe
SH
867 if (!p)
868 continue;
869 p++;
235f1815 870 p2 = strchr(p, ':');
ccb4cabe
SH
871 if (!p2)
872 continue;
873 *p2 = '\0';
ff8d6ee9 874
6328fd9c
CB
875 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
876 * contains an entry of the form:
ff8d6ee9
CB
877 *
878 * 0::/some/path
879 *
6328fd9c 880 * In this case we use "cgroup2" as controller name.
ff8d6ee9 881 */
6328fd9c
CB
882 if ((p2 - p) == 0) {
883 must_append_string(klist, "cgroup2");
ff8d6ee9 884 continue;
6328fd9c 885 }
ff8d6ee9 886
0be0d78f 887 lxc_iterate_parts(tok, p, ",") {
ccb4cabe
SH
888 if (strncmp(tok, "name=", 5) == 0)
889 must_append_string(nlist, tok);
890 else
891 must_append_string(klist, tok);
892 }
893 }
894
d6337a5f 895 return 0;
ccb4cabe
SH
896}
897
d7314671 898static char *trim(char *s)
ccb4cabe 899{
7689dfd7
CB
900 size_t len;
901
902 len = strlen(s);
2c28d76b 903 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe 904 s[--len] = '\0';
d7314671
CB
905
906 return s;
ccb4cabe
SH
907}
908
2202afc9 909static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
ccb4cabe
SH
910{
911 int i;
27d84737 912 struct hierarchy **it;
41c33dbe 913
2202afc9
CB
914 if (!ops->hierarchies) {
915 TRACE(" No hierarchies found");
ccb4cabe
SH
916 return;
917 }
27d84737 918
2202afc9
CB
919 TRACE(" Hierarchies:");
920 for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
ccb4cabe 921 int j;
27d84737
CB
922 char **cit;
923
bb221ad1 924 TRACE(" %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
2202afc9
CB
925 TRACE(" mountpoint: %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
926 TRACE(" controllers:");
a7b0cc4c 927 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
2202afc9 928 TRACE(" %d: %s", j, *cit);
ccb4cabe
SH
929 }
930}
41c33dbe 931
a3926f6a
CB
932static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
933 char **nlist)
41c33dbe
SH
934{
935 int k;
a7b0cc4c 936 char **it;
41c33dbe 937
2202afc9
CB
938 TRACE("basecginfo is:");
939 TRACE("%s", basecginfo);
41c33dbe 940
a7b0cc4c 941 for (k = 0, it = klist; it && *it; it++, k++)
2202afc9 942 TRACE("kernel subsystem %d: %s", k, *it);
0f71dd9b 943
a7b0cc4c 944 for (k = 0, it = nlist; it && *it; it++, k++)
2202afc9 945 TRACE("named subsystem %d: %s", k, *it);
41c33dbe 946}
ccb4cabe 947
2202afc9
CB
948static int cgroup_rmdir(struct hierarchy **hierarchies,
949 const char *container_cgroup)
c71d83e1 950{
2202afc9
CB
951 if (!container_cgroup || !hierarchies)
952 return 0;
d6337a5f 953
8e64b673 954 for (int i = 0; hierarchies[i]; i++) {
2202afc9 955 struct hierarchy *h = hierarchies[i];
77c3e9a2 956 int ret;
d6337a5f 957
eb697136 958 if (!h->container_full_path)
2202afc9
CB
959 continue;
960
eb697136 961 ret = recursive_destroy(h->container_full_path);
2202afc9 962 if (ret < 0)
eb697136 963 WARN("Failed to destroy \"%s\"", h->container_full_path);
2202afc9 964
77c3e9a2 965 free_disarm(h->container_full_path);
2202afc9 966 }
d6337a5f 967
c71d83e1 968 return 0;
d6337a5f
CB
969}
970
2202afc9
CB
971struct generic_userns_exec_data {
972 struct hierarchy **hierarchies;
973 const char *container_cgroup;
974 struct lxc_conf *conf;
975 uid_t origuid; /* target uid in parent namespace */
976 char *path;
977};
d6337a5f 978
2202afc9
CB
979static int cgroup_rmdir_wrapper(void *data)
980{
2202afc9
CB
981 struct generic_userns_exec_data *arg = data;
982 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
983 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
8e64b673 984 int ret;
d6337a5f 985
b58214ac
CB
986 if (!lxc_setgroups(0, NULL) && errno != EPERM)
987 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
988
2202afc9 989 ret = setresgid(nsgid, nsgid, nsgid);
8e64b673 990 if (ret < 0)
77c3e9a2 991 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
8e64b673 992 (int)nsgid, (int)nsgid, (int)nsgid);
d6337a5f 993
2202afc9 994 ret = setresuid(nsuid, nsuid, nsuid);
8e64b673 995 if (ret < 0)
77c3e9a2 996 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
8e64b673 997 (int)nsuid, (int)nsuid, (int)nsuid);
d6337a5f 998
2202afc9 999 return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
d6337a5f
CB
1000}
1001
434c8e15
CB
1002__cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
1003 struct lxc_handler *handler)
d6337a5f
CB
1004{
1005 int ret;
bd8ef4e4 1006
fc3b9533
CB
1007 if (!ops) {
1008 ERROR("Called with uninitialized cgroup operations");
1009 return;
1010 }
fc1c3af9 1011
69b4a4bb
CB
1012 if (!ops->hierarchies)
1013 return;
1014
fc3b9533
CB
1015 if (!handler) {
1016 ERROR("Called with uninitialized handler");
1017 return;
1018 }
fc1c3af9 1019
fc3b9533
CB
1020 if (!handler->conf) {
1021 ERROR("Called with uninitialized conf");
1022 return;
1023 }
fc1c3af9 1024
bf651989
CB
1025#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
1026 ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices);
1027 if (ret < 0)
1028 WARN("Failed to detach bpf program from cgroup");
1029#endif
1030
8e64b673
CB
1031 if (handler->conf && !lxc_list_empty(&handler->conf->id_map)) {
1032 struct generic_userns_exec_data wrap = {
77c3e9a2
CB
1033 .conf = handler->conf,
1034 .container_cgroup = ops->container_cgroup,
1035 .hierarchies = ops->hierarchies,
1036 .origuid = 0,
8e64b673 1037 };
2202afc9 1038 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
bd8ef4e4 1039 "cgroup_rmdir_wrapper");
8e64b673 1040 } else {
2202afc9 1041 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
ccb4cabe 1042 }
8e64b673 1043 if (ret < 0)
fc3b9533 1044 SYSWARN("Failed to destroy cgroups");
ccb4cabe
SH
1045}
1046
434c8e15
CB
1047__cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1048 struct lxc_handler *handler)
1049{
1050 int len;
434c8e15 1051 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1973b62a 1052 const struct lxc_conf *conf;
b376d3d0 1053
fc3b9533
CB
1054 if (!ops) {
1055 ERROR("Called with uninitialized cgroup operations");
1056 return;
1057 }
434c8e15
CB
1058
1059 if (!ops->hierarchies)
1060 return;
1061
fc3b9533
CB
1062 if (!handler) {
1063 ERROR("Called with uninitialized handler");
1064 return;
1065 }
b376d3d0 1066
fc3b9533
CB
1067 if (!handler->conf) {
1068 ERROR("Called with uninitialized conf");
1069 return;
1070 }
1973b62a
CB
1071 conf = handler->conf;
1072
434c8e15
CB
1073 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1074 if (len < 0 || (size_t)len >= sizeof(pidstr))
1075 return;
1076
1077 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1078 __do_free char *pivot_path = NULL;
434c8e15 1079 struct hierarchy *h = ops->hierarchies[i];
fe70edee 1080 int ret;
434c8e15
CB
1081
1082 if (!h->monitor_full_path)
1083 continue;
1084
1973b62a
CB
1085 if (conf && conf->cgroup_meta.dir)
1086 pivot_path = must_make_path(h->mountpoint,
1087 h->container_base_path,
1088 conf->cgroup_meta.dir,
1089 CGROUP_PIVOT, NULL);
1090 else
1091 pivot_path = must_make_path(h->mountpoint,
1092 h->container_base_path,
1093 CGROUP_PIVOT, NULL);
1094
1095 ret = mkdir_p(pivot_path, 0755);
fc3b9533
CB
1096 if (ret < 0 && errno != EEXIST) {
1097 ERROR("Failed to create %s", pivot_path);
1098 goto try_recursive_destroy;
1099 }
1973b62a
CB
1100
1101 ret = lxc_write_openat(pivot_path, "cgroup.procs", pidstr, len);
fc3b9533
CB
1102 if (ret != 0) {
1103 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
1104 continue;
1105 }
434c8e15 1106
1973b62a 1107try_recursive_destroy:
434c8e15
CB
1108 ret = recursive_destroy(h->monitor_full_path);
1109 if (ret < 0)
1110 WARN("Failed to destroy \"%s\"", h->monitor_full_path);
434c8e15
CB
1111 }
1112}
1113
6099dd5a
CB
1114static int mkdir_eexist_on_last(const char *dir, mode_t mode)
1115{
1116 const char *tmp = dir;
1117 const char *orig = dir;
1118 size_t orig_len;
1119
1120 orig_len = strlen(dir);
1121 do {
6453ba56 1122 __do_free char *makeme = NULL;
6099dd5a
CB
1123 int ret;
1124 size_t cur_len;
6099dd5a
CB
1125
1126 dir = tmp + strspn(tmp, "/");
1127 tmp = dir + strcspn(dir, "/");
1128
6099dd5a
CB
1129 cur_len = dir - orig;
1130 makeme = strndup(orig, cur_len);
1131 if (!makeme)
77c3e9a2 1132 return ret_set_errno(-1, ENOMEM);
6099dd5a
CB
1133
1134 ret = mkdir(makeme, mode);
77c3e9a2
CB
1135 if (ret < 0 && ((errno != EEXIST) || (orig_len == cur_len)))
1136 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
6099dd5a
CB
1137 } while (tmp != dir);
1138
1139 return 0;
1140}
1141
fe70edee 1142static bool create_cgroup_tree(struct hierarchy *h, const char *cgroup_tree,
f990d3bf 1143 const char *cgroup_leaf, bool payload)
72068e74 1144{
fe70edee
CB
1145 __do_free char *path = NULL;
1146 int ret, ret_cpuset;
72068e74 1147
fe70edee
CB
1148 path = must_make_path(h->mountpoint, h->container_base_path, cgroup_leaf, NULL);
1149 if (dir_exists(path))
1150 return log_warn_errno(false, errno, "The %s cgroup already existed", path);
72068e74 1151
fe70edee
CB
1152 ret_cpuset = cg_legacy_handle_cpuset_hierarchy(h, cgroup_leaf);
1153 if (ret_cpuset < 0)
1154 return log_error_errno(false, errno, "Failed to handle legacy cpuset controller");
0c3deb94 1155
fe70edee 1156 ret = mkdir_eexist_on_last(path, 0755);
6099dd5a 1157 if (ret < 0) {
fe70edee
CB
1158 /*
1159 * This is the cpuset controller and
1160 * cg_legacy_handle_cpuset_hierarchy() has created our target
1161 * directory for us to ensure correct initialization.
1162 */
1163 if (ret_cpuset != 1 || cgroup_tree)
1164 return log_error_errno(false, errno, "Failed to create %s cgroup", path);
6f9584d8 1165 }
0c3deb94 1166
1973b62a
CB
1167 if (payload) {
1168 h->cgfd_con = lxc_open_dirfd(path);
1169 if (h->cgfd_con < 0)
1170 return log_error_errno(false, errno, "Failed to open %s", path);
fe70edee 1171 h->container_full_path = move_ptr(path);
1973b62a
CB
1172 } else {
1173 h->cgfd_mon = lxc_open_dirfd(path);
1174 if (h->cgfd_mon < 0)
1175 return log_error_errno(false, errno, "Failed to open %s", path);
fe70edee 1176 h->monitor_full_path = move_ptr(path);
1973b62a 1177 }
fe70edee 1178
c581d2a6 1179 return true;
ccb4cabe
SH
1180}
1181
fe70edee 1182static void cgroup_remove_leaf(struct hierarchy *h, bool payload)
ccb4cabe 1183{
fe70edee 1184 __do_free char *full_path = NULL;
72068e74 1185
1973b62a 1186 if (payload) {
f62cf1d4 1187 __lxc_unused __do_close int fd = move_fd(h->cgfd_con);
d6bdd182 1188 full_path = move_ptr(h->container_full_path);
1973b62a 1189 } else {
f62cf1d4 1190 __lxc_unused __do_close int fd = move_fd(h->cgfd_mon);
d6bdd182 1191 full_path = move_ptr(h->monitor_full_path);
1973b62a 1192 }
e56639fb 1193
d6bdd182 1194 if (full_path && rmdir(full_path))
fe70edee 1195 SYSWARN("Failed to rmdir(\"%s\") cgroup", full_path);
72068e74
CB
1196}
1197
b857f4be 1198__cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
f2668eea 1199 struct lxc_handler *handler)
72068e74 1200{
b3ed2061 1201 __do_free char *monitor_cgroup = NULL, *__cgroup_tree = NULL;
fe70edee
CB
1202 const char *cgroup_tree;
1203 int idx = 0;
1204 int i;
5ce03bc0 1205 size_t len;
fe70edee 1206 char *suffix;
0d66e29a 1207 struct lxc_conf *conf;
72068e74 1208
0d66e29a
CB
1209 if (!ops)
1210 return ret_set_errno(false, ENOENT);
e56639fb 1211
69b4a4bb
CB
1212 if (!ops->hierarchies)
1213 return true;
1214
0d66e29a
CB
1215 if (ops->monitor_cgroup)
1216 return ret_set_errno(false, EEXIST);
1217
1218 if (!handler || !handler->conf)
1219 return ret_set_errno(false, EINVAL);
1220
1221 conf = handler->conf;
1222
b3ed2061
CB
1223 if (conf->cgroup_meta.dir) {
1224 cgroup_tree = conf->cgroup_meta.dir;
fe70edee
CB
1225 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1226 DEFAULT_MONITOR_CGROUP_PREFIX,
1227 handler->name,
1228 CGROUP_CREATE_RETRY, NULL);
b3ed2061
CB
1229 } else if (ops->cgroup_pattern) {
1230 __cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
d6bdd182
CB
1231 if (!__cgroup_tree)
1232 return ret_set_errno(false, ENOMEM);
1233
b3ed2061 1234 cgroup_tree = __cgroup_tree;
d6bdd182
CB
1235 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1236 DEFAULT_MONITOR_CGROUP,
b3ed2061
CB
1237 CGROUP_CREATE_RETRY, NULL);
1238 } else {
d6bdd182 1239 cgroup_tree = NULL;
fe70edee
CB
1240 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1241 handler->name,
1242 CGROUP_CREATE_RETRY, NULL);
b3ed2061 1243 }
fe70edee 1244 if (!monitor_cgroup)
0d66e29a 1245 return ret_set_errno(false, ENOMEM);
72068e74 1246
fe70edee
CB
1247 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1248 *suffix = '\0';
5ce03bc0 1249 do {
0d66e29a 1250 if (idx)
fe70edee 1251 sprintf(suffix, "-%d", idx);
72068e74 1252
ebc10afe 1253 for (i = 0; ops->hierarchies[i]; i++) {
fe70edee
CB
1254 if (create_cgroup_tree(ops->hierarchies[i], cgroup_tree, monitor_cgroup, false))
1255 continue;
1256
1257 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path ?: "(null)");
1258 for (int j = 0; j < i; j++)
1259 cgroup_remove_leaf(ops->hierarchies[j], false);
1260
1261 idx++;
1262 break;
5ce03bc0 1263 }
ebc10afe 1264 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
5ce03bc0 1265
d97919ab 1266 if (idx == 1000)
0d66e29a 1267 return ret_set_errno(false, ERANGE);
72068e74 1268
c581d2a6 1269 ops->monitor_cgroup = move_ptr(monitor_cgroup);
6e8703a4 1270 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
ccb4cabe
SH
1271}
1272
fe70edee
CB
1273/*
1274 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
cecad0c1 1275 * next cgroup_pattern-1, -2, ..., -999.
ccb4cabe 1276 */
b857f4be 1277__cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
f3839f12 1278 struct lxc_handler *handler)
ccb4cabe 1279{
b3ed2061 1280 __do_free char *container_cgroup = NULL, *__cgroup_tree = NULL;
fe70edee 1281 const char *cgroup_tree;
f3839f12 1282 int idx = 0;
fe70edee 1283 int i;
ccb4cabe 1284 size_t len;
fe70edee 1285 char *suffix;
f3839f12 1286 struct lxc_conf *conf;
43654d34 1287
f3839f12
CB
1288 if (!ops)
1289 return ret_set_errno(false, ENOENT);
ccb4cabe 1290
69b4a4bb
CB
1291 if (!ops->hierarchies)
1292 return true;
1293
f3839f12
CB
1294 if (ops->container_cgroup)
1295 return ret_set_errno(false, EEXIST);
1296
1297 if (!handler || !handler->conf)
1298 return ret_set_errno(false, EINVAL);
1299
1300 conf = handler->conf;
1301
b3ed2061
CB
1302 if (conf->cgroup_meta.dir) {
1303 cgroup_tree = conf->cgroup_meta.dir;
fe70edee
CB
1304 container_cgroup = must_concat(&len, cgroup_tree, "/",
1305 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1306 handler->name,
1307 CGROUP_CREATE_RETRY, NULL);
b3ed2061
CB
1308 } else if (ops->cgroup_pattern) {
1309 __cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
d6bdd182
CB
1310 if (!__cgroup_tree)
1311 return ret_set_errno(false, ENOMEM);
1312
b3ed2061 1313 cgroup_tree = __cgroup_tree;
d6bdd182
CB
1314 container_cgroup = must_concat(&len, cgroup_tree, "/",
1315 DEFAULT_PAYLOAD_CGROUP,
b3ed2061
CB
1316 CGROUP_CREATE_RETRY, NULL);
1317 } else {
d6bdd182 1318 cgroup_tree = NULL;
fe70edee
CB
1319 container_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1320 handler->name,
1321 CGROUP_CREATE_RETRY, NULL);
b3ed2061 1322 }
fe70edee
CB
1323 if (!container_cgroup)
1324 return ret_set_errno(false, ENOMEM);
ccb4cabe 1325
fe70edee
CB
1326 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1327 *suffix = '\0';
d97919ab 1328 do {
f3839f12 1329 if (idx)
fe70edee 1330 sprintf(suffix, "-%d", idx);
bb30b52a 1331
d97919ab 1332 for (i = 0; ops->hierarchies[i]; i++) {
fe70edee
CB
1333 if (create_cgroup_tree(ops->hierarchies[i], cgroup_tree, container_cgroup, true))
1334 continue;
1335
1336 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
1337 for (int j = 0; j < i; j++)
1338 cgroup_remove_leaf(ops->hierarchies[j], true);
1339
1340 idx++;
1341 break;
66b66624 1342 }
d97919ab 1343 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
cecad0c1 1344
d97919ab 1345 if (idx == 1000)
f3839f12 1346 return ret_set_errno(false, ERANGE);
cecad0c1 1347
fe70edee
CB
1348 ops->container_cgroup = move_ptr(container_cgroup);
1349 INFO("The container process uses \"%s\" as cgroup", ops->container_cgroup);
ccb4cabe 1350 return true;
ccb4cabe
SH
1351}
1352
c581d2a6
CB
1353__cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1354 struct lxc_handler *handler)
ccb4cabe 1355{
c581d2a6
CB
1356 int monitor_len, transient_len;
1357 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1358 transient[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 1359
797fa65e
CB
1360 if (!ops)
1361 return ret_set_errno(false, ENOENT);
1362
69b4a4bb
CB
1363 if (!ops->hierarchies)
1364 return true;
1365
797fa65e
CB
1366 if (!ops->monitor_cgroup)
1367 return ret_set_errno(false, ENOENT);
1368
1369 if (!handler || !handler->conf)
1370 return ret_set_errno(false, EINVAL);
1371
c581d2a6
CB
1372 monitor_len = snprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1373 if (handler->transient_pid > 0)
1973b62a 1374 transient_len = snprintf(transient, sizeof(transient), "%d", handler->transient_pid);
ccb4cabe 1375
eeef32bb 1376 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1377 struct hierarchy *h = ops->hierarchies[i];
c581d2a6 1378 int ret;
08768001 1379
1973b62a
CB
1380 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", monitor, monitor_len);
1381 if (ret)
1382 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->monitor_full_path);
c581d2a6
CB
1383
1384 if (handler->transient_pid < 0)
1385 return true;
1386
1973b62a
CB
1387 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", transient, transient_len);
1388 if (ret)
1389 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->monitor_full_path);
1390
1391 /*
78eb6aa6 1392 * we don't keep the fds for non-unified hierarchies around
1973b62a 1393 * mainly because we don't make use of them anymore after the
78eb6aa6 1394 * core cgroup setup is done but also because there are quite a
1973b62a
CB
1395 * lot of them.
1396 */
1397 if (!is_unified_hierarchy(h))
1398 close_prot_errno_disarm(h->cgfd_mon);
ccb4cabe 1399 }
c581d2a6 1400 handler->transient_pid = -1;
ccb4cabe
SH
1401
1402 return true;
1403}
1404
c581d2a6
CB
1405__cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1406 struct lxc_handler *handler)
eeef32bb 1407{
c581d2a6
CB
1408 int len;
1409 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
eeef32bb 1410
4490328e
CB
1411 if (!ops)
1412 return ret_set_errno(false, ENOENT);
1413
c581d2a6
CB
1414 if (!ops->hierarchies)
1415 return true;
1416
4490328e
CB
1417 if (!ops->container_cgroup)
1418 return ret_set_errno(false, ENOENT);
1419
1420 if (!handler || !handler->conf)
1421 return ret_set_errno(false, EINVAL);
1422
c581d2a6
CB
1423 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1424
1425 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1426 struct hierarchy *h = ops->hierarchies[i];
c581d2a6
CB
1427 int ret;
1428
1973b62a 1429 ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
c581d2a6 1430 if (ret != 0)
1973b62a 1431 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
c581d2a6
CB
1432 }
1433
1434 return true;
eeef32bb
CB
1435}
1436
1973b62a
CB
1437static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1438 gid_t chown_gid, mode_t chmod_mode)
6efacf80
CB
1439{
1440 int ret;
1441
1973b62a
CB
1442 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1443 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1444 if (ret < 0)
1445 return log_warn_errno(-1,
1446 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1447 dirfd, path, (int)chown_uid,
1448 (int)chown_gid);
6efacf80 1449
1973b62a
CB
1450 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1451 if (ret < 0)
1452 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1453 dirfd, path, (int)chmod_mode);
6efacf80
CB
1454
1455 return 0;
1456}
1457
1458/* chgrp the container cgroups to container group. We leave
c0888dfe
SH
1459 * the container owner as cgroup owner. So we must make the
1460 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1461 *
1462 * Also chown the tasks and cgroup.procs files. Those may not
1463 * exist depending on kernel version.
c0888dfe 1464 */
ccb4cabe
SH
1465static int chown_cgroup_wrapper(void *data)
1466{
6a720d74 1467 int ret;
4160c3a0
CB
1468 uid_t destuid;
1469 struct generic_userns_exec_data *arg = data;
1470 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1471 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1472
b58214ac
CB
1473 if (!lxc_setgroups(0, NULL) && errno != EPERM)
1474 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1475
6efacf80 1476 ret = setresgid(nsgid, nsgid, nsgid);
803e4123 1477 if (ret < 0)
77c3e9a2 1478 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
803e4123 1479 (int)nsgid, (int)nsgid, (int)nsgid);
6efacf80
CB
1480
1481 ret = setresuid(nsuid, nsuid, nsuid);
803e4123 1482 if (ret < 0)
77c3e9a2 1483 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
803e4123 1484 (int)nsuid, (int)nsuid, (int)nsuid);
6efacf80 1485
ccb4cabe 1486 destuid = get_ns_uid(arg->origuid);
b962868f
CB
1487 if (destuid == LXC_INVALID_UID)
1488 destuid = 0;
ccb4cabe 1489
6a720d74 1490 for (int i = 0; arg->hierarchies[i]; i++) {
1973b62a 1491 int dirfd = arg->hierarchies[i]->cgfd_con;
43647298 1492
1973b62a 1493 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
c0888dfe 1494
1973b62a
CB
1495 /*
1496 * Failures to chown() these are inconvenient but not
6efacf80
CB
1497 * detrimental We leave these owned by the container launcher,
1498 * so that container root can write to the files to attach. We
1499 * chmod() them 664 so that container systemd can write to the
1500 * files (which systemd in wily insists on doing).
ab8f5424 1501 */
6efacf80 1502
1973b62a
CB
1503 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC)
1504 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
43647298 1505
1973b62a 1506 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
0e17357c 1507
2202afc9 1508 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
0e17357c
CB
1509 continue;
1510
1973b62a
CB
1511 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++)
1512 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
ccb4cabe
SH
1513 }
1514
1515 return 0;
1516}
1517
b857f4be 1518__cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
c98bbf71 1519 struct lxc_conf *conf)
ccb4cabe 1520{
4160c3a0 1521 struct generic_userns_exec_data wrap;
ccb4cabe 1522
c98bbf71
CB
1523 if (!ops)
1524 return ret_set_errno(false, ENOENT);
ccb4cabe 1525
69b4a4bb
CB
1526 if (!ops->hierarchies)
1527 return true;
1528
c98bbf71
CB
1529 if (!ops->container_cgroup)
1530 return ret_set_errno(false, ENOENT);
1531
1532 if (!conf)
1533 return ret_set_errno(false, EINVAL);
1534
1535 if (lxc_list_empty(&conf->id_map))
1536 return true;
1537
ccb4cabe 1538 wrap.origuid = geteuid();
4160c3a0 1539 wrap.path = NULL;
2202afc9 1540 wrap.hierarchies = ops->hierarchies;
4160c3a0 1541 wrap.conf = conf;
ccb4cabe 1542
c98bbf71
CB
1543 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1544 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
ccb4cabe
SH
1545
1546 return true;
1547}
1548
78eb6aa6
CB
1549__cgfsng_ops void cgfsng_payload_finalize(struct cgroup_ops *ops)
1550{
1551 if (!ops)
1552 return;
1553
1554 if (!ops->hierarchies)
1555 return;
1556
1557 for (int i = 0; ops->hierarchies[i]; i++) {
1558 struct hierarchy *h = ops->hierarchies[i];
1559 /*
1560 * we don't keep the fds for non-unified hierarchies around
1561 * mainly because we don't make use of them anymore after the
1562 * core cgroup setup is done but also because there are quite a
1563 * lot of them.
1564 */
1565 if (!is_unified_hierarchy(h))
1566 close_prot_errno_disarm(h->cgfd_con);
1567 }
1568}
1569
8aa1044f 1570/* cgroup-full:* is done, no need to create subdirs */
77c3e9a2 1571static inline bool cg_mount_needs_subdirs(int type)
8aa1044f 1572{
77c3e9a2 1573 return !(type >= LXC_AUTO_CGROUP_FULL_RO);
8aa1044f
SH
1574}
1575
886cac86
CB
1576/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1577 * remount controller ro if needed and bindmount the cgroupfs onto
25fa6f8c 1578 * control/the/cg/path.
8aa1044f 1579 */
6812d833
CB
1580static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1581 char *controllerpath, char *cgpath,
1582 const char *container_cgroup)
8aa1044f 1583{
d97919ab 1584 __do_free char *sourcepath = NULL;
5285689c 1585 int ret, remount_flags;
886cac86
CB
1586 int flags = MS_BIND;
1587
8aa1044f 1588 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
886cac86 1589 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
77c3e9a2
CB
1590 if (ret < 0)
1591 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1592 controllerpath, controllerpath);
886cac86 1593
5285689c
CB
1594 remount_flags = add_required_remount_flags(controllerpath,
1595 controllerpath,
1596 flags | MS_REMOUNT);
886cac86 1597 ret = mount(controllerpath, controllerpath, "cgroup",
8186c5c7
CB
1598 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1599 NULL);
77c3e9a2
CB
1600 if (ret < 0)
1601 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath);
886cac86 1602
8aa1044f
SH
1603 INFO("Remounted %s read-only", controllerpath);
1604 }
886cac86 1605
bb221ad1 1606 sourcepath = must_make_path(h->mountpoint, h->container_base_path,
886cac86 1607 container_cgroup, NULL);
8aa1044f
SH
1608 if (type == LXC_AUTO_CGROUP_RO)
1609 flags |= MS_RDONLY;
886cac86
CB
1610
1611 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
77c3e9a2
CB
1612 if (ret < 0)
1613 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1614 h->controllers[0], cgpath);
886cac86 1615 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
f8c40ffa
L
1616
1617 if (flags & MS_RDONLY) {
5285689c
CB
1618 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1619 flags | MS_REMOUNT);
1620 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
77c3e9a2
CB
1621 if (ret < 0)
1622 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
5285689c 1623 INFO("Remounted %s read-only", cgpath);
f8c40ffa
L
1624 }
1625
886cac86 1626 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
8aa1044f
SH
1627 return 0;
1628}
1629
6812d833
CB
1630/* __cg_mount_direct
1631 *
1632 * Mount cgroup hierarchies directly without using bind-mounts. The main
1633 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1634 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1635 */
1636static int __cg_mount_direct(int type, struct hierarchy *h,
1637 const char *controllerpath)
b635e92d 1638{
d97919ab 1639 __do_free char *controllers = NULL;
a760603e
CB
1640 char *fstype = "cgroup2";
1641 unsigned long flags = 0;
f6b54668 1642 int ret;
b635e92d 1643
a760603e
CB
1644 flags |= MS_NOSUID;
1645 flags |= MS_NOEXEC;
1646 flags |= MS_NODEV;
1647 flags |= MS_RELATIME;
1648
1649 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1650 flags |= MS_RDONLY;
1651
d6337a5f 1652 if (h->version != CGROUP2_SUPER_MAGIC) {
a760603e
CB
1653 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1654 if (!controllers)
1655 return -ENOMEM;
1656 fstype = "cgroup";
b635e92d
CB
1657 }
1658
a760603e 1659 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
77c3e9a2
CB
1660 if (ret < 0)
1661 return log_error_errno(-1, errno, "Failed to mount \"%s\" with cgroup filesystem type %s",
1662 controllerpath, fstype);
b635e92d 1663
6812d833 1664 DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1665 return 0;
1666}
1667
6812d833
CB
1668static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1669 const char *controllerpath)
1670{
1671 return __cg_mount_direct(type, h, controllerpath);
1672}
1673
1674static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1675 const char *controllerpath)
1676{
1677 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1678 return 0;
1679
1680 return __cg_mount_direct(type, h, controllerpath);
1681}
1682
b857f4be 1683__cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
8d661d38
CB
1684 struct lxc_handler *handler,
1685 const char *root, int type)
ccb4cabe 1686{
6607d6e9 1687 __do_free char *cgroup_root = NULL;
d7314671 1688 bool has_cgns = false, wants_force_mount = false;
dfa835ac 1689 int ret;
8aa1044f 1690
9585ccb3
CB
1691 if (!ops)
1692 return ret_set_errno(false, ENOENT);
1693
69b4a4bb
CB
1694 if (!ops->hierarchies)
1695 return true;
1696
9585ccb3
CB
1697 if (!handler || !handler->conf)
1698 return ret_set_errno(false, EINVAL);
1699
8aa1044f
SH
1700 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1701 return true;
1702
3f69fb12
SY
1703 if (type & LXC_AUTO_CGROUP_FORCE) {
1704 type &= ~LXC_AUTO_CGROUP_FORCE;
1705 wants_force_mount = true;
1706 }
b635e92d 1707
3f69fb12
SY
1708 if (!wants_force_mount){
1709 if (!lxc_list_empty(&handler->conf->keepcaps))
1710 wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1711 else
1712 wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1713 }
8aa1044f 1714
3f69fb12
SY
1715 has_cgns = cgns_supported();
1716 if (has_cgns && !wants_force_mount)
1717 return true;
8aa1044f
SH
1718
1719 if (type == LXC_AUTO_CGROUP_NOSPEC)
1720 type = LXC_AUTO_CGROUP_MIXED;
1721 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1722 type = LXC_AUTO_CGROUP_FULL_MIXED;
1723
dca9587a 1724 cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
8d661d38 1725 if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
8d661d38 1726 if (has_cgns && wants_force_mount) {
d7314671
CB
1727 /*
1728 * If cgroup namespaces are supported but the container
8d661d38
CB
1729 * will not have CAP_SYS_ADMIN after it has started we
1730 * need to mount the cgroups manually.
1731 */
d7314671 1732 return cg_mount_in_cgroup_namespace(type, ops->unified, cgroup_root) == 0;
8d661d38
CB
1733 }
1734
6607d6e9 1735 return cg_mount_cgroup_full(type, ops->unified, cgroup_root) == 0;
8d661d38
CB
1736 }
1737
1738 /* mount tmpfs */
6607d6e9 1739 ret = safe_mount(NULL, cgroup_root, "tmpfs",
3f69fb12
SY
1740 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1741 "size=10240k,mode=755", root);
1742 if (ret < 0)
d7314671 1743 return false;
8aa1044f 1744
dfa835ac 1745 for (int i = 0; ops->hierarchies[i]; i++) {
d97919ab 1746 __do_free char *controllerpath = NULL, *path2 = NULL;
2202afc9 1747 struct hierarchy *h = ops->hierarchies[i];
8aa1044f 1748 char *controller = strrchr(h->mountpoint, '/');
8aa1044f
SH
1749
1750 if (!controller)
1751 continue;
1752 controller++;
affd10fa 1753
6607d6e9 1754 controllerpath = must_make_path(cgroup_root, controller, NULL);
d97919ab 1755 if (dir_exists(controllerpath))
8aa1044f 1756 continue;
affd10fa 1757
3f69fb12 1758 ret = mkdir(controllerpath, 0755);
d7314671
CB
1759 if (ret < 0)
1760 return log_error_errno(false, errno, "Error creating cgroup path: %s", controllerpath);
b635e92d 1761
3f69fb12 1762 if (has_cgns && wants_force_mount) {
b635e92d
CB
1763 /* If cgroup namespaces are supported but the container
1764 * will not have CAP_SYS_ADMIN after it has started we
1765 * need to mount the cgroups manually.
1766 */
3f69fb12 1767 ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
3f69fb12 1768 if (ret < 0)
d7314671 1769 return false;
3f69fb12 1770
b635e92d
CB
1771 continue;
1772 }
1773
6812d833 1774 ret = cg_mount_cgroup_full(type, h, controllerpath);
d97919ab 1775 if (ret < 0)
d7314671 1776 return false;
3f69fb12 1777
d97919ab 1778 if (!cg_mount_needs_subdirs(type))
8aa1044f 1779 continue;
3f69fb12 1780
bb221ad1 1781 path2 = must_make_path(controllerpath, h->container_base_path,
2202afc9 1782 ops->container_cgroup, NULL);
3f69fb12 1783 ret = mkdir_p(path2, 0755);
d97919ab 1784 if (ret < 0)
d7314671 1785 return false;
2f62fb00 1786
6812d833 1787 ret = cg_legacy_mount_controllers(type, h, controllerpath,
2202afc9 1788 path2, ops->container_cgroup);
3f69fb12 1789 if (ret < 0)
d7314671 1790 return false;
8aa1044f 1791 }
8aa1044f 1792
d7314671 1793 return true;
ccb4cabe
SH
1794}
1795
11c23867 1796/* Only root needs to escape to the cgroup of its init. */
b857f4be 1797__cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
52d08ab0 1798 struct lxc_conf *conf)
ccb4cabe 1799{
52d08ab0
CB
1800 if (!ops)
1801 return ret_set_errno(false, ENOENT);
1802
1803 if (!ops->hierarchies)
1804 return true;
1805
1806 if (!conf)
1807 return ret_set_errno(false, EINVAL);
1808
1809 if (conf->cgroup_meta.relative || geteuid())
ccb4cabe
SH
1810 return true;
1811
779b3d82 1812 for (int i = 0; ops->hierarchies[i]; i++) {
88396101 1813 __do_free char *fullpath = NULL;
52d08ab0 1814 int ret;
11c23867 1815
52d08ab0
CB
1816 fullpath =
1817 must_make_path(ops->hierarchies[i]->mountpoint,
1818 ops->hierarchies[i]->container_base_path,
1819 "cgroup.procs", NULL);
7cea5905 1820 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
52d08ab0 1821 if (ret != 0)
77c3e9a2 1822 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
ccb4cabe
SH
1823 }
1824
6df334d1 1825 return true;
ccb4cabe
SH
1826}
1827
b857f4be 1828__cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
36662416 1829{
69b4a4bb
CB
1830 int i = 0;
1831
e3ffb28b
CB
1832 if (!ops)
1833 return ret_set_errno(-1, ENOENT);
1834
69b4a4bb
CB
1835 if (!ops->hierarchies)
1836 return 0;
36662416 1837
69b4a4bb 1838 for (; ops->hierarchies[i]; i++)
36662416
TA
1839 ;
1840
1841 return i;
1842}
1843
aa48a34f
CB
1844__cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n,
1845 char ***out)
36662416
TA
1846{
1847 int i;
1848
aa48a34f
CB
1849 if (!ops)
1850 return ret_set_errno(false, ENOENT);
1851
69b4a4bb 1852 if (!ops->hierarchies)
77c3e9a2 1853 return ret_set_errno(false, ENOENT);
69b4a4bb 1854
36662416 1855 /* sanity check n */
6b38e644 1856 for (i = 0; i < n; i++)
2202afc9 1857 if (!ops->hierarchies[i])
aa48a34f 1858 return ret_set_errno(false, ENOENT);
36662416 1859
2202afc9 1860 *out = ops->hierarchies[i]->controllers;
36662416
TA
1861
1862 return true;
1863}
1864
ee3a7775 1865static bool cg_legacy_freeze(struct cgroup_ops *ops)
ccb4cabe 1866{
d6337a5f 1867 struct hierarchy *h;
ccb4cabe 1868
ee3a7775
CB
1869 h = get_hierarchy(ops, "freezer");
1870 if (!h)
d2203230 1871 return ret_set_errno(-1, ENOENT);
81468ea7 1872
c04a6d4e
CB
1873 return lxc_write_openat(h->container_full_path, "freezer.state",
1874 "FROZEN", STRLITERALLEN("FROZEN"));
ee3a7775 1875}
942e193e 1876
018051e3
CB
1877static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1878 struct lxc_epoll_descr *descr)
ee3a7775 1879{
f62cf1d4 1880 __do_close int duped_fd = -EBADF;
018051e3 1881 __do_free char *line = NULL;
ee3a7775 1882 __do_fclose FILE *f = NULL;
018051e3
CB
1883 int state = PTR_TO_INT(cbdata);
1884 size_t len;
1885 const char *state_string;
1886
1887 duped_fd = dup(fd);
1888 if (duped_fd < 0)
1889 return LXC_MAINLOOP_ERROR;
1890
1891 if (lseek(duped_fd, 0, SEEK_SET) < (off_t)-1)
1892 return LXC_MAINLOOP_ERROR;
1893
1894 f = fdopen(duped_fd, "re");
1895 if (!f)
1896 return LXC_MAINLOOP_ERROR;
1897 move_fd(duped_fd);
1898
1899 if (state == 1)
1900 state_string = "frozen 1";
1901 else
1902 state_string = "frozen 0";
1903
1904 while (getline(&line, &len, f) != -1)
1905 if (strncmp(line, state_string, STRLITERALLEN("frozen") + 2) == 0)
1906 return LXC_MAINLOOP_CLOSE;
1907
1908 return LXC_MAINLOOP_CONTINUE;
1909}
1910
1911static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1912{
f62cf1d4 1913 __do_close int fd = -EBADF;
eafc1bb6 1914 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
018051e3
CB
1915 int ret;
1916 struct lxc_epoll_descr descr;
ee3a7775 1917 struct hierarchy *h;
942e193e
CB
1918
1919 h = ops->unified;
457ca9aa 1920 if (!h)
d2203230 1921 return ret_set_errno(-1, ENOENT);
d6337a5f 1922
018051e3 1923 if (!h->container_full_path)
d2203230 1924 return ret_set_errno(-1, EEXIST);
d6337a5f 1925
018051e3
CB
1926 if (timeout != 0) {
1927 __do_free char *events_file = NULL;
942e193e 1928
018051e3
CB
1929 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
1930 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1931 if (fd < 0)
d2203230 1932 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
942e193e 1933
018051e3
CB
1934 ret = lxc_mainloop_open(&descr);
1935 if (ret)
d2203230 1936 return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container freeze");
942e193e 1937
018051e3
CB
1938 /* automatically cleaned up now */
1939 descr_ptr = &descr;
942e193e 1940
018051e3
CB
1941 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){1}));
1942 if (ret < 0)
d2203230 1943 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 1944 }
942e193e 1945
c04a6d4e 1946 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "1", 1);
018051e3 1947 if (ret < 0)
d2203230 1948 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
1949
1950 if (timeout != 0 && lxc_mainloop(&descr, timeout))
d2203230 1951 return log_error_errno(-1, errno, "Failed to wait for container to be frozen");
018051e3
CB
1952
1953 return 0;
942e193e
CB
1954}
1955
018051e3 1956__cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
942e193e 1957{
81468ea7 1958 if (!ops->hierarchies)
d2203230 1959 return ret_set_errno(-1, ENOENT);
81468ea7 1960
ee3a7775
CB
1961 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
1962 return cg_legacy_freeze(ops);
942e193e 1963
018051e3 1964 return cg_unified_freeze(ops, timeout);
ee3a7775
CB
1965}
1966
018051e3 1967static int cg_legacy_unfreeze(struct cgroup_ops *ops)
ee3a7775 1968{
ee3a7775
CB
1969 struct hierarchy *h;
1970
1971 h = get_hierarchy(ops, "freezer");
1972 if (!h)
d2203230 1973 return ret_set_errno(-1, ENOENT);
ee3a7775 1974
c04a6d4e
CB
1975 return lxc_write_openat(h->container_full_path, "freezer.state",
1976 "THAWED", STRLITERALLEN("THAWED"));
ee3a7775
CB
1977}
1978
018051e3 1979static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775 1980{
f62cf1d4 1981 __do_close int fd = -EBADF;
eafc1bb6 1982 call_cleaner(lxc_mainloop_close)struct lxc_epoll_descr *descr_ptr = NULL;
018051e3
CB
1983 int ret;
1984 struct lxc_epoll_descr descr;
ee3a7775 1985 struct hierarchy *h;
942e193e
CB
1986
1987 h = ops->unified;
1988 if (!h)
d2203230 1989 return ret_set_errno(-1, ENOENT);
018051e3
CB
1990
1991 if (!h->container_full_path)
d2203230 1992 return ret_set_errno(-1, EEXIST);
018051e3
CB
1993
1994 if (timeout != 0) {
1995 __do_free char *events_file = NULL;
1996
1997 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
1998 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1999 if (fd < 0)
d2203230 2000 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
018051e3
CB
2001
2002 ret = lxc_mainloop_open(&descr);
2003 if (ret)
d2203230 2004 return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container unfreeze");
018051e3
CB
2005
2006 /* automatically cleaned up now */
2007 descr_ptr = &descr;
2008
2009 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){0}));
2010 if (ret < 0)
d2203230 2011 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 2012 }
942e193e 2013
c04a6d4e 2014 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "0", 1);
018051e3 2015 if (ret < 0)
d2203230 2016 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
2017
2018 if (timeout != 0 && lxc_mainloop(&descr, timeout))
d2203230 2019 return log_error_errno(-1, errno, "Failed to wait for container to be unfrozen");
018051e3
CB
2020
2021 return 0;
ee3a7775
CB
2022}
2023
018051e3 2024__cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775
CB
2025{
2026 if (!ops->hierarchies)
d2203230 2027 return ret_set_errno(-1, ENOENT);
ee3a7775
CB
2028
2029 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2030 return cg_legacy_unfreeze(ops);
2031
018051e3 2032 return cg_unified_unfreeze(ops, timeout);
ccb4cabe
SH
2033}
2034
b857f4be 2035__cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
6bdf9691 2036 const char *controller)
ccb4cabe 2037{
d6337a5f
CB
2038 struct hierarchy *h;
2039
2202afc9 2040 h = get_hierarchy(ops, controller);
6bdf9691 2041 if (!h)
77c3e9a2 2042 return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
6bdf9691 2043 controller ? controller : "(null)");
ccb4cabe 2044
6bdf9691
CB
2045 return h->container_full_path
2046 ? h->container_full_path + strlen(h->mountpoint)
2047 : NULL;
371f834d
SH
2048}
2049
c40c8209
CB
2050/* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2051 * which must be freed by the caller.
371f834d 2052 */
c40c8209
CB
2053static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2054 const char *inpath,
2055 const char *filename)
371f834d 2056{
371f834d 2057 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
2058}
2059
ba7ca43b
CB
2060#define LXC_UNIFIED_ATTACH_CGROUP_LEN STRLITERALLEN("/lxc-1000/cgroup.procs")
2061static int cgroup_attach_leaf(const struct lxc_conf *conf, char *unified_path,
2062 int unified_fd, pid_t pid)
c2aed66d 2063{
ba7ca43b 2064 __do_free char *path = NULL;
ad275c16 2065 int idx = 1;
c2aed66d 2066 int ret;
900b6606
CB
2067 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2068 size_t pidstr_len;
c2aed66d 2069
ad275c16
CB
2070 /* Create leaf cgroup. */
2071 ret = mkdirat(unified_fd, "lxc", 0755);
2072 if (ret < 0 && errno != EEXIST)
2073 return log_error_errno(-1, errno, "Failed to create leaf cgroup \"lxc\"");
2074
ba7ca43b
CB
2075 path = must_make_path(unified_path, "lxc", NULL);
2076 ret = chown_mapped_root(path, conf);
2077 if (ret < 0)
2078 return log_error_errno(-1, errno, "Failed to chown \"%s\"", path);
2079
7581a82f 2080 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
ad275c16
CB
2081 ret = lxc_writeat(unified_fd, "lxc/cgroup.procs", pidstr, pidstr_len);
2082 if (ret < 0)
2083 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
c2aed66d 2084 if (ret == 0)
bad788b0 2085 return 0;
ad275c16 2086
bad788b0
CB
2087 /* this is a non-leaf node */
2088 if (errno != EBUSY)
d2203230 2089 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2090
ba7ca43b 2091 free_disarm(path);
c2aed66d 2092 do {
7581a82f 2093 bool rm = false;
ba7ca43b 2094 char attach_cgroup[LXC_UNIFIED_ATTACH_CGROUP_LEN + 1];
bad788b0 2095 char *slash;
c2aed66d 2096
ad275c16 2097 sprintf(attach_cgroup, "lxc-%d/cgroup.procs", idx);
bad788b0
CB
2098 slash = &attach_cgroup[ret] - STRLITERALLEN("/cgroup.procs");
2099 *slash = '\0';
ad275c16 2100
bad788b0 2101 ret = mkdirat(unified_fd, attach_cgroup, 0755);
c2aed66d 2102 if (ret < 0 && errno != EEXIST)
d2203230 2103 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
7581a82f
CB
2104 if (ret == 0)
2105 rm = true;
c2aed66d 2106
bad788b0 2107 *slash = '/';
ad275c16 2108
ba7ca43b
CB
2109 path = must_make_path(unified_path, attach_cgroup, NULL);
2110 ret = chown_mapped_root(path, conf);
2111 if (ret < 0)
2112 return log_error_errno(-1, errno, "Failed to chown \"%s\"", path);
2113 free_disarm(path);
2114
bad788b0 2115 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
c2aed66d 2116 if (ret == 0)
bad788b0 2117 return 0;
c2aed66d 2118
7581a82f
CB
2119 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2120 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2121
c2aed66d
CB
2122 /* this is a non-leaf node */
2123 if (errno != EBUSY)
d2203230 2124 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2125
edae86e9
CB
2126 idx++;
2127 } while (idx < 1000);
c2aed66d 2128
ad275c16 2129 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d
CB
2130}
2131
7581a82f
CB
2132int cgroup_attach(const struct lxc_conf *conf, const char *name,
2133 const char *lxcpath, pid_t pid)
900b6606 2134{
f62cf1d4 2135 __do_close int unified_fd = -EBADF;
ba7ca43b 2136 __do_free char *buf = NULL;
7581a82f 2137 int ret;
ba7ca43b
CB
2138 ssize_t len;
2139 struct stat st;
7581a82f
CB
2140
2141 if (!conf || !name || !lxcpath || pid <= 0)
2142 return ret_errno(EINVAL);
900b6606
CB
2143
2144 unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
2145 if (unified_fd < 0)
7581a82f 2146 return ret_errno(EBADF);
900b6606 2147
ba7ca43b
CB
2148 ret = fstatat(unified_fd, "", &st, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
2149 if (ret < 0)
2150 return -errno;
7581a82f 2151
ba7ca43b
CB
2152 if (st.st_size == 0)
2153 return ret_errno(EINVAL);
2154
2155 buf = zalloc(st.st_size);
2156 if (!buf)
2157 return ret_errno(ENOMEM);
2158
2159 len = readlinkat(unified_fd, "", buf, st.st_size);
2160 if (len < 0)
2161 return -errno;
2162 if (len >= st.st_size)
2163 return ret_errno(E2BIG);
7581a82f 2164
ba7ca43b 2165 return cgroup_attach_leaf(conf, buf, unified_fd, pid);
900b6606
CB
2166}
2167
2168/* Technically, we're always at a delegation boundary here (This is especially
2169 * true when cgroup namespaces are available.). The reasoning is that in order
2170 * for us to have been able to start a container in the first place the root
2171 * cgroup must have been a leaf node. Now, either the container's init system
2172 * has populated the cgroup and kept it as a leaf node or it has created
2173 * subtrees. In the former case we will simply attach to the leaf node we
2174 * created when we started the container in the latter case we create our own
2175 * cgroup for the attaching process.
2176 */
7581a82f
CB
2177static int __cg_unified_attach(const struct hierarchy *h,
2178 const struct lxc_conf *conf, const char *name,
900b6606
CB
2179 const char *lxcpath, pid_t pid,
2180 const char *controller)
2181{
f62cf1d4 2182 __do_close int unified_fd = -EBADF;
32908bfd 2183 __do_free char *path = NULL, *cgroup = NULL;
900b6606
CB
2184 int ret;
2185
7581a82f
CB
2186 if (!conf || !name || !lxcpath || pid <= 0)
2187 return ret_errno(EINVAL);
2188
2189 ret = cgroup_attach(conf, name, lxcpath, pid);
32908bfd
CB
2190 if (ret == 0)
2191 return log_trace(0, "Attached to unified cgroup via command handler");
2192 if (ret != -EBADF)
2193 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2194
2195 /* Fall back to retrieving the path for the unified cgroup. */
2196 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2197 /* not running */
2198 if (!cgroup)
2199 return 0;
900b6606 2200
32908bfd 2201 path = must_make_path(h->mountpoint, cgroup, NULL);
900b6606 2202
32908bfd 2203 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
900b6606 2204 if (unified_fd < 0)
7581a82f
CB
2205 return ret_errno(EBADF);
2206
ba7ca43b 2207 return cgroup_attach_leaf(conf, path, unified_fd, pid);
900b6606
CB
2208}
2209
7581a82f
CB
2210__cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2211 const struct lxc_conf *conf,
2212 const char *name, const char *lxcpath,
2213 pid_t pid)
ccb4cabe 2214{
81b5d48a 2215 int len, ret;
a3650c0c 2216 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 2217
ab9a452d
CB
2218 if (!ops)
2219 return ret_set_errno(false, ENOENT);
2220
69b4a4bb
CB
2221 if (!ops->hierarchies)
2222 return true;
2223
a3650c0c
CB
2224 len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
2225 if (len < 0 || (size_t)len >= sizeof(pidstr))
ccb4cabe
SH
2226 return false;
2227
81b5d48a 2228 for (int i = 0; ops->hierarchies[i]; i++) {
c05b17bd 2229 __do_free char *fullpath = NULL, *path = NULL;
2202afc9 2230 struct hierarchy *h = ops->hierarchies[i];
ccb4cabe 2231
c2aed66d 2232 if (h->version == CGROUP2_SUPER_MAGIC) {
7581a82f 2233 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
a3926f6a 2234 h->controllers[0]);
c2aed66d
CB
2235 if (ret < 0)
2236 return false;
2237
2238 continue;
2239 }
2240
ccb4cabe 2241 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
c2aed66d
CB
2242 /* not running */
2243 if (!path)
e2cb2e74 2244 return false;
ccb4cabe 2245
371f834d 2246 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
7cea5905 2247 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
ab9a452d 2248 if (ret < 0)
77c3e9a2 2249 return log_error_errno(false, errno, "Failed to attach %d to %s",
ab9a452d 2250 (int)pid, fullpath);
ccb4cabe
SH
2251 }
2252
ccb4cabe
SH
2253 return true;
2254}
2255
e2bd2b13
CB
2256/* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2257 * don't have a cgroup_data set up, so we ask the running container through the
2258 * commands API for the cgroup path.
ccb4cabe 2259 */
b857f4be 2260__cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
fb55e009
CB
2261 char *value, size_t len, const char *name,
2262 const char *lxcpath)
ccb4cabe 2263{
d97919ab 2264 __do_free char *path = NULL;
88396101 2265 __do_free char *controller = NULL;
d97919ab 2266 char *p;
0069cc61 2267 struct hierarchy *h;
861cb8c2 2268 int ret = -1;
ccb4cabe 2269
a358028a
CB
2270 if (!ops)
2271 return ret_set_errno(-1, ENOENT);
2272
861cb8c2 2273 controller = must_copy_string(filename);
0069cc61
CB
2274 p = strchr(controller, '.');
2275 if (p)
ccb4cabe
SH
2276 *p = '\0';
2277
0069cc61
CB
2278 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2279 /* not running */
2280 if (!path)
ccb4cabe
SH
2281 return -1;
2282
2202afc9 2283 h = get_hierarchy(ops, controller);
ccb4cabe 2284 if (h) {
88396101 2285 __do_free char *fullpath = NULL;
0069cc61
CB
2286
2287 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe 2288 ret = lxc_read_from_file(fullpath, value, len);
ccb4cabe 2289 }
ccb4cabe
SH
2290
2291 return ret;
2292}
2293
cb3fc90c
CB
2294static int device_cgroup_parse_access(struct device_item *device, const char *val)
2295{
2296 for (int count = 0; count < 3; count++, val++) {
2297 switch (*val) {
2298 case 'r':
2299 device->access[count] = *val;
2300 break;
2301 case 'w':
2302 device->access[count] = *val;
2303 break;
2304 case 'm':
2305 device->access[count] = *val;
2306 break;
2307 case '\n':
2308 case '\0':
2309 count = 3;
2310 break;
2311 default:
2312 return ret_errno(EINVAL);
2313 }
2314 }
2315
2316 return 0;
2317}
2318
2a63b5cb
CB
2319static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2320 const char *val)
2321{
2322 int count, ret;
2323 char temp[50];
2324
2325 if (strcmp("devices.allow", key) == 0)
2326 device->allow = 1;
2327 else
2328 device->allow = 0;
2329
2330 if (strcmp(val, "a") == 0) {
2331 /* global rule */
2332 device->type = 'a';
2333 device->major = -1;
2334 device->minor = -1;
fda39d45
CB
2335 device->global_rule = device->allow
2336 ? LXC_BPF_DEVICE_CGROUP_BLACKLIST
2337 : LXC_BPF_DEVICE_CGROUP_WHITELIST;
2a63b5cb
CB
2338 device->allow = -1;
2339 return 0;
2a63b5cb
CB
2340 }
2341
77c3e9a2
CB
2342 /* local rule */
2343 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2344
2a63b5cb
CB
2345 switch (*val) {
2346 case 'a':
2347 __fallthrough;
2348 case 'b':
2349 __fallthrough;
2350 case 'c':
2351 device->type = *val;
2352 break;
2353 default:
2354 return -1;
2355 }
2356
2357 val++;
2358 if (!isspace(*val))
2359 return -1;
2360 val++;
2361 if (*val == '*') {
2362 device->major = -1;
2363 val++;
2364 } else if (isdigit(*val)) {
2365 memset(temp, 0, sizeof(temp));
2366 for (count = 0; count < sizeof(temp) - 1; count++) {
2367 temp[count] = *val;
2368 val++;
2369 if (!isdigit(*val))
2370 break;
2371 }
2372 ret = lxc_safe_int(temp, &device->major);
2373 if (ret)
2374 return -1;
2375 } else {
2376 return -1;
2377 }
2378 if (*val != ':')
2379 return -1;
2380 val++;
2381
2382 /* read minor */
2383 if (*val == '*') {
2384 device->minor = -1;
2385 val++;
2386 } else if (isdigit(*val)) {
2387 memset(temp, 0, sizeof(temp));
2388 for (count = 0; count < sizeof(temp) - 1; count++) {
2389 temp[count] = *val;
2390 val++;
2391 if (!isdigit(*val))
2392 break;
2393 }
2394 ret = lxc_safe_int(temp, &device->minor);
2395 if (ret)
2396 return -1;
2397 } else {
2398 return -1;
2399 }
2400 if (!isspace(*val))
2401 return -1;
2a63b5cb 2402
cb3fc90c 2403 return device_cgroup_parse_access(device, ++val);
2a63b5cb
CB
2404}
2405
eec533e3
CB
2406/* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2407 * don't have a cgroup_data set up, so we ask the running container through the
2408 * commands API for the cgroup path.
ccb4cabe 2409 */
b857f4be 2410__cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2a63b5cb 2411 const char *key, const char *value,
fb55e009 2412 const char *name, const char *lxcpath)
ccb4cabe 2413{
d97919ab 2414 __do_free char *path = NULL;
88396101 2415 __do_free char *controller = NULL;
d97919ab 2416 char *p;
87777968 2417 struct hierarchy *h;
861cb8c2 2418 int ret = -1;
ccb4cabe 2419
a358028a
CB
2420 if (!ops)
2421 return ret_set_errno(-1, ENOENT);
2422
2a63b5cb 2423 controller = must_copy_string(key);
87777968
CB
2424 p = strchr(controller, '.');
2425 if (p)
ccb4cabe
SH
2426 *p = '\0';
2427
2a63b5cb
CB
2428 if (pure_unified_layout(ops) && strcmp(controller, "devices") == 0) {
2429 struct device_item device = {0};
2430
2431 ret = device_cgroup_rule_parse(&device, key, value);
2432 if (ret < 0)
d2203230 2433 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2a63b5cb
CB
2434 key, value);
2435
2436 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2437 if (ret < 0)
2438 return -1;
2439
2440 return 0;
2441 }
2442
87777968
CB
2443 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2444 /* not running */
2445 if (!path)
ccb4cabe
SH
2446 return -1;
2447
2202afc9 2448 h = get_hierarchy(ops, controller);
ccb4cabe 2449 if (h) {
88396101 2450 __do_free char *fullpath = NULL;
87777968 2451
2a63b5cb 2452 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
7cea5905 2453 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
ccb4cabe 2454 }
ccb4cabe
SH
2455
2456 return ret;
2457}
2458
91d1a13a 2459/* take devices cgroup line
72add155
SH
2460 * /dev/foo rwx
2461 * and convert it to a valid
2462 * type major:minor mode
91d1a13a
CB
2463 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2464 * the output.
72add155 2465 */
cb3fc90c
CB
2466static int device_cgroup_rule_parse_devpath(struct device_item *device,
2467 const char *devpath)
72add155 2468{
88396101 2469 __do_free char *path = NULL;
2a06d041 2470 char *mode = NULL;
cb3fc90c
CB
2471 int n_parts, ret;
2472 char *p;
2473 struct stat sb;
72add155 2474
cb3fc90c 2475 path = must_copy_string(devpath);
72add155 2476
cb3fc90c
CB
2477 /*
2478 * Read path followed by mode. Ignore any trailing text.
91d1a13a
CB
2479 * A ' # comment' would be legal. Technically other text is not
2480 * legal, we could check for that if we cared to.
72add155 2481 */
0dbdb99e 2482 for (n_parts = 1, p = path; *p; p++) {
2c2d6c49
SH
2483 if (*p != ' ')
2484 continue;
2485 *p = '\0';
91d1a13a 2486
2c2d6c49
SH
2487 if (n_parts != 1)
2488 break;
2489 p++;
2490 n_parts++;
91d1a13a 2491
2c2d6c49
SH
2492 while (*p == ' ')
2493 p++;
91d1a13a 2494
2c2d6c49 2495 mode = p;
91d1a13a 2496
2c2d6c49 2497 if (*p == '\0')
cb3fc90c 2498 return ret_set_errno(-1, EINVAL);
72add155 2499 }
2c2d6c49 2500
cb3fc90c
CB
2501 if (device_cgroup_parse_access(device, mode) < 0)
2502 return -1;
2503
2c2d6c49 2504 if (n_parts == 1)
cb3fc90c 2505 return ret_set_errno(-1, EINVAL);
72add155
SH
2506
2507 ret = stat(path, &sb);
2508 if (ret < 0)
cb3fc90c 2509 return ret_set_errno(-1, errno);
72add155 2510
72add155
SH
2511 mode_t m = sb.st_mode & S_IFMT;
2512 switch (m) {
2513 case S_IFBLK:
cb3fc90c 2514 device->type = 'b';
72add155
SH
2515 break;
2516 case S_IFCHR:
cb3fc90c 2517 device->type = 'c';
72add155 2518 break;
2c2d6c49 2519 default:
77c3e9a2 2520 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
72add155 2521 }
2c2d6c49 2522
cb3fc90c
CB
2523 device->major = MAJOR(sb.st_rdev);
2524 device->minor = MINOR(sb.st_rdev);
2525 device->allow = 1;
2526 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
72add155 2527
cb3fc90c
CB
2528 return 0;
2529}
2530
2531static int convert_devpath(const char *invalue, char *dest)
2532{
2533 struct device_item device = {0};
2534 int ret;
2535
2536 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2537 if (ret < 0)
2538 return -1;
2539
2540 ret = snprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2541 device.minor, device.access);
2542 if (ret < 0 || ret >= 50)
77c3e9a2
CB
2543 return log_error_errno(-1, ENAMETOOLONG, "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2544 device.type, device.major, device.minor, device.access);
cb3fc90c
CB
2545
2546 return 0;
72add155
SH
2547}
2548
90e97284
CB
2549/* Called from setup_limits - here we have the container's cgroup_data because
2550 * we created the cgroups.
ccb4cabe 2551 */
2202afc9
CB
2552static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2553 const char *value)
ccb4cabe 2554{
88396101 2555 __do_free char *controller = NULL;
d97919ab 2556 char *p;
1a0e70ac
CB
2557 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2558 char converted_value[50];
b3646d7e 2559 struct hierarchy *h;
64e82f8b 2560
861cb8c2 2561 controller = must_copy_string(filename);
ab1a6cac
CB
2562 p = strchr(controller, '.');
2563 if (p)
ccb4cabe
SH
2564 *p = '\0';
2565
c8bf519d 2566 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
c04a6d4e
CB
2567 int ret;
2568
72add155
SH
2569 ret = convert_devpath(value, converted_value);
2570 if (ret < 0)
c8bf519d 2571 return ret;
72add155 2572 value = converted_value;
c8bf519d 2573 }
2574
2202afc9 2575 h = get_hierarchy(ops, controller);
77c3e9a2
CB
2576 if (!h)
2577 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
b3646d7e 2578
c04a6d4e 2579 return lxc_write_openat(h->container_full_path, filename, value, strlen(value));
ccb4cabe
SH
2580}
2581
c581d2a6
CB
2582__cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2583 struct lxc_conf *conf,
2584 bool do_devices)
ccb4cabe 2585{
d97919ab 2586 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
c581d2a6 2587 struct lxc_list *cgroup_settings = &conf->cgroup;
d97919ab 2588 struct lxc_list *iterator, *next;
ccb4cabe 2589 struct lxc_cgroup *cg;
ccb4cabe
SH
2590 bool ret = false;
2591
92ca7eb5
CB
2592 if (!ops)
2593 return ret_set_errno(false, ENOENT);
2594
2595 if (!conf)
2596 return ret_set_errno(false, EINVAL);
2597
2598 cgroup_settings = &conf->cgroup;
ccb4cabe
SH
2599 if (lxc_list_empty(cgroup_settings))
2600 return true;
2601
69b4a4bb 2602 if (!ops->hierarchies)
92ca7eb5 2603 return ret_set_errno(false, EINVAL);
69b4a4bb 2604
ccb4cabe 2605 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
6b38e644 2606 if (!sorted_cgroup_settings)
ccb4cabe 2607 return false;
ccb4cabe 2608
ccb4cabe
SH
2609 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2610 cg = iterator->elem;
2611
2612 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2202afc9 2613 if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
fc3b9533
CB
2614 if (do_devices && (errno == EACCES || errno == EPERM)) {
2615 SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2616 continue;
2617 }
2618 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2619 goto out;
ccb4cabe 2620 }
77c3e9a2 2621 DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
ccb4cabe 2622 }
ccb4cabe
SH
2623 }
2624
2625 ret = true;
6b38e644 2626 INFO("Limits for the legacy cgroup hierarchies have been setup");
ccb4cabe 2627out:
ccb4cabe
SH
2628 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2629 lxc_list_del(iterator);
2630 free(iterator);
2631 }
d97919ab 2632
ccb4cabe
SH
2633 return ret;
2634}
2635
bf651989
CB
2636/*
2637 * Some of the parsing logic comes from the original cgroup device v1
2638 * implementation in the kernel.
2639 */
4bfb655e
CB
2640static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2641 struct lxc_conf *conf, const char *key,
bf651989
CB
2642 const char *val)
2643{
2644#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
4bfb655e 2645 struct device_item device_item = {0};
2a63b5cb 2646 int ret;
bf651989 2647
cb3fc90c
CB
2648 if (strcmp("devices.allow", key) == 0 && *val == '/')
2649 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2650 else
2651 ret = device_cgroup_rule_parse(&device_item, key, val);
2a63b5cb 2652 if (ret < 0)
77c3e9a2 2653 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", key, val);
4bfb655e
CB
2654
2655 ret = bpf_list_add_device(conf, &device_item);
2a63b5cb 2656 if (ret < 0)
4bfb655e 2657 return -1;
bf651989
CB
2658#endif
2659 return 0;
2660}
2661
c581d2a6
CB
2662__cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2663 struct lxc_handler *handler)
6b38e644 2664{
7e31931f
CB
2665 struct lxc_list *cgroup_settings, *iterator;
2666 struct hierarchy *h;
2667 struct lxc_conf *conf;
6b38e644 2668
7e31931f
CB
2669 if (!ops)
2670 return ret_set_errno(false, ENOENT);
2671
2672 if (!ops->hierarchies)
6b38e644
CB
2673 return true;
2674
7e31931f
CB
2675 if (!ops->container_cgroup)
2676 return ret_set_errno(false, EINVAL);
2677
2678 if (!handler || !handler->conf)
2679 return ret_set_errno(false, EINVAL);
2680 conf = handler->conf;
2681
2682 if (lxc_list_empty(&conf->cgroup2))
2683 return true;
2684 cgroup_settings = &conf->cgroup2;
2685
2686 if (!ops->unified)
6b38e644 2687 return false;
7e31931f 2688 h = ops->unified;
6b38e644 2689
bf651989 2690 lxc_list_for_each (iterator, cgroup_settings) {
6b38e644 2691 struct lxc_cgroup *cg = iterator->elem;
c04a6d4e 2692 int ret;
6b38e644 2693
bf651989 2694 if (strncmp("devices", cg->subsystem, 7) == 0) {
4bfb655e 2695 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem,
bf651989
CB
2696 cg->value);
2697 } else {
c04a6d4e
CB
2698 ret = lxc_write_openat(h->container_full_path,
2699 cg->subsystem, cg->value,
2700 strlen(cg->value));
7e31931f 2701 if (ret < 0)
77c3e9a2 2702 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"",
7e31931f 2703 cg->subsystem, cg->value);
6b38e644
CB
2704 }
2705 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2706 }
2707
7e31931f 2708 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
6b38e644
CB
2709}
2710
bf651989
CB
2711__cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
2712 struct lxc_handler *handler)
2713{
2714#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
2a63b5cb 2715 __do_bpf_program_free struct bpf_program *devices = NULL;
bf651989 2716 int ret;
e552bd1a
CB
2717 struct lxc_conf *conf;
2718 struct hierarchy *unified;
2a63b5cb
CB
2719 struct lxc_list *it;
2720 struct bpf_program *devices_old;
bf651989 2721
e552bd1a
CB
2722 if (!ops)
2723 return ret_set_errno(false, ENOENT);
2724
2725 if (!ops->hierarchies)
2726 return true;
2727
2728 if (!ops->container_cgroup)
2729 return ret_set_errno(false, EEXIST);
2730
2731 if (!handler || !handler->conf)
2732 return ret_set_errno(false, EINVAL);
2733 conf = handler->conf;
2734
2735 unified = ops->unified;
9994db51
CB
2736 if (!unified || !unified->bpf_device_controller ||
2737 !unified->container_full_path || lxc_list_empty(&conf->devices))
bf651989
CB
2738 return true;
2739
2a63b5cb
CB
2740 devices = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
2741 if (!devices)
77c3e9a2 2742 return log_error_errno(false, ENOMEM, "Failed to create new bpf program");
2a63b5cb
CB
2743
2744 ret = bpf_program_init(devices);
bf651989 2745 if (ret)
77c3e9a2 2746 return log_error_errno(false, ENOMEM, "Failed to initialize bpf program");
2a63b5cb
CB
2747
2748 lxc_list_for_each(it, &conf->devices) {
2749 struct device_item *cur = it->elem;
2750
2751 ret = bpf_program_append_device(devices, cur);
2752 if (ret)
77c3e9a2
CB
2753 return log_error_errno(false, ENOMEM, "Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2754 cur->type,
2755 cur->major,
2756 cur->minor,
2757 cur->access,
2758 cur->allow,
2759 cur->global_rule);
2a63b5cb 2760 TRACE("Added rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
77c3e9a2
CB
2761 cur->type,
2762 cur->major,
2763 cur->minor,
2764 cur->access,
2765 cur->allow,
2766 cur->global_rule);
2a63b5cb
CB
2767 }
2768
2769 ret = bpf_program_finalize(devices);
2770 if (ret)
77c3e9a2 2771 return log_error_errno(false, ENOMEM, "Failed to finalize bpf program");
bf651989 2772
2a63b5cb
CB
2773 ret = bpf_program_cgroup_attach(devices, BPF_CGROUP_DEVICE,
2774 unified->container_full_path,
cce5a3d7
CB
2775 BPF_F_ALLOW_MULTI);
2776 if (ret)
77c3e9a2 2777 return log_error_errno(false, ENOMEM, "Failed to attach bpf program");
cce5a3d7
CB
2778
2779 /* Replace old bpf program. */
2a63b5cb
CB
2780 devices_old = move_ptr(conf->cgroup2_devices);
2781 conf->cgroup2_devices = move_ptr(devices);
2782 devices = move_ptr(devices_old);
bf651989 2783#endif
cce5a3d7 2784 return true;
bf651989
CB
2785}
2786
c581d2a6 2787bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
6b38e644 2788{
c581d2a6 2789 __do_free char *add_controllers = NULL, *base_path = NULL;
f761d24d 2790 __do_free_string_list char **parts = NULL;
c581d2a6
CB
2791 struct hierarchy *unified = ops->unified;
2792 ssize_t parts_len;
2793 char **it;
2794 size_t full_len = 0;
6b38e644 2795
c581d2a6
CB
2796 if (!ops->hierarchies || !pure_unified_layout(ops) ||
2797 !unified->controllers[0])
bf651989
CB
2798 return true;
2799
c581d2a6
CB
2800 /* For now we simply enable all controllers that we have detected by
2801 * creating a string like "+memory +pids +cpu +io".
2802 * TODO: In the near future we might want to support "-<controller>"
2803 * etc. but whether supporting semantics like this make sense will need
2804 * some thinking.
2805 */
2806 for (it = unified->controllers; it && *it; it++) {
2807 full_len += strlen(*it) + 2;
2808 add_controllers = must_realloc(add_controllers, full_len + 1);
2809
2810 if (unified->controllers[0] == *it)
2811 add_controllers[0] = '\0';
2812
2813 (void)strlcat(add_controllers, "+", full_len + 1);
2814 (void)strlcat(add_controllers, *it, full_len + 1);
2815
2816 if ((it + 1) && *(it + 1))
2817 (void)strlcat(add_controllers, " ", full_len + 1);
2818 }
2819
2820 parts = lxc_string_split(cgroup, '/');
2821 if (!parts)
f761d24d 2822 return false;
c581d2a6
CB
2823
2824 parts_len = lxc_array_len((void **)parts);
2825 if (parts_len > 0)
2826 parts_len--;
2827
2828 base_path = must_make_path(unified->mountpoint, unified->container_base_path, NULL);
2829 for (ssize_t i = -1; i < parts_len; i++) {
2830 int ret;
2831 __do_free char *target = NULL;
2832
2833 if (i >= 0)
2834 base_path = must_append_path(base_path, parts[i], NULL);
2835 target = must_make_path(base_path, "cgroup.subtree_control", NULL);
2836 ret = lxc_writeat(-1, target, add_controllers, full_len);
61fbc369 2837 if (ret < 0)
f761d24d
CB
2838 return log_error_errno(false, errno, "Could not enable \"%s\" controllers in the unified cgroup \"%s\"",
2839 add_controllers, target);
c581d2a6
CB
2840 TRACE("Enable \"%s\" controllers in the unified cgroup \"%s\"", add_controllers, target);
2841 }
2842
f761d24d 2843 return true;
c581d2a6
CB
2844}
2845
2846__cgfsng_ops bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2847{
61fbc369
CB
2848 if (!ops)
2849 return ret_set_errno(false, ENOENT);
2850
c581d2a6
CB
2851 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2852}
2853
2854__cgfsng_ops bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2855{
61fbc369
CB
2856 if (!ops)
2857 return ret_set_errno(false, ENOENT);
2858
c581d2a6 2859 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2202afc9
CB
2860}
2861
b7b18fc5
CB
2862static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2863 char **controllers)
2864{
b7b18fc5
CB
2865 if (!ops->cgroup_use)
2866 return true;
2867
431e2c54 2868 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
b7b18fc5
CB
2869 bool found = false;
2870
431e2c54 2871 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
b7b18fc5
CB
2872 if (strcmp(*cur_use, *cur_ctrl) != 0)
2873 continue;
2874
2875 found = true;
2876 break;
2877 }
2878
2879 if (found)
2880 continue;
2881
2882 return false;
2883 }
2884
2885 return true;
2886}
2887
a6ca2ed8
CB
2888static void cg_unified_delegate(char ***delegate)
2889{
d606c4e9 2890 __do_free char *buf = NULL;
a6ca2ed8 2891 char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
d606c4e9
CB
2892 char *token;
2893 int idx;
a6ca2ed8 2894
d606c4e9
CB
2895 buf = read_file("/sys/kernel/cgroup/delegate");
2896 if (!buf) {
a6ca2ed8
CB
2897 for (char **p = standard; p && *p; p++) {
2898 idx = append_null_to_list((void ***)delegate);
2899 (*delegate)[idx] = must_copy_string(*p);
2900 }
fc3b9533
CB
2901 SYSWARN("Failed to read /sys/kernel/cgroup/delegate");
2902 return;
d606c4e9 2903 }
a6ca2ed8 2904
d606c4e9
CB
2905 lxc_iterate_parts (token, buf, " \t\n") {
2906 /*
2907 * We always need to chown this for both cgroup and
2908 * cgroup2.
2909 */
2910 if (strcmp(token, "cgroup.procs") == 0)
2911 continue;
2912
2913 idx = append_null_to_list((void ***)delegate);
2914 (*delegate)[idx] = must_copy_string(token);
a6ca2ed8
CB
2915 }
2916}
2917
2202afc9
CB
2918/* At startup, parse_hierarchies finds all the info we need about cgroup
2919 * mountpoints and current cgroups, and stores it in @d.
2920 */
341e6516 2921static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
2202afc9 2922{
bbba37f7
CB
2923 __do_free char *basecginfo = NULL, *line = NULL;
2924 __do_free_string_list char **klist = NULL, **nlist = NULL;
d97919ab 2925 __do_fclose FILE *f = NULL;
2202afc9 2926 int ret;
2202afc9 2927 size_t len = 0;
2202afc9
CB
2928
2929 /* Root spawned containers escape the current cgroup, so use init's
2930 * cgroups as our base in that case.
2931 */
9caee129 2932 if (!relative && (geteuid() == 0))
2202afc9
CB
2933 basecginfo = read_file("/proc/1/cgroup");
2934 else
2935 basecginfo = read_file("/proc/self/cgroup");
2936 if (!basecginfo)
341e6516 2937 return ret_set_errno(-1, ENOMEM);
2202afc9
CB
2938
2939 ret = get_existing_subsystems(&klist, &nlist);
341e6516
CB
2940 if (ret < 0)
2941 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
2202afc9 2942
4110345b 2943 f = fopen("/proc/self/mountinfo", "re");
341e6516
CB
2944 if (!f)
2945 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
2202afc9
CB
2946
2947 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2948
2949 while (getline(&line, &len, f) != -1) {
bbba37f7
CB
2950 __do_free char *base_cgroup = NULL, *mountpoint = NULL;
2951 __do_free_string_list char **controller_list = NULL;
2202afc9
CB
2952 int type;
2953 bool writeable;
2954 struct hierarchy *new;
2202afc9
CB
2955
2956 type = get_cgroup_version(line);
2957 if (type == 0)
2958 continue;
2959
2960 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2961 continue;
2962
2963 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2964 if (type == CGROUP2_SUPER_MAGIC)
2965 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2966 else if (type == CGROUP_SUPER_MAGIC)
2967 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2968 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2969 if (type == CGROUP_SUPER_MAGIC)
2970 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2971 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2972 if (type == CGROUP2_SUPER_MAGIC)
2973 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2974 }
2975
2976 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
2977 if (!controller_list && type == CGROUP_SUPER_MAGIC)
2978 continue;
2979
2980 if (type == CGROUP_SUPER_MAGIC)
fc3b9533
CB
2981 if (controller_list_is_dup(ops->hierarchies, controller_list)) {
2982 TRACE("Skipping duplicating controller");
2983 continue;
2984 }
2202afc9
CB
2985
2986 mountpoint = cg_hybrid_get_mountpoint(line);
fc3b9533
CB
2987 if (!mountpoint) {
2988 ERROR("Failed parsing mountpoint from \"%s\"", line);
2989 continue;
2990 }
2202afc9
CB
2991
2992 if (type == CGROUP_SUPER_MAGIC)
2993 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
2994 else
2995 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
fc3b9533
CB
2996 if (!base_cgroup) {
2997 ERROR("Failed to find current cgroup");
2998 continue;
2999 }
2202afc9
CB
3000
3001 trim(base_cgroup);
3002 prune_init_scope(base_cgroup);
3003 if (type == CGROUP2_SUPER_MAGIC)
3004 writeable = test_writeable_v2(mountpoint, base_cgroup);
3005 else
3006 writeable = test_writeable_v1(mountpoint, base_cgroup);
fc3b9533
CB
3007 if (!writeable) {
3008 TRACE("The %s group is not writeable", base_cgroup);
3009 continue;
3010 }
2202afc9
CB
3011
3012 if (type == CGROUP2_SUPER_MAGIC) {
3013 char *cgv2_ctrl_path;
3014
3015 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
3016 "cgroup.controllers",
3017 NULL);
3018
3019 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
3020 free(cgv2_ctrl_path);
3021 if (!controller_list) {
3022 controller_list = cg_unified_make_empty_controller();
3023 TRACE("No controllers are enabled for "
3024 "delegation in the unified hierarchy");
3025 }
3026 }
3027
b7b18fc5 3028 /* Exclude all controllers that cgroup use does not want. */
fc3b9533
CB
3029 if (!cgroup_use_wants_controllers(ops, controller_list)) {
3030 TRACE("Skipping controller");
3031 continue;
3032 }
b7b18fc5 3033
bbba37f7 3034 new = add_hierarchy(&ops->hierarchies, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
a6ca2ed8
CB
3035 if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
3036 if (unprivileged)
3037 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 3038 ops->unified = new;
a6ca2ed8 3039 }
2202afc9
CB
3040 }
3041
2202afc9
CB
3042 TRACE("Writable cgroup hierarchies:");
3043 lxc_cgfsng_print_hierarchies(ops);
3044
3045 /* verify that all controllers in cgroup.use and all crucial
3046 * controllers are accounted for
3047 */
3048 if (!all_controllers_found(ops))
341e6516 3049 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
2202afc9 3050
341e6516 3051 return 0;
2202afc9
CB
3052}
3053
2202afc9 3054/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
9caee129 3055static char *cg_unified_get_current_cgroup(bool relative)
2202afc9 3056{
88396101 3057 __do_free char *basecginfo = NULL;
d7314671 3058 char *copy;
d97919ab 3059 char *base_cgroup;
2202afc9 3060
9caee129 3061 if (!relative && (geteuid() == 0))
2202afc9
CB
3062 basecginfo = read_file("/proc/1/cgroup");
3063 else
3064 basecginfo = read_file("/proc/self/cgroup");
3065 if (!basecginfo)
3066 return NULL;
3067
3068 base_cgroup = strstr(basecginfo, "0::/");
3069 if (!base_cgroup)
d7314671 3070 return NULL;
2202afc9
CB
3071
3072 base_cgroup = base_cgroup + 3;
3073 copy = copy_to_eol(base_cgroup);
3074 if (!copy)
d7314671 3075 return NULL;
2202afc9 3076
d7314671 3077 return trim(copy);
2202afc9
CB
3078}
3079
a6ca2ed8
CB
3080static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3081 bool unprivileged)
2202afc9 3082{
d97919ab 3083 __do_free char *subtree_path = NULL;
2202afc9 3084 int ret;
7717e175 3085 char *mountpoint;
2202afc9 3086 char **delegatable;
a6ca2ed8 3087 struct hierarchy *new;
2202afc9
CB
3088 char *base_cgroup = NULL;
3089
d47ff01b 3090 ret = unified_cgroup_hierarchy();
2202afc9 3091 if (ret == -ENOMEDIUM)
d2203230 3092 return ret_errno(ENOMEDIUM);
2202afc9
CB
3093
3094 if (ret != CGROUP2_SUPER_MAGIC)
3095 return 0;
3096
9caee129 3097 base_cgroup = cg_unified_get_current_cgroup(relative);
2202afc9 3098 if (!base_cgroup)
d2203230 3099 return ret_errno(EINVAL);
c581d2a6
CB
3100 if (!relative)
3101 prune_init_scope(base_cgroup);
2202afc9 3102
d606c4e9
CB
3103 /*
3104 * We assume that the cgroup we're currently in has been delegated to
3105 * us and we are free to further delege all of the controllers listed
3106 * in cgroup.controllers further down the hierarchy.
2202afc9 3107 */
dca9587a 3108 mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
c581d2a6 3109 subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
2202afc9 3110 delegatable = cg_unified_get_controllers(subtree_path);
2202afc9
CB
3111 if (!delegatable)
3112 delegatable = cg_unified_make_empty_controller();
3113 if (!delegatable[0])
3114 TRACE("No controllers are enabled for delegation");
3115
3116 /* TODO: If the user requested specific controllers via lxc.cgroup.use
3117 * we should verify here. The reason I'm not doing it right is that I'm
3118 * not convinced that lxc.cgroup.use will be the future since it is a
3119 * global property. I much rather have an option that lets you request
3120 * controllers per container.
3121 */
3122
a6ca2ed8 3123 new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
d606c4e9 3124 if (unprivileged)
a6ca2ed8 3125 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 3126
2a63b5cb
CB
3127 if (bpf_devices_cgroup_supported())
3128 new->bpf_device_controller = 1;
3129
2202afc9 3130 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
908e0ee5 3131 ops->unified = new;
77c3e9a2 3132
2202afc9
CB
3133 return CGROUP2_SUPER_MAGIC;
3134}
3135
341e6516 3136static int cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
2202afc9
CB
3137{
3138 int ret;
3139 const char *tmp;
9caee129 3140 bool relative = conf->cgroup_meta.relative;
2202afc9
CB
3141
3142 tmp = lxc_global_config_value("lxc.cgroup.use");
b7b18fc5 3143 if (tmp) {
88396101 3144 __do_free char *pin = NULL;
d97919ab 3145 char *chop, *cur;
b7b18fc5
CB
3146
3147 pin = must_copy_string(tmp);
3148 chop = pin;
3149
d97919ab 3150 lxc_iterate_parts(cur, chop, ",")
b7b18fc5 3151 must_append_string(&ops->cgroup_use, cur);
b7b18fc5 3152 }
2202afc9 3153
a6ca2ed8 3154 ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9 3155 if (ret < 0)
341e6516 3156 return -1;
2202afc9
CB
3157
3158 if (ret == CGROUP2_SUPER_MAGIC)
341e6516 3159 return 0;
2202afc9 3160
a6ca2ed8 3161 return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9
CB
3162}
3163
341e6516 3164__cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
2202afc9
CB
3165{
3166 const char *cgroup_pattern;
3167
341e6516
CB
3168 if (!ops)
3169 return ret_set_errno(-1, ENOENT);
3170
2202afc9
CB
3171 /* copy system-wide cgroup information */
3172 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
b3ed2061
CB
3173 if (cgroup_pattern && strcmp(cgroup_pattern, "") != 0)
3174 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
2202afc9 3175
341e6516 3176 return 0;
2202afc9
CB
3177}
3178
5a087e05 3179struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
2202afc9 3180{
a64edc1c 3181 __do_free struct cgroup_ops *cgfsng_ops = NULL;
2202afc9
CB
3182
3183 cgfsng_ops = malloc(sizeof(struct cgroup_ops));
3184 if (!cgfsng_ops)
341e6516 3185 return ret_set_errno(NULL, ENOMEM);
2202afc9
CB
3186
3187 memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
3188 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3189
341e6516 3190 if (cg_init(cgfsng_ops, conf))
2202afc9 3191 return NULL;
2202afc9
CB
3192
3193 cgfsng_ops->data_init = cgfsng_data_init;
434c8e15
CB
3194 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3195 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
72068e74 3196 cgfsng_ops->monitor_create = cgfsng_monitor_create;
eeef32bb 3197 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
c581d2a6
CB
3198 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3199 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
e8b181f5
CB
3200 cgfsng_ops->payload_create = cgfsng_payload_create;
3201 cgfsng_ops->payload_enter = cgfsng_payload_enter;
78eb6aa6 3202 cgfsng_ops->payload_finalize = cgfsng_payload_finalize;
2202afc9
CB
3203 cgfsng_ops->escape = cgfsng_escape;
3204 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
3205 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
3206 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3207 cgfsng_ops->get = cgfsng_get;
3208 cgfsng_ops->set = cgfsng_set;
942e193e 3209 cgfsng_ops->freeze = cgfsng_freeze;
2202afc9 3210 cgfsng_ops->unfreeze = cgfsng_unfreeze;
c581d2a6 3211 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
2202afc9
CB
3212 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3213 cgfsng_ops->driver = "cgfsng";
3214 cgfsng_ops->version = "1.0.0";
3215 cgfsng_ops->attach = cgfsng_attach;
3216 cgfsng_ops->chown = cgfsng_chown;
3217 cgfsng_ops->mount = cgfsng_mount;
bf651989 3218 cgfsng_ops->devices_activate = cgfsng_devices_activate;
2202afc9 3219
a64edc1c 3220 return move_ptr(cgfsng_ops);
2202afc9 3221}