]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
attach: convert to strequal()
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
ccb4cabe
SH
2
3/*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
0e7ff52c 8 * each controller.
ccb4cabe
SH
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
18406e5a 12 * a comma-separated list of controllers.
ccb4cabe 13 */
a54694f8 14
d38dd64a
CB
15#ifndef _GNU_SOURCE
16#define _GNU_SOURCE 1
17#endif
a54694f8
CB
18#include <ctype.h>
19#include <dirent.h>
20#include <errno.h>
21#include <grp.h>
d38dd64a
CB
22#include <linux/kdev_t.h>
23#include <linux/types.h>
942e193e
CB
24#include <poll.h>
25#include <signal.h>
a54694f8 26#include <stdint.h>
ccb4cabe
SH
27#include <stdio.h>
28#include <stdlib.h>
a54694f8 29#include <string.h>
385e58e8 30#include <sys/epoll.h>
438c4581 31#include <sys/types.h>
d38dd64a 32#include <unistd.h>
c8bf519d 33
d1783ef4 34#include "af_unix.h"
b635e92d 35#include "caps.h"
ccb4cabe 36#include "cgroup.h"
bf651989 37#include "cgroup2_devices.h"
6328fd9c 38#include "cgroup_utils.h"
ccb4cabe 39#include "commands.h"
c8af3332 40#include "commands_utils.h"
43654d34 41#include "conf.h"
d38dd64a 42#include "config.h"
a54694f8 43#include "log.h"
c19ad94b 44#include "macro.h"
018051e3 45#include "mainloop.h"
861cb8c2 46#include "memory_utils.h"
74ed30d7 47#include "mount_utils.h"
43654d34 48#include "storage/storage.h"
315f8a4e 49#include "syscall_wrappers.h"
a54694f8 50#include "utils.h"
ccb4cabe 51
64e82f8b
DJ
52#ifndef HAVE_STRLCPY
53#include "include/strlcpy.h"
54#endif
55
3ebe2fbd
DJ
56#ifndef HAVE_STRLCAT
57#include "include/strlcat.h"
58#endif
59
ac2cecc4 60lxc_log_define(cgfsng, cgroup);
ccb4cabe 61
8b8db2f6
CB
62/* Given a pointer to a null-terminated array of pointers, realloc to add one
63 * entry, and point the new entry to NULL. Do not fail. Return the index to the
64 * second-to-last entry - that is, the one which is now available for use
65 * (keeping the list null-terminated).
ccb4cabe
SH
66 */
67static int append_null_to_list(void ***list)
68{
69 int newentry = 0;
70
71 if (*list)
8b8db2f6
CB
72 for (; (*list)[newentry]; newentry++)
73 ;
ccb4cabe
SH
74
75 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
76 (*list)[newentry + 1] = NULL;
77 return newentry;
78}
79
8073018d
CB
80/* Given a null-terminated array of strings, check whether @entry is one of the
81 * strings.
ccb4cabe
SH
82 */
83static bool string_in_list(char **list, const char *entry)
84{
ccb4cabe
SH
85 if (!list)
86 return false;
d6337a5f 87
77c3e9a2 88 for (int i = 0; list[i]; i++)
8b99a20a 89 if (strequal(list[i], entry))
ccb4cabe
SH
90 return true;
91
92 return false;
93}
94
ac010944
CB
95/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
96 * "name=systemd". Do not fail.
97 */
98static char *cg_legacy_must_prefix_named(char *entry)
99{
100 size_t len;
101 char *prefixed;
102
103 len = strlen(entry);
f25a2044 104 prefixed = must_realloc(NULL, len + 6);
ac010944 105
6333c915
CB
106 memcpy(prefixed, "name=", STRLITERALLEN("name="));
107 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
ac010944 108 prefixed[len + 5] = '\0';
99bb3fa8 109
ac010944
CB
110 return prefixed;
111}
112
42a993b4
CB
113/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
114 * we are called.
ccb4cabe 115 *
42a993b4
CB
116 * We also handle named subsystems here. Any controller which is not a kernel
117 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
118 * we refuse to use because we're not sure which we have here.
119 * (TODO: We could work around this in some cases by just remounting to be
120 * unambiguous, or by comparing mountpoint contents with current cgroup.)
ccb4cabe
SH
121 *
122 * The last entry will always be NULL.
123 */
42a993b4
CB
124static void must_append_controller(char **klist, char **nlist, char ***clist,
125 char *entry)
ccb4cabe
SH
126{
127 int newentry;
128 char *copy;
129
130 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
c2712f64 131 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
ccb4cabe
SH
132 ERROR("It is both a named and kernel subsystem");
133 return;
134 }
135
136 newentry = append_null_to_list((void ***)clist);
137
138 if (strncmp(entry, "name=", 5) == 0)
139 copy = must_copy_string(entry);
140 else if (string_in_list(klist, entry))
141 copy = must_copy_string(entry);
142 else
7745483d 143 copy = cg_legacy_must_prefix_named(entry);
ccb4cabe
SH
144
145 (*clist)[newentry] = copy;
146}
147
5ae0207c
CB
148/* Given a handler's cgroup data, return the struct hierarchy for the controller
149 * @c, or NULL if there is none.
ccb4cabe 150 */
59eac805 151static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
ccb4cabe 152{
77c3e9a2
CB
153 if (!ops->hierarchies)
154 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
d6337a5f 155
77c3e9a2 156 for (int i = 0; ops->hierarchies[i]; i++) {
27a5132c 157 if (!controller) {
d6337a5f 158 /* This is the empty unified hierarchy. */
09ed8992 159 if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
2202afc9 160 return ops->hierarchies[i];
09ed8992 161
106f1f38 162 continue;
6dcd6f02 163 }
09ed8992 164
6dcd6f02
CB
165 /*
166 * Handle controllers with significant implementation changes
167 * from cgroup to cgroup2.
168 */
169 if (pure_unified_layout(ops)) {
8b99a20a 170 if (strequal(controller, "devices")) {
6dcd6f02
CB
171 if (ops->unified->bpf_device_controller)
172 return ops->unified;
173
174 break;
8b99a20a 175 } else if (strequal(controller, "freezer")) {
6dcd6f02
CB
176 if (ops->unified->freezer_controller)
177 return ops->unified;
178
179 break;
180 }
d6337a5f
CB
181 }
182
27a5132c 183 if (string_in_list(ops->hierarchies[i]->controllers, controller))
2202afc9 184 return ops->hierarchies[i];
ccb4cabe 185 }
d6337a5f 186
27a5132c
CB
187 if (controller)
188 WARN("There is no useable %s controller", controller);
189 else
190 WARN("There is no empty unified cgroup hierarchy");
191
77c3e9a2 192 return ret_set_errno(NULL, ENOENT);
ccb4cabe
SH
193}
194
a54694f8
CB
195/* Taken over modified from the kernel sources. */
196#define NBITS 32 /* bits in uint32_t */
197#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
198#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
199
200static void set_bit(unsigned bit, uint32_t *bitarr)
201{
202 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
203}
204
205static void clear_bit(unsigned bit, uint32_t *bitarr)
206{
207 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
208}
209
210static bool is_set(unsigned bit, uint32_t *bitarr)
211{
212 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
213}
214
215/* Create cpumask from cpulist aka turn:
216 *
217 * 0,2-3
218 *
d5d468f6 219 * into bit array
a54694f8
CB
220 *
221 * 1 0 1 1
222 */
223static uint32_t *lxc_cpumask(char *buf, size_t nbits)
224{
77c3e9a2 225 __do_free uint32_t *bitarr = NULL;
a54694f8 226 char *token;
d5d468f6 227 size_t arrlen;
d5d468f6
CB
228
229 arrlen = BITS_TO_LONGS(nbits);
230 bitarr = calloc(arrlen, sizeof(uint32_t));
a54694f8 231 if (!bitarr)
c5b8049e 232 return ret_set_errno(NULL, ENOMEM);
a54694f8 233
0be0d78f 234 lxc_iterate_parts(token, buf, ",") {
a54694f8 235 errno = 0;
d5d468f6
CB
236 unsigned end, start;
237 char *range;
a54694f8 238
d5d468f6
CB
239 start = strtoul(token, NULL, 0);
240 end = start;
241 range = strchr(token, '-');
a54694f8
CB
242 if (range)
243 end = strtoul(range + 1, NULL, 0);
d5d468f6 244
c5b8049e
CB
245 if (!(start <= end))
246 return ret_set_errno(NULL, EINVAL);
a54694f8 247
c5b8049e
CB
248 if (end >= nbits)
249 return ret_set_errno(NULL, EINVAL);
a54694f8
CB
250
251 while (start <= end)
252 set_bit(start++, bitarr);
253 }
254
c5b8049e 255 return move_ptr(bitarr);
a54694f8
CB
256}
257
a54694f8
CB
258/* Turn cpumask into simple, comma-separated cpulist. */
259static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
260{
f761d24d 261 __do_free_string_list char **cpulist = NULL;
c19ad94b 262 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
77c3e9a2 263 int ret;
a54694f8 264
77c3e9a2 265 for (size_t i = 0; i <= nbits; i++) {
414c6719
CB
266 if (!is_set(i, bitarr))
267 continue;
268
0bba27c1
CB
269 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
270 if (ret < 0)
414c6719 271 return NULL;
414c6719
CB
272
273 ret = lxc_append_string(&cpulist, numstr);
f761d24d 274 if (ret < 0)
c5b8049e 275 return ret_set_errno(NULL, ENOMEM);
a54694f8 276 }
414c6719
CB
277
278 if (!cpulist)
c5b8049e 279 return ret_set_errno(NULL, ENOMEM);
414c6719 280
f761d24d 281 return lxc_string_join(",", (const char **)cpulist, false);
a54694f8
CB
282}
283
284static ssize_t get_max_cpus(char *cpulist)
285{
286 char *c1, *c2;
287 char *maxcpus = cpulist;
288 size_t cpus = 0;
289
290 c1 = strrchr(maxcpus, ',');
291 if (c1)
292 c1++;
293
294 c2 = strrchr(maxcpus, '-');
295 if (c2)
296 c2++;
297
298 if (!c1 && !c2)
299 c1 = maxcpus;
300 else if (c1 > c2)
301 c2 = c1;
302 else if (c1 < c2)
303 c1 = c2;
333987b9 304 else if (!c1 && c2)
a54694f8
CB
305 c1 = c2;
306
a54694f8
CB
307 errno = 0;
308 cpus = strtoul(c1, NULL, 0);
309 if (errno != 0)
310 return -1;
311
312 return cpus;
313}
314
6f9584d8 315#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
36f70181 316#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
c5b8049e
CB
317static bool cg_legacy_filter_and_set_cpus(const char *parent_cgroup,
318 char *child_cgroup, bool am_initialized)
a54694f8 319{
d97919ab 320 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
36f70181
CB
321 *offlinecpus = NULL, *posscpus = NULL;
322 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
323 *possmask = NULL;
a54694f8
CB
324 int ret;
325 ssize_t i;
36f70181 326 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
c5b8049e 327 bool flipped_bit = false;
a54694f8 328
c5b8049e 329 fpath = must_make_path(parent_cgroup, "cpuset.cpus", NULL);
46bf13b7 330 posscpus = read_file_at(-EBADF, fpath, PROTECT_OPEN, 0);
c5b8049e
CB
331 if (!posscpus)
332 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
a54694f8
CB
333
334 /* Get maximum number of cpus found in possible cpuset. */
335 maxposs = get_max_cpus(posscpus);
92d5ea57 336 if (maxposs < 0 || maxposs >= INT_MAX - 1)
d97919ab 337 return false;
a54694f8 338
36f70181 339 if (file_exists(__ISOL_CPUS)) {
46bf13b7 340 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
c5b8049e
CB
341 if (!isolcpus)
342 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
6f9584d8 343
36f70181
CB
344 if (isdigit(isolcpus[0])) {
345 /* Get maximum number of cpus found in isolated cpuset. */
346 maxisol = get_max_cpus(isolcpus);
347 if (maxisol < 0 || maxisol >= INT_MAX - 1)
348 return false;
6f9584d8 349 }
36f70181
CB
350
351 if (maxposs < maxisol)
352 maxposs = maxisol;
353 maxposs++;
354 } else {
355 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
a54694f8
CB
356 }
357
36f70181 358 if (file_exists(__OFFLINE_CPUS)) {
46bf13b7 359 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
c5b8049e
CB
360 if (!offlinecpus)
361 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
36f70181
CB
362
363 if (isdigit(offlinecpus[0])) {
364 /* Get maximum number of cpus found in offline cpuset. */
365 maxoffline = get_max_cpus(offlinecpus);
366 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
367 return false;
368 }
369
370 if (maxposs < maxoffline)
371 maxposs = maxoffline;
372 maxposs++;
373 } else {
374 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
375 }
a54694f8 376
dcd14a3d
CB
377 if ((maxisol == 0) && (maxoffline == 0)) {
378 cpulist = move_ptr(posscpus);
36f70181 379 goto copy_parent;
dcd14a3d 380 }
a54694f8
CB
381
382 possmask = lxc_cpumask(posscpus, maxposs);
c5b8049e
CB
383 if (!possmask)
384 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
a54694f8 385
36f70181
CB
386 if (maxisol > 0) {
387 isolmask = lxc_cpumask(isolcpus, maxposs);
c5b8049e
CB
388 if (!isolmask)
389 return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
36f70181
CB
390 }
391
392 if (maxoffline > 0) {
393 offlinemask = lxc_cpumask(offlinecpus, maxposs);
c5b8049e
CB
394 if (!offlinemask)
395 return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
6f9584d8 396 }
a54694f8
CB
397
398 for (i = 0; i <= maxposs; i++) {
36f70181
CB
399 if ((isolmask && !is_set(i, isolmask)) ||
400 (offlinemask && !is_set(i, offlinemask)) ||
401 !is_set(i, possmask))
59ac3b88
CB
402 continue;
403
404 flipped_bit = true;
405 clear_bit(i, possmask);
a54694f8
CB
406 }
407
6f9584d8 408 if (!flipped_bit) {
b31d62b8
CB
409 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
410 TRACE("No isolated or offline cpus present in cpuset");
411 } else {
412 cpulist = move_ptr(posscpus);
413 TRACE("Removed isolated or offline cpus from cpuset");
6f9584d8 414 }
c5b8049e
CB
415 if (!cpulist)
416 return log_error_errno(false, errno, "Failed to create cpu list");
a54694f8
CB
417
418copy_parent:
36f70181 419 if (!am_initialized) {
c5b8049e 420 ret = lxc_write_openat(child_cgroup, "cpuset.cpus", cpulist, strlen(cpulist));
c04a6d4e
CB
421 if (ret < 0)
422 return log_error_errno(false,
423 errno, "Failed to write cpu list to \"%s/cpuset.cpus\"",
c5b8049e 424 child_cgroup);
36f70181
CB
425
426 TRACE("Copied cpu settings of parent cgroup");
6f9584d8
CB
427 }
428
d97919ab 429 return true;
a54694f8
CB
430}
431
e3a3fecf 432/* Copy contents of parent(@path)/@file to @path/@file */
c5b8049e
CB
433static bool copy_parent_file(const char *parent_cgroup,
434 const char *child_cgroup, const char *file)
e3a3fecf 435{
c5b8049e 436 __do_free char *parent_file = NULL, *value = NULL;
b095a8eb 437 int len = 0;
fe70edee 438 int ret;
e3a3fecf 439
c5b8049e
CB
440 parent_file = must_make_path(parent_cgroup, file, NULL);
441 len = lxc_read_from_file(parent_file, NULL, 0);
fe70edee 442 if (len <= 0)
77c3e9a2 443 return log_error_errno(false, errno, "Failed to determine buffer size");
b095a8eb 444
f25a2044 445 value = must_realloc(NULL, len + 1);
fe70edee 446 value[len] = '\0';
c5b8049e 447 ret = lxc_read_from_file(parent_file, value, len);
fe70edee 448 if (ret != len)
77c3e9a2 449 return log_error_errno(false, errno, "Failed to read from parent file \"%s\"", parent_file);
b095a8eb 450
c5b8049e 451 ret = lxc_write_openat(child_cgroup, file, value, len);
fe70edee 452 if (ret < 0 && errno != EACCES)
77c3e9a2 453 return log_error_errno(false, errno, "Failed to write \"%s\" to file \"%s/%s\"",
c5b8049e 454 value, child_cgroup, file);
fe70edee 455 return true;
e3a3fecf
SH
456}
457
77c3e9a2 458static inline bool is_unified_hierarchy(const struct hierarchy *h)
c04a6d4e
CB
459{
460 return h->version == CGROUP2_SUPER_MAGIC;
461}
462
f990d3bf
CB
463/*
464 * Initialize the cpuset hierarchy in first directory of @cgroup_leaf and set
7793add3
CB
465 * cgroup.clone_children so that children inherit settings. Since the
466 * h->base_path is populated by init or ourselves, we know it is already
467 * initialized.
fe70edee
CB
468 *
469 * returns -1 on error, 0 when we didn't created a cgroup, 1 if we created a
470 * cgroup.
e3a3fecf 471 */
f990d3bf
CB
472static int cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h,
473 const char *cgroup_leaf)
e3a3fecf 474{
c5b8049e 475 __do_free char *parent_cgroup = NULL, *child_cgroup = NULL, *dup = NULL;
f62cf1d4 476 __do_close int cgroup_fd = -EBADF;
c5b8049e 477 int fret = -1;
7793add3
CB
478 int ret;
479 char v;
f990d3bf 480 char *leaf, *slash;
e3a3fecf 481
c04a6d4e 482 if (is_unified_hierarchy(h))
fe70edee 483 return 0;
c04a6d4e 484
e3a3fecf 485 if (!string_in_list(h->controllers, "cpuset"))
fe70edee 486 return 0;
e3a3fecf 487
f990d3bf
CB
488 if (!cgroup_leaf)
489 return ret_set_errno(-1, EINVAL);
490
491 dup = strdup(cgroup_leaf);
492 if (!dup)
493 return ret_set_errno(-1, ENOMEM);
494
c5b8049e
CB
495 parent_cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
496
497 leaf = dup;
f990d3bf
CB
498 leaf += strspn(leaf, "/");
499 slash = strchr(leaf, '/');
e3a3fecf
SH
500 if (slash)
501 *slash = '\0';
c5b8049e 502 child_cgroup = must_make_path(parent_cgroup, leaf, NULL);
e3a3fecf
SH
503 if (slash)
504 *slash = '/';
7793add3 505
fe70edee 506 fret = 1;
c5b8049e 507 ret = mkdir(child_cgroup, 0755);
7793add3 508 if (ret < 0) {
fe70edee 509 if (errno != EEXIST)
c5b8049e 510 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", child_cgroup);
fe70edee
CB
511
512 fret = 0;
e3a3fecf 513 }
6f9584d8 514
c5b8049e 515 cgroup_fd = lxc_open_dirfd(child_cgroup);
c04a6d4e 516 if (cgroup_fd < 0)
fe70edee 517 return -1;
7793add3 518
c04a6d4e 519 ret = lxc_readat(cgroup_fd, "cgroup.clone_children", &v, 1);
fe70edee 520 if (ret < 0)
c5b8049e 521 return log_error_errno(-1, errno, "Failed to read file \"%s/cgroup.clone_children\"", child_cgroup);
e3a3fecf 522
a54694f8 523 /* Make sure any isolated cpus are removed from cpuset.cpus. */
c5b8049e 524 if (!cg_legacy_filter_and_set_cpus(parent_cgroup, child_cgroup, v == '1'))
fe70edee 525 return log_error_errno(-1, errno, "Failed to remove isolated cpus");
a54694f8 526
7793add3 527 /* Already set for us by someone else. */
b28c2810
CB
528 if (v == '1')
529 TRACE("\"cgroup.clone_children\" was already set to \"1\"");
e3a3fecf
SH
530
531 /* copy parent's settings */
c5b8049e 532 if (!copy_parent_file(parent_cgroup, child_cgroup, "cpuset.mems"))
fe70edee 533 return log_error_errno(-1, errno, "Failed to copy \"cpuset.mems\" settings");
e3a3fecf 534
fe70edee 535 /* Set clone_children so children inherit our settings */
c04a6d4e 536 ret = lxc_writeat(cgroup_fd, "cgroup.clone_children", "1", 1);
fe70edee 537 if (ret < 0)
c5b8049e 538 return log_error_errno(-1, errno, "Failed to write 1 to \"%s/cgroup.clone_children\"", child_cgroup);
d97919ab 539
fe70edee 540 return fret;
e3a3fecf
SH
541}
542
5c0089ae
CB
543/* Given two null-terminated lists of strings, return true if any string is in
544 * both.
ccb4cabe
SH
545 */
546static bool controller_lists_intersect(char **l1, char **l2)
547{
ccb4cabe
SH
548 if (!l1 || !l2)
549 return false;
550
77c3e9a2 551 for (int i = 0; l1[i]; i++)
ccb4cabe
SH
552 if (string_in_list(l2, l1[i]))
553 return true;
5c0089ae 554
ccb4cabe
SH
555 return false;
556}
557
258449e5
CB
558/* For a null-terminated list of controllers @clist, return true if any of those
559 * controllers is already listed the null-terminated list of hierarchies @hlist.
560 * Realistically, if one is present, all must be present.
ccb4cabe
SH
561 */
562static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
563{
ccb4cabe
SH
564 if (!hlist)
565 return false;
258449e5 566
77c3e9a2 567 for (int i = 0; hlist[i]; i++)
ccb4cabe
SH
568 if (controller_lists_intersect(hlist[i]->controllers, clist))
569 return true;
ccb4cabe 570
258449e5 571 return false;
ccb4cabe
SH
572}
573
f57ac67f
CB
574/* Return true if the controller @entry is found in the null-terminated list of
575 * hierarchies @hlist.
ccb4cabe
SH
576 */
577static bool controller_found(struct hierarchy **hlist, char *entry)
578{
ccb4cabe
SH
579 if (!hlist)
580 return false;
581
77c3e9a2 582 for (int i = 0; hlist[i]; i++)
ccb4cabe
SH
583 if (string_in_list(hlist[i]->controllers, entry))
584 return true;
d6337a5f 585
ccb4cabe
SH
586 return false;
587}
588
e1c27ab0
CB
589/* Return true if all of the controllers which we require have been found. The
590 * required list is freezer and anything in lxc.cgroup.use.
ccb4cabe 591 */
2202afc9 592static bool all_controllers_found(struct cgroup_ops *ops)
ccb4cabe 593{
77c3e9a2 594 struct hierarchy **hlist;
ccb4cabe 595
2202afc9 596 if (!ops->cgroup_use)
ccb4cabe 597 return true;
c2712f64 598
77c3e9a2
CB
599 hlist = ops->hierarchies;
600 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
601 if (!controller_found(hlist, *cur))
602 return log_error(false, "No %s controller mountpoint found", *cur);
c2712f64 603
ccb4cabe
SH
604 return true;
605}
606
f205f10c
CB
607/* Get the controllers from a mountinfo line There are other ways we could get
608 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
609 * could parse the mount options. But we simply assume that the mountpoint must
610 * be /sys/fs/cgroup/controller-list
ccb4cabe 611 */
a3926f6a
CB
612static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
613 int type)
ccb4cabe 614{
f205f10c
CB
615 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
616 * for legacy hierarchies.
617 */
f761d24d 618 __do_free_string_list char **aret = NULL;
ccb4cabe 619 int i;
d97919ab 620 char *p2, *tok;
0be0d78f 621 char *p = line, *sep = ",";
6328fd9c 622
ccb4cabe 623 for (i = 0; i < 4; i++) {
235f1815 624 p = strchr(p, ' ');
ccb4cabe
SH
625 if (!p)
626 return NULL;
627 p++;
628 }
a55f31bd 629
f205f10c
CB
630 /* Note, if we change how mountinfo works, then our caller will need to
631 * verify /sys/fs/cgroup/ in this field.
632 */
77c3e9a2 633 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
34375fd7 634 return log_warn(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
d6337a5f 635
ccb4cabe 636 p += 15;
235f1815 637 p2 = strchr(p, ' ');
77c3e9a2
CB
638 if (!p2)
639 return log_error(NULL, "Corrupt mountinfo");
ccb4cabe 640 *p2 = '\0';
6328fd9c 641
d6337a5f 642 if (type == CGROUP_SUPER_MAGIC) {
88396101 643 __do_free char *dup = NULL;
d97919ab 644
0be0d78f
CB
645 /* strdup() here for v1 hierarchies. Otherwise
646 * lxc_iterate_parts() will destroy mountpoints such as
647 * "/sys/fs/cgroup/cpu,cpuacct".
d6337a5f 648 */
d97919ab 649 dup = must_copy_string(p);
d6337a5f
CB
650 if (!dup)
651 return NULL;
652
257f04ec 653 lxc_iterate_parts(tok, dup, sep)
d6337a5f 654 must_append_controller(klist, nlist, &aret, tok);
411ac6d8 655 }
d6337a5f 656 *p2 = ' ';
f205f10c 657
f761d24d 658 return move_ptr(aret);
d6337a5f 659}
411ac6d8 660
d6337a5f
CB
661static char **cg_unified_make_empty_controller(void)
662{
f761d24d 663 __do_free_string_list char **aret = NULL;
d6337a5f 664 int newentry;
d6337a5f
CB
665
666 newentry = append_null_to_list((void ***)&aret);
667 aret[newentry] = NULL;
f761d24d 668 return move_ptr(aret);
d6337a5f
CB
669}
670
d23cb29e 671static char **cg_unified_get_controllers(int dfd, const char *file)
d6337a5f 672{
d97919ab 673 __do_free char *buf = NULL;
f761d24d 674 __do_free_string_list char **aret = NULL;
0be0d78f 675 char *sep = " \t\n";
2a63b5cb 676 char *tok;
d6337a5f 677
46bf13b7 678 buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
d6337a5f 679 if (!buf)
411ac6d8 680 return NULL;
6328fd9c 681
0be0d78f 682 lxc_iterate_parts(tok, buf, sep) {
d6337a5f
CB
683 int newentry;
684 char *copy;
685
686 newentry = append_null_to_list((void ***)&aret);
687 copy = must_copy_string(tok);
688 aret[newentry] = copy;
ccb4cabe
SH
689 }
690
f761d24d 691 return move_ptr(aret);
ccb4cabe
SH
692}
693
2202afc9 694static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
bb221ad1 695 char *container_base_path, int type)
ccb4cabe
SH
696{
697 struct hierarchy *new;
698 int newentry;
699
1973b62a 700 new = zalloc(sizeof(*new));
6e214b74
CB
701 if (!new)
702 return ret_set_errno(NULL, ENOMEM);
ccb4cabe
SH
703 new->controllers = clist;
704 new->mountpoint = mountpoint;
bb221ad1 705 new->container_base_path = container_base_path;
d6337a5f 706 new->version = type;
1973b62a 707 new->cgfd_con = -EBADF;
a900cbaf 708 new->cgfd_limit = -EBADF;
1973b62a 709 new->cgfd_mon = -EBADF;
6328fd9c 710
2202afc9
CB
711 newentry = append_null_to_list((void ***)h);
712 (*h)[newentry] = new;
d6337a5f 713 return new;
ccb4cabe
SH
714}
715
798c3b33
CB
716/* Get a copy of the mountpoint from @line, which is a line from
717 * /proc/self/mountinfo.
ccb4cabe 718 */
a3926f6a 719static char *cg_hybrid_get_mountpoint(char *line)
ccb4cabe 720{
77c3e9a2 721 char *p = line, *sret = NULL;
ccb4cabe 722 size_t len;
798c3b33 723 char *p2;
ccb4cabe 724
77c3e9a2 725 for (int i = 0; i < 4; i++) {
235f1815 726 p = strchr(p, ' ');
ccb4cabe
SH
727 if (!p)
728 return NULL;
729 p++;
730 }
d6337a5f 731
dca9587a 732 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
d6337a5f
CB
733 return NULL;
734
735 p2 = strchr(p + 15, ' ');
736 if (!p2)
737 return NULL;
738 *p2 = '\0';
739
ccb4cabe 740 len = strlen(p);
f25a2044 741 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
742 memcpy(sret, p, len);
743 sret[len] = '\0';
77c3e9a2 744
ccb4cabe
SH
745 return sret;
746}
747
f523291e 748/* Given a multi-line string, return a null-terminated copy of the current line. */
ccb4cabe
SH
749static char *copy_to_eol(char *p)
750{
77c3e9a2 751 char *p2, *sret;
ccb4cabe
SH
752 size_t len;
753
77c3e9a2 754 p2 = strchr(p, '\n');
ccb4cabe
SH
755 if (!p2)
756 return NULL;
757
758 len = p2 - p;
f25a2044 759 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
760 memcpy(sret, p, len);
761 sret[len] = '\0';
77c3e9a2 762
ccb4cabe
SH
763 return sret;
764}
765
bced39de
CB
766/* cgline: pointer to character after the first ':' in a line in a \n-terminated
767 * /proc/self/cgroup file. Check whether controller c is present.
ccb4cabe
SH
768 */
769static bool controller_in_clist(char *cgline, char *c)
770{
d97919ab
CB
771 __do_free char *tmp = NULL;
772 char *tok, *eol;
ccb4cabe
SH
773 size_t len;
774
235f1815 775 eol = strchr(cgline, ':');
ccb4cabe
SH
776 if (!eol)
777 return false;
778
779 len = eol - cgline;
861cb8c2 780 tmp = must_realloc(NULL, len + 1);
ccb4cabe
SH
781 memcpy(tmp, cgline, len);
782 tmp[len] = '\0';
783
d97919ab 784 lxc_iterate_parts(tok, tmp, ",")
8b99a20a 785 if (strequal(tok, c))
ccb4cabe 786 return true;
d6337a5f 787
ccb4cabe
SH
788 return false;
789}
790
c3ef912e
CB
791/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
792 * @controller.
ccb4cabe 793 */
c3ef912e
CB
794static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
795 int type)
ccb4cabe
SH
796{
797 char *p = basecginfo;
6328fd9c 798
d6337a5f
CB
799 for (;;) {
800 bool is_cgv2_base_cgroup = false;
801
6328fd9c 802 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
d6337a5f
CB
803 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
804 is_cgv2_base_cgroup = true;
ccb4cabe 805
235f1815 806 p = strchr(p, ':');
ccb4cabe
SH
807 if (!p)
808 return NULL;
809 p++;
d6337a5f
CB
810
811 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
235f1815 812 p = strchr(p, ':');
ccb4cabe
SH
813 if (!p)
814 return NULL;
815 p++;
816 return copy_to_eol(p);
817 }
818
235f1815 819 p = strchr(p, '\n');
ccb4cabe
SH
820 if (!p)
821 return NULL;
822 p++;
823 }
824}
825
ccb4cabe
SH
826static void must_append_string(char ***list, char *entry)
827{
6dfb18bf 828 int newentry;
ccb4cabe
SH
829 char *copy;
830
6dfb18bf 831 newentry = append_null_to_list((void ***)list);
ccb4cabe
SH
832 copy = must_copy_string(entry);
833 (*list)[newentry] = copy;
834}
835
d6337a5f 836static int get_existing_subsystems(char ***klist, char ***nlist)
ccb4cabe 837{
d97919ab
CB
838 __do_free char *line = NULL;
839 __do_fclose FILE *f = NULL;
ccb4cabe
SH
840 size_t len = 0;
841
4110345b 842 f = fopen("/proc/self/cgroup", "re");
d6337a5f
CB
843 if (!f)
844 return -1;
845
ccb4cabe 846 while (getline(&line, &len, f) != -1) {
0be0d78f 847 char *p, *p2, *tok;
235f1815 848 p = strchr(line, ':');
ccb4cabe
SH
849 if (!p)
850 continue;
851 p++;
235f1815 852 p2 = strchr(p, ':');
ccb4cabe
SH
853 if (!p2)
854 continue;
855 *p2 = '\0';
ff8d6ee9 856
6328fd9c
CB
857 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
858 * contains an entry of the form:
ff8d6ee9
CB
859 *
860 * 0::/some/path
861 *
6328fd9c 862 * In this case we use "cgroup2" as controller name.
ff8d6ee9 863 */
6328fd9c
CB
864 if ((p2 - p) == 0) {
865 must_append_string(klist, "cgroup2");
ff8d6ee9 866 continue;
6328fd9c 867 }
ff8d6ee9 868
0be0d78f 869 lxc_iterate_parts(tok, p, ",") {
ccb4cabe
SH
870 if (strncmp(tok, "name=", 5) == 0)
871 must_append_string(nlist, tok);
872 else
873 must_append_string(klist, tok);
874 }
875 }
876
d6337a5f 877 return 0;
ccb4cabe
SH
878}
879
d7314671 880static char *trim(char *s)
ccb4cabe 881{
7689dfd7
CB
882 size_t len;
883
884 len = strlen(s);
2c28d76b 885 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe 886 s[--len] = '\0';
d7314671
CB
887
888 return s;
ccb4cabe
SH
889}
890
2202afc9 891static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
ccb4cabe
SH
892{
893 int i;
27d84737 894 struct hierarchy **it;
41c33dbe 895
2202afc9
CB
896 if (!ops->hierarchies) {
897 TRACE(" No hierarchies found");
ccb4cabe
SH
898 return;
899 }
27d84737 900
2202afc9
CB
901 TRACE(" Hierarchies:");
902 for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
ccb4cabe 903 int j;
27d84737
CB
904 char **cit;
905
bb221ad1 906 TRACE(" %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
2202afc9
CB
907 TRACE(" mountpoint: %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
908 TRACE(" controllers:");
a7b0cc4c 909 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
2202afc9 910 TRACE(" %d: %s", j, *cit);
ccb4cabe
SH
911 }
912}
41c33dbe 913
a3926f6a
CB
914static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
915 char **nlist)
41c33dbe
SH
916{
917 int k;
a7b0cc4c 918 char **it;
41c33dbe 919
2202afc9
CB
920 TRACE("basecginfo is:");
921 TRACE("%s", basecginfo);
41c33dbe 922
a7b0cc4c 923 for (k = 0, it = klist; it && *it; it++, k++)
2202afc9 924 TRACE("kernel subsystem %d: %s", k, *it);
0f71dd9b 925
a7b0cc4c 926 for (k = 0, it = nlist; it && *it; it++, k++)
2202afc9 927 TRACE("named subsystem %d: %s", k, *it);
41c33dbe 928}
ccb4cabe 929
59eac805 930static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *container_cgroup)
c71d83e1 931{
2202afc9
CB
932 if (!container_cgroup || !hierarchies)
933 return 0;
d6337a5f 934
8e64b673 935 for (int i = 0; hierarchies[i]; i++) {
2202afc9 936 struct hierarchy *h = hierarchies[i];
77c3e9a2 937 int ret;
d6337a5f 938
a900cbaf 939 if (!h->container_limit_path)
2202afc9
CB
940 continue;
941
a900cbaf 942 ret = lxc_rm_rf(h->container_limit_path);
2202afc9 943 if (ret < 0)
a900cbaf 944 WARN("Failed to destroy \"%s\"", h->container_limit_path);
2202afc9 945
a900cbaf
WB
946 if (h->container_limit_path != h->container_full_path)
947 free_disarm(h->container_limit_path);
77c3e9a2 948 free_disarm(h->container_full_path);
2202afc9 949 }
d6337a5f 950
c71d83e1 951 return 0;
d6337a5f
CB
952}
953
2202afc9
CB
954struct generic_userns_exec_data {
955 struct hierarchy **hierarchies;
956 const char *container_cgroup;
957 struct lxc_conf *conf;
958 uid_t origuid; /* target uid in parent namespace */
959 char *path;
960};
d6337a5f 961
de6fe132 962static int cgroup_tree_remove_wrapper(void *data)
2202afc9 963{
2202afc9
CB
964 struct generic_userns_exec_data *arg = data;
965 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
966 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
8e64b673 967 int ret;
d6337a5f 968
8917c382 969 if (!lxc_drop_groups() && errno != EPERM)
b58214ac
CB
970 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
971
2202afc9 972 ret = setresgid(nsgid, nsgid, nsgid);
8e64b673 973 if (ret < 0)
77c3e9a2 974 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
8e64b673 975 (int)nsgid, (int)nsgid, (int)nsgid);
d6337a5f 976
2202afc9 977 ret = setresuid(nsuid, nsuid, nsuid);
8e64b673 978 if (ret < 0)
77c3e9a2 979 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
8e64b673 980 (int)nsuid, (int)nsuid, (int)nsuid);
d6337a5f 981
de6fe132 982 return cgroup_tree_remove(arg->hierarchies, arg->container_cgroup);
d6337a5f
CB
983}
984
434c8e15
CB
985__cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
986 struct lxc_handler *handler)
d6337a5f
CB
987{
988 int ret;
bd8ef4e4 989
fc3b9533
CB
990 if (!ops) {
991 ERROR("Called with uninitialized cgroup operations");
992 return;
993 }
fc1c3af9 994
69b4a4bb
CB
995 if (!ops->hierarchies)
996 return;
997
fc3b9533
CB
998 if (!handler) {
999 ERROR("Called with uninitialized handler");
1000 return;
1001 }
fc1c3af9 1002
fc3b9533
CB
1003 if (!handler->conf) {
1004 ERROR("Called with uninitialized conf");
1005 return;
1006 }
fc1c3af9 1007
bf651989 1008#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
31b84c7a 1009 ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
bf651989
CB
1010 if (ret < 0)
1011 WARN("Failed to detach bpf program from cgroup");
1012#endif
1013
bb6dbaf0 1014 if (!lxc_list_empty(&handler->conf->id_map)) {
8e64b673 1015 struct generic_userns_exec_data wrap = {
77c3e9a2
CB
1016 .conf = handler->conf,
1017 .container_cgroup = ops->container_cgroup,
1018 .hierarchies = ops->hierarchies,
1019 .origuid = 0,
8e64b673 1020 };
de6fe132
CB
1021 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
1022 &wrap, "cgroup_tree_remove_wrapper");
8e64b673 1023 } else {
de6fe132 1024 ret = cgroup_tree_remove(ops->hierarchies, ops->container_cgroup);
ccb4cabe 1025 }
8e64b673 1026 if (ret < 0)
fc3b9533 1027 SYSWARN("Failed to destroy cgroups");
ccb4cabe
SH
1028}
1029
434c8e15
CB
1030__cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1031 struct lxc_handler *handler)
1032{
1033 int len;
434c8e15 1034 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1973b62a 1035 const struct lxc_conf *conf;
b376d3d0 1036
fc3b9533
CB
1037 if (!ops) {
1038 ERROR("Called with uninitialized cgroup operations");
1039 return;
1040 }
434c8e15
CB
1041
1042 if (!ops->hierarchies)
1043 return;
1044
fc3b9533
CB
1045 if (!handler) {
1046 ERROR("Called with uninitialized handler");
1047 return;
1048 }
b376d3d0 1049
fc3b9533
CB
1050 if (!handler->conf) {
1051 ERROR("Called with uninitialized conf");
1052 return;
1053 }
1973b62a
CB
1054 conf = handler->conf;
1055
0bba27c1
CB
1056 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1057 if (len < 0)
434c8e15
CB
1058 return;
1059
1060 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1061 __do_free char *pivot_path = NULL;
434c8e15 1062 struct hierarchy *h = ops->hierarchies[i];
77ffeed2 1063 size_t offset;
fe70edee 1064 int ret;
434c8e15
CB
1065
1066 if (!h->monitor_full_path)
1067 continue;
1068
c468e4d4
CB
1069 /* Monitor might have died before we entered the cgroup. */
1070 if (handler->monitor_pid <= 0) {
1071 WARN("No valid monitor process found while destroying cgroups");
8408a9cc 1072 goto try_lxc_rm_rf;
c468e4d4
CB
1073 }
1074
bb6dbaf0 1075 if (conf->cgroup_meta.monitor_pivot_dir)
7696c1f9
RJ
1076 pivot_path = must_make_path(h->mountpoint, h->container_base_path,
1077 conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
bb6dbaf0 1078 else if (conf->cgroup_meta.monitor_dir)
77ffeed2
CB
1079 pivot_path = must_make_path(h->mountpoint, h->container_base_path,
1080 conf->cgroup_meta.monitor_dir, CGROUP_PIVOT, NULL);
bb6dbaf0 1081 else if (conf->cgroup_meta.dir)
77ffeed2
CB
1082 pivot_path = must_make_path(h->mountpoint, h->container_base_path,
1083 conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
1973b62a 1084 else
77ffeed2 1085 pivot_path = must_make_path(h->mountpoint, h->container_base_path,
1973b62a
CB
1086 CGROUP_PIVOT, NULL);
1087
77ffeed2
CB
1088 offset = strlen(h->mountpoint) + strlen(h->container_base_path);
1089
1090 if (cg_legacy_handle_cpuset_hierarchy(h, pivot_path + offset))
1091 SYSWARN("Failed to initialize cpuset %s/" CGROUP_PIVOT, pivot_path);
1092
1973b62a 1093 ret = mkdir_p(pivot_path, 0755);
fc3b9533
CB
1094 if (ret < 0 && errno != EEXIST) {
1095 ERROR("Failed to create %s", pivot_path);
8408a9cc 1096 goto try_lxc_rm_rf;
fc3b9533 1097 }
1973b62a 1098
c468e4d4
CB
1099 ret = lxc_write_openat(pivot_path, "cgroup.procs", pidstr, len);
1100 if (ret != 0) {
1101 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
1102 continue;
fc3b9533 1103 }
434c8e15 1104
8408a9cc
CB
1105try_lxc_rm_rf:
1106 ret = lxc_rm_rf(h->monitor_full_path);
434c8e15
CB
1107 if (ret < 0)
1108 WARN("Failed to destroy \"%s\"", h->monitor_full_path);
434c8e15
CB
1109 }
1110}
1111
6099dd5a
CB
1112static int mkdir_eexist_on_last(const char *dir, mode_t mode)
1113{
1114 const char *tmp = dir;
1115 const char *orig = dir;
1116 size_t orig_len;
1117
1118 orig_len = strlen(dir);
1119 do {
6453ba56 1120 __do_free char *makeme = NULL;
6099dd5a
CB
1121 int ret;
1122 size_t cur_len;
6099dd5a
CB
1123
1124 dir = tmp + strspn(tmp, "/");
1125 tmp = dir + strcspn(dir, "/");
1126
6099dd5a
CB
1127 cur_len = dir - orig;
1128 makeme = strndup(orig, cur_len);
1129 if (!makeme)
77c3e9a2 1130 return ret_set_errno(-1, ENOMEM);
6099dd5a
CB
1131
1132 ret = mkdir(makeme, mode);
77c3e9a2 1133 if (ret < 0 && ((errno != EEXIST) || (orig_len == cur_len)))
04a49a14 1134 return log_warn_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
6099dd5a
CB
1135 } while (tmp != dir);
1136
1137 return 0;
1138}
1139
432faf20
WB
1140static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
1141 struct hierarchy *h, const char *cgroup_tree,
a900cbaf
WB
1142 const char *cgroup_leaf, bool payload,
1143 const char *cgroup_limit_dir)
72068e74 1144{
432faf20 1145 __do_free char *path = NULL, *limit_path = NULL;
fe70edee 1146 int ret, ret_cpuset;
72068e74 1147
fe70edee
CB
1148 path = must_make_path(h->mountpoint, h->container_base_path, cgroup_leaf, NULL);
1149 if (dir_exists(path))
1150 return log_warn_errno(false, errno, "The %s cgroup already existed", path);
72068e74 1151
fe70edee
CB
1152 ret_cpuset = cg_legacy_handle_cpuset_hierarchy(h, cgroup_leaf);
1153 if (ret_cpuset < 0)
1154 return log_error_errno(false, errno, "Failed to handle legacy cpuset controller");
0c3deb94 1155
432faf20
WB
1156 if (payload && cgroup_limit_dir) {
1157 /* with isolation both parts need to not already exist */
1158 limit_path = must_make_path(h->mountpoint,
1159 h->container_base_path,
1160 cgroup_limit_dir, NULL);
1161
1162 ret = mkdir_eexist_on_last(limit_path, 0755);
1163 if (ret < 0)
04a49a14
CB
1164 return log_debug_errno(false,
1165 errno, "Failed to create %s limiting cgroup",
1166 limit_path);
432faf20
WB
1167
1168 h->cgfd_limit = lxc_open_dirfd(limit_path);
1169 if (h->cgfd_limit < 0)
1170 return log_error_errno(false, errno,
1171 "Failed to open %s", path);
1172 h->container_limit_path = move_ptr(limit_path);
1173
1174 /*
1175 * With isolation the devices legacy cgroup needs to be
1176 * iinitialized early, as it typically contains an 'a' (all)
1177 * line, which is not possible once a subdirectory has been
1178 * created.
1179 */
ec4d463d
CB
1180 if (string_in_list(h->controllers, "devices") &&
1181 !ops->setup_limits_legacy(ops, conf, true))
1182 return log_error(false, "Failed to setup legacy device limits");
432faf20
WB
1183 }
1184
fe70edee 1185 ret = mkdir_eexist_on_last(path, 0755);
6099dd5a 1186 if (ret < 0) {
fe70edee
CB
1187 /*
1188 * This is the cpuset controller and
1189 * cg_legacy_handle_cpuset_hierarchy() has created our target
1190 * directory for us to ensure correct initialization.
1191 */
1192 if (ret_cpuset != 1 || cgroup_tree)
04a49a14 1193 return log_debug_errno(false, errno, "Failed to create %s cgroup", path);
6f9584d8 1194 }
0c3deb94 1195
1973b62a
CB
1196 if (payload) {
1197 h->cgfd_con = lxc_open_dirfd(path);
1198 if (h->cgfd_con < 0)
1199 return log_error_errno(false, errno, "Failed to open %s", path);
fe70edee 1200 h->container_full_path = move_ptr(path);
432faf20 1201 if (h->cgfd_limit < 0)
a900cbaf 1202 h->cgfd_limit = h->cgfd_con;
432faf20
WB
1203 if (!h->container_limit_path)
1204 h->container_limit_path = h->container_full_path;
1973b62a
CB
1205 } else {
1206 h->cgfd_mon = lxc_open_dirfd(path);
1207 if (h->cgfd_mon < 0)
1208 return log_error_errno(false, errno, "Failed to open %s", path);
fe70edee 1209 h->monitor_full_path = move_ptr(path);
1973b62a 1210 }
fe70edee 1211
c581d2a6 1212 return true;
ccb4cabe
SH
1213}
1214
de6fe132 1215static void cgroup_tree_leaf_remove(struct hierarchy *h, bool payload)
ccb4cabe 1216{
a900cbaf
WB
1217 __do_free char *full_path = NULL, *__limit_path = NULL;
1218 char *limit_path = NULL;
72068e74 1219
1973b62a 1220 if (payload) {
f62cf1d4 1221 __lxc_unused __do_close int fd = move_fd(h->cgfd_con);
d6bdd182 1222 full_path = move_ptr(h->container_full_path);
a900cbaf
WB
1223 limit_path = move_ptr(h->container_limit_path);
1224 if (limit_path != full_path)
1225 __limit_path = limit_path;
1973b62a 1226 } else {
f62cf1d4 1227 __lxc_unused __do_close int fd = move_fd(h->cgfd_mon);
d6bdd182 1228 full_path = move_ptr(h->monitor_full_path);
1973b62a 1229 }
e56639fb 1230
d6bdd182 1231 if (full_path && rmdir(full_path))
fe70edee 1232 SYSWARN("Failed to rmdir(\"%s\") cgroup", full_path);
a900cbaf
WB
1233 if (limit_path && rmdir(limit_path))
1234 SYSWARN("Failed to rmdir(\"%s\") cgroup", limit_path);
1235}
1236
1237/*
1238 * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
1239 * proper prefix directory of lxc.cgroup.dir.payload.
1240 *
1241 * Returns the prefix length if it is set, otherwise zero on success.
1242 */
1243static bool check_cgroup_dir_config(struct lxc_conf *conf)
1244{
1245 const char *monitor_dir = conf->cgroup_meta.monitor_dir,
1246 *container_dir = conf->cgroup_meta.container_dir,
1247 *namespace_dir = conf->cgroup_meta.namespace_dir;
a900cbaf
WB
1248
1249 /* none of the new options are set, all is fine */
1250 if (!monitor_dir && !container_dir && !namespace_dir)
1251 return true;
1252
1253 /* some are set, make sure lxc.cgroup.dir is not also set*/
1254 if (conf->cgroup_meta.dir)
1255 return log_error_errno(false, EINVAL,
1256 "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
1257
1258 /* make sure both monitor and payload are set */
1259 if (!monitor_dir || !container_dir)
1260 return log_error_errno(false, EINVAL,
1261 "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
1262
1263 /* namespace_dir may be empty */
1264 return true;
72068e74
CB
1265}
1266
59eac805 1267__cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
72068e74 1268{
b3ed2061 1269 __do_free char *monitor_cgroup = NULL, *__cgroup_tree = NULL;
fe70edee
CB
1270 const char *cgroup_tree;
1271 int idx = 0;
1272 int i;
5ce03bc0 1273 size_t len;
a900cbaf 1274 char *suffix = NULL;
0d66e29a 1275 struct lxc_conf *conf;
72068e74 1276
0d66e29a
CB
1277 if (!ops)
1278 return ret_set_errno(false, ENOENT);
e56639fb 1279
69b4a4bb
CB
1280 if (!ops->hierarchies)
1281 return true;
1282
0d66e29a
CB
1283 if (ops->monitor_cgroup)
1284 return ret_set_errno(false, EEXIST);
1285
1286 if (!handler || !handler->conf)
1287 return ret_set_errno(false, EINVAL);
1288
1289 conf = handler->conf;
1290
a900cbaf
WB
1291 if (!check_cgroup_dir_config(conf))
1292 return false;
1293
1294 if (conf->cgroup_meta.monitor_dir) {
1295 cgroup_tree = NULL;
1296 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
1297 } else if (conf->cgroup_meta.dir) {
b3ed2061 1298 cgroup_tree = conf->cgroup_meta.dir;
fe70edee
CB
1299 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1300 DEFAULT_MONITOR_CGROUP_PREFIX,
1301 handler->name,
1302 CGROUP_CREATE_RETRY, NULL);
b3ed2061
CB
1303 } else if (ops->cgroup_pattern) {
1304 __cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
d6bdd182
CB
1305 if (!__cgroup_tree)
1306 return ret_set_errno(false, ENOMEM);
1307
b3ed2061 1308 cgroup_tree = __cgroup_tree;
d6bdd182
CB
1309 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1310 DEFAULT_MONITOR_CGROUP,
b3ed2061
CB
1311 CGROUP_CREATE_RETRY, NULL);
1312 } else {
d6bdd182 1313 cgroup_tree = NULL;
fe70edee
CB
1314 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1315 handler->name,
1316 CGROUP_CREATE_RETRY, NULL);
b3ed2061 1317 }
fe70edee 1318 if (!monitor_cgroup)
0d66e29a 1319 return ret_set_errno(false, ENOMEM);
72068e74 1320
a900cbaf
WB
1321 if (!conf->cgroup_meta.monitor_dir) {
1322 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1323 *suffix = '\0';
1324 }
5ce03bc0 1325 do {
a900cbaf 1326 if (idx && suffix)
fe70edee 1327 sprintf(suffix, "-%d", idx);
72068e74 1328
ebc10afe 1329 for (i = 0; ops->hierarchies[i]; i++) {
432faf20
WB
1330 if (cgroup_tree_create(ops, handler->conf,
1331 ops->hierarchies[i], cgroup_tree,
1332 monitor_cgroup, false, NULL))
fe70edee
CB
1333 continue;
1334
04a49a14 1335 DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path ?: "(null)");
fe70edee 1336 for (int j = 0; j < i; j++)
de6fe132 1337 cgroup_tree_leaf_remove(ops->hierarchies[j], false);
fe70edee
CB
1338
1339 idx++;
1340 break;
5ce03bc0 1341 }
a900cbaf 1342 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
5ce03bc0 1343
a900cbaf 1344 if (idx == 1000 || (!suffix && idx != 0))
04a49a14 1345 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
72068e74 1346
c581d2a6 1347 ops->monitor_cgroup = move_ptr(monitor_cgroup);
6e8703a4 1348 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
ccb4cabe
SH
1349}
1350
fe70edee
CB
1351/*
1352 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
cecad0c1 1353 * next cgroup_pattern-1, -2, ..., -999.
ccb4cabe 1354 */
59eac805 1355__cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
ccb4cabe 1356{
a900cbaf
WB
1357 __do_free char *container_cgroup = NULL,
1358 *__cgroup_tree = NULL,
1359 *limiting_cgroup = NULL;
fe70edee 1360 const char *cgroup_tree;
f3839f12 1361 int idx = 0;
fe70edee 1362 int i;
ccb4cabe 1363 size_t len;
a900cbaf 1364 char *suffix = NULL;
f3839f12 1365 struct lxc_conf *conf;
43654d34 1366
f3839f12
CB
1367 if (!ops)
1368 return ret_set_errno(false, ENOENT);
ccb4cabe 1369
69b4a4bb
CB
1370 if (!ops->hierarchies)
1371 return true;
1372
f3839f12
CB
1373 if (ops->container_cgroup)
1374 return ret_set_errno(false, EEXIST);
1375
1376 if (!handler || !handler->conf)
1377 return ret_set_errno(false, EINVAL);
1378
1379 conf = handler->conf;
1380
a900cbaf
WB
1381 if (!check_cgroup_dir_config(conf))
1382 return false;
1383
1384 if (conf->cgroup_meta.container_dir) {
1385 cgroup_tree = NULL;
1386
1387 limiting_cgroup = strdup(conf->cgroup_meta.container_dir);
1388 if (!limiting_cgroup)
1389 return ret_set_errno(false, ENOMEM);
1390
432faf20
WB
1391 if (conf->cgroup_meta.namespace_dir) {
1392 container_cgroup = must_make_path(limiting_cgroup,
1393 conf->cgroup_meta.namespace_dir,
1394 NULL);
1395 } else {
1396 /* explicit paths but without isolation */
1397 container_cgroup = move_ptr(limiting_cgroup);
1398 }
a900cbaf 1399 } else if (conf->cgroup_meta.dir) {
b3ed2061 1400 cgroup_tree = conf->cgroup_meta.dir;
fe70edee
CB
1401 container_cgroup = must_concat(&len, cgroup_tree, "/",
1402 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1403 handler->name,
1404 CGROUP_CREATE_RETRY, NULL);
b3ed2061
CB
1405 } else if (ops->cgroup_pattern) {
1406 __cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
d6bdd182
CB
1407 if (!__cgroup_tree)
1408 return ret_set_errno(false, ENOMEM);
1409
b3ed2061 1410 cgroup_tree = __cgroup_tree;
d6bdd182
CB
1411 container_cgroup = must_concat(&len, cgroup_tree, "/",
1412 DEFAULT_PAYLOAD_CGROUP,
b3ed2061
CB
1413 CGROUP_CREATE_RETRY, NULL);
1414 } else {
d6bdd182 1415 cgroup_tree = NULL;
fe70edee
CB
1416 container_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1417 handler->name,
1418 CGROUP_CREATE_RETRY, NULL);
b3ed2061 1419 }
fe70edee
CB
1420 if (!container_cgroup)
1421 return ret_set_errno(false, ENOMEM);
ccb4cabe 1422
a900cbaf
WB
1423 if (!conf->cgroup_meta.container_dir) {
1424 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1425 *suffix = '\0';
1426 }
d97919ab 1427 do {
a900cbaf 1428 if (idx && suffix)
fe70edee 1429 sprintf(suffix, "-%d", idx);
bb30b52a 1430
d97919ab 1431 for (i = 0; ops->hierarchies[i]; i++) {
432faf20
WB
1432 if (cgroup_tree_create(ops, handler->conf,
1433 ops->hierarchies[i], cgroup_tree,
a900cbaf
WB
1434 container_cgroup, true,
1435 limiting_cgroup))
fe70edee
CB
1436 continue;
1437
04a49a14 1438 DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
fe70edee 1439 for (int j = 0; j < i; j++)
de6fe132 1440 cgroup_tree_leaf_remove(ops->hierarchies[j], true);
fe70edee
CB
1441
1442 idx++;
1443 break;
66b66624 1444 }
a900cbaf 1445 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
cecad0c1 1446
a900cbaf 1447 if (idx == 1000 || (!suffix && idx != 0))
04a49a14 1448 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
cecad0c1 1449
fe70edee
CB
1450 ops->container_cgroup = move_ptr(container_cgroup);
1451 INFO("The container process uses \"%s\" as cgroup", ops->container_cgroup);
ccb4cabe 1452 return true;
ccb4cabe
SH
1453}
1454
c581d2a6
CB
1455__cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1456 struct lxc_handler *handler)
ccb4cabe 1457{
fdb0b8ab 1458 int monitor_len, transient_len = 0;
c581d2a6
CB
1459 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1460 transient[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 1461
797fa65e
CB
1462 if (!ops)
1463 return ret_set_errno(false, ENOENT);
1464
69b4a4bb
CB
1465 if (!ops->hierarchies)
1466 return true;
1467
797fa65e
CB
1468 if (!ops->monitor_cgroup)
1469 return ret_set_errno(false, ENOENT);
1470
1471 if (!handler || !handler->conf)
1472 return ret_set_errno(false, EINVAL);
1473
0bba27c1
CB
1474 monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1475 if (monitor_len < 0)
1476 return false;
1477
1478 if (handler->transient_pid > 0) {
1479 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1480 if (transient_len < 0)
1481 return false;
1482 }
ccb4cabe 1483
eeef32bb 1484 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1485 struct hierarchy *h = ops->hierarchies[i];
c581d2a6 1486 int ret;
08768001 1487
1973b62a
CB
1488 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", monitor, monitor_len);
1489 if (ret)
1490 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->monitor_full_path);
c581d2a6 1491
ebf88e5b
CB
1492 TRACE("Moved monitor into %s cgroup via %d", h->monitor_full_path, h->cgfd_mon);
1493
34683042 1494 if (handler->transient_pid <= 0)
d1ee8719 1495 continue;
c581d2a6 1496
1973b62a
CB
1497 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", transient, transient_len);
1498 if (ret)
1499 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->monitor_full_path);
1500
ebf88e5b
CB
1501 TRACE("Moved transient process into %s cgroup via %d", h->monitor_full_path, h->cgfd_mon);
1502
1973b62a 1503 /*
78eb6aa6 1504 * we don't keep the fds for non-unified hierarchies around
1973b62a 1505 * mainly because we don't make use of them anymore after the
78eb6aa6 1506 * core cgroup setup is done but also because there are quite a
1973b62a
CB
1507 * lot of them.
1508 */
1509 if (!is_unified_hierarchy(h))
1510 close_prot_errno_disarm(h->cgfd_mon);
ccb4cabe 1511 }
c581d2a6 1512 handler->transient_pid = -1;
ccb4cabe
SH
1513
1514 return true;
1515}
1516
c581d2a6
CB
1517__cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1518 struct lxc_handler *handler)
eeef32bb 1519{
c581d2a6
CB
1520 int len;
1521 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
eeef32bb 1522
4490328e
CB
1523 if (!ops)
1524 return ret_set_errno(false, ENOENT);
1525
c581d2a6
CB
1526 if (!ops->hierarchies)
1527 return true;
1528
4490328e
CB
1529 if (!ops->container_cgroup)
1530 return ret_set_errno(false, ENOENT);
1531
1532 if (!handler || !handler->conf)
1533 return ret_set_errno(false, EINVAL);
1534
0bba27c1
CB
1535 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1536 if (len < 0)
1537 return false;
c581d2a6
CB
1538
1539 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1540 struct hierarchy *h = ops->hierarchies[i];
c581d2a6
CB
1541 int ret;
1542
b3a42865
CB
1543 if (is_unified_hierarchy(h) &&
1544 (handler->clone_flags & CLONE_INTO_CGROUP))
f7176c3e
CB
1545 continue;
1546
1973b62a 1547 ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
c581d2a6 1548 if (ret != 0)
1973b62a 1549 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
25db3f94
CB
1550
1551 TRACE("Moved container into %s cgroup via %d", h->container_full_path, h->cgfd_con);
c581d2a6
CB
1552 }
1553
1554 return true;
eeef32bb
CB
1555}
1556
1973b62a
CB
1557static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1558 gid_t chown_gid, mode_t chmod_mode)
6efacf80
CB
1559{
1560 int ret;
1561
1973b62a
CB
1562 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1563 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1564 if (ret < 0)
1565 return log_warn_errno(-1,
1566 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1567 dirfd, path, (int)chown_uid,
1568 (int)chown_gid);
6efacf80 1569
1973b62a
CB
1570 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1571 if (ret < 0)
1572 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1573 dirfd, path, (int)chmod_mode);
6efacf80
CB
1574
1575 return 0;
1576}
1577
1578/* chgrp the container cgroups to container group. We leave
c0888dfe
SH
1579 * the container owner as cgroup owner. So we must make the
1580 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1581 *
1582 * Also chown the tasks and cgroup.procs files. Those may not
1583 * exist depending on kernel version.
c0888dfe 1584 */
ccb4cabe
SH
1585static int chown_cgroup_wrapper(void *data)
1586{
6a720d74 1587 int ret;
4160c3a0
CB
1588 uid_t destuid;
1589 struct generic_userns_exec_data *arg = data;
1590 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1591 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1592
8917c382 1593 if (!lxc_drop_groups() && errno != EPERM)
b58214ac
CB
1594 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1595
6efacf80 1596 ret = setresgid(nsgid, nsgid, nsgid);
803e4123 1597 if (ret < 0)
77c3e9a2 1598 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
803e4123 1599 (int)nsgid, (int)nsgid, (int)nsgid);
6efacf80
CB
1600
1601 ret = setresuid(nsuid, nsuid, nsuid);
803e4123 1602 if (ret < 0)
77c3e9a2 1603 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
803e4123 1604 (int)nsuid, (int)nsuid, (int)nsuid);
6efacf80 1605
ccb4cabe 1606 destuid = get_ns_uid(arg->origuid);
b962868f
CB
1607 if (destuid == LXC_INVALID_UID)
1608 destuid = 0;
ccb4cabe 1609
6a720d74 1610 for (int i = 0; arg->hierarchies[i]; i++) {
1973b62a 1611 int dirfd = arg->hierarchies[i]->cgfd_con;
43647298 1612
1973b62a 1613 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
c0888dfe 1614
1973b62a
CB
1615 /*
1616 * Failures to chown() these are inconvenient but not
6efacf80
CB
1617 * detrimental We leave these owned by the container launcher,
1618 * so that container root can write to the files to attach. We
1619 * chmod() them 664 so that container systemd can write to the
1620 * files (which systemd in wily insists on doing).
ab8f5424 1621 */
6efacf80 1622
1973b62a
CB
1623 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC)
1624 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
43647298 1625
1973b62a 1626 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
0e17357c 1627
2202afc9 1628 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
0e17357c
CB
1629 continue;
1630
1973b62a
CB
1631 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++)
1632 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
ccb4cabe
SH
1633 }
1634
1635 return 0;
1636}
1637
b857f4be 1638__cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
c98bbf71 1639 struct lxc_conf *conf)
ccb4cabe 1640{
4160c3a0 1641 struct generic_userns_exec_data wrap;
ccb4cabe 1642
c98bbf71
CB
1643 if (!ops)
1644 return ret_set_errno(false, ENOENT);
ccb4cabe 1645
69b4a4bb
CB
1646 if (!ops->hierarchies)
1647 return true;
1648
c98bbf71
CB
1649 if (!ops->container_cgroup)
1650 return ret_set_errno(false, ENOENT);
1651
1652 if (!conf)
1653 return ret_set_errno(false, EINVAL);
1654
1655 if (lxc_list_empty(&conf->id_map))
1656 return true;
1657
ccb4cabe 1658 wrap.origuid = geteuid();
4160c3a0 1659 wrap.path = NULL;
2202afc9 1660 wrap.hierarchies = ops->hierarchies;
4160c3a0 1661 wrap.conf = conf;
ccb4cabe 1662
c98bbf71
CB
1663 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1664 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
ccb4cabe
SH
1665
1666 return true;
1667}
1668
59eac805 1669__cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops)
78eb6aa6
CB
1670{
1671 if (!ops)
1672 return;
1673
1674 if (!ops->hierarchies)
1675 return;
1676
1677 for (int i = 0; ops->hierarchies[i]; i++) {
1678 struct hierarchy *h = ops->hierarchies[i];
1679 /*
1680 * we don't keep the fds for non-unified hierarchies around
1681 * mainly because we don't make use of them anymore after the
1682 * core cgroup setup is done but also because there are quite a
1683 * lot of them.
1684 */
1685 if (!is_unified_hierarchy(h))
1686 close_prot_errno_disarm(h->cgfd_con);
1687 }
6dcd6f02
CB
1688
1689 /*
1690 * The checking for freezer support should obviously be done at cgroup
1691 * initialization time but that doesn't work reliable. The freezer
1692 * controller has been demoted (rightly so) to a simple file located in
1693 * each non-root cgroup. At the time when the container is created we
1694 * might still be located in /sys/fs/cgroup and so checking for
1695 * cgroup.freeze won't tell us anything because this file doesn't exist
1696 * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1697 * find an already existing cgroup and then check within that cgroup
1698 * for the existence of cgroup.freeze but that will only work on
1699 * systemd based hosts. Other init systems might not manage cgroups and
1700 * so no cgroup will exist. So we defer until we have created cgroups
1701 * for our container which means we check here.
1702 */
1703 if (pure_unified_layout(ops) &&
1704 !faccessat(ops->unified->cgfd_con, "cgroup.freeze", F_OK,
1705 AT_SYMLINK_NOFOLLOW)) {
1706 TRACE("Unified hierarchy supports freezer");
1707 ops->unified->freezer_controller = 1;
1708 }
78eb6aa6
CB
1709}
1710
8aa1044f 1711/* cgroup-full:* is done, no need to create subdirs */
77c3e9a2 1712static inline bool cg_mount_needs_subdirs(int type)
8aa1044f 1713{
77c3e9a2 1714 return !(type >= LXC_AUTO_CGROUP_FULL_RO);
8aa1044f
SH
1715}
1716
886cac86
CB
1717/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1718 * remount controller ro if needed and bindmount the cgroupfs onto
25fa6f8c 1719 * control/the/cg/path.
8aa1044f 1720 */
6812d833
CB
1721static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1722 char *controllerpath, char *cgpath,
1723 const char *container_cgroup)
8aa1044f 1724{
d97919ab 1725 __do_free char *sourcepath = NULL;
5285689c 1726 int ret, remount_flags;
886cac86
CB
1727 int flags = MS_BIND;
1728
8aa1044f 1729 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
886cac86 1730 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
77c3e9a2
CB
1731 if (ret < 0)
1732 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1733 controllerpath, controllerpath);
886cac86 1734
5285689c
CB
1735 remount_flags = add_required_remount_flags(controllerpath,
1736 controllerpath,
1737 flags | MS_REMOUNT);
886cac86 1738 ret = mount(controllerpath, controllerpath, "cgroup",
8186c5c7
CB
1739 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1740 NULL);
77c3e9a2
CB
1741 if (ret < 0)
1742 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath);
886cac86 1743
8aa1044f
SH
1744 INFO("Remounted %s read-only", controllerpath);
1745 }
886cac86 1746
bb221ad1 1747 sourcepath = must_make_path(h->mountpoint, h->container_base_path,
886cac86 1748 container_cgroup, NULL);
8aa1044f
SH
1749 if (type == LXC_AUTO_CGROUP_RO)
1750 flags |= MS_RDONLY;
886cac86
CB
1751
1752 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
77c3e9a2
CB
1753 if (ret < 0)
1754 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1755 h->controllers[0], cgpath);
886cac86 1756 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
f8c40ffa
L
1757
1758 if (flags & MS_RDONLY) {
5285689c
CB
1759 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1760 flags | MS_REMOUNT);
1761 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
77c3e9a2
CB
1762 if (ret < 0)
1763 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
5285689c 1764 INFO("Remounted %s read-only", cgpath);
f8c40ffa
L
1765 }
1766
886cac86 1767 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
8aa1044f
SH
1768 return 0;
1769}
1770
6812d833
CB
1771/* __cg_mount_direct
1772 *
1773 * Mount cgroup hierarchies directly without using bind-mounts. The main
1774 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1775 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1776 */
1777static int __cg_mount_direct(int type, struct hierarchy *h,
02efd041
CB
1778 struct lxc_rootfs *rootfs,
1779 int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
b635e92d 1780{
a099c5db
CB
1781 __do_close int fd_fs = -EBADF;
1782 unsigned int flags = 0;
02efd041
CB
1783 char *fstype;
1784 int ret;
1785
1786 if (dfd_mnt_cgroupfs < 0)
1787 return ret_errno(EINVAL);
1788
a099c5db
CB
1789 flags |= MOUNT_ATTR_NOSUID;
1790 flags |= MOUNT_ATTR_NOEXEC;
1791 flags |= MOUNT_ATTR_NODEV;
1792 flags |= MOUNT_ATTR_RELATIME;
02efd041
CB
1793
1794 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
a099c5db 1795 flags |= MOUNT_ATTR_RDONLY;
02efd041
CB
1796
1797 if (is_unified_hierarchy(h)) {
1798 fstype = "cgroup2";
1799 } else {
1800 fstype = "cgroup";
b635e92d
CB
1801 }
1802
de7f9f33 1803 if (can_use_mount_api()) {
635e7bac
CB
1804 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1805 if (fd_fs < 0)
1806 return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1807
1808 if (!is_unified_hierarchy(h)) {
1809 for (const char **it = (const char **)h->controllers; it && *it; it++) {
1810 if (strncmp(*it, "name=", STRLITERALLEN("name=")) == 0)
1811 ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1812 else
1813 ret = fs_set_property(fd_fs, *it, "");
1814 if (ret < 0)
1815 return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1816 }
1817 }
1818
1819 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1820 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1821 flags);
1822 } else {
a099c5db
CB
1823 __do_free char *controllers = NULL, *target = NULL;
1824 unsigned int old_flags = 0;
02efd041
CB
1825 const char *rootfs_mnt;
1826
a099c5db
CB
1827 if (!is_unified_hierarchy(h)) {
1828 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1829 if (!controllers)
1830 return ret_errno(ENOMEM);
1831 }
1832
02efd041 1833 rootfs_mnt = get_rootfs_mnt(rootfs);
a099c5db
CB
1834 ret = mnt_attributes_old(flags, &old_flags);
1835 if (ret)
1836 return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1837
02efd041 1838 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
a099c5db 1839 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
02efd041 1840 }
77c3e9a2 1841 if (ret < 0)
02efd041
CB
1842 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1843 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
b635e92d 1844
02efd041
CB
1845 DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1846 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
b635e92d
CB
1847 return 0;
1848}
1849
6812d833 1850static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
02efd041
CB
1851 struct lxc_rootfs *rootfs,
1852 int dfd_mnt_cgroupfs,
1853 const char *hierarchy_mnt)
6812d833 1854{
02efd041 1855 return __cg_mount_direct(type, h, rootfs, dfd_mnt_cgroupfs, hierarchy_mnt);
6812d833
CB
1856}
1857
1858static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
02efd041
CB
1859 struct lxc_rootfs *rootfs,
1860 int dfd_mnt_cgroupfs,
1861 const char *hierarchy_mnt)
6812d833
CB
1862{
1863 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1864 return 0;
1865
02efd041 1866 return __cg_mount_direct(type, h, rootfs, dfd_mnt_cgroupfs, hierarchy_mnt);
6812d833
CB
1867}
1868
b857f4be 1869__cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
315f8a4e 1870 struct lxc_conf *conf, int type)
ccb4cabe 1871{
23a20dbe 1872 __do_close int dfd_mnt_cgroupfs = -EBADF, fd_fs = -EBADF;
6607d6e9 1873 __do_free char *cgroup_root = NULL;
d7314671 1874 bool has_cgns = false, wants_force_mount = false;
315f8a4e 1875 struct lxc_rootfs *rootfs = &conf->rootfs;
02efd041 1876 const char *rootfs_mnt = get_rootfs_mnt(rootfs);
dfa835ac 1877 int ret;
8aa1044f 1878
9585ccb3
CB
1879 if (!ops)
1880 return ret_set_errno(false, ENOENT);
1881
69b4a4bb
CB
1882 if (!ops->hierarchies)
1883 return true;
1884
315f8a4e 1885 if (!conf)
9585ccb3
CB
1886 return ret_set_errno(false, EINVAL);
1887
8aa1044f
SH
1888 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1889 return true;
1890
3f69fb12
SY
1891 if (type & LXC_AUTO_CGROUP_FORCE) {
1892 type &= ~LXC_AUTO_CGROUP_FORCE;
1893 wants_force_mount = true;
1894 }
b635e92d 1895
4547e73e 1896 if (!wants_force_mount) {
315f8a4e 1897 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
4547e73e
CB
1898
1899 /*
1900 * Most recent distro versions currently have init system that
1901 * do support cgroup2 but do not mount it by default unless
1902 * explicitly told so even if the host is cgroup2 only. That
1903 * means they often will fail to boot. Fix this by pre-mounting
1904 * cgroup2 by default. We will likely need to be doing this a
1905 * few years until all distros have switched over to cgroup2 at
1906 * which point we can safely assume that their init systems
1907 * will mount it themselves.
1908 */
1909 if (pure_unified_layout(ops))
1910 wants_force_mount = true;
3f69fb12 1911 }
8aa1044f 1912
3f69fb12
SY
1913 has_cgns = cgns_supported();
1914 if (has_cgns && !wants_force_mount)
1915 return true;
8aa1044f
SH
1916
1917 if (type == LXC_AUTO_CGROUP_NOSPEC)
1918 type = LXC_AUTO_CGROUP_MIXED;
1919 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1920 type = LXC_AUTO_CGROUP_FULL_MIXED;
1921
02efd041
CB
1922 /* This is really the codepath that we want. */
1923 if (pure_unified_layout(ops)) {
ea57e424 1924 dfd_mnt_cgroupfs = open_at(rootfs->dfd_mnt,
02efd041
CB
1925 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1926 PROTECT_OPATH_DIRECTORY,
1927 PROTECT_LOOKUP_BENEATH_XDEV, 0);
1928 if (dfd_mnt_cgroupfs < 0)
1929 return log_error_errno(-errno, errno, "Failed to open %d(%s)",
ea57e424 1930 rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
02efd041 1931
8d661d38 1932 if (has_cgns && wants_force_mount) {
d7314671
CB
1933 /*
1934 * If cgroup namespaces are supported but the container
8d661d38
CB
1935 * will not have CAP_SYS_ADMIN after it has started we
1936 * need to mount the cgroups manually.
1937 */
02efd041 1938 return cg_mount_in_cgroup_namespace(type, ops->unified, rootfs, dfd_mnt_cgroupfs, "") == 0;
8d661d38
CB
1939 }
1940
02efd041 1941 return cg_mount_cgroup_full(type, ops->unified, rootfs, dfd_mnt_cgroupfs, "") == 0;
8d661d38
CB
1942 }
1943
e6d4df78
CB
1944 /*
1945 * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1946 * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1947 * DEFAULT_CGROUP_MOUNTPOINT define.
1948 */
de7f9f33 1949 if (can_use_mount_api()) {
635e7bac
CB
1950 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1951 if (fd_fs < 0)
1952 return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1953
23a20dbe
CB
1954 ret = fs_set_property(fd_fs, "mode", "0755");
1955 if (ret < 0)
1956 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1957
1958 ret = fs_set_property(fd_fs, "size", "10240k");
1959 if (ret < 0)
1960 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1961
1962 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1963 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1964 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1965 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
635e7bac
CB
1966 } else {
1967 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1968 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1969 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1970 "size=10240k,mode=755", rootfs_mnt);
8b1f4dd9 1971 }
3f69fb12 1972 if (ret < 0)
02efd041
CB
1973 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1974 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
8aa1044f 1975
ea57e424 1976 dfd_mnt_cgroupfs = open_at(rootfs->dfd_mnt,
c689b58a
CB
1977 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1978 PROTECT_OPATH_DIRECTORY,
1979 PROTECT_LOOKUP_BENEATH_XDEV, 0);
1980 if (dfd_mnt_cgroupfs < 0)
1981 return log_error_errno(-errno, errno, "Failed to open %d(%s)",
ea57e424 1982 rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
c689b58a 1983
dfa835ac 1984 for (int i = 0; ops->hierarchies[i]; i++) {
d97919ab 1985 __do_free char *controllerpath = NULL, *path2 = NULL;
2202afc9 1986 struct hierarchy *h = ops->hierarchies[i];
8aa1044f 1987 char *controller = strrchr(h->mountpoint, '/');
8aa1044f
SH
1988
1989 if (!controller)
1990 continue;
1991 controller++;
affd10fa 1992
c689b58a 1993 ret = mkdirat(dfd_mnt_cgroupfs, controller, 0000);
d7314671 1994 if (ret < 0)
02efd041 1995 return log_error_errno(false, errno, "Failed to create cgroup mountpoint %d(%s)", dfd_mnt_cgroupfs, controller);
b635e92d 1996
3f69fb12 1997 if (has_cgns && wants_force_mount) {
02efd041
CB
1998 /*
1999 * If cgroup namespaces are supported but the container
b635e92d
CB
2000 * will not have CAP_SYS_ADMIN after it has started we
2001 * need to mount the cgroups manually.
2002 */
02efd041 2003 ret = cg_mount_in_cgroup_namespace(type, h, rootfs, dfd_mnt_cgroupfs, controller);
3f69fb12 2004 if (ret < 0)
d7314671 2005 return false;
3f69fb12 2006
b635e92d
CB
2007 continue;
2008 }
2009
02efd041
CB
2010 /* Here is where the ancient kernel section begins. */
2011 ret = cg_mount_cgroup_full(type, h, rootfs, dfd_mnt_cgroupfs, controller);
d97919ab 2012 if (ret < 0)
d7314671 2013 return false;
3f69fb12 2014
d97919ab 2015 if (!cg_mount_needs_subdirs(type))
8aa1044f 2016 continue;
3f69fb12 2017
02efd041
CB
2018 controllerpath = must_make_path(cgroup_root, controller, NULL);
2019 if (dir_exists(controllerpath))
2020 continue;
2021
2022 path2 = must_make_path(controllerpath, h->container_base_path, ops->container_cgroup, NULL);
3f69fb12 2023 ret = mkdir_p(path2, 0755);
d97919ab 2024 if (ret < 0)
d7314671 2025 return false;
2f62fb00 2026
02efd041 2027 ret = cg_legacy_mount_controllers(type, h, controllerpath, path2, ops->container_cgroup);
3f69fb12 2028 if (ret < 0)
d7314671 2029 return false;
8aa1044f 2030 }
8aa1044f 2031
d7314671 2032 return true;
ccb4cabe
SH
2033}
2034
11c23867 2035/* Only root needs to escape to the cgroup of its init. */
ff9edd2d
CB
2036__cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
2037 struct lxc_conf *conf)
ccb4cabe 2038{
52d08ab0
CB
2039 if (!ops)
2040 return ret_set_errno(false, ENOENT);
2041
2042 if (!ops->hierarchies)
2043 return true;
2044
2045 if (!conf)
2046 return ret_set_errno(false, EINVAL);
2047
2048 if (conf->cgroup_meta.relative || geteuid())
ccb4cabe
SH
2049 return true;
2050
779b3d82 2051 for (int i = 0; ops->hierarchies[i]; i++) {
88396101 2052 __do_free char *fullpath = NULL;
52d08ab0 2053 int ret;
11c23867 2054
52d08ab0
CB
2055 fullpath =
2056 must_make_path(ops->hierarchies[i]->mountpoint,
2057 ops->hierarchies[i]->container_base_path,
2058 "cgroup.procs", NULL);
7cea5905 2059 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
52d08ab0 2060 if (ret != 0)
77c3e9a2 2061 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
ccb4cabe
SH
2062 }
2063
6df334d1 2064 return true;
ccb4cabe
SH
2065}
2066
ff9edd2d 2067__cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
36662416 2068{
69b4a4bb
CB
2069 int i = 0;
2070
e3ffb28b
CB
2071 if (!ops)
2072 return ret_set_errno(-1, ENOENT);
2073
69b4a4bb
CB
2074 if (!ops->hierarchies)
2075 return 0;
36662416 2076
69b4a4bb 2077 for (; ops->hierarchies[i]; i++)
36662416
TA
2078 ;
2079
2080 return i;
2081}
2082
ff9edd2d
CB
2083__cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
2084 int n, char ***out)
36662416
TA
2085{
2086 int i;
2087
aa48a34f
CB
2088 if (!ops)
2089 return ret_set_errno(false, ENOENT);
2090
69b4a4bb 2091 if (!ops->hierarchies)
77c3e9a2 2092 return ret_set_errno(false, ENOENT);
69b4a4bb 2093
36662416 2094 /* sanity check n */
6b38e644 2095 for (i = 0; i < n; i++)
2202afc9 2096 if (!ops->hierarchies[i])
aa48a34f 2097 return ret_set_errno(false, ENOENT);
36662416 2098
2202afc9 2099 *out = ops->hierarchies[i]->controllers;
36662416
TA
2100
2101 return true;
2102}
2103
ee3a7775 2104static bool cg_legacy_freeze(struct cgroup_ops *ops)
ccb4cabe 2105{
d6337a5f 2106 struct hierarchy *h;
ccb4cabe 2107
ee3a7775
CB
2108 h = get_hierarchy(ops, "freezer");
2109 if (!h)
d2203230 2110 return ret_set_errno(-1, ENOENT);
81468ea7 2111
c04a6d4e
CB
2112 return lxc_write_openat(h->container_full_path, "freezer.state",
2113 "FROZEN", STRLITERALLEN("FROZEN"));
ee3a7775 2114}
942e193e 2115
018051e3
CB
2116static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
2117 struct lxc_epoll_descr *descr)
ee3a7775 2118{
018051e3 2119 __do_free char *line = NULL;
ee3a7775 2120 __do_fclose FILE *f = NULL;
018051e3
CB
2121 int state = PTR_TO_INT(cbdata);
2122 size_t len;
2123 const char *state_string;
2124
c8af3332 2125 f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
018051e3
CB
2126 if (!f)
2127 return LXC_MAINLOOP_ERROR;
018051e3
CB
2128
2129 if (state == 1)
2130 state_string = "frozen 1";
2131 else
2132 state_string = "frozen 0";
2133
2134 while (getline(&line, &len, f) != -1)
2135 if (strncmp(line, state_string, STRLITERALLEN("frozen") + 2) == 0)
2136 return LXC_MAINLOOP_CLOSE;
2137
281c3645
CB
2138 rewind(f);
2139
018051e3
CB
2140 return LXC_MAINLOOP_CONTINUE;
2141}
2142
443be565
WB
2143static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
2144 const char *state_string,
2145 int state_num,
2146 const char *epoll_error,
2147 const char *wait_error)
018051e3 2148{
f62cf1d4 2149 __do_close int fd = -EBADF;
eafc1bb6 2150 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
018051e3
CB
2151 int ret;
2152 struct lxc_epoll_descr descr;
ee3a7775 2153 struct hierarchy *h;
942e193e
CB
2154
2155 h = ops->unified;
457ca9aa 2156 if (!h)
d2203230 2157 return ret_set_errno(-1, ENOENT);
d6337a5f 2158
018051e3 2159 if (!h->container_full_path)
d2203230 2160 return ret_set_errno(-1, EEXIST);
d6337a5f 2161
018051e3
CB
2162 if (timeout != 0) {
2163 __do_free char *events_file = NULL;
942e193e 2164
018051e3
CB
2165 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2166 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2167 if (fd < 0)
d2203230 2168 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
942e193e 2169
018051e3
CB
2170 ret = lxc_mainloop_open(&descr);
2171 if (ret)
443be565 2172 return log_error_errno(-1, errno, "%s", epoll_error);
942e193e 2173
018051e3
CB
2174 /* automatically cleaned up now */
2175 descr_ptr = &descr;
942e193e 2176
385e58e8 2177 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
018051e3 2178 if (ret < 0)
d2203230 2179 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 2180 }
942e193e 2181
443be565 2182 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", state_string, 1);
018051e3 2183 if (ret < 0)
d2203230 2184 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
2185
2186 if (timeout != 0 && lxc_mainloop(&descr, timeout))
443be565 2187 return log_error_errno(-1, errno, "%s", wait_error);
018051e3
CB
2188
2189 return 0;
942e193e
CB
2190}
2191
443be565
WB
2192static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
2193{
2194 return cg_unified_freeze_do(ops, timeout, "1", 1,
2195 "Failed to create epoll instance to wait for container freeze",
2196 "Failed to wait for container to be frozen");
2197}
2198
018051e3 2199__cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
942e193e 2200{
81468ea7 2201 if (!ops->hierarchies)
d2203230 2202 return ret_set_errno(-1, ENOENT);
81468ea7 2203
ee3a7775
CB
2204 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2205 return cg_legacy_freeze(ops);
942e193e 2206
018051e3 2207 return cg_unified_freeze(ops, timeout);
ee3a7775
CB
2208}
2209
018051e3 2210static int cg_legacy_unfreeze(struct cgroup_ops *ops)
ee3a7775 2211{
ee3a7775
CB
2212 struct hierarchy *h;
2213
2214 h = get_hierarchy(ops, "freezer");
2215 if (!h)
d2203230 2216 return ret_set_errno(-1, ENOENT);
ee3a7775 2217
c04a6d4e
CB
2218 return lxc_write_openat(h->container_full_path, "freezer.state",
2219 "THAWED", STRLITERALLEN("THAWED"));
ee3a7775
CB
2220}
2221
018051e3 2222static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775 2223{
443be565
WB
2224 return cg_unified_freeze_do(ops, timeout, "0", 0,
2225 "Failed to create epoll instance to wait for container unfreeze",
2226 "Failed to wait for container to be unfrozen");
ee3a7775
CB
2227}
2228
018051e3 2229__cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775
CB
2230{
2231 if (!ops->hierarchies)
d2203230 2232 return ret_set_errno(-1, ENOENT);
ee3a7775
CB
2233
2234 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2235 return cg_legacy_unfreeze(ops);
2236
018051e3 2237 return cg_unified_unfreeze(ops, timeout);
ccb4cabe
SH
2238}
2239
a900cbaf
WB
2240static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2241 const char *controller, bool limiting)
ccb4cabe 2242{
d6337a5f
CB
2243 struct hierarchy *h;
2244
2202afc9 2245 h = get_hierarchy(ops, controller);
6bdf9691 2246 if (!h)
77c3e9a2 2247 return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
6bdf9691 2248 controller ? controller : "(null)");
ccb4cabe 2249
a900cbaf
WB
2250 if (limiting)
2251 return h->container_limit_path
2252 ? h->container_limit_path + strlen(h->mountpoint)
2253 : NULL;
2254
6bdf9691
CB
2255 return h->container_full_path
2256 ? h->container_full_path + strlen(h->mountpoint)
2257 : NULL;
371f834d
SH
2258}
2259
a900cbaf
WB
2260__cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2261 const char *controller)
2262{
2263 return cgfsng_get_cgroup_do(ops, controller, false);
2264}
2265
2266__cgfsng_ops static const char *cgfsng_get_limiting_cgroup(struct cgroup_ops *ops,
2267 const char *controller)
2268{
2269 return cgfsng_get_cgroup_do(ops, controller, true);
2270}
2271
c40c8209
CB
2272/* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2273 * which must be freed by the caller.
371f834d 2274 */
c40c8209
CB
2275static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2276 const char *inpath,
2277 const char *filename)
371f834d 2278{
371f834d 2279 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
2280}
2281
4b86fefd 2282static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
c2aed66d 2283{
ad275c16 2284 int idx = 1;
c2aed66d 2285 int ret;
900b6606 2286 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
6e2078de 2287 ssize_t pidstr_len;
c2aed66d 2288
ad275c16 2289 /* Create leaf cgroup. */
275e8ef8 2290 ret = mkdirat(unified_fd, ".lxc", 0755);
ad275c16 2291 if (ret < 0 && errno != EEXIST)
6e2078de
CB
2292 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2293
0bba27c1
CB
2294 pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2295 if (pidstr_len < 0)
2296 return pidstr_len;
ad275c16 2297
275e8ef8 2298 ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
ad275c16
CB
2299 if (ret < 0)
2300 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
c2aed66d 2301 if (ret == 0)
6e2078de 2302 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
ad275c16 2303
bad788b0
CB
2304 /* this is a non-leaf node */
2305 if (errno != EBUSY)
6e2078de 2306 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
c2aed66d 2307
c2aed66d 2308 do {
7581a82f 2309 bool rm = false;
c80c9a70 2310 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
9fd047d1 2311 char *slash = attach_cgroup;
c2aed66d 2312
0bba27c1
CB
2313 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2314 if (ret < 0)
2315 return ret;
5045306b 2316
c80c9a70
CB
2317 /*
2318 * This shouldn't really happen but the compiler might complain
2319 * that a short write would cause a buffer overrun. So be on
2320 * the safe side.
2321 */
2322 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2323 return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2324
9fd047d1 2325 slash += (ret - STRLITERALLEN("/cgroup.procs"));
bad788b0 2326 *slash = '\0';
ad275c16 2327
bad788b0 2328 ret = mkdirat(unified_fd, attach_cgroup, 0755);
c2aed66d 2329 if (ret < 0 && errno != EEXIST)
d2203230 2330 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
7581a82f
CB
2331 if (ret == 0)
2332 rm = true;
c2aed66d 2333
bad788b0 2334 *slash = '/';
ad275c16 2335
bad788b0 2336 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
c2aed66d 2337 if (ret == 0)
6e2078de 2338 return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
c2aed66d 2339
7581a82f
CB
2340 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2341 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2342
c2aed66d
CB
2343 /* this is a non-leaf node */
2344 if (errno != EBUSY)
d2203230 2345 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2346
edae86e9
CB
2347 idx++;
2348 } while (idx < 1000);
c2aed66d 2349
ad275c16 2350 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d
CB
2351}
2352
d1783ef4
CB
2353static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2354 int unified_fd, int *sk_fd)
2355{
7d849163
CB
2356 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2357 int target_fds[2];
d1783ef4
CB
2358 ssize_t ret;
2359
2360 /* Create leaf cgroup. */
2361 ret = mkdirat(unified_fd, ".lxc", 0755);
2362 if (ret < 0 && errno != EEXIST)
2363 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2364
7043e2b4 2365 target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
7d849163 2366 if (target_fd0 < 0)
d1783ef4 2367 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
7d849163 2368 target_fds[0] = target_fd0;
d1783ef4 2369
7043e2b4 2370 target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
7d849163 2371 if (target_fd1 < 0)
49df620b 2372 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
7d849163 2373 target_fds[1] = target_fd1;
49df620b
CB
2374
2375 ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
d1783ef4 2376 if (ret <= 0)
49df620b 2377 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
7d849163 2378 target_fd0, target_fd1);
d1783ef4 2379
7d849163 2380 return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
d1783ef4
CB
2381}
2382
2383static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2384 int *sk_fd, pid_t pid)
2385{
7d849163
CB
2386 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2387 int target_fds[2];
d1783ef4
CB
2388 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2389 size_t pidstr_len;
2390 ssize_t ret;
2391
49df620b 2392 ret = lxc_abstract_unix_recv_fds(sk, target_fds, 2, NULL, 0);
d1783ef4
CB
2393 if (ret <= 0)
2394 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
7d849163
CB
2395 target_fd0 = target_fds[0];
2396 target_fd1 = target_fds[1];
d1783ef4
CB
2397
2398 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2399
7d849163
CB
2400 ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2401 if (ret > 0 && ret == pidstr_len)
2402 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2403
49df620b 2404 ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
7d849163
CB
2405 if (ret > 0 && ret == pidstr_len)
2406 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
d1783ef4 2407
7d849163
CB
2408 return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2409 target_fd0, target_fd1);
d1783ef4
CB
2410}
2411
4b86fefd
CB
2412struct userns_exec_unified_attach_data {
2413 const struct lxc_conf *conf;
2414 int unified_fd;
d1783ef4 2415 int sk_pair[2];
4b86fefd
CB
2416 pid_t pid;
2417};
2418
d1783ef4
CB
2419static int cgroup_unified_attach_child_wrapper(void *data)
2420{
2421 struct userns_exec_unified_attach_data *args = data;
2422
2423 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2424 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2425 return ret_errno(EINVAL);
2426
2427 close_prot_errno_disarm(args->sk_pair[0]);
2428 return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2429 &args->sk_pair[1]);
2430}
2431
2432static int cgroup_unified_attach_parent_wrapper(void *data)
4b86fefd
CB
2433{
2434 struct userns_exec_unified_attach_data *args = data;
4b86fefd 2435
d1783ef4
CB
2436 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2437 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
4b86fefd
CB
2438 return ret_errno(EINVAL);
2439
d1783ef4
CB
2440 close_prot_errno_disarm(args->sk_pair[1]);
2441 return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2442 args->pid);
4b86fefd
CB
2443}
2444
900b6606
CB
2445/* Technically, we're always at a delegation boundary here (This is especially
2446 * true when cgroup namespaces are available.). The reasoning is that in order
2447 * for us to have been able to start a container in the first place the root
2448 * cgroup must have been a leaf node. Now, either the container's init system
2449 * has populated the cgroup and kept it as a leaf node or it has created
2450 * subtrees. In the former case we will simply attach to the leaf node we
2451 * created when we started the container in the latter case we create our own
2452 * cgroup for the attaching process.
2453 */
7581a82f
CB
2454static int __cg_unified_attach(const struct hierarchy *h,
2455 const struct lxc_conf *conf, const char *name,
900b6606
CB
2456 const char *lxcpath, pid_t pid,
2457 const char *controller)
2458{
f62cf1d4 2459 __do_close int unified_fd = -EBADF;
32908bfd 2460 __do_free char *path = NULL, *cgroup = NULL;
900b6606
CB
2461 int ret;
2462
7581a82f
CB
2463 if (!conf || !name || !lxcpath || pid <= 0)
2464 return ret_errno(EINVAL);
2465
2466 ret = cgroup_attach(conf, name, lxcpath, pid);
32908bfd
CB
2467 if (ret == 0)
2468 return log_trace(0, "Attached to unified cgroup via command handler");
59114d80 2469 if (ret != -ENOCGROUP2)
32908bfd
CB
2470 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2471
2472 /* Fall back to retrieving the path for the unified cgroup. */
2473 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2474 /* not running */
2475 if (!cgroup)
2476 return 0;
900b6606 2477
32908bfd 2478 path = must_make_path(h->mountpoint, cgroup, NULL);
900b6606 2479
32908bfd 2480 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
900b6606 2481 if (unified_fd < 0)
7581a82f
CB
2482 return ret_errno(EBADF);
2483
4b86fefd
CB
2484 if (!lxc_list_empty(&conf->id_map)) {
2485 struct userns_exec_unified_attach_data args = {
2486 .conf = conf,
2487 .unified_fd = unified_fd,
2488 .pid = pid,
2489 };
2490
d1783ef4
CB
2491 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2492 if (ret < 0)
2493 return -errno;
2494
2495 ret = userns_exec_minimal(conf,
2496 cgroup_unified_attach_parent_wrapper,
2497 &args,
2498 cgroup_unified_attach_child_wrapper,
2499 &args);
4b86fefd
CB
2500 } else {
2501 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2502 }
2503
2504 return ret;
900b6606
CB
2505}
2506
7581a82f
CB
2507__cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2508 const struct lxc_conf *conf,
2509 const char *name, const char *lxcpath,
2510 pid_t pid)
ccb4cabe 2511{
81b5d48a 2512 int len, ret;
a3650c0c 2513 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 2514
ab9a452d
CB
2515 if (!ops)
2516 return ret_set_errno(false, ENOENT);
2517
69b4a4bb
CB
2518 if (!ops->hierarchies)
2519 return true;
2520
0bba27c1
CB
2521 len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2522 if (len < 0)
ccb4cabe
SH
2523 return false;
2524
81b5d48a 2525 for (int i = 0; ops->hierarchies[i]; i++) {
c05b17bd 2526 __do_free char *fullpath = NULL, *path = NULL;
2202afc9 2527 struct hierarchy *h = ops->hierarchies[i];
ccb4cabe 2528
c2aed66d 2529 if (h->version == CGROUP2_SUPER_MAGIC) {
7581a82f 2530 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
a3926f6a 2531 h->controllers[0]);
c2aed66d
CB
2532 if (ret < 0)
2533 return false;
2534
2535 continue;
2536 }
2537
ccb4cabe 2538 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
c2aed66d
CB
2539 /* not running */
2540 if (!path)
e2cb2e74 2541 return false;
ccb4cabe 2542
371f834d 2543 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
7cea5905 2544 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
ab9a452d 2545 if (ret < 0)
77c3e9a2 2546 return log_error_errno(false, errno, "Failed to attach %d to %s",
ab9a452d 2547 (int)pid, fullpath);
ccb4cabe
SH
2548 }
2549
ccb4cabe
SH
2550 return true;
2551}
2552
e2bd2b13
CB
2553/* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2554 * don't have a cgroup_data set up, so we ask the running container through the
2555 * commands API for the cgroup path.
ccb4cabe 2556 */
b857f4be 2557__cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
fb55e009
CB
2558 char *value, size_t len, const char *name,
2559 const char *lxcpath)
ccb4cabe 2560{
d97919ab 2561 __do_free char *path = NULL;
88396101 2562 __do_free char *controller = NULL;
d97919ab 2563 char *p;
0069cc61 2564 struct hierarchy *h;
861cb8c2 2565 int ret = -1;
ccb4cabe 2566
a358028a
CB
2567 if (!ops)
2568 return ret_set_errno(-1, ENOENT);
2569
861cb8c2 2570 controller = must_copy_string(filename);
0069cc61
CB
2571 p = strchr(controller, '.');
2572 if (p)
ccb4cabe
SH
2573 *p = '\0';
2574
a900cbaf 2575 path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
0069cc61
CB
2576 /* not running */
2577 if (!path)
ccb4cabe
SH
2578 return -1;
2579
2202afc9 2580 h = get_hierarchy(ops, controller);
ccb4cabe 2581 if (h) {
88396101 2582 __do_free char *fullpath = NULL;
0069cc61
CB
2583
2584 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe 2585 ret = lxc_read_from_file(fullpath, value, len);
ccb4cabe 2586 }
ccb4cabe
SH
2587
2588 return ret;
2589}
2590
cb3fc90c
CB
2591static int device_cgroup_parse_access(struct device_item *device, const char *val)
2592{
2593 for (int count = 0; count < 3; count++, val++) {
2594 switch (*val) {
2595 case 'r':
2596 device->access[count] = *val;
2597 break;
2598 case 'w':
2599 device->access[count] = *val;
2600 break;
2601 case 'm':
2602 device->access[count] = *val;
2603 break;
2604 case '\n':
2605 case '\0':
2606 count = 3;
2607 break;
2608 default:
2609 return ret_errno(EINVAL);
2610 }
2611 }
2612
2613 return 0;
2614}
2615
2a63b5cb
CB
2616static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2617 const char *val)
2618{
2619 int count, ret;
2620 char temp[50];
2621
8b99a20a 2622 if (strequal("devices.allow", key))
2a63b5cb
CB
2623 device->allow = 1;
2624 else
2625 device->allow = 0;
2626
8b99a20a 2627 if (strequal(val, "a")) {
2a63b5cb
CB
2628 /* global rule */
2629 device->type = 'a';
2630 device->major = -1;
2631 device->minor = -1;
fda39d45 2632 device->global_rule = device->allow
29a01c37
CB
2633 ? LXC_BPF_DEVICE_CGROUP_DENYLIST
2634 : LXC_BPF_DEVICE_CGROUP_ALLOWLIST;
2a63b5cb
CB
2635 device->allow = -1;
2636 return 0;
2a63b5cb
CB
2637 }
2638
77c3e9a2
CB
2639 /* local rule */
2640 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2641
2a63b5cb
CB
2642 switch (*val) {
2643 case 'a':
2644 __fallthrough;
2645 case 'b':
2646 __fallthrough;
2647 case 'c':
2648 device->type = *val;
2649 break;
2650 default:
2651 return -1;
2652 }
2653
2654 val++;
2655 if (!isspace(*val))
2656 return -1;
2657 val++;
2658 if (*val == '*') {
2659 device->major = -1;
2660 val++;
2661 } else if (isdigit(*val)) {
2662 memset(temp, 0, sizeof(temp));
2663 for (count = 0; count < sizeof(temp) - 1; count++) {
2664 temp[count] = *val;
2665 val++;
2666 if (!isdigit(*val))
2667 break;
2668 }
2669 ret = lxc_safe_int(temp, &device->major);
2670 if (ret)
2671 return -1;
2672 } else {
2673 return -1;
2674 }
2675 if (*val != ':')
2676 return -1;
2677 val++;
2678
2679 /* read minor */
2680 if (*val == '*') {
2681 device->minor = -1;
2682 val++;
2683 } else if (isdigit(*val)) {
2684 memset(temp, 0, sizeof(temp));
2685 for (count = 0; count < sizeof(temp) - 1; count++) {
2686 temp[count] = *val;
2687 val++;
2688 if (!isdigit(*val))
2689 break;
2690 }
2691 ret = lxc_safe_int(temp, &device->minor);
2692 if (ret)
2693 return -1;
2694 } else {
2695 return -1;
2696 }
2697 if (!isspace(*val))
2698 return -1;
2a63b5cb 2699
cb3fc90c 2700 return device_cgroup_parse_access(device, ++val);
2a63b5cb
CB
2701}
2702
eec533e3
CB
2703/* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2704 * don't have a cgroup_data set up, so we ask the running container through the
2705 * commands API for the cgroup path.
ccb4cabe 2706 */
b857f4be 2707__cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2a63b5cb 2708 const char *key, const char *value,
fb55e009 2709 const char *name, const char *lxcpath)
ccb4cabe 2710{
d97919ab 2711 __do_free char *path = NULL;
88396101 2712 __do_free char *controller = NULL;
d97919ab 2713 char *p;
87777968 2714 struct hierarchy *h;
861cb8c2 2715 int ret = -1;
ccb4cabe 2716
b7aeda96
CB
2717 if (!ops || is_empty_string(key) || is_empty_string(value) ||
2718 is_empty_string(name) || is_empty_string(lxcpath))
2719 return ret_errno(EINVAL);
a358028a 2720
2a63b5cb 2721 controller = must_copy_string(key);
87777968
CB
2722 p = strchr(controller, '.');
2723 if (p)
ccb4cabe
SH
2724 *p = '\0';
2725
8b99a20a 2726 if (pure_unified_layout(ops) && strequal(controller, "devices")) {
50329f28 2727 struct device_item device = {};
2a63b5cb
CB
2728
2729 ret = device_cgroup_rule_parse(&device, key, value);
2730 if (ret < 0)
d2203230 2731 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2a63b5cb
CB
2732 key, value);
2733
2734 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2735 if (ret < 0)
2736 return -1;
2737
2738 return 0;
2739 }
2740
a900cbaf 2741 path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
87777968
CB
2742 /* not running */
2743 if (!path)
ccb4cabe
SH
2744 return -1;
2745
2202afc9 2746 h = get_hierarchy(ops, controller);
ccb4cabe 2747 if (h) {
88396101 2748 __do_free char *fullpath = NULL;
87777968 2749
2a63b5cb 2750 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
7cea5905 2751 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
ccb4cabe 2752 }
ccb4cabe
SH
2753
2754 return ret;
2755}
2756
91d1a13a 2757/* take devices cgroup line
72add155
SH
2758 * /dev/foo rwx
2759 * and convert it to a valid
2760 * type major:minor mode
91d1a13a
CB
2761 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2762 * the output.
72add155 2763 */
cb3fc90c
CB
2764static int device_cgroup_rule_parse_devpath(struct device_item *device,
2765 const char *devpath)
72add155 2766{
88396101 2767 __do_free char *path = NULL;
2a06d041 2768 char *mode = NULL;
cb3fc90c
CB
2769 int n_parts, ret;
2770 char *p;
2771 struct stat sb;
72add155 2772
cb3fc90c 2773 path = must_copy_string(devpath);
72add155 2774
cb3fc90c
CB
2775 /*
2776 * Read path followed by mode. Ignore any trailing text.
91d1a13a
CB
2777 * A ' # comment' would be legal. Technically other text is not
2778 * legal, we could check for that if we cared to.
72add155 2779 */
0dbdb99e 2780 for (n_parts = 1, p = path; *p; p++) {
2c2d6c49
SH
2781 if (*p != ' ')
2782 continue;
2783 *p = '\0';
91d1a13a 2784
2c2d6c49
SH
2785 if (n_parts != 1)
2786 break;
2787 p++;
2788 n_parts++;
91d1a13a 2789
2c2d6c49
SH
2790 while (*p == ' ')
2791 p++;
91d1a13a 2792
2c2d6c49 2793 mode = p;
91d1a13a 2794
2c2d6c49 2795 if (*p == '\0')
cb3fc90c 2796 return ret_set_errno(-1, EINVAL);
72add155 2797 }
2c2d6c49 2798
83b25c4d
CB
2799 if (!mode)
2800 return ret_errno(EINVAL);
2801
cb3fc90c
CB
2802 if (device_cgroup_parse_access(device, mode) < 0)
2803 return -1;
2804
72add155
SH
2805 ret = stat(path, &sb);
2806 if (ret < 0)
cb3fc90c 2807 return ret_set_errno(-1, errno);
72add155 2808
72add155
SH
2809 mode_t m = sb.st_mode & S_IFMT;
2810 switch (m) {
2811 case S_IFBLK:
cb3fc90c 2812 device->type = 'b';
72add155
SH
2813 break;
2814 case S_IFCHR:
cb3fc90c 2815 device->type = 'c';
72add155 2816 break;
2c2d6c49 2817 default:
77c3e9a2 2818 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
72add155 2819 }
2c2d6c49 2820
cb3fc90c
CB
2821 device->major = MAJOR(sb.st_rdev);
2822 device->minor = MINOR(sb.st_rdev);
2823 device->allow = 1;
2824 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
72add155 2825
cb3fc90c
CB
2826 return 0;
2827}
2828
2829static int convert_devpath(const char *invalue, char *dest)
2830{
50329f28 2831 struct device_item device = {};
cb3fc90c
CB
2832 int ret;
2833
2834 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2835 if (ret < 0)
2836 return -1;
2837
0bba27c1
CB
2838 ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2839 device.minor, device.access);
2840 if (ret < 0)
2841 return log_error_errno(ret, -ret,
2842 "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2843 device.type, device.major, device.minor,
2844 device.access);
cb3fc90c
CB
2845
2846 return 0;
72add155
SH
2847}
2848
90e97284
CB
2849/* Called from setup_limits - here we have the container's cgroup_data because
2850 * we created the cgroups.
ccb4cabe 2851 */
2202afc9 2852static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
a900cbaf 2853 const char *value, bool is_cpuset)
ccb4cabe 2854{
88396101 2855 __do_free char *controller = NULL;
d97919ab 2856 char *p;
1a0e70ac
CB
2857 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2858 char converted_value[50];
b3646d7e 2859 struct hierarchy *h;
64e82f8b 2860
861cb8c2 2861 controller = must_copy_string(filename);
ab1a6cac
CB
2862 p = strchr(controller, '.');
2863 if (p)
ccb4cabe
SH
2864 *p = '\0';
2865
8b99a20a 2866 if (strequal("devices.allow", filename) && value[0] == '/') {
c04a6d4e
CB
2867 int ret;
2868
72add155
SH
2869 ret = convert_devpath(value, converted_value);
2870 if (ret < 0)
c8bf519d 2871 return ret;
72add155 2872 value = converted_value;
c8bf519d 2873 }
2874
2202afc9 2875 h = get_hierarchy(ops, controller);
77c3e9a2
CB
2876 if (!h)
2877 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
b3646d7e 2878
a900cbaf
WB
2879 if (is_cpuset) {
2880 int ret = lxc_write_openat(h->container_full_path, filename, value, strlen(value));
2881 if (ret)
2882 return ret;
2883 }
2884 return lxc_write_openat(h->container_limit_path, filename, value, strlen(value));
ccb4cabe
SH
2885}
2886
c581d2a6
CB
2887__cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2888 struct lxc_conf *conf,
2889 bool do_devices)
ccb4cabe 2890{
d97919ab 2891 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
c581d2a6 2892 struct lxc_list *cgroup_settings = &conf->cgroup;
d97919ab 2893 struct lxc_list *iterator, *next;
ccb4cabe 2894 struct lxc_cgroup *cg;
ccb4cabe
SH
2895 bool ret = false;
2896
92ca7eb5
CB
2897 if (!ops)
2898 return ret_set_errno(false, ENOENT);
2899
2900 if (!conf)
2901 return ret_set_errno(false, EINVAL);
2902
2903 cgroup_settings = &conf->cgroup;
ccb4cabe
SH
2904 if (lxc_list_empty(cgroup_settings))
2905 return true;
2906
69b4a4bb 2907 if (!ops->hierarchies)
92ca7eb5 2908 return ret_set_errno(false, EINVAL);
69b4a4bb 2909
92afbe74 2910 if (pure_unified_layout(ops))
b96aa96f
CB
2911 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2912
ccb4cabe 2913 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
6b38e644 2914 if (!sorted_cgroup_settings)
ccb4cabe 2915 return false;
ccb4cabe 2916
ccb4cabe
SH
2917 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2918 cg = iterator->elem;
2919
2920 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
a900cbaf 2921 if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strncmp("cpuset", cg->subsystem, 6) == 0)) {
fc3b9533
CB
2922 if (do_devices && (errno == EACCES || errno == EPERM)) {
2923 SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2924 continue;
2925 }
2926 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2927 goto out;
ccb4cabe 2928 }
77c3e9a2 2929 DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
ccb4cabe 2930 }
ccb4cabe
SH
2931 }
2932
2933 ret = true;
6b38e644 2934 INFO("Limits for the legacy cgroup hierarchies have been setup");
ccb4cabe 2935out:
ccb4cabe
SH
2936 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2937 lxc_list_del(iterator);
2938 free(iterator);
2939 }
d97919ab 2940
ccb4cabe
SH
2941 return ret;
2942}
2943
bf651989
CB
2944/*
2945 * Some of the parsing logic comes from the original cgroup device v1
2946 * implementation in the kernel.
2947 */
4bfb655e
CB
2948static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2949 struct lxc_conf *conf, const char *key,
bf651989
CB
2950 const char *val)
2951{
2952#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
50329f28 2953 struct device_item device_item = {};
2a63b5cb 2954 int ret;
bf651989 2955
8b99a20a 2956 if (strequal("devices.allow", key) && *val == '/')
cb3fc90c
CB
2957 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2958 else
2959 ret = device_cgroup_rule_parse(&device_item, key, val);
2a63b5cb 2960 if (ret < 0)
77c3e9a2 2961 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", key, val);
4bfb655e
CB
2962
2963 ret = bpf_list_add_device(conf, &device_item);
2a63b5cb 2964 if (ret < 0)
4bfb655e 2965 return -1;
bf651989
CB
2966#endif
2967 return 0;
2968}
2969
c581d2a6
CB
2970__cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2971 struct lxc_handler *handler)
6b38e644 2972{
7e31931f
CB
2973 struct lxc_list *cgroup_settings, *iterator;
2974 struct hierarchy *h;
2975 struct lxc_conf *conf;
6b38e644 2976
7e31931f
CB
2977 if (!ops)
2978 return ret_set_errno(false, ENOENT);
2979
2980 if (!ops->hierarchies)
6b38e644
CB
2981 return true;
2982
7e31931f
CB
2983 if (!ops->container_cgroup)
2984 return ret_set_errno(false, EINVAL);
2985
2986 if (!handler || !handler->conf)
2987 return ret_set_errno(false, EINVAL);
2988 conf = handler->conf;
2989
7e31931f 2990 cgroup_settings = &conf->cgroup2;
0e7a013e
CB
2991 if (lxc_list_empty(cgroup_settings))
2992 return true;
2993
2994 if (!pure_unified_layout(ops))
2995 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
7e31931f
CB
2996
2997 if (!ops->unified)
6b38e644 2998 return false;
7e31931f 2999 h = ops->unified;
6b38e644 3000
bf651989 3001 lxc_list_for_each (iterator, cgroup_settings) {
6b38e644 3002 struct lxc_cgroup *cg = iterator->elem;
c04a6d4e 3003 int ret;
6b38e644 3004
ee9d3ef0
CB
3005 if (strncmp("devices", cg->subsystem, 7) == 0)
3006 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
3007 else
3008 ret = lxc_write_openat(h->container_limit_path, cg->subsystem, cg->value, strlen(cg->value));
3009 if (ret < 0)
3010 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3011
6b38e644
CB
3012 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3013 }
3014
7e31931f 3015 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
6b38e644
CB
3016}
3017
59eac805 3018__cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
bf651989
CB
3019{
3020#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
dcbb9e99 3021 __do_bpf_program_free struct bpf_program *prog = NULL;
bf651989 3022 int ret;
e552bd1a
CB
3023 struct lxc_conf *conf;
3024 struct hierarchy *unified;
2a63b5cb 3025 struct lxc_list *it;
dcbb9e99 3026 struct bpf_program *prog_old;
bf651989 3027
e552bd1a
CB
3028 if (!ops)
3029 return ret_set_errno(false, ENOENT);
3030
3031 if (!ops->hierarchies)
3032 return true;
3033
3034 if (!ops->container_cgroup)
3035 return ret_set_errno(false, EEXIST);
3036
3037 if (!handler || !handler->conf)
3038 return ret_set_errno(false, EINVAL);
3039 conf = handler->conf;
3040
3041 unified = ops->unified;
9994db51
CB
3042 if (!unified || !unified->bpf_device_controller ||
3043 !unified->container_full_path || lxc_list_empty(&conf->devices))
bf651989
CB
3044 return true;
3045
dcbb9e99
CB
3046 prog = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
3047 if (!prog)
77c3e9a2 3048 return log_error_errno(false, ENOMEM, "Failed to create new bpf program");
2a63b5cb 3049
dcbb9e99 3050 ret = bpf_program_init(prog);
bf651989 3051 if (ret)
77c3e9a2 3052 return log_error_errno(false, ENOMEM, "Failed to initialize bpf program");
2a63b5cb
CB
3053
3054 lxc_list_for_each(it, &conf->devices) {
3055 struct device_item *cur = it->elem;
3056
dcbb9e99 3057 ret = bpf_program_append_device(prog, cur);
2a63b5cb 3058 if (ret)
77c3e9a2
CB
3059 return log_error_errno(false, ENOMEM, "Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
3060 cur->type,
3061 cur->major,
3062 cur->minor,
3063 cur->access,
3064 cur->allow,
3065 cur->global_rule);
2a63b5cb 3066 TRACE("Added rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
77c3e9a2
CB
3067 cur->type,
3068 cur->major,
3069 cur->minor,
3070 cur->access,
3071 cur->allow,
3072 cur->global_rule);
2a63b5cb
CB
3073 }
3074
dcbb9e99 3075 ret = bpf_program_finalize(prog);
2a63b5cb 3076 if (ret)
77c3e9a2 3077 return log_error_errno(false, ENOMEM, "Failed to finalize bpf program");
bf651989 3078
dcbb9e99 3079 ret = bpf_program_cgroup_attach(prog, BPF_CGROUP_DEVICE,
a900cbaf 3080 unified->container_limit_path,
cce5a3d7
CB
3081 BPF_F_ALLOW_MULTI);
3082 if (ret)
77c3e9a2 3083 return log_error_errno(false, ENOMEM, "Failed to attach bpf program");
cce5a3d7
CB
3084
3085 /* Replace old bpf program. */
dcbb9e99
CB
3086 prog_old = move_ptr(ops->cgroup2_devices);
3087 ops->cgroup2_devices = move_ptr(prog);
3088 prog = move_ptr(prog_old);
bf651989 3089#endif
cce5a3d7 3090 return true;
bf651989
CB
3091}
3092
59eac805 3093static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
6b38e644 3094{
ac01a9b8 3095 __do_close int fd_base = -EBADF;
c581d2a6 3096 __do_free char *add_controllers = NULL, *base_path = NULL;
f761d24d 3097 __do_free_string_list char **parts = NULL;
c581d2a6
CB
3098 struct hierarchy *unified = ops->unified;
3099 ssize_t parts_len;
3100 char **it;
3101 size_t full_len = 0;
6b38e644 3102
c581d2a6
CB
3103 if (!ops->hierarchies || !pure_unified_layout(ops) ||
3104 !unified->controllers[0])
bf651989
CB
3105 return true;
3106
c581d2a6
CB
3107 /* For now we simply enable all controllers that we have detected by
3108 * creating a string like "+memory +pids +cpu +io".
3109 * TODO: In the near future we might want to support "-<controller>"
3110 * etc. but whether supporting semantics like this make sense will need
3111 * some thinking.
3112 */
3113 for (it = unified->controllers; it && *it; it++) {
3114 full_len += strlen(*it) + 2;
3115 add_controllers = must_realloc(add_controllers, full_len + 1);
3116
3117 if (unified->controllers[0] == *it)
3118 add_controllers[0] = '\0';
3119
3120 (void)strlcat(add_controllers, "+", full_len + 1);
3121 (void)strlcat(add_controllers, *it, full_len + 1);
3122
3123 if ((it + 1) && *(it + 1))
3124 (void)strlcat(add_controllers, " ", full_len + 1);
3125 }
3126
ac01a9b8
CB
3127 base_path = must_make_path(unified->mountpoint, unified->container_base_path, NULL);
3128 fd_base = lxc_open_dirfd(base_path);
3129 if (fd_base < 0)
3130 return false;
3131
3132 if (!unified_cgroup_fd(fd_base))
3133 return log_error_errno(false, EINVAL, "File descriptor does not refer to cgroup2 filesystem");
3134
c581d2a6
CB
3135 parts = lxc_string_split(cgroup, '/');
3136 if (!parts)
f761d24d 3137 return false;
c581d2a6
CB
3138
3139 parts_len = lxc_array_len((void **)parts);
3140 if (parts_len > 0)
3141 parts_len--;
3142
c581d2a6
CB
3143 for (ssize_t i = -1; i < parts_len; i++) {
3144 int ret;
c581d2a6 3145
ac01a9b8
CB
3146 if (i >= 0) {
3147 int fd_next;
3148
3149 fd_next = openat(fd_base, parts[i], PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH);
3150 if (fd_next < 0)
3151 return log_error_errno(false, errno, "Failed to open %d(%s)", fd_next, parts[i]);
3152 close_prot_errno_move(fd_base, fd_next);
3153 }
3154
3155 ret = lxc_writeat(fd_base, "cgroup.subtree_control", add_controllers, full_len);
61fbc369 3156 if (ret < 0)
ac01a9b8
CB
3157 return log_error_errno(false, errno,
3158 "Could not enable \"%s\" controllers in the unified cgroup %d(%s)",
3159 add_controllers, fd_base, (i >= 0) ? parts[i] : unified->container_base_path);
3160
3161 TRACE("Enable \"%s\" controllers in the unified cgroup %d(%s)",
3162 add_controllers, fd_base, (i >= 0) ? parts[i] : unified->container_base_path);
c581d2a6
CB
3163 }
3164
f761d24d 3165 return true;
c581d2a6
CB
3166}
3167
59eac805 3168__cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
c581d2a6 3169{
61fbc369
CB
3170 if (!ops)
3171 return ret_set_errno(false, ENOENT);
3172
c581d2a6
CB
3173 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
3174}
3175
59eac805 3176__cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
c581d2a6 3177{
61fbc369
CB
3178 if (!ops)
3179 return ret_set_errno(false, ENOENT);
3180
c581d2a6 3181 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2202afc9
CB
3182}
3183
b7b18fc5
CB
3184static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
3185 char **controllers)
3186{
b7b18fc5
CB
3187 if (!ops->cgroup_use)
3188 return true;
3189
431e2c54 3190 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
b7b18fc5
CB
3191 bool found = false;
3192
431e2c54 3193 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
8b99a20a 3194 if (!strequal(*cur_use, *cur_ctrl))
b7b18fc5
CB
3195 continue;
3196
3197 found = true;
3198 break;
3199 }
3200
3201 if (found)
3202 continue;
3203
3204 return false;
3205 }
3206
3207 return true;
3208}
3209
a6ca2ed8
CB
3210static void cg_unified_delegate(char ***delegate)
3211{
d606c4e9 3212 __do_free char *buf = NULL;
a6ca2ed8 3213 char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
d606c4e9
CB
3214 char *token;
3215 int idx;
a6ca2ed8 3216
46bf13b7 3217 buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
d606c4e9 3218 if (!buf) {
a6ca2ed8
CB
3219 for (char **p = standard; p && *p; p++) {
3220 idx = append_null_to_list((void ***)delegate);
3221 (*delegate)[idx] = must_copy_string(*p);
3222 }
fc3b9533
CB
3223 SYSWARN("Failed to read /sys/kernel/cgroup/delegate");
3224 return;
d606c4e9 3225 }
a6ca2ed8 3226
257f04ec 3227 lxc_iterate_parts(token, buf, " \t\n") {
d606c4e9
CB
3228 /*
3229 * We always need to chown this for both cgroup and
3230 * cgroup2.
3231 */
8b99a20a 3232 if (strequal(token, "cgroup.procs"))
d606c4e9
CB
3233 continue;
3234
3235 idx = append_null_to_list((void ***)delegate);
3236 (*delegate)[idx] = must_copy_string(token);
a6ca2ed8
CB
3237 }
3238}
3239
2202afc9
CB
3240/* At startup, parse_hierarchies finds all the info we need about cgroup
3241 * mountpoints and current cgroups, and stores it in @d.
3242 */
341e6516 3243static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
2202afc9 3244{
bbba37f7
CB
3245 __do_free char *basecginfo = NULL, *line = NULL;
3246 __do_free_string_list char **klist = NULL, **nlist = NULL;
d97919ab 3247 __do_fclose FILE *f = NULL;
2202afc9 3248 int ret;
2202afc9 3249 size_t len = 0;
2202afc9
CB
3250
3251 /* Root spawned containers escape the current cgroup, so use init's
3252 * cgroups as our base in that case.
3253 */
9caee129 3254 if (!relative && (geteuid() == 0))
46bf13b7 3255 basecginfo = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
2202afc9 3256 else
46bf13b7 3257 basecginfo = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
2202afc9 3258 if (!basecginfo)
341e6516 3259 return ret_set_errno(-1, ENOMEM);
2202afc9
CB
3260
3261 ret = get_existing_subsystems(&klist, &nlist);
341e6516
CB
3262 if (ret < 0)
3263 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
2202afc9 3264
4110345b 3265 f = fopen("/proc/self/mountinfo", "re");
341e6516
CB
3266 if (!f)
3267 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
2202afc9
CB
3268
3269 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
3270
3271 while (getline(&line, &len, f) != -1) {
bbba37f7
CB
3272 __do_free char *base_cgroup = NULL, *mountpoint = NULL;
3273 __do_free_string_list char **controller_list = NULL;
2202afc9
CB
3274 int type;
3275 bool writeable;
3276 struct hierarchy *new;
2202afc9
CB
3277
3278 type = get_cgroup_version(line);
3279 if (type == 0)
3280 continue;
3281
3282 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
3283 continue;
3284
3285 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
3286 if (type == CGROUP2_SUPER_MAGIC)
3287 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3288 else if (type == CGROUP_SUPER_MAGIC)
3289 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3290 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
3291 if (type == CGROUP_SUPER_MAGIC)
3292 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3293 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3294 if (type == CGROUP2_SUPER_MAGIC)
3295 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3296 }
3297
3298 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
3299 if (!controller_list && type == CGROUP_SUPER_MAGIC)
3300 continue;
3301
3302 if (type == CGROUP_SUPER_MAGIC)
fc3b9533
CB
3303 if (controller_list_is_dup(ops->hierarchies, controller_list)) {
3304 TRACE("Skipping duplicating controller");
3305 continue;
3306 }
2202afc9
CB
3307
3308 mountpoint = cg_hybrid_get_mountpoint(line);
fc3b9533 3309 if (!mountpoint) {
34375fd7 3310 WARN("Failed parsing mountpoint from \"%s\"", line);
fc3b9533
CB
3311 continue;
3312 }
2202afc9
CB
3313
3314 if (type == CGROUP_SUPER_MAGIC)
3315 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
3316 else
3317 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
fc3b9533 3318 if (!base_cgroup) {
34375fd7 3319 WARN("Failed to find current cgroup");
fc3b9533
CB
3320 continue;
3321 }
2202afc9
CB
3322
3323 trim(base_cgroup);
3324 prune_init_scope(base_cgroup);
3325 if (type == CGROUP2_SUPER_MAGIC)
3326 writeable = test_writeable_v2(mountpoint, base_cgroup);
3327 else
3328 writeable = test_writeable_v1(mountpoint, base_cgroup);
fc3b9533
CB
3329 if (!writeable) {
3330 TRACE("The %s group is not writeable", base_cgroup);
3331 continue;
3332 }
2202afc9
CB
3333
3334 if (type == CGROUP2_SUPER_MAGIC) {
3335 char *cgv2_ctrl_path;
3336
3337 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
3338 "cgroup.controllers",
3339 NULL);
3340
d23cb29e 3341 controller_list = cg_unified_get_controllers(-EBADF, cgv2_ctrl_path);
2202afc9
CB
3342 free(cgv2_ctrl_path);
3343 if (!controller_list) {
3344 controller_list = cg_unified_make_empty_controller();
3345 TRACE("No controllers are enabled for "
3346 "delegation in the unified hierarchy");
3347 }
3348 }
3349
b7b18fc5 3350 /* Exclude all controllers that cgroup use does not want. */
fc3b9533
CB
3351 if (!cgroup_use_wants_controllers(ops, controller_list)) {
3352 TRACE("Skipping controller");
3353 continue;
3354 }
b7b18fc5 3355
bbba37f7 3356 new = add_hierarchy(&ops->hierarchies, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
6e214b74
CB
3357 if (!new)
3358 return log_error_errno(-1, errno, "Failed to add cgroup hierarchy");
a6ca2ed8
CB
3359 if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
3360 if (unprivileged)
3361 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 3362 ops->unified = new;
a6ca2ed8 3363 }
2202afc9
CB
3364 }
3365
2202afc9
CB
3366 TRACE("Writable cgroup hierarchies:");
3367 lxc_cgfsng_print_hierarchies(ops);
3368
3369 /* verify that all controllers in cgroup.use and all crucial
3370 * controllers are accounted for
3371 */
3372 if (!all_controllers_found(ops))
341e6516 3373 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
2202afc9 3374
341e6516 3375 return 0;
2202afc9
CB
3376}
3377
2202afc9 3378/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
9caee129 3379static char *cg_unified_get_current_cgroup(bool relative)
2202afc9 3380{
88396101 3381 __do_free char *basecginfo = NULL;
d7314671 3382 char *copy;
d97919ab 3383 char *base_cgroup;
2202afc9 3384
9caee129 3385 if (!relative && (geteuid() == 0))
46bf13b7 3386 basecginfo = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
2202afc9 3387 else
46bf13b7 3388 basecginfo = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
2202afc9
CB
3389 if (!basecginfo)
3390 return NULL;
3391
3392 base_cgroup = strstr(basecginfo, "0::/");
3393 if (!base_cgroup)
d7314671 3394 return NULL;
2202afc9
CB
3395
3396 base_cgroup = base_cgroup + 3;
3397 copy = copy_to_eol(base_cgroup);
3398 if (!copy)
d7314671 3399 return NULL;
2202afc9 3400
d7314671 3401 return trim(copy);
2202afc9
CB
3402}
3403
a6ca2ed8
CB
3404static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3405 bool unprivileged)
2202afc9 3406{
f914ae08
CB
3407 __do_close int cgroup_root_fd = -EBADF;
3408 __do_free char *base_cgroup = NULL, *controllers_path = NULL;
ed75d76e 3409 __do_free_string_list char **delegatable = NULL;
0450b7ce 3410 __do_free struct hierarchy *new = NULL;
2202afc9 3411 int ret;
2202afc9 3412
d47ff01b 3413 ret = unified_cgroup_hierarchy();
2202afc9 3414 if (ret == -ENOMEDIUM)
d2203230 3415 return ret_errno(ENOMEDIUM);
2202afc9
CB
3416
3417 if (ret != CGROUP2_SUPER_MAGIC)
3418 return 0;
3419
9caee129 3420 base_cgroup = cg_unified_get_current_cgroup(relative);
2202afc9 3421 if (!base_cgroup)
d2203230 3422 return ret_errno(EINVAL);
c581d2a6
CB
3423 if (!relative)
3424 prune_init_scope(base_cgroup);
2202afc9 3425
f914ae08
CB
3426 cgroup_root_fd = openat(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3427 O_NOCTTY | O_CLOEXEC | O_NOFOLLOW | O_DIRECTORY);
3428 if (cgroup_root_fd < 0)
3429 return -errno;
3430
d606c4e9
CB
3431 /*
3432 * We assume that the cgroup we're currently in has been delegated to
3433 * us and we are free to further delege all of the controllers listed
3434 * in cgroup.controllers further down the hierarchy.
2202afc9 3435 */
f914ae08
CB
3436 controllers_path = must_make_path_relative(base_cgroup, "cgroup.controllers", NULL);
3437 delegatable = cg_unified_get_controllers(cgroup_root_fd, controllers_path);
2202afc9
CB
3438 if (!delegatable)
3439 delegatable = cg_unified_make_empty_controller();
3440 if (!delegatable[0])
3441 TRACE("No controllers are enabled for delegation");
3442
3443 /* TODO: If the user requested specific controllers via lxc.cgroup.use
3444 * we should verify here. The reason I'm not doing it right is that I'm
3445 * not convinced that lxc.cgroup.use will be the future since it is a
3446 * global property. I much rather have an option that lets you request
3447 * controllers per container.
3448 */
3449
f914ae08 3450 new = add_hierarchy(&ops->hierarchies,
6e214b74 3451 move_ptr(delegatable),
f914ae08
CB
3452 must_copy_string(DEFAULT_CGROUP_MOUNTPOINT),
3453 move_ptr(base_cgroup),
3454 CGROUP2_SUPER_MAGIC);
6e214b74
CB
3455 if (!new)
3456 return log_error_errno(-1, errno, "Failed to add unified cgroup hierarchy");
3457
d606c4e9 3458 if (unprivileged)
a6ca2ed8 3459 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 3460
2a63b5cb
CB
3461 if (bpf_devices_cgroup_supported())
3462 new->bpf_device_controller = 1;
3463
2202afc9 3464 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
0450b7ce 3465 ops->unified = move_ptr(new);
77c3e9a2 3466
2202afc9
CB
3467 return CGROUP2_SUPER_MAGIC;
3468}
3469
341e6516 3470static int cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
2202afc9
CB
3471{
3472 int ret;
3473 const char *tmp;
9caee129 3474 bool relative = conf->cgroup_meta.relative;
2202afc9
CB
3475
3476 tmp = lxc_global_config_value("lxc.cgroup.use");
b7b18fc5 3477 if (tmp) {
88396101 3478 __do_free char *pin = NULL;
d97919ab 3479 char *chop, *cur;
b7b18fc5
CB
3480
3481 pin = must_copy_string(tmp);
3482 chop = pin;
3483
d97919ab 3484 lxc_iterate_parts(cur, chop, ",")
b7b18fc5 3485 must_append_string(&ops->cgroup_use, cur);
b7b18fc5 3486 }
2202afc9 3487
a6ca2ed8 3488 ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9 3489 if (ret < 0)
341e6516 3490 return -1;
2202afc9
CB
3491
3492 if (ret == CGROUP2_SUPER_MAGIC)
341e6516 3493 return 0;
2202afc9 3494
a6ca2ed8 3495 return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9
CB
3496}
3497
341e6516 3498__cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
2202afc9
CB
3499{
3500 const char *cgroup_pattern;
3501
341e6516
CB
3502 if (!ops)
3503 return ret_set_errno(-1, ENOENT);
3504
2202afc9
CB
3505 /* copy system-wide cgroup information */
3506 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
8b99a20a 3507 if (cgroup_pattern && !strequal(cgroup_pattern, ""))
b3ed2061 3508 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
2202afc9 3509
341e6516 3510 return 0;
2202afc9
CB
3511}
3512
5a087e05 3513struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
2202afc9 3514{
a64edc1c 3515 __do_free struct cgroup_ops *cgfsng_ops = NULL;
2202afc9 3516
c5d0238a 3517 cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
2202afc9 3518 if (!cgfsng_ops)
341e6516 3519 return ret_set_errno(NULL, ENOMEM);
2202afc9 3520
2202afc9
CB
3521 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3522
341e6516 3523 if (cg_init(cgfsng_ops, conf))
2202afc9 3524 return NULL;
2202afc9 3525
ca76baed
CB
3526 cgfsng_ops->data_init = cgfsng_data_init;
3527 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3528 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3529 cgfsng_ops->monitor_create = cgfsng_monitor_create;
3530 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3531 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3532 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3533 cgfsng_ops->payload_create = cgfsng_payload_create;
3534 cgfsng_ops->payload_enter = cgfsng_payload_enter;
3535 cgfsng_ops->payload_finalize = cgfsng_payload_finalize;
ca76baed
CB
3536 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3537 cgfsng_ops->get = cgfsng_get;
3538 cgfsng_ops->set = cgfsng_set;
3539 cgfsng_ops->freeze = cgfsng_freeze;
3540 cgfsng_ops->unfreeze = cgfsng_unfreeze;
3541 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3542 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3543 cgfsng_ops->driver = "cgfsng";
3544 cgfsng_ops->version = "1.0.0";
3545 cgfsng_ops->attach = cgfsng_attach;
3546 cgfsng_ops->chown = cgfsng_chown;
3547 cgfsng_ops->mount = cgfsng_mount;
3548 cgfsng_ops->devices_activate = cgfsng_devices_activate;
3549 cgfsng_ops->get_limiting_cgroup = cgfsng_get_limiting_cgroup;
2202afc9 3550
ff9edd2d
CB
3551 cgfsng_ops->criu_escape = cgfsng_criu_escape;
3552 cgfsng_ops->criu_num_hierarchies = cgfsng_criu_num_hierarchies;
3553 cgfsng_ops->criu_get_hierarchies = cgfsng_criu_get_hierarchies;
3554
a64edc1c 3555 return move_ptr(cgfsng_ops);
2202afc9 3556}
be835470 3557
029d8e88
CB
3558int cgroup_attach(const struct lxc_conf *conf, const char *name,
3559 const char *lxcpath, pid_t pid)
3560{
3561 __do_close int unified_fd = -EBADF;
3562 int ret;
3563
88c27c53 3564 if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
029d8e88
CB
3565 return ret_errno(EINVAL);
3566
3567 unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3568 if (unified_fd < 0)
6b55ce0e 3569 return ret_errno(ENOCGROUP2);
029d8e88
CB
3570
3571 if (!lxc_list_empty(&conf->id_map)) {
3572 struct userns_exec_unified_attach_data args = {
3573 .conf = conf,
3574 .unified_fd = unified_fd,
3575 .pid = pid,
3576 };
3577
3578 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3579 if (ret < 0)
3580 return -errno;
3581
3582 ret = userns_exec_minimal(conf,
3583 cgroup_unified_attach_parent_wrapper,
3584 &args,
3585 cgroup_unified_attach_child_wrapper,
3586 &args);
3587 } else {
3588 ret = cgroup_attach_leaf(conf, unified_fd, pid);
3589 }
3590
3591 return ret;
3592}
3593
751a624f 3594/* Connects to command socket therefore isn't callable from command handler. */
bfe2971a 3595int cgroup_get(const char *name, const char *lxcpath,
be835470
CB
3596 const char *filename, char *buf, size_t len)
3597{
3598 __do_close int unified_fd = -EBADF;
3599 ssize_t ret;
3600
bfe2971a 3601 if (is_empty_string(filename) || is_empty_string(name) ||
be835470
CB
3602 is_empty_string(lxcpath))
3603 return ret_errno(EINVAL);
3604
3605 if ((buf && !len) || (len && !buf))
3606 return ret_errno(EINVAL);
3607
ae4fcc7b 3608 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
be835470
CB
3609 if (unified_fd < 0)
3610 return ret_errno(ENOCGROUP2);
3611
3612 ret = lxc_read_try_buf_at(unified_fd, filename, buf, len);
3613 if (ret < 0)
3614 SYSERROR("Failed to read cgroup value");
3615
3616 return ret;
3617}
3618
751a624f 3619/* Connects to command socket therefore isn't callable from command handler. */
bfe2971a 3620int cgroup_set(const char *name, const char *lxcpath,
be835470
CB
3621 const char *filename, const char *value)
3622{
3623 __do_close int unified_fd = -EBADF;
3624 ssize_t ret;
3625
bfe2971a 3626 if (is_empty_string(filename) || is_empty_string(value) ||
be835470
CB
3627 is_empty_string(name) || is_empty_string(lxcpath))
3628 return ret_errno(EINVAL);
3629
ae4fcc7b 3630 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
be835470
CB
3631 if (unified_fd < 0)
3632 return ret_errno(ENOCGROUP2);
3633
3634 if (strncmp(filename, "devices.", STRLITERALLEN("devices.")) == 0) {
3635 struct device_item device = {};
3636
3637 ret = device_cgroup_rule_parse(&device, filename, value);
3638 if (ret < 0)
3639 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", filename, value);
3640
3641 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3642 } else {
3643 ret = lxc_writeat(unified_fd, filename, value, strlen(value));
3644 }
3645
3646 return ret;
3647}
c8af3332 3648
c9c814f4
CB
3649static int do_cgroup_freeze(int unified_fd,
3650 const char *state_string,
3651 int state_num,
3652 int timeout,
3653 const char *epoll_error,
3654 const char *wait_error)
c8af3332
CB
3655{
3656 __do_close int events_fd = -EBADF;
3657 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
3658 int ret;
3659 struct lxc_epoll_descr descr = {};
3660
3661 if (timeout != 0) {
3662 ret = lxc_mainloop_open(&descr);
3663 if (ret)
3664 return log_error_errno(-1, errno, "%s", epoll_error);
3665
3666 /* automatically cleaned up now */
3667 descr_ptr = &descr;
3668
3669 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3670 if (events_fd < 0)
3671 return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3672
3673 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
3674 if (ret < 0)
3675 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3676 }
3677
3678 ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3679 if (ret < 0)
3680 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3681
3682 if (timeout != 0) {
3683 ret = lxc_mainloop(&descr, timeout);
3684 if (ret)
3685 return log_error_errno(-1, errno, "%s", wait_error);
3686 }
3687
3688 return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3689}
3690
c9c814f4
CB
3691static inline int __cgroup_freeze(int unified_fd, int timeout)
3692{
3693 return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3694 "Failed to create epoll instance to wait for container freeze",
3695 "Failed to wait for container to be frozen");
3696}
3697
5ef7547f 3698int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
c8af3332
CB
3699{
3700 __do_close int unified_fd = -EBADF;
3701 int ret;
3702
b57f9b13
CB
3703 if (is_empty_string(name) || is_empty_string(lxcpath))
3704 return ret_errno(EINVAL);
3705
ae4fcc7b 3706 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
c8af3332
CB
3707 if (unified_fd < 0)
3708 return ret_errno(ENOCGROUP2);
3709
3710 lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
c9c814f4 3711 ret = __cgroup_freeze(unified_fd, timeout);
c8af3332 3712 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
5ef7547f 3713 return ret;
c8af3332
CB
3714}
3715
c9c814f4
CB
3716int __cgroup_unfreeze(int unified_fd, int timeout)
3717{
3718 return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3719 "Failed to create epoll instance to wait for container freeze",
3720 "Failed to wait for container to be frozen");
3721}
3722
5ef7547f 3723int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
c8af3332
CB
3724{
3725 __do_close int unified_fd = -EBADF;
3726 int ret;
3727
b57f9b13
CB
3728 if (is_empty_string(name) || is_empty_string(lxcpath))
3729 return ret_errno(EINVAL);
3730
ae4fcc7b 3731 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
c8af3332
CB
3732 if (unified_fd < 0)
3733 return ret_errno(ENOCGROUP2);
3734
3735 lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
c9c814f4 3736 ret = __cgroup_unfreeze(unified_fd, timeout);
c8af3332 3737 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
5ef7547f 3738 return ret;
c8af3332 3739}