]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
af_unix: improve SCM_RIGHTS file descriptor retrieval
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
ccb4cabe
SH
2
3/*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
0e7ff52c 8 * each controller.
ccb4cabe
SH
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
18406e5a 12 * a comma-separated list of controllers.
ccb4cabe 13 */
a54694f8 14
d38dd64a
CB
15#ifndef _GNU_SOURCE
16#define _GNU_SOURCE 1
17#endif
a54694f8
CB
18#include <ctype.h>
19#include <dirent.h>
20#include <errno.h>
21#include <grp.h>
d38dd64a
CB
22#include <linux/kdev_t.h>
23#include <linux/types.h>
942e193e
CB
24#include <poll.h>
25#include <signal.h>
a54694f8 26#include <stdint.h>
ccb4cabe
SH
27#include <stdio.h>
28#include <stdlib.h>
a54694f8 29#include <string.h>
385e58e8 30#include <sys/epoll.h>
438c4581 31#include <sys/types.h>
d38dd64a 32#include <unistd.h>
c8bf519d 33
d1783ef4 34#include "af_unix.h"
b635e92d 35#include "caps.h"
ccb4cabe 36#include "cgroup.h"
bf651989 37#include "cgroup2_devices.h"
6328fd9c 38#include "cgroup_utils.h"
ccb4cabe 39#include "commands.h"
c8af3332 40#include "commands_utils.h"
43654d34 41#include "conf.h"
d38dd64a 42#include "config.h"
a54694f8 43#include "log.h"
c19ad94b 44#include "macro.h"
018051e3 45#include "mainloop.h"
861cb8c2 46#include "memory_utils.h"
74ed30d7 47#include "mount_utils.h"
43654d34 48#include "storage/storage.h"
600a0163 49#include "string_utils.h"
315f8a4e 50#include "syscall_wrappers.h"
a54694f8 51#include "utils.h"
ccb4cabe 52
64e82f8b
DJ
53#ifndef HAVE_STRLCPY
54#include "include/strlcpy.h"
55#endif
56
3ebe2fbd
DJ
57#ifndef HAVE_STRLCAT
58#include "include/strlcat.h"
59#endif
60
ac2cecc4 61lxc_log_define(cgfsng, cgroup);
ccb4cabe 62
35ec1a38
CB
63/*
64 * Given a pointer to a null-terminated array of pointers, realloc to add one
8b8db2f6
CB
65 * entry, and point the new entry to NULL. Do not fail. Return the index to the
66 * second-to-last entry - that is, the one which is now available for use
67 * (keeping the list null-terminated).
ccb4cabe 68 */
35ec1a38 69static int list_add(void ***list)
ccb4cabe 70{
35ec1a38
CB
71 int idx = 0;
72 void **p;
ccb4cabe
SH
73
74 if (*list)
35ec1a38 75 for (; (*list)[idx]; idx++)
8b8db2f6 76 ;
ccb4cabe 77
35ec1a38
CB
78 p = realloc(*list, (idx + 2) * sizeof(void **));
79 if (!p)
80 return ret_errno(ENOMEM);
81
82 p[idx + 1] = NULL;
83 *list = p;
84
85 return idx;
ccb4cabe
SH
86}
87
8073018d
CB
88/* Given a null-terminated array of strings, check whether @entry is one of the
89 * strings.
ccb4cabe
SH
90 */
91static bool string_in_list(char **list, const char *entry)
92{
ccb4cabe
SH
93 if (!list)
94 return false;
d6337a5f 95
77c3e9a2 96 for (int i = 0; list[i]; i++)
8b99a20a 97 if (strequal(list[i], entry))
ccb4cabe
SH
98 return true;
99
100 return false;
101}
102
5ae0207c
CB
103/* Given a handler's cgroup data, return the struct hierarchy for the controller
104 * @c, or NULL if there is none.
ccb4cabe 105 */
59eac805 106static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
ccb4cabe 107{
77c3e9a2
CB
108 if (!ops->hierarchies)
109 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
d6337a5f 110
77c3e9a2 111 for (int i = 0; ops->hierarchies[i]; i++) {
27a5132c 112 if (!controller) {
d6337a5f 113 /* This is the empty unified hierarchy. */
09ed8992 114 if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
2202afc9 115 return ops->hierarchies[i];
09ed8992 116
106f1f38 117 continue;
6dcd6f02 118 }
09ed8992 119
6dcd6f02
CB
120 /*
121 * Handle controllers with significant implementation changes
122 * from cgroup to cgroup2.
123 */
124 if (pure_unified_layout(ops)) {
8b99a20a 125 if (strequal(controller, "devices")) {
ca72ccb5 126 if (device_utility_controller(ops->unified))
6dcd6f02
CB
127 return ops->unified;
128
129 break;
8b99a20a 130 } else if (strequal(controller, "freezer")) {
ca72ccb5 131 if (freezer_utility_controller(ops->unified))
6dcd6f02
CB
132 return ops->unified;
133
134 break;
135 }
d6337a5f
CB
136 }
137
27a5132c 138 if (string_in_list(ops->hierarchies[i]->controllers, controller))
2202afc9 139 return ops->hierarchies[i];
ccb4cabe 140 }
d6337a5f 141
27a5132c
CB
142 if (controller)
143 WARN("There is no useable %s controller", controller);
144 else
145 WARN("There is no empty unified cgroup hierarchy");
146
77c3e9a2 147 return ret_set_errno(NULL, ENOENT);
ccb4cabe
SH
148}
149
a54694f8
CB
150/* Taken over modified from the kernel sources. */
151#define NBITS 32 /* bits in uint32_t */
152#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
153#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
154
155static void set_bit(unsigned bit, uint32_t *bitarr)
156{
157 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
158}
159
160static void clear_bit(unsigned bit, uint32_t *bitarr)
161{
162 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
163}
164
165static bool is_set(unsigned bit, uint32_t *bitarr)
166{
167 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
168}
169
170/* Create cpumask from cpulist aka turn:
171 *
172 * 0,2-3
173 *
d5d468f6 174 * into bit array
a54694f8
CB
175 *
176 * 1 0 1 1
177 */
178static uint32_t *lxc_cpumask(char *buf, size_t nbits)
179{
77c3e9a2 180 __do_free uint32_t *bitarr = NULL;
a54694f8 181 char *token;
d5d468f6 182 size_t arrlen;
d5d468f6
CB
183
184 arrlen = BITS_TO_LONGS(nbits);
185 bitarr = calloc(arrlen, sizeof(uint32_t));
a54694f8 186 if (!bitarr)
c5b8049e 187 return ret_set_errno(NULL, ENOMEM);
a54694f8 188
0be0d78f 189 lxc_iterate_parts(token, buf, ",") {
a54694f8 190 errno = 0;
d5d468f6
CB
191 unsigned end, start;
192 char *range;
a54694f8 193
d5d468f6
CB
194 start = strtoul(token, NULL, 0);
195 end = start;
196 range = strchr(token, '-');
a54694f8
CB
197 if (range)
198 end = strtoul(range + 1, NULL, 0);
d5d468f6 199
c5b8049e
CB
200 if (!(start <= end))
201 return ret_set_errno(NULL, EINVAL);
a54694f8 202
c5b8049e
CB
203 if (end >= nbits)
204 return ret_set_errno(NULL, EINVAL);
a54694f8
CB
205
206 while (start <= end)
207 set_bit(start++, bitarr);
208 }
209
c5b8049e 210 return move_ptr(bitarr);
a54694f8
CB
211}
212
a54694f8
CB
213/* Turn cpumask into simple, comma-separated cpulist. */
214static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
215{
f761d24d 216 __do_free_string_list char **cpulist = NULL;
c19ad94b 217 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
77c3e9a2 218 int ret;
a54694f8 219
77c3e9a2 220 for (size_t i = 0; i <= nbits; i++) {
414c6719
CB
221 if (!is_set(i, bitarr))
222 continue;
223
0bba27c1
CB
224 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
225 if (ret < 0)
414c6719 226 return NULL;
414c6719
CB
227
228 ret = lxc_append_string(&cpulist, numstr);
f761d24d 229 if (ret < 0)
c5b8049e 230 return ret_set_errno(NULL, ENOMEM);
a54694f8 231 }
414c6719
CB
232
233 if (!cpulist)
c5b8049e 234 return ret_set_errno(NULL, ENOMEM);
414c6719 235
f761d24d 236 return lxc_string_join(",", (const char **)cpulist, false);
a54694f8
CB
237}
238
239static ssize_t get_max_cpus(char *cpulist)
240{
241 char *c1, *c2;
242 char *maxcpus = cpulist;
243 size_t cpus = 0;
244
245 c1 = strrchr(maxcpus, ',');
246 if (c1)
247 c1++;
248
249 c2 = strrchr(maxcpus, '-');
250 if (c2)
251 c2++;
252
253 if (!c1 && !c2)
254 c1 = maxcpus;
255 else if (c1 > c2)
256 c2 = c1;
257 else if (c1 < c2)
258 c1 = c2;
333987b9 259 else if (!c1 && c2)
a54694f8
CB
260 c1 = c2;
261
a54694f8
CB
262 errno = 0;
263 cpus = strtoul(c1, NULL, 0);
264 if (errno != 0)
265 return -1;
266
267 return cpus;
268}
269
77c3e9a2 270static inline bool is_unified_hierarchy(const struct hierarchy *h)
c04a6d4e 271{
b8572e8c 272 return h->fs_type == UNIFIED_HIERARCHY;
c04a6d4e
CB
273}
274
f57ac67f
CB
275/* Return true if the controller @entry is found in the null-terminated list of
276 * hierarchies @hlist.
ccb4cabe 277 */
c7a1f72a 278static bool controller_available(struct hierarchy **hlist, char *entry)
ccb4cabe 279{
ccb4cabe
SH
280 if (!hlist)
281 return false;
282
77c3e9a2 283 for (int i = 0; hlist[i]; i++)
ccb4cabe
SH
284 if (string_in_list(hlist[i]->controllers, entry))
285 return true;
d6337a5f 286
ccb4cabe
SH
287 return false;
288}
289
c7a1f72a 290static bool controllers_available(struct cgroup_ops *ops)
ccb4cabe 291{
77c3e9a2 292 struct hierarchy **hlist;
ccb4cabe 293
2202afc9 294 if (!ops->cgroup_use)
ccb4cabe 295 return true;
c2712f64 296
77c3e9a2
CB
297 hlist = ops->hierarchies;
298 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
c7a1f72a
CB
299 if (!controller_available(hlist, *cur))
300 return log_error(false, "The %s controller found", *cur);
c2712f64 301
ccb4cabe
SH
302 return true;
303}
304
63ba9eaf 305static char **list_new(void)
ccb4cabe 306{
63ba9eaf
CB
307 __do_free_string_list char **list = NULL;
308 int idx;
309
310 idx = list_add((void ***)&list);
311 if (idx < 0)
312 return NULL;
a55f31bd 313
63ba9eaf
CB
314 list[idx] = NULL;
315 return move_ptr(list);
35ec1a38 316}
d6337a5f 317
63ba9eaf 318static int list_add_string(char ***list, char *entry)
35ec1a38 319{
63ba9eaf
CB
320 __do_free char *dup = NULL;
321 int idx;
322
323 dup = strdup(entry);
324 if (!dup)
325 return ret_errno(ENOMEM);
326
327 idx = list_add((void ***)list);
328 if (idx < 0)
329 return idx;
330
331 (*list)[idx] = move_ptr(dup);
332 return 0;
333}
334
335static char **list_add_controllers(char *controllers)
336{
337 __do_free_string_list char **list = NULL;
35ec1a38 338 char *it;
6328fd9c 339
35ec1a38 340 lxc_iterate_parts(it, controllers, " \t\n") {
63ba9eaf 341 int ret;
d97919ab 342
63ba9eaf
CB
343 ret = list_add_string(&list, it);
344 if (ret < 0)
d6337a5f 345 return NULL;
411ac6d8 346 }
f205f10c 347
63ba9eaf 348 return move_ptr(list);
d6337a5f
CB
349}
350
35ec1a38 351static char **unified_controllers(int dfd, const char *file)
d6337a5f 352{
d97919ab 353 __do_free char *buf = NULL;
d6337a5f 354
46bf13b7 355 buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
d6337a5f 356 if (!buf)
411ac6d8 357 return NULL;
6328fd9c 358
63ba9eaf 359 return list_add_controllers(buf);
ccb4cabe
SH
360}
361
35ec1a38 362static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
060e54d6
CB
363{
364 if (!ops->cgroup_use)
35ec1a38 365 return false;
060e54d6
CB
366
367 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
368 bool found = false;
369
370 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
371 if (!strequal(*cur_use, *cur_ctrl))
372 continue;
373
374 found = true;
375 break;
376 }
377
378 if (found)
379 continue;
380
35ec1a38 381 return true;
060e54d6
CB
382 }
383
35ec1a38 384 return false;
060e54d6
CB
385}
386
179754a2
CB
387static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
388 int dfd_base, char *base_cgroup,
b8572e8c 389 char **controllers, cgroupfs_type_magic_t fs_type)
ccb4cabe 390{
600a0163 391 __do_free struct hierarchy *new = NULL;
701be30e 392 int idx;
ccb4cabe 393
35ec1a38 394 if (abspath(base_cgroup))
fc4612cb 395 return syserrno_set(-EINVAL, "Container base path must be relative to controller mount");
060e54d6 396
1973b62a 397 new = zalloc(sizeof(*new));
6e214b74 398 if (!new)
060e54d6 399 return ret_errno(ENOMEM);
c72e7cb5 400
e33870e5 401 new->dfd_con = -EBADF;
c0af7b1c 402 new->dfd_lim = -EBADF;
6a32c817 403 new->dfd_mon = -EBADF;
600a0163 404
44585f1a
CB
405 new->fs_type = fs_type;
406 new->controllers = controllers;
a58be2ad 407 new->at_mnt = mnt;
44585f1a 408 new->at_base = base_cgroup;
35ec1a38 409
44585f1a
CB
410 new->dfd_mnt = dfd_mnt;
411 new->dfd_base = dfd_base;
35ec1a38
CB
412
413 TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
414 mnt, maybe_empty(base_cgroup));
060e54d6 415 for (char *const *it = new->controllers; it && *it; it++)
35ec1a38 416 TRACE("The hierarchy contains the %s controller", *it);
6328fd9c 417
35ec1a38 418 idx = list_add((void ***)&ops->hierarchies);
63ba9eaf
CB
419 if (idx < 0)
420 return ret_errno(idx);
421
b8572e8c 422 if (fs_type == UNIFIED_HIERARCHY)
060e54d6 423 ops->unified = new;
701be30e 424 (ops->hierarchies)[idx] = move_ptr(new);
ccb4cabe 425
63ba9eaf 426 return 0;
ccb4cabe
SH
427}
428
c55fe36d 429static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
c71d83e1 430{
c55fe36d 431 if (!path_prune || !hierarchies)
2202afc9 432 return 0;
d6337a5f 433
8e64b673 434 for (int i = 0; hierarchies[i]; i++) {
2202afc9 435 struct hierarchy *h = hierarchies[i];
77c3e9a2 436 int ret;
d6337a5f 437
c55fe36d 438 ret = cgroup_tree_prune(h->dfd_base, path_prune);
2202afc9 439 if (ret < 0)
c55fe36d
CB
440 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
441 else
442 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
2202afc9 443
b1b1a60f 444 free_equal(h->path_lim, h->path_con);
2202afc9 445 }
d6337a5f 446
c71d83e1 447 return 0;
d6337a5f
CB
448}
449
2202afc9
CB
450struct generic_userns_exec_data {
451 struct hierarchy **hierarchies;
c55fe36d 452 const char *path_prune;
2202afc9
CB
453 struct lxc_conf *conf;
454 uid_t origuid; /* target uid in parent namespace */
455 char *path;
456};
d6337a5f 457
de6fe132 458static int cgroup_tree_remove_wrapper(void *data)
2202afc9 459{
2202afc9
CB
460 struct generic_userns_exec_data *arg = data;
461 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
462 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
8e64b673 463 int ret;
d6337a5f 464
8917c382 465 if (!lxc_drop_groups() && errno != EPERM)
b58214ac
CB
466 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
467
2202afc9 468 ret = setresgid(nsgid, nsgid, nsgid);
8e64b673 469 if (ret < 0)
77c3e9a2 470 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
8e64b673 471 (int)nsgid, (int)nsgid, (int)nsgid);
d6337a5f 472
2202afc9 473 ret = setresuid(nsuid, nsuid, nsuid);
8e64b673 474 if (ret < 0)
77c3e9a2 475 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
8e64b673 476 (int)nsuid, (int)nsuid, (int)nsuid);
d6337a5f 477
c55fe36d 478 return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
d6337a5f
CB
479}
480
434c8e15
CB
481__cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
482 struct lxc_handler *handler)
d6337a5f
CB
483{
484 int ret;
bd8ef4e4 485
fc3b9533
CB
486 if (!ops) {
487 ERROR("Called with uninitialized cgroup operations");
488 return;
489 }
fc1c3af9 490
69b4a4bb
CB
491 if (!ops->hierarchies)
492 return;
493
fc3b9533
CB
494 if (!handler) {
495 ERROR("Called with uninitialized handler");
496 return;
497 }
fc1c3af9 498
fc3b9533
CB
499 if (!handler->conf) {
500 ERROR("Called with uninitialized conf");
501 return;
502 }
fc1c3af9 503
a6aeb9f1
CB
504 if (!ops->container_limit_cgroup) {
505 WARN("Uninitialized limit cgroup");
506 return;
507 }
508
31b84c7a 509 ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
bf651989
CB
510 if (ret < 0)
511 WARN("Failed to detach bpf program from cgroup");
bf651989 512
bb6dbaf0 513 if (!lxc_list_empty(&handler->conf->id_map)) {
8e64b673 514 struct generic_userns_exec_data wrap = {
77c3e9a2 515 .conf = handler->conf,
c55fe36d 516 .path_prune = ops->container_limit_cgroup,
77c3e9a2
CB
517 .hierarchies = ops->hierarchies,
518 .origuid = 0,
8e64b673 519 };
de6fe132
CB
520 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
521 &wrap, "cgroup_tree_remove_wrapper");
8e64b673 522 } else {
c55fe36d 523 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
ccb4cabe 524 }
8e64b673 525 if (ret < 0)
fc3b9533 526 SYSWARN("Failed to destroy cgroups");
ccb4cabe
SH
527}
528
033267c9
CB
529#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
530#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
531static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
532 bool am_initialized)
434c8e15 533{
033267c9
CB
534 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
535 *offlinecpus = NULL, *posscpus = NULL;
536 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
537 *possmask = NULL;
538 int ret;
539 ssize_t i;
540 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
541 bool flipped_bit = false;
b376d3d0 542
033267c9
CB
543 posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
544 if (!posscpus)
545 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
546
547 /* Get maximum number of cpus found in possible cpuset. */
548 maxposs = get_max_cpus(posscpus);
549 if (maxposs < 0 || maxposs >= INT_MAX - 1)
550 return false;
551
552 if (file_exists(__ISOL_CPUS)) {
553 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
554 if (!isolcpus)
555 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
556
557 if (isdigit(isolcpus[0])) {
558 /* Get maximum number of cpus found in isolated cpuset. */
559 maxisol = get_max_cpus(isolcpus);
560 if (maxisol < 0 || maxisol >= INT_MAX - 1)
561 return false;
562 }
563
564 if (maxposs < maxisol)
565 maxposs = maxisol;
566 maxposs++;
567 } else {
568 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
fc3b9533 569 }
434c8e15 570
033267c9
CB
571 if (file_exists(__OFFLINE_CPUS)) {
572 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
573 if (!offlinecpus)
574 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
434c8e15 575
033267c9
CB
576 if (isdigit(offlinecpus[0])) {
577 /* Get maximum number of cpus found in offline cpuset. */
578 maxoffline = get_max_cpus(offlinecpus);
579 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
580 return false;
581 }
582
583 if (maxposs < maxoffline)
584 maxposs = maxoffline;
585 maxposs++;
586 } else {
587 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
fc3b9533 588 }
b376d3d0 589
033267c9
CB
590 if ((maxisol == 0) && (maxoffline == 0)) {
591 cpulist = move_ptr(posscpus);
592 goto copy_parent;
fc3b9533 593 }
1973b62a 594
033267c9
CB
595 possmask = lxc_cpumask(posscpus, maxposs);
596 if (!possmask)
597 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
434c8e15 598
033267c9
CB
599 if (maxisol > 0) {
600 isolmask = lxc_cpumask(isolcpus, maxposs);
601 if (!isolmask)
602 return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
603 }
434c8e15 604
033267c9
CB
605 if (maxoffline > 0) {
606 offlinemask = lxc_cpumask(offlinecpus, maxposs);
607 if (!offlinemask)
608 return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
609 }
610
611 for (i = 0; i <= maxposs; i++) {
612 if ((isolmask && !is_set(i, isolmask)) ||
613 (offlinemask && !is_set(i, offlinemask)) ||
614 !is_set(i, possmask))
434c8e15
CB
615 continue;
616
033267c9
CB
617 flipped_bit = true;
618 clear_bit(i, possmask);
619 }
c468e4d4 620
033267c9
CB
621 if (!flipped_bit) {
622 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
623 TRACE("No isolated or offline cpus present in cpuset");
624 } else {
625 cpulist = move_ptr(posscpus);
626 TRACE("Removed isolated or offline cpus from cpuset");
627 }
628 if (!cpulist)
629 return log_error_errno(false, errno, "Failed to create cpu list");
1973b62a 630
033267c9
CB
631copy_parent:
632 if (!am_initialized) {
633 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
634 if (ret < 0)
635 return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
77ffeed2 636
033267c9
CB
637 TRACE("Copied cpu settings of parent cgroup");
638 }
77ffeed2 639
033267c9
CB
640 return true;
641}
1973b62a 642
033267c9
CB
643static bool cpuset1_initialize(int dfd_base, int dfd_next)
644{
645 char mems[PATH_MAX];
646 ssize_t bytes;
647 char v;
434c8e15 648
033267c9
CB
649 /*
650 * Determine whether the base cgroup has cpuset
651 * inheritance turned on.
652 */
653 bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
654 if (bytes < 0)
655 return syserrno(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
656
657 /*
658 * Initialize cpuset.cpus and make remove any isolated
659 * and offline cpus.
660 */
661 if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
662 return syserrno(false, "Failed to initialize cpuset.cpus");
663
664 /* Read cpuset.mems from parent... */
665 bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
666 if (bytes < 0)
667 return syserrno(false, "Failed to read file %d(cpuset.mems)", dfd_base);
668
669 /* ... and copy to first cgroup in the tree... */
670 bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
671 if (bytes < 0)
672 return syserrno(false, "Failed to write %d(cpuset.mems)", dfd_next);
673
674 /* ... and finally turn on cpuset inheritance. */
675 bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
676 if (bytes < 0)
677 return syserrno(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
678
679 return log_trace(true, "Initialized cpuset in the legacy hierarchy");
434c8e15
CB
680}
681
033267c9
CB
682static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
683 bool cpuset_v1, bool eexist_ignore)
6099dd5a 684{
da42ac7b
CB
685 __do_close int dfd_final = -EBADF;
686 int dfd_cur = dfd_base;
687 int ret = 0;
688 size_t len;
689 char *cur;
690 char buf[PATH_MAX];
6099dd5a 691
da42ac7b 692 if (is_empty_string(path))
bce04069 693 return ret_errno(EINVAL);
6099dd5a 694
da42ac7b
CB
695 len = strlcpy(buf, path, sizeof(buf));
696 if (len >= sizeof(buf))
bce04069 697 return ret_errno(E2BIG);
6099dd5a 698
da42ac7b
CB
699 lxc_iterate_parts(cur, buf, "/") {
700 /*
701 * Even though we vetted the paths when we parsed the config
702 * we're paranoid here and check that the path is neither
703 * absolute nor walks upwards.
704 */
e4db08ed 705 if (abspath(cur))
da42ac7b 706 return syserrno_set(-EINVAL, "No absolute paths allowed");
6099dd5a 707
e4db08ed 708 if (strnequal(cur, "..", STRLITERALLEN("..")))
da42ac7b 709 return syserrno_set(-EINVAL, "No upward walking paths allowed");
6099dd5a 710
da42ac7b
CB
711 ret = mkdirat(dfd_cur, cur, mode);
712 if (ret < 0) {
713 if (errno != EEXIST)
714 return syserrno(-errno, "Failed to create %d(%s)", dfd_cur, cur);
715
716 ret = -EEXIST;
717 }
718 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
719
720 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
721 if (dfd_final < 0)
722 return syserrno(-errno, "Fail to open%s directory %d(%s)",
723 !ret ? " newly created" : "", dfd_base, cur);
724 if (dfd_cur != dfd_base)
725 close(dfd_cur);
033267c9
CB
726 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
727 return syserrno(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
da42ac7b 728 /*
033267c9
CB
729 * Leave dfd_final pointing to the last fd we opened so
730 * it will be automatically zapped if we return early.
da42ac7b
CB
731 */
732 dfd_cur = dfd_final;
733 }
734
735 /* The final cgroup must be succesfully creatd by us. */
033267c9
CB
736 if (ret) {
737 if (ret != -EEXIST || !eexist_ignore)
738 return syserrno_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
739 }
da42ac7b
CB
740
741 return move_fd(dfd_final);
6099dd5a
CB
742}
743
432faf20 744static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
a6aeb9f1
CB
745 struct hierarchy *h, const char *cgroup_limit_dir,
746 const char *cgroup_leaf, bool payload)
72068e74 747{
da42ac7b 748 __do_close int fd_limit = -EBADF, fd_final = -EBADF;
432faf20 749 __do_free char *path = NULL, *limit_path = NULL;
033267c9 750 bool cpuset_v1 = false;
72068e74 751
033267c9
CB
752 /*
753 * The legacy cpuset controller needs massaging in case inheriting
754 * settings from its immediate ancestor cgroup hasn't been turned on.
755 */
756 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
0c3deb94 757
a6aeb9f1 758 if (payload && cgroup_leaf) {
da42ac7b 759 /* With isolation both parts need to not already exist. */
033267c9 760 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
da42ac7b
CB
761 if (fd_limit < 0)
762 return syserrno(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
432faf20 763
a6aeb9f1
CB
764 TRACE("Created limit cgroup %d->%d(%s)",
765 fd_limit, h->dfd_base, cgroup_limit_dir);
432faf20
WB
766
767 /*
768 * With isolation the devices legacy cgroup needs to be
769 * iinitialized early, as it typically contains an 'a' (all)
770 * line, which is not possible once a subdirectory has been
771 * created.
772 */
ec4d463d
CB
773 if (string_in_list(h->controllers, "devices") &&
774 !ops->setup_limits_legacy(ops, conf, true))
775 return log_error(false, "Failed to setup legacy device limits");
432faf20 776
44585f1a 777 limit_path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
a6aeb9f1
CB
778 path = must_make_path(limit_path, cgroup_leaf, NULL);
779
780 /*
781 * If we use a separate limit cgroup, the leaf cgroup, i.e. the
782 * cgroup the container actually resides in, is below fd_limit.
783 */
784 fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false);
e2035358
CB
785 if (fd_final < 0) {
786 /* Ensure we don't leave any garbage behind. */
787 if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
788 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
789 else
790 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
791 }
a6aeb9f1 792 } else {
44585f1a 793 path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
9981107f
CB
794
795 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
a6aeb9f1 796 }
033267c9
CB
797 if (fd_final < 0)
798 return syserrno(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
0c3deb94 799
1973b62a 800 if (payload) {
e33870e5 801 h->dfd_con = move_fd(fd_final);
67ed60ce 802 h->path_con = move_ptr(path);
da42ac7b
CB
803
804 if (fd_limit < 0)
c0af7b1c 805 h->dfd_lim = h->dfd_con;
da42ac7b 806 else
c0af7b1c 807 h->dfd_lim = move_fd(fd_limit);
da42ac7b 808
a6aeb9f1 809 if (limit_path)
b1b1a60f 810 h->path_lim = move_ptr(limit_path);
a6aeb9f1 811 else
b1b1a60f 812 h->path_lim = h->path_con;
1973b62a 813 } else {
6a32c817 814 h->dfd_mon = move_fd(fd_final);
1973b62a 815 }
fe70edee 816
c581d2a6 817 return true;
ccb4cabe
SH
818}
819
6c880cdf
CB
820static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
821 bool payload)
ccb4cabe 822{
c1ece895 823 bool prune = true;
72068e74 824
1973b62a 825 if (payload) {
c1ece895 826 /* Check whether we actually created the cgroup to prune. */
c0af7b1c 827 if (h->dfd_lim < 0)
c1ece895
CB
828 prune = false;
829
b1b1a60f 830 free_equal(h->path_con, h->path_lim);
c0af7b1c 831 close_equal(h->dfd_con, h->dfd_lim);
1973b62a 832 } else {
c1ece895 833 /* Check whether we actually created the cgroup to prune. */
6a32c817 834 if (h->dfd_mon < 0)
c1ece895
CB
835 prune = false;
836
6a32c817 837 close_prot_errno_disarm(h->dfd_mon);
1973b62a 838 }
e56639fb 839
c1ece895
CB
840 /* We didn't create this cgroup. */
841 if (!prune)
842 return;
843
844 if (cgroup_tree_prune(h->dfd_base, path_prune))
cb423bd3
CB
845 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
846 else
847 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
a900cbaf
WB
848}
849
033267c9
CB
850__cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
851 struct lxc_handler *handler)
852{
853 int len;
854 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
855 const struct lxc_conf *conf;
856
857 if (!ops) {
858 ERROR("Called with uninitialized cgroup operations");
859 return;
860 }
861
862 if (!ops->hierarchies)
863 return;
864
865 if (!handler) {
866 ERROR("Called with uninitialized handler");
867 return;
868 }
869
870 if (!handler->conf) {
871 ERROR("Called with uninitialized conf");
872 return;
873 }
874 conf = handler->conf;
875
1e058855
CB
876 if (!ops->monitor_cgroup) {
877 WARN("Uninitialized monitor cgroup");
878 return;
879 }
880
033267c9
CB
881 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
882 if (len < 0)
883 return;
884
885 for (int i = 0; ops->hierarchies[i]; i++) {
886 __do_close int fd_pivot = -EBADF;
887 __do_free char *pivot_path = NULL;
888 struct hierarchy *h = ops->hierarchies[i];
889 bool cpuset_v1 = false;
890 int ret;
891
033267c9
CB
892 /* Monitor might have died before we entered the cgroup. */
893 if (handler->monitor_pid <= 0) {
894 WARN("No valid monitor process found while destroying cgroups");
c55fe36d 895 goto cgroup_prune_tree;
033267c9
CB
896 }
897
898 if (conf->cgroup_meta.monitor_pivot_dir)
899 pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
033267c9
CB
900 else if (conf->cgroup_meta.dir)
901 pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
902 else
903 pivot_path = must_make_path(CGROUP_PIVOT, NULL);
904
905 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
906
907 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
908 if (fd_pivot < 0) {
909 SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
910 continue;
911 }
912
913 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
914 if (ret != 0) {
915 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
916 continue;
917 }
918
c55fe36d
CB
919cgroup_prune_tree:
920 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
033267c9 921 if (ret < 0)
c55fe36d
CB
922 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
923 else
924 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
033267c9
CB
925 }
926}
927
a900cbaf
WB
928/*
929 * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
930 * proper prefix directory of lxc.cgroup.dir.payload.
931 *
932 * Returns the prefix length if it is set, otherwise zero on success.
933 */
934static bool check_cgroup_dir_config(struct lxc_conf *conf)
935{
936 const char *monitor_dir = conf->cgroup_meta.monitor_dir,
937 *container_dir = conf->cgroup_meta.container_dir,
938 *namespace_dir = conf->cgroup_meta.namespace_dir;
a900cbaf
WB
939
940 /* none of the new options are set, all is fine */
941 if (!monitor_dir && !container_dir && !namespace_dir)
942 return true;
943
944 /* some are set, make sure lxc.cgroup.dir is not also set*/
945 if (conf->cgroup_meta.dir)
946 return log_error_errno(false, EINVAL,
947 "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
948
949 /* make sure both monitor and payload are set */
950 if (!monitor_dir || !container_dir)
951 return log_error_errno(false, EINVAL,
952 "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
953
954 /* namespace_dir may be empty */
955 return true;
72068e74
CB
956}
957
59eac805 958__cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
72068e74 959{
dcf6a5c7 960 __do_free char *monitor_cgroup = NULL;
fe70edee
CB
961 int idx = 0;
962 int i;
5ce03bc0 963 size_t len;
a900cbaf 964 char *suffix = NULL;
0d66e29a 965 struct lxc_conf *conf;
72068e74 966
0d66e29a
CB
967 if (!ops)
968 return ret_set_errno(false, ENOENT);
e56639fb 969
69b4a4bb
CB
970 if (!ops->hierarchies)
971 return true;
972
0d66e29a
CB
973 if (ops->monitor_cgroup)
974 return ret_set_errno(false, EEXIST);
975
976 if (!handler || !handler->conf)
977 return ret_set_errno(false, EINVAL);
978
979 conf = handler->conf;
980
a900cbaf
WB
981 if (!check_cgroup_dir_config(conf))
982 return false;
983
984 if (conf->cgroup_meta.monitor_dir) {
a900cbaf
WB
985 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
986 } else if (conf->cgroup_meta.dir) {
fe70edee
CB
987 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
988 DEFAULT_MONITOR_CGROUP_PREFIX,
989 handler->name,
990 CGROUP_CREATE_RETRY, NULL);
b3ed2061 991 } else if (ops->cgroup_pattern) {
dcf6a5c7
CB
992 __do_free char *cgroup_tree = NULL;
993
994 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
995 if (!cgroup_tree)
d6bdd182
CB
996 return ret_set_errno(false, ENOMEM);
997
d6bdd182
CB
998 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
999 DEFAULT_MONITOR_CGROUP,
b3ed2061
CB
1000 CGROUP_CREATE_RETRY, NULL);
1001 } else {
fe70edee
CB
1002 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1003 handler->name,
1004 CGROUP_CREATE_RETRY, NULL);
b3ed2061 1005 }
fe70edee 1006 if (!monitor_cgroup)
0d66e29a 1007 return ret_set_errno(false, ENOMEM);
72068e74 1008
a900cbaf
WB
1009 if (!conf->cgroup_meta.monitor_dir) {
1010 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1011 *suffix = '\0';
1012 }
5ce03bc0 1013 do {
a900cbaf 1014 if (idx && suffix)
fe70edee 1015 sprintf(suffix, "-%d", idx);
72068e74 1016
ebc10afe 1017 for (i = 0; ops->hierarchies[i]; i++) {
432faf20 1018 if (cgroup_tree_create(ops, handler->conf,
dcf6a5c7 1019 ops->hierarchies[i],
6fec4327 1020 monitor_cgroup, NULL, false))
fe70edee
CB
1021 continue;
1022
7064ee3a 1023 DEBUG("Failed to create cgroup %s)", monitor_cgroup);
6c880cdf
CB
1024 for (int j = 0; j <= i; j++)
1025 cgroup_tree_prune_leaf(ops->hierarchies[j],
1026 monitor_cgroup, false);
fe70edee
CB
1027
1028 idx++;
1029 break;
5ce03bc0 1030 }
a900cbaf 1031 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
5ce03bc0 1032
a900cbaf 1033 if (idx == 1000 || (!suffix && idx != 0))
04a49a14 1034 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
72068e74 1035
c581d2a6 1036 ops->monitor_cgroup = move_ptr(monitor_cgroup);
6e8703a4 1037 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
ccb4cabe
SH
1038}
1039
fe70edee
CB
1040/*
1041 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
cecad0c1 1042 * next cgroup_pattern-1, -2, ..., -999.
ccb4cabe 1043 */
59eac805 1044__cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
ccb4cabe 1045{
a6aeb9f1
CB
1046 __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1047 char *limit_cgroup;
f3839f12 1048 int idx = 0;
fe70edee 1049 int i;
ccb4cabe 1050 size_t len;
a900cbaf 1051 char *suffix = NULL;
f3839f12 1052 struct lxc_conf *conf;
43654d34 1053
f3839f12
CB
1054 if (!ops)
1055 return ret_set_errno(false, ENOENT);
ccb4cabe 1056
69b4a4bb
CB
1057 if (!ops->hierarchies)
1058 return true;
1059
471929c6 1060 if (ops->container_cgroup || ops->container_limit_cgroup)
f3839f12
CB
1061 return ret_set_errno(false, EEXIST);
1062
1063 if (!handler || !handler->conf)
1064 return ret_set_errno(false, EINVAL);
1065
1066 conf = handler->conf;
1067
a900cbaf
WB
1068 if (!check_cgroup_dir_config(conf))
1069 return false;
1070
1071 if (conf->cgroup_meta.container_dir) {
a6aeb9f1
CB
1072 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1073 if (!__limit_cgroup)
a900cbaf
WB
1074 return ret_set_errno(false, ENOMEM);
1075
432faf20 1076 if (conf->cgroup_meta.namespace_dir) {
a6aeb9f1 1077 container_cgroup = must_make_path(__limit_cgroup,
432faf20
WB
1078 conf->cgroup_meta.namespace_dir,
1079 NULL);
a6aeb9f1 1080 limit_cgroup = __limit_cgroup;
432faf20
WB
1081 } else {
1082 /* explicit paths but without isolation */
a6aeb9f1
CB
1083 limit_cgroup = move_ptr(__limit_cgroup);
1084 container_cgroup = limit_cgroup;
432faf20 1085 }
a900cbaf 1086 } else if (conf->cgroup_meta.dir) {
a6aeb9f1
CB
1087 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1088 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1089 handler->name,
1090 CGROUP_CREATE_RETRY, NULL);
1091 container_cgroup = limit_cgroup;
b3ed2061 1092 } else if (ops->cgroup_pattern) {
dcf6a5c7
CB
1093 __do_free char *cgroup_tree = NULL;
1094
1095 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1096 if (!cgroup_tree)
d6bdd182
CB
1097 return ret_set_errno(false, ENOMEM);
1098
a6aeb9f1
CB
1099 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1100 DEFAULT_PAYLOAD_CGROUP,
1101 CGROUP_CREATE_RETRY, NULL);
1102 container_cgroup = limit_cgroup;
b3ed2061 1103 } else {
a6aeb9f1
CB
1104 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1105 handler->name,
1106 CGROUP_CREATE_RETRY, NULL);
1107 container_cgroup = limit_cgroup;
b3ed2061 1108 }
a6aeb9f1 1109 if (!limit_cgroup)
fe70edee 1110 return ret_set_errno(false, ENOMEM);
ccb4cabe 1111
a900cbaf
WB
1112 if (!conf->cgroup_meta.container_dir) {
1113 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1114 *suffix = '\0';
1115 }
d97919ab 1116 do {
a900cbaf 1117 if (idx && suffix)
fe70edee 1118 sprintf(suffix, "-%d", idx);
bb30b52a 1119
d97919ab 1120 for (i = 0; ops->hierarchies[i]; i++) {
432faf20 1121 if (cgroup_tree_create(ops, handler->conf,
a6aeb9f1
CB
1122 ops->hierarchies[i], limit_cgroup,
1123 conf->cgroup_meta.namespace_dir,
6fec4327 1124 true))
fe70edee
CB
1125 continue;
1126
67ed60ce 1127 DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
6c880cdf
CB
1128 for (int j = 0; j <= i; j++)
1129 cgroup_tree_prune_leaf(ops->hierarchies[j],
a6aeb9f1 1130 limit_cgroup, true);
fe70edee
CB
1131
1132 idx++;
1133 break;
66b66624 1134 }
a900cbaf 1135 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
cecad0c1 1136
a900cbaf 1137 if (idx == 1000 || (!suffix && idx != 0))
04a49a14 1138 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
cecad0c1 1139
fe70edee 1140 ops->container_cgroup = move_ptr(container_cgroup);
a6aeb9f1
CB
1141 if (__limit_cgroup)
1142 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
c55fe36d
CB
1143 else
1144 ops->container_limit_cgroup = ops->container_cgroup;
a6aeb9f1
CB
1145 INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1146 ops->container_cgroup, ops->container_limit_cgroup);
ccb4cabe 1147 return true;
ccb4cabe
SH
1148}
1149
c581d2a6
CB
1150__cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1151 struct lxc_handler *handler)
ccb4cabe 1152{
fdb0b8ab 1153 int monitor_len, transient_len = 0;
c581d2a6
CB
1154 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1155 transient[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 1156
797fa65e
CB
1157 if (!ops)
1158 return ret_set_errno(false, ENOENT);
1159
69b4a4bb
CB
1160 if (!ops->hierarchies)
1161 return true;
1162
797fa65e
CB
1163 if (!ops->monitor_cgroup)
1164 return ret_set_errno(false, ENOENT);
1165
1166 if (!handler || !handler->conf)
1167 return ret_set_errno(false, EINVAL);
1168
0bba27c1
CB
1169 monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1170 if (monitor_len < 0)
1171 return false;
1172
1173 if (handler->transient_pid > 0) {
1174 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1175 if (transient_len < 0)
1176 return false;
1177 }
ccb4cabe 1178
eeef32bb 1179 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1180 struct hierarchy *h = ops->hierarchies[i];
c581d2a6 1181 int ret;
08768001 1182
6a32c817 1183 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
1973b62a 1184 if (ret)
6a32c817 1185 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
c581d2a6 1186
6a32c817 1187 TRACE("Moved monitor into cgroup %d", h->dfd_mon);
ebf88e5b 1188
34683042 1189 if (handler->transient_pid <= 0)
d1ee8719 1190 continue;
c581d2a6 1191
6a32c817 1192 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
1973b62a 1193 if (ret)
6a32c817 1194 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1973b62a 1195
6a32c817 1196 TRACE("Moved transient process into cgroup %d", h->dfd_mon);
ebf88e5b 1197
1973b62a 1198 /*
78eb6aa6 1199 * we don't keep the fds for non-unified hierarchies around
1973b62a 1200 * mainly because we don't make use of them anymore after the
78eb6aa6 1201 * core cgroup setup is done but also because there are quite a
1973b62a
CB
1202 * lot of them.
1203 */
1204 if (!is_unified_hierarchy(h))
6a32c817 1205 close_prot_errno_disarm(h->dfd_mon);
ccb4cabe 1206 }
c581d2a6 1207 handler->transient_pid = -1;
ccb4cabe
SH
1208
1209 return true;
1210}
1211
c581d2a6
CB
1212__cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1213 struct lxc_handler *handler)
eeef32bb 1214{
c581d2a6
CB
1215 int len;
1216 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
eeef32bb 1217
4490328e
CB
1218 if (!ops)
1219 return ret_set_errno(false, ENOENT);
1220
c581d2a6
CB
1221 if (!ops->hierarchies)
1222 return true;
1223
4490328e
CB
1224 if (!ops->container_cgroup)
1225 return ret_set_errno(false, ENOENT);
1226
1227 if (!handler || !handler->conf)
1228 return ret_set_errno(false, EINVAL);
1229
0bba27c1
CB
1230 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1231 if (len < 0)
1232 return false;
c581d2a6
CB
1233
1234 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1235 struct hierarchy *h = ops->hierarchies[i];
c581d2a6
CB
1236 int ret;
1237
b3a42865
CB
1238 if (is_unified_hierarchy(h) &&
1239 (handler->clone_flags & CLONE_INTO_CGROUP))
f7176c3e
CB
1240 continue;
1241
e33870e5 1242 ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
c581d2a6 1243 if (ret != 0)
67ed60ce 1244 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
25db3f94 1245
67ed60ce 1246 TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
c581d2a6
CB
1247 }
1248
1249 return true;
eeef32bb
CB
1250}
1251
1973b62a
CB
1252static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1253 gid_t chown_gid, mode_t chmod_mode)
6efacf80
CB
1254{
1255 int ret;
1256
1973b62a
CB
1257 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1258 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1259 if (ret < 0)
1260 return log_warn_errno(-1,
1261 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1262 dirfd, path, (int)chown_uid,
1263 (int)chown_gid);
6efacf80 1264
1973b62a
CB
1265 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1266 if (ret < 0)
1267 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1268 dirfd, path, (int)chmod_mode);
6efacf80
CB
1269
1270 return 0;
1271}
1272
1273/* chgrp the container cgroups to container group. We leave
c0888dfe
SH
1274 * the container owner as cgroup owner. So we must make the
1275 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1276 *
1277 * Also chown the tasks and cgroup.procs files. Those may not
1278 * exist depending on kernel version.
c0888dfe 1279 */
ccb4cabe
SH
1280static int chown_cgroup_wrapper(void *data)
1281{
6a720d74 1282 int ret;
4160c3a0
CB
1283 uid_t destuid;
1284 struct generic_userns_exec_data *arg = data;
1285 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1286 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1287
8917c382 1288 if (!lxc_drop_groups() && errno != EPERM)
b58214ac
CB
1289 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1290
6efacf80 1291 ret = setresgid(nsgid, nsgid, nsgid);
803e4123 1292 if (ret < 0)
77c3e9a2 1293 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
803e4123 1294 (int)nsgid, (int)nsgid, (int)nsgid);
6efacf80
CB
1295
1296 ret = setresuid(nsuid, nsuid, nsuid);
803e4123 1297 if (ret < 0)
77c3e9a2 1298 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
803e4123 1299 (int)nsuid, (int)nsuid, (int)nsuid);
6efacf80 1300
ccb4cabe 1301 destuid = get_ns_uid(arg->origuid);
b962868f
CB
1302 if (destuid == LXC_INVALID_UID)
1303 destuid = 0;
ccb4cabe 1304
6a720d74 1305 for (int i = 0; arg->hierarchies[i]; i++) {
e33870e5 1306 int dirfd = arg->hierarchies[i]->dfd_con;
43647298 1307
7f02fd24
CB
1308 if (dirfd < 0)
1309 return syserrno_set(-EBADF, "Invalid cgroup file descriptor");
1310
1973b62a 1311 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
c0888dfe 1312
1973b62a
CB
1313 /*
1314 * Failures to chown() these are inconvenient but not
6efacf80
CB
1315 * detrimental We leave these owned by the container launcher,
1316 * so that container root can write to the files to attach. We
1317 * chmod() them 664 so that container systemd can write to the
1318 * files (which systemd in wily insists on doing).
ab8f5424 1319 */
6efacf80 1320
b8572e8c 1321 if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
1973b62a 1322 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
43647298 1323
1973b62a 1324 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
0e17357c 1325
b8572e8c 1326 if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
0e17357c
CB
1327 continue;
1328
042f9e9c 1329 for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
1973b62a 1330 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
ccb4cabe
SH
1331 }
1332
1333 return 0;
1334}
1335
b857f4be 1336__cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
c98bbf71 1337 struct lxc_conf *conf)
ccb4cabe 1338{
4160c3a0 1339 struct generic_userns_exec_data wrap;
ccb4cabe 1340
c98bbf71
CB
1341 if (!ops)
1342 return ret_set_errno(false, ENOENT);
ccb4cabe 1343
69b4a4bb
CB
1344 if (!ops->hierarchies)
1345 return true;
1346
c98bbf71
CB
1347 if (!ops->container_cgroup)
1348 return ret_set_errno(false, ENOENT);
1349
1350 if (!conf)
1351 return ret_set_errno(false, EINVAL);
1352
1353 if (lxc_list_empty(&conf->id_map))
1354 return true;
1355
ccb4cabe 1356 wrap.origuid = geteuid();
4160c3a0 1357 wrap.path = NULL;
2202afc9 1358 wrap.hierarchies = ops->hierarchies;
4160c3a0 1359 wrap.conf = conf;
ccb4cabe 1360
c98bbf71
CB
1361 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1362 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
ccb4cabe
SH
1363
1364 return true;
1365}
1366
840eec19 1367__cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
78eb6aa6
CB
1368{
1369 if (!ops)
1370 return;
1371
1372 if (!ops->hierarchies)
1373 return;
1374
840eec19
CB
1375 for (int i = 0; ops->hierarchies[i]; i++) {
1376 struct hierarchy *h = ops->hierarchies[i];
1377
1378 /* Close all monitor cgroup file descriptors. */
1379 close_prot_errno_disarm(h->dfd_mon);
1380 }
1381 /* Close the cgroup root file descriptor. */
1382 close_prot_errno_disarm(ops->dfd_mnt);
1383
6dcd6f02
CB
1384 /*
1385 * The checking for freezer support should obviously be done at cgroup
1386 * initialization time but that doesn't work reliable. The freezer
1387 * controller has been demoted (rightly so) to a simple file located in
1388 * each non-root cgroup. At the time when the container is created we
1389 * might still be located in /sys/fs/cgroup and so checking for
1390 * cgroup.freeze won't tell us anything because this file doesn't exist
1391 * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1392 * find an already existing cgroup and then check within that cgroup
1393 * for the existence of cgroup.freeze but that will only work on
1394 * systemd based hosts. Other init systems might not manage cgroups and
1395 * so no cgroup will exist. So we defer until we have created cgroups
1396 * for our container which means we check here.
1397 */
1398 if (pure_unified_layout(ops) &&
e33870e5 1399 !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
6dcd6f02
CB
1400 AT_SYMLINK_NOFOLLOW)) {
1401 TRACE("Unified hierarchy supports freezer");
ca72ccb5 1402 ops->unified->utilities |= FREEZER_CONTROLLER;
6dcd6f02 1403 }
78eb6aa6
CB
1404}
1405
8aa1044f 1406/* cgroup-full:* is done, no need to create subdirs */
bd09ee98 1407static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
8aa1044f 1408{
bd09ee98 1409 switch (cgroup_automount_type) {
51feb8db
CB
1410 case LXC_AUTO_CGROUP_RO:
1411 return true;
1412 case LXC_AUTO_CGROUP_RW:
1413 return true;
1414 case LXC_AUTO_CGROUP_MIXED:
1415 return true;
1416 }
1417
1418 return false;
8aa1044f
SH
1419}
1420
886cac86
CB
1421/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1422 * remount controller ro if needed and bindmount the cgroupfs onto
25fa6f8c 1423 * control/the/cg/path.
8aa1044f 1424 */
bd09ee98 1425static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
a9db9474 1426 char *hierarchy_mnt, char *cgpath,
6812d833 1427 const char *container_cgroup)
8aa1044f 1428{
d97919ab 1429 __do_free char *sourcepath = NULL;
5285689c 1430 int ret, remount_flags;
886cac86
CB
1431 int flags = MS_BIND;
1432
bd09ee98
CB
1433 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1434 (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
a9db9474 1435 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
77c3e9a2
CB
1436 if (ret < 0)
1437 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
a9db9474 1438 hierarchy_mnt, hierarchy_mnt);
886cac86 1439
a9db9474
CB
1440 remount_flags = add_required_remount_flags(hierarchy_mnt,
1441 hierarchy_mnt,
5285689c 1442 flags | MS_REMOUNT);
a9db9474 1443 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
8186c5c7
CB
1444 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1445 NULL);
77c3e9a2 1446 if (ret < 0)
a9db9474 1447 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
886cac86 1448
a9db9474 1449 INFO("Remounted %s read-only", hierarchy_mnt);
8aa1044f 1450 }
886cac86 1451
44585f1a 1452 sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
bd09ee98 1453 if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
8aa1044f 1454 flags |= MS_RDONLY;
886cac86
CB
1455
1456 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
77c3e9a2
CB
1457 if (ret < 0)
1458 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1459 h->controllers[0], cgpath);
886cac86 1460 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
f8c40ffa
L
1461
1462 if (flags & MS_RDONLY) {
5285689c
CB
1463 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1464 flags | MS_REMOUNT);
1465 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
77c3e9a2
CB
1466 if (ret < 0)
1467 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
5285689c 1468 INFO("Remounted %s read-only", cgpath);
f8c40ffa
L
1469 }
1470
886cac86 1471 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
8aa1044f
SH
1472 return 0;
1473}
1474
44234ae1 1475/* __cgroupfs_mount
6812d833
CB
1476 *
1477 * Mount cgroup hierarchies directly without using bind-mounts. The main
1478 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1479 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1480 */
bd09ee98 1481static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
44234ae1
CB
1482 struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1483 const char *hierarchy_mnt)
b635e92d 1484{
a099c5db
CB
1485 __do_close int fd_fs = -EBADF;
1486 unsigned int flags = 0;
02efd041
CB
1487 char *fstype;
1488 int ret;
1489
1490 if (dfd_mnt_cgroupfs < 0)
1491 return ret_errno(EINVAL);
1492
a099c5db
CB
1493 flags |= MOUNT_ATTR_NOSUID;
1494 flags |= MOUNT_ATTR_NOEXEC;
1495 flags |= MOUNT_ATTR_NODEV;
1496 flags |= MOUNT_ATTR_RELATIME;
02efd041 1497
bd09ee98
CB
1498 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1499 (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
a099c5db 1500 flags |= MOUNT_ATTR_RDONLY;
02efd041 1501
bd09ee98 1502 if (is_unified_hierarchy(h))
02efd041 1503 fstype = "cgroup2";
bd09ee98 1504 else
02efd041 1505 fstype = "cgroup";
b635e92d 1506
de7f9f33 1507 if (can_use_mount_api()) {
635e7bac
CB
1508 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1509 if (fd_fs < 0)
1510 return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1511
1512 if (!is_unified_hierarchy(h)) {
1513 for (const char **it = (const char **)h->controllers; it && *it; it++) {
aa72fbe7 1514 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
635e7bac
CB
1515 ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1516 else
1517 ret = fs_set_property(fd_fs, *it, "");
1518 if (ret < 0)
1519 return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1520 }
1521 }
1522
1523 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1524 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1525 flags);
1526 } else {
a099c5db
CB
1527 __do_free char *controllers = NULL, *target = NULL;
1528 unsigned int old_flags = 0;
02efd041
CB
1529 const char *rootfs_mnt;
1530
a099c5db
CB
1531 if (!is_unified_hierarchy(h)) {
1532 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1533 if (!controllers)
1534 return ret_errno(ENOMEM);
1535 }
1536
02efd041 1537 rootfs_mnt = get_rootfs_mnt(rootfs);
a099c5db
CB
1538 ret = mnt_attributes_old(flags, &old_flags);
1539 if (ret)
1540 return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1541
02efd041 1542 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
a099c5db 1543 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
02efd041 1544 }
77c3e9a2 1545 if (ret < 0)
02efd041
CB
1546 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1547 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
b635e92d 1548
02efd041
CB
1549 DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1550 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
b635e92d
CB
1551 return 0;
1552}
1553
bd09ee98 1554static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
074af890
CB
1555 struct lxc_rootfs *rootfs,
1556 int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
6812d833 1557{
bd09ee98
CB
1558 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1559 dfd_mnt_cgroupfs, hierarchy_mnt);
6812d833
CB
1560}
1561
bd09ee98 1562static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
14111650
CB
1563 struct lxc_rootfs *rootfs,
1564 int dfd_mnt_cgroupfs,
1565 const char *hierarchy_mnt)
6812d833 1566{
bd09ee98 1567 switch (cgroup_automount_type) {
51feb8db
CB
1568 case LXC_AUTO_CGROUP_FULL_RO:
1569 break;
1570 case LXC_AUTO_CGROUP_FULL_RW:
1571 break;
1572 case LXC_AUTO_CGROUP_FULL_MIXED:
1573 break;
1574 default:
6812d833 1575 return 0;
51feb8db 1576 }
6812d833 1577
bd09ee98
CB
1578 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1579 dfd_mnt_cgroupfs, hierarchy_mnt);
6812d833
CB
1580}
1581
b857f4be 1582__cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
cdd3b77d 1583 struct lxc_handler *handler, int cg_flags)
ccb4cabe 1584{
9bca62b3 1585 __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
6607d6e9 1586 __do_free char *cgroup_root = NULL;
bd09ee98 1587 int cgroup_automount_type;
937a3af9 1588 bool in_cgroup_ns = false, wants_force_mount = false;
ab8cd5d9 1589 struct lxc_conf *conf = handler->conf;
315f8a4e 1590 struct lxc_rootfs *rootfs = &conf->rootfs;
02efd041 1591 const char *rootfs_mnt = get_rootfs_mnt(rootfs);
dfa835ac 1592 int ret;
8aa1044f 1593
9585ccb3
CB
1594 if (!ops)
1595 return ret_set_errno(false, ENOENT);
1596
69b4a4bb
CB
1597 if (!ops->hierarchies)
1598 return true;
1599
315f8a4e 1600 if (!conf)
9585ccb3
CB
1601 return ret_set_errno(false, EINVAL);
1602
cdd3b77d 1603 if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
c581c8a3 1604 return log_trace(true, "No cgroup mounts requested");
8aa1044f 1605
69c29673
CB
1606 if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1607 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
3f69fb12 1608 wants_force_mount = true;
69c29673
CB
1609 }
1610
1611 switch (cg_flags) {
1612 case LXC_AUTO_CGROUP_RO:
1613 TRACE("Read-only cgroup mounts requested");
1614 break;
1615 case LXC_AUTO_CGROUP_RW:
1616 TRACE("Read-write cgroup mounts requested");
1617 break;
1618 case LXC_AUTO_CGROUP_MIXED:
1619 TRACE("Mixed cgroup mounts requested");
1620 break;
1621 case LXC_AUTO_CGROUP_FULL_RO:
1622 TRACE("Full read-only cgroup mounts requested");
1623 break;
1624 case LXC_AUTO_CGROUP_FULL_RW:
1625 TRACE("Full read-write cgroup mounts requested");
1626 break;
1627 case LXC_AUTO_CGROUP_FULL_MIXED:
1628 TRACE("Full mixed cgroup mounts requested");
1629 break;
1630 default:
1631 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1632 }
bd09ee98 1633 cgroup_automount_type = cg_flags;
b635e92d 1634
4547e73e 1635 if (!wants_force_mount) {
315f8a4e 1636 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
4547e73e
CB
1637
1638 /*
1639 * Most recent distro versions currently have init system that
1640 * do support cgroup2 but do not mount it by default unless
1641 * explicitly told so even if the host is cgroup2 only. That
1642 * means they often will fail to boot. Fix this by pre-mounting
1643 * cgroup2 by default. We will likely need to be doing this a
1644 * few years until all distros have switched over to cgroup2 at
1645 * which point we can safely assume that their init systems
1646 * will mount it themselves.
1647 */
1648 if (pure_unified_layout(ops))
1649 wants_force_mount = true;
3f69fb12 1650 }
8aa1044f 1651
2c4348bd 1652 if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
937a3af9 1653 in_cgroup_ns = true;
6768700d 1654
937a3af9 1655 if (in_cgroup_ns && !wants_force_mount)
3a86fb37 1656 return log_trace(true, "Mounting cgroups not requested or needed");
8aa1044f 1657
02efd041
CB
1658 /* This is really the codepath that we want. */
1659 if (pure_unified_layout(ops)) {
9bca62b3
CB
1660 __do_close int dfd_mnt_unified = -EBADF;
1661
1662 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1663 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1664 if (dfd_mnt_unified < 0)
1665 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
1666 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
e7e45fdf
CB
1667 /*
1668 * If cgroup namespaces are supported but the container will
1669 * not have CAP_SYS_ADMIN after it has started we need to mount
1670 * the cgroups manually.
a3e5ec26
CB
1671 *
1672 * Note that here we know that wants_force_mount is true.
1673 * Otherwise we would've returned early above.
e7e45fdf 1674 */
a3e5ec26
CB
1675 if (in_cgroup_ns) {
1676 /*
1677 * 1. cgroup:rw:force -> Mount the cgroup2 filesystem.
1678 * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only.
1679 * 3. cgroup:mixed:force -> See comment above how this
1680 * does not apply so
1681 * cgroup:mixed is equal to
1682 * cgroup:rw when cgroup
1683 * namespaces are supported.
1684
1685 * 4. cgroup:rw -> No-op; init system responsible for mounting.
1686 * 5. cgroup:ro -> No-op; init system responsible for mounting.
1687 * 6. cgroup:mixed -> No-op; init system responsible for mounting.
1688 *
1689 * 7. cgroup-full:rw -> Not supported.
1690 * 8. cgroup-full:ro -> Not supported.
1691 * 9. cgroup-full:mixed -> Not supported.
1692
1693 * 10. cgroup-full:rw:force -> Not supported.
1694 * 11. cgroup-full:ro:force -> Not supported.
1695 * 12. cgroup-full:mixed:force -> Not supported.
1696 */
bd09ee98 1697 ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
a3e5ec26
CB
1698 if (ret < 0)
1699 return syserrno(false, "Failed to force mount cgroup filesystem in cgroup namespace");
1700
1701 return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
1702 } else {
1703 /*
1704 * Either no cgroup namespace supported (highly
1705 * unlikely unless we're dealing with a Frankenkernel.
1706 * Or the user requested to keep the cgroup namespace
1707 * of the host or another container.
1708 */
1709 if (wants_force_mount) {
1710 /*
1711 * 1. cgroup:rw:force -> Bind-mount the cgroup2 filesystem writable.
1712 * 2. cgroup:ro:force -> Bind-mount the cgroup2 filesystem read-only.
1713 * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
1714 * and make the parent directory of the
1715 * container's cgroup read-only but the
1716 * container's cgroup writable.
1717 *
1718 * 10. cgroup-full:rw:force ->
1719 * 11. cgroup-full:ro:force ->
1720 * 12. cgroup-full:mixed:force ->
1721 */
1722 errno = EOPNOTSUPP;
1723 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1724 } else {
1725 errno = EOPNOTSUPP;
1726 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1727 }
1728 }
8d661d38 1729
a3e5ec26 1730 return syserrno(false, "Failed to mount cgroups");
8d661d38
CB
1731 }
1732
e6d4df78
CB
1733 /*
1734 * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1735 * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1736 * DEFAULT_CGROUP_MOUNTPOINT define.
1737 */
de7f9f33 1738 if (can_use_mount_api()) {
635e7bac
CB
1739 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1740 if (fd_fs < 0)
1741 return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1742
23a20dbe
CB
1743 ret = fs_set_property(fd_fs, "mode", "0755");
1744 if (ret < 0)
1745 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1746
1747 ret = fs_set_property(fd_fs, "size", "10240k");
1748 if (ret < 0)
1749 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1750
1751 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1752 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1753 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1754 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
635e7bac
CB
1755 } else {
1756 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1757 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1758 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1759 "size=10240k,mode=755", rootfs_mnt);
8b1f4dd9 1760 }
3f69fb12 1761 if (ret < 0)
02efd041
CB
1762 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1763 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
8aa1044f 1764
9bca62b3
CB
1765 dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1766 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1767 if (dfd_mnt_tmpfs < 0)
1768 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
1769 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1770
dfa835ac 1771 for (int i = 0; ops->hierarchies[i]; i++) {
a9db9474 1772 __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
2202afc9 1773 struct hierarchy *h = ops->hierarchies[i];
8aa1044f 1774
a58be2ad 1775 ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
d7314671 1776 if (ret < 0)
a58be2ad 1777 return syserrno(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
b635e92d 1778
937a3af9 1779 if (in_cgroup_ns && wants_force_mount) {
02efd041
CB
1780 /*
1781 * If cgroup namespaces are supported but the container
b635e92d
CB
1782 * will not have CAP_SYS_ADMIN after it has started we
1783 * need to mount the cgroups manually.
1784 */
a9db9474 1785 ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
a58be2ad 1786 dfd_mnt_tmpfs, h->at_mnt);
3f69fb12 1787 if (ret < 0)
d7314671 1788 return false;
3f69fb12 1789
b635e92d
CB
1790 continue;
1791 }
1792
02efd041 1793 /* Here is where the ancient kernel section begins. */
a9db9474 1794 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
a58be2ad 1795 dfd_mnt_tmpfs, h->at_mnt);
d97919ab 1796 if (ret < 0)
d7314671 1797 return false;
3f69fb12 1798
bd09ee98 1799 if (!cg_mount_needs_subdirs(cgroup_automount_type))
8aa1044f 1800 continue;
3f69fb12 1801
f1921f35
CB
1802 if (!cgroup_root)
1803 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1804
a58be2ad 1805 hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
44585f1a 1806 path2 = must_make_path(hierarchy_mnt, h->at_base,
a9db9474 1807 ops->container_cgroup, NULL);
3f69fb12 1808 ret = mkdir_p(path2, 0755);
77410c98 1809 if (ret < 0 && (errno != EEXIST))
d7314671 1810 return false;
2f62fb00 1811
a9db9474
CB
1812 ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
1813 hierarchy_mnt, path2,
1814 ops->container_cgroup);
3f69fb12 1815 if (ret < 0)
d7314671 1816 return false;
8aa1044f 1817 }
8aa1044f 1818
d7314671 1819 return true;
ccb4cabe
SH
1820}
1821
11c23867 1822/* Only root needs to escape to the cgroup of its init. */
ff9edd2d
CB
1823__cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
1824 struct lxc_conf *conf)
ccb4cabe 1825{
52d08ab0
CB
1826 if (!ops)
1827 return ret_set_errno(false, ENOENT);
1828
1829 if (!ops->hierarchies)
1830 return true;
1831
1832 if (!conf)
1833 return ret_set_errno(false, EINVAL);
1834
1835 if (conf->cgroup_meta.relative || geteuid())
ccb4cabe
SH
1836 return true;
1837
779b3d82 1838 for (int i = 0; ops->hierarchies[i]; i++) {
88396101 1839 __do_free char *fullpath = NULL;
52d08ab0 1840 int ret;
11c23867 1841
35ec1a38 1842 fullpath = make_cgroup_path(ops->hierarchies[i],
44585f1a 1843 ops->hierarchies[i]->at_base,
35ec1a38 1844 "cgroup.procs", NULL);
7cea5905 1845 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
52d08ab0 1846 if (ret != 0)
77c3e9a2 1847 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
ccb4cabe
SH
1848 }
1849
6df334d1 1850 return true;
ccb4cabe
SH
1851}
1852
ff9edd2d 1853__cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
36662416 1854{
69b4a4bb
CB
1855 int i = 0;
1856
e3ffb28b
CB
1857 if (!ops)
1858 return ret_set_errno(-1, ENOENT);
1859
69b4a4bb
CB
1860 if (!ops->hierarchies)
1861 return 0;
36662416 1862
69b4a4bb 1863 for (; ops->hierarchies[i]; i++)
36662416
TA
1864 ;
1865
1866 return i;
1867}
1868
ff9edd2d
CB
1869__cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
1870 int n, char ***out)
36662416
TA
1871{
1872 int i;
1873
aa48a34f
CB
1874 if (!ops)
1875 return ret_set_errno(false, ENOENT);
1876
69b4a4bb 1877 if (!ops->hierarchies)
77c3e9a2 1878 return ret_set_errno(false, ENOENT);
69b4a4bb 1879
36662416 1880 /* sanity check n */
6b38e644 1881 for (i = 0; i < n; i++)
2202afc9 1882 if (!ops->hierarchies[i])
aa48a34f 1883 return ret_set_errno(false, ENOENT);
36662416 1884
2202afc9 1885 *out = ops->hierarchies[i]->controllers;
36662416
TA
1886
1887 return true;
1888}
1889
b8a4fe12 1890static int cg_legacy_freeze(struct cgroup_ops *ops)
ccb4cabe 1891{
d6337a5f 1892 struct hierarchy *h;
ccb4cabe 1893
ee3a7775
CB
1894 h = get_hierarchy(ops, "freezer");
1895 if (!h)
d2203230 1896 return ret_set_errno(-1, ENOENT);
81468ea7 1897
67ed60ce 1898 return lxc_write_openat(h->path_con, "freezer.state",
c04a6d4e 1899 "FROZEN", STRLITERALLEN("FROZEN"));
ee3a7775 1900}
942e193e 1901
018051e3
CB
1902static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1903 struct lxc_epoll_descr *descr)
ee3a7775 1904{
018051e3 1905 __do_free char *line = NULL;
ee3a7775 1906 __do_fclose FILE *f = NULL;
018051e3
CB
1907 int state = PTR_TO_INT(cbdata);
1908 size_t len;
1909 const char *state_string;
1910
c8af3332 1911 f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
018051e3
CB
1912 if (!f)
1913 return LXC_MAINLOOP_ERROR;
018051e3
CB
1914
1915 if (state == 1)
1916 state_string = "frozen 1";
1917 else
1918 state_string = "frozen 0";
1919
1920 while (getline(&line, &len, f) != -1)
aa72fbe7 1921 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
018051e3
CB
1922 return LXC_MAINLOOP_CLOSE;
1923
281c3645
CB
1924 rewind(f);
1925
018051e3
CB
1926 return LXC_MAINLOOP_CONTINUE;
1927}
1928
443be565
WB
1929static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
1930 const char *state_string,
1931 int state_num,
1932 const char *epoll_error,
1933 const char *wait_error)
018051e3 1934{
f62cf1d4 1935 __do_close int fd = -EBADF;
eafc1bb6 1936 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
018051e3
CB
1937 int ret;
1938 struct lxc_epoll_descr descr;
ee3a7775 1939 struct hierarchy *h;
942e193e
CB
1940
1941 h = ops->unified;
457ca9aa 1942 if (!h)
d2203230 1943 return ret_set_errno(-1, ENOENT);
d6337a5f 1944
67ed60ce 1945 if (!h->path_con)
d2203230 1946 return ret_set_errno(-1, EEXIST);
d6337a5f 1947
018051e3
CB
1948 if (timeout != 0) {
1949 __do_free char *events_file = NULL;
942e193e 1950
67ed60ce 1951 events_file = must_make_path(h->path_con, "cgroup.events", NULL);
018051e3
CB
1952 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1953 if (fd < 0)
d2203230 1954 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
942e193e 1955
018051e3
CB
1956 ret = lxc_mainloop_open(&descr);
1957 if (ret)
443be565 1958 return log_error_errno(-1, errno, "%s", epoll_error);
942e193e 1959
018051e3
CB
1960 /* automatically cleaned up now */
1961 descr_ptr = &descr;
942e193e 1962
385e58e8 1963 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
018051e3 1964 if (ret < 0)
d2203230 1965 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 1966 }
942e193e 1967
67ed60ce 1968 ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
018051e3 1969 if (ret < 0)
d2203230 1970 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
1971
1972 if (timeout != 0 && lxc_mainloop(&descr, timeout))
443be565 1973 return log_error_errno(-1, errno, "%s", wait_error);
018051e3
CB
1974
1975 return 0;
942e193e
CB
1976}
1977
443be565
WB
1978static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1979{
1980 return cg_unified_freeze_do(ops, timeout, "1", 1,
1981 "Failed to create epoll instance to wait for container freeze",
1982 "Failed to wait for container to be frozen");
1983}
1984
018051e3 1985__cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
942e193e 1986{
81468ea7 1987 if (!ops->hierarchies)
d2203230 1988 return ret_set_errno(-1, ENOENT);
81468ea7 1989
ee3a7775
CB
1990 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
1991 return cg_legacy_freeze(ops);
942e193e 1992
018051e3 1993 return cg_unified_freeze(ops, timeout);
ee3a7775
CB
1994}
1995
018051e3 1996static int cg_legacy_unfreeze(struct cgroup_ops *ops)
ee3a7775 1997{
ee3a7775
CB
1998 struct hierarchy *h;
1999
2000 h = get_hierarchy(ops, "freezer");
2001 if (!h)
d2203230 2002 return ret_set_errno(-1, ENOENT);
ee3a7775 2003
67ed60ce 2004 return lxc_write_openat(h->path_con, "freezer.state",
c04a6d4e 2005 "THAWED", STRLITERALLEN("THAWED"));
ee3a7775
CB
2006}
2007
018051e3 2008static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775 2009{
443be565
WB
2010 return cg_unified_freeze_do(ops, timeout, "0", 0,
2011 "Failed to create epoll instance to wait for container unfreeze",
2012 "Failed to wait for container to be unfrozen");
ee3a7775
CB
2013}
2014
018051e3 2015__cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775
CB
2016{
2017 if (!ops->hierarchies)
d2203230 2018 return ret_set_errno(-1, ENOENT);
ee3a7775
CB
2019
2020 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2021 return cg_legacy_unfreeze(ops);
2022
018051e3 2023 return cg_unified_unfreeze(ops, timeout);
ccb4cabe
SH
2024}
2025
a900cbaf
WB
2026static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2027 const char *controller, bool limiting)
ccb4cabe 2028{
d6337a5f 2029 struct hierarchy *h;
35ec1a38
CB
2030 size_t len;
2031 const char *path;
d6337a5f 2032
2202afc9 2033 h = get_hierarchy(ops, controller);
6bdf9691 2034 if (!h)
35ec1a38
CB
2035 return log_warn_errno(NULL, ENOENT,
2036 "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
ccb4cabe 2037
a900cbaf 2038 if (limiting)
b1b1a60f 2039 path = h->path_lim;
35ec1a38 2040 else
67ed60ce 2041 path = h->path_con;
35ec1a38
CB
2042 if (!path)
2043 return NULL;
a900cbaf 2044
a58be2ad
CB
2045 len = strlen(h->at_mnt);
2046 if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
35ec1a38
CB
2047 STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
2048 path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
2049 path += strspn(path, "/");
2050 }
2051 return path += len;
371f834d
SH
2052}
2053
a900cbaf
WB
2054__cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2055 const char *controller)
2056{
2057 return cgfsng_get_cgroup_do(ops, controller, false);
2058}
2059
2060__cgfsng_ops static const char *cgfsng_get_limiting_cgroup(struct cgroup_ops *ops,
2061 const char *controller)
2062{
2063 return cgfsng_get_cgroup_do(ops, controller, true);
2064}
2065
c40c8209
CB
2066/* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2067 * which must be freed by the caller.
371f834d 2068 */
c40c8209
CB
2069static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2070 const char *inpath,
2071 const char *filename)
371f834d 2072{
35ec1a38 2073 return make_cgroup_path(h, inpath, filename, NULL);
ccb4cabe
SH
2074}
2075
4b86fefd 2076static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
c2aed66d 2077{
ad275c16 2078 int idx = 1;
c2aed66d 2079 int ret;
900b6606 2080 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
6e2078de 2081 ssize_t pidstr_len;
c2aed66d 2082
ad275c16 2083 /* Create leaf cgroup. */
275e8ef8 2084 ret = mkdirat(unified_fd, ".lxc", 0755);
ad275c16 2085 if (ret < 0 && errno != EEXIST)
6e2078de
CB
2086 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2087
0bba27c1
CB
2088 pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2089 if (pidstr_len < 0)
2090 return pidstr_len;
ad275c16 2091
275e8ef8 2092 ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
ad275c16
CB
2093 if (ret < 0)
2094 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
c2aed66d 2095 if (ret == 0)
6e2078de 2096 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
ad275c16 2097
bad788b0
CB
2098 /* this is a non-leaf node */
2099 if (errno != EBUSY)
6e2078de 2100 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
c2aed66d 2101
c2aed66d 2102 do {
7581a82f 2103 bool rm = false;
c80c9a70 2104 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
9fd047d1 2105 char *slash = attach_cgroup;
c2aed66d 2106
0bba27c1
CB
2107 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2108 if (ret < 0)
2109 return ret;
5045306b 2110
c80c9a70
CB
2111 /*
2112 * This shouldn't really happen but the compiler might complain
2113 * that a short write would cause a buffer overrun. So be on
2114 * the safe side.
2115 */
2116 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2117 return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2118
9fd047d1 2119 slash += (ret - STRLITERALLEN("/cgroup.procs"));
bad788b0 2120 *slash = '\0';
ad275c16 2121
bad788b0 2122 ret = mkdirat(unified_fd, attach_cgroup, 0755);
c2aed66d 2123 if (ret < 0 && errno != EEXIST)
d2203230 2124 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
7581a82f
CB
2125 if (ret == 0)
2126 rm = true;
c2aed66d 2127
bad788b0 2128 *slash = '/';
ad275c16 2129
bad788b0 2130 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
c2aed66d 2131 if (ret == 0)
6e2078de 2132 return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
c2aed66d 2133
7581a82f
CB
2134 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2135 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2136
c2aed66d
CB
2137 /* this is a non-leaf node */
2138 if (errno != EBUSY)
d2203230 2139 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2140
edae86e9
CB
2141 idx++;
2142 } while (idx < 1000);
c2aed66d 2143
ad275c16 2144 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d
CB
2145}
2146
d1783ef4
CB
2147static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2148 int unified_fd, int *sk_fd)
2149{
7d849163
CB
2150 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2151 int target_fds[2];
d1783ef4
CB
2152 ssize_t ret;
2153
2154 /* Create leaf cgroup. */
2155 ret = mkdirat(unified_fd, ".lxc", 0755);
2156 if (ret < 0 && errno != EEXIST)
2157 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2158
7043e2b4 2159 target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
7d849163 2160 if (target_fd0 < 0)
d1783ef4 2161 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
7d849163 2162 target_fds[0] = target_fd0;
d1783ef4 2163
7043e2b4 2164 target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
7d849163 2165 if (target_fd1 < 0)
49df620b 2166 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
7d849163 2167 target_fds[1] = target_fd1;
49df620b
CB
2168
2169 ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
d1783ef4 2170 if (ret <= 0)
49df620b 2171 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
7d849163 2172 target_fd0, target_fd1);
d1783ef4 2173
7d849163 2174 return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
d1783ef4
CB
2175}
2176
2177static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2178 int *sk_fd, pid_t pid)
2179{
7d849163
CB
2180 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2181 int target_fds[2];
d1783ef4
CB
2182 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2183 size_t pidstr_len;
2184 ssize_t ret;
2185
d17c815d
CB
2186 ret = lxc_abstract_unix_recv_two_fds(sk, target_fds);
2187 if (ret < 0)
d1783ef4 2188 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
7d849163
CB
2189 target_fd0 = target_fds[0];
2190 target_fd1 = target_fds[1];
d1783ef4
CB
2191
2192 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2193
7d849163
CB
2194 ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2195 if (ret > 0 && ret == pidstr_len)
2196 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2197
49df620b 2198 ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
7d849163
CB
2199 if (ret > 0 && ret == pidstr_len)
2200 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
d1783ef4 2201
7d849163
CB
2202 return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2203 target_fd0, target_fd1);
d1783ef4
CB
2204}
2205
4b86fefd
CB
2206struct userns_exec_unified_attach_data {
2207 const struct lxc_conf *conf;
2208 int unified_fd;
d1783ef4 2209 int sk_pair[2];
4b86fefd
CB
2210 pid_t pid;
2211};
2212
d1783ef4
CB
2213static int cgroup_unified_attach_child_wrapper(void *data)
2214{
2215 struct userns_exec_unified_attach_data *args = data;
2216
2217 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2218 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2219 return ret_errno(EINVAL);
2220
2221 close_prot_errno_disarm(args->sk_pair[0]);
2222 return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2223 &args->sk_pair[1]);
2224}
2225
2226static int cgroup_unified_attach_parent_wrapper(void *data)
4b86fefd
CB
2227{
2228 struct userns_exec_unified_attach_data *args = data;
4b86fefd 2229
d1783ef4
CB
2230 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2231 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
4b86fefd
CB
2232 return ret_errno(EINVAL);
2233
d1783ef4
CB
2234 close_prot_errno_disarm(args->sk_pair[1]);
2235 return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2236 args->pid);
4b86fefd
CB
2237}
2238
900b6606
CB
2239/* Technically, we're always at a delegation boundary here (This is especially
2240 * true when cgroup namespaces are available.). The reasoning is that in order
2241 * for us to have been able to start a container in the first place the root
2242 * cgroup must have been a leaf node. Now, either the container's init system
2243 * has populated the cgroup and kept it as a leaf node or it has created
2244 * subtrees. In the former case we will simply attach to the leaf node we
2245 * created when we started the container in the latter case we create our own
2246 * cgroup for the attaching process.
2247 */
7581a82f
CB
2248static int __cg_unified_attach(const struct hierarchy *h,
2249 const struct lxc_conf *conf, const char *name,
900b6606
CB
2250 const char *lxcpath, pid_t pid,
2251 const char *controller)
2252{
f62cf1d4 2253 __do_close int unified_fd = -EBADF;
32908bfd 2254 __do_free char *path = NULL, *cgroup = NULL;
900b6606
CB
2255 int ret;
2256
7581a82f
CB
2257 if (!conf || !name || !lxcpath || pid <= 0)
2258 return ret_errno(EINVAL);
2259
2260 ret = cgroup_attach(conf, name, lxcpath, pid);
32908bfd
CB
2261 if (ret == 0)
2262 return log_trace(0, "Attached to unified cgroup via command handler");
59114d80 2263 if (ret != -ENOCGROUP2)
32908bfd
CB
2264 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2265
2266 /* Fall back to retrieving the path for the unified cgroup. */
2267 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2268 /* not running */
2269 if (!cgroup)
2270 return 0;
900b6606 2271
35ec1a38 2272 path = make_cgroup_path(h, cgroup, NULL);
900b6606 2273
32908bfd 2274 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
900b6606 2275 if (unified_fd < 0)
7581a82f
CB
2276 return ret_errno(EBADF);
2277
4b86fefd
CB
2278 if (!lxc_list_empty(&conf->id_map)) {
2279 struct userns_exec_unified_attach_data args = {
2280 .conf = conf,
2281 .unified_fd = unified_fd,
2282 .pid = pid,
2283 };
2284
d1783ef4
CB
2285 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2286 if (ret < 0)
2287 return -errno;
2288
2289 ret = userns_exec_minimal(conf,
2290 cgroup_unified_attach_parent_wrapper,
2291 &args,
2292 cgroup_unified_attach_child_wrapper,
2293 &args);
4b86fefd
CB
2294 } else {
2295 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2296 }
2297
2298 return ret;
900b6606
CB
2299}
2300
7581a82f
CB
2301__cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2302 const struct lxc_conf *conf,
2303 const char *name, const char *lxcpath,
2304 pid_t pid)
ccb4cabe 2305{
81b5d48a 2306 int len, ret;
a3650c0c 2307 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 2308
ab9a452d
CB
2309 if (!ops)
2310 return ret_set_errno(false, ENOENT);
2311
69b4a4bb
CB
2312 if (!ops->hierarchies)
2313 return true;
2314
0bba27c1
CB
2315 len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2316 if (len < 0)
ccb4cabe
SH
2317 return false;
2318
81b5d48a 2319 for (int i = 0; ops->hierarchies[i]; i++) {
c05b17bd 2320 __do_free char *fullpath = NULL, *path = NULL;
2202afc9 2321 struct hierarchy *h = ops->hierarchies[i];
ccb4cabe 2322
b8572e8c 2323 if (h->fs_type == UNIFIED_HIERARCHY) {
7581a82f 2324 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
a3926f6a 2325 h->controllers[0]);
c2aed66d
CB
2326 if (ret < 0)
2327 return false;
2328
2329 continue;
2330 }
2331
ccb4cabe 2332 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
c2aed66d
CB
2333 /* not running */
2334 if (!path)
e2cb2e74 2335 return false;
ccb4cabe 2336
371f834d 2337 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
7cea5905 2338 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
ab9a452d 2339 if (ret < 0)
77c3e9a2 2340 return log_error_errno(false, errno, "Failed to attach %d to %s",
ab9a452d 2341 (int)pid, fullpath);
ccb4cabe
SH
2342 }
2343
ccb4cabe
SH
2344 return true;
2345}
2346
e2bd2b13
CB
2347/* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2348 * don't have a cgroup_data set up, so we ask the running container through the
2349 * commands API for the cgroup path.
ccb4cabe 2350 */
b857f4be 2351__cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
fb55e009
CB
2352 char *value, size_t len, const char *name,
2353 const char *lxcpath)
ccb4cabe 2354{
d97919ab 2355 __do_free char *path = NULL;
88396101 2356 __do_free char *controller = NULL;
d97919ab 2357 char *p;
0069cc61 2358 struct hierarchy *h;
861cb8c2 2359 int ret = -1;
ccb4cabe 2360
a358028a
CB
2361 if (!ops)
2362 return ret_set_errno(-1, ENOENT);
2363
63ba9eaf
CB
2364 controller = strdup(filename);
2365 if (!controller)
2366 return ret_errno(ENOMEM);
2367
0069cc61
CB
2368 p = strchr(controller, '.');
2369 if (p)
ccb4cabe
SH
2370 *p = '\0';
2371
a900cbaf 2372 path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
0069cc61
CB
2373 /* not running */
2374 if (!path)
ccb4cabe
SH
2375 return -1;
2376
2202afc9 2377 h = get_hierarchy(ops, controller);
ccb4cabe 2378 if (h) {
88396101 2379 __do_free char *fullpath = NULL;
0069cc61
CB
2380
2381 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe 2382 ret = lxc_read_from_file(fullpath, value, len);
ccb4cabe 2383 }
ccb4cabe
SH
2384
2385 return ret;
2386}
2387
cb3fc90c
CB
2388static int device_cgroup_parse_access(struct device_item *device, const char *val)
2389{
2390 for (int count = 0; count < 3; count++, val++) {
2391 switch (*val) {
2392 case 'r':
2393 device->access[count] = *val;
2394 break;
2395 case 'w':
2396 device->access[count] = *val;
2397 break;
2398 case 'm':
2399 device->access[count] = *val;
2400 break;
2401 case '\n':
2402 case '\0':
2403 count = 3;
2404 break;
2405 default:
2406 return ret_errno(EINVAL);
2407 }
2408 }
2409
2410 return 0;
2411}
2412
2a63b5cb
CB
2413static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2414 const char *val)
2415{
2416 int count, ret;
2417 char temp[50];
2418
8b99a20a 2419 if (strequal("devices.allow", key))
69885a76 2420 device->allow = 1; /* allow the device */
2a63b5cb 2421 else
69885a76 2422 device->allow = 0; /* deny the device */
2a63b5cb 2423
8b99a20a 2424 if (strequal(val, "a")) {
2a63b5cb
CB
2425 /* global rule */
2426 device->type = 'a';
2427 device->major = -1;
2428 device->minor = -1;
2a63b5cb 2429 return 0;
2a63b5cb
CB
2430 }
2431
2432 switch (*val) {
2433 case 'a':
2434 __fallthrough;
2435 case 'b':
2436 __fallthrough;
2437 case 'c':
2438 device->type = *val;
2439 break;
2440 default:
2441 return -1;
2442 }
2443
2444 val++;
2445 if (!isspace(*val))
2446 return -1;
2447 val++;
2448 if (*val == '*') {
2449 device->major = -1;
2450 val++;
2451 } else if (isdigit(*val)) {
2452 memset(temp, 0, sizeof(temp));
2453 for (count = 0; count < sizeof(temp) - 1; count++) {
2454 temp[count] = *val;
2455 val++;
2456 if (!isdigit(*val))
2457 break;
2458 }
2459 ret = lxc_safe_int(temp, &device->major);
2460 if (ret)
2461 return -1;
2462 } else {
2463 return -1;
2464 }
2465 if (*val != ':')
2466 return -1;
2467 val++;
2468
2469 /* read minor */
2470 if (*val == '*') {
2471 device->minor = -1;
2472 val++;
2473 } else if (isdigit(*val)) {
2474 memset(temp, 0, sizeof(temp));
2475 for (count = 0; count < sizeof(temp) - 1; count++) {
2476 temp[count] = *val;
2477 val++;
2478 if (!isdigit(*val))
2479 break;
2480 }
2481 ret = lxc_safe_int(temp, &device->minor);
2482 if (ret)
2483 return -1;
2484 } else {
2485 return -1;
2486 }
2487 if (!isspace(*val))
2488 return -1;
2a63b5cb 2489
cb3fc90c 2490 return device_cgroup_parse_access(device, ++val);
2a63b5cb
CB
2491}
2492
eec533e3
CB
2493/* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2494 * don't have a cgroup_data set up, so we ask the running container through the
2495 * commands API for the cgroup path.
ccb4cabe 2496 */
b857f4be 2497__cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2a63b5cb 2498 const char *key, const char *value,
fb55e009 2499 const char *name, const char *lxcpath)
ccb4cabe 2500{
d97919ab 2501 __do_free char *path = NULL;
88396101 2502 __do_free char *controller = NULL;
d97919ab 2503 char *p;
87777968 2504 struct hierarchy *h;
861cb8c2 2505 int ret = -1;
ccb4cabe 2506
b7aeda96
CB
2507 if (!ops || is_empty_string(key) || is_empty_string(value) ||
2508 is_empty_string(name) || is_empty_string(lxcpath))
2509 return ret_errno(EINVAL);
a358028a 2510
63ba9eaf
CB
2511 controller = strdup(key);
2512 if (!controller)
2513 return ret_errno(ENOMEM);
2514
87777968
CB
2515 p = strchr(controller, '.');
2516 if (p)
ccb4cabe
SH
2517 *p = '\0';
2518
8b99a20a 2519 if (pure_unified_layout(ops) && strequal(controller, "devices")) {
50329f28 2520 struct device_item device = {};
2a63b5cb
CB
2521
2522 ret = device_cgroup_rule_parse(&device, key, value);
2523 if (ret < 0)
d2203230 2524 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2a63b5cb
CB
2525 key, value);
2526
2527 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2528 if (ret < 0)
2529 return -1;
2530
2531 return 0;
2532 }
2533
a900cbaf 2534 path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
87777968
CB
2535 /* not running */
2536 if (!path)
ccb4cabe
SH
2537 return -1;
2538
2202afc9 2539 h = get_hierarchy(ops, controller);
ccb4cabe 2540 if (h) {
88396101 2541 __do_free char *fullpath = NULL;
87777968 2542
2a63b5cb 2543 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
7cea5905 2544 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
ccb4cabe 2545 }
ccb4cabe
SH
2546
2547 return ret;
2548}
2549
91d1a13a 2550/* take devices cgroup line
72add155
SH
2551 * /dev/foo rwx
2552 * and convert it to a valid
2553 * type major:minor mode
91d1a13a
CB
2554 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2555 * the output.
72add155 2556 */
cb3fc90c
CB
2557static int device_cgroup_rule_parse_devpath(struct device_item *device,
2558 const char *devpath)
72add155 2559{
88396101 2560 __do_free char *path = NULL;
2a06d041 2561 char *mode = NULL;
cb3fc90c
CB
2562 int n_parts, ret;
2563 char *p;
2564 struct stat sb;
72add155 2565
63ba9eaf
CB
2566 path = strdup(devpath);
2567 if (!path)
2568 return ret_errno(ENOMEM);
72add155 2569
cb3fc90c
CB
2570 /*
2571 * Read path followed by mode. Ignore any trailing text.
91d1a13a
CB
2572 * A ' # comment' would be legal. Technically other text is not
2573 * legal, we could check for that if we cared to.
72add155 2574 */
0dbdb99e 2575 for (n_parts = 1, p = path; *p; p++) {
2c2d6c49
SH
2576 if (*p != ' ')
2577 continue;
2578 *p = '\0';
91d1a13a 2579
2c2d6c49
SH
2580 if (n_parts != 1)
2581 break;
2582 p++;
2583 n_parts++;
91d1a13a 2584
2c2d6c49
SH
2585 while (*p == ' ')
2586 p++;
91d1a13a 2587
2c2d6c49 2588 mode = p;
91d1a13a 2589
2c2d6c49 2590 if (*p == '\0')
cb3fc90c 2591 return ret_set_errno(-1, EINVAL);
72add155 2592 }
2c2d6c49 2593
83b25c4d
CB
2594 if (!mode)
2595 return ret_errno(EINVAL);
2596
cb3fc90c
CB
2597 if (device_cgroup_parse_access(device, mode) < 0)
2598 return -1;
2599
72add155
SH
2600 ret = stat(path, &sb);
2601 if (ret < 0)
cb3fc90c 2602 return ret_set_errno(-1, errno);
72add155 2603
72add155
SH
2604 mode_t m = sb.st_mode & S_IFMT;
2605 switch (m) {
2606 case S_IFBLK:
cb3fc90c 2607 device->type = 'b';
72add155
SH
2608 break;
2609 case S_IFCHR:
cb3fc90c 2610 device->type = 'c';
72add155 2611 break;
2c2d6c49 2612 default:
77c3e9a2 2613 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
72add155 2614 }
2c2d6c49 2615
cb3fc90c
CB
2616 device->major = MAJOR(sb.st_rdev);
2617 device->minor = MINOR(sb.st_rdev);
2618 device->allow = 1;
72add155 2619
cb3fc90c
CB
2620 return 0;
2621}
2622
2623static int convert_devpath(const char *invalue, char *dest)
2624{
50329f28 2625 struct device_item device = {};
cb3fc90c
CB
2626 int ret;
2627
2628 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2629 if (ret < 0)
2630 return -1;
2631
0bba27c1
CB
2632 ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2633 device.minor, device.access);
2634 if (ret < 0)
2635 return log_error_errno(ret, -ret,
2636 "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2637 device.type, device.major, device.minor,
2638 device.access);
cb3fc90c
CB
2639
2640 return 0;
72add155
SH
2641}
2642
90e97284
CB
2643/* Called from setup_limits - here we have the container's cgroup_data because
2644 * we created the cgroups.
ccb4cabe 2645 */
2202afc9 2646static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
a900cbaf 2647 const char *value, bool is_cpuset)
ccb4cabe 2648{
88396101 2649 __do_free char *controller = NULL;
d97919ab 2650 char *p;
1a0e70ac
CB
2651 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2652 char converted_value[50];
b3646d7e 2653 struct hierarchy *h;
64e82f8b 2654
63ba9eaf
CB
2655 controller = strdup(filename);
2656 if (!controller)
2657 return ret_errno(ENOMEM);
2658
ab1a6cac
CB
2659 p = strchr(controller, '.');
2660 if (p)
ccb4cabe
SH
2661 *p = '\0';
2662
8b99a20a 2663 if (strequal("devices.allow", filename) && value[0] == '/') {
c04a6d4e
CB
2664 int ret;
2665
72add155
SH
2666 ret = convert_devpath(value, converted_value);
2667 if (ret < 0)
c8bf519d 2668 return ret;
72add155 2669 value = converted_value;
c8bf519d 2670 }
2671
2202afc9 2672 h = get_hierarchy(ops, controller);
77c3e9a2
CB
2673 if (!h)
2674 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
b3646d7e 2675
a900cbaf 2676 if (is_cpuset) {
67ed60ce 2677 int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
a900cbaf
WB
2678 if (ret)
2679 return ret;
2680 }
b1b1a60f 2681 return lxc_write_openat(h->path_lim, filename, value, strlen(value));
ccb4cabe
SH
2682}
2683
c581d2a6
CB
2684__cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2685 struct lxc_conf *conf,
2686 bool do_devices)
ccb4cabe 2687{
d97919ab 2688 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
c581d2a6 2689 struct lxc_list *cgroup_settings = &conf->cgroup;
d97919ab 2690 struct lxc_list *iterator, *next;
ccb4cabe 2691 struct lxc_cgroup *cg;
ccb4cabe
SH
2692 bool ret = false;
2693
92ca7eb5
CB
2694 if (!ops)
2695 return ret_set_errno(false, ENOENT);
2696
2697 if (!conf)
2698 return ret_set_errno(false, EINVAL);
2699
2700 cgroup_settings = &conf->cgroup;
ccb4cabe
SH
2701 if (lxc_list_empty(cgroup_settings))
2702 return true;
2703
69b4a4bb 2704 if (!ops->hierarchies)
92ca7eb5 2705 return ret_set_errno(false, EINVAL);
69b4a4bb 2706
92afbe74 2707 if (pure_unified_layout(ops))
b96aa96f
CB
2708 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2709
ccb4cabe 2710 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
6b38e644 2711 if (!sorted_cgroup_settings)
ccb4cabe 2712 return false;
ccb4cabe 2713
ccb4cabe
SH
2714 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2715 cg = iterator->elem;
2716
aa72fbe7
CB
2717 if (do_devices == strnequal("devices", cg->subsystem, 7)) {
2718 if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strnequal("cpuset", cg->subsystem, 6))) {
fc3b9533
CB
2719 if (do_devices && (errno == EACCES || errno == EPERM)) {
2720 SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2721 continue;
2722 }
2723 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2724 goto out;
ccb4cabe 2725 }
77c3e9a2 2726 DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
ccb4cabe 2727 }
ccb4cabe
SH
2728 }
2729
2730 ret = true;
6b38e644 2731 INFO("Limits for the legacy cgroup hierarchies have been setup");
ccb4cabe 2732out:
ccb4cabe
SH
2733 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2734 lxc_list_del(iterator);
2735 free(iterator);
2736 }
d97919ab 2737
ccb4cabe
SH
2738 return ret;
2739}
2740
bf651989
CB
2741/*
2742 * Some of the parsing logic comes from the original cgroup device v1
2743 * implementation in the kernel.
2744 */
4bfb655e
CB
2745static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2746 struct lxc_conf *conf, const char *key,
bf651989
CB
2747 const char *val)
2748{
50329f28 2749 struct device_item device_item = {};
2a63b5cb 2750 int ret;
bf651989 2751
30bfbd3f 2752 if (strequal("devices.allow", key) && abspath(val))
cb3fc90c
CB
2753 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2754 else
2755 ret = device_cgroup_rule_parse(&device_item, key, val);
2a63b5cb 2756 if (ret < 0)
30bfbd3f 2757 return syserrno_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
4bfb655e 2758
60532b18 2759 /*
15970277
CB
2760 * Note that bpf_list_add_device() returns 1 if it altered the device
2761 * list and 0 if it didn't; both return values indicate success.
2762 * Only a negative return value indicates an error.
60532b18 2763 */
a134099d 2764 ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
2a63b5cb 2765 if (ret < 0)
4bfb655e 2766 return -1;
a134099d 2767
bf651989
CB
2768 return 0;
2769}
2770
c581d2a6
CB
2771__cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2772 struct lxc_handler *handler)
6b38e644 2773{
7e31931f
CB
2774 struct lxc_list *cgroup_settings, *iterator;
2775 struct hierarchy *h;
2776 struct lxc_conf *conf;
6b38e644 2777
7e31931f
CB
2778 if (!ops)
2779 return ret_set_errno(false, ENOENT);
2780
2781 if (!ops->hierarchies)
6b38e644
CB
2782 return true;
2783
7e31931f
CB
2784 if (!ops->container_cgroup)
2785 return ret_set_errno(false, EINVAL);
2786
2787 if (!handler || !handler->conf)
2788 return ret_set_errno(false, EINVAL);
2789 conf = handler->conf;
2790
7e31931f 2791 cgroup_settings = &conf->cgroup2;
0e7a013e
CB
2792 if (lxc_list_empty(cgroup_settings))
2793 return true;
2794
2795 if (!pure_unified_layout(ops))
2796 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
7e31931f
CB
2797
2798 if (!ops->unified)
6b38e644 2799 return false;
7e31931f 2800 h = ops->unified;
6b38e644 2801
bf651989 2802 lxc_list_for_each (iterator, cgroup_settings) {
6b38e644 2803 struct lxc_cgroup *cg = iterator->elem;
c04a6d4e 2804 int ret;
6b38e644 2805
aa72fbe7 2806 if (strnequal("devices", cg->subsystem, 7))
ee9d3ef0
CB
2807 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
2808 else
b1b1a60f 2809 ret = lxc_write_openat(h->path_lim, cg->subsystem, cg->value, strlen(cg->value));
ee9d3ef0
CB
2810 if (ret < 0)
2811 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2812
6b38e644
CB
2813 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2814 }
2815
7e31931f 2816 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
6b38e644
CB
2817}
2818
59eac805 2819__cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
bf651989 2820{
e552bd1a
CB
2821 struct lxc_conf *conf;
2822 struct hierarchy *unified;
bf651989 2823
e552bd1a
CB
2824 if (!ops)
2825 return ret_set_errno(false, ENOENT);
2826
2827 if (!ops->hierarchies)
2828 return true;
2829
2830 if (!ops->container_cgroup)
2831 return ret_set_errno(false, EEXIST);
2832
2833 if (!handler || !handler->conf)
2834 return ret_set_errno(false, EINVAL);
2835 conf = handler->conf;
2836
2837 unified = ops->unified;
ca72ccb5 2838 if (!unified || !device_utility_controller(unified) ||
67ed60ce 2839 !unified->path_con ||
a134099d 2840 lxc_list_empty(&(conf->bpf_devices).device_item))
bf651989
CB
2841 return true;
2842
a134099d 2843 return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
bf651989
CB
2844}
2845
59eac805 2846static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
6b38e644 2847{
95ab26af
CB
2848 __do_close int dfd_final = -EBADF;
2849 __do_free char *add_controllers = NULL, *copy = NULL;
c581d2a6 2850 size_t full_len = 0;
0954f6ce
CB
2851 struct hierarchy *unified;
2852 int dfd_cur, ret;
95ab26af
CB
2853 char *cur;
2854 char **it;
6b38e644 2855
0954f6ce
CB
2856 if (!ops->hierarchies || !pure_unified_layout(ops))
2857 return true;
2858
2859 unified = ops->unified;
2860 if (!unified->controllers[0])
bf651989
CB
2861 return true;
2862
c581d2a6
CB
2863 /* For now we simply enable all controllers that we have detected by
2864 * creating a string like "+memory +pids +cpu +io".
2865 * TODO: In the near future we might want to support "-<controller>"
2866 * etc. but whether supporting semantics like this make sense will need
2867 * some thinking.
2868 */
2869 for (it = unified->controllers; it && *it; it++) {
2870 full_len += strlen(*it) + 2;
2871 add_controllers = must_realloc(add_controllers, full_len + 1);
2872
2873 if (unified->controllers[0] == *it)
2874 add_controllers[0] = '\0';
2875
2876 (void)strlcat(add_controllers, "+", full_len + 1);
2877 (void)strlcat(add_controllers, *it, full_len + 1);
2878
2879 if ((it + 1) && *(it + 1))
2880 (void)strlcat(add_controllers, " ", full_len + 1);
2881 }
2882
95ab26af
CB
2883 copy = strdup(cgroup);
2884 if (!copy)
f761d24d 2885 return false;
c581d2a6 2886
95ab26af
CB
2887 /*
2888 * Placing the write to cgroup.subtree_control before the open() is
2889 * intentional because of the cgroup2 delegation model. It enforces
2890 * that leaf cgroups don't have any controllers enabled for delegation.
2891 */
0954f6ce 2892 dfd_cur = unified->dfd_base;
95ab26af
CB
2893 lxc_iterate_parts(cur, copy, "/") {
2894 /*
2895 * Even though we vetted the paths when we parsed the config
2896 * we're paranoid here and check that the path is neither
2897 * absolute nor walks upwards.
2898 */
2899 if (abspath(cur))
2900 return syserrno_set(-EINVAL, "No absolute paths allowed");
ac01a9b8 2901
95ab26af
CB
2902 if (strnequal(cur, "..", STRLITERALLEN("..")))
2903 return syserrno_set(-EINVAL, "No upward walking paths allowed");
ac01a9b8 2904
95ab26af 2905 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
61fbc369 2906 if (ret < 0)
95ab26af
CB
2907 return syserrno(-errno, "Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2908
2909 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
ac01a9b8 2910
95ab26af
CB
2911 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
2912 if (dfd_final < 0)
2913 return syserrno(-errno, "Fail to open directory %d(%s)", dfd_cur, cur);
2914 if (dfd_cur != unified->dfd_base)
2915 close(dfd_cur);
2916 /*
2917 * Leave dfd_final pointing to the last fd we opened so
2918 * it will be automatically zapped if we return early.
2919 */
2920 dfd_cur = dfd_final;
c581d2a6
CB
2921 }
2922
f761d24d 2923 return true;
c581d2a6
CB
2924}
2925
59eac805 2926__cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
c581d2a6 2927{
61fbc369
CB
2928 if (!ops)
2929 return ret_set_errno(false, ENOENT);
2930
c581d2a6
CB
2931 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2932}
2933
59eac805 2934__cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
c581d2a6 2935{
61fbc369
CB
2936 if (!ops)
2937 return ret_set_errno(false, ENOENT);
2938
c581d2a6 2939 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2202afc9
CB
2940}
2941
0da35ac7
CB
2942static inline bool unified_cgroup(const char *line)
2943{
2944 return *line == '0';
2945}
2946
2947static inline char *current_unified_cgroup(bool relative, char *line)
2948{
2949 char *current_cgroup;
2950
2951 line += STRLITERALLEN("0::");
2952
2953 if (!abspath(line))
2954 return ERR_PTR(-EINVAL);
2955
2956 /* remove init.scope */
2957 if (!relative)
2958 line = prune_init_scope(line);
2959
2960 /* create a relative path */
2961 line = deabs(line);
2962
2963 current_cgroup = strdup(line);
2964 if (!current_cgroup)
2965 return ERR_PTR(-ENOMEM);
2966
2967 return current_cgroup;
2968}
2969
2970static inline const char *unprefix(const char *controllers)
2971{
2972 if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
2973 return controllers + STRLITERALLEN("name=");
2974 return controllers;
2975}
2976
2977static int __list_cgroup_delegate(char ***delegate)
a6ca2ed8 2978{
63ba9eaf 2979 __do_free char **list = NULL;
d606c4e9 2980 __do_free char *buf = NULL;
35ec1a38
CB
2981 char *standard[] = {
2982 "cgroup.procs",
2983 "cgroup.threads",
2984 "cgroup.subtree_control",
2985 "memory.oom.group",
2986 NULL,
2987 };
d606c4e9 2988 char *token;
63ba9eaf 2989 int ret;
a6ca2ed8 2990
46bf13b7 2991 buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
d606c4e9 2992 if (!buf) {
a6ca2ed8 2993 for (char **p = standard; p && *p; p++) {
63ba9eaf
CB
2994 ret = list_add_string(&list, *p);
2995 if (ret < 0)
2996 return ret;
a6ca2ed8 2997 }
35ec1a38 2998
63ba9eaf 2999 *delegate = move_ptr(list);
35ec1a38 3000 return syswarn(0, "Failed to read /sys/kernel/cgroup/delegate");
d606c4e9 3001 }
a6ca2ed8 3002
257f04ec 3003 lxc_iterate_parts(token, buf, " \t\n") {
d606c4e9
CB
3004 /*
3005 * We always need to chown this for both cgroup and
3006 * cgroup2.
3007 */
8b99a20a 3008 if (strequal(token, "cgroup.procs"))
d606c4e9
CB
3009 continue;
3010
63ba9eaf
CB
3011 ret = list_add_string(&list, token);
3012 if (ret < 0)
3013 return ret;
a6ca2ed8 3014 }
2202afc9 3015
63ba9eaf 3016 *delegate = move_ptr(list);
341e6516 3017 return 0;
2202afc9
CB
3018}
3019
0da35ac7 3020static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
0e3af26b 3021{
0da35ac7
CB
3022 __do_free_string_list char **list = NULL;
3023 int ret;
0e3af26b 3024
0da35ac7
CB
3025 ret = __list_cgroup_delegate(&list);
3026 if (ret < 0)
3027 return syserrno(ret, "Failed to determine unified cgroup delegation requirements");
0e3af26b 3028
0da35ac7
CB
3029 for (char *const *s = list; s && *s; s++) {
3030 if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
3031 continue;
0e3af26b 3032
0da35ac7
CB
3033 return sysinfo(false, "The %s file is not writable, skipping unified hierarchy", *s);
3034 }
0e3af26b 3035
0da35ac7
CB
3036 *ret_files = move_ptr(list);
3037 return true;
0e3af26b
CB
3038}
3039
0da35ac7 3040static bool legacy_hierarchy_delegated(int dfd_base)
35ec1a38 3041{
0da35ac7
CB
3042 if (faccessat(dfd_base, "cgroup.procs", W_OK, 0) && errno != ENOENT)
3043 return sysinfo(false, "The cgroup.procs file is not writable, skipping legacy hierarchy");
3044
3045 return true;
35ec1a38
CB
3046}
3047
3048static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
3049 bool unprivileged)
2202afc9 3050{
8033666c
CB
3051 __do_free char *cgroup_info = NULL;
3052 char *it;
2202afc9 3053
35ec1a38
CB
3054 /*
3055 * Root spawned containers escape the current cgroup, so use init's
3056 * cgroups as our base in that case.
3057 */
9caee129 3058 if (!relative && (geteuid() == 0))
8033666c 3059 cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
2202afc9 3060 else
8033666c
CB
3061 cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3062 if (!cgroup_info)
35ec1a38 3063 return ret_errno(ENOMEM);
2202afc9 3064
8033666c 3065 lxc_iterate_parts(it, cgroup_info, "\n") {
35ec1a38
CB
3066 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
3067 __do_free char *controllers = NULL, *current_cgroup = NULL;
3068 __do_free_string_list char **controller_list = NULL,
3069 **delegate = NULL;
3070 char *line;
3071 int dfd, ret, type;
3072
3073 /* Handle the unified cgroup hierarchy. */
3074 line = it;
3075 if (unified_cgroup(line)) {
3076 char *unified_mnt;
3077
b8572e8c
CB
3078 type = UNIFIED_HIERARCHY;
3079
35ec1a38
CB
3080 current_cgroup = current_unified_cgroup(relative, line);
3081 if (IS_ERR(current_cgroup))
3082 return PTR_ERR(current_cgroup);
3083
e18e9053
CB
3084 if (unified_cgroup_fd(ops->dfd_mnt)) {
3085 dfd_mnt = dup_cloexec(ops->dfd_mnt);
35ec1a38
CB
3086 unified_mnt = "";
3087 } else {
e18e9053 3088 dfd_mnt = open_at(ops->dfd_mnt,
35ec1a38
CB
3089 "unified",
3090 PROTECT_OPATH_DIRECTORY,
3091 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3092 unified_mnt = "unified";
3093 }
3094 if (dfd_mnt < 0) {
3095 if (errno != ENOENT)
e18e9053 3096 return syserrno(-errno, "Failed to open %d/unified", ops->dfd_mnt);
2202afc9 3097
35ec1a38
CB
3098 SYSTRACE("Unified cgroup not mounted");
3099 continue;
3100 }
3101 dfd = dfd_mnt;
3102
3103 if (!is_empty_string(current_cgroup)) {
3104 dfd_base = open_at(dfd_mnt, current_cgroup,
3105 PROTECT_OPATH_DIRECTORY,
3106 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3107 if (dfd_base < 0)
3108 return syserrno(-errno, "Failed to open %d/%s", dfd_mnt, current_cgroup);
3109 dfd = dfd_base;
3110 }
8033666c 3111
0da35ac7
CB
3112 if (!unified_hierarchy_delegated(dfd, &delegate))
3113 continue;
3114
35ec1a38
CB
3115 controller_list = unified_controllers(dfd, "cgroup.controllers");
3116 if (!controller_list) {
3117 TRACE("No controllers are enabled for delegation in the unified hierarchy");
63ba9eaf
CB
3118 controller_list = list_new();
3119 if (!controller_list)
3120 return syserrno(-ENOMEM, "Failed to create empty controller list");
35ec1a38 3121 }
8033666c 3122
35ec1a38
CB
3123 controllers = strdup(unified_mnt);
3124 if (!controllers)
3125 return ret_errno(ENOMEM);
3126 } else {
3127 char *__controllers, *__current_cgroup;
2202afc9 3128
b8572e8c
CB
3129 type = LEGACY_HIERARCHY;
3130
35ec1a38
CB
3131 __controllers = strchr(line, ':');
3132 if (!__controllers)
3133 return ret_errno(EINVAL);
3134 __controllers++;
3135
3136 __current_cgroup = strchr(__controllers, ':');
3137 if (!__current_cgroup)
3138 return ret_errno(EINVAL);
3139 *__current_cgroup = '\0';
3140 __current_cgroup++;
3141
3142 controllers = strdup(unprefix(__controllers));
3143 if (!controllers)
3144 return ret_errno(ENOMEM);
3145
e18e9053 3146 dfd_mnt = open_at(ops->dfd_mnt,
35ec1a38
CB
3147 controllers, PROTECT_OPATH_DIRECTORY,
3148 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3149 if (dfd_mnt < 0) {
3150 if (errno != ENOENT)
3151 return syserrno(-errno, "Failed to open %d/%s",
e18e9053 3152 ops->dfd_mnt, controllers);
2202afc9 3153
35ec1a38
CB
3154 SYSTRACE("%s not mounted", controllers);
3155 continue;
3156 }
3157 dfd = dfd_mnt;
3158
3159 if (!abspath(__current_cgroup))
3160 return ret_errno(EINVAL);
3161
3162 /* remove init.scope */
3163 if (!relative)
3164 __current_cgroup = prune_init_scope(__current_cgroup);
3165
3166 /* create a relative path */
3167 __current_cgroup = deabs(__current_cgroup);
6e214b74 3168
35ec1a38
CB
3169 current_cgroup = strdup(__current_cgroup);
3170 if (!current_cgroup)
3171 return ret_errno(ENOMEM);
2202afc9 3172
35ec1a38
CB
3173 if (!is_empty_string(current_cgroup)) {
3174 dfd_base = open_at(dfd_mnt, current_cgroup,
3175 PROTECT_OPATH_DIRECTORY,
3176 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3177 if (dfd_base < 0)
3178 return syserrno(-errno, "Failed to open %d/%s",
3179 dfd_mnt, current_cgroup);
3180 dfd = dfd_base;
3181 }
2a63b5cb 3182
0da35ac7
CB
3183 if (!legacy_hierarchy_delegated(dfd))
3184 continue;
35ec1a38
CB
3185
3186 /*
3187 * We intentionally pass __current_cgroup here and not
3188 * controllers because we would otherwise chop the
3189 * mountpoint.
3190 */
63ba9eaf
CB
3191 controller_list = list_add_controllers(__controllers);
3192 if (!controller_list)
3193 return syserrno(-ENOMEM, "Failed to create controller list from %s", __controllers);
35ec1a38
CB
3194
3195 if (skip_hierarchy(ops, controller_list))
3196 continue;
3197
35ec1a38
CB
3198 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3199 }
3200
179754a2
CB
3201 ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
3202 current_cgroup, controller_list, type);
35ec1a38
CB
3203 if (ret < 0)
3204 return syserrno(ret, "Failed to add %s hierarchy", controllers);
3205
3206 /* Transfer ownership. */
3207 move_fd(dfd_mnt);
3208 move_fd(dfd_base);
3209 move_ptr(current_cgroup);
3210 move_ptr(controllers);
3211 move_ptr(controller_list);
b8572e8c 3212 if (type == UNIFIED_HIERARCHY)
042f9e9c 3213 ops->unified->delegate = move_ptr(delegate);
35ec1a38
CB
3214 }
3215
3216 /* determine cgroup layout */
3217 if (ops->unified) {
3218 if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3219 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3220 } else {
3221 if (bpf_devices_cgroup_supported())
ca72ccb5 3222 ops->unified->utilities |= DEVICES_CONTROLLER;
35ec1a38
CB
3223 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3224 }
3225 }
3226
c7a1f72a
CB
3227 if (!controllers_available(ops))
3228 return syserrno_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
3229
35ec1a38 3230 return 0;
2202afc9
CB
3231}
3232
35ec1a38 3233static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
2202afc9 3234{
d4cff352 3235 __do_close int dfd = -EBADF;
2202afc9 3236 int ret;
0fbf99d6 3237 const char *controllers_use;
d4cff352 3238
e18e9053 3239 if (ops->dfd_mnt >= 0)
a96be3c3 3240 return ret_errno(EBUSY);
d4cff352
CB
3241
3242 /*
3243 * I don't see the need for allowing symlinks here. If users want to
3244 * have their hierarchy available in different locations I strongly
3245 * suggest bind-mounts.
3246 */
3247 dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3248 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3249 if (dfd < 0)
3250 return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
2202afc9 3251
0fbf99d6
CB
3252 controllers_use = lxc_global_config_value("lxc.cgroup.use");
3253 if (controllers_use) {
3254 __do_free char *dup = NULL;
3255 char *it;
b7b18fc5 3256
0fbf99d6
CB
3257 dup = strdup(controllers_use);
3258 if (!dup)
7a0c8ed3 3259 return -errno;
b7b18fc5 3260
63ba9eaf
CB
3261 lxc_iterate_parts(it, dup, ",") {
3262 ret = list_add_string(&ops->cgroup_use, it);
3263 if (ret < 0)
3264 return ret;
3265 }
b7b18fc5 3266 }
2202afc9 3267
d4cff352
CB
3268 /*
3269 * Keep dfd referenced by the cleanup function and actually move the fd
3270 * once we know the initialization succeeded. So if we fail we clean up
3271 * the dfd.
3272 */
e18e9053 3273 ops->dfd_mnt = dfd;
2202afc9 3274
35ec1a38 3275 ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !lxc_list_empty(&conf->id_map));
d4cff352
CB
3276 if (ret < 0)
3277 return syserrno(ret, "Failed to initialize cgroups");
2202afc9 3278
d4cff352
CB
3279 /* Transfer ownership to cgroup_ops. */
3280 move_fd(dfd);
3281 return 0;
2202afc9
CB
3282}
3283
341e6516 3284__cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
2202afc9
CB
3285{
3286 const char *cgroup_pattern;
3287
341e6516
CB
3288 if (!ops)
3289 return ret_set_errno(-1, ENOENT);
3290
2202afc9
CB
3291 /* copy system-wide cgroup information */
3292 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
63ba9eaf
CB
3293 if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
3294 ops->cgroup_pattern = strdup(cgroup_pattern);
3295 if (!ops->cgroup_pattern)
3296 return ret_errno(ENOMEM);
3297 }
2202afc9 3298
341e6516 3299 return 0;
2202afc9
CB
3300}
3301
35ec1a38 3302struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
2202afc9 3303{
a64edc1c 3304 __do_free struct cgroup_ops *cgfsng_ops = NULL;
2202afc9 3305
c5d0238a 3306 cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
2202afc9 3307 if (!cgfsng_ops)
341e6516 3308 return ret_set_errno(NULL, ENOMEM);
2202afc9 3309
2202afc9 3310 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
e18e9053 3311 cgfsng_ops->dfd_mnt = -EBADF;
2202afc9 3312
35ec1a38 3313 if (initialize_cgroups(cgfsng_ops, conf))
2202afc9 3314 return NULL;
2202afc9 3315
ca76baed
CB
3316 cgfsng_ops->data_init = cgfsng_data_init;
3317 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3318 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3319 cgfsng_ops->monitor_create = cgfsng_monitor_create;
3320 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3321 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3322 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3323 cgfsng_ops->payload_create = cgfsng_payload_create;
3324 cgfsng_ops->payload_enter = cgfsng_payload_enter;
840eec19 3325 cgfsng_ops->finalize = cgfsng_finalize;
ca76baed
CB
3326 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3327 cgfsng_ops->get = cgfsng_get;
3328 cgfsng_ops->set = cgfsng_set;
3329 cgfsng_ops->freeze = cgfsng_freeze;
3330 cgfsng_ops->unfreeze = cgfsng_unfreeze;
3331 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3332 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3333 cgfsng_ops->driver = "cgfsng";
3334 cgfsng_ops->version = "1.0.0";
3335 cgfsng_ops->attach = cgfsng_attach;
3336 cgfsng_ops->chown = cgfsng_chown;
3337 cgfsng_ops->mount = cgfsng_mount;
3338 cgfsng_ops->devices_activate = cgfsng_devices_activate;
3339 cgfsng_ops->get_limiting_cgroup = cgfsng_get_limiting_cgroup;
2202afc9 3340
ff9edd2d
CB
3341 cgfsng_ops->criu_escape = cgfsng_criu_escape;
3342 cgfsng_ops->criu_num_hierarchies = cgfsng_criu_num_hierarchies;
3343 cgfsng_ops->criu_get_hierarchies = cgfsng_criu_get_hierarchies;
3344
a64edc1c 3345 return move_ptr(cgfsng_ops);
2202afc9 3346}
be835470 3347
029d8e88
CB
3348int cgroup_attach(const struct lxc_conf *conf, const char *name,
3349 const char *lxcpath, pid_t pid)
3350{
3351 __do_close int unified_fd = -EBADF;
3352 int ret;
3353
88c27c53 3354 if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
029d8e88
CB
3355 return ret_errno(EINVAL);
3356
3357 unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3358 if (unified_fd < 0)
6b55ce0e 3359 return ret_errno(ENOCGROUP2);
029d8e88
CB
3360
3361 if (!lxc_list_empty(&conf->id_map)) {
3362 struct userns_exec_unified_attach_data args = {
3363 .conf = conf,
3364 .unified_fd = unified_fd,
3365 .pid = pid,
3366 };
3367
3368 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3369 if (ret < 0)
3370 return -errno;
3371
3372 ret = userns_exec_minimal(conf,
3373 cgroup_unified_attach_parent_wrapper,
3374 &args,
3375 cgroup_unified_attach_child_wrapper,
3376 &args);
3377 } else {
3378 ret = cgroup_attach_leaf(conf, unified_fd, pid);
3379 }
3380
3381 return ret;
3382}
3383
751a624f 3384/* Connects to command socket therefore isn't callable from command handler. */
bfe2971a 3385int cgroup_get(const char *name, const char *lxcpath,
be835470
CB
3386 const char *filename, char *buf, size_t len)
3387{
3388 __do_close int unified_fd = -EBADF;
3389 ssize_t ret;
3390
bfe2971a 3391 if (is_empty_string(filename) || is_empty_string(name) ||
be835470
CB
3392 is_empty_string(lxcpath))
3393 return ret_errno(EINVAL);
3394
3395 if ((buf && !len) || (len && !buf))
3396 return ret_errno(EINVAL);
3397
ae4fcc7b 3398 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
be835470
CB
3399 if (unified_fd < 0)
3400 return ret_errno(ENOCGROUP2);
3401
3402 ret = lxc_read_try_buf_at(unified_fd, filename, buf, len);
3403 if (ret < 0)
3404 SYSERROR("Failed to read cgroup value");
3405
3406 return ret;
3407}
3408
751a624f 3409/* Connects to command socket therefore isn't callable from command handler. */
bfe2971a 3410int cgroup_set(const char *name, const char *lxcpath,
be835470
CB
3411 const char *filename, const char *value)
3412{
3413 __do_close int unified_fd = -EBADF;
3414 ssize_t ret;
3415
bfe2971a 3416 if (is_empty_string(filename) || is_empty_string(value) ||
be835470
CB
3417 is_empty_string(name) || is_empty_string(lxcpath))
3418 return ret_errno(EINVAL);
3419
ae4fcc7b 3420 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
be835470
CB
3421 if (unified_fd < 0)
3422 return ret_errno(ENOCGROUP2);
3423
aa72fbe7 3424 if (strnequal(filename, "devices.", STRLITERALLEN("devices."))) {
be835470
CB
3425 struct device_item device = {};
3426
3427 ret = device_cgroup_rule_parse(&device, filename, value);
3428 if (ret < 0)
3429 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", filename, value);
3430
3431 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3432 } else {
3433 ret = lxc_writeat(unified_fd, filename, value, strlen(value));
3434 }
3435
3436 return ret;
3437}
c8af3332 3438
c9c814f4
CB
3439static int do_cgroup_freeze(int unified_fd,
3440 const char *state_string,
3441 int state_num,
3442 int timeout,
3443 const char *epoll_error,
3444 const char *wait_error)
c8af3332
CB
3445{
3446 __do_close int events_fd = -EBADF;
3447 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
3448 int ret;
3449 struct lxc_epoll_descr descr = {};
3450
3451 if (timeout != 0) {
3452 ret = lxc_mainloop_open(&descr);
3453 if (ret)
3454 return log_error_errno(-1, errno, "%s", epoll_error);
3455
3456 /* automatically cleaned up now */
3457 descr_ptr = &descr;
3458
3459 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3460 if (events_fd < 0)
3461 return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3462
3463 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
3464 if (ret < 0)
3465 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3466 }
3467
3468 ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3469 if (ret < 0)
3470 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3471
3472 if (timeout != 0) {
3473 ret = lxc_mainloop(&descr, timeout);
3474 if (ret)
3475 return log_error_errno(-1, errno, "%s", wait_error);
3476 }
3477
3478 return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3479}
3480
c9c814f4
CB
3481static inline int __cgroup_freeze(int unified_fd, int timeout)
3482{
3483 return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3484 "Failed to create epoll instance to wait for container freeze",
3485 "Failed to wait for container to be frozen");
3486}
3487
5ef7547f 3488int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
c8af3332
CB
3489{
3490 __do_close int unified_fd = -EBADF;
3491 int ret;
3492
b57f9b13
CB
3493 if (is_empty_string(name) || is_empty_string(lxcpath))
3494 return ret_errno(EINVAL);
3495
ae4fcc7b 3496 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
c8af3332
CB
3497 if (unified_fd < 0)
3498 return ret_errno(ENOCGROUP2);
3499
3500 lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
c9c814f4 3501 ret = __cgroup_freeze(unified_fd, timeout);
c8af3332 3502 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
5ef7547f 3503 return ret;
c8af3332
CB
3504}
3505
c9c814f4
CB
3506int __cgroup_unfreeze(int unified_fd, int timeout)
3507{
3508 return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3509 "Failed to create epoll instance to wait for container freeze",
3510 "Failed to wait for container to be frozen");
3511}
3512
5ef7547f 3513int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
c8af3332
CB
3514{
3515 __do_close int unified_fd = -EBADF;
3516 int ret;
3517
b57f9b13
CB
3518 if (is_empty_string(name) || is_empty_string(lxcpath))
3519 return ret_errno(EINVAL);
3520
ae4fcc7b 3521 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
c8af3332
CB
3522 if (unified_fd < 0)
3523 return ret_errno(ENOCGROUP2);
3524
3525 lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
c9c814f4 3526 ret = __cgroup_unfreeze(unified_fd, timeout);
c8af3332 3527 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
5ef7547f 3528 return ret;
c8af3332 3529}