]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
build: add src/include to build and simplify header inclusions
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
ccb4cabe
SH
2
3/*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
0e7ff52c 8 * each controller.
ccb4cabe
SH
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
18406e5a 12 * a comma-separated list of controllers.
ccb4cabe 13 */
a54694f8 14
646b75b5
CB
15#include "config.h"
16
d38dd64a
CB
17#ifndef _GNU_SOURCE
18#define _GNU_SOURCE 1
19#endif
646b75b5 20
a54694f8
CB
21#include <ctype.h>
22#include <dirent.h>
23#include <errno.h>
24#include <grp.h>
d38dd64a
CB
25#include <linux/kdev_t.h>
26#include <linux/types.h>
942e193e
CB
27#include <poll.h>
28#include <signal.h>
a54694f8 29#include <stdint.h>
ccb4cabe
SH
30#include <stdio.h>
31#include <stdlib.h>
a54694f8 32#include <string.h>
385e58e8 33#include <sys/epoll.h>
438c4581 34#include <sys/types.h>
d38dd64a 35#include <unistd.h>
c8bf519d 36
d1783ef4 37#include "af_unix.h"
b635e92d 38#include "caps.h"
ccb4cabe 39#include "cgroup.h"
bf651989 40#include "cgroup2_devices.h"
6328fd9c 41#include "cgroup_utils.h"
ccb4cabe 42#include "commands.h"
c8af3332 43#include "commands_utils.h"
43654d34 44#include "conf.h"
38fa7e47 45#include "error_utils.h"
a54694f8 46#include "log.h"
c19ad94b 47#include "macro.h"
018051e3 48#include "mainloop.h"
861cb8c2 49#include "memory_utils.h"
74ed30d7 50#include "mount_utils.h"
43654d34 51#include "storage/storage.h"
600a0163 52#include "string_utils.h"
315f8a4e 53#include "syscall_wrappers.h"
a54694f8 54#include "utils.h"
ccb4cabe 55
64e82f8b 56#ifndef HAVE_STRLCPY
58db1a61 57#include "strlcpy.h"
64e82f8b
DJ
58#endif
59
3ebe2fbd 60#ifndef HAVE_STRLCAT
58db1a61 61#include "strlcat.h"
3ebe2fbd
DJ
62#endif
63
ac2cecc4 64lxc_log_define(cgfsng, cgroup);
ccb4cabe 65
35ec1a38
CB
66/*
67 * Given a pointer to a null-terminated array of pointers, realloc to add one
8b8db2f6
CB
68 * entry, and point the new entry to NULL. Do not fail. Return the index to the
69 * second-to-last entry - that is, the one which is now available for use
70 * (keeping the list null-terminated).
ccb4cabe 71 */
4780b5e7 72static int cg_list_add(void ***list)
ccb4cabe 73{
35ec1a38
CB
74 int idx = 0;
75 void **p;
ccb4cabe
SH
76
77 if (*list)
35ec1a38 78 for (; (*list)[idx]; idx++)
8b8db2f6 79 ;
ccb4cabe 80
35ec1a38
CB
81 p = realloc(*list, (idx + 2) * sizeof(void **));
82 if (!p)
83 return ret_errno(ENOMEM);
84
85 p[idx + 1] = NULL;
86 *list = p;
87
88 return idx;
ccb4cabe
SH
89}
90
8073018d
CB
91/* Given a null-terminated array of strings, check whether @entry is one of the
92 * strings.
ccb4cabe
SH
93 */
94static bool string_in_list(char **list, const char *entry)
95{
ccb4cabe
SH
96 if (!list)
97 return false;
d6337a5f 98
77c3e9a2 99 for (int i = 0; list[i]; i++)
8b99a20a 100 if (strequal(list[i], entry))
ccb4cabe
SH
101 return true;
102
103 return false;
104}
105
5ae0207c
CB
106/* Given a handler's cgroup data, return the struct hierarchy for the controller
107 * @c, or NULL if there is none.
ccb4cabe 108 */
abb6f657 109static struct hierarchy *get_hierarchy(const struct cgroup_ops *ops, const char *controller)
ccb4cabe 110{
77c3e9a2
CB
111 if (!ops->hierarchies)
112 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
d6337a5f 113
77c3e9a2 114 for (int i = 0; ops->hierarchies[i]; i++) {
27a5132c 115 if (!controller) {
d6337a5f 116 /* This is the empty unified hierarchy. */
09ed8992 117 if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
2202afc9 118 return ops->hierarchies[i];
09ed8992 119
106f1f38 120 continue;
6dcd6f02 121 }
09ed8992 122
6dcd6f02
CB
123 /*
124 * Handle controllers with significant implementation changes
125 * from cgroup to cgroup2.
126 */
127 if (pure_unified_layout(ops)) {
8b99a20a 128 if (strequal(controller, "devices")) {
ca72ccb5 129 if (device_utility_controller(ops->unified))
6dcd6f02
CB
130 return ops->unified;
131
132 break;
8b99a20a 133 } else if (strequal(controller, "freezer")) {
ca72ccb5 134 if (freezer_utility_controller(ops->unified))
6dcd6f02
CB
135 return ops->unified;
136
137 break;
138 }
d6337a5f
CB
139 }
140
27a5132c 141 if (string_in_list(ops->hierarchies[i]->controllers, controller))
2202afc9 142 return ops->hierarchies[i];
ccb4cabe 143 }
d6337a5f 144
27a5132c
CB
145 if (controller)
146 WARN("There is no useable %s controller", controller);
147 else
148 WARN("There is no empty unified cgroup hierarchy");
149
77c3e9a2 150 return ret_set_errno(NULL, ENOENT);
ccb4cabe
SH
151}
152
abb6f657
CB
153int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool limit)
154{
155 int dfd;
156 const struct hierarchy *h;
157
158 h = get_hierarchy(ops, fd->controller);
159 if (!h)
160 return ret_errno(ENOENT);
161
162 /*
163 * The client requested that the controller must be in a specific
164 * cgroup version.
165 */
166 if (fd->type != 0 && fd->type != h->fs_type)
167 return ret_errno(EINVAL);
168
169 if (limit)
170 dfd = h->dfd_con;
171 else
172 dfd = h->dfd_lim;
173 if (dfd < 0)
174 return ret_errno(EBADF);
175
176 fd->layout = ops->cgroup_layout;
177 fd->type = h->fs_type;
178 if (fd->type == UNIFIED_HIERARCHY)
179 fd->utilities = h->utilities;
180 fd->fd = dfd;
181
182 return 0;
183}
184
a54694f8
CB
185/* Taken over modified from the kernel sources. */
186#define NBITS 32 /* bits in uint32_t */
187#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
188#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
189
190static void set_bit(unsigned bit, uint32_t *bitarr)
191{
192 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
193}
194
195static void clear_bit(unsigned bit, uint32_t *bitarr)
196{
197 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
198}
199
200static bool is_set(unsigned bit, uint32_t *bitarr)
201{
202 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
203}
204
205/* Create cpumask from cpulist aka turn:
206 *
207 * 0,2-3
208 *
d5d468f6 209 * into bit array
a54694f8
CB
210 *
211 * 1 0 1 1
212 */
f5bc57d2 213static int lxc_cpumask(char *buf, uint32_t **bitarr, size_t *last_set_bit)
a54694f8 214{
f5bc57d2
CB
215 __do_free uint32_t *arr_u32 = NULL;
216 size_t cur_last_set_bit = 0, nbits = 256;
217 size_t nr_u32;
a54694f8 218 char *token;
d5d468f6 219
f5bc57d2
CB
220 nr_u32 = BITS_TO_LONGS(nbits);
221 arr_u32 = zalloc(nr_u32 * sizeof(uint32_t));
222 if (!arr_u32)
223 return ret_errno(ENOMEM);
a54694f8 224
0be0d78f 225 lxc_iterate_parts(token, buf, ",") {
f5bc57d2 226 unsigned last_bit, first_bit;
d5d468f6 227 char *range;
a54694f8 228
f5bc57d2
CB
229 errno = 0;
230 first_bit = strtoul(token, NULL, 0);
231 last_bit = first_bit;
d5d468f6 232 range = strchr(token, '-');
a54694f8 233 if (range)
f5bc57d2
CB
234 last_bit = strtoul(range + 1, NULL, 0);
235
236 if (!(first_bit <= last_bit))
237 return ret_errno(EINVAL);
d5d468f6 238
f5bc57d2
CB
239 if (last_bit >= nbits) {
240 size_t add_bits = last_bit - nbits + 32;
241 size_t new_nr_u32;
242 uint32_t *p;
a54694f8 243
f5bc57d2
CB
244 new_nr_u32 = BITS_TO_LONGS(nbits + add_bits);
245 p = realloc(arr_u32, new_nr_u32 * sizeof(uint32_t));
246 if (!p)
247 return ret_errno(ENOMEM);
248 arr_u32 = move_ptr(p);
a54694f8 249
f5bc57d2
CB
250 memset(arr_u32 + nr_u32, 0,
251 (new_nr_u32 - nr_u32) * sizeof(uint32_t));
252 nbits += add_bits;
253 }
254
255 while (first_bit <= last_bit)
256 set_bit(first_bit++, arr_u32);
257
258 if (last_bit > cur_last_set_bit)
259 cur_last_set_bit = last_bit;
a54694f8
CB
260 }
261
f5bc57d2
CB
262 *last_set_bit = cur_last_set_bit;
263 *bitarr = move_ptr(arr_u32);
264 return 0;
a54694f8
CB
265}
266
4d8f68fb
CB
267static int lxc_cpumask_update(char *buf, uint32_t *bitarr, size_t last_set_bit,
268 bool clear)
269{
270 bool flipped = false;
271 char *token;
272
273 lxc_iterate_parts(token, buf, ",") {
274 unsigned last_bit, first_bit;
275 char *range;
276
277 errno = 0;
278 first_bit = strtoul(token, NULL, 0);
279 last_bit = first_bit;
280 range = strchr(token, '-');
281 if (range)
282 last_bit = strtoul(range + 1, NULL, 0);
283
284 if (!(first_bit <= last_bit)) {
285 WARN("The cup range seems to be inverted: %u-%u", first_bit, last_bit);
286 continue;
287 }
288
289 if (last_bit > last_set_bit)
290 continue;
291
292 while (first_bit <= last_bit) {
293 if (clear && is_set(first_bit, bitarr)) {
294 flipped = true;
295 clear_bit(first_bit, bitarr);
296 } else if (!clear && !is_set(first_bit, bitarr)) {
297 flipped = true;
298 set_bit(first_bit, bitarr);
299 }
300
301 first_bit++;
302 }
303 }
304
305 if (flipped)
306 return 1;
307
308 return 0;
309}
310
a54694f8 311/* Turn cpumask into simple, comma-separated cpulist. */
f5bc57d2 312static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t last_set_bit)
a54694f8 313{
f761d24d 314 __do_free_string_list char **cpulist = NULL;
c19ad94b 315 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
77c3e9a2 316 int ret;
a54694f8 317
f5bc57d2 318 for (size_t i = 0; i <= last_set_bit; i++) {
414c6719
CB
319 if (!is_set(i, bitarr))
320 continue;
321
0bba27c1
CB
322 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
323 if (ret < 0)
414c6719 324 return NULL;
414c6719
CB
325
326 ret = lxc_append_string(&cpulist, numstr);
f761d24d 327 if (ret < 0)
c5b8049e 328 return ret_set_errno(NULL, ENOMEM);
a54694f8 329 }
414c6719
CB
330
331 if (!cpulist)
c5b8049e 332 return ret_set_errno(NULL, ENOMEM);
414c6719 333
f761d24d 334 return lxc_string_join(",", (const char **)cpulist, false);
a54694f8
CB
335}
336
77c3e9a2 337static inline bool is_unified_hierarchy(const struct hierarchy *h)
c04a6d4e 338{
b8572e8c 339 return h->fs_type == UNIFIED_HIERARCHY;
c04a6d4e
CB
340}
341
f57ac67f
CB
342/* Return true if the controller @entry is found in the null-terminated list of
343 * hierarchies @hlist.
ccb4cabe 344 */
c7a1f72a 345static bool controller_available(struct hierarchy **hlist, char *entry)
ccb4cabe 346{
ccb4cabe
SH
347 if (!hlist)
348 return false;
349
77c3e9a2 350 for (int i = 0; hlist[i]; i++)
ccb4cabe
SH
351 if (string_in_list(hlist[i]->controllers, entry))
352 return true;
d6337a5f 353
ccb4cabe
SH
354 return false;
355}
356
c7a1f72a 357static bool controllers_available(struct cgroup_ops *ops)
ccb4cabe 358{
77c3e9a2 359 struct hierarchy **hlist;
ccb4cabe 360
2202afc9 361 if (!ops->cgroup_use)
ccb4cabe 362 return true;
c2712f64 363
77c3e9a2
CB
364 hlist = ops->hierarchies;
365 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
c7a1f72a
CB
366 if (!controller_available(hlist, *cur))
367 return log_error(false, "The %s controller found", *cur);
c2712f64 368
ccb4cabe
SH
369 return true;
370}
371
63ba9eaf 372static char **list_new(void)
ccb4cabe 373{
63ba9eaf
CB
374 __do_free_string_list char **list = NULL;
375 int idx;
376
4780b5e7 377 idx = cg_list_add((void ***)&list);
63ba9eaf
CB
378 if (idx < 0)
379 return NULL;
a55f31bd 380
63ba9eaf
CB
381 list[idx] = NULL;
382 return move_ptr(list);
35ec1a38 383}
d6337a5f 384
63ba9eaf 385static int list_add_string(char ***list, char *entry)
35ec1a38 386{
63ba9eaf
CB
387 __do_free char *dup = NULL;
388 int idx;
389
390 dup = strdup(entry);
391 if (!dup)
392 return ret_errno(ENOMEM);
393
4780b5e7 394 idx = cg_list_add((void ***)list);
63ba9eaf
CB
395 if (idx < 0)
396 return idx;
397
398 (*list)[idx] = move_ptr(dup);
399 return 0;
400}
401
402static char **list_add_controllers(char *controllers)
403{
404 __do_free_string_list char **list = NULL;
35ec1a38 405 char *it;
6328fd9c 406
327baffe 407 lxc_iterate_parts(it, controllers, ", \t\n") {
63ba9eaf 408 int ret;
d97919ab 409
63ba9eaf
CB
410 ret = list_add_string(&list, it);
411 if (ret < 0)
d6337a5f 412 return NULL;
411ac6d8 413 }
f205f10c 414
63ba9eaf 415 return move_ptr(list);
d6337a5f
CB
416}
417
35ec1a38 418static char **unified_controllers(int dfd, const char *file)
d6337a5f 419{
d97919ab 420 __do_free char *buf = NULL;
d6337a5f 421
46bf13b7 422 buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
d6337a5f 423 if (!buf)
411ac6d8 424 return NULL;
6328fd9c 425
63ba9eaf 426 return list_add_controllers(buf);
ccb4cabe
SH
427}
428
35ec1a38 429static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
060e54d6
CB
430{
431 if (!ops->cgroup_use)
35ec1a38 432 return false;
060e54d6
CB
433
434 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
435 bool found = false;
436
437 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
438 if (!strequal(*cur_use, *cur_ctrl))
439 continue;
440
441 found = true;
442 break;
443 }
444
445 if (found)
446 continue;
447
35ec1a38 448 return true;
060e54d6
CB
449 }
450
35ec1a38 451 return false;
060e54d6
CB
452}
453
179754a2
CB
454static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
455 int dfd_base, char *base_cgroup,
b8572e8c 456 char **controllers, cgroupfs_type_magic_t fs_type)
ccb4cabe 457{
600a0163 458 __do_free struct hierarchy *new = NULL;
701be30e 459 int idx;
ccb4cabe 460
35ec1a38 461 if (abspath(base_cgroup))
060aaa39 462 return syserror_set(-EINVAL, "Container base path must be relative to controller mount");
060e54d6 463
1973b62a 464 new = zalloc(sizeof(*new));
6e214b74 465 if (!new)
060e54d6 466 return ret_errno(ENOMEM);
c72e7cb5 467
e33870e5 468 new->dfd_con = -EBADF;
c0af7b1c 469 new->dfd_lim = -EBADF;
6a32c817 470 new->dfd_mon = -EBADF;
600a0163 471
44585f1a
CB
472 new->fs_type = fs_type;
473 new->controllers = controllers;
a58be2ad 474 new->at_mnt = mnt;
44585f1a 475 new->at_base = base_cgroup;
35ec1a38 476
44585f1a
CB
477 new->dfd_mnt = dfd_mnt;
478 new->dfd_base = dfd_base;
35ec1a38
CB
479
480 TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
481 mnt, maybe_empty(base_cgroup));
060e54d6 482 for (char *const *it = new->controllers; it && *it; it++)
35ec1a38 483 TRACE("The hierarchy contains the %s controller", *it);
6328fd9c 484
4780b5e7 485 idx = cg_list_add((void ***)&ops->hierarchies);
63ba9eaf
CB
486 if (idx < 0)
487 return ret_errno(idx);
488
b8572e8c 489 if (fs_type == UNIFIED_HIERARCHY)
060e54d6 490 ops->unified = new;
701be30e 491 (ops->hierarchies)[idx] = move_ptr(new);
ccb4cabe 492
63ba9eaf 493 return 0;
ccb4cabe
SH
494}
495
c55fe36d 496static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
c71d83e1 497{
c55fe36d 498 if (!path_prune || !hierarchies)
2202afc9 499 return 0;
d6337a5f 500
8e64b673 501 for (int i = 0; hierarchies[i]; i++) {
2202afc9 502 struct hierarchy *h = hierarchies[i];
77c3e9a2 503 int ret;
d6337a5f 504
c55fe36d 505 ret = cgroup_tree_prune(h->dfd_base, path_prune);
2202afc9 506 if (ret < 0)
c55fe36d
CB
507 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
508 else
509 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
2202afc9 510
b1b1a60f 511 free_equal(h->path_lim, h->path_con);
2202afc9 512 }
d6337a5f 513
c71d83e1 514 return 0;
d6337a5f
CB
515}
516
2202afc9
CB
517struct generic_userns_exec_data {
518 struct hierarchy **hierarchies;
c55fe36d 519 const char *path_prune;
2202afc9
CB
520 struct lxc_conf *conf;
521 uid_t origuid; /* target uid in parent namespace */
522 char *path;
523};
d6337a5f 524
de6fe132 525static int cgroup_tree_remove_wrapper(void *data)
2202afc9 526{
2202afc9
CB
527 struct generic_userns_exec_data *arg = data;
528 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
529 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
8e64b673 530 int ret;
d6337a5f 531
8917c382 532 if (!lxc_drop_groups() && errno != EPERM)
b58214ac
CB
533 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
534
2202afc9 535 ret = setresgid(nsgid, nsgid, nsgid);
8e64b673 536 if (ret < 0)
77c3e9a2 537 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
8e64b673 538 (int)nsgid, (int)nsgid, (int)nsgid);
d6337a5f 539
2202afc9 540 ret = setresuid(nsuid, nsuid, nsuid);
8e64b673 541 if (ret < 0)
77c3e9a2 542 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
8e64b673 543 (int)nsuid, (int)nsuid, (int)nsuid);
d6337a5f 544
c55fe36d 545 return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
d6337a5f
CB
546}
547
434c8e15
CB
548__cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
549 struct lxc_handler *handler)
d6337a5f
CB
550{
551 int ret;
bd8ef4e4 552
fc3b9533
CB
553 if (!ops) {
554 ERROR("Called with uninitialized cgroup operations");
555 return;
556 }
fc1c3af9 557
69b4a4bb
CB
558 if (!ops->hierarchies)
559 return;
560
fc3b9533
CB
561 if (!handler) {
562 ERROR("Called with uninitialized handler");
563 return;
564 }
fc1c3af9 565
fc3b9533
CB
566 if (!handler->conf) {
567 ERROR("Called with uninitialized conf");
568 return;
569 }
fc1c3af9 570
a6aeb9f1
CB
571 if (!ops->container_limit_cgroup) {
572 WARN("Uninitialized limit cgroup");
573 return;
574 }
575
31b84c7a 576 ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
bf651989
CB
577 if (ret < 0)
578 WARN("Failed to detach bpf program from cgroup");
bf651989 579
0589d744 580 if (!list_empty(&handler->conf->id_map)) {
8e64b673 581 struct generic_userns_exec_data wrap = {
77c3e9a2 582 .conf = handler->conf,
c55fe36d 583 .path_prune = ops->container_limit_cgroup,
77c3e9a2
CB
584 .hierarchies = ops->hierarchies,
585 .origuid = 0,
8e64b673 586 };
de6fe132
CB
587 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
588 &wrap, "cgroup_tree_remove_wrapper");
8e64b673 589 } else {
c55fe36d 590 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
ccb4cabe 591 }
8e64b673 592 if (ret < 0)
fc3b9533 593 SYSWARN("Failed to destroy cgroups");
ccb4cabe
SH
594}
595
033267c9
CB
596#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
597#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
598static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
599 bool am_initialized)
434c8e15 600{
033267c9
CB
601 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
602 *offlinecpus = NULL, *posscpus = NULL;
4d8f68fb 603 __do_free uint32_t *possmask = NULL;
033267c9 604 int ret;
4d8f68fb 605 size_t poss_last_set_bit = 0;
b376d3d0 606
033267c9
CB
607 posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
608 if (!posscpus)
609 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
610
033267c9
CB
611 if (file_exists(__ISOL_CPUS)) {
612 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
613 if (!isolcpus)
614 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
615
f5bc57d2
CB
616 if (!isdigit(isolcpus[0]))
617 free_disarm(isolcpus);
033267c9
CB
618 } else {
619 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
fc3b9533 620 }
434c8e15 621
033267c9
CB
622 if (file_exists(__OFFLINE_CPUS)) {
623 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
624 if (!offlinecpus)
625 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
434c8e15 626
f5bc57d2
CB
627 if (!isdigit(offlinecpus[0]))
628 free_disarm(offlinecpus);
033267c9
CB
629 } else {
630 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
fc3b9533 631 }
b376d3d0 632
f5bc57d2 633 if (!isolcpus && !offlinecpus) {
033267c9
CB
634 cpulist = move_ptr(posscpus);
635 goto copy_parent;
fc3b9533 636 }
1973b62a 637
f5bc57d2
CB
638 ret = lxc_cpumask(posscpus, &possmask, &poss_last_set_bit);
639 if (ret)
033267c9 640 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
434c8e15 641
4d8f68fb
CB
642 if (isolcpus)
643 ret = lxc_cpumask_update(isolcpus, possmask, poss_last_set_bit, true);
434c8e15 644
4d8f68fb
CB
645 if (offlinecpus)
646 ret |= lxc_cpumask_update(offlinecpus, possmask, poss_last_set_bit, true);
c468e4d4 647
4d8f68fb 648 if (!ret) {
f5bc57d2 649 cpulist = lxc_cpumask_to_cpulist(possmask, poss_last_set_bit);
033267c9
CB
650 TRACE("No isolated or offline cpus present in cpuset");
651 } else {
652 cpulist = move_ptr(posscpus);
653 TRACE("Removed isolated or offline cpus from cpuset");
654 }
655 if (!cpulist)
656 return log_error_errno(false, errno, "Failed to create cpu list");
1973b62a 657
033267c9
CB
658copy_parent:
659 if (!am_initialized) {
660 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
661 if (ret < 0)
662 return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
77ffeed2 663
033267c9
CB
664 TRACE("Copied cpu settings of parent cgroup");
665 }
77ffeed2 666
033267c9
CB
667 return true;
668}
1973b62a 669
033267c9
CB
670static bool cpuset1_initialize(int dfd_base, int dfd_next)
671{
672 char mems[PATH_MAX];
673 ssize_t bytes;
674 char v;
434c8e15 675
21e84b02 676 /* Determine whether the base cgroup has cpuset inheritance turned on. */
033267c9
CB
677 bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
678 if (bytes < 0)
9fc21b2d 679 return syserror_ret(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
033267c9 680
21e84b02 681 /* Initialize cpuset.cpus removing any isolated and offline cpus. */
033267c9 682 if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
9fc21b2d 683 return syserror_ret(false, "Failed to initialize cpuset.cpus");
033267c9
CB
684
685 /* Read cpuset.mems from parent... */
686 bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
687 if (bytes < 0)
9fc21b2d 688 return syserror_ret(false, "Failed to read file %d(cpuset.mems)", dfd_base);
033267c9 689
21e84b02 690 /* and copy to first cgroup in the tree... */
033267c9
CB
691 bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
692 if (bytes < 0)
9fc21b2d 693 return syserror_ret(false, "Failed to write %d(cpuset.mems)", dfd_next);
033267c9 694
21e84b02 695 /* and finally turn on cpuset inheritance. */
033267c9
CB
696 bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
697 if (bytes < 0)
9fc21b2d 698 return syserror_ret(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
033267c9
CB
699
700 return log_trace(true, "Initialized cpuset in the legacy hierarchy");
434c8e15
CB
701}
702
033267c9
CB
703static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
704 bool cpuset_v1, bool eexist_ignore)
6099dd5a 705{
da42ac7b
CB
706 __do_close int dfd_final = -EBADF;
707 int dfd_cur = dfd_base;
708 int ret = 0;
709 size_t len;
710 char *cur;
711 char buf[PATH_MAX];
6099dd5a 712
da42ac7b 713 if (is_empty_string(path))
bce04069 714 return ret_errno(EINVAL);
6099dd5a 715
da42ac7b
CB
716 len = strlcpy(buf, path, sizeof(buf));
717 if (len >= sizeof(buf))
bce04069 718 return ret_errno(E2BIG);
6099dd5a 719
da42ac7b
CB
720 lxc_iterate_parts(cur, buf, "/") {
721 /*
722 * Even though we vetted the paths when we parsed the config
723 * we're paranoid here and check that the path is neither
724 * absolute nor walks upwards.
725 */
e4db08ed 726 if (abspath(cur))
060aaa39 727 return syserror_set(-EINVAL, "No absolute paths allowed");
6099dd5a 728
e4db08ed 729 if (strnequal(cur, "..", STRLITERALLEN("..")))
060aaa39 730 return syserror_set(-EINVAL, "No upward walking paths allowed");
6099dd5a 731
da42ac7b
CB
732 ret = mkdirat(dfd_cur, cur, mode);
733 if (ret < 0) {
734 if (errno != EEXIST)
2d7b0895 735 return syserror("Failed to create %d(%s)", dfd_cur, cur);
da42ac7b
CB
736
737 ret = -EEXIST;
738 }
739 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
740
741 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
742 if (dfd_final < 0)
2d7b0895 743 return syserror("Fail to open%s directory %d(%s)",
da42ac7b
CB
744 !ret ? " newly created" : "", dfd_base, cur);
745 if (dfd_cur != dfd_base)
746 close(dfd_cur);
033267c9 747 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
2d7b0895 748 return syserror_set(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
da42ac7b 749 /*
033267c9
CB
750 * Leave dfd_final pointing to the last fd we opened so
751 * it will be automatically zapped if we return early.
da42ac7b
CB
752 */
753 dfd_cur = dfd_final;
754 }
755
756 /* The final cgroup must be succesfully creatd by us. */
033267c9
CB
757 if (ret) {
758 if (ret != -EEXIST || !eexist_ignore)
0d8d13be 759 return syswarn_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
033267c9 760 }
da42ac7b
CB
761
762 return move_fd(dfd_final);
6099dd5a
CB
763}
764
432faf20 765static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
a6aeb9f1
CB
766 struct hierarchy *h, const char *cgroup_limit_dir,
767 const char *cgroup_leaf, bool payload)
72068e74 768{
da42ac7b 769 __do_close int fd_limit = -EBADF, fd_final = -EBADF;
033267c9 770 bool cpuset_v1 = false;
72068e74 771
033267c9
CB
772 /*
773 * The legacy cpuset controller needs massaging in case inheriting
774 * settings from its immediate ancestor cgroup hasn't been turned on.
775 */
776 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
0c3deb94 777
a6aeb9f1 778 if (payload && cgroup_leaf) {
da42ac7b 779 /* With isolation both parts need to not already exist. */
033267c9 780 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
da42ac7b 781 if (fd_limit < 0)
0d8d13be 782 return syswarn_ret(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
432faf20 783
eece10d5 784 h->path_lim = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
60052c3b 785 h->dfd_lim = move_fd(fd_limit);
60052c3b 786
a6aeb9f1 787 TRACE("Created limit cgroup %d->%d(%s)",
60052c3b 788 h->dfd_lim, h->dfd_base, cgroup_limit_dir);
432faf20
WB
789
790 /*
791 * With isolation the devices legacy cgroup needs to be
792 * iinitialized early, as it typically contains an 'a' (all)
793 * line, which is not possible once a subdirectory has been
794 * created.
795 */
ec4d463d
CB
796 if (string_in_list(h->controllers, "devices") &&
797 !ops->setup_limits_legacy(ops, conf, true))
0d8d13be 798 return log_warn(false, "Failed to setup legacy device limits");
432faf20 799
a6aeb9f1
CB
800 /*
801 * If we use a separate limit cgroup, the leaf cgroup, i.e. the
802 * cgroup the container actually resides in, is below fd_limit.
803 */
60052c3b 804 fd_final = __cgroup_tree_create(h->dfd_lim, cgroup_leaf, 0755, cpuset_v1, false);
e2035358
CB
805 if (fd_final < 0) {
806 /* Ensure we don't leave any garbage behind. */
807 if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
808 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
809 else
810 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
0d8d13be 811 return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
e2035358 812 }
60052c3b 813 h->dfd_con = move_fd(fd_final);
eece10d5 814 h->path_con = must_make_path(h->path_lim, cgroup_leaf, NULL);
60052c3b 815
a6aeb9f1 816 } else {
9981107f 817 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
60052c3b 818 if (fd_final < 0)
0d8d13be 819 return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
0c3deb94 820
60052c3b
SI
821 if (payload) {
822 h->dfd_con = move_fd(fd_final);
c0af7b1c 823 h->dfd_lim = h->dfd_con;
eece10d5
SI
824 h->path_con = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
825
b1b1a60f 826 h->path_lim = h->path_con;
60052c3b
SI
827 } else {
828 h->dfd_mon = move_fd(fd_final);
829 }
1973b62a 830 }
fe70edee 831
c581d2a6 832 return true;
ccb4cabe
SH
833}
834
6c880cdf
CB
835static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
836 bool payload)
ccb4cabe 837{
c1ece895 838 bool prune = true;
72068e74 839
1973b62a 840 if (payload) {
c1ece895 841 /* Check whether we actually created the cgroup to prune. */
c0af7b1c 842 if (h->dfd_lim < 0)
c1ece895
CB
843 prune = false;
844
b1b1a60f 845 free_equal(h->path_con, h->path_lim);
c0af7b1c 846 close_equal(h->dfd_con, h->dfd_lim);
1973b62a 847 } else {
c1ece895 848 /* Check whether we actually created the cgroup to prune. */
6a32c817 849 if (h->dfd_mon < 0)
c1ece895
CB
850 prune = false;
851
6a32c817 852 close_prot_errno_disarm(h->dfd_mon);
1973b62a 853 }
e56639fb 854
c1ece895
CB
855 /* We didn't create this cgroup. */
856 if (!prune)
857 return;
858
859 if (cgroup_tree_prune(h->dfd_base, path_prune))
cb423bd3
CB
860 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
861 else
862 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
a900cbaf
WB
863}
864
033267c9
CB
865__cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
866 struct lxc_handler *handler)
867{
868 int len;
869 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
870 const struct lxc_conf *conf;
871
872 if (!ops) {
873 ERROR("Called with uninitialized cgroup operations");
874 return;
875 }
876
877 if (!ops->hierarchies)
878 return;
879
880 if (!handler) {
881 ERROR("Called with uninitialized handler");
882 return;
883 }
884
885 if (!handler->conf) {
886 ERROR("Called with uninitialized conf");
887 return;
888 }
889 conf = handler->conf;
890
1e058855
CB
891 if (!ops->monitor_cgroup) {
892 WARN("Uninitialized monitor cgroup");
893 return;
894 }
895
033267c9
CB
896 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
897 if (len < 0)
898 return;
899
900 for (int i = 0; ops->hierarchies[i]; i++) {
901 __do_close int fd_pivot = -EBADF;
902 __do_free char *pivot_path = NULL;
903 struct hierarchy *h = ops->hierarchies[i];
904 bool cpuset_v1 = false;
905 int ret;
906
033267c9
CB
907 /* Monitor might have died before we entered the cgroup. */
908 if (handler->monitor_pid <= 0) {
909 WARN("No valid monitor process found while destroying cgroups");
c55fe36d 910 goto cgroup_prune_tree;
033267c9
CB
911 }
912
913 if (conf->cgroup_meta.monitor_pivot_dir)
914 pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
033267c9
CB
915 else if (conf->cgroup_meta.dir)
916 pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
917 else
918 pivot_path = must_make_path(CGROUP_PIVOT, NULL);
919
920 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
921
922 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
923 if (fd_pivot < 0) {
924 SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
925 continue;
926 }
927
928 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
929 if (ret != 0) {
930 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
931 continue;
932 }
933
c55fe36d
CB
934cgroup_prune_tree:
935 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
033267c9 936 if (ret < 0)
c55fe36d
CB
937 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
938 else
939 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
033267c9
CB
940 }
941}
942
a900cbaf
WB
943/*
944 * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
945 * proper prefix directory of lxc.cgroup.dir.payload.
946 *
947 * Returns the prefix length if it is set, otherwise zero on success.
948 */
949static bool check_cgroup_dir_config(struct lxc_conf *conf)
950{
951 const char *monitor_dir = conf->cgroup_meta.monitor_dir,
952 *container_dir = conf->cgroup_meta.container_dir,
953 *namespace_dir = conf->cgroup_meta.namespace_dir;
a900cbaf
WB
954
955 /* none of the new options are set, all is fine */
956 if (!monitor_dir && !container_dir && !namespace_dir)
957 return true;
958
959 /* some are set, make sure lxc.cgroup.dir is not also set*/
960 if (conf->cgroup_meta.dir)
961 return log_error_errno(false, EINVAL,
962 "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
963
964 /* make sure both monitor and payload are set */
965 if (!monitor_dir || !container_dir)
966 return log_error_errno(false, EINVAL,
967 "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
968
969 /* namespace_dir may be empty */
970 return true;
72068e74
CB
971}
972
59eac805 973__cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
72068e74 974{
dcf6a5c7 975 __do_free char *monitor_cgroup = NULL;
fe70edee
CB
976 int idx = 0;
977 int i;
5ce03bc0 978 size_t len;
a900cbaf 979 char *suffix = NULL;
0d66e29a 980 struct lxc_conf *conf;
72068e74 981
0d66e29a
CB
982 if (!ops)
983 return ret_set_errno(false, ENOENT);
e56639fb 984
69b4a4bb
CB
985 if (!ops->hierarchies)
986 return true;
987
0d66e29a
CB
988 if (ops->monitor_cgroup)
989 return ret_set_errno(false, EEXIST);
990
991 if (!handler || !handler->conf)
992 return ret_set_errno(false, EINVAL);
993
994 conf = handler->conf;
995
a900cbaf
WB
996 if (!check_cgroup_dir_config(conf))
997 return false;
998
999 if (conf->cgroup_meta.monitor_dir) {
a900cbaf
WB
1000 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
1001 } else if (conf->cgroup_meta.dir) {
fe70edee
CB
1002 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1003 DEFAULT_MONITOR_CGROUP_PREFIX,
1004 handler->name,
1005 CGROUP_CREATE_RETRY, NULL);
b3ed2061 1006 } else if (ops->cgroup_pattern) {
dcf6a5c7
CB
1007 __do_free char *cgroup_tree = NULL;
1008
1009 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1010 if (!cgroup_tree)
d6bdd182
CB
1011 return ret_set_errno(false, ENOMEM);
1012
d6bdd182
CB
1013 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1014 DEFAULT_MONITOR_CGROUP,
b3ed2061
CB
1015 CGROUP_CREATE_RETRY, NULL);
1016 } else {
fe70edee
CB
1017 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1018 handler->name,
1019 CGROUP_CREATE_RETRY, NULL);
b3ed2061 1020 }
fe70edee 1021 if (!monitor_cgroup)
0d66e29a 1022 return ret_set_errno(false, ENOMEM);
72068e74 1023
a900cbaf
WB
1024 if (!conf->cgroup_meta.monitor_dir) {
1025 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1026 *suffix = '\0';
1027 }
5ce03bc0 1028 do {
a900cbaf 1029 if (idx && suffix)
fe70edee 1030 sprintf(suffix, "-%d", idx);
72068e74 1031
ebc10afe 1032 for (i = 0; ops->hierarchies[i]; i++) {
432faf20 1033 if (cgroup_tree_create(ops, handler->conf,
dcf6a5c7 1034 ops->hierarchies[i],
6fec4327 1035 monitor_cgroup, NULL, false))
fe70edee
CB
1036 continue;
1037
7064ee3a 1038 DEBUG("Failed to create cgroup %s)", monitor_cgroup);
6c880cdf
CB
1039 for (int j = 0; j <= i; j++)
1040 cgroup_tree_prune_leaf(ops->hierarchies[j],
1041 monitor_cgroup, false);
fe70edee
CB
1042
1043 idx++;
1044 break;
5ce03bc0 1045 }
a900cbaf 1046 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
5ce03bc0 1047
a900cbaf 1048 if (idx == 1000 || (!suffix && idx != 0))
04a49a14 1049 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
72068e74 1050
c581d2a6 1051 ops->monitor_cgroup = move_ptr(monitor_cgroup);
6e8703a4 1052 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
ccb4cabe
SH
1053}
1054
fe70edee
CB
1055/*
1056 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
cecad0c1 1057 * next cgroup_pattern-1, -2, ..., -999.
ccb4cabe 1058 */
59eac805 1059__cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
ccb4cabe 1060{
a6aeb9f1
CB
1061 __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1062 char *limit_cgroup;
f3839f12 1063 int idx = 0;
fe70edee 1064 int i;
ccb4cabe 1065 size_t len;
a900cbaf 1066 char *suffix = NULL;
f3839f12 1067 struct lxc_conf *conf;
43654d34 1068
f3839f12
CB
1069 if (!ops)
1070 return ret_set_errno(false, ENOENT);
ccb4cabe 1071
69b4a4bb
CB
1072 if (!ops->hierarchies)
1073 return true;
1074
471929c6 1075 if (ops->container_cgroup || ops->container_limit_cgroup)
f3839f12
CB
1076 return ret_set_errno(false, EEXIST);
1077
1078 if (!handler || !handler->conf)
1079 return ret_set_errno(false, EINVAL);
1080
1081 conf = handler->conf;
1082
a900cbaf
WB
1083 if (!check_cgroup_dir_config(conf))
1084 return false;
1085
1086 if (conf->cgroup_meta.container_dir) {
a6aeb9f1
CB
1087 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1088 if (!__limit_cgroup)
a900cbaf
WB
1089 return ret_set_errno(false, ENOMEM);
1090
432faf20 1091 if (conf->cgroup_meta.namespace_dir) {
a6aeb9f1 1092 container_cgroup = must_make_path(__limit_cgroup,
432faf20
WB
1093 conf->cgroup_meta.namespace_dir,
1094 NULL);
a6aeb9f1 1095 limit_cgroup = __limit_cgroup;
432faf20
WB
1096 } else {
1097 /* explicit paths but without isolation */
a6aeb9f1
CB
1098 limit_cgroup = move_ptr(__limit_cgroup);
1099 container_cgroup = limit_cgroup;
432faf20 1100 }
a900cbaf 1101 } else if (conf->cgroup_meta.dir) {
a6aeb9f1
CB
1102 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1103 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1104 handler->name,
1105 CGROUP_CREATE_RETRY, NULL);
1106 container_cgroup = limit_cgroup;
b3ed2061 1107 } else if (ops->cgroup_pattern) {
dcf6a5c7
CB
1108 __do_free char *cgroup_tree = NULL;
1109
1110 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1111 if (!cgroup_tree)
d6bdd182
CB
1112 return ret_set_errno(false, ENOMEM);
1113
a6aeb9f1
CB
1114 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1115 DEFAULT_PAYLOAD_CGROUP,
1116 CGROUP_CREATE_RETRY, NULL);
1117 container_cgroup = limit_cgroup;
b3ed2061 1118 } else {
a6aeb9f1
CB
1119 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1120 handler->name,
1121 CGROUP_CREATE_RETRY, NULL);
1122 container_cgroup = limit_cgroup;
b3ed2061 1123 }
a6aeb9f1 1124 if (!limit_cgroup)
fe70edee 1125 return ret_set_errno(false, ENOMEM);
ccb4cabe 1126
a900cbaf
WB
1127 if (!conf->cgroup_meta.container_dir) {
1128 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1129 *suffix = '\0';
1130 }
d97919ab 1131 do {
a900cbaf 1132 if (idx && suffix)
fe70edee 1133 sprintf(suffix, "-%d", idx);
bb30b52a 1134
d97919ab 1135 for (i = 0; ops->hierarchies[i]; i++) {
432faf20 1136 if (cgroup_tree_create(ops, handler->conf,
a6aeb9f1
CB
1137 ops->hierarchies[i], limit_cgroup,
1138 conf->cgroup_meta.namespace_dir,
6fec4327 1139 true))
fe70edee
CB
1140 continue;
1141
67ed60ce 1142 DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
6c880cdf
CB
1143 for (int j = 0; j <= i; j++)
1144 cgroup_tree_prune_leaf(ops->hierarchies[j],
a6aeb9f1 1145 limit_cgroup, true);
fe70edee
CB
1146
1147 idx++;
1148 break;
66b66624 1149 }
a900cbaf 1150 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
cecad0c1 1151
a900cbaf 1152 if (idx == 1000 || (!suffix && idx != 0))
04a49a14 1153 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
cecad0c1 1154
fe70edee 1155 ops->container_cgroup = move_ptr(container_cgroup);
a6aeb9f1
CB
1156 if (__limit_cgroup)
1157 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
c55fe36d
CB
1158 else
1159 ops->container_limit_cgroup = ops->container_cgroup;
a6aeb9f1
CB
1160 INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1161 ops->container_cgroup, ops->container_limit_cgroup);
ccb4cabe 1162 return true;
ccb4cabe
SH
1163}
1164
c581d2a6
CB
1165__cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1166 struct lxc_handler *handler)
ccb4cabe 1167{
fdb0b8ab 1168 int monitor_len, transient_len = 0;
c581d2a6
CB
1169 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1170 transient[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 1171
797fa65e
CB
1172 if (!ops)
1173 return ret_set_errno(false, ENOENT);
1174
69b4a4bb
CB
1175 if (!ops->hierarchies)
1176 return true;
1177
797fa65e
CB
1178 if (!ops->monitor_cgroup)
1179 return ret_set_errno(false, ENOENT);
1180
1181 if (!handler || !handler->conf)
1182 return ret_set_errno(false, EINVAL);
1183
0bba27c1
CB
1184 monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1185 if (monitor_len < 0)
1186 return false;
1187
1188 if (handler->transient_pid > 0) {
1189 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1190 if (transient_len < 0)
1191 return false;
1192 }
ccb4cabe 1193
eeef32bb 1194 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1195 struct hierarchy *h = ops->hierarchies[i];
c581d2a6 1196 int ret;
08768001 1197
6a32c817 1198 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
1973b62a 1199 if (ret)
6a32c817 1200 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
c581d2a6 1201
6a32c817 1202 TRACE("Moved monitor into cgroup %d", h->dfd_mon);
ebf88e5b 1203
34683042 1204 if (handler->transient_pid <= 0)
d1ee8719 1205 continue;
c581d2a6 1206
6a32c817 1207 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
1973b62a 1208 if (ret)
6a32c817 1209 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1973b62a 1210
6a32c817 1211 TRACE("Moved transient process into cgroup %d", h->dfd_mon);
ebf88e5b 1212
1973b62a 1213 /*
78eb6aa6 1214 * we don't keep the fds for non-unified hierarchies around
1973b62a 1215 * mainly because we don't make use of them anymore after the
78eb6aa6 1216 * core cgroup setup is done but also because there are quite a
1973b62a
CB
1217 * lot of them.
1218 */
1219 if (!is_unified_hierarchy(h))
6a32c817 1220 close_prot_errno_disarm(h->dfd_mon);
ccb4cabe 1221 }
c581d2a6 1222 handler->transient_pid = -1;
ccb4cabe
SH
1223
1224 return true;
1225}
1226
c581d2a6
CB
1227__cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1228 struct lxc_handler *handler)
eeef32bb 1229{
c581d2a6
CB
1230 int len;
1231 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
eeef32bb 1232
4490328e
CB
1233 if (!ops)
1234 return ret_set_errno(false, ENOENT);
1235
c581d2a6
CB
1236 if (!ops->hierarchies)
1237 return true;
1238
4490328e
CB
1239 if (!ops->container_cgroup)
1240 return ret_set_errno(false, ENOENT);
1241
1242 if (!handler || !handler->conf)
1243 return ret_set_errno(false, EINVAL);
1244
0bba27c1
CB
1245 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1246 if (len < 0)
1247 return false;
c581d2a6
CB
1248
1249 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1250 struct hierarchy *h = ops->hierarchies[i];
c581d2a6
CB
1251 int ret;
1252
b3a42865
CB
1253 if (is_unified_hierarchy(h) &&
1254 (handler->clone_flags & CLONE_INTO_CGROUP))
f7176c3e
CB
1255 continue;
1256
e33870e5 1257 ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
c581d2a6 1258 if (ret != 0)
67ed60ce 1259 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
25db3f94 1260
67ed60ce 1261 TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
c581d2a6
CB
1262 }
1263
1264 return true;
eeef32bb
CB
1265}
1266
1973b62a
CB
1267static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1268 gid_t chown_gid, mode_t chmod_mode)
6efacf80
CB
1269{
1270 int ret;
1271
1973b62a
CB
1272 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1273 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1274 if (ret < 0)
1275 return log_warn_errno(-1,
1276 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1277 dirfd, path, (int)chown_uid,
1278 (int)chown_gid);
6efacf80 1279
1973b62a
CB
1280 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1281 if (ret < 0)
1282 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1283 dirfd, path, (int)chmod_mode);
6efacf80
CB
1284
1285 return 0;
1286}
1287
1288/* chgrp the container cgroups to container group. We leave
c0888dfe
SH
1289 * the container owner as cgroup owner. So we must make the
1290 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1291 *
1292 * Also chown the tasks and cgroup.procs files. Those may not
1293 * exist depending on kernel version.
c0888dfe 1294 */
ccb4cabe
SH
1295static int chown_cgroup_wrapper(void *data)
1296{
6a720d74 1297 int ret;
4160c3a0
CB
1298 uid_t destuid;
1299 struct generic_userns_exec_data *arg = data;
1300 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1301 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1302
8917c382 1303 if (!lxc_drop_groups() && errno != EPERM)
b58214ac
CB
1304 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1305
6efacf80 1306 ret = setresgid(nsgid, nsgid, nsgid);
803e4123 1307 if (ret < 0)
77c3e9a2 1308 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
803e4123 1309 (int)nsgid, (int)nsgid, (int)nsgid);
6efacf80
CB
1310
1311 ret = setresuid(nsuid, nsuid, nsuid);
803e4123 1312 if (ret < 0)
77c3e9a2 1313 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
803e4123 1314 (int)nsuid, (int)nsuid, (int)nsuid);
6efacf80 1315
ccb4cabe 1316 destuid = get_ns_uid(arg->origuid);
b962868f
CB
1317 if (destuid == LXC_INVALID_UID)
1318 destuid = 0;
ccb4cabe 1319
6a720d74 1320 for (int i = 0; arg->hierarchies[i]; i++) {
e33870e5 1321 int dirfd = arg->hierarchies[i]->dfd_con;
43647298 1322
7f02fd24 1323 if (dirfd < 0)
060aaa39 1324 return syserror_set(-EBADF, "Invalid cgroup file descriptor");
7f02fd24 1325
1973b62a 1326 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
c0888dfe 1327
1973b62a
CB
1328 /*
1329 * Failures to chown() these are inconvenient but not
6efacf80
CB
1330 * detrimental We leave these owned by the container launcher,
1331 * so that container root can write to the files to attach. We
1332 * chmod() them 664 so that container systemd can write to the
1333 * files (which systemd in wily insists on doing).
ab8f5424 1334 */
6efacf80 1335
b8572e8c 1336 if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
1973b62a 1337 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
43647298 1338
1973b62a 1339 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
0e17357c 1340
b8572e8c 1341 if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
0e17357c
CB
1342 continue;
1343
042f9e9c 1344 for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
1973b62a 1345 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
ccb4cabe
SH
1346 }
1347
1348 return 0;
1349}
1350
b857f4be 1351__cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
c98bbf71 1352 struct lxc_conf *conf)
ccb4cabe 1353{
4160c3a0 1354 struct generic_userns_exec_data wrap;
ccb4cabe 1355
c98bbf71
CB
1356 if (!ops)
1357 return ret_set_errno(false, ENOENT);
ccb4cabe 1358
69b4a4bb
CB
1359 if (!ops->hierarchies)
1360 return true;
1361
c98bbf71
CB
1362 if (!ops->container_cgroup)
1363 return ret_set_errno(false, ENOENT);
1364
1365 if (!conf)
1366 return ret_set_errno(false, EINVAL);
1367
0589d744 1368 if (list_empty(&conf->id_map))
c98bbf71
CB
1369 return true;
1370
ccb4cabe 1371 wrap.origuid = geteuid();
4160c3a0 1372 wrap.path = NULL;
2202afc9 1373 wrap.hierarchies = ops->hierarchies;
4160c3a0 1374 wrap.conf = conf;
ccb4cabe 1375
c98bbf71
CB
1376 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1377 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
ccb4cabe
SH
1378
1379 return true;
1380}
1381
840eec19 1382__cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
78eb6aa6
CB
1383{
1384 if (!ops)
1385 return;
1386
1387 if (!ops->hierarchies)
1388 return;
1389
840eec19
CB
1390 for (int i = 0; ops->hierarchies[i]; i++) {
1391 struct hierarchy *h = ops->hierarchies[i];
1392
1393 /* Close all monitor cgroup file descriptors. */
1394 close_prot_errno_disarm(h->dfd_mon);
1395 }
1396 /* Close the cgroup root file descriptor. */
1397 close_prot_errno_disarm(ops->dfd_mnt);
1398
6dcd6f02
CB
1399 /*
1400 * The checking for freezer support should obviously be done at cgroup
1401 * initialization time but that doesn't work reliable. The freezer
1402 * controller has been demoted (rightly so) to a simple file located in
1403 * each non-root cgroup. At the time when the container is created we
1404 * might still be located in /sys/fs/cgroup and so checking for
1405 * cgroup.freeze won't tell us anything because this file doesn't exist
1406 * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1407 * find an already existing cgroup and then check within that cgroup
1408 * for the existence of cgroup.freeze but that will only work on
1409 * systemd based hosts. Other init systems might not manage cgroups and
1410 * so no cgroup will exist. So we defer until we have created cgroups
1411 * for our container which means we check here.
1412 */
1413 if (pure_unified_layout(ops) &&
e33870e5 1414 !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
6dcd6f02
CB
1415 AT_SYMLINK_NOFOLLOW)) {
1416 TRACE("Unified hierarchy supports freezer");
ca72ccb5 1417 ops->unified->utilities |= FREEZER_CONTROLLER;
6dcd6f02 1418 }
78eb6aa6
CB
1419}
1420
8aa1044f 1421/* cgroup-full:* is done, no need to create subdirs */
bd09ee98 1422static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
8aa1044f 1423{
bd09ee98 1424 switch (cgroup_automount_type) {
51feb8db
CB
1425 case LXC_AUTO_CGROUP_RO:
1426 return true;
1427 case LXC_AUTO_CGROUP_RW:
1428 return true;
1429 case LXC_AUTO_CGROUP_MIXED:
1430 return true;
1431 }
1432
1433 return false;
8aa1044f
SH
1434}
1435
886cac86
CB
1436/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1437 * remount controller ro if needed and bindmount the cgroupfs onto
25fa6f8c 1438 * control/the/cg/path.
8aa1044f 1439 */
bd09ee98 1440static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
a9db9474 1441 char *hierarchy_mnt, char *cgpath,
6812d833 1442 const char *container_cgroup)
8aa1044f 1443{
d97919ab 1444 __do_free char *sourcepath = NULL;
5285689c 1445 int ret, remount_flags;
886cac86
CB
1446 int flags = MS_BIND;
1447
bd09ee98
CB
1448 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1449 (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
a9db9474 1450 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
77c3e9a2
CB
1451 if (ret < 0)
1452 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
a9db9474 1453 hierarchy_mnt, hierarchy_mnt);
886cac86 1454
a9db9474
CB
1455 remount_flags = add_required_remount_flags(hierarchy_mnt,
1456 hierarchy_mnt,
5285689c 1457 flags | MS_REMOUNT);
a9db9474 1458 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
8186c5c7
CB
1459 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1460 NULL);
77c3e9a2 1461 if (ret < 0)
a9db9474 1462 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
886cac86 1463
a9db9474 1464 INFO("Remounted %s read-only", hierarchy_mnt);
8aa1044f 1465 }
886cac86 1466
44585f1a 1467 sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
bd09ee98 1468 if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
8aa1044f 1469 flags |= MS_RDONLY;
886cac86
CB
1470
1471 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
77c3e9a2
CB
1472 if (ret < 0)
1473 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1474 h->controllers[0], cgpath);
886cac86 1475 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
f8c40ffa
L
1476
1477 if (flags & MS_RDONLY) {
5285689c
CB
1478 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1479 flags | MS_REMOUNT);
1480 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
77c3e9a2
CB
1481 if (ret < 0)
1482 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
5285689c 1483 INFO("Remounted %s read-only", cgpath);
f8c40ffa
L
1484 }
1485
886cac86 1486 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
8aa1044f
SH
1487 return 0;
1488}
1489
44234ae1 1490/* __cgroupfs_mount
6812d833
CB
1491 *
1492 * Mount cgroup hierarchies directly without using bind-mounts. The main
1493 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1494 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1495 */
bd09ee98 1496static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
44234ae1
CB
1497 struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1498 const char *hierarchy_mnt)
b635e92d 1499{
a099c5db
CB
1500 __do_close int fd_fs = -EBADF;
1501 unsigned int flags = 0;
02efd041
CB
1502 char *fstype;
1503 int ret;
1504
1505 if (dfd_mnt_cgroupfs < 0)
1506 return ret_errno(EINVAL);
1507
a099c5db
CB
1508 flags |= MOUNT_ATTR_NOSUID;
1509 flags |= MOUNT_ATTR_NOEXEC;
1510 flags |= MOUNT_ATTR_NODEV;
1511 flags |= MOUNT_ATTR_RELATIME;
02efd041 1512
bd09ee98
CB
1513 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1514 (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
a099c5db 1515 flags |= MOUNT_ATTR_RDONLY;
02efd041 1516
bd09ee98 1517 if (is_unified_hierarchy(h))
02efd041 1518 fstype = "cgroup2";
bd09ee98 1519 else
02efd041 1520 fstype = "cgroup";
b635e92d 1521
de7f9f33 1522 if (can_use_mount_api()) {
635e7bac
CB
1523 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1524 if (fd_fs < 0)
1525 return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1526
1527 if (!is_unified_hierarchy(h)) {
1528 for (const char **it = (const char **)h->controllers; it && *it; it++) {
aa72fbe7 1529 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
635e7bac
CB
1530 ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1531 else
1532 ret = fs_set_property(fd_fs, *it, "");
1533 if (ret < 0)
1534 return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1535 }
1536 }
1537
1538 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1539 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1540 flags);
1541 } else {
a099c5db
CB
1542 __do_free char *controllers = NULL, *target = NULL;
1543 unsigned int old_flags = 0;
02efd041
CB
1544 const char *rootfs_mnt;
1545
a099c5db
CB
1546 if (!is_unified_hierarchy(h)) {
1547 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1548 if (!controllers)
1549 return ret_errno(ENOMEM);
1550 }
1551
02efd041 1552 rootfs_mnt = get_rootfs_mnt(rootfs);
a099c5db
CB
1553 ret = mnt_attributes_old(flags, &old_flags);
1554 if (ret)
1555 return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1556
02efd041 1557 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
a099c5db 1558 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
02efd041 1559 }
77c3e9a2 1560 if (ret < 0)
02efd041
CB
1561 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1562 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
b635e92d 1563
02efd041
CB
1564 DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1565 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
b635e92d
CB
1566 return 0;
1567}
1568
bd09ee98 1569static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
074af890
CB
1570 struct lxc_rootfs *rootfs,
1571 int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
6812d833 1572{
bd09ee98
CB
1573 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1574 dfd_mnt_cgroupfs, hierarchy_mnt);
6812d833
CB
1575}
1576
bd09ee98 1577static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
14111650
CB
1578 struct lxc_rootfs *rootfs,
1579 int dfd_mnt_cgroupfs,
1580 const char *hierarchy_mnt)
6812d833 1581{
bd09ee98 1582 switch (cgroup_automount_type) {
51feb8db
CB
1583 case LXC_AUTO_CGROUP_FULL_RO:
1584 break;
1585 case LXC_AUTO_CGROUP_FULL_RW:
1586 break;
1587 case LXC_AUTO_CGROUP_FULL_MIXED:
1588 break;
1589 default:
6812d833 1590 return 0;
51feb8db 1591 }
6812d833 1592
bd09ee98
CB
1593 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1594 dfd_mnt_cgroupfs, hierarchy_mnt);
6812d833
CB
1595}
1596
b857f4be 1597__cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
cdd3b77d 1598 struct lxc_handler *handler, int cg_flags)
ccb4cabe 1599{
9bca62b3 1600 __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
6607d6e9 1601 __do_free char *cgroup_root = NULL;
bd09ee98 1602 int cgroup_automount_type;
937a3af9 1603 bool in_cgroup_ns = false, wants_force_mount = false;
ab8cd5d9 1604 struct lxc_conf *conf = handler->conf;
315f8a4e 1605 struct lxc_rootfs *rootfs = &conf->rootfs;
02efd041 1606 const char *rootfs_mnt = get_rootfs_mnt(rootfs);
dfa835ac 1607 int ret;
8aa1044f 1608
9585ccb3
CB
1609 if (!ops)
1610 return ret_set_errno(false, ENOENT);
1611
69b4a4bb
CB
1612 if (!ops->hierarchies)
1613 return true;
1614
315f8a4e 1615 if (!conf)
9585ccb3
CB
1616 return ret_set_errno(false, EINVAL);
1617
cdd3b77d 1618 if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
c581c8a3 1619 return log_trace(true, "No cgroup mounts requested");
8aa1044f 1620
69c29673
CB
1621 if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1622 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
3f69fb12 1623 wants_force_mount = true;
69c29673
CB
1624 }
1625
1626 switch (cg_flags) {
1627 case LXC_AUTO_CGROUP_RO:
1628 TRACE("Read-only cgroup mounts requested");
1629 break;
1630 case LXC_AUTO_CGROUP_RW:
1631 TRACE("Read-write cgroup mounts requested");
1632 break;
1633 case LXC_AUTO_CGROUP_MIXED:
1634 TRACE("Mixed cgroup mounts requested");
1635 break;
1636 case LXC_AUTO_CGROUP_FULL_RO:
1637 TRACE("Full read-only cgroup mounts requested");
1638 break;
1639 case LXC_AUTO_CGROUP_FULL_RW:
1640 TRACE("Full read-write cgroup mounts requested");
1641 break;
1642 case LXC_AUTO_CGROUP_FULL_MIXED:
1643 TRACE("Full mixed cgroup mounts requested");
1644 break;
1645 default:
1646 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1647 }
bd09ee98 1648 cgroup_automount_type = cg_flags;
b635e92d 1649
4547e73e 1650 if (!wants_force_mount) {
315f8a4e 1651 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
4547e73e
CB
1652
1653 /*
1654 * Most recent distro versions currently have init system that
1655 * do support cgroup2 but do not mount it by default unless
1656 * explicitly told so even if the host is cgroup2 only. That
1657 * means they often will fail to boot. Fix this by pre-mounting
1658 * cgroup2 by default. We will likely need to be doing this a
1659 * few years until all distros have switched over to cgroup2 at
1660 * which point we can safely assume that their init systems
1661 * will mount it themselves.
1662 */
1663 if (pure_unified_layout(ops))
1664 wants_force_mount = true;
3f69fb12 1665 }
8aa1044f 1666
2c4348bd 1667 if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
937a3af9 1668 in_cgroup_ns = true;
6768700d 1669
937a3af9 1670 if (in_cgroup_ns && !wants_force_mount)
3a86fb37 1671 return log_trace(true, "Mounting cgroups not requested or needed");
8aa1044f 1672
02efd041
CB
1673 /* This is really the codepath that we want. */
1674 if (pure_unified_layout(ops)) {
9bca62b3
CB
1675 __do_close int dfd_mnt_unified = -EBADF;
1676
1677 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1678 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1679 if (dfd_mnt_unified < 0)
9fc21b2d
CB
1680 return syserror_ret(false, "Failed to open %d(%s)",
1681 rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
e7e45fdf
CB
1682 /*
1683 * If cgroup namespaces are supported but the container will
1684 * not have CAP_SYS_ADMIN after it has started we need to mount
1685 * the cgroups manually.
a3e5ec26
CB
1686 *
1687 * Note that here we know that wants_force_mount is true.
1688 * Otherwise we would've returned early above.
e7e45fdf 1689 */
a3e5ec26
CB
1690 if (in_cgroup_ns) {
1691 /*
1692 * 1. cgroup:rw:force -> Mount the cgroup2 filesystem.
1693 * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only.
1694 * 3. cgroup:mixed:force -> See comment above how this
1695 * does not apply so
1696 * cgroup:mixed is equal to
1697 * cgroup:rw when cgroup
1698 * namespaces are supported.
1699
1700 * 4. cgroup:rw -> No-op; init system responsible for mounting.
1701 * 5. cgroup:ro -> No-op; init system responsible for mounting.
1702 * 6. cgroup:mixed -> No-op; init system responsible for mounting.
1703 *
1704 * 7. cgroup-full:rw -> Not supported.
1705 * 8. cgroup-full:ro -> Not supported.
1706 * 9. cgroup-full:mixed -> Not supported.
1707
1708 * 10. cgroup-full:rw:force -> Not supported.
1709 * 11. cgroup-full:ro:force -> Not supported.
1710 * 12. cgroup-full:mixed:force -> Not supported.
1711 */
bd09ee98 1712 ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
a3e5ec26 1713 if (ret < 0)
9fc21b2d 1714 return syserror_ret(false, "Failed to force mount cgroup filesystem in cgroup namespace");
a3e5ec26
CB
1715
1716 return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
1717 } else {
1718 /*
1719 * Either no cgroup namespace supported (highly
1720 * unlikely unless we're dealing with a Frankenkernel.
1721 * Or the user requested to keep the cgroup namespace
1722 * of the host or another container.
1723 */
1724 if (wants_force_mount) {
1725 /*
1726 * 1. cgroup:rw:force -> Bind-mount the cgroup2 filesystem writable.
1727 * 2. cgroup:ro:force -> Bind-mount the cgroup2 filesystem read-only.
1728 * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
1729 * and make the parent directory of the
1730 * container's cgroup read-only but the
1731 * container's cgroup writable.
1732 *
1733 * 10. cgroup-full:rw:force ->
1734 * 11. cgroup-full:ro:force ->
1735 * 12. cgroup-full:mixed:force ->
1736 */
1737 errno = EOPNOTSUPP;
1738 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1739 } else {
1740 errno = EOPNOTSUPP;
1741 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1742 }
1743 }
8d661d38 1744
9fc21b2d 1745 return syserror_ret(false, "Failed to mount cgroups");
8d661d38
CB
1746 }
1747
e6d4df78
CB
1748 /*
1749 * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1750 * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1751 * DEFAULT_CGROUP_MOUNTPOINT define.
1752 */
de7f9f33 1753 if (can_use_mount_api()) {
635e7bac
CB
1754 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1755 if (fd_fs < 0)
1756 return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1757
23a20dbe
CB
1758 ret = fs_set_property(fd_fs, "mode", "0755");
1759 if (ret < 0)
1760 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1761
1762 ret = fs_set_property(fd_fs, "size", "10240k");
1763 if (ret < 0)
1764 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1765
1766 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1767 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1768 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1769 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
635e7bac
CB
1770 } else {
1771 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1772 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1773 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1774 "size=10240k,mode=755", rootfs_mnt);
8b1f4dd9 1775 }
3f69fb12 1776 if (ret < 0)
02efd041
CB
1777 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1778 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
8aa1044f 1779
9bca62b3
CB
1780 dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1781 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1782 if (dfd_mnt_tmpfs < 0)
9fc21b2d
CB
1783 return syserror_ret(false, "Failed to open %d(%s)",
1784 rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
9bca62b3 1785
dfa835ac 1786 for (int i = 0; ops->hierarchies[i]; i++) {
a9db9474 1787 __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
2202afc9 1788 struct hierarchy *h = ops->hierarchies[i];
8aa1044f 1789
a58be2ad 1790 ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
d7314671 1791 if (ret < 0)
9fc21b2d 1792 return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
b635e92d 1793
937a3af9 1794 if (in_cgroup_ns && wants_force_mount) {
02efd041
CB
1795 /*
1796 * If cgroup namespaces are supported but the container
b635e92d
CB
1797 * will not have CAP_SYS_ADMIN after it has started we
1798 * need to mount the cgroups manually.
1799 */
a9db9474 1800 ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
a58be2ad 1801 dfd_mnt_tmpfs, h->at_mnt);
3f69fb12 1802 if (ret < 0)
d7314671 1803 return false;
3f69fb12 1804
b635e92d
CB
1805 continue;
1806 }
1807
02efd041 1808 /* Here is where the ancient kernel section begins. */
a9db9474 1809 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
a58be2ad 1810 dfd_mnt_tmpfs, h->at_mnt);
d97919ab 1811 if (ret < 0)
d7314671 1812 return false;
3f69fb12 1813
bd09ee98 1814 if (!cg_mount_needs_subdirs(cgroup_automount_type))
8aa1044f 1815 continue;
3f69fb12 1816
f1921f35
CB
1817 if (!cgroup_root)
1818 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1819
a58be2ad 1820 hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
44585f1a 1821 path2 = must_make_path(hierarchy_mnt, h->at_base,
a9db9474 1822 ops->container_cgroup, NULL);
3f69fb12 1823 ret = mkdir_p(path2, 0755);
77410c98 1824 if (ret < 0 && (errno != EEXIST))
d7314671 1825 return false;
2f62fb00 1826
a9db9474
CB
1827 ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
1828 hierarchy_mnt, path2,
1829 ops->container_cgroup);
3f69fb12 1830 if (ret < 0)
d7314671 1831 return false;
8aa1044f 1832 }
8aa1044f 1833
d7314671 1834 return true;
ccb4cabe
SH
1835}
1836
11c23867 1837/* Only root needs to escape to the cgroup of its init. */
ff9edd2d
CB
1838__cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
1839 struct lxc_conf *conf)
ccb4cabe 1840{
52d08ab0
CB
1841 if (!ops)
1842 return ret_set_errno(false, ENOENT);
1843
1844 if (!ops->hierarchies)
1845 return true;
1846
1847 if (!conf)
1848 return ret_set_errno(false, EINVAL);
1849
1850 if (conf->cgroup_meta.relative || geteuid())
ccb4cabe
SH
1851 return true;
1852
779b3d82 1853 for (int i = 0; ops->hierarchies[i]; i++) {
88396101 1854 __do_free char *fullpath = NULL;
52d08ab0 1855 int ret;
11c23867 1856
35ec1a38 1857 fullpath = make_cgroup_path(ops->hierarchies[i],
44585f1a 1858 ops->hierarchies[i]->at_base,
35ec1a38 1859 "cgroup.procs", NULL);
7cea5905 1860 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
52d08ab0 1861 if (ret != 0)
77c3e9a2 1862 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
ccb4cabe
SH
1863 }
1864
6df334d1 1865 return true;
ccb4cabe
SH
1866}
1867
ff9edd2d 1868__cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
36662416 1869{
69b4a4bb
CB
1870 int i = 0;
1871
e3ffb28b
CB
1872 if (!ops)
1873 return ret_set_errno(-1, ENOENT);
1874
69b4a4bb
CB
1875 if (!ops->hierarchies)
1876 return 0;
36662416 1877
69b4a4bb 1878 for (; ops->hierarchies[i]; i++)
36662416
TA
1879 ;
1880
1881 return i;
1882}
1883
ff9edd2d
CB
1884__cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
1885 int n, char ***out)
36662416
TA
1886{
1887 int i;
1888
aa48a34f
CB
1889 if (!ops)
1890 return ret_set_errno(false, ENOENT);
1891
69b4a4bb 1892 if (!ops->hierarchies)
77c3e9a2 1893 return ret_set_errno(false, ENOENT);
69b4a4bb 1894
b7b227cc 1895 /* consistency check n */
6b38e644 1896 for (i = 0; i < n; i++)
2202afc9 1897 if (!ops->hierarchies[i])
aa48a34f 1898 return ret_set_errno(false, ENOENT);
36662416 1899
2202afc9 1900 *out = ops->hierarchies[i]->controllers;
36662416
TA
1901
1902 return true;
1903}
1904
b8a4fe12 1905static int cg_legacy_freeze(struct cgroup_ops *ops)
ccb4cabe 1906{
d6337a5f 1907 struct hierarchy *h;
ccb4cabe 1908
ee3a7775
CB
1909 h = get_hierarchy(ops, "freezer");
1910 if (!h)
d2203230 1911 return ret_set_errno(-1, ENOENT);
81468ea7 1912
67ed60ce 1913 return lxc_write_openat(h->path_con, "freezer.state",
c04a6d4e 1914 "FROZEN", STRLITERALLEN("FROZEN"));
ee3a7775 1915}
942e193e 1916
018051e3 1917static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
3298b37d 1918 struct lxc_async_descr *descr)
ee3a7775 1919{
018051e3 1920 __do_free char *line = NULL;
ee3a7775 1921 __do_fclose FILE *f = NULL;
018051e3
CB
1922 int state = PTR_TO_INT(cbdata);
1923 size_t len;
1924 const char *state_string;
1925
c8af3332 1926 f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
018051e3
CB
1927 if (!f)
1928 return LXC_MAINLOOP_ERROR;
018051e3
CB
1929
1930 if (state == 1)
1931 state_string = "frozen 1";
1932 else
1933 state_string = "frozen 0";
1934
1935 while (getline(&line, &len, f) != -1)
aa72fbe7 1936 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
018051e3
CB
1937 return LXC_MAINLOOP_CLOSE;
1938
281c3645
CB
1939 rewind(f);
1940
018051e3
CB
1941 return LXC_MAINLOOP_CONTINUE;
1942}
1943
443be565
WB
1944static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
1945 const char *state_string,
1946 int state_num,
1947 const char *epoll_error,
1948 const char *wait_error)
018051e3 1949{
f62cf1d4 1950 __do_close int fd = -EBADF;
3298b37d 1951 call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL;
018051e3 1952 int ret;
3298b37d 1953 struct lxc_async_descr descr;
ee3a7775 1954 struct hierarchy *h;
942e193e
CB
1955
1956 h = ops->unified;
457ca9aa 1957 if (!h)
d2203230 1958 return ret_set_errno(-1, ENOENT);
d6337a5f 1959
67ed60ce 1960 if (!h->path_con)
d2203230 1961 return ret_set_errno(-1, EEXIST);
d6337a5f 1962
018051e3
CB
1963 if (timeout != 0) {
1964 __do_free char *events_file = NULL;
942e193e 1965
67ed60ce 1966 events_file = must_make_path(h->path_con, "cgroup.events", NULL);
018051e3
CB
1967 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1968 if (fd < 0)
d2203230 1969 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
942e193e 1970
018051e3
CB
1971 ret = lxc_mainloop_open(&descr);
1972 if (ret)
443be565 1973 return log_error_errno(-1, errno, "%s", epoll_error);
942e193e 1974
018051e3
CB
1975 /* automatically cleaned up now */
1976 descr_ptr = &descr;
942e193e 1977
543d2f83
CB
1978 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI,
1979 freezer_cgroup_events_cb,
1980 default_cleanup_handler,
1981 INT_TO_PTR(state_num),
1982 "freezer_cgroup_events_cb");
018051e3 1983 if (ret < 0)
d2203230 1984 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 1985 }
942e193e 1986
67ed60ce 1987 ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
018051e3 1988 if (ret < 0)
d2203230 1989 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
1990
1991 if (timeout != 0 && lxc_mainloop(&descr, timeout))
443be565 1992 return log_error_errno(-1, errno, "%s", wait_error);
018051e3
CB
1993
1994 return 0;
942e193e
CB
1995}
1996
443be565
WB
1997static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1998{
1999 return cg_unified_freeze_do(ops, timeout, "1", 1,
2000 "Failed to create epoll instance to wait for container freeze",
2001 "Failed to wait for container to be frozen");
2002}
2003
018051e3 2004__cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
942e193e 2005{
81468ea7 2006 if (!ops->hierarchies)
d2203230 2007 return ret_set_errno(-1, ENOENT);
81468ea7 2008
ee3a7775
CB
2009 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2010 return cg_legacy_freeze(ops);
942e193e 2011
018051e3 2012 return cg_unified_freeze(ops, timeout);
ee3a7775
CB
2013}
2014
018051e3 2015static int cg_legacy_unfreeze(struct cgroup_ops *ops)
ee3a7775 2016{
ee3a7775
CB
2017 struct hierarchy *h;
2018
2019 h = get_hierarchy(ops, "freezer");
2020 if (!h)
d2203230 2021 return ret_set_errno(-1, ENOENT);
ee3a7775 2022
67ed60ce 2023 return lxc_write_openat(h->path_con, "freezer.state",
c04a6d4e 2024 "THAWED", STRLITERALLEN("THAWED"));
ee3a7775
CB
2025}
2026
018051e3 2027static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775 2028{
443be565
WB
2029 return cg_unified_freeze_do(ops, timeout, "0", 0,
2030 "Failed to create epoll instance to wait for container unfreeze",
2031 "Failed to wait for container to be unfrozen");
ee3a7775
CB
2032}
2033
018051e3 2034__cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775
CB
2035{
2036 if (!ops->hierarchies)
d2203230 2037 return ret_set_errno(-1, ENOENT);
ee3a7775
CB
2038
2039 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2040 return cg_legacy_unfreeze(ops);
2041
018051e3 2042 return cg_unified_unfreeze(ops, timeout);
ccb4cabe
SH
2043}
2044
a900cbaf
WB
2045static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2046 const char *controller, bool limiting)
ccb4cabe 2047{
d6337a5f 2048 struct hierarchy *h;
35ec1a38
CB
2049 size_t len;
2050 const char *path;
d6337a5f 2051
2202afc9 2052 h = get_hierarchy(ops, controller);
6bdf9691 2053 if (!h)
35ec1a38
CB
2054 return log_warn_errno(NULL, ENOENT,
2055 "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
ccb4cabe 2056
a900cbaf 2057 if (limiting)
b1b1a60f 2058 path = h->path_lim;
35ec1a38 2059 else
67ed60ce 2060 path = h->path_con;
35ec1a38
CB
2061 if (!path)
2062 return NULL;
a900cbaf 2063
a58be2ad
CB
2064 len = strlen(h->at_mnt);
2065 if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
35ec1a38
CB
2066 STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
2067 path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
2068 path += strspn(path, "/");
2069 }
2070 return path += len;
371f834d
SH
2071}
2072
a900cbaf
WB
2073__cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2074 const char *controller)
2075{
2076 return cgfsng_get_cgroup_do(ops, controller, false);
2077}
2078
a9b642ee
CB
2079__cgfsng_ops static const char *cgfsng_get_limit_cgroup(struct cgroup_ops *ops,
2080 const char *controller)
a900cbaf
WB
2081{
2082 return cgfsng_get_cgroup_do(ops, controller, true);
2083}
2084
c40c8209
CB
2085/* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2086 * which must be freed by the caller.
371f834d 2087 */
c40c8209
CB
2088static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2089 const char *inpath,
2090 const char *filename)
371f834d 2091{
35ec1a38 2092 return make_cgroup_path(h, inpath, filename, NULL);
ccb4cabe
SH
2093}
2094
4b86fefd 2095static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
c2aed66d 2096{
ad275c16 2097 int idx = 1;
c2aed66d 2098 int ret;
900b6606 2099 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
6e2078de 2100 ssize_t pidstr_len;
c2aed66d 2101
ad275c16 2102 /* Create leaf cgroup. */
275e8ef8 2103 ret = mkdirat(unified_fd, ".lxc", 0755);
ad275c16 2104 if (ret < 0 && errno != EEXIST)
6e2078de
CB
2105 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2106
0bba27c1
CB
2107 pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2108 if (pidstr_len < 0)
2109 return pidstr_len;
ad275c16 2110
275e8ef8 2111 ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
ad275c16
CB
2112 if (ret < 0)
2113 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
c2aed66d 2114 if (ret == 0)
6e2078de 2115 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
ad275c16 2116
bad788b0
CB
2117 /* this is a non-leaf node */
2118 if (errno != EBUSY)
6e2078de 2119 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
c2aed66d 2120
c2aed66d 2121 do {
7581a82f 2122 bool rm = false;
c80c9a70 2123 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
9fd047d1 2124 char *slash = attach_cgroup;
c2aed66d 2125
0bba27c1
CB
2126 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2127 if (ret < 0)
2128 return ret;
5045306b 2129
c80c9a70
CB
2130 /*
2131 * This shouldn't really happen but the compiler might complain
2132 * that a short write would cause a buffer overrun. So be on
2133 * the safe side.
2134 */
2135 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2136 return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2137
9fd047d1 2138 slash += (ret - STRLITERALLEN("/cgroup.procs"));
bad788b0 2139 *slash = '\0';
ad275c16 2140
bad788b0 2141 ret = mkdirat(unified_fd, attach_cgroup, 0755);
c2aed66d 2142 if (ret < 0 && errno != EEXIST)
d2203230 2143 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
7581a82f
CB
2144 if (ret == 0)
2145 rm = true;
c2aed66d 2146
bad788b0 2147 *slash = '/';
ad275c16 2148
bad788b0 2149 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
c2aed66d 2150 if (ret == 0)
6e2078de 2151 return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
c2aed66d 2152
7581a82f
CB
2153 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2154 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2155
c2aed66d
CB
2156 /* this is a non-leaf node */
2157 if (errno != EBUSY)
d2203230 2158 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2159
edae86e9
CB
2160 idx++;
2161 } while (idx < 1000);
c2aed66d 2162
ad275c16 2163 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d
CB
2164}
2165
d1783ef4
CB
2166static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2167 int unified_fd, int *sk_fd)
2168{
7d849163
CB
2169 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2170 int target_fds[2];
d1783ef4
CB
2171 ssize_t ret;
2172
2173 /* Create leaf cgroup. */
2174 ret = mkdirat(unified_fd, ".lxc", 0755);
2175 if (ret < 0 && errno != EEXIST)
2176 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2177
7043e2b4 2178 target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
7d849163 2179 if (target_fd0 < 0)
d1783ef4 2180 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
7d849163 2181 target_fds[0] = target_fd0;
d1783ef4 2182
7043e2b4 2183 target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
7d849163 2184 if (target_fd1 < 0)
49df620b 2185 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
7d849163 2186 target_fds[1] = target_fd1;
49df620b
CB
2187
2188 ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
d1783ef4 2189 if (ret <= 0)
49df620b 2190 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
7d849163 2191 target_fd0, target_fd1);
d1783ef4 2192
7d849163 2193 return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
d1783ef4
CB
2194}
2195
2196static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2197 int *sk_fd, pid_t pid)
2198{
7d849163 2199 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
d1783ef4
CB
2200 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2201 size_t pidstr_len;
2202 ssize_t ret;
2203
1b82d721 2204 ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1);
d17c815d 2205 if (ret < 0)
d1783ef4
CB
2206 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2207
2208 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2209
7d849163
CB
2210 ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2211 if (ret > 0 && ret == pidstr_len)
2212 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2213
49df620b 2214 ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
7d849163
CB
2215 if (ret > 0 && ret == pidstr_len)
2216 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
d1783ef4 2217
7d849163
CB
2218 return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2219 target_fd0, target_fd1);
d1783ef4
CB
2220}
2221
4b86fefd
CB
2222struct userns_exec_unified_attach_data {
2223 const struct lxc_conf *conf;
2224 int unified_fd;
d1783ef4 2225 int sk_pair[2];
4b86fefd
CB
2226 pid_t pid;
2227};
2228
d1783ef4
CB
2229static int cgroup_unified_attach_child_wrapper(void *data)
2230{
2231 struct userns_exec_unified_attach_data *args = data;
2232
2233 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2234 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2235 return ret_errno(EINVAL);
2236
2237 close_prot_errno_disarm(args->sk_pair[0]);
2238 return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2239 &args->sk_pair[1]);
2240}
2241
2242static int cgroup_unified_attach_parent_wrapper(void *data)
4b86fefd
CB
2243{
2244 struct userns_exec_unified_attach_data *args = data;
4b86fefd 2245
d1783ef4
CB
2246 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2247 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
4b86fefd
CB
2248 return ret_errno(EINVAL);
2249
d1783ef4
CB
2250 close_prot_errno_disarm(args->sk_pair[1]);
2251 return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2252 args->pid);
4b86fefd
CB
2253}
2254
900b6606
CB
2255/* Technically, we're always at a delegation boundary here (This is especially
2256 * true when cgroup namespaces are available.). The reasoning is that in order
2257 * for us to have been able to start a container in the first place the root
2258 * cgroup must have been a leaf node. Now, either the container's init system
2259 * has populated the cgroup and kept it as a leaf node or it has created
2260 * subtrees. In the former case we will simply attach to the leaf node we
2261 * created when we started the container in the latter case we create our own
2262 * cgroup for the attaching process.
2263 */
7581a82f
CB
2264static int __cg_unified_attach(const struct hierarchy *h,
2265 const struct lxc_conf *conf, const char *name,
900b6606
CB
2266 const char *lxcpath, pid_t pid,
2267 const char *controller)
2268{
f62cf1d4 2269 __do_close int unified_fd = -EBADF;
32908bfd 2270 __do_free char *path = NULL, *cgroup = NULL;
900b6606
CB
2271 int ret;
2272
7581a82f
CB
2273 if (!conf || !name || !lxcpath || pid <= 0)
2274 return ret_errno(EINVAL);
2275
2276 ret = cgroup_attach(conf, name, lxcpath, pid);
32908bfd
CB
2277 if (ret == 0)
2278 return log_trace(0, "Attached to unified cgroup via command handler");
112ccbc9 2279 if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2)
32908bfd
CB
2280 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2281
2282 /* Fall back to retrieving the path for the unified cgroup. */
2283 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2284 /* not running */
2285 if (!cgroup)
2286 return 0;
900b6606 2287
35ec1a38 2288 path = make_cgroup_path(h, cgroup, NULL);
900b6606 2289
32908bfd 2290 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
900b6606 2291 if (unified_fd < 0)
7581a82f
CB
2292 return ret_errno(EBADF);
2293
0589d744 2294 if (!list_empty(&conf->id_map)) {
4b86fefd
CB
2295 struct userns_exec_unified_attach_data args = {
2296 .conf = conf,
2297 .unified_fd = unified_fd,
2298 .pid = pid,
2299 };
2300
d1783ef4
CB
2301 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2302 if (ret < 0)
2303 return -errno;
2304
2305 ret = userns_exec_minimal(conf,
2306 cgroup_unified_attach_parent_wrapper,
2307 &args,
2308 cgroup_unified_attach_child_wrapper,
2309 &args);
4b86fefd
CB
2310 } else {
2311 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2312 }
2313
2314 return ret;
900b6606
CB
2315}
2316
7581a82f
CB
2317__cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2318 const struct lxc_conf *conf,
2319 const char *name, const char *lxcpath,
2320 pid_t pid)
ccb4cabe 2321{
81b5d48a 2322 int len, ret;
a3650c0c 2323 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 2324
ab9a452d
CB
2325 if (!ops)
2326 return ret_set_errno(false, ENOENT);
2327
69b4a4bb
CB
2328 if (!ops->hierarchies)
2329 return true;
2330
0bba27c1
CB
2331 len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2332 if (len < 0)
ccb4cabe
SH
2333 return false;
2334
81b5d48a 2335 for (int i = 0; ops->hierarchies[i]; i++) {
c05b17bd 2336 __do_free char *fullpath = NULL, *path = NULL;
2202afc9 2337 struct hierarchy *h = ops->hierarchies[i];
ccb4cabe 2338
b8572e8c 2339 if (h->fs_type == UNIFIED_HIERARCHY) {
7581a82f 2340 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
a3926f6a 2341 h->controllers[0]);
c2aed66d
CB
2342 if (ret < 0)
2343 return false;
2344
2345 continue;
2346 }
2347
ccb4cabe 2348 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
6159413b
CB
2349 if (!path) {
2350 /*
2351 * Someone might have created a name=<controller>
2352 * controller after the container has started and so
2353 * the container doesn't make use of this controller.
2354 *
2355 * Link: https://github.com/lxc/lxd/issues/8577
2356 */
2357 TRACE("Skipping unused %s controller", maybe_empty(h->controllers[0]));
2358 continue;
2359 }
ccb4cabe 2360
371f834d 2361 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
7cea5905 2362 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
ab9a452d 2363 if (ret < 0)
77c3e9a2 2364 return log_error_errno(false, errno, "Failed to attach %d to %s",
ab9a452d 2365 (int)pid, fullpath);
ccb4cabe
SH
2366 }
2367
ccb4cabe
SH
2368 return true;
2369}
2370
e2bd2b13
CB
2371/* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2372 * don't have a cgroup_data set up, so we ask the running container through the
2373 * commands API for the cgroup path.
ccb4cabe 2374 */
b857f4be 2375__cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
fb55e009
CB
2376 char *value, size_t len, const char *name,
2377 const char *lxcpath)
ccb4cabe 2378{
d97919ab 2379 __do_free char *path = NULL;
88396101 2380 __do_free char *controller = NULL;
d97919ab 2381 char *p;
0069cc61 2382 struct hierarchy *h;
861cb8c2 2383 int ret = -1;
ccb4cabe 2384
a358028a
CB
2385 if (!ops)
2386 return ret_set_errno(-1, ENOENT);
2387
63ba9eaf
CB
2388 controller = strdup(filename);
2389 if (!controller)
2390 return ret_errno(ENOMEM);
2391
0069cc61
CB
2392 p = strchr(controller, '.');
2393 if (p)
ccb4cabe
SH
2394 *p = '\0';
2395
a9b642ee 2396 path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
0069cc61
CB
2397 /* not running */
2398 if (!path)
ccb4cabe
SH
2399 return -1;
2400
2202afc9 2401 h = get_hierarchy(ops, controller);
ccb4cabe 2402 if (h) {
88396101 2403 __do_free char *fullpath = NULL;
0069cc61
CB
2404
2405 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe 2406 ret = lxc_read_from_file(fullpath, value, len);
ccb4cabe 2407 }
ccb4cabe
SH
2408
2409 return ret;
2410}
2411
cb3fc90c
CB
2412static int device_cgroup_parse_access(struct device_item *device, const char *val)
2413{
2414 for (int count = 0; count < 3; count++, val++) {
2415 switch (*val) {
2416 case 'r':
2417 device->access[count] = *val;
2418 break;
2419 case 'w':
2420 device->access[count] = *val;
2421 break;
2422 case 'm':
2423 device->access[count] = *val;
2424 break;
2425 case '\n':
2426 case '\0':
2427 count = 3;
2428 break;
2429 default:
2430 return ret_errno(EINVAL);
2431 }
2432 }
2433
2434 return 0;
2435}
2436
2a63b5cb
CB
2437static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2438 const char *val)
2439{
2440 int count, ret;
2441 char temp[50];
2442
8b99a20a 2443 if (strequal("devices.allow", key))
69885a76 2444 device->allow = 1; /* allow the device */
2a63b5cb 2445 else
69885a76 2446 device->allow = 0; /* deny the device */
2a63b5cb 2447
8b99a20a 2448 if (strequal(val, "a")) {
2a63b5cb
CB
2449 /* global rule */
2450 device->type = 'a';
2451 device->major = -1;
2452 device->minor = -1;
2a63b5cb 2453 return 0;
2a63b5cb
CB
2454 }
2455
2456 switch (*val) {
2457 case 'a':
2458 __fallthrough;
2459 case 'b':
2460 __fallthrough;
2461 case 'c':
2462 device->type = *val;
2463 break;
2464 default:
2465 return -1;
2466 }
2467
2468 val++;
2469 if (!isspace(*val))
2470 return -1;
2471 val++;
2472 if (*val == '*') {
2473 device->major = -1;
2474 val++;
2475 } else if (isdigit(*val)) {
2476 memset(temp, 0, sizeof(temp));
2477 for (count = 0; count < sizeof(temp) - 1; count++) {
2478 temp[count] = *val;
2479 val++;
2480 if (!isdigit(*val))
2481 break;
2482 }
2483 ret = lxc_safe_int(temp, &device->major);
2484 if (ret)
2485 return -1;
2486 } else {
2487 return -1;
2488 }
2489 if (*val != ':')
2490 return -1;
2491 val++;
2492
2493 /* read minor */
2494 if (*val == '*') {
2495 device->minor = -1;
2496 val++;
2497 } else if (isdigit(*val)) {
2498 memset(temp, 0, sizeof(temp));
2499 for (count = 0; count < sizeof(temp) - 1; count++) {
2500 temp[count] = *val;
2501 val++;
2502 if (!isdigit(*val))
2503 break;
2504 }
2505 ret = lxc_safe_int(temp, &device->minor);
2506 if (ret)
2507 return -1;
2508 } else {
2509 return -1;
2510 }
2511 if (!isspace(*val))
2512 return -1;
2a63b5cb 2513
cb3fc90c 2514 return device_cgroup_parse_access(device, ++val);
2a63b5cb
CB
2515}
2516
eec533e3
CB
2517/* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2518 * don't have a cgroup_data set up, so we ask the running container through the
2519 * commands API for the cgroup path.
ccb4cabe 2520 */
b857f4be 2521__cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2a63b5cb 2522 const char *key, const char *value,
fb55e009 2523 const char *name, const char *lxcpath)
ccb4cabe 2524{
d97919ab 2525 __do_free char *path = NULL;
88396101 2526 __do_free char *controller = NULL;
d97919ab 2527 char *p;
87777968 2528 struct hierarchy *h;
861cb8c2 2529 int ret = -1;
ccb4cabe 2530
b7aeda96
CB
2531 if (!ops || is_empty_string(key) || is_empty_string(value) ||
2532 is_empty_string(name) || is_empty_string(lxcpath))
2533 return ret_errno(EINVAL);
a358028a 2534
63ba9eaf
CB
2535 controller = strdup(key);
2536 if (!controller)
2537 return ret_errno(ENOMEM);
2538
87777968
CB
2539 p = strchr(controller, '.');
2540 if (p)
ccb4cabe
SH
2541 *p = '\0';
2542
8b99a20a 2543 if (pure_unified_layout(ops) && strequal(controller, "devices")) {
50329f28 2544 struct device_item device = {};
2a63b5cb
CB
2545
2546 ret = device_cgroup_rule_parse(&device, key, value);
2547 if (ret < 0)
d2203230 2548 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2a63b5cb
CB
2549 key, value);
2550
2551 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2552 if (ret < 0)
2553 return -1;
2554
2555 return 0;
2556 }
2557
a9b642ee 2558 path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
87777968
CB
2559 /* not running */
2560 if (!path)
ccb4cabe
SH
2561 return -1;
2562
2202afc9 2563 h = get_hierarchy(ops, controller);
ccb4cabe 2564 if (h) {
88396101 2565 __do_free char *fullpath = NULL;
87777968 2566
2a63b5cb 2567 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
7cea5905 2568 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
ccb4cabe 2569 }
ccb4cabe
SH
2570
2571 return ret;
2572}
2573
91d1a13a 2574/* take devices cgroup line
72add155
SH
2575 * /dev/foo rwx
2576 * and convert it to a valid
2577 * type major:minor mode
91d1a13a
CB
2578 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2579 * the output.
72add155 2580 */
cb3fc90c
CB
2581static int device_cgroup_rule_parse_devpath(struct device_item *device,
2582 const char *devpath)
72add155 2583{
88396101 2584 __do_free char *path = NULL;
2a06d041 2585 char *mode = NULL;
cb3fc90c
CB
2586 int n_parts, ret;
2587 char *p;
2588 struct stat sb;
72add155 2589
63ba9eaf
CB
2590 path = strdup(devpath);
2591 if (!path)
2592 return ret_errno(ENOMEM);
72add155 2593
cb3fc90c
CB
2594 /*
2595 * Read path followed by mode. Ignore any trailing text.
91d1a13a
CB
2596 * A ' # comment' would be legal. Technically other text is not
2597 * legal, we could check for that if we cared to.
72add155 2598 */
0dbdb99e 2599 for (n_parts = 1, p = path; *p; p++) {
2c2d6c49
SH
2600 if (*p != ' ')
2601 continue;
2602 *p = '\0';
91d1a13a 2603
2c2d6c49
SH
2604 if (n_parts != 1)
2605 break;
2606 p++;
2607 n_parts++;
91d1a13a 2608
2c2d6c49
SH
2609 while (*p == ' ')
2610 p++;
91d1a13a 2611
2c2d6c49 2612 mode = p;
91d1a13a 2613
2c2d6c49 2614 if (*p == '\0')
cb3fc90c 2615 return ret_set_errno(-1, EINVAL);
72add155 2616 }
2c2d6c49 2617
83b25c4d
CB
2618 if (!mode)
2619 return ret_errno(EINVAL);
2620
cb3fc90c
CB
2621 if (device_cgroup_parse_access(device, mode) < 0)
2622 return -1;
2623
72add155
SH
2624 ret = stat(path, &sb);
2625 if (ret < 0)
cb3fc90c 2626 return ret_set_errno(-1, errno);
72add155 2627
72add155
SH
2628 mode_t m = sb.st_mode & S_IFMT;
2629 switch (m) {
2630 case S_IFBLK:
cb3fc90c 2631 device->type = 'b';
72add155
SH
2632 break;
2633 case S_IFCHR:
cb3fc90c 2634 device->type = 'c';
72add155 2635 break;
2c2d6c49 2636 default:
77c3e9a2 2637 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
72add155 2638 }
2c2d6c49 2639
cb3fc90c
CB
2640 device->major = MAJOR(sb.st_rdev);
2641 device->minor = MINOR(sb.st_rdev);
2642 device->allow = 1;
72add155 2643
cb3fc90c
CB
2644 return 0;
2645}
2646
2647static int convert_devpath(const char *invalue, char *dest)
2648{
50329f28 2649 struct device_item device = {};
cb3fc90c
CB
2650 int ret;
2651
2652 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2653 if (ret < 0)
2654 return -1;
2655
0bba27c1
CB
2656 ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2657 device.minor, device.access);
2658 if (ret < 0)
2659 return log_error_errno(ret, -ret,
2660 "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2661 device.type, device.major, device.minor,
2662 device.access);
cb3fc90c
CB
2663
2664 return 0;
72add155
SH
2665}
2666
90e97284
CB
2667/* Called from setup_limits - here we have the container's cgroup_data because
2668 * we created the cgroups.
ccb4cabe 2669 */
2202afc9 2670static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
a900cbaf 2671 const char *value, bool is_cpuset)
ccb4cabe 2672{
88396101 2673 __do_free char *controller = NULL;
d97919ab 2674 char *p;
1a0e70ac
CB
2675 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2676 char converted_value[50];
b3646d7e 2677 struct hierarchy *h;
64e82f8b 2678
63ba9eaf
CB
2679 controller = strdup(filename);
2680 if (!controller)
2681 return ret_errno(ENOMEM);
2682
ab1a6cac
CB
2683 p = strchr(controller, '.');
2684 if (p)
ccb4cabe
SH
2685 *p = '\0';
2686
8b99a20a 2687 if (strequal("devices.allow", filename) && value[0] == '/') {
c04a6d4e
CB
2688 int ret;
2689
72add155
SH
2690 ret = convert_devpath(value, converted_value);
2691 if (ret < 0)
c8bf519d 2692 return ret;
72add155 2693 value = converted_value;
c8bf519d 2694 }
2695
2202afc9 2696 h = get_hierarchy(ops, controller);
77c3e9a2
CB
2697 if (!h)
2698 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
b3646d7e 2699
a900cbaf 2700 if (is_cpuset) {
67ed60ce 2701 int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
a900cbaf
WB
2702 if (ret)
2703 return ret;
2704 }
b1b1a60f 2705 return lxc_write_openat(h->path_lim, filename, value, strlen(value));
ccb4cabe
SH
2706}
2707
bca286f2
CB
2708/*
2709 * Return the list of cgroup_settings sorted according to the following rules
2710 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
2711 */
2712static void sort_cgroup_settings(struct lxc_conf *conf)
2713{
2714 LIST_HEAD(memsw_list);
2715 struct lxc_cgroup *cgroup, *ncgroup;
2716
2717 /* Iterate over the cgroup settings and copy them to the output list. */
2718 list_for_each_entry_safe(cgroup, ncgroup, &conf->cgroup, head) {
2719 if (!strequal(cgroup->subsystem, "memory.memsw.limit_in_bytes"))
2720 continue;
2721
2722 /* Move the memsw entry from the cgroup settings list. */
2723 list_move_tail(&cgroup->head, &memsw_list);
2724 }
2725
2726 /*
2727 * Append all the memsw entries to the end of the cgroup settings list
2728 * to make sure they are applied after all memory limit settings.
2729 */
2730 list_splice_tail(&memsw_list, &conf->cgroup);
2731
2732}
2733
c581d2a6
CB
2734__cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2735 struct lxc_conf *conf,
2736 bool do_devices)
ccb4cabe 2737{
c9dbb8ed
CB
2738 struct list_head *cgroup_settings;
2739 struct lxc_cgroup *cgroup;
ccb4cabe 2740
92ca7eb5
CB
2741 if (!ops)
2742 return ret_set_errno(false, ENOENT);
2743
2744 if (!conf)
2745 return ret_set_errno(false, EINVAL);
2746
2747 cgroup_settings = &conf->cgroup;
c9dbb8ed 2748 if (list_empty(cgroup_settings))
ccb4cabe
SH
2749 return true;
2750
69b4a4bb 2751 if (!ops->hierarchies)
92ca7eb5 2752 return ret_set_errno(false, EINVAL);
69b4a4bb 2753
92afbe74 2754 if (pure_unified_layout(ops))
b96aa96f
CB
2755 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2756
c9dbb8ed
CB
2757 sort_cgroup_settings(conf);
2758 list_for_each_entry(cgroup, cgroup_settings, head) {
2759 if (do_devices == strnequal("devices", cgroup->subsystem, 7)) {
2760 if (cg_legacy_set_data(ops, cgroup->subsystem, cgroup->value, strnequal("cpuset", cgroup->subsystem, 6))) {
fc3b9533 2761 if (do_devices && (errno == EACCES || errno == EPERM)) {
c9dbb8ed 2762 SYSWARN("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
fc3b9533
CB
2763 continue;
2764 }
c9dbb8ed
CB
2765 SYSERROR("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2766 return false;
ccb4cabe 2767 }
c9dbb8ed 2768 DEBUG("Set controller \"%s\" set to \"%s\"", cgroup->subsystem, cgroup->value);
ccb4cabe 2769 }
ccb4cabe
SH
2770 }
2771
6b38e644 2772 INFO("Limits for the legacy cgroup hierarchies have been setup");
c9dbb8ed 2773 return true;
ccb4cabe
SH
2774}
2775
bf651989
CB
2776/*
2777 * Some of the parsing logic comes from the original cgroup device v1
2778 * implementation in the kernel.
2779 */
4bfb655e
CB
2780static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2781 struct lxc_conf *conf, const char *key,
bf651989
CB
2782 const char *val)
2783{
50329f28 2784 struct device_item device_item = {};
2a63b5cb 2785 int ret;
bf651989 2786
30bfbd3f 2787 if (strequal("devices.allow", key) && abspath(val))
cb3fc90c
CB
2788 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2789 else
2790 ret = device_cgroup_rule_parse(&device_item, key, val);
2a63b5cb 2791 if (ret < 0)
060aaa39 2792 return syserror_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
4bfb655e 2793
60532b18 2794 /*
15970277
CB
2795 * Note that bpf_list_add_device() returns 1 if it altered the device
2796 * list and 0 if it didn't; both return values indicate success.
2797 * Only a negative return value indicates an error.
60532b18 2798 */
a134099d 2799 ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
2a63b5cb 2800 if (ret < 0)
4bfb655e 2801 return -1;
a134099d 2802
bf651989
CB
2803 return 0;
2804}
2805
c581d2a6
CB
2806__cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2807 struct lxc_handler *handler)
6b38e644 2808{
c9dbb8ed 2809 struct list_head *cgroup_settings;
7e31931f
CB
2810 struct hierarchy *h;
2811 struct lxc_conf *conf;
c9dbb8ed 2812 struct lxc_cgroup *cgroup;
6b38e644 2813
7e31931f
CB
2814 if (!ops)
2815 return ret_set_errno(false, ENOENT);
2816
2817 if (!ops->hierarchies)
6b38e644
CB
2818 return true;
2819
7e31931f
CB
2820 if (!ops->container_cgroup)
2821 return ret_set_errno(false, EINVAL);
2822
2823 if (!handler || !handler->conf)
2824 return ret_set_errno(false, EINVAL);
2825 conf = handler->conf;
2826
7e31931f 2827 cgroup_settings = &conf->cgroup2;
c9dbb8ed 2828 if (list_empty(cgroup_settings))
0e7a013e
CB
2829 return true;
2830
2831 if (!pure_unified_layout(ops))
2832 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
7e31931f
CB
2833
2834 if (!ops->unified)
6b38e644 2835 return false;
7e31931f 2836 h = ops->unified;
6b38e644 2837
c9dbb8ed 2838 list_for_each_entry(cgroup, cgroup_settings, head) {
c04a6d4e 2839 int ret;
6b38e644 2840
c9dbb8ed
CB
2841 if (strnequal("devices", cgroup->subsystem, 7))
2842 ret = bpf_device_cgroup_prepare(ops, conf, cgroup->subsystem, cgroup->value);
ee9d3ef0 2843 else
c9dbb8ed 2844 ret = lxc_write_openat(h->path_lim, cgroup->subsystem, cgroup->value, strlen(cgroup->value));
ee9d3ef0 2845 if (ret < 0)
c9dbb8ed 2846 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
ee9d3ef0 2847
c9dbb8ed 2848 TRACE("Set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
6b38e644
CB
2849 }
2850
7e31931f 2851 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
6b38e644
CB
2852}
2853
59eac805 2854__cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
bf651989 2855{
e552bd1a
CB
2856 struct lxc_conf *conf;
2857 struct hierarchy *unified;
bf651989 2858
e552bd1a
CB
2859 if (!ops)
2860 return ret_set_errno(false, ENOENT);
2861
2862 if (!ops->hierarchies)
2863 return true;
2864
2865 if (!ops->container_cgroup)
2866 return ret_set_errno(false, EEXIST);
2867
2868 if (!handler || !handler->conf)
2869 return ret_set_errno(false, EINVAL);
2870 conf = handler->conf;
2871
2872 unified = ops->unified;
ca72ccb5 2873 if (!unified || !device_utility_controller(unified) ||
93de768e 2874 !unified->path_con || list_empty(&(conf->bpf_devices).devices))
bf651989
CB
2875 return true;
2876
a134099d 2877 return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
bf651989
CB
2878}
2879
59eac805 2880static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
6b38e644 2881{
95ab26af
CB
2882 __do_close int dfd_final = -EBADF;
2883 __do_free char *add_controllers = NULL, *copy = NULL;
c581d2a6 2884 size_t full_len = 0;
0954f6ce
CB
2885 struct hierarchy *unified;
2886 int dfd_cur, ret;
95ab26af
CB
2887 char *cur;
2888 char **it;
6b38e644 2889
0954f6ce
CB
2890 if (!ops->hierarchies || !pure_unified_layout(ops))
2891 return true;
2892
2893 unified = ops->unified;
2894 if (!unified->controllers[0])
bf651989
CB
2895 return true;
2896
c581d2a6
CB
2897 /* For now we simply enable all controllers that we have detected by
2898 * creating a string like "+memory +pids +cpu +io".
2899 * TODO: In the near future we might want to support "-<controller>"
2900 * etc. but whether supporting semantics like this make sense will need
2901 * some thinking.
2902 */
2903 for (it = unified->controllers; it && *it; it++) {
2904 full_len += strlen(*it) + 2;
2905 add_controllers = must_realloc(add_controllers, full_len + 1);
2906
2907 if (unified->controllers[0] == *it)
2908 add_controllers[0] = '\0';
2909
2910 (void)strlcat(add_controllers, "+", full_len + 1);
2911 (void)strlcat(add_controllers, *it, full_len + 1);
2912
2913 if ((it + 1) && *(it + 1))
2914 (void)strlcat(add_controllers, " ", full_len + 1);
2915 }
2916
95ab26af
CB
2917 copy = strdup(cgroup);
2918 if (!copy)
f761d24d 2919 return false;
c581d2a6 2920
95ab26af
CB
2921 /*
2922 * Placing the write to cgroup.subtree_control before the open() is
2923 * intentional because of the cgroup2 delegation model. It enforces
2924 * that leaf cgroups don't have any controllers enabled for delegation.
2925 */
0954f6ce 2926 dfd_cur = unified->dfd_base;
95ab26af
CB
2927 lxc_iterate_parts(cur, copy, "/") {
2928 /*
2929 * Even though we vetted the paths when we parsed the config
2930 * we're paranoid here and check that the path is neither
2931 * absolute nor walks upwards.
2932 */
2933 if (abspath(cur))
060aaa39 2934 return syserror_set(-EINVAL, "No absolute paths allowed");
ac01a9b8 2935
95ab26af 2936 if (strnequal(cur, "..", STRLITERALLEN("..")))
060aaa39 2937 return syserror_set(-EINVAL, "No upward walking paths allowed");
ac01a9b8 2938
95ab26af 2939 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
61fbc369 2940 if (ret < 0)
2d7b0895 2941 return syserror("Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
95ab26af
CB
2942
2943 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
ac01a9b8 2944
95ab26af
CB
2945 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
2946 if (dfd_final < 0)
2d7b0895 2947 return syserror("Fail to open directory %d(%s)", dfd_cur, cur);
95ab26af
CB
2948 if (dfd_cur != unified->dfd_base)
2949 close(dfd_cur);
2950 /*
2951 * Leave dfd_final pointing to the last fd we opened so
2952 * it will be automatically zapped if we return early.
2953 */
2954 dfd_cur = dfd_final;
c581d2a6
CB
2955 }
2956
f761d24d 2957 return true;
c581d2a6
CB
2958}
2959
59eac805 2960__cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
c581d2a6 2961{
61fbc369
CB
2962 if (!ops)
2963 return ret_set_errno(false, ENOENT);
2964
c581d2a6
CB
2965 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2966}
2967
59eac805 2968__cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
c581d2a6 2969{
61fbc369
CB
2970 if (!ops)
2971 return ret_set_errno(false, ENOENT);
2972
c581d2a6 2973 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2202afc9
CB
2974}
2975
0da35ac7
CB
2976static inline bool unified_cgroup(const char *line)
2977{
2978 return *line == '0';
2979}
2980
2981static inline char *current_unified_cgroup(bool relative, char *line)
2982{
2983 char *current_cgroup;
2984
2985 line += STRLITERALLEN("0::");
2986
2987 if (!abspath(line))
2988 return ERR_PTR(-EINVAL);
2989
2990 /* remove init.scope */
2991 if (!relative)
2992 line = prune_init_scope(line);
2993
2994 /* create a relative path */
2995 line = deabs(line);
2996
2997 current_cgroup = strdup(line);
2998 if (!current_cgroup)
2999 return ERR_PTR(-ENOMEM);
3000
3001 return current_cgroup;
3002}
3003
3004static inline const char *unprefix(const char *controllers)
3005{
3006 if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
3007 return controllers + STRLITERALLEN("name=");
3008 return controllers;
3009}
3010
3011static int __list_cgroup_delegate(char ***delegate)
a6ca2ed8 3012{
63ba9eaf 3013 __do_free char **list = NULL;
d606c4e9 3014 __do_free char *buf = NULL;
35ec1a38
CB
3015 char *standard[] = {
3016 "cgroup.procs",
3017 "cgroup.threads",
3018 "cgroup.subtree_control",
3019 "memory.oom.group",
3020 NULL,
3021 };
d606c4e9 3022 char *token;
63ba9eaf 3023 int ret;
a6ca2ed8 3024
46bf13b7 3025 buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
d606c4e9 3026 if (!buf) {
a6ca2ed8 3027 for (char **p = standard; p && *p; p++) {
63ba9eaf
CB
3028 ret = list_add_string(&list, *p);
3029 if (ret < 0)
3030 return ret;
a6ca2ed8 3031 }
35ec1a38 3032
63ba9eaf 3033 *delegate = move_ptr(list);
6d95e0b7 3034 return syswarn_ret(0, "Failed to read /sys/kernel/cgroup/delegate");
d606c4e9 3035 }
a6ca2ed8 3036
257f04ec 3037 lxc_iterate_parts(token, buf, " \t\n") {
d606c4e9
CB
3038 /*
3039 * We always need to chown this for both cgroup and
3040 * cgroup2.
3041 */
8b99a20a 3042 if (strequal(token, "cgroup.procs"))
d606c4e9
CB
3043 continue;
3044
63ba9eaf
CB
3045 ret = list_add_string(&list, token);
3046 if (ret < 0)
3047 return ret;
a6ca2ed8 3048 }
2202afc9 3049
63ba9eaf 3050 *delegate = move_ptr(list);
341e6516 3051 return 0;
2202afc9
CB
3052}
3053
0da35ac7 3054static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
0e3af26b 3055{
0da35ac7
CB
3056 __do_free_string_list char **list = NULL;
3057 int ret;
0e3af26b 3058
0da35ac7
CB
3059 ret = __list_cgroup_delegate(&list);
3060 if (ret < 0)
9fc21b2d 3061 return syserror_ret(ret, "Failed to determine unified cgroup delegation requirements");
0e3af26b 3062
0da35ac7
CB
3063 for (char *const *s = list; s && *s; s++) {
3064 if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
3065 continue;
0e3af26b 3066
815c378b 3067 return sysinfo_ret(false, "The %s file is not writable, skipping unified hierarchy", *s);
0da35ac7 3068 }
0e3af26b 3069
0da35ac7
CB
3070 *ret_files = move_ptr(list);
3071 return true;
0e3af26b
CB
3072}
3073
0da35ac7 3074static bool legacy_hierarchy_delegated(int dfd_base)
35ec1a38 3075{
98db769c
CB
3076 int ret;
3077
3078 ret = faccessat(dfd_base, ".", W_OK, 0);
3079 if (ret < 0 && errno != ENOENT)
3080 return sysinfo_ret(false, "Legacy hierarchy not writable, skipping");
0da35ac7
CB
3081
3082 return true;
35ec1a38
CB
3083}
3084
91d0151d
CB
3085/**
3086 * systemd guarantees that the order of co-mounted controllers is stable. On
3087 * some systems the order of the controllers might be reversed though.
3088 *
3089 * For example, this is how the order is mismatched on CentOS 7:
3090 *
3091 * [root@localhost ~]# cat /proc/self/cgroup
3092 * 11:perf_event:/
3093 * 10:pids:/
3094 * 9:freezer:/
3095 * >>>> 8:cpuacct,cpu:/
3096 * 7:memory:/
3097 * 6:blkio:/
3098 * 5:devices:/
3099 * 4:hugetlb:/
3100 * >>>> 3:net_prio,net_cls:/
3101 * 2:cpuset:/
3102 * 1:name=systemd:/user.slice/user-0.slice/session-c1.scope
3103 *
3104 * whereas the mountpoint:
3105 *
3106 * | |-/sys/fs/cgroup tmpfs tmpfs ro,nosuid,nodev,noexec,mode=755
3107 * | | |-/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
3108 * | | |-/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset
3109 * >>>> | | |-/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_prio,net_cls
3110 * | | |-/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb
3111 * | | |-/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices
3112 * | | |-/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio
3113 * | | |-/sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory
3114 * >>>> | | |-/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuacct,cpu
3115 * | | |-/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer
3116 * | | |-/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids
3117 * | | `-/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event
3118 *
3119 * Ensure that we always use the systemd-guaranteed stable order when checking
3120 * for the mountpoint.
3121 */
3122__attribute__((returns_nonnull)) __attribute__((nonnull))
3123static const char *stable_order(const char *controllers)
3124{
3125 if (strequal(controllers, "cpuacct,cpu"))
3126 return "cpu,cpuacct";
3127
3128 if (strequal(controllers, "net_prio,net_cls"))
3129 return "net_cls,net_prio";
3130
3131 return unprefix(controllers);
3132}
3133
35ec1a38
CB
3134static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
3135 bool unprivileged)
2202afc9 3136{
8033666c
CB
3137 __do_free char *cgroup_info = NULL;
3138 char *it;
2202afc9 3139
35ec1a38
CB
3140 /*
3141 * Root spawned containers escape the current cgroup, so use init's
3142 * cgroups as our base in that case.
3143 */
9caee129 3144 if (!relative && (geteuid() == 0))
8033666c 3145 cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
2202afc9 3146 else
8033666c
CB
3147 cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3148 if (!cgroup_info)
35ec1a38 3149 return ret_errno(ENOMEM);
2202afc9 3150
8033666c 3151 lxc_iterate_parts(it, cgroup_info, "\n") {
35ec1a38
CB
3152 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
3153 __do_free char *controllers = NULL, *current_cgroup = NULL;
3154 __do_free_string_list char **controller_list = NULL,
3155 **delegate = NULL;
3156 char *line;
3157 int dfd, ret, type;
3158
3159 /* Handle the unified cgroup hierarchy. */
3160 line = it;
3161 if (unified_cgroup(line)) {
3162 char *unified_mnt;
3163
b8572e8c
CB
3164 type = UNIFIED_HIERARCHY;
3165
35ec1a38
CB
3166 current_cgroup = current_unified_cgroup(relative, line);
3167 if (IS_ERR(current_cgroup))
3168 return PTR_ERR(current_cgroup);
3169
e18e9053
CB
3170 if (unified_cgroup_fd(ops->dfd_mnt)) {
3171 dfd_mnt = dup_cloexec(ops->dfd_mnt);
35ec1a38
CB
3172 unified_mnt = "";
3173 } else {
e18e9053 3174 dfd_mnt = open_at(ops->dfd_mnt,
35ec1a38
CB
3175 "unified",
3176 PROTECT_OPATH_DIRECTORY,
3177 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3178 unified_mnt = "unified";
3179 }
3180 if (dfd_mnt < 0) {
3181 if (errno != ENOENT)
2d7b0895 3182 return syserror("Failed to open %d/unified", ops->dfd_mnt);
2202afc9 3183
35ec1a38
CB
3184 SYSTRACE("Unified cgroup not mounted");
3185 continue;
3186 }
3187 dfd = dfd_mnt;
3188
3189 if (!is_empty_string(current_cgroup)) {
3190 dfd_base = open_at(dfd_mnt, current_cgroup,
3191 PROTECT_OPATH_DIRECTORY,
3192 PROTECT_LOOKUP_BENEATH_XDEV, 0);
f4afdfbe
CB
3193 if (dfd_base < 0) {
3194 if (errno != ENOENT)
3195 return syserror("Failed to open %d/%s",
3196 dfd_mnt, current_cgroup);
3197
3198 SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3199 dfd_mnt, current_cgroup);
3200 continue;
3201 }
35ec1a38
CB
3202 dfd = dfd_base;
3203 }
8033666c 3204
0da35ac7
CB
3205 if (!unified_hierarchy_delegated(dfd, &delegate))
3206 continue;
3207
35ec1a38
CB
3208 controller_list = unified_controllers(dfd, "cgroup.controllers");
3209 if (!controller_list) {
3210 TRACE("No controllers are enabled for delegation in the unified hierarchy");
63ba9eaf
CB
3211 controller_list = list_new();
3212 if (!controller_list)
2d7b0895 3213 return syserror_set(-ENOMEM, "Failed to create empty controller list");
35ec1a38 3214 }
8033666c 3215
35ec1a38
CB
3216 controllers = strdup(unified_mnt);
3217 if (!controllers)
3218 return ret_errno(ENOMEM);
3219 } else {
3220 char *__controllers, *__current_cgroup;
2202afc9 3221
b8572e8c
CB
3222 type = LEGACY_HIERARCHY;
3223
35ec1a38
CB
3224 __controllers = strchr(line, ':');
3225 if (!__controllers)
3226 return ret_errno(EINVAL);
3227 __controllers++;
3228
3229 __current_cgroup = strchr(__controllers, ':');
3230 if (!__current_cgroup)
3231 return ret_errno(EINVAL);
3232 *__current_cgroup = '\0';
3233 __current_cgroup++;
3234
91d0151d 3235 controllers = strdup(stable_order(__controllers));
35ec1a38
CB
3236 if (!controllers)
3237 return ret_errno(ENOMEM);
3238
e18e9053 3239 dfd_mnt = open_at(ops->dfd_mnt,
91d0151d
CB
3240 controllers,
3241 PROTECT_OPATH_DIRECTORY,
35ec1a38
CB
3242 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3243 if (dfd_mnt < 0) {
3244 if (errno != ENOENT)
2d7b0895 3245 return syserror("Failed to open %d/%s",
e18e9053 3246 ops->dfd_mnt, controllers);
2202afc9 3247
35ec1a38
CB
3248 SYSTRACE("%s not mounted", controllers);
3249 continue;
3250 }
3251 dfd = dfd_mnt;
3252
3253 if (!abspath(__current_cgroup))
3254 return ret_errno(EINVAL);
3255
3256 /* remove init.scope */
3257 if (!relative)
3258 __current_cgroup = prune_init_scope(__current_cgroup);
3259
3260 /* create a relative path */
3261 __current_cgroup = deabs(__current_cgroup);
6e214b74 3262
35ec1a38
CB
3263 current_cgroup = strdup(__current_cgroup);
3264 if (!current_cgroup)
3265 return ret_errno(ENOMEM);
2202afc9 3266
35ec1a38
CB
3267 if (!is_empty_string(current_cgroup)) {
3268 dfd_base = open_at(dfd_mnt, current_cgroup,
3269 PROTECT_OPATH_DIRECTORY,
3270 PROTECT_LOOKUP_BENEATH_XDEV, 0);
f4afdfbe
CB
3271 if (dfd_base < 0) {
3272 if (errno != ENOENT)
3273 return syserror("Failed to open %d/%s",
3274 dfd_mnt, current_cgroup);
3275
3276 SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3277 dfd_mnt, current_cgroup);
3278 continue;
3279 }
35ec1a38
CB
3280 dfd = dfd_base;
3281 }
2a63b5cb 3282
0da35ac7
CB
3283 if (!legacy_hierarchy_delegated(dfd))
3284 continue;
35ec1a38
CB
3285
3286 /*
3287 * We intentionally pass __current_cgroup here and not
3288 * controllers because we would otherwise chop the
3289 * mountpoint.
3290 */
63ba9eaf
CB
3291 controller_list = list_add_controllers(__controllers);
3292 if (!controller_list)
2d7b0895 3293 return syserror_set(-ENOMEM, "Failed to create controller list from %s", __controllers);
35ec1a38
CB
3294
3295 if (skip_hierarchy(ops, controller_list))
3296 continue;
3297
35ec1a38
CB
3298 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3299 }
3300
179754a2
CB
3301 ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
3302 current_cgroup, controller_list, type);
35ec1a38 3303 if (ret < 0)
9fc21b2d 3304 return syserror_ret(ret, "Failed to add %s hierarchy", controllers);
35ec1a38
CB
3305
3306 /* Transfer ownership. */
3307 move_fd(dfd_mnt);
3308 move_fd(dfd_base);
3309 move_ptr(current_cgroup);
3310 move_ptr(controllers);
3311 move_ptr(controller_list);
b8572e8c 3312 if (type == UNIFIED_HIERARCHY)
042f9e9c 3313 ops->unified->delegate = move_ptr(delegate);
35ec1a38
CB
3314 }
3315
3316 /* determine cgroup layout */
3317 if (ops->unified) {
3318 if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3319 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3320 } else {
3321 if (bpf_devices_cgroup_supported())
ca72ccb5 3322 ops->unified->utilities |= DEVICES_CONTROLLER;
35ec1a38
CB
3323 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3324 }
3325 }
3326
c7a1f72a 3327 if (!controllers_available(ops))
060aaa39 3328 return syserror_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
c7a1f72a 3329
35ec1a38 3330 return 0;
2202afc9
CB
3331}
3332
35ec1a38 3333static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
2202afc9 3334{
d4cff352 3335 __do_close int dfd = -EBADF;
2202afc9 3336 int ret;
0fbf99d6 3337 const char *controllers_use;
d4cff352 3338
e18e9053 3339 if (ops->dfd_mnt >= 0)
a96be3c3 3340 return ret_errno(EBUSY);
d4cff352
CB
3341
3342 /*
3343 * I don't see the need for allowing symlinks here. If users want to
3344 * have their hierarchy available in different locations I strongly
3345 * suggest bind-mounts.
3346 */
3347 dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3348 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3349 if (dfd < 0)
2d7b0895 3350 return syserror("Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
2202afc9 3351
0fbf99d6
CB
3352 controllers_use = lxc_global_config_value("lxc.cgroup.use");
3353 if (controllers_use) {
3354 __do_free char *dup = NULL;
3355 char *it;
b7b18fc5 3356
0fbf99d6
CB
3357 dup = strdup(controllers_use);
3358 if (!dup)
7a0c8ed3 3359 return -errno;
b7b18fc5 3360
63ba9eaf
CB
3361 lxc_iterate_parts(it, dup, ",") {
3362 ret = list_add_string(&ops->cgroup_use, it);
3363 if (ret < 0)
3364 return ret;
3365 }
b7b18fc5 3366 }
2202afc9 3367
d4cff352
CB
3368 /*
3369 * Keep dfd referenced by the cleanup function and actually move the fd
3370 * once we know the initialization succeeded. So if we fail we clean up
3371 * the dfd.
3372 */
e18e9053 3373 ops->dfd_mnt = dfd;
2202afc9 3374
0589d744 3375 ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map));
d4cff352 3376 if (ret < 0)
9fc21b2d 3377 return syserror_ret(ret, "Failed to initialize cgroups");
2202afc9 3378
d4cff352
CB
3379 /* Transfer ownership to cgroup_ops. */
3380 move_fd(dfd);
3381 return 0;
2202afc9
CB
3382}
3383
341e6516 3384__cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
2202afc9
CB
3385{
3386 const char *cgroup_pattern;
3387
341e6516
CB
3388 if (!ops)
3389 return ret_set_errno(-1, ENOENT);
3390
2202afc9
CB
3391 /* copy system-wide cgroup information */
3392 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
63ba9eaf
CB
3393 if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
3394 ops->cgroup_pattern = strdup(cgroup_pattern);
3395 if (!ops->cgroup_pattern)
3396 return ret_errno(ENOMEM);
3397 }
2202afc9 3398
341e6516 3399 return 0;
2202afc9
CB
3400}
3401
35ec1a38 3402struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
2202afc9 3403{
e3d78fdc 3404 __cleanup_cgroup_ops struct cgroup_ops *cgfsng_ops = NULL;
2202afc9 3405
c5d0238a 3406 cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
2202afc9 3407 if (!cgfsng_ops)
341e6516 3408 return ret_set_errno(NULL, ENOMEM);
2202afc9 3409
e3d78fdc
CB
3410 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3411 cgfsng_ops->dfd_mnt = -EBADF;
2202afc9 3412
35ec1a38 3413 if (initialize_cgroups(cgfsng_ops, conf))
2202afc9 3414 return NULL;
2202afc9 3415
ca76baed
CB
3416 cgfsng_ops->data_init = cgfsng_data_init;
3417 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3418 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3419 cgfsng_ops->monitor_create = cgfsng_monitor_create;
3420 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3421 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3422 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3423 cgfsng_ops->payload_create = cgfsng_payload_create;
3424 cgfsng_ops->payload_enter = cgfsng_payload_enter;
840eec19 3425 cgfsng_ops->finalize = cgfsng_finalize;
ca76baed
CB
3426 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3427 cgfsng_ops->get = cgfsng_get;
3428 cgfsng_ops->set = cgfsng_set;
3429 cgfsng_ops->freeze = cgfsng_freeze;
3430 cgfsng_ops->unfreeze = cgfsng_unfreeze;
3431 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3432 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3433 cgfsng_ops->driver = "cgfsng";
3434 cgfsng_ops->version = "1.0.0";
3435 cgfsng_ops->attach = cgfsng_attach;
3436 cgfsng_ops->chown = cgfsng_chown;
3437 cgfsng_ops->mount = cgfsng_mount;
3438 cgfsng_ops->devices_activate = cgfsng_devices_activate;
a9b642ee 3439 cgfsng_ops->get_limit_cgroup = cgfsng_get_limit_cgroup;
2202afc9 3440
ff9edd2d
CB
3441 cgfsng_ops->criu_escape = cgfsng_criu_escape;
3442 cgfsng_ops->criu_num_hierarchies = cgfsng_criu_num_hierarchies;
3443 cgfsng_ops->criu_get_hierarchies = cgfsng_criu_get_hierarchies;
3444
a64edc1c 3445 return move_ptr(cgfsng_ops);
2202afc9 3446}
be835470 3447
2092492c 3448static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid)
029d8e88 3449{
029d8e88
CB
3450 int ret;
3451
0589d744 3452 if (!list_empty(&conf->id_map)) {
029d8e88
CB
3453 struct userns_exec_unified_attach_data args = {
3454 .conf = conf,
2092492c 3455 .unified_fd = fd_unified,
029d8e88
CB
3456 .pid = pid,
3457 };
3458
3459 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3460 if (ret < 0)
3461 return -errno;
3462
3463 ret = userns_exec_minimal(conf,
3464 cgroup_unified_attach_parent_wrapper,
3465 &args,
3466 cgroup_unified_attach_child_wrapper,
3467 &args);
3468 } else {
2092492c
CB
3469 ret = cgroup_attach_leaf(conf, fd_unified, pid);
3470 }
3471
3472 return ret;
3473}
3474
3475static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name,
3476 const char *lxcpath, pid_t pid)
3477{
c071c112 3478 call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){};
2092492c 3479 int ret;
c071c112 3480 size_t idx;
2092492c 3481 ssize_t pidstr_len;
9d3480da 3482 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2092492c 3483
bce2970f 3484 ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, sizeof(struct cgroup_ctx), ctx);
2092492c
CB
3485 if (ret < 0)
3486 return ret_errno(ENOSYS);
3487
3488 pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
3489 if (pidstr_len < 0)
3490 return pidstr_len;
3491
c071c112
CB
3492 for (idx = 0; idx < ctx->fd_len; idx++) {
3493 int dfd_con = ctx->fd[idx];
2092492c
CB
3494
3495 if (unified_cgroup_fd(dfd_con))
3496 ret = __unified_attach_fd(conf, dfd_con, pid);
3497 else
3498 ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len);
3499 if (ret)
9fc21b2d 3500 return syserror_ret(ret, "Failed to attach to cgroup fd %d", dfd_con);
2092492c
CB
3501 else
3502 TRACE("Attached to cgroup fd %d", dfd_con);
3503 }
3504
c071c112 3505 if (idx == 0)
060aaa39 3506 return syserror_set(-ENOENT, "Failed to attach to cgroups");
c071c112 3507
61983e15 3508 TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->layout));
c071c112 3509 return 0;
2092492c
CB
3510}
3511
3512static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name,
3513 const char *lxcpath, pid_t pid)
3514{
3515 __do_close int dfd_unified = -EBADF;
3516
3517 if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3518 return ret_errno(EINVAL);
3519
3520 dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3521 if (dfd_unified < 0)
f740bc63 3522 return ret_errno(ENOSYS);
2092492c
CB
3523
3524 return __unified_attach_fd(conf, dfd_unified, pid);
3525}
3526
3527int cgroup_attach(const struct lxc_conf *conf, const char *name,
3528 const char *lxcpath, pid_t pid)
3529{
3530 int ret;
3531
3532 ret = __cgroup_attach_many(conf, name, lxcpath, pid);
3533 if (ret < 0) {
f740bc63 3534 if (!ERRNO_IS_NOT_SUPPORTED(ret))
2092492c
CB
3535 return ret;
3536
3537 ret = __cgroup_attach_unified(conf, name, lxcpath, pid);
f740bc63
CB
3538 if (ret < 0 && ERRNO_IS_NOT_SUPPORTED(ret))
3539 return ret_errno(ENOSYS);
029d8e88
CB
3540 }
3541
3542 return ret;
3543}
3544
751a624f 3545/* Connects to command socket therefore isn't callable from command handler. */
abb6f657 3546int cgroup_get(const char *name, const char *lxcpath, const char *key, char *buf, size_t len)
be835470 3547{
abb6f657
CB
3548 __do_close int dfd = -EBADF;
3549 struct cgroup_fd fd = {
3550 .fd = -EBADF,
3551 };
3552 size_t len_controller;
3553 int ret;
be835470 3554
abb6f657
CB
3555 if (is_empty_string(name) || is_empty_string(lxcpath) ||
3556 is_empty_string(key))
be835470
CB
3557 return ret_errno(EINVAL);
3558
3559 if ((buf && !len) || (len && !buf))
3560 return ret_errno(EINVAL);
3561
abb6f657
CB
3562 len_controller = strcspn(key, ".");
3563 len_controller++; /* Don't forget the \0 byte. */
3564 if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3565 return ret_errno(EINVAL);
3566 (void)strlcpy(fd.controller, key, len_controller);
be835470 3567
abb6f657
CB
3568 ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3569 if (ret < 0) {
3570 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3571 return ret;
3572
3573 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3574 if (dfd < 0) {
3575 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3576 return ret;
3577
3578 return ret_errno(ENOSYS);
3579 }
3580 fd.type = UNIFIED_HIERARCHY;
3581 fd.fd = move_fd(dfd);
3582 }
3583 dfd = move_fd(fd.fd);
3584
3585 TRACE("Reading %s from %s cgroup hierarchy", key, cgroup_hierarchy_name(fd.type));
3586
3587 if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices"))
3588 return ret_errno(EOPNOTSUPP);
3589 else
3590 ret = lxc_read_try_buf_at(dfd, key, buf, len);
be835470
CB
3591
3592 return ret;
3593}
3594
751a624f 3595/* Connects to command socket therefore isn't callable from command handler. */
abb6f657 3596int cgroup_set(const char *name, const char *lxcpath, const char *key, const char *value)
be835470 3597{
abb6f657
CB
3598 __do_close int dfd = -EBADF;
3599 struct cgroup_fd fd = {
3600 .fd = -EBADF,
3601 };
3602 size_t len_controller;
3603 int ret;
be835470 3604
abb6f657
CB
3605 if (is_empty_string(name) || is_empty_string(lxcpath) ||
3606 is_empty_string(key) || is_empty_string(value))
be835470
CB
3607 return ret_errno(EINVAL);
3608
abb6f657
CB
3609 len_controller = strcspn(key, ".");
3610 len_controller++; /* Don't forget the \0 byte. */
3611 if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3612 return ret_errno(EINVAL);
3613 (void)strlcpy(fd.controller, key, len_controller);
3614
3615 ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3616 if (ret < 0) {
3617 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3618 return ret;
3619
3620 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3621 if (dfd < 0) {
3622 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3623 return ret;
be835470 3624
abb6f657
CB
3625 return ret_errno(ENOSYS);
3626 }
3627 fd.type = UNIFIED_HIERARCHY;
3628 fd.fd = move_fd(dfd);
3629 }
3630 dfd = move_fd(fd.fd);
3631
3632 TRACE("Setting %s to %s in %s cgroup hierarchy", key, value, cgroup_hierarchy_name(fd.type));
3633
3634 if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) {
be835470
CB
3635 struct device_item device = {};
3636
abb6f657 3637 ret = device_cgroup_rule_parse(&device, key, value);
be835470 3638 if (ret < 0)
abb6f657
CB
3639 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
3640 key, value);
be835470
CB
3641
3642 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3643 } else {
abb6f657 3644 ret = lxc_writeat(dfd, key, value, strlen(value));
be835470
CB
3645 }
3646
3647 return ret;
3648}
c8af3332 3649
c9c814f4
CB
3650static int do_cgroup_freeze(int unified_fd,
3651 const char *state_string,
3652 int state_num,
3653 int timeout,
3654 const char *epoll_error,
3655 const char *wait_error)
c8af3332
CB
3656{
3657 __do_close int events_fd = -EBADF;
3298b37d 3658 call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL;
c8af3332 3659 int ret;
3298b37d 3660 struct lxc_async_descr descr = {};
c8af3332
CB
3661
3662 if (timeout != 0) {
3663 ret = lxc_mainloop_open(&descr);
3664 if (ret)
3665 return log_error_errno(-1, errno, "%s", epoll_error);
3666
3667 /* automatically cleaned up now */
3668 descr_ptr = &descr;
3669
3670 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3671 if (events_fd < 0)
3672 return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3673
543d2f83
CB
3674 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI,
3675 freezer_cgroup_events_cb,
3676 default_cleanup_handler,
3677 INT_TO_PTR(state_num),
3678 "freezer_cgroup_events_cb");
c8af3332
CB
3679 if (ret < 0)
3680 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3681 }
3682
3683 ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3684 if (ret < 0)
3685 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3686
3687 if (timeout != 0) {
3688 ret = lxc_mainloop(&descr, timeout);
3689 if (ret)
3690 return log_error_errno(-1, errno, "%s", wait_error);
3691 }
3692
3693 return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3694}
3695
c9c814f4
CB
3696static inline int __cgroup_freeze(int unified_fd, int timeout)
3697{
3698 return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3699 "Failed to create epoll instance to wait for container freeze",
3700 "Failed to wait for container to be frozen");
3701}
3702
5ef7547f 3703int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
c8af3332
CB
3704{
3705 __do_close int unified_fd = -EBADF;
3706 int ret;
3707
b57f9b13
CB
3708 if (is_empty_string(name) || is_empty_string(lxcpath))
3709 return ret_errno(EINVAL);
3710
a9b642ee 3711 unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
c8af3332
CB
3712 if (unified_fd < 0)
3713 return ret_errno(ENOCGROUP2);
3714
3715 lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
c9c814f4 3716 ret = __cgroup_freeze(unified_fd, timeout);
c8af3332 3717 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
5ef7547f 3718 return ret;
c8af3332
CB
3719}
3720
c9c814f4
CB
3721int __cgroup_unfreeze(int unified_fd, int timeout)
3722{
3723 return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3724 "Failed to create epoll instance to wait for container freeze",
3725 "Failed to wait for container to be frozen");
3726}
3727
5ef7547f 3728int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
c8af3332
CB
3729{
3730 __do_close int unified_fd = -EBADF;
3731 int ret;
3732
b57f9b13
CB
3733 if (is_empty_string(name) || is_empty_string(lxcpath))
3734 return ret_errno(EINVAL);
3735
a9b642ee 3736 unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
c8af3332
CB
3737 if (unified_fd < 0)
3738 return ret_errno(ENOCGROUP2);
3739
3740 lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
c9c814f4 3741 ret = __cgroup_unfreeze(unified_fd, timeout);
c8af3332 3742 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
5ef7547f 3743 return ret;
c8af3332 3744}