]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfsng.c
cgroups: skip and warn about invalid file descriptors
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 /*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
8 * each controller.
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
12 * a comma-separated list of controllers.
13 */
14
15 #ifndef _GNU_SOURCE
16 #define _GNU_SOURCE 1
17 #endif
18 #include <ctype.h>
19 #include <dirent.h>
20 #include <errno.h>
21 #include <grp.h>
22 #include <linux/kdev_t.h>
23 #include <linux/types.h>
24 #include <poll.h>
25 #include <signal.h>
26 #include <stdint.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/epoll.h>
31 #include <sys/types.h>
32 #include <unistd.h>
33
34 #include "af_unix.h"
35 #include "caps.h"
36 #include "cgroup.h"
37 #include "cgroup2_devices.h"
38 #include "cgroup_utils.h"
39 #include "commands.h"
40 #include "commands_utils.h"
41 #include "conf.h"
42 #include "config.h"
43 #include "log.h"
44 #include "macro.h"
45 #include "mainloop.h"
46 #include "memory_utils.h"
47 #include "mount_utils.h"
48 #include "storage/storage.h"
49 #include "string_utils.h"
50 #include "syscall_wrappers.h"
51 #include "utils.h"
52
53 #ifndef HAVE_STRLCPY
54 #include "include/strlcpy.h"
55 #endif
56
57 #ifndef HAVE_STRLCAT
58 #include "include/strlcat.h"
59 #endif
60
61 lxc_log_define(cgfsng, cgroup);
62
63 /*
64 * Given a pointer to a null-terminated array of pointers, realloc to add one
65 * entry, and point the new entry to NULL. Do not fail. Return the index to the
66 * second-to-last entry - that is, the one which is now available for use
67 * (keeping the list null-terminated).
68 */
69 static int list_add(void ***list)
70 {
71 int idx = 0;
72 void **p;
73
74 if (*list)
75 for (; (*list)[idx]; idx++)
76 ;
77
78 p = realloc(*list, (idx + 2) * sizeof(void **));
79 if (!p)
80 return ret_errno(ENOMEM);
81
82 p[idx + 1] = NULL;
83 *list = p;
84
85 return idx;
86 }
87
88 /* Given a null-terminated array of strings, check whether @entry is one of the
89 * strings.
90 */
91 static bool string_in_list(char **list, const char *entry)
92 {
93 if (!list)
94 return false;
95
96 for (int i = 0; list[i]; i++)
97 if (strequal(list[i], entry))
98 return true;
99
100 return false;
101 }
102
103 /* Given a handler's cgroup data, return the struct hierarchy for the controller
104 * @c, or NULL if there is none.
105 */
106 static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
107 {
108 if (!ops->hierarchies)
109 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
110
111 for (int i = 0; ops->hierarchies[i]; i++) {
112 if (!controller) {
113 /* This is the empty unified hierarchy. */
114 if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
115 return ops->hierarchies[i];
116
117 continue;
118 }
119
120 /*
121 * Handle controllers with significant implementation changes
122 * from cgroup to cgroup2.
123 */
124 if (pure_unified_layout(ops)) {
125 if (strequal(controller, "devices")) {
126 if (device_utility_controller(ops->unified))
127 return ops->unified;
128
129 break;
130 } else if (strequal(controller, "freezer")) {
131 if (freezer_utility_controller(ops->unified))
132 return ops->unified;
133
134 break;
135 }
136 }
137
138 if (string_in_list(ops->hierarchies[i]->controllers, controller))
139 return ops->hierarchies[i];
140 }
141
142 if (controller)
143 WARN("There is no useable %s controller", controller);
144 else
145 WARN("There is no empty unified cgroup hierarchy");
146
147 return ret_set_errno(NULL, ENOENT);
148 }
149
150 /* Taken over modified from the kernel sources. */
151 #define NBITS 32 /* bits in uint32_t */
152 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
153 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
154
155 static void set_bit(unsigned bit, uint32_t *bitarr)
156 {
157 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
158 }
159
160 static void clear_bit(unsigned bit, uint32_t *bitarr)
161 {
162 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
163 }
164
165 static bool is_set(unsigned bit, uint32_t *bitarr)
166 {
167 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
168 }
169
170 /* Create cpumask from cpulist aka turn:
171 *
172 * 0,2-3
173 *
174 * into bit array
175 *
176 * 1 0 1 1
177 */
178 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
179 {
180 __do_free uint32_t *bitarr = NULL;
181 char *token;
182 size_t arrlen;
183
184 arrlen = BITS_TO_LONGS(nbits);
185 bitarr = calloc(arrlen, sizeof(uint32_t));
186 if (!bitarr)
187 return ret_set_errno(NULL, ENOMEM);
188
189 lxc_iterate_parts(token, buf, ",") {
190 errno = 0;
191 unsigned end, start;
192 char *range;
193
194 start = strtoul(token, NULL, 0);
195 end = start;
196 range = strchr(token, '-');
197 if (range)
198 end = strtoul(range + 1, NULL, 0);
199
200 if (!(start <= end))
201 return ret_set_errno(NULL, EINVAL);
202
203 if (end >= nbits)
204 return ret_set_errno(NULL, EINVAL);
205
206 while (start <= end)
207 set_bit(start++, bitarr);
208 }
209
210 return move_ptr(bitarr);
211 }
212
213 /* Turn cpumask into simple, comma-separated cpulist. */
214 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
215 {
216 __do_free_string_list char **cpulist = NULL;
217 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
218 int ret;
219
220 for (size_t i = 0; i <= nbits; i++) {
221 if (!is_set(i, bitarr))
222 continue;
223
224 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
225 if (ret < 0)
226 return NULL;
227
228 ret = lxc_append_string(&cpulist, numstr);
229 if (ret < 0)
230 return ret_set_errno(NULL, ENOMEM);
231 }
232
233 if (!cpulist)
234 return ret_set_errno(NULL, ENOMEM);
235
236 return lxc_string_join(",", (const char **)cpulist, false);
237 }
238
239 static ssize_t get_max_cpus(char *cpulist)
240 {
241 char *c1, *c2;
242 char *maxcpus = cpulist;
243 size_t cpus = 0;
244
245 c1 = strrchr(maxcpus, ',');
246 if (c1)
247 c1++;
248
249 c2 = strrchr(maxcpus, '-');
250 if (c2)
251 c2++;
252
253 if (!c1 && !c2)
254 c1 = maxcpus;
255 else if (c1 > c2)
256 c2 = c1;
257 else if (c1 < c2)
258 c1 = c2;
259 else if (!c1 && c2)
260 c1 = c2;
261
262 errno = 0;
263 cpus = strtoul(c1, NULL, 0);
264 if (errno != 0)
265 return -1;
266
267 return cpus;
268 }
269
270 static inline bool is_unified_hierarchy(const struct hierarchy *h)
271 {
272 return h->fs_type == UNIFIED_HIERARCHY;
273 }
274
275 /* Return true if the controller @entry is found in the null-terminated list of
276 * hierarchies @hlist.
277 */
278 static bool controller_available(struct hierarchy **hlist, char *entry)
279 {
280 if (!hlist)
281 return false;
282
283 for (int i = 0; hlist[i]; i++)
284 if (string_in_list(hlist[i]->controllers, entry))
285 return true;
286
287 return false;
288 }
289
290 static bool controllers_available(struct cgroup_ops *ops)
291 {
292 struct hierarchy **hlist;
293
294 if (!ops->cgroup_use)
295 return true;
296
297 hlist = ops->hierarchies;
298 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
299 if (!controller_available(hlist, *cur))
300 return log_error(false, "The %s controller found", *cur);
301
302 return true;
303 }
304
305 static char **list_new(void)
306 {
307 __do_free_string_list char **list = NULL;
308 int idx;
309
310 idx = list_add((void ***)&list);
311 if (idx < 0)
312 return NULL;
313
314 list[idx] = NULL;
315 return move_ptr(list);
316 }
317
318 static int list_add_string(char ***list, char *entry)
319 {
320 __do_free char *dup = NULL;
321 int idx;
322
323 dup = strdup(entry);
324 if (!dup)
325 return ret_errno(ENOMEM);
326
327 idx = list_add((void ***)list);
328 if (idx < 0)
329 return idx;
330
331 (*list)[idx] = move_ptr(dup);
332 return 0;
333 }
334
335 static char **list_add_controllers(char *controllers)
336 {
337 __do_free_string_list char **list = NULL;
338 char *it;
339
340 lxc_iterate_parts(it, controllers, " \t\n") {
341 int ret;
342
343 ret = list_add_string(&list, it);
344 if (ret < 0)
345 return NULL;
346 }
347
348 return move_ptr(list);
349 }
350
351 static char **unified_controllers(int dfd, const char *file)
352 {
353 __do_free char *buf = NULL;
354
355 buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
356 if (!buf)
357 return NULL;
358
359 return list_add_controllers(buf);
360 }
361
362 static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
363 {
364 if (!ops->cgroup_use)
365 return false;
366
367 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
368 bool found = false;
369
370 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
371 if (!strequal(*cur_use, *cur_ctrl))
372 continue;
373
374 found = true;
375 break;
376 }
377
378 if (found)
379 continue;
380
381 return true;
382 }
383
384 return false;
385 }
386
387 static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
388 int dfd_base, char *base_cgroup,
389 char **controllers, cgroupfs_type_magic_t fs_type)
390 {
391 __do_free struct hierarchy *new = NULL;
392 int idx;
393
394 if (abspath(base_cgroup))
395 return syserrno_set(-EINVAL, "Container base path must be relative to controller mount");
396
397 new = zalloc(sizeof(*new));
398 if (!new)
399 return ret_errno(ENOMEM);
400
401 new->dfd_con = -EBADF;
402 new->dfd_lim = -EBADF;
403 new->dfd_mon = -EBADF;
404
405 new->fs_type = fs_type;
406 new->controllers = controllers;
407 new->at_mnt = mnt;
408 new->at_base = base_cgroup;
409
410 new->dfd_mnt = dfd_mnt;
411 new->dfd_base = dfd_base;
412
413 TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
414 mnt, maybe_empty(base_cgroup));
415 for (char *const *it = new->controllers; it && *it; it++)
416 TRACE("The hierarchy contains the %s controller", *it);
417
418 idx = list_add((void ***)&ops->hierarchies);
419 if (idx < 0)
420 return ret_errno(idx);
421
422 if (fs_type == UNIFIED_HIERARCHY)
423 ops->unified = new;
424 (ops->hierarchies)[idx] = move_ptr(new);
425
426 return 0;
427 }
428
429 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
430 {
431 if (!path_prune || !hierarchies)
432 return 0;
433
434 for (int i = 0; hierarchies[i]; i++) {
435 struct hierarchy *h = hierarchies[i];
436 int ret;
437
438 ret = cgroup_tree_prune(h->dfd_base, path_prune);
439 if (ret < 0)
440 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
441 else
442 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
443
444 free_equal(h->path_lim, h->path_con);
445 }
446
447 return 0;
448 }
449
450 struct generic_userns_exec_data {
451 struct hierarchy **hierarchies;
452 const char *path_prune;
453 struct lxc_conf *conf;
454 uid_t origuid; /* target uid in parent namespace */
455 char *path;
456 };
457
458 static int cgroup_tree_remove_wrapper(void *data)
459 {
460 struct generic_userns_exec_data *arg = data;
461 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
462 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
463 int ret;
464
465 if (!lxc_drop_groups() && errno != EPERM)
466 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
467
468 ret = setresgid(nsgid, nsgid, nsgid);
469 if (ret < 0)
470 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
471 (int)nsgid, (int)nsgid, (int)nsgid);
472
473 ret = setresuid(nsuid, nsuid, nsuid);
474 if (ret < 0)
475 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
476 (int)nsuid, (int)nsuid, (int)nsuid);
477
478 return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
479 }
480
481 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
482 struct lxc_handler *handler)
483 {
484 int ret;
485
486 if (!ops) {
487 ERROR("Called with uninitialized cgroup operations");
488 return;
489 }
490
491 if (!ops->hierarchies)
492 return;
493
494 if (!handler) {
495 ERROR("Called with uninitialized handler");
496 return;
497 }
498
499 if (!handler->conf) {
500 ERROR("Called with uninitialized conf");
501 return;
502 }
503
504 if (!ops->container_limit_cgroup) {
505 WARN("Uninitialized limit cgroup");
506 return;
507 }
508
509 ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
510 if (ret < 0)
511 WARN("Failed to detach bpf program from cgroup");
512
513 if (!lxc_list_empty(&handler->conf->id_map)) {
514 struct generic_userns_exec_data wrap = {
515 .conf = handler->conf,
516 .path_prune = ops->container_limit_cgroup,
517 .hierarchies = ops->hierarchies,
518 .origuid = 0,
519 };
520 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
521 &wrap, "cgroup_tree_remove_wrapper");
522 } else {
523 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
524 }
525 if (ret < 0)
526 SYSWARN("Failed to destroy cgroups");
527 }
528
529 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
530 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
531 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
532 bool am_initialized)
533 {
534 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
535 *offlinecpus = NULL, *posscpus = NULL;
536 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
537 *possmask = NULL;
538 int ret;
539 ssize_t i;
540 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
541 bool flipped_bit = false;
542
543 posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
544 if (!posscpus)
545 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
546
547 /* Get maximum number of cpus found in possible cpuset. */
548 maxposs = get_max_cpus(posscpus);
549 if (maxposs < 0 || maxposs >= INT_MAX - 1)
550 return false;
551
552 if (file_exists(__ISOL_CPUS)) {
553 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
554 if (!isolcpus)
555 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
556
557 if (isdigit(isolcpus[0])) {
558 /* Get maximum number of cpus found in isolated cpuset. */
559 maxisol = get_max_cpus(isolcpus);
560 if (maxisol < 0 || maxisol >= INT_MAX - 1)
561 return false;
562 }
563
564 if (maxposs < maxisol)
565 maxposs = maxisol;
566 maxposs++;
567 } else {
568 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
569 }
570
571 if (file_exists(__OFFLINE_CPUS)) {
572 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
573 if (!offlinecpus)
574 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
575
576 if (isdigit(offlinecpus[0])) {
577 /* Get maximum number of cpus found in offline cpuset. */
578 maxoffline = get_max_cpus(offlinecpus);
579 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
580 return false;
581 }
582
583 if (maxposs < maxoffline)
584 maxposs = maxoffline;
585 maxposs++;
586 } else {
587 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
588 }
589
590 if ((maxisol == 0) && (maxoffline == 0)) {
591 cpulist = move_ptr(posscpus);
592 goto copy_parent;
593 }
594
595 possmask = lxc_cpumask(posscpus, maxposs);
596 if (!possmask)
597 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
598
599 if (maxisol > 0) {
600 isolmask = lxc_cpumask(isolcpus, maxposs);
601 if (!isolmask)
602 return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
603 }
604
605 if (maxoffline > 0) {
606 offlinemask = lxc_cpumask(offlinecpus, maxposs);
607 if (!offlinemask)
608 return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
609 }
610
611 for (i = 0; i <= maxposs; i++) {
612 if ((isolmask && !is_set(i, isolmask)) ||
613 (offlinemask && !is_set(i, offlinemask)) ||
614 !is_set(i, possmask))
615 continue;
616
617 flipped_bit = true;
618 clear_bit(i, possmask);
619 }
620
621 if (!flipped_bit) {
622 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
623 TRACE("No isolated or offline cpus present in cpuset");
624 } else {
625 cpulist = move_ptr(posscpus);
626 TRACE("Removed isolated or offline cpus from cpuset");
627 }
628 if (!cpulist)
629 return log_error_errno(false, errno, "Failed to create cpu list");
630
631 copy_parent:
632 if (!am_initialized) {
633 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
634 if (ret < 0)
635 return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
636
637 TRACE("Copied cpu settings of parent cgroup");
638 }
639
640 return true;
641 }
642
643 static bool cpuset1_initialize(int dfd_base, int dfd_next)
644 {
645 char mems[PATH_MAX];
646 ssize_t bytes;
647 char v;
648
649 /*
650 * Determine whether the base cgroup has cpuset
651 * inheritance turned on.
652 */
653 bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
654 if (bytes < 0)
655 return syserrno(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
656
657 /*
658 * Initialize cpuset.cpus and make remove any isolated
659 * and offline cpus.
660 */
661 if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
662 return syserrno(false, "Failed to initialize cpuset.cpus");
663
664 /* Read cpuset.mems from parent... */
665 bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
666 if (bytes < 0)
667 return syserrno(false, "Failed to read file %d(cpuset.mems)", dfd_base);
668
669 /* ... and copy to first cgroup in the tree... */
670 bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
671 if (bytes < 0)
672 return syserrno(false, "Failed to write %d(cpuset.mems)", dfd_next);
673
674 /* ... and finally turn on cpuset inheritance. */
675 bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
676 if (bytes < 0)
677 return syserrno(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
678
679 return log_trace(true, "Initialized cpuset in the legacy hierarchy");
680 }
681
682 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
683 bool cpuset_v1, bool eexist_ignore)
684 {
685 __do_close int dfd_final = -EBADF;
686 int dfd_cur = dfd_base;
687 int ret = 0;
688 size_t len;
689 char *cur;
690 char buf[PATH_MAX];
691
692 if (is_empty_string(path))
693 return ret_errno(EINVAL);
694
695 len = strlcpy(buf, path, sizeof(buf));
696 if (len >= sizeof(buf))
697 return ret_errno(E2BIG);
698
699 lxc_iterate_parts(cur, buf, "/") {
700 /*
701 * Even though we vetted the paths when we parsed the config
702 * we're paranoid here and check that the path is neither
703 * absolute nor walks upwards.
704 */
705 if (abspath(cur))
706 return syserrno_set(-EINVAL, "No absolute paths allowed");
707
708 if (strnequal(cur, "..", STRLITERALLEN("..")))
709 return syserrno_set(-EINVAL, "No upward walking paths allowed");
710
711 ret = mkdirat(dfd_cur, cur, mode);
712 if (ret < 0) {
713 if (errno != EEXIST)
714 return syserrno(-errno, "Failed to create %d(%s)", dfd_cur, cur);
715
716 ret = -EEXIST;
717 }
718 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
719
720 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
721 if (dfd_final < 0)
722 return syserrno(-errno, "Fail to open%s directory %d(%s)",
723 !ret ? " newly created" : "", dfd_base, cur);
724 if (dfd_cur != dfd_base)
725 close(dfd_cur);
726 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
727 return syserrno(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
728 /*
729 * Leave dfd_final pointing to the last fd we opened so
730 * it will be automatically zapped if we return early.
731 */
732 dfd_cur = dfd_final;
733 }
734
735 /* The final cgroup must be succesfully creatd by us. */
736 if (ret) {
737 if (ret != -EEXIST || !eexist_ignore)
738 return syserrno_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
739 }
740
741 return move_fd(dfd_final);
742 }
743
744 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
745 struct hierarchy *h, const char *cgroup_limit_dir,
746 const char *cgroup_leaf, bool payload)
747 {
748 __do_close int fd_limit = -EBADF, fd_final = -EBADF;
749 __do_free char *path = NULL, *limit_path = NULL;
750 bool cpuset_v1 = false;
751
752 /*
753 * The legacy cpuset controller needs massaging in case inheriting
754 * settings from its immediate ancestor cgroup hasn't been turned on.
755 */
756 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
757
758 if (payload && cgroup_leaf) {
759 /* With isolation both parts need to not already exist. */
760 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
761 if (fd_limit < 0)
762 return syserrno(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
763
764 TRACE("Created limit cgroup %d->%d(%s)",
765 fd_limit, h->dfd_base, cgroup_limit_dir);
766
767 /*
768 * With isolation the devices legacy cgroup needs to be
769 * iinitialized early, as it typically contains an 'a' (all)
770 * line, which is not possible once a subdirectory has been
771 * created.
772 */
773 if (string_in_list(h->controllers, "devices") &&
774 !ops->setup_limits_legacy(ops, conf, true))
775 return log_error(false, "Failed to setup legacy device limits");
776
777 limit_path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
778 path = must_make_path(limit_path, cgroup_leaf, NULL);
779
780 /*
781 * If we use a separate limit cgroup, the leaf cgroup, i.e. the
782 * cgroup the container actually resides in, is below fd_limit.
783 */
784 fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false);
785 if (fd_final < 0) {
786 /* Ensure we don't leave any garbage behind. */
787 if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
788 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
789 else
790 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
791 }
792 } else {
793 path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
794
795 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
796 }
797 if (fd_final < 0)
798 return syserrno(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
799
800 if (payload) {
801 h->dfd_con = move_fd(fd_final);
802 h->path_con = move_ptr(path);
803
804 if (fd_limit < 0)
805 h->dfd_lim = h->dfd_con;
806 else
807 h->dfd_lim = move_fd(fd_limit);
808
809 if (limit_path)
810 h->path_lim = move_ptr(limit_path);
811 else
812 h->path_lim = h->path_con;
813 } else {
814 h->dfd_mon = move_fd(fd_final);
815 }
816
817 return true;
818 }
819
820 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
821 bool payload)
822 {
823 bool prune = true;
824
825 if (payload) {
826 /* Check whether we actually created the cgroup to prune. */
827 if (h->dfd_lim < 0)
828 prune = false;
829
830 free_equal(h->path_con, h->path_lim);
831 close_equal(h->dfd_con, h->dfd_lim);
832 } else {
833 /* Check whether we actually created the cgroup to prune. */
834 if (h->dfd_mon < 0)
835 prune = false;
836
837 close_prot_errno_disarm(h->dfd_mon);
838 }
839
840 /* We didn't create this cgroup. */
841 if (!prune)
842 return;
843
844 if (cgroup_tree_prune(h->dfd_base, path_prune))
845 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
846 else
847 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
848 }
849
850 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
851 struct lxc_handler *handler)
852 {
853 int len;
854 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
855 const struct lxc_conf *conf;
856
857 if (!ops) {
858 ERROR("Called with uninitialized cgroup operations");
859 return;
860 }
861
862 if (!ops->hierarchies)
863 return;
864
865 if (!handler) {
866 ERROR("Called with uninitialized handler");
867 return;
868 }
869
870 if (!handler->conf) {
871 ERROR("Called with uninitialized conf");
872 return;
873 }
874 conf = handler->conf;
875
876 if (!ops->monitor_cgroup) {
877 WARN("Uninitialized monitor cgroup");
878 return;
879 }
880
881 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
882 if (len < 0)
883 return;
884
885 for (int i = 0; ops->hierarchies[i]; i++) {
886 __do_close int fd_pivot = -EBADF;
887 __do_free char *pivot_path = NULL;
888 struct hierarchy *h = ops->hierarchies[i];
889 bool cpuset_v1 = false;
890 int ret;
891
892 /* Monitor might have died before we entered the cgroup. */
893 if (handler->monitor_pid <= 0) {
894 WARN("No valid monitor process found while destroying cgroups");
895 goto cgroup_prune_tree;
896 }
897
898 if (conf->cgroup_meta.monitor_pivot_dir)
899 pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
900 else if (conf->cgroup_meta.dir)
901 pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
902 else
903 pivot_path = must_make_path(CGROUP_PIVOT, NULL);
904
905 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
906
907 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
908 if (fd_pivot < 0) {
909 SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
910 continue;
911 }
912
913 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
914 if (ret != 0) {
915 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
916 continue;
917 }
918
919 cgroup_prune_tree:
920 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
921 if (ret < 0)
922 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
923 else
924 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
925 }
926 }
927
928 /*
929 * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
930 * proper prefix directory of lxc.cgroup.dir.payload.
931 *
932 * Returns the prefix length if it is set, otherwise zero on success.
933 */
934 static bool check_cgroup_dir_config(struct lxc_conf *conf)
935 {
936 const char *monitor_dir = conf->cgroup_meta.monitor_dir,
937 *container_dir = conf->cgroup_meta.container_dir,
938 *namespace_dir = conf->cgroup_meta.namespace_dir;
939
940 /* none of the new options are set, all is fine */
941 if (!monitor_dir && !container_dir && !namespace_dir)
942 return true;
943
944 /* some are set, make sure lxc.cgroup.dir is not also set*/
945 if (conf->cgroup_meta.dir)
946 return log_error_errno(false, EINVAL,
947 "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
948
949 /* make sure both monitor and payload are set */
950 if (!monitor_dir || !container_dir)
951 return log_error_errno(false, EINVAL,
952 "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
953
954 /* namespace_dir may be empty */
955 return true;
956 }
957
958 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
959 {
960 __do_free char *monitor_cgroup = NULL;
961 int idx = 0;
962 int i;
963 size_t len;
964 char *suffix = NULL;
965 struct lxc_conf *conf;
966
967 if (!ops)
968 return ret_set_errno(false, ENOENT);
969
970 if (!ops->hierarchies)
971 return true;
972
973 if (ops->monitor_cgroup)
974 return ret_set_errno(false, EEXIST);
975
976 if (!handler || !handler->conf)
977 return ret_set_errno(false, EINVAL);
978
979 conf = handler->conf;
980
981 if (!check_cgroup_dir_config(conf))
982 return false;
983
984 if (conf->cgroup_meta.monitor_dir) {
985 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
986 } else if (conf->cgroup_meta.dir) {
987 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
988 DEFAULT_MONITOR_CGROUP_PREFIX,
989 handler->name,
990 CGROUP_CREATE_RETRY, NULL);
991 } else if (ops->cgroup_pattern) {
992 __do_free char *cgroup_tree = NULL;
993
994 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
995 if (!cgroup_tree)
996 return ret_set_errno(false, ENOMEM);
997
998 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
999 DEFAULT_MONITOR_CGROUP,
1000 CGROUP_CREATE_RETRY, NULL);
1001 } else {
1002 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1003 handler->name,
1004 CGROUP_CREATE_RETRY, NULL);
1005 }
1006 if (!monitor_cgroup)
1007 return ret_set_errno(false, ENOMEM);
1008
1009 if (!conf->cgroup_meta.monitor_dir) {
1010 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1011 *suffix = '\0';
1012 }
1013 do {
1014 if (idx && suffix)
1015 sprintf(suffix, "-%d", idx);
1016
1017 for (i = 0; ops->hierarchies[i]; i++) {
1018 if (cgroup_tree_create(ops, handler->conf,
1019 ops->hierarchies[i],
1020 monitor_cgroup, NULL, false))
1021 continue;
1022
1023 DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1024 for (int j = 0; j <= i; j++)
1025 cgroup_tree_prune_leaf(ops->hierarchies[j],
1026 monitor_cgroup, false);
1027
1028 idx++;
1029 break;
1030 }
1031 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1032
1033 if (idx == 1000 || (!suffix && idx != 0))
1034 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1035
1036 ops->monitor_cgroup = move_ptr(monitor_cgroup);
1037 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1038 }
1039
1040 /*
1041 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1042 * next cgroup_pattern-1, -2, ..., -999.
1043 */
1044 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1045 {
1046 __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1047 char *limit_cgroup;
1048 int idx = 0;
1049 int i;
1050 size_t len;
1051 char *suffix = NULL;
1052 struct lxc_conf *conf;
1053
1054 if (!ops)
1055 return ret_set_errno(false, ENOENT);
1056
1057 if (!ops->hierarchies)
1058 return true;
1059
1060 if (ops->container_cgroup || ops->container_limit_cgroup)
1061 return ret_set_errno(false, EEXIST);
1062
1063 if (!handler || !handler->conf)
1064 return ret_set_errno(false, EINVAL);
1065
1066 conf = handler->conf;
1067
1068 if (!check_cgroup_dir_config(conf))
1069 return false;
1070
1071 if (conf->cgroup_meta.container_dir) {
1072 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1073 if (!__limit_cgroup)
1074 return ret_set_errno(false, ENOMEM);
1075
1076 if (conf->cgroup_meta.namespace_dir) {
1077 container_cgroup = must_make_path(__limit_cgroup,
1078 conf->cgroup_meta.namespace_dir,
1079 NULL);
1080 limit_cgroup = __limit_cgroup;
1081 } else {
1082 /* explicit paths but without isolation */
1083 limit_cgroup = move_ptr(__limit_cgroup);
1084 container_cgroup = limit_cgroup;
1085 }
1086 } else if (conf->cgroup_meta.dir) {
1087 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1088 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1089 handler->name,
1090 CGROUP_CREATE_RETRY, NULL);
1091 container_cgroup = limit_cgroup;
1092 } else if (ops->cgroup_pattern) {
1093 __do_free char *cgroup_tree = NULL;
1094
1095 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1096 if (!cgroup_tree)
1097 return ret_set_errno(false, ENOMEM);
1098
1099 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1100 DEFAULT_PAYLOAD_CGROUP,
1101 CGROUP_CREATE_RETRY, NULL);
1102 container_cgroup = limit_cgroup;
1103 } else {
1104 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1105 handler->name,
1106 CGROUP_CREATE_RETRY, NULL);
1107 container_cgroup = limit_cgroup;
1108 }
1109 if (!limit_cgroup)
1110 return ret_set_errno(false, ENOMEM);
1111
1112 if (!conf->cgroup_meta.container_dir) {
1113 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1114 *suffix = '\0';
1115 }
1116 do {
1117 if (idx && suffix)
1118 sprintf(suffix, "-%d", idx);
1119
1120 for (i = 0; ops->hierarchies[i]; i++) {
1121 if (cgroup_tree_create(ops, handler->conf,
1122 ops->hierarchies[i], limit_cgroup,
1123 conf->cgroup_meta.namespace_dir,
1124 true))
1125 continue;
1126
1127 DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
1128 for (int j = 0; j <= i; j++)
1129 cgroup_tree_prune_leaf(ops->hierarchies[j],
1130 limit_cgroup, true);
1131
1132 idx++;
1133 break;
1134 }
1135 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1136
1137 if (idx == 1000 || (!suffix && idx != 0))
1138 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1139
1140 ops->container_cgroup = move_ptr(container_cgroup);
1141 if (__limit_cgroup)
1142 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1143 else
1144 ops->container_limit_cgroup = ops->container_cgroup;
1145 INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1146 ops->container_cgroup, ops->container_limit_cgroup);
1147 return true;
1148 }
1149
1150 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1151 struct lxc_handler *handler)
1152 {
1153 int monitor_len, transient_len = 0;
1154 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1155 transient[INTTYPE_TO_STRLEN(pid_t)];
1156
1157 if (!ops)
1158 return ret_set_errno(false, ENOENT);
1159
1160 if (!ops->hierarchies)
1161 return true;
1162
1163 if (!ops->monitor_cgroup)
1164 return ret_set_errno(false, ENOENT);
1165
1166 if (!handler || !handler->conf)
1167 return ret_set_errno(false, EINVAL);
1168
1169 monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1170 if (monitor_len < 0)
1171 return false;
1172
1173 if (handler->transient_pid > 0) {
1174 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1175 if (transient_len < 0)
1176 return false;
1177 }
1178
1179 for (int i = 0; ops->hierarchies[i]; i++) {
1180 struct hierarchy *h = ops->hierarchies[i];
1181 int ret;
1182
1183 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
1184 if (ret)
1185 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1186
1187 TRACE("Moved monitor into cgroup %d", h->dfd_mon);
1188
1189 if (handler->transient_pid <= 0)
1190 continue;
1191
1192 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
1193 if (ret)
1194 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1195
1196 TRACE("Moved transient process into cgroup %d", h->dfd_mon);
1197
1198 /*
1199 * we don't keep the fds for non-unified hierarchies around
1200 * mainly because we don't make use of them anymore after the
1201 * core cgroup setup is done but also because there are quite a
1202 * lot of them.
1203 */
1204 if (!is_unified_hierarchy(h))
1205 close_prot_errno_disarm(h->dfd_mon);
1206 }
1207 handler->transient_pid = -1;
1208
1209 return true;
1210 }
1211
1212 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1213 struct lxc_handler *handler)
1214 {
1215 int len;
1216 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1217
1218 if (!ops)
1219 return ret_set_errno(false, ENOENT);
1220
1221 if (!ops->hierarchies)
1222 return true;
1223
1224 if (!ops->container_cgroup)
1225 return ret_set_errno(false, ENOENT);
1226
1227 if (!handler || !handler->conf)
1228 return ret_set_errno(false, EINVAL);
1229
1230 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1231 if (len < 0)
1232 return false;
1233
1234 for (int i = 0; ops->hierarchies[i]; i++) {
1235 struct hierarchy *h = ops->hierarchies[i];
1236 int ret;
1237
1238 if (is_unified_hierarchy(h) &&
1239 (handler->clone_flags & CLONE_INTO_CGROUP))
1240 continue;
1241
1242 ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
1243 if (ret != 0)
1244 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
1245
1246 TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
1247 }
1248
1249 return true;
1250 }
1251
1252 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1253 gid_t chown_gid, mode_t chmod_mode)
1254 {
1255 int ret;
1256
1257 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1258 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1259 if (ret < 0)
1260 return log_warn_errno(-1,
1261 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1262 dirfd, path, (int)chown_uid,
1263 (int)chown_gid);
1264
1265 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1266 if (ret < 0)
1267 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1268 dirfd, path, (int)chmod_mode);
1269
1270 return 0;
1271 }
1272
1273 /* chgrp the container cgroups to container group. We leave
1274 * the container owner as cgroup owner. So we must make the
1275 * directories 775 so that the container can create sub-cgroups.
1276 *
1277 * Also chown the tasks and cgroup.procs files. Those may not
1278 * exist depending on kernel version.
1279 */
1280 static int chown_cgroup_wrapper(void *data)
1281 {
1282 int ret;
1283 uid_t destuid;
1284 struct generic_userns_exec_data *arg = data;
1285 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1286 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1287
1288 if (!lxc_drop_groups() && errno != EPERM)
1289 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1290
1291 ret = setresgid(nsgid, nsgid, nsgid);
1292 if (ret < 0)
1293 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1294 (int)nsgid, (int)nsgid, (int)nsgid);
1295
1296 ret = setresuid(nsuid, nsuid, nsuid);
1297 if (ret < 0)
1298 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1299 (int)nsuid, (int)nsuid, (int)nsuid);
1300
1301 destuid = get_ns_uid(arg->origuid);
1302 if (destuid == LXC_INVALID_UID)
1303 destuid = 0;
1304
1305 for (int i = 0; arg->hierarchies[i]; i++) {
1306 int dirfd = arg->hierarchies[i]->dfd_con;
1307
1308 if (dirfd < 0)
1309 return syserrno_set(-EBADF, "Invalid cgroup file descriptor");
1310
1311 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1312
1313 /*
1314 * Failures to chown() these are inconvenient but not
1315 * detrimental We leave these owned by the container launcher,
1316 * so that container root can write to the files to attach. We
1317 * chmod() them 664 so that container systemd can write to the
1318 * files (which systemd in wily insists on doing).
1319 */
1320
1321 if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
1322 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1323
1324 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1325
1326 if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
1327 continue;
1328
1329 for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
1330 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1331 }
1332
1333 return 0;
1334 }
1335
1336 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1337 struct lxc_conf *conf)
1338 {
1339 struct generic_userns_exec_data wrap;
1340
1341 if (!ops)
1342 return ret_set_errno(false, ENOENT);
1343
1344 if (!ops->hierarchies)
1345 return true;
1346
1347 if (!ops->container_cgroup)
1348 return ret_set_errno(false, ENOENT);
1349
1350 if (!conf)
1351 return ret_set_errno(false, EINVAL);
1352
1353 if (lxc_list_empty(&conf->id_map))
1354 return true;
1355
1356 wrap.origuid = geteuid();
1357 wrap.path = NULL;
1358 wrap.hierarchies = ops->hierarchies;
1359 wrap.conf = conf;
1360
1361 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1362 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1363
1364 return true;
1365 }
1366
1367 __cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops)
1368 {
1369 if (!ops)
1370 return;
1371
1372 if (!ops->hierarchies)
1373 return;
1374
1375 for (int i = 0; ops->hierarchies[i]; i++) {
1376 struct hierarchy *h = ops->hierarchies[i];
1377 /*
1378 * we don't keep the fds for non-unified hierarchies around
1379 * mainly because we don't make use of them anymore after the
1380 * core cgroup setup is done but also because there are quite a
1381 * lot of them.
1382 */
1383 if (!is_unified_hierarchy(h))
1384 close_prot_errno_disarm(h->dfd_con);
1385 }
1386
1387 /*
1388 * The checking for freezer support should obviously be done at cgroup
1389 * initialization time but that doesn't work reliable. The freezer
1390 * controller has been demoted (rightly so) to a simple file located in
1391 * each non-root cgroup. At the time when the container is created we
1392 * might still be located in /sys/fs/cgroup and so checking for
1393 * cgroup.freeze won't tell us anything because this file doesn't exist
1394 * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1395 * find an already existing cgroup and then check within that cgroup
1396 * for the existence of cgroup.freeze but that will only work on
1397 * systemd based hosts. Other init systems might not manage cgroups and
1398 * so no cgroup will exist. So we defer until we have created cgroups
1399 * for our container which means we check here.
1400 */
1401 if (pure_unified_layout(ops) &&
1402 !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
1403 AT_SYMLINK_NOFOLLOW)) {
1404 TRACE("Unified hierarchy supports freezer");
1405 ops->unified->utilities |= FREEZER_CONTROLLER;
1406 }
1407 }
1408
1409 /* cgroup-full:* is done, no need to create subdirs */
1410 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1411 {
1412 switch (cgroup_automount_type) {
1413 case LXC_AUTO_CGROUP_RO:
1414 return true;
1415 case LXC_AUTO_CGROUP_RW:
1416 return true;
1417 case LXC_AUTO_CGROUP_MIXED:
1418 return true;
1419 }
1420
1421 return false;
1422 }
1423
1424 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1425 * remount controller ro if needed and bindmount the cgroupfs onto
1426 * control/the/cg/path.
1427 */
1428 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1429 char *hierarchy_mnt, char *cgpath,
1430 const char *container_cgroup)
1431 {
1432 __do_free char *sourcepath = NULL;
1433 int ret, remount_flags;
1434 int flags = MS_BIND;
1435
1436 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1437 (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1438 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
1439 if (ret < 0)
1440 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1441 hierarchy_mnt, hierarchy_mnt);
1442
1443 remount_flags = add_required_remount_flags(hierarchy_mnt,
1444 hierarchy_mnt,
1445 flags | MS_REMOUNT);
1446 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
1447 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1448 NULL);
1449 if (ret < 0)
1450 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
1451
1452 INFO("Remounted %s read-only", hierarchy_mnt);
1453 }
1454
1455 sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
1456 if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1457 flags |= MS_RDONLY;
1458
1459 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1460 if (ret < 0)
1461 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1462 h->controllers[0], cgpath);
1463 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1464
1465 if (flags & MS_RDONLY) {
1466 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1467 flags | MS_REMOUNT);
1468 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1469 if (ret < 0)
1470 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1471 INFO("Remounted %s read-only", cgpath);
1472 }
1473
1474 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1475 return 0;
1476 }
1477
1478 /* __cgroupfs_mount
1479 *
1480 * Mount cgroup hierarchies directly without using bind-mounts. The main
1481 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1482 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1483 */
1484 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1485 struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1486 const char *hierarchy_mnt)
1487 {
1488 __do_close int fd_fs = -EBADF;
1489 unsigned int flags = 0;
1490 char *fstype;
1491 int ret;
1492
1493 if (dfd_mnt_cgroupfs < 0)
1494 return ret_errno(EINVAL);
1495
1496 flags |= MOUNT_ATTR_NOSUID;
1497 flags |= MOUNT_ATTR_NOEXEC;
1498 flags |= MOUNT_ATTR_NODEV;
1499 flags |= MOUNT_ATTR_RELATIME;
1500
1501 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1502 (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1503 flags |= MOUNT_ATTR_RDONLY;
1504
1505 if (is_unified_hierarchy(h))
1506 fstype = "cgroup2";
1507 else
1508 fstype = "cgroup";
1509
1510 if (can_use_mount_api()) {
1511 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1512 if (fd_fs < 0)
1513 return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1514
1515 if (!is_unified_hierarchy(h)) {
1516 for (const char **it = (const char **)h->controllers; it && *it; it++) {
1517 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1518 ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1519 else
1520 ret = fs_set_property(fd_fs, *it, "");
1521 if (ret < 0)
1522 return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1523 }
1524 }
1525
1526 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1527 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1528 flags);
1529 } else {
1530 __do_free char *controllers = NULL, *target = NULL;
1531 unsigned int old_flags = 0;
1532 const char *rootfs_mnt;
1533
1534 if (!is_unified_hierarchy(h)) {
1535 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1536 if (!controllers)
1537 return ret_errno(ENOMEM);
1538 }
1539
1540 rootfs_mnt = get_rootfs_mnt(rootfs);
1541 ret = mnt_attributes_old(flags, &old_flags);
1542 if (ret)
1543 return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1544
1545 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1546 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1547 }
1548 if (ret < 0)
1549 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1550 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1551
1552 DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1553 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1554 return 0;
1555 }
1556
1557 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1558 struct lxc_rootfs *rootfs,
1559 int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1560 {
1561 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1562 dfd_mnt_cgroupfs, hierarchy_mnt);
1563 }
1564
1565 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1566 struct lxc_rootfs *rootfs,
1567 int dfd_mnt_cgroupfs,
1568 const char *hierarchy_mnt)
1569 {
1570 switch (cgroup_automount_type) {
1571 case LXC_AUTO_CGROUP_FULL_RO:
1572 break;
1573 case LXC_AUTO_CGROUP_FULL_RW:
1574 break;
1575 case LXC_AUTO_CGROUP_FULL_MIXED:
1576 break;
1577 default:
1578 return 0;
1579 }
1580
1581 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1582 dfd_mnt_cgroupfs, hierarchy_mnt);
1583 }
1584
1585 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1586 struct lxc_handler *handler, int cg_flags)
1587 {
1588 __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1589 __do_free char *cgroup_root = NULL;
1590 int cgroup_automount_type;
1591 bool in_cgroup_ns = false, wants_force_mount = false;
1592 struct lxc_conf *conf = handler->conf;
1593 struct lxc_rootfs *rootfs = &conf->rootfs;
1594 const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1595 int ret;
1596
1597 if (!ops)
1598 return ret_set_errno(false, ENOENT);
1599
1600 if (!ops->hierarchies)
1601 return true;
1602
1603 if (!conf)
1604 return ret_set_errno(false, EINVAL);
1605
1606 if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1607 return log_trace(true, "No cgroup mounts requested");
1608
1609 if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1610 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1611 wants_force_mount = true;
1612 }
1613
1614 switch (cg_flags) {
1615 case LXC_AUTO_CGROUP_RO:
1616 TRACE("Read-only cgroup mounts requested");
1617 break;
1618 case LXC_AUTO_CGROUP_RW:
1619 TRACE("Read-write cgroup mounts requested");
1620 break;
1621 case LXC_AUTO_CGROUP_MIXED:
1622 TRACE("Mixed cgroup mounts requested");
1623 break;
1624 case LXC_AUTO_CGROUP_FULL_RO:
1625 TRACE("Full read-only cgroup mounts requested");
1626 break;
1627 case LXC_AUTO_CGROUP_FULL_RW:
1628 TRACE("Full read-write cgroup mounts requested");
1629 break;
1630 case LXC_AUTO_CGROUP_FULL_MIXED:
1631 TRACE("Full mixed cgroup mounts requested");
1632 break;
1633 default:
1634 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1635 }
1636 cgroup_automount_type = cg_flags;
1637
1638 if (!wants_force_mount) {
1639 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
1640
1641 /*
1642 * Most recent distro versions currently have init system that
1643 * do support cgroup2 but do not mount it by default unless
1644 * explicitly told so even if the host is cgroup2 only. That
1645 * means they often will fail to boot. Fix this by pre-mounting
1646 * cgroup2 by default. We will likely need to be doing this a
1647 * few years until all distros have switched over to cgroup2 at
1648 * which point we can safely assume that their init systems
1649 * will mount it themselves.
1650 */
1651 if (pure_unified_layout(ops))
1652 wants_force_mount = true;
1653 }
1654
1655 if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
1656 in_cgroup_ns = true;
1657
1658 if (in_cgroup_ns && !wants_force_mount)
1659 return log_trace(true, "Mounting cgroups not requested or needed");
1660
1661 /* This is really the codepath that we want. */
1662 if (pure_unified_layout(ops)) {
1663 __do_close int dfd_mnt_unified = -EBADF;
1664
1665 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1666 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1667 if (dfd_mnt_unified < 0)
1668 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
1669 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1670 /*
1671 * If cgroup namespaces are supported but the container will
1672 * not have CAP_SYS_ADMIN after it has started we need to mount
1673 * the cgroups manually.
1674 *
1675 * Note that here we know that wants_force_mount is true.
1676 * Otherwise we would've returned early above.
1677 */
1678 if (in_cgroup_ns) {
1679 /*
1680 * 1. cgroup:rw:force -> Mount the cgroup2 filesystem.
1681 * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only.
1682 * 3. cgroup:mixed:force -> See comment above how this
1683 * does not apply so
1684 * cgroup:mixed is equal to
1685 * cgroup:rw when cgroup
1686 * namespaces are supported.
1687
1688 * 4. cgroup:rw -> No-op; init system responsible for mounting.
1689 * 5. cgroup:ro -> No-op; init system responsible for mounting.
1690 * 6. cgroup:mixed -> No-op; init system responsible for mounting.
1691 *
1692 * 7. cgroup-full:rw -> Not supported.
1693 * 8. cgroup-full:ro -> Not supported.
1694 * 9. cgroup-full:mixed -> Not supported.
1695
1696 * 10. cgroup-full:rw:force -> Not supported.
1697 * 11. cgroup-full:ro:force -> Not supported.
1698 * 12. cgroup-full:mixed:force -> Not supported.
1699 */
1700 ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
1701 if (ret < 0)
1702 return syserrno(false, "Failed to force mount cgroup filesystem in cgroup namespace");
1703
1704 return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
1705 } else {
1706 /*
1707 * Either no cgroup namespace supported (highly
1708 * unlikely unless we're dealing with a Frankenkernel.
1709 * Or the user requested to keep the cgroup namespace
1710 * of the host or another container.
1711 */
1712 if (wants_force_mount) {
1713 /*
1714 * 1. cgroup:rw:force -> Bind-mount the cgroup2 filesystem writable.
1715 * 2. cgroup:ro:force -> Bind-mount the cgroup2 filesystem read-only.
1716 * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
1717 * and make the parent directory of the
1718 * container's cgroup read-only but the
1719 * container's cgroup writable.
1720 *
1721 * 10. cgroup-full:rw:force ->
1722 * 11. cgroup-full:ro:force ->
1723 * 12. cgroup-full:mixed:force ->
1724 */
1725 errno = EOPNOTSUPP;
1726 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1727 } else {
1728 errno = EOPNOTSUPP;
1729 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1730 }
1731 }
1732
1733 return syserrno(false, "Failed to mount cgroups");
1734 }
1735
1736 /*
1737 * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1738 * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1739 * DEFAULT_CGROUP_MOUNTPOINT define.
1740 */
1741 if (can_use_mount_api()) {
1742 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1743 if (fd_fs < 0)
1744 return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1745
1746 ret = fs_set_property(fd_fs, "mode", "0755");
1747 if (ret < 0)
1748 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1749
1750 ret = fs_set_property(fd_fs, "size", "10240k");
1751 if (ret < 0)
1752 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1753
1754 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1755 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1756 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1757 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
1758 } else {
1759 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1760 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1761 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1762 "size=10240k,mode=755", rootfs_mnt);
1763 }
1764 if (ret < 0)
1765 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1766 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1767
1768 dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1769 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1770 if (dfd_mnt_tmpfs < 0)
1771 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
1772 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1773
1774 for (int i = 0; ops->hierarchies[i]; i++) {
1775 __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
1776 struct hierarchy *h = ops->hierarchies[i];
1777
1778 ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
1779 if (ret < 0)
1780 return syserrno(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
1781
1782 if (in_cgroup_ns && wants_force_mount) {
1783 /*
1784 * If cgroup namespaces are supported but the container
1785 * will not have CAP_SYS_ADMIN after it has started we
1786 * need to mount the cgroups manually.
1787 */
1788 ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
1789 dfd_mnt_tmpfs, h->at_mnt);
1790 if (ret < 0)
1791 return false;
1792
1793 continue;
1794 }
1795
1796 /* Here is where the ancient kernel section begins. */
1797 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
1798 dfd_mnt_tmpfs, h->at_mnt);
1799 if (ret < 0)
1800 return false;
1801
1802 if (!cg_mount_needs_subdirs(cgroup_automount_type))
1803 continue;
1804
1805 if (!cgroup_root)
1806 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1807
1808 hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
1809 path2 = must_make_path(hierarchy_mnt, h->at_base,
1810 ops->container_cgroup, NULL);
1811 ret = mkdir_p(path2, 0755);
1812 if (ret < 0 && (errno != EEXIST))
1813 return false;
1814
1815 ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
1816 hierarchy_mnt, path2,
1817 ops->container_cgroup);
1818 if (ret < 0)
1819 return false;
1820 }
1821
1822 return true;
1823 }
1824
1825 /* Only root needs to escape to the cgroup of its init. */
1826 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
1827 struct lxc_conf *conf)
1828 {
1829 if (!ops)
1830 return ret_set_errno(false, ENOENT);
1831
1832 if (!ops->hierarchies)
1833 return true;
1834
1835 if (!conf)
1836 return ret_set_errno(false, EINVAL);
1837
1838 if (conf->cgroup_meta.relative || geteuid())
1839 return true;
1840
1841 for (int i = 0; ops->hierarchies[i]; i++) {
1842 __do_free char *fullpath = NULL;
1843 int ret;
1844
1845 fullpath = make_cgroup_path(ops->hierarchies[i],
1846 ops->hierarchies[i]->at_base,
1847 "cgroup.procs", NULL);
1848 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1849 if (ret != 0)
1850 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
1851 }
1852
1853 return true;
1854 }
1855
1856 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
1857 {
1858 int i = 0;
1859
1860 if (!ops)
1861 return ret_set_errno(-1, ENOENT);
1862
1863 if (!ops->hierarchies)
1864 return 0;
1865
1866 for (; ops->hierarchies[i]; i++)
1867 ;
1868
1869 return i;
1870 }
1871
1872 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
1873 int n, char ***out)
1874 {
1875 int i;
1876
1877 if (!ops)
1878 return ret_set_errno(false, ENOENT);
1879
1880 if (!ops->hierarchies)
1881 return ret_set_errno(false, ENOENT);
1882
1883 /* sanity check n */
1884 for (i = 0; i < n; i++)
1885 if (!ops->hierarchies[i])
1886 return ret_set_errno(false, ENOENT);
1887
1888 *out = ops->hierarchies[i]->controllers;
1889
1890 return true;
1891 }
1892
1893 static int cg_legacy_freeze(struct cgroup_ops *ops)
1894 {
1895 struct hierarchy *h;
1896
1897 h = get_hierarchy(ops, "freezer");
1898 if (!h)
1899 return ret_set_errno(-1, ENOENT);
1900
1901 return lxc_write_openat(h->path_con, "freezer.state",
1902 "FROZEN", STRLITERALLEN("FROZEN"));
1903 }
1904
1905 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1906 struct lxc_epoll_descr *descr)
1907 {
1908 __do_free char *line = NULL;
1909 __do_fclose FILE *f = NULL;
1910 int state = PTR_TO_INT(cbdata);
1911 size_t len;
1912 const char *state_string;
1913
1914 f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
1915 if (!f)
1916 return LXC_MAINLOOP_ERROR;
1917
1918 if (state == 1)
1919 state_string = "frozen 1";
1920 else
1921 state_string = "frozen 0";
1922
1923 while (getline(&line, &len, f) != -1)
1924 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
1925 return LXC_MAINLOOP_CLOSE;
1926
1927 rewind(f);
1928
1929 return LXC_MAINLOOP_CONTINUE;
1930 }
1931
1932 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
1933 const char *state_string,
1934 int state_num,
1935 const char *epoll_error,
1936 const char *wait_error)
1937 {
1938 __do_close int fd = -EBADF;
1939 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
1940 int ret;
1941 struct lxc_epoll_descr descr;
1942 struct hierarchy *h;
1943
1944 h = ops->unified;
1945 if (!h)
1946 return ret_set_errno(-1, ENOENT);
1947
1948 if (!h->path_con)
1949 return ret_set_errno(-1, EEXIST);
1950
1951 if (timeout != 0) {
1952 __do_free char *events_file = NULL;
1953
1954 events_file = must_make_path(h->path_con, "cgroup.events", NULL);
1955 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1956 if (fd < 0)
1957 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
1958
1959 ret = lxc_mainloop_open(&descr);
1960 if (ret)
1961 return log_error_errno(-1, errno, "%s", epoll_error);
1962
1963 /* automatically cleaned up now */
1964 descr_ptr = &descr;
1965
1966 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
1967 if (ret < 0)
1968 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
1969 }
1970
1971 ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
1972 if (ret < 0)
1973 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
1974
1975 if (timeout != 0 && lxc_mainloop(&descr, timeout))
1976 return log_error_errno(-1, errno, "%s", wait_error);
1977
1978 return 0;
1979 }
1980
1981 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1982 {
1983 return cg_unified_freeze_do(ops, timeout, "1", 1,
1984 "Failed to create epoll instance to wait for container freeze",
1985 "Failed to wait for container to be frozen");
1986 }
1987
1988 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
1989 {
1990 if (!ops->hierarchies)
1991 return ret_set_errno(-1, ENOENT);
1992
1993 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
1994 return cg_legacy_freeze(ops);
1995
1996 return cg_unified_freeze(ops, timeout);
1997 }
1998
1999 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
2000 {
2001 struct hierarchy *h;
2002
2003 h = get_hierarchy(ops, "freezer");
2004 if (!h)
2005 return ret_set_errno(-1, ENOENT);
2006
2007 return lxc_write_openat(h->path_con, "freezer.state",
2008 "THAWED", STRLITERALLEN("THAWED"));
2009 }
2010
2011 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2012 {
2013 return cg_unified_freeze_do(ops, timeout, "0", 0,
2014 "Failed to create epoll instance to wait for container unfreeze",
2015 "Failed to wait for container to be unfrozen");
2016 }
2017
2018 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2019 {
2020 if (!ops->hierarchies)
2021 return ret_set_errno(-1, ENOENT);
2022
2023 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2024 return cg_legacy_unfreeze(ops);
2025
2026 return cg_unified_unfreeze(ops, timeout);
2027 }
2028
2029 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2030 const char *controller, bool limiting)
2031 {
2032 struct hierarchy *h;
2033 size_t len;
2034 const char *path;
2035
2036 h = get_hierarchy(ops, controller);
2037 if (!h)
2038 return log_warn_errno(NULL, ENOENT,
2039 "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
2040
2041 if (limiting)
2042 path = h->path_lim;
2043 else
2044 path = h->path_con;
2045 if (!path)
2046 return NULL;
2047
2048 len = strlen(h->at_mnt);
2049 if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
2050 STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
2051 path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
2052 path += strspn(path, "/");
2053 }
2054 return path += len;
2055 }
2056
2057 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2058 const char *controller)
2059 {
2060 return cgfsng_get_cgroup_do(ops, controller, false);
2061 }
2062
2063 __cgfsng_ops static const char *cgfsng_get_limiting_cgroup(struct cgroup_ops *ops,
2064 const char *controller)
2065 {
2066 return cgfsng_get_cgroup_do(ops, controller, true);
2067 }
2068
2069 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2070 * which must be freed by the caller.
2071 */
2072 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2073 const char *inpath,
2074 const char *filename)
2075 {
2076 return make_cgroup_path(h, inpath, filename, NULL);
2077 }
2078
2079 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2080 {
2081 int idx = 1;
2082 int ret;
2083 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2084 ssize_t pidstr_len;
2085
2086 /* Create leaf cgroup. */
2087 ret = mkdirat(unified_fd, ".lxc", 0755);
2088 if (ret < 0 && errno != EEXIST)
2089 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2090
2091 pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2092 if (pidstr_len < 0)
2093 return pidstr_len;
2094
2095 ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2096 if (ret < 0)
2097 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2098 if (ret == 0)
2099 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2100
2101 /* this is a non-leaf node */
2102 if (errno != EBUSY)
2103 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2104
2105 do {
2106 bool rm = false;
2107 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2108 char *slash = attach_cgroup;
2109
2110 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2111 if (ret < 0)
2112 return ret;
2113
2114 /*
2115 * This shouldn't really happen but the compiler might complain
2116 * that a short write would cause a buffer overrun. So be on
2117 * the safe side.
2118 */
2119 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2120 return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2121
2122 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2123 *slash = '\0';
2124
2125 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2126 if (ret < 0 && errno != EEXIST)
2127 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2128 if (ret == 0)
2129 rm = true;
2130
2131 *slash = '/';
2132
2133 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2134 if (ret == 0)
2135 return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2136
2137 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2138 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2139
2140 /* this is a non-leaf node */
2141 if (errno != EBUSY)
2142 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2143
2144 idx++;
2145 } while (idx < 1000);
2146
2147 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2148 }
2149
2150 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2151 int unified_fd, int *sk_fd)
2152 {
2153 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2154 int target_fds[2];
2155 ssize_t ret;
2156
2157 /* Create leaf cgroup. */
2158 ret = mkdirat(unified_fd, ".lxc", 0755);
2159 if (ret < 0 && errno != EEXIST)
2160 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2161
2162 target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2163 if (target_fd0 < 0)
2164 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2165 target_fds[0] = target_fd0;
2166
2167 target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2168 if (target_fd1 < 0)
2169 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2170 target_fds[1] = target_fd1;
2171
2172 ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2173 if (ret <= 0)
2174 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2175 target_fd0, target_fd1);
2176
2177 return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2178 }
2179
2180 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2181 int *sk_fd, pid_t pid)
2182 {
2183 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2184 int target_fds[2];
2185 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2186 size_t pidstr_len;
2187 ssize_t ret;
2188
2189 ret = lxc_abstract_unix_recv_fds(sk, target_fds, 2, NULL, 0);
2190 if (ret <= 0)
2191 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2192 target_fd0 = target_fds[0];
2193 target_fd1 = target_fds[1];
2194
2195 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2196
2197 ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2198 if (ret > 0 && ret == pidstr_len)
2199 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2200
2201 ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2202 if (ret > 0 && ret == pidstr_len)
2203 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2204
2205 return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2206 target_fd0, target_fd1);
2207 }
2208
2209 struct userns_exec_unified_attach_data {
2210 const struct lxc_conf *conf;
2211 int unified_fd;
2212 int sk_pair[2];
2213 pid_t pid;
2214 };
2215
2216 static int cgroup_unified_attach_child_wrapper(void *data)
2217 {
2218 struct userns_exec_unified_attach_data *args = data;
2219
2220 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2221 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2222 return ret_errno(EINVAL);
2223
2224 close_prot_errno_disarm(args->sk_pair[0]);
2225 return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2226 &args->sk_pair[1]);
2227 }
2228
2229 static int cgroup_unified_attach_parent_wrapper(void *data)
2230 {
2231 struct userns_exec_unified_attach_data *args = data;
2232
2233 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2234 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2235 return ret_errno(EINVAL);
2236
2237 close_prot_errno_disarm(args->sk_pair[1]);
2238 return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2239 args->pid);
2240 }
2241
2242 /* Technically, we're always at a delegation boundary here (This is especially
2243 * true when cgroup namespaces are available.). The reasoning is that in order
2244 * for us to have been able to start a container in the first place the root
2245 * cgroup must have been a leaf node. Now, either the container's init system
2246 * has populated the cgroup and kept it as a leaf node or it has created
2247 * subtrees. In the former case we will simply attach to the leaf node we
2248 * created when we started the container in the latter case we create our own
2249 * cgroup for the attaching process.
2250 */
2251 static int __cg_unified_attach(const struct hierarchy *h,
2252 const struct lxc_conf *conf, const char *name,
2253 const char *lxcpath, pid_t pid,
2254 const char *controller)
2255 {
2256 __do_close int unified_fd = -EBADF;
2257 __do_free char *path = NULL, *cgroup = NULL;
2258 int ret;
2259
2260 if (!conf || !name || !lxcpath || pid <= 0)
2261 return ret_errno(EINVAL);
2262
2263 ret = cgroup_attach(conf, name, lxcpath, pid);
2264 if (ret == 0)
2265 return log_trace(0, "Attached to unified cgroup via command handler");
2266 if (ret != -ENOCGROUP2)
2267 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2268
2269 /* Fall back to retrieving the path for the unified cgroup. */
2270 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2271 /* not running */
2272 if (!cgroup)
2273 return 0;
2274
2275 path = make_cgroup_path(h, cgroup, NULL);
2276
2277 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2278 if (unified_fd < 0)
2279 return ret_errno(EBADF);
2280
2281 if (!lxc_list_empty(&conf->id_map)) {
2282 struct userns_exec_unified_attach_data args = {
2283 .conf = conf,
2284 .unified_fd = unified_fd,
2285 .pid = pid,
2286 };
2287
2288 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2289 if (ret < 0)
2290 return -errno;
2291
2292 ret = userns_exec_minimal(conf,
2293 cgroup_unified_attach_parent_wrapper,
2294 &args,
2295 cgroup_unified_attach_child_wrapper,
2296 &args);
2297 } else {
2298 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2299 }
2300
2301 return ret;
2302 }
2303
2304 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2305 const struct lxc_conf *conf,
2306 const char *name, const char *lxcpath,
2307 pid_t pid)
2308 {
2309 int len, ret;
2310 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2311
2312 if (!ops)
2313 return ret_set_errno(false, ENOENT);
2314
2315 if (!ops->hierarchies)
2316 return true;
2317
2318 len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2319 if (len < 0)
2320 return false;
2321
2322 for (int i = 0; ops->hierarchies[i]; i++) {
2323 __do_free char *fullpath = NULL, *path = NULL;
2324 struct hierarchy *h = ops->hierarchies[i];
2325
2326 if (h->fs_type == UNIFIED_HIERARCHY) {
2327 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2328 h->controllers[0]);
2329 if (ret < 0)
2330 return false;
2331
2332 continue;
2333 }
2334
2335 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2336 /* not running */
2337 if (!path)
2338 return false;
2339
2340 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2341 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2342 if (ret < 0)
2343 return log_error_errno(false, errno, "Failed to attach %d to %s",
2344 (int)pid, fullpath);
2345 }
2346
2347 return true;
2348 }
2349
2350 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2351 * don't have a cgroup_data set up, so we ask the running container through the
2352 * commands API for the cgroup path.
2353 */
2354 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2355 char *value, size_t len, const char *name,
2356 const char *lxcpath)
2357 {
2358 __do_free char *path = NULL;
2359 __do_free char *controller = NULL;
2360 char *p;
2361 struct hierarchy *h;
2362 int ret = -1;
2363
2364 if (!ops)
2365 return ret_set_errno(-1, ENOENT);
2366
2367 controller = strdup(filename);
2368 if (!controller)
2369 return ret_errno(ENOMEM);
2370
2371 p = strchr(controller, '.');
2372 if (p)
2373 *p = '\0';
2374
2375 path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2376 /* not running */
2377 if (!path)
2378 return -1;
2379
2380 h = get_hierarchy(ops, controller);
2381 if (h) {
2382 __do_free char *fullpath = NULL;
2383
2384 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2385 ret = lxc_read_from_file(fullpath, value, len);
2386 }
2387
2388 return ret;
2389 }
2390
2391 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2392 {
2393 for (int count = 0; count < 3; count++, val++) {
2394 switch (*val) {
2395 case 'r':
2396 device->access[count] = *val;
2397 break;
2398 case 'w':
2399 device->access[count] = *val;
2400 break;
2401 case 'm':
2402 device->access[count] = *val;
2403 break;
2404 case '\n':
2405 case '\0':
2406 count = 3;
2407 break;
2408 default:
2409 return ret_errno(EINVAL);
2410 }
2411 }
2412
2413 return 0;
2414 }
2415
2416 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2417 const char *val)
2418 {
2419 int count, ret;
2420 char temp[50];
2421
2422 if (strequal("devices.allow", key))
2423 device->allow = 1; /* allow the device */
2424 else
2425 device->allow = 0; /* deny the device */
2426
2427 if (strequal(val, "a")) {
2428 /* global rule */
2429 device->type = 'a';
2430 device->major = -1;
2431 device->minor = -1;
2432 return 0;
2433 }
2434
2435 switch (*val) {
2436 case 'a':
2437 __fallthrough;
2438 case 'b':
2439 __fallthrough;
2440 case 'c':
2441 device->type = *val;
2442 break;
2443 default:
2444 return -1;
2445 }
2446
2447 val++;
2448 if (!isspace(*val))
2449 return -1;
2450 val++;
2451 if (*val == '*') {
2452 device->major = -1;
2453 val++;
2454 } else if (isdigit(*val)) {
2455 memset(temp, 0, sizeof(temp));
2456 for (count = 0; count < sizeof(temp) - 1; count++) {
2457 temp[count] = *val;
2458 val++;
2459 if (!isdigit(*val))
2460 break;
2461 }
2462 ret = lxc_safe_int(temp, &device->major);
2463 if (ret)
2464 return -1;
2465 } else {
2466 return -1;
2467 }
2468 if (*val != ':')
2469 return -1;
2470 val++;
2471
2472 /* read minor */
2473 if (*val == '*') {
2474 device->minor = -1;
2475 val++;
2476 } else if (isdigit(*val)) {
2477 memset(temp, 0, sizeof(temp));
2478 for (count = 0; count < sizeof(temp) - 1; count++) {
2479 temp[count] = *val;
2480 val++;
2481 if (!isdigit(*val))
2482 break;
2483 }
2484 ret = lxc_safe_int(temp, &device->minor);
2485 if (ret)
2486 return -1;
2487 } else {
2488 return -1;
2489 }
2490 if (!isspace(*val))
2491 return -1;
2492
2493 return device_cgroup_parse_access(device, ++val);
2494 }
2495
2496 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2497 * don't have a cgroup_data set up, so we ask the running container through the
2498 * commands API for the cgroup path.
2499 */
2500 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2501 const char *key, const char *value,
2502 const char *name, const char *lxcpath)
2503 {
2504 __do_free char *path = NULL;
2505 __do_free char *controller = NULL;
2506 char *p;
2507 struct hierarchy *h;
2508 int ret = -1;
2509
2510 if (!ops || is_empty_string(key) || is_empty_string(value) ||
2511 is_empty_string(name) || is_empty_string(lxcpath))
2512 return ret_errno(EINVAL);
2513
2514 controller = strdup(key);
2515 if (!controller)
2516 return ret_errno(ENOMEM);
2517
2518 p = strchr(controller, '.');
2519 if (p)
2520 *p = '\0';
2521
2522 if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2523 struct device_item device = {};
2524
2525 ret = device_cgroup_rule_parse(&device, key, value);
2526 if (ret < 0)
2527 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2528 key, value);
2529
2530 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2531 if (ret < 0)
2532 return -1;
2533
2534 return 0;
2535 }
2536
2537 path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2538 /* not running */
2539 if (!path)
2540 return -1;
2541
2542 h = get_hierarchy(ops, controller);
2543 if (h) {
2544 __do_free char *fullpath = NULL;
2545
2546 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2547 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2548 }
2549
2550 return ret;
2551 }
2552
2553 /* take devices cgroup line
2554 * /dev/foo rwx
2555 * and convert it to a valid
2556 * type major:minor mode
2557 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2558 * the output.
2559 */
2560 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2561 const char *devpath)
2562 {
2563 __do_free char *path = NULL;
2564 char *mode = NULL;
2565 int n_parts, ret;
2566 char *p;
2567 struct stat sb;
2568
2569 path = strdup(devpath);
2570 if (!path)
2571 return ret_errno(ENOMEM);
2572
2573 /*
2574 * Read path followed by mode. Ignore any trailing text.
2575 * A ' # comment' would be legal. Technically other text is not
2576 * legal, we could check for that if we cared to.
2577 */
2578 for (n_parts = 1, p = path; *p; p++) {
2579 if (*p != ' ')
2580 continue;
2581 *p = '\0';
2582
2583 if (n_parts != 1)
2584 break;
2585 p++;
2586 n_parts++;
2587
2588 while (*p == ' ')
2589 p++;
2590
2591 mode = p;
2592
2593 if (*p == '\0')
2594 return ret_set_errno(-1, EINVAL);
2595 }
2596
2597 if (!mode)
2598 return ret_errno(EINVAL);
2599
2600 if (device_cgroup_parse_access(device, mode) < 0)
2601 return -1;
2602
2603 ret = stat(path, &sb);
2604 if (ret < 0)
2605 return ret_set_errno(-1, errno);
2606
2607 mode_t m = sb.st_mode & S_IFMT;
2608 switch (m) {
2609 case S_IFBLK:
2610 device->type = 'b';
2611 break;
2612 case S_IFCHR:
2613 device->type = 'c';
2614 break;
2615 default:
2616 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2617 }
2618
2619 device->major = MAJOR(sb.st_rdev);
2620 device->minor = MINOR(sb.st_rdev);
2621 device->allow = 1;
2622
2623 return 0;
2624 }
2625
2626 static int convert_devpath(const char *invalue, char *dest)
2627 {
2628 struct device_item device = {};
2629 int ret;
2630
2631 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2632 if (ret < 0)
2633 return -1;
2634
2635 ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2636 device.minor, device.access);
2637 if (ret < 0)
2638 return log_error_errno(ret, -ret,
2639 "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2640 device.type, device.major, device.minor,
2641 device.access);
2642
2643 return 0;
2644 }
2645
2646 /* Called from setup_limits - here we have the container's cgroup_data because
2647 * we created the cgroups.
2648 */
2649 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2650 const char *value, bool is_cpuset)
2651 {
2652 __do_free char *controller = NULL;
2653 char *p;
2654 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2655 char converted_value[50];
2656 struct hierarchy *h;
2657
2658 controller = strdup(filename);
2659 if (!controller)
2660 return ret_errno(ENOMEM);
2661
2662 p = strchr(controller, '.');
2663 if (p)
2664 *p = '\0';
2665
2666 if (strequal("devices.allow", filename) && value[0] == '/') {
2667 int ret;
2668
2669 ret = convert_devpath(value, converted_value);
2670 if (ret < 0)
2671 return ret;
2672 value = converted_value;
2673 }
2674
2675 h = get_hierarchy(ops, controller);
2676 if (!h)
2677 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
2678
2679 if (is_cpuset) {
2680 int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
2681 if (ret)
2682 return ret;
2683 }
2684 return lxc_write_openat(h->path_lim, filename, value, strlen(value));
2685 }
2686
2687 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2688 struct lxc_conf *conf,
2689 bool do_devices)
2690 {
2691 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
2692 struct lxc_list *cgroup_settings = &conf->cgroup;
2693 struct lxc_list *iterator, *next;
2694 struct lxc_cgroup *cg;
2695 bool ret = false;
2696
2697 if (!ops)
2698 return ret_set_errno(false, ENOENT);
2699
2700 if (!conf)
2701 return ret_set_errno(false, EINVAL);
2702
2703 cgroup_settings = &conf->cgroup;
2704 if (lxc_list_empty(cgroup_settings))
2705 return true;
2706
2707 if (!ops->hierarchies)
2708 return ret_set_errno(false, EINVAL);
2709
2710 if (pure_unified_layout(ops))
2711 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2712
2713 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2714 if (!sorted_cgroup_settings)
2715 return false;
2716
2717 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2718 cg = iterator->elem;
2719
2720 if (do_devices == strnequal("devices", cg->subsystem, 7)) {
2721 if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strnequal("cpuset", cg->subsystem, 6))) {
2722 if (do_devices && (errno == EACCES || errno == EPERM)) {
2723 SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2724 continue;
2725 }
2726 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2727 goto out;
2728 }
2729 DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
2730 }
2731 }
2732
2733 ret = true;
2734 INFO("Limits for the legacy cgroup hierarchies have been setup");
2735 out:
2736 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2737 lxc_list_del(iterator);
2738 free(iterator);
2739 }
2740
2741 return ret;
2742 }
2743
2744 /*
2745 * Some of the parsing logic comes from the original cgroup device v1
2746 * implementation in the kernel.
2747 */
2748 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2749 struct lxc_conf *conf, const char *key,
2750 const char *val)
2751 {
2752 struct device_item device_item = {};
2753 int ret;
2754
2755 if (strequal("devices.allow", key) && abspath(val))
2756 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2757 else
2758 ret = device_cgroup_rule_parse(&device_item, key, val);
2759 if (ret < 0)
2760 return syserrno_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
2761
2762 /*
2763 * Note that bpf_list_add_device() returns 1 if it altered the device
2764 * list and 0 if it didn't; both return values indicate success.
2765 * Only a negative return value indicates an error.
2766 */
2767 ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
2768 if (ret < 0)
2769 return -1;
2770
2771 return 0;
2772 }
2773
2774 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2775 struct lxc_handler *handler)
2776 {
2777 struct lxc_list *cgroup_settings, *iterator;
2778 struct hierarchy *h;
2779 struct lxc_conf *conf;
2780
2781 if (!ops)
2782 return ret_set_errno(false, ENOENT);
2783
2784 if (!ops->hierarchies)
2785 return true;
2786
2787 if (!ops->container_cgroup)
2788 return ret_set_errno(false, EINVAL);
2789
2790 if (!handler || !handler->conf)
2791 return ret_set_errno(false, EINVAL);
2792 conf = handler->conf;
2793
2794 cgroup_settings = &conf->cgroup2;
2795 if (lxc_list_empty(cgroup_settings))
2796 return true;
2797
2798 if (!pure_unified_layout(ops))
2799 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
2800
2801 if (!ops->unified)
2802 return false;
2803 h = ops->unified;
2804
2805 lxc_list_for_each (iterator, cgroup_settings) {
2806 struct lxc_cgroup *cg = iterator->elem;
2807 int ret;
2808
2809 if (strnequal("devices", cg->subsystem, 7))
2810 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
2811 else
2812 ret = lxc_write_openat(h->path_lim, cg->subsystem, cg->value, strlen(cg->value));
2813 if (ret < 0)
2814 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2815
2816 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2817 }
2818
2819 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2820 }
2821
2822 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
2823 {
2824 struct lxc_conf *conf;
2825 struct hierarchy *unified;
2826
2827 if (!ops)
2828 return ret_set_errno(false, ENOENT);
2829
2830 if (!ops->hierarchies)
2831 return true;
2832
2833 if (!ops->container_cgroup)
2834 return ret_set_errno(false, EEXIST);
2835
2836 if (!handler || !handler->conf)
2837 return ret_set_errno(false, EINVAL);
2838 conf = handler->conf;
2839
2840 unified = ops->unified;
2841 if (!unified || !device_utility_controller(unified) ||
2842 !unified->path_con ||
2843 lxc_list_empty(&(conf->bpf_devices).device_item))
2844 return true;
2845
2846 return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
2847 }
2848
2849 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2850 {
2851 __do_close int dfd_final = -EBADF;
2852 __do_free char *add_controllers = NULL, *copy = NULL;
2853 size_t full_len = 0;
2854 struct hierarchy *unified;
2855 int dfd_cur, ret;
2856 char *cur;
2857 char **it;
2858
2859 if (!ops->hierarchies || !pure_unified_layout(ops))
2860 return true;
2861
2862 unified = ops->unified;
2863 if (!unified->controllers[0])
2864 return true;
2865
2866 /* For now we simply enable all controllers that we have detected by
2867 * creating a string like "+memory +pids +cpu +io".
2868 * TODO: In the near future we might want to support "-<controller>"
2869 * etc. but whether supporting semantics like this make sense will need
2870 * some thinking.
2871 */
2872 for (it = unified->controllers; it && *it; it++) {
2873 full_len += strlen(*it) + 2;
2874 add_controllers = must_realloc(add_controllers, full_len + 1);
2875
2876 if (unified->controllers[0] == *it)
2877 add_controllers[0] = '\0';
2878
2879 (void)strlcat(add_controllers, "+", full_len + 1);
2880 (void)strlcat(add_controllers, *it, full_len + 1);
2881
2882 if ((it + 1) && *(it + 1))
2883 (void)strlcat(add_controllers, " ", full_len + 1);
2884 }
2885
2886 copy = strdup(cgroup);
2887 if (!copy)
2888 return false;
2889
2890 /*
2891 * Placing the write to cgroup.subtree_control before the open() is
2892 * intentional because of the cgroup2 delegation model. It enforces
2893 * that leaf cgroups don't have any controllers enabled for delegation.
2894 */
2895 dfd_cur = unified->dfd_base;
2896 lxc_iterate_parts(cur, copy, "/") {
2897 /*
2898 * Even though we vetted the paths when we parsed the config
2899 * we're paranoid here and check that the path is neither
2900 * absolute nor walks upwards.
2901 */
2902 if (abspath(cur))
2903 return syserrno_set(-EINVAL, "No absolute paths allowed");
2904
2905 if (strnequal(cur, "..", STRLITERALLEN("..")))
2906 return syserrno_set(-EINVAL, "No upward walking paths allowed");
2907
2908 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
2909 if (ret < 0)
2910 return syserrno(-errno, "Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2911
2912 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2913
2914 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
2915 if (dfd_final < 0)
2916 return syserrno(-errno, "Fail to open directory %d(%s)", dfd_cur, cur);
2917 if (dfd_cur != unified->dfd_base)
2918 close(dfd_cur);
2919 /*
2920 * Leave dfd_final pointing to the last fd we opened so
2921 * it will be automatically zapped if we return early.
2922 */
2923 dfd_cur = dfd_final;
2924 }
2925
2926 return true;
2927 }
2928
2929 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2930 {
2931 if (!ops)
2932 return ret_set_errno(false, ENOENT);
2933
2934 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2935 }
2936
2937 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2938 {
2939 if (!ops)
2940 return ret_set_errno(false, ENOENT);
2941
2942 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2943 }
2944
2945 static inline bool unified_cgroup(const char *line)
2946 {
2947 return *line == '0';
2948 }
2949
2950 static inline char *current_unified_cgroup(bool relative, char *line)
2951 {
2952 char *current_cgroup;
2953
2954 line += STRLITERALLEN("0::");
2955
2956 if (!abspath(line))
2957 return ERR_PTR(-EINVAL);
2958
2959 /* remove init.scope */
2960 if (!relative)
2961 line = prune_init_scope(line);
2962
2963 /* create a relative path */
2964 line = deabs(line);
2965
2966 current_cgroup = strdup(line);
2967 if (!current_cgroup)
2968 return ERR_PTR(-ENOMEM);
2969
2970 return current_cgroup;
2971 }
2972
2973 static inline const char *unprefix(const char *controllers)
2974 {
2975 if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
2976 return controllers + STRLITERALLEN("name=");
2977 return controllers;
2978 }
2979
2980 static int __list_cgroup_delegate(char ***delegate)
2981 {
2982 __do_free char **list = NULL;
2983 __do_free char *buf = NULL;
2984 char *standard[] = {
2985 "cgroup.procs",
2986 "cgroup.threads",
2987 "cgroup.subtree_control",
2988 "memory.oom.group",
2989 NULL,
2990 };
2991 char *token;
2992 int ret;
2993
2994 buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
2995 if (!buf) {
2996 for (char **p = standard; p && *p; p++) {
2997 ret = list_add_string(&list, *p);
2998 if (ret < 0)
2999 return ret;
3000 }
3001
3002 *delegate = move_ptr(list);
3003 return syswarn(0, "Failed to read /sys/kernel/cgroup/delegate");
3004 }
3005
3006 lxc_iterate_parts(token, buf, " \t\n") {
3007 /*
3008 * We always need to chown this for both cgroup and
3009 * cgroup2.
3010 */
3011 if (strequal(token, "cgroup.procs"))
3012 continue;
3013
3014 ret = list_add_string(&list, token);
3015 if (ret < 0)
3016 return ret;
3017 }
3018
3019 *delegate = move_ptr(list);
3020 return 0;
3021 }
3022
3023 static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
3024 {
3025 __do_free_string_list char **list = NULL;
3026 int ret;
3027
3028 ret = __list_cgroup_delegate(&list);
3029 if (ret < 0)
3030 return syserrno(ret, "Failed to determine unified cgroup delegation requirements");
3031
3032 for (char *const *s = list; s && *s; s++) {
3033 if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
3034 continue;
3035
3036 return sysinfo(false, "The %s file is not writable, skipping unified hierarchy", *s);
3037 }
3038
3039 *ret_files = move_ptr(list);
3040 return true;
3041 }
3042
3043 static bool legacy_hierarchy_delegated(int dfd_base)
3044 {
3045 if (faccessat(dfd_base, "cgroup.procs", W_OK, 0) && errno != ENOENT)
3046 return sysinfo(false, "The cgroup.procs file is not writable, skipping legacy hierarchy");
3047
3048 return true;
3049 }
3050
3051 static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
3052 bool unprivileged)
3053 {
3054 __do_free char *cgroup_info = NULL;
3055 char *it;
3056
3057 /*
3058 * Root spawned containers escape the current cgroup, so use init's
3059 * cgroups as our base in that case.
3060 */
3061 if (!relative && (geteuid() == 0))
3062 cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3063 else
3064 cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3065 if (!cgroup_info)
3066 return ret_errno(ENOMEM);
3067
3068 lxc_iterate_parts(it, cgroup_info, "\n") {
3069 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
3070 __do_free char *controllers = NULL, *current_cgroup = NULL;
3071 __do_free_string_list char **controller_list = NULL,
3072 **delegate = NULL;
3073 char *line;
3074 int dfd, ret, type;
3075
3076 /* Handle the unified cgroup hierarchy. */
3077 line = it;
3078 if (unified_cgroup(line)) {
3079 char *unified_mnt;
3080
3081 type = UNIFIED_HIERARCHY;
3082
3083 current_cgroup = current_unified_cgroup(relative, line);
3084 if (IS_ERR(current_cgroup))
3085 return PTR_ERR(current_cgroup);
3086
3087 if (unified_cgroup_fd(ops->dfd_mnt)) {
3088 dfd_mnt = dup_cloexec(ops->dfd_mnt);
3089 unified_mnt = "";
3090 } else {
3091 dfd_mnt = open_at(ops->dfd_mnt,
3092 "unified",
3093 PROTECT_OPATH_DIRECTORY,
3094 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3095 unified_mnt = "unified";
3096 }
3097 if (dfd_mnt < 0) {
3098 if (errno != ENOENT)
3099 return syserrno(-errno, "Failed to open %d/unified", ops->dfd_mnt);
3100
3101 SYSTRACE("Unified cgroup not mounted");
3102 continue;
3103 }
3104 dfd = dfd_mnt;
3105
3106 if (!is_empty_string(current_cgroup)) {
3107 dfd_base = open_at(dfd_mnt, current_cgroup,
3108 PROTECT_OPATH_DIRECTORY,
3109 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3110 if (dfd_base < 0)
3111 return syserrno(-errno, "Failed to open %d/%s", dfd_mnt, current_cgroup);
3112 dfd = dfd_base;
3113 }
3114
3115 if (!unified_hierarchy_delegated(dfd, &delegate))
3116 continue;
3117
3118 controller_list = unified_controllers(dfd, "cgroup.controllers");
3119 if (!controller_list) {
3120 TRACE("No controllers are enabled for delegation in the unified hierarchy");
3121 controller_list = list_new();
3122 if (!controller_list)
3123 return syserrno(-ENOMEM, "Failed to create empty controller list");
3124 }
3125
3126 controllers = strdup(unified_mnt);
3127 if (!controllers)
3128 return ret_errno(ENOMEM);
3129 } else {
3130 char *__controllers, *__current_cgroup;
3131
3132 type = LEGACY_HIERARCHY;
3133
3134 __controllers = strchr(line, ':');
3135 if (!__controllers)
3136 return ret_errno(EINVAL);
3137 __controllers++;
3138
3139 __current_cgroup = strchr(__controllers, ':');
3140 if (!__current_cgroup)
3141 return ret_errno(EINVAL);
3142 *__current_cgroup = '\0';
3143 __current_cgroup++;
3144
3145 controllers = strdup(unprefix(__controllers));
3146 if (!controllers)
3147 return ret_errno(ENOMEM);
3148
3149 dfd_mnt = open_at(ops->dfd_mnt,
3150 controllers, PROTECT_OPATH_DIRECTORY,
3151 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3152 if (dfd_mnt < 0) {
3153 if (errno != ENOENT)
3154 return syserrno(-errno, "Failed to open %d/%s",
3155 ops->dfd_mnt, controllers);
3156
3157 SYSTRACE("%s not mounted", controllers);
3158 continue;
3159 }
3160 dfd = dfd_mnt;
3161
3162 if (!abspath(__current_cgroup))
3163 return ret_errno(EINVAL);
3164
3165 /* remove init.scope */
3166 if (!relative)
3167 __current_cgroup = prune_init_scope(__current_cgroup);
3168
3169 /* create a relative path */
3170 __current_cgroup = deabs(__current_cgroup);
3171
3172 current_cgroup = strdup(__current_cgroup);
3173 if (!current_cgroup)
3174 return ret_errno(ENOMEM);
3175
3176 if (!is_empty_string(current_cgroup)) {
3177 dfd_base = open_at(dfd_mnt, current_cgroup,
3178 PROTECT_OPATH_DIRECTORY,
3179 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3180 if (dfd_base < 0)
3181 return syserrno(-errno, "Failed to open %d/%s",
3182 dfd_mnt, current_cgroup);
3183 dfd = dfd_base;
3184 }
3185
3186 if (!legacy_hierarchy_delegated(dfd))
3187 continue;
3188
3189 /*
3190 * We intentionally pass __current_cgroup here and not
3191 * controllers because we would otherwise chop the
3192 * mountpoint.
3193 */
3194 controller_list = list_add_controllers(__controllers);
3195 if (!controller_list)
3196 return syserrno(-ENOMEM, "Failed to create controller list from %s", __controllers);
3197
3198 if (skip_hierarchy(ops, controller_list))
3199 continue;
3200
3201 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3202 }
3203
3204 ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
3205 current_cgroup, controller_list, type);
3206 if (ret < 0)
3207 return syserrno(ret, "Failed to add %s hierarchy", controllers);
3208
3209 /* Transfer ownership. */
3210 move_fd(dfd_mnt);
3211 move_fd(dfd_base);
3212 move_ptr(current_cgroup);
3213 move_ptr(controllers);
3214 move_ptr(controller_list);
3215 if (type == UNIFIED_HIERARCHY)
3216 ops->unified->delegate = move_ptr(delegate);
3217 }
3218
3219 /* determine cgroup layout */
3220 if (ops->unified) {
3221 if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3222 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3223 } else {
3224 if (bpf_devices_cgroup_supported())
3225 ops->unified->utilities |= DEVICES_CONTROLLER;
3226 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3227 }
3228 }
3229
3230 if (!controllers_available(ops))
3231 return syserrno_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
3232
3233 return 0;
3234 }
3235
3236 static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
3237 {
3238 __do_close int dfd = -EBADF;
3239 int ret;
3240 const char *controllers_use;
3241
3242 if (ops->dfd_mnt >= 0)
3243 return ret_errno(EBUSY);
3244
3245 /*
3246 * I don't see the need for allowing symlinks here. If users want to
3247 * have their hierarchy available in different locations I strongly
3248 * suggest bind-mounts.
3249 */
3250 dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3251 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3252 if (dfd < 0)
3253 return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3254
3255 controllers_use = lxc_global_config_value("lxc.cgroup.use");
3256 if (controllers_use) {
3257 __do_free char *dup = NULL;
3258 char *it;
3259
3260 dup = strdup(controllers_use);
3261 if (!dup)
3262 return -errno;
3263
3264 lxc_iterate_parts(it, dup, ",") {
3265 ret = list_add_string(&ops->cgroup_use, it);
3266 if (ret < 0)
3267 return ret;
3268 }
3269 }
3270
3271 /*
3272 * Keep dfd referenced by the cleanup function and actually move the fd
3273 * once we know the initialization succeeded. So if we fail we clean up
3274 * the dfd.
3275 */
3276 ops->dfd_mnt = dfd;
3277
3278 ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !lxc_list_empty(&conf->id_map));
3279 if (ret < 0)
3280 return syserrno(ret, "Failed to initialize cgroups");
3281
3282 /* Transfer ownership to cgroup_ops. */
3283 move_fd(dfd);
3284 return 0;
3285 }
3286
3287 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3288 {
3289 const char *cgroup_pattern;
3290
3291 if (!ops)
3292 return ret_set_errno(-1, ENOENT);
3293
3294 /* copy system-wide cgroup information */
3295 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3296 if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
3297 ops->cgroup_pattern = strdup(cgroup_pattern);
3298 if (!ops->cgroup_pattern)
3299 return ret_errno(ENOMEM);
3300 }
3301
3302 return 0;
3303 }
3304
3305 struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
3306 {
3307 __do_free struct cgroup_ops *cgfsng_ops = NULL;
3308
3309 cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3310 if (!cgfsng_ops)
3311 return ret_set_errno(NULL, ENOMEM);
3312
3313 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3314 cgfsng_ops->dfd_mnt = -EBADF;
3315
3316 if (initialize_cgroups(cgfsng_ops, conf))
3317 return NULL;
3318
3319 cgfsng_ops->data_init = cgfsng_data_init;
3320 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3321 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3322 cgfsng_ops->monitor_create = cgfsng_monitor_create;
3323 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3324 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3325 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3326 cgfsng_ops->payload_create = cgfsng_payload_create;
3327 cgfsng_ops->payload_enter = cgfsng_payload_enter;
3328 cgfsng_ops->payload_finalize = cgfsng_payload_finalize;
3329 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3330 cgfsng_ops->get = cgfsng_get;
3331 cgfsng_ops->set = cgfsng_set;
3332 cgfsng_ops->freeze = cgfsng_freeze;
3333 cgfsng_ops->unfreeze = cgfsng_unfreeze;
3334 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3335 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3336 cgfsng_ops->driver = "cgfsng";
3337 cgfsng_ops->version = "1.0.0";
3338 cgfsng_ops->attach = cgfsng_attach;
3339 cgfsng_ops->chown = cgfsng_chown;
3340 cgfsng_ops->mount = cgfsng_mount;
3341 cgfsng_ops->devices_activate = cgfsng_devices_activate;
3342 cgfsng_ops->get_limiting_cgroup = cgfsng_get_limiting_cgroup;
3343
3344 cgfsng_ops->criu_escape = cgfsng_criu_escape;
3345 cgfsng_ops->criu_num_hierarchies = cgfsng_criu_num_hierarchies;
3346 cgfsng_ops->criu_get_hierarchies = cgfsng_criu_get_hierarchies;
3347
3348 return move_ptr(cgfsng_ops);
3349 }
3350
3351 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3352 const char *lxcpath, pid_t pid)
3353 {
3354 __do_close int unified_fd = -EBADF;
3355 int ret;
3356
3357 if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3358 return ret_errno(EINVAL);
3359
3360 unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3361 if (unified_fd < 0)
3362 return ret_errno(ENOCGROUP2);
3363
3364 if (!lxc_list_empty(&conf->id_map)) {
3365 struct userns_exec_unified_attach_data args = {
3366 .conf = conf,
3367 .unified_fd = unified_fd,
3368 .pid = pid,
3369 };
3370
3371 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3372 if (ret < 0)
3373 return -errno;
3374
3375 ret = userns_exec_minimal(conf,
3376 cgroup_unified_attach_parent_wrapper,
3377 &args,
3378 cgroup_unified_attach_child_wrapper,
3379 &args);
3380 } else {
3381 ret = cgroup_attach_leaf(conf, unified_fd, pid);
3382 }
3383
3384 return ret;
3385 }
3386
3387 /* Connects to command socket therefore isn't callable from command handler. */
3388 int cgroup_get(const char *name, const char *lxcpath,
3389 const char *filename, char *buf, size_t len)
3390 {
3391 __do_close int unified_fd = -EBADF;
3392 ssize_t ret;
3393
3394 if (is_empty_string(filename) || is_empty_string(name) ||
3395 is_empty_string(lxcpath))
3396 return ret_errno(EINVAL);
3397
3398 if ((buf && !len) || (len && !buf))
3399 return ret_errno(EINVAL);
3400
3401 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3402 if (unified_fd < 0)
3403 return ret_errno(ENOCGROUP2);
3404
3405 ret = lxc_read_try_buf_at(unified_fd, filename, buf, len);
3406 if (ret < 0)
3407 SYSERROR("Failed to read cgroup value");
3408
3409 return ret;
3410 }
3411
3412 /* Connects to command socket therefore isn't callable from command handler. */
3413 int cgroup_set(const char *name, const char *lxcpath,
3414 const char *filename, const char *value)
3415 {
3416 __do_close int unified_fd = -EBADF;
3417 ssize_t ret;
3418
3419 if (is_empty_string(filename) || is_empty_string(value) ||
3420 is_empty_string(name) || is_empty_string(lxcpath))
3421 return ret_errno(EINVAL);
3422
3423 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3424 if (unified_fd < 0)
3425 return ret_errno(ENOCGROUP2);
3426
3427 if (strnequal(filename, "devices.", STRLITERALLEN("devices."))) {
3428 struct device_item device = {};
3429
3430 ret = device_cgroup_rule_parse(&device, filename, value);
3431 if (ret < 0)
3432 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", filename, value);
3433
3434 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3435 } else {
3436 ret = lxc_writeat(unified_fd, filename, value, strlen(value));
3437 }
3438
3439 return ret;
3440 }
3441
3442 static int do_cgroup_freeze(int unified_fd,
3443 const char *state_string,
3444 int state_num,
3445 int timeout,
3446 const char *epoll_error,
3447 const char *wait_error)
3448 {
3449 __do_close int events_fd = -EBADF;
3450 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
3451 int ret;
3452 struct lxc_epoll_descr descr = {};
3453
3454 if (timeout != 0) {
3455 ret = lxc_mainloop_open(&descr);
3456 if (ret)
3457 return log_error_errno(-1, errno, "%s", epoll_error);
3458
3459 /* automatically cleaned up now */
3460 descr_ptr = &descr;
3461
3462 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3463 if (events_fd < 0)
3464 return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3465
3466 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
3467 if (ret < 0)
3468 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3469 }
3470
3471 ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3472 if (ret < 0)
3473 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3474
3475 if (timeout != 0) {
3476 ret = lxc_mainloop(&descr, timeout);
3477 if (ret)
3478 return log_error_errno(-1, errno, "%s", wait_error);
3479 }
3480
3481 return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3482 }
3483
3484 static inline int __cgroup_freeze(int unified_fd, int timeout)
3485 {
3486 return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3487 "Failed to create epoll instance to wait for container freeze",
3488 "Failed to wait for container to be frozen");
3489 }
3490
3491 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3492 {
3493 __do_close int unified_fd = -EBADF;
3494 int ret;
3495
3496 if (is_empty_string(name) || is_empty_string(lxcpath))
3497 return ret_errno(EINVAL);
3498
3499 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3500 if (unified_fd < 0)
3501 return ret_errno(ENOCGROUP2);
3502
3503 lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3504 ret = __cgroup_freeze(unified_fd, timeout);
3505 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3506 return ret;
3507 }
3508
3509 int __cgroup_unfreeze(int unified_fd, int timeout)
3510 {
3511 return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3512 "Failed to create epoll instance to wait for container freeze",
3513 "Failed to wait for container to be frozen");
3514 }
3515
3516 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3517 {
3518 __do_close int unified_fd = -EBADF;
3519 int ret;
3520
3521 if (is_empty_string(name) || is_empty_string(lxcpath))
3522 return ret_errno(EINVAL);
3523
3524 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3525 if (unified_fd < 0)
3526 return ret_errno(ENOCGROUP2);
3527
3528 lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3529 ret = __cgroup_unfreeze(unified_fd, timeout);
3530 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3531 return ret;
3532 }