]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfsng.c
commands: s/_LIMITING_/_LIMIT_/g and s/_limiting_/_limit_/g
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 /*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
8 * each controller.
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
12 * a comma-separated list of controllers.
13 */
14
15 #ifndef _GNU_SOURCE
16 #define _GNU_SOURCE 1
17 #endif
18 #include <ctype.h>
19 #include <dirent.h>
20 #include <errno.h>
21 #include <grp.h>
22 #include <linux/kdev_t.h>
23 #include <linux/types.h>
24 #include <poll.h>
25 #include <signal.h>
26 #include <stdint.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/epoll.h>
31 #include <sys/types.h>
32 #include <unistd.h>
33
34 #include "af_unix.h"
35 #include "caps.h"
36 #include "cgroup.h"
37 #include "cgroup2_devices.h"
38 #include "cgroup_utils.h"
39 #include "commands.h"
40 #include "commands_utils.h"
41 #include "conf.h"
42 #include "config.h"
43 #include "error_utils.h"
44 #include "log.h"
45 #include "macro.h"
46 #include "mainloop.h"
47 #include "memory_utils.h"
48 #include "mount_utils.h"
49 #include "storage/storage.h"
50 #include "string_utils.h"
51 #include "syscall_wrappers.h"
52 #include "utils.h"
53
54 #ifndef HAVE_STRLCPY
55 #include "include/strlcpy.h"
56 #endif
57
58 #ifndef HAVE_STRLCAT
59 #include "include/strlcat.h"
60 #endif
61
62 lxc_log_define(cgfsng, cgroup);
63
64 /*
65 * Given a pointer to a null-terminated array of pointers, realloc to add one
66 * entry, and point the new entry to NULL. Do not fail. Return the index to the
67 * second-to-last entry - that is, the one which is now available for use
68 * (keeping the list null-terminated).
69 */
70 static int list_add(void ***list)
71 {
72 int idx = 0;
73 void **p;
74
75 if (*list)
76 for (; (*list)[idx]; idx++)
77 ;
78
79 p = realloc(*list, (idx + 2) * sizeof(void **));
80 if (!p)
81 return ret_errno(ENOMEM);
82
83 p[idx + 1] = NULL;
84 *list = p;
85
86 return idx;
87 }
88
89 /* Given a null-terminated array of strings, check whether @entry is one of the
90 * strings.
91 */
92 static bool string_in_list(char **list, const char *entry)
93 {
94 if (!list)
95 return false;
96
97 for (int i = 0; list[i]; i++)
98 if (strequal(list[i], entry))
99 return true;
100
101 return false;
102 }
103
104 /* Given a handler's cgroup data, return the struct hierarchy for the controller
105 * @c, or NULL if there is none.
106 */
107 static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
108 {
109 if (!ops->hierarchies)
110 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
111
112 for (int i = 0; ops->hierarchies[i]; i++) {
113 if (!controller) {
114 /* This is the empty unified hierarchy. */
115 if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
116 return ops->hierarchies[i];
117
118 continue;
119 }
120
121 /*
122 * Handle controllers with significant implementation changes
123 * from cgroup to cgroup2.
124 */
125 if (pure_unified_layout(ops)) {
126 if (strequal(controller, "devices")) {
127 if (device_utility_controller(ops->unified))
128 return ops->unified;
129
130 break;
131 } else if (strequal(controller, "freezer")) {
132 if (freezer_utility_controller(ops->unified))
133 return ops->unified;
134
135 break;
136 }
137 }
138
139 if (string_in_list(ops->hierarchies[i]->controllers, controller))
140 return ops->hierarchies[i];
141 }
142
143 if (controller)
144 WARN("There is no useable %s controller", controller);
145 else
146 WARN("There is no empty unified cgroup hierarchy");
147
148 return ret_set_errno(NULL, ENOENT);
149 }
150
151 /* Taken over modified from the kernel sources. */
152 #define NBITS 32 /* bits in uint32_t */
153 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
154 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
155
156 static void set_bit(unsigned bit, uint32_t *bitarr)
157 {
158 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
159 }
160
161 static void clear_bit(unsigned bit, uint32_t *bitarr)
162 {
163 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
164 }
165
166 static bool is_set(unsigned bit, uint32_t *bitarr)
167 {
168 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
169 }
170
171 /* Create cpumask from cpulist aka turn:
172 *
173 * 0,2-3
174 *
175 * into bit array
176 *
177 * 1 0 1 1
178 */
179 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
180 {
181 __do_free uint32_t *bitarr = NULL;
182 char *token;
183 size_t arrlen;
184
185 arrlen = BITS_TO_LONGS(nbits);
186 bitarr = calloc(arrlen, sizeof(uint32_t));
187 if (!bitarr)
188 return ret_set_errno(NULL, ENOMEM);
189
190 lxc_iterate_parts(token, buf, ",") {
191 errno = 0;
192 unsigned end, start;
193 char *range;
194
195 start = strtoul(token, NULL, 0);
196 end = start;
197 range = strchr(token, '-');
198 if (range)
199 end = strtoul(range + 1, NULL, 0);
200
201 if (!(start <= end))
202 return ret_set_errno(NULL, EINVAL);
203
204 if (end >= nbits)
205 return ret_set_errno(NULL, EINVAL);
206
207 while (start <= end)
208 set_bit(start++, bitarr);
209 }
210
211 return move_ptr(bitarr);
212 }
213
214 /* Turn cpumask into simple, comma-separated cpulist. */
215 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
216 {
217 __do_free_string_list char **cpulist = NULL;
218 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
219 int ret;
220
221 for (size_t i = 0; i <= nbits; i++) {
222 if (!is_set(i, bitarr))
223 continue;
224
225 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
226 if (ret < 0)
227 return NULL;
228
229 ret = lxc_append_string(&cpulist, numstr);
230 if (ret < 0)
231 return ret_set_errno(NULL, ENOMEM);
232 }
233
234 if (!cpulist)
235 return ret_set_errno(NULL, ENOMEM);
236
237 return lxc_string_join(",", (const char **)cpulist, false);
238 }
239
240 static ssize_t get_max_cpus(char *cpulist)
241 {
242 char *c1, *c2;
243 char *maxcpus = cpulist;
244 size_t cpus = 0;
245
246 c1 = strrchr(maxcpus, ',');
247 if (c1)
248 c1++;
249
250 c2 = strrchr(maxcpus, '-');
251 if (c2)
252 c2++;
253
254 if (!c1 && !c2)
255 c1 = maxcpus;
256 else if (c1 > c2)
257 c2 = c1;
258 else if (c1 < c2)
259 c1 = c2;
260 else if (!c1 && c2)
261 c1 = c2;
262
263 errno = 0;
264 cpus = strtoul(c1, NULL, 0);
265 if (errno != 0)
266 return -1;
267
268 return cpus;
269 }
270
271 static inline bool is_unified_hierarchy(const struct hierarchy *h)
272 {
273 return h->fs_type == UNIFIED_HIERARCHY;
274 }
275
276 /* Return true if the controller @entry is found in the null-terminated list of
277 * hierarchies @hlist.
278 */
279 static bool controller_available(struct hierarchy **hlist, char *entry)
280 {
281 if (!hlist)
282 return false;
283
284 for (int i = 0; hlist[i]; i++)
285 if (string_in_list(hlist[i]->controllers, entry))
286 return true;
287
288 return false;
289 }
290
291 static bool controllers_available(struct cgroup_ops *ops)
292 {
293 struct hierarchy **hlist;
294
295 if (!ops->cgroup_use)
296 return true;
297
298 hlist = ops->hierarchies;
299 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
300 if (!controller_available(hlist, *cur))
301 return log_error(false, "The %s controller found", *cur);
302
303 return true;
304 }
305
306 static char **list_new(void)
307 {
308 __do_free_string_list char **list = NULL;
309 int idx;
310
311 idx = list_add((void ***)&list);
312 if (idx < 0)
313 return NULL;
314
315 list[idx] = NULL;
316 return move_ptr(list);
317 }
318
319 static int list_add_string(char ***list, char *entry)
320 {
321 __do_free char *dup = NULL;
322 int idx;
323
324 dup = strdup(entry);
325 if (!dup)
326 return ret_errno(ENOMEM);
327
328 idx = list_add((void ***)list);
329 if (idx < 0)
330 return idx;
331
332 (*list)[idx] = move_ptr(dup);
333 return 0;
334 }
335
336 static char **list_add_controllers(char *controllers)
337 {
338 __do_free_string_list char **list = NULL;
339 char *it;
340
341 lxc_iterate_parts(it, controllers, ", \t\n") {
342 int ret;
343
344 ret = list_add_string(&list, it);
345 if (ret < 0)
346 return NULL;
347 }
348
349 return move_ptr(list);
350 }
351
352 static char **unified_controllers(int dfd, const char *file)
353 {
354 __do_free char *buf = NULL;
355
356 buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
357 if (!buf)
358 return NULL;
359
360 return list_add_controllers(buf);
361 }
362
363 static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
364 {
365 if (!ops->cgroup_use)
366 return false;
367
368 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
369 bool found = false;
370
371 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
372 if (!strequal(*cur_use, *cur_ctrl))
373 continue;
374
375 found = true;
376 break;
377 }
378
379 if (found)
380 continue;
381
382 return true;
383 }
384
385 return false;
386 }
387
388 static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
389 int dfd_base, char *base_cgroup,
390 char **controllers, cgroupfs_type_magic_t fs_type)
391 {
392 __do_free struct hierarchy *new = NULL;
393 int idx;
394
395 if (abspath(base_cgroup))
396 return syserrno_set(-EINVAL, "Container base path must be relative to controller mount");
397
398 new = zalloc(sizeof(*new));
399 if (!new)
400 return ret_errno(ENOMEM);
401
402 new->dfd_con = -EBADF;
403 new->dfd_lim = -EBADF;
404 new->dfd_mon = -EBADF;
405
406 new->fs_type = fs_type;
407 new->controllers = controllers;
408 new->at_mnt = mnt;
409 new->at_base = base_cgroup;
410
411 new->dfd_mnt = dfd_mnt;
412 new->dfd_base = dfd_base;
413
414 TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
415 mnt, maybe_empty(base_cgroup));
416 for (char *const *it = new->controllers; it && *it; it++)
417 TRACE("The hierarchy contains the %s controller", *it);
418
419 idx = list_add((void ***)&ops->hierarchies);
420 if (idx < 0)
421 return ret_errno(idx);
422
423 if (fs_type == UNIFIED_HIERARCHY)
424 ops->unified = new;
425 (ops->hierarchies)[idx] = move_ptr(new);
426
427 return 0;
428 }
429
430 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
431 {
432 if (!path_prune || !hierarchies)
433 return 0;
434
435 for (int i = 0; hierarchies[i]; i++) {
436 struct hierarchy *h = hierarchies[i];
437 int ret;
438
439 ret = cgroup_tree_prune(h->dfd_base, path_prune);
440 if (ret < 0)
441 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
442 else
443 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
444
445 free_equal(h->path_lim, h->path_con);
446 }
447
448 return 0;
449 }
450
451 struct generic_userns_exec_data {
452 struct hierarchy **hierarchies;
453 const char *path_prune;
454 struct lxc_conf *conf;
455 uid_t origuid; /* target uid in parent namespace */
456 char *path;
457 };
458
459 static int cgroup_tree_remove_wrapper(void *data)
460 {
461 struct generic_userns_exec_data *arg = data;
462 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
463 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
464 int ret;
465
466 if (!lxc_drop_groups() && errno != EPERM)
467 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
468
469 ret = setresgid(nsgid, nsgid, nsgid);
470 if (ret < 0)
471 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
472 (int)nsgid, (int)nsgid, (int)nsgid);
473
474 ret = setresuid(nsuid, nsuid, nsuid);
475 if (ret < 0)
476 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
477 (int)nsuid, (int)nsuid, (int)nsuid);
478
479 return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
480 }
481
482 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
483 struct lxc_handler *handler)
484 {
485 int ret;
486
487 if (!ops) {
488 ERROR("Called with uninitialized cgroup operations");
489 return;
490 }
491
492 if (!ops->hierarchies)
493 return;
494
495 if (!handler) {
496 ERROR("Called with uninitialized handler");
497 return;
498 }
499
500 if (!handler->conf) {
501 ERROR("Called with uninitialized conf");
502 return;
503 }
504
505 if (!ops->container_limit_cgroup) {
506 WARN("Uninitialized limit cgroup");
507 return;
508 }
509
510 ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
511 if (ret < 0)
512 WARN("Failed to detach bpf program from cgroup");
513
514 if (!lxc_list_empty(&handler->conf->id_map)) {
515 struct generic_userns_exec_data wrap = {
516 .conf = handler->conf,
517 .path_prune = ops->container_limit_cgroup,
518 .hierarchies = ops->hierarchies,
519 .origuid = 0,
520 };
521 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
522 &wrap, "cgroup_tree_remove_wrapper");
523 } else {
524 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
525 }
526 if (ret < 0)
527 SYSWARN("Failed to destroy cgroups");
528 }
529
530 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
531 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
532 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
533 bool am_initialized)
534 {
535 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
536 *offlinecpus = NULL, *posscpus = NULL;
537 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
538 *possmask = NULL;
539 int ret;
540 ssize_t i;
541 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
542 bool flipped_bit = false;
543
544 posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
545 if (!posscpus)
546 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
547
548 /* Get maximum number of cpus found in possible cpuset. */
549 maxposs = get_max_cpus(posscpus);
550 if (maxposs < 0 || maxposs >= INT_MAX - 1)
551 return false;
552
553 if (file_exists(__ISOL_CPUS)) {
554 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
555 if (!isolcpus)
556 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
557
558 if (isdigit(isolcpus[0])) {
559 /* Get maximum number of cpus found in isolated cpuset. */
560 maxisol = get_max_cpus(isolcpus);
561 if (maxisol < 0 || maxisol >= INT_MAX - 1)
562 return false;
563 }
564
565 if (maxposs < maxisol)
566 maxposs = maxisol;
567 maxposs++;
568 } else {
569 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
570 }
571
572 if (file_exists(__OFFLINE_CPUS)) {
573 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
574 if (!offlinecpus)
575 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
576
577 if (isdigit(offlinecpus[0])) {
578 /* Get maximum number of cpus found in offline cpuset. */
579 maxoffline = get_max_cpus(offlinecpus);
580 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
581 return false;
582 }
583
584 if (maxposs < maxoffline)
585 maxposs = maxoffline;
586 maxposs++;
587 } else {
588 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
589 }
590
591 if ((maxisol == 0) && (maxoffline == 0)) {
592 cpulist = move_ptr(posscpus);
593 goto copy_parent;
594 }
595
596 possmask = lxc_cpumask(posscpus, maxposs);
597 if (!possmask)
598 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
599
600 if (maxisol > 0) {
601 isolmask = lxc_cpumask(isolcpus, maxposs);
602 if (!isolmask)
603 return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
604 }
605
606 if (maxoffline > 0) {
607 offlinemask = lxc_cpumask(offlinecpus, maxposs);
608 if (!offlinemask)
609 return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
610 }
611
612 for (i = 0; i <= maxposs; i++) {
613 if ((isolmask && !is_set(i, isolmask)) ||
614 (offlinemask && !is_set(i, offlinemask)) ||
615 !is_set(i, possmask))
616 continue;
617
618 flipped_bit = true;
619 clear_bit(i, possmask);
620 }
621
622 if (!flipped_bit) {
623 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
624 TRACE("No isolated or offline cpus present in cpuset");
625 } else {
626 cpulist = move_ptr(posscpus);
627 TRACE("Removed isolated or offline cpus from cpuset");
628 }
629 if (!cpulist)
630 return log_error_errno(false, errno, "Failed to create cpu list");
631
632 copy_parent:
633 if (!am_initialized) {
634 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
635 if (ret < 0)
636 return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
637
638 TRACE("Copied cpu settings of parent cgroup");
639 }
640
641 return true;
642 }
643
644 static bool cpuset1_initialize(int dfd_base, int dfd_next)
645 {
646 char mems[PATH_MAX];
647 ssize_t bytes;
648 char v;
649
650 /*
651 * Determine whether the base cgroup has cpuset
652 * inheritance turned on.
653 */
654 bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
655 if (bytes < 0)
656 return syserrno(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
657
658 /*
659 * Initialize cpuset.cpus and make remove any isolated
660 * and offline cpus.
661 */
662 if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
663 return syserrno(false, "Failed to initialize cpuset.cpus");
664
665 /* Read cpuset.mems from parent... */
666 bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
667 if (bytes < 0)
668 return syserrno(false, "Failed to read file %d(cpuset.mems)", dfd_base);
669
670 /* ... and copy to first cgroup in the tree... */
671 bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
672 if (bytes < 0)
673 return syserrno(false, "Failed to write %d(cpuset.mems)", dfd_next);
674
675 /* ... and finally turn on cpuset inheritance. */
676 bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
677 if (bytes < 0)
678 return syserrno(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
679
680 return log_trace(true, "Initialized cpuset in the legacy hierarchy");
681 }
682
683 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
684 bool cpuset_v1, bool eexist_ignore)
685 {
686 __do_close int dfd_final = -EBADF;
687 int dfd_cur = dfd_base;
688 int ret = 0;
689 size_t len;
690 char *cur;
691 char buf[PATH_MAX];
692
693 if (is_empty_string(path))
694 return ret_errno(EINVAL);
695
696 len = strlcpy(buf, path, sizeof(buf));
697 if (len >= sizeof(buf))
698 return ret_errno(E2BIG);
699
700 lxc_iterate_parts(cur, buf, "/") {
701 /*
702 * Even though we vetted the paths when we parsed the config
703 * we're paranoid here and check that the path is neither
704 * absolute nor walks upwards.
705 */
706 if (abspath(cur))
707 return syserrno_set(-EINVAL, "No absolute paths allowed");
708
709 if (strnequal(cur, "..", STRLITERALLEN("..")))
710 return syserrno_set(-EINVAL, "No upward walking paths allowed");
711
712 ret = mkdirat(dfd_cur, cur, mode);
713 if (ret < 0) {
714 if (errno != EEXIST)
715 return syserrno(-errno, "Failed to create %d(%s)", dfd_cur, cur);
716
717 ret = -EEXIST;
718 }
719 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
720
721 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
722 if (dfd_final < 0)
723 return syserrno(-errno, "Fail to open%s directory %d(%s)",
724 !ret ? " newly created" : "", dfd_base, cur);
725 if (dfd_cur != dfd_base)
726 close(dfd_cur);
727 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
728 return syserrno(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
729 /*
730 * Leave dfd_final pointing to the last fd we opened so
731 * it will be automatically zapped if we return early.
732 */
733 dfd_cur = dfd_final;
734 }
735
736 /* The final cgroup must be succesfully creatd by us. */
737 if (ret) {
738 if (ret != -EEXIST || !eexist_ignore)
739 return syserrno_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
740 }
741
742 return move_fd(dfd_final);
743 }
744
745 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
746 struct hierarchy *h, const char *cgroup_limit_dir,
747 const char *cgroup_leaf, bool payload)
748 {
749 __do_close int fd_limit = -EBADF, fd_final = -EBADF;
750 __do_free char *path = NULL, *limit_path = NULL;
751 bool cpuset_v1 = false;
752
753 /*
754 * The legacy cpuset controller needs massaging in case inheriting
755 * settings from its immediate ancestor cgroup hasn't been turned on.
756 */
757 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
758
759 if (payload && cgroup_leaf) {
760 /* With isolation both parts need to not already exist. */
761 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
762 if (fd_limit < 0)
763 return syserrno(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
764
765 TRACE("Created limit cgroup %d->%d(%s)",
766 fd_limit, h->dfd_base, cgroup_limit_dir);
767
768 /*
769 * With isolation the devices legacy cgroup needs to be
770 * iinitialized early, as it typically contains an 'a' (all)
771 * line, which is not possible once a subdirectory has been
772 * created.
773 */
774 if (string_in_list(h->controllers, "devices") &&
775 !ops->setup_limits_legacy(ops, conf, true))
776 return log_error(false, "Failed to setup legacy device limits");
777
778 limit_path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
779 path = must_make_path(limit_path, cgroup_leaf, NULL);
780
781 /*
782 * If we use a separate limit cgroup, the leaf cgroup, i.e. the
783 * cgroup the container actually resides in, is below fd_limit.
784 */
785 fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false);
786 if (fd_final < 0) {
787 /* Ensure we don't leave any garbage behind. */
788 if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
789 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
790 else
791 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
792 }
793 } else {
794 path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
795
796 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
797 }
798 if (fd_final < 0)
799 return syserrno(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
800
801 if (payload) {
802 h->dfd_con = move_fd(fd_final);
803 h->path_con = move_ptr(path);
804
805 if (fd_limit < 0)
806 h->dfd_lim = h->dfd_con;
807 else
808 h->dfd_lim = move_fd(fd_limit);
809
810 if (limit_path)
811 h->path_lim = move_ptr(limit_path);
812 else
813 h->path_lim = h->path_con;
814 } else {
815 h->dfd_mon = move_fd(fd_final);
816 }
817
818 return true;
819 }
820
821 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
822 bool payload)
823 {
824 bool prune = true;
825
826 if (payload) {
827 /* Check whether we actually created the cgroup to prune. */
828 if (h->dfd_lim < 0)
829 prune = false;
830
831 free_equal(h->path_con, h->path_lim);
832 close_equal(h->dfd_con, h->dfd_lim);
833 } else {
834 /* Check whether we actually created the cgroup to prune. */
835 if (h->dfd_mon < 0)
836 prune = false;
837
838 close_prot_errno_disarm(h->dfd_mon);
839 }
840
841 /* We didn't create this cgroup. */
842 if (!prune)
843 return;
844
845 if (cgroup_tree_prune(h->dfd_base, path_prune))
846 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
847 else
848 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
849 }
850
851 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
852 struct lxc_handler *handler)
853 {
854 int len;
855 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
856 const struct lxc_conf *conf;
857
858 if (!ops) {
859 ERROR("Called with uninitialized cgroup operations");
860 return;
861 }
862
863 if (!ops->hierarchies)
864 return;
865
866 if (!handler) {
867 ERROR("Called with uninitialized handler");
868 return;
869 }
870
871 if (!handler->conf) {
872 ERROR("Called with uninitialized conf");
873 return;
874 }
875 conf = handler->conf;
876
877 if (!ops->monitor_cgroup) {
878 WARN("Uninitialized monitor cgroup");
879 return;
880 }
881
882 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
883 if (len < 0)
884 return;
885
886 for (int i = 0; ops->hierarchies[i]; i++) {
887 __do_close int fd_pivot = -EBADF;
888 __do_free char *pivot_path = NULL;
889 struct hierarchy *h = ops->hierarchies[i];
890 bool cpuset_v1 = false;
891 int ret;
892
893 /* Monitor might have died before we entered the cgroup. */
894 if (handler->monitor_pid <= 0) {
895 WARN("No valid monitor process found while destroying cgroups");
896 goto cgroup_prune_tree;
897 }
898
899 if (conf->cgroup_meta.monitor_pivot_dir)
900 pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
901 else if (conf->cgroup_meta.dir)
902 pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
903 else
904 pivot_path = must_make_path(CGROUP_PIVOT, NULL);
905
906 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
907
908 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
909 if (fd_pivot < 0) {
910 SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
911 continue;
912 }
913
914 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
915 if (ret != 0) {
916 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
917 continue;
918 }
919
920 cgroup_prune_tree:
921 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
922 if (ret < 0)
923 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
924 else
925 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
926 }
927 }
928
929 /*
930 * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
931 * proper prefix directory of lxc.cgroup.dir.payload.
932 *
933 * Returns the prefix length if it is set, otherwise zero on success.
934 */
935 static bool check_cgroup_dir_config(struct lxc_conf *conf)
936 {
937 const char *monitor_dir = conf->cgroup_meta.monitor_dir,
938 *container_dir = conf->cgroup_meta.container_dir,
939 *namespace_dir = conf->cgroup_meta.namespace_dir;
940
941 /* none of the new options are set, all is fine */
942 if (!monitor_dir && !container_dir && !namespace_dir)
943 return true;
944
945 /* some are set, make sure lxc.cgroup.dir is not also set*/
946 if (conf->cgroup_meta.dir)
947 return log_error_errno(false, EINVAL,
948 "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
949
950 /* make sure both monitor and payload are set */
951 if (!monitor_dir || !container_dir)
952 return log_error_errno(false, EINVAL,
953 "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
954
955 /* namespace_dir may be empty */
956 return true;
957 }
958
959 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
960 {
961 __do_free char *monitor_cgroup = NULL;
962 int idx = 0;
963 int i;
964 size_t len;
965 char *suffix = NULL;
966 struct lxc_conf *conf;
967
968 if (!ops)
969 return ret_set_errno(false, ENOENT);
970
971 if (!ops->hierarchies)
972 return true;
973
974 if (ops->monitor_cgroup)
975 return ret_set_errno(false, EEXIST);
976
977 if (!handler || !handler->conf)
978 return ret_set_errno(false, EINVAL);
979
980 conf = handler->conf;
981
982 if (!check_cgroup_dir_config(conf))
983 return false;
984
985 if (conf->cgroup_meta.monitor_dir) {
986 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
987 } else if (conf->cgroup_meta.dir) {
988 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
989 DEFAULT_MONITOR_CGROUP_PREFIX,
990 handler->name,
991 CGROUP_CREATE_RETRY, NULL);
992 } else if (ops->cgroup_pattern) {
993 __do_free char *cgroup_tree = NULL;
994
995 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
996 if (!cgroup_tree)
997 return ret_set_errno(false, ENOMEM);
998
999 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1000 DEFAULT_MONITOR_CGROUP,
1001 CGROUP_CREATE_RETRY, NULL);
1002 } else {
1003 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1004 handler->name,
1005 CGROUP_CREATE_RETRY, NULL);
1006 }
1007 if (!monitor_cgroup)
1008 return ret_set_errno(false, ENOMEM);
1009
1010 if (!conf->cgroup_meta.monitor_dir) {
1011 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1012 *suffix = '\0';
1013 }
1014 do {
1015 if (idx && suffix)
1016 sprintf(suffix, "-%d", idx);
1017
1018 for (i = 0; ops->hierarchies[i]; i++) {
1019 if (cgroup_tree_create(ops, handler->conf,
1020 ops->hierarchies[i],
1021 monitor_cgroup, NULL, false))
1022 continue;
1023
1024 DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1025 for (int j = 0; j <= i; j++)
1026 cgroup_tree_prune_leaf(ops->hierarchies[j],
1027 monitor_cgroup, false);
1028
1029 idx++;
1030 break;
1031 }
1032 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1033
1034 if (idx == 1000 || (!suffix && idx != 0))
1035 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1036
1037 ops->monitor_cgroup = move_ptr(monitor_cgroup);
1038 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1039 }
1040
1041 /*
1042 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1043 * next cgroup_pattern-1, -2, ..., -999.
1044 */
1045 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1046 {
1047 __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1048 char *limit_cgroup;
1049 int idx = 0;
1050 int i;
1051 size_t len;
1052 char *suffix = NULL;
1053 struct lxc_conf *conf;
1054
1055 if (!ops)
1056 return ret_set_errno(false, ENOENT);
1057
1058 if (!ops->hierarchies)
1059 return true;
1060
1061 if (ops->container_cgroup || ops->container_limit_cgroup)
1062 return ret_set_errno(false, EEXIST);
1063
1064 if (!handler || !handler->conf)
1065 return ret_set_errno(false, EINVAL);
1066
1067 conf = handler->conf;
1068
1069 if (!check_cgroup_dir_config(conf))
1070 return false;
1071
1072 if (conf->cgroup_meta.container_dir) {
1073 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1074 if (!__limit_cgroup)
1075 return ret_set_errno(false, ENOMEM);
1076
1077 if (conf->cgroup_meta.namespace_dir) {
1078 container_cgroup = must_make_path(__limit_cgroup,
1079 conf->cgroup_meta.namespace_dir,
1080 NULL);
1081 limit_cgroup = __limit_cgroup;
1082 } else {
1083 /* explicit paths but without isolation */
1084 limit_cgroup = move_ptr(__limit_cgroup);
1085 container_cgroup = limit_cgroup;
1086 }
1087 } else if (conf->cgroup_meta.dir) {
1088 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1089 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1090 handler->name,
1091 CGROUP_CREATE_RETRY, NULL);
1092 container_cgroup = limit_cgroup;
1093 } else if (ops->cgroup_pattern) {
1094 __do_free char *cgroup_tree = NULL;
1095
1096 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1097 if (!cgroup_tree)
1098 return ret_set_errno(false, ENOMEM);
1099
1100 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1101 DEFAULT_PAYLOAD_CGROUP,
1102 CGROUP_CREATE_RETRY, NULL);
1103 container_cgroup = limit_cgroup;
1104 } else {
1105 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1106 handler->name,
1107 CGROUP_CREATE_RETRY, NULL);
1108 container_cgroup = limit_cgroup;
1109 }
1110 if (!limit_cgroup)
1111 return ret_set_errno(false, ENOMEM);
1112
1113 if (!conf->cgroup_meta.container_dir) {
1114 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1115 *suffix = '\0';
1116 }
1117 do {
1118 if (idx && suffix)
1119 sprintf(suffix, "-%d", idx);
1120
1121 for (i = 0; ops->hierarchies[i]; i++) {
1122 if (cgroup_tree_create(ops, handler->conf,
1123 ops->hierarchies[i], limit_cgroup,
1124 conf->cgroup_meta.namespace_dir,
1125 true))
1126 continue;
1127
1128 DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
1129 for (int j = 0; j <= i; j++)
1130 cgroup_tree_prune_leaf(ops->hierarchies[j],
1131 limit_cgroup, true);
1132
1133 idx++;
1134 break;
1135 }
1136 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1137
1138 if (idx == 1000 || (!suffix && idx != 0))
1139 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1140
1141 ops->container_cgroup = move_ptr(container_cgroup);
1142 if (__limit_cgroup)
1143 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1144 else
1145 ops->container_limit_cgroup = ops->container_cgroup;
1146 INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1147 ops->container_cgroup, ops->container_limit_cgroup);
1148 return true;
1149 }
1150
1151 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1152 struct lxc_handler *handler)
1153 {
1154 int monitor_len, transient_len = 0;
1155 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1156 transient[INTTYPE_TO_STRLEN(pid_t)];
1157
1158 if (!ops)
1159 return ret_set_errno(false, ENOENT);
1160
1161 if (!ops->hierarchies)
1162 return true;
1163
1164 if (!ops->monitor_cgroup)
1165 return ret_set_errno(false, ENOENT);
1166
1167 if (!handler || !handler->conf)
1168 return ret_set_errno(false, EINVAL);
1169
1170 monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1171 if (monitor_len < 0)
1172 return false;
1173
1174 if (handler->transient_pid > 0) {
1175 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1176 if (transient_len < 0)
1177 return false;
1178 }
1179
1180 for (int i = 0; ops->hierarchies[i]; i++) {
1181 struct hierarchy *h = ops->hierarchies[i];
1182 int ret;
1183
1184 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
1185 if (ret)
1186 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1187
1188 TRACE("Moved monitor into cgroup %d", h->dfd_mon);
1189
1190 if (handler->transient_pid <= 0)
1191 continue;
1192
1193 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
1194 if (ret)
1195 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1196
1197 TRACE("Moved transient process into cgroup %d", h->dfd_mon);
1198
1199 /*
1200 * we don't keep the fds for non-unified hierarchies around
1201 * mainly because we don't make use of them anymore after the
1202 * core cgroup setup is done but also because there are quite a
1203 * lot of them.
1204 */
1205 if (!is_unified_hierarchy(h))
1206 close_prot_errno_disarm(h->dfd_mon);
1207 }
1208 handler->transient_pid = -1;
1209
1210 return true;
1211 }
1212
1213 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1214 struct lxc_handler *handler)
1215 {
1216 int len;
1217 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1218
1219 if (!ops)
1220 return ret_set_errno(false, ENOENT);
1221
1222 if (!ops->hierarchies)
1223 return true;
1224
1225 if (!ops->container_cgroup)
1226 return ret_set_errno(false, ENOENT);
1227
1228 if (!handler || !handler->conf)
1229 return ret_set_errno(false, EINVAL);
1230
1231 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1232 if (len < 0)
1233 return false;
1234
1235 for (int i = 0; ops->hierarchies[i]; i++) {
1236 struct hierarchy *h = ops->hierarchies[i];
1237 int ret;
1238
1239 if (is_unified_hierarchy(h) &&
1240 (handler->clone_flags & CLONE_INTO_CGROUP))
1241 continue;
1242
1243 ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
1244 if (ret != 0)
1245 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
1246
1247 TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
1248 }
1249
1250 return true;
1251 }
1252
1253 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1254 gid_t chown_gid, mode_t chmod_mode)
1255 {
1256 int ret;
1257
1258 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1259 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1260 if (ret < 0)
1261 return log_warn_errno(-1,
1262 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1263 dirfd, path, (int)chown_uid,
1264 (int)chown_gid);
1265
1266 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1267 if (ret < 0)
1268 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1269 dirfd, path, (int)chmod_mode);
1270
1271 return 0;
1272 }
1273
1274 /* chgrp the container cgroups to container group. We leave
1275 * the container owner as cgroup owner. So we must make the
1276 * directories 775 so that the container can create sub-cgroups.
1277 *
1278 * Also chown the tasks and cgroup.procs files. Those may not
1279 * exist depending on kernel version.
1280 */
1281 static int chown_cgroup_wrapper(void *data)
1282 {
1283 int ret;
1284 uid_t destuid;
1285 struct generic_userns_exec_data *arg = data;
1286 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1287 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1288
1289 if (!lxc_drop_groups() && errno != EPERM)
1290 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1291
1292 ret = setresgid(nsgid, nsgid, nsgid);
1293 if (ret < 0)
1294 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1295 (int)nsgid, (int)nsgid, (int)nsgid);
1296
1297 ret = setresuid(nsuid, nsuid, nsuid);
1298 if (ret < 0)
1299 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1300 (int)nsuid, (int)nsuid, (int)nsuid);
1301
1302 destuid = get_ns_uid(arg->origuid);
1303 if (destuid == LXC_INVALID_UID)
1304 destuid = 0;
1305
1306 for (int i = 0; arg->hierarchies[i]; i++) {
1307 int dirfd = arg->hierarchies[i]->dfd_con;
1308
1309 if (dirfd < 0)
1310 return syserrno_set(-EBADF, "Invalid cgroup file descriptor");
1311
1312 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1313
1314 /*
1315 * Failures to chown() these are inconvenient but not
1316 * detrimental We leave these owned by the container launcher,
1317 * so that container root can write to the files to attach. We
1318 * chmod() them 664 so that container systemd can write to the
1319 * files (which systemd in wily insists on doing).
1320 */
1321
1322 if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
1323 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1324
1325 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1326
1327 if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
1328 continue;
1329
1330 for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
1331 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1332 }
1333
1334 return 0;
1335 }
1336
1337 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1338 struct lxc_conf *conf)
1339 {
1340 struct generic_userns_exec_data wrap;
1341
1342 if (!ops)
1343 return ret_set_errno(false, ENOENT);
1344
1345 if (!ops->hierarchies)
1346 return true;
1347
1348 if (!ops->container_cgroup)
1349 return ret_set_errno(false, ENOENT);
1350
1351 if (!conf)
1352 return ret_set_errno(false, EINVAL);
1353
1354 if (lxc_list_empty(&conf->id_map))
1355 return true;
1356
1357 wrap.origuid = geteuid();
1358 wrap.path = NULL;
1359 wrap.hierarchies = ops->hierarchies;
1360 wrap.conf = conf;
1361
1362 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1363 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1364
1365 return true;
1366 }
1367
1368 __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
1369 {
1370 if (!ops)
1371 return;
1372
1373 if (!ops->hierarchies)
1374 return;
1375
1376 for (int i = 0; ops->hierarchies[i]; i++) {
1377 struct hierarchy *h = ops->hierarchies[i];
1378
1379 /* Close all monitor cgroup file descriptors. */
1380 close_prot_errno_disarm(h->dfd_mon);
1381 }
1382 /* Close the cgroup root file descriptor. */
1383 close_prot_errno_disarm(ops->dfd_mnt);
1384
1385 /*
1386 * The checking for freezer support should obviously be done at cgroup
1387 * initialization time but that doesn't work reliable. The freezer
1388 * controller has been demoted (rightly so) to a simple file located in
1389 * each non-root cgroup. At the time when the container is created we
1390 * might still be located in /sys/fs/cgroup and so checking for
1391 * cgroup.freeze won't tell us anything because this file doesn't exist
1392 * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1393 * find an already existing cgroup and then check within that cgroup
1394 * for the existence of cgroup.freeze but that will only work on
1395 * systemd based hosts. Other init systems might not manage cgroups and
1396 * so no cgroup will exist. So we defer until we have created cgroups
1397 * for our container which means we check here.
1398 */
1399 if (pure_unified_layout(ops) &&
1400 !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
1401 AT_SYMLINK_NOFOLLOW)) {
1402 TRACE("Unified hierarchy supports freezer");
1403 ops->unified->utilities |= FREEZER_CONTROLLER;
1404 }
1405 }
1406
1407 /* cgroup-full:* is done, no need to create subdirs */
1408 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1409 {
1410 switch (cgroup_automount_type) {
1411 case LXC_AUTO_CGROUP_RO:
1412 return true;
1413 case LXC_AUTO_CGROUP_RW:
1414 return true;
1415 case LXC_AUTO_CGROUP_MIXED:
1416 return true;
1417 }
1418
1419 return false;
1420 }
1421
1422 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1423 * remount controller ro if needed and bindmount the cgroupfs onto
1424 * control/the/cg/path.
1425 */
1426 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1427 char *hierarchy_mnt, char *cgpath,
1428 const char *container_cgroup)
1429 {
1430 __do_free char *sourcepath = NULL;
1431 int ret, remount_flags;
1432 int flags = MS_BIND;
1433
1434 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1435 (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1436 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
1437 if (ret < 0)
1438 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1439 hierarchy_mnt, hierarchy_mnt);
1440
1441 remount_flags = add_required_remount_flags(hierarchy_mnt,
1442 hierarchy_mnt,
1443 flags | MS_REMOUNT);
1444 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
1445 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1446 NULL);
1447 if (ret < 0)
1448 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
1449
1450 INFO("Remounted %s read-only", hierarchy_mnt);
1451 }
1452
1453 sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
1454 if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1455 flags |= MS_RDONLY;
1456
1457 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1458 if (ret < 0)
1459 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1460 h->controllers[0], cgpath);
1461 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1462
1463 if (flags & MS_RDONLY) {
1464 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1465 flags | MS_REMOUNT);
1466 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1467 if (ret < 0)
1468 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1469 INFO("Remounted %s read-only", cgpath);
1470 }
1471
1472 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1473 return 0;
1474 }
1475
1476 /* __cgroupfs_mount
1477 *
1478 * Mount cgroup hierarchies directly without using bind-mounts. The main
1479 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1480 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1481 */
1482 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1483 struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1484 const char *hierarchy_mnt)
1485 {
1486 __do_close int fd_fs = -EBADF;
1487 unsigned int flags = 0;
1488 char *fstype;
1489 int ret;
1490
1491 if (dfd_mnt_cgroupfs < 0)
1492 return ret_errno(EINVAL);
1493
1494 flags |= MOUNT_ATTR_NOSUID;
1495 flags |= MOUNT_ATTR_NOEXEC;
1496 flags |= MOUNT_ATTR_NODEV;
1497 flags |= MOUNT_ATTR_RELATIME;
1498
1499 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1500 (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1501 flags |= MOUNT_ATTR_RDONLY;
1502
1503 if (is_unified_hierarchy(h))
1504 fstype = "cgroup2";
1505 else
1506 fstype = "cgroup";
1507
1508 if (can_use_mount_api()) {
1509 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1510 if (fd_fs < 0)
1511 return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1512
1513 if (!is_unified_hierarchy(h)) {
1514 for (const char **it = (const char **)h->controllers; it && *it; it++) {
1515 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1516 ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1517 else
1518 ret = fs_set_property(fd_fs, *it, "");
1519 if (ret < 0)
1520 return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1521 }
1522 }
1523
1524 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1525 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1526 flags);
1527 } else {
1528 __do_free char *controllers = NULL, *target = NULL;
1529 unsigned int old_flags = 0;
1530 const char *rootfs_mnt;
1531
1532 if (!is_unified_hierarchy(h)) {
1533 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1534 if (!controllers)
1535 return ret_errno(ENOMEM);
1536 }
1537
1538 rootfs_mnt = get_rootfs_mnt(rootfs);
1539 ret = mnt_attributes_old(flags, &old_flags);
1540 if (ret)
1541 return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1542
1543 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1544 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1545 }
1546 if (ret < 0)
1547 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1548 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1549
1550 DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1551 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1552 return 0;
1553 }
1554
1555 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1556 struct lxc_rootfs *rootfs,
1557 int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1558 {
1559 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1560 dfd_mnt_cgroupfs, hierarchy_mnt);
1561 }
1562
1563 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1564 struct lxc_rootfs *rootfs,
1565 int dfd_mnt_cgroupfs,
1566 const char *hierarchy_mnt)
1567 {
1568 switch (cgroup_automount_type) {
1569 case LXC_AUTO_CGROUP_FULL_RO:
1570 break;
1571 case LXC_AUTO_CGROUP_FULL_RW:
1572 break;
1573 case LXC_AUTO_CGROUP_FULL_MIXED:
1574 break;
1575 default:
1576 return 0;
1577 }
1578
1579 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1580 dfd_mnt_cgroupfs, hierarchy_mnt);
1581 }
1582
1583 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1584 struct lxc_handler *handler, int cg_flags)
1585 {
1586 __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1587 __do_free char *cgroup_root = NULL;
1588 int cgroup_automount_type;
1589 bool in_cgroup_ns = false, wants_force_mount = false;
1590 struct lxc_conf *conf = handler->conf;
1591 struct lxc_rootfs *rootfs = &conf->rootfs;
1592 const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1593 int ret;
1594
1595 if (!ops)
1596 return ret_set_errno(false, ENOENT);
1597
1598 if (!ops->hierarchies)
1599 return true;
1600
1601 if (!conf)
1602 return ret_set_errno(false, EINVAL);
1603
1604 if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1605 return log_trace(true, "No cgroup mounts requested");
1606
1607 if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1608 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1609 wants_force_mount = true;
1610 }
1611
1612 switch (cg_flags) {
1613 case LXC_AUTO_CGROUP_RO:
1614 TRACE("Read-only cgroup mounts requested");
1615 break;
1616 case LXC_AUTO_CGROUP_RW:
1617 TRACE("Read-write cgroup mounts requested");
1618 break;
1619 case LXC_AUTO_CGROUP_MIXED:
1620 TRACE("Mixed cgroup mounts requested");
1621 break;
1622 case LXC_AUTO_CGROUP_FULL_RO:
1623 TRACE("Full read-only cgroup mounts requested");
1624 break;
1625 case LXC_AUTO_CGROUP_FULL_RW:
1626 TRACE("Full read-write cgroup mounts requested");
1627 break;
1628 case LXC_AUTO_CGROUP_FULL_MIXED:
1629 TRACE("Full mixed cgroup mounts requested");
1630 break;
1631 default:
1632 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1633 }
1634 cgroup_automount_type = cg_flags;
1635
1636 if (!wants_force_mount) {
1637 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
1638
1639 /*
1640 * Most recent distro versions currently have init system that
1641 * do support cgroup2 but do not mount it by default unless
1642 * explicitly told so even if the host is cgroup2 only. That
1643 * means they often will fail to boot. Fix this by pre-mounting
1644 * cgroup2 by default. We will likely need to be doing this a
1645 * few years until all distros have switched over to cgroup2 at
1646 * which point we can safely assume that their init systems
1647 * will mount it themselves.
1648 */
1649 if (pure_unified_layout(ops))
1650 wants_force_mount = true;
1651 }
1652
1653 if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
1654 in_cgroup_ns = true;
1655
1656 if (in_cgroup_ns && !wants_force_mount)
1657 return log_trace(true, "Mounting cgroups not requested or needed");
1658
1659 /* This is really the codepath that we want. */
1660 if (pure_unified_layout(ops)) {
1661 __do_close int dfd_mnt_unified = -EBADF;
1662
1663 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1664 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1665 if (dfd_mnt_unified < 0)
1666 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
1667 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1668 /*
1669 * If cgroup namespaces are supported but the container will
1670 * not have CAP_SYS_ADMIN after it has started we need to mount
1671 * the cgroups manually.
1672 *
1673 * Note that here we know that wants_force_mount is true.
1674 * Otherwise we would've returned early above.
1675 */
1676 if (in_cgroup_ns) {
1677 /*
1678 * 1. cgroup:rw:force -> Mount the cgroup2 filesystem.
1679 * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only.
1680 * 3. cgroup:mixed:force -> See comment above how this
1681 * does not apply so
1682 * cgroup:mixed is equal to
1683 * cgroup:rw when cgroup
1684 * namespaces are supported.
1685
1686 * 4. cgroup:rw -> No-op; init system responsible for mounting.
1687 * 5. cgroup:ro -> No-op; init system responsible for mounting.
1688 * 6. cgroup:mixed -> No-op; init system responsible for mounting.
1689 *
1690 * 7. cgroup-full:rw -> Not supported.
1691 * 8. cgroup-full:ro -> Not supported.
1692 * 9. cgroup-full:mixed -> Not supported.
1693
1694 * 10. cgroup-full:rw:force -> Not supported.
1695 * 11. cgroup-full:ro:force -> Not supported.
1696 * 12. cgroup-full:mixed:force -> Not supported.
1697 */
1698 ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
1699 if (ret < 0)
1700 return syserrno(false, "Failed to force mount cgroup filesystem in cgroup namespace");
1701
1702 return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
1703 } else {
1704 /*
1705 * Either no cgroup namespace supported (highly
1706 * unlikely unless we're dealing with a Frankenkernel.
1707 * Or the user requested to keep the cgroup namespace
1708 * of the host or another container.
1709 */
1710 if (wants_force_mount) {
1711 /*
1712 * 1. cgroup:rw:force -> Bind-mount the cgroup2 filesystem writable.
1713 * 2. cgroup:ro:force -> Bind-mount the cgroup2 filesystem read-only.
1714 * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
1715 * and make the parent directory of the
1716 * container's cgroup read-only but the
1717 * container's cgroup writable.
1718 *
1719 * 10. cgroup-full:rw:force ->
1720 * 11. cgroup-full:ro:force ->
1721 * 12. cgroup-full:mixed:force ->
1722 */
1723 errno = EOPNOTSUPP;
1724 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1725 } else {
1726 errno = EOPNOTSUPP;
1727 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1728 }
1729 }
1730
1731 return syserrno(false, "Failed to mount cgroups");
1732 }
1733
1734 /*
1735 * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1736 * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1737 * DEFAULT_CGROUP_MOUNTPOINT define.
1738 */
1739 if (can_use_mount_api()) {
1740 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1741 if (fd_fs < 0)
1742 return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1743
1744 ret = fs_set_property(fd_fs, "mode", "0755");
1745 if (ret < 0)
1746 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1747
1748 ret = fs_set_property(fd_fs, "size", "10240k");
1749 if (ret < 0)
1750 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1751
1752 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1753 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1754 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1755 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
1756 } else {
1757 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1758 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1759 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1760 "size=10240k,mode=755", rootfs_mnt);
1761 }
1762 if (ret < 0)
1763 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1764 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1765
1766 dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1767 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1768 if (dfd_mnt_tmpfs < 0)
1769 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
1770 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1771
1772 for (int i = 0; ops->hierarchies[i]; i++) {
1773 __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
1774 struct hierarchy *h = ops->hierarchies[i];
1775
1776 ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
1777 if (ret < 0)
1778 return syserrno(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
1779
1780 if (in_cgroup_ns && wants_force_mount) {
1781 /*
1782 * If cgroup namespaces are supported but the container
1783 * will not have CAP_SYS_ADMIN after it has started we
1784 * need to mount the cgroups manually.
1785 */
1786 ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
1787 dfd_mnt_tmpfs, h->at_mnt);
1788 if (ret < 0)
1789 return false;
1790
1791 continue;
1792 }
1793
1794 /* Here is where the ancient kernel section begins. */
1795 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
1796 dfd_mnt_tmpfs, h->at_mnt);
1797 if (ret < 0)
1798 return false;
1799
1800 if (!cg_mount_needs_subdirs(cgroup_automount_type))
1801 continue;
1802
1803 if (!cgroup_root)
1804 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1805
1806 hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
1807 path2 = must_make_path(hierarchy_mnt, h->at_base,
1808 ops->container_cgroup, NULL);
1809 ret = mkdir_p(path2, 0755);
1810 if (ret < 0 && (errno != EEXIST))
1811 return false;
1812
1813 ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
1814 hierarchy_mnt, path2,
1815 ops->container_cgroup);
1816 if (ret < 0)
1817 return false;
1818 }
1819
1820 return true;
1821 }
1822
1823 /* Only root needs to escape to the cgroup of its init. */
1824 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
1825 struct lxc_conf *conf)
1826 {
1827 if (!ops)
1828 return ret_set_errno(false, ENOENT);
1829
1830 if (!ops->hierarchies)
1831 return true;
1832
1833 if (!conf)
1834 return ret_set_errno(false, EINVAL);
1835
1836 if (conf->cgroup_meta.relative || geteuid())
1837 return true;
1838
1839 for (int i = 0; ops->hierarchies[i]; i++) {
1840 __do_free char *fullpath = NULL;
1841 int ret;
1842
1843 fullpath = make_cgroup_path(ops->hierarchies[i],
1844 ops->hierarchies[i]->at_base,
1845 "cgroup.procs", NULL);
1846 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1847 if (ret != 0)
1848 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
1849 }
1850
1851 return true;
1852 }
1853
1854 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
1855 {
1856 int i = 0;
1857
1858 if (!ops)
1859 return ret_set_errno(-1, ENOENT);
1860
1861 if (!ops->hierarchies)
1862 return 0;
1863
1864 for (; ops->hierarchies[i]; i++)
1865 ;
1866
1867 return i;
1868 }
1869
1870 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
1871 int n, char ***out)
1872 {
1873 int i;
1874
1875 if (!ops)
1876 return ret_set_errno(false, ENOENT);
1877
1878 if (!ops->hierarchies)
1879 return ret_set_errno(false, ENOENT);
1880
1881 /* sanity check n */
1882 for (i = 0; i < n; i++)
1883 if (!ops->hierarchies[i])
1884 return ret_set_errno(false, ENOENT);
1885
1886 *out = ops->hierarchies[i]->controllers;
1887
1888 return true;
1889 }
1890
1891 static int cg_legacy_freeze(struct cgroup_ops *ops)
1892 {
1893 struct hierarchy *h;
1894
1895 h = get_hierarchy(ops, "freezer");
1896 if (!h)
1897 return ret_set_errno(-1, ENOENT);
1898
1899 return lxc_write_openat(h->path_con, "freezer.state",
1900 "FROZEN", STRLITERALLEN("FROZEN"));
1901 }
1902
1903 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1904 struct lxc_epoll_descr *descr)
1905 {
1906 __do_free char *line = NULL;
1907 __do_fclose FILE *f = NULL;
1908 int state = PTR_TO_INT(cbdata);
1909 size_t len;
1910 const char *state_string;
1911
1912 f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
1913 if (!f)
1914 return LXC_MAINLOOP_ERROR;
1915
1916 if (state == 1)
1917 state_string = "frozen 1";
1918 else
1919 state_string = "frozen 0";
1920
1921 while (getline(&line, &len, f) != -1)
1922 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
1923 return LXC_MAINLOOP_CLOSE;
1924
1925 rewind(f);
1926
1927 return LXC_MAINLOOP_CONTINUE;
1928 }
1929
1930 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
1931 const char *state_string,
1932 int state_num,
1933 const char *epoll_error,
1934 const char *wait_error)
1935 {
1936 __do_close int fd = -EBADF;
1937 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
1938 int ret;
1939 struct lxc_epoll_descr descr;
1940 struct hierarchy *h;
1941
1942 h = ops->unified;
1943 if (!h)
1944 return ret_set_errno(-1, ENOENT);
1945
1946 if (!h->path_con)
1947 return ret_set_errno(-1, EEXIST);
1948
1949 if (timeout != 0) {
1950 __do_free char *events_file = NULL;
1951
1952 events_file = must_make_path(h->path_con, "cgroup.events", NULL);
1953 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1954 if (fd < 0)
1955 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
1956
1957 ret = lxc_mainloop_open(&descr);
1958 if (ret)
1959 return log_error_errno(-1, errno, "%s", epoll_error);
1960
1961 /* automatically cleaned up now */
1962 descr_ptr = &descr;
1963
1964 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
1965 if (ret < 0)
1966 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
1967 }
1968
1969 ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
1970 if (ret < 0)
1971 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
1972
1973 if (timeout != 0 && lxc_mainloop(&descr, timeout))
1974 return log_error_errno(-1, errno, "%s", wait_error);
1975
1976 return 0;
1977 }
1978
1979 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1980 {
1981 return cg_unified_freeze_do(ops, timeout, "1", 1,
1982 "Failed to create epoll instance to wait for container freeze",
1983 "Failed to wait for container to be frozen");
1984 }
1985
1986 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
1987 {
1988 if (!ops->hierarchies)
1989 return ret_set_errno(-1, ENOENT);
1990
1991 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
1992 return cg_legacy_freeze(ops);
1993
1994 return cg_unified_freeze(ops, timeout);
1995 }
1996
1997 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
1998 {
1999 struct hierarchy *h;
2000
2001 h = get_hierarchy(ops, "freezer");
2002 if (!h)
2003 return ret_set_errno(-1, ENOENT);
2004
2005 return lxc_write_openat(h->path_con, "freezer.state",
2006 "THAWED", STRLITERALLEN("THAWED"));
2007 }
2008
2009 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2010 {
2011 return cg_unified_freeze_do(ops, timeout, "0", 0,
2012 "Failed to create epoll instance to wait for container unfreeze",
2013 "Failed to wait for container to be unfrozen");
2014 }
2015
2016 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2017 {
2018 if (!ops->hierarchies)
2019 return ret_set_errno(-1, ENOENT);
2020
2021 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2022 return cg_legacy_unfreeze(ops);
2023
2024 return cg_unified_unfreeze(ops, timeout);
2025 }
2026
2027 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2028 const char *controller, bool limiting)
2029 {
2030 struct hierarchy *h;
2031 size_t len;
2032 const char *path;
2033
2034 h = get_hierarchy(ops, controller);
2035 if (!h)
2036 return log_warn_errno(NULL, ENOENT,
2037 "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
2038
2039 if (limiting)
2040 path = h->path_lim;
2041 else
2042 path = h->path_con;
2043 if (!path)
2044 return NULL;
2045
2046 len = strlen(h->at_mnt);
2047 if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
2048 STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
2049 path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
2050 path += strspn(path, "/");
2051 }
2052 return path += len;
2053 }
2054
2055 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2056 const char *controller)
2057 {
2058 return cgfsng_get_cgroup_do(ops, controller, false);
2059 }
2060
2061 __cgfsng_ops static const char *cgfsng_get_limit_cgroup(struct cgroup_ops *ops,
2062 const char *controller)
2063 {
2064 return cgfsng_get_cgroup_do(ops, controller, true);
2065 }
2066
2067 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2068 * which must be freed by the caller.
2069 */
2070 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2071 const char *inpath,
2072 const char *filename)
2073 {
2074 return make_cgroup_path(h, inpath, filename, NULL);
2075 }
2076
2077 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2078 {
2079 int idx = 1;
2080 int ret;
2081 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2082 ssize_t pidstr_len;
2083
2084 /* Create leaf cgroup. */
2085 ret = mkdirat(unified_fd, ".lxc", 0755);
2086 if (ret < 0 && errno != EEXIST)
2087 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2088
2089 pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2090 if (pidstr_len < 0)
2091 return pidstr_len;
2092
2093 ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2094 if (ret < 0)
2095 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2096 if (ret == 0)
2097 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2098
2099 /* this is a non-leaf node */
2100 if (errno != EBUSY)
2101 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2102
2103 do {
2104 bool rm = false;
2105 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2106 char *slash = attach_cgroup;
2107
2108 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2109 if (ret < 0)
2110 return ret;
2111
2112 /*
2113 * This shouldn't really happen but the compiler might complain
2114 * that a short write would cause a buffer overrun. So be on
2115 * the safe side.
2116 */
2117 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2118 return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2119
2120 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2121 *slash = '\0';
2122
2123 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2124 if (ret < 0 && errno != EEXIST)
2125 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2126 if (ret == 0)
2127 rm = true;
2128
2129 *slash = '/';
2130
2131 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2132 if (ret == 0)
2133 return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2134
2135 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2136 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2137
2138 /* this is a non-leaf node */
2139 if (errno != EBUSY)
2140 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2141
2142 idx++;
2143 } while (idx < 1000);
2144
2145 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2146 }
2147
2148 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2149 int unified_fd, int *sk_fd)
2150 {
2151 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2152 int target_fds[2];
2153 ssize_t ret;
2154
2155 /* Create leaf cgroup. */
2156 ret = mkdirat(unified_fd, ".lxc", 0755);
2157 if (ret < 0 && errno != EEXIST)
2158 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2159
2160 target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2161 if (target_fd0 < 0)
2162 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2163 target_fds[0] = target_fd0;
2164
2165 target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2166 if (target_fd1 < 0)
2167 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2168 target_fds[1] = target_fd1;
2169
2170 ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2171 if (ret <= 0)
2172 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2173 target_fd0, target_fd1);
2174
2175 return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2176 }
2177
2178 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2179 int *sk_fd, pid_t pid)
2180 {
2181 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2182 int target_fds[2];
2183 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2184 size_t pidstr_len;
2185 ssize_t ret;
2186
2187 ret = lxc_abstract_unix_recv_two_fds(sk, target_fds);
2188 if (ret < 0)
2189 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2190 target_fd0 = target_fds[0];
2191 target_fd1 = target_fds[1];
2192
2193 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2194
2195 ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2196 if (ret > 0 && ret == pidstr_len)
2197 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2198
2199 ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2200 if (ret > 0 && ret == pidstr_len)
2201 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2202
2203 return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2204 target_fd0, target_fd1);
2205 }
2206
2207 struct userns_exec_unified_attach_data {
2208 const struct lxc_conf *conf;
2209 int unified_fd;
2210 int sk_pair[2];
2211 pid_t pid;
2212 };
2213
2214 static int cgroup_unified_attach_child_wrapper(void *data)
2215 {
2216 struct userns_exec_unified_attach_data *args = data;
2217
2218 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2219 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2220 return ret_errno(EINVAL);
2221
2222 close_prot_errno_disarm(args->sk_pair[0]);
2223 return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2224 &args->sk_pair[1]);
2225 }
2226
2227 static int cgroup_unified_attach_parent_wrapper(void *data)
2228 {
2229 struct userns_exec_unified_attach_data *args = data;
2230
2231 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2232 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2233 return ret_errno(EINVAL);
2234
2235 close_prot_errno_disarm(args->sk_pair[1]);
2236 return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2237 args->pid);
2238 }
2239
2240 /* Technically, we're always at a delegation boundary here (This is especially
2241 * true when cgroup namespaces are available.). The reasoning is that in order
2242 * for us to have been able to start a container in the first place the root
2243 * cgroup must have been a leaf node. Now, either the container's init system
2244 * has populated the cgroup and kept it as a leaf node or it has created
2245 * subtrees. In the former case we will simply attach to the leaf node we
2246 * created when we started the container in the latter case we create our own
2247 * cgroup for the attaching process.
2248 */
2249 static int __cg_unified_attach(const struct hierarchy *h,
2250 const struct lxc_conf *conf, const char *name,
2251 const char *lxcpath, pid_t pid,
2252 const char *controller)
2253 {
2254 __do_close int unified_fd = -EBADF;
2255 __do_free char *path = NULL, *cgroup = NULL;
2256 int ret;
2257
2258 if (!conf || !name || !lxcpath || pid <= 0)
2259 return ret_errno(EINVAL);
2260
2261 ret = cgroup_attach(conf, name, lxcpath, pid);
2262 if (ret == 0)
2263 return log_trace(0, "Attached to unified cgroup via command handler");
2264 if (ret != -ENOCGROUP2)
2265 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2266
2267 /* Fall back to retrieving the path for the unified cgroup. */
2268 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2269 /* not running */
2270 if (!cgroup)
2271 return 0;
2272
2273 path = make_cgroup_path(h, cgroup, NULL);
2274
2275 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2276 if (unified_fd < 0)
2277 return ret_errno(EBADF);
2278
2279 if (!lxc_list_empty(&conf->id_map)) {
2280 struct userns_exec_unified_attach_data args = {
2281 .conf = conf,
2282 .unified_fd = unified_fd,
2283 .pid = pid,
2284 };
2285
2286 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2287 if (ret < 0)
2288 return -errno;
2289
2290 ret = userns_exec_minimal(conf,
2291 cgroup_unified_attach_parent_wrapper,
2292 &args,
2293 cgroup_unified_attach_child_wrapper,
2294 &args);
2295 } else {
2296 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2297 }
2298
2299 return ret;
2300 }
2301
2302 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2303 const struct lxc_conf *conf,
2304 const char *name, const char *lxcpath,
2305 pid_t pid)
2306 {
2307 int len, ret;
2308 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2309
2310 if (!ops)
2311 return ret_set_errno(false, ENOENT);
2312
2313 if (!ops->hierarchies)
2314 return true;
2315
2316 len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2317 if (len < 0)
2318 return false;
2319
2320 for (int i = 0; ops->hierarchies[i]; i++) {
2321 __do_free char *fullpath = NULL, *path = NULL;
2322 struct hierarchy *h = ops->hierarchies[i];
2323
2324 if (h->fs_type == UNIFIED_HIERARCHY) {
2325 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2326 h->controllers[0]);
2327 if (ret < 0)
2328 return false;
2329
2330 continue;
2331 }
2332
2333 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2334 /* not running */
2335 if (!path)
2336 return false;
2337
2338 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2339 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2340 if (ret < 0)
2341 return log_error_errno(false, errno, "Failed to attach %d to %s",
2342 (int)pid, fullpath);
2343 }
2344
2345 return true;
2346 }
2347
2348 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2349 * don't have a cgroup_data set up, so we ask the running container through the
2350 * commands API for the cgroup path.
2351 */
2352 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2353 char *value, size_t len, const char *name,
2354 const char *lxcpath)
2355 {
2356 __do_free char *path = NULL;
2357 __do_free char *controller = NULL;
2358 char *p;
2359 struct hierarchy *h;
2360 int ret = -1;
2361
2362 if (!ops)
2363 return ret_set_errno(-1, ENOENT);
2364
2365 controller = strdup(filename);
2366 if (!controller)
2367 return ret_errno(ENOMEM);
2368
2369 p = strchr(controller, '.');
2370 if (p)
2371 *p = '\0';
2372
2373 path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2374 /* not running */
2375 if (!path)
2376 return -1;
2377
2378 h = get_hierarchy(ops, controller);
2379 if (h) {
2380 __do_free char *fullpath = NULL;
2381
2382 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2383 ret = lxc_read_from_file(fullpath, value, len);
2384 }
2385
2386 return ret;
2387 }
2388
2389 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2390 {
2391 for (int count = 0; count < 3; count++, val++) {
2392 switch (*val) {
2393 case 'r':
2394 device->access[count] = *val;
2395 break;
2396 case 'w':
2397 device->access[count] = *val;
2398 break;
2399 case 'm':
2400 device->access[count] = *val;
2401 break;
2402 case '\n':
2403 case '\0':
2404 count = 3;
2405 break;
2406 default:
2407 return ret_errno(EINVAL);
2408 }
2409 }
2410
2411 return 0;
2412 }
2413
2414 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2415 const char *val)
2416 {
2417 int count, ret;
2418 char temp[50];
2419
2420 if (strequal("devices.allow", key))
2421 device->allow = 1; /* allow the device */
2422 else
2423 device->allow = 0; /* deny the device */
2424
2425 if (strequal(val, "a")) {
2426 /* global rule */
2427 device->type = 'a';
2428 device->major = -1;
2429 device->minor = -1;
2430 return 0;
2431 }
2432
2433 switch (*val) {
2434 case 'a':
2435 __fallthrough;
2436 case 'b':
2437 __fallthrough;
2438 case 'c':
2439 device->type = *val;
2440 break;
2441 default:
2442 return -1;
2443 }
2444
2445 val++;
2446 if (!isspace(*val))
2447 return -1;
2448 val++;
2449 if (*val == '*') {
2450 device->major = -1;
2451 val++;
2452 } else if (isdigit(*val)) {
2453 memset(temp, 0, sizeof(temp));
2454 for (count = 0; count < sizeof(temp) - 1; count++) {
2455 temp[count] = *val;
2456 val++;
2457 if (!isdigit(*val))
2458 break;
2459 }
2460 ret = lxc_safe_int(temp, &device->major);
2461 if (ret)
2462 return -1;
2463 } else {
2464 return -1;
2465 }
2466 if (*val != ':')
2467 return -1;
2468 val++;
2469
2470 /* read minor */
2471 if (*val == '*') {
2472 device->minor = -1;
2473 val++;
2474 } else if (isdigit(*val)) {
2475 memset(temp, 0, sizeof(temp));
2476 for (count = 0; count < sizeof(temp) - 1; count++) {
2477 temp[count] = *val;
2478 val++;
2479 if (!isdigit(*val))
2480 break;
2481 }
2482 ret = lxc_safe_int(temp, &device->minor);
2483 if (ret)
2484 return -1;
2485 } else {
2486 return -1;
2487 }
2488 if (!isspace(*val))
2489 return -1;
2490
2491 return device_cgroup_parse_access(device, ++val);
2492 }
2493
2494 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2495 * don't have a cgroup_data set up, so we ask the running container through the
2496 * commands API for the cgroup path.
2497 */
2498 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2499 const char *key, const char *value,
2500 const char *name, const char *lxcpath)
2501 {
2502 __do_free char *path = NULL;
2503 __do_free char *controller = NULL;
2504 char *p;
2505 struct hierarchy *h;
2506 int ret = -1;
2507
2508 if (!ops || is_empty_string(key) || is_empty_string(value) ||
2509 is_empty_string(name) || is_empty_string(lxcpath))
2510 return ret_errno(EINVAL);
2511
2512 controller = strdup(key);
2513 if (!controller)
2514 return ret_errno(ENOMEM);
2515
2516 p = strchr(controller, '.');
2517 if (p)
2518 *p = '\0';
2519
2520 if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2521 struct device_item device = {};
2522
2523 ret = device_cgroup_rule_parse(&device, key, value);
2524 if (ret < 0)
2525 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2526 key, value);
2527
2528 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2529 if (ret < 0)
2530 return -1;
2531
2532 return 0;
2533 }
2534
2535 path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2536 /* not running */
2537 if (!path)
2538 return -1;
2539
2540 h = get_hierarchy(ops, controller);
2541 if (h) {
2542 __do_free char *fullpath = NULL;
2543
2544 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2545 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2546 }
2547
2548 return ret;
2549 }
2550
2551 /* take devices cgroup line
2552 * /dev/foo rwx
2553 * and convert it to a valid
2554 * type major:minor mode
2555 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2556 * the output.
2557 */
2558 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2559 const char *devpath)
2560 {
2561 __do_free char *path = NULL;
2562 char *mode = NULL;
2563 int n_parts, ret;
2564 char *p;
2565 struct stat sb;
2566
2567 path = strdup(devpath);
2568 if (!path)
2569 return ret_errno(ENOMEM);
2570
2571 /*
2572 * Read path followed by mode. Ignore any trailing text.
2573 * A ' # comment' would be legal. Technically other text is not
2574 * legal, we could check for that if we cared to.
2575 */
2576 for (n_parts = 1, p = path; *p; p++) {
2577 if (*p != ' ')
2578 continue;
2579 *p = '\0';
2580
2581 if (n_parts != 1)
2582 break;
2583 p++;
2584 n_parts++;
2585
2586 while (*p == ' ')
2587 p++;
2588
2589 mode = p;
2590
2591 if (*p == '\0')
2592 return ret_set_errno(-1, EINVAL);
2593 }
2594
2595 if (!mode)
2596 return ret_errno(EINVAL);
2597
2598 if (device_cgroup_parse_access(device, mode) < 0)
2599 return -1;
2600
2601 ret = stat(path, &sb);
2602 if (ret < 0)
2603 return ret_set_errno(-1, errno);
2604
2605 mode_t m = sb.st_mode & S_IFMT;
2606 switch (m) {
2607 case S_IFBLK:
2608 device->type = 'b';
2609 break;
2610 case S_IFCHR:
2611 device->type = 'c';
2612 break;
2613 default:
2614 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2615 }
2616
2617 device->major = MAJOR(sb.st_rdev);
2618 device->minor = MINOR(sb.st_rdev);
2619 device->allow = 1;
2620
2621 return 0;
2622 }
2623
2624 static int convert_devpath(const char *invalue, char *dest)
2625 {
2626 struct device_item device = {};
2627 int ret;
2628
2629 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2630 if (ret < 0)
2631 return -1;
2632
2633 ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2634 device.minor, device.access);
2635 if (ret < 0)
2636 return log_error_errno(ret, -ret,
2637 "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2638 device.type, device.major, device.minor,
2639 device.access);
2640
2641 return 0;
2642 }
2643
2644 /* Called from setup_limits - here we have the container's cgroup_data because
2645 * we created the cgroups.
2646 */
2647 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2648 const char *value, bool is_cpuset)
2649 {
2650 __do_free char *controller = NULL;
2651 char *p;
2652 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2653 char converted_value[50];
2654 struct hierarchy *h;
2655
2656 controller = strdup(filename);
2657 if (!controller)
2658 return ret_errno(ENOMEM);
2659
2660 p = strchr(controller, '.');
2661 if (p)
2662 *p = '\0';
2663
2664 if (strequal("devices.allow", filename) && value[0] == '/') {
2665 int ret;
2666
2667 ret = convert_devpath(value, converted_value);
2668 if (ret < 0)
2669 return ret;
2670 value = converted_value;
2671 }
2672
2673 h = get_hierarchy(ops, controller);
2674 if (!h)
2675 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
2676
2677 if (is_cpuset) {
2678 int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
2679 if (ret)
2680 return ret;
2681 }
2682 return lxc_write_openat(h->path_lim, filename, value, strlen(value));
2683 }
2684
2685 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2686 struct lxc_conf *conf,
2687 bool do_devices)
2688 {
2689 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
2690 struct lxc_list *cgroup_settings = &conf->cgroup;
2691 struct lxc_list *iterator, *next;
2692 struct lxc_cgroup *cg;
2693 bool ret = false;
2694
2695 if (!ops)
2696 return ret_set_errno(false, ENOENT);
2697
2698 if (!conf)
2699 return ret_set_errno(false, EINVAL);
2700
2701 cgroup_settings = &conf->cgroup;
2702 if (lxc_list_empty(cgroup_settings))
2703 return true;
2704
2705 if (!ops->hierarchies)
2706 return ret_set_errno(false, EINVAL);
2707
2708 if (pure_unified_layout(ops))
2709 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2710
2711 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2712 if (!sorted_cgroup_settings)
2713 return false;
2714
2715 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2716 cg = iterator->elem;
2717
2718 if (do_devices == strnequal("devices", cg->subsystem, 7)) {
2719 if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strnequal("cpuset", cg->subsystem, 6))) {
2720 if (do_devices && (errno == EACCES || errno == EPERM)) {
2721 SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2722 continue;
2723 }
2724 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2725 goto out;
2726 }
2727 DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
2728 }
2729 }
2730
2731 ret = true;
2732 INFO("Limits for the legacy cgroup hierarchies have been setup");
2733 out:
2734 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2735 lxc_list_del(iterator);
2736 free(iterator);
2737 }
2738
2739 return ret;
2740 }
2741
2742 /*
2743 * Some of the parsing logic comes from the original cgroup device v1
2744 * implementation in the kernel.
2745 */
2746 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2747 struct lxc_conf *conf, const char *key,
2748 const char *val)
2749 {
2750 struct device_item device_item = {};
2751 int ret;
2752
2753 if (strequal("devices.allow", key) && abspath(val))
2754 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2755 else
2756 ret = device_cgroup_rule_parse(&device_item, key, val);
2757 if (ret < 0)
2758 return syserrno_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
2759
2760 /*
2761 * Note that bpf_list_add_device() returns 1 if it altered the device
2762 * list and 0 if it didn't; both return values indicate success.
2763 * Only a negative return value indicates an error.
2764 */
2765 ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
2766 if (ret < 0)
2767 return -1;
2768
2769 return 0;
2770 }
2771
2772 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2773 struct lxc_handler *handler)
2774 {
2775 struct lxc_list *cgroup_settings, *iterator;
2776 struct hierarchy *h;
2777 struct lxc_conf *conf;
2778
2779 if (!ops)
2780 return ret_set_errno(false, ENOENT);
2781
2782 if (!ops->hierarchies)
2783 return true;
2784
2785 if (!ops->container_cgroup)
2786 return ret_set_errno(false, EINVAL);
2787
2788 if (!handler || !handler->conf)
2789 return ret_set_errno(false, EINVAL);
2790 conf = handler->conf;
2791
2792 cgroup_settings = &conf->cgroup2;
2793 if (lxc_list_empty(cgroup_settings))
2794 return true;
2795
2796 if (!pure_unified_layout(ops))
2797 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
2798
2799 if (!ops->unified)
2800 return false;
2801 h = ops->unified;
2802
2803 lxc_list_for_each (iterator, cgroup_settings) {
2804 struct lxc_cgroup *cg = iterator->elem;
2805 int ret;
2806
2807 if (strnequal("devices", cg->subsystem, 7))
2808 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
2809 else
2810 ret = lxc_write_openat(h->path_lim, cg->subsystem, cg->value, strlen(cg->value));
2811 if (ret < 0)
2812 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2813
2814 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2815 }
2816
2817 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2818 }
2819
2820 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
2821 {
2822 struct lxc_conf *conf;
2823 struct hierarchy *unified;
2824
2825 if (!ops)
2826 return ret_set_errno(false, ENOENT);
2827
2828 if (!ops->hierarchies)
2829 return true;
2830
2831 if (!ops->container_cgroup)
2832 return ret_set_errno(false, EEXIST);
2833
2834 if (!handler || !handler->conf)
2835 return ret_set_errno(false, EINVAL);
2836 conf = handler->conf;
2837
2838 unified = ops->unified;
2839 if (!unified || !device_utility_controller(unified) ||
2840 !unified->path_con ||
2841 lxc_list_empty(&(conf->bpf_devices).device_item))
2842 return true;
2843
2844 return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
2845 }
2846
2847 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2848 {
2849 __do_close int dfd_final = -EBADF;
2850 __do_free char *add_controllers = NULL, *copy = NULL;
2851 size_t full_len = 0;
2852 struct hierarchy *unified;
2853 int dfd_cur, ret;
2854 char *cur;
2855 char **it;
2856
2857 if (!ops->hierarchies || !pure_unified_layout(ops))
2858 return true;
2859
2860 unified = ops->unified;
2861 if (!unified->controllers[0])
2862 return true;
2863
2864 /* For now we simply enable all controllers that we have detected by
2865 * creating a string like "+memory +pids +cpu +io".
2866 * TODO: In the near future we might want to support "-<controller>"
2867 * etc. but whether supporting semantics like this make sense will need
2868 * some thinking.
2869 */
2870 for (it = unified->controllers; it && *it; it++) {
2871 full_len += strlen(*it) + 2;
2872 add_controllers = must_realloc(add_controllers, full_len + 1);
2873
2874 if (unified->controllers[0] == *it)
2875 add_controllers[0] = '\0';
2876
2877 (void)strlcat(add_controllers, "+", full_len + 1);
2878 (void)strlcat(add_controllers, *it, full_len + 1);
2879
2880 if ((it + 1) && *(it + 1))
2881 (void)strlcat(add_controllers, " ", full_len + 1);
2882 }
2883
2884 copy = strdup(cgroup);
2885 if (!copy)
2886 return false;
2887
2888 /*
2889 * Placing the write to cgroup.subtree_control before the open() is
2890 * intentional because of the cgroup2 delegation model. It enforces
2891 * that leaf cgroups don't have any controllers enabled for delegation.
2892 */
2893 dfd_cur = unified->dfd_base;
2894 lxc_iterate_parts(cur, copy, "/") {
2895 /*
2896 * Even though we vetted the paths when we parsed the config
2897 * we're paranoid here and check that the path is neither
2898 * absolute nor walks upwards.
2899 */
2900 if (abspath(cur))
2901 return syserrno_set(-EINVAL, "No absolute paths allowed");
2902
2903 if (strnequal(cur, "..", STRLITERALLEN("..")))
2904 return syserrno_set(-EINVAL, "No upward walking paths allowed");
2905
2906 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
2907 if (ret < 0)
2908 return syserrno(-errno, "Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2909
2910 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2911
2912 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
2913 if (dfd_final < 0)
2914 return syserrno(-errno, "Fail to open directory %d(%s)", dfd_cur, cur);
2915 if (dfd_cur != unified->dfd_base)
2916 close(dfd_cur);
2917 /*
2918 * Leave dfd_final pointing to the last fd we opened so
2919 * it will be automatically zapped if we return early.
2920 */
2921 dfd_cur = dfd_final;
2922 }
2923
2924 return true;
2925 }
2926
2927 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2928 {
2929 if (!ops)
2930 return ret_set_errno(false, ENOENT);
2931
2932 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2933 }
2934
2935 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2936 {
2937 if (!ops)
2938 return ret_set_errno(false, ENOENT);
2939
2940 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2941 }
2942
2943 static inline bool unified_cgroup(const char *line)
2944 {
2945 return *line == '0';
2946 }
2947
2948 static inline char *current_unified_cgroup(bool relative, char *line)
2949 {
2950 char *current_cgroup;
2951
2952 line += STRLITERALLEN("0::");
2953
2954 if (!abspath(line))
2955 return ERR_PTR(-EINVAL);
2956
2957 /* remove init.scope */
2958 if (!relative)
2959 line = prune_init_scope(line);
2960
2961 /* create a relative path */
2962 line = deabs(line);
2963
2964 current_cgroup = strdup(line);
2965 if (!current_cgroup)
2966 return ERR_PTR(-ENOMEM);
2967
2968 return current_cgroup;
2969 }
2970
2971 static inline const char *unprefix(const char *controllers)
2972 {
2973 if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
2974 return controllers + STRLITERALLEN("name=");
2975 return controllers;
2976 }
2977
2978 static int __list_cgroup_delegate(char ***delegate)
2979 {
2980 __do_free char **list = NULL;
2981 __do_free char *buf = NULL;
2982 char *standard[] = {
2983 "cgroup.procs",
2984 "cgroup.threads",
2985 "cgroup.subtree_control",
2986 "memory.oom.group",
2987 NULL,
2988 };
2989 char *token;
2990 int ret;
2991
2992 buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
2993 if (!buf) {
2994 for (char **p = standard; p && *p; p++) {
2995 ret = list_add_string(&list, *p);
2996 if (ret < 0)
2997 return ret;
2998 }
2999
3000 *delegate = move_ptr(list);
3001 return syswarn(0, "Failed to read /sys/kernel/cgroup/delegate");
3002 }
3003
3004 lxc_iterate_parts(token, buf, " \t\n") {
3005 /*
3006 * We always need to chown this for both cgroup and
3007 * cgroup2.
3008 */
3009 if (strequal(token, "cgroup.procs"))
3010 continue;
3011
3012 ret = list_add_string(&list, token);
3013 if (ret < 0)
3014 return ret;
3015 }
3016
3017 *delegate = move_ptr(list);
3018 return 0;
3019 }
3020
3021 static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
3022 {
3023 __do_free_string_list char **list = NULL;
3024 int ret;
3025
3026 ret = __list_cgroup_delegate(&list);
3027 if (ret < 0)
3028 return syserrno(ret, "Failed to determine unified cgroup delegation requirements");
3029
3030 for (char *const *s = list; s && *s; s++) {
3031 if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
3032 continue;
3033
3034 return sysinfo(false, "The %s file is not writable, skipping unified hierarchy", *s);
3035 }
3036
3037 *ret_files = move_ptr(list);
3038 return true;
3039 }
3040
3041 static bool legacy_hierarchy_delegated(int dfd_base)
3042 {
3043 if (faccessat(dfd_base, "cgroup.procs", W_OK, 0) && errno != ENOENT)
3044 return sysinfo(false, "The cgroup.procs file is not writable, skipping legacy hierarchy");
3045
3046 return true;
3047 }
3048
3049 static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
3050 bool unprivileged)
3051 {
3052 __do_free char *cgroup_info = NULL;
3053 char *it;
3054
3055 /*
3056 * Root spawned containers escape the current cgroup, so use init's
3057 * cgroups as our base in that case.
3058 */
3059 if (!relative && (geteuid() == 0))
3060 cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3061 else
3062 cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3063 if (!cgroup_info)
3064 return ret_errno(ENOMEM);
3065
3066 lxc_iterate_parts(it, cgroup_info, "\n") {
3067 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
3068 __do_free char *controllers = NULL, *current_cgroup = NULL;
3069 __do_free_string_list char **controller_list = NULL,
3070 **delegate = NULL;
3071 char *line;
3072 int dfd, ret, type;
3073
3074 /* Handle the unified cgroup hierarchy. */
3075 line = it;
3076 if (unified_cgroup(line)) {
3077 char *unified_mnt;
3078
3079 type = UNIFIED_HIERARCHY;
3080
3081 current_cgroup = current_unified_cgroup(relative, line);
3082 if (IS_ERR(current_cgroup))
3083 return PTR_ERR(current_cgroup);
3084
3085 if (unified_cgroup_fd(ops->dfd_mnt)) {
3086 dfd_mnt = dup_cloexec(ops->dfd_mnt);
3087 unified_mnt = "";
3088 } else {
3089 dfd_mnt = open_at(ops->dfd_mnt,
3090 "unified",
3091 PROTECT_OPATH_DIRECTORY,
3092 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3093 unified_mnt = "unified";
3094 }
3095 if (dfd_mnt < 0) {
3096 if (errno != ENOENT)
3097 return syserrno(-errno, "Failed to open %d/unified", ops->dfd_mnt);
3098
3099 SYSTRACE("Unified cgroup not mounted");
3100 continue;
3101 }
3102 dfd = dfd_mnt;
3103
3104 if (!is_empty_string(current_cgroup)) {
3105 dfd_base = open_at(dfd_mnt, current_cgroup,
3106 PROTECT_OPATH_DIRECTORY,
3107 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3108 if (dfd_base < 0)
3109 return syserrno(-errno, "Failed to open %d/%s", dfd_mnt, current_cgroup);
3110 dfd = dfd_base;
3111 }
3112
3113 if (!unified_hierarchy_delegated(dfd, &delegate))
3114 continue;
3115
3116 controller_list = unified_controllers(dfd, "cgroup.controllers");
3117 if (!controller_list) {
3118 TRACE("No controllers are enabled for delegation in the unified hierarchy");
3119 controller_list = list_new();
3120 if (!controller_list)
3121 return syserrno(-ENOMEM, "Failed to create empty controller list");
3122 }
3123
3124 controllers = strdup(unified_mnt);
3125 if (!controllers)
3126 return ret_errno(ENOMEM);
3127 } else {
3128 char *__controllers, *__current_cgroup;
3129
3130 type = LEGACY_HIERARCHY;
3131
3132 __controllers = strchr(line, ':');
3133 if (!__controllers)
3134 return ret_errno(EINVAL);
3135 __controllers++;
3136
3137 __current_cgroup = strchr(__controllers, ':');
3138 if (!__current_cgroup)
3139 return ret_errno(EINVAL);
3140 *__current_cgroup = '\0';
3141 __current_cgroup++;
3142
3143 controllers = strdup(unprefix(__controllers));
3144 if (!controllers)
3145 return ret_errno(ENOMEM);
3146
3147 dfd_mnt = open_at(ops->dfd_mnt,
3148 controllers, PROTECT_OPATH_DIRECTORY,
3149 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3150 if (dfd_mnt < 0) {
3151 if (errno != ENOENT)
3152 return syserrno(-errno, "Failed to open %d/%s",
3153 ops->dfd_mnt, controllers);
3154
3155 SYSTRACE("%s not mounted", controllers);
3156 continue;
3157 }
3158 dfd = dfd_mnt;
3159
3160 if (!abspath(__current_cgroup))
3161 return ret_errno(EINVAL);
3162
3163 /* remove init.scope */
3164 if (!relative)
3165 __current_cgroup = prune_init_scope(__current_cgroup);
3166
3167 /* create a relative path */
3168 __current_cgroup = deabs(__current_cgroup);
3169
3170 current_cgroup = strdup(__current_cgroup);
3171 if (!current_cgroup)
3172 return ret_errno(ENOMEM);
3173
3174 if (!is_empty_string(current_cgroup)) {
3175 dfd_base = open_at(dfd_mnt, current_cgroup,
3176 PROTECT_OPATH_DIRECTORY,
3177 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3178 if (dfd_base < 0)
3179 return syserrno(-errno, "Failed to open %d/%s",
3180 dfd_mnt, current_cgroup);
3181 dfd = dfd_base;
3182 }
3183
3184 if (!legacy_hierarchy_delegated(dfd))
3185 continue;
3186
3187 /*
3188 * We intentionally pass __current_cgroup here and not
3189 * controllers because we would otherwise chop the
3190 * mountpoint.
3191 */
3192 controller_list = list_add_controllers(__controllers);
3193 if (!controller_list)
3194 return syserrno(-ENOMEM, "Failed to create controller list from %s", __controllers);
3195
3196 if (skip_hierarchy(ops, controller_list))
3197 continue;
3198
3199 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3200 }
3201
3202 ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
3203 current_cgroup, controller_list, type);
3204 if (ret < 0)
3205 return syserrno(ret, "Failed to add %s hierarchy", controllers);
3206
3207 /* Transfer ownership. */
3208 move_fd(dfd_mnt);
3209 move_fd(dfd_base);
3210 move_ptr(current_cgroup);
3211 move_ptr(controllers);
3212 move_ptr(controller_list);
3213 if (type == UNIFIED_HIERARCHY)
3214 ops->unified->delegate = move_ptr(delegate);
3215 }
3216
3217 /* determine cgroup layout */
3218 if (ops->unified) {
3219 if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3220 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3221 } else {
3222 if (bpf_devices_cgroup_supported())
3223 ops->unified->utilities |= DEVICES_CONTROLLER;
3224 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3225 }
3226 }
3227
3228 if (!controllers_available(ops))
3229 return syserrno_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
3230
3231 return 0;
3232 }
3233
3234 static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
3235 {
3236 __do_close int dfd = -EBADF;
3237 int ret;
3238 const char *controllers_use;
3239
3240 if (ops->dfd_mnt >= 0)
3241 return ret_errno(EBUSY);
3242
3243 /*
3244 * I don't see the need for allowing symlinks here. If users want to
3245 * have their hierarchy available in different locations I strongly
3246 * suggest bind-mounts.
3247 */
3248 dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3249 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3250 if (dfd < 0)
3251 return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3252
3253 controllers_use = lxc_global_config_value("lxc.cgroup.use");
3254 if (controllers_use) {
3255 __do_free char *dup = NULL;
3256 char *it;
3257
3258 dup = strdup(controllers_use);
3259 if (!dup)
3260 return -errno;
3261
3262 lxc_iterate_parts(it, dup, ",") {
3263 ret = list_add_string(&ops->cgroup_use, it);
3264 if (ret < 0)
3265 return ret;
3266 }
3267 }
3268
3269 /*
3270 * Keep dfd referenced by the cleanup function and actually move the fd
3271 * once we know the initialization succeeded. So if we fail we clean up
3272 * the dfd.
3273 */
3274 ops->dfd_mnt = dfd;
3275
3276 ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !lxc_list_empty(&conf->id_map));
3277 if (ret < 0)
3278 return syserrno(ret, "Failed to initialize cgroups");
3279
3280 /* Transfer ownership to cgroup_ops. */
3281 move_fd(dfd);
3282 return 0;
3283 }
3284
3285 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3286 {
3287 const char *cgroup_pattern;
3288
3289 if (!ops)
3290 return ret_set_errno(-1, ENOENT);
3291
3292 /* copy system-wide cgroup information */
3293 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3294 if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
3295 ops->cgroup_pattern = strdup(cgroup_pattern);
3296 if (!ops->cgroup_pattern)
3297 return ret_errno(ENOMEM);
3298 }
3299
3300 return 0;
3301 }
3302
3303 struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
3304 {
3305 __do_free struct cgroup_ops *cgfsng_ops = NULL;
3306
3307 cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3308 if (!cgfsng_ops)
3309 return ret_set_errno(NULL, ENOMEM);
3310
3311 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3312 cgfsng_ops->dfd_mnt = -EBADF;
3313
3314 if (initialize_cgroups(cgfsng_ops, conf))
3315 return NULL;
3316
3317 cgfsng_ops->data_init = cgfsng_data_init;
3318 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3319 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3320 cgfsng_ops->monitor_create = cgfsng_monitor_create;
3321 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3322 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3323 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3324 cgfsng_ops->payload_create = cgfsng_payload_create;
3325 cgfsng_ops->payload_enter = cgfsng_payload_enter;
3326 cgfsng_ops->finalize = cgfsng_finalize;
3327 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3328 cgfsng_ops->get = cgfsng_get;
3329 cgfsng_ops->set = cgfsng_set;
3330 cgfsng_ops->freeze = cgfsng_freeze;
3331 cgfsng_ops->unfreeze = cgfsng_unfreeze;
3332 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3333 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3334 cgfsng_ops->driver = "cgfsng";
3335 cgfsng_ops->version = "1.0.0";
3336 cgfsng_ops->attach = cgfsng_attach;
3337 cgfsng_ops->chown = cgfsng_chown;
3338 cgfsng_ops->mount = cgfsng_mount;
3339 cgfsng_ops->devices_activate = cgfsng_devices_activate;
3340 cgfsng_ops->get_limit_cgroup = cgfsng_get_limit_cgroup;
3341
3342 cgfsng_ops->criu_escape = cgfsng_criu_escape;
3343 cgfsng_ops->criu_num_hierarchies = cgfsng_criu_num_hierarchies;
3344 cgfsng_ops->criu_get_hierarchies = cgfsng_criu_get_hierarchies;
3345
3346 return move_ptr(cgfsng_ops);
3347 }
3348
3349 static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid)
3350 {
3351 int ret;
3352
3353 if (!lxc_list_empty(&conf->id_map)) {
3354 struct userns_exec_unified_attach_data args = {
3355 .conf = conf,
3356 .unified_fd = fd_unified,
3357 .pid = pid,
3358 };
3359
3360 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3361 if (ret < 0)
3362 return -errno;
3363
3364 ret = userns_exec_minimal(conf,
3365 cgroup_unified_attach_parent_wrapper,
3366 &args,
3367 cgroup_unified_attach_child_wrapper,
3368 &args);
3369 } else {
3370 ret = cgroup_attach_leaf(conf, fd_unified, pid);
3371 }
3372
3373 return ret;
3374 }
3375
3376 static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name,
3377 const char *lxcpath, pid_t pid)
3378 {
3379 call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){};
3380 int ret;
3381 size_t idx;
3382 ssize_t pidstr_len;
3383 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
3384
3385 ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, sizeof(struct cgroup_ctx), ctx);
3386 if (ret < 0)
3387 return ret_errno(ENOSYS);
3388
3389 pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
3390 if (pidstr_len < 0)
3391 return pidstr_len;
3392
3393 for (idx = 0; idx < ctx->fd_len; idx++) {
3394 int dfd_con = ctx->fd[idx];
3395
3396 if (unified_cgroup_fd(dfd_con))
3397 ret = __unified_attach_fd(conf, dfd_con, pid);
3398 else
3399 ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len);
3400 if (ret)
3401 return syserrno(ret, "Failed to attach to cgroup fd %d", dfd_con);
3402 else
3403 TRACE("Attached to cgroup fd %d", dfd_con);
3404 }
3405
3406 if (idx == 0)
3407 return syserrno_set(-ENOENT, "Failed to attach to cgroups");
3408
3409 TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->cgroup_layout));
3410 return 0;
3411 }
3412
3413 static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name,
3414 const char *lxcpath, pid_t pid)
3415 {
3416 __do_close int dfd_unified = -EBADF;
3417
3418 if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3419 return ret_errno(EINVAL);
3420
3421 dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3422 if (dfd_unified < 0)
3423 return ret_errno(ENOSYS);
3424
3425 return __unified_attach_fd(conf, dfd_unified, pid);
3426 }
3427
3428 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3429 const char *lxcpath, pid_t pid)
3430 {
3431 int ret;
3432
3433 ret = __cgroup_attach_many(conf, name, lxcpath, pid);
3434 if (ret < 0) {
3435 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3436 return ret;
3437
3438 ret = __cgroup_attach_unified(conf, name, lxcpath, pid);
3439 if (ret < 0 && ERRNO_IS_NOT_SUPPORTED(ret))
3440 return ret_errno(ENOSYS);
3441 }
3442
3443 return ret;
3444 }
3445
3446 /* Connects to command socket therefore isn't callable from command handler. */
3447 int cgroup_get(const char *name, const char *lxcpath,
3448 const char *filename, char *buf, size_t len)
3449 {
3450 __do_close int unified_fd = -EBADF;
3451 ssize_t ret;
3452
3453 if (is_empty_string(filename) || is_empty_string(name) ||
3454 is_empty_string(lxcpath))
3455 return ret_errno(EINVAL);
3456
3457 if ((buf && !len) || (len && !buf))
3458 return ret_errno(EINVAL);
3459
3460 unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3461 if (unified_fd < 0)
3462 return ret_errno(ENOSYS);
3463
3464 ret = lxc_read_try_buf_at(unified_fd, filename, buf, len);
3465 if (ret < 0)
3466 SYSERROR("Failed to read cgroup value");
3467
3468 return ret;
3469 }
3470
3471 /* Connects to command socket therefore isn't callable from command handler. */
3472 int cgroup_set(const char *name, const char *lxcpath,
3473 const char *filename, const char *value)
3474 {
3475 __do_close int unified_fd = -EBADF;
3476 ssize_t ret;
3477
3478 if (is_empty_string(filename) || is_empty_string(value) ||
3479 is_empty_string(name) || is_empty_string(lxcpath))
3480 return ret_errno(EINVAL);
3481
3482 unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3483 if (unified_fd < 0)
3484 return ret_errno(ENOSYS);
3485
3486 if (strnequal(filename, "devices.", STRLITERALLEN("devices."))) {
3487 struct device_item device = {};
3488
3489 ret = device_cgroup_rule_parse(&device, filename, value);
3490 if (ret < 0)
3491 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", filename, value);
3492
3493 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3494 } else {
3495 ret = lxc_writeat(unified_fd, filename, value, strlen(value));
3496 }
3497
3498 return ret;
3499 }
3500
3501 static int do_cgroup_freeze(int unified_fd,
3502 const char *state_string,
3503 int state_num,
3504 int timeout,
3505 const char *epoll_error,
3506 const char *wait_error)
3507 {
3508 __do_close int events_fd = -EBADF;
3509 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
3510 int ret;
3511 struct lxc_epoll_descr descr = {};
3512
3513 if (timeout != 0) {
3514 ret = lxc_mainloop_open(&descr);
3515 if (ret)
3516 return log_error_errno(-1, errno, "%s", epoll_error);
3517
3518 /* automatically cleaned up now */
3519 descr_ptr = &descr;
3520
3521 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3522 if (events_fd < 0)
3523 return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3524
3525 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
3526 if (ret < 0)
3527 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3528 }
3529
3530 ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3531 if (ret < 0)
3532 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3533
3534 if (timeout != 0) {
3535 ret = lxc_mainloop(&descr, timeout);
3536 if (ret)
3537 return log_error_errno(-1, errno, "%s", wait_error);
3538 }
3539
3540 return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3541 }
3542
3543 static inline int __cgroup_freeze(int unified_fd, int timeout)
3544 {
3545 return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3546 "Failed to create epoll instance to wait for container freeze",
3547 "Failed to wait for container to be frozen");
3548 }
3549
3550 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3551 {
3552 __do_close int unified_fd = -EBADF;
3553 int ret;
3554
3555 if (is_empty_string(name) || is_empty_string(lxcpath))
3556 return ret_errno(EINVAL);
3557
3558 unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3559 if (unified_fd < 0)
3560 return ret_errno(ENOCGROUP2);
3561
3562 lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3563 ret = __cgroup_freeze(unified_fd, timeout);
3564 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3565 return ret;
3566 }
3567
3568 int __cgroup_unfreeze(int unified_fd, int timeout)
3569 {
3570 return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3571 "Failed to create epoll instance to wait for container freeze",
3572 "Failed to wait for container to be frozen");
3573 }
3574
3575 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3576 {
3577 __do_close int unified_fd = -EBADF;
3578 int ret;
3579
3580 if (is_empty_string(name) || is_empty_string(lxcpath))
3581 return ret_errno(EINVAL);
3582
3583 unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3584 if (unified_fd < 0)
3585 return ret_errno(ENOCGROUP2);
3586
3587 lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3588 ret = __cgroup_unfreeze(unified_fd, timeout);
3589 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3590 return ret;
3591 }