]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfsng.c
Merge pull request #3891 from brauner/2021-07-01.fixes
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 /*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
8 * each controller.
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
12 * a comma-separated list of controllers.
13 */
14
15 #ifndef _GNU_SOURCE
16 #define _GNU_SOURCE 1
17 #endif
18 #include <ctype.h>
19 #include <dirent.h>
20 #include <errno.h>
21 #include <grp.h>
22 #include <linux/kdev_t.h>
23 #include <linux/types.h>
24 #include <poll.h>
25 #include <signal.h>
26 #include <stdint.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/epoll.h>
31 #include <sys/types.h>
32 #include <unistd.h>
33
34 #include "af_unix.h"
35 #include "caps.h"
36 #include "cgroup.h"
37 #include "cgroup2_devices.h"
38 #include "cgroup_utils.h"
39 #include "commands.h"
40 #include "commands_utils.h"
41 #include "conf.h"
42 #include "config.h"
43 #include "error_utils.h"
44 #include "log.h"
45 #include "macro.h"
46 #include "mainloop.h"
47 #include "memory_utils.h"
48 #include "mount_utils.h"
49 #include "storage/storage.h"
50 #include "string_utils.h"
51 #include "syscall_wrappers.h"
52 #include "utils.h"
53
54 #ifndef HAVE_STRLCPY
55 #include "include/strlcpy.h"
56 #endif
57
58 #ifndef HAVE_STRLCAT
59 #include "include/strlcat.h"
60 #endif
61
62 lxc_log_define(cgfsng, cgroup);
63
64 /*
65 * Given a pointer to a null-terminated array of pointers, realloc to add one
66 * entry, and point the new entry to NULL. Do not fail. Return the index to the
67 * second-to-last entry - that is, the one which is now available for use
68 * (keeping the list null-terminated).
69 */
70 static int list_add(void ***list)
71 {
72 int idx = 0;
73 void **p;
74
75 if (*list)
76 for (; (*list)[idx]; idx++)
77 ;
78
79 p = realloc(*list, (idx + 2) * sizeof(void **));
80 if (!p)
81 return ret_errno(ENOMEM);
82
83 p[idx + 1] = NULL;
84 *list = p;
85
86 return idx;
87 }
88
89 /* Given a null-terminated array of strings, check whether @entry is one of the
90 * strings.
91 */
92 static bool string_in_list(char **list, const char *entry)
93 {
94 if (!list)
95 return false;
96
97 for (int i = 0; list[i]; i++)
98 if (strequal(list[i], entry))
99 return true;
100
101 return false;
102 }
103
104 /* Given a handler's cgroup data, return the struct hierarchy for the controller
105 * @c, or NULL if there is none.
106 */
107 static struct hierarchy *get_hierarchy(const struct cgroup_ops *ops, const char *controller)
108 {
109 if (!ops->hierarchies)
110 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
111
112 for (int i = 0; ops->hierarchies[i]; i++) {
113 if (!controller) {
114 /* This is the empty unified hierarchy. */
115 if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
116 return ops->hierarchies[i];
117
118 continue;
119 }
120
121 /*
122 * Handle controllers with significant implementation changes
123 * from cgroup to cgroup2.
124 */
125 if (pure_unified_layout(ops)) {
126 if (strequal(controller, "devices")) {
127 if (device_utility_controller(ops->unified))
128 return ops->unified;
129
130 break;
131 } else if (strequal(controller, "freezer")) {
132 if (freezer_utility_controller(ops->unified))
133 return ops->unified;
134
135 break;
136 }
137 }
138
139 if (string_in_list(ops->hierarchies[i]->controllers, controller))
140 return ops->hierarchies[i];
141 }
142
143 if (controller)
144 WARN("There is no useable %s controller", controller);
145 else
146 WARN("There is no empty unified cgroup hierarchy");
147
148 return ret_set_errno(NULL, ENOENT);
149 }
150
151 int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool limit)
152 {
153 int dfd;
154 const struct hierarchy *h;
155
156 h = get_hierarchy(ops, fd->controller);
157 if (!h)
158 return ret_errno(ENOENT);
159
160 /*
161 * The client requested that the controller must be in a specific
162 * cgroup version.
163 */
164 if (fd->type != 0 && fd->type != h->fs_type)
165 return ret_errno(EINVAL);
166
167 if (limit)
168 dfd = h->dfd_con;
169 else
170 dfd = h->dfd_lim;
171 if (dfd < 0)
172 return ret_errno(EBADF);
173
174 fd->layout = ops->cgroup_layout;
175 fd->type = h->fs_type;
176 if (fd->type == UNIFIED_HIERARCHY)
177 fd->utilities = h->utilities;
178 fd->fd = dfd;
179
180 return 0;
181 }
182
183 /* Taken over modified from the kernel sources. */
184 #define NBITS 32 /* bits in uint32_t */
185 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
186 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
187
188 static void set_bit(unsigned bit, uint32_t *bitarr)
189 {
190 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
191 }
192
193 static void clear_bit(unsigned bit, uint32_t *bitarr)
194 {
195 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
196 }
197
198 static bool is_set(unsigned bit, uint32_t *bitarr)
199 {
200 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
201 }
202
203 /* Create cpumask from cpulist aka turn:
204 *
205 * 0,2-3
206 *
207 * into bit array
208 *
209 * 1 0 1 1
210 */
211 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
212 {
213 __do_free uint32_t *bitarr = NULL;
214 char *token;
215 size_t arrlen;
216
217 arrlen = BITS_TO_LONGS(nbits);
218 bitarr = calloc(arrlen, sizeof(uint32_t));
219 if (!bitarr)
220 return ret_set_errno(NULL, ENOMEM);
221
222 lxc_iterate_parts(token, buf, ",") {
223 errno = 0;
224 unsigned end, start;
225 char *range;
226
227 start = strtoul(token, NULL, 0);
228 end = start;
229 range = strchr(token, '-');
230 if (range)
231 end = strtoul(range + 1, NULL, 0);
232
233 if (!(start <= end))
234 return ret_set_errno(NULL, EINVAL);
235
236 if (end >= nbits)
237 return ret_set_errno(NULL, EINVAL);
238
239 while (start <= end)
240 set_bit(start++, bitarr);
241 }
242
243 return move_ptr(bitarr);
244 }
245
246 /* Turn cpumask into simple, comma-separated cpulist. */
247 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
248 {
249 __do_free_string_list char **cpulist = NULL;
250 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
251 int ret;
252
253 for (size_t i = 0; i <= nbits; i++) {
254 if (!is_set(i, bitarr))
255 continue;
256
257 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
258 if (ret < 0)
259 return NULL;
260
261 ret = lxc_append_string(&cpulist, numstr);
262 if (ret < 0)
263 return ret_set_errno(NULL, ENOMEM);
264 }
265
266 if (!cpulist)
267 return ret_set_errno(NULL, ENOMEM);
268
269 return lxc_string_join(",", (const char **)cpulist, false);
270 }
271
272 static ssize_t get_max_cpus(char *cpulist)
273 {
274 char *c1, *c2;
275 char *maxcpus = cpulist;
276 size_t cpus = 0;
277
278 c1 = strrchr(maxcpus, ',');
279 if (c1)
280 c1++;
281
282 c2 = strrchr(maxcpus, '-');
283 if (c2)
284 c2++;
285
286 if (!c1 && !c2)
287 c1 = maxcpus;
288 else if (c1 > c2)
289 c2 = c1;
290 else if (c1 < c2)
291 c1 = c2;
292 else if (!c1 && c2)
293 c1 = c2;
294
295 errno = 0;
296 cpus = strtoul(c1, NULL, 0);
297 if (errno != 0)
298 return -1;
299
300 return cpus;
301 }
302
303 static inline bool is_unified_hierarchy(const struct hierarchy *h)
304 {
305 return h->fs_type == UNIFIED_HIERARCHY;
306 }
307
308 /* Return true if the controller @entry is found in the null-terminated list of
309 * hierarchies @hlist.
310 */
311 static bool controller_available(struct hierarchy **hlist, char *entry)
312 {
313 if (!hlist)
314 return false;
315
316 for (int i = 0; hlist[i]; i++)
317 if (string_in_list(hlist[i]->controllers, entry))
318 return true;
319
320 return false;
321 }
322
323 static bool controllers_available(struct cgroup_ops *ops)
324 {
325 struct hierarchy **hlist;
326
327 if (!ops->cgroup_use)
328 return true;
329
330 hlist = ops->hierarchies;
331 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
332 if (!controller_available(hlist, *cur))
333 return log_error(false, "The %s controller found", *cur);
334
335 return true;
336 }
337
338 static char **list_new(void)
339 {
340 __do_free_string_list char **list = NULL;
341 int idx;
342
343 idx = list_add((void ***)&list);
344 if (idx < 0)
345 return NULL;
346
347 list[idx] = NULL;
348 return move_ptr(list);
349 }
350
351 static int list_add_string(char ***list, char *entry)
352 {
353 __do_free char *dup = NULL;
354 int idx;
355
356 dup = strdup(entry);
357 if (!dup)
358 return ret_errno(ENOMEM);
359
360 idx = list_add((void ***)list);
361 if (idx < 0)
362 return idx;
363
364 (*list)[idx] = move_ptr(dup);
365 return 0;
366 }
367
368 static char **list_add_controllers(char *controllers)
369 {
370 __do_free_string_list char **list = NULL;
371 char *it;
372
373 lxc_iterate_parts(it, controllers, ", \t\n") {
374 int ret;
375
376 ret = list_add_string(&list, it);
377 if (ret < 0)
378 return NULL;
379 }
380
381 return move_ptr(list);
382 }
383
384 static char **unified_controllers(int dfd, const char *file)
385 {
386 __do_free char *buf = NULL;
387
388 buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
389 if (!buf)
390 return NULL;
391
392 return list_add_controllers(buf);
393 }
394
395 static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
396 {
397 if (!ops->cgroup_use)
398 return false;
399
400 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
401 bool found = false;
402
403 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
404 if (!strequal(*cur_use, *cur_ctrl))
405 continue;
406
407 found = true;
408 break;
409 }
410
411 if (found)
412 continue;
413
414 return true;
415 }
416
417 return false;
418 }
419
420 static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
421 int dfd_base, char *base_cgroup,
422 char **controllers, cgroupfs_type_magic_t fs_type)
423 {
424 __do_free struct hierarchy *new = NULL;
425 int idx;
426
427 if (abspath(base_cgroup))
428 return syserror_set(-EINVAL, "Container base path must be relative to controller mount");
429
430 new = zalloc(sizeof(*new));
431 if (!new)
432 return ret_errno(ENOMEM);
433
434 new->dfd_con = -EBADF;
435 new->dfd_lim = -EBADF;
436 new->dfd_mon = -EBADF;
437
438 new->fs_type = fs_type;
439 new->controllers = controllers;
440 new->at_mnt = mnt;
441 new->at_base = base_cgroup;
442
443 new->dfd_mnt = dfd_mnt;
444 new->dfd_base = dfd_base;
445
446 TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
447 mnt, maybe_empty(base_cgroup));
448 for (char *const *it = new->controllers; it && *it; it++)
449 TRACE("The hierarchy contains the %s controller", *it);
450
451 idx = list_add((void ***)&ops->hierarchies);
452 if (idx < 0)
453 return ret_errno(idx);
454
455 if (fs_type == UNIFIED_HIERARCHY)
456 ops->unified = new;
457 (ops->hierarchies)[idx] = move_ptr(new);
458
459 return 0;
460 }
461
462 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
463 {
464 if (!path_prune || !hierarchies)
465 return 0;
466
467 for (int i = 0; hierarchies[i]; i++) {
468 struct hierarchy *h = hierarchies[i];
469 int ret;
470
471 ret = cgroup_tree_prune(h->dfd_base, path_prune);
472 if (ret < 0)
473 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
474 else
475 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
476
477 free_equal(h->path_lim, h->path_con);
478 }
479
480 return 0;
481 }
482
483 struct generic_userns_exec_data {
484 struct hierarchy **hierarchies;
485 const char *path_prune;
486 struct lxc_conf *conf;
487 uid_t origuid; /* target uid in parent namespace */
488 char *path;
489 };
490
491 static int cgroup_tree_remove_wrapper(void *data)
492 {
493 struct generic_userns_exec_data *arg = data;
494 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
495 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
496 int ret;
497
498 if (!lxc_drop_groups() && errno != EPERM)
499 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
500
501 ret = setresgid(nsgid, nsgid, nsgid);
502 if (ret < 0)
503 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
504 (int)nsgid, (int)nsgid, (int)nsgid);
505
506 ret = setresuid(nsuid, nsuid, nsuid);
507 if (ret < 0)
508 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
509 (int)nsuid, (int)nsuid, (int)nsuid);
510
511 return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
512 }
513
514 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
515 struct lxc_handler *handler)
516 {
517 int ret;
518
519 if (!ops) {
520 ERROR("Called with uninitialized cgroup operations");
521 return;
522 }
523
524 if (!ops->hierarchies)
525 return;
526
527 if (!handler) {
528 ERROR("Called with uninitialized handler");
529 return;
530 }
531
532 if (!handler->conf) {
533 ERROR("Called with uninitialized conf");
534 return;
535 }
536
537 if (!ops->container_limit_cgroup) {
538 WARN("Uninitialized limit cgroup");
539 return;
540 }
541
542 ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
543 if (ret < 0)
544 WARN("Failed to detach bpf program from cgroup");
545
546 if (!lxc_list_empty(&handler->conf->id_map)) {
547 struct generic_userns_exec_data wrap = {
548 .conf = handler->conf,
549 .path_prune = ops->container_limit_cgroup,
550 .hierarchies = ops->hierarchies,
551 .origuid = 0,
552 };
553 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
554 &wrap, "cgroup_tree_remove_wrapper");
555 } else {
556 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
557 }
558 if (ret < 0)
559 SYSWARN("Failed to destroy cgroups");
560 }
561
562 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
563 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
564 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
565 bool am_initialized)
566 {
567 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
568 *offlinecpus = NULL, *posscpus = NULL;
569 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
570 *possmask = NULL;
571 int ret;
572 ssize_t i;
573 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
574 bool flipped_bit = false;
575
576 posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
577 if (!posscpus)
578 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
579
580 /* Get maximum number of cpus found in possible cpuset. */
581 maxposs = get_max_cpus(posscpus);
582 if (maxposs < 0 || maxposs >= INT_MAX - 1)
583 return false;
584
585 if (file_exists(__ISOL_CPUS)) {
586 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
587 if (!isolcpus)
588 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
589
590 if (isdigit(isolcpus[0])) {
591 /* Get maximum number of cpus found in isolated cpuset. */
592 maxisol = get_max_cpus(isolcpus);
593 if (maxisol < 0 || maxisol >= INT_MAX - 1)
594 return false;
595 }
596
597 if (maxposs < maxisol)
598 maxposs = maxisol;
599 maxposs++;
600 } else {
601 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
602 }
603
604 if (file_exists(__OFFLINE_CPUS)) {
605 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
606 if (!offlinecpus)
607 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
608
609 if (isdigit(offlinecpus[0])) {
610 /* Get maximum number of cpus found in offline cpuset. */
611 maxoffline = get_max_cpus(offlinecpus);
612 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
613 return false;
614 }
615
616 if (maxposs < maxoffline)
617 maxposs = maxoffline;
618 maxposs++;
619 } else {
620 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
621 }
622
623 if ((maxisol == 0) && (maxoffline == 0)) {
624 cpulist = move_ptr(posscpus);
625 goto copy_parent;
626 }
627
628 possmask = lxc_cpumask(posscpus, maxposs);
629 if (!possmask)
630 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
631
632 if (maxisol > 0) {
633 isolmask = lxc_cpumask(isolcpus, maxposs);
634 if (!isolmask)
635 return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
636 }
637
638 if (maxoffline > 0) {
639 offlinemask = lxc_cpumask(offlinecpus, maxposs);
640 if (!offlinemask)
641 return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
642 }
643
644 for (i = 0; i <= maxposs; i++) {
645 if ((isolmask && !is_set(i, isolmask)) ||
646 (offlinemask && !is_set(i, offlinemask)) ||
647 !is_set(i, possmask))
648 continue;
649
650 flipped_bit = true;
651 clear_bit(i, possmask);
652 }
653
654 if (!flipped_bit) {
655 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
656 TRACE("No isolated or offline cpus present in cpuset");
657 } else {
658 cpulist = move_ptr(posscpus);
659 TRACE("Removed isolated or offline cpus from cpuset");
660 }
661 if (!cpulist)
662 return log_error_errno(false, errno, "Failed to create cpu list");
663
664 copy_parent:
665 if (!am_initialized) {
666 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
667 if (ret < 0)
668 return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
669
670 TRACE("Copied cpu settings of parent cgroup");
671 }
672
673 return true;
674 }
675
676 static bool cpuset1_initialize(int dfd_base, int dfd_next)
677 {
678 char mems[PATH_MAX];
679 ssize_t bytes;
680 char v;
681
682 /*
683 * Determine whether the base cgroup has cpuset
684 * inheritance turned on.
685 */
686 bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
687 if (bytes < 0)
688 return syserror_ret(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
689
690 /*
691 * Initialize cpuset.cpus and make remove any isolated
692 * and offline cpus.
693 */
694 if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
695 return syserror_ret(false, "Failed to initialize cpuset.cpus");
696
697 /* Read cpuset.mems from parent... */
698 bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
699 if (bytes < 0)
700 return syserror_ret(false, "Failed to read file %d(cpuset.mems)", dfd_base);
701
702 /* ... and copy to first cgroup in the tree... */
703 bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
704 if (bytes < 0)
705 return syserror_ret(false, "Failed to write %d(cpuset.mems)", dfd_next);
706
707 /* ... and finally turn on cpuset inheritance. */
708 bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
709 if (bytes < 0)
710 return syserror_ret(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
711
712 return log_trace(true, "Initialized cpuset in the legacy hierarchy");
713 }
714
715 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
716 bool cpuset_v1, bool eexist_ignore)
717 {
718 __do_close int dfd_final = -EBADF;
719 int dfd_cur = dfd_base;
720 int ret = 0;
721 size_t len;
722 char *cur;
723 char buf[PATH_MAX];
724
725 if (is_empty_string(path))
726 return ret_errno(EINVAL);
727
728 len = strlcpy(buf, path, sizeof(buf));
729 if (len >= sizeof(buf))
730 return ret_errno(E2BIG);
731
732 lxc_iterate_parts(cur, buf, "/") {
733 /*
734 * Even though we vetted the paths when we parsed the config
735 * we're paranoid here and check that the path is neither
736 * absolute nor walks upwards.
737 */
738 if (abspath(cur))
739 return syserror_set(-EINVAL, "No absolute paths allowed");
740
741 if (strnequal(cur, "..", STRLITERALLEN("..")))
742 return syserror_set(-EINVAL, "No upward walking paths allowed");
743
744 ret = mkdirat(dfd_cur, cur, mode);
745 if (ret < 0) {
746 if (errno != EEXIST)
747 return syserror("Failed to create %d(%s)", dfd_cur, cur);
748
749 ret = -EEXIST;
750 }
751 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
752
753 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
754 if (dfd_final < 0)
755 return syserror("Fail to open%s directory %d(%s)",
756 !ret ? " newly created" : "", dfd_base, cur);
757 if (dfd_cur != dfd_base)
758 close(dfd_cur);
759 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
760 return syserror_set(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
761 /*
762 * Leave dfd_final pointing to the last fd we opened so
763 * it will be automatically zapped if we return early.
764 */
765 dfd_cur = dfd_final;
766 }
767
768 /* The final cgroup must be succesfully creatd by us. */
769 if (ret) {
770 if (ret != -EEXIST || !eexist_ignore)
771 return syserror_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
772 }
773
774 return move_fd(dfd_final);
775 }
776
777 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
778 struct hierarchy *h, const char *cgroup_limit_dir,
779 const char *cgroup_leaf, bool payload)
780 {
781 __do_close int fd_limit = -EBADF, fd_final = -EBADF;
782 __do_free char *path = NULL, *limit_path = NULL;
783 bool cpuset_v1 = false;
784
785 /*
786 * The legacy cpuset controller needs massaging in case inheriting
787 * settings from its immediate ancestor cgroup hasn't been turned on.
788 */
789 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
790
791 if (payload && cgroup_leaf) {
792 /* With isolation both parts need to not already exist. */
793 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
794 if (fd_limit < 0)
795 return syserror_ret(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
796
797 TRACE("Created limit cgroup %d->%d(%s)",
798 fd_limit, h->dfd_base, cgroup_limit_dir);
799
800 /*
801 * With isolation the devices legacy cgroup needs to be
802 * iinitialized early, as it typically contains an 'a' (all)
803 * line, which is not possible once a subdirectory has been
804 * created.
805 */
806 if (string_in_list(h->controllers, "devices") &&
807 !ops->setup_limits_legacy(ops, conf, true))
808 return log_error(false, "Failed to setup legacy device limits");
809
810 limit_path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
811 path = must_make_path(limit_path, cgroup_leaf, NULL);
812
813 /*
814 * If we use a separate limit cgroup, the leaf cgroup, i.e. the
815 * cgroup the container actually resides in, is below fd_limit.
816 */
817 fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false);
818 if (fd_final < 0) {
819 /* Ensure we don't leave any garbage behind. */
820 if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
821 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
822 else
823 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
824 }
825 } else {
826 path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
827
828 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
829 }
830 if (fd_final < 0)
831 return syserror_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
832
833 if (payload) {
834 h->dfd_con = move_fd(fd_final);
835 h->path_con = move_ptr(path);
836
837 if (fd_limit < 0)
838 h->dfd_lim = h->dfd_con;
839 else
840 h->dfd_lim = move_fd(fd_limit);
841
842 if (limit_path)
843 h->path_lim = move_ptr(limit_path);
844 else
845 h->path_lim = h->path_con;
846 } else {
847 h->dfd_mon = move_fd(fd_final);
848 }
849
850 return true;
851 }
852
853 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
854 bool payload)
855 {
856 bool prune = true;
857
858 if (payload) {
859 /* Check whether we actually created the cgroup to prune. */
860 if (h->dfd_lim < 0)
861 prune = false;
862
863 free_equal(h->path_con, h->path_lim);
864 close_equal(h->dfd_con, h->dfd_lim);
865 } else {
866 /* Check whether we actually created the cgroup to prune. */
867 if (h->dfd_mon < 0)
868 prune = false;
869
870 close_prot_errno_disarm(h->dfd_mon);
871 }
872
873 /* We didn't create this cgroup. */
874 if (!prune)
875 return;
876
877 if (cgroup_tree_prune(h->dfd_base, path_prune))
878 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
879 else
880 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
881 }
882
883 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
884 struct lxc_handler *handler)
885 {
886 int len;
887 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
888 const struct lxc_conf *conf;
889
890 if (!ops) {
891 ERROR("Called with uninitialized cgroup operations");
892 return;
893 }
894
895 if (!ops->hierarchies)
896 return;
897
898 if (!handler) {
899 ERROR("Called with uninitialized handler");
900 return;
901 }
902
903 if (!handler->conf) {
904 ERROR("Called with uninitialized conf");
905 return;
906 }
907 conf = handler->conf;
908
909 if (!ops->monitor_cgroup) {
910 WARN("Uninitialized monitor cgroup");
911 return;
912 }
913
914 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
915 if (len < 0)
916 return;
917
918 for (int i = 0; ops->hierarchies[i]; i++) {
919 __do_close int fd_pivot = -EBADF;
920 __do_free char *pivot_path = NULL;
921 struct hierarchy *h = ops->hierarchies[i];
922 bool cpuset_v1 = false;
923 int ret;
924
925 /* Monitor might have died before we entered the cgroup. */
926 if (handler->monitor_pid <= 0) {
927 WARN("No valid monitor process found while destroying cgroups");
928 goto cgroup_prune_tree;
929 }
930
931 if (conf->cgroup_meta.monitor_pivot_dir)
932 pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
933 else if (conf->cgroup_meta.dir)
934 pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
935 else
936 pivot_path = must_make_path(CGROUP_PIVOT, NULL);
937
938 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
939
940 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
941 if (fd_pivot < 0) {
942 SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
943 continue;
944 }
945
946 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
947 if (ret != 0) {
948 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
949 continue;
950 }
951
952 cgroup_prune_tree:
953 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
954 if (ret < 0)
955 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
956 else
957 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
958 }
959 }
960
961 /*
962 * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
963 * proper prefix directory of lxc.cgroup.dir.payload.
964 *
965 * Returns the prefix length if it is set, otherwise zero on success.
966 */
967 static bool check_cgroup_dir_config(struct lxc_conf *conf)
968 {
969 const char *monitor_dir = conf->cgroup_meta.monitor_dir,
970 *container_dir = conf->cgroup_meta.container_dir,
971 *namespace_dir = conf->cgroup_meta.namespace_dir;
972
973 /* none of the new options are set, all is fine */
974 if (!monitor_dir && !container_dir && !namespace_dir)
975 return true;
976
977 /* some are set, make sure lxc.cgroup.dir is not also set*/
978 if (conf->cgroup_meta.dir)
979 return log_error_errno(false, EINVAL,
980 "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
981
982 /* make sure both monitor and payload are set */
983 if (!monitor_dir || !container_dir)
984 return log_error_errno(false, EINVAL,
985 "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
986
987 /* namespace_dir may be empty */
988 return true;
989 }
990
991 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
992 {
993 __do_free char *monitor_cgroup = NULL;
994 int idx = 0;
995 int i;
996 size_t len;
997 char *suffix = NULL;
998 struct lxc_conf *conf;
999
1000 if (!ops)
1001 return ret_set_errno(false, ENOENT);
1002
1003 if (!ops->hierarchies)
1004 return true;
1005
1006 if (ops->monitor_cgroup)
1007 return ret_set_errno(false, EEXIST);
1008
1009 if (!handler || !handler->conf)
1010 return ret_set_errno(false, EINVAL);
1011
1012 conf = handler->conf;
1013
1014 if (!check_cgroup_dir_config(conf))
1015 return false;
1016
1017 if (conf->cgroup_meta.monitor_dir) {
1018 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
1019 } else if (conf->cgroup_meta.dir) {
1020 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1021 DEFAULT_MONITOR_CGROUP_PREFIX,
1022 handler->name,
1023 CGROUP_CREATE_RETRY, NULL);
1024 } else if (ops->cgroup_pattern) {
1025 __do_free char *cgroup_tree = NULL;
1026
1027 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1028 if (!cgroup_tree)
1029 return ret_set_errno(false, ENOMEM);
1030
1031 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1032 DEFAULT_MONITOR_CGROUP,
1033 CGROUP_CREATE_RETRY, NULL);
1034 } else {
1035 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1036 handler->name,
1037 CGROUP_CREATE_RETRY, NULL);
1038 }
1039 if (!monitor_cgroup)
1040 return ret_set_errno(false, ENOMEM);
1041
1042 if (!conf->cgroup_meta.monitor_dir) {
1043 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1044 *suffix = '\0';
1045 }
1046 do {
1047 if (idx && suffix)
1048 sprintf(suffix, "-%d", idx);
1049
1050 for (i = 0; ops->hierarchies[i]; i++) {
1051 if (cgroup_tree_create(ops, handler->conf,
1052 ops->hierarchies[i],
1053 monitor_cgroup, NULL, false))
1054 continue;
1055
1056 DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1057 for (int j = 0; j <= i; j++)
1058 cgroup_tree_prune_leaf(ops->hierarchies[j],
1059 monitor_cgroup, false);
1060
1061 idx++;
1062 break;
1063 }
1064 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1065
1066 if (idx == 1000 || (!suffix && idx != 0))
1067 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1068
1069 ops->monitor_cgroup = move_ptr(monitor_cgroup);
1070 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1071 }
1072
1073 /*
1074 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1075 * next cgroup_pattern-1, -2, ..., -999.
1076 */
1077 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1078 {
1079 __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1080 char *limit_cgroup;
1081 int idx = 0;
1082 int i;
1083 size_t len;
1084 char *suffix = NULL;
1085 struct lxc_conf *conf;
1086
1087 if (!ops)
1088 return ret_set_errno(false, ENOENT);
1089
1090 if (!ops->hierarchies)
1091 return true;
1092
1093 if (ops->container_cgroup || ops->container_limit_cgroup)
1094 return ret_set_errno(false, EEXIST);
1095
1096 if (!handler || !handler->conf)
1097 return ret_set_errno(false, EINVAL);
1098
1099 conf = handler->conf;
1100
1101 if (!check_cgroup_dir_config(conf))
1102 return false;
1103
1104 if (conf->cgroup_meta.container_dir) {
1105 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1106 if (!__limit_cgroup)
1107 return ret_set_errno(false, ENOMEM);
1108
1109 if (conf->cgroup_meta.namespace_dir) {
1110 container_cgroup = must_make_path(__limit_cgroup,
1111 conf->cgroup_meta.namespace_dir,
1112 NULL);
1113 limit_cgroup = __limit_cgroup;
1114 } else {
1115 /* explicit paths but without isolation */
1116 limit_cgroup = move_ptr(__limit_cgroup);
1117 container_cgroup = limit_cgroup;
1118 }
1119 } else if (conf->cgroup_meta.dir) {
1120 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1121 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1122 handler->name,
1123 CGROUP_CREATE_RETRY, NULL);
1124 container_cgroup = limit_cgroup;
1125 } else if (ops->cgroup_pattern) {
1126 __do_free char *cgroup_tree = NULL;
1127
1128 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1129 if (!cgroup_tree)
1130 return ret_set_errno(false, ENOMEM);
1131
1132 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1133 DEFAULT_PAYLOAD_CGROUP,
1134 CGROUP_CREATE_RETRY, NULL);
1135 container_cgroup = limit_cgroup;
1136 } else {
1137 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1138 handler->name,
1139 CGROUP_CREATE_RETRY, NULL);
1140 container_cgroup = limit_cgroup;
1141 }
1142 if (!limit_cgroup)
1143 return ret_set_errno(false, ENOMEM);
1144
1145 if (!conf->cgroup_meta.container_dir) {
1146 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1147 *suffix = '\0';
1148 }
1149 do {
1150 if (idx && suffix)
1151 sprintf(suffix, "-%d", idx);
1152
1153 for (i = 0; ops->hierarchies[i]; i++) {
1154 if (cgroup_tree_create(ops, handler->conf,
1155 ops->hierarchies[i], limit_cgroup,
1156 conf->cgroup_meta.namespace_dir,
1157 true))
1158 continue;
1159
1160 DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
1161 for (int j = 0; j <= i; j++)
1162 cgroup_tree_prune_leaf(ops->hierarchies[j],
1163 limit_cgroup, true);
1164
1165 idx++;
1166 break;
1167 }
1168 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1169
1170 if (idx == 1000 || (!suffix && idx != 0))
1171 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1172
1173 ops->container_cgroup = move_ptr(container_cgroup);
1174 if (__limit_cgroup)
1175 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1176 else
1177 ops->container_limit_cgroup = ops->container_cgroup;
1178 INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1179 ops->container_cgroup, ops->container_limit_cgroup);
1180 return true;
1181 }
1182
1183 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1184 struct lxc_handler *handler)
1185 {
1186 int monitor_len, transient_len = 0;
1187 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1188 transient[INTTYPE_TO_STRLEN(pid_t)];
1189
1190 if (!ops)
1191 return ret_set_errno(false, ENOENT);
1192
1193 if (!ops->hierarchies)
1194 return true;
1195
1196 if (!ops->monitor_cgroup)
1197 return ret_set_errno(false, ENOENT);
1198
1199 if (!handler || !handler->conf)
1200 return ret_set_errno(false, EINVAL);
1201
1202 monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1203 if (monitor_len < 0)
1204 return false;
1205
1206 if (handler->transient_pid > 0) {
1207 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1208 if (transient_len < 0)
1209 return false;
1210 }
1211
1212 for (int i = 0; ops->hierarchies[i]; i++) {
1213 struct hierarchy *h = ops->hierarchies[i];
1214 int ret;
1215
1216 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
1217 if (ret)
1218 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1219
1220 TRACE("Moved monitor into cgroup %d", h->dfd_mon);
1221
1222 if (handler->transient_pid <= 0)
1223 continue;
1224
1225 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
1226 if (ret)
1227 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1228
1229 TRACE("Moved transient process into cgroup %d", h->dfd_mon);
1230
1231 /*
1232 * we don't keep the fds for non-unified hierarchies around
1233 * mainly because we don't make use of them anymore after the
1234 * core cgroup setup is done but also because there are quite a
1235 * lot of them.
1236 */
1237 if (!is_unified_hierarchy(h))
1238 close_prot_errno_disarm(h->dfd_mon);
1239 }
1240 handler->transient_pid = -1;
1241
1242 return true;
1243 }
1244
1245 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1246 struct lxc_handler *handler)
1247 {
1248 int len;
1249 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1250
1251 if (!ops)
1252 return ret_set_errno(false, ENOENT);
1253
1254 if (!ops->hierarchies)
1255 return true;
1256
1257 if (!ops->container_cgroup)
1258 return ret_set_errno(false, ENOENT);
1259
1260 if (!handler || !handler->conf)
1261 return ret_set_errno(false, EINVAL);
1262
1263 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1264 if (len < 0)
1265 return false;
1266
1267 for (int i = 0; ops->hierarchies[i]; i++) {
1268 struct hierarchy *h = ops->hierarchies[i];
1269 int ret;
1270
1271 if (is_unified_hierarchy(h) &&
1272 (handler->clone_flags & CLONE_INTO_CGROUP))
1273 continue;
1274
1275 ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
1276 if (ret != 0)
1277 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
1278
1279 TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
1280 }
1281
1282 return true;
1283 }
1284
1285 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1286 gid_t chown_gid, mode_t chmod_mode)
1287 {
1288 int ret;
1289
1290 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1291 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1292 if (ret < 0)
1293 return log_warn_errno(-1,
1294 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1295 dirfd, path, (int)chown_uid,
1296 (int)chown_gid);
1297
1298 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1299 if (ret < 0)
1300 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1301 dirfd, path, (int)chmod_mode);
1302
1303 return 0;
1304 }
1305
1306 /* chgrp the container cgroups to container group. We leave
1307 * the container owner as cgroup owner. So we must make the
1308 * directories 775 so that the container can create sub-cgroups.
1309 *
1310 * Also chown the tasks and cgroup.procs files. Those may not
1311 * exist depending on kernel version.
1312 */
1313 static int chown_cgroup_wrapper(void *data)
1314 {
1315 int ret;
1316 uid_t destuid;
1317 struct generic_userns_exec_data *arg = data;
1318 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1319 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1320
1321 if (!lxc_drop_groups() && errno != EPERM)
1322 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1323
1324 ret = setresgid(nsgid, nsgid, nsgid);
1325 if (ret < 0)
1326 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1327 (int)nsgid, (int)nsgid, (int)nsgid);
1328
1329 ret = setresuid(nsuid, nsuid, nsuid);
1330 if (ret < 0)
1331 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1332 (int)nsuid, (int)nsuid, (int)nsuid);
1333
1334 destuid = get_ns_uid(arg->origuid);
1335 if (destuid == LXC_INVALID_UID)
1336 destuid = 0;
1337
1338 for (int i = 0; arg->hierarchies[i]; i++) {
1339 int dirfd = arg->hierarchies[i]->dfd_con;
1340
1341 if (dirfd < 0)
1342 return syserror_set(-EBADF, "Invalid cgroup file descriptor");
1343
1344 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1345
1346 /*
1347 * Failures to chown() these are inconvenient but not
1348 * detrimental We leave these owned by the container launcher,
1349 * so that container root can write to the files to attach. We
1350 * chmod() them 664 so that container systemd can write to the
1351 * files (which systemd in wily insists on doing).
1352 */
1353
1354 if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
1355 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1356
1357 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1358
1359 if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
1360 continue;
1361
1362 for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
1363 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1364 }
1365
1366 return 0;
1367 }
1368
1369 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1370 struct lxc_conf *conf)
1371 {
1372 struct generic_userns_exec_data wrap;
1373
1374 if (!ops)
1375 return ret_set_errno(false, ENOENT);
1376
1377 if (!ops->hierarchies)
1378 return true;
1379
1380 if (!ops->container_cgroup)
1381 return ret_set_errno(false, ENOENT);
1382
1383 if (!conf)
1384 return ret_set_errno(false, EINVAL);
1385
1386 if (lxc_list_empty(&conf->id_map))
1387 return true;
1388
1389 wrap.origuid = geteuid();
1390 wrap.path = NULL;
1391 wrap.hierarchies = ops->hierarchies;
1392 wrap.conf = conf;
1393
1394 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1395 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1396
1397 return true;
1398 }
1399
1400 __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
1401 {
1402 if (!ops)
1403 return;
1404
1405 if (!ops->hierarchies)
1406 return;
1407
1408 for (int i = 0; ops->hierarchies[i]; i++) {
1409 struct hierarchy *h = ops->hierarchies[i];
1410
1411 /* Close all monitor cgroup file descriptors. */
1412 close_prot_errno_disarm(h->dfd_mon);
1413 }
1414 /* Close the cgroup root file descriptor. */
1415 close_prot_errno_disarm(ops->dfd_mnt);
1416
1417 /*
1418 * The checking for freezer support should obviously be done at cgroup
1419 * initialization time but that doesn't work reliable. The freezer
1420 * controller has been demoted (rightly so) to a simple file located in
1421 * each non-root cgroup. At the time when the container is created we
1422 * might still be located in /sys/fs/cgroup and so checking for
1423 * cgroup.freeze won't tell us anything because this file doesn't exist
1424 * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1425 * find an already existing cgroup and then check within that cgroup
1426 * for the existence of cgroup.freeze but that will only work on
1427 * systemd based hosts. Other init systems might not manage cgroups and
1428 * so no cgroup will exist. So we defer until we have created cgroups
1429 * for our container which means we check here.
1430 */
1431 if (pure_unified_layout(ops) &&
1432 !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
1433 AT_SYMLINK_NOFOLLOW)) {
1434 TRACE("Unified hierarchy supports freezer");
1435 ops->unified->utilities |= FREEZER_CONTROLLER;
1436 }
1437 }
1438
1439 /* cgroup-full:* is done, no need to create subdirs */
1440 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1441 {
1442 switch (cgroup_automount_type) {
1443 case LXC_AUTO_CGROUP_RO:
1444 return true;
1445 case LXC_AUTO_CGROUP_RW:
1446 return true;
1447 case LXC_AUTO_CGROUP_MIXED:
1448 return true;
1449 }
1450
1451 return false;
1452 }
1453
1454 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1455 * remount controller ro if needed and bindmount the cgroupfs onto
1456 * control/the/cg/path.
1457 */
1458 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1459 char *hierarchy_mnt, char *cgpath,
1460 const char *container_cgroup)
1461 {
1462 __do_free char *sourcepath = NULL;
1463 int ret, remount_flags;
1464 int flags = MS_BIND;
1465
1466 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1467 (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1468 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
1469 if (ret < 0)
1470 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1471 hierarchy_mnt, hierarchy_mnt);
1472
1473 remount_flags = add_required_remount_flags(hierarchy_mnt,
1474 hierarchy_mnt,
1475 flags | MS_REMOUNT);
1476 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
1477 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1478 NULL);
1479 if (ret < 0)
1480 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
1481
1482 INFO("Remounted %s read-only", hierarchy_mnt);
1483 }
1484
1485 sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
1486 if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1487 flags |= MS_RDONLY;
1488
1489 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1490 if (ret < 0)
1491 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1492 h->controllers[0], cgpath);
1493 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1494
1495 if (flags & MS_RDONLY) {
1496 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1497 flags | MS_REMOUNT);
1498 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1499 if (ret < 0)
1500 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1501 INFO("Remounted %s read-only", cgpath);
1502 }
1503
1504 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1505 return 0;
1506 }
1507
1508 /* __cgroupfs_mount
1509 *
1510 * Mount cgroup hierarchies directly without using bind-mounts. The main
1511 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1512 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1513 */
1514 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1515 struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1516 const char *hierarchy_mnt)
1517 {
1518 __do_close int fd_fs = -EBADF;
1519 unsigned int flags = 0;
1520 char *fstype;
1521 int ret;
1522
1523 if (dfd_mnt_cgroupfs < 0)
1524 return ret_errno(EINVAL);
1525
1526 flags |= MOUNT_ATTR_NOSUID;
1527 flags |= MOUNT_ATTR_NOEXEC;
1528 flags |= MOUNT_ATTR_NODEV;
1529 flags |= MOUNT_ATTR_RELATIME;
1530
1531 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1532 (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1533 flags |= MOUNT_ATTR_RDONLY;
1534
1535 if (is_unified_hierarchy(h))
1536 fstype = "cgroup2";
1537 else
1538 fstype = "cgroup";
1539
1540 if (can_use_mount_api()) {
1541 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1542 if (fd_fs < 0)
1543 return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1544
1545 if (!is_unified_hierarchy(h)) {
1546 for (const char **it = (const char **)h->controllers; it && *it; it++) {
1547 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1548 ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1549 else
1550 ret = fs_set_property(fd_fs, *it, "");
1551 if (ret < 0)
1552 return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1553 }
1554 }
1555
1556 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1557 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1558 flags);
1559 } else {
1560 __do_free char *controllers = NULL, *target = NULL;
1561 unsigned int old_flags = 0;
1562 const char *rootfs_mnt;
1563
1564 if (!is_unified_hierarchy(h)) {
1565 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1566 if (!controllers)
1567 return ret_errno(ENOMEM);
1568 }
1569
1570 rootfs_mnt = get_rootfs_mnt(rootfs);
1571 ret = mnt_attributes_old(flags, &old_flags);
1572 if (ret)
1573 return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1574
1575 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1576 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1577 }
1578 if (ret < 0)
1579 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1580 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1581
1582 DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1583 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1584 return 0;
1585 }
1586
1587 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1588 struct lxc_rootfs *rootfs,
1589 int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1590 {
1591 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1592 dfd_mnt_cgroupfs, hierarchy_mnt);
1593 }
1594
1595 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1596 struct lxc_rootfs *rootfs,
1597 int dfd_mnt_cgroupfs,
1598 const char *hierarchy_mnt)
1599 {
1600 switch (cgroup_automount_type) {
1601 case LXC_AUTO_CGROUP_FULL_RO:
1602 break;
1603 case LXC_AUTO_CGROUP_FULL_RW:
1604 break;
1605 case LXC_AUTO_CGROUP_FULL_MIXED:
1606 break;
1607 default:
1608 return 0;
1609 }
1610
1611 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1612 dfd_mnt_cgroupfs, hierarchy_mnt);
1613 }
1614
1615 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1616 struct lxc_handler *handler, int cg_flags)
1617 {
1618 __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1619 __do_free char *cgroup_root = NULL;
1620 int cgroup_automount_type;
1621 bool in_cgroup_ns = false, wants_force_mount = false;
1622 struct lxc_conf *conf = handler->conf;
1623 struct lxc_rootfs *rootfs = &conf->rootfs;
1624 const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1625 int ret;
1626
1627 if (!ops)
1628 return ret_set_errno(false, ENOENT);
1629
1630 if (!ops->hierarchies)
1631 return true;
1632
1633 if (!conf)
1634 return ret_set_errno(false, EINVAL);
1635
1636 if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1637 return log_trace(true, "No cgroup mounts requested");
1638
1639 if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1640 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1641 wants_force_mount = true;
1642 }
1643
1644 switch (cg_flags) {
1645 case LXC_AUTO_CGROUP_RO:
1646 TRACE("Read-only cgroup mounts requested");
1647 break;
1648 case LXC_AUTO_CGROUP_RW:
1649 TRACE("Read-write cgroup mounts requested");
1650 break;
1651 case LXC_AUTO_CGROUP_MIXED:
1652 TRACE("Mixed cgroup mounts requested");
1653 break;
1654 case LXC_AUTO_CGROUP_FULL_RO:
1655 TRACE("Full read-only cgroup mounts requested");
1656 break;
1657 case LXC_AUTO_CGROUP_FULL_RW:
1658 TRACE("Full read-write cgroup mounts requested");
1659 break;
1660 case LXC_AUTO_CGROUP_FULL_MIXED:
1661 TRACE("Full mixed cgroup mounts requested");
1662 break;
1663 default:
1664 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1665 }
1666 cgroup_automount_type = cg_flags;
1667
1668 if (!wants_force_mount) {
1669 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
1670
1671 /*
1672 * Most recent distro versions currently have init system that
1673 * do support cgroup2 but do not mount it by default unless
1674 * explicitly told so even if the host is cgroup2 only. That
1675 * means they often will fail to boot. Fix this by pre-mounting
1676 * cgroup2 by default. We will likely need to be doing this a
1677 * few years until all distros have switched over to cgroup2 at
1678 * which point we can safely assume that their init systems
1679 * will mount it themselves.
1680 */
1681 if (pure_unified_layout(ops))
1682 wants_force_mount = true;
1683 }
1684
1685 if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
1686 in_cgroup_ns = true;
1687
1688 if (in_cgroup_ns && !wants_force_mount)
1689 return log_trace(true, "Mounting cgroups not requested or needed");
1690
1691 /* This is really the codepath that we want. */
1692 if (pure_unified_layout(ops)) {
1693 __do_close int dfd_mnt_unified = -EBADF;
1694
1695 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1696 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1697 if (dfd_mnt_unified < 0)
1698 return syserror_ret(false, "Failed to open %d(%s)",
1699 rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1700 /*
1701 * If cgroup namespaces are supported but the container will
1702 * not have CAP_SYS_ADMIN after it has started we need to mount
1703 * the cgroups manually.
1704 *
1705 * Note that here we know that wants_force_mount is true.
1706 * Otherwise we would've returned early above.
1707 */
1708 if (in_cgroup_ns) {
1709 /*
1710 * 1. cgroup:rw:force -> Mount the cgroup2 filesystem.
1711 * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only.
1712 * 3. cgroup:mixed:force -> See comment above how this
1713 * does not apply so
1714 * cgroup:mixed is equal to
1715 * cgroup:rw when cgroup
1716 * namespaces are supported.
1717
1718 * 4. cgroup:rw -> No-op; init system responsible for mounting.
1719 * 5. cgroup:ro -> No-op; init system responsible for mounting.
1720 * 6. cgroup:mixed -> No-op; init system responsible for mounting.
1721 *
1722 * 7. cgroup-full:rw -> Not supported.
1723 * 8. cgroup-full:ro -> Not supported.
1724 * 9. cgroup-full:mixed -> Not supported.
1725
1726 * 10. cgroup-full:rw:force -> Not supported.
1727 * 11. cgroup-full:ro:force -> Not supported.
1728 * 12. cgroup-full:mixed:force -> Not supported.
1729 */
1730 ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
1731 if (ret < 0)
1732 return syserror_ret(false, "Failed to force mount cgroup filesystem in cgroup namespace");
1733
1734 return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
1735 } else {
1736 /*
1737 * Either no cgroup namespace supported (highly
1738 * unlikely unless we're dealing with a Frankenkernel.
1739 * Or the user requested to keep the cgroup namespace
1740 * of the host or another container.
1741 */
1742 if (wants_force_mount) {
1743 /*
1744 * 1. cgroup:rw:force -> Bind-mount the cgroup2 filesystem writable.
1745 * 2. cgroup:ro:force -> Bind-mount the cgroup2 filesystem read-only.
1746 * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
1747 * and make the parent directory of the
1748 * container's cgroup read-only but the
1749 * container's cgroup writable.
1750 *
1751 * 10. cgroup-full:rw:force ->
1752 * 11. cgroup-full:ro:force ->
1753 * 12. cgroup-full:mixed:force ->
1754 */
1755 errno = EOPNOTSUPP;
1756 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1757 } else {
1758 errno = EOPNOTSUPP;
1759 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1760 }
1761 }
1762
1763 return syserror_ret(false, "Failed to mount cgroups");
1764 }
1765
1766 /*
1767 * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1768 * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1769 * DEFAULT_CGROUP_MOUNTPOINT define.
1770 */
1771 if (can_use_mount_api()) {
1772 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1773 if (fd_fs < 0)
1774 return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1775
1776 ret = fs_set_property(fd_fs, "mode", "0755");
1777 if (ret < 0)
1778 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1779
1780 ret = fs_set_property(fd_fs, "size", "10240k");
1781 if (ret < 0)
1782 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1783
1784 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1785 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1786 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1787 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
1788 } else {
1789 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1790 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1791 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1792 "size=10240k,mode=755", rootfs_mnt);
1793 }
1794 if (ret < 0)
1795 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1796 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1797
1798 dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1799 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1800 if (dfd_mnt_tmpfs < 0)
1801 return syserror_ret(false, "Failed to open %d(%s)",
1802 rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1803
1804 for (int i = 0; ops->hierarchies[i]; i++) {
1805 __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
1806 struct hierarchy *h = ops->hierarchies[i];
1807
1808 ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
1809 if (ret < 0)
1810 return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
1811
1812 if (in_cgroup_ns && wants_force_mount) {
1813 /*
1814 * If cgroup namespaces are supported but the container
1815 * will not have CAP_SYS_ADMIN after it has started we
1816 * need to mount the cgroups manually.
1817 */
1818 ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
1819 dfd_mnt_tmpfs, h->at_mnt);
1820 if (ret < 0)
1821 return false;
1822
1823 continue;
1824 }
1825
1826 /* Here is where the ancient kernel section begins. */
1827 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
1828 dfd_mnt_tmpfs, h->at_mnt);
1829 if (ret < 0)
1830 return false;
1831
1832 if (!cg_mount_needs_subdirs(cgroup_automount_type))
1833 continue;
1834
1835 if (!cgroup_root)
1836 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1837
1838 hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
1839 path2 = must_make_path(hierarchy_mnt, h->at_base,
1840 ops->container_cgroup, NULL);
1841 ret = mkdir_p(path2, 0755);
1842 if (ret < 0 && (errno != EEXIST))
1843 return false;
1844
1845 ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
1846 hierarchy_mnt, path2,
1847 ops->container_cgroup);
1848 if (ret < 0)
1849 return false;
1850 }
1851
1852 return true;
1853 }
1854
1855 /* Only root needs to escape to the cgroup of its init. */
1856 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
1857 struct lxc_conf *conf)
1858 {
1859 if (!ops)
1860 return ret_set_errno(false, ENOENT);
1861
1862 if (!ops->hierarchies)
1863 return true;
1864
1865 if (!conf)
1866 return ret_set_errno(false, EINVAL);
1867
1868 if (conf->cgroup_meta.relative || geteuid())
1869 return true;
1870
1871 for (int i = 0; ops->hierarchies[i]; i++) {
1872 __do_free char *fullpath = NULL;
1873 int ret;
1874
1875 fullpath = make_cgroup_path(ops->hierarchies[i],
1876 ops->hierarchies[i]->at_base,
1877 "cgroup.procs", NULL);
1878 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1879 if (ret != 0)
1880 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
1881 }
1882
1883 return true;
1884 }
1885
1886 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
1887 {
1888 int i = 0;
1889
1890 if (!ops)
1891 return ret_set_errno(-1, ENOENT);
1892
1893 if (!ops->hierarchies)
1894 return 0;
1895
1896 for (; ops->hierarchies[i]; i++)
1897 ;
1898
1899 return i;
1900 }
1901
1902 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
1903 int n, char ***out)
1904 {
1905 int i;
1906
1907 if (!ops)
1908 return ret_set_errno(false, ENOENT);
1909
1910 if (!ops->hierarchies)
1911 return ret_set_errno(false, ENOENT);
1912
1913 /* consistency check n */
1914 for (i = 0; i < n; i++)
1915 if (!ops->hierarchies[i])
1916 return ret_set_errno(false, ENOENT);
1917
1918 *out = ops->hierarchies[i]->controllers;
1919
1920 return true;
1921 }
1922
1923 static int cg_legacy_freeze(struct cgroup_ops *ops)
1924 {
1925 struct hierarchy *h;
1926
1927 h = get_hierarchy(ops, "freezer");
1928 if (!h)
1929 return ret_set_errno(-1, ENOENT);
1930
1931 return lxc_write_openat(h->path_con, "freezer.state",
1932 "FROZEN", STRLITERALLEN("FROZEN"));
1933 }
1934
1935 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1936 struct lxc_epoll_descr *descr)
1937 {
1938 __do_free char *line = NULL;
1939 __do_fclose FILE *f = NULL;
1940 int state = PTR_TO_INT(cbdata);
1941 size_t len;
1942 const char *state_string;
1943
1944 f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
1945 if (!f)
1946 return LXC_MAINLOOP_ERROR;
1947
1948 if (state == 1)
1949 state_string = "frozen 1";
1950 else
1951 state_string = "frozen 0";
1952
1953 while (getline(&line, &len, f) != -1)
1954 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
1955 return LXC_MAINLOOP_CLOSE;
1956
1957 rewind(f);
1958
1959 return LXC_MAINLOOP_CONTINUE;
1960 }
1961
1962 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
1963 const char *state_string,
1964 int state_num,
1965 const char *epoll_error,
1966 const char *wait_error)
1967 {
1968 __do_close int fd = -EBADF;
1969 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
1970 int ret;
1971 struct lxc_epoll_descr descr;
1972 struct hierarchy *h;
1973
1974 h = ops->unified;
1975 if (!h)
1976 return ret_set_errno(-1, ENOENT);
1977
1978 if (!h->path_con)
1979 return ret_set_errno(-1, EEXIST);
1980
1981 if (timeout != 0) {
1982 __do_free char *events_file = NULL;
1983
1984 events_file = must_make_path(h->path_con, "cgroup.events", NULL);
1985 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1986 if (fd < 0)
1987 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
1988
1989 ret = lxc_mainloop_open(&descr);
1990 if (ret)
1991 return log_error_errno(-1, errno, "%s", epoll_error);
1992
1993 /* automatically cleaned up now */
1994 descr_ptr = &descr;
1995
1996 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
1997 if (ret < 0)
1998 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
1999 }
2000
2001 ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
2002 if (ret < 0)
2003 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
2004
2005 if (timeout != 0 && lxc_mainloop(&descr, timeout))
2006 return log_error_errno(-1, errno, "%s", wait_error);
2007
2008 return 0;
2009 }
2010
2011 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
2012 {
2013 return cg_unified_freeze_do(ops, timeout, "1", 1,
2014 "Failed to create epoll instance to wait for container freeze",
2015 "Failed to wait for container to be frozen");
2016 }
2017
2018 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
2019 {
2020 if (!ops->hierarchies)
2021 return ret_set_errno(-1, ENOENT);
2022
2023 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2024 return cg_legacy_freeze(ops);
2025
2026 return cg_unified_freeze(ops, timeout);
2027 }
2028
2029 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
2030 {
2031 struct hierarchy *h;
2032
2033 h = get_hierarchy(ops, "freezer");
2034 if (!h)
2035 return ret_set_errno(-1, ENOENT);
2036
2037 return lxc_write_openat(h->path_con, "freezer.state",
2038 "THAWED", STRLITERALLEN("THAWED"));
2039 }
2040
2041 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2042 {
2043 return cg_unified_freeze_do(ops, timeout, "0", 0,
2044 "Failed to create epoll instance to wait for container unfreeze",
2045 "Failed to wait for container to be unfrozen");
2046 }
2047
2048 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2049 {
2050 if (!ops->hierarchies)
2051 return ret_set_errno(-1, ENOENT);
2052
2053 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2054 return cg_legacy_unfreeze(ops);
2055
2056 return cg_unified_unfreeze(ops, timeout);
2057 }
2058
2059 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2060 const char *controller, bool limiting)
2061 {
2062 struct hierarchy *h;
2063 size_t len;
2064 const char *path;
2065
2066 h = get_hierarchy(ops, controller);
2067 if (!h)
2068 return log_warn_errno(NULL, ENOENT,
2069 "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
2070
2071 if (limiting)
2072 path = h->path_lim;
2073 else
2074 path = h->path_con;
2075 if (!path)
2076 return NULL;
2077
2078 len = strlen(h->at_mnt);
2079 if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
2080 STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
2081 path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
2082 path += strspn(path, "/");
2083 }
2084 return path += len;
2085 }
2086
2087 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2088 const char *controller)
2089 {
2090 return cgfsng_get_cgroup_do(ops, controller, false);
2091 }
2092
2093 __cgfsng_ops static const char *cgfsng_get_limit_cgroup(struct cgroup_ops *ops,
2094 const char *controller)
2095 {
2096 return cgfsng_get_cgroup_do(ops, controller, true);
2097 }
2098
2099 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2100 * which must be freed by the caller.
2101 */
2102 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2103 const char *inpath,
2104 const char *filename)
2105 {
2106 return make_cgroup_path(h, inpath, filename, NULL);
2107 }
2108
2109 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2110 {
2111 int idx = 1;
2112 int ret;
2113 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2114 ssize_t pidstr_len;
2115
2116 /* Create leaf cgroup. */
2117 ret = mkdirat(unified_fd, ".lxc", 0755);
2118 if (ret < 0 && errno != EEXIST)
2119 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2120
2121 pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2122 if (pidstr_len < 0)
2123 return pidstr_len;
2124
2125 ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2126 if (ret < 0)
2127 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2128 if (ret == 0)
2129 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2130
2131 /* this is a non-leaf node */
2132 if (errno != EBUSY)
2133 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2134
2135 do {
2136 bool rm = false;
2137 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2138 char *slash = attach_cgroup;
2139
2140 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2141 if (ret < 0)
2142 return ret;
2143
2144 /*
2145 * This shouldn't really happen but the compiler might complain
2146 * that a short write would cause a buffer overrun. So be on
2147 * the safe side.
2148 */
2149 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2150 return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2151
2152 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2153 *slash = '\0';
2154
2155 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2156 if (ret < 0 && errno != EEXIST)
2157 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2158 if (ret == 0)
2159 rm = true;
2160
2161 *slash = '/';
2162
2163 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2164 if (ret == 0)
2165 return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2166
2167 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2168 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2169
2170 /* this is a non-leaf node */
2171 if (errno != EBUSY)
2172 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2173
2174 idx++;
2175 } while (idx < 1000);
2176
2177 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2178 }
2179
2180 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2181 int unified_fd, int *sk_fd)
2182 {
2183 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2184 int target_fds[2];
2185 ssize_t ret;
2186
2187 /* Create leaf cgroup. */
2188 ret = mkdirat(unified_fd, ".lxc", 0755);
2189 if (ret < 0 && errno != EEXIST)
2190 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2191
2192 target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2193 if (target_fd0 < 0)
2194 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2195 target_fds[0] = target_fd0;
2196
2197 target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2198 if (target_fd1 < 0)
2199 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2200 target_fds[1] = target_fd1;
2201
2202 ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2203 if (ret <= 0)
2204 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2205 target_fd0, target_fd1);
2206
2207 return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2208 }
2209
2210 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2211 int *sk_fd, pid_t pid)
2212 {
2213 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2214 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2215 size_t pidstr_len;
2216 ssize_t ret;
2217
2218 ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1);
2219 if (ret < 0)
2220 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2221
2222 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2223
2224 ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2225 if (ret > 0 && ret == pidstr_len)
2226 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2227
2228 ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2229 if (ret > 0 && ret == pidstr_len)
2230 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2231
2232 return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2233 target_fd0, target_fd1);
2234 }
2235
2236 struct userns_exec_unified_attach_data {
2237 const struct lxc_conf *conf;
2238 int unified_fd;
2239 int sk_pair[2];
2240 pid_t pid;
2241 };
2242
2243 static int cgroup_unified_attach_child_wrapper(void *data)
2244 {
2245 struct userns_exec_unified_attach_data *args = data;
2246
2247 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2248 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2249 return ret_errno(EINVAL);
2250
2251 close_prot_errno_disarm(args->sk_pair[0]);
2252 return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2253 &args->sk_pair[1]);
2254 }
2255
2256 static int cgroup_unified_attach_parent_wrapper(void *data)
2257 {
2258 struct userns_exec_unified_attach_data *args = data;
2259
2260 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2261 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2262 return ret_errno(EINVAL);
2263
2264 close_prot_errno_disarm(args->sk_pair[1]);
2265 return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2266 args->pid);
2267 }
2268
2269 /* Technically, we're always at a delegation boundary here (This is especially
2270 * true when cgroup namespaces are available.). The reasoning is that in order
2271 * for us to have been able to start a container in the first place the root
2272 * cgroup must have been a leaf node. Now, either the container's init system
2273 * has populated the cgroup and kept it as a leaf node or it has created
2274 * subtrees. In the former case we will simply attach to the leaf node we
2275 * created when we started the container in the latter case we create our own
2276 * cgroup for the attaching process.
2277 */
2278 static int __cg_unified_attach(const struct hierarchy *h,
2279 const struct lxc_conf *conf, const char *name,
2280 const char *lxcpath, pid_t pid,
2281 const char *controller)
2282 {
2283 __do_close int unified_fd = -EBADF;
2284 __do_free char *path = NULL, *cgroup = NULL;
2285 int ret;
2286
2287 if (!conf || !name || !lxcpath || pid <= 0)
2288 return ret_errno(EINVAL);
2289
2290 ret = cgroup_attach(conf, name, lxcpath, pid);
2291 if (ret == 0)
2292 return log_trace(0, "Attached to unified cgroup via command handler");
2293 if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2)
2294 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2295
2296 /* Fall back to retrieving the path for the unified cgroup. */
2297 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2298 /* not running */
2299 if (!cgroup)
2300 return 0;
2301
2302 path = make_cgroup_path(h, cgroup, NULL);
2303
2304 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2305 if (unified_fd < 0)
2306 return ret_errno(EBADF);
2307
2308 if (!lxc_list_empty(&conf->id_map)) {
2309 struct userns_exec_unified_attach_data args = {
2310 .conf = conf,
2311 .unified_fd = unified_fd,
2312 .pid = pid,
2313 };
2314
2315 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2316 if (ret < 0)
2317 return -errno;
2318
2319 ret = userns_exec_minimal(conf,
2320 cgroup_unified_attach_parent_wrapper,
2321 &args,
2322 cgroup_unified_attach_child_wrapper,
2323 &args);
2324 } else {
2325 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2326 }
2327
2328 return ret;
2329 }
2330
2331 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2332 const struct lxc_conf *conf,
2333 const char *name, const char *lxcpath,
2334 pid_t pid)
2335 {
2336 int len, ret;
2337 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2338
2339 if (!ops)
2340 return ret_set_errno(false, ENOENT);
2341
2342 if (!ops->hierarchies)
2343 return true;
2344
2345 len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2346 if (len < 0)
2347 return false;
2348
2349 for (int i = 0; ops->hierarchies[i]; i++) {
2350 __do_free char *fullpath = NULL, *path = NULL;
2351 struct hierarchy *h = ops->hierarchies[i];
2352
2353 if (h->fs_type == UNIFIED_HIERARCHY) {
2354 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2355 h->controllers[0]);
2356 if (ret < 0)
2357 return false;
2358
2359 continue;
2360 }
2361
2362 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2363 if (!path) {
2364 /*
2365 * Someone might have created a name=<controller>
2366 * controller after the container has started and so
2367 * the container doesn't make use of this controller.
2368 *
2369 * Link: https://github.com/lxc/lxd/issues/8577
2370 */
2371 TRACE("Skipping unused %s controller", maybe_empty(h->controllers[0]));
2372 continue;
2373 }
2374
2375 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2376 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2377 if (ret < 0)
2378 return log_error_errno(false, errno, "Failed to attach %d to %s",
2379 (int)pid, fullpath);
2380 }
2381
2382 return true;
2383 }
2384
2385 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2386 * don't have a cgroup_data set up, so we ask the running container through the
2387 * commands API for the cgroup path.
2388 */
2389 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2390 char *value, size_t len, const char *name,
2391 const char *lxcpath)
2392 {
2393 __do_free char *path = NULL;
2394 __do_free char *controller = NULL;
2395 char *p;
2396 struct hierarchy *h;
2397 int ret = -1;
2398
2399 if (!ops)
2400 return ret_set_errno(-1, ENOENT);
2401
2402 controller = strdup(filename);
2403 if (!controller)
2404 return ret_errno(ENOMEM);
2405
2406 p = strchr(controller, '.');
2407 if (p)
2408 *p = '\0';
2409
2410 path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2411 /* not running */
2412 if (!path)
2413 return -1;
2414
2415 h = get_hierarchy(ops, controller);
2416 if (h) {
2417 __do_free char *fullpath = NULL;
2418
2419 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2420 ret = lxc_read_from_file(fullpath, value, len);
2421 }
2422
2423 return ret;
2424 }
2425
2426 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2427 {
2428 for (int count = 0; count < 3; count++, val++) {
2429 switch (*val) {
2430 case 'r':
2431 device->access[count] = *val;
2432 break;
2433 case 'w':
2434 device->access[count] = *val;
2435 break;
2436 case 'm':
2437 device->access[count] = *val;
2438 break;
2439 case '\n':
2440 case '\0':
2441 count = 3;
2442 break;
2443 default:
2444 return ret_errno(EINVAL);
2445 }
2446 }
2447
2448 return 0;
2449 }
2450
2451 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2452 const char *val)
2453 {
2454 int count, ret;
2455 char temp[50];
2456
2457 if (strequal("devices.allow", key))
2458 device->allow = 1; /* allow the device */
2459 else
2460 device->allow = 0; /* deny the device */
2461
2462 if (strequal(val, "a")) {
2463 /* global rule */
2464 device->type = 'a';
2465 device->major = -1;
2466 device->minor = -1;
2467 return 0;
2468 }
2469
2470 switch (*val) {
2471 case 'a':
2472 __fallthrough;
2473 case 'b':
2474 __fallthrough;
2475 case 'c':
2476 device->type = *val;
2477 break;
2478 default:
2479 return -1;
2480 }
2481
2482 val++;
2483 if (!isspace(*val))
2484 return -1;
2485 val++;
2486 if (*val == '*') {
2487 device->major = -1;
2488 val++;
2489 } else if (isdigit(*val)) {
2490 memset(temp, 0, sizeof(temp));
2491 for (count = 0; count < sizeof(temp) - 1; count++) {
2492 temp[count] = *val;
2493 val++;
2494 if (!isdigit(*val))
2495 break;
2496 }
2497 ret = lxc_safe_int(temp, &device->major);
2498 if (ret)
2499 return -1;
2500 } else {
2501 return -1;
2502 }
2503 if (*val != ':')
2504 return -1;
2505 val++;
2506
2507 /* read minor */
2508 if (*val == '*') {
2509 device->minor = -1;
2510 val++;
2511 } else if (isdigit(*val)) {
2512 memset(temp, 0, sizeof(temp));
2513 for (count = 0; count < sizeof(temp) - 1; count++) {
2514 temp[count] = *val;
2515 val++;
2516 if (!isdigit(*val))
2517 break;
2518 }
2519 ret = lxc_safe_int(temp, &device->minor);
2520 if (ret)
2521 return -1;
2522 } else {
2523 return -1;
2524 }
2525 if (!isspace(*val))
2526 return -1;
2527
2528 return device_cgroup_parse_access(device, ++val);
2529 }
2530
2531 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2532 * don't have a cgroup_data set up, so we ask the running container through the
2533 * commands API for the cgroup path.
2534 */
2535 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2536 const char *key, const char *value,
2537 const char *name, const char *lxcpath)
2538 {
2539 __do_free char *path = NULL;
2540 __do_free char *controller = NULL;
2541 char *p;
2542 struct hierarchy *h;
2543 int ret = -1;
2544
2545 if (!ops || is_empty_string(key) || is_empty_string(value) ||
2546 is_empty_string(name) || is_empty_string(lxcpath))
2547 return ret_errno(EINVAL);
2548
2549 controller = strdup(key);
2550 if (!controller)
2551 return ret_errno(ENOMEM);
2552
2553 p = strchr(controller, '.');
2554 if (p)
2555 *p = '\0';
2556
2557 if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2558 struct device_item device = {};
2559
2560 ret = device_cgroup_rule_parse(&device, key, value);
2561 if (ret < 0)
2562 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2563 key, value);
2564
2565 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2566 if (ret < 0)
2567 return -1;
2568
2569 return 0;
2570 }
2571
2572 path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2573 /* not running */
2574 if (!path)
2575 return -1;
2576
2577 h = get_hierarchy(ops, controller);
2578 if (h) {
2579 __do_free char *fullpath = NULL;
2580
2581 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2582 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2583 }
2584
2585 return ret;
2586 }
2587
2588 /* take devices cgroup line
2589 * /dev/foo rwx
2590 * and convert it to a valid
2591 * type major:minor mode
2592 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2593 * the output.
2594 */
2595 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2596 const char *devpath)
2597 {
2598 __do_free char *path = NULL;
2599 char *mode = NULL;
2600 int n_parts, ret;
2601 char *p;
2602 struct stat sb;
2603
2604 path = strdup(devpath);
2605 if (!path)
2606 return ret_errno(ENOMEM);
2607
2608 /*
2609 * Read path followed by mode. Ignore any trailing text.
2610 * A ' # comment' would be legal. Technically other text is not
2611 * legal, we could check for that if we cared to.
2612 */
2613 for (n_parts = 1, p = path; *p; p++) {
2614 if (*p != ' ')
2615 continue;
2616 *p = '\0';
2617
2618 if (n_parts != 1)
2619 break;
2620 p++;
2621 n_parts++;
2622
2623 while (*p == ' ')
2624 p++;
2625
2626 mode = p;
2627
2628 if (*p == '\0')
2629 return ret_set_errno(-1, EINVAL);
2630 }
2631
2632 if (!mode)
2633 return ret_errno(EINVAL);
2634
2635 if (device_cgroup_parse_access(device, mode) < 0)
2636 return -1;
2637
2638 ret = stat(path, &sb);
2639 if (ret < 0)
2640 return ret_set_errno(-1, errno);
2641
2642 mode_t m = sb.st_mode & S_IFMT;
2643 switch (m) {
2644 case S_IFBLK:
2645 device->type = 'b';
2646 break;
2647 case S_IFCHR:
2648 device->type = 'c';
2649 break;
2650 default:
2651 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2652 }
2653
2654 device->major = MAJOR(sb.st_rdev);
2655 device->minor = MINOR(sb.st_rdev);
2656 device->allow = 1;
2657
2658 return 0;
2659 }
2660
2661 static int convert_devpath(const char *invalue, char *dest)
2662 {
2663 struct device_item device = {};
2664 int ret;
2665
2666 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2667 if (ret < 0)
2668 return -1;
2669
2670 ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2671 device.minor, device.access);
2672 if (ret < 0)
2673 return log_error_errno(ret, -ret,
2674 "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2675 device.type, device.major, device.minor,
2676 device.access);
2677
2678 return 0;
2679 }
2680
2681 /* Called from setup_limits - here we have the container's cgroup_data because
2682 * we created the cgroups.
2683 */
2684 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2685 const char *value, bool is_cpuset)
2686 {
2687 __do_free char *controller = NULL;
2688 char *p;
2689 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2690 char converted_value[50];
2691 struct hierarchy *h;
2692
2693 controller = strdup(filename);
2694 if (!controller)
2695 return ret_errno(ENOMEM);
2696
2697 p = strchr(controller, '.');
2698 if (p)
2699 *p = '\0';
2700
2701 if (strequal("devices.allow", filename) && value[0] == '/') {
2702 int ret;
2703
2704 ret = convert_devpath(value, converted_value);
2705 if (ret < 0)
2706 return ret;
2707 value = converted_value;
2708 }
2709
2710 h = get_hierarchy(ops, controller);
2711 if (!h)
2712 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
2713
2714 if (is_cpuset) {
2715 int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
2716 if (ret)
2717 return ret;
2718 }
2719 return lxc_write_openat(h->path_lim, filename, value, strlen(value));
2720 }
2721
2722 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2723 struct lxc_conf *conf,
2724 bool do_devices)
2725 {
2726 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
2727 struct lxc_list *cgroup_settings = &conf->cgroup;
2728 struct lxc_list *iterator, *next;
2729 struct lxc_cgroup *cg;
2730 bool ret = false;
2731
2732 if (!ops)
2733 return ret_set_errno(false, ENOENT);
2734
2735 if (!conf)
2736 return ret_set_errno(false, EINVAL);
2737
2738 cgroup_settings = &conf->cgroup;
2739 if (lxc_list_empty(cgroup_settings))
2740 return true;
2741
2742 if (!ops->hierarchies)
2743 return ret_set_errno(false, EINVAL);
2744
2745 if (pure_unified_layout(ops))
2746 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2747
2748 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2749 if (!sorted_cgroup_settings)
2750 return false;
2751
2752 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2753 cg = iterator->elem;
2754
2755 if (do_devices == strnequal("devices", cg->subsystem, 7)) {
2756 if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strnequal("cpuset", cg->subsystem, 6))) {
2757 if (do_devices && (errno == EACCES || errno == EPERM)) {
2758 SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2759 continue;
2760 }
2761 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2762 goto out;
2763 }
2764 DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
2765 }
2766 }
2767
2768 ret = true;
2769 INFO("Limits for the legacy cgroup hierarchies have been setup");
2770 out:
2771 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2772 lxc_list_del(iterator);
2773 free(iterator);
2774 }
2775
2776 return ret;
2777 }
2778
2779 /*
2780 * Some of the parsing logic comes from the original cgroup device v1
2781 * implementation in the kernel.
2782 */
2783 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2784 struct lxc_conf *conf, const char *key,
2785 const char *val)
2786 {
2787 struct device_item device_item = {};
2788 int ret;
2789
2790 if (strequal("devices.allow", key) && abspath(val))
2791 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2792 else
2793 ret = device_cgroup_rule_parse(&device_item, key, val);
2794 if (ret < 0)
2795 return syserror_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
2796
2797 /*
2798 * Note that bpf_list_add_device() returns 1 if it altered the device
2799 * list and 0 if it didn't; both return values indicate success.
2800 * Only a negative return value indicates an error.
2801 */
2802 ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
2803 if (ret < 0)
2804 return -1;
2805
2806 return 0;
2807 }
2808
2809 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2810 struct lxc_handler *handler)
2811 {
2812 struct lxc_list *cgroup_settings, *iterator;
2813 struct hierarchy *h;
2814 struct lxc_conf *conf;
2815
2816 if (!ops)
2817 return ret_set_errno(false, ENOENT);
2818
2819 if (!ops->hierarchies)
2820 return true;
2821
2822 if (!ops->container_cgroup)
2823 return ret_set_errno(false, EINVAL);
2824
2825 if (!handler || !handler->conf)
2826 return ret_set_errno(false, EINVAL);
2827 conf = handler->conf;
2828
2829 cgroup_settings = &conf->cgroup2;
2830 if (lxc_list_empty(cgroup_settings))
2831 return true;
2832
2833 if (!pure_unified_layout(ops))
2834 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
2835
2836 if (!ops->unified)
2837 return false;
2838 h = ops->unified;
2839
2840 lxc_list_for_each (iterator, cgroup_settings) {
2841 struct lxc_cgroup *cg = iterator->elem;
2842 int ret;
2843
2844 if (strnequal("devices", cg->subsystem, 7))
2845 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
2846 else
2847 ret = lxc_write_openat(h->path_lim, cg->subsystem, cg->value, strlen(cg->value));
2848 if (ret < 0)
2849 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2850
2851 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2852 }
2853
2854 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2855 }
2856
2857 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
2858 {
2859 struct lxc_conf *conf;
2860 struct hierarchy *unified;
2861
2862 if (!ops)
2863 return ret_set_errno(false, ENOENT);
2864
2865 if (!ops->hierarchies)
2866 return true;
2867
2868 if (!ops->container_cgroup)
2869 return ret_set_errno(false, EEXIST);
2870
2871 if (!handler || !handler->conf)
2872 return ret_set_errno(false, EINVAL);
2873 conf = handler->conf;
2874
2875 unified = ops->unified;
2876 if (!unified || !device_utility_controller(unified) ||
2877 !unified->path_con ||
2878 lxc_list_empty(&(conf->bpf_devices).device_item))
2879 return true;
2880
2881 return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
2882 }
2883
2884 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2885 {
2886 __do_close int dfd_final = -EBADF;
2887 __do_free char *add_controllers = NULL, *copy = NULL;
2888 size_t full_len = 0;
2889 struct hierarchy *unified;
2890 int dfd_cur, ret;
2891 char *cur;
2892 char **it;
2893
2894 if (!ops->hierarchies || !pure_unified_layout(ops))
2895 return true;
2896
2897 unified = ops->unified;
2898 if (!unified->controllers[0])
2899 return true;
2900
2901 /* For now we simply enable all controllers that we have detected by
2902 * creating a string like "+memory +pids +cpu +io".
2903 * TODO: In the near future we might want to support "-<controller>"
2904 * etc. but whether supporting semantics like this make sense will need
2905 * some thinking.
2906 */
2907 for (it = unified->controllers; it && *it; it++) {
2908 full_len += strlen(*it) + 2;
2909 add_controllers = must_realloc(add_controllers, full_len + 1);
2910
2911 if (unified->controllers[0] == *it)
2912 add_controllers[0] = '\0';
2913
2914 (void)strlcat(add_controllers, "+", full_len + 1);
2915 (void)strlcat(add_controllers, *it, full_len + 1);
2916
2917 if ((it + 1) && *(it + 1))
2918 (void)strlcat(add_controllers, " ", full_len + 1);
2919 }
2920
2921 copy = strdup(cgroup);
2922 if (!copy)
2923 return false;
2924
2925 /*
2926 * Placing the write to cgroup.subtree_control before the open() is
2927 * intentional because of the cgroup2 delegation model. It enforces
2928 * that leaf cgroups don't have any controllers enabled for delegation.
2929 */
2930 dfd_cur = unified->dfd_base;
2931 lxc_iterate_parts(cur, copy, "/") {
2932 /*
2933 * Even though we vetted the paths when we parsed the config
2934 * we're paranoid here and check that the path is neither
2935 * absolute nor walks upwards.
2936 */
2937 if (abspath(cur))
2938 return syserror_set(-EINVAL, "No absolute paths allowed");
2939
2940 if (strnequal(cur, "..", STRLITERALLEN("..")))
2941 return syserror_set(-EINVAL, "No upward walking paths allowed");
2942
2943 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
2944 if (ret < 0)
2945 return syserror("Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2946
2947 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2948
2949 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
2950 if (dfd_final < 0)
2951 return syserror("Fail to open directory %d(%s)", dfd_cur, cur);
2952 if (dfd_cur != unified->dfd_base)
2953 close(dfd_cur);
2954 /*
2955 * Leave dfd_final pointing to the last fd we opened so
2956 * it will be automatically zapped if we return early.
2957 */
2958 dfd_cur = dfd_final;
2959 }
2960
2961 return true;
2962 }
2963
2964 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2965 {
2966 if (!ops)
2967 return ret_set_errno(false, ENOENT);
2968
2969 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2970 }
2971
2972 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2973 {
2974 if (!ops)
2975 return ret_set_errno(false, ENOENT);
2976
2977 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2978 }
2979
2980 static inline bool unified_cgroup(const char *line)
2981 {
2982 return *line == '0';
2983 }
2984
2985 static inline char *current_unified_cgroup(bool relative, char *line)
2986 {
2987 char *current_cgroup;
2988
2989 line += STRLITERALLEN("0::");
2990
2991 if (!abspath(line))
2992 return ERR_PTR(-EINVAL);
2993
2994 /* remove init.scope */
2995 if (!relative)
2996 line = prune_init_scope(line);
2997
2998 /* create a relative path */
2999 line = deabs(line);
3000
3001 current_cgroup = strdup(line);
3002 if (!current_cgroup)
3003 return ERR_PTR(-ENOMEM);
3004
3005 return current_cgroup;
3006 }
3007
3008 static inline const char *unprefix(const char *controllers)
3009 {
3010 if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
3011 return controllers + STRLITERALLEN("name=");
3012 return controllers;
3013 }
3014
3015 static int __list_cgroup_delegate(char ***delegate)
3016 {
3017 __do_free char **list = NULL;
3018 __do_free char *buf = NULL;
3019 char *standard[] = {
3020 "cgroup.procs",
3021 "cgroup.threads",
3022 "cgroup.subtree_control",
3023 "memory.oom.group",
3024 NULL,
3025 };
3026 char *token;
3027 int ret;
3028
3029 buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
3030 if (!buf) {
3031 for (char **p = standard; p && *p; p++) {
3032 ret = list_add_string(&list, *p);
3033 if (ret < 0)
3034 return ret;
3035 }
3036
3037 *delegate = move_ptr(list);
3038 return syswarn_ret(0, "Failed to read /sys/kernel/cgroup/delegate");
3039 }
3040
3041 lxc_iterate_parts(token, buf, " \t\n") {
3042 /*
3043 * We always need to chown this for both cgroup and
3044 * cgroup2.
3045 */
3046 if (strequal(token, "cgroup.procs"))
3047 continue;
3048
3049 ret = list_add_string(&list, token);
3050 if (ret < 0)
3051 return ret;
3052 }
3053
3054 *delegate = move_ptr(list);
3055 return 0;
3056 }
3057
3058 static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
3059 {
3060 __do_free_string_list char **list = NULL;
3061 int ret;
3062
3063 ret = __list_cgroup_delegate(&list);
3064 if (ret < 0)
3065 return syserror_ret(ret, "Failed to determine unified cgroup delegation requirements");
3066
3067 for (char *const *s = list; s && *s; s++) {
3068 if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
3069 continue;
3070
3071 return sysinfo_ret(false, "The %s file is not writable, skipping unified hierarchy", *s);
3072 }
3073
3074 *ret_files = move_ptr(list);
3075 return true;
3076 }
3077
3078 static bool legacy_hierarchy_delegated(int dfd_base)
3079 {
3080 int ret;
3081
3082 ret = faccessat(dfd_base, ".", W_OK, 0);
3083 if (ret < 0 && errno != ENOENT)
3084 return sysinfo_ret(false, "Legacy hierarchy not writable, skipping");
3085
3086 return true;
3087 }
3088
3089 /**
3090 * systemd guarantees that the order of co-mounted controllers is stable. On
3091 * some systems the order of the controllers might be reversed though.
3092 *
3093 * For example, this is how the order is mismatched on CentOS 7:
3094 *
3095 * [root@localhost ~]# cat /proc/self/cgroup
3096 * 11:perf_event:/
3097 * 10:pids:/
3098 * 9:freezer:/
3099 * >>>> 8:cpuacct,cpu:/
3100 * 7:memory:/
3101 * 6:blkio:/
3102 * 5:devices:/
3103 * 4:hugetlb:/
3104 * >>>> 3:net_prio,net_cls:/
3105 * 2:cpuset:/
3106 * 1:name=systemd:/user.slice/user-0.slice/session-c1.scope
3107 *
3108 * whereas the mountpoint:
3109 *
3110 * | |-/sys/fs/cgroup tmpfs tmpfs ro,nosuid,nodev,noexec,mode=755
3111 * | | |-/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
3112 * | | |-/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset
3113 * >>>> | | |-/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_prio,net_cls
3114 * | | |-/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb
3115 * | | |-/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices
3116 * | | |-/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio
3117 * | | |-/sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory
3118 * >>>> | | |-/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuacct,cpu
3119 * | | |-/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer
3120 * | | |-/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids
3121 * | | `-/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event
3122 *
3123 * Ensure that we always use the systemd-guaranteed stable order when checking
3124 * for the mountpoint.
3125 */
3126 __attribute__((returns_nonnull)) __attribute__((nonnull))
3127 static const char *stable_order(const char *controllers)
3128 {
3129 if (strequal(controllers, "cpuacct,cpu"))
3130 return "cpu,cpuacct";
3131
3132 if (strequal(controllers, "net_prio,net_cls"))
3133 return "net_cls,net_prio";
3134
3135 return unprefix(controllers);
3136 }
3137
3138 static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
3139 bool unprivileged)
3140 {
3141 __do_free char *cgroup_info = NULL;
3142 char *it;
3143
3144 /*
3145 * Root spawned containers escape the current cgroup, so use init's
3146 * cgroups as our base in that case.
3147 */
3148 if (!relative && (geteuid() == 0))
3149 cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3150 else
3151 cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3152 if (!cgroup_info)
3153 return ret_errno(ENOMEM);
3154
3155 lxc_iterate_parts(it, cgroup_info, "\n") {
3156 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
3157 __do_free char *controllers = NULL, *current_cgroup = NULL;
3158 __do_free_string_list char **controller_list = NULL,
3159 **delegate = NULL;
3160 char *line;
3161 int dfd, ret, type;
3162
3163 /* Handle the unified cgroup hierarchy. */
3164 line = it;
3165 if (unified_cgroup(line)) {
3166 char *unified_mnt;
3167
3168 type = UNIFIED_HIERARCHY;
3169
3170 current_cgroup = current_unified_cgroup(relative, line);
3171 if (IS_ERR(current_cgroup))
3172 return PTR_ERR(current_cgroup);
3173
3174 if (unified_cgroup_fd(ops->dfd_mnt)) {
3175 dfd_mnt = dup_cloexec(ops->dfd_mnt);
3176 unified_mnt = "";
3177 } else {
3178 dfd_mnt = open_at(ops->dfd_mnt,
3179 "unified",
3180 PROTECT_OPATH_DIRECTORY,
3181 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3182 unified_mnt = "unified";
3183 }
3184 if (dfd_mnt < 0) {
3185 if (errno != ENOENT)
3186 return syserror("Failed to open %d/unified", ops->dfd_mnt);
3187
3188 SYSTRACE("Unified cgroup not mounted");
3189 continue;
3190 }
3191 dfd = dfd_mnt;
3192
3193 if (!is_empty_string(current_cgroup)) {
3194 dfd_base = open_at(dfd_mnt, current_cgroup,
3195 PROTECT_OPATH_DIRECTORY,
3196 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3197 if (dfd_base < 0) {
3198 if (errno != ENOENT)
3199 return syserror("Failed to open %d/%s",
3200 dfd_mnt, current_cgroup);
3201
3202 SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3203 dfd_mnt, current_cgroup);
3204 continue;
3205 }
3206 dfd = dfd_base;
3207 }
3208
3209 if (!unified_hierarchy_delegated(dfd, &delegate))
3210 continue;
3211
3212 controller_list = unified_controllers(dfd, "cgroup.controllers");
3213 if (!controller_list) {
3214 TRACE("No controllers are enabled for delegation in the unified hierarchy");
3215 controller_list = list_new();
3216 if (!controller_list)
3217 return syserror_set(-ENOMEM, "Failed to create empty controller list");
3218 }
3219
3220 controllers = strdup(unified_mnt);
3221 if (!controllers)
3222 return ret_errno(ENOMEM);
3223 } else {
3224 char *__controllers, *__current_cgroup;
3225
3226 type = LEGACY_HIERARCHY;
3227
3228 __controllers = strchr(line, ':');
3229 if (!__controllers)
3230 return ret_errno(EINVAL);
3231 __controllers++;
3232
3233 __current_cgroup = strchr(__controllers, ':');
3234 if (!__current_cgroup)
3235 return ret_errno(EINVAL);
3236 *__current_cgroup = '\0';
3237 __current_cgroup++;
3238
3239 controllers = strdup(stable_order(__controllers));
3240 if (!controllers)
3241 return ret_errno(ENOMEM);
3242
3243 dfd_mnt = open_at(ops->dfd_mnt,
3244 controllers,
3245 PROTECT_OPATH_DIRECTORY,
3246 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3247 if (dfd_mnt < 0) {
3248 if (errno != ENOENT)
3249 return syserror("Failed to open %d/%s",
3250 ops->dfd_mnt, controllers);
3251
3252 SYSTRACE("%s not mounted", controllers);
3253 continue;
3254 }
3255 dfd = dfd_mnt;
3256
3257 if (!abspath(__current_cgroup))
3258 return ret_errno(EINVAL);
3259
3260 /* remove init.scope */
3261 if (!relative)
3262 __current_cgroup = prune_init_scope(__current_cgroup);
3263
3264 /* create a relative path */
3265 __current_cgroup = deabs(__current_cgroup);
3266
3267 current_cgroup = strdup(__current_cgroup);
3268 if (!current_cgroup)
3269 return ret_errno(ENOMEM);
3270
3271 if (!is_empty_string(current_cgroup)) {
3272 dfd_base = open_at(dfd_mnt, current_cgroup,
3273 PROTECT_OPATH_DIRECTORY,
3274 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3275 if (dfd_base < 0) {
3276 if (errno != ENOENT)
3277 return syserror("Failed to open %d/%s",
3278 dfd_mnt, current_cgroup);
3279
3280 SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3281 dfd_mnt, current_cgroup);
3282 continue;
3283 }
3284 dfd = dfd_base;
3285 }
3286
3287 if (!legacy_hierarchy_delegated(dfd))
3288 continue;
3289
3290 /*
3291 * We intentionally pass __current_cgroup here and not
3292 * controllers because we would otherwise chop the
3293 * mountpoint.
3294 */
3295 controller_list = list_add_controllers(__controllers);
3296 if (!controller_list)
3297 return syserror_set(-ENOMEM, "Failed to create controller list from %s", __controllers);
3298
3299 if (skip_hierarchy(ops, controller_list))
3300 continue;
3301
3302 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3303 }
3304
3305 ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
3306 current_cgroup, controller_list, type);
3307 if (ret < 0)
3308 return syserror_ret(ret, "Failed to add %s hierarchy", controllers);
3309
3310 /* Transfer ownership. */
3311 move_fd(dfd_mnt);
3312 move_fd(dfd_base);
3313 move_ptr(current_cgroup);
3314 move_ptr(controllers);
3315 move_ptr(controller_list);
3316 if (type == UNIFIED_HIERARCHY)
3317 ops->unified->delegate = move_ptr(delegate);
3318 }
3319
3320 /* determine cgroup layout */
3321 if (ops->unified) {
3322 if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3323 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3324 } else {
3325 if (bpf_devices_cgroup_supported())
3326 ops->unified->utilities |= DEVICES_CONTROLLER;
3327 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3328 }
3329 }
3330
3331 if (!controllers_available(ops))
3332 return syserror_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
3333
3334 return 0;
3335 }
3336
3337 static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
3338 {
3339 __do_close int dfd = -EBADF;
3340 int ret;
3341 const char *controllers_use;
3342
3343 if (ops->dfd_mnt >= 0)
3344 return ret_errno(EBUSY);
3345
3346 /*
3347 * I don't see the need for allowing symlinks here. If users want to
3348 * have their hierarchy available in different locations I strongly
3349 * suggest bind-mounts.
3350 */
3351 dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3352 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3353 if (dfd < 0)
3354 return syserror("Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3355
3356 controllers_use = lxc_global_config_value("lxc.cgroup.use");
3357 if (controllers_use) {
3358 __do_free char *dup = NULL;
3359 char *it;
3360
3361 dup = strdup(controllers_use);
3362 if (!dup)
3363 return -errno;
3364
3365 lxc_iterate_parts(it, dup, ",") {
3366 ret = list_add_string(&ops->cgroup_use, it);
3367 if (ret < 0)
3368 return ret;
3369 }
3370 }
3371
3372 /*
3373 * Keep dfd referenced by the cleanup function and actually move the fd
3374 * once we know the initialization succeeded. So if we fail we clean up
3375 * the dfd.
3376 */
3377 ops->dfd_mnt = dfd;
3378
3379 ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !lxc_list_empty(&conf->id_map));
3380 if (ret < 0)
3381 return syserror_ret(ret, "Failed to initialize cgroups");
3382
3383 /* Transfer ownership to cgroup_ops. */
3384 move_fd(dfd);
3385 return 0;
3386 }
3387
3388 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3389 {
3390 const char *cgroup_pattern;
3391
3392 if (!ops)
3393 return ret_set_errno(-1, ENOENT);
3394
3395 /* copy system-wide cgroup information */
3396 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3397 if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
3398 ops->cgroup_pattern = strdup(cgroup_pattern);
3399 if (!ops->cgroup_pattern)
3400 return ret_errno(ENOMEM);
3401 }
3402
3403 return 0;
3404 }
3405
3406 struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
3407 {
3408 __cleanup_cgroup_ops struct cgroup_ops *cgfsng_ops = NULL;
3409
3410 cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3411 if (!cgfsng_ops)
3412 return ret_set_errno(NULL, ENOMEM);
3413
3414 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3415 cgfsng_ops->dfd_mnt = -EBADF;
3416
3417 if (initialize_cgroups(cgfsng_ops, conf))
3418 return NULL;
3419
3420 cgfsng_ops->data_init = cgfsng_data_init;
3421 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3422 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3423 cgfsng_ops->monitor_create = cgfsng_monitor_create;
3424 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3425 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3426 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3427 cgfsng_ops->payload_create = cgfsng_payload_create;
3428 cgfsng_ops->payload_enter = cgfsng_payload_enter;
3429 cgfsng_ops->finalize = cgfsng_finalize;
3430 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3431 cgfsng_ops->get = cgfsng_get;
3432 cgfsng_ops->set = cgfsng_set;
3433 cgfsng_ops->freeze = cgfsng_freeze;
3434 cgfsng_ops->unfreeze = cgfsng_unfreeze;
3435 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3436 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3437 cgfsng_ops->driver = "cgfsng";
3438 cgfsng_ops->version = "1.0.0";
3439 cgfsng_ops->attach = cgfsng_attach;
3440 cgfsng_ops->chown = cgfsng_chown;
3441 cgfsng_ops->mount = cgfsng_mount;
3442 cgfsng_ops->devices_activate = cgfsng_devices_activate;
3443 cgfsng_ops->get_limit_cgroup = cgfsng_get_limit_cgroup;
3444
3445 cgfsng_ops->criu_escape = cgfsng_criu_escape;
3446 cgfsng_ops->criu_num_hierarchies = cgfsng_criu_num_hierarchies;
3447 cgfsng_ops->criu_get_hierarchies = cgfsng_criu_get_hierarchies;
3448
3449 return move_ptr(cgfsng_ops);
3450 }
3451
3452 static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid)
3453 {
3454 int ret;
3455
3456 if (!lxc_list_empty(&conf->id_map)) {
3457 struct userns_exec_unified_attach_data args = {
3458 .conf = conf,
3459 .unified_fd = fd_unified,
3460 .pid = pid,
3461 };
3462
3463 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3464 if (ret < 0)
3465 return -errno;
3466
3467 ret = userns_exec_minimal(conf,
3468 cgroup_unified_attach_parent_wrapper,
3469 &args,
3470 cgroup_unified_attach_child_wrapper,
3471 &args);
3472 } else {
3473 ret = cgroup_attach_leaf(conf, fd_unified, pid);
3474 }
3475
3476 return ret;
3477 }
3478
3479 static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name,
3480 const char *lxcpath, pid_t pid)
3481 {
3482 call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){};
3483 int ret;
3484 size_t idx;
3485 ssize_t pidstr_len;
3486 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
3487
3488 ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, sizeof(struct cgroup_ctx), ctx);
3489 if (ret < 0)
3490 return ret_errno(ENOSYS);
3491
3492 pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
3493 if (pidstr_len < 0)
3494 return pidstr_len;
3495
3496 for (idx = 0; idx < ctx->fd_len; idx++) {
3497 int dfd_con = ctx->fd[idx];
3498
3499 if (unified_cgroup_fd(dfd_con))
3500 ret = __unified_attach_fd(conf, dfd_con, pid);
3501 else
3502 ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len);
3503 if (ret)
3504 return syserror_ret(ret, "Failed to attach to cgroup fd %d", dfd_con);
3505 else
3506 TRACE("Attached to cgroup fd %d", dfd_con);
3507 }
3508
3509 if (idx == 0)
3510 return syserror_set(-ENOENT, "Failed to attach to cgroups");
3511
3512 TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->layout));
3513 return 0;
3514 }
3515
3516 static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name,
3517 const char *lxcpath, pid_t pid)
3518 {
3519 __do_close int dfd_unified = -EBADF;
3520
3521 if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3522 return ret_errno(EINVAL);
3523
3524 dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3525 if (dfd_unified < 0)
3526 return ret_errno(ENOSYS);
3527
3528 return __unified_attach_fd(conf, dfd_unified, pid);
3529 }
3530
3531 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3532 const char *lxcpath, pid_t pid)
3533 {
3534 int ret;
3535
3536 ret = __cgroup_attach_many(conf, name, lxcpath, pid);
3537 if (ret < 0) {
3538 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3539 return ret;
3540
3541 ret = __cgroup_attach_unified(conf, name, lxcpath, pid);
3542 if (ret < 0 && ERRNO_IS_NOT_SUPPORTED(ret))
3543 return ret_errno(ENOSYS);
3544 }
3545
3546 return ret;
3547 }
3548
3549 /* Connects to command socket therefore isn't callable from command handler. */
3550 int cgroup_get(const char *name, const char *lxcpath, const char *key, char *buf, size_t len)
3551 {
3552 __do_close int dfd = -EBADF;
3553 struct cgroup_fd fd = {
3554 .fd = -EBADF,
3555 };
3556 size_t len_controller;
3557 int ret;
3558
3559 if (is_empty_string(name) || is_empty_string(lxcpath) ||
3560 is_empty_string(key))
3561 return ret_errno(EINVAL);
3562
3563 if ((buf && !len) || (len && !buf))
3564 return ret_errno(EINVAL);
3565
3566 len_controller = strcspn(key, ".");
3567 len_controller++; /* Don't forget the \0 byte. */
3568 if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3569 return ret_errno(EINVAL);
3570 (void)strlcpy(fd.controller, key, len_controller);
3571
3572 ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3573 if (ret < 0) {
3574 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3575 return ret;
3576
3577 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3578 if (dfd < 0) {
3579 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3580 return ret;
3581
3582 return ret_errno(ENOSYS);
3583 }
3584 fd.type = UNIFIED_HIERARCHY;
3585 fd.fd = move_fd(dfd);
3586 }
3587 dfd = move_fd(fd.fd);
3588
3589 TRACE("Reading %s from %s cgroup hierarchy", key, cgroup_hierarchy_name(fd.type));
3590
3591 if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices"))
3592 return ret_errno(EOPNOTSUPP);
3593 else
3594 ret = lxc_read_try_buf_at(dfd, key, buf, len);
3595
3596 return ret;
3597 }
3598
3599 /* Connects to command socket therefore isn't callable from command handler. */
3600 int cgroup_set(const char *name, const char *lxcpath, const char *key, const char *value)
3601 {
3602 __do_close int dfd = -EBADF;
3603 struct cgroup_fd fd = {
3604 .fd = -EBADF,
3605 };
3606 size_t len_controller;
3607 int ret;
3608
3609 if (is_empty_string(name) || is_empty_string(lxcpath) ||
3610 is_empty_string(key) || is_empty_string(value))
3611 return ret_errno(EINVAL);
3612
3613 len_controller = strcspn(key, ".");
3614 len_controller++; /* Don't forget the \0 byte. */
3615 if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3616 return ret_errno(EINVAL);
3617 (void)strlcpy(fd.controller, key, len_controller);
3618
3619 ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3620 if (ret < 0) {
3621 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3622 return ret;
3623
3624 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3625 if (dfd < 0) {
3626 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3627 return ret;
3628
3629 return ret_errno(ENOSYS);
3630 }
3631 fd.type = UNIFIED_HIERARCHY;
3632 fd.fd = move_fd(dfd);
3633 }
3634 dfd = move_fd(fd.fd);
3635
3636 TRACE("Setting %s to %s in %s cgroup hierarchy", key, value, cgroup_hierarchy_name(fd.type));
3637
3638 if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) {
3639 struct device_item device = {};
3640
3641 ret = device_cgroup_rule_parse(&device, key, value);
3642 if (ret < 0)
3643 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
3644 key, value);
3645
3646 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3647 } else {
3648 ret = lxc_writeat(dfd, key, value, strlen(value));
3649 }
3650
3651 return ret;
3652 }
3653
3654 static int do_cgroup_freeze(int unified_fd,
3655 const char *state_string,
3656 int state_num,
3657 int timeout,
3658 const char *epoll_error,
3659 const char *wait_error)
3660 {
3661 __do_close int events_fd = -EBADF;
3662 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
3663 int ret;
3664 struct lxc_epoll_descr descr = {};
3665
3666 if (timeout != 0) {
3667 ret = lxc_mainloop_open(&descr);
3668 if (ret)
3669 return log_error_errno(-1, errno, "%s", epoll_error);
3670
3671 /* automatically cleaned up now */
3672 descr_ptr = &descr;
3673
3674 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3675 if (events_fd < 0)
3676 return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3677
3678 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
3679 if (ret < 0)
3680 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3681 }
3682
3683 ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3684 if (ret < 0)
3685 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3686
3687 if (timeout != 0) {
3688 ret = lxc_mainloop(&descr, timeout);
3689 if (ret)
3690 return log_error_errno(-1, errno, "%s", wait_error);
3691 }
3692
3693 return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3694 }
3695
3696 static inline int __cgroup_freeze(int unified_fd, int timeout)
3697 {
3698 return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3699 "Failed to create epoll instance to wait for container freeze",
3700 "Failed to wait for container to be frozen");
3701 }
3702
3703 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3704 {
3705 __do_close int unified_fd = -EBADF;
3706 int ret;
3707
3708 if (is_empty_string(name) || is_empty_string(lxcpath))
3709 return ret_errno(EINVAL);
3710
3711 unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3712 if (unified_fd < 0)
3713 return ret_errno(ENOCGROUP2);
3714
3715 lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3716 ret = __cgroup_freeze(unified_fd, timeout);
3717 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3718 return ret;
3719 }
3720
3721 int __cgroup_unfreeze(int unified_fd, int timeout)
3722 {
3723 return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3724 "Failed to create epoll instance to wait for container freeze",
3725 "Failed to wait for container to be frozen");
3726 }
3727
3728 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3729 {
3730 __do_close int unified_fd = -EBADF;
3731 int ret;
3732
3733 if (is_empty_string(name) || is_empty_string(lxcpath))
3734 return ret_errno(EINVAL);
3735
3736 unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3737 if (unified_fd < 0)
3738 return ret_errno(ENOCGROUP2);
3739
3740 lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3741 ret = __cgroup_unfreeze(unified_fd, timeout);
3742 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3743 return ret;
3744 }