]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfsng.c
cgroups: fix declarations and headers
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 /*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
8 * each controller.
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
12 * a comma-separated list of controllers.
13 */
14
15 #include "config.h"
16
17 #include <ctype.h>
18 #include <dirent.h>
19 #include <errno.h>
20 #include <grp.h>
21 #include <linux/kdev_t.h>
22 #include <linux/types.h>
23 #include <poll.h>
24 #include <signal.h>
25 #include <stdint.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <sys/epoll.h>
30 #include <sys/types.h>
31 #include <unistd.h>
32
33 #include "cgroup.h"
34 #include "af_unix.h"
35 #include "caps.h"
36 #include "cgroup2_devices.h"
37 #include "cgroup_utils.h"
38 #include "commands.h"
39 #include "commands_utils.h"
40 #include "conf.h"
41 #include "error_utils.h"
42 #include "log.h"
43 #include "macro.h"
44 #include "mainloop.h"
45 #include "memory_utils.h"
46 #include "mount_utils.h"
47 #include "storage/storage.h"
48 #include "string_utils.h"
49 #include "syscall_wrappers.h"
50 #include "utils.h"
51
52 #ifndef HAVE_STRLCPY
53 #include "strlcpy.h"
54 #endif
55
56 #ifndef HAVE_STRLCAT
57 #include "strlcat.h"
58 #endif
59
60 lxc_log_define(cgfsng, cgroup);
61
62 /*
63 * Given a pointer to a null-terminated array of pointers, realloc to add one
64 * entry, and point the new entry to NULL. Do not fail. Return the index to the
65 * second-to-last entry - that is, the one which is now available for use
66 * (keeping the list null-terminated).
67 */
68 static int cg_list_add(void ***list)
69 {
70 int idx = 0;
71 void **p;
72
73 if (*list)
74 for (; (*list)[idx]; idx++)
75 ;
76
77 p = realloc(*list, (idx + 2) * sizeof(void **));
78 if (!p)
79 return ret_errno(ENOMEM);
80
81 p[idx + 1] = NULL;
82 *list = p;
83
84 return idx;
85 }
86
87 /* Given a null-terminated array of strings, check whether @entry is one of the
88 * strings.
89 */
90 static bool string_in_list(char **list, const char *entry)
91 {
92 if (!list)
93 return false;
94
95 for (int i = 0; list[i]; i++)
96 if (strequal(list[i], entry))
97 return true;
98
99 return false;
100 }
101
102 /* Given a handler's cgroup data, return the struct hierarchy for the controller
103 * @c, or NULL if there is none.
104 */
105 static struct hierarchy *get_hierarchy(const struct cgroup_ops *ops, const char *controller)
106 {
107 if (!ops->hierarchies)
108 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
109
110 for (int i = 0; ops->hierarchies[i]; i++) {
111 if (!controller) {
112 /* This is the empty unified hierarchy. */
113 if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
114 return ops->hierarchies[i];
115
116 continue;
117 }
118
119 /*
120 * Handle controllers with significant implementation changes
121 * from cgroup to cgroup2.
122 */
123 if (pure_unified_layout(ops)) {
124 if (strequal(controller, "devices")) {
125 if (device_utility_controller(ops->unified))
126 return ops->unified;
127
128 break;
129 } else if (strequal(controller, "freezer")) {
130 if (freezer_utility_controller(ops->unified))
131 return ops->unified;
132
133 break;
134 }
135 }
136
137 if (string_in_list(ops->hierarchies[i]->controllers, controller))
138 return ops->hierarchies[i];
139 }
140
141 if (controller)
142 WARN("There is no useable %s controller", controller);
143 else
144 WARN("There is no empty unified cgroup hierarchy");
145
146 return ret_set_errno(NULL, ENOENT);
147 }
148
149 int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool limit)
150 {
151 int dfd;
152 const struct hierarchy *h;
153
154 h = get_hierarchy(ops, fd->controller);
155 if (!h)
156 return ret_errno(ENOENT);
157
158 /*
159 * The client requested that the controller must be in a specific
160 * cgroup version.
161 */
162 if (fd->type != 0 && (cgroupfs_type_magic_t)fd->type != h->fs_type)
163 return ret_errno(EINVAL);
164
165 if (limit)
166 dfd = h->dfd_con;
167 else
168 dfd = h->dfd_lim;
169 if (dfd < 0)
170 return ret_errno(EBADF);
171
172 fd->layout = ops->cgroup_layout;
173 fd->type = h->fs_type;
174 if (fd->type == UNIFIED_HIERARCHY)
175 fd->utilities = h->utilities;
176 fd->fd = dfd;
177
178 return 0;
179 }
180
181 /* Taken over modified from the kernel sources. */
182 #define NBITS 32 /* bits in uint32_t */
183 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
184 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
185
186 static void set_bit(unsigned bit, uint32_t *bitarr)
187 {
188 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
189 }
190
191 static void clear_bit(unsigned bit, uint32_t *bitarr)
192 {
193 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
194 }
195
196 static bool is_set(unsigned bit, uint32_t *bitarr)
197 {
198 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
199 }
200
201 /* Create cpumask from cpulist aka turn:
202 *
203 * 0,2-3
204 *
205 * into bit array
206 *
207 * 1 0 1 1
208 */
209 static int lxc_cpumask(char *buf, uint32_t **bitarr, size_t *last_set_bit)
210 {
211 __do_free uint32_t *arr_u32 = NULL;
212 size_t cur_last_set_bit = 0, nbits = 256;
213 size_t nr_u32;
214 char *token;
215
216 nr_u32 = BITS_TO_LONGS(nbits);
217 arr_u32 = zalloc(nr_u32 * sizeof(uint32_t));
218 if (!arr_u32)
219 return ret_errno(ENOMEM);
220
221 lxc_iterate_parts(token, buf, ",") {
222 unsigned last_bit, first_bit;
223 char *range;
224
225 errno = 0;
226 first_bit = strtoul(token, NULL, 0);
227 last_bit = first_bit;
228 range = strchr(token, '-');
229 if (range)
230 last_bit = strtoul(range + 1, NULL, 0);
231
232 if (!(first_bit <= last_bit))
233 return ret_errno(EINVAL);
234
235 if (last_bit >= nbits) {
236 size_t add_bits = last_bit - nbits + 32;
237 size_t new_nr_u32;
238 uint32_t *p;
239
240 new_nr_u32 = BITS_TO_LONGS(nbits + add_bits);
241 p = realloc(arr_u32, new_nr_u32 * sizeof(uint32_t));
242 if (!p)
243 return ret_errno(ENOMEM);
244 arr_u32 = move_ptr(p);
245
246 memset(arr_u32 + nr_u32, 0,
247 (new_nr_u32 - nr_u32) * sizeof(uint32_t));
248 nbits += add_bits;
249 }
250
251 while (first_bit <= last_bit)
252 set_bit(first_bit++, arr_u32);
253
254 if (last_bit > cur_last_set_bit)
255 cur_last_set_bit = last_bit;
256 }
257
258 *last_set_bit = cur_last_set_bit;
259 *bitarr = move_ptr(arr_u32);
260 return 0;
261 }
262
263 static int lxc_cpumask_update(char *buf, uint32_t *bitarr, size_t last_set_bit,
264 bool clear)
265 {
266 bool flipped = false;
267 char *token;
268
269 lxc_iterate_parts(token, buf, ",") {
270 unsigned last_bit, first_bit;
271 char *range;
272
273 errno = 0;
274 first_bit = strtoul(token, NULL, 0);
275 last_bit = first_bit;
276 range = strchr(token, '-');
277 if (range)
278 last_bit = strtoul(range + 1, NULL, 0);
279
280 if (!(first_bit <= last_bit)) {
281 WARN("The cup range seems to be inverted: %u-%u", first_bit, last_bit);
282 continue;
283 }
284
285 if (last_bit > last_set_bit)
286 continue;
287
288 while (first_bit <= last_bit) {
289 if (clear && is_set(first_bit, bitarr)) {
290 flipped = true;
291 clear_bit(first_bit, bitarr);
292 } else if (!clear && !is_set(first_bit, bitarr)) {
293 flipped = true;
294 set_bit(first_bit, bitarr);
295 }
296
297 first_bit++;
298 }
299 }
300
301 if (flipped)
302 return 1;
303
304 return 0;
305 }
306
307 /* Turn cpumask into simple, comma-separated cpulist. */
308 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t last_set_bit)
309 {
310 __do_free_string_list char **cpulist = NULL;
311 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
312 int ret;
313
314 for (size_t i = 0; i <= last_set_bit; i++) {
315 if (!is_set(i, bitarr))
316 continue;
317
318 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
319 if (ret < 0)
320 return NULL;
321
322 ret = lxc_append_string(&cpulist, numstr);
323 if (ret < 0)
324 return ret_set_errno(NULL, ENOMEM);
325 }
326
327 if (!cpulist)
328 return ret_set_errno(NULL, ENOMEM);
329
330 return lxc_string_join(",", (const char **)cpulist, false);
331 }
332
333 static inline bool is_unified_hierarchy(const struct hierarchy *h)
334 {
335 return h->fs_type == UNIFIED_HIERARCHY;
336 }
337
338 /* Return true if the controller @entry is found in the null-terminated list of
339 * hierarchies @hlist.
340 */
341 static bool controller_available(struct hierarchy **hlist, char *entry)
342 {
343 if (!hlist)
344 return false;
345
346 for (int i = 0; hlist[i]; i++)
347 if (string_in_list(hlist[i]->controllers, entry))
348 return true;
349
350 return false;
351 }
352
353 static bool controllers_available(struct cgroup_ops *ops)
354 {
355 struct hierarchy **hlist;
356
357 if (!ops->cgroup_use)
358 return true;
359
360 hlist = ops->hierarchies;
361 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
362 if (!controller_available(hlist, *cur))
363 return log_error(false, "The %s controller found", *cur);
364
365 return true;
366 }
367
368 static char **list_new(void)
369 {
370 __do_free_string_list char **list = NULL;
371 int idx;
372
373 idx = cg_list_add((void ***)&list);
374 if (idx < 0)
375 return NULL;
376
377 list[idx] = NULL;
378 return move_ptr(list);
379 }
380
381 static int list_add_string(char ***list, char *entry)
382 {
383 __do_free char *dup = NULL;
384 int idx;
385
386 dup = strdup(entry);
387 if (!dup)
388 return ret_errno(ENOMEM);
389
390 idx = cg_list_add((void ***)list);
391 if (idx < 0)
392 return idx;
393
394 (*list)[idx] = move_ptr(dup);
395 return 0;
396 }
397
398 static char **list_add_controllers(char *controllers)
399 {
400 __do_free_string_list char **list = NULL;
401 char *it;
402
403 lxc_iterate_parts(it, controllers, ", \t\n") {
404 int ret;
405
406 ret = list_add_string(&list, it);
407 if (ret < 0)
408 return NULL;
409 }
410
411 return move_ptr(list);
412 }
413
414 static char **unified_controllers(int dfd, const char *file)
415 {
416 __do_free char *buf = NULL;
417
418 buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
419 if (!buf)
420 return NULL;
421
422 return list_add_controllers(buf);
423 }
424
425 static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
426 {
427 if (!ops->cgroup_use)
428 return false;
429
430 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
431 bool found = false;
432
433 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
434 if (!strequal(*cur_use, *cur_ctrl))
435 continue;
436
437 found = true;
438 break;
439 }
440
441 if (found)
442 continue;
443
444 return true;
445 }
446
447 return false;
448 }
449
450 static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
451 int dfd_base, char *base_cgroup,
452 char **controllers, cgroupfs_type_magic_t fs_type)
453 {
454 __do_free struct hierarchy *new = NULL;
455 int idx;
456
457 if (abspath(base_cgroup))
458 return syserror_set(-EINVAL, "Container base path must be relative to controller mount");
459
460 new = zalloc(sizeof(*new));
461 if (!new)
462 return ret_errno(ENOMEM);
463
464 new->dfd_con = -EBADF;
465 new->dfd_lim = -EBADF;
466 new->dfd_mon = -EBADF;
467
468 new->fs_type = fs_type;
469 new->controllers = controllers;
470 new->at_mnt = mnt;
471 new->at_base = base_cgroup;
472
473 new->dfd_mnt = dfd_mnt;
474 new->dfd_base = dfd_base;
475
476 TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
477 mnt, maybe_empty(base_cgroup));
478 for (char *const *it = new->controllers; it && *it; it++)
479 TRACE("The hierarchy contains the %s controller", *it);
480
481 idx = cg_list_add((void ***)&ops->hierarchies);
482 if (idx < 0)
483 return ret_errno(idx);
484
485 if (fs_type == UNIFIED_HIERARCHY)
486 ops->unified = new;
487 (ops->hierarchies)[idx] = move_ptr(new);
488
489 return 0;
490 }
491
492 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
493 {
494 if (!path_prune || !hierarchies)
495 return 0;
496
497 for (int i = 0; hierarchies[i]; i++) {
498 struct hierarchy *h = hierarchies[i];
499 int ret;
500
501 ret = cgroup_tree_prune(h->dfd_base, path_prune);
502 if (ret < 0)
503 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
504 else
505 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
506
507 free_equal(h->path_lim, h->path_con);
508 }
509
510 return 0;
511 }
512
513 struct generic_userns_exec_data {
514 struct hierarchy **hierarchies;
515 const char *path_prune;
516 struct lxc_conf *conf;
517 uid_t origuid; /* target uid in parent namespace */
518 char *path;
519 };
520
521 static int cgroup_tree_remove_wrapper(void *data)
522 {
523 struct generic_userns_exec_data *arg = data;
524 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
525 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
526 int ret;
527
528 if (!lxc_drop_groups() && errno != EPERM)
529 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
530
531 ret = setresgid(nsgid, nsgid, nsgid);
532 if (ret < 0)
533 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
534 (int)nsgid, (int)nsgid, (int)nsgid);
535
536 ret = setresuid(nsuid, nsuid, nsuid);
537 if (ret < 0)
538 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
539 (int)nsuid, (int)nsuid, (int)nsuid);
540
541 return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
542 }
543
544 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
545 struct lxc_handler *handler)
546 {
547 int ret;
548
549 if (!ops) {
550 ERROR("Called with uninitialized cgroup operations");
551 return;
552 }
553
554 if (!ops->hierarchies)
555 return;
556
557 if (!handler) {
558 ERROR("Called with uninitialized handler");
559 return;
560 }
561
562 if (!handler->conf) {
563 ERROR("Called with uninitialized conf");
564 return;
565 }
566
567 if (!ops->container_limit_cgroup) {
568 WARN("Uninitialized limit cgroup");
569 return;
570 }
571
572 ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
573 if (ret < 0)
574 WARN("Failed to detach bpf program from cgroup");
575
576 if (!list_empty(&handler->conf->id_map)) {
577 struct generic_userns_exec_data wrap = {
578 .conf = handler->conf,
579 .path_prune = ops->container_limit_cgroup,
580 .hierarchies = ops->hierarchies,
581 .origuid = 0,
582 };
583 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
584 &wrap, "cgroup_tree_remove_wrapper");
585 } else {
586 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
587 }
588 if (ret < 0)
589 SYSWARN("Failed to destroy cgroups");
590 }
591
592 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
593 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
594 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
595 bool am_initialized)
596 {
597 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
598 *offlinecpus = NULL, *posscpus = NULL;
599 __do_free uint32_t *possmask = NULL;
600 int ret;
601 size_t poss_last_set_bit = 0;
602
603 posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
604 if (!posscpus)
605 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
606
607 if (file_exists(__ISOL_CPUS)) {
608 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
609 if (!isolcpus)
610 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
611
612 if (!isdigit(isolcpus[0]))
613 free_disarm(isolcpus);
614 } else {
615 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
616 }
617
618 if (file_exists(__OFFLINE_CPUS)) {
619 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
620 if (!offlinecpus)
621 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
622
623 if (!isdigit(offlinecpus[0]))
624 free_disarm(offlinecpus);
625 } else {
626 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
627 }
628
629 if (!isolcpus && !offlinecpus) {
630 cpulist = move_ptr(posscpus);
631 goto copy_parent;
632 }
633
634 ret = lxc_cpumask(posscpus, &possmask, &poss_last_set_bit);
635 if (ret)
636 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
637
638 if (isolcpus)
639 ret = lxc_cpumask_update(isolcpus, possmask, poss_last_set_bit, true);
640
641 if (offlinecpus)
642 ret |= lxc_cpumask_update(offlinecpus, possmask, poss_last_set_bit, true);
643
644 if (!ret) {
645 cpulist = lxc_cpumask_to_cpulist(possmask, poss_last_set_bit);
646 TRACE("No isolated or offline cpus present in cpuset");
647 } else {
648 cpulist = move_ptr(posscpus);
649 TRACE("Removed isolated or offline cpus from cpuset");
650 }
651 if (!cpulist)
652 return log_error_errno(false, errno, "Failed to create cpu list");
653
654 copy_parent:
655 if (!am_initialized) {
656 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
657 if (ret < 0)
658 return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
659
660 TRACE("Copied cpu settings of parent cgroup");
661 }
662
663 return true;
664 }
665
666 static bool cpuset1_initialize(int dfd_base, int dfd_next)
667 {
668 char mems[PATH_MAX];
669 ssize_t bytes;
670 char v;
671
672 /* Determine whether the base cgroup has cpuset inheritance turned on. */
673 bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
674 if (bytes < 0)
675 return syserror_ret(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
676
677 /* Initialize cpuset.cpus removing any isolated and offline cpus. */
678 if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
679 return syserror_ret(false, "Failed to initialize cpuset.cpus");
680
681 /* Read cpuset.mems from parent... */
682 bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
683 if (bytes < 0)
684 return syserror_ret(false, "Failed to read file %d(cpuset.mems)", dfd_base);
685
686 /* and copy to first cgroup in the tree... */
687 bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
688 if (bytes < 0)
689 return syserror_ret(false, "Failed to write %d(cpuset.mems)", dfd_next);
690
691 /* and finally turn on cpuset inheritance. */
692 bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
693 if (bytes < 0)
694 return syserror_ret(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
695
696 return log_trace(true, "Initialized cpuset in the legacy hierarchy");
697 }
698
699 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
700 bool cpuset_v1, bool eexist_ignore)
701 {
702 __do_close int dfd_final = -EBADF;
703 int dfd_cur = dfd_base;
704 int ret = 0;
705 size_t len;
706 char *cur;
707 char buf[PATH_MAX];
708
709 if (is_empty_string(path))
710 return ret_errno(EINVAL);
711
712 len = strlcpy(buf, path, sizeof(buf));
713 if (len >= sizeof(buf))
714 return ret_errno(E2BIG);
715
716 lxc_iterate_parts(cur, buf, "/") {
717 /*
718 * Even though we vetted the paths when we parsed the config
719 * we're paranoid here and check that the path is neither
720 * absolute nor walks upwards.
721 */
722 if (abspath(cur))
723 return syserror_set(-EINVAL, "No absolute paths allowed");
724
725 if (strnequal(cur, "..", STRLITERALLEN("..")))
726 return syserror_set(-EINVAL, "No upward walking paths allowed");
727
728 ret = mkdirat(dfd_cur, cur, mode);
729 if (ret < 0) {
730 if (errno != EEXIST)
731 return syserror("Failed to create %d(%s)", dfd_cur, cur);
732
733 ret = -EEXIST;
734 }
735 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
736
737 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
738 if (dfd_final < 0)
739 return syserror("Fail to open%s directory %d(%s)",
740 !ret ? " newly created" : "", dfd_base, cur);
741 if (dfd_cur != dfd_base)
742 close(dfd_cur);
743 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
744 return syserror_set(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
745 /*
746 * Leave dfd_final pointing to the last fd we opened so
747 * it will be automatically zapped if we return early.
748 */
749 dfd_cur = dfd_final;
750 }
751
752 /* The final cgroup must be succesfully creatd by us. */
753 if (ret) {
754 if (ret != -EEXIST || !eexist_ignore)
755 return syswarn_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
756 }
757
758 return move_fd(dfd_final);
759 }
760
761 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
762 struct hierarchy *h, const char *cgroup_limit_dir,
763 const char *cgroup_leaf, bool payload)
764 {
765 __do_close int fd_limit = -EBADF, fd_final = -EBADF;
766 bool cpuset_v1 = false;
767
768 /*
769 * The legacy cpuset controller needs massaging in case inheriting
770 * settings from its immediate ancestor cgroup hasn't been turned on.
771 */
772 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
773
774 if (payload && cgroup_leaf) {
775 /* With isolation both parts need to not already exist. */
776 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
777 if (fd_limit < 0)
778 return syswarn_ret(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
779
780 h->path_lim = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
781 h->dfd_lim = move_fd(fd_limit);
782
783 TRACE("Created limit cgroup %d->%d(%s)",
784 h->dfd_lim, h->dfd_base, cgroup_limit_dir);
785
786 /*
787 * With isolation the devices legacy cgroup needs to be
788 * iinitialized early, as it typically contains an 'a' (all)
789 * line, which is not possible once a subdirectory has been
790 * created.
791 */
792 if (string_in_list(h->controllers, "devices") &&
793 !ops->setup_limits_legacy(ops, conf, true))
794 return log_warn(false, "Failed to setup legacy device limits");
795
796 /*
797 * If we use a separate limit cgroup, the leaf cgroup, i.e. the
798 * cgroup the container actually resides in, is below fd_limit.
799 */
800 fd_final = __cgroup_tree_create(h->dfd_lim, cgroup_leaf, 0755, cpuset_v1, false);
801 if (fd_final < 0) {
802 /* Ensure we don't leave any garbage behind. */
803 if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
804 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
805 else
806 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
807 return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
808 }
809 h->dfd_con = move_fd(fd_final);
810 h->path_con = must_make_path(h->path_lim, cgroup_leaf, NULL);
811
812 } else {
813 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
814 if (fd_final < 0)
815 return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
816
817 if (payload) {
818 h->dfd_con = move_fd(fd_final);
819 h->dfd_lim = h->dfd_con;
820 h->path_con = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
821
822 h->path_lim = h->path_con;
823 } else {
824 h->dfd_mon = move_fd(fd_final);
825 }
826 }
827
828 return true;
829 }
830
831 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
832 bool payload)
833 {
834 bool prune = true;
835
836 if (payload) {
837 /* Check whether we actually created the cgroup to prune. */
838 if (h->dfd_lim < 0)
839 prune = false;
840
841 free_equal(h->path_con, h->path_lim);
842 close_equal(h->dfd_con, h->dfd_lim);
843 } else {
844 /* Check whether we actually created the cgroup to prune. */
845 if (h->dfd_mon < 0)
846 prune = false;
847
848 close_prot_errno_disarm(h->dfd_mon);
849 }
850
851 /* We didn't create this cgroup. */
852 if (!prune)
853 return;
854
855 if (cgroup_tree_prune(h->dfd_base, path_prune))
856 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
857 else
858 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
859 }
860
861 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
862 struct lxc_handler *handler)
863 {
864 int len;
865 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
866 const struct lxc_conf *conf;
867
868 if (!ops) {
869 ERROR("Called with uninitialized cgroup operations");
870 return;
871 }
872
873 if (!ops->hierarchies)
874 return;
875
876 if (!handler) {
877 ERROR("Called with uninitialized handler");
878 return;
879 }
880
881 if (!handler->conf) {
882 ERROR("Called with uninitialized conf");
883 return;
884 }
885 conf = handler->conf;
886
887 if (!ops->monitor_cgroup) {
888 WARN("Uninitialized monitor cgroup");
889 return;
890 }
891
892 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
893 if (len < 0)
894 return;
895
896 for (int i = 0; ops->hierarchies[i]; i++) {
897 __do_close int fd_pivot = -EBADF;
898 __do_free char *pivot_path = NULL;
899 struct hierarchy *h = ops->hierarchies[i];
900 bool cpuset_v1 = false;
901 int ret;
902
903 /* Monitor might have died before we entered the cgroup. */
904 if (handler->monitor_pid <= 0) {
905 WARN("No valid monitor process found while destroying cgroups");
906 goto cgroup_prune_tree;
907 }
908
909 if (conf->cgroup_meta.monitor_pivot_dir)
910 pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
911 else if (conf->cgroup_meta.dir)
912 pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
913 else
914 pivot_path = must_make_path(CGROUP_PIVOT, NULL);
915
916 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
917
918 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
919 if (fd_pivot < 0) {
920 SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
921 continue;
922 }
923
924 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
925 if (ret != 0) {
926 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
927 continue;
928 }
929
930 cgroup_prune_tree:
931 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
932 if (ret < 0)
933 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
934 else
935 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
936 }
937 }
938
939 /*
940 * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
941 * proper prefix directory of lxc.cgroup.dir.payload.
942 *
943 * Returns the prefix length if it is set, otherwise zero on success.
944 */
945 static bool check_cgroup_dir_config(struct lxc_conf *conf)
946 {
947 const char *monitor_dir = conf->cgroup_meta.monitor_dir,
948 *container_dir = conf->cgroup_meta.container_dir,
949 *namespace_dir = conf->cgroup_meta.namespace_dir;
950
951 /* none of the new options are set, all is fine */
952 if (!monitor_dir && !container_dir && !namespace_dir)
953 return true;
954
955 /* some are set, make sure lxc.cgroup.dir is not also set*/
956 if (conf->cgroup_meta.dir)
957 return log_error_errno(false, EINVAL,
958 "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
959
960 /* make sure both monitor and payload are set */
961 if (!monitor_dir || !container_dir)
962 return log_error_errno(false, EINVAL,
963 "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
964
965 /* namespace_dir may be empty */
966 return true;
967 }
968
969 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
970 {
971 __do_free char *monitor_cgroup = NULL;
972 int idx = 0;
973 int i;
974 size_t len;
975 char *suffix = NULL;
976 struct lxc_conf *conf;
977
978 if (!ops)
979 return ret_set_errno(false, ENOENT);
980
981 if (!ops->hierarchies)
982 return true;
983
984 if (ops->monitor_cgroup)
985 return ret_set_errno(false, EEXIST);
986
987 if (!handler || !handler->conf)
988 return ret_set_errno(false, EINVAL);
989
990 conf = handler->conf;
991
992 if (!check_cgroup_dir_config(conf))
993 return false;
994
995 if (conf->cgroup_meta.monitor_dir) {
996 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
997 } else if (conf->cgroup_meta.dir) {
998 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
999 DEFAULT_MONITOR_CGROUP_PREFIX,
1000 handler->name,
1001 CGROUP_CREATE_RETRY, NULL);
1002 } else if (ops->cgroup_pattern) {
1003 __do_free char *cgroup_tree = NULL;
1004
1005 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1006 if (!cgroup_tree)
1007 return ret_set_errno(false, ENOMEM);
1008
1009 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1010 DEFAULT_MONITOR_CGROUP,
1011 CGROUP_CREATE_RETRY, NULL);
1012 } else {
1013 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1014 handler->name,
1015 CGROUP_CREATE_RETRY, NULL);
1016 }
1017 if (!monitor_cgroup)
1018 return ret_set_errno(false, ENOMEM);
1019
1020 if (!conf->cgroup_meta.monitor_dir) {
1021 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1022 *suffix = '\0';
1023 }
1024 do {
1025 if (idx && suffix)
1026 sprintf(suffix, "-%d", idx);
1027
1028 for (i = 0; ops->hierarchies[i]; i++) {
1029 if (cgroup_tree_create(ops, handler->conf,
1030 ops->hierarchies[i],
1031 monitor_cgroup, NULL, false))
1032 continue;
1033
1034 DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1035 for (int j = 0; j <= i; j++)
1036 cgroup_tree_prune_leaf(ops->hierarchies[j],
1037 monitor_cgroup, false);
1038
1039 idx++;
1040 break;
1041 }
1042 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1043
1044 if (idx == 1000 || (!suffix && idx != 0))
1045 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1046
1047 ops->monitor_cgroup = move_ptr(monitor_cgroup);
1048 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1049 }
1050
1051 /*
1052 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1053 * next cgroup_pattern-1, -2, ..., -999.
1054 */
1055 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1056 {
1057 __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1058 char *limit_cgroup;
1059 int idx = 0;
1060 int i;
1061 size_t len;
1062 char *suffix = NULL;
1063 struct lxc_conf *conf;
1064
1065 if (!ops)
1066 return ret_set_errno(false, ENOENT);
1067
1068 if (!ops->hierarchies)
1069 return true;
1070
1071 if (ops->container_cgroup || ops->container_limit_cgroup)
1072 return ret_set_errno(false, EEXIST);
1073
1074 if (!handler || !handler->conf)
1075 return ret_set_errno(false, EINVAL);
1076
1077 conf = handler->conf;
1078
1079 if (!check_cgroup_dir_config(conf))
1080 return false;
1081
1082 if (conf->cgroup_meta.container_dir) {
1083 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1084 if (!__limit_cgroup)
1085 return ret_set_errno(false, ENOMEM);
1086
1087 if (conf->cgroup_meta.namespace_dir) {
1088 container_cgroup = must_make_path(__limit_cgroup,
1089 conf->cgroup_meta.namespace_dir,
1090 NULL);
1091 limit_cgroup = __limit_cgroup;
1092 } else {
1093 /* explicit paths but without isolation */
1094 limit_cgroup = move_ptr(__limit_cgroup);
1095 container_cgroup = limit_cgroup;
1096 }
1097 } else if (conf->cgroup_meta.dir) {
1098 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1099 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1100 handler->name,
1101 CGROUP_CREATE_RETRY, NULL);
1102 container_cgroup = limit_cgroup;
1103 } else if (ops->cgroup_pattern) {
1104 __do_free char *cgroup_tree = NULL;
1105
1106 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1107 if (!cgroup_tree)
1108 return ret_set_errno(false, ENOMEM);
1109
1110 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1111 DEFAULT_PAYLOAD_CGROUP,
1112 CGROUP_CREATE_RETRY, NULL);
1113 container_cgroup = limit_cgroup;
1114 } else {
1115 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1116 handler->name,
1117 CGROUP_CREATE_RETRY, NULL);
1118 container_cgroup = limit_cgroup;
1119 }
1120 if (!limit_cgroup)
1121 return ret_set_errno(false, ENOMEM);
1122
1123 if (!conf->cgroup_meta.container_dir) {
1124 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1125 *suffix = '\0';
1126 }
1127 do {
1128 if (idx && suffix)
1129 sprintf(suffix, "-%d", idx);
1130
1131 for (i = 0; ops->hierarchies[i]; i++) {
1132 if (cgroup_tree_create(ops, handler->conf,
1133 ops->hierarchies[i], limit_cgroup,
1134 conf->cgroup_meta.namespace_dir,
1135 true))
1136 continue;
1137
1138 DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
1139 for (int j = 0; j <= i; j++)
1140 cgroup_tree_prune_leaf(ops->hierarchies[j],
1141 limit_cgroup, true);
1142
1143 idx++;
1144 break;
1145 }
1146 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1147
1148 if (idx == 1000 || (!suffix && idx != 0))
1149 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1150
1151 ops->container_cgroup = move_ptr(container_cgroup);
1152 if (__limit_cgroup)
1153 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1154 else
1155 ops->container_limit_cgroup = ops->container_cgroup;
1156 INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1157 ops->container_cgroup, ops->container_limit_cgroup);
1158 return true;
1159 }
1160
1161 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1162 struct lxc_handler *handler)
1163 {
1164 int monitor_len, transient_len = 0;
1165 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1166 transient[INTTYPE_TO_STRLEN(pid_t)];
1167
1168 if (!ops)
1169 return ret_set_errno(false, ENOENT);
1170
1171 if (!ops->hierarchies)
1172 return true;
1173
1174 if (!ops->monitor_cgroup)
1175 return ret_set_errno(false, ENOENT);
1176
1177 if (!handler || !handler->conf)
1178 return ret_set_errno(false, EINVAL);
1179
1180 monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1181 if (monitor_len < 0)
1182 return false;
1183
1184 if (handler->transient_pid > 0) {
1185 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1186 if (transient_len < 0)
1187 return false;
1188 }
1189
1190 for (int i = 0; ops->hierarchies[i]; i++) {
1191 struct hierarchy *h = ops->hierarchies[i];
1192 int ret;
1193
1194 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
1195 if (ret)
1196 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1197
1198 TRACE("Moved monitor into cgroup %d", h->dfd_mon);
1199
1200 if (handler->transient_pid <= 0)
1201 continue;
1202
1203 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
1204 if (ret)
1205 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1206
1207 TRACE("Moved transient process into cgroup %d", h->dfd_mon);
1208
1209 /*
1210 * we don't keep the fds for non-unified hierarchies around
1211 * mainly because we don't make use of them anymore after the
1212 * core cgroup setup is done but also because there are quite a
1213 * lot of them.
1214 */
1215 if (!is_unified_hierarchy(h))
1216 close_prot_errno_disarm(h->dfd_mon);
1217 }
1218 handler->transient_pid = -1;
1219
1220 return true;
1221 }
1222
1223 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1224 struct lxc_handler *handler)
1225 {
1226 int len;
1227 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1228
1229 if (!ops)
1230 return ret_set_errno(false, ENOENT);
1231
1232 if (!ops->hierarchies)
1233 return true;
1234
1235 if (!ops->container_cgroup)
1236 return ret_set_errno(false, ENOENT);
1237
1238 if (!handler || !handler->conf)
1239 return ret_set_errno(false, EINVAL);
1240
1241 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1242 if (len < 0)
1243 return false;
1244
1245 for (int i = 0; ops->hierarchies[i]; i++) {
1246 struct hierarchy *h = ops->hierarchies[i];
1247 int ret;
1248
1249 if (is_unified_hierarchy(h) &&
1250 (handler->clone_flags & CLONE_INTO_CGROUP))
1251 continue;
1252
1253 ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
1254 if (ret != 0)
1255 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
1256
1257 TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
1258 }
1259
1260 return true;
1261 }
1262
1263 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1264 gid_t chown_gid, mode_t chmod_mode)
1265 {
1266 int ret;
1267
1268 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1269 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1270 if (ret < 0)
1271 return log_warn_errno(-1,
1272 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1273 dirfd, path, (int)chown_uid,
1274 (int)chown_gid);
1275
1276 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1277 if (ret < 0)
1278 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1279 dirfd, path, (int)chmod_mode);
1280
1281 return 0;
1282 }
1283
1284 /* chgrp the container cgroups to container group. We leave
1285 * the container owner as cgroup owner. So we must make the
1286 * directories 775 so that the container can create sub-cgroups.
1287 *
1288 * Also chown the tasks and cgroup.procs files. Those may not
1289 * exist depending on kernel version.
1290 */
1291 static int chown_cgroup_wrapper(void *data)
1292 {
1293 int ret;
1294 uid_t destuid;
1295 struct generic_userns_exec_data *arg = data;
1296 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1297 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1298
1299 if (!lxc_drop_groups() && errno != EPERM)
1300 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1301
1302 ret = setresgid(nsgid, nsgid, nsgid);
1303 if (ret < 0)
1304 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1305 (int)nsgid, (int)nsgid, (int)nsgid);
1306
1307 ret = setresuid(nsuid, nsuid, nsuid);
1308 if (ret < 0)
1309 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1310 (int)nsuid, (int)nsuid, (int)nsuid);
1311
1312 destuid = get_ns_uid(arg->origuid);
1313 if (destuid == LXC_INVALID_UID)
1314 destuid = 0;
1315
1316 for (int i = 0; arg->hierarchies[i]; i++) {
1317 int dirfd = arg->hierarchies[i]->dfd_con;
1318
1319 if (dirfd < 0)
1320 return syserror_set(-EBADF, "Invalid cgroup file descriptor");
1321
1322 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1323
1324 /*
1325 * Failures to chown() these are inconvenient but not
1326 * detrimental We leave these owned by the container launcher,
1327 * so that container root can write to the files to attach. We
1328 * chmod() them 664 so that container systemd can write to the
1329 * files (which systemd in wily insists on doing).
1330 */
1331
1332 if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
1333 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1334
1335 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1336
1337 if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
1338 continue;
1339
1340 for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
1341 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1342 }
1343
1344 return 0;
1345 }
1346
1347 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1348 struct lxc_conf *conf)
1349 {
1350 struct generic_userns_exec_data wrap;
1351
1352 if (!ops)
1353 return ret_set_errno(false, ENOENT);
1354
1355 if (!ops->hierarchies)
1356 return true;
1357
1358 if (!ops->container_cgroup)
1359 return ret_set_errno(false, ENOENT);
1360
1361 if (!conf)
1362 return ret_set_errno(false, EINVAL);
1363
1364 if (list_empty(&conf->id_map))
1365 return true;
1366
1367 wrap.origuid = geteuid();
1368 wrap.path = NULL;
1369 wrap.hierarchies = ops->hierarchies;
1370 wrap.conf = conf;
1371
1372 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1373 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1374
1375 return true;
1376 }
1377
1378 __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
1379 {
1380 if (!ops)
1381 return;
1382
1383 if (!ops->hierarchies)
1384 return;
1385
1386 for (int i = 0; ops->hierarchies[i]; i++) {
1387 struct hierarchy *h = ops->hierarchies[i];
1388
1389 /* Close all monitor cgroup file descriptors. */
1390 close_prot_errno_disarm(h->dfd_mon);
1391 }
1392 /* Close the cgroup root file descriptor. */
1393 close_prot_errno_disarm(ops->dfd_mnt);
1394
1395 /*
1396 * The checking for freezer support should obviously be done at cgroup
1397 * initialization time but that doesn't work reliable. The freezer
1398 * controller has been demoted (rightly so) to a simple file located in
1399 * each non-root cgroup. At the time when the container is created we
1400 * might still be located in /sys/fs/cgroup and so checking for
1401 * cgroup.freeze won't tell us anything because this file doesn't exist
1402 * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1403 * find an already existing cgroup and then check within that cgroup
1404 * for the existence of cgroup.freeze but that will only work on
1405 * systemd based hosts. Other init systems might not manage cgroups and
1406 * so no cgroup will exist. So we defer until we have created cgroups
1407 * for our container which means we check here.
1408 */
1409 if (pure_unified_layout(ops) &&
1410 !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
1411 AT_SYMLINK_NOFOLLOW)) {
1412 TRACE("Unified hierarchy supports freezer");
1413 ops->unified->utilities |= FREEZER_CONTROLLER;
1414 }
1415 }
1416
1417 /* cgroup-full:* is done, no need to create subdirs */
1418 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1419 {
1420 switch (cgroup_automount_type) {
1421 case LXC_AUTO_CGROUP_RO:
1422 return true;
1423 case LXC_AUTO_CGROUP_RW:
1424 return true;
1425 case LXC_AUTO_CGROUP_MIXED:
1426 return true;
1427 }
1428
1429 return false;
1430 }
1431
1432 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1433 * remount controller ro if needed and bindmount the cgroupfs onto
1434 * control/the/cg/path.
1435 */
1436 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1437 char *hierarchy_mnt, char *cgpath,
1438 const char *container_cgroup)
1439 {
1440 __do_free char *sourcepath = NULL;
1441 int ret, remount_flags;
1442 int flags = MS_BIND;
1443
1444 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1445 (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1446 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
1447 if (ret < 0)
1448 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1449 hierarchy_mnt, hierarchy_mnt);
1450
1451 remount_flags = add_required_remount_flags(hierarchy_mnt,
1452 hierarchy_mnt,
1453 flags | MS_REMOUNT);
1454 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
1455 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1456 NULL);
1457 if (ret < 0)
1458 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
1459
1460 INFO("Remounted %s read-only", hierarchy_mnt);
1461 }
1462
1463 sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
1464 if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1465 flags |= MS_RDONLY;
1466
1467 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1468 if (ret < 0)
1469 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1470 h->controllers[0], cgpath);
1471 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1472
1473 if (flags & MS_RDONLY) {
1474 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1475 flags | MS_REMOUNT);
1476 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1477 if (ret < 0)
1478 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1479 INFO("Remounted %s read-only", cgpath);
1480 }
1481
1482 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1483 return 0;
1484 }
1485
1486 /* __cgroupfs_mount
1487 *
1488 * Mount cgroup hierarchies directly without using bind-mounts. The main
1489 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1490 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1491 */
1492 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1493 struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1494 const char *hierarchy_mnt)
1495 {
1496 __do_close int fd_fs = -EBADF;
1497 unsigned int flags = 0;
1498 char *fstype;
1499 int ret;
1500
1501 if (dfd_mnt_cgroupfs < 0)
1502 return ret_errno(EINVAL);
1503
1504 flags |= MOUNT_ATTR_NOSUID;
1505 flags |= MOUNT_ATTR_NOEXEC;
1506 flags |= MOUNT_ATTR_NODEV;
1507 flags |= MOUNT_ATTR_RELATIME;
1508
1509 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1510 (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1511 flags |= MOUNT_ATTR_RDONLY;
1512
1513 if (is_unified_hierarchy(h))
1514 fstype = "cgroup2";
1515 else
1516 fstype = "cgroup";
1517
1518 if (can_use_mount_api()) {
1519 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1520 if (fd_fs < 0)
1521 return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1522
1523 if (!is_unified_hierarchy(h)) {
1524 for (const char **it = (const char **)h->controllers; it && *it; it++) {
1525 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1526 ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1527 else
1528 ret = fs_set_property(fd_fs, *it, "");
1529 if (ret < 0)
1530 return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1531 }
1532 }
1533
1534 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1535 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1536 flags);
1537 } else {
1538 __do_free char *controllers = NULL, *target = NULL;
1539 unsigned int old_flags = 0;
1540 const char *rootfs_mnt;
1541
1542 if (!is_unified_hierarchy(h)) {
1543 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1544 if (!controllers)
1545 return ret_errno(ENOMEM);
1546 }
1547
1548 rootfs_mnt = get_rootfs_mnt(rootfs);
1549 ret = mnt_attributes_old(flags, &old_flags);
1550 if (ret)
1551 return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1552
1553 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1554 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1555 }
1556 if (ret < 0)
1557 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1558 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1559
1560 DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1561 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1562 return 0;
1563 }
1564
1565 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1566 struct lxc_rootfs *rootfs,
1567 int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1568 {
1569 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1570 dfd_mnt_cgroupfs, hierarchy_mnt);
1571 }
1572
1573 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1574 struct lxc_rootfs *rootfs,
1575 int dfd_mnt_cgroupfs,
1576 const char *hierarchy_mnt)
1577 {
1578 switch (cgroup_automount_type) {
1579 case LXC_AUTO_CGROUP_FULL_RO:
1580 break;
1581 case LXC_AUTO_CGROUP_FULL_RW:
1582 break;
1583 case LXC_AUTO_CGROUP_FULL_MIXED:
1584 break;
1585 default:
1586 return 0;
1587 }
1588
1589 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1590 dfd_mnt_cgroupfs, hierarchy_mnt);
1591 }
1592
1593 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1594 struct lxc_handler *handler, int cg_flags)
1595 {
1596 __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1597 __do_free char *cgroup_root = NULL;
1598 int cgroup_automount_type;
1599 bool in_cgroup_ns = false, wants_force_mount = false;
1600 struct lxc_conf *conf = handler->conf;
1601 struct lxc_rootfs *rootfs = &conf->rootfs;
1602 const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1603 int ret;
1604
1605 if (!ops)
1606 return ret_set_errno(false, ENOENT);
1607
1608 if (!ops->hierarchies)
1609 return true;
1610
1611 if (!conf)
1612 return ret_set_errno(false, EINVAL);
1613
1614 if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1615 return log_trace(true, "No cgroup mounts requested");
1616
1617 if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1618 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1619 wants_force_mount = true;
1620 }
1621
1622 switch (cg_flags) {
1623 case LXC_AUTO_CGROUP_RO:
1624 TRACE("Read-only cgroup mounts requested");
1625 break;
1626 case LXC_AUTO_CGROUP_RW:
1627 TRACE("Read-write cgroup mounts requested");
1628 break;
1629 case LXC_AUTO_CGROUP_MIXED:
1630 TRACE("Mixed cgroup mounts requested");
1631 break;
1632 case LXC_AUTO_CGROUP_FULL_RO:
1633 TRACE("Full read-only cgroup mounts requested");
1634 break;
1635 case LXC_AUTO_CGROUP_FULL_RW:
1636 TRACE("Full read-write cgroup mounts requested");
1637 break;
1638 case LXC_AUTO_CGROUP_FULL_MIXED:
1639 TRACE("Full mixed cgroup mounts requested");
1640 break;
1641 default:
1642 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1643 }
1644 cgroup_automount_type = cg_flags;
1645
1646 if (!wants_force_mount) {
1647 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
1648
1649 /*
1650 * Most recent distro versions currently have init system that
1651 * do support cgroup2 but do not mount it by default unless
1652 * explicitly told so even if the host is cgroup2 only. That
1653 * means they often will fail to boot. Fix this by pre-mounting
1654 * cgroup2 by default. We will likely need to be doing this a
1655 * few years until all distros have switched over to cgroup2 at
1656 * which point we can safely assume that their init systems
1657 * will mount it themselves.
1658 */
1659 if (pure_unified_layout(ops))
1660 wants_force_mount = true;
1661 }
1662
1663 if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
1664 in_cgroup_ns = true;
1665
1666 if (in_cgroup_ns && !wants_force_mount)
1667 return log_trace(true, "Mounting cgroups not requested or needed");
1668
1669 /* This is really the codepath that we want. */
1670 if (pure_unified_layout(ops)) {
1671 __do_close int dfd_mnt_unified = -EBADF;
1672
1673 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1674 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1675 if (dfd_mnt_unified < 0)
1676 return syserror_ret(false, "Failed to open %d(%s)",
1677 rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1678 /*
1679 * If cgroup namespaces are supported but the container will
1680 * not have CAP_SYS_ADMIN after it has started we need to mount
1681 * the cgroups manually.
1682 *
1683 * Note that here we know that wants_force_mount is true.
1684 * Otherwise we would've returned early above.
1685 */
1686 if (in_cgroup_ns) {
1687 /*
1688 * 1. cgroup:rw:force -> Mount the cgroup2 filesystem.
1689 * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only.
1690 * 3. cgroup:mixed:force -> See comment above how this
1691 * does not apply so
1692 * cgroup:mixed is equal to
1693 * cgroup:rw when cgroup
1694 * namespaces are supported.
1695
1696 * 4. cgroup:rw -> No-op; init system responsible for mounting.
1697 * 5. cgroup:ro -> No-op; init system responsible for mounting.
1698 * 6. cgroup:mixed -> No-op; init system responsible for mounting.
1699 *
1700 * 7. cgroup-full:rw -> Not supported.
1701 * 8. cgroup-full:ro -> Not supported.
1702 * 9. cgroup-full:mixed -> Not supported.
1703
1704 * 10. cgroup-full:rw:force -> Not supported.
1705 * 11. cgroup-full:ro:force -> Not supported.
1706 * 12. cgroup-full:mixed:force -> Not supported.
1707 */
1708 ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
1709 if (ret < 0)
1710 return syserror_ret(false, "Failed to force mount cgroup filesystem in cgroup namespace");
1711
1712 return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
1713 } else {
1714 /*
1715 * Either no cgroup namespace supported (highly
1716 * unlikely unless we're dealing with a Frankenkernel.
1717 * Or the user requested to keep the cgroup namespace
1718 * of the host or another container.
1719 */
1720 if (wants_force_mount) {
1721 /*
1722 * 1. cgroup:rw:force -> Bind-mount the cgroup2 filesystem writable.
1723 * 2. cgroup:ro:force -> Bind-mount the cgroup2 filesystem read-only.
1724 * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
1725 * and make the parent directory of the
1726 * container's cgroup read-only but the
1727 * container's cgroup writable.
1728 *
1729 * 10. cgroup-full:rw:force ->
1730 * 11. cgroup-full:ro:force ->
1731 * 12. cgroup-full:mixed:force ->
1732 */
1733 errno = EOPNOTSUPP;
1734 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1735 } else {
1736 errno = EOPNOTSUPP;
1737 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1738 }
1739 }
1740
1741 return syserror_ret(false, "Failed to mount cgroups");
1742 }
1743
1744 /*
1745 * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1746 * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1747 * DEFAULT_CGROUP_MOUNTPOINT define.
1748 */
1749 if (can_use_mount_api()) {
1750 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1751 if (fd_fs < 0)
1752 return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1753
1754 ret = fs_set_property(fd_fs, "mode", "0755");
1755 if (ret < 0)
1756 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1757
1758 ret = fs_set_property(fd_fs, "size", "10240k");
1759 if (ret < 0)
1760 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1761
1762 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1763 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1764 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1765 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
1766 } else {
1767 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1768 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1769 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1770 "size=10240k,mode=755", rootfs_mnt);
1771 }
1772 if (ret < 0)
1773 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1774 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1775
1776 dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1777 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1778 if (dfd_mnt_tmpfs < 0)
1779 return syserror_ret(false, "Failed to open %d(%s)",
1780 rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1781
1782 for (int i = 0; ops->hierarchies[i]; i++) {
1783 __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
1784 struct hierarchy *h = ops->hierarchies[i];
1785
1786 ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
1787 if (ret < 0)
1788 return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
1789
1790 if (in_cgroup_ns && wants_force_mount) {
1791 /*
1792 * If cgroup namespaces are supported but the container
1793 * will not have CAP_SYS_ADMIN after it has started we
1794 * need to mount the cgroups manually.
1795 */
1796 ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
1797 dfd_mnt_tmpfs, h->at_mnt);
1798 if (ret < 0)
1799 return false;
1800
1801 continue;
1802 }
1803
1804 /* Here is where the ancient kernel section begins. */
1805 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
1806 dfd_mnt_tmpfs, h->at_mnt);
1807 if (ret < 0)
1808 return false;
1809
1810 if (!cg_mount_needs_subdirs(cgroup_automount_type))
1811 continue;
1812
1813 if (!cgroup_root)
1814 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1815
1816 hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
1817 path2 = must_make_path(hierarchy_mnt, h->at_base,
1818 ops->container_cgroup, NULL);
1819 ret = mkdir_p(path2, 0755);
1820 if (ret < 0 && (errno != EEXIST))
1821 return false;
1822
1823 ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
1824 hierarchy_mnt, path2,
1825 ops->container_cgroup);
1826 if (ret < 0)
1827 return false;
1828 }
1829
1830 return true;
1831 }
1832
1833 /* Only root needs to escape to the cgroup of its init. */
1834 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
1835 struct lxc_conf *conf)
1836 {
1837 if (!ops)
1838 return ret_set_errno(false, ENOENT);
1839
1840 if (!ops->hierarchies)
1841 return true;
1842
1843 if (!conf)
1844 return ret_set_errno(false, EINVAL);
1845
1846 if (conf->cgroup_meta.relative || geteuid())
1847 return true;
1848
1849 for (int i = 0; ops->hierarchies[i]; i++) {
1850 __do_free char *fullpath = NULL;
1851 int ret;
1852
1853 fullpath = make_cgroup_path(ops->hierarchies[i],
1854 ops->hierarchies[i]->at_base,
1855 "cgroup.procs", NULL);
1856 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1857 if (ret != 0)
1858 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
1859 }
1860
1861 return true;
1862 }
1863
1864 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
1865 {
1866 int i = 0;
1867
1868 if (!ops)
1869 return ret_set_errno(-1, ENOENT);
1870
1871 if (!ops->hierarchies)
1872 return 0;
1873
1874 for (; ops->hierarchies[i]; i++)
1875 ;
1876
1877 return i;
1878 }
1879
1880 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
1881 int n, char ***out)
1882 {
1883 int i;
1884
1885 if (!ops)
1886 return ret_set_errno(false, ENOENT);
1887
1888 if (!ops->hierarchies)
1889 return ret_set_errno(false, ENOENT);
1890
1891 /* consistency check n */
1892 for (i = 0; i < n; i++)
1893 if (!ops->hierarchies[i])
1894 return ret_set_errno(false, ENOENT);
1895
1896 *out = ops->hierarchies[i]->controllers;
1897
1898 return true;
1899 }
1900
1901 static int cg_legacy_freeze(struct cgroup_ops *ops)
1902 {
1903 struct hierarchy *h;
1904
1905 h = get_hierarchy(ops, "freezer");
1906 if (!h)
1907 return ret_set_errno(-1, ENOENT);
1908
1909 return lxc_write_openat(h->path_con, "freezer.state",
1910 "FROZEN", STRLITERALLEN("FROZEN"));
1911 }
1912
1913 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1914 struct lxc_async_descr *descr)
1915 {
1916 __do_free char *line = NULL;
1917 __do_fclose FILE *f = NULL;
1918 int state = PTR_TO_INT(cbdata);
1919 size_t len;
1920 const char *state_string;
1921
1922 f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
1923 if (!f)
1924 return LXC_MAINLOOP_ERROR;
1925
1926 if (state == 1)
1927 state_string = "frozen 1";
1928 else
1929 state_string = "frozen 0";
1930
1931 while (getline(&line, &len, f) != -1)
1932 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
1933 return LXC_MAINLOOP_CLOSE;
1934
1935 rewind(f);
1936
1937 return LXC_MAINLOOP_CONTINUE;
1938 }
1939
1940 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
1941 const char *state_string,
1942 int state_num,
1943 const char *epoll_error,
1944 const char *wait_error)
1945 {
1946 __do_close int fd = -EBADF;
1947 call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL;
1948 int ret;
1949 struct lxc_async_descr descr;
1950 struct hierarchy *h;
1951
1952 h = ops->unified;
1953 if (!h)
1954 return ret_set_errno(-1, ENOENT);
1955
1956 if (!h->path_con)
1957 return ret_set_errno(-1, EEXIST);
1958
1959 if (timeout != 0) {
1960 __do_free char *events_file = NULL;
1961
1962 events_file = must_make_path(h->path_con, "cgroup.events", NULL);
1963 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1964 if (fd < 0)
1965 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
1966
1967 ret = lxc_mainloop_open(&descr);
1968 if (ret)
1969 return log_error_errno(-1, errno, "%s", epoll_error);
1970
1971 /* automatically cleaned up now */
1972 descr_ptr = &descr;
1973
1974 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI,
1975 freezer_cgroup_events_cb,
1976 default_cleanup_handler,
1977 INT_TO_PTR(state_num),
1978 "freezer_cgroup_events_cb");
1979 if (ret < 0)
1980 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
1981 }
1982
1983 ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
1984 if (ret < 0)
1985 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
1986
1987 if (timeout != 0 && lxc_mainloop(&descr, timeout))
1988 return log_error_errno(-1, errno, "%s", wait_error);
1989
1990 return 0;
1991 }
1992
1993 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1994 {
1995 return cg_unified_freeze_do(ops, timeout, "1", 1,
1996 "Failed to create epoll instance to wait for container freeze",
1997 "Failed to wait for container to be frozen");
1998 }
1999
2000 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
2001 {
2002 if (!ops->hierarchies)
2003 return ret_set_errno(-1, ENOENT);
2004
2005 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2006 return cg_legacy_freeze(ops);
2007
2008 return cg_unified_freeze(ops, timeout);
2009 }
2010
2011 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
2012 {
2013 struct hierarchy *h;
2014
2015 h = get_hierarchy(ops, "freezer");
2016 if (!h)
2017 return ret_set_errno(-1, ENOENT);
2018
2019 return lxc_write_openat(h->path_con, "freezer.state",
2020 "THAWED", STRLITERALLEN("THAWED"));
2021 }
2022
2023 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2024 {
2025 return cg_unified_freeze_do(ops, timeout, "0", 0,
2026 "Failed to create epoll instance to wait for container unfreeze",
2027 "Failed to wait for container to be unfrozen");
2028 }
2029
2030 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2031 {
2032 if (!ops->hierarchies)
2033 return ret_set_errno(-1, ENOENT);
2034
2035 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2036 return cg_legacy_unfreeze(ops);
2037
2038 return cg_unified_unfreeze(ops, timeout);
2039 }
2040
2041 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2042 const char *controller, bool limiting)
2043 {
2044 struct hierarchy *h;
2045 size_t len;
2046 const char *path;
2047
2048 h = get_hierarchy(ops, controller);
2049 if (!h)
2050 return log_warn_errno(NULL, ENOENT,
2051 "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
2052
2053 if (limiting)
2054 path = h->path_lim;
2055 else
2056 path = h->path_con;
2057 if (!path)
2058 return NULL;
2059
2060 len = strlen(h->at_mnt);
2061 if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
2062 STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
2063 path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
2064 path += strspn(path, "/");
2065 }
2066 return path += len;
2067 }
2068
2069 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2070 const char *controller)
2071 {
2072 return cgfsng_get_cgroup_do(ops, controller, false);
2073 }
2074
2075 __cgfsng_ops static const char *cgfsng_get_limit_cgroup(struct cgroup_ops *ops,
2076 const char *controller)
2077 {
2078 return cgfsng_get_cgroup_do(ops, controller, true);
2079 }
2080
2081 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2082 * which must be freed by the caller.
2083 */
2084 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2085 const char *inpath,
2086 const char *filename)
2087 {
2088 return make_cgroup_path(h, inpath, filename, NULL);
2089 }
2090
2091 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2092 {
2093 int idx = 1;
2094 int ret;
2095 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2096 ssize_t pidstr_len;
2097
2098 /* Create leaf cgroup. */
2099 ret = mkdirat(unified_fd, ".lxc", 0755);
2100 if (ret < 0 && errno != EEXIST)
2101 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2102
2103 pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2104 if (pidstr_len < 0)
2105 return pidstr_len;
2106
2107 ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2108 if (ret < 0)
2109 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2110 if (ret == 0)
2111 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2112
2113 /* this is a non-leaf node */
2114 if (errno != EBUSY)
2115 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2116
2117 do {
2118 bool rm = false;
2119 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2120 char *slash = attach_cgroup;
2121
2122 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2123 if (ret < 0)
2124 return ret;
2125
2126 /*
2127 * This shouldn't really happen but the compiler might complain
2128 * that a short write would cause a buffer overrun. So be on
2129 * the safe side.
2130 */
2131 if ((size_t)ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2132 return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2133
2134 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2135 *slash = '\0';
2136
2137 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2138 if (ret < 0 && errno != EEXIST)
2139 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2140 if (ret == 0)
2141 rm = true;
2142
2143 *slash = '/';
2144
2145 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2146 if (ret == 0)
2147 return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2148
2149 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2150 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2151
2152 /* this is a non-leaf node */
2153 if (errno != EBUSY)
2154 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2155
2156 idx++;
2157 } while (idx < 1000);
2158
2159 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2160 }
2161
2162 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2163 int unified_fd, int *sk_fd)
2164 {
2165 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2166 int target_fds[2];
2167 ssize_t ret;
2168
2169 /* Create leaf cgroup. */
2170 ret = mkdirat(unified_fd, ".lxc", 0755);
2171 if (ret < 0 && errno != EEXIST)
2172 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2173
2174 target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2175 if (target_fd0 < 0)
2176 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2177 target_fds[0] = target_fd0;
2178
2179 target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2180 if (target_fd1 < 0)
2181 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2182 target_fds[1] = target_fd1;
2183
2184 ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2185 if (ret <= 0)
2186 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2187 target_fd0, target_fd1);
2188
2189 return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2190 }
2191
2192 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2193 int *sk_fd, pid_t pid)
2194 {
2195 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2196 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2197 size_t pidstr_len;
2198 ssize_t ret;
2199
2200 ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1);
2201 if (ret < 0)
2202 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2203
2204 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2205
2206 ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2207 if (ret > 0 && (size_t)ret == pidstr_len)
2208 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2209
2210 ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2211 if (ret > 0 && (size_t)ret == pidstr_len)
2212 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2213
2214 return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2215 target_fd0, target_fd1);
2216 }
2217
2218 struct userns_exec_unified_attach_data {
2219 const struct lxc_conf *conf;
2220 int unified_fd;
2221 int sk_pair[2];
2222 pid_t pid;
2223 };
2224
2225 static int cgroup_unified_attach_child_wrapper(void *data)
2226 {
2227 struct userns_exec_unified_attach_data *args = data;
2228
2229 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2230 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2231 return ret_errno(EINVAL);
2232
2233 close_prot_errno_disarm(args->sk_pair[0]);
2234 return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2235 &args->sk_pair[1]);
2236 }
2237
2238 static int cgroup_unified_attach_parent_wrapper(void *data)
2239 {
2240 struct userns_exec_unified_attach_data *args = data;
2241
2242 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2243 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2244 return ret_errno(EINVAL);
2245
2246 close_prot_errno_disarm(args->sk_pair[1]);
2247 return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2248 args->pid);
2249 }
2250
2251 /* Technically, we're always at a delegation boundary here (This is especially
2252 * true when cgroup namespaces are available.). The reasoning is that in order
2253 * for us to have been able to start a container in the first place the root
2254 * cgroup must have been a leaf node. Now, either the container's init system
2255 * has populated the cgroup and kept it as a leaf node or it has created
2256 * subtrees. In the former case we will simply attach to the leaf node we
2257 * created when we started the container in the latter case we create our own
2258 * cgroup for the attaching process.
2259 */
2260 static int __cg_unified_attach(const struct hierarchy *h,
2261 const struct lxc_conf *conf, const char *name,
2262 const char *lxcpath, pid_t pid,
2263 const char *controller)
2264 {
2265 __do_close int unified_fd = -EBADF;
2266 __do_free char *path = NULL, *cgroup = NULL;
2267 int ret;
2268
2269 if (!conf || !name || !lxcpath || pid <= 0)
2270 return ret_errno(EINVAL);
2271
2272 ret = cgroup_attach(conf, name, lxcpath, pid);
2273 if (ret == 0)
2274 return log_trace(0, "Attached to unified cgroup via command handler");
2275 if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2)
2276 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2277
2278 /* Fall back to retrieving the path for the unified cgroup. */
2279 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2280 /* not running */
2281 if (!cgroup)
2282 return 0;
2283
2284 path = make_cgroup_path(h, cgroup, NULL);
2285
2286 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2287 if (unified_fd < 0)
2288 return ret_errno(EBADF);
2289
2290 if (!list_empty(&conf->id_map)) {
2291 struct userns_exec_unified_attach_data args = {
2292 .conf = conf,
2293 .unified_fd = unified_fd,
2294 .pid = pid,
2295 };
2296
2297 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2298 if (ret < 0)
2299 return -errno;
2300
2301 ret = userns_exec_minimal(conf,
2302 cgroup_unified_attach_parent_wrapper,
2303 &args,
2304 cgroup_unified_attach_child_wrapper,
2305 &args);
2306 } else {
2307 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2308 }
2309
2310 return ret;
2311 }
2312
2313 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2314 const struct lxc_conf *conf,
2315 const char *name, const char *lxcpath,
2316 pid_t pid)
2317 {
2318 int len, ret;
2319 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2320
2321 if (!ops)
2322 return ret_set_errno(false, ENOENT);
2323
2324 if (!ops->hierarchies)
2325 return true;
2326
2327 len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2328 if (len < 0)
2329 return false;
2330
2331 for (int i = 0; ops->hierarchies[i]; i++) {
2332 __do_free char *fullpath = NULL, *path = NULL;
2333 struct hierarchy *h = ops->hierarchies[i];
2334
2335 if (h->fs_type == UNIFIED_HIERARCHY) {
2336 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2337 h->controllers[0]);
2338 if (ret < 0)
2339 return false;
2340
2341 continue;
2342 }
2343
2344 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2345 if (!path) {
2346 /*
2347 * Someone might have created a name=<controller>
2348 * controller after the container has started and so
2349 * the container doesn't make use of this controller.
2350 *
2351 * Link: https://github.com/lxc/lxd/issues/8577
2352 */
2353 TRACE("Skipping unused %s controller", maybe_empty(h->controllers[0]));
2354 continue;
2355 }
2356
2357 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2358 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2359 if (ret < 0)
2360 return log_error_errno(false, errno, "Failed to attach %d to %s",
2361 (int)pid, fullpath);
2362 }
2363
2364 return true;
2365 }
2366
2367 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2368 * don't have a cgroup_data set up, so we ask the running container through the
2369 * commands API for the cgroup path.
2370 */
2371 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2372 char *value, size_t len, const char *name,
2373 const char *lxcpath)
2374 {
2375 __do_free char *path = NULL;
2376 __do_free char *controller = NULL;
2377 char *p;
2378 struct hierarchy *h;
2379 int ret = -1;
2380
2381 if (!ops)
2382 return ret_set_errno(-1, ENOENT);
2383
2384 controller = strdup(filename);
2385 if (!controller)
2386 return ret_errno(ENOMEM);
2387
2388 p = strchr(controller, '.');
2389 if (p)
2390 *p = '\0';
2391
2392 path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2393 /* not running */
2394 if (!path)
2395 return -1;
2396
2397 h = get_hierarchy(ops, controller);
2398 if (h) {
2399 __do_free char *fullpath = NULL;
2400
2401 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2402 ret = lxc_read_from_file(fullpath, value, len);
2403 }
2404
2405 return ret;
2406 }
2407
2408 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2409 {
2410 for (int count = 0; count < 3; count++, val++) {
2411 switch (*val) {
2412 case 'r':
2413 device->access[count] = *val;
2414 break;
2415 case 'w':
2416 device->access[count] = *val;
2417 break;
2418 case 'm':
2419 device->access[count] = *val;
2420 break;
2421 case '\n':
2422 case '\0':
2423 count = 3;
2424 break;
2425 default:
2426 return ret_errno(EINVAL);
2427 }
2428 }
2429
2430 return 0;
2431 }
2432
2433 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2434 const char *val)
2435 {
2436 size_t count;
2437 int ret;
2438 char temp[50];
2439
2440 if (strequal("devices.allow", key))
2441 device->allow = 1; /* allow the device */
2442 else
2443 device->allow = 0; /* deny the device */
2444
2445 if (strequal(val, "a")) {
2446 /* global rule */
2447 device->type = 'a';
2448 device->major = -1;
2449 device->minor = -1;
2450 return 0;
2451 }
2452
2453 switch (*val) {
2454 case 'a':
2455 __fallthrough;
2456 case 'b':
2457 __fallthrough;
2458 case 'c':
2459 device->type = *val;
2460 break;
2461 default:
2462 return -1;
2463 }
2464
2465 val++;
2466 if (!isspace(*val))
2467 return -1;
2468 val++;
2469 if (*val == '*') {
2470 device->major = -1;
2471 val++;
2472 } else if (isdigit(*val)) {
2473 memset(temp, 0, sizeof(temp));
2474 for (count = 0; count < sizeof(temp) - 1; count++) {
2475 temp[count] = *val;
2476 val++;
2477 if (!isdigit(*val))
2478 break;
2479 }
2480 ret = lxc_safe_int(temp, &device->major);
2481 if (ret)
2482 return -1;
2483 } else {
2484 return -1;
2485 }
2486 if (*val != ':')
2487 return -1;
2488 val++;
2489
2490 /* read minor */
2491 if (*val == '*') {
2492 device->minor = -1;
2493 val++;
2494 } else if (isdigit(*val)) {
2495 memset(temp, 0, sizeof(temp));
2496 for (count = 0; count < sizeof(temp) - 1; count++) {
2497 temp[count] = *val;
2498 val++;
2499 if (!isdigit(*val))
2500 break;
2501 }
2502 ret = lxc_safe_int(temp, &device->minor);
2503 if (ret)
2504 return -1;
2505 } else {
2506 return -1;
2507 }
2508 if (!isspace(*val))
2509 return -1;
2510
2511 return device_cgroup_parse_access(device, ++val);
2512 }
2513
2514 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2515 * don't have a cgroup_data set up, so we ask the running container through the
2516 * commands API for the cgroup path.
2517 */
2518 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2519 const char *key, const char *value,
2520 const char *name, const char *lxcpath)
2521 {
2522 __do_free char *path = NULL;
2523 __do_free char *controller = NULL;
2524 char *p;
2525 struct hierarchy *h;
2526 int ret = -1;
2527
2528 if (!ops || is_empty_string(key) || is_empty_string(value) ||
2529 is_empty_string(name) || is_empty_string(lxcpath))
2530 return ret_errno(EINVAL);
2531
2532 controller = strdup(key);
2533 if (!controller)
2534 return ret_errno(ENOMEM);
2535
2536 p = strchr(controller, '.');
2537 if (p)
2538 *p = '\0';
2539
2540 if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2541 struct device_item device = {};
2542
2543 ret = device_cgroup_rule_parse(&device, key, value);
2544 if (ret < 0)
2545 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2546 key, value);
2547
2548 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2549 if (ret < 0)
2550 return -1;
2551
2552 return 0;
2553 }
2554
2555 path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2556 /* not running */
2557 if (!path)
2558 return -1;
2559
2560 h = get_hierarchy(ops, controller);
2561 if (h) {
2562 __do_free char *fullpath = NULL;
2563
2564 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2565 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2566 }
2567
2568 return ret;
2569 }
2570
2571 /* take devices cgroup line
2572 * /dev/foo rwx
2573 * and convert it to a valid
2574 * type major:minor mode
2575 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2576 * the output.
2577 */
2578 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2579 const char *devpath)
2580 {
2581 __do_free char *path = NULL;
2582 char *mode = NULL;
2583 int n_parts, ret;
2584 char *p;
2585 struct stat sb;
2586
2587 path = strdup(devpath);
2588 if (!path)
2589 return ret_errno(ENOMEM);
2590
2591 /*
2592 * Read path followed by mode. Ignore any trailing text.
2593 * A ' # comment' would be legal. Technically other text is not
2594 * legal, we could check for that if we cared to.
2595 */
2596 for (n_parts = 1, p = path; *p; p++) {
2597 if (*p != ' ')
2598 continue;
2599 *p = '\0';
2600
2601 if (n_parts != 1)
2602 break;
2603 p++;
2604 n_parts++;
2605
2606 while (*p == ' ')
2607 p++;
2608
2609 mode = p;
2610
2611 if (*p == '\0')
2612 return ret_set_errno(-1, EINVAL);
2613 }
2614
2615 if (!mode)
2616 return ret_errno(EINVAL);
2617
2618 if (device_cgroup_parse_access(device, mode) < 0)
2619 return -1;
2620
2621 ret = stat(path, &sb);
2622 if (ret < 0)
2623 return ret_set_errno(-1, errno);
2624
2625 mode_t m = sb.st_mode & S_IFMT;
2626 switch (m) {
2627 case S_IFBLK:
2628 device->type = 'b';
2629 break;
2630 case S_IFCHR:
2631 device->type = 'c';
2632 break;
2633 default:
2634 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2635 }
2636
2637 device->major = MAJOR(sb.st_rdev);
2638 device->minor = MINOR(sb.st_rdev);
2639 device->allow = 1;
2640
2641 return 0;
2642 }
2643
2644 static int convert_devpath(const char *invalue, char *dest)
2645 {
2646 struct device_item device = {};
2647 int ret;
2648
2649 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2650 if (ret < 0)
2651 return -1;
2652
2653 ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2654 device.minor, device.access);
2655 if (ret < 0)
2656 return log_error_errno(ret, -ret,
2657 "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2658 device.type, device.major, device.minor,
2659 device.access);
2660
2661 return 0;
2662 }
2663
2664 /* Called from setup_limits - here we have the container's cgroup_data because
2665 * we created the cgroups.
2666 */
2667 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2668 const char *value, bool is_cpuset)
2669 {
2670 __do_free char *controller = NULL;
2671 char *p;
2672 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2673 char converted_value[50];
2674 struct hierarchy *h;
2675
2676 controller = strdup(filename);
2677 if (!controller)
2678 return ret_errno(ENOMEM);
2679
2680 p = strchr(controller, '.');
2681 if (p)
2682 *p = '\0';
2683
2684 if (strequal("devices.allow", filename) && value[0] == '/') {
2685 int ret;
2686
2687 ret = convert_devpath(value, converted_value);
2688 if (ret < 0)
2689 return ret;
2690 value = converted_value;
2691 }
2692
2693 h = get_hierarchy(ops, controller);
2694 if (!h)
2695 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
2696
2697 if (is_cpuset) {
2698 int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
2699 if (ret)
2700 return ret;
2701 }
2702 return lxc_write_openat(h->path_lim, filename, value, strlen(value));
2703 }
2704
2705 /*
2706 * Return the list of cgroup_settings sorted according to the following rules
2707 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
2708 */
2709 static void sort_cgroup_settings(struct lxc_conf *conf)
2710 {
2711 LIST_HEAD(memsw_list);
2712 struct lxc_cgroup *cgroup, *ncgroup;
2713
2714 /* Iterate over the cgroup settings and copy them to the output list. */
2715 list_for_each_entry_safe(cgroup, ncgroup, &conf->cgroup, head) {
2716 if (!strequal(cgroup->subsystem, "memory.memsw.limit_in_bytes"))
2717 continue;
2718
2719 /* Move the memsw entry from the cgroup settings list. */
2720 list_move_tail(&cgroup->head, &memsw_list);
2721 }
2722
2723 /*
2724 * Append all the memsw entries to the end of the cgroup settings list
2725 * to make sure they are applied after all memory limit settings.
2726 */
2727 list_splice_tail(&memsw_list, &conf->cgroup);
2728
2729 }
2730
2731 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2732 struct lxc_conf *conf,
2733 bool do_devices)
2734 {
2735 struct list_head *cgroup_settings;
2736 struct lxc_cgroup *cgroup;
2737
2738 if (!ops)
2739 return ret_set_errno(false, ENOENT);
2740
2741 if (!conf)
2742 return ret_set_errno(false, EINVAL);
2743
2744 cgroup_settings = &conf->cgroup;
2745 if (list_empty(cgroup_settings))
2746 return true;
2747
2748 if (!ops->hierarchies)
2749 return ret_set_errno(false, EINVAL);
2750
2751 if (pure_unified_layout(ops))
2752 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2753
2754 sort_cgroup_settings(conf);
2755 list_for_each_entry(cgroup, cgroup_settings, head) {
2756 if (do_devices == strnequal("devices", cgroup->subsystem, 7)) {
2757 if (cg_legacy_set_data(ops, cgroup->subsystem, cgroup->value, strnequal("cpuset", cgroup->subsystem, 6))) {
2758 if (do_devices && (errno == EACCES || errno == EPERM)) {
2759 SYSWARN("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2760 continue;
2761 }
2762 SYSERROR("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2763 return false;
2764 }
2765 DEBUG("Set controller \"%s\" set to \"%s\"", cgroup->subsystem, cgroup->value);
2766 }
2767 }
2768
2769 INFO("Limits for the legacy cgroup hierarchies have been setup");
2770 return true;
2771 }
2772
2773 /*
2774 * Some of the parsing logic comes from the original cgroup device v1
2775 * implementation in the kernel.
2776 */
2777 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2778 struct lxc_conf *conf, const char *key,
2779 const char *val)
2780 {
2781 struct device_item device_item = {};
2782 int ret;
2783
2784 if (strequal("devices.allow", key) && abspath(val))
2785 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2786 else
2787 ret = device_cgroup_rule_parse(&device_item, key, val);
2788 if (ret < 0)
2789 return syserror_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
2790
2791 /*
2792 * Note that bpf_list_add_device() returns 1 if it altered the device
2793 * list and 0 if it didn't; both return values indicate success.
2794 * Only a negative return value indicates an error.
2795 */
2796 ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
2797 if (ret < 0)
2798 return -1;
2799
2800 return 0;
2801 }
2802
2803 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2804 struct lxc_handler *handler)
2805 {
2806 struct list_head *cgroup_settings;
2807 struct hierarchy *h;
2808 struct lxc_conf *conf;
2809 struct lxc_cgroup *cgroup;
2810
2811 if (!ops)
2812 return ret_set_errno(false, ENOENT);
2813
2814 if (!ops->hierarchies)
2815 return true;
2816
2817 if (!ops->container_cgroup)
2818 return ret_set_errno(false, EINVAL);
2819
2820 if (!handler || !handler->conf)
2821 return ret_set_errno(false, EINVAL);
2822 conf = handler->conf;
2823
2824 cgroup_settings = &conf->cgroup2;
2825 if (list_empty(cgroup_settings))
2826 return true;
2827
2828 if (!pure_unified_layout(ops))
2829 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
2830
2831 if (!ops->unified)
2832 return false;
2833 h = ops->unified;
2834
2835 list_for_each_entry(cgroup, cgroup_settings, head) {
2836 int ret;
2837
2838 if (strnequal("devices", cgroup->subsystem, 7))
2839 ret = bpf_device_cgroup_prepare(ops, conf, cgroup->subsystem, cgroup->value);
2840 else
2841 ret = lxc_write_openat(h->path_lim, cgroup->subsystem, cgroup->value, strlen(cgroup->value));
2842 if (ret < 0)
2843 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2844
2845 TRACE("Set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2846 }
2847
2848 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2849 }
2850
2851 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
2852 {
2853 struct lxc_conf *conf;
2854 struct hierarchy *unified;
2855
2856 if (!ops)
2857 return ret_set_errno(false, ENOENT);
2858
2859 if (!ops->hierarchies)
2860 return true;
2861
2862 if (!ops->container_cgroup)
2863 return ret_set_errno(false, EEXIST);
2864
2865 if (!handler || !handler->conf)
2866 return ret_set_errno(false, EINVAL);
2867 conf = handler->conf;
2868
2869 unified = ops->unified;
2870 if (!unified || !device_utility_controller(unified) ||
2871 !unified->path_con || list_empty(&(conf->bpf_devices).devices))
2872 return true;
2873
2874 return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
2875 }
2876
2877 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2878 {
2879 __do_close int dfd_final = -EBADF;
2880 __do_free char *add_controllers = NULL, *copy = NULL;
2881 size_t full_len = 0;
2882 struct hierarchy *unified;
2883 int dfd_cur, ret;
2884 char *cur;
2885 char **it;
2886
2887 if (!ops->hierarchies || !pure_unified_layout(ops))
2888 return true;
2889
2890 unified = ops->unified;
2891 if (!unified->controllers[0])
2892 return true;
2893
2894 /* For now we simply enable all controllers that we have detected by
2895 * creating a string like "+memory +pids +cpu +io".
2896 * TODO: In the near future we might want to support "-<controller>"
2897 * etc. but whether supporting semantics like this make sense will need
2898 * some thinking.
2899 */
2900 for (it = unified->controllers; it && *it; it++) {
2901 full_len += strlen(*it) + 2;
2902 add_controllers = must_realloc(add_controllers, full_len + 1);
2903
2904 if (unified->controllers[0] == *it)
2905 add_controllers[0] = '\0';
2906
2907 (void)strlcat(add_controllers, "+", full_len + 1);
2908 (void)strlcat(add_controllers, *it, full_len + 1);
2909
2910 if ((it + 1) && *(it + 1))
2911 (void)strlcat(add_controllers, " ", full_len + 1);
2912 }
2913
2914 copy = strdup(cgroup);
2915 if (!copy)
2916 return false;
2917
2918 /*
2919 * Placing the write to cgroup.subtree_control before the open() is
2920 * intentional because of the cgroup2 delegation model. It enforces
2921 * that leaf cgroups don't have any controllers enabled for delegation.
2922 */
2923 dfd_cur = unified->dfd_base;
2924 lxc_iterate_parts(cur, copy, "/") {
2925 /*
2926 * Even though we vetted the paths when we parsed the config
2927 * we're paranoid here and check that the path is neither
2928 * absolute nor walks upwards.
2929 */
2930 if (abspath(cur))
2931 return syserror_set(-EINVAL, "No absolute paths allowed");
2932
2933 if (strnequal(cur, "..", STRLITERALLEN("..")))
2934 return syserror_set(-EINVAL, "No upward walking paths allowed");
2935
2936 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
2937 if (ret < 0)
2938 return syserror("Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2939
2940 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2941
2942 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
2943 if (dfd_final < 0)
2944 return syserror("Fail to open directory %d(%s)", dfd_cur, cur);
2945 if (dfd_cur != unified->dfd_base)
2946 close(dfd_cur);
2947 /*
2948 * Leave dfd_final pointing to the last fd we opened so
2949 * it will be automatically zapped if we return early.
2950 */
2951 dfd_cur = dfd_final;
2952 }
2953
2954 return true;
2955 }
2956
2957 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2958 {
2959 if (!ops)
2960 return ret_set_errno(false, ENOENT);
2961
2962 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2963 }
2964
2965 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2966 {
2967 if (!ops)
2968 return ret_set_errno(false, ENOENT);
2969
2970 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2971 }
2972
2973 static inline bool unified_cgroup(const char *line)
2974 {
2975 return *line == '0';
2976 }
2977
2978 static inline char *current_unified_cgroup(bool relative, char *line)
2979 {
2980 char *current_cgroup;
2981
2982 line += STRLITERALLEN("0::");
2983
2984 if (!abspath(line))
2985 return ERR_PTR(-EINVAL);
2986
2987 /* remove init.scope */
2988 if (!relative)
2989 line = prune_init_scope(line);
2990
2991 /* create a relative path */
2992 line = deabs(line);
2993
2994 current_cgroup = strdup(line);
2995 if (!current_cgroup)
2996 return ERR_PTR(-ENOMEM);
2997
2998 return current_cgroup;
2999 }
3000
3001 static inline const char *unprefix(const char *controllers)
3002 {
3003 if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
3004 return controllers + STRLITERALLEN("name=");
3005 return controllers;
3006 }
3007
3008 static int __list_cgroup_delegate(char ***delegate)
3009 {
3010 __do_free char **list = NULL;
3011 __do_free char *buf = NULL;
3012 char *standard[] = {
3013 "cgroup.procs",
3014 "cgroup.threads",
3015 "cgroup.subtree_control",
3016 "memory.oom.group",
3017 NULL,
3018 };
3019 char *token;
3020 int ret;
3021
3022 buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
3023 if (!buf) {
3024 for (char **p = standard; p && *p; p++) {
3025 ret = list_add_string(&list, *p);
3026 if (ret < 0)
3027 return ret;
3028 }
3029
3030 *delegate = move_ptr(list);
3031 return syswarn_ret(0, "Failed to read /sys/kernel/cgroup/delegate");
3032 }
3033
3034 lxc_iterate_parts(token, buf, " \t\n") {
3035 /*
3036 * We always need to chown this for both cgroup and
3037 * cgroup2.
3038 */
3039 if (strequal(token, "cgroup.procs"))
3040 continue;
3041
3042 ret = list_add_string(&list, token);
3043 if (ret < 0)
3044 return ret;
3045 }
3046
3047 *delegate = move_ptr(list);
3048 return 0;
3049 }
3050
3051 static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
3052 {
3053 __do_free_string_list char **list = NULL;
3054 int ret;
3055
3056 ret = __list_cgroup_delegate(&list);
3057 if (ret < 0)
3058 return syserror_ret(ret, "Failed to determine unified cgroup delegation requirements");
3059
3060 for (char *const *s = list; s && *s; s++) {
3061 if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
3062 continue;
3063
3064 return sysinfo_ret(false, "The %s file is not writable, skipping unified hierarchy", *s);
3065 }
3066
3067 *ret_files = move_ptr(list);
3068 return true;
3069 }
3070
3071 static bool legacy_hierarchy_delegated(int dfd_base)
3072 {
3073 int ret;
3074
3075 ret = faccessat(dfd_base, ".", W_OK, 0);
3076 if (ret < 0 && errno != ENOENT)
3077 return sysinfo_ret(false, "Legacy hierarchy not writable, skipping");
3078
3079 return true;
3080 }
3081
3082 /**
3083 * systemd guarantees that the order of co-mounted controllers is stable. On
3084 * some systems the order of the controllers might be reversed though.
3085 *
3086 * For example, this is how the order is mismatched on CentOS 7:
3087 *
3088 * [root@localhost ~]# cat /proc/self/cgroup
3089 * 11:perf_event:/
3090 * 10:pids:/
3091 * 9:freezer:/
3092 * >>>> 8:cpuacct,cpu:/
3093 * 7:memory:/
3094 * 6:blkio:/
3095 * 5:devices:/
3096 * 4:hugetlb:/
3097 * >>>> 3:net_prio,net_cls:/
3098 * 2:cpuset:/
3099 * 1:name=systemd:/user.slice/user-0.slice/session-c1.scope
3100 *
3101 * whereas the mountpoint:
3102 *
3103 * | |-/sys/fs/cgroup tmpfs tmpfs ro,nosuid,nodev,noexec,mode=755
3104 * | | |-/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
3105 * | | |-/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset
3106 * >>>> | | |-/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_prio,net_cls
3107 * | | |-/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb
3108 * | | |-/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices
3109 * | | |-/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio
3110 * | | |-/sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory
3111 * >>>> | | |-/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuacct,cpu
3112 * | | |-/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer
3113 * | | |-/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids
3114 * | | `-/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event
3115 *
3116 * Ensure that we always use the systemd-guaranteed stable order when checking
3117 * for the mountpoint.
3118 */
3119 __attribute__((returns_nonnull)) __attribute__((nonnull))
3120 static const char *stable_order(const char *controllers)
3121 {
3122 if (strequal(controllers, "cpuacct,cpu"))
3123 return "cpu,cpuacct";
3124
3125 if (strequal(controllers, "net_prio,net_cls"))
3126 return "net_cls,net_prio";
3127
3128 return unprefix(controllers);
3129 }
3130
3131 static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
3132 bool unprivileged)
3133 {
3134 __do_free char *cgroup_info = NULL;
3135 char *it;
3136
3137 /*
3138 * Root spawned containers escape the current cgroup, so use init's
3139 * cgroups as our base in that case.
3140 */
3141 if (!relative && (geteuid() == 0))
3142 cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3143 else
3144 cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3145 if (!cgroup_info)
3146 return ret_errno(ENOMEM);
3147
3148 lxc_iterate_parts(it, cgroup_info, "\n") {
3149 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
3150 __do_free char *controllers = NULL, *current_cgroup = NULL;
3151 __do_free_string_list char **controller_list = NULL,
3152 **delegate = NULL;
3153 char *line;
3154 int dfd, ret, type;
3155
3156 /* Handle the unified cgroup hierarchy. */
3157 line = it;
3158 if (unified_cgroup(line)) {
3159 char *unified_mnt;
3160
3161 type = UNIFIED_HIERARCHY;
3162
3163 current_cgroup = current_unified_cgroup(relative, line);
3164 if (IS_ERR(current_cgroup))
3165 return PTR_ERR(current_cgroup);
3166
3167 if (unified_cgroup_fd(ops->dfd_mnt)) {
3168 dfd_mnt = dup_cloexec(ops->dfd_mnt);
3169 unified_mnt = "";
3170 } else {
3171 dfd_mnt = open_at(ops->dfd_mnt,
3172 "unified",
3173 PROTECT_OPATH_DIRECTORY,
3174 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3175 unified_mnt = "unified";
3176 }
3177 if (dfd_mnt < 0) {
3178 if (errno != ENOENT)
3179 return syserror("Failed to open %d/unified", ops->dfd_mnt);
3180
3181 SYSTRACE("Unified cgroup not mounted");
3182 continue;
3183 }
3184 dfd = dfd_mnt;
3185
3186 if (!is_empty_string(current_cgroup)) {
3187 dfd_base = open_at(dfd_mnt, current_cgroup,
3188 PROTECT_OPATH_DIRECTORY,
3189 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3190 if (dfd_base < 0) {
3191 if (errno != ENOENT)
3192 return syserror("Failed to open %d/%s",
3193 dfd_mnt, current_cgroup);
3194
3195 SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3196 dfd_mnt, current_cgroup);
3197 continue;
3198 }
3199 dfd = dfd_base;
3200 }
3201
3202 if (!unified_hierarchy_delegated(dfd, &delegate))
3203 continue;
3204
3205 controller_list = unified_controllers(dfd, "cgroup.controllers");
3206 if (!controller_list) {
3207 TRACE("No controllers are enabled for delegation in the unified hierarchy");
3208 controller_list = list_new();
3209 if (!controller_list)
3210 return syserror_set(-ENOMEM, "Failed to create empty controller list");
3211 }
3212
3213 controllers = strdup(unified_mnt);
3214 if (!controllers)
3215 return ret_errno(ENOMEM);
3216 } else {
3217 char *__controllers, *__current_cgroup;
3218
3219 type = LEGACY_HIERARCHY;
3220
3221 __controllers = strchr(line, ':');
3222 if (!__controllers)
3223 return ret_errno(EINVAL);
3224 __controllers++;
3225
3226 __current_cgroup = strchr(__controllers, ':');
3227 if (!__current_cgroup)
3228 return ret_errno(EINVAL);
3229 *__current_cgroup = '\0';
3230 __current_cgroup++;
3231
3232 controllers = strdup(stable_order(__controllers));
3233 if (!controllers)
3234 return ret_errno(ENOMEM);
3235
3236 dfd_mnt = open_at(ops->dfd_mnt,
3237 controllers,
3238 PROTECT_OPATH_DIRECTORY,
3239 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3240 if (dfd_mnt < 0) {
3241 if (errno != ENOENT)
3242 return syserror("Failed to open %d/%s",
3243 ops->dfd_mnt, controllers);
3244
3245 SYSTRACE("%s not mounted", controllers);
3246 continue;
3247 }
3248 dfd = dfd_mnt;
3249
3250 if (!abspath(__current_cgroup))
3251 return ret_errno(EINVAL);
3252
3253 /* remove init.scope */
3254 if (!relative)
3255 __current_cgroup = prune_init_scope(__current_cgroup);
3256
3257 /* create a relative path */
3258 __current_cgroup = deabs(__current_cgroup);
3259
3260 current_cgroup = strdup(__current_cgroup);
3261 if (!current_cgroup)
3262 return ret_errno(ENOMEM);
3263
3264 if (!is_empty_string(current_cgroup)) {
3265 dfd_base = open_at(dfd_mnt, current_cgroup,
3266 PROTECT_OPATH_DIRECTORY,
3267 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3268 if (dfd_base < 0) {
3269 if (errno != ENOENT)
3270 return syserror("Failed to open %d/%s",
3271 dfd_mnt, current_cgroup);
3272
3273 SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3274 dfd_mnt, current_cgroup);
3275 continue;
3276 }
3277 dfd = dfd_base;
3278 }
3279
3280 if (!legacy_hierarchy_delegated(dfd))
3281 continue;
3282
3283 /*
3284 * We intentionally pass __current_cgroup here and not
3285 * controllers because we would otherwise chop the
3286 * mountpoint.
3287 */
3288 controller_list = list_add_controllers(__controllers);
3289 if (!controller_list)
3290 return syserror_set(-ENOMEM, "Failed to create controller list from %s", __controllers);
3291
3292 if (skip_hierarchy(ops, controller_list))
3293 continue;
3294
3295 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3296 }
3297
3298 ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
3299 current_cgroup, controller_list, type);
3300 if (ret < 0)
3301 return syserror_ret(ret, "Failed to add %s hierarchy", controllers);
3302
3303 /* Transfer ownership. */
3304 move_fd(dfd_mnt);
3305 move_fd(dfd_base);
3306 move_ptr(current_cgroup);
3307 move_ptr(controllers);
3308 move_ptr(controller_list);
3309 if (type == UNIFIED_HIERARCHY)
3310 ops->unified->delegate = move_ptr(delegate);
3311 }
3312
3313 /* determine cgroup layout */
3314 if (ops->unified) {
3315 if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3316 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3317 } else {
3318 if (bpf_devices_cgroup_supported())
3319 ops->unified->utilities |= DEVICES_CONTROLLER;
3320 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3321 }
3322 }
3323
3324 if (!controllers_available(ops))
3325 return syserror_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
3326
3327 return 0;
3328 }
3329
3330 static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
3331 {
3332 __do_close int dfd = -EBADF;
3333 int ret;
3334 const char *controllers_use;
3335
3336 if (ops->dfd_mnt >= 0)
3337 return ret_errno(EBUSY);
3338
3339 /*
3340 * I don't see the need for allowing symlinks here. If users want to
3341 * have their hierarchy available in different locations I strongly
3342 * suggest bind-mounts.
3343 */
3344 dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3345 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3346 if (dfd < 0)
3347 return syserror("Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3348
3349 controllers_use = lxc_global_config_value("lxc.cgroup.use");
3350 if (controllers_use) {
3351 __do_free char *dup = NULL;
3352 char *it;
3353
3354 dup = strdup(controllers_use);
3355 if (!dup)
3356 return -errno;
3357
3358 lxc_iterate_parts(it, dup, ",") {
3359 ret = list_add_string(&ops->cgroup_use, it);
3360 if (ret < 0)
3361 return ret;
3362 }
3363 }
3364
3365 /*
3366 * Keep dfd referenced by the cleanup function and actually move the fd
3367 * once we know the initialization succeeded. So if we fail we clean up
3368 * the dfd.
3369 */
3370 ops->dfd_mnt = dfd;
3371
3372 ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map));
3373 if (ret < 0)
3374 return syserror_ret(ret, "Failed to initialize cgroups");
3375
3376 /* Transfer ownership to cgroup_ops. */
3377 move_fd(dfd);
3378 return 0;
3379 }
3380
3381 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3382 {
3383 const char *cgroup_pattern;
3384
3385 if (!ops)
3386 return ret_set_errno(-1, ENOENT);
3387
3388 /* copy system-wide cgroup information */
3389 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3390 if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
3391 ops->cgroup_pattern = strdup(cgroup_pattern);
3392 if (!ops->cgroup_pattern)
3393 return ret_errno(ENOMEM);
3394 }
3395
3396 return 0;
3397 }
3398
3399 struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
3400 {
3401 __cleanup_cgroup_ops struct cgroup_ops *cgfsng_ops = NULL;
3402
3403 cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3404 if (!cgfsng_ops)
3405 return ret_set_errno(NULL, ENOMEM);
3406
3407 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3408 cgfsng_ops->dfd_mnt = -EBADF;
3409
3410 if (initialize_cgroups(cgfsng_ops, conf))
3411 return NULL;
3412
3413 cgfsng_ops->data_init = cgfsng_data_init;
3414 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3415 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3416 cgfsng_ops->monitor_create = cgfsng_monitor_create;
3417 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3418 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3419 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3420 cgfsng_ops->payload_create = cgfsng_payload_create;
3421 cgfsng_ops->payload_enter = cgfsng_payload_enter;
3422 cgfsng_ops->finalize = cgfsng_finalize;
3423 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3424 cgfsng_ops->get = cgfsng_get;
3425 cgfsng_ops->set = cgfsng_set;
3426 cgfsng_ops->freeze = cgfsng_freeze;
3427 cgfsng_ops->unfreeze = cgfsng_unfreeze;
3428 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3429 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3430 cgfsng_ops->driver = "cgfsng";
3431 cgfsng_ops->version = "1.0.0";
3432 cgfsng_ops->attach = cgfsng_attach;
3433 cgfsng_ops->chown = cgfsng_chown;
3434 cgfsng_ops->mount = cgfsng_mount;
3435 cgfsng_ops->devices_activate = cgfsng_devices_activate;
3436 cgfsng_ops->get_limit_cgroup = cgfsng_get_limit_cgroup;
3437
3438 cgfsng_ops->criu_escape = cgfsng_criu_escape;
3439 cgfsng_ops->criu_num_hierarchies = cgfsng_criu_num_hierarchies;
3440 cgfsng_ops->criu_get_hierarchies = cgfsng_criu_get_hierarchies;
3441
3442 return move_ptr(cgfsng_ops);
3443 }
3444
3445 static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid)
3446 {
3447 int ret;
3448
3449 if (!list_empty(&conf->id_map)) {
3450 struct userns_exec_unified_attach_data args = {
3451 .conf = conf,
3452 .unified_fd = fd_unified,
3453 .pid = pid,
3454 };
3455
3456 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3457 if (ret < 0)
3458 return -errno;
3459
3460 ret = userns_exec_minimal(conf,
3461 cgroup_unified_attach_parent_wrapper,
3462 &args,
3463 cgroup_unified_attach_child_wrapper,
3464 &args);
3465 } else {
3466 ret = cgroup_attach_leaf(conf, fd_unified, pid);
3467 }
3468
3469 return ret;
3470 }
3471
3472 static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name,
3473 const char *lxcpath, pid_t pid)
3474 {
3475 call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){};
3476 int ret;
3477 size_t idx;
3478 ssize_t pidstr_len;
3479 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
3480
3481 ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, sizeof(struct cgroup_ctx), ctx);
3482 if (ret < 0)
3483 return ret_errno(ENOSYS);
3484
3485 pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
3486 if (pidstr_len < 0)
3487 return pidstr_len;
3488
3489 for (idx = 0; idx < ctx->fd_len; idx++) {
3490 int dfd_con = ctx->fd[idx];
3491
3492 if (unified_cgroup_fd(dfd_con))
3493 ret = __unified_attach_fd(conf, dfd_con, pid);
3494 else
3495 ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len);
3496 if (ret)
3497 return syserror_ret(ret, "Failed to attach to cgroup fd %d", dfd_con);
3498 else
3499 TRACE("Attached to cgroup fd %d", dfd_con);
3500 }
3501
3502 if (idx == 0)
3503 return syserror_set(-ENOENT, "Failed to attach to cgroups");
3504
3505 TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->layout));
3506 return 0;
3507 }
3508
3509 static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name,
3510 const char *lxcpath, pid_t pid)
3511 {
3512 __do_close int dfd_unified = -EBADF;
3513
3514 if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3515 return ret_errno(EINVAL);
3516
3517 dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3518 if (dfd_unified < 0)
3519 return ret_errno(ENOSYS);
3520
3521 return __unified_attach_fd(conf, dfd_unified, pid);
3522 }
3523
3524 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3525 const char *lxcpath, pid_t pid)
3526 {
3527 int ret;
3528
3529 ret = __cgroup_attach_many(conf, name, lxcpath, pid);
3530 if (ret < 0) {
3531 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3532 return ret;
3533
3534 ret = __cgroup_attach_unified(conf, name, lxcpath, pid);
3535 if (ret < 0 && ERRNO_IS_NOT_SUPPORTED(ret))
3536 return ret_errno(ENOSYS);
3537 }
3538
3539 return ret;
3540 }
3541
3542 /* Connects to command socket therefore isn't callable from command handler. */
3543 int cgroup_get(const char *name, const char *lxcpath, const char *key, char *buf, size_t len)
3544 {
3545 __do_close int dfd = -EBADF;
3546 struct cgroup_fd fd = {
3547 .fd = -EBADF,
3548 };
3549 size_t len_controller;
3550 int ret;
3551
3552 if (is_empty_string(name) || is_empty_string(lxcpath) ||
3553 is_empty_string(key))
3554 return ret_errno(EINVAL);
3555
3556 if ((buf && !len) || (len && !buf))
3557 return ret_errno(EINVAL);
3558
3559 len_controller = strcspn(key, ".");
3560 len_controller++; /* Don't forget the \0 byte. */
3561 if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3562 return ret_errno(EINVAL);
3563 (void)strlcpy(fd.controller, key, len_controller);
3564
3565 ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3566 if (ret < 0) {
3567 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3568 return ret;
3569
3570 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3571 if (dfd < 0) {
3572 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3573 return ret;
3574
3575 return ret_errno(ENOSYS);
3576 }
3577 fd.type = UNIFIED_HIERARCHY;
3578 fd.fd = move_fd(dfd);
3579 }
3580 dfd = move_fd(fd.fd);
3581
3582 TRACE("Reading %s from %s cgroup hierarchy", key, cgroup_hierarchy_name(fd.type));
3583
3584 if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices"))
3585 return ret_errno(EOPNOTSUPP);
3586 else
3587 ret = lxc_read_try_buf_at(dfd, key, buf, len);
3588
3589 return ret;
3590 }
3591
3592 /* Connects to command socket therefore isn't callable from command handler. */
3593 int cgroup_set(const char *name, const char *lxcpath, const char *key, const char *value)
3594 {
3595 __do_close int dfd = -EBADF;
3596 struct cgroup_fd fd = {
3597 .fd = -EBADF,
3598 };
3599 size_t len_controller;
3600 int ret;
3601
3602 if (is_empty_string(name) || is_empty_string(lxcpath) ||
3603 is_empty_string(key) || is_empty_string(value))
3604 return ret_errno(EINVAL);
3605
3606 len_controller = strcspn(key, ".");
3607 len_controller++; /* Don't forget the \0 byte. */
3608 if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3609 return ret_errno(EINVAL);
3610 (void)strlcpy(fd.controller, key, len_controller);
3611
3612 ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3613 if (ret < 0) {
3614 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3615 return ret;
3616
3617 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3618 if (dfd < 0) {
3619 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3620 return ret;
3621
3622 return ret_errno(ENOSYS);
3623 }
3624 fd.type = UNIFIED_HIERARCHY;
3625 fd.fd = move_fd(dfd);
3626 }
3627 dfd = move_fd(fd.fd);
3628
3629 TRACE("Setting %s to %s in %s cgroup hierarchy", key, value, cgroup_hierarchy_name(fd.type));
3630
3631 if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) {
3632 struct device_item device = {};
3633
3634 ret = device_cgroup_rule_parse(&device, key, value);
3635 if (ret < 0)
3636 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
3637 key, value);
3638
3639 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3640 } else {
3641 ret = lxc_writeat(dfd, key, value, strlen(value));
3642 }
3643
3644 return ret;
3645 }
3646
3647 static int do_cgroup_freeze(int unified_fd,
3648 const char *state_string,
3649 int state_num,
3650 int timeout,
3651 const char *epoll_error,
3652 const char *wait_error)
3653 {
3654 __do_close int events_fd = -EBADF;
3655 call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL;
3656 int ret;
3657 struct lxc_async_descr descr = {};
3658
3659 if (timeout != 0) {
3660 ret = lxc_mainloop_open(&descr);
3661 if (ret)
3662 return log_error_errno(-1, errno, "%s", epoll_error);
3663
3664 /* automatically cleaned up now */
3665 descr_ptr = &descr;
3666
3667 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3668 if (events_fd < 0)
3669 return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3670
3671 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI,
3672 freezer_cgroup_events_cb,
3673 default_cleanup_handler,
3674 INT_TO_PTR(state_num),
3675 "freezer_cgroup_events_cb");
3676 if (ret < 0)
3677 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3678 }
3679
3680 ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3681 if (ret < 0)
3682 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3683
3684 if (timeout != 0) {
3685 ret = lxc_mainloop(&descr, timeout);
3686 if (ret)
3687 return log_error_errno(-1, errno, "%s", wait_error);
3688 }
3689
3690 return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3691 }
3692
3693 static inline int __cgroup_freeze(int unified_fd, int timeout)
3694 {
3695 return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3696 "Failed to create epoll instance to wait for container freeze",
3697 "Failed to wait for container to be frozen");
3698 }
3699
3700 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3701 {
3702 __do_close int unified_fd = -EBADF;
3703 int ret;
3704
3705 if (is_empty_string(name) || is_empty_string(lxcpath))
3706 return ret_errno(EINVAL);
3707
3708 unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3709 if (unified_fd < 0)
3710 return ret_errno(ENOCGROUP2);
3711
3712 lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3713 ret = __cgroup_freeze(unified_fd, timeout);
3714 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3715 return ret;
3716 }
3717
3718 int __cgroup_unfreeze(int unified_fd, int timeout)
3719 {
3720 return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3721 "Failed to create epoll instance to wait for container freeze",
3722 "Failed to wait for container to be frozen");
3723 }
3724
3725 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3726 {
3727 __do_close int unified_fd = -EBADF;
3728 int ret;
3729
3730 if (is_empty_string(name) || is_empty_string(lxcpath))
3731 return ret_errno(EINVAL);
3732
3733 unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3734 if (unified_fd < 0)
3735 return ret_errno(ENOCGROUP2);
3736
3737 lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3738 ret = __cgroup_unfreeze(unified_fd, timeout);
3739 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3740 return ret;
3741 }