]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfsng.c
cgroups: use __u32 for cpumasks
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 /*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
8 * each controller.
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
12 * a comma-separated list of controllers.
13 */
14
15 #include "config.h"
16
17 #include <ctype.h>
18 #include <dirent.h>
19 #include <errno.h>
20 #include <grp.h>
21 #include <linux/kdev_t.h>
22 #include <linux/types.h>
23 #include <poll.h>
24 #include <signal.h>
25 #include <stdint.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <sys/epoll.h>
30 #include <sys/types.h>
31 #include <unistd.h>
32
33 #include "cgroup.h"
34 #include "af_unix.h"
35 #include "caps.h"
36 #include "cgroup2_devices.h"
37 #include "cgroup_utils.h"
38 #include "commands.h"
39 #include "commands_utils.h"
40 #include "conf.h"
41 #include "error_utils.h"
42 #include "log.h"
43 #include "macro.h"
44 #include "mainloop.h"
45 #include "memory_utils.h"
46 #include "mount_utils.h"
47 #include "storage/storage.h"
48 #include "string_utils.h"
49 #include "syscall_wrappers.h"
50 #include "utils.h"
51
52 #if !HAVE_STRLCPY
53 #include "strlcpy.h"
54 #endif
55
56 #if !HAVE_STRLCAT
57 #include "strlcat.h"
58 #endif
59
60 lxc_log_define(cgfsng, cgroup);
61
62 /*
63 * Given a pointer to a null-terminated array of pointers, realloc to add one
64 * entry, and point the new entry to NULL. Do not fail. Return the index to the
65 * second-to-last entry - that is, the one which is now available for use
66 * (keeping the list null-terminated).
67 */
68 static int cg_list_add(void ***list)
69 {
70 int idx = 0;
71 void **p;
72
73 if (*list)
74 for (; (*list)[idx]; idx++)
75 ;
76
77 p = realloc(*list, (idx + 2) * sizeof(void **));
78 if (!p)
79 return ret_errno(ENOMEM);
80
81 p[idx + 1] = NULL;
82 *list = p;
83
84 return idx;
85 }
86
87 /* Given a null-terminated array of strings, check whether @entry is one of the
88 * strings.
89 */
90 static bool string_in_list(char **list, const char *entry)
91 {
92 if (!list)
93 return false;
94
95 for (int i = 0; list[i]; i++)
96 if (strequal(list[i], entry))
97 return true;
98
99 return false;
100 }
101
102 /* Given a handler's cgroup data, return the struct hierarchy for the controller
103 * @c, or NULL if there is none.
104 */
105 static struct hierarchy *get_hierarchy(const struct cgroup_ops *ops, const char *controller)
106 {
107 if (!ops->hierarchies)
108 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
109
110 for (int i = 0; ops->hierarchies[i]; i++) {
111 if (!controller) {
112 /* This is the empty unified hierarchy. */
113 if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
114 return ops->hierarchies[i];
115
116 continue;
117 }
118
119 /*
120 * Handle controllers with significant implementation changes
121 * from cgroup to cgroup2.
122 */
123 if (pure_unified_layout(ops)) {
124 if (strequal(controller, "devices")) {
125 if (device_utility_controller(ops->unified))
126 return ops->unified;
127
128 break;
129 } else if (strequal(controller, "freezer")) {
130 if (freezer_utility_controller(ops->unified))
131 return ops->unified;
132
133 break;
134 }
135 }
136
137 if (string_in_list(ops->hierarchies[i]->controllers, controller))
138 return ops->hierarchies[i];
139 }
140
141 if (controller)
142 WARN("There is no useable %s controller", controller);
143 else
144 WARN("There is no empty unified cgroup hierarchy");
145
146 return ret_set_errno(NULL, ENOENT);
147 }
148
149 int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool limit)
150 {
151 int dfd;
152 const struct hierarchy *h;
153
154 h = get_hierarchy(ops, fd->controller);
155 if (!h)
156 return ret_errno(ENOENT);
157
158 /*
159 * The client requested that the controller must be in a specific
160 * cgroup version.
161 */
162 if (fd->type != 0 && (cgroupfs_type_magic_t)fd->type != h->fs_type)
163 return ret_errno(EINVAL);
164
165 if (limit)
166 dfd = h->dfd_con;
167 else
168 dfd = h->dfd_lim;
169 if (dfd < 0)
170 return ret_errno(EBADF);
171
172 fd->layout = ops->cgroup_layout;
173 fd->type = h->fs_type;
174 if (fd->type == UNIFIED_HIERARCHY)
175 fd->utilities = h->utilities;
176 fd->fd = dfd;
177
178 return 0;
179 }
180
181 /* Create cpumask from cpulist aka turn:
182 *
183 * 0,2-3
184 *
185 * into bit array
186 *
187 * 1 0 1 1
188 */
189 static int lxc_cpumask(char *buf, __u32 **bitarr, __u32 *last_set_bit)
190 {
191 __do_free __u32 *arr_u32 = NULL;
192 __u32 cur_last_set_bit = 0, nbits = 256;
193 __u32 nr_u32;
194 char *token;
195
196 nr_u32 = BITS_TO_LONGS(nbits);
197 arr_u32 = zalloc(nr_u32 * sizeof(__u32));
198 if (!arr_u32)
199 return ret_errno(ENOMEM);
200
201 lxc_iterate_parts(token, buf, ",") {
202 __u32 last_bit, first_bit;
203 char *range;
204
205 errno = 0;
206 first_bit = strtoul(token, NULL, 0);
207 last_bit = first_bit;
208 range = strchr(token, '-');
209 if (range)
210 last_bit = strtoul(range + 1, NULL, 0);
211
212 if (!(first_bit <= last_bit))
213 return ret_errno(EINVAL);
214
215 if (last_bit >= nbits) {
216 __u32 add_bits = last_bit - nbits + 32;
217 __u32 new_nr_u32;
218 __u32 *p;
219
220 new_nr_u32 = BITS_TO_LONGS(nbits + add_bits);
221 p = realloc(arr_u32, new_nr_u32 * sizeof(uint32_t));
222 if (!p)
223 return ret_errno(ENOMEM);
224 arr_u32 = move_ptr(p);
225
226 memset(arr_u32 + nr_u32, 0,
227 (new_nr_u32 - nr_u32) * sizeof(uint32_t));
228 nbits += add_bits;
229 }
230
231 while (first_bit <= last_bit)
232 set_bit(first_bit++, arr_u32);
233
234 if (last_bit > cur_last_set_bit)
235 cur_last_set_bit = last_bit;
236 }
237
238 *last_set_bit = cur_last_set_bit;
239 *bitarr = move_ptr(arr_u32);
240 return 0;
241 }
242
243 static int lxc_cpumask_update(char *buf, __u32 *bitarr, __u32 last_set_bit,
244 bool clear)
245 {
246 bool flipped = false;
247 char *token;
248
249 lxc_iterate_parts(token, buf, ",") {
250 __u32 last_bit, first_bit;
251 char *range;
252
253 errno = 0;
254 first_bit = strtoul(token, NULL, 0);
255 last_bit = first_bit;
256 range = strchr(token, '-');
257 if (range)
258 last_bit = strtoul(range + 1, NULL, 0);
259
260 if (!(first_bit <= last_bit)) {
261 WARN("The cup range seems to be inverted: %u-%u", first_bit, last_bit);
262 continue;
263 }
264
265 if (last_bit > last_set_bit)
266 continue;
267
268 while (first_bit <= last_bit) {
269 if (clear && is_set(first_bit, bitarr)) {
270 flipped = true;
271 clear_bit(first_bit, bitarr);
272 } else if (!clear && !is_set(first_bit, bitarr)) {
273 flipped = true;
274 set_bit(first_bit, bitarr);
275 }
276
277 first_bit++;
278 }
279 }
280
281 if (flipped)
282 return 1;
283
284 return 0;
285 }
286
287 /* Turn cpumask into simple, comma-separated cpulist. */
288 static char *lxc_cpumask_to_cpulist(__u32 *bitarr, __u32 last_set_bit)
289 {
290 __do_free_string_list char **cpulist = NULL;
291 char numstr[INTTYPE_TO_STRLEN(__u32)] = {0};
292 int ret;
293
294 for (__u32 bit = 0; bit <= last_set_bit; bit++) {
295 if (!is_set(bit, bitarr))
296 continue;
297
298 ret = strnprintf(numstr, sizeof(numstr), "%u", bit);
299 if (ret < 0)
300 return NULL;
301
302 ret = lxc_append_string(&cpulist, numstr);
303 if (ret < 0)
304 return ret_set_errno(NULL, ENOMEM);
305 }
306
307 if (!cpulist)
308 return ret_set_errno(NULL, ENOMEM);
309
310 return lxc_string_join(",", (const char **)cpulist, false);
311 }
312
313 static inline bool is_unified_hierarchy(const struct hierarchy *h)
314 {
315 return h->fs_type == UNIFIED_HIERARCHY;
316 }
317
318 /* Return true if the controller @entry is found in the null-terminated list of
319 * hierarchies @hlist.
320 */
321 static bool controller_available(struct hierarchy **hlist, char *entry)
322 {
323 if (!hlist)
324 return false;
325
326 for (int i = 0; hlist[i]; i++)
327 if (string_in_list(hlist[i]->controllers, entry))
328 return true;
329
330 return false;
331 }
332
333 static bool controllers_available(struct cgroup_ops *ops)
334 {
335 struct hierarchy **hlist;
336
337 if (!ops->cgroup_use)
338 return true;
339
340 hlist = ops->hierarchies;
341 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
342 if (!controller_available(hlist, *cur))
343 return log_error(false, "The %s controller found", *cur);
344
345 return true;
346 }
347
348 static char **list_new(void)
349 {
350 __do_free_string_list char **list = NULL;
351 int idx;
352
353 idx = cg_list_add((void ***)&list);
354 if (idx < 0)
355 return NULL;
356
357 list[idx] = NULL;
358 return move_ptr(list);
359 }
360
361 static int list_add_string(char ***list, char *entry)
362 {
363 __do_free char *dup = NULL;
364 int idx;
365
366 dup = strdup(entry);
367 if (!dup)
368 return ret_errno(ENOMEM);
369
370 idx = cg_list_add((void ***)list);
371 if (idx < 0)
372 return idx;
373
374 (*list)[idx] = move_ptr(dup);
375 return 0;
376 }
377
378 static char **list_add_controllers(char *controllers)
379 {
380 __do_free_string_list char **list = NULL;
381 char *it;
382
383 lxc_iterate_parts(it, controllers, ", \t\n") {
384 int ret;
385
386 ret = list_add_string(&list, it);
387 if (ret < 0)
388 return NULL;
389 }
390
391 return move_ptr(list);
392 }
393
394 static char **unified_controllers(int dfd, const char *file)
395 {
396 __do_free char *buf = NULL;
397
398 buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
399 if (!buf)
400 return NULL;
401
402 return list_add_controllers(buf);
403 }
404
405 static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
406 {
407 if (!ops->cgroup_use)
408 return false;
409
410 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
411 bool found = false;
412
413 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
414 if (!strequal(*cur_use, *cur_ctrl))
415 continue;
416
417 found = true;
418 break;
419 }
420
421 if (found)
422 continue;
423
424 return true;
425 }
426
427 return false;
428 }
429
430 static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
431 int dfd_base, char *base_cgroup,
432 char **controllers, cgroupfs_type_magic_t fs_type)
433 {
434 __do_free struct hierarchy *new = NULL;
435 int idx;
436
437 if (abspath(base_cgroup))
438 return syserror_set(-EINVAL, "Container base path must be relative to controller mount");
439
440 new = zalloc(sizeof(*new));
441 if (!new)
442 return ret_errno(ENOMEM);
443
444 new->dfd_con = -EBADF;
445 new->dfd_lim = -EBADF;
446 new->dfd_mon = -EBADF;
447
448 new->fs_type = fs_type;
449 new->controllers = controllers;
450 new->at_mnt = mnt;
451 new->at_base = base_cgroup;
452
453 new->dfd_mnt = dfd_mnt;
454 new->dfd_base = dfd_base;
455
456 TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
457 mnt, maybe_empty(base_cgroup));
458 for (char *const *it = new->controllers; it && *it; it++)
459 TRACE("The hierarchy contains the %s controller", *it);
460
461 idx = cg_list_add((void ***)&ops->hierarchies);
462 if (idx < 0)
463 return ret_errno(idx);
464
465 if (fs_type == UNIFIED_HIERARCHY)
466 ops->unified = new;
467 (ops->hierarchies)[idx] = move_ptr(new);
468
469 return 0;
470 }
471
472 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
473 {
474 if (!path_prune || !hierarchies)
475 return 0;
476
477 for (int i = 0; hierarchies[i]; i++) {
478 struct hierarchy *h = hierarchies[i];
479 int ret;
480
481 ret = cgroup_tree_prune(h->dfd_base, path_prune);
482 if (ret < 0)
483 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
484 else
485 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
486
487 free_equal(h->path_lim, h->path_con);
488 }
489
490 return 0;
491 }
492
493 struct generic_userns_exec_data {
494 struct hierarchy **hierarchies;
495 const char *path_prune;
496 struct lxc_conf *conf;
497 uid_t origuid; /* target uid in parent namespace */
498 char *path;
499 };
500
501 static int cgroup_tree_remove_wrapper(void *data)
502 {
503 struct generic_userns_exec_data *arg = data;
504 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
505 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
506 int ret;
507
508 if (!lxc_drop_groups() && errno != EPERM)
509 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
510
511 ret = setresgid(nsgid, nsgid, nsgid);
512 if (ret < 0)
513 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
514 (int)nsgid, (int)nsgid, (int)nsgid);
515
516 ret = setresuid(nsuid, nsuid, nsuid);
517 if (ret < 0)
518 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
519 (int)nsuid, (int)nsuid, (int)nsuid);
520
521 return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
522 }
523
524 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
525 struct lxc_handler *handler)
526 {
527 int ret;
528
529 if (!ops) {
530 ERROR("Called with uninitialized cgroup operations");
531 return;
532 }
533
534 if (!ops->hierarchies)
535 return;
536
537 if (!handler) {
538 ERROR("Called with uninitialized handler");
539 return;
540 }
541
542 if (!handler->conf) {
543 ERROR("Called with uninitialized conf");
544 return;
545 }
546
547 if (!ops->container_limit_cgroup) {
548 WARN("Uninitialized limit cgroup");
549 return;
550 }
551
552 ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
553 if (ret < 0)
554 WARN("Failed to detach bpf program from cgroup");
555
556 if (!list_empty(&handler->conf->id_map)) {
557 struct generic_userns_exec_data wrap = {
558 .conf = handler->conf,
559 .path_prune = ops->container_limit_cgroup,
560 .hierarchies = ops->hierarchies,
561 .origuid = 0,
562 };
563 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
564 &wrap, "cgroup_tree_remove_wrapper");
565 } else {
566 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
567 }
568 if (ret < 0)
569 SYSWARN("Failed to destroy cgroups");
570 }
571
572 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
573 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
574 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
575 bool am_initialized)
576 {
577 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
578 *offlinecpus = NULL, *posscpus = NULL;
579 __do_free __u32 *possmask = NULL;
580 int ret;
581 __u32 poss_last_set_bit = 0;
582
583 posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
584 if (!posscpus)
585 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
586
587 if (file_exists(__ISOL_CPUS)) {
588 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
589 if (!isolcpus)
590 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
591
592 if (!isdigit(isolcpus[0]))
593 free_disarm(isolcpus);
594 } else {
595 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
596 }
597
598 if (file_exists(__OFFLINE_CPUS)) {
599 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
600 if (!offlinecpus)
601 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
602
603 if (!isdigit(offlinecpus[0]))
604 free_disarm(offlinecpus);
605 } else {
606 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
607 }
608
609 if (!isolcpus && !offlinecpus) {
610 cpulist = move_ptr(posscpus);
611 goto copy_parent;
612 }
613
614 ret = lxc_cpumask(posscpus, &possmask, &poss_last_set_bit);
615 if (ret)
616 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
617
618 if (isolcpus)
619 ret = lxc_cpumask_update(isolcpus, possmask, poss_last_set_bit, true);
620
621 if (offlinecpus)
622 ret |= lxc_cpumask_update(offlinecpus, possmask, poss_last_set_bit, true);
623
624 if (!ret) {
625 cpulist = lxc_cpumask_to_cpulist(possmask, poss_last_set_bit);
626 TRACE("No isolated or offline cpus present in cpuset");
627 } else {
628 cpulist = move_ptr(posscpus);
629 TRACE("Removed isolated or offline cpus from cpuset");
630 }
631 if (!cpulist)
632 return log_error_errno(false, errno, "Failed to create cpu list");
633
634 copy_parent:
635 if (!am_initialized) {
636 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
637 if (ret < 0)
638 return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
639
640 TRACE("Copied cpu settings of parent cgroup");
641 }
642
643 return true;
644 }
645
646 static bool cpuset1_initialize(int dfd_base, int dfd_next)
647 {
648 char mems[PATH_MAX];
649 ssize_t bytes;
650 char v;
651
652 /* Determine whether the base cgroup has cpuset inheritance turned on. */
653 bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
654 if (bytes < 0)
655 return syserror_ret(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
656
657 /* Initialize cpuset.cpus removing any isolated and offline cpus. */
658 if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
659 return syserror_ret(false, "Failed to initialize cpuset.cpus");
660
661 /* Read cpuset.mems from parent... */
662 bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
663 if (bytes < 0)
664 return syserror_ret(false, "Failed to read file %d(cpuset.mems)", dfd_base);
665
666 /* and copy to first cgroup in the tree... */
667 bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
668 if (bytes < 0)
669 return syserror_ret(false, "Failed to write %d(cpuset.mems)", dfd_next);
670
671 /* and finally turn on cpuset inheritance. */
672 bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
673 if (bytes < 0)
674 return syserror_ret(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
675
676 return log_trace(true, "Initialized cpuset in the legacy hierarchy");
677 }
678
679 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
680 bool cpuset_v1, bool eexist_ignore)
681 {
682 __do_close int dfd_final = -EBADF;
683 int dfd_cur = dfd_base;
684 int ret = 0;
685 size_t len;
686 char *cur;
687 char buf[PATH_MAX];
688
689 if (is_empty_string(path))
690 return ret_errno(EINVAL);
691
692 len = strlcpy(buf, path, sizeof(buf));
693 if (len >= sizeof(buf))
694 return ret_errno(E2BIG);
695
696 lxc_iterate_parts(cur, buf, "/") {
697 /*
698 * Even though we vetted the paths when we parsed the config
699 * we're paranoid here and check that the path is neither
700 * absolute nor walks upwards.
701 */
702 if (abspath(cur))
703 return syserror_set(-EINVAL, "No absolute paths allowed");
704
705 if (strnequal(cur, "..", STRLITERALLEN("..")))
706 return syserror_set(-EINVAL, "No upward walking paths allowed");
707
708 ret = mkdirat(dfd_cur, cur, mode);
709 if (ret < 0) {
710 if (errno != EEXIST)
711 return syserror("Failed to create %d(%s)", dfd_cur, cur);
712
713 ret = -EEXIST;
714 }
715 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
716
717 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
718 if (dfd_final < 0)
719 return syserror("Fail to open%s directory %d(%s)",
720 !ret ? " newly created" : "", dfd_base, cur);
721 if (dfd_cur != dfd_base)
722 close(dfd_cur);
723 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
724 return syserror_set(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
725 /*
726 * Leave dfd_final pointing to the last fd we opened so
727 * it will be automatically zapped if we return early.
728 */
729 dfd_cur = dfd_final;
730 }
731
732 /* The final cgroup must be succesfully creatd by us. */
733 if (ret) {
734 if (ret != -EEXIST || !eexist_ignore)
735 return syswarn_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
736 }
737
738 return move_fd(dfd_final);
739 }
740
741 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
742 struct hierarchy *h, const char *cgroup_limit_dir,
743 const char *cgroup_leaf, bool payload)
744 {
745 __do_close int fd_limit = -EBADF, fd_final = -EBADF;
746 bool cpuset_v1 = false;
747
748 /*
749 * The legacy cpuset controller needs massaging in case inheriting
750 * settings from its immediate ancestor cgroup hasn't been turned on.
751 */
752 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
753
754 if (payload && cgroup_leaf) {
755 /* With isolation both parts need to not already exist. */
756 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
757 if (fd_limit < 0)
758 return syswarn_ret(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
759
760 h->path_lim = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
761 h->dfd_lim = move_fd(fd_limit);
762
763 TRACE("Created limit cgroup %d->%d(%s)",
764 h->dfd_lim, h->dfd_base, cgroup_limit_dir);
765
766 /*
767 * With isolation the devices legacy cgroup needs to be
768 * iinitialized early, as it typically contains an 'a' (all)
769 * line, which is not possible once a subdirectory has been
770 * created.
771 */
772 if (string_in_list(h->controllers, "devices") &&
773 !ops->setup_limits_legacy(ops, conf, true))
774 return log_warn(false, "Failed to setup legacy device limits");
775
776 /*
777 * If we use a separate limit cgroup, the leaf cgroup, i.e. the
778 * cgroup the container actually resides in, is below fd_limit.
779 */
780 fd_final = __cgroup_tree_create(h->dfd_lim, cgroup_leaf, 0755, cpuset_v1, false);
781 if (fd_final < 0) {
782 /* Ensure we don't leave any garbage behind. */
783 if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
784 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
785 else
786 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
787 return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
788 }
789 h->dfd_con = move_fd(fd_final);
790 h->path_con = must_make_path(h->path_lim, cgroup_leaf, NULL);
791
792 } else {
793 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
794 if (fd_final < 0)
795 return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
796
797 if (payload) {
798 h->dfd_con = move_fd(fd_final);
799 h->dfd_lim = h->dfd_con;
800 h->path_con = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
801
802 h->path_lim = h->path_con;
803 } else {
804 h->dfd_mon = move_fd(fd_final);
805 }
806 }
807
808 return true;
809 }
810
811 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
812 bool payload)
813 {
814 bool prune = true;
815
816 if (payload) {
817 /* Check whether we actually created the cgroup to prune. */
818 if (h->dfd_lim < 0)
819 prune = false;
820
821 free_equal(h->path_con, h->path_lim);
822 close_equal(h->dfd_con, h->dfd_lim);
823 } else {
824 /* Check whether we actually created the cgroup to prune. */
825 if (h->dfd_mon < 0)
826 prune = false;
827
828 close_prot_errno_disarm(h->dfd_mon);
829 }
830
831 /* We didn't create this cgroup. */
832 if (!prune)
833 return;
834
835 if (cgroup_tree_prune(h->dfd_base, path_prune))
836 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
837 else
838 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
839 }
840
841 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
842 struct lxc_handler *handler)
843 {
844 int len;
845 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
846 const struct lxc_conf *conf;
847
848 if (!ops) {
849 ERROR("Called with uninitialized cgroup operations");
850 return;
851 }
852
853 if (!ops->hierarchies)
854 return;
855
856 if (!handler) {
857 ERROR("Called with uninitialized handler");
858 return;
859 }
860
861 if (!handler->conf) {
862 ERROR("Called with uninitialized conf");
863 return;
864 }
865 conf = handler->conf;
866
867 if (!ops->monitor_cgroup) {
868 WARN("Uninitialized monitor cgroup");
869 return;
870 }
871
872 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
873 if (len < 0)
874 return;
875
876 for (int i = 0; ops->hierarchies[i]; i++) {
877 __do_close int fd_pivot = -EBADF;
878 __do_free char *pivot_path = NULL;
879 struct hierarchy *h = ops->hierarchies[i];
880 bool cpuset_v1 = false;
881 int ret;
882
883 /* Monitor might have died before we entered the cgroup. */
884 if (handler->monitor_pid <= 0) {
885 WARN("No valid monitor process found while destroying cgroups");
886 goto cgroup_prune_tree;
887 }
888
889 if (conf->cgroup_meta.monitor_pivot_dir)
890 pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
891 else if (conf->cgroup_meta.dir)
892 pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
893 else
894 pivot_path = must_make_path(CGROUP_PIVOT, NULL);
895
896 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
897
898 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
899 if (fd_pivot < 0) {
900 SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
901 continue;
902 }
903
904 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
905 if (ret != 0) {
906 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
907 continue;
908 }
909
910 cgroup_prune_tree:
911 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
912 if (ret < 0)
913 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
914 else
915 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
916 }
917 }
918
919 /*
920 * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
921 * proper prefix directory of lxc.cgroup.dir.payload.
922 *
923 * Returns the prefix length if it is set, otherwise zero on success.
924 */
925 static bool check_cgroup_dir_config(struct lxc_conf *conf)
926 {
927 const char *monitor_dir = conf->cgroup_meta.monitor_dir,
928 *container_dir = conf->cgroup_meta.container_dir,
929 *namespace_dir = conf->cgroup_meta.namespace_dir;
930
931 /* none of the new options are set, all is fine */
932 if (!monitor_dir && !container_dir && !namespace_dir)
933 return true;
934
935 /* some are set, make sure lxc.cgroup.dir is not also set*/
936 if (conf->cgroup_meta.dir)
937 return log_error_errno(false, EINVAL,
938 "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
939
940 /* make sure both monitor and payload are set */
941 if (!monitor_dir || !container_dir)
942 return log_error_errno(false, EINVAL,
943 "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
944
945 /* namespace_dir may be empty */
946 return true;
947 }
948
949 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
950 {
951 __do_free char *monitor_cgroup = NULL;
952 int idx = 0;
953 int i;
954 size_t len;
955 char *suffix = NULL;
956 struct lxc_conf *conf;
957
958 if (!ops)
959 return ret_set_errno(false, ENOENT);
960
961 if (!ops->hierarchies)
962 return true;
963
964 if (ops->monitor_cgroup)
965 return ret_set_errno(false, EEXIST);
966
967 if (!handler || !handler->conf)
968 return ret_set_errno(false, EINVAL);
969
970 conf = handler->conf;
971
972 if (!check_cgroup_dir_config(conf))
973 return false;
974
975 if (conf->cgroup_meta.monitor_dir) {
976 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
977 } else if (conf->cgroup_meta.dir) {
978 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
979 DEFAULT_MONITOR_CGROUP_PREFIX,
980 handler->name,
981 CGROUP_CREATE_RETRY, NULL);
982 } else if (ops->cgroup_pattern) {
983 __do_free char *cgroup_tree = NULL;
984
985 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
986 if (!cgroup_tree)
987 return ret_set_errno(false, ENOMEM);
988
989 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
990 DEFAULT_MONITOR_CGROUP,
991 CGROUP_CREATE_RETRY, NULL);
992 } else {
993 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
994 handler->name,
995 CGROUP_CREATE_RETRY, NULL);
996 }
997 if (!monitor_cgroup)
998 return ret_set_errno(false, ENOMEM);
999
1000 if (!conf->cgroup_meta.monitor_dir) {
1001 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1002 *suffix = '\0';
1003 }
1004 do {
1005 if (idx && suffix)
1006 sprintf(suffix, "-%d", idx);
1007
1008 for (i = 0; ops->hierarchies[i]; i++) {
1009 if (cgroup_tree_create(ops, handler->conf,
1010 ops->hierarchies[i],
1011 monitor_cgroup, NULL, false))
1012 continue;
1013
1014 DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1015 for (int j = 0; j <= i; j++)
1016 cgroup_tree_prune_leaf(ops->hierarchies[j],
1017 monitor_cgroup, false);
1018
1019 idx++;
1020 break;
1021 }
1022 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1023
1024 if (idx == 1000 || (!suffix && idx != 0))
1025 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1026
1027 ops->monitor_cgroup = move_ptr(monitor_cgroup);
1028 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1029 }
1030
1031 /*
1032 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1033 * next cgroup_pattern-1, -2, ..., -999.
1034 */
1035 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1036 {
1037 __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1038 char *limit_cgroup;
1039 int idx = 0;
1040 int i;
1041 size_t len;
1042 char *suffix = NULL;
1043 struct lxc_conf *conf;
1044
1045 if (!ops)
1046 return ret_set_errno(false, ENOENT);
1047
1048 if (!ops->hierarchies)
1049 return true;
1050
1051 if (ops->container_cgroup || ops->container_limit_cgroup)
1052 return ret_set_errno(false, EEXIST);
1053
1054 if (!handler || !handler->conf)
1055 return ret_set_errno(false, EINVAL);
1056
1057 conf = handler->conf;
1058
1059 if (!check_cgroup_dir_config(conf))
1060 return false;
1061
1062 if (conf->cgroup_meta.container_dir) {
1063 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1064 if (!__limit_cgroup)
1065 return ret_set_errno(false, ENOMEM);
1066
1067 if (conf->cgroup_meta.namespace_dir) {
1068 container_cgroup = must_make_path(__limit_cgroup,
1069 conf->cgroup_meta.namespace_dir,
1070 NULL);
1071 limit_cgroup = __limit_cgroup;
1072 } else {
1073 /* explicit paths but without isolation */
1074 limit_cgroup = move_ptr(__limit_cgroup);
1075 container_cgroup = limit_cgroup;
1076 }
1077 } else if (conf->cgroup_meta.dir) {
1078 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1079 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1080 handler->name,
1081 CGROUP_CREATE_RETRY, NULL);
1082 container_cgroup = limit_cgroup;
1083 } else if (ops->cgroup_pattern) {
1084 __do_free char *cgroup_tree = NULL;
1085
1086 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1087 if (!cgroup_tree)
1088 return ret_set_errno(false, ENOMEM);
1089
1090 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1091 DEFAULT_PAYLOAD_CGROUP,
1092 CGROUP_CREATE_RETRY, NULL);
1093 container_cgroup = limit_cgroup;
1094 } else {
1095 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1096 handler->name,
1097 CGROUP_CREATE_RETRY, NULL);
1098 container_cgroup = limit_cgroup;
1099 }
1100 if (!limit_cgroup)
1101 return ret_set_errno(false, ENOMEM);
1102
1103 if (!conf->cgroup_meta.container_dir) {
1104 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1105 *suffix = '\0';
1106 }
1107 do {
1108 if (idx && suffix)
1109 sprintf(suffix, "-%d", idx);
1110
1111 for (i = 0; ops->hierarchies[i]; i++) {
1112 if (cgroup_tree_create(ops, handler->conf,
1113 ops->hierarchies[i], limit_cgroup,
1114 conf->cgroup_meta.namespace_dir,
1115 true))
1116 continue;
1117
1118 DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
1119 for (int j = 0; j <= i; j++)
1120 cgroup_tree_prune_leaf(ops->hierarchies[j],
1121 limit_cgroup, true);
1122
1123 idx++;
1124 break;
1125 }
1126 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1127
1128 if (idx == 1000 || (!suffix && idx != 0))
1129 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1130
1131 ops->container_cgroup = move_ptr(container_cgroup);
1132 if (__limit_cgroup)
1133 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1134 else
1135 ops->container_limit_cgroup = ops->container_cgroup;
1136 INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1137 ops->container_cgroup, ops->container_limit_cgroup);
1138 return true;
1139 }
1140
1141 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1142 struct lxc_handler *handler)
1143 {
1144 int monitor_len, transient_len = 0;
1145 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1146 transient[INTTYPE_TO_STRLEN(pid_t)];
1147
1148 if (!ops)
1149 return ret_set_errno(false, ENOENT);
1150
1151 if (!ops->hierarchies)
1152 return true;
1153
1154 if (!ops->monitor_cgroup)
1155 return ret_set_errno(false, ENOENT);
1156
1157 if (!handler || !handler->conf)
1158 return ret_set_errno(false, EINVAL);
1159
1160 monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1161 if (monitor_len < 0)
1162 return false;
1163
1164 if (handler->transient_pid > 0) {
1165 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1166 if (transient_len < 0)
1167 return false;
1168 }
1169
1170 for (int i = 0; ops->hierarchies[i]; i++) {
1171 struct hierarchy *h = ops->hierarchies[i];
1172 int ret;
1173
1174 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
1175 if (ret)
1176 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1177
1178 TRACE("Moved monitor into cgroup %d", h->dfd_mon);
1179
1180 if (handler->transient_pid <= 0)
1181 continue;
1182
1183 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
1184 if (ret)
1185 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1186
1187 TRACE("Moved transient process into cgroup %d", h->dfd_mon);
1188
1189 /*
1190 * we don't keep the fds for non-unified hierarchies around
1191 * mainly because we don't make use of them anymore after the
1192 * core cgroup setup is done but also because there are quite a
1193 * lot of them.
1194 */
1195 if (!is_unified_hierarchy(h))
1196 close_prot_errno_disarm(h->dfd_mon);
1197 }
1198 handler->transient_pid = -1;
1199
1200 return true;
1201 }
1202
1203 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1204 struct lxc_handler *handler)
1205 {
1206 int len;
1207 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1208
1209 if (!ops)
1210 return ret_set_errno(false, ENOENT);
1211
1212 if (!ops->hierarchies)
1213 return true;
1214
1215 if (!ops->container_cgroup)
1216 return ret_set_errno(false, ENOENT);
1217
1218 if (!handler || !handler->conf)
1219 return ret_set_errno(false, EINVAL);
1220
1221 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1222 if (len < 0)
1223 return false;
1224
1225 for (int i = 0; ops->hierarchies[i]; i++) {
1226 struct hierarchy *h = ops->hierarchies[i];
1227 int ret;
1228
1229 if (is_unified_hierarchy(h) &&
1230 (handler->clone_flags & CLONE_INTO_CGROUP))
1231 continue;
1232
1233 ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
1234 if (ret != 0)
1235 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
1236
1237 TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
1238 }
1239
1240 return true;
1241 }
1242
1243 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1244 gid_t chown_gid, mode_t chmod_mode)
1245 {
1246 int ret;
1247
1248 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1249 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1250 if (ret < 0)
1251 return log_warn_errno(-1,
1252 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1253 dirfd, path, (int)chown_uid,
1254 (int)chown_gid);
1255
1256 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1257 if (ret < 0)
1258 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1259 dirfd, path, (int)chmod_mode);
1260
1261 return 0;
1262 }
1263
1264 /* chgrp the container cgroups to container group. We leave
1265 * the container owner as cgroup owner. So we must make the
1266 * directories 775 so that the container can create sub-cgroups.
1267 *
1268 * Also chown the tasks and cgroup.procs files. Those may not
1269 * exist depending on kernel version.
1270 */
1271 static int chown_cgroup_wrapper(void *data)
1272 {
1273 int ret;
1274 uid_t destuid;
1275 struct generic_userns_exec_data *arg = data;
1276 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1277 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1278
1279 if (!lxc_drop_groups() && errno != EPERM)
1280 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1281
1282 ret = setresgid(nsgid, nsgid, nsgid);
1283 if (ret < 0)
1284 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1285 (int)nsgid, (int)nsgid, (int)nsgid);
1286
1287 ret = setresuid(nsuid, nsuid, nsuid);
1288 if (ret < 0)
1289 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1290 (int)nsuid, (int)nsuid, (int)nsuid);
1291
1292 destuid = get_ns_uid(arg->origuid);
1293 if (destuid == LXC_INVALID_UID)
1294 destuid = 0;
1295
1296 for (int i = 0; arg->hierarchies[i]; i++) {
1297 int dirfd = arg->hierarchies[i]->dfd_con;
1298
1299 if (dirfd < 0)
1300 return syserror_set(-EBADF, "Invalid cgroup file descriptor");
1301
1302 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1303
1304 /*
1305 * Failures to chown() these are inconvenient but not
1306 * detrimental We leave these owned by the container launcher,
1307 * so that container root can write to the files to attach. We
1308 * chmod() them 664 so that container systemd can write to the
1309 * files (which systemd in wily insists on doing).
1310 */
1311
1312 if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
1313 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1314
1315 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1316
1317 if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
1318 continue;
1319
1320 for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
1321 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1322 }
1323
1324 return 0;
1325 }
1326
1327 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1328 struct lxc_conf *conf)
1329 {
1330 struct generic_userns_exec_data wrap;
1331
1332 if (!ops)
1333 return ret_set_errno(false, ENOENT);
1334
1335 if (!ops->hierarchies)
1336 return true;
1337
1338 if (!ops->container_cgroup)
1339 return ret_set_errno(false, ENOENT);
1340
1341 if (!conf)
1342 return ret_set_errno(false, EINVAL);
1343
1344 if (list_empty(&conf->id_map))
1345 return true;
1346
1347 wrap.origuid = geteuid();
1348 wrap.path = NULL;
1349 wrap.hierarchies = ops->hierarchies;
1350 wrap.conf = conf;
1351
1352 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1353 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1354
1355 return true;
1356 }
1357
1358 __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
1359 {
1360 if (!ops)
1361 return;
1362
1363 if (!ops->hierarchies)
1364 return;
1365
1366 for (int i = 0; ops->hierarchies[i]; i++) {
1367 struct hierarchy *h = ops->hierarchies[i];
1368
1369 /* Close all monitor cgroup file descriptors. */
1370 close_prot_errno_disarm(h->dfd_mon);
1371 }
1372 /* Close the cgroup root file descriptor. */
1373 close_prot_errno_disarm(ops->dfd_mnt);
1374
1375 /*
1376 * The checking for freezer support should obviously be done at cgroup
1377 * initialization time but that doesn't work reliable. The freezer
1378 * controller has been demoted (rightly so) to a simple file located in
1379 * each non-root cgroup. At the time when the container is created we
1380 * might still be located in /sys/fs/cgroup and so checking for
1381 * cgroup.freeze won't tell us anything because this file doesn't exist
1382 * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1383 * find an already existing cgroup and then check within that cgroup
1384 * for the existence of cgroup.freeze but that will only work on
1385 * systemd based hosts. Other init systems might not manage cgroups and
1386 * so no cgroup will exist. So we defer until we have created cgroups
1387 * for our container which means we check here.
1388 */
1389 if (pure_unified_layout(ops) &&
1390 !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
1391 AT_SYMLINK_NOFOLLOW)) {
1392 TRACE("Unified hierarchy supports freezer");
1393 ops->unified->utilities |= FREEZER_CONTROLLER;
1394 }
1395 }
1396
1397 /* cgroup-full:* is done, no need to create subdirs */
1398 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1399 {
1400 switch (cgroup_automount_type) {
1401 case LXC_AUTO_CGROUP_RO:
1402 return true;
1403 case LXC_AUTO_CGROUP_RW:
1404 return true;
1405 case LXC_AUTO_CGROUP_MIXED:
1406 return true;
1407 }
1408
1409 return false;
1410 }
1411
1412 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1413 * remount controller ro if needed and bindmount the cgroupfs onto
1414 * control/the/cg/path.
1415 */
1416 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1417 char *hierarchy_mnt, char *cgpath,
1418 const char *container_cgroup)
1419 {
1420 __do_free char *sourcepath = NULL;
1421 int ret, remount_flags;
1422 int flags = MS_BIND;
1423
1424 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1425 (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1426 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
1427 if (ret < 0)
1428 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1429 hierarchy_mnt, hierarchy_mnt);
1430
1431 remount_flags = add_required_remount_flags(hierarchy_mnt,
1432 hierarchy_mnt,
1433 flags | MS_REMOUNT);
1434 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
1435 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1436 NULL);
1437 if (ret < 0)
1438 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
1439
1440 INFO("Remounted %s read-only", hierarchy_mnt);
1441 }
1442
1443 sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
1444 if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1445 flags |= MS_RDONLY;
1446
1447 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1448 if (ret < 0)
1449 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1450 h->controllers[0], cgpath);
1451 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1452
1453 if (flags & MS_RDONLY) {
1454 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1455 flags | MS_REMOUNT);
1456 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1457 if (ret < 0)
1458 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1459 INFO("Remounted %s read-only", cgpath);
1460 }
1461
1462 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1463 return 0;
1464 }
1465
1466 /* __cgroupfs_mount
1467 *
1468 * Mount cgroup hierarchies directly without using bind-mounts. The main
1469 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1470 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1471 */
1472 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1473 struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1474 const char *hierarchy_mnt)
1475 {
1476 __do_close int fd_fs = -EBADF;
1477 unsigned int flags = 0;
1478 char *fstype;
1479 int ret;
1480
1481 if (dfd_mnt_cgroupfs < 0)
1482 return ret_errno(EINVAL);
1483
1484 flags |= MOUNT_ATTR_NOSUID;
1485 flags |= MOUNT_ATTR_NOEXEC;
1486 flags |= MOUNT_ATTR_NODEV;
1487 flags |= MOUNT_ATTR_RELATIME;
1488
1489 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1490 (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1491 flags |= MOUNT_ATTR_RDONLY;
1492
1493 if (is_unified_hierarchy(h))
1494 fstype = "cgroup2";
1495 else
1496 fstype = "cgroup";
1497
1498 if (can_use_mount_api()) {
1499 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1500 if (fd_fs < 0)
1501 return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1502
1503 if (!is_unified_hierarchy(h)) {
1504 for (const char **it = (const char **)h->controllers; it && *it; it++) {
1505 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1506 ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1507 else
1508 ret = fs_set_property(fd_fs, *it, "");
1509 if (ret < 0)
1510 return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1511 }
1512 }
1513
1514 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1515 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1516 flags);
1517 } else {
1518 __do_free char *controllers = NULL, *target = NULL;
1519 unsigned int old_flags = 0;
1520 const char *rootfs_mnt;
1521
1522 if (!is_unified_hierarchy(h)) {
1523 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1524 if (!controllers)
1525 return ret_errno(ENOMEM);
1526 }
1527
1528 rootfs_mnt = get_rootfs_mnt(rootfs);
1529 ret = mnt_attributes_old(flags, &old_flags);
1530 if (ret)
1531 return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1532
1533 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1534 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1535 }
1536 if (ret < 0)
1537 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1538 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1539
1540 DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1541 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1542 return 0;
1543 }
1544
1545 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1546 struct lxc_rootfs *rootfs,
1547 int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1548 {
1549 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1550 dfd_mnt_cgroupfs, hierarchy_mnt);
1551 }
1552
1553 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1554 struct lxc_rootfs *rootfs,
1555 int dfd_mnt_cgroupfs,
1556 const char *hierarchy_mnt)
1557 {
1558 switch (cgroup_automount_type) {
1559 case LXC_AUTO_CGROUP_FULL_RO:
1560 break;
1561 case LXC_AUTO_CGROUP_FULL_RW:
1562 break;
1563 case LXC_AUTO_CGROUP_FULL_MIXED:
1564 break;
1565 default:
1566 return 0;
1567 }
1568
1569 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1570 dfd_mnt_cgroupfs, hierarchy_mnt);
1571 }
1572
1573 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1574 struct lxc_handler *handler, int cg_flags)
1575 {
1576 __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1577 __do_free char *cgroup_root = NULL;
1578 int cgroup_automount_type;
1579 bool in_cgroup_ns = false, wants_force_mount = false;
1580 struct lxc_conf *conf = handler->conf;
1581 struct lxc_rootfs *rootfs = &conf->rootfs;
1582 const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1583 int ret;
1584
1585 if (!ops)
1586 return ret_set_errno(false, ENOENT);
1587
1588 if (!ops->hierarchies)
1589 return true;
1590
1591 if (!conf)
1592 return ret_set_errno(false, EINVAL);
1593
1594 if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1595 return log_trace(true, "No cgroup mounts requested");
1596
1597 if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1598 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1599 wants_force_mount = true;
1600 }
1601
1602 switch (cg_flags) {
1603 case LXC_AUTO_CGROUP_RO:
1604 TRACE("Read-only cgroup mounts requested");
1605 break;
1606 case LXC_AUTO_CGROUP_RW:
1607 TRACE("Read-write cgroup mounts requested");
1608 break;
1609 case LXC_AUTO_CGROUP_MIXED:
1610 TRACE("Mixed cgroup mounts requested");
1611 break;
1612 case LXC_AUTO_CGROUP_FULL_RO:
1613 TRACE("Full read-only cgroup mounts requested");
1614 break;
1615 case LXC_AUTO_CGROUP_FULL_RW:
1616 TRACE("Full read-write cgroup mounts requested");
1617 break;
1618 case LXC_AUTO_CGROUP_FULL_MIXED:
1619 TRACE("Full mixed cgroup mounts requested");
1620 break;
1621 default:
1622 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1623 }
1624 cgroup_automount_type = cg_flags;
1625
1626 if (!wants_force_mount) {
1627 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
1628
1629 /*
1630 * Most recent distro versions currently have init system that
1631 * do support cgroup2 but do not mount it by default unless
1632 * explicitly told so even if the host is cgroup2 only. That
1633 * means they often will fail to boot. Fix this by pre-mounting
1634 * cgroup2 by default. We will likely need to be doing this a
1635 * few years until all distros have switched over to cgroup2 at
1636 * which point we can safely assume that their init systems
1637 * will mount it themselves.
1638 */
1639 if (pure_unified_layout(ops))
1640 wants_force_mount = true;
1641 }
1642
1643 if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
1644 in_cgroup_ns = true;
1645
1646 if (in_cgroup_ns && !wants_force_mount)
1647 return log_trace(true, "Mounting cgroups not requested or needed");
1648
1649 /* This is really the codepath that we want. */
1650 if (pure_unified_layout(ops)) {
1651 __do_close int dfd_mnt_unified = -EBADF;
1652
1653 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1654 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1655 if (dfd_mnt_unified < 0)
1656 return syserror_ret(false, "Failed to open %d(%s)",
1657 rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1658 /*
1659 * If cgroup namespaces are supported but the container will
1660 * not have CAP_SYS_ADMIN after it has started we need to mount
1661 * the cgroups manually.
1662 *
1663 * Note that here we know that wants_force_mount is true.
1664 * Otherwise we would've returned early above.
1665 */
1666 if (in_cgroup_ns) {
1667 /*
1668 * 1. cgroup:rw:force -> Mount the cgroup2 filesystem.
1669 * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only.
1670 * 3. cgroup:mixed:force -> See comment above how this
1671 * does not apply so
1672 * cgroup:mixed is equal to
1673 * cgroup:rw when cgroup
1674 * namespaces are supported.
1675
1676 * 4. cgroup:rw -> No-op; init system responsible for mounting.
1677 * 5. cgroup:ro -> No-op; init system responsible for mounting.
1678 * 6. cgroup:mixed -> No-op; init system responsible for mounting.
1679 *
1680 * 7. cgroup-full:rw -> Not supported.
1681 * 8. cgroup-full:ro -> Not supported.
1682 * 9. cgroup-full:mixed -> Not supported.
1683
1684 * 10. cgroup-full:rw:force -> Not supported.
1685 * 11. cgroup-full:ro:force -> Not supported.
1686 * 12. cgroup-full:mixed:force -> Not supported.
1687 */
1688 ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
1689 if (ret < 0)
1690 return syserror_ret(false, "Failed to force mount cgroup filesystem in cgroup namespace");
1691
1692 return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
1693 } else {
1694 /*
1695 * Either no cgroup namespace supported (highly
1696 * unlikely unless we're dealing with a Frankenkernel.
1697 * Or the user requested to keep the cgroup namespace
1698 * of the host or another container.
1699 */
1700 if (wants_force_mount) {
1701 /*
1702 * 1. cgroup:rw:force -> Bind-mount the cgroup2 filesystem writable.
1703 * 2. cgroup:ro:force -> Bind-mount the cgroup2 filesystem read-only.
1704 * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
1705 * and make the parent directory of the
1706 * container's cgroup read-only but the
1707 * container's cgroup writable.
1708 *
1709 * 10. cgroup-full:rw:force ->
1710 * 11. cgroup-full:ro:force ->
1711 * 12. cgroup-full:mixed:force ->
1712 */
1713 errno = EOPNOTSUPP;
1714 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1715 } else {
1716 errno = EOPNOTSUPP;
1717 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1718 }
1719 }
1720
1721 return syserror_ret(false, "Failed to mount cgroups");
1722 }
1723
1724 /*
1725 * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1726 * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1727 * DEFAULT_CGROUP_MOUNTPOINT define.
1728 */
1729 if (can_use_mount_api()) {
1730 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1731 if (fd_fs < 0)
1732 return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1733
1734 ret = fs_set_property(fd_fs, "mode", "0755");
1735 if (ret < 0)
1736 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1737
1738 ret = fs_set_property(fd_fs, "size", "10240k");
1739 if (ret < 0)
1740 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1741
1742 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1743 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1744 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1745 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
1746 } else {
1747 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1748 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1749 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1750 "size=10240k,mode=755", rootfs_mnt);
1751 }
1752 if (ret < 0)
1753 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1754 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1755
1756 dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1757 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1758 if (dfd_mnt_tmpfs < 0)
1759 return syserror_ret(false, "Failed to open %d(%s)",
1760 rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1761
1762 for (int i = 0; ops->hierarchies[i]; i++) {
1763 __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
1764 struct hierarchy *h = ops->hierarchies[i];
1765
1766 ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
1767 if (ret < 0)
1768 return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
1769
1770 if (in_cgroup_ns && wants_force_mount) {
1771 /*
1772 * If cgroup namespaces are supported but the container
1773 * will not have CAP_SYS_ADMIN after it has started we
1774 * need to mount the cgroups manually.
1775 */
1776 ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
1777 dfd_mnt_tmpfs, h->at_mnt);
1778 if (ret < 0)
1779 return false;
1780
1781 continue;
1782 }
1783
1784 /* Here is where the ancient kernel section begins. */
1785 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
1786 dfd_mnt_tmpfs, h->at_mnt);
1787 if (ret < 0)
1788 return false;
1789
1790 if (!cg_mount_needs_subdirs(cgroup_automount_type))
1791 continue;
1792
1793 if (!cgroup_root)
1794 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1795
1796 hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
1797 path2 = must_make_path(hierarchy_mnt, h->at_base,
1798 ops->container_cgroup, NULL);
1799 ret = mkdir_p(path2, 0755);
1800 if (ret < 0 && (errno != EEXIST))
1801 return false;
1802
1803 ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
1804 hierarchy_mnt, path2,
1805 ops->container_cgroup);
1806 if (ret < 0)
1807 return false;
1808 }
1809
1810 return true;
1811 }
1812
1813 /* Only root needs to escape to the cgroup of its init. */
1814 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
1815 struct lxc_conf *conf)
1816 {
1817 if (!ops)
1818 return ret_set_errno(false, ENOENT);
1819
1820 if (!ops->hierarchies)
1821 return true;
1822
1823 if (!conf)
1824 return ret_set_errno(false, EINVAL);
1825
1826 if (conf->cgroup_meta.relative || geteuid())
1827 return true;
1828
1829 for (int i = 0; ops->hierarchies[i]; i++) {
1830 __do_free char *fullpath = NULL;
1831 int ret;
1832
1833 fullpath = make_cgroup_path(ops->hierarchies[i],
1834 ops->hierarchies[i]->at_base,
1835 "cgroup.procs", NULL);
1836 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1837 if (ret != 0)
1838 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
1839 }
1840
1841 return true;
1842 }
1843
1844 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
1845 {
1846 int i = 0;
1847
1848 if (!ops)
1849 return ret_set_errno(-1, ENOENT);
1850
1851 if (!ops->hierarchies)
1852 return 0;
1853
1854 for (; ops->hierarchies[i]; i++)
1855 ;
1856
1857 return i;
1858 }
1859
1860 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
1861 int n, char ***out)
1862 {
1863 int i;
1864
1865 if (!ops)
1866 return ret_set_errno(false, ENOENT);
1867
1868 if (!ops->hierarchies)
1869 return ret_set_errno(false, ENOENT);
1870
1871 /* consistency check n */
1872 for (i = 0; i < n; i++)
1873 if (!ops->hierarchies[i])
1874 return ret_set_errno(false, ENOENT);
1875
1876 *out = ops->hierarchies[i]->controllers;
1877
1878 return true;
1879 }
1880
1881 static int cg_legacy_freeze(struct cgroup_ops *ops)
1882 {
1883 struct hierarchy *h;
1884
1885 h = get_hierarchy(ops, "freezer");
1886 if (!h)
1887 return ret_set_errno(-1, ENOENT);
1888
1889 return lxc_write_openat(h->path_con, "freezer.state",
1890 "FROZEN", STRLITERALLEN("FROZEN"));
1891 }
1892
1893 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1894 struct lxc_async_descr *descr)
1895 {
1896 __do_free char *line = NULL;
1897 __do_fclose FILE *f = NULL;
1898 int state = PTR_TO_INT(cbdata);
1899 size_t len;
1900 const char *state_string;
1901
1902 f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
1903 if (!f)
1904 return LXC_MAINLOOP_ERROR;
1905
1906 if (state == 1)
1907 state_string = "frozen 1";
1908 else
1909 state_string = "frozen 0";
1910
1911 while (getline(&line, &len, f) != -1)
1912 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
1913 return LXC_MAINLOOP_CLOSE;
1914
1915 rewind(f);
1916
1917 return LXC_MAINLOOP_CONTINUE;
1918 }
1919
1920 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
1921 const char *state_string,
1922 int state_num,
1923 const char *epoll_error,
1924 const char *wait_error)
1925 {
1926 __do_close int fd = -EBADF;
1927 call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL;
1928 int ret;
1929 struct lxc_async_descr descr;
1930 struct hierarchy *h;
1931
1932 h = ops->unified;
1933 if (!h)
1934 return ret_set_errno(-1, ENOENT);
1935
1936 if (!h->path_con)
1937 return ret_set_errno(-1, EEXIST);
1938
1939 if (timeout != 0) {
1940 __do_free char *events_file = NULL;
1941
1942 events_file = must_make_path(h->path_con, "cgroup.events", NULL);
1943 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1944 if (fd < 0)
1945 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
1946
1947 ret = lxc_mainloop_open(&descr);
1948 if (ret)
1949 return log_error_errno(-1, errno, "%s", epoll_error);
1950
1951 /* automatically cleaned up now */
1952 descr_ptr = &descr;
1953
1954 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI,
1955 freezer_cgroup_events_cb,
1956 default_cleanup_handler,
1957 INT_TO_PTR(state_num),
1958 "freezer_cgroup_events_cb");
1959 if (ret < 0)
1960 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
1961 }
1962
1963 ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
1964 if (ret < 0)
1965 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
1966
1967 if (timeout != 0 && lxc_mainloop(&descr, timeout))
1968 return log_error_errno(-1, errno, "%s", wait_error);
1969
1970 return 0;
1971 }
1972
1973 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1974 {
1975 return cg_unified_freeze_do(ops, timeout, "1", 1,
1976 "Failed to create epoll instance to wait for container freeze",
1977 "Failed to wait for container to be frozen");
1978 }
1979
1980 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
1981 {
1982 if (!ops->hierarchies)
1983 return ret_set_errno(-1, ENOENT);
1984
1985 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
1986 return cg_legacy_freeze(ops);
1987
1988 return cg_unified_freeze(ops, timeout);
1989 }
1990
1991 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
1992 {
1993 struct hierarchy *h;
1994
1995 h = get_hierarchy(ops, "freezer");
1996 if (!h)
1997 return ret_set_errno(-1, ENOENT);
1998
1999 return lxc_write_openat(h->path_con, "freezer.state",
2000 "THAWED", STRLITERALLEN("THAWED"));
2001 }
2002
2003 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2004 {
2005 return cg_unified_freeze_do(ops, timeout, "0", 0,
2006 "Failed to create epoll instance to wait for container unfreeze",
2007 "Failed to wait for container to be unfrozen");
2008 }
2009
2010 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2011 {
2012 if (!ops->hierarchies)
2013 return ret_set_errno(-1, ENOENT);
2014
2015 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2016 return cg_legacy_unfreeze(ops);
2017
2018 return cg_unified_unfreeze(ops, timeout);
2019 }
2020
2021 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2022 const char *controller, bool limiting)
2023 {
2024 struct hierarchy *h;
2025 size_t len;
2026 const char *path;
2027
2028 h = get_hierarchy(ops, controller);
2029 if (!h)
2030 return log_warn_errno(NULL, ENOENT,
2031 "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
2032
2033 if (limiting)
2034 path = h->path_lim;
2035 else
2036 path = h->path_con;
2037 if (!path)
2038 return NULL;
2039
2040 len = strlen(h->at_mnt);
2041 if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
2042 STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
2043 path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
2044 path += strspn(path, "/");
2045 }
2046 return path += len;
2047 }
2048
2049 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2050 const char *controller)
2051 {
2052 return cgfsng_get_cgroup_do(ops, controller, false);
2053 }
2054
2055 __cgfsng_ops static const char *cgfsng_get_limit_cgroup(struct cgroup_ops *ops,
2056 const char *controller)
2057 {
2058 return cgfsng_get_cgroup_do(ops, controller, true);
2059 }
2060
2061 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2062 * which must be freed by the caller.
2063 */
2064 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2065 const char *inpath,
2066 const char *filename)
2067 {
2068 return make_cgroup_path(h, inpath, filename, NULL);
2069 }
2070
2071 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2072 {
2073 int idx = 1;
2074 int ret;
2075 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2076 ssize_t pidstr_len;
2077
2078 /* Create leaf cgroup. */
2079 ret = mkdirat(unified_fd, ".lxc", 0755);
2080 if (ret < 0 && errno != EEXIST)
2081 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2082
2083 pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2084 if (pidstr_len < 0)
2085 return pidstr_len;
2086
2087 ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2088 if (ret < 0)
2089 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2090 if (ret == 0)
2091 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2092
2093 /* this is a non-leaf node */
2094 if (errno != EBUSY)
2095 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2096
2097 do {
2098 bool rm = false;
2099 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2100 char *slash = attach_cgroup;
2101
2102 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2103 if (ret < 0)
2104 return ret;
2105
2106 /*
2107 * This shouldn't really happen but the compiler might complain
2108 * that a short write would cause a buffer overrun. So be on
2109 * the safe side.
2110 */
2111 if ((size_t)ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2112 return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2113
2114 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2115 *slash = '\0';
2116
2117 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2118 if (ret < 0 && errno != EEXIST)
2119 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2120 if (ret == 0)
2121 rm = true;
2122
2123 *slash = '/';
2124
2125 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2126 if (ret == 0)
2127 return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2128
2129 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2130 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2131
2132 /* this is a non-leaf node */
2133 if (errno != EBUSY)
2134 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2135
2136 idx++;
2137 } while (idx < 1000);
2138
2139 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2140 }
2141
2142 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2143 int unified_fd, int *sk_fd)
2144 {
2145 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2146 int target_fds[2];
2147 ssize_t ret;
2148
2149 /* Create leaf cgroup. */
2150 ret = mkdirat(unified_fd, ".lxc", 0755);
2151 if (ret < 0 && errno != EEXIST)
2152 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2153
2154 target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2155 if (target_fd0 < 0)
2156 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2157 target_fds[0] = target_fd0;
2158
2159 target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2160 if (target_fd1 < 0)
2161 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2162 target_fds[1] = target_fd1;
2163
2164 ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2165 if (ret <= 0)
2166 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2167 target_fd0, target_fd1);
2168
2169 return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2170 }
2171
2172 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2173 int *sk_fd, pid_t pid)
2174 {
2175 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2176 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2177 size_t pidstr_len;
2178 ssize_t ret;
2179
2180 ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1);
2181 if (ret < 0)
2182 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2183
2184 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2185
2186 ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2187 if (ret > 0 && (size_t)ret == pidstr_len)
2188 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2189
2190 ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2191 if (ret > 0 && (size_t)ret == pidstr_len)
2192 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2193
2194 return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2195 target_fd0, target_fd1);
2196 }
2197
2198 struct userns_exec_unified_attach_data {
2199 const struct lxc_conf *conf;
2200 int unified_fd;
2201 int sk_pair[2];
2202 pid_t pid;
2203 };
2204
2205 static int cgroup_unified_attach_child_wrapper(void *data)
2206 {
2207 struct userns_exec_unified_attach_data *args = data;
2208
2209 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2210 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2211 return ret_errno(EINVAL);
2212
2213 close_prot_errno_disarm(args->sk_pair[0]);
2214 return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2215 &args->sk_pair[1]);
2216 }
2217
2218 static int cgroup_unified_attach_parent_wrapper(void *data)
2219 {
2220 struct userns_exec_unified_attach_data *args = data;
2221
2222 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2223 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2224 return ret_errno(EINVAL);
2225
2226 close_prot_errno_disarm(args->sk_pair[1]);
2227 return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2228 args->pid);
2229 }
2230
2231 /* Technically, we're always at a delegation boundary here (This is especially
2232 * true when cgroup namespaces are available.). The reasoning is that in order
2233 * for us to have been able to start a container in the first place the root
2234 * cgroup must have been a leaf node. Now, either the container's init system
2235 * has populated the cgroup and kept it as a leaf node or it has created
2236 * subtrees. In the former case we will simply attach to the leaf node we
2237 * created when we started the container in the latter case we create our own
2238 * cgroup for the attaching process.
2239 */
2240 static int __cg_unified_attach(const struct hierarchy *h,
2241 const struct lxc_conf *conf, const char *name,
2242 const char *lxcpath, pid_t pid,
2243 const char *controller)
2244 {
2245 __do_close int unified_fd = -EBADF;
2246 __do_free char *path = NULL, *cgroup = NULL;
2247 int ret;
2248
2249 if (!conf || !name || !lxcpath || pid <= 0)
2250 return ret_errno(EINVAL);
2251
2252 ret = cgroup_attach(conf, name, lxcpath, pid);
2253 if (ret == 0)
2254 return log_trace(0, "Attached to unified cgroup via command handler");
2255 if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2)
2256 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2257
2258 /* Fall back to retrieving the path for the unified cgroup. */
2259 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2260 /* not running */
2261 if (!cgroup)
2262 return 0;
2263
2264 path = make_cgroup_path(h, cgroup, NULL);
2265
2266 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2267 if (unified_fd < 0)
2268 return ret_errno(EBADF);
2269
2270 if (!list_empty(&conf->id_map)) {
2271 struct userns_exec_unified_attach_data args = {
2272 .conf = conf,
2273 .unified_fd = unified_fd,
2274 .pid = pid,
2275 };
2276
2277 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2278 if (ret < 0)
2279 return -errno;
2280
2281 ret = userns_exec_minimal(conf,
2282 cgroup_unified_attach_parent_wrapper,
2283 &args,
2284 cgroup_unified_attach_child_wrapper,
2285 &args);
2286 } else {
2287 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2288 }
2289
2290 return ret;
2291 }
2292
2293 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2294 const struct lxc_conf *conf,
2295 const char *name, const char *lxcpath,
2296 pid_t pid)
2297 {
2298 int len, ret;
2299 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2300
2301 if (!ops)
2302 return ret_set_errno(false, ENOENT);
2303
2304 if (!ops->hierarchies)
2305 return true;
2306
2307 len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2308 if (len < 0)
2309 return false;
2310
2311 for (int i = 0; ops->hierarchies[i]; i++) {
2312 __do_free char *fullpath = NULL, *path = NULL;
2313 struct hierarchy *h = ops->hierarchies[i];
2314
2315 if (h->fs_type == UNIFIED_HIERARCHY) {
2316 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2317 h->controllers[0]);
2318 if (ret < 0)
2319 return false;
2320
2321 continue;
2322 }
2323
2324 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2325 if (!path) {
2326 /*
2327 * Someone might have created a name=<controller>
2328 * controller after the container has started and so
2329 * the container doesn't make use of this controller.
2330 *
2331 * Link: https://github.com/lxc/lxd/issues/8577
2332 */
2333 TRACE("Skipping unused %s controller", maybe_empty(h->controllers[0]));
2334 continue;
2335 }
2336
2337 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2338 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2339 if (ret < 0)
2340 return log_error_errno(false, errno, "Failed to attach %d to %s",
2341 (int)pid, fullpath);
2342 }
2343
2344 return true;
2345 }
2346
2347 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2348 * don't have a cgroup_data set up, so we ask the running container through the
2349 * commands API for the cgroup path.
2350 */
2351 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2352 char *value, size_t len, const char *name,
2353 const char *lxcpath)
2354 {
2355 __do_free char *path = NULL;
2356 __do_free char *controller = NULL;
2357 char *p;
2358 struct hierarchy *h;
2359 int ret = -1;
2360
2361 if (!ops)
2362 return ret_set_errno(-1, ENOENT);
2363
2364 controller = strdup(filename);
2365 if (!controller)
2366 return ret_errno(ENOMEM);
2367
2368 p = strchr(controller, '.');
2369 if (p)
2370 *p = '\0';
2371
2372 path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2373 /* not running */
2374 if (!path)
2375 return -1;
2376
2377 h = get_hierarchy(ops, controller);
2378 if (h) {
2379 __do_free char *fullpath = NULL;
2380
2381 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2382 ret = lxc_read_from_file(fullpath, value, len);
2383 }
2384
2385 return ret;
2386 }
2387
2388 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2389 {
2390 for (int count = 0; count < 3; count++, val++) {
2391 switch (*val) {
2392 case 'r':
2393 device->access[count] = *val;
2394 break;
2395 case 'w':
2396 device->access[count] = *val;
2397 break;
2398 case 'm':
2399 device->access[count] = *val;
2400 break;
2401 case '\n':
2402 case '\0':
2403 count = 3;
2404 break;
2405 default:
2406 return ret_errno(EINVAL);
2407 }
2408 }
2409
2410 return 0;
2411 }
2412
2413 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2414 const char *val)
2415 {
2416 size_t count;
2417 int ret;
2418 char temp[50];
2419
2420 if (strequal("devices.allow", key))
2421 device->allow = 1; /* allow the device */
2422 else
2423 device->allow = 0; /* deny the device */
2424
2425 if (strequal(val, "a")) {
2426 /* global rule */
2427 device->type = 'a';
2428 device->major = -1;
2429 device->minor = -1;
2430 return 0;
2431 }
2432
2433 switch (*val) {
2434 case 'a':
2435 __fallthrough;
2436 case 'b':
2437 __fallthrough;
2438 case 'c':
2439 device->type = *val;
2440 break;
2441 default:
2442 return -1;
2443 }
2444
2445 val++;
2446 if (!isspace(*val))
2447 return -1;
2448 val++;
2449 if (*val == '*') {
2450 device->major = -1;
2451 val++;
2452 } else if (isdigit(*val)) {
2453 memset(temp, 0, sizeof(temp));
2454 for (count = 0; count < sizeof(temp) - 1; count++) {
2455 temp[count] = *val;
2456 val++;
2457 if (!isdigit(*val))
2458 break;
2459 }
2460 ret = lxc_safe_int(temp, &device->major);
2461 if (ret)
2462 return -1;
2463 } else {
2464 return -1;
2465 }
2466 if (*val != ':')
2467 return -1;
2468 val++;
2469
2470 /* read minor */
2471 if (*val == '*') {
2472 device->minor = -1;
2473 val++;
2474 } else if (isdigit(*val)) {
2475 memset(temp, 0, sizeof(temp));
2476 for (count = 0; count < sizeof(temp) - 1; count++) {
2477 temp[count] = *val;
2478 val++;
2479 if (!isdigit(*val))
2480 break;
2481 }
2482 ret = lxc_safe_int(temp, &device->minor);
2483 if (ret)
2484 return -1;
2485 } else {
2486 return -1;
2487 }
2488 if (!isspace(*val))
2489 return -1;
2490
2491 return device_cgroup_parse_access(device, ++val);
2492 }
2493
2494 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2495 * don't have a cgroup_data set up, so we ask the running container through the
2496 * commands API for the cgroup path.
2497 */
2498 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2499 const char *key, const char *value,
2500 const char *name, const char *lxcpath)
2501 {
2502 __do_free char *path = NULL;
2503 __do_free char *controller = NULL;
2504 char *p;
2505 struct hierarchy *h;
2506 int ret = -1;
2507
2508 if (!ops || is_empty_string(key) || is_empty_string(value) ||
2509 is_empty_string(name) || is_empty_string(lxcpath))
2510 return ret_errno(EINVAL);
2511
2512 controller = strdup(key);
2513 if (!controller)
2514 return ret_errno(ENOMEM);
2515
2516 p = strchr(controller, '.');
2517 if (p)
2518 *p = '\0';
2519
2520 if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2521 struct device_item device = {};
2522
2523 ret = device_cgroup_rule_parse(&device, key, value);
2524 if (ret < 0)
2525 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2526 key, value);
2527
2528 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2529 if (ret < 0)
2530 return -1;
2531
2532 return 0;
2533 }
2534
2535 path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2536 /* not running */
2537 if (!path)
2538 return -1;
2539
2540 h = get_hierarchy(ops, controller);
2541 if (h) {
2542 __do_free char *fullpath = NULL;
2543
2544 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2545 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2546 }
2547
2548 return ret;
2549 }
2550
2551 /* take devices cgroup line
2552 * /dev/foo rwx
2553 * and convert it to a valid
2554 * type major:minor mode
2555 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2556 * the output.
2557 */
2558 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2559 const char *devpath)
2560 {
2561 __do_free char *path = NULL;
2562 char *mode = NULL;
2563 int n_parts, ret;
2564 char *p;
2565 struct stat sb;
2566
2567 path = strdup(devpath);
2568 if (!path)
2569 return ret_errno(ENOMEM);
2570
2571 /*
2572 * Read path followed by mode. Ignore any trailing text.
2573 * A ' # comment' would be legal. Technically other text is not
2574 * legal, we could check for that if we cared to.
2575 */
2576 for (n_parts = 1, p = path; *p; p++) {
2577 if (*p != ' ')
2578 continue;
2579 *p = '\0';
2580
2581 if (n_parts != 1)
2582 break;
2583 p++;
2584 n_parts++;
2585
2586 while (*p == ' ')
2587 p++;
2588
2589 mode = p;
2590
2591 if (*p == '\0')
2592 return ret_set_errno(-1, EINVAL);
2593 }
2594
2595 if (!mode)
2596 return ret_errno(EINVAL);
2597
2598 if (device_cgroup_parse_access(device, mode) < 0)
2599 return -1;
2600
2601 ret = stat(path, &sb);
2602 if (ret < 0)
2603 return ret_set_errno(-1, errno);
2604
2605 mode_t m = sb.st_mode & S_IFMT;
2606 switch (m) {
2607 case S_IFBLK:
2608 device->type = 'b';
2609 break;
2610 case S_IFCHR:
2611 device->type = 'c';
2612 break;
2613 default:
2614 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2615 }
2616
2617 device->major = MAJOR(sb.st_rdev);
2618 device->minor = MINOR(sb.st_rdev);
2619 device->allow = 1;
2620
2621 return 0;
2622 }
2623
2624 static int convert_devpath(const char *invalue, char *dest)
2625 {
2626 struct device_item device = {};
2627 int ret;
2628
2629 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2630 if (ret < 0)
2631 return -1;
2632
2633 ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2634 device.minor, device.access);
2635 if (ret < 0)
2636 return log_error_errno(ret, -ret,
2637 "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2638 device.type, device.major, device.minor,
2639 device.access);
2640
2641 return 0;
2642 }
2643
2644 /* Called from setup_limits - here we have the container's cgroup_data because
2645 * we created the cgroups.
2646 */
2647 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2648 const char *value, bool is_cpuset)
2649 {
2650 __do_free char *controller = NULL;
2651 char *p;
2652 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2653 char converted_value[50];
2654 struct hierarchy *h;
2655
2656 controller = strdup(filename);
2657 if (!controller)
2658 return ret_errno(ENOMEM);
2659
2660 p = strchr(controller, '.');
2661 if (p)
2662 *p = '\0';
2663
2664 if (strequal("devices.allow", filename) && value[0] == '/') {
2665 int ret;
2666
2667 ret = convert_devpath(value, converted_value);
2668 if (ret < 0)
2669 return ret;
2670 value = converted_value;
2671 }
2672
2673 h = get_hierarchy(ops, controller);
2674 if (!h)
2675 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
2676
2677 if (is_cpuset) {
2678 int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
2679 if (ret)
2680 return ret;
2681 }
2682 return lxc_write_openat(h->path_lim, filename, value, strlen(value));
2683 }
2684
2685 /*
2686 * Return the list of cgroup_settings sorted according to the following rules
2687 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
2688 */
2689 static void sort_cgroup_settings(struct lxc_conf *conf)
2690 {
2691 LIST_HEAD(memsw_list);
2692 struct lxc_cgroup *cgroup, *ncgroup;
2693
2694 /* Iterate over the cgroup settings and copy them to the output list. */
2695 list_for_each_entry_safe(cgroup, ncgroup, &conf->cgroup, head) {
2696 if (!strequal(cgroup->subsystem, "memory.memsw.limit_in_bytes"))
2697 continue;
2698
2699 /* Move the memsw entry from the cgroup settings list. */
2700 list_move_tail(&cgroup->head, &memsw_list);
2701 }
2702
2703 /*
2704 * Append all the memsw entries to the end of the cgroup settings list
2705 * to make sure they are applied after all memory limit settings.
2706 */
2707 list_splice_tail(&memsw_list, &conf->cgroup);
2708
2709 }
2710
2711 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2712 struct lxc_conf *conf,
2713 bool do_devices)
2714 {
2715 struct list_head *cgroup_settings;
2716 struct lxc_cgroup *cgroup;
2717
2718 if (!ops)
2719 return ret_set_errno(false, ENOENT);
2720
2721 if (!conf)
2722 return ret_set_errno(false, EINVAL);
2723
2724 cgroup_settings = &conf->cgroup;
2725 if (list_empty(cgroup_settings))
2726 return true;
2727
2728 if (!ops->hierarchies)
2729 return ret_set_errno(false, EINVAL);
2730
2731 if (pure_unified_layout(ops))
2732 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2733
2734 sort_cgroup_settings(conf);
2735 list_for_each_entry(cgroup, cgroup_settings, head) {
2736 if (do_devices == strnequal("devices", cgroup->subsystem, 7)) {
2737 if (cg_legacy_set_data(ops, cgroup->subsystem, cgroup->value, strnequal("cpuset", cgroup->subsystem, 6))) {
2738 if (do_devices && (errno == EACCES || errno == EPERM)) {
2739 SYSWARN("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2740 continue;
2741 }
2742 SYSERROR("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2743 return false;
2744 }
2745 DEBUG("Set controller \"%s\" set to \"%s\"", cgroup->subsystem, cgroup->value);
2746 }
2747 }
2748
2749 INFO("Limits for the legacy cgroup hierarchies have been setup");
2750 return true;
2751 }
2752
2753 /*
2754 * Some of the parsing logic comes from the original cgroup device v1
2755 * implementation in the kernel.
2756 */
2757 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2758 struct lxc_conf *conf, const char *key,
2759 const char *val)
2760 {
2761 struct device_item device_item = {};
2762 int ret;
2763
2764 if (strequal("devices.allow", key) && abspath(val))
2765 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2766 else
2767 ret = device_cgroup_rule_parse(&device_item, key, val);
2768 if (ret < 0)
2769 return syserror_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
2770
2771 /*
2772 * Note that bpf_list_add_device() returns 1 if it altered the device
2773 * list and 0 if it didn't; both return values indicate success.
2774 * Only a negative return value indicates an error.
2775 */
2776 ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
2777 if (ret < 0)
2778 return -1;
2779
2780 return 0;
2781 }
2782
2783 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2784 struct lxc_handler *handler)
2785 {
2786 struct list_head *cgroup_settings;
2787 struct hierarchy *h;
2788 struct lxc_conf *conf;
2789 struct lxc_cgroup *cgroup;
2790
2791 if (!ops)
2792 return ret_set_errno(false, ENOENT);
2793
2794 if (!ops->hierarchies)
2795 return true;
2796
2797 if (!ops->container_cgroup)
2798 return ret_set_errno(false, EINVAL);
2799
2800 if (!handler || !handler->conf)
2801 return ret_set_errno(false, EINVAL);
2802 conf = handler->conf;
2803
2804 cgroup_settings = &conf->cgroup2;
2805 if (list_empty(cgroup_settings))
2806 return true;
2807
2808 if (!pure_unified_layout(ops))
2809 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
2810
2811 if (!ops->unified)
2812 return false;
2813 h = ops->unified;
2814
2815 list_for_each_entry(cgroup, cgroup_settings, head) {
2816 int ret;
2817
2818 if (strnequal("devices", cgroup->subsystem, 7))
2819 ret = bpf_device_cgroup_prepare(ops, conf, cgroup->subsystem, cgroup->value);
2820 else
2821 ret = lxc_write_openat(h->path_lim, cgroup->subsystem, cgroup->value, strlen(cgroup->value));
2822 if (ret < 0)
2823 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2824
2825 TRACE("Set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2826 }
2827
2828 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2829 }
2830
2831 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
2832 {
2833 struct lxc_conf *conf;
2834 struct hierarchy *unified;
2835
2836 if (!ops)
2837 return ret_set_errno(false, ENOENT);
2838
2839 if (!ops->hierarchies)
2840 return true;
2841
2842 if (!ops->container_cgroup)
2843 return ret_set_errno(false, EEXIST);
2844
2845 if (!handler || !handler->conf)
2846 return ret_set_errno(false, EINVAL);
2847 conf = handler->conf;
2848
2849 unified = ops->unified;
2850 if (!unified || !device_utility_controller(unified) ||
2851 !unified->path_con || list_empty(&(conf->bpf_devices).devices))
2852 return true;
2853
2854 return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
2855 }
2856
2857 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2858 {
2859 __do_close int dfd_final = -EBADF;
2860 __do_free char *add_controllers = NULL, *copy = NULL;
2861 size_t full_len = 0;
2862 struct hierarchy *unified;
2863 int dfd_cur, ret;
2864 char *cur;
2865 char **it;
2866
2867 if (!ops->hierarchies || !pure_unified_layout(ops))
2868 return true;
2869
2870 unified = ops->unified;
2871 if (!unified->controllers[0])
2872 return true;
2873
2874 /* For now we simply enable all controllers that we have detected by
2875 * creating a string like "+memory +pids +cpu +io".
2876 * TODO: In the near future we might want to support "-<controller>"
2877 * etc. but whether supporting semantics like this make sense will need
2878 * some thinking.
2879 */
2880 for (it = unified->controllers; it && *it; it++) {
2881 full_len += strlen(*it) + 2;
2882 add_controllers = must_realloc(add_controllers, full_len + 1);
2883
2884 if (unified->controllers[0] == *it)
2885 add_controllers[0] = '\0';
2886
2887 (void)strlcat(add_controllers, "+", full_len + 1);
2888 (void)strlcat(add_controllers, *it, full_len + 1);
2889
2890 if ((it + 1) && *(it + 1))
2891 (void)strlcat(add_controllers, " ", full_len + 1);
2892 }
2893
2894 copy = strdup(cgroup);
2895 if (!copy)
2896 return false;
2897
2898 /*
2899 * Placing the write to cgroup.subtree_control before the open() is
2900 * intentional because of the cgroup2 delegation model. It enforces
2901 * that leaf cgroups don't have any controllers enabled for delegation.
2902 */
2903 dfd_cur = unified->dfd_base;
2904 lxc_iterate_parts(cur, copy, "/") {
2905 /*
2906 * Even though we vetted the paths when we parsed the config
2907 * we're paranoid here and check that the path is neither
2908 * absolute nor walks upwards.
2909 */
2910 if (abspath(cur))
2911 return syserror_set(-EINVAL, "No absolute paths allowed");
2912
2913 if (strnequal(cur, "..", STRLITERALLEN("..")))
2914 return syserror_set(-EINVAL, "No upward walking paths allowed");
2915
2916 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
2917 if (ret < 0)
2918 return syserror("Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2919
2920 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2921
2922 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
2923 if (dfd_final < 0)
2924 return syserror("Fail to open directory %d(%s)", dfd_cur, cur);
2925 if (dfd_cur != unified->dfd_base)
2926 close(dfd_cur);
2927 /*
2928 * Leave dfd_final pointing to the last fd we opened so
2929 * it will be automatically zapped if we return early.
2930 */
2931 dfd_cur = dfd_final;
2932 }
2933
2934 return true;
2935 }
2936
2937 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2938 {
2939 if (!ops)
2940 return ret_set_errno(false, ENOENT);
2941
2942 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2943 }
2944
2945 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2946 {
2947 if (!ops)
2948 return ret_set_errno(false, ENOENT);
2949
2950 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2951 }
2952
2953 static inline bool unified_cgroup(const char *line)
2954 {
2955 return *line == '0';
2956 }
2957
2958 static inline char *current_unified_cgroup(bool relative, char *line)
2959 {
2960 char *current_cgroup;
2961
2962 line += STRLITERALLEN("0::");
2963
2964 if (!abspath(line))
2965 return ERR_PTR(-EINVAL);
2966
2967 /* remove init.scope */
2968 if (!relative)
2969 line = prune_init_scope(line);
2970
2971 /* create a relative path */
2972 line = deabs(line);
2973
2974 current_cgroup = strdup(line);
2975 if (!current_cgroup)
2976 return ERR_PTR(-ENOMEM);
2977
2978 return current_cgroup;
2979 }
2980
2981 static inline const char *unprefix(const char *controllers)
2982 {
2983 if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
2984 return controllers + STRLITERALLEN("name=");
2985 return controllers;
2986 }
2987
2988 static int __list_cgroup_delegate(char ***delegate)
2989 {
2990 __do_free char **list = NULL;
2991 __do_free char *buf = NULL;
2992 char *standard[] = {
2993 "cgroup.procs",
2994 "cgroup.threads",
2995 "cgroup.subtree_control",
2996 "memory.oom.group",
2997 NULL,
2998 };
2999 char *token;
3000 int ret;
3001
3002 buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
3003 if (!buf) {
3004 for (char **p = standard; p && *p; p++) {
3005 ret = list_add_string(&list, *p);
3006 if (ret < 0)
3007 return ret;
3008 }
3009
3010 *delegate = move_ptr(list);
3011 return syswarn_ret(0, "Failed to read /sys/kernel/cgroup/delegate");
3012 }
3013
3014 lxc_iterate_parts(token, buf, " \t\n") {
3015 /*
3016 * We always need to chown this for both cgroup and
3017 * cgroup2.
3018 */
3019 if (strequal(token, "cgroup.procs"))
3020 continue;
3021
3022 ret = list_add_string(&list, token);
3023 if (ret < 0)
3024 return ret;
3025 }
3026
3027 *delegate = move_ptr(list);
3028 return 0;
3029 }
3030
3031 static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
3032 {
3033 __do_free_string_list char **list = NULL;
3034 int ret;
3035
3036 ret = __list_cgroup_delegate(&list);
3037 if (ret < 0)
3038 return syserror_ret(ret, "Failed to determine unified cgroup delegation requirements");
3039
3040 for (char *const *s = list; s && *s; s++) {
3041 if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
3042 continue;
3043
3044 return sysinfo_ret(false, "The %s file is not writable, skipping unified hierarchy", *s);
3045 }
3046
3047 *ret_files = move_ptr(list);
3048 return true;
3049 }
3050
3051 static bool legacy_hierarchy_delegated(int dfd_base)
3052 {
3053 int ret;
3054
3055 ret = faccessat(dfd_base, ".", W_OK, 0);
3056 if (ret < 0 && errno != ENOENT)
3057 return sysinfo_ret(false, "Legacy hierarchy not writable, skipping");
3058
3059 return true;
3060 }
3061
3062 /**
3063 * systemd guarantees that the order of co-mounted controllers is stable. On
3064 * some systems the order of the controllers might be reversed though.
3065 *
3066 * For example, this is how the order is mismatched on CentOS 7:
3067 *
3068 * [root@localhost ~]# cat /proc/self/cgroup
3069 * 11:perf_event:/
3070 * 10:pids:/
3071 * 9:freezer:/
3072 * >>>> 8:cpuacct,cpu:/
3073 * 7:memory:/
3074 * 6:blkio:/
3075 * 5:devices:/
3076 * 4:hugetlb:/
3077 * >>>> 3:net_prio,net_cls:/
3078 * 2:cpuset:/
3079 * 1:name=systemd:/user.slice/user-0.slice/session-c1.scope
3080 *
3081 * whereas the mountpoint:
3082 *
3083 * | |-/sys/fs/cgroup tmpfs tmpfs ro,nosuid,nodev,noexec,mode=755
3084 * | | |-/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
3085 * | | |-/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset
3086 * >>>> | | |-/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_prio,net_cls
3087 * | | |-/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb
3088 * | | |-/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices
3089 * | | |-/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio
3090 * | | |-/sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory
3091 * >>>> | | |-/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuacct,cpu
3092 * | | |-/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer
3093 * | | |-/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids
3094 * | | `-/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event
3095 *
3096 * Ensure that we always use the systemd-guaranteed stable order when checking
3097 * for the mountpoint.
3098 */
3099 #if HAVE_COMPILER_ATTR_NONNULL
3100 __attribute__((nonnull))
3101 #endif
3102 #if HAVE_COMPILER_ATTR_RETURNS_NONNULL
3103 __attribute__((returns_nonnull))
3104 #endif
3105 static const char *stable_order(const char *controllers)
3106 {
3107 if (strequal(controllers, "cpuacct,cpu"))
3108 return "cpu,cpuacct";
3109
3110 if (strequal(controllers, "net_prio,net_cls"))
3111 return "net_cls,net_prio";
3112
3113 return unprefix(controllers);
3114 }
3115
3116 static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
3117 bool unprivileged)
3118 {
3119 __do_free char *cgroup_info = NULL;
3120 char *it;
3121
3122 /*
3123 * Root spawned containers escape the current cgroup, so use init's
3124 * cgroups as our base in that case.
3125 */
3126 if (!relative && (geteuid() == 0))
3127 cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3128 else
3129 cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3130 if (!cgroup_info)
3131 return ret_errno(ENOMEM);
3132
3133 lxc_iterate_parts(it, cgroup_info, "\n") {
3134 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
3135 __do_free char *controllers = NULL, *current_cgroup = NULL;
3136 __do_free_string_list char **controller_list = NULL,
3137 **delegate = NULL;
3138 char *line;
3139 int dfd, ret, type;
3140
3141 /* Handle the unified cgroup hierarchy. */
3142 line = it;
3143 if (unified_cgroup(line)) {
3144 char *unified_mnt;
3145
3146 type = UNIFIED_HIERARCHY;
3147
3148 current_cgroup = current_unified_cgroup(relative, line);
3149 if (IS_ERR(current_cgroup))
3150 return PTR_ERR(current_cgroup);
3151
3152 if (unified_cgroup_fd(ops->dfd_mnt)) {
3153 dfd_mnt = dup_cloexec(ops->dfd_mnt);
3154 unified_mnt = "";
3155 } else {
3156 dfd_mnt = open_at(ops->dfd_mnt,
3157 "unified",
3158 PROTECT_OPATH_DIRECTORY,
3159 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3160 unified_mnt = "unified";
3161 }
3162 if (dfd_mnt < 0) {
3163 if (errno != ENOENT)
3164 return syserror("Failed to open %d/unified", ops->dfd_mnt);
3165
3166 SYSTRACE("Unified cgroup not mounted");
3167 continue;
3168 }
3169 dfd = dfd_mnt;
3170
3171 if (!is_empty_string(current_cgroup)) {
3172 dfd_base = open_at(dfd_mnt, current_cgroup,
3173 PROTECT_OPATH_DIRECTORY,
3174 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3175 if (dfd_base < 0) {
3176 if (errno != ENOENT)
3177 return syserror("Failed to open %d/%s",
3178 dfd_mnt, current_cgroup);
3179
3180 SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3181 dfd_mnt, current_cgroup);
3182 continue;
3183 }
3184 dfd = dfd_base;
3185 }
3186
3187 if (!unified_hierarchy_delegated(dfd, &delegate))
3188 continue;
3189
3190 controller_list = unified_controllers(dfd, "cgroup.controllers");
3191 if (!controller_list) {
3192 TRACE("No controllers are enabled for delegation in the unified hierarchy");
3193 controller_list = list_new();
3194 if (!controller_list)
3195 return syserror_set(-ENOMEM, "Failed to create empty controller list");
3196 }
3197
3198 controllers = strdup(unified_mnt);
3199 if (!controllers)
3200 return ret_errno(ENOMEM);
3201 } else {
3202 char *__controllers, *__current_cgroup;
3203
3204 type = LEGACY_HIERARCHY;
3205
3206 __controllers = strchr(line, ':');
3207 if (!__controllers)
3208 return ret_errno(EINVAL);
3209 __controllers++;
3210
3211 __current_cgroup = strchr(__controllers, ':');
3212 if (!__current_cgroup)
3213 return ret_errno(EINVAL);
3214 *__current_cgroup = '\0';
3215 __current_cgroup++;
3216
3217 controllers = strdup(stable_order(__controllers));
3218 if (!controllers)
3219 return ret_errno(ENOMEM);
3220
3221 dfd_mnt = open_at(ops->dfd_mnt,
3222 controllers,
3223 PROTECT_OPATH_DIRECTORY,
3224 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3225 if (dfd_mnt < 0) {
3226 if (errno != ENOENT)
3227 return syserror("Failed to open %d/%s",
3228 ops->dfd_mnt, controllers);
3229
3230 SYSTRACE("%s not mounted", controllers);
3231 continue;
3232 }
3233 dfd = dfd_mnt;
3234
3235 if (!abspath(__current_cgroup))
3236 return ret_errno(EINVAL);
3237
3238 /* remove init.scope */
3239 if (!relative)
3240 __current_cgroup = prune_init_scope(__current_cgroup);
3241
3242 /* create a relative path */
3243 __current_cgroup = deabs(__current_cgroup);
3244
3245 current_cgroup = strdup(__current_cgroup);
3246 if (!current_cgroup)
3247 return ret_errno(ENOMEM);
3248
3249 if (!is_empty_string(current_cgroup)) {
3250 dfd_base = open_at(dfd_mnt, current_cgroup,
3251 PROTECT_OPATH_DIRECTORY,
3252 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3253 if (dfd_base < 0) {
3254 if (errno != ENOENT)
3255 return syserror("Failed to open %d/%s",
3256 dfd_mnt, current_cgroup);
3257
3258 SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3259 dfd_mnt, current_cgroup);
3260 continue;
3261 }
3262 dfd = dfd_base;
3263 }
3264
3265 if (!legacy_hierarchy_delegated(dfd))
3266 continue;
3267
3268 /*
3269 * We intentionally pass __current_cgroup here and not
3270 * controllers because we would otherwise chop the
3271 * mountpoint.
3272 */
3273 controller_list = list_add_controllers(__controllers);
3274 if (!controller_list)
3275 return syserror_set(-ENOMEM, "Failed to create controller list from %s", __controllers);
3276
3277 if (skip_hierarchy(ops, controller_list))
3278 continue;
3279
3280 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3281 }
3282
3283 ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
3284 current_cgroup, controller_list, type);
3285 if (ret < 0)
3286 return syserror_ret(ret, "Failed to add %s hierarchy", controllers);
3287
3288 /* Transfer ownership. */
3289 move_fd(dfd_mnt);
3290 move_fd(dfd_base);
3291 move_ptr(current_cgroup);
3292 move_ptr(controllers);
3293 move_ptr(controller_list);
3294 if (type == UNIFIED_HIERARCHY)
3295 ops->unified->delegate = move_ptr(delegate);
3296 }
3297
3298 /* determine cgroup layout */
3299 if (ops->unified) {
3300 if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3301 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3302 } else {
3303 if (bpf_devices_cgroup_supported())
3304 ops->unified->utilities |= DEVICES_CONTROLLER;
3305 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3306 }
3307 }
3308
3309 if (!controllers_available(ops))
3310 return syserror_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
3311
3312 return 0;
3313 }
3314
3315 static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
3316 {
3317 __do_close int dfd = -EBADF;
3318 int ret;
3319 const char *controllers_use;
3320
3321 if (ops->dfd_mnt >= 0)
3322 return ret_errno(EBUSY);
3323
3324 /*
3325 * I don't see the need for allowing symlinks here. If users want to
3326 * have their hierarchy available in different locations I strongly
3327 * suggest bind-mounts.
3328 */
3329 dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3330 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3331 if (dfd < 0)
3332 return syserror("Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3333
3334 controllers_use = lxc_global_config_value("lxc.cgroup.use");
3335 if (controllers_use) {
3336 __do_free char *dup = NULL;
3337 char *it;
3338
3339 dup = strdup(controllers_use);
3340 if (!dup)
3341 return -errno;
3342
3343 lxc_iterate_parts(it, dup, ",") {
3344 ret = list_add_string(&ops->cgroup_use, it);
3345 if (ret < 0)
3346 return ret;
3347 }
3348 }
3349
3350 /*
3351 * Keep dfd referenced by the cleanup function and actually move the fd
3352 * once we know the initialization succeeded. So if we fail we clean up
3353 * the dfd.
3354 */
3355 ops->dfd_mnt = dfd;
3356
3357 ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map));
3358 if (ret < 0)
3359 return syserror_ret(ret, "Failed to initialize cgroups");
3360
3361 /* Transfer ownership to cgroup_ops. */
3362 move_fd(dfd);
3363 return 0;
3364 }
3365
3366 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3367 {
3368 const char *cgroup_pattern;
3369
3370 if (!ops)
3371 return ret_set_errno(-1, ENOENT);
3372
3373 /* copy system-wide cgroup information */
3374 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3375 if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
3376 ops->cgroup_pattern = strdup(cgroup_pattern);
3377 if (!ops->cgroup_pattern)
3378 return ret_errno(ENOMEM);
3379 }
3380
3381 return 0;
3382 }
3383
3384 struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
3385 {
3386 __cleanup_cgroup_ops struct cgroup_ops *cgfsng_ops = NULL;
3387
3388 cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3389 if (!cgfsng_ops)
3390 return ret_set_errno(NULL, ENOMEM);
3391
3392 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3393 cgfsng_ops->dfd_mnt = -EBADF;
3394
3395 if (initialize_cgroups(cgfsng_ops, conf))
3396 return NULL;
3397
3398 cgfsng_ops->data_init = cgfsng_data_init;
3399 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3400 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3401 cgfsng_ops->monitor_create = cgfsng_monitor_create;
3402 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3403 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3404 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3405 cgfsng_ops->payload_create = cgfsng_payload_create;
3406 cgfsng_ops->payload_enter = cgfsng_payload_enter;
3407 cgfsng_ops->finalize = cgfsng_finalize;
3408 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3409 cgfsng_ops->get = cgfsng_get;
3410 cgfsng_ops->set = cgfsng_set;
3411 cgfsng_ops->freeze = cgfsng_freeze;
3412 cgfsng_ops->unfreeze = cgfsng_unfreeze;
3413 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3414 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3415 cgfsng_ops->driver = "cgfsng";
3416 cgfsng_ops->version = "1.0.0";
3417 cgfsng_ops->attach = cgfsng_attach;
3418 cgfsng_ops->chown = cgfsng_chown;
3419 cgfsng_ops->mount = cgfsng_mount;
3420 cgfsng_ops->devices_activate = cgfsng_devices_activate;
3421 cgfsng_ops->get_limit_cgroup = cgfsng_get_limit_cgroup;
3422
3423 cgfsng_ops->criu_escape = cgfsng_criu_escape;
3424 cgfsng_ops->criu_num_hierarchies = cgfsng_criu_num_hierarchies;
3425 cgfsng_ops->criu_get_hierarchies = cgfsng_criu_get_hierarchies;
3426
3427 return move_ptr(cgfsng_ops);
3428 }
3429
3430 static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid)
3431 {
3432 int ret;
3433
3434 if (!list_empty(&conf->id_map)) {
3435 struct userns_exec_unified_attach_data args = {
3436 .conf = conf,
3437 .unified_fd = fd_unified,
3438 .pid = pid,
3439 };
3440
3441 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3442 if (ret < 0)
3443 return -errno;
3444
3445 ret = userns_exec_minimal(conf,
3446 cgroup_unified_attach_parent_wrapper,
3447 &args,
3448 cgroup_unified_attach_child_wrapper,
3449 &args);
3450 } else {
3451 ret = cgroup_attach_leaf(conf, fd_unified, pid);
3452 }
3453
3454 return ret;
3455 }
3456
3457 static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name,
3458 const char *lxcpath, pid_t pid)
3459 {
3460 call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){};
3461 int ret;
3462 size_t idx;
3463 ssize_t pidstr_len;
3464 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
3465
3466 ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, sizeof(struct cgroup_ctx), ctx);
3467 if (ret < 0)
3468 return ret_errno(ENOSYS);
3469
3470 pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
3471 if (pidstr_len < 0)
3472 return pidstr_len;
3473
3474 for (idx = 0; idx < ctx->fd_len; idx++) {
3475 int dfd_con = ctx->fd[idx];
3476
3477 if (unified_cgroup_fd(dfd_con))
3478 ret = __unified_attach_fd(conf, dfd_con, pid);
3479 else
3480 ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len);
3481 if (ret)
3482 return syserror_ret(ret, "Failed to attach to cgroup fd %d", dfd_con);
3483 else
3484 TRACE("Attached to cgroup fd %d", dfd_con);
3485 }
3486
3487 if (idx == 0)
3488 return syserror_set(-ENOENT, "Failed to attach to cgroups");
3489
3490 TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->layout));
3491 return 0;
3492 }
3493
3494 static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name,
3495 const char *lxcpath, pid_t pid)
3496 {
3497 __do_close int dfd_unified = -EBADF;
3498
3499 if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3500 return ret_errno(EINVAL);
3501
3502 dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3503 if (dfd_unified < 0)
3504 return ret_errno(ENOSYS);
3505
3506 return __unified_attach_fd(conf, dfd_unified, pid);
3507 }
3508
3509 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3510 const char *lxcpath, pid_t pid)
3511 {
3512 int ret;
3513
3514 ret = __cgroup_attach_many(conf, name, lxcpath, pid);
3515 if (ret < 0) {
3516 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3517 return ret;
3518
3519 ret = __cgroup_attach_unified(conf, name, lxcpath, pid);
3520 if (ret < 0 && ERRNO_IS_NOT_SUPPORTED(ret))
3521 return ret_errno(ENOSYS);
3522 }
3523
3524 return ret;
3525 }
3526
3527 /* Connects to command socket therefore isn't callable from command handler. */
3528 int cgroup_get(const char *name, const char *lxcpath, const char *key, char *buf, size_t len)
3529 {
3530 __do_close int dfd = -EBADF;
3531 struct cgroup_fd fd = {
3532 .fd = -EBADF,
3533 };
3534 size_t len_controller;
3535 int ret;
3536
3537 if (is_empty_string(name) || is_empty_string(lxcpath) ||
3538 is_empty_string(key))
3539 return ret_errno(EINVAL);
3540
3541 if ((buf && !len) || (len && !buf))
3542 return ret_errno(EINVAL);
3543
3544 len_controller = strcspn(key, ".");
3545 len_controller++; /* Don't forget the \0 byte. */
3546 if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3547 return ret_errno(EINVAL);
3548 (void)strlcpy(fd.controller, key, len_controller);
3549
3550 ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3551 if (ret < 0) {
3552 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3553 return ret;
3554
3555 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3556 if (dfd < 0) {
3557 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3558 return ret;
3559
3560 return ret_errno(ENOSYS);
3561 }
3562 fd.type = UNIFIED_HIERARCHY;
3563 fd.fd = move_fd(dfd);
3564 }
3565 dfd = move_fd(fd.fd);
3566
3567 TRACE("Reading %s from %s cgroup hierarchy", key, cgroup_hierarchy_name(fd.type));
3568
3569 if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices"))
3570 return ret_errno(EOPNOTSUPP);
3571 else
3572 ret = lxc_read_try_buf_at(dfd, key, buf, len);
3573
3574 return ret;
3575 }
3576
3577 /* Connects to command socket therefore isn't callable from command handler. */
3578 int cgroup_set(const char *name, const char *lxcpath, const char *key, const char *value)
3579 {
3580 __do_close int dfd = -EBADF;
3581 struct cgroup_fd fd = {
3582 .fd = -EBADF,
3583 };
3584 size_t len_controller;
3585 int ret;
3586
3587 if (is_empty_string(name) || is_empty_string(lxcpath) ||
3588 is_empty_string(key) || is_empty_string(value))
3589 return ret_errno(EINVAL);
3590
3591 len_controller = strcspn(key, ".");
3592 len_controller++; /* Don't forget the \0 byte. */
3593 if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3594 return ret_errno(EINVAL);
3595 (void)strlcpy(fd.controller, key, len_controller);
3596
3597 ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3598 if (ret < 0) {
3599 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3600 return ret;
3601
3602 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3603 if (dfd < 0) {
3604 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3605 return ret;
3606
3607 return ret_errno(ENOSYS);
3608 }
3609 fd.type = UNIFIED_HIERARCHY;
3610 fd.fd = move_fd(dfd);
3611 }
3612 dfd = move_fd(fd.fd);
3613
3614 TRACE("Setting %s to %s in %s cgroup hierarchy", key, value, cgroup_hierarchy_name(fd.type));
3615
3616 if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) {
3617 struct device_item device = {};
3618
3619 ret = device_cgroup_rule_parse(&device, key, value);
3620 if (ret < 0)
3621 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
3622 key, value);
3623
3624 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3625 } else {
3626 ret = lxc_writeat(dfd, key, value, strlen(value));
3627 }
3628
3629 return ret;
3630 }
3631
3632 static int do_cgroup_freeze(int unified_fd,
3633 const char *state_string,
3634 int state_num,
3635 int timeout,
3636 const char *epoll_error,
3637 const char *wait_error)
3638 {
3639 __do_close int events_fd = -EBADF;
3640 call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL;
3641 int ret;
3642 struct lxc_async_descr descr = {};
3643
3644 if (timeout != 0) {
3645 ret = lxc_mainloop_open(&descr);
3646 if (ret)
3647 return log_error_errno(-1, errno, "%s", epoll_error);
3648
3649 /* automatically cleaned up now */
3650 descr_ptr = &descr;
3651
3652 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3653 if (events_fd < 0)
3654 return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3655
3656 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI,
3657 freezer_cgroup_events_cb,
3658 default_cleanup_handler,
3659 INT_TO_PTR(state_num),
3660 "freezer_cgroup_events_cb");
3661 if (ret < 0)
3662 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3663 }
3664
3665 ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3666 if (ret < 0)
3667 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3668
3669 if (timeout != 0) {
3670 ret = lxc_mainloop(&descr, timeout);
3671 if (ret)
3672 return log_error_errno(-1, errno, "%s", wait_error);
3673 }
3674
3675 return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3676 }
3677
3678 static inline int __cgroup_freeze(int unified_fd, int timeout)
3679 {
3680 return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3681 "Failed to create epoll instance to wait for container freeze",
3682 "Failed to wait for container to be frozen");
3683 }
3684
3685 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3686 {
3687 __do_close int unified_fd = -EBADF;
3688 int ret;
3689
3690 if (is_empty_string(name) || is_empty_string(lxcpath))
3691 return ret_errno(EINVAL);
3692
3693 unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3694 if (unified_fd < 0)
3695 return ret_errno(ENOCGROUP2);
3696
3697 lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3698 ret = __cgroup_freeze(unified_fd, timeout);
3699 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3700 return ret;
3701 }
3702
3703 int __cgroup_unfreeze(int unified_fd, int timeout)
3704 {
3705 return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3706 "Failed to create epoll instance to wait for container freeze",
3707 "Failed to wait for container to be frozen");
3708 }
3709
3710 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3711 {
3712 __do_close int unified_fd = -EBADF;
3713 int ret;
3714
3715 if (is_empty_string(name) || is_empty_string(lxcpath))
3716 return ret_errno(EINVAL);
3717
3718 unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3719 if (unified_fd < 0)
3720 return ret_errno(ENOCGROUP2);
3721
3722 lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3723 ret = __cgroup_unfreeze(unified_fd, timeout);
3724 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3725 return ret;
3726 }