]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfsng.c
bpf: let bpf_list_add_device() take the device list directly
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 /*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
8 * each controller.
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
12 * a comma-separated list of controllers.
13 */
14
15 #ifndef _GNU_SOURCE
16 #define _GNU_SOURCE 1
17 #endif
18 #include <ctype.h>
19 #include <dirent.h>
20 #include <errno.h>
21 #include <grp.h>
22 #include <linux/kdev_t.h>
23 #include <linux/types.h>
24 #include <poll.h>
25 #include <signal.h>
26 #include <stdint.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/epoll.h>
31 #include <sys/types.h>
32 #include <unistd.h>
33
34 #include "af_unix.h"
35 #include "caps.h"
36 #include "cgroup.h"
37 #include "cgroup2_devices.h"
38 #include "cgroup_utils.h"
39 #include "commands.h"
40 #include "commands_utils.h"
41 #include "conf.h"
42 #include "config.h"
43 #include "log.h"
44 #include "macro.h"
45 #include "mainloop.h"
46 #include "memory_utils.h"
47 #include "mount_utils.h"
48 #include "storage/storage.h"
49 #include "string_utils.h"
50 #include "syscall_wrappers.h"
51 #include "utils.h"
52
53 #ifndef HAVE_STRLCPY
54 #include "include/strlcpy.h"
55 #endif
56
57 #ifndef HAVE_STRLCAT
58 #include "include/strlcat.h"
59 #endif
60
61 lxc_log_define(cgfsng, cgroup);
62
63 /* Given a pointer to a null-terminated array of pointers, realloc to add one
64 * entry, and point the new entry to NULL. Do not fail. Return the index to the
65 * second-to-last entry - that is, the one which is now available for use
66 * (keeping the list null-terminated).
67 */
68 static int append_null_to_list(void ***list)
69 {
70 int newentry = 0;
71
72 if (*list)
73 for (; (*list)[newentry]; newentry++)
74 ;
75
76 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
77 (*list)[newentry + 1] = NULL;
78 return newentry;
79 }
80
81 /* Given a null-terminated array of strings, check whether @entry is one of the
82 * strings.
83 */
84 static bool string_in_list(char **list, const char *entry)
85 {
86 if (!list)
87 return false;
88
89 for (int i = 0; list[i]; i++)
90 if (strequal(list[i], entry))
91 return true;
92
93 return false;
94 }
95
96 /* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
97 * "name=systemd". Do not fail.
98 */
99 static char *cg_legacy_must_prefix_named(char *entry)
100 {
101 size_t len;
102 char *prefixed;
103
104 len = strlen(entry);
105 prefixed = must_realloc(NULL, len + 6);
106
107 memcpy(prefixed, "name=", STRLITERALLEN("name="));
108 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
109 prefixed[len + 5] = '\0';
110
111 return prefixed;
112 }
113
114 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
115 * we are called.
116 *
117 * We also handle named subsystems here. Any controller which is not a kernel
118 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
119 * we refuse to use because we're not sure which we have here.
120 * (TODO: We could work around this in some cases by just remounting to be
121 * unambiguous, or by comparing mountpoint contents with current cgroup.)
122 *
123 * The last entry will always be NULL.
124 */
125 static void must_append_controller(char **klist, char **nlist, char ***clist,
126 char *entry)
127 {
128 int newentry;
129 char *copy;
130
131 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
132 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
133 ERROR("It is both a named and kernel subsystem");
134 return;
135 }
136
137 newentry = append_null_to_list((void ***)clist);
138
139 if (strnequal(entry, "name=", 5))
140 copy = must_copy_string(entry);
141 else if (string_in_list(klist, entry))
142 copy = must_copy_string(entry);
143 else
144 copy = cg_legacy_must_prefix_named(entry);
145
146 (*clist)[newentry] = copy;
147 }
148
149 /* Given a handler's cgroup data, return the struct hierarchy for the controller
150 * @c, or NULL if there is none.
151 */
152 static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
153 {
154 if (!ops->hierarchies)
155 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
156
157 for (int i = 0; ops->hierarchies[i]; i++) {
158 if (!controller) {
159 /* This is the empty unified hierarchy. */
160 if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
161 return ops->hierarchies[i];
162
163 continue;
164 }
165
166 /*
167 * Handle controllers with significant implementation changes
168 * from cgroup to cgroup2.
169 */
170 if (pure_unified_layout(ops)) {
171 if (strequal(controller, "devices")) {
172 if (ops->unified->bpf_device_controller)
173 return ops->unified;
174
175 break;
176 } else if (strequal(controller, "freezer")) {
177 if (ops->unified->freezer_controller)
178 return ops->unified;
179
180 break;
181 }
182 }
183
184 if (string_in_list(ops->hierarchies[i]->controllers, controller))
185 return ops->hierarchies[i];
186 }
187
188 if (controller)
189 WARN("There is no useable %s controller", controller);
190 else
191 WARN("There is no empty unified cgroup hierarchy");
192
193 return ret_set_errno(NULL, ENOENT);
194 }
195
196 /* Taken over modified from the kernel sources. */
197 #define NBITS 32 /* bits in uint32_t */
198 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
199 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
200
201 static void set_bit(unsigned bit, uint32_t *bitarr)
202 {
203 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
204 }
205
206 static void clear_bit(unsigned bit, uint32_t *bitarr)
207 {
208 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
209 }
210
211 static bool is_set(unsigned bit, uint32_t *bitarr)
212 {
213 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
214 }
215
216 /* Create cpumask from cpulist aka turn:
217 *
218 * 0,2-3
219 *
220 * into bit array
221 *
222 * 1 0 1 1
223 */
224 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
225 {
226 __do_free uint32_t *bitarr = NULL;
227 char *token;
228 size_t arrlen;
229
230 arrlen = BITS_TO_LONGS(nbits);
231 bitarr = calloc(arrlen, sizeof(uint32_t));
232 if (!bitarr)
233 return ret_set_errno(NULL, ENOMEM);
234
235 lxc_iterate_parts(token, buf, ",") {
236 errno = 0;
237 unsigned end, start;
238 char *range;
239
240 start = strtoul(token, NULL, 0);
241 end = start;
242 range = strchr(token, '-');
243 if (range)
244 end = strtoul(range + 1, NULL, 0);
245
246 if (!(start <= end))
247 return ret_set_errno(NULL, EINVAL);
248
249 if (end >= nbits)
250 return ret_set_errno(NULL, EINVAL);
251
252 while (start <= end)
253 set_bit(start++, bitarr);
254 }
255
256 return move_ptr(bitarr);
257 }
258
259 /* Turn cpumask into simple, comma-separated cpulist. */
260 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
261 {
262 __do_free_string_list char **cpulist = NULL;
263 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
264 int ret;
265
266 for (size_t i = 0; i <= nbits; i++) {
267 if (!is_set(i, bitarr))
268 continue;
269
270 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
271 if (ret < 0)
272 return NULL;
273
274 ret = lxc_append_string(&cpulist, numstr);
275 if (ret < 0)
276 return ret_set_errno(NULL, ENOMEM);
277 }
278
279 if (!cpulist)
280 return ret_set_errno(NULL, ENOMEM);
281
282 return lxc_string_join(",", (const char **)cpulist, false);
283 }
284
285 static ssize_t get_max_cpus(char *cpulist)
286 {
287 char *c1, *c2;
288 char *maxcpus = cpulist;
289 size_t cpus = 0;
290
291 c1 = strrchr(maxcpus, ',');
292 if (c1)
293 c1++;
294
295 c2 = strrchr(maxcpus, '-');
296 if (c2)
297 c2++;
298
299 if (!c1 && !c2)
300 c1 = maxcpus;
301 else if (c1 > c2)
302 c2 = c1;
303 else if (c1 < c2)
304 c1 = c2;
305 else if (!c1 && c2)
306 c1 = c2;
307
308 errno = 0;
309 cpus = strtoul(c1, NULL, 0);
310 if (errno != 0)
311 return -1;
312
313 return cpus;
314 }
315
316 static inline bool is_unified_hierarchy(const struct hierarchy *h)
317 {
318 return h->version == CGROUP2_SUPER_MAGIC;
319 }
320
321 /* Given two null-terminated lists of strings, return true if any string is in
322 * both.
323 */
324 static bool controller_lists_intersect(char **l1, char **l2)
325 {
326 if (!l1 || !l2)
327 return false;
328
329 for (int i = 0; l1[i]; i++)
330 if (string_in_list(l2, l1[i]))
331 return true;
332
333 return false;
334 }
335
336 /* For a null-terminated list of controllers @clist, return true if any of those
337 * controllers is already listed the null-terminated list of hierarchies @hlist.
338 * Realistically, if one is present, all must be present.
339 */
340 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
341 {
342 if (!hlist)
343 return false;
344
345 for (int i = 0; hlist[i]; i++)
346 if (controller_lists_intersect(hlist[i]->controllers, clist))
347 return true;
348
349 return false;
350 }
351
352 /* Return true if the controller @entry is found in the null-terminated list of
353 * hierarchies @hlist.
354 */
355 static bool controller_found(struct hierarchy **hlist, char *entry)
356 {
357 if (!hlist)
358 return false;
359
360 for (int i = 0; hlist[i]; i++)
361 if (string_in_list(hlist[i]->controllers, entry))
362 return true;
363
364 return false;
365 }
366
367 /* Return true if all of the controllers which we require have been found. The
368 * required list is freezer and anything in lxc.cgroup.use.
369 */
370 static bool all_controllers_found(struct cgroup_ops *ops)
371 {
372 struct hierarchy **hlist;
373
374 if (!ops->cgroup_use)
375 return true;
376
377 hlist = ops->hierarchies;
378 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
379 if (!controller_found(hlist, *cur))
380 return log_error(false, "No %s controller mountpoint found", *cur);
381
382 return true;
383 }
384
385 /* Get the controllers from a mountinfo line There are other ways we could get
386 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
387 * could parse the mount options. But we simply assume that the mountpoint must
388 * be /sys/fs/cgroup/controller-list
389 */
390 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
391 int type)
392 {
393 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
394 * for legacy hierarchies.
395 */
396 __do_free_string_list char **aret = NULL;
397 int i;
398 char *p2, *tok;
399 char *p = line, *sep = ",";
400
401 for (i = 0; i < 4; i++) {
402 p = strchr(p, ' ');
403 if (!p)
404 return NULL;
405 p++;
406 }
407
408 /* Note, if we change how mountinfo works, then our caller will need to
409 * verify /sys/fs/cgroup/ in this field.
410 */
411 if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15))
412 return log_warn(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
413
414 p += 15;
415 p2 = strchr(p, ' ');
416 if (!p2)
417 return log_error(NULL, "Corrupt mountinfo");
418 *p2 = '\0';
419
420 if (type == CGROUP_SUPER_MAGIC) {
421 __do_free char *dup = NULL;
422
423 /* strdup() here for v1 hierarchies. Otherwise
424 * lxc_iterate_parts() will destroy mountpoints such as
425 * "/sys/fs/cgroup/cpu,cpuacct".
426 */
427 dup = must_copy_string(p);
428 if (!dup)
429 return NULL;
430
431 lxc_iterate_parts(tok, dup, sep)
432 must_append_controller(klist, nlist, &aret, tok);
433 }
434 *p2 = ' ';
435
436 return move_ptr(aret);
437 }
438
439 static char **cg_unified_make_empty_controller(void)
440 {
441 __do_free_string_list char **aret = NULL;
442 int newentry;
443
444 newentry = append_null_to_list((void ***)&aret);
445 aret[newentry] = NULL;
446 return move_ptr(aret);
447 }
448
449 static char **cg_unified_get_controllers(int dfd, const char *file)
450 {
451 __do_free char *buf = NULL;
452 __do_free_string_list char **aret = NULL;
453 char *sep = " \t\n";
454 char *tok;
455
456 buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
457 if (!buf)
458 return NULL;
459
460 lxc_iterate_parts(tok, buf, sep) {
461 int newentry;
462 char *copy;
463
464 newentry = append_null_to_list((void ***)&aret);
465 copy = must_copy_string(tok);
466 aret[newentry] = copy;
467 }
468
469 return move_ptr(aret);
470 }
471
472 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
473 char **controllers)
474 {
475 if (!ops->cgroup_use)
476 return true;
477
478 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
479 bool found = false;
480
481 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
482 if (!strequal(*cur_use, *cur_ctrl))
483 continue;
484
485 found = true;
486 break;
487 }
488
489 if (found)
490 continue;
491
492 return false;
493 }
494
495 return true;
496 }
497
498 static int add_hierarchy(struct cgroup_ops *ops, char **clist, char *mountpoint,
499 char *container_base_path, int type)
500 {
501 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
502 __do_free struct hierarchy *new = NULL;
503 __do_free_string_list char **controllers = clist;
504 int idx;
505
506 if (abspath(container_base_path))
507 return syserrno(-errno, "Container base path must be relative to controller mount");
508
509 if (!controllers && type != CGROUP2_SUPER_MAGIC)
510 return syserrno_set(-EINVAL, "Empty controller list for non-unified cgroup hierarchy passed");
511
512 dfd_mnt = open_at(-EBADF, mountpoint, PROTECT_OPATH_DIRECTORY,
513 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
514 if (dfd_mnt < 0)
515 return syserrno(-errno, "Failed to open %s", mountpoint);
516
517 if (!is_empty_string(container_base_path)) {
518 dfd_base = open_at(dfd_mnt, container_base_path,
519 PROTECT_OPATH_DIRECTORY,
520 PROTECT_LOOKUP_BENEATH_XDEV, 0);
521 if (dfd_base < 0)
522 return syserrno(-errno, "Failed to open %d(%s)", dfd_base, container_base_path);
523 }
524
525 if (!controllers) {
526 /*
527 * We assume that the cgroup we're currently in has been delegated to
528 * us and we are free to further delege all of the controllers listed
529 * in cgroup.controllers further down the hierarchy.
530 */
531 if (dfd_base < 0)
532 controllers = cg_unified_get_controllers(dfd_mnt, "cgroup.controllers");
533 else
534 controllers = cg_unified_get_controllers(dfd_base, "cgroup.controllers");
535 if (!controllers)
536 controllers = cg_unified_make_empty_controller();
537 if (!controllers[0])
538 TRACE("No controllers are enabled for delegation");
539 }
540
541 /* Exclude all controllers that cgroup use does not want. */
542 if (!cgroup_use_wants_controllers(ops, controllers))
543 return log_trace(0, "Skipping cgroup hiearchy with non-requested controllers");
544
545 new = zalloc(sizeof(*new));
546 if (!new)
547 return ret_errno(ENOMEM);
548
549 new->version = type;
550 new->controllers = move_ptr(controllers);
551 new->mountpoint = mountpoint;
552 new->container_base_path = container_base_path;
553 new->cgfd_con = -EBADF;
554 new->cgfd_limit = -EBADF;
555 new->cgfd_mon = -EBADF;
556
557 TRACE("Adding cgroup hierarchy with mountpoint %s and base cgroup %s",
558 mountpoint, container_base_path);
559 for (char *const *it = new->controllers; it && *it; it++)
560 TRACE("The detected hierarchy contains the %s controller", *it);
561
562 idx = append_null_to_list((void ***)&ops->hierarchies);
563 if (dfd_base < 0)
564 new->dfd_base = dfd_mnt;
565 else
566 new->dfd_base = move_fd(dfd_base);
567 new->dfd_mnt = move_fd(dfd_mnt);
568 if (type == CGROUP2_SUPER_MAGIC)
569 ops->unified = new;
570 (ops->hierarchies)[idx] = move_ptr(new);
571 return 0;
572 }
573
574 /* Get a copy of the mountpoint from @line, which is a line from
575 * /proc/self/mountinfo.
576 */
577 static char *cg_hybrid_get_mountpoint(char *line)
578 {
579 char *p = line, *sret = NULL;
580 size_t len;
581 char *p2;
582
583 for (int i = 0; i < 4; i++) {
584 p = strchr(p, ' ');
585 if (!p)
586 return NULL;
587 p++;
588 }
589
590 if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15))
591 return NULL;
592
593 p2 = strchr(p + 15, ' ');
594 if (!p2)
595 return NULL;
596 *p2 = '\0';
597
598 len = strlen(p);
599 sret = must_realloc(NULL, len + 1);
600 memcpy(sret, p, len);
601 sret[len] = '\0';
602
603 return sret;
604 }
605
606 /* Given a multi-line string, return a null-terminated copy of the current line. */
607 static char *copy_to_eol(char *p)
608 {
609 char *p2, *sret;
610 size_t len;
611
612 p2 = strchr(p, '\n');
613 if (!p2)
614 return NULL;
615
616 len = p2 - p;
617 sret = must_realloc(NULL, len + 1);
618 memcpy(sret, p, len);
619 sret[len] = '\0';
620
621 return sret;
622 }
623
624 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
625 * /proc/self/cgroup file. Check whether controller c is present.
626 */
627 static bool controller_in_clist(char *cgline, char *c)
628 {
629 __do_free char *tmp = NULL;
630 char *tok, *eol;
631 size_t len;
632
633 eol = strchr(cgline, ':');
634 if (!eol)
635 return false;
636
637 len = eol - cgline;
638 tmp = must_realloc(NULL, len + 1);
639 memcpy(tmp, cgline, len);
640 tmp[len] = '\0';
641
642 lxc_iterate_parts(tok, tmp, ",")
643 if (strequal(tok, c))
644 return true;
645
646 return false;
647 }
648
649 static inline char *trim(char *s)
650 {
651 size_t len;
652
653 len = strlen(s);
654 while ((len > 1) && (s[len - 1] == '\n'))
655 s[--len] = '\0';
656
657 return s;
658 }
659
660 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
661 * @controller.
662 */
663 static char *cg_hybrid_get_current_cgroup(bool relative, char *basecginfo,
664 char *controller, int type)
665 {
666 char *base_cgroup = basecginfo;
667
668 for (;;) {
669 bool is_cgv2_base_cgroup = false;
670
671 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
672 if ((type == CGROUP2_SUPER_MAGIC) && (*base_cgroup == '0'))
673 is_cgv2_base_cgroup = true;
674
675 base_cgroup = strchr(base_cgroup, ':');
676 if (!base_cgroup)
677 return NULL;
678 base_cgroup++;
679
680 if (is_cgv2_base_cgroup || (controller && controller_in_clist(base_cgroup, controller))) {
681 __do_free char *copy = NULL;
682
683 base_cgroup = strchr(base_cgroup, ':');
684 if (!base_cgroup)
685 return NULL;
686 base_cgroup++;
687
688 copy = copy_to_eol(base_cgroup);
689 if (!copy)
690 return NULL;
691 trim(copy);
692
693 if (!relative) {
694 base_cgroup = prune_init_scope(copy);
695 if (!base_cgroup)
696 return NULL;
697 } else {
698 base_cgroup = copy;
699 }
700
701 if (abspath(base_cgroup))
702 base_cgroup = deabs(base_cgroup);
703
704 /* We're allowing base_cgroup to be "". */
705 return strdup(base_cgroup);
706 }
707
708 base_cgroup = strchr(base_cgroup, '\n');
709 if (!base_cgroup)
710 return NULL;
711 base_cgroup++;
712 }
713 }
714
715 static void must_append_string(char ***list, char *entry)
716 {
717 int newentry;
718 char *copy;
719
720 newentry = append_null_to_list((void ***)list);
721 copy = must_copy_string(entry);
722 (*list)[newentry] = copy;
723 }
724
725 static int get_existing_subsystems(char ***klist, char ***nlist)
726 {
727 __do_free char *line = NULL;
728 __do_fclose FILE *f = NULL;
729 size_t len = 0;
730
731 f = fopen("/proc/self/cgroup", "re");
732 if (!f)
733 return -1;
734
735 while (getline(&line, &len, f) != -1) {
736 char *p, *p2, *tok;
737 p = strchr(line, ':');
738 if (!p)
739 continue;
740 p++;
741 p2 = strchr(p, ':');
742 if (!p2)
743 continue;
744 *p2 = '\0';
745
746 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
747 * contains an entry of the form:
748 *
749 * 0::/some/path
750 *
751 * In this case we use "cgroup2" as controller name.
752 */
753 if ((p2 - p) == 0) {
754 must_append_string(klist, "cgroup2");
755 continue;
756 }
757
758 lxc_iterate_parts(tok, p, ",") {
759 if (strnequal(tok, "name=", 5))
760 must_append_string(nlist, tok);
761 else
762 must_append_string(klist, tok);
763 }
764 }
765
766 return 0;
767 }
768
769 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
770 char **nlist)
771 {
772 int k;
773 char **it;
774
775 TRACE("basecginfo is:");
776 TRACE("%s", basecginfo);
777
778 for (k = 0, it = klist; it && *it; it++, k++)
779 TRACE("kernel subsystem %d: %s", k, *it);
780
781 for (k = 0, it = nlist; it && *it; it++, k++)
782 TRACE("named subsystem %d: %s", k, *it);
783 }
784
785 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
786 {
787 if (!path_prune || !hierarchies)
788 return 0;
789
790 for (int i = 0; hierarchies[i]; i++) {
791 struct hierarchy *h = hierarchies[i];
792 int ret;
793
794 ret = cgroup_tree_prune(h->dfd_base, path_prune);
795 if (ret < 0)
796 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
797 else
798 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
799
800 if (h->container_limit_path != h->container_full_path)
801 free_disarm(h->container_limit_path);
802 free_disarm(h->container_full_path);
803 }
804
805 return 0;
806 }
807
808 struct generic_userns_exec_data {
809 struct hierarchy **hierarchies;
810 const char *path_prune;
811 struct lxc_conf *conf;
812 uid_t origuid; /* target uid in parent namespace */
813 char *path;
814 };
815
816 static int cgroup_tree_remove_wrapper(void *data)
817 {
818 struct generic_userns_exec_data *arg = data;
819 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
820 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
821 int ret;
822
823 if (!lxc_drop_groups() && errno != EPERM)
824 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
825
826 ret = setresgid(nsgid, nsgid, nsgid);
827 if (ret < 0)
828 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
829 (int)nsgid, (int)nsgid, (int)nsgid);
830
831 ret = setresuid(nsuid, nsuid, nsuid);
832 if (ret < 0)
833 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
834 (int)nsuid, (int)nsuid, (int)nsuid);
835
836 return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
837 }
838
839 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
840 struct lxc_handler *handler)
841 {
842 int ret;
843
844 if (!ops) {
845 ERROR("Called with uninitialized cgroup operations");
846 return;
847 }
848
849 if (!ops->hierarchies)
850 return;
851
852 if (!handler) {
853 ERROR("Called with uninitialized handler");
854 return;
855 }
856
857 if (!handler->conf) {
858 ERROR("Called with uninitialized conf");
859 return;
860 }
861
862 if (!ops->container_limit_cgroup) {
863 WARN("Uninitialized limit cgroup");
864 return;
865 }
866
867 ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
868 if (ret < 0)
869 WARN("Failed to detach bpf program from cgroup");
870
871 if (!lxc_list_empty(&handler->conf->id_map)) {
872 struct generic_userns_exec_data wrap = {
873 .conf = handler->conf,
874 .path_prune = ops->container_limit_cgroup,
875 .hierarchies = ops->hierarchies,
876 .origuid = 0,
877 };
878 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
879 &wrap, "cgroup_tree_remove_wrapper");
880 } else {
881 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
882 }
883 if (ret < 0)
884 SYSWARN("Failed to destroy cgroups");
885 }
886
887 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
888 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
889 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
890 bool am_initialized)
891 {
892 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
893 *offlinecpus = NULL, *posscpus = NULL;
894 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
895 *possmask = NULL;
896 int ret;
897 ssize_t i;
898 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
899 bool flipped_bit = false;
900
901 posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
902 if (!posscpus)
903 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
904
905 /* Get maximum number of cpus found in possible cpuset. */
906 maxposs = get_max_cpus(posscpus);
907 if (maxposs < 0 || maxposs >= INT_MAX - 1)
908 return false;
909
910 if (file_exists(__ISOL_CPUS)) {
911 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
912 if (!isolcpus)
913 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
914
915 if (isdigit(isolcpus[0])) {
916 /* Get maximum number of cpus found in isolated cpuset. */
917 maxisol = get_max_cpus(isolcpus);
918 if (maxisol < 0 || maxisol >= INT_MAX - 1)
919 return false;
920 }
921
922 if (maxposs < maxisol)
923 maxposs = maxisol;
924 maxposs++;
925 } else {
926 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
927 }
928
929 if (file_exists(__OFFLINE_CPUS)) {
930 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
931 if (!offlinecpus)
932 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
933
934 if (isdigit(offlinecpus[0])) {
935 /* Get maximum number of cpus found in offline cpuset. */
936 maxoffline = get_max_cpus(offlinecpus);
937 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
938 return false;
939 }
940
941 if (maxposs < maxoffline)
942 maxposs = maxoffline;
943 maxposs++;
944 } else {
945 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
946 }
947
948 if ((maxisol == 0) && (maxoffline == 0)) {
949 cpulist = move_ptr(posscpus);
950 goto copy_parent;
951 }
952
953 possmask = lxc_cpumask(posscpus, maxposs);
954 if (!possmask)
955 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
956
957 if (maxisol > 0) {
958 isolmask = lxc_cpumask(isolcpus, maxposs);
959 if (!isolmask)
960 return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
961 }
962
963 if (maxoffline > 0) {
964 offlinemask = lxc_cpumask(offlinecpus, maxposs);
965 if (!offlinemask)
966 return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
967 }
968
969 for (i = 0; i <= maxposs; i++) {
970 if ((isolmask && !is_set(i, isolmask)) ||
971 (offlinemask && !is_set(i, offlinemask)) ||
972 !is_set(i, possmask))
973 continue;
974
975 flipped_bit = true;
976 clear_bit(i, possmask);
977 }
978
979 if (!flipped_bit) {
980 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
981 TRACE("No isolated or offline cpus present in cpuset");
982 } else {
983 cpulist = move_ptr(posscpus);
984 TRACE("Removed isolated or offline cpus from cpuset");
985 }
986 if (!cpulist)
987 return log_error_errno(false, errno, "Failed to create cpu list");
988
989 copy_parent:
990 if (!am_initialized) {
991 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
992 if (ret < 0)
993 return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
994
995 TRACE("Copied cpu settings of parent cgroup");
996 }
997
998 return true;
999 }
1000
1001 static bool cpuset1_initialize(int dfd_base, int dfd_next)
1002 {
1003 char mems[PATH_MAX];
1004 ssize_t bytes;
1005 char v;
1006
1007 /*
1008 * Determine whether the base cgroup has cpuset
1009 * inheritance turned on.
1010 */
1011 bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
1012 if (bytes < 0)
1013 return syserrno(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
1014
1015 /*
1016 * Initialize cpuset.cpus and make remove any isolated
1017 * and offline cpus.
1018 */
1019 if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
1020 return syserrno(false, "Failed to initialize cpuset.cpus");
1021
1022 /* Read cpuset.mems from parent... */
1023 bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
1024 if (bytes < 0)
1025 return syserrno(false, "Failed to read file %d(cpuset.mems)", dfd_base);
1026
1027 /* ... and copy to first cgroup in the tree... */
1028 bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
1029 if (bytes < 0)
1030 return syserrno(false, "Failed to write %d(cpuset.mems)", dfd_next);
1031
1032 /* ... and finally turn on cpuset inheritance. */
1033 bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
1034 if (bytes < 0)
1035 return syserrno(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
1036
1037 return log_trace(true, "Initialized cpuset in the legacy hierarchy");
1038 }
1039
1040 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
1041 bool cpuset_v1, bool eexist_ignore)
1042 {
1043 __do_close int dfd_final = -EBADF;
1044 int dfd_cur = dfd_base;
1045 int ret = 0;
1046 size_t len;
1047 char *cur;
1048 char buf[PATH_MAX];
1049
1050 if (is_empty_string(path))
1051 return ret_errno(EINVAL);
1052
1053 len = strlcpy(buf, path, sizeof(buf));
1054 if (len >= sizeof(buf))
1055 return ret_errno(E2BIG);
1056
1057 lxc_iterate_parts(cur, buf, "/") {
1058 /*
1059 * Even though we vetted the paths when we parsed the config
1060 * we're paranoid here and check that the path is neither
1061 * absolute nor walks upwards.
1062 */
1063 if (abspath(cur))
1064 return syserrno_set(-EINVAL, "No absolute paths allowed");
1065
1066 if (strnequal(cur, "..", STRLITERALLEN("..")))
1067 return syserrno_set(-EINVAL, "No upward walking paths allowed");
1068
1069 ret = mkdirat(dfd_cur, cur, mode);
1070 if (ret < 0) {
1071 if (errno != EEXIST)
1072 return syserrno(-errno, "Failed to create %d(%s)", dfd_cur, cur);
1073
1074 ret = -EEXIST;
1075 }
1076 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
1077
1078 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
1079 if (dfd_final < 0)
1080 return syserrno(-errno, "Fail to open%s directory %d(%s)",
1081 !ret ? " newly created" : "", dfd_base, cur);
1082 if (dfd_cur != dfd_base)
1083 close(dfd_cur);
1084 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
1085 return syserrno(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
1086 /*
1087 * Leave dfd_final pointing to the last fd we opened so
1088 * it will be automatically zapped if we return early.
1089 */
1090 dfd_cur = dfd_final;
1091 }
1092
1093 /* The final cgroup must be succesfully creatd by us. */
1094 if (ret) {
1095 if (ret != -EEXIST || !eexist_ignore)
1096 return syserrno_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
1097 }
1098
1099 return move_fd(dfd_final);
1100 }
1101
1102 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
1103 struct hierarchy *h, const char *cgroup_limit_dir,
1104 const char *cgroup_leaf, bool payload)
1105 {
1106 __do_close int fd_limit = -EBADF, fd_final = -EBADF;
1107 __do_free char *path = NULL, *limit_path = NULL;
1108 bool cpuset_v1 = false;
1109
1110 /*
1111 * The legacy cpuset controller needs massaging in case inheriting
1112 * settings from its immediate ancestor cgroup hasn't been turned on.
1113 */
1114 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
1115
1116 if (payload && cgroup_leaf) {
1117 /* With isolation both parts need to not already exist. */
1118 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
1119 if (fd_limit < 0)
1120 return syserrno(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
1121
1122 TRACE("Created limit cgroup %d->%d(%s)",
1123 fd_limit, h->dfd_base, cgroup_limit_dir);
1124
1125 /*
1126 * With isolation the devices legacy cgroup needs to be
1127 * iinitialized early, as it typically contains an 'a' (all)
1128 * line, which is not possible once a subdirectory has been
1129 * created.
1130 */
1131 if (string_in_list(h->controllers, "devices") &&
1132 !ops->setup_limits_legacy(ops, conf, true))
1133 return log_error(false, "Failed to setup legacy device limits");
1134
1135 limit_path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
1136 path = must_make_path(limit_path, cgroup_leaf, NULL);
1137
1138 /*
1139 * If we use a separate limit cgroup, the leaf cgroup, i.e. the
1140 * cgroup the container actually resides in, is below fd_limit.
1141 */
1142 fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false);
1143 if (fd_final < 0) {
1144 /* Ensure we don't leave any garbage behind. */
1145 if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
1146 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
1147 else
1148 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
1149 }
1150 } else {
1151 path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
1152
1153 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
1154 }
1155 if (fd_final < 0)
1156 return syserrno(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
1157
1158 if (payload) {
1159 h->cgfd_con = move_fd(fd_final);
1160 h->container_full_path = move_ptr(path);
1161
1162 if (fd_limit < 0)
1163 h->cgfd_limit = h->cgfd_con;
1164 else
1165 h->cgfd_limit = move_fd(fd_limit);
1166
1167 if (limit_path)
1168 h->container_limit_path = move_ptr(limit_path);
1169 else
1170 h->container_limit_path = h->container_full_path;
1171 } else {
1172 h->cgfd_mon = move_fd(fd_final);
1173 }
1174
1175 return true;
1176 }
1177
1178 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
1179 bool payload)
1180 {
1181 bool prune = true;
1182
1183 if (payload) {
1184 /* Check whether we actually created the cgroup to prune. */
1185 if (h->cgfd_limit < 0)
1186 prune = false;
1187
1188 if (h->container_full_path != h->container_limit_path)
1189 free_disarm(h->container_limit_path);
1190 free_disarm(h->container_full_path);
1191
1192 close_prot_errno_disarm(h->cgfd_con);
1193 close_prot_errno_disarm(h->cgfd_limit);
1194 } else {
1195 /* Check whether we actually created the cgroup to prune. */
1196 if (h->cgfd_mon < 0)
1197 prune = false;
1198
1199 close_prot_errno_disarm(h->cgfd_mon);
1200 }
1201
1202 /* We didn't create this cgroup. */
1203 if (!prune)
1204 return;
1205
1206 if (cgroup_tree_prune(h->dfd_base, path_prune))
1207 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
1208 else
1209 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
1210 }
1211
1212 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1213 struct lxc_handler *handler)
1214 {
1215 int len;
1216 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1217 const struct lxc_conf *conf;
1218
1219 if (!ops) {
1220 ERROR("Called with uninitialized cgroup operations");
1221 return;
1222 }
1223
1224 if (!ops->hierarchies)
1225 return;
1226
1227 if (!handler) {
1228 ERROR("Called with uninitialized handler");
1229 return;
1230 }
1231
1232 if (!handler->conf) {
1233 ERROR("Called with uninitialized conf");
1234 return;
1235 }
1236 conf = handler->conf;
1237
1238 if (!ops->monitor_cgroup) {
1239 WARN("Uninitialized monitor cgroup");
1240 return;
1241 }
1242
1243 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1244 if (len < 0)
1245 return;
1246
1247 for (int i = 0; ops->hierarchies[i]; i++) {
1248 __do_close int fd_pivot = -EBADF;
1249 __do_free char *pivot_path = NULL;
1250 struct hierarchy *h = ops->hierarchies[i];
1251 bool cpuset_v1 = false;
1252 int ret;
1253
1254 /* Monitor might have died before we entered the cgroup. */
1255 if (handler->monitor_pid <= 0) {
1256 WARN("No valid monitor process found while destroying cgroups");
1257 goto cgroup_prune_tree;
1258 }
1259
1260 if (conf->cgroup_meta.monitor_pivot_dir)
1261 pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
1262 else if (conf->cgroup_meta.dir)
1263 pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
1264 else
1265 pivot_path = must_make_path(CGROUP_PIVOT, NULL);
1266
1267 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
1268
1269 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
1270 if (fd_pivot < 0) {
1271 SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
1272 continue;
1273 }
1274
1275 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
1276 if (ret != 0) {
1277 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
1278 continue;
1279 }
1280
1281 cgroup_prune_tree:
1282 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
1283 if (ret < 0)
1284 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
1285 else
1286 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
1287 }
1288 }
1289
1290 /*
1291 * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
1292 * proper prefix directory of lxc.cgroup.dir.payload.
1293 *
1294 * Returns the prefix length if it is set, otherwise zero on success.
1295 */
1296 static bool check_cgroup_dir_config(struct lxc_conf *conf)
1297 {
1298 const char *monitor_dir = conf->cgroup_meta.monitor_dir,
1299 *container_dir = conf->cgroup_meta.container_dir,
1300 *namespace_dir = conf->cgroup_meta.namespace_dir;
1301
1302 /* none of the new options are set, all is fine */
1303 if (!monitor_dir && !container_dir && !namespace_dir)
1304 return true;
1305
1306 /* some are set, make sure lxc.cgroup.dir is not also set*/
1307 if (conf->cgroup_meta.dir)
1308 return log_error_errno(false, EINVAL,
1309 "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
1310
1311 /* make sure both monitor and payload are set */
1312 if (!monitor_dir || !container_dir)
1313 return log_error_errno(false, EINVAL,
1314 "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
1315
1316 /* namespace_dir may be empty */
1317 return true;
1318 }
1319
1320 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1321 {
1322 __do_free char *monitor_cgroup = NULL;
1323 int idx = 0;
1324 int i;
1325 size_t len;
1326 char *suffix = NULL;
1327 struct lxc_conf *conf;
1328
1329 if (!ops)
1330 return ret_set_errno(false, ENOENT);
1331
1332 if (!ops->hierarchies)
1333 return true;
1334
1335 if (ops->monitor_cgroup)
1336 return ret_set_errno(false, EEXIST);
1337
1338 if (!handler || !handler->conf)
1339 return ret_set_errno(false, EINVAL);
1340
1341 conf = handler->conf;
1342
1343 if (!check_cgroup_dir_config(conf))
1344 return false;
1345
1346 if (conf->cgroup_meta.monitor_dir) {
1347 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
1348 } else if (conf->cgroup_meta.dir) {
1349 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1350 DEFAULT_MONITOR_CGROUP_PREFIX,
1351 handler->name,
1352 CGROUP_CREATE_RETRY, NULL);
1353 } else if (ops->cgroup_pattern) {
1354 __do_free char *cgroup_tree = NULL;
1355
1356 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1357 if (!cgroup_tree)
1358 return ret_set_errno(false, ENOMEM);
1359
1360 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1361 DEFAULT_MONITOR_CGROUP,
1362 CGROUP_CREATE_RETRY, NULL);
1363 } else {
1364 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1365 handler->name,
1366 CGROUP_CREATE_RETRY, NULL);
1367 }
1368 if (!monitor_cgroup)
1369 return ret_set_errno(false, ENOMEM);
1370
1371 if (!conf->cgroup_meta.monitor_dir) {
1372 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1373 *suffix = '\0';
1374 }
1375 do {
1376 if (idx && suffix)
1377 sprintf(suffix, "-%d", idx);
1378
1379 for (i = 0; ops->hierarchies[i]; i++) {
1380 if (cgroup_tree_create(ops, handler->conf,
1381 ops->hierarchies[i],
1382 monitor_cgroup, NULL, false))
1383 continue;
1384
1385 DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1386 for (int j = 0; j <= i; j++)
1387 cgroup_tree_prune_leaf(ops->hierarchies[j],
1388 monitor_cgroup, false);
1389
1390 idx++;
1391 break;
1392 }
1393 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1394
1395 if (idx == 1000 || (!suffix && idx != 0))
1396 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1397
1398 ops->monitor_cgroup = move_ptr(monitor_cgroup);
1399 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1400 }
1401
1402 /*
1403 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1404 * next cgroup_pattern-1, -2, ..., -999.
1405 */
1406 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1407 {
1408 __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1409 char *limit_cgroup;
1410 int idx = 0;
1411 int i;
1412 size_t len;
1413 char *suffix = NULL;
1414 struct lxc_conf *conf;
1415
1416 if (!ops)
1417 return ret_set_errno(false, ENOENT);
1418
1419 if (!ops->hierarchies)
1420 return true;
1421
1422 if (ops->container_cgroup || ops->container_limit_cgroup)
1423 return ret_set_errno(false, EEXIST);
1424
1425 if (!handler || !handler->conf)
1426 return ret_set_errno(false, EINVAL);
1427
1428 conf = handler->conf;
1429
1430 if (!check_cgroup_dir_config(conf))
1431 return false;
1432
1433 if (conf->cgroup_meta.container_dir) {
1434 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1435 if (!__limit_cgroup)
1436 return ret_set_errno(false, ENOMEM);
1437
1438 if (conf->cgroup_meta.namespace_dir) {
1439 container_cgroup = must_make_path(__limit_cgroup,
1440 conf->cgroup_meta.namespace_dir,
1441 NULL);
1442 limit_cgroup = __limit_cgroup;
1443 } else {
1444 /* explicit paths but without isolation */
1445 limit_cgroup = move_ptr(__limit_cgroup);
1446 container_cgroup = limit_cgroup;
1447 }
1448 } else if (conf->cgroup_meta.dir) {
1449 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1450 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1451 handler->name,
1452 CGROUP_CREATE_RETRY, NULL);
1453 container_cgroup = limit_cgroup;
1454 } else if (ops->cgroup_pattern) {
1455 __do_free char *cgroup_tree = NULL;
1456
1457 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1458 if (!cgroup_tree)
1459 return ret_set_errno(false, ENOMEM);
1460
1461 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1462 DEFAULT_PAYLOAD_CGROUP,
1463 CGROUP_CREATE_RETRY, NULL);
1464 container_cgroup = limit_cgroup;
1465 } else {
1466 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1467 handler->name,
1468 CGROUP_CREATE_RETRY, NULL);
1469 container_cgroup = limit_cgroup;
1470 }
1471 if (!limit_cgroup)
1472 return ret_set_errno(false, ENOMEM);
1473
1474 if (!conf->cgroup_meta.container_dir) {
1475 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1476 *suffix = '\0';
1477 }
1478 do {
1479 if (idx && suffix)
1480 sprintf(suffix, "-%d", idx);
1481
1482 for (i = 0; ops->hierarchies[i]; i++) {
1483 if (cgroup_tree_create(ops, handler->conf,
1484 ops->hierarchies[i], limit_cgroup,
1485 conf->cgroup_meta.namespace_dir,
1486 true))
1487 continue;
1488
1489 DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
1490 for (int j = 0; j <= i; j++)
1491 cgroup_tree_prune_leaf(ops->hierarchies[j],
1492 limit_cgroup, true);
1493
1494 idx++;
1495 break;
1496 }
1497 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1498
1499 if (idx == 1000 || (!suffix && idx != 0))
1500 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1501
1502 ops->container_cgroup = move_ptr(container_cgroup);
1503 if (__limit_cgroup)
1504 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1505 else
1506 ops->container_limit_cgroup = ops->container_cgroup;
1507 INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1508 ops->container_cgroup, ops->container_limit_cgroup);
1509 return true;
1510 }
1511
1512 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1513 struct lxc_handler *handler)
1514 {
1515 int monitor_len, transient_len = 0;
1516 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1517 transient[INTTYPE_TO_STRLEN(pid_t)];
1518
1519 if (!ops)
1520 return ret_set_errno(false, ENOENT);
1521
1522 if (!ops->hierarchies)
1523 return true;
1524
1525 if (!ops->monitor_cgroup)
1526 return ret_set_errno(false, ENOENT);
1527
1528 if (!handler || !handler->conf)
1529 return ret_set_errno(false, EINVAL);
1530
1531 monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1532 if (monitor_len < 0)
1533 return false;
1534
1535 if (handler->transient_pid > 0) {
1536 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1537 if (transient_len < 0)
1538 return false;
1539 }
1540
1541 for (int i = 0; ops->hierarchies[i]; i++) {
1542 struct hierarchy *h = ops->hierarchies[i];
1543 int ret;
1544
1545 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", monitor, monitor_len);
1546 if (ret)
1547 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->cgfd_mon);
1548
1549 TRACE("Moved monitor into cgroup %d", h->cgfd_mon);
1550
1551 if (handler->transient_pid <= 0)
1552 continue;
1553
1554 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", transient, transient_len);
1555 if (ret)
1556 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->cgfd_mon);
1557
1558 TRACE("Moved transient process into cgroup %d", h->cgfd_mon);
1559
1560 /*
1561 * we don't keep the fds for non-unified hierarchies around
1562 * mainly because we don't make use of them anymore after the
1563 * core cgroup setup is done but also because there are quite a
1564 * lot of them.
1565 */
1566 if (!is_unified_hierarchy(h))
1567 close_prot_errno_disarm(h->cgfd_mon);
1568 }
1569 handler->transient_pid = -1;
1570
1571 return true;
1572 }
1573
1574 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1575 struct lxc_handler *handler)
1576 {
1577 int len;
1578 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1579
1580 if (!ops)
1581 return ret_set_errno(false, ENOENT);
1582
1583 if (!ops->hierarchies)
1584 return true;
1585
1586 if (!ops->container_cgroup)
1587 return ret_set_errno(false, ENOENT);
1588
1589 if (!handler || !handler->conf)
1590 return ret_set_errno(false, EINVAL);
1591
1592 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1593 if (len < 0)
1594 return false;
1595
1596 for (int i = 0; ops->hierarchies[i]; i++) {
1597 struct hierarchy *h = ops->hierarchies[i];
1598 int ret;
1599
1600 if (is_unified_hierarchy(h) &&
1601 (handler->clone_flags & CLONE_INTO_CGROUP))
1602 continue;
1603
1604 ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
1605 if (ret != 0)
1606 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
1607
1608 TRACE("Moved container into %s cgroup via %d", h->container_full_path, h->cgfd_con);
1609 }
1610
1611 return true;
1612 }
1613
1614 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1615 gid_t chown_gid, mode_t chmod_mode)
1616 {
1617 int ret;
1618
1619 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1620 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1621 if (ret < 0)
1622 return log_warn_errno(-1,
1623 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1624 dirfd, path, (int)chown_uid,
1625 (int)chown_gid);
1626
1627 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1628 if (ret < 0)
1629 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1630 dirfd, path, (int)chmod_mode);
1631
1632 return 0;
1633 }
1634
1635 /* chgrp the container cgroups to container group. We leave
1636 * the container owner as cgroup owner. So we must make the
1637 * directories 775 so that the container can create sub-cgroups.
1638 *
1639 * Also chown the tasks and cgroup.procs files. Those may not
1640 * exist depending on kernel version.
1641 */
1642 static int chown_cgroup_wrapper(void *data)
1643 {
1644 int ret;
1645 uid_t destuid;
1646 struct generic_userns_exec_data *arg = data;
1647 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1648 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1649
1650 if (!lxc_drop_groups() && errno != EPERM)
1651 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1652
1653 ret = setresgid(nsgid, nsgid, nsgid);
1654 if (ret < 0)
1655 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1656 (int)nsgid, (int)nsgid, (int)nsgid);
1657
1658 ret = setresuid(nsuid, nsuid, nsuid);
1659 if (ret < 0)
1660 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1661 (int)nsuid, (int)nsuid, (int)nsuid);
1662
1663 destuid = get_ns_uid(arg->origuid);
1664 if (destuid == LXC_INVALID_UID)
1665 destuid = 0;
1666
1667 for (int i = 0; arg->hierarchies[i]; i++) {
1668 int dirfd = arg->hierarchies[i]->cgfd_con;
1669
1670 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1671
1672 /*
1673 * Failures to chown() these are inconvenient but not
1674 * detrimental We leave these owned by the container launcher,
1675 * so that container root can write to the files to attach. We
1676 * chmod() them 664 so that container systemd can write to the
1677 * files (which systemd in wily insists on doing).
1678 */
1679
1680 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC)
1681 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1682
1683 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1684
1685 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1686 continue;
1687
1688 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++)
1689 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1690 }
1691
1692 return 0;
1693 }
1694
1695 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1696 struct lxc_conf *conf)
1697 {
1698 struct generic_userns_exec_data wrap;
1699
1700 if (!ops)
1701 return ret_set_errno(false, ENOENT);
1702
1703 if (!ops->hierarchies)
1704 return true;
1705
1706 if (!ops->container_cgroup)
1707 return ret_set_errno(false, ENOENT);
1708
1709 if (!conf)
1710 return ret_set_errno(false, EINVAL);
1711
1712 if (lxc_list_empty(&conf->id_map))
1713 return true;
1714
1715 wrap.origuid = geteuid();
1716 wrap.path = NULL;
1717 wrap.hierarchies = ops->hierarchies;
1718 wrap.conf = conf;
1719
1720 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1721 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1722
1723 return true;
1724 }
1725
1726 __cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops)
1727 {
1728 if (!ops)
1729 return;
1730
1731 if (!ops->hierarchies)
1732 return;
1733
1734 for (int i = 0; ops->hierarchies[i]; i++) {
1735 struct hierarchy *h = ops->hierarchies[i];
1736 /*
1737 * we don't keep the fds for non-unified hierarchies around
1738 * mainly because we don't make use of them anymore after the
1739 * core cgroup setup is done but also because there are quite a
1740 * lot of them.
1741 */
1742 if (!is_unified_hierarchy(h))
1743 close_prot_errno_disarm(h->cgfd_con);
1744 }
1745
1746 /*
1747 * The checking for freezer support should obviously be done at cgroup
1748 * initialization time but that doesn't work reliable. The freezer
1749 * controller has been demoted (rightly so) to a simple file located in
1750 * each non-root cgroup. At the time when the container is created we
1751 * might still be located in /sys/fs/cgroup and so checking for
1752 * cgroup.freeze won't tell us anything because this file doesn't exist
1753 * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1754 * find an already existing cgroup and then check within that cgroup
1755 * for the existence of cgroup.freeze but that will only work on
1756 * systemd based hosts. Other init systems might not manage cgroups and
1757 * so no cgroup will exist. So we defer until we have created cgroups
1758 * for our container which means we check here.
1759 */
1760 if (pure_unified_layout(ops) &&
1761 !faccessat(ops->unified->cgfd_con, "cgroup.freeze", F_OK,
1762 AT_SYMLINK_NOFOLLOW)) {
1763 TRACE("Unified hierarchy supports freezer");
1764 ops->unified->freezer_controller = 1;
1765 }
1766 }
1767
1768 /* cgroup-full:* is done, no need to create subdirs */
1769 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1770 {
1771 switch (cgroup_automount_type) {
1772 case LXC_AUTO_CGROUP_RO:
1773 return true;
1774 case LXC_AUTO_CGROUP_RW:
1775 return true;
1776 case LXC_AUTO_CGROUP_MIXED:
1777 return true;
1778 }
1779
1780 return false;
1781 }
1782
1783 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1784 * remount controller ro if needed and bindmount the cgroupfs onto
1785 * control/the/cg/path.
1786 */
1787 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1788 char *controllerpath, char *cgpath,
1789 const char *container_cgroup)
1790 {
1791 __do_free char *sourcepath = NULL;
1792 int ret, remount_flags;
1793 int flags = MS_BIND;
1794
1795 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1796 (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1797 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1798 if (ret < 0)
1799 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1800 controllerpath, controllerpath);
1801
1802 remount_flags = add_required_remount_flags(controllerpath,
1803 controllerpath,
1804 flags | MS_REMOUNT);
1805 ret = mount(controllerpath, controllerpath, "cgroup",
1806 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1807 NULL);
1808 if (ret < 0)
1809 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath);
1810
1811 INFO("Remounted %s read-only", controllerpath);
1812 }
1813
1814 sourcepath = must_make_path(h->mountpoint, h->container_base_path,
1815 container_cgroup, NULL);
1816 if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1817 flags |= MS_RDONLY;
1818
1819 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1820 if (ret < 0)
1821 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1822 h->controllers[0], cgpath);
1823 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1824
1825 if (flags & MS_RDONLY) {
1826 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1827 flags | MS_REMOUNT);
1828 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1829 if (ret < 0)
1830 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1831 INFO("Remounted %s read-only", cgpath);
1832 }
1833
1834 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1835 return 0;
1836 }
1837
1838 /* __cgroupfs_mount
1839 *
1840 * Mount cgroup hierarchies directly without using bind-mounts. The main
1841 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1842 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1843 */
1844 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1845 struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1846 const char *hierarchy_mnt)
1847 {
1848 __do_close int fd_fs = -EBADF;
1849 unsigned int flags = 0;
1850 char *fstype;
1851 int ret;
1852
1853 if (dfd_mnt_cgroupfs < 0)
1854 return ret_errno(EINVAL);
1855
1856 flags |= MOUNT_ATTR_NOSUID;
1857 flags |= MOUNT_ATTR_NOEXEC;
1858 flags |= MOUNT_ATTR_NODEV;
1859 flags |= MOUNT_ATTR_RELATIME;
1860
1861 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1862 (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1863 flags |= MOUNT_ATTR_RDONLY;
1864
1865 if (is_unified_hierarchy(h))
1866 fstype = "cgroup2";
1867 else
1868 fstype = "cgroup";
1869
1870 if (can_use_mount_api()) {
1871 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1872 if (fd_fs < 0)
1873 return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1874
1875 if (!is_unified_hierarchy(h)) {
1876 for (const char **it = (const char **)h->controllers; it && *it; it++) {
1877 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1878 ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1879 else
1880 ret = fs_set_property(fd_fs, *it, "");
1881 if (ret < 0)
1882 return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1883 }
1884 }
1885
1886 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1887 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1888 flags);
1889 } else {
1890 __do_free char *controllers = NULL, *target = NULL;
1891 unsigned int old_flags = 0;
1892 const char *rootfs_mnt;
1893
1894 if (!is_unified_hierarchy(h)) {
1895 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1896 if (!controllers)
1897 return ret_errno(ENOMEM);
1898 }
1899
1900 rootfs_mnt = get_rootfs_mnt(rootfs);
1901 ret = mnt_attributes_old(flags, &old_flags);
1902 if (ret)
1903 return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1904
1905 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1906 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1907 }
1908 if (ret < 0)
1909 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1910 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1911
1912 DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1913 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1914 return 0;
1915 }
1916
1917 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1918 struct lxc_rootfs *rootfs,
1919 int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1920 {
1921 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1922 dfd_mnt_cgroupfs, hierarchy_mnt);
1923 }
1924
1925 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1926 struct lxc_rootfs *rootfs,
1927 int dfd_mnt_cgroupfs,
1928 const char *hierarchy_mnt)
1929 {
1930 switch (cgroup_automount_type) {
1931 case LXC_AUTO_CGROUP_FULL_RO:
1932 break;
1933 case LXC_AUTO_CGROUP_FULL_RW:
1934 break;
1935 case LXC_AUTO_CGROUP_FULL_MIXED:
1936 break;
1937 default:
1938 return 0;
1939 }
1940
1941 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1942 dfd_mnt_cgroupfs, hierarchy_mnt);
1943 }
1944
1945 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1946 struct lxc_handler *handler, int cg_flags)
1947 {
1948 __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1949 __do_free char *cgroup_root = NULL;
1950 int cgroup_automount_type;
1951 bool in_cgroup_ns = false, wants_force_mount = false;
1952 struct lxc_conf *conf = handler->conf;
1953 struct lxc_rootfs *rootfs = &conf->rootfs;
1954 const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1955 int ret;
1956
1957 if (!ops)
1958 return ret_set_errno(false, ENOENT);
1959
1960 if (!ops->hierarchies)
1961 return true;
1962
1963 if (!conf)
1964 return ret_set_errno(false, EINVAL);
1965
1966 if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1967 return log_trace(true, "No cgroup mounts requested");
1968
1969 if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1970 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1971 wants_force_mount = true;
1972 }
1973
1974 switch (cg_flags) {
1975 case LXC_AUTO_CGROUP_RO:
1976 TRACE("Read-only cgroup mounts requested");
1977 break;
1978 case LXC_AUTO_CGROUP_RW:
1979 TRACE("Read-write cgroup mounts requested");
1980 break;
1981 case LXC_AUTO_CGROUP_MIXED:
1982 TRACE("Mixed cgroup mounts requested");
1983 break;
1984 case LXC_AUTO_CGROUP_FULL_RO:
1985 TRACE("Full read-only cgroup mounts requested");
1986 break;
1987 case LXC_AUTO_CGROUP_FULL_RW:
1988 TRACE("Full read-write cgroup mounts requested");
1989 break;
1990 case LXC_AUTO_CGROUP_FULL_MIXED:
1991 TRACE("Full mixed cgroup mounts requested");
1992 break;
1993 default:
1994 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1995 }
1996 cgroup_automount_type = cg_flags;
1997
1998 if (!wants_force_mount) {
1999 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
2000
2001 /*
2002 * Most recent distro versions currently have init system that
2003 * do support cgroup2 but do not mount it by default unless
2004 * explicitly told so even if the host is cgroup2 only. That
2005 * means they often will fail to boot. Fix this by pre-mounting
2006 * cgroup2 by default. We will likely need to be doing this a
2007 * few years until all distros have switched over to cgroup2 at
2008 * which point we can safely assume that their init systems
2009 * will mount it themselves.
2010 */
2011 if (pure_unified_layout(ops))
2012 wants_force_mount = true;
2013 }
2014
2015 if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
2016 in_cgroup_ns = true;
2017
2018 if (in_cgroup_ns && !wants_force_mount)
2019 return log_trace(true, "Mounting cgroups not requested or needed");
2020
2021 /* This is really the codepath that we want. */
2022 if (pure_unified_layout(ops)) {
2023 __do_close int dfd_mnt_unified = -EBADF;
2024
2025 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
2026 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
2027 if (dfd_mnt_unified < 0)
2028 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
2029 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
2030 /*
2031 * If cgroup namespaces are supported but the container will
2032 * not have CAP_SYS_ADMIN after it has started we need to mount
2033 * the cgroups manually.
2034 *
2035 * Note that here we know that wants_force_mount is true.
2036 * Otherwise we would've returned early above.
2037 */
2038 if (in_cgroup_ns) {
2039 /*
2040 * 1. cgroup:rw:force -> Mount the cgroup2 filesystem.
2041 * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only.
2042 * 3. cgroup:mixed:force -> See comment above how this
2043 * does not apply so
2044 * cgroup:mixed is equal to
2045 * cgroup:rw when cgroup
2046 * namespaces are supported.
2047
2048 * 4. cgroup:rw -> No-op; init system responsible for mounting.
2049 * 5. cgroup:ro -> No-op; init system responsible for mounting.
2050 * 6. cgroup:mixed -> No-op; init system responsible for mounting.
2051 *
2052 * 7. cgroup-full:rw -> Not supported.
2053 * 8. cgroup-full:ro -> Not supported.
2054 * 9. cgroup-full:mixed -> Not supported.
2055
2056 * 10. cgroup-full:rw:force -> Not supported.
2057 * 11. cgroup-full:ro:force -> Not supported.
2058 * 12. cgroup-full:mixed:force -> Not supported.
2059 */
2060 ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
2061 if (ret < 0)
2062 return syserrno(false, "Failed to force mount cgroup filesystem in cgroup namespace");
2063
2064 return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
2065 } else {
2066 /*
2067 * Either no cgroup namespace supported (highly
2068 * unlikely unless we're dealing with a Frankenkernel.
2069 * Or the user requested to keep the cgroup namespace
2070 * of the host or another container.
2071 */
2072 if (wants_force_mount) {
2073 /*
2074 * 1. cgroup:rw:force -> Bind-mount the cgroup2 filesystem writable.
2075 * 2. cgroup:ro:force -> Bind-mount the cgroup2 filesystem read-only.
2076 * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
2077 * and make the parent directory of the
2078 * container's cgroup read-only but the
2079 * container's cgroup writable.
2080 *
2081 * 10. cgroup-full:rw:force ->
2082 * 11. cgroup-full:ro:force ->
2083 * 12. cgroup-full:mixed:force ->
2084 */
2085 errno = EOPNOTSUPP;
2086 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
2087 } else {
2088 errno = EOPNOTSUPP;
2089 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
2090 }
2091 }
2092
2093 return syserrno(false, "Failed to mount cgroups");
2094 }
2095
2096 /*
2097 * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
2098 * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
2099 * DEFAULT_CGROUP_MOUNTPOINT define.
2100 */
2101 if (can_use_mount_api()) {
2102 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
2103 if (fd_fs < 0)
2104 return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
2105
2106 ret = fs_set_property(fd_fs, "mode", "0755");
2107 if (ret < 0)
2108 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
2109
2110 ret = fs_set_property(fd_fs, "size", "10240k");
2111 if (ret < 0)
2112 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
2113
2114 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
2115 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
2116 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
2117 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
2118 } else {
2119 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
2120 ret = safe_mount(NULL, cgroup_root, "tmpfs",
2121 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
2122 "size=10240k,mode=755", rootfs_mnt);
2123 }
2124 if (ret < 0)
2125 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
2126 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
2127
2128 dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
2129 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
2130 if (dfd_mnt_tmpfs < 0)
2131 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
2132 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
2133
2134 for (int i = 0; ops->hierarchies[i]; i++) {
2135 __do_free char *controllerpath = NULL, *path2 = NULL;
2136 struct hierarchy *h = ops->hierarchies[i];
2137 char *controller = strrchr(h->mountpoint, '/');
2138
2139 if (!controller)
2140 continue;
2141 controller++;
2142
2143 ret = mkdirat(dfd_mnt_tmpfs, controller, 0000);
2144 if (ret < 0)
2145 return log_error_errno(false, errno, "Failed to create cgroup mountpoint %d(%s)", dfd_mnt_tmpfs, controller);
2146
2147 if (in_cgroup_ns && wants_force_mount) {
2148 /*
2149 * If cgroup namespaces are supported but the container
2150 * will not have CAP_SYS_ADMIN after it has started we
2151 * need to mount the cgroups manually.
2152 */
2153 ret = cgroupfs_mount(cgroup_automount_type, h, rootfs, dfd_mnt_tmpfs, controller);
2154 if (ret < 0)
2155 return false;
2156
2157 continue;
2158 }
2159
2160 /* Here is where the ancient kernel section begins. */
2161 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs, dfd_mnt_tmpfs, controller);
2162 if (ret < 0)
2163 return false;
2164
2165 if (!cg_mount_needs_subdirs(cgroup_automount_type))
2166 continue;
2167
2168 if (!cgroup_root)
2169 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
2170
2171 controllerpath = must_make_path(cgroup_root, controller, NULL);
2172 path2 = must_make_path(controllerpath, h->container_base_path, ops->container_cgroup, NULL);
2173 ret = mkdir_p(path2, 0755);
2174 if (ret < 0 && (errno != EEXIST))
2175 return false;
2176
2177 ret = cg_legacy_mount_controllers(cgroup_automount_type, h, controllerpath, path2, ops->container_cgroup);
2178 if (ret < 0)
2179 return false;
2180 }
2181
2182 return true;
2183 }
2184
2185 /* Only root needs to escape to the cgroup of its init. */
2186 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
2187 struct lxc_conf *conf)
2188 {
2189 if (!ops)
2190 return ret_set_errno(false, ENOENT);
2191
2192 if (!ops->hierarchies)
2193 return true;
2194
2195 if (!conf)
2196 return ret_set_errno(false, EINVAL);
2197
2198 if (conf->cgroup_meta.relative || geteuid())
2199 return true;
2200
2201 for (int i = 0; ops->hierarchies[i]; i++) {
2202 __do_free char *fullpath = NULL;
2203 int ret;
2204
2205 fullpath =
2206 must_make_path(ops->hierarchies[i]->mountpoint,
2207 ops->hierarchies[i]->container_base_path,
2208 "cgroup.procs", NULL);
2209 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
2210 if (ret != 0)
2211 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
2212 }
2213
2214 return true;
2215 }
2216
2217 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
2218 {
2219 int i = 0;
2220
2221 if (!ops)
2222 return ret_set_errno(-1, ENOENT);
2223
2224 if (!ops->hierarchies)
2225 return 0;
2226
2227 for (; ops->hierarchies[i]; i++)
2228 ;
2229
2230 return i;
2231 }
2232
2233 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
2234 int n, char ***out)
2235 {
2236 int i;
2237
2238 if (!ops)
2239 return ret_set_errno(false, ENOENT);
2240
2241 if (!ops->hierarchies)
2242 return ret_set_errno(false, ENOENT);
2243
2244 /* sanity check n */
2245 for (i = 0; i < n; i++)
2246 if (!ops->hierarchies[i])
2247 return ret_set_errno(false, ENOENT);
2248
2249 *out = ops->hierarchies[i]->controllers;
2250
2251 return true;
2252 }
2253
2254 static bool cg_legacy_freeze(struct cgroup_ops *ops)
2255 {
2256 struct hierarchy *h;
2257
2258 h = get_hierarchy(ops, "freezer");
2259 if (!h)
2260 return ret_set_errno(-1, ENOENT);
2261
2262 return lxc_write_openat(h->container_full_path, "freezer.state",
2263 "FROZEN", STRLITERALLEN("FROZEN"));
2264 }
2265
2266 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
2267 struct lxc_epoll_descr *descr)
2268 {
2269 __do_free char *line = NULL;
2270 __do_fclose FILE *f = NULL;
2271 int state = PTR_TO_INT(cbdata);
2272 size_t len;
2273 const char *state_string;
2274
2275 f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
2276 if (!f)
2277 return LXC_MAINLOOP_ERROR;
2278
2279 if (state == 1)
2280 state_string = "frozen 1";
2281 else
2282 state_string = "frozen 0";
2283
2284 while (getline(&line, &len, f) != -1)
2285 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
2286 return LXC_MAINLOOP_CLOSE;
2287
2288 rewind(f);
2289
2290 return LXC_MAINLOOP_CONTINUE;
2291 }
2292
2293 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
2294 const char *state_string,
2295 int state_num,
2296 const char *epoll_error,
2297 const char *wait_error)
2298 {
2299 __do_close int fd = -EBADF;
2300 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
2301 int ret;
2302 struct lxc_epoll_descr descr;
2303 struct hierarchy *h;
2304
2305 h = ops->unified;
2306 if (!h)
2307 return ret_set_errno(-1, ENOENT);
2308
2309 if (!h->container_full_path)
2310 return ret_set_errno(-1, EEXIST);
2311
2312 if (timeout != 0) {
2313 __do_free char *events_file = NULL;
2314
2315 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2316 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2317 if (fd < 0)
2318 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
2319
2320 ret = lxc_mainloop_open(&descr);
2321 if (ret)
2322 return log_error_errno(-1, errno, "%s", epoll_error);
2323
2324 /* automatically cleaned up now */
2325 descr_ptr = &descr;
2326
2327 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
2328 if (ret < 0)
2329 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
2330 }
2331
2332 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", state_string, 1);
2333 if (ret < 0)
2334 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
2335
2336 if (timeout != 0 && lxc_mainloop(&descr, timeout))
2337 return log_error_errno(-1, errno, "%s", wait_error);
2338
2339 return 0;
2340 }
2341
2342 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
2343 {
2344 return cg_unified_freeze_do(ops, timeout, "1", 1,
2345 "Failed to create epoll instance to wait for container freeze",
2346 "Failed to wait for container to be frozen");
2347 }
2348
2349 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
2350 {
2351 if (!ops->hierarchies)
2352 return ret_set_errno(-1, ENOENT);
2353
2354 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2355 return cg_legacy_freeze(ops);
2356
2357 return cg_unified_freeze(ops, timeout);
2358 }
2359
2360 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
2361 {
2362 struct hierarchy *h;
2363
2364 h = get_hierarchy(ops, "freezer");
2365 if (!h)
2366 return ret_set_errno(-1, ENOENT);
2367
2368 return lxc_write_openat(h->container_full_path, "freezer.state",
2369 "THAWED", STRLITERALLEN("THAWED"));
2370 }
2371
2372 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2373 {
2374 return cg_unified_freeze_do(ops, timeout, "0", 0,
2375 "Failed to create epoll instance to wait for container unfreeze",
2376 "Failed to wait for container to be unfrozen");
2377 }
2378
2379 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2380 {
2381 if (!ops->hierarchies)
2382 return ret_set_errno(-1, ENOENT);
2383
2384 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2385 return cg_legacy_unfreeze(ops);
2386
2387 return cg_unified_unfreeze(ops, timeout);
2388 }
2389
2390 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2391 const char *controller, bool limiting)
2392 {
2393 struct hierarchy *h;
2394
2395 h = get_hierarchy(ops, controller);
2396 if (!h)
2397 return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
2398 controller ? controller : "(null)");
2399
2400 if (limiting)
2401 return h->container_limit_path
2402 ? h->container_limit_path + strlen(h->mountpoint)
2403 : NULL;
2404
2405 return h->container_full_path
2406 ? h->container_full_path + strlen(h->mountpoint)
2407 : NULL;
2408 }
2409
2410 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2411 const char *controller)
2412 {
2413 return cgfsng_get_cgroup_do(ops, controller, false);
2414 }
2415
2416 __cgfsng_ops static const char *cgfsng_get_limiting_cgroup(struct cgroup_ops *ops,
2417 const char *controller)
2418 {
2419 return cgfsng_get_cgroup_do(ops, controller, true);
2420 }
2421
2422 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2423 * which must be freed by the caller.
2424 */
2425 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2426 const char *inpath,
2427 const char *filename)
2428 {
2429 return must_make_path(h->mountpoint, inpath, filename, NULL);
2430 }
2431
2432 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2433 {
2434 int idx = 1;
2435 int ret;
2436 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2437 ssize_t pidstr_len;
2438
2439 /* Create leaf cgroup. */
2440 ret = mkdirat(unified_fd, ".lxc", 0755);
2441 if (ret < 0 && errno != EEXIST)
2442 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2443
2444 pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2445 if (pidstr_len < 0)
2446 return pidstr_len;
2447
2448 ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2449 if (ret < 0)
2450 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2451 if (ret == 0)
2452 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2453
2454 /* this is a non-leaf node */
2455 if (errno != EBUSY)
2456 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2457
2458 do {
2459 bool rm = false;
2460 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2461 char *slash = attach_cgroup;
2462
2463 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2464 if (ret < 0)
2465 return ret;
2466
2467 /*
2468 * This shouldn't really happen but the compiler might complain
2469 * that a short write would cause a buffer overrun. So be on
2470 * the safe side.
2471 */
2472 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2473 return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2474
2475 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2476 *slash = '\0';
2477
2478 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2479 if (ret < 0 && errno != EEXIST)
2480 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2481 if (ret == 0)
2482 rm = true;
2483
2484 *slash = '/';
2485
2486 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2487 if (ret == 0)
2488 return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2489
2490 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2491 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2492
2493 /* this is a non-leaf node */
2494 if (errno != EBUSY)
2495 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2496
2497 idx++;
2498 } while (idx < 1000);
2499
2500 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2501 }
2502
2503 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2504 int unified_fd, int *sk_fd)
2505 {
2506 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2507 int target_fds[2];
2508 ssize_t ret;
2509
2510 /* Create leaf cgroup. */
2511 ret = mkdirat(unified_fd, ".lxc", 0755);
2512 if (ret < 0 && errno != EEXIST)
2513 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2514
2515 target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2516 if (target_fd0 < 0)
2517 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2518 target_fds[0] = target_fd0;
2519
2520 target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2521 if (target_fd1 < 0)
2522 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2523 target_fds[1] = target_fd1;
2524
2525 ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2526 if (ret <= 0)
2527 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2528 target_fd0, target_fd1);
2529
2530 return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2531 }
2532
2533 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2534 int *sk_fd, pid_t pid)
2535 {
2536 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2537 int target_fds[2];
2538 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2539 size_t pidstr_len;
2540 ssize_t ret;
2541
2542 ret = lxc_abstract_unix_recv_fds(sk, target_fds, 2, NULL, 0);
2543 if (ret <= 0)
2544 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2545 target_fd0 = target_fds[0];
2546 target_fd1 = target_fds[1];
2547
2548 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2549
2550 ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2551 if (ret > 0 && ret == pidstr_len)
2552 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2553
2554 ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2555 if (ret > 0 && ret == pidstr_len)
2556 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2557
2558 return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2559 target_fd0, target_fd1);
2560 }
2561
2562 struct userns_exec_unified_attach_data {
2563 const struct lxc_conf *conf;
2564 int unified_fd;
2565 int sk_pair[2];
2566 pid_t pid;
2567 };
2568
2569 static int cgroup_unified_attach_child_wrapper(void *data)
2570 {
2571 struct userns_exec_unified_attach_data *args = data;
2572
2573 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2574 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2575 return ret_errno(EINVAL);
2576
2577 close_prot_errno_disarm(args->sk_pair[0]);
2578 return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2579 &args->sk_pair[1]);
2580 }
2581
2582 static int cgroup_unified_attach_parent_wrapper(void *data)
2583 {
2584 struct userns_exec_unified_attach_data *args = data;
2585
2586 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2587 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2588 return ret_errno(EINVAL);
2589
2590 close_prot_errno_disarm(args->sk_pair[1]);
2591 return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2592 args->pid);
2593 }
2594
2595 /* Technically, we're always at a delegation boundary here (This is especially
2596 * true when cgroup namespaces are available.). The reasoning is that in order
2597 * for us to have been able to start a container in the first place the root
2598 * cgroup must have been a leaf node. Now, either the container's init system
2599 * has populated the cgroup and kept it as a leaf node or it has created
2600 * subtrees. In the former case we will simply attach to the leaf node we
2601 * created when we started the container in the latter case we create our own
2602 * cgroup for the attaching process.
2603 */
2604 static int __cg_unified_attach(const struct hierarchy *h,
2605 const struct lxc_conf *conf, const char *name,
2606 const char *lxcpath, pid_t pid,
2607 const char *controller)
2608 {
2609 __do_close int unified_fd = -EBADF;
2610 __do_free char *path = NULL, *cgroup = NULL;
2611 int ret;
2612
2613 if (!conf || !name || !lxcpath || pid <= 0)
2614 return ret_errno(EINVAL);
2615
2616 ret = cgroup_attach(conf, name, lxcpath, pid);
2617 if (ret == 0)
2618 return log_trace(0, "Attached to unified cgroup via command handler");
2619 if (ret != -ENOCGROUP2)
2620 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2621
2622 /* Fall back to retrieving the path for the unified cgroup. */
2623 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2624 /* not running */
2625 if (!cgroup)
2626 return 0;
2627
2628 path = must_make_path(h->mountpoint, cgroup, NULL);
2629
2630 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2631 if (unified_fd < 0)
2632 return ret_errno(EBADF);
2633
2634 if (!lxc_list_empty(&conf->id_map)) {
2635 struct userns_exec_unified_attach_data args = {
2636 .conf = conf,
2637 .unified_fd = unified_fd,
2638 .pid = pid,
2639 };
2640
2641 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2642 if (ret < 0)
2643 return -errno;
2644
2645 ret = userns_exec_minimal(conf,
2646 cgroup_unified_attach_parent_wrapper,
2647 &args,
2648 cgroup_unified_attach_child_wrapper,
2649 &args);
2650 } else {
2651 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2652 }
2653
2654 return ret;
2655 }
2656
2657 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2658 const struct lxc_conf *conf,
2659 const char *name, const char *lxcpath,
2660 pid_t pid)
2661 {
2662 int len, ret;
2663 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2664
2665 if (!ops)
2666 return ret_set_errno(false, ENOENT);
2667
2668 if (!ops->hierarchies)
2669 return true;
2670
2671 len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2672 if (len < 0)
2673 return false;
2674
2675 for (int i = 0; ops->hierarchies[i]; i++) {
2676 __do_free char *fullpath = NULL, *path = NULL;
2677 struct hierarchy *h = ops->hierarchies[i];
2678
2679 if (h->version == CGROUP2_SUPER_MAGIC) {
2680 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2681 h->controllers[0]);
2682 if (ret < 0)
2683 return false;
2684
2685 continue;
2686 }
2687
2688 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2689 /* not running */
2690 if (!path)
2691 return false;
2692
2693 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2694 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2695 if (ret < 0)
2696 return log_error_errno(false, errno, "Failed to attach %d to %s",
2697 (int)pid, fullpath);
2698 }
2699
2700 return true;
2701 }
2702
2703 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2704 * don't have a cgroup_data set up, so we ask the running container through the
2705 * commands API for the cgroup path.
2706 */
2707 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2708 char *value, size_t len, const char *name,
2709 const char *lxcpath)
2710 {
2711 __do_free char *path = NULL;
2712 __do_free char *controller = NULL;
2713 char *p;
2714 struct hierarchy *h;
2715 int ret = -1;
2716
2717 if (!ops)
2718 return ret_set_errno(-1, ENOENT);
2719
2720 controller = must_copy_string(filename);
2721 p = strchr(controller, '.');
2722 if (p)
2723 *p = '\0';
2724
2725 path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2726 /* not running */
2727 if (!path)
2728 return -1;
2729
2730 h = get_hierarchy(ops, controller);
2731 if (h) {
2732 __do_free char *fullpath = NULL;
2733
2734 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2735 ret = lxc_read_from_file(fullpath, value, len);
2736 }
2737
2738 return ret;
2739 }
2740
2741 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2742 {
2743 for (int count = 0; count < 3; count++, val++) {
2744 switch (*val) {
2745 case 'r':
2746 device->access[count] = *val;
2747 break;
2748 case 'w':
2749 device->access[count] = *val;
2750 break;
2751 case 'm':
2752 device->access[count] = *val;
2753 break;
2754 case '\n':
2755 case '\0':
2756 count = 3;
2757 break;
2758 default:
2759 return ret_errno(EINVAL);
2760 }
2761 }
2762
2763 return 0;
2764 }
2765
2766 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2767 const char *val)
2768 {
2769 int count, ret;
2770 char temp[50];
2771
2772 if (strequal("devices.allow", key))
2773 device->allow = 1; /* allow the device */
2774 else
2775 device->allow = 0; /* deny the device */
2776
2777 if (strequal(val, "a")) {
2778 /* global rule */
2779 device->type = 'a';
2780 device->major = -1;
2781 device->minor = -1;
2782
2783 if (device->allow) /* allow all devices */
2784 device->global_rule = LXC_BPF_DEVICE_CGROUP_DENYLIST;
2785 else /* deny all devices */
2786 device->global_rule = LXC_BPF_DEVICE_CGROUP_ALLOWLIST;
2787
2788 device->allow = -1;
2789 return 0;
2790 }
2791
2792 /* local rule */
2793 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2794
2795 switch (*val) {
2796 case 'a':
2797 __fallthrough;
2798 case 'b':
2799 __fallthrough;
2800 case 'c':
2801 device->type = *val;
2802 break;
2803 default:
2804 return -1;
2805 }
2806
2807 val++;
2808 if (!isspace(*val))
2809 return -1;
2810 val++;
2811 if (*val == '*') {
2812 device->major = -1;
2813 val++;
2814 } else if (isdigit(*val)) {
2815 memset(temp, 0, sizeof(temp));
2816 for (count = 0; count < sizeof(temp) - 1; count++) {
2817 temp[count] = *val;
2818 val++;
2819 if (!isdigit(*val))
2820 break;
2821 }
2822 ret = lxc_safe_int(temp, &device->major);
2823 if (ret)
2824 return -1;
2825 } else {
2826 return -1;
2827 }
2828 if (*val != ':')
2829 return -1;
2830 val++;
2831
2832 /* read minor */
2833 if (*val == '*') {
2834 device->minor = -1;
2835 val++;
2836 } else if (isdigit(*val)) {
2837 memset(temp, 0, sizeof(temp));
2838 for (count = 0; count < sizeof(temp) - 1; count++) {
2839 temp[count] = *val;
2840 val++;
2841 if (!isdigit(*val))
2842 break;
2843 }
2844 ret = lxc_safe_int(temp, &device->minor);
2845 if (ret)
2846 return -1;
2847 } else {
2848 return -1;
2849 }
2850 if (!isspace(*val))
2851 return -1;
2852
2853 return device_cgroup_parse_access(device, ++val);
2854 }
2855
2856 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2857 * don't have a cgroup_data set up, so we ask the running container through the
2858 * commands API for the cgroup path.
2859 */
2860 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2861 const char *key, const char *value,
2862 const char *name, const char *lxcpath)
2863 {
2864 __do_free char *path = NULL;
2865 __do_free char *controller = NULL;
2866 char *p;
2867 struct hierarchy *h;
2868 int ret = -1;
2869
2870 if (!ops || is_empty_string(key) || is_empty_string(value) ||
2871 is_empty_string(name) || is_empty_string(lxcpath))
2872 return ret_errno(EINVAL);
2873
2874 controller = must_copy_string(key);
2875 p = strchr(controller, '.');
2876 if (p)
2877 *p = '\0';
2878
2879 if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2880 struct device_item device = {};
2881
2882 ret = device_cgroup_rule_parse(&device, key, value);
2883 if (ret < 0)
2884 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2885 key, value);
2886
2887 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2888 if (ret < 0)
2889 return -1;
2890
2891 return 0;
2892 }
2893
2894 path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2895 /* not running */
2896 if (!path)
2897 return -1;
2898
2899 h = get_hierarchy(ops, controller);
2900 if (h) {
2901 __do_free char *fullpath = NULL;
2902
2903 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2904 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2905 }
2906
2907 return ret;
2908 }
2909
2910 /* take devices cgroup line
2911 * /dev/foo rwx
2912 * and convert it to a valid
2913 * type major:minor mode
2914 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2915 * the output.
2916 */
2917 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2918 const char *devpath)
2919 {
2920 __do_free char *path = NULL;
2921 char *mode = NULL;
2922 int n_parts, ret;
2923 char *p;
2924 struct stat sb;
2925
2926 path = must_copy_string(devpath);
2927
2928 /*
2929 * Read path followed by mode. Ignore any trailing text.
2930 * A ' # comment' would be legal. Technically other text is not
2931 * legal, we could check for that if we cared to.
2932 */
2933 for (n_parts = 1, p = path; *p; p++) {
2934 if (*p != ' ')
2935 continue;
2936 *p = '\0';
2937
2938 if (n_parts != 1)
2939 break;
2940 p++;
2941 n_parts++;
2942
2943 while (*p == ' ')
2944 p++;
2945
2946 mode = p;
2947
2948 if (*p == '\0')
2949 return ret_set_errno(-1, EINVAL);
2950 }
2951
2952 if (!mode)
2953 return ret_errno(EINVAL);
2954
2955 if (device_cgroup_parse_access(device, mode) < 0)
2956 return -1;
2957
2958 ret = stat(path, &sb);
2959 if (ret < 0)
2960 return ret_set_errno(-1, errno);
2961
2962 mode_t m = sb.st_mode & S_IFMT;
2963 switch (m) {
2964 case S_IFBLK:
2965 device->type = 'b';
2966 break;
2967 case S_IFCHR:
2968 device->type = 'c';
2969 break;
2970 default:
2971 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2972 }
2973
2974 device->major = MAJOR(sb.st_rdev);
2975 device->minor = MINOR(sb.st_rdev);
2976 device->allow = 1;
2977 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2978
2979 return 0;
2980 }
2981
2982 static int convert_devpath(const char *invalue, char *dest)
2983 {
2984 struct device_item device = {};
2985 int ret;
2986
2987 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2988 if (ret < 0)
2989 return -1;
2990
2991 ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2992 device.minor, device.access);
2993 if (ret < 0)
2994 return log_error_errno(ret, -ret,
2995 "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2996 device.type, device.major, device.minor,
2997 device.access);
2998
2999 return 0;
3000 }
3001
3002 /* Called from setup_limits - here we have the container's cgroup_data because
3003 * we created the cgroups.
3004 */
3005 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
3006 const char *value, bool is_cpuset)
3007 {
3008 __do_free char *controller = NULL;
3009 char *p;
3010 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
3011 char converted_value[50];
3012 struct hierarchy *h;
3013
3014 controller = must_copy_string(filename);
3015 p = strchr(controller, '.');
3016 if (p)
3017 *p = '\0';
3018
3019 if (strequal("devices.allow", filename) && value[0] == '/') {
3020 int ret;
3021
3022 ret = convert_devpath(value, converted_value);
3023 if (ret < 0)
3024 return ret;
3025 value = converted_value;
3026 }
3027
3028 h = get_hierarchy(ops, controller);
3029 if (!h)
3030 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
3031
3032 if (is_cpuset) {
3033 int ret = lxc_write_openat(h->container_full_path, filename, value, strlen(value));
3034 if (ret)
3035 return ret;
3036 }
3037 return lxc_write_openat(h->container_limit_path, filename, value, strlen(value));
3038 }
3039
3040 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
3041 struct lxc_conf *conf,
3042 bool do_devices)
3043 {
3044 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
3045 struct lxc_list *cgroup_settings = &conf->cgroup;
3046 struct lxc_list *iterator, *next;
3047 struct lxc_cgroup *cg;
3048 bool ret = false;
3049
3050 if (!ops)
3051 return ret_set_errno(false, ENOENT);
3052
3053 if (!conf)
3054 return ret_set_errno(false, EINVAL);
3055
3056 cgroup_settings = &conf->cgroup;
3057 if (lxc_list_empty(cgroup_settings))
3058 return true;
3059
3060 if (!ops->hierarchies)
3061 return ret_set_errno(false, EINVAL);
3062
3063 if (pure_unified_layout(ops))
3064 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
3065
3066 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
3067 if (!sorted_cgroup_settings)
3068 return false;
3069
3070 lxc_list_for_each(iterator, sorted_cgroup_settings) {
3071 cg = iterator->elem;
3072
3073 if (do_devices == strnequal("devices", cg->subsystem, 7)) {
3074 if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strnequal("cpuset", cg->subsystem, 6))) {
3075 if (do_devices && (errno == EACCES || errno == EPERM)) {
3076 SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3077 continue;
3078 }
3079 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3080 goto out;
3081 }
3082 DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
3083 }
3084 }
3085
3086 ret = true;
3087 INFO("Limits for the legacy cgroup hierarchies have been setup");
3088 out:
3089 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
3090 lxc_list_del(iterator);
3091 free(iterator);
3092 }
3093
3094 return ret;
3095 }
3096
3097 /*
3098 * Some of the parsing logic comes from the original cgroup device v1
3099 * implementation in the kernel.
3100 */
3101 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
3102 struct lxc_conf *conf, const char *key,
3103 const char *val)
3104 {
3105 struct device_item device_item = {};
3106 int ret;
3107
3108 if (strequal("devices.allow", key) && *val == '/')
3109 ret = device_cgroup_rule_parse_devpath(&device_item, val);
3110 else
3111 ret = device_cgroup_rule_parse(&device_item, key, val);
3112 if (ret < 0)
3113 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", key, val);
3114
3115 ret = bpf_list_add_device(&conf->devices, &device_item);
3116 if (ret < 0)
3117 return -1;
3118 return 0;
3119 }
3120
3121 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
3122 struct lxc_handler *handler)
3123 {
3124 struct lxc_list *cgroup_settings, *iterator;
3125 struct hierarchy *h;
3126 struct lxc_conf *conf;
3127
3128 if (!ops)
3129 return ret_set_errno(false, ENOENT);
3130
3131 if (!ops->hierarchies)
3132 return true;
3133
3134 if (!ops->container_cgroup)
3135 return ret_set_errno(false, EINVAL);
3136
3137 if (!handler || !handler->conf)
3138 return ret_set_errno(false, EINVAL);
3139 conf = handler->conf;
3140
3141 cgroup_settings = &conf->cgroup2;
3142 if (lxc_list_empty(cgroup_settings))
3143 return true;
3144
3145 if (!pure_unified_layout(ops))
3146 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
3147
3148 if (!ops->unified)
3149 return false;
3150 h = ops->unified;
3151
3152 lxc_list_for_each (iterator, cgroup_settings) {
3153 struct lxc_cgroup *cg = iterator->elem;
3154 int ret;
3155
3156 if (strnequal("devices", cg->subsystem, 7))
3157 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
3158 else
3159 ret = lxc_write_openat(h->container_limit_path, cg->subsystem, cg->value, strlen(cg->value));
3160 if (ret < 0)
3161 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3162
3163 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3164 }
3165
3166 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
3167 }
3168
3169 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
3170 {
3171 struct lxc_conf *conf;
3172 struct hierarchy *unified;
3173
3174 if (!ops)
3175 return ret_set_errno(false, ENOENT);
3176
3177 if (!ops->hierarchies)
3178 return true;
3179
3180 if (!ops->container_cgroup)
3181 return ret_set_errno(false, EEXIST);
3182
3183 if (!handler || !handler->conf)
3184 return ret_set_errno(false, EINVAL);
3185 conf = handler->conf;
3186
3187 unified = ops->unified;
3188 if (!unified || !unified->bpf_device_controller ||
3189 !unified->container_full_path || lxc_list_empty(&conf->devices))
3190 return true;
3191
3192 return bpf_cgroup_devices_attach(ops, &conf->devices);
3193 }
3194
3195 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
3196 {
3197 __do_close int dfd_final = -EBADF;
3198 __do_free char *add_controllers = NULL, *copy = NULL;
3199 size_t full_len = 0;
3200 struct hierarchy *unified;
3201 int dfd_cur, ret;
3202 char *cur;
3203 char **it;
3204
3205 if (!ops->hierarchies || !pure_unified_layout(ops))
3206 return true;
3207
3208 unified = ops->unified;
3209 if (!unified->controllers[0])
3210 return true;
3211
3212 /* For now we simply enable all controllers that we have detected by
3213 * creating a string like "+memory +pids +cpu +io".
3214 * TODO: In the near future we might want to support "-<controller>"
3215 * etc. but whether supporting semantics like this make sense will need
3216 * some thinking.
3217 */
3218 for (it = unified->controllers; it && *it; it++) {
3219 full_len += strlen(*it) + 2;
3220 add_controllers = must_realloc(add_controllers, full_len + 1);
3221
3222 if (unified->controllers[0] == *it)
3223 add_controllers[0] = '\0';
3224
3225 (void)strlcat(add_controllers, "+", full_len + 1);
3226 (void)strlcat(add_controllers, *it, full_len + 1);
3227
3228 if ((it + 1) && *(it + 1))
3229 (void)strlcat(add_controllers, " ", full_len + 1);
3230 }
3231
3232 copy = strdup(cgroup);
3233 if (!copy)
3234 return false;
3235
3236 /*
3237 * Placing the write to cgroup.subtree_control before the open() is
3238 * intentional because of the cgroup2 delegation model. It enforces
3239 * that leaf cgroups don't have any controllers enabled for delegation.
3240 */
3241 dfd_cur = unified->dfd_base;
3242 lxc_iterate_parts(cur, copy, "/") {
3243 /*
3244 * Even though we vetted the paths when we parsed the config
3245 * we're paranoid here and check that the path is neither
3246 * absolute nor walks upwards.
3247 */
3248 if (abspath(cur))
3249 return syserrno_set(-EINVAL, "No absolute paths allowed");
3250
3251 if (strnequal(cur, "..", STRLITERALLEN("..")))
3252 return syserrno_set(-EINVAL, "No upward walking paths allowed");
3253
3254 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
3255 if (ret < 0)
3256 return syserrno(-errno, "Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
3257
3258 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
3259
3260 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
3261 if (dfd_final < 0)
3262 return syserrno(-errno, "Fail to open directory %d(%s)", dfd_cur, cur);
3263 if (dfd_cur != unified->dfd_base)
3264 close(dfd_cur);
3265 /*
3266 * Leave dfd_final pointing to the last fd we opened so
3267 * it will be automatically zapped if we return early.
3268 */
3269 dfd_cur = dfd_final;
3270 }
3271
3272 return true;
3273 }
3274
3275 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
3276 {
3277 if (!ops)
3278 return ret_set_errno(false, ENOENT);
3279
3280 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
3281 }
3282
3283 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
3284 {
3285 if (!ops)
3286 return ret_set_errno(false, ENOENT);
3287
3288 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
3289 }
3290
3291 static void cg_unified_delegate(char ***delegate)
3292 {
3293 __do_free char *buf = NULL;
3294 char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
3295 char *token;
3296 int idx;
3297
3298 buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
3299 if (!buf) {
3300 for (char **p = standard; p && *p; p++) {
3301 idx = append_null_to_list((void ***)delegate);
3302 (*delegate)[idx] = must_copy_string(*p);
3303 }
3304 SYSWARN("Failed to read /sys/kernel/cgroup/delegate");
3305 return;
3306 }
3307
3308 lxc_iterate_parts(token, buf, " \t\n") {
3309 /*
3310 * We always need to chown this for both cgroup and
3311 * cgroup2.
3312 */
3313 if (strequal(token, "cgroup.procs"))
3314 continue;
3315
3316 idx = append_null_to_list((void ***)delegate);
3317 (*delegate)[idx] = must_copy_string(token);
3318 }
3319 }
3320
3321 /* At startup, parse_hierarchies finds all the info we need about cgroup
3322 * mountpoints and current cgroups, and stores it in @d.
3323 */
3324 static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
3325 {
3326 __do_free char *basecginfo = NULL, *line = NULL;
3327 __do_free_string_list char **klist = NULL, **nlist = NULL;
3328 __do_fclose FILE *f = NULL;
3329 int ret;
3330 size_t len = 0;
3331
3332 /* Root spawned containers escape the current cgroup, so use init's
3333 * cgroups as our base in that case.
3334 */
3335 if (!relative && (geteuid() == 0))
3336 basecginfo = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3337 else
3338 basecginfo = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3339 if (!basecginfo)
3340 return ret_set_errno(-1, ENOMEM);
3341
3342 ret = get_existing_subsystems(&klist, &nlist);
3343 if (ret < 0)
3344 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
3345
3346 f = fopen("/proc/self/mountinfo", "re");
3347 if (!f)
3348 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
3349
3350 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
3351
3352 while (getline(&line, &len, f) != -1) {
3353 __do_free char *base_cgroup = NULL, *mountpoint = NULL;
3354 __do_free_string_list char **controller_list = NULL;
3355 int type;
3356 bool writeable;
3357
3358 type = get_cgroup_version(line);
3359 if (type == 0)
3360 continue;
3361
3362 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
3363 continue;
3364
3365 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
3366 if (type == CGROUP2_SUPER_MAGIC)
3367 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3368 else if (type == CGROUP_SUPER_MAGIC)
3369 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3370 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
3371 if (type == CGROUP_SUPER_MAGIC)
3372 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3373 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3374 if (type == CGROUP2_SUPER_MAGIC)
3375 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3376 }
3377
3378 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
3379 if (!controller_list && type == CGROUP_SUPER_MAGIC)
3380 continue;
3381
3382 if (type == CGROUP_SUPER_MAGIC)
3383 if (controller_list_is_dup(ops->hierarchies, controller_list)) {
3384 TRACE("Skipping duplicating controller");
3385 continue;
3386 }
3387
3388 mountpoint = cg_hybrid_get_mountpoint(line);
3389 if (!mountpoint) {
3390 WARN("Failed parsing mountpoint from \"%s\"", line);
3391 continue;
3392 }
3393
3394 if (type == CGROUP_SUPER_MAGIC)
3395 base_cgroup = cg_hybrid_get_current_cgroup(relative, basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
3396 else
3397 base_cgroup = cg_hybrid_get_current_cgroup(relative, basecginfo, NULL, CGROUP2_SUPER_MAGIC);
3398 if (!base_cgroup) {
3399 WARN("Failed to find current cgroup");
3400 continue;
3401 }
3402
3403 if (type == CGROUP2_SUPER_MAGIC)
3404 writeable = test_writeable_v2(mountpoint, base_cgroup);
3405 else
3406 writeable = test_writeable_v1(mountpoint, base_cgroup);
3407 if (!writeable) {
3408 TRACE("The %s group is not writeable", base_cgroup);
3409 continue;
3410 }
3411
3412 if (type == CGROUP2_SUPER_MAGIC)
3413 ret = add_hierarchy(ops, NULL, move_ptr(mountpoint), move_ptr(base_cgroup), type);
3414 else
3415 ret = add_hierarchy(ops, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
3416 if (ret)
3417 return syserrno(ret, "Failed to add cgroup hierarchy");
3418 if (ops->unified && unprivileged)
3419 cg_unified_delegate(&(ops->unified)->cgroup2_chown);
3420 }
3421
3422 /* verify that all controllers in cgroup.use and all crucial
3423 * controllers are accounted for
3424 */
3425 if (!all_controllers_found(ops))
3426 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
3427
3428 return 0;
3429 }
3430
3431 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
3432 static char *cg_unified_get_current_cgroup(bool relative)
3433 {
3434 __do_free char *basecginfo = NULL, *copy = NULL;
3435 char *base_cgroup;
3436
3437 if (!relative && (geteuid() == 0))
3438 basecginfo = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3439 else
3440 basecginfo = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3441 if (!basecginfo)
3442 return NULL;
3443
3444 base_cgroup = strstr(basecginfo, "0::/");
3445 if (!base_cgroup)
3446 return NULL;
3447
3448 base_cgroup = base_cgroup + 3;
3449 copy = copy_to_eol(base_cgroup);
3450 if (!copy)
3451 return NULL;
3452 trim(copy);
3453
3454 if (!relative) {
3455 base_cgroup = prune_init_scope(copy);
3456 if (!base_cgroup)
3457 return NULL;
3458 } else {
3459 base_cgroup = copy;
3460 }
3461
3462 if (abspath(base_cgroup))
3463 base_cgroup = deabs(base_cgroup);
3464
3465 /* We're allowing base_cgroup to be "". */
3466 return strdup(base_cgroup);
3467 }
3468
3469 static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3470 bool unprivileged)
3471 {
3472 __do_free char *base_cgroup = NULL;
3473 int ret;
3474
3475 base_cgroup = cg_unified_get_current_cgroup(relative);
3476 if (!base_cgroup)
3477 return ret_errno(EINVAL);
3478
3479 /* TODO: If the user requested specific controllers via lxc.cgroup.use
3480 * we should verify here. The reason I'm not doing it right is that I'm
3481 * not convinced that lxc.cgroup.use will be the future since it is a
3482 * global property. I much rather have an option that lets you request
3483 * controllers per container.
3484 */
3485
3486 ret = add_hierarchy(ops, NULL,
3487 must_copy_string(DEFAULT_CGROUP_MOUNTPOINT),
3488 move_ptr(base_cgroup), CGROUP2_SUPER_MAGIC);
3489 if (ret)
3490 return syserrno(ret, "Failed to add unified cgroup hierarchy");
3491
3492 if (unprivileged)
3493 cg_unified_delegate(&(ops->unified)->cgroup2_chown);
3494
3495 if (bpf_devices_cgroup_supported())
3496 ops->unified->bpf_device_controller = 1;
3497
3498 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3499 return CGROUP2_SUPER_MAGIC;
3500 }
3501
3502 static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf)
3503 {
3504 __do_close int dfd = -EBADF;
3505 bool relative = conf->cgroup_meta.relative;
3506 int ret;
3507 const char *tmp;
3508
3509 if (ops->dfd_mnt_cgroupfs_host >= 0)
3510 return ret_errno(EINVAL);
3511
3512 /*
3513 * I don't see the need for allowing symlinks here. If users want to
3514 * have their hierarchy available in different locations I strongly
3515 * suggest bind-mounts.
3516 */
3517 dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3518 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3519 if (dfd < 0)
3520 return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3521
3522 tmp = lxc_global_config_value("lxc.cgroup.use");
3523 if (tmp) {
3524 __do_free char *pin = NULL;
3525 char *chop, *cur;
3526
3527 pin = must_copy_string(tmp);
3528 chop = pin;
3529
3530 lxc_iterate_parts(cur, chop, ",")
3531 must_append_string(&ops->cgroup_use, cur);
3532 }
3533
3534 /*
3535 * Keep dfd referenced by the cleanup function and actually move the fd
3536 * once we know the initialization succeeded. So if we fail we clean up
3537 * the dfd.
3538 */
3539 ops->dfd_mnt_cgroupfs_host = dfd;
3540
3541 if (unified_cgroup_fd(dfd))
3542 ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
3543 else
3544 ret = cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
3545 if (ret < 0)
3546 return syserrno(ret, "Failed to initialize cgroups");
3547
3548 /* Transfer ownership to cgroup_ops. */
3549 move_fd(dfd);
3550 return 0;
3551 }
3552
3553 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3554 {
3555 const char *cgroup_pattern;
3556
3557 if (!ops)
3558 return ret_set_errno(-1, ENOENT);
3559
3560 /* copy system-wide cgroup information */
3561 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3562 if (cgroup_pattern && !strequal(cgroup_pattern, ""))
3563 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
3564
3565 return 0;
3566 }
3567
3568 struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
3569 {
3570 __do_free struct cgroup_ops *cgfsng_ops = NULL;
3571
3572 cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3573 if (!cgfsng_ops)
3574 return ret_set_errno(NULL, ENOMEM);
3575
3576 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3577 cgfsng_ops->dfd_mnt_cgroupfs_host = -EBADF;
3578
3579 if (__cgroup_init(cgfsng_ops, conf))
3580 return NULL;
3581
3582 cgfsng_ops->data_init = cgfsng_data_init;
3583 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3584 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3585 cgfsng_ops->monitor_create = cgfsng_monitor_create;
3586 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3587 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3588 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3589 cgfsng_ops->payload_create = cgfsng_payload_create;
3590 cgfsng_ops->payload_enter = cgfsng_payload_enter;
3591 cgfsng_ops->payload_finalize = cgfsng_payload_finalize;
3592 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3593 cgfsng_ops->get = cgfsng_get;
3594 cgfsng_ops->set = cgfsng_set;
3595 cgfsng_ops->freeze = cgfsng_freeze;
3596 cgfsng_ops->unfreeze = cgfsng_unfreeze;
3597 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3598 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3599 cgfsng_ops->driver = "cgfsng";
3600 cgfsng_ops->version = "1.0.0";
3601 cgfsng_ops->attach = cgfsng_attach;
3602 cgfsng_ops->chown = cgfsng_chown;
3603 cgfsng_ops->mount = cgfsng_mount;
3604 cgfsng_ops->devices_activate = cgfsng_devices_activate;
3605 cgfsng_ops->get_limiting_cgroup = cgfsng_get_limiting_cgroup;
3606
3607 cgfsng_ops->criu_escape = cgfsng_criu_escape;
3608 cgfsng_ops->criu_num_hierarchies = cgfsng_criu_num_hierarchies;
3609 cgfsng_ops->criu_get_hierarchies = cgfsng_criu_get_hierarchies;
3610
3611 return move_ptr(cgfsng_ops);
3612 }
3613
3614 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3615 const char *lxcpath, pid_t pid)
3616 {
3617 __do_close int unified_fd = -EBADF;
3618 int ret;
3619
3620 if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3621 return ret_errno(EINVAL);
3622
3623 unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3624 if (unified_fd < 0)
3625 return ret_errno(ENOCGROUP2);
3626
3627 if (!lxc_list_empty(&conf->id_map)) {
3628 struct userns_exec_unified_attach_data args = {
3629 .conf = conf,
3630 .unified_fd = unified_fd,
3631 .pid = pid,
3632 };
3633
3634 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3635 if (ret < 0)
3636 return -errno;
3637
3638 ret = userns_exec_minimal(conf,
3639 cgroup_unified_attach_parent_wrapper,
3640 &args,
3641 cgroup_unified_attach_child_wrapper,
3642 &args);
3643 } else {
3644 ret = cgroup_attach_leaf(conf, unified_fd, pid);
3645 }
3646
3647 return ret;
3648 }
3649
3650 /* Connects to command socket therefore isn't callable from command handler. */
3651 int cgroup_get(const char *name, const char *lxcpath,
3652 const char *filename, char *buf, size_t len)
3653 {
3654 __do_close int unified_fd = -EBADF;
3655 ssize_t ret;
3656
3657 if (is_empty_string(filename) || is_empty_string(name) ||
3658 is_empty_string(lxcpath))
3659 return ret_errno(EINVAL);
3660
3661 if ((buf && !len) || (len && !buf))
3662 return ret_errno(EINVAL);
3663
3664 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3665 if (unified_fd < 0)
3666 return ret_errno(ENOCGROUP2);
3667
3668 ret = lxc_read_try_buf_at(unified_fd, filename, buf, len);
3669 if (ret < 0)
3670 SYSERROR("Failed to read cgroup value");
3671
3672 return ret;
3673 }
3674
3675 /* Connects to command socket therefore isn't callable from command handler. */
3676 int cgroup_set(const char *name, const char *lxcpath,
3677 const char *filename, const char *value)
3678 {
3679 __do_close int unified_fd = -EBADF;
3680 ssize_t ret;
3681
3682 if (is_empty_string(filename) || is_empty_string(value) ||
3683 is_empty_string(name) || is_empty_string(lxcpath))
3684 return ret_errno(EINVAL);
3685
3686 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3687 if (unified_fd < 0)
3688 return ret_errno(ENOCGROUP2);
3689
3690 if (strnequal(filename, "devices.", STRLITERALLEN("devices."))) {
3691 struct device_item device = {};
3692
3693 ret = device_cgroup_rule_parse(&device, filename, value);
3694 if (ret < 0)
3695 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", filename, value);
3696
3697 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3698 } else {
3699 ret = lxc_writeat(unified_fd, filename, value, strlen(value));
3700 }
3701
3702 return ret;
3703 }
3704
3705 static int do_cgroup_freeze(int unified_fd,
3706 const char *state_string,
3707 int state_num,
3708 int timeout,
3709 const char *epoll_error,
3710 const char *wait_error)
3711 {
3712 __do_close int events_fd = -EBADF;
3713 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
3714 int ret;
3715 struct lxc_epoll_descr descr = {};
3716
3717 if (timeout != 0) {
3718 ret = lxc_mainloop_open(&descr);
3719 if (ret)
3720 return log_error_errno(-1, errno, "%s", epoll_error);
3721
3722 /* automatically cleaned up now */
3723 descr_ptr = &descr;
3724
3725 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3726 if (events_fd < 0)
3727 return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3728
3729 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
3730 if (ret < 0)
3731 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3732 }
3733
3734 ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3735 if (ret < 0)
3736 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3737
3738 if (timeout != 0) {
3739 ret = lxc_mainloop(&descr, timeout);
3740 if (ret)
3741 return log_error_errno(-1, errno, "%s", wait_error);
3742 }
3743
3744 return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3745 }
3746
3747 static inline int __cgroup_freeze(int unified_fd, int timeout)
3748 {
3749 return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3750 "Failed to create epoll instance to wait for container freeze",
3751 "Failed to wait for container to be frozen");
3752 }
3753
3754 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3755 {
3756 __do_close int unified_fd = -EBADF;
3757 int ret;
3758
3759 if (is_empty_string(name) || is_empty_string(lxcpath))
3760 return ret_errno(EINVAL);
3761
3762 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3763 if (unified_fd < 0)
3764 return ret_errno(ENOCGROUP2);
3765
3766 lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3767 ret = __cgroup_freeze(unified_fd, timeout);
3768 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3769 return ret;
3770 }
3771
3772 int __cgroup_unfreeze(int unified_fd, int timeout)
3773 {
3774 return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3775 "Failed to create epoll instance to wait for container freeze",
3776 "Failed to wait for container to be frozen");
3777 }
3778
3779 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3780 {
3781 __do_close int unified_fd = -EBADF;
3782 int ret;
3783
3784 if (is_empty_string(name) || is_empty_string(lxcpath))
3785 return ret_errno(EINVAL);
3786
3787 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3788 if (unified_fd < 0)
3789 return ret_errno(ENOCGROUP2);
3790
3791 lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3792 ret = __cgroup_unfreeze(unified_fd, timeout);
3793 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3794 return ret;
3795 }