]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfsng.c
commands: only update bpf device program if really needed
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 /*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
8 * each controller.
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
12 * a comma-separated list of controllers.
13 */
14
15 #ifndef _GNU_SOURCE
16 #define _GNU_SOURCE 1
17 #endif
18 #include <ctype.h>
19 #include <dirent.h>
20 #include <errno.h>
21 #include <grp.h>
22 #include <linux/kdev_t.h>
23 #include <linux/types.h>
24 #include <poll.h>
25 #include <signal.h>
26 #include <stdint.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/epoll.h>
31 #include <sys/types.h>
32 #include <unistd.h>
33
34 #include "af_unix.h"
35 #include "caps.h"
36 #include "cgroup.h"
37 #include "cgroup2_devices.h"
38 #include "cgroup_utils.h"
39 #include "commands.h"
40 #include "commands_utils.h"
41 #include "conf.h"
42 #include "config.h"
43 #include "log.h"
44 #include "macro.h"
45 #include "mainloop.h"
46 #include "memory_utils.h"
47 #include "mount_utils.h"
48 #include "storage/storage.h"
49 #include "string_utils.h"
50 #include "syscall_wrappers.h"
51 #include "utils.h"
52
53 #ifndef HAVE_STRLCPY
54 #include "include/strlcpy.h"
55 #endif
56
57 #ifndef HAVE_STRLCAT
58 #include "include/strlcat.h"
59 #endif
60
61 lxc_log_define(cgfsng, cgroup);
62
63 /* Given a pointer to a null-terminated array of pointers, realloc to add one
64 * entry, and point the new entry to NULL. Do not fail. Return the index to the
65 * second-to-last entry - that is, the one which is now available for use
66 * (keeping the list null-terminated).
67 */
68 static int append_null_to_list(void ***list)
69 {
70 int newentry = 0;
71
72 if (*list)
73 for (; (*list)[newentry]; newentry++)
74 ;
75
76 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
77 (*list)[newentry + 1] = NULL;
78 return newentry;
79 }
80
81 /* Given a null-terminated array of strings, check whether @entry is one of the
82 * strings.
83 */
84 static bool string_in_list(char **list, const char *entry)
85 {
86 if (!list)
87 return false;
88
89 for (int i = 0; list[i]; i++)
90 if (strequal(list[i], entry))
91 return true;
92
93 return false;
94 }
95
96 /* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
97 * "name=systemd". Do not fail.
98 */
99 static char *cg_legacy_must_prefix_named(char *entry)
100 {
101 size_t len;
102 char *prefixed;
103
104 len = strlen(entry);
105 prefixed = must_realloc(NULL, len + 6);
106
107 memcpy(prefixed, "name=", STRLITERALLEN("name="));
108 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
109 prefixed[len + 5] = '\0';
110
111 return prefixed;
112 }
113
114 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
115 * we are called.
116 *
117 * We also handle named subsystems here. Any controller which is not a kernel
118 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
119 * we refuse to use because we're not sure which we have here.
120 * (TODO: We could work around this in some cases by just remounting to be
121 * unambiguous, or by comparing mountpoint contents with current cgroup.)
122 *
123 * The last entry will always be NULL.
124 */
125 static void must_append_controller(char **klist, char **nlist, char ***clist,
126 char *entry)
127 {
128 int newentry;
129 char *copy;
130
131 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
132 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
133 ERROR("It is both a named and kernel subsystem");
134 return;
135 }
136
137 newentry = append_null_to_list((void ***)clist);
138
139 if (strnequal(entry, "name=", 5))
140 copy = must_copy_string(entry);
141 else if (string_in_list(klist, entry))
142 copy = must_copy_string(entry);
143 else
144 copy = cg_legacy_must_prefix_named(entry);
145
146 (*clist)[newentry] = copy;
147 }
148
149 /* Given a handler's cgroup data, return the struct hierarchy for the controller
150 * @c, or NULL if there is none.
151 */
152 static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
153 {
154 if (!ops->hierarchies)
155 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
156
157 for (int i = 0; ops->hierarchies[i]; i++) {
158 if (!controller) {
159 /* This is the empty unified hierarchy. */
160 if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
161 return ops->hierarchies[i];
162
163 continue;
164 }
165
166 /*
167 * Handle controllers with significant implementation changes
168 * from cgroup to cgroup2.
169 */
170 if (pure_unified_layout(ops)) {
171 if (strequal(controller, "devices")) {
172 if (ops->unified->bpf_device_controller)
173 return ops->unified;
174
175 break;
176 } else if (strequal(controller, "freezer")) {
177 if (ops->unified->freezer_controller)
178 return ops->unified;
179
180 break;
181 }
182 }
183
184 if (string_in_list(ops->hierarchies[i]->controllers, controller))
185 return ops->hierarchies[i];
186 }
187
188 if (controller)
189 WARN("There is no useable %s controller", controller);
190 else
191 WARN("There is no empty unified cgroup hierarchy");
192
193 return ret_set_errno(NULL, ENOENT);
194 }
195
196 /* Taken over modified from the kernel sources. */
197 #define NBITS 32 /* bits in uint32_t */
198 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
199 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
200
201 static void set_bit(unsigned bit, uint32_t *bitarr)
202 {
203 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
204 }
205
206 static void clear_bit(unsigned bit, uint32_t *bitarr)
207 {
208 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
209 }
210
211 static bool is_set(unsigned bit, uint32_t *bitarr)
212 {
213 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
214 }
215
216 /* Create cpumask from cpulist aka turn:
217 *
218 * 0,2-3
219 *
220 * into bit array
221 *
222 * 1 0 1 1
223 */
224 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
225 {
226 __do_free uint32_t *bitarr = NULL;
227 char *token;
228 size_t arrlen;
229
230 arrlen = BITS_TO_LONGS(nbits);
231 bitarr = calloc(arrlen, sizeof(uint32_t));
232 if (!bitarr)
233 return ret_set_errno(NULL, ENOMEM);
234
235 lxc_iterate_parts(token, buf, ",") {
236 errno = 0;
237 unsigned end, start;
238 char *range;
239
240 start = strtoul(token, NULL, 0);
241 end = start;
242 range = strchr(token, '-');
243 if (range)
244 end = strtoul(range + 1, NULL, 0);
245
246 if (!(start <= end))
247 return ret_set_errno(NULL, EINVAL);
248
249 if (end >= nbits)
250 return ret_set_errno(NULL, EINVAL);
251
252 while (start <= end)
253 set_bit(start++, bitarr);
254 }
255
256 return move_ptr(bitarr);
257 }
258
259 /* Turn cpumask into simple, comma-separated cpulist. */
260 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
261 {
262 __do_free_string_list char **cpulist = NULL;
263 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
264 int ret;
265
266 for (size_t i = 0; i <= nbits; i++) {
267 if (!is_set(i, bitarr))
268 continue;
269
270 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
271 if (ret < 0)
272 return NULL;
273
274 ret = lxc_append_string(&cpulist, numstr);
275 if (ret < 0)
276 return ret_set_errno(NULL, ENOMEM);
277 }
278
279 if (!cpulist)
280 return ret_set_errno(NULL, ENOMEM);
281
282 return lxc_string_join(",", (const char **)cpulist, false);
283 }
284
285 static ssize_t get_max_cpus(char *cpulist)
286 {
287 char *c1, *c2;
288 char *maxcpus = cpulist;
289 size_t cpus = 0;
290
291 c1 = strrchr(maxcpus, ',');
292 if (c1)
293 c1++;
294
295 c2 = strrchr(maxcpus, '-');
296 if (c2)
297 c2++;
298
299 if (!c1 && !c2)
300 c1 = maxcpus;
301 else if (c1 > c2)
302 c2 = c1;
303 else if (c1 < c2)
304 c1 = c2;
305 else if (!c1 && c2)
306 c1 = c2;
307
308 errno = 0;
309 cpus = strtoul(c1, NULL, 0);
310 if (errno != 0)
311 return -1;
312
313 return cpus;
314 }
315
316 static inline bool is_unified_hierarchy(const struct hierarchy *h)
317 {
318 return h->version == CGROUP2_SUPER_MAGIC;
319 }
320
321 /* Given two null-terminated lists of strings, return true if any string is in
322 * both.
323 */
324 static bool controller_lists_intersect(char **l1, char **l2)
325 {
326 if (!l1 || !l2)
327 return false;
328
329 for (int i = 0; l1[i]; i++)
330 if (string_in_list(l2, l1[i]))
331 return true;
332
333 return false;
334 }
335
336 /* For a null-terminated list of controllers @clist, return true if any of those
337 * controllers is already listed the null-terminated list of hierarchies @hlist.
338 * Realistically, if one is present, all must be present.
339 */
340 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
341 {
342 if (!hlist)
343 return false;
344
345 for (int i = 0; hlist[i]; i++)
346 if (controller_lists_intersect(hlist[i]->controllers, clist))
347 return true;
348
349 return false;
350 }
351
352 /* Return true if the controller @entry is found in the null-terminated list of
353 * hierarchies @hlist.
354 */
355 static bool controller_found(struct hierarchy **hlist, char *entry)
356 {
357 if (!hlist)
358 return false;
359
360 for (int i = 0; hlist[i]; i++)
361 if (string_in_list(hlist[i]->controllers, entry))
362 return true;
363
364 return false;
365 }
366
367 /* Return true if all of the controllers which we require have been found. The
368 * required list is freezer and anything in lxc.cgroup.use.
369 */
370 static bool all_controllers_found(struct cgroup_ops *ops)
371 {
372 struct hierarchy **hlist;
373
374 if (!ops->cgroup_use)
375 return true;
376
377 hlist = ops->hierarchies;
378 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
379 if (!controller_found(hlist, *cur))
380 return log_error(false, "No %s controller mountpoint found", *cur);
381
382 return true;
383 }
384
385 /* Get the controllers from a mountinfo line There are other ways we could get
386 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
387 * could parse the mount options. But we simply assume that the mountpoint must
388 * be /sys/fs/cgroup/controller-list
389 */
390 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
391 int type)
392 {
393 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
394 * for legacy hierarchies.
395 */
396 __do_free_string_list char **aret = NULL;
397 int i;
398 char *p2, *tok;
399 char *p = line, *sep = ",";
400
401 for (i = 0; i < 4; i++) {
402 p = strchr(p, ' ');
403 if (!p)
404 return NULL;
405 p++;
406 }
407
408 /* Note, if we change how mountinfo works, then our caller will need to
409 * verify /sys/fs/cgroup/ in this field.
410 */
411 if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15))
412 return log_warn(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
413
414 p += 15;
415 p2 = strchr(p, ' ');
416 if (!p2)
417 return log_error(NULL, "Corrupt mountinfo");
418 *p2 = '\0';
419
420 if (type == CGROUP_SUPER_MAGIC) {
421 __do_free char *dup = NULL;
422
423 /* strdup() here for v1 hierarchies. Otherwise
424 * lxc_iterate_parts() will destroy mountpoints such as
425 * "/sys/fs/cgroup/cpu,cpuacct".
426 */
427 dup = must_copy_string(p);
428 if (!dup)
429 return NULL;
430
431 lxc_iterate_parts(tok, dup, sep)
432 must_append_controller(klist, nlist, &aret, tok);
433 }
434 *p2 = ' ';
435
436 return move_ptr(aret);
437 }
438
439 static char **cg_unified_make_empty_controller(void)
440 {
441 __do_free_string_list char **aret = NULL;
442 int newentry;
443
444 newentry = append_null_to_list((void ***)&aret);
445 aret[newentry] = NULL;
446 return move_ptr(aret);
447 }
448
449 static char **cg_unified_get_controllers(int dfd, const char *file)
450 {
451 __do_free char *buf = NULL;
452 __do_free_string_list char **aret = NULL;
453 char *sep = " \t\n";
454 char *tok;
455
456 buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
457 if (!buf)
458 return NULL;
459
460 lxc_iterate_parts(tok, buf, sep) {
461 int newentry;
462 char *copy;
463
464 newentry = append_null_to_list((void ***)&aret);
465 copy = must_copy_string(tok);
466 aret[newentry] = copy;
467 }
468
469 return move_ptr(aret);
470 }
471
472 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
473 char **controllers)
474 {
475 if (!ops->cgroup_use)
476 return true;
477
478 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
479 bool found = false;
480
481 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
482 if (!strequal(*cur_use, *cur_ctrl))
483 continue;
484
485 found = true;
486 break;
487 }
488
489 if (found)
490 continue;
491
492 return false;
493 }
494
495 return true;
496 }
497
498 static int add_hierarchy(struct cgroup_ops *ops, char **clist, char *mountpoint,
499 char *container_base_path, int type)
500 {
501 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
502 __do_free struct hierarchy *new = NULL;
503 __do_free_string_list char **controllers = clist;
504 int idx;
505
506 if (abspath(container_base_path))
507 return syserrno_set(-EINVAL, "Container base path must be relative to controller mount");
508
509 if (!controllers && type != CGROUP2_SUPER_MAGIC)
510 return syserrno_set(-EINVAL, "Empty controller list for non-unified cgroup hierarchy passed");
511
512 dfd_mnt = open_at(-EBADF, mountpoint, PROTECT_OPATH_DIRECTORY,
513 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
514 if (dfd_mnt < 0)
515 return syserrno(-errno, "Failed to open %s", mountpoint);
516
517 if (!is_empty_string(container_base_path)) {
518 dfd_base = open_at(dfd_mnt, container_base_path,
519 PROTECT_OPATH_DIRECTORY,
520 PROTECT_LOOKUP_BENEATH_XDEV, 0);
521 if (dfd_base < 0)
522 return syserrno(-errno, "Failed to open %d(%s)", dfd_base, container_base_path);
523 }
524
525 if (!controllers) {
526 /*
527 * We assume that the cgroup we're currently in has been delegated to
528 * us and we are free to further delege all of the controllers listed
529 * in cgroup.controllers further down the hierarchy.
530 */
531 if (dfd_base < 0)
532 controllers = cg_unified_get_controllers(dfd_mnt, "cgroup.controllers");
533 else
534 controllers = cg_unified_get_controllers(dfd_base, "cgroup.controllers");
535 if (!controllers)
536 controllers = cg_unified_make_empty_controller();
537 if (!controllers[0])
538 TRACE("No controllers are enabled for delegation");
539 }
540
541 /* Exclude all controllers that cgroup use does not want. */
542 if (!cgroup_use_wants_controllers(ops, controllers))
543 return log_trace(0, "Skipping cgroup hiearchy with non-requested controllers");
544
545 new = zalloc(sizeof(*new));
546 if (!new)
547 return ret_errno(ENOMEM);
548
549 new->version = type;
550 new->controllers = move_ptr(controllers);
551 new->mountpoint = mountpoint;
552 new->container_base_path = container_base_path;
553 new->cgfd_con = -EBADF;
554 new->cgfd_limit = -EBADF;
555 new->cgfd_mon = -EBADF;
556
557 TRACE("Adding cgroup hierarchy with mountpoint %s and base cgroup %s",
558 mountpoint, container_base_path);
559 for (char *const *it = new->controllers; it && *it; it++)
560 TRACE("The detected hierarchy contains the %s controller", *it);
561
562 idx = append_null_to_list((void ***)&ops->hierarchies);
563 if (dfd_base < 0)
564 new->dfd_base = dfd_mnt;
565 else
566 new->dfd_base = move_fd(dfd_base);
567 new->dfd_mnt = move_fd(dfd_mnt);
568 if (type == CGROUP2_SUPER_MAGIC)
569 ops->unified = new;
570 (ops->hierarchies)[idx] = move_ptr(new);
571 return 0;
572 }
573
574 /* Get a copy of the mountpoint from @line, which is a line from
575 * /proc/self/mountinfo.
576 */
577 static char *cg_hybrid_get_mountpoint(char *line)
578 {
579 char *p = line, *sret = NULL;
580 size_t len;
581 char *p2;
582
583 for (int i = 0; i < 4; i++) {
584 p = strchr(p, ' ');
585 if (!p)
586 return NULL;
587 p++;
588 }
589
590 if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15))
591 return NULL;
592
593 p2 = strchr(p + 15, ' ');
594 if (!p2)
595 return NULL;
596 *p2 = '\0';
597
598 len = strlen(p);
599 sret = must_realloc(NULL, len + 1);
600 memcpy(sret, p, len);
601 sret[len] = '\0';
602
603 return sret;
604 }
605
606 /* Given a multi-line string, return a null-terminated copy of the current line. */
607 static char *copy_to_eol(char *p)
608 {
609 char *p2, *sret;
610 size_t len;
611
612 p2 = strchr(p, '\n');
613 if (!p2)
614 return NULL;
615
616 len = p2 - p;
617 sret = must_realloc(NULL, len + 1);
618 memcpy(sret, p, len);
619 sret[len] = '\0';
620
621 return sret;
622 }
623
624 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
625 * /proc/self/cgroup file. Check whether controller c is present.
626 */
627 static bool controller_in_clist(char *cgline, char *c)
628 {
629 __do_free char *tmp = NULL;
630 char *tok, *eol;
631 size_t len;
632
633 eol = strchr(cgline, ':');
634 if (!eol)
635 return false;
636
637 len = eol - cgline;
638 tmp = must_realloc(NULL, len + 1);
639 memcpy(tmp, cgline, len);
640 tmp[len] = '\0';
641
642 lxc_iterate_parts(tok, tmp, ",")
643 if (strequal(tok, c))
644 return true;
645
646 return false;
647 }
648
649 static inline char *trim(char *s)
650 {
651 size_t len;
652
653 len = strlen(s);
654 while ((len > 1) && (s[len - 1] == '\n'))
655 s[--len] = '\0';
656
657 return s;
658 }
659
660 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
661 * @controller.
662 */
663 static char *cg_hybrid_get_current_cgroup(bool relative, char *basecginfo,
664 char *controller, int type)
665 {
666 char *base_cgroup = basecginfo;
667
668 for (;;) {
669 bool is_cgv2_base_cgroup = false;
670
671 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
672 if ((type == CGROUP2_SUPER_MAGIC) && (*base_cgroup == '0'))
673 is_cgv2_base_cgroup = true;
674
675 base_cgroup = strchr(base_cgroup, ':');
676 if (!base_cgroup)
677 return NULL;
678 base_cgroup++;
679
680 if (is_cgv2_base_cgroup || (controller && controller_in_clist(base_cgroup, controller))) {
681 __do_free char *copy = NULL;
682
683 base_cgroup = strchr(base_cgroup, ':');
684 if (!base_cgroup)
685 return NULL;
686 base_cgroup++;
687
688 copy = copy_to_eol(base_cgroup);
689 if (!copy)
690 return NULL;
691 trim(copy);
692
693 if (!relative) {
694 base_cgroup = prune_init_scope(copy);
695 if (!base_cgroup)
696 return NULL;
697 } else {
698 base_cgroup = copy;
699 }
700
701 if (abspath(base_cgroup))
702 base_cgroup = deabs(base_cgroup);
703
704 /* We're allowing base_cgroup to be "". */
705 return strdup(base_cgroup);
706 }
707
708 base_cgroup = strchr(base_cgroup, '\n');
709 if (!base_cgroup)
710 return NULL;
711 base_cgroup++;
712 }
713 }
714
715 static void must_append_string(char ***list, char *entry)
716 {
717 int newentry;
718 char *copy;
719
720 newentry = append_null_to_list((void ***)list);
721 copy = must_copy_string(entry);
722 (*list)[newentry] = copy;
723 }
724
725 static int get_existing_subsystems(char ***klist, char ***nlist)
726 {
727 __do_free char *line = NULL;
728 __do_fclose FILE *f = NULL;
729 size_t len = 0;
730
731 f = fopen("/proc/self/cgroup", "re");
732 if (!f)
733 return -1;
734
735 while (getline(&line, &len, f) != -1) {
736 char *p, *p2, *tok;
737 p = strchr(line, ':');
738 if (!p)
739 continue;
740 p++;
741 p2 = strchr(p, ':');
742 if (!p2)
743 continue;
744 *p2 = '\0';
745
746 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
747 * contains an entry of the form:
748 *
749 * 0::/some/path
750 *
751 * In this case we use "cgroup2" as controller name.
752 */
753 if ((p2 - p) == 0) {
754 must_append_string(klist, "cgroup2");
755 continue;
756 }
757
758 lxc_iterate_parts(tok, p, ",") {
759 if (strnequal(tok, "name=", 5))
760 must_append_string(nlist, tok);
761 else
762 must_append_string(klist, tok);
763 }
764 }
765
766 return 0;
767 }
768
769 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
770 char **nlist)
771 {
772 int k;
773 char **it;
774
775 TRACE("basecginfo is:");
776 TRACE("%s", basecginfo);
777
778 for (k = 0, it = klist; it && *it; it++, k++)
779 TRACE("kernel subsystem %d: %s", k, *it);
780
781 for (k = 0, it = nlist; it && *it; it++, k++)
782 TRACE("named subsystem %d: %s", k, *it);
783 }
784
785 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
786 {
787 if (!path_prune || !hierarchies)
788 return 0;
789
790 for (int i = 0; hierarchies[i]; i++) {
791 struct hierarchy *h = hierarchies[i];
792 int ret;
793
794 ret = cgroup_tree_prune(h->dfd_base, path_prune);
795 if (ret < 0)
796 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
797 else
798 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
799
800 free_equal(h->container_limit_path, h->container_full_path);
801 }
802
803 return 0;
804 }
805
806 struct generic_userns_exec_data {
807 struct hierarchy **hierarchies;
808 const char *path_prune;
809 struct lxc_conf *conf;
810 uid_t origuid; /* target uid in parent namespace */
811 char *path;
812 };
813
814 static int cgroup_tree_remove_wrapper(void *data)
815 {
816 struct generic_userns_exec_data *arg = data;
817 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
818 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
819 int ret;
820
821 if (!lxc_drop_groups() && errno != EPERM)
822 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
823
824 ret = setresgid(nsgid, nsgid, nsgid);
825 if (ret < 0)
826 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
827 (int)nsgid, (int)nsgid, (int)nsgid);
828
829 ret = setresuid(nsuid, nsuid, nsuid);
830 if (ret < 0)
831 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
832 (int)nsuid, (int)nsuid, (int)nsuid);
833
834 return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
835 }
836
837 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
838 struct lxc_handler *handler)
839 {
840 int ret;
841
842 if (!ops) {
843 ERROR("Called with uninitialized cgroup operations");
844 return;
845 }
846
847 if (!ops->hierarchies)
848 return;
849
850 if (!handler) {
851 ERROR("Called with uninitialized handler");
852 return;
853 }
854
855 if (!handler->conf) {
856 ERROR("Called with uninitialized conf");
857 return;
858 }
859
860 if (!ops->container_limit_cgroup) {
861 WARN("Uninitialized limit cgroup");
862 return;
863 }
864
865 ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
866 if (ret < 0)
867 WARN("Failed to detach bpf program from cgroup");
868
869 if (!lxc_list_empty(&handler->conf->id_map)) {
870 struct generic_userns_exec_data wrap = {
871 .conf = handler->conf,
872 .path_prune = ops->container_limit_cgroup,
873 .hierarchies = ops->hierarchies,
874 .origuid = 0,
875 };
876 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
877 &wrap, "cgroup_tree_remove_wrapper");
878 } else {
879 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
880 }
881 if (ret < 0)
882 SYSWARN("Failed to destroy cgroups");
883 }
884
885 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
886 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
887 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
888 bool am_initialized)
889 {
890 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
891 *offlinecpus = NULL, *posscpus = NULL;
892 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
893 *possmask = NULL;
894 int ret;
895 ssize_t i;
896 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
897 bool flipped_bit = false;
898
899 posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
900 if (!posscpus)
901 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
902
903 /* Get maximum number of cpus found in possible cpuset. */
904 maxposs = get_max_cpus(posscpus);
905 if (maxposs < 0 || maxposs >= INT_MAX - 1)
906 return false;
907
908 if (file_exists(__ISOL_CPUS)) {
909 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
910 if (!isolcpus)
911 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
912
913 if (isdigit(isolcpus[0])) {
914 /* Get maximum number of cpus found in isolated cpuset. */
915 maxisol = get_max_cpus(isolcpus);
916 if (maxisol < 0 || maxisol >= INT_MAX - 1)
917 return false;
918 }
919
920 if (maxposs < maxisol)
921 maxposs = maxisol;
922 maxposs++;
923 } else {
924 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
925 }
926
927 if (file_exists(__OFFLINE_CPUS)) {
928 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
929 if (!offlinecpus)
930 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
931
932 if (isdigit(offlinecpus[0])) {
933 /* Get maximum number of cpus found in offline cpuset. */
934 maxoffline = get_max_cpus(offlinecpus);
935 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
936 return false;
937 }
938
939 if (maxposs < maxoffline)
940 maxposs = maxoffline;
941 maxposs++;
942 } else {
943 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
944 }
945
946 if ((maxisol == 0) && (maxoffline == 0)) {
947 cpulist = move_ptr(posscpus);
948 goto copy_parent;
949 }
950
951 possmask = lxc_cpumask(posscpus, maxposs);
952 if (!possmask)
953 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
954
955 if (maxisol > 0) {
956 isolmask = lxc_cpumask(isolcpus, maxposs);
957 if (!isolmask)
958 return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
959 }
960
961 if (maxoffline > 0) {
962 offlinemask = lxc_cpumask(offlinecpus, maxposs);
963 if (!offlinemask)
964 return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
965 }
966
967 for (i = 0; i <= maxposs; i++) {
968 if ((isolmask && !is_set(i, isolmask)) ||
969 (offlinemask && !is_set(i, offlinemask)) ||
970 !is_set(i, possmask))
971 continue;
972
973 flipped_bit = true;
974 clear_bit(i, possmask);
975 }
976
977 if (!flipped_bit) {
978 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
979 TRACE("No isolated or offline cpus present in cpuset");
980 } else {
981 cpulist = move_ptr(posscpus);
982 TRACE("Removed isolated or offline cpus from cpuset");
983 }
984 if (!cpulist)
985 return log_error_errno(false, errno, "Failed to create cpu list");
986
987 copy_parent:
988 if (!am_initialized) {
989 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
990 if (ret < 0)
991 return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
992
993 TRACE("Copied cpu settings of parent cgroup");
994 }
995
996 return true;
997 }
998
999 static bool cpuset1_initialize(int dfd_base, int dfd_next)
1000 {
1001 char mems[PATH_MAX];
1002 ssize_t bytes;
1003 char v;
1004
1005 /*
1006 * Determine whether the base cgroup has cpuset
1007 * inheritance turned on.
1008 */
1009 bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
1010 if (bytes < 0)
1011 return syserrno(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
1012
1013 /*
1014 * Initialize cpuset.cpus and make remove any isolated
1015 * and offline cpus.
1016 */
1017 if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
1018 return syserrno(false, "Failed to initialize cpuset.cpus");
1019
1020 /* Read cpuset.mems from parent... */
1021 bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
1022 if (bytes < 0)
1023 return syserrno(false, "Failed to read file %d(cpuset.mems)", dfd_base);
1024
1025 /* ... and copy to first cgroup in the tree... */
1026 bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
1027 if (bytes < 0)
1028 return syserrno(false, "Failed to write %d(cpuset.mems)", dfd_next);
1029
1030 /* ... and finally turn on cpuset inheritance. */
1031 bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
1032 if (bytes < 0)
1033 return syserrno(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
1034
1035 return log_trace(true, "Initialized cpuset in the legacy hierarchy");
1036 }
1037
1038 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
1039 bool cpuset_v1, bool eexist_ignore)
1040 {
1041 __do_close int dfd_final = -EBADF;
1042 int dfd_cur = dfd_base;
1043 int ret = 0;
1044 size_t len;
1045 char *cur;
1046 char buf[PATH_MAX];
1047
1048 if (is_empty_string(path))
1049 return ret_errno(EINVAL);
1050
1051 len = strlcpy(buf, path, sizeof(buf));
1052 if (len >= sizeof(buf))
1053 return ret_errno(E2BIG);
1054
1055 lxc_iterate_parts(cur, buf, "/") {
1056 /*
1057 * Even though we vetted the paths when we parsed the config
1058 * we're paranoid here and check that the path is neither
1059 * absolute nor walks upwards.
1060 */
1061 if (abspath(cur))
1062 return syserrno_set(-EINVAL, "No absolute paths allowed");
1063
1064 if (strnequal(cur, "..", STRLITERALLEN("..")))
1065 return syserrno_set(-EINVAL, "No upward walking paths allowed");
1066
1067 ret = mkdirat(dfd_cur, cur, mode);
1068 if (ret < 0) {
1069 if (errno != EEXIST)
1070 return syserrno(-errno, "Failed to create %d(%s)", dfd_cur, cur);
1071
1072 ret = -EEXIST;
1073 }
1074 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
1075
1076 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
1077 if (dfd_final < 0)
1078 return syserrno(-errno, "Fail to open%s directory %d(%s)",
1079 !ret ? " newly created" : "", dfd_base, cur);
1080 if (dfd_cur != dfd_base)
1081 close(dfd_cur);
1082 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
1083 return syserrno(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
1084 /*
1085 * Leave dfd_final pointing to the last fd we opened so
1086 * it will be automatically zapped if we return early.
1087 */
1088 dfd_cur = dfd_final;
1089 }
1090
1091 /* The final cgroup must be succesfully creatd by us. */
1092 if (ret) {
1093 if (ret != -EEXIST || !eexist_ignore)
1094 return syserrno_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
1095 }
1096
1097 return move_fd(dfd_final);
1098 }
1099
1100 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
1101 struct hierarchy *h, const char *cgroup_limit_dir,
1102 const char *cgroup_leaf, bool payload)
1103 {
1104 __do_close int fd_limit = -EBADF, fd_final = -EBADF;
1105 __do_free char *path = NULL, *limit_path = NULL;
1106 bool cpuset_v1 = false;
1107
1108 /*
1109 * The legacy cpuset controller needs massaging in case inheriting
1110 * settings from its immediate ancestor cgroup hasn't been turned on.
1111 */
1112 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
1113
1114 if (payload && cgroup_leaf) {
1115 /* With isolation both parts need to not already exist. */
1116 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
1117 if (fd_limit < 0)
1118 return syserrno(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
1119
1120 TRACE("Created limit cgroup %d->%d(%s)",
1121 fd_limit, h->dfd_base, cgroup_limit_dir);
1122
1123 /*
1124 * With isolation the devices legacy cgroup needs to be
1125 * iinitialized early, as it typically contains an 'a' (all)
1126 * line, which is not possible once a subdirectory has been
1127 * created.
1128 */
1129 if (string_in_list(h->controllers, "devices") &&
1130 !ops->setup_limits_legacy(ops, conf, true))
1131 return log_error(false, "Failed to setup legacy device limits");
1132
1133 limit_path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
1134 path = must_make_path(limit_path, cgroup_leaf, NULL);
1135
1136 /*
1137 * If we use a separate limit cgroup, the leaf cgroup, i.e. the
1138 * cgroup the container actually resides in, is below fd_limit.
1139 */
1140 fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false);
1141 if (fd_final < 0) {
1142 /* Ensure we don't leave any garbage behind. */
1143 if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
1144 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
1145 else
1146 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
1147 }
1148 } else {
1149 path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
1150
1151 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
1152 }
1153 if (fd_final < 0)
1154 return syserrno(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
1155
1156 if (payload) {
1157 h->cgfd_con = move_fd(fd_final);
1158 h->container_full_path = move_ptr(path);
1159
1160 if (fd_limit < 0)
1161 h->cgfd_limit = h->cgfd_con;
1162 else
1163 h->cgfd_limit = move_fd(fd_limit);
1164
1165 if (limit_path)
1166 h->container_limit_path = move_ptr(limit_path);
1167 else
1168 h->container_limit_path = h->container_full_path;
1169 } else {
1170 h->cgfd_mon = move_fd(fd_final);
1171 }
1172
1173 return true;
1174 }
1175
1176 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
1177 bool payload)
1178 {
1179 bool prune = true;
1180
1181 if (payload) {
1182 /* Check whether we actually created the cgroup to prune. */
1183 if (h->cgfd_limit < 0)
1184 prune = false;
1185
1186 free_equal(h->container_full_path, h->container_limit_path);
1187 close_equal(h->cgfd_con, h->cgfd_limit);
1188 } else {
1189 /* Check whether we actually created the cgroup to prune. */
1190 if (h->cgfd_mon < 0)
1191 prune = false;
1192
1193 close_prot_errno_disarm(h->cgfd_mon);
1194 }
1195
1196 /* We didn't create this cgroup. */
1197 if (!prune)
1198 return;
1199
1200 if (cgroup_tree_prune(h->dfd_base, path_prune))
1201 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
1202 else
1203 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
1204 }
1205
1206 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1207 struct lxc_handler *handler)
1208 {
1209 int len;
1210 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1211 const struct lxc_conf *conf;
1212
1213 if (!ops) {
1214 ERROR("Called with uninitialized cgroup operations");
1215 return;
1216 }
1217
1218 if (!ops->hierarchies)
1219 return;
1220
1221 if (!handler) {
1222 ERROR("Called with uninitialized handler");
1223 return;
1224 }
1225
1226 if (!handler->conf) {
1227 ERROR("Called with uninitialized conf");
1228 return;
1229 }
1230 conf = handler->conf;
1231
1232 if (!ops->monitor_cgroup) {
1233 WARN("Uninitialized monitor cgroup");
1234 return;
1235 }
1236
1237 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1238 if (len < 0)
1239 return;
1240
1241 for (int i = 0; ops->hierarchies[i]; i++) {
1242 __do_close int fd_pivot = -EBADF;
1243 __do_free char *pivot_path = NULL;
1244 struct hierarchy *h = ops->hierarchies[i];
1245 bool cpuset_v1 = false;
1246 int ret;
1247
1248 /* Monitor might have died before we entered the cgroup. */
1249 if (handler->monitor_pid <= 0) {
1250 WARN("No valid monitor process found while destroying cgroups");
1251 goto cgroup_prune_tree;
1252 }
1253
1254 if (conf->cgroup_meta.monitor_pivot_dir)
1255 pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
1256 else if (conf->cgroup_meta.dir)
1257 pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
1258 else
1259 pivot_path = must_make_path(CGROUP_PIVOT, NULL);
1260
1261 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
1262
1263 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
1264 if (fd_pivot < 0) {
1265 SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
1266 continue;
1267 }
1268
1269 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
1270 if (ret != 0) {
1271 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
1272 continue;
1273 }
1274
1275 cgroup_prune_tree:
1276 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
1277 if (ret < 0)
1278 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
1279 else
1280 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
1281 }
1282 }
1283
1284 /*
1285 * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
1286 * proper prefix directory of lxc.cgroup.dir.payload.
1287 *
1288 * Returns the prefix length if it is set, otherwise zero on success.
1289 */
1290 static bool check_cgroup_dir_config(struct lxc_conf *conf)
1291 {
1292 const char *monitor_dir = conf->cgroup_meta.monitor_dir,
1293 *container_dir = conf->cgroup_meta.container_dir,
1294 *namespace_dir = conf->cgroup_meta.namespace_dir;
1295
1296 /* none of the new options are set, all is fine */
1297 if (!monitor_dir && !container_dir && !namespace_dir)
1298 return true;
1299
1300 /* some are set, make sure lxc.cgroup.dir is not also set*/
1301 if (conf->cgroup_meta.dir)
1302 return log_error_errno(false, EINVAL,
1303 "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
1304
1305 /* make sure both monitor and payload are set */
1306 if (!monitor_dir || !container_dir)
1307 return log_error_errno(false, EINVAL,
1308 "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
1309
1310 /* namespace_dir may be empty */
1311 return true;
1312 }
1313
1314 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1315 {
1316 __do_free char *monitor_cgroup = NULL;
1317 int idx = 0;
1318 int i;
1319 size_t len;
1320 char *suffix = NULL;
1321 struct lxc_conf *conf;
1322
1323 if (!ops)
1324 return ret_set_errno(false, ENOENT);
1325
1326 if (!ops->hierarchies)
1327 return true;
1328
1329 if (ops->monitor_cgroup)
1330 return ret_set_errno(false, EEXIST);
1331
1332 if (!handler || !handler->conf)
1333 return ret_set_errno(false, EINVAL);
1334
1335 conf = handler->conf;
1336
1337 if (!check_cgroup_dir_config(conf))
1338 return false;
1339
1340 if (conf->cgroup_meta.monitor_dir) {
1341 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
1342 } else if (conf->cgroup_meta.dir) {
1343 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1344 DEFAULT_MONITOR_CGROUP_PREFIX,
1345 handler->name,
1346 CGROUP_CREATE_RETRY, NULL);
1347 } else if (ops->cgroup_pattern) {
1348 __do_free char *cgroup_tree = NULL;
1349
1350 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1351 if (!cgroup_tree)
1352 return ret_set_errno(false, ENOMEM);
1353
1354 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1355 DEFAULT_MONITOR_CGROUP,
1356 CGROUP_CREATE_RETRY, NULL);
1357 } else {
1358 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1359 handler->name,
1360 CGROUP_CREATE_RETRY, NULL);
1361 }
1362 if (!monitor_cgroup)
1363 return ret_set_errno(false, ENOMEM);
1364
1365 if (!conf->cgroup_meta.monitor_dir) {
1366 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1367 *suffix = '\0';
1368 }
1369 do {
1370 if (idx && suffix)
1371 sprintf(suffix, "-%d", idx);
1372
1373 for (i = 0; ops->hierarchies[i]; i++) {
1374 if (cgroup_tree_create(ops, handler->conf,
1375 ops->hierarchies[i],
1376 monitor_cgroup, NULL, false))
1377 continue;
1378
1379 DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1380 for (int j = 0; j <= i; j++)
1381 cgroup_tree_prune_leaf(ops->hierarchies[j],
1382 monitor_cgroup, false);
1383
1384 idx++;
1385 break;
1386 }
1387 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1388
1389 if (idx == 1000 || (!suffix && idx != 0))
1390 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1391
1392 ops->monitor_cgroup = move_ptr(monitor_cgroup);
1393 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1394 }
1395
1396 /*
1397 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1398 * next cgroup_pattern-1, -2, ..., -999.
1399 */
1400 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1401 {
1402 __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1403 char *limit_cgroup;
1404 int idx = 0;
1405 int i;
1406 size_t len;
1407 char *suffix = NULL;
1408 struct lxc_conf *conf;
1409
1410 if (!ops)
1411 return ret_set_errno(false, ENOENT);
1412
1413 if (!ops->hierarchies)
1414 return true;
1415
1416 if (ops->container_cgroup || ops->container_limit_cgroup)
1417 return ret_set_errno(false, EEXIST);
1418
1419 if (!handler || !handler->conf)
1420 return ret_set_errno(false, EINVAL);
1421
1422 conf = handler->conf;
1423
1424 if (!check_cgroup_dir_config(conf))
1425 return false;
1426
1427 if (conf->cgroup_meta.container_dir) {
1428 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1429 if (!__limit_cgroup)
1430 return ret_set_errno(false, ENOMEM);
1431
1432 if (conf->cgroup_meta.namespace_dir) {
1433 container_cgroup = must_make_path(__limit_cgroup,
1434 conf->cgroup_meta.namespace_dir,
1435 NULL);
1436 limit_cgroup = __limit_cgroup;
1437 } else {
1438 /* explicit paths but without isolation */
1439 limit_cgroup = move_ptr(__limit_cgroup);
1440 container_cgroup = limit_cgroup;
1441 }
1442 } else if (conf->cgroup_meta.dir) {
1443 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1444 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1445 handler->name,
1446 CGROUP_CREATE_RETRY, NULL);
1447 container_cgroup = limit_cgroup;
1448 } else if (ops->cgroup_pattern) {
1449 __do_free char *cgroup_tree = NULL;
1450
1451 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1452 if (!cgroup_tree)
1453 return ret_set_errno(false, ENOMEM);
1454
1455 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1456 DEFAULT_PAYLOAD_CGROUP,
1457 CGROUP_CREATE_RETRY, NULL);
1458 container_cgroup = limit_cgroup;
1459 } else {
1460 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1461 handler->name,
1462 CGROUP_CREATE_RETRY, NULL);
1463 container_cgroup = limit_cgroup;
1464 }
1465 if (!limit_cgroup)
1466 return ret_set_errno(false, ENOMEM);
1467
1468 if (!conf->cgroup_meta.container_dir) {
1469 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1470 *suffix = '\0';
1471 }
1472 do {
1473 if (idx && suffix)
1474 sprintf(suffix, "-%d", idx);
1475
1476 for (i = 0; ops->hierarchies[i]; i++) {
1477 if (cgroup_tree_create(ops, handler->conf,
1478 ops->hierarchies[i], limit_cgroup,
1479 conf->cgroup_meta.namespace_dir,
1480 true))
1481 continue;
1482
1483 DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
1484 for (int j = 0; j <= i; j++)
1485 cgroup_tree_prune_leaf(ops->hierarchies[j],
1486 limit_cgroup, true);
1487
1488 idx++;
1489 break;
1490 }
1491 } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1492
1493 if (idx == 1000 || (!suffix && idx != 0))
1494 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1495
1496 ops->container_cgroup = move_ptr(container_cgroup);
1497 if (__limit_cgroup)
1498 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1499 else
1500 ops->container_limit_cgroup = ops->container_cgroup;
1501 INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1502 ops->container_cgroup, ops->container_limit_cgroup);
1503 return true;
1504 }
1505
1506 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1507 struct lxc_handler *handler)
1508 {
1509 int monitor_len, transient_len = 0;
1510 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1511 transient[INTTYPE_TO_STRLEN(pid_t)];
1512
1513 if (!ops)
1514 return ret_set_errno(false, ENOENT);
1515
1516 if (!ops->hierarchies)
1517 return true;
1518
1519 if (!ops->monitor_cgroup)
1520 return ret_set_errno(false, ENOENT);
1521
1522 if (!handler || !handler->conf)
1523 return ret_set_errno(false, EINVAL);
1524
1525 monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1526 if (monitor_len < 0)
1527 return false;
1528
1529 if (handler->transient_pid > 0) {
1530 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1531 if (transient_len < 0)
1532 return false;
1533 }
1534
1535 for (int i = 0; ops->hierarchies[i]; i++) {
1536 struct hierarchy *h = ops->hierarchies[i];
1537 int ret;
1538
1539 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", monitor, monitor_len);
1540 if (ret)
1541 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->cgfd_mon);
1542
1543 TRACE("Moved monitor into cgroup %d", h->cgfd_mon);
1544
1545 if (handler->transient_pid <= 0)
1546 continue;
1547
1548 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", transient, transient_len);
1549 if (ret)
1550 return log_error_errno(false, errno, "Failed to enter cgroup %d", h->cgfd_mon);
1551
1552 TRACE("Moved transient process into cgroup %d", h->cgfd_mon);
1553
1554 /*
1555 * we don't keep the fds for non-unified hierarchies around
1556 * mainly because we don't make use of them anymore after the
1557 * core cgroup setup is done but also because there are quite a
1558 * lot of them.
1559 */
1560 if (!is_unified_hierarchy(h))
1561 close_prot_errno_disarm(h->cgfd_mon);
1562 }
1563 handler->transient_pid = -1;
1564
1565 return true;
1566 }
1567
1568 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1569 struct lxc_handler *handler)
1570 {
1571 int len;
1572 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1573
1574 if (!ops)
1575 return ret_set_errno(false, ENOENT);
1576
1577 if (!ops->hierarchies)
1578 return true;
1579
1580 if (!ops->container_cgroup)
1581 return ret_set_errno(false, ENOENT);
1582
1583 if (!handler || !handler->conf)
1584 return ret_set_errno(false, EINVAL);
1585
1586 len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1587 if (len < 0)
1588 return false;
1589
1590 for (int i = 0; ops->hierarchies[i]; i++) {
1591 struct hierarchy *h = ops->hierarchies[i];
1592 int ret;
1593
1594 if (is_unified_hierarchy(h) &&
1595 (handler->clone_flags & CLONE_INTO_CGROUP))
1596 continue;
1597
1598 ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
1599 if (ret != 0)
1600 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
1601
1602 TRACE("Moved container into %s cgroup via %d", h->container_full_path, h->cgfd_con);
1603 }
1604
1605 return true;
1606 }
1607
1608 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1609 gid_t chown_gid, mode_t chmod_mode)
1610 {
1611 int ret;
1612
1613 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1614 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1615 if (ret < 0)
1616 return log_warn_errno(-1,
1617 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1618 dirfd, path, (int)chown_uid,
1619 (int)chown_gid);
1620
1621 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1622 if (ret < 0)
1623 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1624 dirfd, path, (int)chmod_mode);
1625
1626 return 0;
1627 }
1628
1629 /* chgrp the container cgroups to container group. We leave
1630 * the container owner as cgroup owner. So we must make the
1631 * directories 775 so that the container can create sub-cgroups.
1632 *
1633 * Also chown the tasks and cgroup.procs files. Those may not
1634 * exist depending on kernel version.
1635 */
1636 static int chown_cgroup_wrapper(void *data)
1637 {
1638 int ret;
1639 uid_t destuid;
1640 struct generic_userns_exec_data *arg = data;
1641 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1642 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1643
1644 if (!lxc_drop_groups() && errno != EPERM)
1645 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1646
1647 ret = setresgid(nsgid, nsgid, nsgid);
1648 if (ret < 0)
1649 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1650 (int)nsgid, (int)nsgid, (int)nsgid);
1651
1652 ret = setresuid(nsuid, nsuid, nsuid);
1653 if (ret < 0)
1654 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1655 (int)nsuid, (int)nsuid, (int)nsuid);
1656
1657 destuid = get_ns_uid(arg->origuid);
1658 if (destuid == LXC_INVALID_UID)
1659 destuid = 0;
1660
1661 for (int i = 0; arg->hierarchies[i]; i++) {
1662 int dirfd = arg->hierarchies[i]->cgfd_con;
1663
1664 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1665
1666 /*
1667 * Failures to chown() these are inconvenient but not
1668 * detrimental We leave these owned by the container launcher,
1669 * so that container root can write to the files to attach. We
1670 * chmod() them 664 so that container systemd can write to the
1671 * files (which systemd in wily insists on doing).
1672 */
1673
1674 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC)
1675 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1676
1677 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1678
1679 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1680 continue;
1681
1682 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++)
1683 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1684 }
1685
1686 return 0;
1687 }
1688
1689 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1690 struct lxc_conf *conf)
1691 {
1692 struct generic_userns_exec_data wrap;
1693
1694 if (!ops)
1695 return ret_set_errno(false, ENOENT);
1696
1697 if (!ops->hierarchies)
1698 return true;
1699
1700 if (!ops->container_cgroup)
1701 return ret_set_errno(false, ENOENT);
1702
1703 if (!conf)
1704 return ret_set_errno(false, EINVAL);
1705
1706 if (lxc_list_empty(&conf->id_map))
1707 return true;
1708
1709 wrap.origuid = geteuid();
1710 wrap.path = NULL;
1711 wrap.hierarchies = ops->hierarchies;
1712 wrap.conf = conf;
1713
1714 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1715 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1716
1717 return true;
1718 }
1719
1720 __cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops)
1721 {
1722 if (!ops)
1723 return;
1724
1725 if (!ops->hierarchies)
1726 return;
1727
1728 for (int i = 0; ops->hierarchies[i]; i++) {
1729 struct hierarchy *h = ops->hierarchies[i];
1730 /*
1731 * we don't keep the fds for non-unified hierarchies around
1732 * mainly because we don't make use of them anymore after the
1733 * core cgroup setup is done but also because there are quite a
1734 * lot of them.
1735 */
1736 if (!is_unified_hierarchy(h))
1737 close_prot_errno_disarm(h->cgfd_con);
1738 }
1739
1740 /*
1741 * The checking for freezer support should obviously be done at cgroup
1742 * initialization time but that doesn't work reliable. The freezer
1743 * controller has been demoted (rightly so) to a simple file located in
1744 * each non-root cgroup. At the time when the container is created we
1745 * might still be located in /sys/fs/cgroup and so checking for
1746 * cgroup.freeze won't tell us anything because this file doesn't exist
1747 * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1748 * find an already existing cgroup and then check within that cgroup
1749 * for the existence of cgroup.freeze but that will only work on
1750 * systemd based hosts. Other init systems might not manage cgroups and
1751 * so no cgroup will exist. So we defer until we have created cgroups
1752 * for our container which means we check here.
1753 */
1754 if (pure_unified_layout(ops) &&
1755 !faccessat(ops->unified->cgfd_con, "cgroup.freeze", F_OK,
1756 AT_SYMLINK_NOFOLLOW)) {
1757 TRACE("Unified hierarchy supports freezer");
1758 ops->unified->freezer_controller = 1;
1759 }
1760 }
1761
1762 /* cgroup-full:* is done, no need to create subdirs */
1763 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1764 {
1765 switch (cgroup_automount_type) {
1766 case LXC_AUTO_CGROUP_RO:
1767 return true;
1768 case LXC_AUTO_CGROUP_RW:
1769 return true;
1770 case LXC_AUTO_CGROUP_MIXED:
1771 return true;
1772 }
1773
1774 return false;
1775 }
1776
1777 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1778 * remount controller ro if needed and bindmount the cgroupfs onto
1779 * control/the/cg/path.
1780 */
1781 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1782 char *controllerpath, char *cgpath,
1783 const char *container_cgroup)
1784 {
1785 __do_free char *sourcepath = NULL;
1786 int ret, remount_flags;
1787 int flags = MS_BIND;
1788
1789 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1790 (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1791 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1792 if (ret < 0)
1793 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1794 controllerpath, controllerpath);
1795
1796 remount_flags = add_required_remount_flags(controllerpath,
1797 controllerpath,
1798 flags | MS_REMOUNT);
1799 ret = mount(controllerpath, controllerpath, "cgroup",
1800 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1801 NULL);
1802 if (ret < 0)
1803 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath);
1804
1805 INFO("Remounted %s read-only", controllerpath);
1806 }
1807
1808 sourcepath = must_make_path(h->mountpoint, h->container_base_path,
1809 container_cgroup, NULL);
1810 if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1811 flags |= MS_RDONLY;
1812
1813 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1814 if (ret < 0)
1815 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1816 h->controllers[0], cgpath);
1817 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1818
1819 if (flags & MS_RDONLY) {
1820 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1821 flags | MS_REMOUNT);
1822 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1823 if (ret < 0)
1824 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1825 INFO("Remounted %s read-only", cgpath);
1826 }
1827
1828 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1829 return 0;
1830 }
1831
1832 /* __cgroupfs_mount
1833 *
1834 * Mount cgroup hierarchies directly without using bind-mounts. The main
1835 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1836 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1837 */
1838 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1839 struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1840 const char *hierarchy_mnt)
1841 {
1842 __do_close int fd_fs = -EBADF;
1843 unsigned int flags = 0;
1844 char *fstype;
1845 int ret;
1846
1847 if (dfd_mnt_cgroupfs < 0)
1848 return ret_errno(EINVAL);
1849
1850 flags |= MOUNT_ATTR_NOSUID;
1851 flags |= MOUNT_ATTR_NOEXEC;
1852 flags |= MOUNT_ATTR_NODEV;
1853 flags |= MOUNT_ATTR_RELATIME;
1854
1855 if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1856 (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1857 flags |= MOUNT_ATTR_RDONLY;
1858
1859 if (is_unified_hierarchy(h))
1860 fstype = "cgroup2";
1861 else
1862 fstype = "cgroup";
1863
1864 if (can_use_mount_api()) {
1865 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1866 if (fd_fs < 0)
1867 return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1868
1869 if (!is_unified_hierarchy(h)) {
1870 for (const char **it = (const char **)h->controllers; it && *it; it++) {
1871 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1872 ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1873 else
1874 ret = fs_set_property(fd_fs, *it, "");
1875 if (ret < 0)
1876 return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1877 }
1878 }
1879
1880 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1881 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1882 flags);
1883 } else {
1884 __do_free char *controllers = NULL, *target = NULL;
1885 unsigned int old_flags = 0;
1886 const char *rootfs_mnt;
1887
1888 if (!is_unified_hierarchy(h)) {
1889 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1890 if (!controllers)
1891 return ret_errno(ENOMEM);
1892 }
1893
1894 rootfs_mnt = get_rootfs_mnt(rootfs);
1895 ret = mnt_attributes_old(flags, &old_flags);
1896 if (ret)
1897 return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1898
1899 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1900 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1901 }
1902 if (ret < 0)
1903 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1904 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1905
1906 DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1907 fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1908 return 0;
1909 }
1910
1911 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1912 struct lxc_rootfs *rootfs,
1913 int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1914 {
1915 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1916 dfd_mnt_cgroupfs, hierarchy_mnt);
1917 }
1918
1919 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1920 struct lxc_rootfs *rootfs,
1921 int dfd_mnt_cgroupfs,
1922 const char *hierarchy_mnt)
1923 {
1924 switch (cgroup_automount_type) {
1925 case LXC_AUTO_CGROUP_FULL_RO:
1926 break;
1927 case LXC_AUTO_CGROUP_FULL_RW:
1928 break;
1929 case LXC_AUTO_CGROUP_FULL_MIXED:
1930 break;
1931 default:
1932 return 0;
1933 }
1934
1935 return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1936 dfd_mnt_cgroupfs, hierarchy_mnt);
1937 }
1938
1939 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1940 struct lxc_handler *handler, int cg_flags)
1941 {
1942 __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1943 __do_free char *cgroup_root = NULL;
1944 int cgroup_automount_type;
1945 bool in_cgroup_ns = false, wants_force_mount = false;
1946 struct lxc_conf *conf = handler->conf;
1947 struct lxc_rootfs *rootfs = &conf->rootfs;
1948 const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1949 int ret;
1950
1951 if (!ops)
1952 return ret_set_errno(false, ENOENT);
1953
1954 if (!ops->hierarchies)
1955 return true;
1956
1957 if (!conf)
1958 return ret_set_errno(false, EINVAL);
1959
1960 if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1961 return log_trace(true, "No cgroup mounts requested");
1962
1963 if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1964 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1965 wants_force_mount = true;
1966 }
1967
1968 switch (cg_flags) {
1969 case LXC_AUTO_CGROUP_RO:
1970 TRACE("Read-only cgroup mounts requested");
1971 break;
1972 case LXC_AUTO_CGROUP_RW:
1973 TRACE("Read-write cgroup mounts requested");
1974 break;
1975 case LXC_AUTO_CGROUP_MIXED:
1976 TRACE("Mixed cgroup mounts requested");
1977 break;
1978 case LXC_AUTO_CGROUP_FULL_RO:
1979 TRACE("Full read-only cgroup mounts requested");
1980 break;
1981 case LXC_AUTO_CGROUP_FULL_RW:
1982 TRACE("Full read-write cgroup mounts requested");
1983 break;
1984 case LXC_AUTO_CGROUP_FULL_MIXED:
1985 TRACE("Full mixed cgroup mounts requested");
1986 break;
1987 default:
1988 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1989 }
1990 cgroup_automount_type = cg_flags;
1991
1992 if (!wants_force_mount) {
1993 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
1994
1995 /*
1996 * Most recent distro versions currently have init system that
1997 * do support cgroup2 but do not mount it by default unless
1998 * explicitly told so even if the host is cgroup2 only. That
1999 * means they often will fail to boot. Fix this by pre-mounting
2000 * cgroup2 by default. We will likely need to be doing this a
2001 * few years until all distros have switched over to cgroup2 at
2002 * which point we can safely assume that their init systems
2003 * will mount it themselves.
2004 */
2005 if (pure_unified_layout(ops))
2006 wants_force_mount = true;
2007 }
2008
2009 if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
2010 in_cgroup_ns = true;
2011
2012 if (in_cgroup_ns && !wants_force_mount)
2013 return log_trace(true, "Mounting cgroups not requested or needed");
2014
2015 /* This is really the codepath that we want. */
2016 if (pure_unified_layout(ops)) {
2017 __do_close int dfd_mnt_unified = -EBADF;
2018
2019 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
2020 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
2021 if (dfd_mnt_unified < 0)
2022 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
2023 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
2024 /*
2025 * If cgroup namespaces are supported but the container will
2026 * not have CAP_SYS_ADMIN after it has started we need to mount
2027 * the cgroups manually.
2028 *
2029 * Note that here we know that wants_force_mount is true.
2030 * Otherwise we would've returned early above.
2031 */
2032 if (in_cgroup_ns) {
2033 /*
2034 * 1. cgroup:rw:force -> Mount the cgroup2 filesystem.
2035 * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only.
2036 * 3. cgroup:mixed:force -> See comment above how this
2037 * does not apply so
2038 * cgroup:mixed is equal to
2039 * cgroup:rw when cgroup
2040 * namespaces are supported.
2041
2042 * 4. cgroup:rw -> No-op; init system responsible for mounting.
2043 * 5. cgroup:ro -> No-op; init system responsible for mounting.
2044 * 6. cgroup:mixed -> No-op; init system responsible for mounting.
2045 *
2046 * 7. cgroup-full:rw -> Not supported.
2047 * 8. cgroup-full:ro -> Not supported.
2048 * 9. cgroup-full:mixed -> Not supported.
2049
2050 * 10. cgroup-full:rw:force -> Not supported.
2051 * 11. cgroup-full:ro:force -> Not supported.
2052 * 12. cgroup-full:mixed:force -> Not supported.
2053 */
2054 ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
2055 if (ret < 0)
2056 return syserrno(false, "Failed to force mount cgroup filesystem in cgroup namespace");
2057
2058 return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
2059 } else {
2060 /*
2061 * Either no cgroup namespace supported (highly
2062 * unlikely unless we're dealing with a Frankenkernel.
2063 * Or the user requested to keep the cgroup namespace
2064 * of the host or another container.
2065 */
2066 if (wants_force_mount) {
2067 /*
2068 * 1. cgroup:rw:force -> Bind-mount the cgroup2 filesystem writable.
2069 * 2. cgroup:ro:force -> Bind-mount the cgroup2 filesystem read-only.
2070 * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
2071 * and make the parent directory of the
2072 * container's cgroup read-only but the
2073 * container's cgroup writable.
2074 *
2075 * 10. cgroup-full:rw:force ->
2076 * 11. cgroup-full:ro:force ->
2077 * 12. cgroup-full:mixed:force ->
2078 */
2079 errno = EOPNOTSUPP;
2080 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
2081 } else {
2082 errno = EOPNOTSUPP;
2083 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
2084 }
2085 }
2086
2087 return syserrno(false, "Failed to mount cgroups");
2088 }
2089
2090 /*
2091 * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
2092 * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
2093 * DEFAULT_CGROUP_MOUNTPOINT define.
2094 */
2095 if (can_use_mount_api()) {
2096 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
2097 if (fd_fs < 0)
2098 return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
2099
2100 ret = fs_set_property(fd_fs, "mode", "0755");
2101 if (ret < 0)
2102 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
2103
2104 ret = fs_set_property(fd_fs, "size", "10240k");
2105 if (ret < 0)
2106 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
2107
2108 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
2109 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
2110 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
2111 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
2112 } else {
2113 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
2114 ret = safe_mount(NULL, cgroup_root, "tmpfs",
2115 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
2116 "size=10240k,mode=755", rootfs_mnt);
2117 }
2118 if (ret < 0)
2119 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
2120 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
2121
2122 dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
2123 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
2124 if (dfd_mnt_tmpfs < 0)
2125 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
2126 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
2127
2128 for (int i = 0; ops->hierarchies[i]; i++) {
2129 __do_free char *controllerpath = NULL, *path2 = NULL;
2130 struct hierarchy *h = ops->hierarchies[i];
2131 char *controller = strrchr(h->mountpoint, '/');
2132
2133 if (!controller)
2134 continue;
2135 controller++;
2136
2137 ret = mkdirat(dfd_mnt_tmpfs, controller, 0000);
2138 if (ret < 0)
2139 return log_error_errno(false, errno, "Failed to create cgroup mountpoint %d(%s)", dfd_mnt_tmpfs, controller);
2140
2141 if (in_cgroup_ns && wants_force_mount) {
2142 /*
2143 * If cgroup namespaces are supported but the container
2144 * will not have CAP_SYS_ADMIN after it has started we
2145 * need to mount the cgroups manually.
2146 */
2147 ret = cgroupfs_mount(cgroup_automount_type, h, rootfs, dfd_mnt_tmpfs, controller);
2148 if (ret < 0)
2149 return false;
2150
2151 continue;
2152 }
2153
2154 /* Here is where the ancient kernel section begins. */
2155 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs, dfd_mnt_tmpfs, controller);
2156 if (ret < 0)
2157 return false;
2158
2159 if (!cg_mount_needs_subdirs(cgroup_automount_type))
2160 continue;
2161
2162 if (!cgroup_root)
2163 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
2164
2165 controllerpath = must_make_path(cgroup_root, controller, NULL);
2166 path2 = must_make_path(controllerpath, h->container_base_path, ops->container_cgroup, NULL);
2167 ret = mkdir_p(path2, 0755);
2168 if (ret < 0 && (errno != EEXIST))
2169 return false;
2170
2171 ret = cg_legacy_mount_controllers(cgroup_automount_type, h, controllerpath, path2, ops->container_cgroup);
2172 if (ret < 0)
2173 return false;
2174 }
2175
2176 return true;
2177 }
2178
2179 /* Only root needs to escape to the cgroup of its init. */
2180 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
2181 struct lxc_conf *conf)
2182 {
2183 if (!ops)
2184 return ret_set_errno(false, ENOENT);
2185
2186 if (!ops->hierarchies)
2187 return true;
2188
2189 if (!conf)
2190 return ret_set_errno(false, EINVAL);
2191
2192 if (conf->cgroup_meta.relative || geteuid())
2193 return true;
2194
2195 for (int i = 0; ops->hierarchies[i]; i++) {
2196 __do_free char *fullpath = NULL;
2197 int ret;
2198
2199 fullpath =
2200 must_make_path(ops->hierarchies[i]->mountpoint,
2201 ops->hierarchies[i]->container_base_path,
2202 "cgroup.procs", NULL);
2203 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
2204 if (ret != 0)
2205 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
2206 }
2207
2208 return true;
2209 }
2210
2211 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
2212 {
2213 int i = 0;
2214
2215 if (!ops)
2216 return ret_set_errno(-1, ENOENT);
2217
2218 if (!ops->hierarchies)
2219 return 0;
2220
2221 for (; ops->hierarchies[i]; i++)
2222 ;
2223
2224 return i;
2225 }
2226
2227 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
2228 int n, char ***out)
2229 {
2230 int i;
2231
2232 if (!ops)
2233 return ret_set_errno(false, ENOENT);
2234
2235 if (!ops->hierarchies)
2236 return ret_set_errno(false, ENOENT);
2237
2238 /* sanity check n */
2239 for (i = 0; i < n; i++)
2240 if (!ops->hierarchies[i])
2241 return ret_set_errno(false, ENOENT);
2242
2243 *out = ops->hierarchies[i]->controllers;
2244
2245 return true;
2246 }
2247
2248 static bool cg_legacy_freeze(struct cgroup_ops *ops)
2249 {
2250 struct hierarchy *h;
2251
2252 h = get_hierarchy(ops, "freezer");
2253 if (!h)
2254 return ret_set_errno(-1, ENOENT);
2255
2256 return lxc_write_openat(h->container_full_path, "freezer.state",
2257 "FROZEN", STRLITERALLEN("FROZEN"));
2258 }
2259
2260 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
2261 struct lxc_epoll_descr *descr)
2262 {
2263 __do_free char *line = NULL;
2264 __do_fclose FILE *f = NULL;
2265 int state = PTR_TO_INT(cbdata);
2266 size_t len;
2267 const char *state_string;
2268
2269 f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
2270 if (!f)
2271 return LXC_MAINLOOP_ERROR;
2272
2273 if (state == 1)
2274 state_string = "frozen 1";
2275 else
2276 state_string = "frozen 0";
2277
2278 while (getline(&line, &len, f) != -1)
2279 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
2280 return LXC_MAINLOOP_CLOSE;
2281
2282 rewind(f);
2283
2284 return LXC_MAINLOOP_CONTINUE;
2285 }
2286
2287 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
2288 const char *state_string,
2289 int state_num,
2290 const char *epoll_error,
2291 const char *wait_error)
2292 {
2293 __do_close int fd = -EBADF;
2294 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
2295 int ret;
2296 struct lxc_epoll_descr descr;
2297 struct hierarchy *h;
2298
2299 h = ops->unified;
2300 if (!h)
2301 return ret_set_errno(-1, ENOENT);
2302
2303 if (!h->container_full_path)
2304 return ret_set_errno(-1, EEXIST);
2305
2306 if (timeout != 0) {
2307 __do_free char *events_file = NULL;
2308
2309 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2310 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2311 if (fd < 0)
2312 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
2313
2314 ret = lxc_mainloop_open(&descr);
2315 if (ret)
2316 return log_error_errno(-1, errno, "%s", epoll_error);
2317
2318 /* automatically cleaned up now */
2319 descr_ptr = &descr;
2320
2321 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
2322 if (ret < 0)
2323 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
2324 }
2325
2326 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", state_string, 1);
2327 if (ret < 0)
2328 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
2329
2330 if (timeout != 0 && lxc_mainloop(&descr, timeout))
2331 return log_error_errno(-1, errno, "%s", wait_error);
2332
2333 return 0;
2334 }
2335
2336 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
2337 {
2338 return cg_unified_freeze_do(ops, timeout, "1", 1,
2339 "Failed to create epoll instance to wait for container freeze",
2340 "Failed to wait for container to be frozen");
2341 }
2342
2343 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
2344 {
2345 if (!ops->hierarchies)
2346 return ret_set_errno(-1, ENOENT);
2347
2348 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2349 return cg_legacy_freeze(ops);
2350
2351 return cg_unified_freeze(ops, timeout);
2352 }
2353
2354 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
2355 {
2356 struct hierarchy *h;
2357
2358 h = get_hierarchy(ops, "freezer");
2359 if (!h)
2360 return ret_set_errno(-1, ENOENT);
2361
2362 return lxc_write_openat(h->container_full_path, "freezer.state",
2363 "THAWED", STRLITERALLEN("THAWED"));
2364 }
2365
2366 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2367 {
2368 return cg_unified_freeze_do(ops, timeout, "0", 0,
2369 "Failed to create epoll instance to wait for container unfreeze",
2370 "Failed to wait for container to be unfrozen");
2371 }
2372
2373 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2374 {
2375 if (!ops->hierarchies)
2376 return ret_set_errno(-1, ENOENT);
2377
2378 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2379 return cg_legacy_unfreeze(ops);
2380
2381 return cg_unified_unfreeze(ops, timeout);
2382 }
2383
2384 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2385 const char *controller, bool limiting)
2386 {
2387 struct hierarchy *h;
2388
2389 h = get_hierarchy(ops, controller);
2390 if (!h)
2391 return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
2392 controller ? controller : "(null)");
2393
2394 if (limiting)
2395 return h->container_limit_path
2396 ? h->container_limit_path + strlen(h->mountpoint)
2397 : NULL;
2398
2399 return h->container_full_path
2400 ? h->container_full_path + strlen(h->mountpoint)
2401 : NULL;
2402 }
2403
2404 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2405 const char *controller)
2406 {
2407 return cgfsng_get_cgroup_do(ops, controller, false);
2408 }
2409
2410 __cgfsng_ops static const char *cgfsng_get_limiting_cgroup(struct cgroup_ops *ops,
2411 const char *controller)
2412 {
2413 return cgfsng_get_cgroup_do(ops, controller, true);
2414 }
2415
2416 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2417 * which must be freed by the caller.
2418 */
2419 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2420 const char *inpath,
2421 const char *filename)
2422 {
2423 return must_make_path(h->mountpoint, inpath, filename, NULL);
2424 }
2425
2426 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2427 {
2428 int idx = 1;
2429 int ret;
2430 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2431 ssize_t pidstr_len;
2432
2433 /* Create leaf cgroup. */
2434 ret = mkdirat(unified_fd, ".lxc", 0755);
2435 if (ret < 0 && errno != EEXIST)
2436 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2437
2438 pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2439 if (pidstr_len < 0)
2440 return pidstr_len;
2441
2442 ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2443 if (ret < 0)
2444 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2445 if (ret == 0)
2446 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2447
2448 /* this is a non-leaf node */
2449 if (errno != EBUSY)
2450 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2451
2452 do {
2453 bool rm = false;
2454 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2455 char *slash = attach_cgroup;
2456
2457 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2458 if (ret < 0)
2459 return ret;
2460
2461 /*
2462 * This shouldn't really happen but the compiler might complain
2463 * that a short write would cause a buffer overrun. So be on
2464 * the safe side.
2465 */
2466 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2467 return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2468
2469 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2470 *slash = '\0';
2471
2472 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2473 if (ret < 0 && errno != EEXIST)
2474 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2475 if (ret == 0)
2476 rm = true;
2477
2478 *slash = '/';
2479
2480 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2481 if (ret == 0)
2482 return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2483
2484 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2485 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2486
2487 /* this is a non-leaf node */
2488 if (errno != EBUSY)
2489 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2490
2491 idx++;
2492 } while (idx < 1000);
2493
2494 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2495 }
2496
2497 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2498 int unified_fd, int *sk_fd)
2499 {
2500 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2501 int target_fds[2];
2502 ssize_t ret;
2503
2504 /* Create leaf cgroup. */
2505 ret = mkdirat(unified_fd, ".lxc", 0755);
2506 if (ret < 0 && errno != EEXIST)
2507 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2508
2509 target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2510 if (target_fd0 < 0)
2511 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2512 target_fds[0] = target_fd0;
2513
2514 target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2515 if (target_fd1 < 0)
2516 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2517 target_fds[1] = target_fd1;
2518
2519 ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2520 if (ret <= 0)
2521 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2522 target_fd0, target_fd1);
2523
2524 return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2525 }
2526
2527 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2528 int *sk_fd, pid_t pid)
2529 {
2530 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2531 int target_fds[2];
2532 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2533 size_t pidstr_len;
2534 ssize_t ret;
2535
2536 ret = lxc_abstract_unix_recv_fds(sk, target_fds, 2, NULL, 0);
2537 if (ret <= 0)
2538 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2539 target_fd0 = target_fds[0];
2540 target_fd1 = target_fds[1];
2541
2542 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2543
2544 ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2545 if (ret > 0 && ret == pidstr_len)
2546 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2547
2548 ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2549 if (ret > 0 && ret == pidstr_len)
2550 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2551
2552 return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2553 target_fd0, target_fd1);
2554 }
2555
2556 struct userns_exec_unified_attach_data {
2557 const struct lxc_conf *conf;
2558 int unified_fd;
2559 int sk_pair[2];
2560 pid_t pid;
2561 };
2562
2563 static int cgroup_unified_attach_child_wrapper(void *data)
2564 {
2565 struct userns_exec_unified_attach_data *args = data;
2566
2567 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2568 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2569 return ret_errno(EINVAL);
2570
2571 close_prot_errno_disarm(args->sk_pair[0]);
2572 return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2573 &args->sk_pair[1]);
2574 }
2575
2576 static int cgroup_unified_attach_parent_wrapper(void *data)
2577 {
2578 struct userns_exec_unified_attach_data *args = data;
2579
2580 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2581 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2582 return ret_errno(EINVAL);
2583
2584 close_prot_errno_disarm(args->sk_pair[1]);
2585 return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2586 args->pid);
2587 }
2588
2589 /* Technically, we're always at a delegation boundary here (This is especially
2590 * true when cgroup namespaces are available.). The reasoning is that in order
2591 * for us to have been able to start a container in the first place the root
2592 * cgroup must have been a leaf node. Now, either the container's init system
2593 * has populated the cgroup and kept it as a leaf node or it has created
2594 * subtrees. In the former case we will simply attach to the leaf node we
2595 * created when we started the container in the latter case we create our own
2596 * cgroup for the attaching process.
2597 */
2598 static int __cg_unified_attach(const struct hierarchy *h,
2599 const struct lxc_conf *conf, const char *name,
2600 const char *lxcpath, pid_t pid,
2601 const char *controller)
2602 {
2603 __do_close int unified_fd = -EBADF;
2604 __do_free char *path = NULL, *cgroup = NULL;
2605 int ret;
2606
2607 if (!conf || !name || !lxcpath || pid <= 0)
2608 return ret_errno(EINVAL);
2609
2610 ret = cgroup_attach(conf, name, lxcpath, pid);
2611 if (ret == 0)
2612 return log_trace(0, "Attached to unified cgroup via command handler");
2613 if (ret != -ENOCGROUP2)
2614 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2615
2616 /* Fall back to retrieving the path for the unified cgroup. */
2617 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2618 /* not running */
2619 if (!cgroup)
2620 return 0;
2621
2622 path = must_make_path(h->mountpoint, cgroup, NULL);
2623
2624 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2625 if (unified_fd < 0)
2626 return ret_errno(EBADF);
2627
2628 if (!lxc_list_empty(&conf->id_map)) {
2629 struct userns_exec_unified_attach_data args = {
2630 .conf = conf,
2631 .unified_fd = unified_fd,
2632 .pid = pid,
2633 };
2634
2635 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2636 if (ret < 0)
2637 return -errno;
2638
2639 ret = userns_exec_minimal(conf,
2640 cgroup_unified_attach_parent_wrapper,
2641 &args,
2642 cgroup_unified_attach_child_wrapper,
2643 &args);
2644 } else {
2645 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2646 }
2647
2648 return ret;
2649 }
2650
2651 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2652 const struct lxc_conf *conf,
2653 const char *name, const char *lxcpath,
2654 pid_t pid)
2655 {
2656 int len, ret;
2657 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2658
2659 if (!ops)
2660 return ret_set_errno(false, ENOENT);
2661
2662 if (!ops->hierarchies)
2663 return true;
2664
2665 len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2666 if (len < 0)
2667 return false;
2668
2669 for (int i = 0; ops->hierarchies[i]; i++) {
2670 __do_free char *fullpath = NULL, *path = NULL;
2671 struct hierarchy *h = ops->hierarchies[i];
2672
2673 if (h->version == CGROUP2_SUPER_MAGIC) {
2674 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2675 h->controllers[0]);
2676 if (ret < 0)
2677 return false;
2678
2679 continue;
2680 }
2681
2682 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2683 /* not running */
2684 if (!path)
2685 return false;
2686
2687 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2688 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2689 if (ret < 0)
2690 return log_error_errno(false, errno, "Failed to attach %d to %s",
2691 (int)pid, fullpath);
2692 }
2693
2694 return true;
2695 }
2696
2697 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2698 * don't have a cgroup_data set up, so we ask the running container through the
2699 * commands API for the cgroup path.
2700 */
2701 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2702 char *value, size_t len, const char *name,
2703 const char *lxcpath)
2704 {
2705 __do_free char *path = NULL;
2706 __do_free char *controller = NULL;
2707 char *p;
2708 struct hierarchy *h;
2709 int ret = -1;
2710
2711 if (!ops)
2712 return ret_set_errno(-1, ENOENT);
2713
2714 controller = must_copy_string(filename);
2715 p = strchr(controller, '.');
2716 if (p)
2717 *p = '\0';
2718
2719 path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2720 /* not running */
2721 if (!path)
2722 return -1;
2723
2724 h = get_hierarchy(ops, controller);
2725 if (h) {
2726 __do_free char *fullpath = NULL;
2727
2728 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2729 ret = lxc_read_from_file(fullpath, value, len);
2730 }
2731
2732 return ret;
2733 }
2734
2735 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2736 {
2737 for (int count = 0; count < 3; count++, val++) {
2738 switch (*val) {
2739 case 'r':
2740 device->access[count] = *val;
2741 break;
2742 case 'w':
2743 device->access[count] = *val;
2744 break;
2745 case 'm':
2746 device->access[count] = *val;
2747 break;
2748 case '\n':
2749 case '\0':
2750 count = 3;
2751 break;
2752 default:
2753 return ret_errno(EINVAL);
2754 }
2755 }
2756
2757 return 0;
2758 }
2759
2760 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2761 const char *val)
2762 {
2763 int count, ret;
2764 char temp[50];
2765
2766 if (strequal("devices.allow", key))
2767 device->allow = 1; /* allow the device */
2768 else
2769 device->allow = 0; /* deny the device */
2770
2771 if (strequal(val, "a")) {
2772 /* global rule */
2773 device->type = 'a';
2774 device->major = -1;
2775 device->minor = -1;
2776 return 0;
2777 }
2778
2779 switch (*val) {
2780 case 'a':
2781 __fallthrough;
2782 case 'b':
2783 __fallthrough;
2784 case 'c':
2785 device->type = *val;
2786 break;
2787 default:
2788 return -1;
2789 }
2790
2791 val++;
2792 if (!isspace(*val))
2793 return -1;
2794 val++;
2795 if (*val == '*') {
2796 device->major = -1;
2797 val++;
2798 } else if (isdigit(*val)) {
2799 memset(temp, 0, sizeof(temp));
2800 for (count = 0; count < sizeof(temp) - 1; count++) {
2801 temp[count] = *val;
2802 val++;
2803 if (!isdigit(*val))
2804 break;
2805 }
2806 ret = lxc_safe_int(temp, &device->major);
2807 if (ret)
2808 return -1;
2809 } else {
2810 return -1;
2811 }
2812 if (*val != ':')
2813 return -1;
2814 val++;
2815
2816 /* read minor */
2817 if (*val == '*') {
2818 device->minor = -1;
2819 val++;
2820 } else if (isdigit(*val)) {
2821 memset(temp, 0, sizeof(temp));
2822 for (count = 0; count < sizeof(temp) - 1; count++) {
2823 temp[count] = *val;
2824 val++;
2825 if (!isdigit(*val))
2826 break;
2827 }
2828 ret = lxc_safe_int(temp, &device->minor);
2829 if (ret)
2830 return -1;
2831 } else {
2832 return -1;
2833 }
2834 if (!isspace(*val))
2835 return -1;
2836
2837 return device_cgroup_parse_access(device, ++val);
2838 }
2839
2840 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2841 * don't have a cgroup_data set up, so we ask the running container through the
2842 * commands API for the cgroup path.
2843 */
2844 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2845 const char *key, const char *value,
2846 const char *name, const char *lxcpath)
2847 {
2848 __do_free char *path = NULL;
2849 __do_free char *controller = NULL;
2850 char *p;
2851 struct hierarchy *h;
2852 int ret = -1;
2853
2854 if (!ops || is_empty_string(key) || is_empty_string(value) ||
2855 is_empty_string(name) || is_empty_string(lxcpath))
2856 return ret_errno(EINVAL);
2857
2858 controller = must_copy_string(key);
2859 p = strchr(controller, '.');
2860 if (p)
2861 *p = '\0';
2862
2863 if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2864 struct device_item device = {};
2865
2866 ret = device_cgroup_rule_parse(&device, key, value);
2867 if (ret < 0)
2868 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2869 key, value);
2870
2871 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2872 if (ret < 0)
2873 return -1;
2874
2875 return 0;
2876 }
2877
2878 path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2879 /* not running */
2880 if (!path)
2881 return -1;
2882
2883 h = get_hierarchy(ops, controller);
2884 if (h) {
2885 __do_free char *fullpath = NULL;
2886
2887 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2888 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2889 }
2890
2891 return ret;
2892 }
2893
2894 /* take devices cgroup line
2895 * /dev/foo rwx
2896 * and convert it to a valid
2897 * type major:minor mode
2898 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2899 * the output.
2900 */
2901 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2902 const char *devpath)
2903 {
2904 __do_free char *path = NULL;
2905 char *mode = NULL;
2906 int n_parts, ret;
2907 char *p;
2908 struct stat sb;
2909
2910 path = must_copy_string(devpath);
2911
2912 /*
2913 * Read path followed by mode. Ignore any trailing text.
2914 * A ' # comment' would be legal. Technically other text is not
2915 * legal, we could check for that if we cared to.
2916 */
2917 for (n_parts = 1, p = path; *p; p++) {
2918 if (*p != ' ')
2919 continue;
2920 *p = '\0';
2921
2922 if (n_parts != 1)
2923 break;
2924 p++;
2925 n_parts++;
2926
2927 while (*p == ' ')
2928 p++;
2929
2930 mode = p;
2931
2932 if (*p == '\0')
2933 return ret_set_errno(-1, EINVAL);
2934 }
2935
2936 if (!mode)
2937 return ret_errno(EINVAL);
2938
2939 if (device_cgroup_parse_access(device, mode) < 0)
2940 return -1;
2941
2942 ret = stat(path, &sb);
2943 if (ret < 0)
2944 return ret_set_errno(-1, errno);
2945
2946 mode_t m = sb.st_mode & S_IFMT;
2947 switch (m) {
2948 case S_IFBLK:
2949 device->type = 'b';
2950 break;
2951 case S_IFCHR:
2952 device->type = 'c';
2953 break;
2954 default:
2955 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2956 }
2957
2958 device->major = MAJOR(sb.st_rdev);
2959 device->minor = MINOR(sb.st_rdev);
2960 device->allow = 1;
2961
2962 return 0;
2963 }
2964
2965 static int convert_devpath(const char *invalue, char *dest)
2966 {
2967 struct device_item device = {};
2968 int ret;
2969
2970 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2971 if (ret < 0)
2972 return -1;
2973
2974 ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2975 device.minor, device.access);
2976 if (ret < 0)
2977 return log_error_errno(ret, -ret,
2978 "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2979 device.type, device.major, device.minor,
2980 device.access);
2981
2982 return 0;
2983 }
2984
2985 /* Called from setup_limits - here we have the container's cgroup_data because
2986 * we created the cgroups.
2987 */
2988 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2989 const char *value, bool is_cpuset)
2990 {
2991 __do_free char *controller = NULL;
2992 char *p;
2993 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2994 char converted_value[50];
2995 struct hierarchy *h;
2996
2997 controller = must_copy_string(filename);
2998 p = strchr(controller, '.');
2999 if (p)
3000 *p = '\0';
3001
3002 if (strequal("devices.allow", filename) && value[0] == '/') {
3003 int ret;
3004
3005 ret = convert_devpath(value, converted_value);
3006 if (ret < 0)
3007 return ret;
3008 value = converted_value;
3009 }
3010
3011 h = get_hierarchy(ops, controller);
3012 if (!h)
3013 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
3014
3015 if (is_cpuset) {
3016 int ret = lxc_write_openat(h->container_full_path, filename, value, strlen(value));
3017 if (ret)
3018 return ret;
3019 }
3020 return lxc_write_openat(h->container_limit_path, filename, value, strlen(value));
3021 }
3022
3023 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
3024 struct lxc_conf *conf,
3025 bool do_devices)
3026 {
3027 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
3028 struct lxc_list *cgroup_settings = &conf->cgroup;
3029 struct lxc_list *iterator, *next;
3030 struct lxc_cgroup *cg;
3031 bool ret = false;
3032
3033 if (!ops)
3034 return ret_set_errno(false, ENOENT);
3035
3036 if (!conf)
3037 return ret_set_errno(false, EINVAL);
3038
3039 cgroup_settings = &conf->cgroup;
3040 if (lxc_list_empty(cgroup_settings))
3041 return true;
3042
3043 if (!ops->hierarchies)
3044 return ret_set_errno(false, EINVAL);
3045
3046 if (pure_unified_layout(ops))
3047 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
3048
3049 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
3050 if (!sorted_cgroup_settings)
3051 return false;
3052
3053 lxc_list_for_each(iterator, sorted_cgroup_settings) {
3054 cg = iterator->elem;
3055
3056 if (do_devices == strnequal("devices", cg->subsystem, 7)) {
3057 if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strnequal("cpuset", cg->subsystem, 6))) {
3058 if (do_devices && (errno == EACCES || errno == EPERM)) {
3059 SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3060 continue;
3061 }
3062 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3063 goto out;
3064 }
3065 DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
3066 }
3067 }
3068
3069 ret = true;
3070 INFO("Limits for the legacy cgroup hierarchies have been setup");
3071 out:
3072 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
3073 lxc_list_del(iterator);
3074 free(iterator);
3075 }
3076
3077 return ret;
3078 }
3079
3080 /*
3081 * Some of the parsing logic comes from the original cgroup device v1
3082 * implementation in the kernel.
3083 */
3084 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
3085 struct lxc_conf *conf, const char *key,
3086 const char *val)
3087 {
3088 struct device_item device_item = {};
3089 int ret;
3090
3091 if (strequal("devices.allow", key) && *val == '/')
3092 ret = device_cgroup_rule_parse_devpath(&device_item, val);
3093 else
3094 ret = device_cgroup_rule_parse(&device_item, key, val);
3095 if (ret < 0)
3096 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", key, val);
3097
3098 /*
3099 * Note that bpf_list_add_device() indicates whether or not it had to
3100 * alter the current device list by return 1 and 0; both indicate
3101 * success. A negative return value indicates and error.
3102 */
3103 ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
3104 if (ret < 0)
3105 return -1;
3106
3107 return 0;
3108 }
3109
3110 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
3111 struct lxc_handler *handler)
3112 {
3113 struct lxc_list *cgroup_settings, *iterator;
3114 struct hierarchy *h;
3115 struct lxc_conf *conf;
3116
3117 if (!ops)
3118 return ret_set_errno(false, ENOENT);
3119
3120 if (!ops->hierarchies)
3121 return true;
3122
3123 if (!ops->container_cgroup)
3124 return ret_set_errno(false, EINVAL);
3125
3126 if (!handler || !handler->conf)
3127 return ret_set_errno(false, EINVAL);
3128 conf = handler->conf;
3129
3130 cgroup_settings = &conf->cgroup2;
3131 if (lxc_list_empty(cgroup_settings))
3132 return true;
3133
3134 if (!pure_unified_layout(ops))
3135 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
3136
3137 if (!ops->unified)
3138 return false;
3139 h = ops->unified;
3140
3141 lxc_list_for_each (iterator, cgroup_settings) {
3142 struct lxc_cgroup *cg = iterator->elem;
3143 int ret;
3144
3145 if (strnequal("devices", cg->subsystem, 7))
3146 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
3147 else
3148 ret = lxc_write_openat(h->container_limit_path, cg->subsystem, cg->value, strlen(cg->value));
3149 if (ret < 0)
3150 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3151
3152 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3153 }
3154
3155 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
3156 }
3157
3158 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
3159 {
3160 struct lxc_conf *conf;
3161 struct hierarchy *unified;
3162
3163 if (!ops)
3164 return ret_set_errno(false, ENOENT);
3165
3166 if (!ops->hierarchies)
3167 return true;
3168
3169 if (!ops->container_cgroup)
3170 return ret_set_errno(false, EEXIST);
3171
3172 if (!handler || !handler->conf)
3173 return ret_set_errno(false, EINVAL);
3174 conf = handler->conf;
3175
3176 unified = ops->unified;
3177 if (!unified || !unified->bpf_device_controller ||
3178 !unified->container_full_path ||
3179 lxc_list_empty(&(conf->bpf_devices).device_item))
3180 return true;
3181
3182 return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
3183 }
3184
3185 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
3186 {
3187 __do_close int dfd_final = -EBADF;
3188 __do_free char *add_controllers = NULL, *copy = NULL;
3189 size_t full_len = 0;
3190 struct hierarchy *unified;
3191 int dfd_cur, ret;
3192 char *cur;
3193 char **it;
3194
3195 if (!ops->hierarchies || !pure_unified_layout(ops))
3196 return true;
3197
3198 unified = ops->unified;
3199 if (!unified->controllers[0])
3200 return true;
3201
3202 /* For now we simply enable all controllers that we have detected by
3203 * creating a string like "+memory +pids +cpu +io".
3204 * TODO: In the near future we might want to support "-<controller>"
3205 * etc. but whether supporting semantics like this make sense will need
3206 * some thinking.
3207 */
3208 for (it = unified->controllers; it && *it; it++) {
3209 full_len += strlen(*it) + 2;
3210 add_controllers = must_realloc(add_controllers, full_len + 1);
3211
3212 if (unified->controllers[0] == *it)
3213 add_controllers[0] = '\0';
3214
3215 (void)strlcat(add_controllers, "+", full_len + 1);
3216 (void)strlcat(add_controllers, *it, full_len + 1);
3217
3218 if ((it + 1) && *(it + 1))
3219 (void)strlcat(add_controllers, " ", full_len + 1);
3220 }
3221
3222 copy = strdup(cgroup);
3223 if (!copy)
3224 return false;
3225
3226 /*
3227 * Placing the write to cgroup.subtree_control before the open() is
3228 * intentional because of the cgroup2 delegation model. It enforces
3229 * that leaf cgroups don't have any controllers enabled for delegation.
3230 */
3231 dfd_cur = unified->dfd_base;
3232 lxc_iterate_parts(cur, copy, "/") {
3233 /*
3234 * Even though we vetted the paths when we parsed the config
3235 * we're paranoid here and check that the path is neither
3236 * absolute nor walks upwards.
3237 */
3238 if (abspath(cur))
3239 return syserrno_set(-EINVAL, "No absolute paths allowed");
3240
3241 if (strnequal(cur, "..", STRLITERALLEN("..")))
3242 return syserrno_set(-EINVAL, "No upward walking paths allowed");
3243
3244 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
3245 if (ret < 0)
3246 return syserrno(-errno, "Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
3247
3248 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
3249
3250 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
3251 if (dfd_final < 0)
3252 return syserrno(-errno, "Fail to open directory %d(%s)", dfd_cur, cur);
3253 if (dfd_cur != unified->dfd_base)
3254 close(dfd_cur);
3255 /*
3256 * Leave dfd_final pointing to the last fd we opened so
3257 * it will be automatically zapped if we return early.
3258 */
3259 dfd_cur = dfd_final;
3260 }
3261
3262 return true;
3263 }
3264
3265 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
3266 {
3267 if (!ops)
3268 return ret_set_errno(false, ENOENT);
3269
3270 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
3271 }
3272
3273 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
3274 {
3275 if (!ops)
3276 return ret_set_errno(false, ENOENT);
3277
3278 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
3279 }
3280
3281 static void cg_unified_delegate(char ***delegate)
3282 {
3283 __do_free char *buf = NULL;
3284 char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
3285 char *token;
3286 int idx;
3287
3288 buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
3289 if (!buf) {
3290 for (char **p = standard; p && *p; p++) {
3291 idx = append_null_to_list((void ***)delegate);
3292 (*delegate)[idx] = must_copy_string(*p);
3293 }
3294 SYSWARN("Failed to read /sys/kernel/cgroup/delegate");
3295 return;
3296 }
3297
3298 lxc_iterate_parts(token, buf, " \t\n") {
3299 /*
3300 * We always need to chown this for both cgroup and
3301 * cgroup2.
3302 */
3303 if (strequal(token, "cgroup.procs"))
3304 continue;
3305
3306 idx = append_null_to_list((void ***)delegate);
3307 (*delegate)[idx] = must_copy_string(token);
3308 }
3309 }
3310
3311 /* At startup, parse_hierarchies finds all the info we need about cgroup
3312 * mountpoints and current cgroups, and stores it in @d.
3313 */
3314 static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
3315 {
3316 __do_free char *basecginfo = NULL, *line = NULL;
3317 __do_free_string_list char **klist = NULL, **nlist = NULL;
3318 __do_fclose FILE *f = NULL;
3319 int ret;
3320 size_t len = 0;
3321
3322 /* Root spawned containers escape the current cgroup, so use init's
3323 * cgroups as our base in that case.
3324 */
3325 if (!relative && (geteuid() == 0))
3326 basecginfo = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3327 else
3328 basecginfo = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3329 if (!basecginfo)
3330 return ret_set_errno(-1, ENOMEM);
3331
3332 ret = get_existing_subsystems(&klist, &nlist);
3333 if (ret < 0)
3334 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
3335
3336 f = fopen("/proc/self/mountinfo", "re");
3337 if (!f)
3338 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
3339
3340 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
3341
3342 while (getline(&line, &len, f) != -1) {
3343 __do_free char *base_cgroup = NULL, *mountpoint = NULL;
3344 __do_free_string_list char **controller_list = NULL;
3345 int type;
3346 bool writeable;
3347
3348 type = get_cgroup_version(line);
3349 if (type == 0)
3350 continue;
3351
3352 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
3353 continue;
3354
3355 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
3356 if (type == CGROUP2_SUPER_MAGIC)
3357 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3358 else if (type == CGROUP_SUPER_MAGIC)
3359 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3360 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
3361 if (type == CGROUP_SUPER_MAGIC)
3362 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3363 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3364 if (type == CGROUP2_SUPER_MAGIC)
3365 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3366 }
3367
3368 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
3369 if (!controller_list && type == CGROUP_SUPER_MAGIC)
3370 continue;
3371
3372 if (type == CGROUP_SUPER_MAGIC)
3373 if (controller_list_is_dup(ops->hierarchies, controller_list)) {
3374 TRACE("Skipping duplicating controller");
3375 continue;
3376 }
3377
3378 mountpoint = cg_hybrid_get_mountpoint(line);
3379 if (!mountpoint) {
3380 WARN("Failed parsing mountpoint from \"%s\"", line);
3381 continue;
3382 }
3383
3384 if (type == CGROUP_SUPER_MAGIC)
3385 base_cgroup = cg_hybrid_get_current_cgroup(relative, basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
3386 else
3387 base_cgroup = cg_hybrid_get_current_cgroup(relative, basecginfo, NULL, CGROUP2_SUPER_MAGIC);
3388 if (!base_cgroup) {
3389 WARN("Failed to find current cgroup");
3390 continue;
3391 }
3392
3393 if (type == CGROUP2_SUPER_MAGIC)
3394 writeable = test_writeable_v2(mountpoint, base_cgroup);
3395 else
3396 writeable = test_writeable_v1(mountpoint, base_cgroup);
3397 if (!writeable) {
3398 TRACE("The %s group is not writeable", base_cgroup);
3399 continue;
3400 }
3401
3402 if (type == CGROUP2_SUPER_MAGIC)
3403 ret = add_hierarchy(ops, NULL, move_ptr(mountpoint), move_ptr(base_cgroup), type);
3404 else
3405 ret = add_hierarchy(ops, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
3406 if (ret)
3407 return syserrno(ret, "Failed to add cgroup hierarchy");
3408 if (ops->unified && unprivileged)
3409 cg_unified_delegate(&(ops->unified)->cgroup2_chown);
3410 }
3411
3412 /* verify that all controllers in cgroup.use and all crucial
3413 * controllers are accounted for
3414 */
3415 if (!all_controllers_found(ops))
3416 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
3417
3418 return 0;
3419 }
3420
3421 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
3422 static char *cg_unified_get_current_cgroup(bool relative)
3423 {
3424 __do_free char *basecginfo = NULL, *copy = NULL;
3425 char *base_cgroup;
3426
3427 if (!relative && (geteuid() == 0))
3428 basecginfo = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3429 else
3430 basecginfo = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3431 if (!basecginfo)
3432 return NULL;
3433
3434 base_cgroup = strstr(basecginfo, "0::/");
3435 if (!base_cgroup)
3436 return NULL;
3437
3438 base_cgroup = base_cgroup + 3;
3439 copy = copy_to_eol(base_cgroup);
3440 if (!copy)
3441 return NULL;
3442 trim(copy);
3443
3444 if (!relative) {
3445 base_cgroup = prune_init_scope(copy);
3446 if (!base_cgroup)
3447 return NULL;
3448 } else {
3449 base_cgroup = copy;
3450 }
3451
3452 if (abspath(base_cgroup))
3453 base_cgroup = deabs(base_cgroup);
3454
3455 /* We're allowing base_cgroup to be "". */
3456 return strdup(base_cgroup);
3457 }
3458
3459 static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3460 bool unprivileged)
3461 {
3462 __do_free char *base_cgroup = NULL;
3463 int ret;
3464
3465 base_cgroup = cg_unified_get_current_cgroup(relative);
3466 if (!base_cgroup)
3467 return ret_errno(EINVAL);
3468
3469 /* TODO: If the user requested specific controllers via lxc.cgroup.use
3470 * we should verify here. The reason I'm not doing it right is that I'm
3471 * not convinced that lxc.cgroup.use will be the future since it is a
3472 * global property. I much rather have an option that lets you request
3473 * controllers per container.
3474 */
3475
3476 ret = add_hierarchy(ops, NULL,
3477 must_copy_string(DEFAULT_CGROUP_MOUNTPOINT),
3478 move_ptr(base_cgroup), CGROUP2_SUPER_MAGIC);
3479 if (ret)
3480 return syserrno(ret, "Failed to add unified cgroup hierarchy");
3481
3482 if (unprivileged)
3483 cg_unified_delegate(&(ops->unified)->cgroup2_chown);
3484
3485 if (bpf_devices_cgroup_supported())
3486 ops->unified->bpf_device_controller = 1;
3487
3488 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3489 return CGROUP2_SUPER_MAGIC;
3490 }
3491
3492 static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf)
3493 {
3494 __do_close int dfd = -EBADF;
3495 bool relative = conf->cgroup_meta.relative;
3496 int ret;
3497 const char *tmp;
3498
3499 if (ops->dfd_mnt_cgroupfs_host >= 0)
3500 return ret_errno(EINVAL);
3501
3502 /*
3503 * I don't see the need for allowing symlinks here. If users want to
3504 * have their hierarchy available in different locations I strongly
3505 * suggest bind-mounts.
3506 */
3507 dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3508 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3509 if (dfd < 0)
3510 return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3511
3512 tmp = lxc_global_config_value("lxc.cgroup.use");
3513 if (tmp) {
3514 __do_free char *pin = NULL;
3515 char *chop, *cur;
3516
3517 pin = must_copy_string(tmp);
3518 chop = pin;
3519
3520 lxc_iterate_parts(cur, chop, ",")
3521 must_append_string(&ops->cgroup_use, cur);
3522 }
3523
3524 /*
3525 * Keep dfd referenced by the cleanup function and actually move the fd
3526 * once we know the initialization succeeded. So if we fail we clean up
3527 * the dfd.
3528 */
3529 ops->dfd_mnt_cgroupfs_host = dfd;
3530
3531 if (unified_cgroup_fd(dfd))
3532 ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
3533 else
3534 ret = cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
3535 if (ret < 0)
3536 return syserrno(ret, "Failed to initialize cgroups");
3537
3538 /* Transfer ownership to cgroup_ops. */
3539 move_fd(dfd);
3540 return 0;
3541 }
3542
3543 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3544 {
3545 const char *cgroup_pattern;
3546
3547 if (!ops)
3548 return ret_set_errno(-1, ENOENT);
3549
3550 /* copy system-wide cgroup information */
3551 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3552 if (cgroup_pattern && !strequal(cgroup_pattern, ""))
3553 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
3554
3555 return 0;
3556 }
3557
3558 struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
3559 {
3560 __do_free struct cgroup_ops *cgfsng_ops = NULL;
3561
3562 cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3563 if (!cgfsng_ops)
3564 return ret_set_errno(NULL, ENOMEM);
3565
3566 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3567 cgfsng_ops->dfd_mnt_cgroupfs_host = -EBADF;
3568
3569 if (__cgroup_init(cgfsng_ops, conf))
3570 return NULL;
3571
3572 cgfsng_ops->data_init = cgfsng_data_init;
3573 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3574 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3575 cgfsng_ops->monitor_create = cgfsng_monitor_create;
3576 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3577 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3578 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3579 cgfsng_ops->payload_create = cgfsng_payload_create;
3580 cgfsng_ops->payload_enter = cgfsng_payload_enter;
3581 cgfsng_ops->payload_finalize = cgfsng_payload_finalize;
3582 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3583 cgfsng_ops->get = cgfsng_get;
3584 cgfsng_ops->set = cgfsng_set;
3585 cgfsng_ops->freeze = cgfsng_freeze;
3586 cgfsng_ops->unfreeze = cgfsng_unfreeze;
3587 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3588 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3589 cgfsng_ops->driver = "cgfsng";
3590 cgfsng_ops->version = "1.0.0";
3591 cgfsng_ops->attach = cgfsng_attach;
3592 cgfsng_ops->chown = cgfsng_chown;
3593 cgfsng_ops->mount = cgfsng_mount;
3594 cgfsng_ops->devices_activate = cgfsng_devices_activate;
3595 cgfsng_ops->get_limiting_cgroup = cgfsng_get_limiting_cgroup;
3596
3597 cgfsng_ops->criu_escape = cgfsng_criu_escape;
3598 cgfsng_ops->criu_num_hierarchies = cgfsng_criu_num_hierarchies;
3599 cgfsng_ops->criu_get_hierarchies = cgfsng_criu_get_hierarchies;
3600
3601 return move_ptr(cgfsng_ops);
3602 }
3603
3604 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3605 const char *lxcpath, pid_t pid)
3606 {
3607 __do_close int unified_fd = -EBADF;
3608 int ret;
3609
3610 if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3611 return ret_errno(EINVAL);
3612
3613 unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3614 if (unified_fd < 0)
3615 return ret_errno(ENOCGROUP2);
3616
3617 if (!lxc_list_empty(&conf->id_map)) {
3618 struct userns_exec_unified_attach_data args = {
3619 .conf = conf,
3620 .unified_fd = unified_fd,
3621 .pid = pid,
3622 };
3623
3624 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3625 if (ret < 0)
3626 return -errno;
3627
3628 ret = userns_exec_minimal(conf,
3629 cgroup_unified_attach_parent_wrapper,
3630 &args,
3631 cgroup_unified_attach_child_wrapper,
3632 &args);
3633 } else {
3634 ret = cgroup_attach_leaf(conf, unified_fd, pid);
3635 }
3636
3637 return ret;
3638 }
3639
3640 /* Connects to command socket therefore isn't callable from command handler. */
3641 int cgroup_get(const char *name, const char *lxcpath,
3642 const char *filename, char *buf, size_t len)
3643 {
3644 __do_close int unified_fd = -EBADF;
3645 ssize_t ret;
3646
3647 if (is_empty_string(filename) || is_empty_string(name) ||
3648 is_empty_string(lxcpath))
3649 return ret_errno(EINVAL);
3650
3651 if ((buf && !len) || (len && !buf))
3652 return ret_errno(EINVAL);
3653
3654 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3655 if (unified_fd < 0)
3656 return ret_errno(ENOCGROUP2);
3657
3658 ret = lxc_read_try_buf_at(unified_fd, filename, buf, len);
3659 if (ret < 0)
3660 SYSERROR("Failed to read cgroup value");
3661
3662 return ret;
3663 }
3664
3665 /* Connects to command socket therefore isn't callable from command handler. */
3666 int cgroup_set(const char *name, const char *lxcpath,
3667 const char *filename, const char *value)
3668 {
3669 __do_close int unified_fd = -EBADF;
3670 ssize_t ret;
3671
3672 if (is_empty_string(filename) || is_empty_string(value) ||
3673 is_empty_string(name) || is_empty_string(lxcpath))
3674 return ret_errno(EINVAL);
3675
3676 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3677 if (unified_fd < 0)
3678 return ret_errno(ENOCGROUP2);
3679
3680 if (strnequal(filename, "devices.", STRLITERALLEN("devices."))) {
3681 struct device_item device = {};
3682
3683 ret = device_cgroup_rule_parse(&device, filename, value);
3684 if (ret < 0)
3685 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", filename, value);
3686
3687 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3688 } else {
3689 ret = lxc_writeat(unified_fd, filename, value, strlen(value));
3690 }
3691
3692 return ret;
3693 }
3694
3695 static int do_cgroup_freeze(int unified_fd,
3696 const char *state_string,
3697 int state_num,
3698 int timeout,
3699 const char *epoll_error,
3700 const char *wait_error)
3701 {
3702 __do_close int events_fd = -EBADF;
3703 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
3704 int ret;
3705 struct lxc_epoll_descr descr = {};
3706
3707 if (timeout != 0) {
3708 ret = lxc_mainloop_open(&descr);
3709 if (ret)
3710 return log_error_errno(-1, errno, "%s", epoll_error);
3711
3712 /* automatically cleaned up now */
3713 descr_ptr = &descr;
3714
3715 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3716 if (events_fd < 0)
3717 return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3718
3719 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
3720 if (ret < 0)
3721 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3722 }
3723
3724 ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3725 if (ret < 0)
3726 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3727
3728 if (timeout != 0) {
3729 ret = lxc_mainloop(&descr, timeout);
3730 if (ret)
3731 return log_error_errno(-1, errno, "%s", wait_error);
3732 }
3733
3734 return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3735 }
3736
3737 static inline int __cgroup_freeze(int unified_fd, int timeout)
3738 {
3739 return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3740 "Failed to create epoll instance to wait for container freeze",
3741 "Failed to wait for container to be frozen");
3742 }
3743
3744 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3745 {
3746 __do_close int unified_fd = -EBADF;
3747 int ret;
3748
3749 if (is_empty_string(name) || is_empty_string(lxcpath))
3750 return ret_errno(EINVAL);
3751
3752 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3753 if (unified_fd < 0)
3754 return ret_errno(ENOCGROUP2);
3755
3756 lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3757 ret = __cgroup_freeze(unified_fd, timeout);
3758 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3759 return ret;
3760 }
3761
3762 int __cgroup_unfreeze(int unified_fd, int timeout)
3763 {
3764 return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3765 "Failed to create epoll instance to wait for container freeze",
3766 "Failed to wait for container to be frozen");
3767 }
3768
3769 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3770 {
3771 __do_close int unified_fd = -EBADF;
3772 int ret;
3773
3774 if (is_empty_string(name) || is_empty_string(lxcpath))
3775 return ret_errno(EINVAL);
3776
3777 unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3778 if (unified_fd < 0)
3779 return ret_errno(ENOCGROUP2);
3780
3781 lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3782 ret = __cgroup_unfreeze(unified_fd, timeout);
3783 lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3784 return ret;
3785 }