]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfsng.c
conf: introduce and use userns_exec_minimal()
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 /*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
8 * each controller.
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
12 * a comma-separated list of controllers.
13 */
14
15 #ifndef _GNU_SOURCE
16 #define _GNU_SOURCE 1
17 #endif
18 #include <ctype.h>
19 #include <dirent.h>
20 #include <errno.h>
21 #include <grp.h>
22 #include <linux/kdev_t.h>
23 #include <linux/types.h>
24 #include <poll.h>
25 #include <signal.h>
26 #include <stdint.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/types.h>
31 #include <unistd.h>
32
33 #include "caps.h"
34 #include "cgroup.h"
35 #include "cgroup2_devices.h"
36 #include "cgroup_utils.h"
37 #include "commands.h"
38 #include "conf.h"
39 #include "config.h"
40 #include "log.h"
41 #include "macro.h"
42 #include "mainloop.h"
43 #include "memory_utils.h"
44 #include "storage/storage.h"
45 #include "utils.h"
46
47 #ifndef HAVE_STRLCPY
48 #include "include/strlcpy.h"
49 #endif
50
51 #ifndef HAVE_STRLCAT
52 #include "include/strlcat.h"
53 #endif
54
55 lxc_log_define(cgfsng, cgroup);
56
57 /* Given a pointer to a null-terminated array of pointers, realloc to add one
58 * entry, and point the new entry to NULL. Do not fail. Return the index to the
59 * second-to-last entry - that is, the one which is now available for use
60 * (keeping the list null-terminated).
61 */
62 static int append_null_to_list(void ***list)
63 {
64 int newentry = 0;
65
66 if (*list)
67 for (; (*list)[newentry]; newentry++)
68 ;
69
70 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
71 (*list)[newentry + 1] = NULL;
72 return newentry;
73 }
74
75 /* Given a null-terminated array of strings, check whether @entry is one of the
76 * strings.
77 */
78 static bool string_in_list(char **list, const char *entry)
79 {
80 if (!list)
81 return false;
82
83 for (int i = 0; list[i]; i++)
84 if (strcmp(list[i], entry) == 0)
85 return true;
86
87 return false;
88 }
89
90 /* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
91 * "name=systemd". Do not fail.
92 */
93 static char *cg_legacy_must_prefix_named(char *entry)
94 {
95 size_t len;
96 char *prefixed;
97
98 len = strlen(entry);
99 prefixed = must_realloc(NULL, len + 6);
100
101 memcpy(prefixed, "name=", STRLITERALLEN("name="));
102 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
103 prefixed[len + 5] = '\0';
104
105 return prefixed;
106 }
107
108 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
109 * we are called.
110 *
111 * We also handle named subsystems here. Any controller which is not a kernel
112 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
113 * we refuse to use because we're not sure which we have here.
114 * (TODO: We could work around this in some cases by just remounting to be
115 * unambiguous, or by comparing mountpoint contents with current cgroup.)
116 *
117 * The last entry will always be NULL.
118 */
119 static void must_append_controller(char **klist, char **nlist, char ***clist,
120 char *entry)
121 {
122 int newentry;
123 char *copy;
124
125 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
126 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
127 ERROR("It is both a named and kernel subsystem");
128 return;
129 }
130
131 newentry = append_null_to_list((void ***)clist);
132
133 if (strncmp(entry, "name=", 5) == 0)
134 copy = must_copy_string(entry);
135 else if (string_in_list(klist, entry))
136 copy = must_copy_string(entry);
137 else
138 copy = cg_legacy_must_prefix_named(entry);
139
140 (*clist)[newentry] = copy;
141 }
142
143 /* Given a handler's cgroup data, return the struct hierarchy for the controller
144 * @c, or NULL if there is none.
145 */
146 struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
147 {
148 if (!ops->hierarchies)
149 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
150
151 for (int i = 0; ops->hierarchies[i]; i++) {
152 if (!controller) {
153 /* This is the empty unified hierarchy. */
154 if (ops->hierarchies[i]->controllers &&
155 !ops->hierarchies[i]->controllers[0])
156 return ops->hierarchies[i];
157 continue;
158 } else if (pure_unified_layout(ops) &&
159 strcmp(controller, "devices") == 0) {
160 if (ops->unified->bpf_device_controller)
161 return ops->unified;
162 break;
163 }
164
165 if (string_in_list(ops->hierarchies[i]->controllers, controller))
166 return ops->hierarchies[i];
167 }
168
169 if (controller)
170 WARN("There is no useable %s controller", controller);
171 else
172 WARN("There is no empty unified cgroup hierarchy");
173
174 return ret_set_errno(NULL, ENOENT);
175 }
176
177 #define BATCH_SIZE 50
178 static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
179 {
180 int newbatches = (newlen / BATCH_SIZE) + 1;
181 int oldbatches = (oldlen / BATCH_SIZE) + 1;
182
183 if (!*mem || newbatches > oldbatches)
184 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
185 }
186
187 static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
188 {
189 size_t full = oldlen + newlen;
190
191 batch_realloc(dest, oldlen, full + 1);
192
193 memcpy(*dest + oldlen, new, newlen + 1);
194 }
195
196 /* Slurp in a whole file */
197 static char *read_file(const char *fnam)
198 {
199 __do_free char *buf = NULL, *line = NULL;
200 __do_fclose FILE *f = NULL;
201 size_t len = 0, fulllen = 0;
202 int linelen;
203
204 f = fopen(fnam, "re");
205 if (!f)
206 return NULL;
207
208 while ((linelen = getline(&line, &len, f)) != -1) {
209 append_line(&buf, fulllen, line, linelen);
210 fulllen += linelen;
211 }
212
213 return move_ptr(buf);
214 }
215
216 /* Taken over modified from the kernel sources. */
217 #define NBITS 32 /* bits in uint32_t */
218 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
219 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
220
221 static void set_bit(unsigned bit, uint32_t *bitarr)
222 {
223 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
224 }
225
226 static void clear_bit(unsigned bit, uint32_t *bitarr)
227 {
228 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
229 }
230
231 static bool is_set(unsigned bit, uint32_t *bitarr)
232 {
233 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
234 }
235
236 /* Create cpumask from cpulist aka turn:
237 *
238 * 0,2-3
239 *
240 * into bit array
241 *
242 * 1 0 1 1
243 */
244 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
245 {
246 __do_free uint32_t *bitarr = NULL;
247 char *token;
248 size_t arrlen;
249
250 arrlen = BITS_TO_LONGS(nbits);
251 bitarr = calloc(arrlen, sizeof(uint32_t));
252 if (!bitarr)
253 return ret_set_errno(NULL, ENOMEM);
254
255 lxc_iterate_parts(token, buf, ",") {
256 errno = 0;
257 unsigned end, start;
258 char *range;
259
260 start = strtoul(token, NULL, 0);
261 end = start;
262 range = strchr(token, '-');
263 if (range)
264 end = strtoul(range + 1, NULL, 0);
265
266 if (!(start <= end))
267 return ret_set_errno(NULL, EINVAL);
268
269 if (end >= nbits)
270 return ret_set_errno(NULL, EINVAL);
271
272 while (start <= end)
273 set_bit(start++, bitarr);
274 }
275
276 return move_ptr(bitarr);
277 }
278
279 /* Turn cpumask into simple, comma-separated cpulist. */
280 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
281 {
282 __do_free_string_list char **cpulist = NULL;
283 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
284 int ret;
285
286 for (size_t i = 0; i <= nbits; i++) {
287 if (!is_set(i, bitarr))
288 continue;
289
290 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
291 if (ret < 0 || (size_t)ret >= sizeof(numstr))
292 return NULL;
293
294 ret = lxc_append_string(&cpulist, numstr);
295 if (ret < 0)
296 return ret_set_errno(NULL, ENOMEM);
297 }
298
299 if (!cpulist)
300 return ret_set_errno(NULL, ENOMEM);
301
302 return lxc_string_join(",", (const char **)cpulist, false);
303 }
304
305 static ssize_t get_max_cpus(char *cpulist)
306 {
307 char *c1, *c2;
308 char *maxcpus = cpulist;
309 size_t cpus = 0;
310
311 c1 = strrchr(maxcpus, ',');
312 if (c1)
313 c1++;
314
315 c2 = strrchr(maxcpus, '-');
316 if (c2)
317 c2++;
318
319 if (!c1 && !c2)
320 c1 = maxcpus;
321 else if (c1 > c2)
322 c2 = c1;
323 else if (c1 < c2)
324 c1 = c2;
325 else if (!c1 && c2)
326 c1 = c2;
327
328 errno = 0;
329 cpus = strtoul(c1, NULL, 0);
330 if (errno != 0)
331 return -1;
332
333 return cpus;
334 }
335
336 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
337 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
338 static bool cg_legacy_filter_and_set_cpus(const char *parent_cgroup,
339 char *child_cgroup, bool am_initialized)
340 {
341 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
342 *offlinecpus = NULL, *posscpus = NULL;
343 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
344 *possmask = NULL;
345 int ret;
346 ssize_t i;
347 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
348 bool flipped_bit = false;
349
350 fpath = must_make_path(parent_cgroup, "cpuset.cpus", NULL);
351 posscpus = read_file(fpath);
352 if (!posscpus)
353 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
354
355 /* Get maximum number of cpus found in possible cpuset. */
356 maxposs = get_max_cpus(posscpus);
357 if (maxposs < 0 || maxposs >= INT_MAX - 1)
358 return false;
359
360 if (file_exists(__ISOL_CPUS)) {
361 isolcpus = read_file(__ISOL_CPUS);
362 if (!isolcpus)
363 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
364
365 if (isdigit(isolcpus[0])) {
366 /* Get maximum number of cpus found in isolated cpuset. */
367 maxisol = get_max_cpus(isolcpus);
368 if (maxisol < 0 || maxisol >= INT_MAX - 1)
369 return false;
370 }
371
372 if (maxposs < maxisol)
373 maxposs = maxisol;
374 maxposs++;
375 } else {
376 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
377 }
378
379 if (file_exists(__OFFLINE_CPUS)) {
380 offlinecpus = read_file(__OFFLINE_CPUS);
381 if (!offlinecpus)
382 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
383
384 if (isdigit(offlinecpus[0])) {
385 /* Get maximum number of cpus found in offline cpuset. */
386 maxoffline = get_max_cpus(offlinecpus);
387 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
388 return false;
389 }
390
391 if (maxposs < maxoffline)
392 maxposs = maxoffline;
393 maxposs++;
394 } else {
395 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
396 }
397
398 if ((maxisol == 0) && (maxoffline == 0)) {
399 cpulist = move_ptr(posscpus);
400 goto copy_parent;
401 }
402
403 possmask = lxc_cpumask(posscpus, maxposs);
404 if (!possmask)
405 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
406
407 if (maxisol > 0) {
408 isolmask = lxc_cpumask(isolcpus, maxposs);
409 if (!isolmask)
410 return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
411 }
412
413 if (maxoffline > 0) {
414 offlinemask = lxc_cpumask(offlinecpus, maxposs);
415 if (!offlinemask)
416 return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
417 }
418
419 for (i = 0; i <= maxposs; i++) {
420 if ((isolmask && !is_set(i, isolmask)) ||
421 (offlinemask && !is_set(i, offlinemask)) ||
422 !is_set(i, possmask))
423 continue;
424
425 flipped_bit = true;
426 clear_bit(i, possmask);
427 }
428
429 if (!flipped_bit) {
430 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
431 TRACE("No isolated or offline cpus present in cpuset");
432 } else {
433 cpulist = move_ptr(posscpus);
434 TRACE("Removed isolated or offline cpus from cpuset");
435 }
436 if (!cpulist)
437 return log_error_errno(false, errno, "Failed to create cpu list");
438
439 copy_parent:
440 if (!am_initialized) {
441 ret = lxc_write_openat(child_cgroup, "cpuset.cpus", cpulist, strlen(cpulist));
442 if (ret < 0)
443 return log_error_errno(false,
444 errno, "Failed to write cpu list to \"%s/cpuset.cpus\"",
445 child_cgroup);
446
447 TRACE("Copied cpu settings of parent cgroup");
448 }
449
450 return true;
451 }
452
453 /* Copy contents of parent(@path)/@file to @path/@file */
454 static bool copy_parent_file(const char *parent_cgroup,
455 const char *child_cgroup, const char *file)
456 {
457 __do_free char *parent_file = NULL, *value = NULL;
458 int len = 0;
459 int ret;
460
461 parent_file = must_make_path(parent_cgroup, file, NULL);
462 len = lxc_read_from_file(parent_file, NULL, 0);
463 if (len <= 0)
464 return log_error_errno(false, errno, "Failed to determine buffer size");
465
466 value = must_realloc(NULL, len + 1);
467 value[len] = '\0';
468 ret = lxc_read_from_file(parent_file, value, len);
469 if (ret != len)
470 return log_error_errno(false, errno, "Failed to read from parent file \"%s\"", parent_file);
471
472 ret = lxc_write_openat(child_cgroup, file, value, len);
473 if (ret < 0 && errno != EACCES)
474 return log_error_errno(false, errno, "Failed to write \"%s\" to file \"%s/%s\"",
475 value, child_cgroup, file);
476 return true;
477 }
478
479 static inline bool is_unified_hierarchy(const struct hierarchy *h)
480 {
481 return h->version == CGROUP2_SUPER_MAGIC;
482 }
483
484 /*
485 * Initialize the cpuset hierarchy in first directory of @cgroup_leaf and set
486 * cgroup.clone_children so that children inherit settings. Since the
487 * h->base_path is populated by init or ourselves, we know it is already
488 * initialized.
489 *
490 * returns -1 on error, 0 when we didn't created a cgroup, 1 if we created a
491 * cgroup.
492 */
493 static int cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h,
494 const char *cgroup_leaf)
495 {
496 __do_free char *parent_cgroup = NULL, *child_cgroup = NULL, *dup = NULL;
497 __do_close int cgroup_fd = -EBADF;
498 int fret = -1;
499 int ret;
500 char v;
501 char *leaf, *slash;
502
503 if (is_unified_hierarchy(h))
504 return 0;
505
506 if (!string_in_list(h->controllers, "cpuset"))
507 return 0;
508
509 if (!cgroup_leaf)
510 return ret_set_errno(-1, EINVAL);
511
512 dup = strdup(cgroup_leaf);
513 if (!dup)
514 return ret_set_errno(-1, ENOMEM);
515
516 parent_cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
517
518 leaf = dup;
519 leaf += strspn(leaf, "/");
520 slash = strchr(leaf, '/');
521 if (slash)
522 *slash = '\0';
523 child_cgroup = must_make_path(parent_cgroup, leaf, NULL);
524 if (slash)
525 *slash = '/';
526
527 fret = 1;
528 ret = mkdir(child_cgroup, 0755);
529 if (ret < 0) {
530 if (errno != EEXIST)
531 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", child_cgroup);
532
533 fret = 0;
534 }
535
536 cgroup_fd = lxc_open_dirfd(child_cgroup);
537 if (cgroup_fd < 0)
538 return -1;
539
540 ret = lxc_readat(cgroup_fd, "cgroup.clone_children", &v, 1);
541 if (ret < 0)
542 return log_error_errno(-1, errno, "Failed to read file \"%s/cgroup.clone_children\"", child_cgroup);
543
544 /* Make sure any isolated cpus are removed from cpuset.cpus. */
545 if (!cg_legacy_filter_and_set_cpus(parent_cgroup, child_cgroup, v == '1'))
546 return log_error_errno(-1, errno, "Failed to remove isolated cpus");
547
548 /* Already set for us by someone else. */
549 if (v == '1')
550 TRACE("\"cgroup.clone_children\" was already set to \"1\"");
551
552 /* copy parent's settings */
553 if (!copy_parent_file(parent_cgroup, child_cgroup, "cpuset.mems"))
554 return log_error_errno(-1, errno, "Failed to copy \"cpuset.mems\" settings");
555
556 /* Set clone_children so children inherit our settings */
557 ret = lxc_writeat(cgroup_fd, "cgroup.clone_children", "1", 1);
558 if (ret < 0)
559 return log_error_errno(-1, errno, "Failed to write 1 to \"%s/cgroup.clone_children\"", child_cgroup);
560
561 return fret;
562 }
563
564 /* Given two null-terminated lists of strings, return true if any string is in
565 * both.
566 */
567 static bool controller_lists_intersect(char **l1, char **l2)
568 {
569 if (!l1 || !l2)
570 return false;
571
572 for (int i = 0; l1[i]; i++)
573 if (string_in_list(l2, l1[i]))
574 return true;
575
576 return false;
577 }
578
579 /* For a null-terminated list of controllers @clist, return true if any of those
580 * controllers is already listed the null-terminated list of hierarchies @hlist.
581 * Realistically, if one is present, all must be present.
582 */
583 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
584 {
585 if (!hlist)
586 return false;
587
588 for (int i = 0; hlist[i]; i++)
589 if (controller_lists_intersect(hlist[i]->controllers, clist))
590 return true;
591
592 return false;
593 }
594
595 /* Return true if the controller @entry is found in the null-terminated list of
596 * hierarchies @hlist.
597 */
598 static bool controller_found(struct hierarchy **hlist, char *entry)
599 {
600 if (!hlist)
601 return false;
602
603 for (int i = 0; hlist[i]; i++)
604 if (string_in_list(hlist[i]->controllers, entry))
605 return true;
606
607 return false;
608 }
609
610 /* Return true if all of the controllers which we require have been found. The
611 * required list is freezer and anything in lxc.cgroup.use.
612 */
613 static bool all_controllers_found(struct cgroup_ops *ops)
614 {
615 struct hierarchy **hlist;
616
617 if (!ops->cgroup_use)
618 return true;
619
620 hlist = ops->hierarchies;
621 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
622 if (!controller_found(hlist, *cur))
623 return log_error(false, "No %s controller mountpoint found", *cur);
624
625 return true;
626 }
627
628 /* Get the controllers from a mountinfo line There are other ways we could get
629 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
630 * could parse the mount options. But we simply assume that the mountpoint must
631 * be /sys/fs/cgroup/controller-list
632 */
633 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
634 int type)
635 {
636 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
637 * for legacy hierarchies.
638 */
639 __do_free_string_list char **aret = NULL;
640 int i;
641 char *p2, *tok;
642 char *p = line, *sep = ",";
643
644 for (i = 0; i < 4; i++) {
645 p = strchr(p, ' ');
646 if (!p)
647 return NULL;
648 p++;
649 }
650
651 /* Note, if we change how mountinfo works, then our caller will need to
652 * verify /sys/fs/cgroup/ in this field.
653 */
654 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
655 return log_error(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
656
657 p += 15;
658 p2 = strchr(p, ' ');
659 if (!p2)
660 return log_error(NULL, "Corrupt mountinfo");
661 *p2 = '\0';
662
663 if (type == CGROUP_SUPER_MAGIC) {
664 __do_free char *dup = NULL;
665
666 /* strdup() here for v1 hierarchies. Otherwise
667 * lxc_iterate_parts() will destroy mountpoints such as
668 * "/sys/fs/cgroup/cpu,cpuacct".
669 */
670 dup = must_copy_string(p);
671 if (!dup)
672 return NULL;
673
674 lxc_iterate_parts (tok, dup, sep)
675 must_append_controller(klist, nlist, &aret, tok);
676 }
677 *p2 = ' ';
678
679 return move_ptr(aret);
680 }
681
682 static char **cg_unified_make_empty_controller(void)
683 {
684 __do_free_string_list char **aret = NULL;
685 int newentry;
686
687 newentry = append_null_to_list((void ***)&aret);
688 aret[newentry] = NULL;
689 return move_ptr(aret);
690 }
691
692 static char **cg_unified_get_controllers(const char *file)
693 {
694 __do_free char *buf = NULL;
695 __do_free_string_list char **aret = NULL;
696 char *sep = " \t\n";
697 char *tok;
698
699 buf = read_file(file);
700 if (!buf)
701 return NULL;
702
703 lxc_iterate_parts(tok, buf, sep) {
704 int newentry;
705 char *copy;
706
707 newentry = append_null_to_list((void ***)&aret);
708 copy = must_copy_string(tok);
709 aret[newentry] = copy;
710 }
711
712 return move_ptr(aret);
713 }
714
715 static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
716 char *container_base_path, int type)
717 {
718 struct hierarchy *new;
719 int newentry;
720
721 new = zalloc(sizeof(*new));
722 new->controllers = clist;
723 new->mountpoint = mountpoint;
724 new->container_base_path = container_base_path;
725 new->version = type;
726 new->cgfd_con = -EBADF;
727 new->cgfd_mon = -EBADF;
728
729 newentry = append_null_to_list((void ***)h);
730 (*h)[newentry] = new;
731 return new;
732 }
733
734 /* Get a copy of the mountpoint from @line, which is a line from
735 * /proc/self/mountinfo.
736 */
737 static char *cg_hybrid_get_mountpoint(char *line)
738 {
739 char *p = line, *sret = NULL;
740 size_t len;
741 char *p2;
742
743 for (int i = 0; i < 4; i++) {
744 p = strchr(p, ' ');
745 if (!p)
746 return NULL;
747 p++;
748 }
749
750 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
751 return NULL;
752
753 p2 = strchr(p + 15, ' ');
754 if (!p2)
755 return NULL;
756 *p2 = '\0';
757
758 len = strlen(p);
759 sret = must_realloc(NULL, len + 1);
760 memcpy(sret, p, len);
761 sret[len] = '\0';
762
763 return sret;
764 }
765
766 /* Given a multi-line string, return a null-terminated copy of the current line. */
767 static char *copy_to_eol(char *p)
768 {
769 char *p2, *sret;
770 size_t len;
771
772 p2 = strchr(p, '\n');
773 if (!p2)
774 return NULL;
775
776 len = p2 - p;
777 sret = must_realloc(NULL, len + 1);
778 memcpy(sret, p, len);
779 sret[len] = '\0';
780
781 return sret;
782 }
783
784 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
785 * /proc/self/cgroup file. Check whether controller c is present.
786 */
787 static bool controller_in_clist(char *cgline, char *c)
788 {
789 __do_free char *tmp = NULL;
790 char *tok, *eol;
791 size_t len;
792
793 eol = strchr(cgline, ':');
794 if (!eol)
795 return false;
796
797 len = eol - cgline;
798 tmp = must_realloc(NULL, len + 1);
799 memcpy(tmp, cgline, len);
800 tmp[len] = '\0';
801
802 lxc_iterate_parts(tok, tmp, ",")
803 if (strcmp(tok, c) == 0)
804 return true;
805
806 return false;
807 }
808
809 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
810 * @controller.
811 */
812 static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
813 int type)
814 {
815 char *p = basecginfo;
816
817 for (;;) {
818 bool is_cgv2_base_cgroup = false;
819
820 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
821 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
822 is_cgv2_base_cgroup = true;
823
824 p = strchr(p, ':');
825 if (!p)
826 return NULL;
827 p++;
828
829 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
830 p = strchr(p, ':');
831 if (!p)
832 return NULL;
833 p++;
834 return copy_to_eol(p);
835 }
836
837 p = strchr(p, '\n');
838 if (!p)
839 return NULL;
840 p++;
841 }
842 }
843
844 static void must_append_string(char ***list, char *entry)
845 {
846 int newentry;
847 char *copy;
848
849 newentry = append_null_to_list((void ***)list);
850 copy = must_copy_string(entry);
851 (*list)[newentry] = copy;
852 }
853
854 static int get_existing_subsystems(char ***klist, char ***nlist)
855 {
856 __do_free char *line = NULL;
857 __do_fclose FILE *f = NULL;
858 size_t len = 0;
859
860 f = fopen("/proc/self/cgroup", "re");
861 if (!f)
862 return -1;
863
864 while (getline(&line, &len, f) != -1) {
865 char *p, *p2, *tok;
866 p = strchr(line, ':');
867 if (!p)
868 continue;
869 p++;
870 p2 = strchr(p, ':');
871 if (!p2)
872 continue;
873 *p2 = '\0';
874
875 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
876 * contains an entry of the form:
877 *
878 * 0::/some/path
879 *
880 * In this case we use "cgroup2" as controller name.
881 */
882 if ((p2 - p) == 0) {
883 must_append_string(klist, "cgroup2");
884 continue;
885 }
886
887 lxc_iterate_parts(tok, p, ",") {
888 if (strncmp(tok, "name=", 5) == 0)
889 must_append_string(nlist, tok);
890 else
891 must_append_string(klist, tok);
892 }
893 }
894
895 return 0;
896 }
897
898 static char *trim(char *s)
899 {
900 size_t len;
901
902 len = strlen(s);
903 while ((len > 1) && (s[len - 1] == '\n'))
904 s[--len] = '\0';
905
906 return s;
907 }
908
909 static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
910 {
911 int i;
912 struct hierarchy **it;
913
914 if (!ops->hierarchies) {
915 TRACE(" No hierarchies found");
916 return;
917 }
918
919 TRACE(" Hierarchies:");
920 for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
921 int j;
922 char **cit;
923
924 TRACE(" %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
925 TRACE(" mountpoint: %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
926 TRACE(" controllers:");
927 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
928 TRACE(" %d: %s", j, *cit);
929 }
930 }
931
932 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
933 char **nlist)
934 {
935 int k;
936 char **it;
937
938 TRACE("basecginfo is:");
939 TRACE("%s", basecginfo);
940
941 for (k = 0, it = klist; it && *it; it++, k++)
942 TRACE("kernel subsystem %d: %s", k, *it);
943
944 for (k = 0, it = nlist; it && *it; it++, k++)
945 TRACE("named subsystem %d: %s", k, *it);
946 }
947
948 static int cgroup_rmdir(struct hierarchy **hierarchies,
949 const char *container_cgroup)
950 {
951 if (!container_cgroup || !hierarchies)
952 return 0;
953
954 for (int i = 0; hierarchies[i]; i++) {
955 struct hierarchy *h = hierarchies[i];
956 int ret;
957
958 if (!h->container_full_path)
959 continue;
960
961 ret = recursive_destroy(h->container_full_path);
962 if (ret < 0)
963 WARN("Failed to destroy \"%s\"", h->container_full_path);
964
965 free_disarm(h->container_full_path);
966 }
967
968 return 0;
969 }
970
971 struct generic_userns_exec_data {
972 struct hierarchy **hierarchies;
973 const char *container_cgroup;
974 struct lxc_conf *conf;
975 uid_t origuid; /* target uid in parent namespace */
976 char *path;
977 };
978
979 static int cgroup_rmdir_wrapper(void *data)
980 {
981 struct generic_userns_exec_data *arg = data;
982 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
983 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
984 int ret;
985
986 if (!lxc_setgroups(0, NULL) && errno != EPERM)
987 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
988
989 ret = setresgid(nsgid, nsgid, nsgid);
990 if (ret < 0)
991 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
992 (int)nsgid, (int)nsgid, (int)nsgid);
993
994 ret = setresuid(nsuid, nsuid, nsuid);
995 if (ret < 0)
996 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
997 (int)nsuid, (int)nsuid, (int)nsuid);
998
999 return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
1000 }
1001
1002 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
1003 struct lxc_handler *handler)
1004 {
1005 int ret;
1006
1007 if (!ops) {
1008 ERROR("Called with uninitialized cgroup operations");
1009 return;
1010 }
1011
1012 if (!ops->hierarchies)
1013 return;
1014
1015 if (!handler) {
1016 ERROR("Called with uninitialized handler");
1017 return;
1018 }
1019
1020 if (!handler->conf) {
1021 ERROR("Called with uninitialized conf");
1022 return;
1023 }
1024
1025 #ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
1026 ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices);
1027 if (ret < 0)
1028 WARN("Failed to detach bpf program from cgroup");
1029 #endif
1030
1031 if (handler->conf && !lxc_list_empty(&handler->conf->id_map)) {
1032 struct generic_userns_exec_data wrap = {
1033 .conf = handler->conf,
1034 .container_cgroup = ops->container_cgroup,
1035 .hierarchies = ops->hierarchies,
1036 .origuid = 0,
1037 };
1038 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
1039 "cgroup_rmdir_wrapper");
1040 } else {
1041 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
1042 }
1043 if (ret < 0)
1044 SYSWARN("Failed to destroy cgroups");
1045 }
1046
1047 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1048 struct lxc_handler *handler)
1049 {
1050 int len;
1051 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1052 const struct lxc_conf *conf;
1053
1054 if (!ops) {
1055 ERROR("Called with uninitialized cgroup operations");
1056 return;
1057 }
1058
1059 if (!ops->hierarchies)
1060 return;
1061
1062 if (!handler) {
1063 ERROR("Called with uninitialized handler");
1064 return;
1065 }
1066
1067 if (!handler->conf) {
1068 ERROR("Called with uninitialized conf");
1069 return;
1070 }
1071 conf = handler->conf;
1072
1073 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1074 if (len < 0 || (size_t)len >= sizeof(pidstr))
1075 return;
1076
1077 for (int i = 0; ops->hierarchies[i]; i++) {
1078 __do_free char *pivot_path = NULL;
1079 struct hierarchy *h = ops->hierarchies[i];
1080 int ret;
1081
1082 if (!h->monitor_full_path)
1083 continue;
1084
1085 if (conf && conf->cgroup_meta.dir)
1086 pivot_path = must_make_path(h->mountpoint,
1087 h->container_base_path,
1088 conf->cgroup_meta.dir,
1089 CGROUP_PIVOT, NULL);
1090 else
1091 pivot_path = must_make_path(h->mountpoint,
1092 h->container_base_path,
1093 CGROUP_PIVOT, NULL);
1094
1095 ret = mkdir_p(pivot_path, 0755);
1096 if (ret < 0 && errno != EEXIST) {
1097 ERROR("Failed to create %s", pivot_path);
1098 goto try_recursive_destroy;
1099 }
1100
1101 ret = lxc_write_openat(pivot_path, "cgroup.procs", pidstr, len);
1102 if (ret != 0) {
1103 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
1104 continue;
1105 }
1106
1107 try_recursive_destroy:
1108 ret = recursive_destroy(h->monitor_full_path);
1109 if (ret < 0)
1110 WARN("Failed to destroy \"%s\"", h->monitor_full_path);
1111 }
1112 }
1113
1114 static int mkdir_eexist_on_last(const char *dir, mode_t mode)
1115 {
1116 const char *tmp = dir;
1117 const char *orig = dir;
1118 size_t orig_len;
1119
1120 orig_len = strlen(dir);
1121 do {
1122 __do_free char *makeme = NULL;
1123 int ret;
1124 size_t cur_len;
1125
1126 dir = tmp + strspn(tmp, "/");
1127 tmp = dir + strcspn(dir, "/");
1128
1129 cur_len = dir - orig;
1130 makeme = strndup(orig, cur_len);
1131 if (!makeme)
1132 return ret_set_errno(-1, ENOMEM);
1133
1134 ret = mkdir(makeme, mode);
1135 if (ret < 0 && ((errno != EEXIST) || (orig_len == cur_len)))
1136 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
1137 } while (tmp != dir);
1138
1139 return 0;
1140 }
1141
1142 static bool create_cgroup_tree(struct hierarchy *h, const char *cgroup_tree,
1143 const char *cgroup_leaf, bool payload)
1144 {
1145 __do_free char *path = NULL;
1146 int ret, ret_cpuset;
1147
1148 path = must_make_path(h->mountpoint, h->container_base_path, cgroup_leaf, NULL);
1149 if (dir_exists(path))
1150 return log_warn_errno(false, errno, "The %s cgroup already existed", path);
1151
1152 ret_cpuset = cg_legacy_handle_cpuset_hierarchy(h, cgroup_leaf);
1153 if (ret_cpuset < 0)
1154 return log_error_errno(false, errno, "Failed to handle legacy cpuset controller");
1155
1156 ret = mkdir_eexist_on_last(path, 0755);
1157 if (ret < 0) {
1158 /*
1159 * This is the cpuset controller and
1160 * cg_legacy_handle_cpuset_hierarchy() has created our target
1161 * directory for us to ensure correct initialization.
1162 */
1163 if (ret_cpuset != 1 || cgroup_tree)
1164 return log_error_errno(false, errno, "Failed to create %s cgroup", path);
1165 }
1166
1167 if (payload) {
1168 h->cgfd_con = lxc_open_dirfd(path);
1169 if (h->cgfd_con < 0)
1170 return log_error_errno(false, errno, "Failed to open %s", path);
1171 h->container_full_path = move_ptr(path);
1172 } else {
1173 h->cgfd_mon = lxc_open_dirfd(path);
1174 if (h->cgfd_mon < 0)
1175 return log_error_errno(false, errno, "Failed to open %s", path);
1176 h->monitor_full_path = move_ptr(path);
1177 }
1178
1179 return true;
1180 }
1181
1182 static void cgroup_remove_leaf(struct hierarchy *h, bool payload)
1183 {
1184 __do_free char *full_path = NULL;
1185
1186 if (payload) {
1187 __lxc_unused __do_close int fd = move_fd(h->cgfd_con);
1188 full_path = move_ptr(h->container_full_path);
1189 } else {
1190 __lxc_unused __do_close int fd = move_fd(h->cgfd_mon);
1191 full_path = move_ptr(h->monitor_full_path);
1192 }
1193
1194 if (full_path && rmdir(full_path))
1195 SYSWARN("Failed to rmdir(\"%s\") cgroup", full_path);
1196 }
1197
1198 __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
1199 struct lxc_handler *handler)
1200 {
1201 __do_free char *monitor_cgroup = NULL, *__cgroup_tree = NULL;
1202 const char *cgroup_tree;
1203 int idx = 0;
1204 int i;
1205 size_t len;
1206 char *suffix;
1207 struct lxc_conf *conf;
1208
1209 if (!ops)
1210 return ret_set_errno(false, ENOENT);
1211
1212 if (!ops->hierarchies)
1213 return true;
1214
1215 if (ops->monitor_cgroup)
1216 return ret_set_errno(false, EEXIST);
1217
1218 if (!handler || !handler->conf)
1219 return ret_set_errno(false, EINVAL);
1220
1221 conf = handler->conf;
1222
1223 if (conf->cgroup_meta.dir) {
1224 cgroup_tree = conf->cgroup_meta.dir;
1225 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1226 DEFAULT_MONITOR_CGROUP_PREFIX,
1227 handler->name,
1228 CGROUP_CREATE_RETRY, NULL);
1229 } else if (ops->cgroup_pattern) {
1230 __cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1231 if (!__cgroup_tree)
1232 return ret_set_errno(false, ENOMEM);
1233
1234 cgroup_tree = __cgroup_tree;
1235 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1236 DEFAULT_MONITOR_CGROUP,
1237 CGROUP_CREATE_RETRY, NULL);
1238 } else {
1239 cgroup_tree = NULL;
1240 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1241 handler->name,
1242 CGROUP_CREATE_RETRY, NULL);
1243 }
1244 if (!monitor_cgroup)
1245 return ret_set_errno(false, ENOMEM);
1246
1247 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1248 *suffix = '\0';
1249 do {
1250 if (idx)
1251 sprintf(suffix, "-%d", idx);
1252
1253 for (i = 0; ops->hierarchies[i]; i++) {
1254 if (create_cgroup_tree(ops->hierarchies[i], cgroup_tree, monitor_cgroup, false))
1255 continue;
1256
1257 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path ?: "(null)");
1258 for (int j = 0; j < i; j++)
1259 cgroup_remove_leaf(ops->hierarchies[j], false);
1260
1261 idx++;
1262 break;
1263 }
1264 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
1265
1266 if (idx == 1000)
1267 return ret_set_errno(false, ERANGE);
1268
1269 ops->monitor_cgroup = move_ptr(monitor_cgroup);
1270 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1271 }
1272
1273 /*
1274 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1275 * next cgroup_pattern-1, -2, ..., -999.
1276 */
1277 __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
1278 struct lxc_handler *handler)
1279 {
1280 __do_free char *container_cgroup = NULL, *__cgroup_tree = NULL;
1281 const char *cgroup_tree;
1282 int idx = 0;
1283 int i;
1284 size_t len;
1285 char *suffix;
1286 struct lxc_conf *conf;
1287
1288 if (!ops)
1289 return ret_set_errno(false, ENOENT);
1290
1291 if (!ops->hierarchies)
1292 return true;
1293
1294 if (ops->container_cgroup)
1295 return ret_set_errno(false, EEXIST);
1296
1297 if (!handler || !handler->conf)
1298 return ret_set_errno(false, EINVAL);
1299
1300 conf = handler->conf;
1301
1302 if (conf->cgroup_meta.dir) {
1303 cgroup_tree = conf->cgroup_meta.dir;
1304 container_cgroup = must_concat(&len, cgroup_tree, "/",
1305 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1306 handler->name,
1307 CGROUP_CREATE_RETRY, NULL);
1308 } else if (ops->cgroup_pattern) {
1309 __cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1310 if (!__cgroup_tree)
1311 return ret_set_errno(false, ENOMEM);
1312
1313 cgroup_tree = __cgroup_tree;
1314 container_cgroup = must_concat(&len, cgroup_tree, "/",
1315 DEFAULT_PAYLOAD_CGROUP,
1316 CGROUP_CREATE_RETRY, NULL);
1317 } else {
1318 cgroup_tree = NULL;
1319 container_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1320 handler->name,
1321 CGROUP_CREATE_RETRY, NULL);
1322 }
1323 if (!container_cgroup)
1324 return ret_set_errno(false, ENOMEM);
1325
1326 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1327 *suffix = '\0';
1328 do {
1329 if (idx)
1330 sprintf(suffix, "-%d", idx);
1331
1332 for (i = 0; ops->hierarchies[i]; i++) {
1333 if (create_cgroup_tree(ops->hierarchies[i], cgroup_tree, container_cgroup, true))
1334 continue;
1335
1336 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
1337 for (int j = 0; j < i; j++)
1338 cgroup_remove_leaf(ops->hierarchies[j], true);
1339
1340 idx++;
1341 break;
1342 }
1343 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
1344
1345 if (idx == 1000)
1346 return ret_set_errno(false, ERANGE);
1347
1348 ops->container_cgroup = move_ptr(container_cgroup);
1349 INFO("The container process uses \"%s\" as cgroup", ops->container_cgroup);
1350 return true;
1351 }
1352
1353 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1354 struct lxc_handler *handler)
1355 {
1356 int monitor_len, transient_len;
1357 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1358 transient[INTTYPE_TO_STRLEN(pid_t)];
1359
1360 if (!ops)
1361 return ret_set_errno(false, ENOENT);
1362
1363 if (!ops->hierarchies)
1364 return true;
1365
1366 if (!ops->monitor_cgroup)
1367 return ret_set_errno(false, ENOENT);
1368
1369 if (!handler || !handler->conf)
1370 return ret_set_errno(false, EINVAL);
1371
1372 monitor_len = snprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1373 if (handler->transient_pid > 0)
1374 transient_len = snprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1375
1376 for (int i = 0; ops->hierarchies[i]; i++) {
1377 struct hierarchy *h = ops->hierarchies[i];
1378 int ret;
1379
1380 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", monitor, monitor_len);
1381 if (ret)
1382 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->monitor_full_path);
1383
1384 if (handler->transient_pid < 0)
1385 return true;
1386
1387 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", transient, transient_len);
1388 if (ret)
1389 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->monitor_full_path);
1390
1391 /*
1392 * we don't keep the fds for non-unified hierarchies around
1393 * mainly because we don't make use of them anymore after the
1394 * core cgroup setup is done but also because there are quite a
1395 * lot of them.
1396 */
1397 if (!is_unified_hierarchy(h))
1398 close_prot_errno_disarm(h->cgfd_mon);
1399 }
1400 handler->transient_pid = -1;
1401
1402 return true;
1403 }
1404
1405 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1406 struct lxc_handler *handler)
1407 {
1408 int len;
1409 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1410
1411 if (!ops)
1412 return ret_set_errno(false, ENOENT);
1413
1414 if (!ops->hierarchies)
1415 return true;
1416
1417 if (!ops->container_cgroup)
1418 return ret_set_errno(false, ENOENT);
1419
1420 if (!handler || !handler->conf)
1421 return ret_set_errno(false, EINVAL);
1422
1423 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1424
1425 for (int i = 0; ops->hierarchies[i]; i++) {
1426 struct hierarchy *h = ops->hierarchies[i];
1427 int ret;
1428
1429 ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
1430 if (ret != 0)
1431 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
1432 }
1433
1434 return true;
1435 }
1436
1437 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1438 gid_t chown_gid, mode_t chmod_mode)
1439 {
1440 int ret;
1441
1442 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1443 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1444 if (ret < 0)
1445 return log_warn_errno(-1,
1446 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1447 dirfd, path, (int)chown_uid,
1448 (int)chown_gid);
1449
1450 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1451 if (ret < 0)
1452 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1453 dirfd, path, (int)chmod_mode);
1454
1455 return 0;
1456 }
1457
1458 /* chgrp the container cgroups to container group. We leave
1459 * the container owner as cgroup owner. So we must make the
1460 * directories 775 so that the container can create sub-cgroups.
1461 *
1462 * Also chown the tasks and cgroup.procs files. Those may not
1463 * exist depending on kernel version.
1464 */
1465 static int chown_cgroup_wrapper(void *data)
1466 {
1467 int ret;
1468 uid_t destuid;
1469 struct generic_userns_exec_data *arg = data;
1470 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1471 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1472
1473 if (!lxc_setgroups(0, NULL) && errno != EPERM)
1474 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1475
1476 ret = setresgid(nsgid, nsgid, nsgid);
1477 if (ret < 0)
1478 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1479 (int)nsgid, (int)nsgid, (int)nsgid);
1480
1481 ret = setresuid(nsuid, nsuid, nsuid);
1482 if (ret < 0)
1483 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1484 (int)nsuid, (int)nsuid, (int)nsuid);
1485
1486 destuid = get_ns_uid(arg->origuid);
1487 if (destuid == LXC_INVALID_UID)
1488 destuid = 0;
1489
1490 for (int i = 0; arg->hierarchies[i]; i++) {
1491 int dirfd = arg->hierarchies[i]->cgfd_con;
1492
1493 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1494
1495 /*
1496 * Failures to chown() these are inconvenient but not
1497 * detrimental We leave these owned by the container launcher,
1498 * so that container root can write to the files to attach. We
1499 * chmod() them 664 so that container systemd can write to the
1500 * files (which systemd in wily insists on doing).
1501 */
1502
1503 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC)
1504 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1505
1506 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1507
1508 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1509 continue;
1510
1511 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++)
1512 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1513 }
1514
1515 return 0;
1516 }
1517
1518 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1519 struct lxc_conf *conf)
1520 {
1521 struct generic_userns_exec_data wrap;
1522
1523 if (!ops)
1524 return ret_set_errno(false, ENOENT);
1525
1526 if (!ops->hierarchies)
1527 return true;
1528
1529 if (!ops->container_cgroup)
1530 return ret_set_errno(false, ENOENT);
1531
1532 if (!conf)
1533 return ret_set_errno(false, EINVAL);
1534
1535 if (lxc_list_empty(&conf->id_map))
1536 return true;
1537
1538 wrap.origuid = geteuid();
1539 wrap.path = NULL;
1540 wrap.hierarchies = ops->hierarchies;
1541 wrap.conf = conf;
1542
1543 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1544 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1545
1546 return true;
1547 }
1548
1549 __cgfsng_ops void cgfsng_payload_finalize(struct cgroup_ops *ops)
1550 {
1551 if (!ops)
1552 return;
1553
1554 if (!ops->hierarchies)
1555 return;
1556
1557 for (int i = 0; ops->hierarchies[i]; i++) {
1558 struct hierarchy *h = ops->hierarchies[i];
1559 /*
1560 * we don't keep the fds for non-unified hierarchies around
1561 * mainly because we don't make use of them anymore after the
1562 * core cgroup setup is done but also because there are quite a
1563 * lot of them.
1564 */
1565 if (!is_unified_hierarchy(h))
1566 close_prot_errno_disarm(h->cgfd_con);
1567 }
1568 }
1569
1570 /* cgroup-full:* is done, no need to create subdirs */
1571 static inline bool cg_mount_needs_subdirs(int type)
1572 {
1573 return !(type >= LXC_AUTO_CGROUP_FULL_RO);
1574 }
1575
1576 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1577 * remount controller ro if needed and bindmount the cgroupfs onto
1578 * control/the/cg/path.
1579 */
1580 static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1581 char *controllerpath, char *cgpath,
1582 const char *container_cgroup)
1583 {
1584 __do_free char *sourcepath = NULL;
1585 int ret, remount_flags;
1586 int flags = MS_BIND;
1587
1588 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1589 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1590 if (ret < 0)
1591 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1592 controllerpath, controllerpath);
1593
1594 remount_flags = add_required_remount_flags(controllerpath,
1595 controllerpath,
1596 flags | MS_REMOUNT);
1597 ret = mount(controllerpath, controllerpath, "cgroup",
1598 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1599 NULL);
1600 if (ret < 0)
1601 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath);
1602
1603 INFO("Remounted %s read-only", controllerpath);
1604 }
1605
1606 sourcepath = must_make_path(h->mountpoint, h->container_base_path,
1607 container_cgroup, NULL);
1608 if (type == LXC_AUTO_CGROUP_RO)
1609 flags |= MS_RDONLY;
1610
1611 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1612 if (ret < 0)
1613 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1614 h->controllers[0], cgpath);
1615 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1616
1617 if (flags & MS_RDONLY) {
1618 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1619 flags | MS_REMOUNT);
1620 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1621 if (ret < 0)
1622 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1623 INFO("Remounted %s read-only", cgpath);
1624 }
1625
1626 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1627 return 0;
1628 }
1629
1630 /* __cg_mount_direct
1631 *
1632 * Mount cgroup hierarchies directly without using bind-mounts. The main
1633 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1634 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1635 */
1636 static int __cg_mount_direct(int type, struct hierarchy *h,
1637 const char *controllerpath)
1638 {
1639 __do_free char *controllers = NULL;
1640 char *fstype = "cgroup2";
1641 unsigned long flags = 0;
1642 int ret;
1643
1644 flags |= MS_NOSUID;
1645 flags |= MS_NOEXEC;
1646 flags |= MS_NODEV;
1647 flags |= MS_RELATIME;
1648
1649 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1650 flags |= MS_RDONLY;
1651
1652 if (h->version != CGROUP2_SUPER_MAGIC) {
1653 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1654 if (!controllers)
1655 return -ENOMEM;
1656 fstype = "cgroup";
1657 }
1658
1659 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
1660 if (ret < 0)
1661 return log_error_errno(-1, errno, "Failed to mount \"%s\" with cgroup filesystem type %s",
1662 controllerpath, fstype);
1663
1664 DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1665 return 0;
1666 }
1667
1668 static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1669 const char *controllerpath)
1670 {
1671 return __cg_mount_direct(type, h, controllerpath);
1672 }
1673
1674 static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1675 const char *controllerpath)
1676 {
1677 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1678 return 0;
1679
1680 return __cg_mount_direct(type, h, controllerpath);
1681 }
1682
1683 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1684 struct lxc_handler *handler,
1685 const char *root, int type)
1686 {
1687 __do_free char *cgroup_root = NULL;
1688 bool has_cgns = false, wants_force_mount = false;
1689 int ret;
1690
1691 if (!ops)
1692 return ret_set_errno(false, ENOENT);
1693
1694 if (!ops->hierarchies)
1695 return true;
1696
1697 if (!handler || !handler->conf)
1698 return ret_set_errno(false, EINVAL);
1699
1700 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1701 return true;
1702
1703 if (type & LXC_AUTO_CGROUP_FORCE) {
1704 type &= ~LXC_AUTO_CGROUP_FORCE;
1705 wants_force_mount = true;
1706 }
1707
1708 if (!wants_force_mount){
1709 if (!lxc_list_empty(&handler->conf->keepcaps))
1710 wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1711 else
1712 wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1713 }
1714
1715 has_cgns = cgns_supported();
1716 if (has_cgns && !wants_force_mount)
1717 return true;
1718
1719 if (type == LXC_AUTO_CGROUP_NOSPEC)
1720 type = LXC_AUTO_CGROUP_MIXED;
1721 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1722 type = LXC_AUTO_CGROUP_FULL_MIXED;
1723
1724 cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1725 if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
1726 if (has_cgns && wants_force_mount) {
1727 /*
1728 * If cgroup namespaces are supported but the container
1729 * will not have CAP_SYS_ADMIN after it has started we
1730 * need to mount the cgroups manually.
1731 */
1732 return cg_mount_in_cgroup_namespace(type, ops->unified, cgroup_root) == 0;
1733 }
1734
1735 return cg_mount_cgroup_full(type, ops->unified, cgroup_root) == 0;
1736 }
1737
1738 /* mount tmpfs */
1739 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1740 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1741 "size=10240k,mode=755", root);
1742 if (ret < 0)
1743 return false;
1744
1745 for (int i = 0; ops->hierarchies[i]; i++) {
1746 __do_free char *controllerpath = NULL, *path2 = NULL;
1747 struct hierarchy *h = ops->hierarchies[i];
1748 char *controller = strrchr(h->mountpoint, '/');
1749
1750 if (!controller)
1751 continue;
1752 controller++;
1753
1754 controllerpath = must_make_path(cgroup_root, controller, NULL);
1755 if (dir_exists(controllerpath))
1756 continue;
1757
1758 ret = mkdir(controllerpath, 0755);
1759 if (ret < 0)
1760 return log_error_errno(false, errno, "Error creating cgroup path: %s", controllerpath);
1761
1762 if (has_cgns && wants_force_mount) {
1763 /* If cgroup namespaces are supported but the container
1764 * will not have CAP_SYS_ADMIN after it has started we
1765 * need to mount the cgroups manually.
1766 */
1767 ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
1768 if (ret < 0)
1769 return false;
1770
1771 continue;
1772 }
1773
1774 ret = cg_mount_cgroup_full(type, h, controllerpath);
1775 if (ret < 0)
1776 return false;
1777
1778 if (!cg_mount_needs_subdirs(type))
1779 continue;
1780
1781 path2 = must_make_path(controllerpath, h->container_base_path,
1782 ops->container_cgroup, NULL);
1783 ret = mkdir_p(path2, 0755);
1784 if (ret < 0)
1785 return false;
1786
1787 ret = cg_legacy_mount_controllers(type, h, controllerpath,
1788 path2, ops->container_cgroup);
1789 if (ret < 0)
1790 return false;
1791 }
1792
1793 return true;
1794 }
1795
1796 /* Only root needs to escape to the cgroup of its init. */
1797 __cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
1798 struct lxc_conf *conf)
1799 {
1800 if (!ops)
1801 return ret_set_errno(false, ENOENT);
1802
1803 if (!ops->hierarchies)
1804 return true;
1805
1806 if (!conf)
1807 return ret_set_errno(false, EINVAL);
1808
1809 if (conf->cgroup_meta.relative || geteuid())
1810 return true;
1811
1812 for (int i = 0; ops->hierarchies[i]; i++) {
1813 __do_free char *fullpath = NULL;
1814 int ret;
1815
1816 fullpath =
1817 must_make_path(ops->hierarchies[i]->mountpoint,
1818 ops->hierarchies[i]->container_base_path,
1819 "cgroup.procs", NULL);
1820 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1821 if (ret != 0)
1822 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
1823 }
1824
1825 return true;
1826 }
1827
1828 __cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
1829 {
1830 int i = 0;
1831
1832 if (!ops)
1833 return ret_set_errno(-1, ENOENT);
1834
1835 if (!ops->hierarchies)
1836 return 0;
1837
1838 for (; ops->hierarchies[i]; i++)
1839 ;
1840
1841 return i;
1842 }
1843
1844 __cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n,
1845 char ***out)
1846 {
1847 int i;
1848
1849 if (!ops)
1850 return ret_set_errno(false, ENOENT);
1851
1852 if (!ops->hierarchies)
1853 return ret_set_errno(false, ENOENT);
1854
1855 /* sanity check n */
1856 for (i = 0; i < n; i++)
1857 if (!ops->hierarchies[i])
1858 return ret_set_errno(false, ENOENT);
1859
1860 *out = ops->hierarchies[i]->controllers;
1861
1862 return true;
1863 }
1864
1865 static bool cg_legacy_freeze(struct cgroup_ops *ops)
1866 {
1867 struct hierarchy *h;
1868
1869 h = get_hierarchy(ops, "freezer");
1870 if (!h)
1871 return ret_set_errno(-1, ENOENT);
1872
1873 return lxc_write_openat(h->container_full_path, "freezer.state",
1874 "FROZEN", STRLITERALLEN("FROZEN"));
1875 }
1876
1877 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1878 struct lxc_epoll_descr *descr)
1879 {
1880 __do_close int duped_fd = -EBADF;
1881 __do_free char *line = NULL;
1882 __do_fclose FILE *f = NULL;
1883 int state = PTR_TO_INT(cbdata);
1884 size_t len;
1885 const char *state_string;
1886
1887 duped_fd = dup(fd);
1888 if (duped_fd < 0)
1889 return LXC_MAINLOOP_ERROR;
1890
1891 if (lseek(duped_fd, 0, SEEK_SET) < (off_t)-1)
1892 return LXC_MAINLOOP_ERROR;
1893
1894 f = fdopen(duped_fd, "re");
1895 if (!f)
1896 return LXC_MAINLOOP_ERROR;
1897 move_fd(duped_fd);
1898
1899 if (state == 1)
1900 state_string = "frozen 1";
1901 else
1902 state_string = "frozen 0";
1903
1904 while (getline(&line, &len, f) != -1)
1905 if (strncmp(line, state_string, STRLITERALLEN("frozen") + 2) == 0)
1906 return LXC_MAINLOOP_CLOSE;
1907
1908 return LXC_MAINLOOP_CONTINUE;
1909 }
1910
1911 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1912 {
1913 __do_close int fd = -EBADF;
1914 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
1915 int ret;
1916 struct lxc_epoll_descr descr;
1917 struct hierarchy *h;
1918
1919 h = ops->unified;
1920 if (!h)
1921 return ret_set_errno(-1, ENOENT);
1922
1923 if (!h->container_full_path)
1924 return ret_set_errno(-1, EEXIST);
1925
1926 if (timeout != 0) {
1927 __do_free char *events_file = NULL;
1928
1929 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
1930 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1931 if (fd < 0)
1932 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
1933
1934 ret = lxc_mainloop_open(&descr);
1935 if (ret)
1936 return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container freeze");
1937
1938 /* automatically cleaned up now */
1939 descr_ptr = &descr;
1940
1941 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){1}));
1942 if (ret < 0)
1943 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
1944 }
1945
1946 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "1", 1);
1947 if (ret < 0)
1948 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
1949
1950 if (timeout != 0 && lxc_mainloop(&descr, timeout))
1951 return log_error_errno(-1, errno, "Failed to wait for container to be frozen");
1952
1953 return 0;
1954 }
1955
1956 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
1957 {
1958 if (!ops->hierarchies)
1959 return ret_set_errno(-1, ENOENT);
1960
1961 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
1962 return cg_legacy_freeze(ops);
1963
1964 return cg_unified_freeze(ops, timeout);
1965 }
1966
1967 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
1968 {
1969 struct hierarchy *h;
1970
1971 h = get_hierarchy(ops, "freezer");
1972 if (!h)
1973 return ret_set_errno(-1, ENOENT);
1974
1975 return lxc_write_openat(h->container_full_path, "freezer.state",
1976 "THAWED", STRLITERALLEN("THAWED"));
1977 }
1978
1979 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
1980 {
1981 __do_close int fd = -EBADF;
1982 call_cleaner(lxc_mainloop_close)struct lxc_epoll_descr *descr_ptr = NULL;
1983 int ret;
1984 struct lxc_epoll_descr descr;
1985 struct hierarchy *h;
1986
1987 h = ops->unified;
1988 if (!h)
1989 return ret_set_errno(-1, ENOENT);
1990
1991 if (!h->container_full_path)
1992 return ret_set_errno(-1, EEXIST);
1993
1994 if (timeout != 0) {
1995 __do_free char *events_file = NULL;
1996
1997 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
1998 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1999 if (fd < 0)
2000 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
2001
2002 ret = lxc_mainloop_open(&descr);
2003 if (ret)
2004 return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container unfreeze");
2005
2006 /* automatically cleaned up now */
2007 descr_ptr = &descr;
2008
2009 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){0}));
2010 if (ret < 0)
2011 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
2012 }
2013
2014 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "0", 1);
2015 if (ret < 0)
2016 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
2017
2018 if (timeout != 0 && lxc_mainloop(&descr, timeout))
2019 return log_error_errno(-1, errno, "Failed to wait for container to be unfrozen");
2020
2021 return 0;
2022 }
2023
2024 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2025 {
2026 if (!ops->hierarchies)
2027 return ret_set_errno(-1, ENOENT);
2028
2029 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2030 return cg_legacy_unfreeze(ops);
2031
2032 return cg_unified_unfreeze(ops, timeout);
2033 }
2034
2035 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2036 const char *controller)
2037 {
2038 struct hierarchy *h;
2039
2040 h = get_hierarchy(ops, controller);
2041 if (!h)
2042 return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
2043 controller ? controller : "(null)");
2044
2045 return h->container_full_path
2046 ? h->container_full_path + strlen(h->mountpoint)
2047 : NULL;
2048 }
2049
2050 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2051 * which must be freed by the caller.
2052 */
2053 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2054 const char *inpath,
2055 const char *filename)
2056 {
2057 return must_make_path(h->mountpoint, inpath, filename, NULL);
2058 }
2059
2060 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2061 {
2062 int idx = 1;
2063 int ret;
2064 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2065 size_t pidstr_len;
2066
2067 /* Create leaf cgroup. */
2068 ret = mkdirat(unified_fd, "lxc", 0755);
2069 if (ret < 0 && errno != EEXIST)
2070 return log_error_errno(-1, errno, "Failed to create leaf cgroup \"lxc\"");
2071
2072 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2073 ret = lxc_writeat(unified_fd, "lxc/cgroup.procs", pidstr, pidstr_len);
2074 if (ret < 0)
2075 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2076 if (ret == 0)
2077 return 0;
2078
2079 /* this is a non-leaf node */
2080 if (errno != EBUSY)
2081 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2082
2083 do {
2084 bool rm = false;
2085 char attach_cgroup[STRLITERALLEN("lxc-1000/cgroup.procs") + 1];
2086 char *slash;
2087
2088 sprintf(attach_cgroup, "lxc-%d/cgroup.procs", idx);
2089 slash = &attach_cgroup[ret] - STRLITERALLEN("/cgroup.procs");
2090 *slash = '\0';
2091
2092 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2093 if (ret < 0 && errno != EEXIST)
2094 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2095 if (ret == 0)
2096 rm = true;
2097
2098 *slash = '/';
2099
2100 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2101 if (ret == 0)
2102 return 0;
2103
2104 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2105 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2106
2107 /* this is a non-leaf node */
2108 if (errno != EBUSY)
2109 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2110
2111 idx++;
2112 } while (idx < 1000);
2113
2114 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2115 }
2116
2117 struct userns_exec_unified_attach_data {
2118 const struct lxc_conf *conf;
2119 int unified_fd;
2120 pid_t pid;
2121 };
2122
2123 static int cgroup_unified_attach_wrapper(void *data)
2124 {
2125 struct userns_exec_unified_attach_data *args = data;
2126 uid_t nsuid;
2127 gid_t nsgid;
2128 int ret;
2129
2130 if (!args->conf || args->unified_fd < 0 || args->pid <= 0)
2131 return ret_errno(EINVAL);
2132
2133 if (!lxc_setgroups(0, NULL) && errno != EPERM)
2134 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
2135
2136 nsuid = (args->conf->root_nsuid_map != NULL) ? 0 : args->conf->init_uid;
2137 nsgid = (args->conf->root_nsgid_map != NULL) ? 0 : args->conf->init_gid;
2138
2139 ret = setresgid(nsgid, nsgid, nsgid);
2140 if (ret < 0)
2141 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
2142 (int)nsgid, (int)nsgid, (int)nsgid);
2143
2144 ret = setresuid(nsuid, nsuid, nsuid);
2145 if (ret < 0)
2146 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
2147 (int)nsuid, (int)nsuid, (int)nsuid);
2148
2149 return cgroup_attach_leaf(args->conf, args->unified_fd, args->pid);
2150 }
2151
2152 int cgroup_attach(const struct lxc_conf *conf, const char *name,
2153 const char *lxcpath, pid_t pid)
2154 {
2155 __do_close int unified_fd = -EBADF;
2156 int ret;
2157
2158 if (!conf || !name || !lxcpath || pid <= 0)
2159 return ret_errno(EINVAL);
2160
2161 unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
2162 if (unified_fd < 0)
2163 return ret_errno(EBADF);
2164
2165 if (!lxc_list_empty(&conf->id_map)) {
2166 struct userns_exec_unified_attach_data args = {
2167 .conf = conf,
2168 .unified_fd = unified_fd,
2169 .pid = pid,
2170 };
2171
2172 ret = userns_exec_minimal(conf, cgroup_unified_attach_wrapper, &args);
2173 } else {
2174 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2175 }
2176
2177 return ret;
2178 }
2179
2180 /* Technically, we're always at a delegation boundary here (This is especially
2181 * true when cgroup namespaces are available.). The reasoning is that in order
2182 * for us to have been able to start a container in the first place the root
2183 * cgroup must have been a leaf node. Now, either the container's init system
2184 * has populated the cgroup and kept it as a leaf node or it has created
2185 * subtrees. In the former case we will simply attach to the leaf node we
2186 * created when we started the container in the latter case we create our own
2187 * cgroup for the attaching process.
2188 */
2189 static int __cg_unified_attach(const struct hierarchy *h,
2190 const struct lxc_conf *conf, const char *name,
2191 const char *lxcpath, pid_t pid,
2192 const char *controller)
2193 {
2194 __do_close int unified_fd = -EBADF;
2195 __do_free char *path = NULL, *cgroup = NULL;
2196 int ret;
2197
2198 if (!conf || !name || !lxcpath || pid <= 0)
2199 return ret_errno(EINVAL);
2200
2201 ret = cgroup_attach(conf, name, lxcpath, pid);
2202 if (ret == 0)
2203 return log_trace(0, "Attached to unified cgroup via command handler");
2204 if (ret != -EBADF)
2205 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2206
2207 /* Fall back to retrieving the path for the unified cgroup. */
2208 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2209 /* not running */
2210 if (!cgroup)
2211 return 0;
2212
2213 path = must_make_path(h->mountpoint, cgroup, NULL);
2214
2215 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2216 if (unified_fd < 0)
2217 return ret_errno(EBADF);
2218
2219 if (!lxc_list_empty(&conf->id_map)) {
2220 struct userns_exec_unified_attach_data args = {
2221 .conf = conf,
2222 .unified_fd = unified_fd,
2223 .pid = pid,
2224 };
2225
2226 ret = userns_exec_minimal(conf, cgroup_unified_attach_wrapper, &args);
2227 } else {
2228 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2229 }
2230
2231 return ret;
2232 }
2233
2234 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2235 const struct lxc_conf *conf,
2236 const char *name, const char *lxcpath,
2237 pid_t pid)
2238 {
2239 int len, ret;
2240 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2241
2242 if (!ops)
2243 return ret_set_errno(false, ENOENT);
2244
2245 if (!ops->hierarchies)
2246 return true;
2247
2248 len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
2249 if (len < 0 || (size_t)len >= sizeof(pidstr))
2250 return false;
2251
2252 for (int i = 0; ops->hierarchies[i]; i++) {
2253 __do_free char *fullpath = NULL, *path = NULL;
2254 struct hierarchy *h = ops->hierarchies[i];
2255
2256 if (h->version == CGROUP2_SUPER_MAGIC) {
2257 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2258 h->controllers[0]);
2259 if (ret < 0)
2260 return false;
2261
2262 continue;
2263 }
2264
2265 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2266 /* not running */
2267 if (!path)
2268 return false;
2269
2270 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2271 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2272 if (ret < 0)
2273 return log_error_errno(false, errno, "Failed to attach %d to %s",
2274 (int)pid, fullpath);
2275 }
2276
2277 return true;
2278 }
2279
2280 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2281 * don't have a cgroup_data set up, so we ask the running container through the
2282 * commands API for the cgroup path.
2283 */
2284 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2285 char *value, size_t len, const char *name,
2286 const char *lxcpath)
2287 {
2288 __do_free char *path = NULL;
2289 __do_free char *controller = NULL;
2290 char *p;
2291 struct hierarchy *h;
2292 int ret = -1;
2293
2294 if (!ops)
2295 return ret_set_errno(-1, ENOENT);
2296
2297 controller = must_copy_string(filename);
2298 p = strchr(controller, '.');
2299 if (p)
2300 *p = '\0';
2301
2302 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2303 /* not running */
2304 if (!path)
2305 return -1;
2306
2307 h = get_hierarchy(ops, controller);
2308 if (h) {
2309 __do_free char *fullpath = NULL;
2310
2311 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2312 ret = lxc_read_from_file(fullpath, value, len);
2313 }
2314
2315 return ret;
2316 }
2317
2318 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2319 {
2320 for (int count = 0; count < 3; count++, val++) {
2321 switch (*val) {
2322 case 'r':
2323 device->access[count] = *val;
2324 break;
2325 case 'w':
2326 device->access[count] = *val;
2327 break;
2328 case 'm':
2329 device->access[count] = *val;
2330 break;
2331 case '\n':
2332 case '\0':
2333 count = 3;
2334 break;
2335 default:
2336 return ret_errno(EINVAL);
2337 }
2338 }
2339
2340 return 0;
2341 }
2342
2343 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2344 const char *val)
2345 {
2346 int count, ret;
2347 char temp[50];
2348
2349 if (strcmp("devices.allow", key) == 0)
2350 device->allow = 1;
2351 else
2352 device->allow = 0;
2353
2354 if (strcmp(val, "a") == 0) {
2355 /* global rule */
2356 device->type = 'a';
2357 device->major = -1;
2358 device->minor = -1;
2359 device->global_rule = device->allow
2360 ? LXC_BPF_DEVICE_CGROUP_BLACKLIST
2361 : LXC_BPF_DEVICE_CGROUP_WHITELIST;
2362 device->allow = -1;
2363 return 0;
2364 }
2365
2366 /* local rule */
2367 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2368
2369 switch (*val) {
2370 case 'a':
2371 __fallthrough;
2372 case 'b':
2373 __fallthrough;
2374 case 'c':
2375 device->type = *val;
2376 break;
2377 default:
2378 return -1;
2379 }
2380
2381 val++;
2382 if (!isspace(*val))
2383 return -1;
2384 val++;
2385 if (*val == '*') {
2386 device->major = -1;
2387 val++;
2388 } else if (isdigit(*val)) {
2389 memset(temp, 0, sizeof(temp));
2390 for (count = 0; count < sizeof(temp) - 1; count++) {
2391 temp[count] = *val;
2392 val++;
2393 if (!isdigit(*val))
2394 break;
2395 }
2396 ret = lxc_safe_int(temp, &device->major);
2397 if (ret)
2398 return -1;
2399 } else {
2400 return -1;
2401 }
2402 if (*val != ':')
2403 return -1;
2404 val++;
2405
2406 /* read minor */
2407 if (*val == '*') {
2408 device->minor = -1;
2409 val++;
2410 } else if (isdigit(*val)) {
2411 memset(temp, 0, sizeof(temp));
2412 for (count = 0; count < sizeof(temp) - 1; count++) {
2413 temp[count] = *val;
2414 val++;
2415 if (!isdigit(*val))
2416 break;
2417 }
2418 ret = lxc_safe_int(temp, &device->minor);
2419 if (ret)
2420 return -1;
2421 } else {
2422 return -1;
2423 }
2424 if (!isspace(*val))
2425 return -1;
2426
2427 return device_cgroup_parse_access(device, ++val);
2428 }
2429
2430 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2431 * don't have a cgroup_data set up, so we ask the running container through the
2432 * commands API for the cgroup path.
2433 */
2434 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2435 const char *key, const char *value,
2436 const char *name, const char *lxcpath)
2437 {
2438 __do_free char *path = NULL;
2439 __do_free char *controller = NULL;
2440 char *p;
2441 struct hierarchy *h;
2442 int ret = -1;
2443
2444 if (!ops)
2445 return ret_set_errno(-1, ENOENT);
2446
2447 controller = must_copy_string(key);
2448 p = strchr(controller, '.');
2449 if (p)
2450 *p = '\0';
2451
2452 if (pure_unified_layout(ops) && strcmp(controller, "devices") == 0) {
2453 struct device_item device = {0};
2454
2455 ret = device_cgroup_rule_parse(&device, key, value);
2456 if (ret < 0)
2457 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2458 key, value);
2459
2460 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2461 if (ret < 0)
2462 return -1;
2463
2464 return 0;
2465 }
2466
2467 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2468 /* not running */
2469 if (!path)
2470 return -1;
2471
2472 h = get_hierarchy(ops, controller);
2473 if (h) {
2474 __do_free char *fullpath = NULL;
2475
2476 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2477 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2478 }
2479
2480 return ret;
2481 }
2482
2483 /* take devices cgroup line
2484 * /dev/foo rwx
2485 * and convert it to a valid
2486 * type major:minor mode
2487 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2488 * the output.
2489 */
2490 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2491 const char *devpath)
2492 {
2493 __do_free char *path = NULL;
2494 char *mode = NULL;
2495 int n_parts, ret;
2496 char *p;
2497 struct stat sb;
2498
2499 path = must_copy_string(devpath);
2500
2501 /*
2502 * Read path followed by mode. Ignore any trailing text.
2503 * A ' # comment' would be legal. Technically other text is not
2504 * legal, we could check for that if we cared to.
2505 */
2506 for (n_parts = 1, p = path; *p; p++) {
2507 if (*p != ' ')
2508 continue;
2509 *p = '\0';
2510
2511 if (n_parts != 1)
2512 break;
2513 p++;
2514 n_parts++;
2515
2516 while (*p == ' ')
2517 p++;
2518
2519 mode = p;
2520
2521 if (*p == '\0')
2522 return ret_set_errno(-1, EINVAL);
2523 }
2524
2525 if (device_cgroup_parse_access(device, mode) < 0)
2526 return -1;
2527
2528 if (n_parts == 1)
2529 return ret_set_errno(-1, EINVAL);
2530
2531 ret = stat(path, &sb);
2532 if (ret < 0)
2533 return ret_set_errno(-1, errno);
2534
2535 mode_t m = sb.st_mode & S_IFMT;
2536 switch (m) {
2537 case S_IFBLK:
2538 device->type = 'b';
2539 break;
2540 case S_IFCHR:
2541 device->type = 'c';
2542 break;
2543 default:
2544 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2545 }
2546
2547 device->major = MAJOR(sb.st_rdev);
2548 device->minor = MINOR(sb.st_rdev);
2549 device->allow = 1;
2550 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2551
2552 return 0;
2553 }
2554
2555 static int convert_devpath(const char *invalue, char *dest)
2556 {
2557 struct device_item device = {0};
2558 int ret;
2559
2560 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2561 if (ret < 0)
2562 return -1;
2563
2564 ret = snprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2565 device.minor, device.access);
2566 if (ret < 0 || ret >= 50)
2567 return log_error_errno(-1, ENAMETOOLONG, "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2568 device.type, device.major, device.minor, device.access);
2569
2570 return 0;
2571 }
2572
2573 /* Called from setup_limits - here we have the container's cgroup_data because
2574 * we created the cgroups.
2575 */
2576 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2577 const char *value)
2578 {
2579 __do_free char *controller = NULL;
2580 char *p;
2581 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2582 char converted_value[50];
2583 struct hierarchy *h;
2584
2585 controller = must_copy_string(filename);
2586 p = strchr(controller, '.');
2587 if (p)
2588 *p = '\0';
2589
2590 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
2591 int ret;
2592
2593 ret = convert_devpath(value, converted_value);
2594 if (ret < 0)
2595 return ret;
2596 value = converted_value;
2597 }
2598
2599 h = get_hierarchy(ops, controller);
2600 if (!h)
2601 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
2602
2603 return lxc_write_openat(h->container_full_path, filename, value, strlen(value));
2604 }
2605
2606 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2607 struct lxc_conf *conf,
2608 bool do_devices)
2609 {
2610 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
2611 struct lxc_list *cgroup_settings = &conf->cgroup;
2612 struct lxc_list *iterator, *next;
2613 struct lxc_cgroup *cg;
2614 bool ret = false;
2615
2616 if (!ops)
2617 return ret_set_errno(false, ENOENT);
2618
2619 if (!conf)
2620 return ret_set_errno(false, EINVAL);
2621
2622 cgroup_settings = &conf->cgroup;
2623 if (lxc_list_empty(cgroup_settings))
2624 return true;
2625
2626 if (!ops->hierarchies)
2627 return ret_set_errno(false, EINVAL);
2628
2629 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2630 if (!sorted_cgroup_settings)
2631 return false;
2632
2633 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2634 cg = iterator->elem;
2635
2636 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2637 if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
2638 if (do_devices && (errno == EACCES || errno == EPERM)) {
2639 SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2640 continue;
2641 }
2642 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2643 goto out;
2644 }
2645 DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
2646 }
2647 }
2648
2649 ret = true;
2650 INFO("Limits for the legacy cgroup hierarchies have been setup");
2651 out:
2652 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2653 lxc_list_del(iterator);
2654 free(iterator);
2655 }
2656
2657 return ret;
2658 }
2659
2660 /*
2661 * Some of the parsing logic comes from the original cgroup device v1
2662 * implementation in the kernel.
2663 */
2664 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2665 struct lxc_conf *conf, const char *key,
2666 const char *val)
2667 {
2668 #ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
2669 struct device_item device_item = {0};
2670 int ret;
2671
2672 if (strcmp("devices.allow", key) == 0 && *val == '/')
2673 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2674 else
2675 ret = device_cgroup_rule_parse(&device_item, key, val);
2676 if (ret < 0)
2677 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", key, val);
2678
2679 ret = bpf_list_add_device(conf, &device_item);
2680 if (ret < 0)
2681 return -1;
2682 #endif
2683 return 0;
2684 }
2685
2686 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2687 struct lxc_handler *handler)
2688 {
2689 struct lxc_list *cgroup_settings, *iterator;
2690 struct hierarchy *h;
2691 struct lxc_conf *conf;
2692
2693 if (!ops)
2694 return ret_set_errno(false, ENOENT);
2695
2696 if (!ops->hierarchies)
2697 return true;
2698
2699 if (!ops->container_cgroup)
2700 return ret_set_errno(false, EINVAL);
2701
2702 if (!handler || !handler->conf)
2703 return ret_set_errno(false, EINVAL);
2704 conf = handler->conf;
2705
2706 if (lxc_list_empty(&conf->cgroup2))
2707 return true;
2708 cgroup_settings = &conf->cgroup2;
2709
2710 if (!ops->unified)
2711 return false;
2712 h = ops->unified;
2713
2714 lxc_list_for_each (iterator, cgroup_settings) {
2715 struct lxc_cgroup *cg = iterator->elem;
2716 int ret;
2717
2718 if (strncmp("devices", cg->subsystem, 7) == 0) {
2719 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem,
2720 cg->value);
2721 } else {
2722 ret = lxc_write_openat(h->container_full_path,
2723 cg->subsystem, cg->value,
2724 strlen(cg->value));
2725 if (ret < 0)
2726 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"",
2727 cg->subsystem, cg->value);
2728 }
2729 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2730 }
2731
2732 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2733 }
2734
2735 __cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
2736 struct lxc_handler *handler)
2737 {
2738 #ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
2739 __do_bpf_program_free struct bpf_program *devices = NULL;
2740 int ret;
2741 struct lxc_conf *conf;
2742 struct hierarchy *unified;
2743 struct lxc_list *it;
2744 struct bpf_program *devices_old;
2745
2746 if (!ops)
2747 return ret_set_errno(false, ENOENT);
2748
2749 if (!ops->hierarchies)
2750 return true;
2751
2752 if (!ops->container_cgroup)
2753 return ret_set_errno(false, EEXIST);
2754
2755 if (!handler || !handler->conf)
2756 return ret_set_errno(false, EINVAL);
2757 conf = handler->conf;
2758
2759 unified = ops->unified;
2760 if (!unified || !unified->bpf_device_controller ||
2761 !unified->container_full_path || lxc_list_empty(&conf->devices))
2762 return true;
2763
2764 devices = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
2765 if (!devices)
2766 return log_error_errno(false, ENOMEM, "Failed to create new bpf program");
2767
2768 ret = bpf_program_init(devices);
2769 if (ret)
2770 return log_error_errno(false, ENOMEM, "Failed to initialize bpf program");
2771
2772 lxc_list_for_each(it, &conf->devices) {
2773 struct device_item *cur = it->elem;
2774
2775 ret = bpf_program_append_device(devices, cur);
2776 if (ret)
2777 return log_error_errno(false, ENOMEM, "Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2778 cur->type,
2779 cur->major,
2780 cur->minor,
2781 cur->access,
2782 cur->allow,
2783 cur->global_rule);
2784 TRACE("Added rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2785 cur->type,
2786 cur->major,
2787 cur->minor,
2788 cur->access,
2789 cur->allow,
2790 cur->global_rule);
2791 }
2792
2793 ret = bpf_program_finalize(devices);
2794 if (ret)
2795 return log_error_errno(false, ENOMEM, "Failed to finalize bpf program");
2796
2797 ret = bpf_program_cgroup_attach(devices, BPF_CGROUP_DEVICE,
2798 unified->container_full_path,
2799 BPF_F_ALLOW_MULTI);
2800 if (ret)
2801 return log_error_errno(false, ENOMEM, "Failed to attach bpf program");
2802
2803 /* Replace old bpf program. */
2804 devices_old = move_ptr(conf->cgroup2_devices);
2805 conf->cgroup2_devices = move_ptr(devices);
2806 devices = move_ptr(devices_old);
2807 #endif
2808 return true;
2809 }
2810
2811 bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2812 {
2813 __do_free char *add_controllers = NULL, *base_path = NULL;
2814 __do_free_string_list char **parts = NULL;
2815 struct hierarchy *unified = ops->unified;
2816 ssize_t parts_len;
2817 char **it;
2818 size_t full_len = 0;
2819
2820 if (!ops->hierarchies || !pure_unified_layout(ops) ||
2821 !unified->controllers[0])
2822 return true;
2823
2824 /* For now we simply enable all controllers that we have detected by
2825 * creating a string like "+memory +pids +cpu +io".
2826 * TODO: In the near future we might want to support "-<controller>"
2827 * etc. but whether supporting semantics like this make sense will need
2828 * some thinking.
2829 */
2830 for (it = unified->controllers; it && *it; it++) {
2831 full_len += strlen(*it) + 2;
2832 add_controllers = must_realloc(add_controllers, full_len + 1);
2833
2834 if (unified->controllers[0] == *it)
2835 add_controllers[0] = '\0';
2836
2837 (void)strlcat(add_controllers, "+", full_len + 1);
2838 (void)strlcat(add_controllers, *it, full_len + 1);
2839
2840 if ((it + 1) && *(it + 1))
2841 (void)strlcat(add_controllers, " ", full_len + 1);
2842 }
2843
2844 parts = lxc_string_split(cgroup, '/');
2845 if (!parts)
2846 return false;
2847
2848 parts_len = lxc_array_len((void **)parts);
2849 if (parts_len > 0)
2850 parts_len--;
2851
2852 base_path = must_make_path(unified->mountpoint, unified->container_base_path, NULL);
2853 for (ssize_t i = -1; i < parts_len; i++) {
2854 int ret;
2855 __do_free char *target = NULL;
2856
2857 if (i >= 0)
2858 base_path = must_append_path(base_path, parts[i], NULL);
2859 target = must_make_path(base_path, "cgroup.subtree_control", NULL);
2860 ret = lxc_writeat(-1, target, add_controllers, full_len);
2861 if (ret < 0)
2862 return log_error_errno(false, errno, "Could not enable \"%s\" controllers in the unified cgroup \"%s\"",
2863 add_controllers, target);
2864 TRACE("Enable \"%s\" controllers in the unified cgroup \"%s\"", add_controllers, target);
2865 }
2866
2867 return true;
2868 }
2869
2870 __cgfsng_ops bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2871 {
2872 if (!ops)
2873 return ret_set_errno(false, ENOENT);
2874
2875 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2876 }
2877
2878 __cgfsng_ops bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2879 {
2880 if (!ops)
2881 return ret_set_errno(false, ENOENT);
2882
2883 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2884 }
2885
2886 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2887 char **controllers)
2888 {
2889 if (!ops->cgroup_use)
2890 return true;
2891
2892 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
2893 bool found = false;
2894
2895 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
2896 if (strcmp(*cur_use, *cur_ctrl) != 0)
2897 continue;
2898
2899 found = true;
2900 break;
2901 }
2902
2903 if (found)
2904 continue;
2905
2906 return false;
2907 }
2908
2909 return true;
2910 }
2911
2912 static void cg_unified_delegate(char ***delegate)
2913 {
2914 __do_free char *buf = NULL;
2915 char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
2916 char *token;
2917 int idx;
2918
2919 buf = read_file("/sys/kernel/cgroup/delegate");
2920 if (!buf) {
2921 for (char **p = standard; p && *p; p++) {
2922 idx = append_null_to_list((void ***)delegate);
2923 (*delegate)[idx] = must_copy_string(*p);
2924 }
2925 SYSWARN("Failed to read /sys/kernel/cgroup/delegate");
2926 return;
2927 }
2928
2929 lxc_iterate_parts (token, buf, " \t\n") {
2930 /*
2931 * We always need to chown this for both cgroup and
2932 * cgroup2.
2933 */
2934 if (strcmp(token, "cgroup.procs") == 0)
2935 continue;
2936
2937 idx = append_null_to_list((void ***)delegate);
2938 (*delegate)[idx] = must_copy_string(token);
2939 }
2940 }
2941
2942 /* At startup, parse_hierarchies finds all the info we need about cgroup
2943 * mountpoints and current cgroups, and stores it in @d.
2944 */
2945 static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
2946 {
2947 __do_free char *basecginfo = NULL, *line = NULL;
2948 __do_free_string_list char **klist = NULL, **nlist = NULL;
2949 __do_fclose FILE *f = NULL;
2950 int ret;
2951 size_t len = 0;
2952
2953 /* Root spawned containers escape the current cgroup, so use init's
2954 * cgroups as our base in that case.
2955 */
2956 if (!relative && (geteuid() == 0))
2957 basecginfo = read_file("/proc/1/cgroup");
2958 else
2959 basecginfo = read_file("/proc/self/cgroup");
2960 if (!basecginfo)
2961 return ret_set_errno(-1, ENOMEM);
2962
2963 ret = get_existing_subsystems(&klist, &nlist);
2964 if (ret < 0)
2965 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
2966
2967 f = fopen("/proc/self/mountinfo", "re");
2968 if (!f)
2969 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
2970
2971 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2972
2973 while (getline(&line, &len, f) != -1) {
2974 __do_free char *base_cgroup = NULL, *mountpoint = NULL;
2975 __do_free_string_list char **controller_list = NULL;
2976 int type;
2977 bool writeable;
2978 struct hierarchy *new;
2979
2980 type = get_cgroup_version(line);
2981 if (type == 0)
2982 continue;
2983
2984 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2985 continue;
2986
2987 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2988 if (type == CGROUP2_SUPER_MAGIC)
2989 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2990 else if (type == CGROUP_SUPER_MAGIC)
2991 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2992 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2993 if (type == CGROUP_SUPER_MAGIC)
2994 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2995 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2996 if (type == CGROUP2_SUPER_MAGIC)
2997 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2998 }
2999
3000 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
3001 if (!controller_list && type == CGROUP_SUPER_MAGIC)
3002 continue;
3003
3004 if (type == CGROUP_SUPER_MAGIC)
3005 if (controller_list_is_dup(ops->hierarchies, controller_list)) {
3006 TRACE("Skipping duplicating controller");
3007 continue;
3008 }
3009
3010 mountpoint = cg_hybrid_get_mountpoint(line);
3011 if (!mountpoint) {
3012 ERROR("Failed parsing mountpoint from \"%s\"", line);
3013 continue;
3014 }
3015
3016 if (type == CGROUP_SUPER_MAGIC)
3017 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
3018 else
3019 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
3020 if (!base_cgroup) {
3021 ERROR("Failed to find current cgroup");
3022 continue;
3023 }
3024
3025 trim(base_cgroup);
3026 prune_init_scope(base_cgroup);
3027 if (type == CGROUP2_SUPER_MAGIC)
3028 writeable = test_writeable_v2(mountpoint, base_cgroup);
3029 else
3030 writeable = test_writeable_v1(mountpoint, base_cgroup);
3031 if (!writeable) {
3032 TRACE("The %s group is not writeable", base_cgroup);
3033 continue;
3034 }
3035
3036 if (type == CGROUP2_SUPER_MAGIC) {
3037 char *cgv2_ctrl_path;
3038
3039 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
3040 "cgroup.controllers",
3041 NULL);
3042
3043 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
3044 free(cgv2_ctrl_path);
3045 if (!controller_list) {
3046 controller_list = cg_unified_make_empty_controller();
3047 TRACE("No controllers are enabled for "
3048 "delegation in the unified hierarchy");
3049 }
3050 }
3051
3052 /* Exclude all controllers that cgroup use does not want. */
3053 if (!cgroup_use_wants_controllers(ops, controller_list)) {
3054 TRACE("Skipping controller");
3055 continue;
3056 }
3057
3058 new = add_hierarchy(&ops->hierarchies, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
3059 if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
3060 if (unprivileged)
3061 cg_unified_delegate(&new->cgroup2_chown);
3062 ops->unified = new;
3063 }
3064 }
3065
3066 TRACE("Writable cgroup hierarchies:");
3067 lxc_cgfsng_print_hierarchies(ops);
3068
3069 /* verify that all controllers in cgroup.use and all crucial
3070 * controllers are accounted for
3071 */
3072 if (!all_controllers_found(ops))
3073 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
3074
3075 return 0;
3076 }
3077
3078 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
3079 static char *cg_unified_get_current_cgroup(bool relative)
3080 {
3081 __do_free char *basecginfo = NULL;
3082 char *copy;
3083 char *base_cgroup;
3084
3085 if (!relative && (geteuid() == 0))
3086 basecginfo = read_file("/proc/1/cgroup");
3087 else
3088 basecginfo = read_file("/proc/self/cgroup");
3089 if (!basecginfo)
3090 return NULL;
3091
3092 base_cgroup = strstr(basecginfo, "0::/");
3093 if (!base_cgroup)
3094 return NULL;
3095
3096 base_cgroup = base_cgroup + 3;
3097 copy = copy_to_eol(base_cgroup);
3098 if (!copy)
3099 return NULL;
3100
3101 return trim(copy);
3102 }
3103
3104 static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3105 bool unprivileged)
3106 {
3107 __do_free char *subtree_path = NULL;
3108 int ret;
3109 char *mountpoint;
3110 char **delegatable;
3111 struct hierarchy *new;
3112 char *base_cgroup = NULL;
3113
3114 ret = unified_cgroup_hierarchy();
3115 if (ret == -ENOMEDIUM)
3116 return ret_errno(ENOMEDIUM);
3117
3118 if (ret != CGROUP2_SUPER_MAGIC)
3119 return 0;
3120
3121 base_cgroup = cg_unified_get_current_cgroup(relative);
3122 if (!base_cgroup)
3123 return ret_errno(EINVAL);
3124 if (!relative)
3125 prune_init_scope(base_cgroup);
3126
3127 /*
3128 * We assume that the cgroup we're currently in has been delegated to
3129 * us and we are free to further delege all of the controllers listed
3130 * in cgroup.controllers further down the hierarchy.
3131 */
3132 mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
3133 subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
3134 delegatable = cg_unified_get_controllers(subtree_path);
3135 if (!delegatable)
3136 delegatable = cg_unified_make_empty_controller();
3137 if (!delegatable[0])
3138 TRACE("No controllers are enabled for delegation");
3139
3140 /* TODO: If the user requested specific controllers via lxc.cgroup.use
3141 * we should verify here. The reason I'm not doing it right is that I'm
3142 * not convinced that lxc.cgroup.use will be the future since it is a
3143 * global property. I much rather have an option that lets you request
3144 * controllers per container.
3145 */
3146
3147 new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
3148 if (unprivileged)
3149 cg_unified_delegate(&new->cgroup2_chown);
3150
3151 if (bpf_devices_cgroup_supported())
3152 new->bpf_device_controller = 1;
3153
3154 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3155 ops->unified = new;
3156
3157 return CGROUP2_SUPER_MAGIC;
3158 }
3159
3160 static int cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
3161 {
3162 int ret;
3163 const char *tmp;
3164 bool relative = conf->cgroup_meta.relative;
3165
3166 tmp = lxc_global_config_value("lxc.cgroup.use");
3167 if (tmp) {
3168 __do_free char *pin = NULL;
3169 char *chop, *cur;
3170
3171 pin = must_copy_string(tmp);
3172 chop = pin;
3173
3174 lxc_iterate_parts(cur, chop, ",")
3175 must_append_string(&ops->cgroup_use, cur);
3176 }
3177
3178 ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
3179 if (ret < 0)
3180 return -1;
3181
3182 if (ret == CGROUP2_SUPER_MAGIC)
3183 return 0;
3184
3185 return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
3186 }
3187
3188 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3189 {
3190 const char *cgroup_pattern;
3191
3192 if (!ops)
3193 return ret_set_errno(-1, ENOENT);
3194
3195 /* copy system-wide cgroup information */
3196 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3197 if (cgroup_pattern && strcmp(cgroup_pattern, "") != 0)
3198 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
3199
3200 return 0;
3201 }
3202
3203 struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
3204 {
3205 __do_free struct cgroup_ops *cgfsng_ops = NULL;
3206
3207 cgfsng_ops = malloc(sizeof(struct cgroup_ops));
3208 if (!cgfsng_ops)
3209 return ret_set_errno(NULL, ENOMEM);
3210
3211 memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
3212 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3213
3214 if (cg_init(cgfsng_ops, conf))
3215 return NULL;
3216
3217 cgfsng_ops->data_init = cgfsng_data_init;
3218 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3219 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3220 cgfsng_ops->monitor_create = cgfsng_monitor_create;
3221 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3222 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3223 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3224 cgfsng_ops->payload_create = cgfsng_payload_create;
3225 cgfsng_ops->payload_enter = cgfsng_payload_enter;
3226 cgfsng_ops->payload_finalize = cgfsng_payload_finalize;
3227 cgfsng_ops->escape = cgfsng_escape;
3228 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
3229 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
3230 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3231 cgfsng_ops->get = cgfsng_get;
3232 cgfsng_ops->set = cgfsng_set;
3233 cgfsng_ops->freeze = cgfsng_freeze;
3234 cgfsng_ops->unfreeze = cgfsng_unfreeze;
3235 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3236 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3237 cgfsng_ops->driver = "cgfsng";
3238 cgfsng_ops->version = "1.0.0";
3239 cgfsng_ops->attach = cgfsng_attach;
3240 cgfsng_ops->chown = cgfsng_chown;
3241 cgfsng_ops->mount = cgfsng_mount;
3242 cgfsng_ops->devices_activate = cgfsng_devices_activate;
3243
3244 return move_ptr(cgfsng_ops);
3245 }