]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfsng.c
Merge pull request #2516 from 2xsec/bugfix
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
1 /*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 * Christian Brauner <christian.brauner@ubuntu.com>
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /*
26 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
27 * cgroup backend. The original cgfs.c was designed to be as flexible
28 * as possible. It would try to find cgroup filesystems no matter where
29 * or how you had them mounted, and deduce the most usable mount for
30 * each controller.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
36
37 #include "config.h"
38
39 #include <ctype.h>
40 #include <dirent.h>
41 #include <errno.h>
42 #include <grp.h>
43 #include <stdint.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48 #include <linux/kdev_t.h>
49 #include <linux/types.h>
50 #include <sys/types.h>
51
52 #include "caps.h"
53 #include "cgroup.h"
54 #include "cgroup_utils.h"
55 #include "commands.h"
56 #include "conf.h"
57 #include "log.h"
58 #include "storage/storage.h"
59 #include "utils.h"
60
61 #ifndef HAVE_STRLCPY
62 #include "include/strlcpy.h"
63 #endif
64
65 #ifndef HAVE_STRLCAT
66 #include "include/strlcat.h"
67 #endif
68
69 lxc_log_define(cgfsng, cgroup);
70
71 static void free_string_list(char **clist)
72 {
73 int i;
74
75 if (!clist)
76 return;
77
78 for (i = 0; clist[i]; i++)
79 free(clist[i]);
80
81 free(clist);
82 }
83
84 /* Allocate a pointer, do not fail. */
85 static void *must_alloc(size_t sz)
86 {
87 return must_realloc(NULL, sz);
88 }
89
90 /* Given a pointer to a null-terminated array of pointers, realloc to add one
91 * entry, and point the new entry to NULL. Do not fail. Return the index to the
92 * second-to-last entry - that is, the one which is now available for use
93 * (keeping the list null-terminated).
94 */
95 static int append_null_to_list(void ***list)
96 {
97 int newentry = 0;
98
99 if (*list)
100 for (; (*list)[newentry]; newentry++)
101 ;
102
103 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
104 (*list)[newentry + 1] = NULL;
105 return newentry;
106 }
107
108 /* Given a null-terminated array of strings, check whether @entry is one of the
109 * strings.
110 */
111 static bool string_in_list(char **list, const char *entry)
112 {
113 int i;
114
115 if (!list)
116 return false;
117
118 for (i = 0; list[i]; i++)
119 if (strcmp(list[i], entry) == 0)
120 return true;
121
122 return false;
123 }
124
125 /* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
126 * "name=systemd". Do not fail.
127 */
128 static char *cg_legacy_must_prefix_named(char *entry)
129 {
130 size_t len;
131 char *prefixed;
132
133 len = strlen(entry);
134 prefixed = must_alloc(len + 6);
135
136 memcpy(prefixed, "name=", sizeof("name=") - 1);
137 memcpy(prefixed + sizeof("name=") - 1, entry, len);
138 prefixed[len + 5] = '\0';
139 return prefixed;
140 }
141
142 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
143 * we are called.
144 *
145 * We also handle named subsystems here. Any controller which is not a kernel
146 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
147 * we refuse to use because we're not sure which we have here.
148 * (TODO: We could work around this in some cases by just remounting to be
149 * unambiguous, or by comparing mountpoint contents with current cgroup.)
150 *
151 * The last entry will always be NULL.
152 */
153 static void must_append_controller(char **klist, char **nlist, char ***clist,
154 char *entry)
155 {
156 int newentry;
157 char *copy;
158
159 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
160 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
161 ERROR("It is both a named and kernel subsystem");
162 return;
163 }
164
165 newentry = append_null_to_list((void ***)clist);
166
167 if (strncmp(entry, "name=", 5) == 0)
168 copy = must_copy_string(entry);
169 else if (string_in_list(klist, entry))
170 copy = must_copy_string(entry);
171 else
172 copy = cg_legacy_must_prefix_named(entry);
173
174 (*clist)[newentry] = copy;
175 }
176
177 /* Given a handler's cgroup data, return the struct hierarchy for the controller
178 * @c, or NULL if there is none.
179 */
180 struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *c)
181 {
182 int i;
183
184 if (!ops->hierarchies)
185 return NULL;
186
187 for (i = 0; ops->hierarchies[i]; i++) {
188 if (!c) {
189 /* This is the empty unified hierarchy. */
190 if (ops->hierarchies[i]->controllers &&
191 !ops->hierarchies[i]->controllers[0])
192 return ops->hierarchies[i];
193
194 continue;
195 }
196
197 if (string_in_list(ops->hierarchies[i]->controllers, c))
198 return ops->hierarchies[i];
199 }
200
201 return NULL;
202 }
203
204 #define BATCH_SIZE 50
205 static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
206 {
207 int newbatches = (newlen / BATCH_SIZE) + 1;
208 int oldbatches = (oldlen / BATCH_SIZE) + 1;
209
210 if (!*mem || newbatches > oldbatches) {
211 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
212 }
213 }
214
215 static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
216 {
217 size_t full = oldlen + newlen;
218
219 batch_realloc(dest, oldlen, full + 1);
220
221 memcpy(*dest + oldlen, new, newlen + 1);
222 }
223
224 /* Slurp in a whole file */
225 static char *read_file(const char *fnam)
226 {
227 FILE *f;
228 char *line = NULL, *buf = NULL;
229 size_t len = 0, fulllen = 0;
230 int linelen;
231
232 f = fopen(fnam, "r");
233 if (!f)
234 return NULL;
235 while ((linelen = getline(&line, &len, f)) != -1) {
236 append_line(&buf, fulllen, line, linelen);
237 fulllen += linelen;
238 }
239 fclose(f);
240 free(line);
241 return buf;
242 }
243
244 /* Taken over modified from the kernel sources. */
245 #define NBITS 32 /* bits in uint32_t */
246 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
247 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
248
249 static void set_bit(unsigned bit, uint32_t *bitarr)
250 {
251 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
252 }
253
254 static void clear_bit(unsigned bit, uint32_t *bitarr)
255 {
256 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
257 }
258
259 static bool is_set(unsigned bit, uint32_t *bitarr)
260 {
261 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
262 }
263
264 /* Create cpumask from cpulist aka turn:
265 *
266 * 0,2-3
267 *
268 * into bit array
269 *
270 * 1 0 1 1
271 */
272 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
273 {
274 char *token;
275 size_t arrlen;
276 uint32_t *bitarr;
277
278 arrlen = BITS_TO_LONGS(nbits);
279 bitarr = calloc(arrlen, sizeof(uint32_t));
280 if (!bitarr)
281 return NULL;
282
283 lxc_iterate_parts(token, buf, ",") {
284 errno = 0;
285 unsigned end, start;
286 char *range;
287
288 start = strtoul(token, NULL, 0);
289 end = start;
290 range = strchr(token, '-');
291 if (range)
292 end = strtoul(range + 1, NULL, 0);
293
294 if (!(start <= end)) {
295 free(bitarr);
296 return NULL;
297 }
298
299 if (end >= nbits) {
300 free(bitarr);
301 return NULL;
302 }
303
304 while (start <= end)
305 set_bit(start++, bitarr);
306 }
307
308 return bitarr;
309 }
310
311 /* Turn cpumask into simple, comma-separated cpulist. */
312 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
313 {
314 int ret;
315 size_t i;
316 char **cpulist = NULL;
317 char numstr[LXC_NUMSTRLEN64] = {0};
318
319 for (i = 0; i <= nbits; i++) {
320 if (!is_set(i, bitarr))
321 continue;
322
323 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
324 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
325 lxc_free_array((void **)cpulist, free);
326 return NULL;
327 }
328
329 ret = lxc_append_string(&cpulist, numstr);
330 if (ret < 0) {
331 lxc_free_array((void **)cpulist, free);
332 return NULL;
333 }
334 }
335
336 if (!cpulist)
337 return NULL;
338
339 return lxc_string_join(",", (const char **)cpulist, false);
340 }
341
342 static ssize_t get_max_cpus(char *cpulist)
343 {
344 char *c1, *c2;
345 char *maxcpus = cpulist;
346 size_t cpus = 0;
347
348 c1 = strrchr(maxcpus, ',');
349 if (c1)
350 c1++;
351
352 c2 = strrchr(maxcpus, '-');
353 if (c2)
354 c2++;
355
356 if (!c1 && !c2)
357 c1 = maxcpus;
358 else if (c1 > c2)
359 c2 = c1;
360 else if (c1 < c2)
361 c1 = c2;
362 else if (!c1 && c2)
363 c1 = c2;
364
365 errno = 0;
366 cpus = strtoul(c1, NULL, 0);
367 if (errno != 0)
368 return -1;
369
370 return cpus;
371 }
372
373 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
374 static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
375 {
376 int ret;
377 ssize_t i;
378 char *lastslash, *fpath, oldv;
379 ssize_t maxisol = 0, maxposs = 0;
380 char *cpulist = NULL, *isolcpus = NULL, *posscpus = NULL;
381 uint32_t *isolmask = NULL, *possmask = NULL;
382 bool bret = false, flipped_bit = false;
383
384 lastslash = strrchr(path, '/');
385 if (!lastslash) {
386 ERROR("Failed to detect \"/\" in \"%s\"", path);
387 return bret;
388 }
389 oldv = *lastslash;
390 *lastslash = '\0';
391 fpath = must_make_path(path, "cpuset.cpus", NULL);
392 posscpus = read_file(fpath);
393 if (!posscpus) {
394 SYSERROR("Failed to read file \"%s\"", fpath);
395 goto on_error;
396 }
397
398 /* Get maximum number of cpus found in possible cpuset. */
399 maxposs = get_max_cpus(posscpus);
400 if (maxposs < 0 || maxposs >= INT_MAX - 1)
401 goto on_error;
402
403 if (!file_exists(__ISOL_CPUS)) {
404 /* This system doesn't expose isolated cpus. */
405 DEBUG("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
406 cpulist = posscpus;
407 /* No isolated cpus but we weren't already initialized by
408 * someone. We should simply copy the parents cpuset.cpus
409 * values.
410 */
411 if (!am_initialized) {
412 DEBUG("Copying cpu settings of parent cgroup");
413 goto copy_parent;
414 }
415 /* No isolated cpus but we were already initialized by someone.
416 * Nothing more to do for us.
417 */
418 goto on_success;
419 }
420
421 isolcpus = read_file(__ISOL_CPUS);
422 if (!isolcpus) {
423 SYSERROR("Failed to read file \""__ISOL_CPUS"\"");
424 goto on_error;
425 }
426 if (!isdigit(isolcpus[0])) {
427 TRACE("No isolated cpus detected");
428 cpulist = posscpus;
429 /* No isolated cpus but we weren't already initialized by
430 * someone. We should simply copy the parents cpuset.cpus
431 * values.
432 */
433 if (!am_initialized) {
434 DEBUG("Copying cpu settings of parent cgroup");
435 goto copy_parent;
436 }
437 /* No isolated cpus but we were already initialized by someone.
438 * Nothing more to do for us.
439 */
440 goto on_success;
441 }
442
443 /* Get maximum number of cpus found in isolated cpuset. */
444 maxisol = get_max_cpus(isolcpus);
445 if (maxisol < 0 || maxisol >= INT_MAX - 1)
446 goto on_error;
447
448 if (maxposs < maxisol)
449 maxposs = maxisol;
450 maxposs++;
451
452 possmask = lxc_cpumask(posscpus, maxposs);
453 if (!possmask) {
454 ERROR("Failed to create cpumask for possible cpus");
455 goto on_error;
456 }
457
458 isolmask = lxc_cpumask(isolcpus, maxposs);
459 if (!isolmask) {
460 ERROR("Failed to create cpumask for isolated cpus");
461 goto on_error;
462 }
463
464 for (i = 0; i <= maxposs; i++) {
465 if (!is_set(i, isolmask) || !is_set(i, possmask))
466 continue;
467
468 flipped_bit = true;
469 clear_bit(i, possmask);
470 }
471
472 if (!flipped_bit) {
473 DEBUG("No isolated cpus present in cpuset");
474 goto on_success;
475 }
476 DEBUG("Removed isolated cpus from cpuset");
477
478 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
479 if (!cpulist) {
480 ERROR("Failed to create cpu list");
481 goto on_error;
482 }
483
484 copy_parent:
485 *lastslash = oldv;
486 free(fpath);
487 fpath = must_make_path(path, "cpuset.cpus", NULL);
488 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false, 0666);
489 if (ret < 0) {
490 SYSERROR("Failed to write cpu list to \"%s\"", fpath);
491 goto on_error;
492 }
493
494 on_success:
495 bret = true;
496
497 on_error:
498 free(fpath);
499
500 free(isolcpus);
501 free(isolmask);
502
503 if (posscpus != cpulist)
504 free(posscpus);
505 free(possmask);
506
507 free(cpulist);
508 return bret;
509 }
510
511 /* Copy contents of parent(@path)/@file to @path/@file */
512 static bool copy_parent_file(char *path, char *file)
513 {
514 int ret;
515 char *fpath, *lastslash, oldv;
516 int len = 0;
517 char *value = NULL;
518
519 lastslash = strrchr(path, '/');
520 if (!lastslash) {
521 ERROR("Failed to detect \"/\" in \"%s\"", path);
522 return false;
523 }
524 oldv = *lastslash;
525 *lastslash = '\0';
526 fpath = must_make_path(path, file, NULL);
527 len = lxc_read_from_file(fpath, NULL, 0);
528 if (len <= 0)
529 goto on_error;
530
531 value = must_alloc(len + 1);
532 ret = lxc_read_from_file(fpath, value, len);
533 if (ret != len)
534 goto on_error;
535 free(fpath);
536
537 *lastslash = oldv;
538 fpath = must_make_path(path, file, NULL);
539 ret = lxc_write_to_file(fpath, value, len, false, 0666);
540 if (ret < 0)
541 SYSERROR("Failed to write \"%s\" to file \"%s\"", value, fpath);
542 free(fpath);
543 free(value);
544 return ret >= 0;
545
546 on_error:
547 SYSERROR("Failed to read file \"%s\"", fpath);
548 free(fpath);
549 free(value);
550 return false;
551 }
552
553 /* Initialize the cpuset hierarchy in first directory of @gname and set
554 * cgroup.clone_children so that children inherit settings. Since the
555 * h->base_path is populated by init or ourselves, we know it is already
556 * initialized.
557 */
558 static bool cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
559 {
560 int ret;
561 char v;
562 char *cgpath, *clonechildrenpath, *slash;
563
564 if (!string_in_list(h->controllers, "cpuset"))
565 return true;
566
567 if (*cgname == '/')
568 cgname++;
569 slash = strchr(cgname, '/');
570 if (slash)
571 *slash = '\0';
572
573 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
574 if (slash)
575 *slash = '/';
576
577 ret = mkdir(cgpath, 0755);
578 if (ret < 0) {
579 if (errno != EEXIST) {
580 SYSERROR("Failed to create directory \"%s\"", cgpath);
581 free(cgpath);
582 return false;
583 }
584 }
585
586 clonechildrenpath =
587 must_make_path(cgpath, "cgroup.clone_children", NULL);
588 /* unified hierarchy doesn't have clone_children */
589 if (!file_exists(clonechildrenpath)) {
590 free(clonechildrenpath);
591 free(cgpath);
592 return true;
593 }
594
595 ret = lxc_read_from_file(clonechildrenpath, &v, 1);
596 if (ret < 0) {
597 SYSERROR("Failed to read file \"%s\"", clonechildrenpath);
598 free(clonechildrenpath);
599 free(cgpath);
600 return false;
601 }
602
603 /* Make sure any isolated cpus are removed from cpuset.cpus. */
604 if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1')) {
605 SYSERROR("Failed to remove isolated cpus");
606 free(clonechildrenpath);
607 free(cgpath);
608 return false;
609 }
610
611 /* Already set for us by someone else. */
612 if (v == '1') {
613 DEBUG("\"cgroup.clone_children\" was already set to \"1\"");
614 free(clonechildrenpath);
615 free(cgpath);
616 return true;
617 }
618
619 /* copy parent's settings */
620 if (!copy_parent_file(cgpath, "cpuset.mems")) {
621 SYSERROR("Failed to copy \"cpuset.mems\" settings");
622 free(cgpath);
623 free(clonechildrenpath);
624 return false;
625 }
626 free(cgpath);
627
628 ret = lxc_write_to_file(clonechildrenpath, "1", 1, false, 0666);
629 if (ret < 0) {
630 /* Set clone_children so children inherit our settings */
631 SYSERROR("Failed to write 1 to \"%s\"", clonechildrenpath);
632 free(clonechildrenpath);
633 return false;
634 }
635 free(clonechildrenpath);
636 return true;
637 }
638
639 /* Given two null-terminated lists of strings, return true if any string is in
640 * both.
641 */
642 static bool controller_lists_intersect(char **l1, char **l2)
643 {
644 int i;
645
646 if (!l1 || !l2)
647 return false;
648
649 for (i = 0; l1[i]; i++) {
650 if (string_in_list(l2, l1[i]))
651 return true;
652 }
653
654 return false;
655 }
656
657 /* For a null-terminated list of controllers @clist, return true if any of those
658 * controllers is already listed the null-terminated list of hierarchies @hlist.
659 * Realistically, if one is present, all must be present.
660 */
661 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
662 {
663 int i;
664
665 if (!hlist)
666 return false;
667
668 for (i = 0; hlist[i]; i++)
669 if (controller_lists_intersect(hlist[i]->controllers, clist))
670 return true;
671
672 return false;
673 }
674
675 /* Return true if the controller @entry is found in the null-terminated list of
676 * hierarchies @hlist.
677 */
678 static bool controller_found(struct hierarchy **hlist, char *entry)
679 {
680 int i;
681
682 if (!hlist)
683 return false;
684
685 for (i = 0; hlist[i]; i++)
686 if (string_in_list(hlist[i]->controllers, entry))
687 return true;
688
689 return false;
690 }
691
692 /* Return true if all of the controllers which we require have been found. The
693 * required list is freezer and anything in lxc.cgroup.use.
694 */
695 static bool all_controllers_found(struct cgroup_ops *ops)
696 {
697 char **cur;
698 struct hierarchy **hlist = ops->hierarchies;
699
700 if (!controller_found(hlist, "freezer")) {
701 ERROR("No freezer controller mountpoint found");
702 return false;
703 }
704
705 if (!ops->cgroup_use)
706 return true;
707
708 for (cur = ops->cgroup_use; cur && *cur; cur++)
709 if (!controller_found(hlist, *cur)) {
710 ERROR("No %s controller mountpoint found", *cur);
711 return false;
712 }
713
714 return true;
715 }
716
717 /* Get the controllers from a mountinfo line There are other ways we could get
718 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
719 * could parse the mount options. But we simply assume that the mountpoint must
720 * be /sys/fs/cgroup/controller-list
721 */
722 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
723 int type)
724 {
725 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
726 * for legacy hierarchies.
727 */
728 int i;
729 char *dup, *p2, *tok;
730 char *p = line, *sep = ",";
731 char **aret = NULL;
732
733 for (i = 0; i < 4; i++) {
734 p = strchr(p, ' ');
735 if (!p)
736 return NULL;
737 p++;
738 }
739
740 /* Note, if we change how mountinfo works, then our caller will need to
741 * verify /sys/fs/cgroup/ in this field.
742 */
743 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
744 ERROR("Found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
745 return NULL;
746 }
747
748 p += 15;
749 p2 = strchr(p, ' ');
750 if (!p2) {
751 ERROR("Corrupt mountinfo");
752 return NULL;
753 }
754 *p2 = '\0';
755
756 if (type == CGROUP_SUPER_MAGIC) {
757 /* strdup() here for v1 hierarchies. Otherwise
758 * lxc_iterate_parts() will destroy mountpoints such as
759 * "/sys/fs/cgroup/cpu,cpuacct".
760 */
761 dup = strdup(p);
762 if (!dup)
763 return NULL;
764
765 lxc_iterate_parts(tok, dup, sep) {
766 must_append_controller(klist, nlist, &aret, tok);
767 }
768
769 free(dup);
770 }
771 *p2 = ' ';
772
773 return aret;
774 }
775
776 static char **cg_unified_make_empty_controller(void)
777 {
778 int newentry;
779 char **aret = NULL;
780
781 newentry = append_null_to_list((void ***)&aret);
782 aret[newentry] = NULL;
783 return aret;
784 }
785
786 static char **cg_unified_get_controllers(const char *file)
787 {
788 char *buf, *tok;
789 char *sep = " \t\n";
790 char **aret = NULL;
791
792 buf = read_file(file);
793 if (!buf)
794 return NULL;
795
796 lxc_iterate_parts(tok, buf, sep) {
797 int newentry;
798 char *copy;
799
800 newentry = append_null_to_list((void ***)&aret);
801 copy = must_copy_string(tok);
802 aret[newentry] = copy;
803 }
804
805 free(buf);
806 return aret;
807 }
808
809 static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
810 char *base_cgroup, int type)
811 {
812 struct hierarchy *new;
813 int newentry;
814
815 new = must_alloc(sizeof(*new));
816 new->controllers = clist;
817 new->mountpoint = mountpoint;
818 new->base_cgroup = base_cgroup;
819 new->fullcgpath = NULL;
820 new->version = type;
821
822 newentry = append_null_to_list((void ***)h);
823 (*h)[newentry] = new;
824 return new;
825 }
826
827 /* Get a copy of the mountpoint from @line, which is a line from
828 * /proc/self/mountinfo.
829 */
830 static char *cg_hybrid_get_mountpoint(char *line)
831 {
832 int i;
833 size_t len;
834 char *p2;
835 char *p = line, *sret = NULL;
836
837 for (i = 0; i < 4; i++) {
838 p = strchr(p, ' ');
839 if (!p)
840 return NULL;
841 p++;
842 }
843
844 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0)
845 return NULL;
846
847 p2 = strchr(p + 15, ' ');
848 if (!p2)
849 return NULL;
850 *p2 = '\0';
851
852 len = strlen(p);
853 sret = must_alloc(len + 1);
854 memcpy(sret, p, len);
855 sret[len] = '\0';
856 return sret;
857 }
858
859 /* Given a multi-line string, return a null-terminated copy of the current line. */
860 static char *copy_to_eol(char *p)
861 {
862 char *p2 = strchr(p, '\n'), *sret;
863 size_t len;
864
865 if (!p2)
866 return NULL;
867
868 len = p2 - p;
869 sret = must_alloc(len + 1);
870 memcpy(sret, p, len);
871 sret[len] = '\0';
872 return sret;
873 }
874
875 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
876 * /proc/self/cgroup file. Check whether controller c is present.
877 */
878 static bool controller_in_clist(char *cgline, char *c)
879 {
880 char *tok, *eol, *tmp;
881 size_t len;
882
883 eol = strchr(cgline, ':');
884 if (!eol)
885 return false;
886
887 len = eol - cgline;
888 tmp = alloca(len + 1);
889 memcpy(tmp, cgline, len);
890 tmp[len] = '\0';
891
892 lxc_iterate_parts(tok, tmp, ",") {
893 if (strcmp(tok, c) == 0)
894 return true;
895 }
896
897 return false;
898 }
899
900 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
901 * @controller.
902 */
903 static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
904 int type)
905 {
906 char *p = basecginfo;
907
908 for (;;) {
909 bool is_cgv2_base_cgroup = false;
910
911 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
912 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
913 is_cgv2_base_cgroup = true;
914
915 p = strchr(p, ':');
916 if (!p)
917 return NULL;
918 p++;
919
920 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
921 p = strchr(p, ':');
922 if (!p)
923 return NULL;
924 p++;
925 return copy_to_eol(p);
926 }
927
928 p = strchr(p, '\n');
929 if (!p)
930 return NULL;
931 p++;
932 }
933 }
934
935 static void must_append_string(char ***list, char *entry)
936 {
937 int newentry;
938 char *copy;
939
940 newentry = append_null_to_list((void ***)list);
941 copy = must_copy_string(entry);
942 (*list)[newentry] = copy;
943 }
944
945 static int get_existing_subsystems(char ***klist, char ***nlist)
946 {
947 FILE *f;
948 char *line = NULL;
949 size_t len = 0;
950
951 f = fopen("/proc/self/cgroup", "r");
952 if (!f)
953 return -1;
954
955 while (getline(&line, &len, f) != -1) {
956 char *p, *p2, *tok;
957 p = strchr(line, ':');
958 if (!p)
959 continue;
960 p++;
961 p2 = strchr(p, ':');
962 if (!p2)
963 continue;
964 *p2 = '\0';
965
966 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
967 * contains an entry of the form:
968 *
969 * 0::/some/path
970 *
971 * In this case we use "cgroup2" as controller name.
972 */
973 if ((p2 - p) == 0) {
974 must_append_string(klist, "cgroup2");
975 continue;
976 }
977
978 lxc_iterate_parts(tok, p, ",") {
979 if (strncmp(tok, "name=", 5) == 0)
980 must_append_string(nlist, tok);
981 else
982 must_append_string(klist, tok);
983 }
984 }
985
986 free(line);
987 fclose(f);
988 return 0;
989 }
990
991 static void trim(char *s)
992 {
993 size_t len;
994
995 len = strlen(s);
996 while ((len > 1) && (s[len - 1] == '\n'))
997 s[--len] = '\0';
998 }
999
1000 static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
1001 {
1002 int i;
1003 struct hierarchy **it;
1004
1005 if (!ops->hierarchies) {
1006 TRACE(" No hierarchies found");
1007 return;
1008 }
1009
1010 TRACE(" Hierarchies:");
1011 for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
1012 int j;
1013 char **cit;
1014
1015 TRACE(" %d: base_cgroup: %s", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1016 TRACE(" mountpoint: %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1017 TRACE(" controllers:");
1018 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
1019 TRACE(" %d: %s", j, *cit);
1020 }
1021 }
1022
1023 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1024 char **nlist)
1025 {
1026 int k;
1027 char **it;
1028
1029 TRACE("basecginfo is:");
1030 TRACE("%s", basecginfo);
1031
1032 for (k = 0, it = klist; it && *it; it++, k++)
1033 TRACE("kernel subsystem %d: %s", k, *it);
1034
1035 for (k = 0, it = nlist; it && *it; it++, k++)
1036 TRACE("named subsystem %d: %s", k, *it);
1037 }
1038
1039 static int cgroup_rmdir(struct hierarchy **hierarchies,
1040 const char *container_cgroup)
1041 {
1042 int i;
1043
1044 if (!container_cgroup || !hierarchies)
1045 return 0;
1046
1047 for (i = 0; hierarchies[i]; i++) {
1048 int ret;
1049 struct hierarchy *h = hierarchies[i];
1050
1051 if (!h->fullcgpath)
1052 continue;
1053
1054 ret = recursive_destroy(h->fullcgpath);
1055 if (ret < 0)
1056 WARN("Failed to destroy \"%s\"", h->fullcgpath);
1057
1058 free(h->fullcgpath);
1059 h->fullcgpath = NULL;
1060 }
1061
1062 return 0;
1063 }
1064
1065 struct generic_userns_exec_data {
1066 struct hierarchy **hierarchies;
1067 const char *container_cgroup;
1068 struct lxc_conf *conf;
1069 uid_t origuid; /* target uid in parent namespace */
1070 char *path;
1071 };
1072
1073 static int cgroup_rmdir_wrapper(void *data)
1074 {
1075 int ret;
1076 struct generic_userns_exec_data *arg = data;
1077 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1078 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1079
1080 ret = setresgid(nsgid, nsgid, nsgid);
1081 if (ret < 0) {
1082 SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
1083 (int)nsgid, (int)nsgid);
1084 return -1;
1085 }
1086
1087 ret = setresuid(nsuid, nsuid, nsuid);
1088 if (ret < 0) {
1089 SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
1090 (int)nsuid, (int)nsuid);
1091 return -1;
1092 }
1093
1094 ret = setgroups(0, NULL);
1095 if (ret < 0 && errno != EPERM) {
1096 SYSERROR("Failed to setgroups(0, NULL)");
1097 return -1;
1098 }
1099
1100 return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
1101 }
1102
1103 static void cgfsng_destroy(struct cgroup_ops *ops, struct lxc_handler *handler)
1104 {
1105 int ret;
1106 struct generic_userns_exec_data wrap;
1107
1108 wrap.origuid = 0;
1109 wrap.container_cgroup = ops->container_cgroup;
1110 wrap.hierarchies = ops->hierarchies;
1111 wrap.conf = handler->conf;
1112
1113 if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
1114 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
1115 "cgroup_rmdir_wrapper");
1116 else
1117 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
1118 if (ret < 0) {
1119 WARN("Failed to destroy cgroups");
1120 return;
1121 }
1122 }
1123
1124 static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
1125 {
1126 size_t i, parts_len;
1127 char **it;
1128 size_t full_len = 0;
1129 char *add_controllers = NULL, *cgroup = NULL;
1130 char **parts = NULL;
1131 bool bret = false;
1132
1133 if (h->version != CGROUP2_SUPER_MAGIC)
1134 return true;
1135
1136 if (!h->controllers)
1137 return true;
1138
1139 /* For now we simply enable all controllers that we have detected by
1140 * creating a string like "+memory +pids +cpu +io".
1141 * TODO: In the near future we might want to support "-<controller>"
1142 * etc. but whether supporting semantics like this make sense will need
1143 * some thinking.
1144 */
1145 for (it = h->controllers; it && *it; it++) {
1146 full_len += strlen(*it) + 2;
1147 add_controllers = must_realloc(add_controllers, full_len + 1);
1148
1149 if (h->controllers[0] == *it)
1150 add_controllers[0] = '\0';
1151
1152 (void)strlcat(add_controllers, "+", full_len + 1);
1153 (void)strlcat(add_controllers, *it, full_len + 1);
1154
1155 if ((it + 1) && *(it + 1))
1156 (void)strlcat(add_controllers, " ", full_len + 1);
1157 }
1158
1159 parts = lxc_string_split(cgname, '/');
1160 if (!parts)
1161 goto on_error;
1162
1163 parts_len = lxc_array_len((void **)parts);
1164 if (parts_len > 0)
1165 parts_len--;
1166
1167 cgroup = must_make_path(h->mountpoint, h->base_cgroup, NULL);
1168 for (i = 0; i < parts_len; i++) {
1169 int ret;
1170 char *target;
1171
1172 cgroup = must_append_path(cgroup, parts[i], NULL);
1173 target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
1174 ret = lxc_write_to_file(target, add_controllers, full_len, false, 0666);
1175 free(target);
1176 if (ret < 0) {
1177 SYSERROR("Could not enable \"%s\" controllers in the "
1178 "unified cgroup \"%s\"", add_controllers, cgroup);
1179 goto on_error;
1180 }
1181 }
1182
1183 bret = true;
1184
1185 on_error:
1186 lxc_free_array((void **)parts, free);
1187 free(add_controllers);
1188 free(cgroup);
1189 return bret;
1190 }
1191
1192 static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1193 {
1194 int ret;
1195
1196 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
1197 if (dir_exists(h->fullcgpath)) {
1198 ERROR("The cgroup \"%s\" already existed", h->fullcgpath);
1199 return false;
1200 }
1201
1202 if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1203 ERROR("Failed to handle legacy cpuset controller");
1204 return false;
1205 }
1206
1207 ret = mkdir_p(h->fullcgpath, 0755);
1208 if (ret < 0) {
1209 ERROR("Failed to create cgroup \"%s\"", h->fullcgpath);
1210 return false;
1211 }
1212
1213 return cg_unified_create_cgroup(h, cgname);
1214 }
1215
1216 static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1217 {
1218 int ret;
1219
1220 ret = rmdir(h->fullcgpath);
1221 if (ret < 0)
1222 SYSERROR("Failed to rmdir(\"%s\") from failed creation attempt", h->fullcgpath);
1223
1224 free(h->fullcgpath);
1225 h->fullcgpath = NULL;
1226 }
1227
1228 /* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1229 * next cgroup_pattern-1, -2, ..., -999.
1230 */
1231 static inline bool cgfsng_create(struct cgroup_ops *ops,
1232 struct lxc_handler *handler)
1233 {
1234 int i;
1235 size_t len;
1236 char *container_cgroup, *offset, *tmp;
1237 int idx = 0;
1238 struct lxc_conf *conf = handler->conf;
1239
1240 if (ops->container_cgroup) {
1241 WARN("cgfsng_create called a second time: %s", ops->container_cgroup);
1242 return false;
1243 }
1244
1245 if (!conf)
1246 return false;
1247
1248 if (conf->cgroup_meta.dir)
1249 tmp = lxc_string_join("/", (const char *[]){conf->cgroup_meta.dir, handler->name, NULL}, false);
1250 else
1251 tmp = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1252 if (!tmp) {
1253 ERROR("Failed expanding cgroup name pattern");
1254 return false;
1255 }
1256
1257 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
1258 container_cgroup = must_alloc(len);
1259 (void)strlcpy(container_cgroup, tmp, len);
1260 free(tmp);
1261 offset = container_cgroup + len - 5;
1262
1263 again:
1264 if (idx == 1000) {
1265 ERROR("Too many conflicting cgroup names");
1266 goto out_free;
1267 }
1268
1269 if (idx) {
1270 int ret;
1271
1272 ret = snprintf(offset, 5, "-%d", idx);
1273 if (ret < 0 || (size_t)ret >= 5) {
1274 FILE *f = fopen("/dev/null", "w");
1275 if (f) {
1276 fprintf(f, "Workaround for GCC7 bug: "
1277 "https://gcc.gnu.org/bugzilla/"
1278 "show_bug.cgi?id=78969");
1279 fclose(f);
1280 }
1281 }
1282 }
1283
1284 for (i = 0; ops->hierarchies[i]; i++) {
1285 if (!create_path_for_hierarchy(ops->hierarchies[i], container_cgroup)) {
1286 int j;
1287 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->fullcgpath);
1288 free(ops->hierarchies[i]->fullcgpath);
1289 ops->hierarchies[i]->fullcgpath = NULL;
1290 for (j = 0; j < i; j++)
1291 remove_path_for_hierarchy(ops->hierarchies[j], container_cgroup);
1292 idx++;
1293 goto again;
1294 }
1295 }
1296
1297 ops->container_cgroup = container_cgroup;
1298
1299 return true;
1300
1301 out_free:
1302 free(container_cgroup);
1303
1304 return false;
1305 }
1306
1307 static bool cgfsng_enter(struct cgroup_ops *ops, pid_t pid)
1308 {
1309 int i, len;
1310 char pidstr[25];
1311
1312 len = snprintf(pidstr, 25, "%d", pid);
1313 if (len < 0 || len >= 25)
1314 return false;
1315
1316 for (i = 0; ops->hierarchies[i]; i++) {
1317 int ret;
1318 char *fullpath;
1319
1320 fullpath = must_make_path(ops->hierarchies[i]->fullcgpath,
1321 "cgroup.procs", NULL);
1322 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
1323 if (ret != 0) {
1324 SYSERROR("Failed to enter cgroup \"%s\"", fullpath);
1325 free(fullpath);
1326 return false;
1327 }
1328 free(fullpath);
1329 }
1330
1331 return true;
1332 }
1333
1334 static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
1335 mode_t chmod_mode)
1336 {
1337 int ret;
1338
1339 ret = chown(path, chown_uid, chown_gid);
1340 if (ret < 0) {
1341 SYSWARN("Failed to chown(%s, %d, %d)", path, (int)chown_uid, (int)chown_gid);
1342 return -1;
1343 }
1344
1345 ret = chmod(path, chmod_mode);
1346 if (ret < 0) {
1347 SYSWARN("Failed to chmod(%s, %d)", path, (int)chmod_mode);
1348 return -1;
1349 }
1350
1351 return 0;
1352 }
1353
1354 /* chgrp the container cgroups to container group. We leave
1355 * the container owner as cgroup owner. So we must make the
1356 * directories 775 so that the container can create sub-cgroups.
1357 *
1358 * Also chown the tasks and cgroup.procs files. Those may not
1359 * exist depending on kernel version.
1360 */
1361 static int chown_cgroup_wrapper(void *data)
1362 {
1363 int i, ret;
1364 uid_t destuid;
1365 struct generic_userns_exec_data *arg = data;
1366 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1367 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1368
1369 ret = setresgid(nsgid, nsgid, nsgid);
1370 if (ret < 0) {
1371 SYSERROR("Failed to setresgid(%d, %d, %d)",
1372 (int)nsgid, (int)nsgid, (int)nsgid);
1373 return -1;
1374 }
1375
1376 ret = setresuid(nsuid, nsuid, nsuid);
1377 if (ret < 0) {
1378 SYSERROR("Failed to setresuid(%d, %d, %d)",
1379 (int)nsuid, (int)nsuid, (int)nsuid);
1380 return -1;
1381 }
1382
1383 ret = setgroups(0, NULL);
1384 if (ret < 0 && errno != EPERM) {
1385 SYSERROR("Failed to setgroups(0, NULL)");
1386 return -1;
1387 }
1388
1389 destuid = get_ns_uid(arg->origuid);
1390
1391 for (i = 0; arg->hierarchies[i]; i++) {
1392 char *fullpath;
1393 char *path = arg->hierarchies[i]->fullcgpath;
1394
1395 ret = chowmod(path, destuid, nsgid, 0775);
1396 if (ret < 0)
1397 return -1;
1398
1399 /* Failures to chown() these are inconvenient but not
1400 * detrimental We leave these owned by the container launcher,
1401 * so that container root can write to the files to attach. We
1402 * chmod() them 664 so that container systemd can write to the
1403 * files (which systemd in wily insists on doing).
1404 */
1405
1406 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
1407 fullpath = must_make_path(path, "tasks", NULL);
1408 (void)chowmod(fullpath, destuid, nsgid, 0664);
1409 free(fullpath);
1410 }
1411
1412 fullpath = must_make_path(path, "cgroup.procs", NULL);
1413 (void)chowmod(fullpath, destuid, nsgid, 0664);
1414 free(fullpath);
1415
1416 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1417 continue;
1418
1419 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
1420 (void)chowmod(fullpath, destuid, nsgid, 0664);
1421 free(fullpath);
1422
1423 fullpath = must_make_path(path, "cgroup.threads", NULL);
1424 (void)chowmod(fullpath, destuid, nsgid, 0664);
1425 free(fullpath);
1426 }
1427
1428 return 0;
1429 }
1430
1431 static bool cgfsng_chown(struct cgroup_ops *ops, struct lxc_conf *conf)
1432 {
1433 struct generic_userns_exec_data wrap;
1434
1435 if (lxc_list_empty(&conf->id_map))
1436 return true;
1437
1438 wrap.origuid = geteuid();
1439 wrap.path = NULL;
1440 wrap.hierarchies = ops->hierarchies;
1441 wrap.conf = conf;
1442
1443 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1444 "chown_cgroup_wrapper") < 0) {
1445 ERROR("Error requesting cgroup chown in new user namespace");
1446 return false;
1447 }
1448
1449 return true;
1450 }
1451
1452 /* cgroup-full:* is done, no need to create subdirs */
1453 static bool cg_mount_needs_subdirs(int type)
1454 {
1455 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1456 return false;
1457
1458 return true;
1459 }
1460
1461 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1462 * remount controller ro if needed and bindmount the cgroupfs onto
1463 * controll/the/cg/path.
1464 */
1465 static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1466 char *controllerpath, char *cgpath,
1467 const char *container_cgroup)
1468 {
1469 int ret, remount_flags;
1470 char *sourcepath;
1471 int flags = MS_BIND;
1472
1473 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1474 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1475 if (ret < 0) {
1476 SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
1477 controllerpath, controllerpath);
1478 return -1;
1479 }
1480
1481 remount_flags = add_required_remount_flags(controllerpath,
1482 controllerpath,
1483 flags | MS_REMOUNT);
1484 ret = mount(controllerpath, controllerpath, "cgroup",
1485 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1486 NULL);
1487 if (ret < 0) {
1488 SYSERROR("Failed to remount \"%s\" ro", controllerpath);
1489 return -1;
1490 }
1491
1492 INFO("Remounted %s read-only", controllerpath);
1493 }
1494
1495 sourcepath = must_make_path(h->mountpoint, h->base_cgroup,
1496 container_cgroup, NULL);
1497 if (type == LXC_AUTO_CGROUP_RO)
1498 flags |= MS_RDONLY;
1499
1500 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1501 if (ret < 0) {
1502 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1503 free(sourcepath);
1504 return -1;
1505 }
1506 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1507
1508 if (flags & MS_RDONLY) {
1509 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1510 flags | MS_REMOUNT);
1511 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1512 if (ret < 0) {
1513 SYSERROR("Failed to remount \"%s\" ro", cgpath);
1514 free(sourcepath);
1515 return -1;
1516 }
1517 INFO("Remounted %s read-only", cgpath);
1518 }
1519
1520 free(sourcepath);
1521 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1522 return 0;
1523 }
1524
1525 /* __cg_mount_direct
1526 *
1527 * Mount cgroup hierarchies directly without using bind-mounts. The main
1528 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1529 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1530 */
1531 static int __cg_mount_direct(int type, struct hierarchy *h,
1532 const char *controllerpath)
1533 {
1534 int ret;
1535 char *controllers = NULL;
1536 char *fstype = "cgroup2";
1537 unsigned long flags = 0;
1538
1539 flags |= MS_NOSUID;
1540 flags |= MS_NOEXEC;
1541 flags |= MS_NODEV;
1542 flags |= MS_RELATIME;
1543
1544 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1545 flags |= MS_RDONLY;
1546
1547 if (h->version != CGROUP2_SUPER_MAGIC) {
1548 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1549 if (!controllers)
1550 return -ENOMEM;
1551 fstype = "cgroup";
1552 }
1553
1554 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
1555 free(controllers);
1556 if (ret < 0) {
1557 SYSERROR("Failed to mount \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1558 return -1;
1559 }
1560
1561 DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1562 return 0;
1563 }
1564
1565 static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1566 const char *controllerpath)
1567 {
1568 return __cg_mount_direct(type, h, controllerpath);
1569 }
1570
1571 static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1572 const char *controllerpath)
1573 {
1574 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1575 return 0;
1576
1577 return __cg_mount_direct(type, h, controllerpath);
1578 }
1579
1580 static bool cgfsng_mount(struct cgroup_ops *ops, struct lxc_handler *handler,
1581 const char *root, int type)
1582 {
1583 int i, ret;
1584 char *tmpfspath = NULL;
1585 bool has_cgns = false, retval = false, wants_force_mount = false;
1586
1587 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1588 return true;
1589
1590 if (type & LXC_AUTO_CGROUP_FORCE) {
1591 type &= ~LXC_AUTO_CGROUP_FORCE;
1592 wants_force_mount = true;
1593 }
1594
1595 if (!wants_force_mount){
1596 if (!lxc_list_empty(&handler->conf->keepcaps))
1597 wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1598 else
1599 wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1600 }
1601
1602 has_cgns = cgns_supported();
1603 if (has_cgns && !wants_force_mount)
1604 return true;
1605
1606 if (type == LXC_AUTO_CGROUP_NOSPEC)
1607 type = LXC_AUTO_CGROUP_MIXED;
1608 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1609 type = LXC_AUTO_CGROUP_FULL_MIXED;
1610
1611 /* Mount tmpfs */
1612 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1613 ret = safe_mount(NULL, tmpfspath, "tmpfs",
1614 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1615 "size=10240k,mode=755", root);
1616 if (ret < 0)
1617 goto on_error;
1618
1619 for (i = 0; ops->hierarchies[i]; i++) {
1620 char *controllerpath, *path2;
1621 struct hierarchy *h = ops->hierarchies[i];
1622 char *controller = strrchr(h->mountpoint, '/');
1623
1624 if (!controller)
1625 continue;
1626 controller++;
1627
1628 controllerpath = must_make_path(tmpfspath, controller, NULL);
1629 if (dir_exists(controllerpath)) {
1630 free(controllerpath);
1631 continue;
1632 }
1633
1634 ret = mkdir(controllerpath, 0755);
1635 if (ret < 0) {
1636 SYSERROR("Error creating cgroup path: %s", controllerpath);
1637 free(controllerpath);
1638 goto on_error;
1639 }
1640
1641 if (has_cgns && wants_force_mount) {
1642 /* If cgroup namespaces are supported but the container
1643 * will not have CAP_SYS_ADMIN after it has started we
1644 * need to mount the cgroups manually.
1645 */
1646 ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
1647 free(controllerpath);
1648 if (ret < 0)
1649 goto on_error;
1650
1651 continue;
1652 }
1653
1654 ret = cg_mount_cgroup_full(type, h, controllerpath);
1655 if (ret < 0) {
1656 free(controllerpath);
1657 goto on_error;
1658 }
1659
1660 if (!cg_mount_needs_subdirs(type)) {
1661 free(controllerpath);
1662 continue;
1663 }
1664
1665 path2 = must_make_path(controllerpath, h->base_cgroup,
1666 ops->container_cgroup, NULL);
1667 ret = mkdir_p(path2, 0755);
1668 if (ret < 0) {
1669 free(controllerpath);
1670 free(path2);
1671 goto on_error;
1672 }
1673
1674 ret = cg_legacy_mount_controllers(type, h, controllerpath,
1675 path2, ops->container_cgroup);
1676 free(controllerpath);
1677 free(path2);
1678 if (ret < 0)
1679 goto on_error;
1680 }
1681 retval = true;
1682
1683 on_error:
1684 free(tmpfspath);
1685 return retval;
1686 }
1687
1688 static int recursive_count_nrtasks(char *dirname)
1689 {
1690 struct dirent *direntp;
1691 DIR *dir;
1692 int count = 0, ret;
1693 char *path;
1694
1695 dir = opendir(dirname);
1696 if (!dir)
1697 return 0;
1698
1699 while ((direntp = readdir(dir))) {
1700 struct stat mystat;
1701
1702 if (!strcmp(direntp->d_name, ".") ||
1703 !strcmp(direntp->d_name, ".."))
1704 continue;
1705
1706 path = must_make_path(dirname, direntp->d_name, NULL);
1707
1708 if (lstat(path, &mystat))
1709 goto next;
1710
1711 if (!S_ISDIR(mystat.st_mode))
1712 goto next;
1713
1714 count += recursive_count_nrtasks(path);
1715 next:
1716 free(path);
1717 }
1718
1719 path = must_make_path(dirname, "cgroup.procs", NULL);
1720 ret = lxc_count_file_lines(path);
1721 if (ret != -1)
1722 count += ret;
1723 free(path);
1724
1725 (void)closedir(dir);
1726
1727 return count;
1728 }
1729
1730 static int cgfsng_nrtasks(struct cgroup_ops *ops)
1731 {
1732 int count;
1733 char *path;
1734
1735 if (!ops->container_cgroup || !ops->hierarchies)
1736 return -1;
1737
1738 path = must_make_path(ops->hierarchies[0]->fullcgpath, NULL);
1739 count = recursive_count_nrtasks(path);
1740 free(path);
1741 return count;
1742 }
1743
1744 /* Only root needs to escape to the cgroup of its init. */
1745 static bool cgfsng_escape(const struct cgroup_ops *ops)
1746 {
1747 int i;
1748
1749 if (geteuid())
1750 return true;
1751
1752 for (i = 0; ops->hierarchies[i]; i++) {
1753 int ret;
1754 char *fullpath;
1755
1756 fullpath = must_make_path(ops->hierarchies[i]->mountpoint,
1757 ops->hierarchies[i]->base_cgroup,
1758 "cgroup.procs", NULL);
1759 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1760 if (ret != 0) {
1761 SYSERROR("Failed to escape to cgroup \"%s\"", fullpath);
1762 free(fullpath);
1763 return false;
1764 }
1765 free(fullpath);
1766 }
1767
1768 return true;
1769 }
1770
1771 static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
1772 {
1773 int i;
1774
1775 for (i = 0; ops->hierarchies[i]; i++)
1776 ;
1777
1778 return i;
1779 }
1780
1781 static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
1782 {
1783 int i;
1784
1785 /* sanity check n */
1786 for (i = 0; i < n; i++)
1787 if (!ops->hierarchies[i])
1788 return false;
1789
1790 *out = ops->hierarchies[i]->controllers;
1791
1792 return true;
1793 }
1794
1795 #define THAWED "THAWED"
1796 #define THAWED_LEN (strlen(THAWED))
1797
1798 /* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
1799 * to be adapted.
1800 */
1801 static bool cgfsng_unfreeze(struct cgroup_ops *ops)
1802 {
1803 int ret;
1804 char *fullpath;
1805 struct hierarchy *h;
1806
1807 h = get_hierarchy(ops, "freezer");
1808 if (!h)
1809 return false;
1810
1811 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
1812 ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false, 0666);
1813 free(fullpath);
1814 if (ret < 0)
1815 return false;
1816
1817 return true;
1818 }
1819
1820 static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
1821 const char *controller)
1822 {
1823 struct hierarchy *h;
1824
1825 h = get_hierarchy(ops, controller);
1826 if (!h) {
1827 WARN("Failed to find hierarchy for controller \"%s\"",
1828 controller ? controller : "(null)");
1829 return NULL;
1830 }
1831
1832 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
1833 }
1834
1835 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
1836 * which must be freed by the caller.
1837 */
1838 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
1839 const char *inpath,
1840 const char *filename)
1841 {
1842 return must_make_path(h->mountpoint, inpath, filename, NULL);
1843 }
1844
1845 /* Technically, we're always at a delegation boundary here (This is especially
1846 * true when cgroup namespaces are available.). The reasoning is that in order
1847 * for us to have been able to start a container in the first place the root
1848 * cgroup must have been a leaf node. Now, either the container's init system
1849 * has populated the cgroup and kept it as a leaf node or it has created
1850 * subtrees. In the former case we will simply attach to the leaf node we
1851 * created when we started the container in the latter case we create our own
1852 * cgroup for the attaching process.
1853 */
1854 static int __cg_unified_attach(const struct hierarchy *h, const char *name,
1855 const char *lxcpath, const char *pidstr,
1856 size_t pidstr_len, const char *controller)
1857 {
1858 int ret;
1859 size_t len;
1860 int fret = -1, idx = 0;
1861 char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
1862
1863 container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
1864 /* not running */
1865 if (!container_cgroup)
1866 return 0;
1867
1868 base_path = must_make_path(h->mountpoint, container_cgroup, NULL);
1869 full_path = must_make_path(base_path, "cgroup.procs", NULL);
1870 /* cgroup is populated */
1871 ret = lxc_write_to_file(full_path, pidstr, pidstr_len, false, 0666);
1872 if (ret < 0 && errno != EBUSY)
1873 goto on_error;
1874
1875 if (ret == 0)
1876 goto on_success;
1877
1878 free(full_path);
1879
1880 len = strlen(base_path) + sizeof("/lxc-1000") - 1 +
1881 sizeof("/cgroup-procs") - 1;
1882 full_path = must_alloc(len + 1);
1883 do {
1884 if (idx)
1885 ret = snprintf(full_path, len + 1, "%s/lxc-%d",
1886 base_path, idx);
1887 else
1888 ret = snprintf(full_path, len + 1, "%s/lxc", base_path);
1889 if (ret < 0 || (size_t)ret >= len + 1)
1890 goto on_error;
1891
1892 ret = mkdir_p(full_path, 0755);
1893 if (ret < 0 && errno != EEXIST)
1894 goto on_error;
1895
1896 (void)strlcat(full_path, "/cgroup.procs", len + 1);
1897 ret = lxc_write_to_file(full_path, pidstr, len, false, 0666);
1898 if (ret == 0)
1899 goto on_success;
1900
1901 /* this is a non-leaf node */
1902 if (errno != EBUSY)
1903 goto on_error;
1904
1905 } while (++idx > 0 && idx < 1000);
1906
1907 on_success:
1908 if (idx < 1000)
1909 fret = 0;
1910
1911 on_error:
1912 free(base_path);
1913 free(container_cgroup);
1914 free(full_path);
1915
1916 return fret;
1917 }
1918
1919 static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
1920 const char *lxcpath, pid_t pid)
1921 {
1922 int i, len, ret;
1923 char pidstr[25];
1924
1925 len = snprintf(pidstr, 25, "%d", pid);
1926 if (len < 0 || len >= 25)
1927 return false;
1928
1929 for (i = 0; ops->hierarchies[i]; i++) {
1930 char *path;
1931 char *fullpath = NULL;
1932 struct hierarchy *h = ops->hierarchies[i];
1933
1934 if (h->version == CGROUP2_SUPER_MAGIC) {
1935 ret = __cg_unified_attach(h, name, lxcpath, pidstr, len,
1936 h->controllers[0]);
1937 if (ret < 0)
1938 return false;
1939
1940 continue;
1941 }
1942
1943 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
1944 /* not running */
1945 if (!path)
1946 continue;
1947
1948 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
1949 free(path);
1950 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
1951 if (ret < 0) {
1952 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
1953 free(fullpath);
1954 return false;
1955 }
1956 free(fullpath);
1957 }
1958
1959 return true;
1960 }
1961
1962 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
1963 * don't have a cgroup_data set up, so we ask the running container through the
1964 * commands API for the cgroup path.
1965 */
1966 static int cgfsng_get(struct cgroup_ops *ops, const char *filename, char *value,
1967 size_t len, const char *name, const char *lxcpath)
1968 {
1969 int ret = -1;
1970 size_t controller_len;
1971 char *controller, *p, *path;
1972 struct hierarchy *h;
1973
1974 controller_len = strlen(filename);
1975 controller = alloca(controller_len + 1);
1976 (void)strlcpy(controller, filename, controller_len + 1);
1977
1978 p = strchr(controller, '.');
1979 if (p)
1980 *p = '\0';
1981
1982 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
1983 /* not running */
1984 if (!path)
1985 return -1;
1986
1987 h = get_hierarchy(ops, controller);
1988 if (h) {
1989 char *fullpath;
1990
1991 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
1992 ret = lxc_read_from_file(fullpath, value, len);
1993 free(fullpath);
1994 }
1995 free(path);
1996
1997 return ret;
1998 }
1999
2000 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2001 * don't have a cgroup_data set up, so we ask the running container through the
2002 * commands API for the cgroup path.
2003 */
2004 static int cgfsng_set(struct cgroup_ops *ops, const char *filename,
2005 const char *value, const char *name, const char *lxcpath)
2006 {
2007 int ret = -1;
2008 size_t controller_len;
2009 char *controller, *p, *path;
2010 struct hierarchy *h;
2011
2012 controller_len = strlen(filename);
2013 controller = alloca(controller_len + 1);
2014 (void)strlcpy(controller, filename, controller_len + 1);
2015
2016 p = strchr(controller, '.');
2017 if (p)
2018 *p = '\0';
2019
2020 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2021 /* not running */
2022 if (!path)
2023 return -1;
2024
2025 h = get_hierarchy(ops, controller);
2026 if (h) {
2027 char *fullpath;
2028
2029 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2030 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2031 free(fullpath);
2032 }
2033 free(path);
2034
2035 return ret;
2036 }
2037
2038 /* take devices cgroup line
2039 * /dev/foo rwx
2040 * and convert it to a valid
2041 * type major:minor mode
2042 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2043 * the output.
2044 */
2045 static int convert_devpath(const char *invalue, char *dest)
2046 {
2047 int n_parts;
2048 char *p, *path, type;
2049 unsigned long minor, major;
2050 struct stat sb;
2051 int ret = -EINVAL;
2052 char *mode = NULL;
2053
2054 path = must_copy_string(invalue);
2055
2056 /* Read path followed by mode. Ignore any trailing text.
2057 * A ' # comment' would be legal. Technically other text is not
2058 * legal, we could check for that if we cared to.
2059 */
2060 for (n_parts = 1, p = path; *p && n_parts < 3; p++) {
2061 if (*p != ' ')
2062 continue;
2063 *p = '\0';
2064
2065 if (n_parts != 1)
2066 break;
2067 p++;
2068 n_parts++;
2069
2070 while (*p == ' ')
2071 p++;
2072
2073 mode = p;
2074
2075 if (*p == '\0')
2076 goto out;
2077 }
2078
2079 if (n_parts == 1)
2080 goto out;
2081
2082 ret = stat(path, &sb);
2083 if (ret < 0)
2084 goto out;
2085
2086 mode_t m = sb.st_mode & S_IFMT;
2087 switch (m) {
2088 case S_IFBLK:
2089 type = 'b';
2090 break;
2091 case S_IFCHR:
2092 type = 'c';
2093 break;
2094 default:
2095 ERROR("Unsupported device type %i for \"%s\"", m, path);
2096 ret = -EINVAL;
2097 goto out;
2098 }
2099
2100 major = MAJOR(sb.st_rdev);
2101 minor = MINOR(sb.st_rdev);
2102 ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
2103 if (ret < 0 || ret >= 50) {
2104 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2105 "chars)", type, major, minor, mode);
2106 ret = -ENAMETOOLONG;
2107 goto out;
2108 }
2109 ret = 0;
2110
2111 out:
2112 free(path);
2113 return ret;
2114 }
2115
2116 /* Called from setup_limits - here we have the container's cgroup_data because
2117 * we created the cgroups.
2118 */
2119 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2120 const char *value)
2121 {
2122 size_t len;
2123 char *fullpath, *p;
2124 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2125 char converted_value[50];
2126 struct hierarchy *h;
2127 int ret = 0;
2128 char *controller = NULL;
2129
2130 len = strlen(filename);
2131 controller = alloca(len + 1);
2132 (void)strlcpy(controller, filename, len + 1);
2133
2134 p = strchr(controller, '.');
2135 if (p)
2136 *p = '\0';
2137
2138 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
2139 ret = convert_devpath(value, converted_value);
2140 if (ret < 0)
2141 return ret;
2142 value = converted_value;
2143 }
2144
2145 h = get_hierarchy(ops, controller);
2146 if (!h) {
2147 ERROR("Failed to setup limits for the \"%s\" controller. "
2148 "The controller seems to be unused by \"cgfsng\" cgroup "
2149 "driver or not enabled on the cgroup hierarchy",
2150 controller);
2151 errno = ENOENT;
2152 return -ENOENT;
2153 }
2154
2155 fullpath = must_make_path(h->fullcgpath, filename, NULL);
2156 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2157 free(fullpath);
2158 return ret;
2159 }
2160
2161 static bool __cg_legacy_setup_limits(struct cgroup_ops *ops,
2162 struct lxc_list *cgroup_settings,
2163 bool do_devices)
2164 {
2165 struct lxc_list *iterator, *next, *sorted_cgroup_settings;
2166 struct lxc_cgroup *cg;
2167 bool ret = false;
2168
2169 if (lxc_list_empty(cgroup_settings))
2170 return true;
2171
2172 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2173 if (!sorted_cgroup_settings)
2174 return false;
2175
2176 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2177 cg = iterator->elem;
2178
2179 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2180 if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
2181 if (do_devices && (errno == EACCES || errno == EPERM)) {
2182 WARN("Failed to set \"%s\" to \"%s\"",
2183 cg->subsystem, cg->value);
2184 continue;
2185 }
2186 WARN("Failed to set \"%s\" to \"%s\"",
2187 cg->subsystem, cg->value);
2188 goto out;
2189 }
2190 DEBUG("Set controller \"%s\" set to \"%s\"",
2191 cg->subsystem, cg->value);
2192 }
2193 }
2194
2195 ret = true;
2196 INFO("Limits for the legacy cgroup hierarchies have been setup");
2197 out:
2198 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2199 lxc_list_del(iterator);
2200 free(iterator);
2201 }
2202 free(sorted_cgroup_settings);
2203 return ret;
2204 }
2205
2206 static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
2207 struct lxc_list *cgroup_settings)
2208 {
2209 struct lxc_list *iterator;
2210 struct hierarchy *h = ops->unified;
2211
2212 if (lxc_list_empty(cgroup_settings))
2213 return true;
2214
2215 if (!h)
2216 return false;
2217
2218 lxc_list_for_each(iterator, cgroup_settings) {
2219 int ret;
2220 char *fullpath;
2221 struct lxc_cgroup *cg = iterator->elem;
2222
2223 fullpath = must_make_path(h->fullcgpath, cg->subsystem, NULL);
2224 ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666);
2225 free(fullpath);
2226 if (ret < 0) {
2227 SYSERROR("Failed to set \"%s\" to \"%s\"",
2228 cg->subsystem, cg->value);
2229 return false;
2230 }
2231 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2232 }
2233
2234 INFO("Limits for the unified cgroup hierarchy have been setup");
2235 return true;
2236 }
2237
2238 static bool cgfsng_setup_limits(struct cgroup_ops *ops, struct lxc_conf *conf,
2239 bool do_devices)
2240 {
2241 bool bret;
2242
2243 bret = __cg_legacy_setup_limits(ops, &conf->cgroup, do_devices);
2244 if (!bret)
2245 return false;
2246
2247 return __cg_unified_setup_limits(ops, &conf->cgroup2);
2248 }
2249
2250 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2251 char **controllers)
2252 {
2253 char **cur_ctrl, **cur_use;
2254
2255 if (!ops->cgroup_use)
2256 return true;
2257
2258 for (cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
2259 bool found = false;
2260
2261 for (cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
2262 if (strcmp(*cur_use, *cur_ctrl) != 0)
2263 continue;
2264
2265 found = true;
2266 break;
2267 }
2268
2269 if (found)
2270 continue;
2271
2272 return false;
2273 }
2274
2275 return true;
2276 }
2277
2278 /* At startup, parse_hierarchies finds all the info we need about cgroup
2279 * mountpoints and current cgroups, and stores it in @d.
2280 */
2281 static bool cg_hybrid_init(struct cgroup_ops *ops)
2282 {
2283 int ret;
2284 char *basecginfo;
2285 bool will_escape;
2286 FILE *f;
2287 size_t len = 0;
2288 char *line = NULL;
2289 char **klist = NULL, **nlist = NULL;
2290
2291 /* Root spawned containers escape the current cgroup, so use init's
2292 * cgroups as our base in that case.
2293 */
2294 will_escape = (geteuid() == 0);
2295 if (will_escape)
2296 basecginfo = read_file("/proc/1/cgroup");
2297 else
2298 basecginfo = read_file("/proc/self/cgroup");
2299 if (!basecginfo)
2300 return false;
2301
2302 ret = get_existing_subsystems(&klist, &nlist);
2303 if (ret < 0) {
2304 ERROR("Failed to retrieve available legacy cgroup controllers");
2305 free(basecginfo);
2306 return false;
2307 }
2308
2309 f = fopen("/proc/self/mountinfo", "r");
2310 if (!f) {
2311 ERROR("Failed to open \"/proc/self/mountinfo\"");
2312 free(basecginfo);
2313 return false;
2314 }
2315
2316 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2317
2318 while (getline(&line, &len, f) != -1) {
2319 int type;
2320 bool writeable;
2321 struct hierarchy *new;
2322 char *base_cgroup = NULL, *mountpoint = NULL;
2323 char **controller_list = NULL;
2324
2325 type = get_cgroup_version(line);
2326 if (type == 0)
2327 continue;
2328
2329 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2330 continue;
2331
2332 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2333 if (type == CGROUP2_SUPER_MAGIC)
2334 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2335 else if (type == CGROUP_SUPER_MAGIC)
2336 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2337 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2338 if (type == CGROUP_SUPER_MAGIC)
2339 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2340 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2341 if (type == CGROUP2_SUPER_MAGIC)
2342 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2343 }
2344
2345 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
2346 if (!controller_list && type == CGROUP_SUPER_MAGIC)
2347 continue;
2348
2349 if (type == CGROUP_SUPER_MAGIC)
2350 if (controller_list_is_dup(ops->hierarchies, controller_list))
2351 goto next;
2352
2353 mountpoint = cg_hybrid_get_mountpoint(line);
2354 if (!mountpoint) {
2355 ERROR("Failed parsing mountpoint from \"%s\"", line);
2356 goto next;
2357 }
2358
2359 if (type == CGROUP_SUPER_MAGIC)
2360 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
2361 else
2362 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
2363 if (!base_cgroup) {
2364 ERROR("Failed to find current cgroup");
2365 goto next;
2366 }
2367
2368 trim(base_cgroup);
2369 prune_init_scope(base_cgroup);
2370 if (type == CGROUP2_SUPER_MAGIC)
2371 writeable = test_writeable_v2(mountpoint, base_cgroup);
2372 else
2373 writeable = test_writeable_v1(mountpoint, base_cgroup);
2374 if (!writeable)
2375 goto next;
2376
2377 if (type == CGROUP2_SUPER_MAGIC) {
2378 char *cgv2_ctrl_path;
2379
2380 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
2381 "cgroup.controllers",
2382 NULL);
2383
2384 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
2385 free(cgv2_ctrl_path);
2386 if (!controller_list) {
2387 controller_list = cg_unified_make_empty_controller();
2388 TRACE("No controllers are enabled for "
2389 "delegation in the unified hierarchy");
2390 }
2391 }
2392
2393 /* Exclude all controllers that cgroup use does not want. */
2394 if (!cgroup_use_wants_controllers(ops, controller_list))
2395 goto next;
2396
2397 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
2398 if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
2399 ops->unified = new;
2400
2401 continue;
2402
2403 next:
2404 free_string_list(controller_list);
2405 free(mountpoint);
2406 free(base_cgroup);
2407 }
2408
2409 free_string_list(klist);
2410 free_string_list(nlist);
2411
2412 free(basecginfo);
2413
2414 fclose(f);
2415 free(line);
2416
2417 TRACE("Writable cgroup hierarchies:");
2418 lxc_cgfsng_print_hierarchies(ops);
2419
2420 /* verify that all controllers in cgroup.use and all crucial
2421 * controllers are accounted for
2422 */
2423 if (!all_controllers_found(ops))
2424 return false;
2425
2426 return true;
2427 }
2428
2429 static int cg_is_pure_unified(void)
2430 {
2431
2432 int ret;
2433 struct statfs fs;
2434
2435 ret = statfs("/sys/fs/cgroup", &fs);
2436 if (ret < 0)
2437 return -ENOMEDIUM;
2438
2439 if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
2440 return CGROUP2_SUPER_MAGIC;
2441
2442 return 0;
2443 }
2444
2445 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
2446 static char *cg_unified_get_current_cgroup(void)
2447 {
2448 char *basecginfo, *base_cgroup;
2449 bool will_escape;
2450 char *copy = NULL;
2451
2452 will_escape = (geteuid() == 0);
2453 if (will_escape)
2454 basecginfo = read_file("/proc/1/cgroup");
2455 else
2456 basecginfo = read_file("/proc/self/cgroup");
2457 if (!basecginfo)
2458 return NULL;
2459
2460 base_cgroup = strstr(basecginfo, "0::/");
2461 if (!base_cgroup)
2462 goto cleanup_on_err;
2463
2464 base_cgroup = base_cgroup + 3;
2465 copy = copy_to_eol(base_cgroup);
2466 if (!copy)
2467 goto cleanup_on_err;
2468
2469 cleanup_on_err:
2470 free(basecginfo);
2471 if (copy)
2472 trim(copy);
2473
2474 return copy;
2475 }
2476
2477 static int cg_unified_init(struct cgroup_ops *ops)
2478 {
2479 int ret;
2480 char *mountpoint, *subtree_path;
2481 char **delegatable;
2482 char *base_cgroup = NULL;
2483
2484 ret = cg_is_pure_unified();
2485 if (ret == -ENOMEDIUM)
2486 return -ENOMEDIUM;
2487
2488 if (ret != CGROUP2_SUPER_MAGIC)
2489 return 0;
2490
2491 base_cgroup = cg_unified_get_current_cgroup();
2492 if (!base_cgroup)
2493 return -EINVAL;
2494 prune_init_scope(base_cgroup);
2495
2496 /* We assume that we have already been given controllers to delegate
2497 * further down the hierarchy. If not it is up to the user to delegate
2498 * them to us.
2499 */
2500 mountpoint = must_copy_string("/sys/fs/cgroup");
2501 subtree_path = must_make_path(mountpoint, base_cgroup,
2502 "cgroup.subtree_control", NULL);
2503 delegatable = cg_unified_get_controllers(subtree_path);
2504 free(subtree_path);
2505 if (!delegatable)
2506 delegatable = cg_unified_make_empty_controller();
2507 if (!delegatable[0])
2508 TRACE("No controllers are enabled for delegation");
2509
2510 /* TODO: If the user requested specific controllers via lxc.cgroup.use
2511 * we should verify here. The reason I'm not doing it right is that I'm
2512 * not convinced that lxc.cgroup.use will be the future since it is a
2513 * global property. I much rather have an option that lets you request
2514 * controllers per container.
2515 */
2516
2517 add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
2518
2519 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2520 return CGROUP2_SUPER_MAGIC;
2521 }
2522
2523 static bool cg_init(struct cgroup_ops *ops)
2524 {
2525 int ret;
2526 const char *tmp;
2527
2528 tmp = lxc_global_config_value("lxc.cgroup.use");
2529 if (tmp) {
2530 char *chop, *cur, *pin;
2531
2532 pin = must_copy_string(tmp);
2533 chop = pin;
2534
2535 lxc_iterate_parts(cur, chop, ",") {
2536 must_append_string(&ops->cgroup_use, cur);
2537 }
2538
2539 free(pin);
2540 }
2541
2542 ret = cg_unified_init(ops);
2543 if (ret < 0)
2544 return false;
2545
2546 if (ret == CGROUP2_SUPER_MAGIC)
2547 return true;
2548
2549 return cg_hybrid_init(ops);
2550 }
2551
2552 static bool cgfsng_data_init(struct cgroup_ops *ops)
2553 {
2554 const char *cgroup_pattern;
2555
2556 /* copy system-wide cgroup information */
2557 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
2558 if (!cgroup_pattern) {
2559 /* lxc.cgroup.pattern is only NULL on error. */
2560 ERROR("Failed to retrieve cgroup pattern");
2561 return false;
2562 }
2563 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
2564
2565 return true;
2566 }
2567
2568 struct cgroup_ops *cgfsng_ops_init(void)
2569 {
2570 struct cgroup_ops *cgfsng_ops;
2571
2572 cgfsng_ops = malloc(sizeof(struct cgroup_ops));
2573 if (!cgfsng_ops)
2574 return NULL;
2575
2576 memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
2577 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
2578
2579 if (!cg_init(cgfsng_ops)) {
2580 free(cgfsng_ops);
2581 return NULL;
2582 }
2583
2584 cgfsng_ops->data_init = cgfsng_data_init;
2585 cgfsng_ops->destroy = cgfsng_destroy;
2586 cgfsng_ops->create = cgfsng_create;
2587 cgfsng_ops->enter = cgfsng_enter;
2588 cgfsng_ops->escape = cgfsng_escape;
2589 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
2590 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
2591 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
2592 cgfsng_ops->get = cgfsng_get;
2593 cgfsng_ops->set = cgfsng_set;
2594 cgfsng_ops->unfreeze = cgfsng_unfreeze;
2595 cgfsng_ops->setup_limits = cgfsng_setup_limits;
2596 cgfsng_ops->driver = "cgfsng";
2597 cgfsng_ops->version = "1.0.0";
2598 cgfsng_ops->attach = cgfsng_attach;
2599 cgfsng_ops->chown = cgfsng_chown;
2600 cgfsng_ops->mount = cgfsng_mount;
2601 cgfsng_ops->nrtasks = cgfsng_nrtasks;
2602
2603 return cgfsng_ops;
2604 }