]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfsng.c
secure coding: cgfsng: strncat, strlcpy
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
1 /*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 * Christian Brauner <christian.brauner@ubuntu.com>
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /*
26 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
27 * cgroup backend. The original cgfs.c was designed to be as flexible
28 * as possible. It would try to find cgroup filesystems no matter where
29 * or how you had them mounted, and deduce the most usable mount for
30 * each controller.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
36
37 #include "config.h"
38
39 #include <ctype.h>
40 #include <dirent.h>
41 #include <errno.h>
42 #include <grp.h>
43 #include <stdint.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48 #include <linux/kdev_t.h>
49 #include <linux/types.h>
50 #include <sys/types.h>
51
52 #include "caps.h"
53 #include "cgroup.h"
54 #include "cgroup_utils.h"
55 #include "commands.h"
56 #include "conf.h"
57 #include "log.h"
58 #include "storage/storage.h"
59 #include "utils.h"
60
61 #ifndef HAVE_STRLCPY
62 #include "include/strlcpy.h"
63 #endif
64
65 lxc_log_define(lxc_cgfsng, lxc);
66
67 static void free_string_list(char **clist)
68 {
69 int i;
70
71 if (!clist)
72 return;
73
74 for (i = 0; clist[i]; i++)
75 free(clist[i]);
76
77 free(clist);
78 }
79
80 /* Allocate a pointer, do not fail. */
81 static void *must_alloc(size_t sz)
82 {
83 return must_realloc(NULL, sz);
84 }
85
86 /* Given a pointer to a null-terminated array of pointers, realloc to add one
87 * entry, and point the new entry to NULL. Do not fail. Return the index to the
88 * second-to-last entry - that is, the one which is now available for use
89 * (keeping the list null-terminated).
90 */
91 static int append_null_to_list(void ***list)
92 {
93 int newentry = 0;
94
95 if (*list)
96 for (; (*list)[newentry]; newentry++)
97 ;
98
99 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
100 (*list)[newentry + 1] = NULL;
101 return newentry;
102 }
103
104 /* Given a null-terminated array of strings, check whether @entry is one of the
105 * strings.
106 */
107 static bool string_in_list(char **list, const char *entry)
108 {
109 int i;
110
111 if (!list)
112 return false;
113
114 for (i = 0; list[i]; i++)
115 if (strcmp(list[i], entry) == 0)
116 return true;
117
118 return false;
119 }
120
121 /* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
122 * "name=systemd". Do not fail.
123 */
124 static char *cg_legacy_must_prefix_named(char *entry)
125 {
126 size_t len;
127 char *prefixed;
128
129 len = strlen(entry);
130 prefixed = must_alloc(len + 6);
131
132 memcpy(prefixed, "name=", sizeof("name=") - 1);
133 memcpy(prefixed + sizeof("name=") - 1, entry, len);
134 prefixed[len + 5] = '\0';
135 return prefixed;
136 }
137
138 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
139 * we are called.
140 *
141 * We also handle named subsystems here. Any controller which is not a kernel
142 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
143 * we refuse to use because we're not sure which we have here.
144 * (TODO: We could work around this in some cases by just remounting to be
145 * unambiguous, or by comparing mountpoint contents with current cgroup.)
146 *
147 * The last entry will always be NULL.
148 */
149 static void must_append_controller(char **klist, char **nlist, char ***clist,
150 char *entry)
151 {
152 int newentry;
153 char *copy;
154
155 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
156 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
157 ERROR("It is both a named and kernel subsystem");
158 return;
159 }
160
161 newentry = append_null_to_list((void ***)clist);
162
163 if (strncmp(entry, "name=", 5) == 0)
164 copy = must_copy_string(entry);
165 else if (string_in_list(klist, entry))
166 copy = must_copy_string(entry);
167 else
168 copy = cg_legacy_must_prefix_named(entry);
169
170 (*clist)[newentry] = copy;
171 }
172
173 /* Given a handler's cgroup data, return the struct hierarchy for the controller
174 * @c, or NULL if there is none.
175 */
176 struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *c)
177 {
178 int i;
179
180 if (!ops->hierarchies)
181 return NULL;
182
183 for (i = 0; ops->hierarchies[i]; i++) {
184 if (!c) {
185 /* This is the empty unified hierarchy. */
186 if (ops->hierarchies[i]->controllers &&
187 !ops->hierarchies[i]->controllers[0])
188 return ops->hierarchies[i];
189
190 continue;
191 }
192
193 if (string_in_list(ops->hierarchies[i]->controllers, c))
194 return ops->hierarchies[i];
195 }
196
197 return NULL;
198 }
199
200 #define BATCH_SIZE 50
201 static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
202 {
203 int newbatches = (newlen / BATCH_SIZE) + 1;
204 int oldbatches = (oldlen / BATCH_SIZE) + 1;
205
206 if (!*mem || newbatches > oldbatches) {
207 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
208 }
209 }
210
211 static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
212 {
213 size_t full = oldlen + newlen;
214
215 batch_realloc(dest, oldlen, full + 1);
216
217 memcpy(*dest + oldlen, new, newlen + 1);
218 }
219
220 /* Slurp in a whole file */
221 static char *read_file(const char *fnam)
222 {
223 FILE *f;
224 char *line = NULL, *buf = NULL;
225 size_t len = 0, fulllen = 0;
226 int linelen;
227
228 f = fopen(fnam, "r");
229 if (!f)
230 return NULL;
231 while ((linelen = getline(&line, &len, f)) != -1) {
232 append_line(&buf, fulllen, line, linelen);
233 fulllen += linelen;
234 }
235 fclose(f);
236 free(line);
237 return buf;
238 }
239
240 /* Taken over modified from the kernel sources. */
241 #define NBITS 32 /* bits in uint32_t */
242 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
243 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
244
245 static void set_bit(unsigned bit, uint32_t *bitarr)
246 {
247 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
248 }
249
250 static void clear_bit(unsigned bit, uint32_t *bitarr)
251 {
252 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
253 }
254
255 static bool is_set(unsigned bit, uint32_t *bitarr)
256 {
257 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
258 }
259
260 /* Create cpumask from cpulist aka turn:
261 *
262 * 0,2-3
263 *
264 * into bit array
265 *
266 * 1 0 1 1
267 */
268 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
269 {
270 char *token;
271 size_t arrlen;
272 uint32_t *bitarr;
273 char *saveptr = NULL;
274
275 arrlen = BITS_TO_LONGS(nbits);
276 bitarr = calloc(arrlen, sizeof(uint32_t));
277 if (!bitarr)
278 return NULL;
279
280 for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
281 errno = 0;
282 unsigned end, start;
283 char *range;
284
285 start = strtoul(token, NULL, 0);
286 end = start;
287 range = strchr(token, '-');
288 if (range)
289 end = strtoul(range + 1, NULL, 0);
290
291 if (!(start <= end)) {
292 free(bitarr);
293 return NULL;
294 }
295
296 if (end >= nbits) {
297 free(bitarr);
298 return NULL;
299 }
300
301 while (start <= end)
302 set_bit(start++, bitarr);
303 }
304
305 return bitarr;
306 }
307
308 /* Turn cpumask into simple, comma-separated cpulist. */
309 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
310 {
311 int ret;
312 size_t i;
313 char **cpulist = NULL;
314 char numstr[LXC_NUMSTRLEN64] = {0};
315
316 for (i = 0; i <= nbits; i++) {
317 if (!is_set(i, bitarr))
318 continue;
319
320 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
321 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
322 lxc_free_array((void **)cpulist, free);
323 return NULL;
324 }
325
326 ret = lxc_append_string(&cpulist, numstr);
327 if (ret < 0) {
328 lxc_free_array((void **)cpulist, free);
329 return NULL;
330 }
331 }
332
333 if (!cpulist)
334 return NULL;
335
336 return lxc_string_join(",", (const char **)cpulist, false);
337 }
338
339 static ssize_t get_max_cpus(char *cpulist)
340 {
341 char *c1, *c2;
342 char *maxcpus = cpulist;
343 size_t cpus = 0;
344
345 c1 = strrchr(maxcpus, ',');
346 if (c1)
347 c1++;
348
349 c2 = strrchr(maxcpus, '-');
350 if (c2)
351 c2++;
352
353 if (!c1 && !c2)
354 c1 = maxcpus;
355 else if (c1 > c2)
356 c2 = c1;
357 else if (c1 < c2)
358 c1 = c2;
359 else if (!c1 && c2)
360 c1 = c2;
361
362 errno = 0;
363 cpus = strtoul(c1, NULL, 0);
364 if (errno != 0)
365 return -1;
366
367 return cpus;
368 }
369
370 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
371 static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
372 {
373 int ret;
374 ssize_t i;
375 char *lastslash, *fpath, oldv;
376 ssize_t maxisol = 0, maxposs = 0;
377 char *cpulist = NULL, *isolcpus = NULL, *posscpus = NULL;
378 uint32_t *isolmask = NULL, *possmask = NULL;
379 bool bret = false, flipped_bit = false;
380
381 lastslash = strrchr(path, '/');
382 if (!lastslash) {
383 ERROR("Failed to detect \"/\" in \"%s\"", path);
384 return bret;
385 }
386 oldv = *lastslash;
387 *lastslash = '\0';
388 fpath = must_make_path(path, "cpuset.cpus", NULL);
389 posscpus = read_file(fpath);
390 if (!posscpus) {
391 SYSERROR("Failed to read file \"%s\"", fpath);
392 goto on_error;
393 }
394
395 /* Get maximum number of cpus found in possible cpuset. */
396 maxposs = get_max_cpus(posscpus);
397 if (maxposs < 0)
398 goto on_error;
399
400 if (!file_exists(__ISOL_CPUS)) {
401 /* This system doesn't expose isolated cpus. */
402 DEBUG("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
403 cpulist = posscpus;
404 /* No isolated cpus but we weren't already initialized by
405 * someone. We should simply copy the parents cpuset.cpus
406 * values.
407 */
408 if (!am_initialized) {
409 DEBUG("Copying cpu settings of parent cgroup");
410 goto copy_parent;
411 }
412 /* No isolated cpus but we were already initialized by someone.
413 * Nothing more to do for us.
414 */
415 goto on_success;
416 }
417
418 isolcpus = read_file(__ISOL_CPUS);
419 if (!isolcpus) {
420 SYSERROR("Failed to read file \""__ISOL_CPUS"\"");
421 goto on_error;
422 }
423 if (!isdigit(isolcpus[0])) {
424 TRACE("No isolated cpus detected");
425 cpulist = posscpus;
426 /* No isolated cpus but we weren't already initialized by
427 * someone. We should simply copy the parents cpuset.cpus
428 * values.
429 */
430 if (!am_initialized) {
431 DEBUG("Copying cpu settings of parent cgroup");
432 goto copy_parent;
433 }
434 /* No isolated cpus but we were already initialized by someone.
435 * Nothing more to do for us.
436 */
437 goto on_success;
438 }
439
440 /* Get maximum number of cpus found in isolated cpuset. */
441 maxisol = get_max_cpus(isolcpus);
442 if (maxisol < 0)
443 goto on_error;
444
445 if (maxposs < maxisol)
446 maxposs = maxisol;
447 maxposs++;
448
449 possmask = lxc_cpumask(posscpus, maxposs);
450 if (!possmask) {
451 ERROR("Failed to create cpumask for possible cpus");
452 goto on_error;
453 }
454
455 isolmask = lxc_cpumask(isolcpus, maxposs);
456 if (!isolmask) {
457 ERROR("Failed to create cpumask for isolated cpus");
458 goto on_error;
459 }
460
461 for (i = 0; i <= maxposs; i++) {
462 if (!is_set(i, isolmask) || !is_set(i, possmask))
463 continue;
464
465 flipped_bit = true;
466 clear_bit(i, possmask);
467 }
468
469 if (!flipped_bit) {
470 DEBUG("No isolated cpus present in cpuset");
471 goto on_success;
472 }
473 DEBUG("Removed isolated cpus from cpuset");
474
475 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
476 if (!cpulist) {
477 ERROR("Failed to create cpu list");
478 goto on_error;
479 }
480
481 copy_parent:
482 *lastslash = oldv;
483 free(fpath);
484 fpath = must_make_path(path, "cpuset.cpus", NULL);
485 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false, 0666);
486 if (ret < 0) {
487 SYSERROR("Failed to write cpu list to \"%s\"", fpath);
488 goto on_error;
489 }
490
491 on_success:
492 bret = true;
493
494 on_error:
495 free(fpath);
496
497 free(isolcpus);
498 free(isolmask);
499
500 if (posscpus != cpulist)
501 free(posscpus);
502 free(possmask);
503
504 free(cpulist);
505 return bret;
506 }
507
508 /* Copy contents of parent(@path)/@file to @path/@file */
509 static bool copy_parent_file(char *path, char *file)
510 {
511 int ret;
512 char *fpath, *lastslash, oldv;
513 int len = 0;
514 char *value = NULL;
515
516 lastslash = strrchr(path, '/');
517 if (!lastslash) {
518 ERROR("Failed to detect \"/\" in \"%s\"", path);
519 return false;
520 }
521 oldv = *lastslash;
522 *lastslash = '\0';
523 fpath = must_make_path(path, file, NULL);
524 len = lxc_read_from_file(fpath, NULL, 0);
525 if (len <= 0)
526 goto on_error;
527
528 value = must_alloc(len + 1);
529 ret = lxc_read_from_file(fpath, value, len);
530 if (ret != len)
531 goto on_error;
532 free(fpath);
533
534 *lastslash = oldv;
535 fpath = must_make_path(path, file, NULL);
536 ret = lxc_write_to_file(fpath, value, len, false, 0666);
537 if (ret < 0)
538 SYSERROR("Failed to write \"%s\" to file \"%s\"", value, fpath);
539 free(fpath);
540 free(value);
541 return ret >= 0;
542
543 on_error:
544 SYSERROR("Failed to read file \"%s\"", fpath);
545 free(fpath);
546 free(value);
547 return false;
548 }
549
550 /* Initialize the cpuset hierarchy in first directory of @gname and set
551 * cgroup.clone_children so that children inherit settings. Since the
552 * h->base_path is populated by init or ourselves, we know it is already
553 * initialized.
554 */
555 static bool cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
556 {
557 int ret;
558 char v;
559 char *cgpath, *clonechildrenpath, *slash;
560
561 if (!string_in_list(h->controllers, "cpuset"))
562 return true;
563
564 if (*cgname == '/')
565 cgname++;
566 slash = strchr(cgname, '/');
567 if (slash)
568 *slash = '\0';
569
570 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
571 if (slash)
572 *slash = '/';
573
574 ret = mkdir(cgpath, 0755);
575 if (ret < 0) {
576 if (errno != EEXIST) {
577 SYSERROR("Failed to create directory \"%s\"", cgpath);
578 free(cgpath);
579 return false;
580 }
581 }
582
583 clonechildrenpath =
584 must_make_path(cgpath, "cgroup.clone_children", NULL);
585 /* unified hierarchy doesn't have clone_children */
586 if (!file_exists(clonechildrenpath)) {
587 free(clonechildrenpath);
588 free(cgpath);
589 return true;
590 }
591
592 ret = lxc_read_from_file(clonechildrenpath, &v, 1);
593 if (ret < 0) {
594 SYSERROR("Failed to read file \"%s\"", clonechildrenpath);
595 free(clonechildrenpath);
596 free(cgpath);
597 return false;
598 }
599
600 /* Make sure any isolated cpus are removed from cpuset.cpus. */
601 if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1')) {
602 SYSERROR("Failed to remove isolated cpus");
603 free(clonechildrenpath);
604 free(cgpath);
605 return false;
606 }
607
608 /* Already set for us by someone else. */
609 if (v == '1') {
610 DEBUG("\"cgroup.clone_children\" was already set to \"1\"");
611 free(clonechildrenpath);
612 free(cgpath);
613 return true;
614 }
615
616 /* copy parent's settings */
617 if (!copy_parent_file(cgpath, "cpuset.mems")) {
618 SYSERROR("Failed to copy \"cpuset.mems\" settings");
619 free(cgpath);
620 free(clonechildrenpath);
621 return false;
622 }
623 free(cgpath);
624
625 ret = lxc_write_to_file(clonechildrenpath, "1", 1, false, 0666);
626 if (ret < 0) {
627 /* Set clone_children so children inherit our settings */
628 SYSERROR("Failed to write 1 to \"%s\"", clonechildrenpath);
629 free(clonechildrenpath);
630 return false;
631 }
632 free(clonechildrenpath);
633 return true;
634 }
635
636 /* Given two null-terminated lists of strings, return true if any string is in
637 * both.
638 */
639 static bool controller_lists_intersect(char **l1, char **l2)
640 {
641 int i;
642
643 if (!l1 || !l2)
644 return false;
645
646 for (i = 0; l1[i]; i++) {
647 if (string_in_list(l2, l1[i]))
648 return true;
649 }
650
651 return false;
652 }
653
654 /* For a null-terminated list of controllers @clist, return true if any of those
655 * controllers is already listed the null-terminated list of hierarchies @hlist.
656 * Realistically, if one is present, all must be present.
657 */
658 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
659 {
660 int i;
661
662 if (!hlist)
663 return false;
664
665 for (i = 0; hlist[i]; i++)
666 if (controller_lists_intersect(hlist[i]->controllers, clist))
667 return true;
668
669 return false;
670 }
671
672 /* Return true if the controller @entry is found in the null-terminated list of
673 * hierarchies @hlist.
674 */
675 static bool controller_found(struct hierarchy **hlist, char *entry)
676 {
677 int i;
678
679 if (!hlist)
680 return false;
681
682 for (i = 0; hlist[i]; i++)
683 if (string_in_list(hlist[i]->controllers, entry))
684 return true;
685
686 return false;
687 }
688
689 /* Return true if all of the controllers which we require have been found. The
690 * required list is freezer and anything in lxc.cgroup.use.
691 */
692 static bool all_controllers_found(struct cgroup_ops *ops)
693 {
694 char *p;
695 char *saveptr = NULL;
696 struct hierarchy **hlist = ops->hierarchies;
697
698 if (!controller_found(hlist, "freezer")) {
699 ERROR("No freezer controller mountpoint found");
700 return false;
701 }
702
703 if (!ops->cgroup_use)
704 return true;
705
706 for (; (p = strtok_r(ops->cgroup_use, ",", &saveptr)); ops->cgroup_use = NULL)
707 if (!controller_found(hlist, p)) {
708 ERROR("No %s controller mountpoint found", p);
709 return false;
710 }
711
712 return true;
713 }
714
715 /* Get the controllers from a mountinfo line There are other ways we could get
716 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
717 * could parse the mount options. But we simply assume that the mountpoint must
718 * be /sys/fs/cgroup/controller-list
719 */
720 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
721 int type)
722 {
723 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
724 * for legacy hierarchies.
725 */
726 int i;
727 char *dup, *p2, *tok;
728 char *p = line, *saveptr = NULL, *sep = ",";
729 char **aret = NULL;
730
731 for (i = 0; i < 4; i++) {
732 p = strchr(p, ' ');
733 if (!p)
734 return NULL;
735 p++;
736 }
737
738 /* Note, if we change how mountinfo works, then our caller will need to
739 * verify /sys/fs/cgroup/ in this field.
740 */
741 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
742 ERROR("Found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
743 return NULL;
744 }
745
746 p += 15;
747 p2 = strchr(p, ' ');
748 if (!p2) {
749 ERROR("Corrupt mountinfo");
750 return NULL;
751 }
752 *p2 = '\0';
753
754 if (type == CGROUP_SUPER_MAGIC) {
755 /* strdup() here for v1 hierarchies. Otherwise strtok_r() will
756 * destroy mountpoints such as "/sys/fs/cgroup/cpu,cpuacct".
757 */
758 dup = strdup(p);
759 if (!dup)
760 return NULL;
761
762 for (tok = strtok_r(dup, sep, &saveptr); tok;
763 tok = strtok_r(NULL, sep, &saveptr))
764 must_append_controller(klist, nlist, &aret, tok);
765
766 free(dup);
767 }
768 *p2 = ' ';
769
770 return aret;
771 }
772
773 static char **cg_unified_make_empty_controller(void)
774 {
775 int newentry;
776 char **aret = NULL;
777
778 newentry = append_null_to_list((void ***)&aret);
779 aret[newentry] = NULL;
780 return aret;
781 }
782
783 static char **cg_unified_get_controllers(const char *file)
784 {
785 char *buf, *tok;
786 char *saveptr = NULL, *sep = " \t\n";
787 char **aret = NULL;
788
789 buf = read_file(file);
790 if (!buf)
791 return NULL;
792
793 for (tok = strtok_r(buf, sep, &saveptr); tok;
794 tok = strtok_r(NULL, sep, &saveptr)) {
795 int newentry;
796 char *copy;
797
798 newentry = append_null_to_list((void ***)&aret);
799 copy = must_copy_string(tok);
800 aret[newentry] = copy;
801 }
802
803 free(buf);
804 return aret;
805 }
806
807 static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
808 char *base_cgroup, int type)
809 {
810 struct hierarchy *new;
811 int newentry;
812
813 new = must_alloc(sizeof(*new));
814 new->controllers = clist;
815 new->mountpoint = mountpoint;
816 new->base_cgroup = base_cgroup;
817 new->fullcgpath = NULL;
818 new->version = type;
819
820 newentry = append_null_to_list((void ***)h);
821 (*h)[newentry] = new;
822 return new;
823 }
824
825 /* Get a copy of the mountpoint from @line, which is a line from
826 * /proc/self/mountinfo.
827 */
828 static char *cg_hybrid_get_mountpoint(char *line)
829 {
830 int i;
831 size_t len;
832 char *p2;
833 char *p = line, *sret = NULL;
834
835 for (i = 0; i < 4; i++) {
836 p = strchr(p, ' ');
837 if (!p)
838 return NULL;
839 p++;
840 }
841
842 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0)
843 return NULL;
844
845 p2 = strchr(p + 15, ' ');
846 if (!p2)
847 return NULL;
848 *p2 = '\0';
849
850 len = strlen(p);
851 sret = must_alloc(len + 1);
852 memcpy(sret, p, len);
853 sret[len] = '\0';
854 return sret;
855 }
856
857 /* Given a multi-line string, return a null-terminated copy of the current line. */
858 static char *copy_to_eol(char *p)
859 {
860 char *p2 = strchr(p, '\n'), *sret;
861 size_t len;
862
863 if (!p2)
864 return NULL;
865
866 len = p2 - p;
867 sret = must_alloc(len + 1);
868 memcpy(sret, p, len);
869 sret[len] = '\0';
870 return sret;
871 }
872
873 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
874 * /proc/self/cgroup file. Check whether controller c is present.
875 */
876 static bool controller_in_clist(char *cgline, char *c)
877 {
878 char *tok, *saveptr = NULL, *eol, *tmp;
879 size_t len;
880
881 eol = strchr(cgline, ':');
882 if (!eol)
883 return false;
884
885 len = eol - cgline;
886 tmp = alloca(len + 1);
887 memcpy(tmp, cgline, len);
888 tmp[len] = '\0';
889
890 for (tok = strtok_r(tmp, ",", &saveptr); tok;
891 tok = strtok_r(NULL, ",", &saveptr)) {
892 if (strcmp(tok, c) == 0)
893 return true;
894 }
895
896 return false;
897 }
898
899 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
900 * @controller.
901 */
902 static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
903 int type)
904 {
905 char *p = basecginfo;
906
907 for (;;) {
908 bool is_cgv2_base_cgroup = false;
909
910 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
911 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
912 is_cgv2_base_cgroup = true;
913
914 p = strchr(p, ':');
915 if (!p)
916 return NULL;
917 p++;
918
919 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
920 p = strchr(p, ':');
921 if (!p)
922 return NULL;
923 p++;
924 return copy_to_eol(p);
925 }
926
927 p = strchr(p, '\n');
928 if (!p)
929 return NULL;
930 p++;
931 }
932 }
933
934 static void must_append_string(char ***list, char *entry)
935 {
936 int newentry;
937 char *copy;
938
939 newentry = append_null_to_list((void ***)list);
940 copy = must_copy_string(entry);
941 (*list)[newentry] = copy;
942 }
943
944 static int get_existing_subsystems(char ***klist, char ***nlist)
945 {
946 FILE *f;
947 char *line = NULL;
948 size_t len = 0;
949
950 f = fopen("/proc/self/cgroup", "r");
951 if (!f)
952 return -1;
953
954 while (getline(&line, &len, f) != -1) {
955 char *p, *p2, *tok, *saveptr = NULL;
956 p = strchr(line, ':');
957 if (!p)
958 continue;
959 p++;
960 p2 = strchr(p, ':');
961 if (!p2)
962 continue;
963 *p2 = '\0';
964
965 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
966 * contains an entry of the form:
967 *
968 * 0::/some/path
969 *
970 * In this case we use "cgroup2" as controller name.
971 */
972 if ((p2 - p) == 0) {
973 must_append_string(klist, "cgroup2");
974 continue;
975 }
976
977 for (tok = strtok_r(p, ",", &saveptr); tok;
978 tok = strtok_r(NULL, ",", &saveptr)) {
979 if (strncmp(tok, "name=", 5) == 0)
980 must_append_string(nlist, tok);
981 else
982 must_append_string(klist, tok);
983 }
984 }
985
986 free(line);
987 fclose(f);
988 return 0;
989 }
990
991 static void trim(char *s)
992 {
993 size_t len;
994
995 len = strlen(s);
996 while ((len > 1) && (s[len - 1] == '\n'))
997 s[--len] = '\0';
998 }
999
1000 static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
1001 {
1002 int i;
1003 struct hierarchy **it;
1004
1005 if (!ops->hierarchies) {
1006 TRACE(" No hierarchies found");
1007 return;
1008 }
1009
1010 TRACE(" Hierarchies:");
1011 for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
1012 int j;
1013 char **cit;
1014
1015 TRACE(" %d: base_cgroup: %s", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1016 TRACE(" mountpoint: %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1017 TRACE(" controllers:");
1018 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
1019 TRACE(" %d: %s", j, *cit);
1020 }
1021 }
1022
1023 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1024 char **nlist)
1025 {
1026 int k;
1027 char **it;
1028
1029 TRACE("basecginfo is:");
1030 TRACE("%s", basecginfo);
1031
1032 for (k = 0, it = klist; it && *it; it++, k++)
1033 TRACE("kernel subsystem %d: %s", k, *it);
1034
1035 for (k = 0, it = nlist; it && *it; it++, k++)
1036 TRACE("named subsystem %d: %s", k, *it);
1037 }
1038
1039 static int recursive_destroy(char *dirname)
1040 {
1041 int ret;
1042 struct dirent *direntp;
1043 DIR *dir;
1044 int r = 0;
1045
1046 dir = opendir(dirname);
1047 if (!dir)
1048 return -1;
1049
1050 while ((direntp = readdir(dir))) {
1051 char *pathname;
1052 struct stat mystat;
1053
1054 if (!strcmp(direntp->d_name, ".") ||
1055 !strcmp(direntp->d_name, ".."))
1056 continue;
1057
1058 pathname = must_make_path(dirname, direntp->d_name, NULL);
1059
1060 ret = lstat(pathname, &mystat);
1061 if (ret < 0) {
1062 if (!r)
1063 WARN("Failed to stat \"%s\"", pathname);
1064 r = -1;
1065 goto next;
1066 }
1067
1068 if (!S_ISDIR(mystat.st_mode))
1069 goto next;
1070
1071 ret = recursive_destroy(pathname);
1072 if (ret < 0)
1073 r = -1;
1074 next:
1075 free(pathname);
1076 }
1077
1078 ret = rmdir(dirname);
1079 if (ret < 0) {
1080 if (!r)
1081 WARN("%s - Failed to delete \"%s\"", strerror(errno), dirname);
1082 r = -1;
1083 }
1084
1085 ret = closedir(dir);
1086 if (ret < 0) {
1087 if (!r)
1088 WARN("%s - Failed to delete \"%s\"", strerror(errno), dirname);
1089 r = -1;
1090 }
1091
1092 return r;
1093 }
1094
1095 static int cgroup_rmdir(struct hierarchy **hierarchies,
1096 const char *container_cgroup)
1097 {
1098 int i;
1099
1100 if (!container_cgroup || !hierarchies)
1101 return 0;
1102
1103 for (i = 0; hierarchies[i]; i++) {
1104 int ret;
1105 struct hierarchy *h = hierarchies[i];
1106
1107 if (!h->fullcgpath)
1108 continue;
1109
1110 ret = recursive_destroy(h->fullcgpath);
1111 if (ret < 0)
1112 WARN("Failed to destroy \"%s\"", h->fullcgpath);
1113
1114 free(h->fullcgpath);
1115 h->fullcgpath = NULL;
1116 }
1117
1118 return 0;
1119 }
1120
1121 struct generic_userns_exec_data {
1122 struct hierarchy **hierarchies;
1123 const char *container_cgroup;
1124 struct lxc_conf *conf;
1125 uid_t origuid; /* target uid in parent namespace */
1126 char *path;
1127 };
1128
1129 static int cgroup_rmdir_wrapper(void *data)
1130 {
1131 int ret;
1132 struct generic_userns_exec_data *arg = data;
1133 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1134 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1135
1136 ret = setresgid(nsgid, nsgid, nsgid);
1137 if (ret < 0) {
1138 SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
1139 (int)nsgid, (int)nsgid);
1140 return -1;
1141 }
1142
1143 ret = setresuid(nsuid, nsuid, nsuid);
1144 if (ret < 0) {
1145 SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
1146 (int)nsuid, (int)nsuid);
1147 return -1;
1148 }
1149
1150 ret = setgroups(0, NULL);
1151 if (ret < 0 && errno != EPERM) {
1152 SYSERROR("Failed to setgroups(0, NULL)");
1153 return -1;
1154 }
1155
1156 return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
1157 }
1158
1159 static void cgfsng_destroy(struct cgroup_ops *ops, struct lxc_handler *handler)
1160 {
1161 int ret;
1162 struct generic_userns_exec_data wrap;
1163
1164 wrap.origuid = 0;
1165 wrap.container_cgroup = ops->container_cgroup;
1166 wrap.hierarchies = ops->hierarchies;
1167 wrap.conf = handler->conf;
1168
1169 if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
1170 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
1171 "cgroup_rmdir_wrapper");
1172 else
1173 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
1174 if (ret < 0) {
1175 WARN("Failed to destroy cgroups");
1176 return;
1177 }
1178 }
1179
1180 static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
1181 {
1182 size_t i, parts_len;
1183 char **it;
1184 size_t full_len = 0;
1185 char *add_controllers = NULL, *cgroup = NULL;
1186 char **parts = NULL;
1187 bool bret = false;
1188
1189 if (h->version != CGROUP2_SUPER_MAGIC)
1190 return true;
1191
1192 if (!h->controllers)
1193 return true;
1194
1195 /* For now we simply enable all controllers that we have detected by
1196 * creating a string like "+memory +pids +cpu +io".
1197 * TODO: In the near future we might want to support "-<controller>"
1198 * etc. but whether supporting semantics like this make sense will need
1199 * some thinking.
1200 */
1201 for (it = h->controllers; it && *it; it++) {
1202 full_len += strlen(*it) + 2;
1203 add_controllers = must_realloc(add_controllers, full_len + 1);
1204
1205 if (h->controllers[0] == *it)
1206 add_controllers[0] = '\0';
1207
1208 strncat(add_controllers, "+", 1);
1209 strncat(add_controllers, *it, strlen(*it));
1210
1211 if ((it + 1) && *(it + 1))
1212 strncat(add_controllers, " ", 1);
1213 }
1214
1215 parts = lxc_string_split(cgname, '/');
1216 if (!parts)
1217 goto on_error;
1218
1219 parts_len = lxc_array_len((void **)parts);
1220 if (parts_len > 0)
1221 parts_len--;
1222
1223 cgroup = must_make_path(h->mountpoint, h->base_cgroup, NULL);
1224 for (i = 0; i < parts_len; i++) {
1225 int ret;
1226 char *target;
1227
1228 cgroup = must_append_path(cgroup, parts[i], NULL);
1229 target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
1230 ret = lxc_write_to_file(target, add_controllers, full_len, false, 0666);
1231 free(target);
1232 if (ret < 0) {
1233 SYSERROR("Could not enable \"%s\" controllers in the "
1234 "unified cgroup \"%s\"", add_controllers, cgroup);
1235 goto on_error;
1236 }
1237 }
1238
1239 bret = true;
1240
1241 on_error:
1242 lxc_free_array((void **)parts, free);
1243 free(add_controllers);
1244 free(cgroup);
1245 return bret;
1246 }
1247
1248 static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1249 {
1250 int ret;
1251
1252 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
1253 if (dir_exists(h->fullcgpath)) {
1254 ERROR("The cgroup \"%s\" already existed", h->fullcgpath);
1255 return false;
1256 }
1257
1258 if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1259 ERROR("Failed to handle legacy cpuset controller");
1260 return false;
1261 }
1262
1263 ret = mkdir_p(h->fullcgpath, 0755);
1264 if (ret < 0) {
1265 ERROR("Failed to create cgroup \"%s\"", h->fullcgpath);
1266 return false;
1267 }
1268
1269 return cg_unified_create_cgroup(h, cgname);
1270 }
1271
1272 static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1273 {
1274 int ret;
1275
1276 ret = rmdir(h->fullcgpath);
1277 if (ret < 0)
1278 SYSERROR("Failed to rmdir(\"%s\") from failed creation attempt", h->fullcgpath);
1279
1280 free(h->fullcgpath);
1281 h->fullcgpath = NULL;
1282 }
1283
1284 /* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1285 * next cgroup_pattern-1, -2, ..., -999.
1286 */
1287 static inline bool cgfsng_create(struct cgroup_ops *ops,
1288 struct lxc_handler *handler)
1289 {
1290 int i;
1291 size_t len;
1292 char *container_cgroup, *offset, *tmp;
1293 int idx = 0;
1294 struct lxc_conf *conf = handler->conf;
1295
1296 if (ops->container_cgroup) {
1297 WARN("cgfsng_create called a second time: %s", ops->container_cgroup);
1298 return false;
1299 }
1300
1301 if (!conf)
1302 return false;
1303
1304 if (conf->cgroup_meta.dir)
1305 tmp = lxc_string_join("/", (const char *[]){conf->cgroup_meta.dir, handler->name, NULL}, false);
1306 else
1307 tmp = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1308 if (!tmp) {
1309 ERROR("Failed expanding cgroup name pattern");
1310 return false;
1311 }
1312
1313 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
1314 container_cgroup = must_alloc(len);
1315 (void)strlcpy(container_cgroup, tmp, len);
1316 free(tmp);
1317 offset = container_cgroup + len - 5;
1318
1319 again:
1320 if (idx == 1000) {
1321 ERROR("Too many conflicting cgroup names");
1322 goto out_free;
1323 }
1324
1325 if (idx) {
1326 int ret;
1327
1328 ret = snprintf(offset, 5, "-%d", idx);
1329 if (ret < 0 || (size_t)ret >= 5) {
1330 FILE *f = fopen("/dev/null", "w");
1331 if (f) {
1332 fprintf(f, "Workaround for GCC7 bug: "
1333 "https://gcc.gnu.org/bugzilla/"
1334 "show_bug.cgi?id=78969");
1335 fclose(f);
1336 }
1337 }
1338 }
1339
1340 for (i = 0; ops->hierarchies[i]; i++) {
1341 if (!create_path_for_hierarchy(ops->hierarchies[i], container_cgroup)) {
1342 int j;
1343 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->fullcgpath);
1344 free(ops->hierarchies[i]->fullcgpath);
1345 ops->hierarchies[i]->fullcgpath = NULL;
1346 for (j = 0; j < i; j++)
1347 remove_path_for_hierarchy(ops->hierarchies[j], container_cgroup);
1348 idx++;
1349 goto again;
1350 }
1351 }
1352
1353 ops->container_cgroup = container_cgroup;
1354
1355 return true;
1356
1357 out_free:
1358 free(container_cgroup);
1359
1360 return false;
1361 }
1362
1363 static bool cgfsng_enter(struct cgroup_ops *ops, pid_t pid)
1364 {
1365 int i, len;
1366 char pidstr[25];
1367
1368 len = snprintf(pidstr, 25, "%d", pid);
1369 if (len < 0 || len >= 25)
1370 return false;
1371
1372 for (i = 0; ops->hierarchies[i]; i++) {
1373 int ret;
1374 char *fullpath;
1375
1376 fullpath = must_make_path(ops->hierarchies[i]->fullcgpath,
1377 "cgroup.procs", NULL);
1378 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
1379 if (ret != 0) {
1380 SYSERROR("Failed to enter cgroup \"%s\"", fullpath);
1381 free(fullpath);
1382 return false;
1383 }
1384 free(fullpath);
1385 }
1386
1387 return true;
1388 }
1389
1390 static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
1391 mode_t chmod_mode)
1392 {
1393 int ret;
1394
1395 ret = chown(path, chown_uid, chown_gid);
1396 if (ret < 0) {
1397 WARN("%s - Failed to chown(%s, %d, %d)", strerror(errno), path,
1398 (int)chown_uid, (int)chown_gid);
1399 return -1;
1400 }
1401
1402 ret = chmod(path, chmod_mode);
1403 if (ret < 0) {
1404 WARN("%s - Failed to chmod(%s, %d)", strerror(errno), path,
1405 (int)chmod_mode);
1406 return -1;
1407 }
1408
1409 return 0;
1410 }
1411
1412 /* chgrp the container cgroups to container group. We leave
1413 * the container owner as cgroup owner. So we must make the
1414 * directories 775 so that the container can create sub-cgroups.
1415 *
1416 * Also chown the tasks and cgroup.procs files. Those may not
1417 * exist depending on kernel version.
1418 */
1419 static int chown_cgroup_wrapper(void *data)
1420 {
1421 int i, ret;
1422 uid_t destuid;
1423 struct generic_userns_exec_data *arg = data;
1424 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1425 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1426
1427 ret = setresgid(nsgid, nsgid, nsgid);
1428 if (ret < 0) {
1429 SYSERROR("Failed to setresgid(%d, %d, %d)",
1430 (int)nsgid, (int)nsgid, (int)nsgid);
1431 return -1;
1432 }
1433
1434 ret = setresuid(nsuid, nsuid, nsuid);
1435 if (ret < 0) {
1436 SYSERROR("Failed to setresuid(%d, %d, %d)",
1437 (int)nsuid, (int)nsuid, (int)nsuid);
1438 return -1;
1439 }
1440
1441 ret = setgroups(0, NULL);
1442 if (ret < 0 && errno != EPERM) {
1443 SYSERROR("Failed to setgroups(0, NULL)");
1444 return -1;
1445 }
1446
1447 destuid = get_ns_uid(arg->origuid);
1448
1449 for (i = 0; arg->hierarchies[i]; i++) {
1450 char *fullpath;
1451 char *path = arg->hierarchies[i]->fullcgpath;
1452
1453 ret = chowmod(path, destuid, nsgid, 0775);
1454 if (ret < 0)
1455 return -1;
1456
1457 /* Failures to chown() these are inconvenient but not
1458 * detrimental We leave these owned by the container launcher,
1459 * so that container root can write to the files to attach. We
1460 * chmod() them 664 so that container systemd can write to the
1461 * files (which systemd in wily insists on doing).
1462 */
1463
1464 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
1465 fullpath = must_make_path(path, "tasks", NULL);
1466 (void)chowmod(fullpath, destuid, nsgid, 0664);
1467 free(fullpath);
1468 }
1469
1470 fullpath = must_make_path(path, "cgroup.procs", NULL);
1471 (void)chowmod(fullpath, destuid, nsgid, 0664);
1472 free(fullpath);
1473
1474 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1475 continue;
1476
1477 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
1478 (void)chowmod(fullpath, destuid, nsgid, 0664);
1479 free(fullpath);
1480
1481 fullpath = must_make_path(path, "cgroup.threads", NULL);
1482 (void)chowmod(fullpath, destuid, nsgid, 0664);
1483 free(fullpath);
1484 }
1485
1486 return 0;
1487 }
1488
1489 static bool cgfsng_chown(struct cgroup_ops *ops, struct lxc_conf *conf)
1490 {
1491 struct generic_userns_exec_data wrap;
1492
1493 if (lxc_list_empty(&conf->id_map))
1494 return true;
1495
1496 wrap.origuid = geteuid();
1497 wrap.path = NULL;
1498 wrap.hierarchies = ops->hierarchies;
1499 wrap.conf = conf;
1500
1501 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1502 "chown_cgroup_wrapper") < 0) {
1503 ERROR("Error requesting cgroup chown in new user namespace");
1504 return false;
1505 }
1506
1507 return true;
1508 }
1509
1510 /* cgroup-full:* is done, no need to create subdirs */
1511 static bool cg_mount_needs_subdirs(int type)
1512 {
1513 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1514 return false;
1515
1516 return true;
1517 }
1518
1519 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1520 * remount controller ro if needed and bindmount the cgroupfs onto
1521 * controll/the/cg/path.
1522 */
1523 static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1524 char *controllerpath, char *cgpath,
1525 const char *container_cgroup)
1526 {
1527 int ret, remount_flags;
1528 char *sourcepath;
1529 int flags = MS_BIND;
1530
1531 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1532 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1533 if (ret < 0) {
1534 SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
1535 controllerpath, controllerpath);
1536 return -1;
1537 }
1538
1539 remount_flags = add_required_remount_flags(controllerpath,
1540 controllerpath,
1541 flags | MS_REMOUNT);
1542 ret = mount(controllerpath, controllerpath, "cgroup",
1543 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1544 NULL);
1545 if (ret < 0) {
1546 SYSERROR("Failed to remount \"%s\" ro", controllerpath);
1547 return -1;
1548 }
1549
1550 INFO("Remounted %s read-only", controllerpath);
1551 }
1552
1553 sourcepath = must_make_path(h->mountpoint, h->base_cgroup,
1554 container_cgroup, NULL);
1555 if (type == LXC_AUTO_CGROUP_RO)
1556 flags |= MS_RDONLY;
1557
1558 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1559 if (ret < 0) {
1560 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1561 free(sourcepath);
1562 return -1;
1563 }
1564 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1565
1566 if (flags & MS_RDONLY) {
1567 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1568 flags | MS_REMOUNT);
1569 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1570 if (ret < 0) {
1571 SYSERROR("Failed to remount \"%s\" ro", cgpath);
1572 free(sourcepath);
1573 return -1;
1574 }
1575 INFO("Remounted %s read-only", cgpath);
1576 }
1577
1578 free(sourcepath);
1579 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1580 return 0;
1581 }
1582
1583 /* __cg_mount_direct
1584 *
1585 * Mount cgroup hierarchies directly without using bind-mounts. The main
1586 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1587 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1588 */
1589 static int __cg_mount_direct(int type, struct hierarchy *h,
1590 const char *controllerpath)
1591 {
1592 int ret;
1593 char *controllers = NULL;
1594 char *fstype = "cgroup2";
1595 unsigned long flags = 0;
1596
1597 flags |= MS_NOSUID;
1598 flags |= MS_NOEXEC;
1599 flags |= MS_NODEV;
1600 flags |= MS_RELATIME;
1601
1602 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1603 flags |= MS_RDONLY;
1604
1605 if (h->version != CGROUP2_SUPER_MAGIC) {
1606 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1607 if (!controllers)
1608 return -ENOMEM;
1609 fstype = "cgroup";
1610 }
1611
1612 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
1613 free(controllers);
1614 if (ret < 0) {
1615 SYSERROR("Failed to mount \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1616 return -1;
1617 }
1618
1619 DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1620 return 0;
1621 }
1622
1623 static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1624 const char *controllerpath)
1625 {
1626 return __cg_mount_direct(type, h, controllerpath);
1627 }
1628
1629 static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1630 const char *controllerpath)
1631 {
1632 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1633 return 0;
1634
1635 return __cg_mount_direct(type, h, controllerpath);
1636 }
1637
1638 static bool cgfsng_mount(struct cgroup_ops *ops, struct lxc_handler *handler,
1639 const char *root, int type)
1640 {
1641 int i, ret;
1642 char *tmpfspath = NULL;
1643 bool has_cgns = false, retval = false, wants_force_mount = false;
1644
1645 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1646 return true;
1647
1648 if (type & LXC_AUTO_CGROUP_FORCE) {
1649 type &= ~LXC_AUTO_CGROUP_FORCE;
1650 wants_force_mount = true;
1651 }
1652
1653 if (!wants_force_mount){
1654 if (!lxc_list_empty(&handler->conf->keepcaps))
1655 wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1656 else
1657 wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1658 }
1659
1660 has_cgns = cgns_supported();
1661 if (has_cgns && !wants_force_mount)
1662 return true;
1663
1664 if (type == LXC_AUTO_CGROUP_NOSPEC)
1665 type = LXC_AUTO_CGROUP_MIXED;
1666 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1667 type = LXC_AUTO_CGROUP_FULL_MIXED;
1668
1669 /* Mount tmpfs */
1670 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1671 ret = safe_mount(NULL, tmpfspath, "tmpfs",
1672 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1673 "size=10240k,mode=755", root);
1674 if (ret < 0)
1675 goto on_error;
1676
1677 for (i = 0; ops->hierarchies[i]; i++) {
1678 char *controllerpath, *path2;
1679 struct hierarchy *h = ops->hierarchies[i];
1680 char *controller = strrchr(h->mountpoint, '/');
1681
1682 if (!controller)
1683 continue;
1684 controller++;
1685
1686 controllerpath = must_make_path(tmpfspath, controller, NULL);
1687 if (dir_exists(controllerpath)) {
1688 free(controllerpath);
1689 continue;
1690 }
1691
1692 ret = mkdir(controllerpath, 0755);
1693 if (ret < 0) {
1694 SYSERROR("Error creating cgroup path: %s", controllerpath);
1695 free(controllerpath);
1696 goto on_error;
1697 }
1698
1699 if (has_cgns && wants_force_mount) {
1700 /* If cgroup namespaces are supported but the container
1701 * will not have CAP_SYS_ADMIN after it has started we
1702 * need to mount the cgroups manually.
1703 */
1704 ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
1705 free(controllerpath);
1706 if (ret < 0)
1707 goto on_error;
1708
1709 continue;
1710 }
1711
1712 ret = cg_mount_cgroup_full(type, h, controllerpath);
1713 if (ret < 0) {
1714 free(controllerpath);
1715 goto on_error;
1716 }
1717
1718 if (!cg_mount_needs_subdirs(type)) {
1719 free(controllerpath);
1720 continue;
1721 }
1722
1723 path2 = must_make_path(controllerpath, h->base_cgroup,
1724 ops->container_cgroup, NULL);
1725 ret = mkdir_p(path2, 0755);
1726 if (ret < 0) {
1727 free(controllerpath);
1728 free(path2);
1729 goto on_error;
1730 }
1731
1732 ret = cg_legacy_mount_controllers(type, h, controllerpath,
1733 path2, ops->container_cgroup);
1734 free(controllerpath);
1735 free(path2);
1736 if (ret < 0)
1737 goto on_error;
1738 }
1739 retval = true;
1740
1741 on_error:
1742 free(tmpfspath);
1743 return retval;
1744 }
1745
1746 static int recursive_count_nrtasks(char *dirname)
1747 {
1748 struct dirent *direntp;
1749 DIR *dir;
1750 int count = 0, ret;
1751 char *path;
1752
1753 dir = opendir(dirname);
1754 if (!dir)
1755 return 0;
1756
1757 while ((direntp = readdir(dir))) {
1758 struct stat mystat;
1759
1760 if (!strcmp(direntp->d_name, ".") ||
1761 !strcmp(direntp->d_name, ".."))
1762 continue;
1763
1764 path = must_make_path(dirname, direntp->d_name, NULL);
1765
1766 if (lstat(path, &mystat))
1767 goto next;
1768
1769 if (!S_ISDIR(mystat.st_mode))
1770 goto next;
1771
1772 count += recursive_count_nrtasks(path);
1773 next:
1774 free(path);
1775 }
1776
1777 path = must_make_path(dirname, "cgroup.procs", NULL);
1778 ret = lxc_count_file_lines(path);
1779 if (ret != -1)
1780 count += ret;
1781 free(path);
1782
1783 (void)closedir(dir);
1784
1785 return count;
1786 }
1787
1788 static int cgfsng_nrtasks(struct cgroup_ops *ops)
1789 {
1790 int count;
1791 char *path;
1792
1793 if (!ops->container_cgroup || !ops->hierarchies)
1794 return -1;
1795
1796 path = must_make_path(ops->hierarchies[0]->fullcgpath, NULL);
1797 count = recursive_count_nrtasks(path);
1798 free(path);
1799 return count;
1800 }
1801
1802 /* Only root needs to escape to the cgroup of its init. */
1803 static bool cgfsng_escape(const struct cgroup_ops *ops)
1804 {
1805 int i;
1806
1807 if (geteuid())
1808 return true;
1809
1810 for (i = 0; ops->hierarchies[i]; i++) {
1811 int ret;
1812 char *fullpath;
1813
1814 fullpath = must_make_path(ops->hierarchies[i]->mountpoint,
1815 ops->hierarchies[i]->base_cgroup,
1816 "cgroup.procs", NULL);
1817 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1818 if (ret != 0) {
1819 SYSERROR("Failed to escape to cgroup \"%s\"", fullpath);
1820 free(fullpath);
1821 return false;
1822 }
1823 free(fullpath);
1824 }
1825
1826 return true;
1827 }
1828
1829 static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
1830 {
1831 int i;
1832
1833 for (i = 0; ops->hierarchies[i]; i++)
1834 ;
1835
1836 return i;
1837 }
1838
1839 static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
1840 {
1841 int i;
1842
1843 /* sanity check n */
1844 for (i = 0; i < n; i++)
1845 if (!ops->hierarchies[i])
1846 return false;
1847
1848 *out = ops->hierarchies[i]->controllers;
1849
1850 return true;
1851 }
1852
1853 #define THAWED "THAWED"
1854 #define THAWED_LEN (strlen(THAWED))
1855
1856 /* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
1857 * to be adapted.
1858 */
1859 static bool cgfsng_unfreeze(struct cgroup_ops *ops)
1860 {
1861 int ret;
1862 char *fullpath;
1863 struct hierarchy *h;
1864
1865 h = get_hierarchy(ops, "freezer");
1866 if (!h)
1867 return false;
1868
1869 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
1870 ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false, 0666);
1871 free(fullpath);
1872 if (ret < 0)
1873 return false;
1874
1875 return true;
1876 }
1877
1878 static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
1879 const char *controller)
1880 {
1881 struct hierarchy *h;
1882
1883 h = get_hierarchy(ops, controller);
1884 if (!h) {
1885 WARN("Failed to find hierarchy for controller \"%s\"",
1886 controller ? controller : "(null)");
1887 return NULL;
1888 }
1889
1890 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
1891 }
1892
1893 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
1894 * which must be freed by the caller.
1895 */
1896 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
1897 const char *inpath,
1898 const char *filename)
1899 {
1900 return must_make_path(h->mountpoint, inpath, filename, NULL);
1901 }
1902
1903 /* Technically, we're always at a delegation boundary here (This is especially
1904 * true when cgroup namespaces are available.). The reasoning is that in order
1905 * for us to have been able to start a container in the first place the root
1906 * cgroup must have been a leaf node. Now, either the container's init system
1907 * has populated the cgroup and kept it as a leaf node or it has created
1908 * subtrees. In the former case we will simply attach to the leaf node we
1909 * created when we started the container in the latter case we create our own
1910 * cgroup for the attaching process.
1911 */
1912 static int __cg_unified_attach(const struct hierarchy *h, const char *name,
1913 const char *lxcpath, const char *pidstr,
1914 size_t pidstr_len, const char *controller)
1915 {
1916 int ret;
1917 size_t len;
1918 int fret = -1, idx = 0;
1919 char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
1920
1921 container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
1922 /* not running */
1923 if (!container_cgroup)
1924 return 0;
1925
1926 base_path = must_make_path(h->mountpoint, container_cgroup, NULL);
1927 full_path = must_make_path(base_path, "cgroup.procs", NULL);
1928 /* cgroup is populated */
1929 ret = lxc_write_to_file(full_path, pidstr, pidstr_len, false, 0666);
1930 if (ret < 0 && errno != EBUSY)
1931 goto on_error;
1932
1933 if (ret == 0)
1934 goto on_success;
1935
1936 free(full_path);
1937
1938 len = strlen(base_path) + sizeof("/lxc-1000") - 1 +
1939 sizeof("/cgroup-procs") - 1;
1940 full_path = must_alloc(len + 1);
1941 do {
1942 if (idx)
1943 ret = snprintf(full_path, len + 1, "%s/lxc-%d",
1944 base_path, idx);
1945 else
1946 ret = snprintf(full_path, len + 1, "%s/lxc", base_path);
1947 if (ret < 0 || (size_t)ret >= len + 1)
1948 goto on_error;
1949
1950 ret = mkdir_p(full_path, 0755);
1951 if (ret < 0 && errno != EEXIST)
1952 goto on_error;
1953
1954 strncat(full_path, "/cgroup.procs", strlen("/cgroup.procs"));
1955 ret = lxc_write_to_file(full_path, pidstr, len, false, 0666);
1956 if (ret == 0)
1957 goto on_success;
1958
1959 /* this is a non-leaf node */
1960 if (errno != EBUSY)
1961 goto on_error;
1962
1963 } while (++idx > 0 && idx < 1000);
1964
1965 on_success:
1966 if (idx < 1000)
1967 fret = 0;
1968
1969 on_error:
1970 free(base_path);
1971 free(container_cgroup);
1972 free(full_path);
1973
1974 return fret;
1975 }
1976
1977 static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
1978 const char *lxcpath, pid_t pid)
1979 {
1980 int i, len, ret;
1981 char pidstr[25];
1982
1983 len = snprintf(pidstr, 25, "%d", pid);
1984 if (len < 0 || len >= 25)
1985 return false;
1986
1987 for (i = 0; ops->hierarchies[i]; i++) {
1988 char *path;
1989 char *fullpath = NULL;
1990 struct hierarchy *h = ops->hierarchies[i];
1991
1992 if (h->version == CGROUP2_SUPER_MAGIC) {
1993 ret = __cg_unified_attach(h, name, lxcpath, pidstr, len,
1994 h->controllers[0]);
1995 if (ret < 0)
1996 return false;
1997
1998 continue;
1999 }
2000
2001 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2002 /* not running */
2003 if (!path)
2004 continue;
2005
2006 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2007 free(path);
2008 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2009 if (ret < 0) {
2010 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
2011 free(fullpath);
2012 return false;
2013 }
2014 free(fullpath);
2015 }
2016
2017 return true;
2018 }
2019
2020 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2021 * don't have a cgroup_data set up, so we ask the running container through the
2022 * commands API for the cgroup path.
2023 */
2024 static int cgfsng_get(struct cgroup_ops *ops, const char *filename, char *value,
2025 size_t len, const char *name, const char *lxcpath)
2026 {
2027 int ret = -1;
2028 size_t controller_len;
2029 char *controller, *p, *path;
2030 struct hierarchy *h;
2031
2032 controller_len = strlen(filename);
2033 controller = alloca(controller_len + 1);
2034 (void)strlcpy(controller, filename, controller_len + 1);
2035
2036 p = strchr(controller, '.');
2037 if (p)
2038 *p = '\0';
2039
2040 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2041 /* not running */
2042 if (!path)
2043 return -1;
2044
2045 h = get_hierarchy(ops, controller);
2046 if (h) {
2047 char *fullpath;
2048
2049 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2050 ret = lxc_read_from_file(fullpath, value, len);
2051 free(fullpath);
2052 }
2053 free(path);
2054
2055 return ret;
2056 }
2057
2058 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2059 * don't have a cgroup_data set up, so we ask the running container through the
2060 * commands API for the cgroup path.
2061 */
2062 static int cgfsng_set(struct cgroup_ops *ops, const char *filename,
2063 const char *value, const char *name, const char *lxcpath)
2064 {
2065 int ret = -1;
2066 size_t controller_len;
2067 char *controller, *p, *path;
2068 struct hierarchy *h;
2069
2070 controller_len = strlen(filename);
2071 controller = alloca(controller_len + 1);
2072 (void)strlcpy(controller, filename, controller_len + 1);
2073
2074 p = strchr(controller, '.');
2075 if (p)
2076 *p = '\0';
2077
2078 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2079 /* not running */
2080 if (!path)
2081 return -1;
2082
2083 h = get_hierarchy(ops, controller);
2084 if (h) {
2085 char *fullpath;
2086
2087 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2088 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2089 free(fullpath);
2090 }
2091 free(path);
2092
2093 return ret;
2094 }
2095
2096 /* take devices cgroup line
2097 * /dev/foo rwx
2098 * and convert it to a valid
2099 * type major:minor mode
2100 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2101 * the output.
2102 */
2103 static int convert_devpath(const char *invalue, char *dest)
2104 {
2105 int n_parts;
2106 char *p, *path, type;
2107 unsigned long minor, major;
2108 struct stat sb;
2109 int ret = -EINVAL;
2110 char *mode = NULL;
2111
2112 path = must_copy_string(invalue);
2113
2114 /* Read path followed by mode. Ignore any trailing text.
2115 * A ' # comment' would be legal. Technically other text is not
2116 * legal, we could check for that if we cared to.
2117 */
2118 for (n_parts = 1, p = path; *p && n_parts < 3; p++) {
2119 if (*p != ' ')
2120 continue;
2121 *p = '\0';
2122
2123 if (n_parts != 1)
2124 break;
2125 p++;
2126 n_parts++;
2127
2128 while (*p == ' ')
2129 p++;
2130
2131 mode = p;
2132
2133 if (*p == '\0')
2134 goto out;
2135 }
2136
2137 if (n_parts == 1)
2138 goto out;
2139
2140 ret = stat(path, &sb);
2141 if (ret < 0)
2142 goto out;
2143
2144 mode_t m = sb.st_mode & S_IFMT;
2145 switch (m) {
2146 case S_IFBLK:
2147 type = 'b';
2148 break;
2149 case S_IFCHR:
2150 type = 'c';
2151 break;
2152 default:
2153 ERROR("Unsupported device type %i for \"%s\"", m, path);
2154 ret = -EINVAL;
2155 goto out;
2156 }
2157
2158 major = MAJOR(sb.st_rdev);
2159 minor = MINOR(sb.st_rdev);
2160 ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
2161 if (ret < 0 || ret >= 50) {
2162 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2163 "chars)", type, major, minor, mode);
2164 ret = -ENAMETOOLONG;
2165 goto out;
2166 }
2167 ret = 0;
2168
2169 out:
2170 free(path);
2171 return ret;
2172 }
2173
2174 /* Called from setup_limits - here we have the container's cgroup_data because
2175 * we created the cgroups.
2176 */
2177 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2178 const char *value)
2179 {
2180 size_t len;
2181 char *fullpath, *p;
2182 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2183 char converted_value[50];
2184 struct hierarchy *h;
2185 int ret = 0;
2186 char *controller = NULL;
2187
2188 len = strlen(filename);
2189 controller = alloca(len + 1);
2190 (void)strlcpy(controller, filename, len + 1);
2191
2192 p = strchr(controller, '.');
2193 if (p)
2194 *p = '\0';
2195
2196 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
2197 ret = convert_devpath(value, converted_value);
2198 if (ret < 0)
2199 return ret;
2200 value = converted_value;
2201 }
2202
2203 h = get_hierarchy(ops, controller);
2204 if (!h) {
2205 ERROR("Failed to setup limits for the \"%s\" controller. "
2206 "The controller seems to be unused by \"cgfsng\" cgroup "
2207 "driver or not enabled on the cgroup hierarchy",
2208 controller);
2209 errno = ENOENT;
2210 return -ENOENT;
2211 }
2212
2213 fullpath = must_make_path(h->fullcgpath, filename, NULL);
2214 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2215 free(fullpath);
2216 return ret;
2217 }
2218
2219 static bool __cg_legacy_setup_limits(struct cgroup_ops *ops,
2220 struct lxc_list *cgroup_settings,
2221 bool do_devices)
2222 {
2223 struct lxc_list *iterator, *next, *sorted_cgroup_settings;
2224 struct lxc_cgroup *cg;
2225 bool ret = false;
2226
2227 if (lxc_list_empty(cgroup_settings))
2228 return true;
2229
2230 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2231 if (!sorted_cgroup_settings)
2232 return false;
2233
2234 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2235 cg = iterator->elem;
2236
2237 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2238 if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
2239 if (do_devices && (errno == EACCES || errno == EPERM)) {
2240 WARN("Failed to set \"%s\" to \"%s\"",
2241 cg->subsystem, cg->value);
2242 continue;
2243 }
2244 WARN("Failed to set \"%s\" to \"%s\"",
2245 cg->subsystem, cg->value);
2246 goto out;
2247 }
2248 DEBUG("Set controller \"%s\" set to \"%s\"",
2249 cg->subsystem, cg->value);
2250 }
2251 }
2252
2253 ret = true;
2254 INFO("Limits for the legacy cgroup hierarchies have been setup");
2255 out:
2256 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2257 lxc_list_del(iterator);
2258 free(iterator);
2259 }
2260 free(sorted_cgroup_settings);
2261 return ret;
2262 }
2263
2264 static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
2265 struct lxc_list *cgroup_settings)
2266 {
2267 struct lxc_list *iterator;
2268 struct hierarchy *h = ops->unified;
2269
2270 if (lxc_list_empty(cgroup_settings))
2271 return true;
2272
2273 if (!h)
2274 return false;
2275
2276 lxc_list_for_each(iterator, cgroup_settings) {
2277 int ret;
2278 char *fullpath;
2279 struct lxc_cgroup *cg = iterator->elem;
2280
2281 fullpath = must_make_path(h->fullcgpath, cg->subsystem, NULL);
2282 ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666);
2283 free(fullpath);
2284 if (ret < 0) {
2285 SYSERROR("Failed to set \"%s\" to \"%s\"",
2286 cg->subsystem, cg->value);
2287 return false;
2288 }
2289 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2290 }
2291
2292 INFO("Limits for the unified cgroup hierarchy have been setup");
2293 return true;
2294 }
2295
2296 static bool cgfsng_setup_limits(struct cgroup_ops *ops, struct lxc_conf *conf,
2297 bool do_devices)
2298 {
2299 bool bret;
2300
2301 bret = __cg_legacy_setup_limits(ops, &conf->cgroup, do_devices);
2302 if (!bret)
2303 return false;
2304
2305 return __cg_unified_setup_limits(ops, &conf->cgroup2);
2306 }
2307
2308 /* At startup, parse_hierarchies finds all the info we need about cgroup
2309 * mountpoints and current cgroups, and stores it in @d.
2310 */
2311 static bool cg_hybrid_init(struct cgroup_ops *ops)
2312 {
2313 int ret;
2314 char *basecginfo;
2315 bool will_escape;
2316 FILE *f;
2317 size_t len = 0;
2318 char *line = NULL;
2319 char **klist = NULL, **nlist = NULL;
2320
2321 /* Root spawned containers escape the current cgroup, so use init's
2322 * cgroups as our base in that case.
2323 */
2324 will_escape = (geteuid() == 0);
2325 if (will_escape)
2326 basecginfo = read_file("/proc/1/cgroup");
2327 else
2328 basecginfo = read_file("/proc/self/cgroup");
2329 if (!basecginfo)
2330 return false;
2331
2332 ret = get_existing_subsystems(&klist, &nlist);
2333 if (ret < 0) {
2334 ERROR("Failed to retrieve available legacy cgroup controllers");
2335 free(basecginfo);
2336 return false;
2337 }
2338
2339 f = fopen("/proc/self/mountinfo", "r");
2340 if (!f) {
2341 ERROR("Failed to open \"/proc/self/mountinfo\"");
2342 free(basecginfo);
2343 return false;
2344 }
2345
2346 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2347
2348 while (getline(&line, &len, f) != -1) {
2349 int type;
2350 bool writeable;
2351 struct hierarchy *new;
2352 char *base_cgroup = NULL, *mountpoint = NULL;
2353 char **controller_list = NULL;
2354
2355 type = get_cgroup_version(line);
2356 if (type == 0)
2357 continue;
2358
2359 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2360 continue;
2361
2362 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2363 if (type == CGROUP2_SUPER_MAGIC)
2364 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2365 else if (type == CGROUP_SUPER_MAGIC)
2366 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2367 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2368 if (type == CGROUP_SUPER_MAGIC)
2369 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2370 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2371 if (type == CGROUP2_SUPER_MAGIC)
2372 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2373 }
2374
2375 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
2376 if (!controller_list && type == CGROUP_SUPER_MAGIC)
2377 continue;
2378
2379 if (type == CGROUP_SUPER_MAGIC)
2380 if (controller_list_is_dup(ops->hierarchies, controller_list))
2381 goto next;
2382
2383 mountpoint = cg_hybrid_get_mountpoint(line);
2384 if (!mountpoint) {
2385 ERROR("Failed parsing mountpoint from \"%s\"", line);
2386 goto next;
2387 }
2388
2389 if (type == CGROUP_SUPER_MAGIC)
2390 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
2391 else
2392 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
2393 if (!base_cgroup) {
2394 ERROR("Failed to find current cgroup");
2395 goto next;
2396 }
2397
2398 trim(base_cgroup);
2399 prune_init_scope(base_cgroup);
2400 if (type == CGROUP2_SUPER_MAGIC)
2401 writeable = test_writeable_v2(mountpoint, base_cgroup);
2402 else
2403 writeable = test_writeable_v1(mountpoint, base_cgroup);
2404 if (!writeable)
2405 goto next;
2406
2407 if (type == CGROUP2_SUPER_MAGIC) {
2408 char *cgv2_ctrl_path;
2409
2410 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
2411 "cgroup.controllers",
2412 NULL);
2413
2414 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
2415 free(cgv2_ctrl_path);
2416 if (!controller_list) {
2417 controller_list = cg_unified_make_empty_controller();
2418 TRACE("No controllers are enabled for "
2419 "delegation in the unified hierarchy");
2420 }
2421 }
2422
2423 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
2424 if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
2425 ops->unified = new;
2426
2427 continue;
2428
2429 next:
2430 free_string_list(controller_list);
2431 free(mountpoint);
2432 free(base_cgroup);
2433 }
2434
2435 free_string_list(klist);
2436 free_string_list(nlist);
2437
2438 free(basecginfo);
2439
2440 fclose(f);
2441 free(line);
2442
2443 TRACE("Writable cgroup hierarchies:");
2444 lxc_cgfsng_print_hierarchies(ops);
2445
2446 /* verify that all controllers in cgroup.use and all crucial
2447 * controllers are accounted for
2448 */
2449 if (!all_controllers_found(ops))
2450 return false;
2451
2452 return true;
2453 }
2454
2455 static int cg_is_pure_unified(void)
2456 {
2457
2458 int ret;
2459 struct statfs fs;
2460
2461 ret = statfs("/sys/fs/cgroup", &fs);
2462 if (ret < 0)
2463 return -ENOMEDIUM;
2464
2465 if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
2466 return CGROUP2_SUPER_MAGIC;
2467
2468 return 0;
2469 }
2470
2471 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
2472 static char *cg_unified_get_current_cgroup(void)
2473 {
2474 char *basecginfo, *base_cgroup;
2475 bool will_escape;
2476 char *copy = NULL;
2477
2478 will_escape = (geteuid() == 0);
2479 if (will_escape)
2480 basecginfo = read_file("/proc/1/cgroup");
2481 else
2482 basecginfo = read_file("/proc/self/cgroup");
2483 if (!basecginfo)
2484 return NULL;
2485
2486 base_cgroup = strstr(basecginfo, "0::/");
2487 if (!base_cgroup)
2488 goto cleanup_on_err;
2489
2490 base_cgroup = base_cgroup + 3;
2491 copy = copy_to_eol(base_cgroup);
2492 if (!copy)
2493 goto cleanup_on_err;
2494
2495 cleanup_on_err:
2496 free(basecginfo);
2497 if (copy)
2498 trim(copy);
2499
2500 return copy;
2501 }
2502
2503 static int cg_unified_init(struct cgroup_ops *ops)
2504 {
2505 int ret;
2506 char *mountpoint, *subtree_path;
2507 char **delegatable;
2508 char *base_cgroup = NULL;
2509
2510 ret = cg_is_pure_unified();
2511 if (ret == -ENOMEDIUM)
2512 return -ENOMEDIUM;
2513
2514 if (ret != CGROUP2_SUPER_MAGIC)
2515 return 0;
2516
2517 base_cgroup = cg_unified_get_current_cgroup();
2518 if (!base_cgroup)
2519 return -EINVAL;
2520 prune_init_scope(base_cgroup);
2521
2522 /* We assume that we have already been given controllers to delegate
2523 * further down the hierarchy. If not it is up to the user to delegate
2524 * them to us.
2525 */
2526 mountpoint = must_copy_string("/sys/fs/cgroup");
2527 subtree_path = must_make_path(mountpoint, base_cgroup,
2528 "cgroup.subtree_control", NULL);
2529 delegatable = cg_unified_get_controllers(subtree_path);
2530 free(subtree_path);
2531 if (!delegatable)
2532 delegatable = cg_unified_make_empty_controller();
2533 if (!delegatable[0])
2534 TRACE("No controllers are enabled for delegation");
2535
2536 /* TODO: If the user requested specific controllers via lxc.cgroup.use
2537 * we should verify here. The reason I'm not doing it right is that I'm
2538 * not convinced that lxc.cgroup.use will be the future since it is a
2539 * global property. I much rather have an option that lets you request
2540 * controllers per container.
2541 */
2542
2543 add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
2544
2545 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2546 return CGROUP2_SUPER_MAGIC;
2547 }
2548
2549 static bool cg_init(struct cgroup_ops *ops)
2550 {
2551 int ret;
2552 const char *tmp;
2553
2554 tmp = lxc_global_config_value("lxc.cgroup.use");
2555 if (tmp)
2556 ops->cgroup_use = must_copy_string(tmp);
2557
2558 ret = cg_unified_init(ops);
2559 if (ret < 0)
2560 return false;
2561
2562 if (ret == CGROUP2_SUPER_MAGIC)
2563 return true;
2564
2565 return cg_hybrid_init(ops);
2566 }
2567
2568 static bool cgfsng_data_init(struct cgroup_ops *ops)
2569 {
2570 const char *cgroup_pattern;
2571
2572 /* copy system-wide cgroup information */
2573 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
2574 if (!cgroup_pattern) {
2575 /* lxc.cgroup.pattern is only NULL on error. */
2576 ERROR("Failed to retrieve cgroup pattern");
2577 return false;
2578 }
2579 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
2580
2581 return true;
2582 }
2583
2584 struct cgroup_ops *cgfsng_ops_init(void)
2585 {
2586 struct cgroup_ops *cgfsng_ops;
2587
2588 cgfsng_ops = malloc(sizeof(struct cgroup_ops));
2589 if (!cgfsng_ops)
2590 return NULL;
2591
2592 memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
2593 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
2594
2595 if (!cg_init(cgfsng_ops)) {
2596 free(cgfsng_ops);
2597 return NULL;
2598 }
2599
2600 cgfsng_ops->data_init = cgfsng_data_init;
2601 cgfsng_ops->destroy = cgfsng_destroy;
2602 cgfsng_ops->create = cgfsng_create;
2603 cgfsng_ops->enter = cgfsng_enter;
2604 cgfsng_ops->escape = cgfsng_escape;
2605 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
2606 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
2607 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
2608 cgfsng_ops->get = cgfsng_get;
2609 cgfsng_ops->set = cgfsng_set;
2610 cgfsng_ops->unfreeze = cgfsng_unfreeze;
2611 cgfsng_ops->setup_limits = cgfsng_setup_limits;
2612 cgfsng_ops->driver = "cgfsng";
2613 cgfsng_ops->version = "1.0.0";
2614 cgfsng_ops->attach = cgfsng_attach;
2615 cgfsng_ops->chown = cgfsng_chown;
2616 cgfsng_ops->mount = cgfsng_mount;
2617 cgfsng_ops->nrtasks = cgfsng_nrtasks;
2618
2619 return cgfsng_ops;
2620 }