]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfsng.c
5d7effcba9909b66c4b9588c51973be9ce56653d
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
1 /*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 * Christian Brauner <christian.brauner@ubuntu.com>
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /*
26 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
27 * cgroup backend. The original cgfs.c was designed to be as flexible
28 * as possible. It would try to find cgroup filesystems no matter where
29 * or how you had them mounted, and deduce the most usable mount for
30 * each controller.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
36
37 #include "config.h"
38
39 #include <ctype.h>
40 #include <dirent.h>
41 #include <errno.h>
42 #include <grp.h>
43 #include <stdint.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48 #include <linux/kdev_t.h>
49 #include <linux/types.h>
50 #include <sys/types.h>
51
52 #include "caps.h"
53 #include "cgroup.h"
54 #include "cgroup_utils.h"
55 #include "commands.h"
56 #include "conf.h"
57 #include "log.h"
58 #include "macro.h"
59 #include "storage/storage.h"
60 #include "utils.h"
61
62 #ifndef HAVE_STRLCPY
63 #include "include/strlcpy.h"
64 #endif
65
66 #ifndef HAVE_STRLCAT
67 #include "include/strlcat.h"
68 #endif
69
70 lxc_log_define(cgfsng, cgroup);
71
72 static void free_string_list(char **clist)
73 {
74 int i;
75
76 if (!clist)
77 return;
78
79 for (i = 0; clist[i]; i++)
80 free(clist[i]);
81
82 free(clist);
83 }
84
85 /* Allocate a pointer, do not fail. */
86 static void *must_alloc(size_t sz)
87 {
88 return must_realloc(NULL, sz);
89 }
90
91 /* Given a pointer to a null-terminated array of pointers, realloc to add one
92 * entry, and point the new entry to NULL. Do not fail. Return the index to the
93 * second-to-last entry - that is, the one which is now available for use
94 * (keeping the list null-terminated).
95 */
96 static int append_null_to_list(void ***list)
97 {
98 int newentry = 0;
99
100 if (*list)
101 for (; (*list)[newentry]; newentry++)
102 ;
103
104 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
105 (*list)[newentry + 1] = NULL;
106 return newentry;
107 }
108
109 /* Given a null-terminated array of strings, check whether @entry is one of the
110 * strings.
111 */
112 static bool string_in_list(char **list, const char *entry)
113 {
114 int i;
115
116 if (!list)
117 return false;
118
119 for (i = 0; list[i]; i++)
120 if (strcmp(list[i], entry) == 0)
121 return true;
122
123 return false;
124 }
125
126 /* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
127 * "name=systemd". Do not fail.
128 */
129 static char *cg_legacy_must_prefix_named(char *entry)
130 {
131 size_t len;
132 char *prefixed;
133
134 len = strlen(entry);
135 prefixed = must_alloc(len + 6);
136
137
138 memcpy(prefixed, "name=", STRLITERALLEN("name="));
139 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
140 prefixed[len + 5] = '\0';
141 return prefixed;
142 }
143
144 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
145 * we are called.
146 *
147 * We also handle named subsystems here. Any controller which is not a kernel
148 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
149 * we refuse to use because we're not sure which we have here.
150 * (TODO: We could work around this in some cases by just remounting to be
151 * unambiguous, or by comparing mountpoint contents with current cgroup.)
152 *
153 * The last entry will always be NULL.
154 */
155 static void must_append_controller(char **klist, char **nlist, char ***clist,
156 char *entry)
157 {
158 int newentry;
159 char *copy;
160
161 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
162 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
163 ERROR("It is both a named and kernel subsystem");
164 return;
165 }
166
167 newentry = append_null_to_list((void ***)clist);
168
169 if (strncmp(entry, "name=", 5) == 0)
170 copy = must_copy_string(entry);
171 else if (string_in_list(klist, entry))
172 copy = must_copy_string(entry);
173 else
174 copy = cg_legacy_must_prefix_named(entry);
175
176 (*clist)[newentry] = copy;
177 }
178
179 /* Given a handler's cgroup data, return the struct hierarchy for the controller
180 * @c, or NULL if there is none.
181 */
182 struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
183 {
184 int i;
185
186 errno = ENOENT;
187
188 if (!ops->hierarchies) {
189 TRACE("There are no useable cgroup controllers");
190 return NULL;
191 }
192
193 for (i = 0; ops->hierarchies[i]; i++) {
194 if (!controller) {
195 /* This is the empty unified hierarchy. */
196 if (ops->hierarchies[i]->controllers &&
197 !ops->hierarchies[i]->controllers[0])
198 return ops->hierarchies[i];
199
200 continue;
201 }
202
203 if (string_in_list(ops->hierarchies[i]->controllers, controller))
204 return ops->hierarchies[i];
205 }
206
207 if (controller)
208 WARN("There is no useable %s controller", controller);
209 else
210 WARN("There is no empty unified cgroup hierarchy");
211
212 return NULL;
213 }
214
215 #define BATCH_SIZE 50
216 static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
217 {
218 int newbatches = (newlen / BATCH_SIZE) + 1;
219 int oldbatches = (oldlen / BATCH_SIZE) + 1;
220
221 if (!*mem || newbatches > oldbatches) {
222 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
223 }
224 }
225
226 static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
227 {
228 size_t full = oldlen + newlen;
229
230 batch_realloc(dest, oldlen, full + 1);
231
232 memcpy(*dest + oldlen, new, newlen + 1);
233 }
234
235 /* Slurp in a whole file */
236 static char *read_file(const char *fnam)
237 {
238 FILE *f;
239 char *line = NULL, *buf = NULL;
240 size_t len = 0, fulllen = 0;
241 int linelen;
242
243 f = fopen(fnam, "r");
244 if (!f)
245 return NULL;
246 while ((linelen = getline(&line, &len, f)) != -1) {
247 append_line(&buf, fulllen, line, linelen);
248 fulllen += linelen;
249 }
250 fclose(f);
251 free(line);
252 return buf;
253 }
254
255 /* Taken over modified from the kernel sources. */
256 #define NBITS 32 /* bits in uint32_t */
257 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
258 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
259
260 static void set_bit(unsigned bit, uint32_t *bitarr)
261 {
262 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
263 }
264
265 static void clear_bit(unsigned bit, uint32_t *bitarr)
266 {
267 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
268 }
269
270 static bool is_set(unsigned bit, uint32_t *bitarr)
271 {
272 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
273 }
274
275 /* Create cpumask from cpulist aka turn:
276 *
277 * 0,2-3
278 *
279 * into bit array
280 *
281 * 1 0 1 1
282 */
283 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
284 {
285 char *token;
286 size_t arrlen;
287 uint32_t *bitarr;
288
289 arrlen = BITS_TO_LONGS(nbits);
290 bitarr = calloc(arrlen, sizeof(uint32_t));
291 if (!bitarr)
292 return NULL;
293
294 lxc_iterate_parts(token, buf, ",") {
295 errno = 0;
296 unsigned end, start;
297 char *range;
298
299 start = strtoul(token, NULL, 0);
300 end = start;
301 range = strchr(token, '-');
302 if (range)
303 end = strtoul(range + 1, NULL, 0);
304
305 if (!(start <= end)) {
306 free(bitarr);
307 return NULL;
308 }
309
310 if (end >= nbits) {
311 free(bitarr);
312 return NULL;
313 }
314
315 while (start <= end)
316 set_bit(start++, bitarr);
317 }
318
319 return bitarr;
320 }
321
322 /* Turn cpumask into simple, comma-separated cpulist. */
323 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
324 {
325 int ret;
326 size_t i;
327 char **cpulist = NULL;
328 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
329
330 for (i = 0; i <= nbits; i++) {
331 if (!is_set(i, bitarr))
332 continue;
333
334 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
335 if (ret < 0 || (size_t)ret >= sizeof(numstr)) {
336 lxc_free_array((void **)cpulist, free);
337 return NULL;
338 }
339
340 ret = lxc_append_string(&cpulist, numstr);
341 if (ret < 0) {
342 lxc_free_array((void **)cpulist, free);
343 return NULL;
344 }
345 }
346
347 if (!cpulist)
348 return NULL;
349
350 return lxc_string_join(",", (const char **)cpulist, false);
351 }
352
353 static ssize_t get_max_cpus(char *cpulist)
354 {
355 char *c1, *c2;
356 char *maxcpus = cpulist;
357 size_t cpus = 0;
358
359 c1 = strrchr(maxcpus, ',');
360 if (c1)
361 c1++;
362
363 c2 = strrchr(maxcpus, '-');
364 if (c2)
365 c2++;
366
367 if (!c1 && !c2)
368 c1 = maxcpus;
369 else if (c1 > c2)
370 c2 = c1;
371 else if (c1 < c2)
372 c1 = c2;
373 else if (!c1 && c2)
374 c1 = c2;
375
376 errno = 0;
377 cpus = strtoul(c1, NULL, 0);
378 if (errno != 0)
379 return -1;
380
381 return cpus;
382 }
383
384 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
385 static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
386 {
387 int ret;
388 ssize_t i;
389 char *lastslash, *fpath, oldv;
390 ssize_t maxisol = 0, maxposs = 0;
391 char *cpulist = NULL, *isolcpus = NULL, *posscpus = NULL;
392 uint32_t *isolmask = NULL, *possmask = NULL;
393 bool bret = false, flipped_bit = false;
394
395 lastslash = strrchr(path, '/');
396 if (!lastslash) {
397 ERROR("Failed to detect \"/\" in \"%s\"", path);
398 return bret;
399 }
400 oldv = *lastslash;
401 *lastslash = '\0';
402 fpath = must_make_path(path, "cpuset.cpus", NULL);
403 posscpus = read_file(fpath);
404 if (!posscpus) {
405 SYSERROR("Failed to read file \"%s\"", fpath);
406 goto on_error;
407 }
408
409 /* Get maximum number of cpus found in possible cpuset. */
410 maxposs = get_max_cpus(posscpus);
411 if (maxposs < 0 || maxposs >= INT_MAX - 1)
412 goto on_error;
413
414 if (!file_exists(__ISOL_CPUS)) {
415 /* This system doesn't expose isolated cpus. */
416 DEBUG("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
417 cpulist = posscpus;
418 /* No isolated cpus but we weren't already initialized by
419 * someone. We should simply copy the parents cpuset.cpus
420 * values.
421 */
422 if (!am_initialized) {
423 DEBUG("Copying cpu settings of parent cgroup");
424 goto copy_parent;
425 }
426 /* No isolated cpus but we were already initialized by someone.
427 * Nothing more to do for us.
428 */
429 goto on_success;
430 }
431
432 isolcpus = read_file(__ISOL_CPUS);
433 if (!isolcpus) {
434 SYSERROR("Failed to read file \""__ISOL_CPUS"\"");
435 goto on_error;
436 }
437 if (!isdigit(isolcpus[0])) {
438 TRACE("No isolated cpus detected");
439 cpulist = posscpus;
440 /* No isolated cpus but we weren't already initialized by
441 * someone. We should simply copy the parents cpuset.cpus
442 * values.
443 */
444 if (!am_initialized) {
445 DEBUG("Copying cpu settings of parent cgroup");
446 goto copy_parent;
447 }
448 /* No isolated cpus but we were already initialized by someone.
449 * Nothing more to do for us.
450 */
451 goto on_success;
452 }
453
454 /* Get maximum number of cpus found in isolated cpuset. */
455 maxisol = get_max_cpus(isolcpus);
456 if (maxisol < 0 || maxisol >= INT_MAX - 1)
457 goto on_error;
458
459 if (maxposs < maxisol)
460 maxposs = maxisol;
461 maxposs++;
462
463 possmask = lxc_cpumask(posscpus, maxposs);
464 if (!possmask) {
465 ERROR("Failed to create cpumask for possible cpus");
466 goto on_error;
467 }
468
469 isolmask = lxc_cpumask(isolcpus, maxposs);
470 if (!isolmask) {
471 ERROR("Failed to create cpumask for isolated cpus");
472 goto on_error;
473 }
474
475 for (i = 0; i <= maxposs; i++) {
476 if (!is_set(i, isolmask) || !is_set(i, possmask))
477 continue;
478
479 flipped_bit = true;
480 clear_bit(i, possmask);
481 }
482
483 if (!flipped_bit) {
484 DEBUG("No isolated cpus present in cpuset");
485 goto on_success;
486 }
487 DEBUG("Removed isolated cpus from cpuset");
488
489 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
490 if (!cpulist) {
491 ERROR("Failed to create cpu list");
492 goto on_error;
493 }
494
495 copy_parent:
496 *lastslash = oldv;
497 free(fpath);
498 fpath = must_make_path(path, "cpuset.cpus", NULL);
499 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false, 0666);
500 if (ret < 0) {
501 SYSERROR("Failed to write cpu list to \"%s\"", fpath);
502 goto on_error;
503 }
504
505 on_success:
506 bret = true;
507
508 on_error:
509 free(fpath);
510
511 free(isolcpus);
512 free(isolmask);
513
514 if (posscpus != cpulist)
515 free(posscpus);
516 free(possmask);
517
518 free(cpulist);
519 return bret;
520 }
521
522 /* Copy contents of parent(@path)/@file to @path/@file */
523 static bool copy_parent_file(char *path, char *file)
524 {
525 int ret;
526 char *fpath, *lastslash, oldv;
527 int len = 0;
528 char *value = NULL;
529
530 lastslash = strrchr(path, '/');
531 if (!lastslash) {
532 ERROR("Failed to detect \"/\" in \"%s\"", path);
533 return false;
534 }
535 oldv = *lastslash;
536 *lastslash = '\0';
537 fpath = must_make_path(path, file, NULL);
538 len = lxc_read_from_file(fpath, NULL, 0);
539 if (len <= 0)
540 goto on_error;
541
542 value = must_alloc(len + 1);
543 ret = lxc_read_from_file(fpath, value, len);
544 if (ret != len)
545 goto on_error;
546 free(fpath);
547
548 *lastslash = oldv;
549 fpath = must_make_path(path, file, NULL);
550 ret = lxc_write_to_file(fpath, value, len, false, 0666);
551 if (ret < 0)
552 SYSERROR("Failed to write \"%s\" to file \"%s\"", value, fpath);
553 free(fpath);
554 free(value);
555 return ret >= 0;
556
557 on_error:
558 SYSERROR("Failed to read file \"%s\"", fpath);
559 free(fpath);
560 free(value);
561 return false;
562 }
563
564 /* Initialize the cpuset hierarchy in first directory of @gname and set
565 * cgroup.clone_children so that children inherit settings. Since the
566 * h->base_path is populated by init or ourselves, we know it is already
567 * initialized.
568 */
569 static bool cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
570 {
571 int ret;
572 char v;
573 char *cgpath, *clonechildrenpath, *slash;
574
575 if (!string_in_list(h->controllers, "cpuset"))
576 return true;
577
578 if (*cgname == '/')
579 cgname++;
580 slash = strchr(cgname, '/');
581 if (slash)
582 *slash = '\0';
583
584 cgpath = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
585 if (slash)
586 *slash = '/';
587
588 ret = mkdir(cgpath, 0755);
589 if (ret < 0) {
590 if (errno != EEXIST) {
591 SYSERROR("Failed to create directory \"%s\"", cgpath);
592 free(cgpath);
593 return false;
594 }
595 }
596
597 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
598 /* unified hierarchy doesn't have clone_children */
599 if (!file_exists(clonechildrenpath)) {
600 free(clonechildrenpath);
601 free(cgpath);
602 return true;
603 }
604
605 ret = lxc_read_from_file(clonechildrenpath, &v, 1);
606 if (ret < 0) {
607 SYSERROR("Failed to read file \"%s\"", clonechildrenpath);
608 free(clonechildrenpath);
609 free(cgpath);
610 return false;
611 }
612
613 /* Make sure any isolated cpus are removed from cpuset.cpus. */
614 if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1')) {
615 SYSERROR("Failed to remove isolated cpus");
616 free(clonechildrenpath);
617 free(cgpath);
618 return false;
619 }
620
621 /* Already set for us by someone else. */
622 if (v == '1') {
623 DEBUG("\"cgroup.clone_children\" was already set to \"1\"");
624 free(clonechildrenpath);
625 free(cgpath);
626 return true;
627 }
628
629 /* copy parent's settings */
630 if (!copy_parent_file(cgpath, "cpuset.mems")) {
631 SYSERROR("Failed to copy \"cpuset.mems\" settings");
632 free(cgpath);
633 free(clonechildrenpath);
634 return false;
635 }
636 free(cgpath);
637
638 ret = lxc_write_to_file(clonechildrenpath, "1", 1, false, 0666);
639 if (ret < 0) {
640 /* Set clone_children so children inherit our settings */
641 SYSERROR("Failed to write 1 to \"%s\"", clonechildrenpath);
642 free(clonechildrenpath);
643 return false;
644 }
645 free(clonechildrenpath);
646 return true;
647 }
648
649 /* Given two null-terminated lists of strings, return true if any string is in
650 * both.
651 */
652 static bool controller_lists_intersect(char **l1, char **l2)
653 {
654 int i;
655
656 if (!l1 || !l2)
657 return false;
658
659 for (i = 0; l1[i]; i++) {
660 if (string_in_list(l2, l1[i]))
661 return true;
662 }
663
664 return false;
665 }
666
667 /* For a null-terminated list of controllers @clist, return true if any of those
668 * controllers is already listed the null-terminated list of hierarchies @hlist.
669 * Realistically, if one is present, all must be present.
670 */
671 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
672 {
673 int i;
674
675 if (!hlist)
676 return false;
677
678 for (i = 0; hlist[i]; i++)
679 if (controller_lists_intersect(hlist[i]->controllers, clist))
680 return true;
681
682 return false;
683 }
684
685 /* Return true if the controller @entry is found in the null-terminated list of
686 * hierarchies @hlist.
687 */
688 static bool controller_found(struct hierarchy **hlist, char *entry)
689 {
690 int i;
691
692 if (!hlist)
693 return false;
694
695 for (i = 0; hlist[i]; i++)
696 if (string_in_list(hlist[i]->controllers, entry))
697 return true;
698
699 return false;
700 }
701
702 /* Return true if all of the controllers which we require have been found. The
703 * required list is freezer and anything in lxc.cgroup.use.
704 */
705 static bool all_controllers_found(struct cgroup_ops *ops)
706 {
707 char **cur;
708 struct hierarchy **hlist = ops->hierarchies;
709
710 if (!controller_found(hlist, "freezer")) {
711 ERROR("No freezer controller mountpoint found");
712 return false;
713 }
714
715 if (!ops->cgroup_use)
716 return true;
717
718 for (cur = ops->cgroup_use; cur && *cur; cur++)
719 if (!controller_found(hlist, *cur)) {
720 ERROR("No %s controller mountpoint found", *cur);
721 return false;
722 }
723
724 return true;
725 }
726
727 /* Get the controllers from a mountinfo line There are other ways we could get
728 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
729 * could parse the mount options. But we simply assume that the mountpoint must
730 * be /sys/fs/cgroup/controller-list
731 */
732 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
733 int type)
734 {
735 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
736 * for legacy hierarchies.
737 */
738 int i;
739 char *dup, *p2, *tok;
740 char *p = line, *sep = ",";
741 char **aret = NULL;
742
743 for (i = 0; i < 4; i++) {
744 p = strchr(p, ' ');
745 if (!p)
746 return NULL;
747 p++;
748 }
749
750 /* Note, if we change how mountinfo works, then our caller will need to
751 * verify /sys/fs/cgroup/ in this field.
752 */
753 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
754 ERROR("Found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
755 return NULL;
756 }
757
758 p += 15;
759 p2 = strchr(p, ' ');
760 if (!p2) {
761 ERROR("Corrupt mountinfo");
762 return NULL;
763 }
764 *p2 = '\0';
765
766 if (type == CGROUP_SUPER_MAGIC) {
767 /* strdup() here for v1 hierarchies. Otherwise
768 * lxc_iterate_parts() will destroy mountpoints such as
769 * "/sys/fs/cgroup/cpu,cpuacct".
770 */
771 dup = strdup(p);
772 if (!dup)
773 return NULL;
774
775 lxc_iterate_parts(tok, dup, sep) {
776 must_append_controller(klist, nlist, &aret, tok);
777 }
778
779 free(dup);
780 }
781 *p2 = ' ';
782
783 return aret;
784 }
785
786 static char **cg_unified_make_empty_controller(void)
787 {
788 int newentry;
789 char **aret = NULL;
790
791 newentry = append_null_to_list((void ***)&aret);
792 aret[newentry] = NULL;
793 return aret;
794 }
795
796 static char **cg_unified_get_controllers(const char *file)
797 {
798 char *buf, *tok;
799 char *sep = " \t\n";
800 char **aret = NULL;
801
802 buf = read_file(file);
803 if (!buf)
804 return NULL;
805
806 lxc_iterate_parts(tok, buf, sep) {
807 int newentry;
808 char *copy;
809
810 newentry = append_null_to_list((void ***)&aret);
811 copy = must_copy_string(tok);
812 aret[newentry] = copy;
813 }
814
815 free(buf);
816 return aret;
817 }
818
819 static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
820 char *container_base_path, int type)
821 {
822 struct hierarchy *new;
823 int newentry;
824
825 new = must_alloc(sizeof(*new));
826 new->controllers = clist;
827 new->mountpoint = mountpoint;
828 new->container_base_path = container_base_path;
829 new->container_full_path = NULL;
830 new->monitor_full_path = NULL;
831 new->version = type;
832
833 newentry = append_null_to_list((void ***)h);
834 (*h)[newentry] = new;
835 return new;
836 }
837
838 /* Get a copy of the mountpoint from @line, which is a line from
839 * /proc/self/mountinfo.
840 */
841 static char *cg_hybrid_get_mountpoint(char *line)
842 {
843 int i;
844 size_t len;
845 char *p2;
846 char *p = line, *sret = NULL;
847
848 for (i = 0; i < 4; i++) {
849 p = strchr(p, ' ');
850 if (!p)
851 return NULL;
852 p++;
853 }
854
855 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0)
856 return NULL;
857
858 p2 = strchr(p + 15, ' ');
859 if (!p2)
860 return NULL;
861 *p2 = '\0';
862
863 len = strlen(p);
864 sret = must_alloc(len + 1);
865 memcpy(sret, p, len);
866 sret[len] = '\0';
867 return sret;
868 }
869
870 /* Given a multi-line string, return a null-terminated copy of the current line. */
871 static char *copy_to_eol(char *p)
872 {
873 char *p2 = strchr(p, '\n'), *sret;
874 size_t len;
875
876 if (!p2)
877 return NULL;
878
879 len = p2 - p;
880 sret = must_alloc(len + 1);
881 memcpy(sret, p, len);
882 sret[len] = '\0';
883 return sret;
884 }
885
886 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
887 * /proc/self/cgroup file. Check whether controller c is present.
888 */
889 static bool controller_in_clist(char *cgline, char *c)
890 {
891 char *tok, *eol, *tmp;
892 size_t len;
893
894 eol = strchr(cgline, ':');
895 if (!eol)
896 return false;
897
898 len = eol - cgline;
899 tmp = alloca(len + 1);
900 memcpy(tmp, cgline, len);
901 tmp[len] = '\0';
902
903 lxc_iterate_parts(tok, tmp, ",") {
904 if (strcmp(tok, c) == 0)
905 return true;
906 }
907
908 return false;
909 }
910
911 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
912 * @controller.
913 */
914 static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
915 int type)
916 {
917 char *p = basecginfo;
918
919 for (;;) {
920 bool is_cgv2_base_cgroup = false;
921
922 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
923 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
924 is_cgv2_base_cgroup = true;
925
926 p = strchr(p, ':');
927 if (!p)
928 return NULL;
929 p++;
930
931 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
932 p = strchr(p, ':');
933 if (!p)
934 return NULL;
935 p++;
936 return copy_to_eol(p);
937 }
938
939 p = strchr(p, '\n');
940 if (!p)
941 return NULL;
942 p++;
943 }
944 }
945
946 static void must_append_string(char ***list, char *entry)
947 {
948 int newentry;
949 char *copy;
950
951 newentry = append_null_to_list((void ***)list);
952 copy = must_copy_string(entry);
953 (*list)[newentry] = copy;
954 }
955
956 static int get_existing_subsystems(char ***klist, char ***nlist)
957 {
958 FILE *f;
959 char *line = NULL;
960 size_t len = 0;
961
962 f = fopen("/proc/self/cgroup", "r");
963 if (!f)
964 return -1;
965
966 while (getline(&line, &len, f) != -1) {
967 char *p, *p2, *tok;
968 p = strchr(line, ':');
969 if (!p)
970 continue;
971 p++;
972 p2 = strchr(p, ':');
973 if (!p2)
974 continue;
975 *p2 = '\0';
976
977 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
978 * contains an entry of the form:
979 *
980 * 0::/some/path
981 *
982 * In this case we use "cgroup2" as controller name.
983 */
984 if ((p2 - p) == 0) {
985 must_append_string(klist, "cgroup2");
986 continue;
987 }
988
989 lxc_iterate_parts(tok, p, ",") {
990 if (strncmp(tok, "name=", 5) == 0)
991 must_append_string(nlist, tok);
992 else
993 must_append_string(klist, tok);
994 }
995 }
996
997 free(line);
998 fclose(f);
999 return 0;
1000 }
1001
1002 static void trim(char *s)
1003 {
1004 size_t len;
1005
1006 len = strlen(s);
1007 while ((len > 1) && (s[len - 1] == '\n'))
1008 s[--len] = '\0';
1009 }
1010
1011 static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
1012 {
1013 int i;
1014 struct hierarchy **it;
1015
1016 if (!ops->hierarchies) {
1017 TRACE(" No hierarchies found");
1018 return;
1019 }
1020
1021 TRACE(" Hierarchies:");
1022 for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
1023 int j;
1024 char **cit;
1025
1026 TRACE(" %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
1027 TRACE(" mountpoint: %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1028 TRACE(" controllers:");
1029 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
1030 TRACE(" %d: %s", j, *cit);
1031 }
1032 }
1033
1034 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1035 char **nlist)
1036 {
1037 int k;
1038 char **it;
1039
1040 TRACE("basecginfo is:");
1041 TRACE("%s", basecginfo);
1042
1043 for (k = 0, it = klist; it && *it; it++, k++)
1044 TRACE("kernel subsystem %d: %s", k, *it);
1045
1046 for (k = 0, it = nlist; it && *it; it++, k++)
1047 TRACE("named subsystem %d: %s", k, *it);
1048 }
1049
1050 static int cgroup_rmdir(struct hierarchy **hierarchies,
1051 const char *container_cgroup)
1052 {
1053 int i;
1054
1055 if (!container_cgroup || !hierarchies)
1056 return 0;
1057
1058 for (i = 0; hierarchies[i]; i++) {
1059 int ret;
1060 struct hierarchy *h = hierarchies[i];
1061
1062 if (!h->container_full_path)
1063 continue;
1064
1065 ret = recursive_destroy(h->container_full_path);
1066 if (ret < 0)
1067 WARN("Failed to destroy \"%s\"", h->container_full_path);
1068
1069 free(h->container_full_path);
1070 h->container_full_path = NULL;
1071 }
1072
1073 return 0;
1074 }
1075
1076 struct generic_userns_exec_data {
1077 struct hierarchy **hierarchies;
1078 const char *container_cgroup;
1079 struct lxc_conf *conf;
1080 uid_t origuid; /* target uid in parent namespace */
1081 char *path;
1082 };
1083
1084 static int cgroup_rmdir_wrapper(void *data)
1085 {
1086 int ret;
1087 struct generic_userns_exec_data *arg = data;
1088 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1089 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1090
1091 ret = setresgid(nsgid, nsgid, nsgid);
1092 if (ret < 0) {
1093 SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
1094 (int)nsgid, (int)nsgid);
1095 return -1;
1096 }
1097
1098 ret = setresuid(nsuid, nsuid, nsuid);
1099 if (ret < 0) {
1100 SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
1101 (int)nsuid, (int)nsuid);
1102 return -1;
1103 }
1104
1105 ret = setgroups(0, NULL);
1106 if (ret < 0 && errno != EPERM) {
1107 SYSERROR("Failed to setgroups(0, NULL)");
1108 return -1;
1109 }
1110
1111 return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
1112 }
1113
1114 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
1115 struct lxc_handler *handler)
1116 {
1117 int ret;
1118 struct generic_userns_exec_data wrap;
1119
1120 wrap.origuid = 0;
1121 wrap.container_cgroup = ops->container_cgroup;
1122 wrap.hierarchies = ops->hierarchies;
1123 wrap.conf = handler->conf;
1124
1125 if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
1126 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
1127 "cgroup_rmdir_wrapper");
1128 else
1129 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
1130 if (ret < 0) {
1131 WARN("Failed to destroy cgroups");
1132 return;
1133 }
1134 }
1135
1136 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1137 struct lxc_handler *handler)
1138 {
1139 int len;
1140 char *pivot_path;
1141 struct lxc_conf *conf = handler->conf;
1142 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1143
1144 if (!ops->hierarchies)
1145 return;
1146
1147 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1148 if (len < 0 || (size_t)len >= sizeof(pidstr))
1149 return;
1150
1151 for (int i = 0; ops->hierarchies[i]; i++) {
1152 int ret;
1153 struct hierarchy *h = ops->hierarchies[i];
1154
1155 if (!h->monitor_full_path)
1156 continue;
1157
1158 if (conf && conf->cgroup_meta.dir)
1159 pivot_path = must_make_path(h->mountpoint,
1160 h->container_base_path,
1161 conf->cgroup_meta.dir,
1162 PIVOT_CGROUP,
1163 "cgroup.procs", NULL);
1164 else
1165 pivot_path = must_make_path(h->mountpoint,
1166 h->container_base_path,
1167 PIVOT_CGROUP,
1168 "cgroup.procs", NULL);
1169
1170 ret = mkdir_p(pivot_path, 0755);
1171 if (ret < 0 && errno != EEXIST)
1172 goto next;
1173
1174 /* Move ourselves into the pivot cgroup to delete our own
1175 * cgroup.
1176 */
1177 ret = lxc_write_to_file(pivot_path, pidstr, len, false, 0666);
1178 if (ret != 0)
1179 goto next;
1180
1181 ret = recursive_destroy(h->monitor_full_path);
1182 if (ret < 0)
1183 WARN("Failed to destroy \"%s\"", h->monitor_full_path);
1184
1185 next:
1186 free(pivot_path);
1187 }
1188 }
1189
1190 static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
1191 {
1192 size_t i, parts_len;
1193 char **it;
1194 size_t full_len = 0;
1195 char *add_controllers = NULL, *cgroup = NULL;
1196 char **parts = NULL;
1197 bool bret = false;
1198
1199 if (h->version != CGROUP2_SUPER_MAGIC)
1200 return true;
1201
1202 if (!h->controllers)
1203 return true;
1204
1205 /* For now we simply enable all controllers that we have detected by
1206 * creating a string like "+memory +pids +cpu +io".
1207 * TODO: In the near future we might want to support "-<controller>"
1208 * etc. but whether supporting semantics like this make sense will need
1209 * some thinking.
1210 */
1211 for (it = h->controllers; it && *it; it++) {
1212 full_len += strlen(*it) + 2;
1213 add_controllers = must_realloc(add_controllers, full_len + 1);
1214
1215 if (h->controllers[0] == *it)
1216 add_controllers[0] = '\0';
1217
1218 (void)strlcat(add_controllers, "+", full_len + 1);
1219 (void)strlcat(add_controllers, *it, full_len + 1);
1220
1221 if ((it + 1) && *(it + 1))
1222 (void)strlcat(add_controllers, " ", full_len + 1);
1223 }
1224
1225 parts = lxc_string_split(cgname, '/');
1226 if (!parts)
1227 goto on_error;
1228
1229 parts_len = lxc_array_len((void **)parts);
1230 if (parts_len > 0)
1231 parts_len--;
1232
1233 cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
1234 for (i = 0; i < parts_len; i++) {
1235 int ret;
1236 char *target;
1237
1238 cgroup = must_append_path(cgroup, parts[i], NULL);
1239 target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
1240 ret = lxc_write_to_file(target, add_controllers, full_len, false, 0666);
1241 free(target);
1242 if (ret < 0) {
1243 SYSERROR("Could not enable \"%s\" controllers in the "
1244 "unified cgroup \"%s\"", add_controllers, cgroup);
1245 goto on_error;
1246 }
1247 }
1248
1249 bret = true;
1250
1251 on_error:
1252 lxc_free_array((void **)parts, free);
1253 free(add_controllers);
1254 free(cgroup);
1255 return bret;
1256 }
1257
1258 static bool monitor_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1259 {
1260 int ret;
1261
1262 h->monitor_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
1263 if (dir_exists(h->monitor_full_path))
1264 return true;
1265
1266 if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1267 ERROR("Failed to handle legacy cpuset controller");
1268 return false;
1269 }
1270
1271 ret = mkdir_p(h->monitor_full_path, 0755);
1272 if (ret < 0) {
1273 ERROR("Failed to create cgroup \"%s\"", h->monitor_full_path);
1274 return false;
1275 }
1276
1277 return cg_unified_create_cgroup(h, cgname);
1278 }
1279
1280 static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1281 {
1282 int ret;
1283
1284 h->container_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
1285 if (dir_exists(h->container_full_path)) {
1286 ERROR("The cgroup \"%s\" already existed", h->container_full_path);
1287 return false;
1288 }
1289
1290 if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1291 ERROR("Failed to handle legacy cpuset controller");
1292 return false;
1293 }
1294
1295 ret = mkdir_p(h->container_full_path, 0755);
1296 if (ret < 0) {
1297 ERROR("Failed to create cgroup \"%s\"", h->container_full_path);
1298 return false;
1299 }
1300
1301 return cg_unified_create_cgroup(h, cgname);
1302 }
1303
1304 static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname, bool monitor)
1305 {
1306 int ret;
1307 char *full_path;
1308
1309 if (monitor)
1310 full_path = h->monitor_full_path;
1311 else
1312 full_path = h->container_full_path;
1313
1314 ret = rmdir(full_path);
1315 if (ret < 0)
1316 SYSERROR("Failed to rmdir(\"%s\") from failed creation attempt", full_path);
1317
1318 free(full_path);
1319
1320 if (monitor)
1321 h->monitor_full_path = NULL;
1322 else
1323 h->container_full_path = NULL;
1324 }
1325
1326 __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
1327 struct lxc_handler *handler)
1328 {
1329 char *monitor_cgroup, *offset, *tmp;
1330 int idx = 0;
1331 size_t len;
1332 bool bret = false;
1333 struct lxc_conf *conf = handler->conf;
1334
1335 if (!conf)
1336 return bret;
1337
1338 if (conf->cgroup_meta.dir)
1339 tmp = lxc_string_join("/",
1340 (const char *[]){conf->cgroup_meta.dir,
1341 ops->monitor_pattern,
1342 handler->name, NULL},
1343 false);
1344 else
1345 tmp = must_make_path(ops->monitor_pattern, handler->name, NULL);
1346 if (!tmp)
1347 return bret;
1348
1349 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
1350 monitor_cgroup = must_alloc(len);
1351 (void)strlcpy(monitor_cgroup, tmp, len);
1352 free(tmp);
1353 offset = monitor_cgroup + len - 5;
1354
1355 do {
1356 if (idx) {
1357 int ret = snprintf(offset, 5, "-%d", idx);
1358 if (ret < 0 || (size_t)ret >= 5)
1359 goto on_error;
1360 }
1361
1362 for (int i = 0; ops->hierarchies[i]; i++) {
1363 if (!monitor_create_path_for_hierarchy(ops->hierarchies[i], monitor_cgroup)) {
1364 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path);
1365 free(ops->hierarchies[i]->container_full_path);
1366 ops->hierarchies[i]->container_full_path = NULL;
1367
1368 for (int j = 0; j < i; j++)
1369 remove_path_for_hierarchy(ops->hierarchies[j], monitor_cgroup, true);
1370
1371 idx++;
1372 break;
1373 }
1374 }
1375 } while (idx > 0 && idx < 1000);
1376
1377 if (idx < 1000)
1378 bret = true;
1379
1380 on_error:
1381 free(monitor_cgroup);
1382
1383 return bret;
1384 }
1385
1386 /* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1387 * next cgroup_pattern-1, -2, ..., -999.
1388 */
1389 __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
1390 struct lxc_handler *handler)
1391 {
1392 int i;
1393 size_t len;
1394 char *container_cgroup, *offset, *tmp;
1395 int idx = 0;
1396 struct lxc_conf *conf = handler->conf;
1397
1398 if (ops->container_cgroup) {
1399 WARN("cgfsng_create called a second time: %s", ops->container_cgroup);
1400 return false;
1401 }
1402
1403 if (!conf)
1404 return false;
1405
1406 if (conf->cgroup_meta.dir)
1407 tmp = lxc_string_join("/", (const char *[]){conf->cgroup_meta.dir, handler->name, NULL}, false);
1408 else
1409 tmp = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1410 if (!tmp) {
1411 ERROR("Failed expanding cgroup name pattern");
1412 return false;
1413 }
1414
1415 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
1416 container_cgroup = must_alloc(len);
1417 (void)strlcpy(container_cgroup, tmp, len);
1418 free(tmp);
1419 offset = container_cgroup + len - 5;
1420
1421 again:
1422 if (idx == 1000) {
1423 ERROR("Too many conflicting cgroup names");
1424 goto out_free;
1425 }
1426
1427 if (idx) {
1428 int ret;
1429
1430 ret = snprintf(offset, 5, "-%d", idx);
1431 if (ret < 0 || (size_t)ret >= 5) {
1432 FILE *f = fopen("/dev/null", "w");
1433 if (f) {
1434 fprintf(f, "Workaround for GCC7 bug: "
1435 "https://gcc.gnu.org/bugzilla/"
1436 "show_bug.cgi?id=78969");
1437 fclose(f);
1438 }
1439 }
1440 }
1441
1442 for (i = 0; ops->hierarchies[i]; i++) {
1443 if (!container_create_path_for_hierarchy(ops->hierarchies[i], container_cgroup)) {
1444 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path);
1445 free(ops->hierarchies[i]->container_full_path);
1446 ops->hierarchies[i]->container_full_path = NULL;
1447 for (int j = 0; j < i; j++)
1448 remove_path_for_hierarchy(ops->hierarchies[j], container_cgroup, false);
1449 idx++;
1450 goto again;
1451 }
1452 }
1453
1454 ops->container_cgroup = container_cgroup;
1455
1456 return true;
1457
1458 out_free:
1459 free(container_cgroup);
1460
1461 return false;
1462 }
1463
1464 __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
1465 bool monitor)
1466 {
1467 int len;
1468 char pidstr[25];
1469
1470 len = snprintf(pidstr, 25, "%d", pid);
1471 if (len < 0 || len >= 25)
1472 return false;
1473
1474 for (int i = 0; ops->hierarchies[i]; i++) {
1475 int ret;
1476 char *path;
1477
1478 if (monitor)
1479 path = must_make_path(ops->hierarchies[i]->monitor_full_path,
1480 "cgroup.procs", NULL);
1481 else
1482 path = must_make_path(ops->hierarchies[i]->container_full_path,
1483 "cgroup.procs", NULL);
1484 ret = lxc_write_to_file(path, pidstr, len, false, 0666);
1485 if (ret != 0) {
1486 SYSERROR("Failed to enter cgroup \"%s\"", path);
1487 free(path);
1488 return false;
1489 }
1490 free(path);
1491 }
1492
1493 return true;
1494 }
1495
1496 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, pid_t pid)
1497 {
1498 return __do_cgroup_enter(ops, pid, true);
1499 }
1500
1501 static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid)
1502 {
1503 return __do_cgroup_enter(ops, pid, false);
1504 }
1505
1506 static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
1507 mode_t chmod_mode)
1508 {
1509 int ret;
1510
1511 ret = chown(path, chown_uid, chown_gid);
1512 if (ret < 0) {
1513 SYSWARN("Failed to chown(%s, %d, %d)", path, (int)chown_uid, (int)chown_gid);
1514 return -1;
1515 }
1516
1517 ret = chmod(path, chmod_mode);
1518 if (ret < 0) {
1519 SYSWARN("Failed to chmod(%s, %d)", path, (int)chmod_mode);
1520 return -1;
1521 }
1522
1523 return 0;
1524 }
1525
1526 /* chgrp the container cgroups to container group. We leave
1527 * the container owner as cgroup owner. So we must make the
1528 * directories 775 so that the container can create sub-cgroups.
1529 *
1530 * Also chown the tasks and cgroup.procs files. Those may not
1531 * exist depending on kernel version.
1532 */
1533 static int chown_cgroup_wrapper(void *data)
1534 {
1535 int i, ret;
1536 uid_t destuid;
1537 struct generic_userns_exec_data *arg = data;
1538 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1539 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1540
1541 ret = setresgid(nsgid, nsgid, nsgid);
1542 if (ret < 0) {
1543 SYSERROR("Failed to setresgid(%d, %d, %d)",
1544 (int)nsgid, (int)nsgid, (int)nsgid);
1545 return -1;
1546 }
1547
1548 ret = setresuid(nsuid, nsuid, nsuid);
1549 if (ret < 0) {
1550 SYSERROR("Failed to setresuid(%d, %d, %d)",
1551 (int)nsuid, (int)nsuid, (int)nsuid);
1552 return -1;
1553 }
1554
1555 ret = setgroups(0, NULL);
1556 if (ret < 0 && errno != EPERM) {
1557 SYSERROR("Failed to setgroups(0, NULL)");
1558 return -1;
1559 }
1560
1561 destuid = get_ns_uid(arg->origuid);
1562 if (destuid == LXC_INVALID_UID)
1563 destuid = 0;
1564
1565 for (i = 0; arg->hierarchies[i]; i++) {
1566 char *fullpath;
1567 char *path = arg->hierarchies[i]->container_full_path;
1568
1569 ret = chowmod(path, destuid, nsgid, 0775);
1570 if (ret < 0)
1571 return -1;
1572
1573 /* Failures to chown() these are inconvenient but not
1574 * detrimental We leave these owned by the container launcher,
1575 * so that container root can write to the files to attach. We
1576 * chmod() them 664 so that container systemd can write to the
1577 * files (which systemd in wily insists on doing).
1578 */
1579
1580 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
1581 fullpath = must_make_path(path, "tasks", NULL);
1582 (void)chowmod(fullpath, destuid, nsgid, 0664);
1583 free(fullpath);
1584 }
1585
1586 fullpath = must_make_path(path, "cgroup.procs", NULL);
1587 (void)chowmod(fullpath, destuid, nsgid, 0664);
1588 free(fullpath);
1589
1590 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1591 continue;
1592
1593 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
1594 (void)chowmod(fullpath, destuid, nsgid, 0664);
1595 free(fullpath);
1596
1597 fullpath = must_make_path(path, "cgroup.threads", NULL);
1598 (void)chowmod(fullpath, destuid, nsgid, 0664);
1599 free(fullpath);
1600 }
1601
1602 return 0;
1603 }
1604
1605 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1606 struct lxc_conf *conf)
1607 {
1608 struct generic_userns_exec_data wrap;
1609
1610 if (lxc_list_empty(&conf->id_map))
1611 return true;
1612
1613 wrap.origuid = geteuid();
1614 wrap.path = NULL;
1615 wrap.hierarchies = ops->hierarchies;
1616 wrap.conf = conf;
1617
1618 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1619 "chown_cgroup_wrapper") < 0) {
1620 ERROR("Error requesting cgroup chown in new user namespace");
1621 return false;
1622 }
1623
1624 return true;
1625 }
1626
1627 /* cgroup-full:* is done, no need to create subdirs */
1628 static bool cg_mount_needs_subdirs(int type)
1629 {
1630 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1631 return false;
1632
1633 return true;
1634 }
1635
1636 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1637 * remount controller ro if needed and bindmount the cgroupfs onto
1638 * controll/the/cg/path.
1639 */
1640 static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1641 char *controllerpath, char *cgpath,
1642 const char *container_cgroup)
1643 {
1644 int ret, remount_flags;
1645 char *sourcepath;
1646 int flags = MS_BIND;
1647
1648 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1649 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1650 if (ret < 0) {
1651 SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
1652 controllerpath, controllerpath);
1653 return -1;
1654 }
1655
1656 remount_flags = add_required_remount_flags(controllerpath,
1657 controllerpath,
1658 flags | MS_REMOUNT);
1659 ret = mount(controllerpath, controllerpath, "cgroup",
1660 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1661 NULL);
1662 if (ret < 0) {
1663 SYSERROR("Failed to remount \"%s\" ro", controllerpath);
1664 return -1;
1665 }
1666
1667 INFO("Remounted %s read-only", controllerpath);
1668 }
1669
1670 sourcepath = must_make_path(h->mountpoint, h->container_base_path,
1671 container_cgroup, NULL);
1672 if (type == LXC_AUTO_CGROUP_RO)
1673 flags |= MS_RDONLY;
1674
1675 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1676 if (ret < 0) {
1677 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1678 free(sourcepath);
1679 return -1;
1680 }
1681 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1682
1683 if (flags & MS_RDONLY) {
1684 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1685 flags | MS_REMOUNT);
1686 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1687 if (ret < 0) {
1688 SYSERROR("Failed to remount \"%s\" ro", cgpath);
1689 free(sourcepath);
1690 return -1;
1691 }
1692 INFO("Remounted %s read-only", cgpath);
1693 }
1694
1695 free(sourcepath);
1696 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1697 return 0;
1698 }
1699
1700 /* __cg_mount_direct
1701 *
1702 * Mount cgroup hierarchies directly without using bind-mounts. The main
1703 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1704 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1705 */
1706 static int __cg_mount_direct(int type, struct hierarchy *h,
1707 const char *controllerpath)
1708 {
1709 int ret;
1710 char *controllers = NULL;
1711 char *fstype = "cgroup2";
1712 unsigned long flags = 0;
1713
1714 flags |= MS_NOSUID;
1715 flags |= MS_NOEXEC;
1716 flags |= MS_NODEV;
1717 flags |= MS_RELATIME;
1718
1719 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1720 flags |= MS_RDONLY;
1721
1722 if (h->version != CGROUP2_SUPER_MAGIC) {
1723 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1724 if (!controllers)
1725 return -ENOMEM;
1726 fstype = "cgroup";
1727 }
1728
1729 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
1730 free(controllers);
1731 if (ret < 0) {
1732 SYSERROR("Failed to mount \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1733 return -1;
1734 }
1735
1736 DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1737 return 0;
1738 }
1739
1740 static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1741 const char *controllerpath)
1742 {
1743 return __cg_mount_direct(type, h, controllerpath);
1744 }
1745
1746 static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1747 const char *controllerpath)
1748 {
1749 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1750 return 0;
1751
1752 return __cg_mount_direct(type, h, controllerpath);
1753 }
1754
1755 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1756 struct lxc_handler *handler,
1757 const char *root, int type)
1758 {
1759 int i, ret;
1760 char *tmpfspath = NULL;
1761 bool has_cgns = false, retval = false, wants_force_mount = false;
1762
1763 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1764 return true;
1765
1766 if (type & LXC_AUTO_CGROUP_FORCE) {
1767 type &= ~LXC_AUTO_CGROUP_FORCE;
1768 wants_force_mount = true;
1769 }
1770
1771 if (!wants_force_mount){
1772 if (!lxc_list_empty(&handler->conf->keepcaps))
1773 wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1774 else
1775 wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1776 }
1777
1778 has_cgns = cgns_supported();
1779 if (has_cgns && !wants_force_mount)
1780 return true;
1781
1782 if (type == LXC_AUTO_CGROUP_NOSPEC)
1783 type = LXC_AUTO_CGROUP_MIXED;
1784 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1785 type = LXC_AUTO_CGROUP_FULL_MIXED;
1786
1787 /* Mount tmpfs */
1788 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1789 ret = safe_mount(NULL, tmpfspath, "tmpfs",
1790 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1791 "size=10240k,mode=755", root);
1792 if (ret < 0)
1793 goto on_error;
1794
1795 for (i = 0; ops->hierarchies[i]; i++) {
1796 char *controllerpath, *path2;
1797 struct hierarchy *h = ops->hierarchies[i];
1798 char *controller = strrchr(h->mountpoint, '/');
1799
1800 if (!controller)
1801 continue;
1802 controller++;
1803
1804 controllerpath = must_make_path(tmpfspath, controller, NULL);
1805 if (dir_exists(controllerpath)) {
1806 free(controllerpath);
1807 continue;
1808 }
1809
1810 ret = mkdir(controllerpath, 0755);
1811 if (ret < 0) {
1812 SYSERROR("Error creating cgroup path: %s", controllerpath);
1813 free(controllerpath);
1814 goto on_error;
1815 }
1816
1817 if (has_cgns && wants_force_mount) {
1818 /* If cgroup namespaces are supported but the container
1819 * will not have CAP_SYS_ADMIN after it has started we
1820 * need to mount the cgroups manually.
1821 */
1822 ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
1823 free(controllerpath);
1824 if (ret < 0)
1825 goto on_error;
1826
1827 continue;
1828 }
1829
1830 ret = cg_mount_cgroup_full(type, h, controllerpath);
1831 if (ret < 0) {
1832 free(controllerpath);
1833 goto on_error;
1834 }
1835
1836 if (!cg_mount_needs_subdirs(type)) {
1837 free(controllerpath);
1838 continue;
1839 }
1840
1841 path2 = must_make_path(controllerpath, h->container_base_path,
1842 ops->container_cgroup, NULL);
1843 ret = mkdir_p(path2, 0755);
1844 if (ret < 0) {
1845 free(controllerpath);
1846 free(path2);
1847 goto on_error;
1848 }
1849
1850 ret = cg_legacy_mount_controllers(type, h, controllerpath,
1851 path2, ops->container_cgroup);
1852 free(controllerpath);
1853 free(path2);
1854 if (ret < 0)
1855 goto on_error;
1856 }
1857 retval = true;
1858
1859 on_error:
1860 free(tmpfspath);
1861 return retval;
1862 }
1863
1864 static int recursive_count_nrtasks(char *dirname)
1865 {
1866 struct dirent *direntp;
1867 DIR *dir;
1868 int count = 0, ret;
1869 char *path;
1870
1871 dir = opendir(dirname);
1872 if (!dir)
1873 return 0;
1874
1875 while ((direntp = readdir(dir))) {
1876 struct stat mystat;
1877
1878 if (!strcmp(direntp->d_name, ".") ||
1879 !strcmp(direntp->d_name, ".."))
1880 continue;
1881
1882 path = must_make_path(dirname, direntp->d_name, NULL);
1883
1884 if (lstat(path, &mystat))
1885 goto next;
1886
1887 if (!S_ISDIR(mystat.st_mode))
1888 goto next;
1889
1890 count += recursive_count_nrtasks(path);
1891 next:
1892 free(path);
1893 }
1894
1895 path = must_make_path(dirname, "cgroup.procs", NULL);
1896 ret = lxc_count_file_lines(path);
1897 if (ret != -1)
1898 count += ret;
1899 free(path);
1900
1901 (void)closedir(dir);
1902
1903 return count;
1904 }
1905
1906 __cgfsng_ops static int cgfsng_nrtasks(struct cgroup_ops *ops)
1907 {
1908 int count;
1909 char *path;
1910
1911 if (!ops->container_cgroup || !ops->hierarchies)
1912 return -1;
1913
1914 path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
1915 count = recursive_count_nrtasks(path);
1916 free(path);
1917 return count;
1918 }
1919
1920 /* Only root needs to escape to the cgroup of its init. */
1921 __cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
1922 struct lxc_conf *conf)
1923 {
1924 int i;
1925
1926 if (conf->cgroup_meta.relative || geteuid())
1927 return true;
1928
1929 for (i = 0; ops->hierarchies[i]; i++) {
1930 int ret;
1931 char *fullpath;
1932
1933 fullpath = must_make_path(ops->hierarchies[i]->mountpoint,
1934 ops->hierarchies[i]->container_base_path,
1935 "cgroup.procs", NULL);
1936 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1937 if (ret != 0) {
1938 SYSERROR("Failed to escape to cgroup \"%s\"", fullpath);
1939 free(fullpath);
1940 return false;
1941 }
1942 free(fullpath);
1943 }
1944
1945 return true;
1946 }
1947
1948 __cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
1949 {
1950 int i;
1951
1952 for (i = 0; ops->hierarchies[i]; i++)
1953 ;
1954
1955 return i;
1956 }
1957
1958 __cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
1959 {
1960 int i;
1961
1962 /* sanity check n */
1963 for (i = 0; i < n; i++)
1964 if (!ops->hierarchies[i])
1965 return false;
1966
1967 *out = ops->hierarchies[i]->controllers;
1968
1969 return true;
1970 }
1971
1972 #define THAWED "THAWED"
1973 #define THAWED_LEN (strlen(THAWED))
1974
1975 /* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
1976 * to be adapted.
1977 */
1978 __cgfsng_ops static bool cgfsng_unfreeze(struct cgroup_ops *ops)
1979 {
1980 int ret;
1981 char *fullpath;
1982 struct hierarchy *h;
1983
1984 h = get_hierarchy(ops, "freezer");
1985 if (!h)
1986 return false;
1987
1988 fullpath = must_make_path(h->container_full_path, "freezer.state", NULL);
1989 ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false, 0666);
1990 free(fullpath);
1991 if (ret < 0)
1992 return false;
1993
1994 return true;
1995 }
1996
1997 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
1998 const char *controller)
1999 {
2000 struct hierarchy *h;
2001
2002 h = get_hierarchy(ops, controller);
2003 if (!h) {
2004 WARN("Failed to find hierarchy for controller \"%s\"",
2005 controller ? controller : "(null)");
2006 return NULL;
2007 }
2008
2009 return h->container_full_path ? h->container_full_path + strlen(h->mountpoint) : NULL;
2010 }
2011
2012 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2013 * which must be freed by the caller.
2014 */
2015 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2016 const char *inpath,
2017 const char *filename)
2018 {
2019 return must_make_path(h->mountpoint, inpath, filename, NULL);
2020 }
2021
2022 /* Technically, we're always at a delegation boundary here (This is especially
2023 * true when cgroup namespaces are available.). The reasoning is that in order
2024 * for us to have been able to start a container in the first place the root
2025 * cgroup must have been a leaf node. Now, either the container's init system
2026 * has populated the cgroup and kept it as a leaf node or it has created
2027 * subtrees. In the former case we will simply attach to the leaf node we
2028 * created when we started the container in the latter case we create our own
2029 * cgroup for the attaching process.
2030 */
2031 static int __cg_unified_attach(const struct hierarchy *h, const char *name,
2032 const char *lxcpath, const char *pidstr,
2033 size_t pidstr_len, const char *controller)
2034 {
2035 int ret;
2036 size_t len;
2037 int fret = -1, idx = 0;
2038 char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
2039
2040 container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2041 /* not running */
2042 if (!container_cgroup)
2043 return 0;
2044
2045 base_path = must_make_path(h->mountpoint, container_cgroup, NULL);
2046 full_path = must_make_path(base_path, "cgroup.procs", NULL);
2047 /* cgroup is populated */
2048 ret = lxc_write_to_file(full_path, pidstr, pidstr_len, false, 0666);
2049 if (ret < 0 && errno != EBUSY)
2050 goto on_error;
2051
2052 if (ret == 0)
2053 goto on_success;
2054
2055 free(full_path);
2056
2057 len = strlen(base_path) + STRLITERALLEN("/lxc-1000") +
2058 STRLITERALLEN("/cgroup-procs");
2059 full_path = must_alloc(len + 1);
2060 do {
2061 if (idx)
2062 ret = snprintf(full_path, len + 1, "%s/lxc-%d",
2063 base_path, idx);
2064 else
2065 ret = snprintf(full_path, len + 1, "%s/lxc", base_path);
2066 if (ret < 0 || (size_t)ret >= len + 1)
2067 goto on_error;
2068
2069 ret = mkdir_p(full_path, 0755);
2070 if (ret < 0 && errno != EEXIST)
2071 goto on_error;
2072
2073 (void)strlcat(full_path, "/cgroup.procs", len + 1);
2074 ret = lxc_write_to_file(full_path, pidstr, len, false, 0666);
2075 if (ret == 0)
2076 goto on_success;
2077
2078 /* this is a non-leaf node */
2079 if (errno != EBUSY)
2080 goto on_error;
2081
2082 } while (++idx > 0 && idx < 1000);
2083
2084 on_success:
2085 if (idx < 1000)
2086 fret = 0;
2087
2088 on_error:
2089 free(base_path);
2090 free(container_cgroup);
2091 free(full_path);
2092
2093 return fret;
2094 }
2095
2096 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
2097 const char *lxcpath, pid_t pid)
2098 {
2099 int i, len, ret;
2100 char pidstr[25];
2101
2102 len = snprintf(pidstr, 25, "%d", pid);
2103 if (len < 0 || len >= 25)
2104 return false;
2105
2106 for (i = 0; ops->hierarchies[i]; i++) {
2107 char *path;
2108 char *fullpath = NULL;
2109 struct hierarchy *h = ops->hierarchies[i];
2110
2111 if (h->version == CGROUP2_SUPER_MAGIC) {
2112 ret = __cg_unified_attach(h, name, lxcpath, pidstr, len,
2113 h->controllers[0]);
2114 if (ret < 0)
2115 return false;
2116
2117 continue;
2118 }
2119
2120 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2121 /* not running */
2122 if (!path)
2123 continue;
2124
2125 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2126 free(path);
2127 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2128 if (ret < 0) {
2129 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
2130 free(fullpath);
2131 return false;
2132 }
2133 free(fullpath);
2134 }
2135
2136 return true;
2137 }
2138
2139 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2140 * don't have a cgroup_data set up, so we ask the running container through the
2141 * commands API for the cgroup path.
2142 */
2143 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2144 char *value, size_t len, const char *name,
2145 const char *lxcpath)
2146 {
2147 int ret = -1;
2148 size_t controller_len;
2149 char *controller, *p, *path;
2150 struct hierarchy *h;
2151
2152 controller_len = strlen(filename);
2153 controller = alloca(controller_len + 1);
2154 (void)strlcpy(controller, filename, controller_len + 1);
2155
2156 p = strchr(controller, '.');
2157 if (p)
2158 *p = '\0';
2159
2160 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2161 /* not running */
2162 if (!path)
2163 return -1;
2164
2165 h = get_hierarchy(ops, controller);
2166 if (h) {
2167 char *fullpath;
2168
2169 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2170 ret = lxc_read_from_file(fullpath, value, len);
2171 free(fullpath);
2172 }
2173 free(path);
2174
2175 return ret;
2176 }
2177
2178 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2179 * don't have a cgroup_data set up, so we ask the running container through the
2180 * commands API for the cgroup path.
2181 */
2182 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2183 const char *filename, const char *value,
2184 const char *name, const char *lxcpath)
2185 {
2186 int ret = -1;
2187 size_t controller_len;
2188 char *controller, *p, *path;
2189 struct hierarchy *h;
2190
2191 controller_len = strlen(filename);
2192 controller = alloca(controller_len + 1);
2193 (void)strlcpy(controller, filename, controller_len + 1);
2194
2195 p = strchr(controller, '.');
2196 if (p)
2197 *p = '\0';
2198
2199 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2200 /* not running */
2201 if (!path)
2202 return -1;
2203
2204 h = get_hierarchy(ops, controller);
2205 if (h) {
2206 char *fullpath;
2207
2208 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2209 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2210 free(fullpath);
2211 }
2212 free(path);
2213
2214 return ret;
2215 }
2216
2217 /* take devices cgroup line
2218 * /dev/foo rwx
2219 * and convert it to a valid
2220 * type major:minor mode
2221 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2222 * the output.
2223 */
2224 static int convert_devpath(const char *invalue, char *dest)
2225 {
2226 int n_parts;
2227 char *p, *path, type;
2228 unsigned long minor, major;
2229 struct stat sb;
2230 int ret = -EINVAL;
2231 char *mode = NULL;
2232
2233 path = must_copy_string(invalue);
2234
2235 /* Read path followed by mode. Ignore any trailing text.
2236 * A ' # comment' would be legal. Technically other text is not
2237 * legal, we could check for that if we cared to.
2238 */
2239 for (n_parts = 1, p = path; *p && n_parts < 3; p++) {
2240 if (*p != ' ')
2241 continue;
2242 *p = '\0';
2243
2244 if (n_parts != 1)
2245 break;
2246 p++;
2247 n_parts++;
2248
2249 while (*p == ' ')
2250 p++;
2251
2252 mode = p;
2253
2254 if (*p == '\0')
2255 goto out;
2256 }
2257
2258 if (n_parts == 1)
2259 goto out;
2260
2261 ret = stat(path, &sb);
2262 if (ret < 0)
2263 goto out;
2264
2265 mode_t m = sb.st_mode & S_IFMT;
2266 switch (m) {
2267 case S_IFBLK:
2268 type = 'b';
2269 break;
2270 case S_IFCHR:
2271 type = 'c';
2272 break;
2273 default:
2274 ERROR("Unsupported device type %i for \"%s\"", m, path);
2275 ret = -EINVAL;
2276 goto out;
2277 }
2278
2279 major = MAJOR(sb.st_rdev);
2280 minor = MINOR(sb.st_rdev);
2281 ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
2282 if (ret < 0 || ret >= 50) {
2283 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2284 "chars)", type, major, minor, mode);
2285 ret = -ENAMETOOLONG;
2286 goto out;
2287 }
2288 ret = 0;
2289
2290 out:
2291 free(path);
2292 return ret;
2293 }
2294
2295 /* Called from setup_limits - here we have the container's cgroup_data because
2296 * we created the cgroups.
2297 */
2298 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2299 const char *value)
2300 {
2301 size_t len;
2302 char *fullpath, *p;
2303 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2304 char converted_value[50];
2305 struct hierarchy *h;
2306 int ret = 0;
2307 char *controller = NULL;
2308
2309 len = strlen(filename);
2310 controller = alloca(len + 1);
2311 (void)strlcpy(controller, filename, len + 1);
2312
2313 p = strchr(controller, '.');
2314 if (p)
2315 *p = '\0';
2316
2317 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
2318 ret = convert_devpath(value, converted_value);
2319 if (ret < 0)
2320 return ret;
2321 value = converted_value;
2322 }
2323
2324 h = get_hierarchy(ops, controller);
2325 if (!h) {
2326 ERROR("Failed to setup limits for the \"%s\" controller. "
2327 "The controller seems to be unused by \"cgfsng\" cgroup "
2328 "driver or not enabled on the cgroup hierarchy",
2329 controller);
2330 errno = ENOENT;
2331 return -ENOENT;
2332 }
2333
2334 fullpath = must_make_path(h->container_full_path, filename, NULL);
2335 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2336 free(fullpath);
2337 return ret;
2338 }
2339
2340 static bool __cg_legacy_setup_limits(struct cgroup_ops *ops,
2341 struct lxc_list *cgroup_settings,
2342 bool do_devices)
2343 {
2344 struct lxc_list *iterator, *next, *sorted_cgroup_settings;
2345 struct lxc_cgroup *cg;
2346 bool ret = false;
2347
2348 if (lxc_list_empty(cgroup_settings))
2349 return true;
2350
2351 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2352 if (!sorted_cgroup_settings)
2353 return false;
2354
2355 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2356 cg = iterator->elem;
2357
2358 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2359 if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
2360 if (do_devices && (errno == EACCES || errno == EPERM)) {
2361 WARN("Failed to set \"%s\" to \"%s\"",
2362 cg->subsystem, cg->value);
2363 continue;
2364 }
2365 WARN("Failed to set \"%s\" to \"%s\"",
2366 cg->subsystem, cg->value);
2367 goto out;
2368 }
2369 DEBUG("Set controller \"%s\" set to \"%s\"",
2370 cg->subsystem, cg->value);
2371 }
2372 }
2373
2374 ret = true;
2375 INFO("Limits for the legacy cgroup hierarchies have been setup");
2376 out:
2377 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2378 lxc_list_del(iterator);
2379 free(iterator);
2380 }
2381 free(sorted_cgroup_settings);
2382 return ret;
2383 }
2384
2385 static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
2386 struct lxc_list *cgroup_settings)
2387 {
2388 struct lxc_list *iterator;
2389 struct hierarchy *h = ops->unified;
2390
2391 if (lxc_list_empty(cgroup_settings))
2392 return true;
2393
2394 if (!h)
2395 return false;
2396
2397 lxc_list_for_each(iterator, cgroup_settings) {
2398 int ret;
2399 char *fullpath;
2400 struct lxc_cgroup *cg = iterator->elem;
2401
2402 fullpath = must_make_path(h->container_full_path, cg->subsystem, NULL);
2403 ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666);
2404 free(fullpath);
2405 if (ret < 0) {
2406 SYSERROR("Failed to set \"%s\" to \"%s\"",
2407 cg->subsystem, cg->value);
2408 return false;
2409 }
2410 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2411 }
2412
2413 INFO("Limits for the unified cgroup hierarchy have been setup");
2414 return true;
2415 }
2416
2417 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2418 struct lxc_conf *conf,
2419 bool do_devices)
2420 {
2421 bool bret;
2422
2423 bret = __cg_legacy_setup_limits(ops, &conf->cgroup, do_devices);
2424 if (!bret)
2425 return false;
2426
2427 return __cg_unified_setup_limits(ops, &conf->cgroup2);
2428 }
2429
2430 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2431 char **controllers)
2432 {
2433 char **cur_ctrl, **cur_use;
2434
2435 if (!ops->cgroup_use)
2436 return true;
2437
2438 for (cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
2439 bool found = false;
2440
2441 for (cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
2442 if (strcmp(*cur_use, *cur_ctrl) != 0)
2443 continue;
2444
2445 found = true;
2446 break;
2447 }
2448
2449 if (found)
2450 continue;
2451
2452 return false;
2453 }
2454
2455 return true;
2456 }
2457
2458 /* At startup, parse_hierarchies finds all the info we need about cgroup
2459 * mountpoints and current cgroups, and stores it in @d.
2460 */
2461 static bool cg_hybrid_init(struct cgroup_ops *ops, bool relative)
2462 {
2463 int ret;
2464 char *basecginfo;
2465 FILE *f;
2466 size_t len = 0;
2467 char *line = NULL;
2468 char **klist = NULL, **nlist = NULL;
2469
2470 /* Root spawned containers escape the current cgroup, so use init's
2471 * cgroups as our base in that case.
2472 */
2473 if (!relative && (geteuid() == 0))
2474 basecginfo = read_file("/proc/1/cgroup");
2475 else
2476 basecginfo = read_file("/proc/self/cgroup");
2477 if (!basecginfo)
2478 return false;
2479
2480 ret = get_existing_subsystems(&klist, &nlist);
2481 if (ret < 0) {
2482 ERROR("Failed to retrieve available legacy cgroup controllers");
2483 free(basecginfo);
2484 return false;
2485 }
2486
2487 f = fopen("/proc/self/mountinfo", "r");
2488 if (!f) {
2489 ERROR("Failed to open \"/proc/self/mountinfo\"");
2490 free(basecginfo);
2491 return false;
2492 }
2493
2494 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2495
2496 while (getline(&line, &len, f) != -1) {
2497 int type;
2498 bool writeable;
2499 struct hierarchy *new;
2500 char *base_cgroup = NULL, *mountpoint = NULL;
2501 char **controller_list = NULL;
2502
2503 type = get_cgroup_version(line);
2504 if (type == 0)
2505 continue;
2506
2507 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2508 continue;
2509
2510 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2511 if (type == CGROUP2_SUPER_MAGIC)
2512 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2513 else if (type == CGROUP_SUPER_MAGIC)
2514 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2515 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2516 if (type == CGROUP_SUPER_MAGIC)
2517 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2518 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2519 if (type == CGROUP2_SUPER_MAGIC)
2520 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2521 }
2522
2523 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
2524 if (!controller_list && type == CGROUP_SUPER_MAGIC)
2525 continue;
2526
2527 if (type == CGROUP_SUPER_MAGIC)
2528 if (controller_list_is_dup(ops->hierarchies, controller_list))
2529 goto next;
2530
2531 mountpoint = cg_hybrid_get_mountpoint(line);
2532 if (!mountpoint) {
2533 ERROR("Failed parsing mountpoint from \"%s\"", line);
2534 goto next;
2535 }
2536
2537 if (type == CGROUP_SUPER_MAGIC)
2538 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
2539 else
2540 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
2541 if (!base_cgroup) {
2542 ERROR("Failed to find current cgroup");
2543 goto next;
2544 }
2545
2546 trim(base_cgroup);
2547 prune_init_scope(base_cgroup);
2548 if (type == CGROUP2_SUPER_MAGIC)
2549 writeable = test_writeable_v2(mountpoint, base_cgroup);
2550 else
2551 writeable = test_writeable_v1(mountpoint, base_cgroup);
2552 if (!writeable)
2553 goto next;
2554
2555 if (type == CGROUP2_SUPER_MAGIC) {
2556 char *cgv2_ctrl_path;
2557
2558 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
2559 "cgroup.controllers",
2560 NULL);
2561
2562 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
2563 free(cgv2_ctrl_path);
2564 if (!controller_list) {
2565 controller_list = cg_unified_make_empty_controller();
2566 TRACE("No controllers are enabled for "
2567 "delegation in the unified hierarchy");
2568 }
2569 }
2570
2571 /* Exclude all controllers that cgroup use does not want. */
2572 if (!cgroup_use_wants_controllers(ops, controller_list))
2573 goto next;
2574
2575 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
2576 if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
2577 ops->unified = new;
2578
2579 continue;
2580
2581 next:
2582 free_string_list(controller_list);
2583 free(mountpoint);
2584 free(base_cgroup);
2585 }
2586
2587 free_string_list(klist);
2588 free_string_list(nlist);
2589
2590 free(basecginfo);
2591
2592 fclose(f);
2593 free(line);
2594
2595 TRACE("Writable cgroup hierarchies:");
2596 lxc_cgfsng_print_hierarchies(ops);
2597
2598 /* verify that all controllers in cgroup.use and all crucial
2599 * controllers are accounted for
2600 */
2601 if (!all_controllers_found(ops))
2602 return false;
2603
2604 return true;
2605 }
2606
2607 static int cg_is_pure_unified(void)
2608 {
2609
2610 int ret;
2611 struct statfs fs;
2612
2613 ret = statfs("/sys/fs/cgroup", &fs);
2614 if (ret < 0)
2615 return -ENOMEDIUM;
2616
2617 if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
2618 return CGROUP2_SUPER_MAGIC;
2619
2620 return 0;
2621 }
2622
2623 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
2624 static char *cg_unified_get_current_cgroup(bool relative)
2625 {
2626 char *basecginfo, *base_cgroup;
2627 char *copy = NULL;
2628
2629 if (!relative && (geteuid() == 0))
2630 basecginfo = read_file("/proc/1/cgroup");
2631 else
2632 basecginfo = read_file("/proc/self/cgroup");
2633 if (!basecginfo)
2634 return NULL;
2635
2636 base_cgroup = strstr(basecginfo, "0::/");
2637 if (!base_cgroup)
2638 goto cleanup_on_err;
2639
2640 base_cgroup = base_cgroup + 3;
2641 copy = copy_to_eol(base_cgroup);
2642 if (!copy)
2643 goto cleanup_on_err;
2644
2645 cleanup_on_err:
2646 free(basecginfo);
2647 if (copy)
2648 trim(copy);
2649
2650 return copy;
2651 }
2652
2653 static int cg_unified_init(struct cgroup_ops *ops, bool relative)
2654 {
2655 int ret;
2656 char *mountpoint, *subtree_path;
2657 char **delegatable;
2658 char *base_cgroup = NULL;
2659
2660 ret = cg_is_pure_unified();
2661 if (ret == -ENOMEDIUM)
2662 return -ENOMEDIUM;
2663
2664 if (ret != CGROUP2_SUPER_MAGIC)
2665 return 0;
2666
2667 base_cgroup = cg_unified_get_current_cgroup(relative);
2668 if (!base_cgroup)
2669 return -EINVAL;
2670 prune_init_scope(base_cgroup);
2671
2672 /* We assume that we have already been given controllers to delegate
2673 * further down the hierarchy. If not it is up to the user to delegate
2674 * them to us.
2675 */
2676 mountpoint = must_copy_string("/sys/fs/cgroup");
2677 subtree_path = must_make_path(mountpoint, base_cgroup,
2678 "cgroup.subtree_control", NULL);
2679 delegatable = cg_unified_get_controllers(subtree_path);
2680 free(subtree_path);
2681 if (!delegatable)
2682 delegatable = cg_unified_make_empty_controller();
2683 if (!delegatable[0])
2684 TRACE("No controllers are enabled for delegation");
2685
2686 /* TODO: If the user requested specific controllers via lxc.cgroup.use
2687 * we should verify here. The reason I'm not doing it right is that I'm
2688 * not convinced that lxc.cgroup.use will be the future since it is a
2689 * global property. I much rather have an option that lets you request
2690 * controllers per container.
2691 */
2692
2693 add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
2694
2695 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2696 return CGROUP2_SUPER_MAGIC;
2697 }
2698
2699 static bool cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
2700 {
2701 int ret;
2702 const char *tmp;
2703 bool relative = conf->cgroup_meta.relative;
2704
2705 tmp = lxc_global_config_value("lxc.cgroup.use");
2706 if (tmp) {
2707 char *chop, *cur, *pin;
2708
2709 pin = must_copy_string(tmp);
2710 chop = pin;
2711
2712 lxc_iterate_parts(cur, chop, ",") {
2713 must_append_string(&ops->cgroup_use, cur);
2714 }
2715
2716 free(pin);
2717 }
2718
2719 ret = cg_unified_init(ops, relative);
2720 if (ret < 0)
2721 return false;
2722
2723 if (ret == CGROUP2_SUPER_MAGIC)
2724 return true;
2725
2726 return cg_hybrid_init(ops, relative);
2727 }
2728
2729 __cgfsng_ops static bool cgfsng_data_init(struct cgroup_ops *ops)
2730 {
2731 const char *cgroup_pattern;
2732
2733 /* copy system-wide cgroup information */
2734 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
2735 if (!cgroup_pattern) {
2736 /* lxc.cgroup.pattern is only NULL on error. */
2737 ERROR("Failed to retrieve cgroup pattern");
2738 return false;
2739 }
2740 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
2741 ops->monitor_pattern = MONITOR_CGROUP;
2742
2743 return true;
2744 }
2745
2746 struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
2747 {
2748 struct cgroup_ops *cgfsng_ops;
2749
2750 cgfsng_ops = malloc(sizeof(struct cgroup_ops));
2751 if (!cgfsng_ops)
2752 return NULL;
2753
2754 memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
2755 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
2756
2757 if (!cg_init(cgfsng_ops, conf)) {
2758 free(cgfsng_ops);
2759 return NULL;
2760 }
2761
2762 cgfsng_ops->data_init = cgfsng_data_init;
2763 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
2764 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
2765 cgfsng_ops->monitor_create = cgfsng_monitor_create;
2766 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
2767 cgfsng_ops->payload_create = cgfsng_payload_create;
2768 cgfsng_ops->payload_enter = cgfsng_payload_enter;
2769 cgfsng_ops->escape = cgfsng_escape;
2770 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
2771 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
2772 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
2773 cgfsng_ops->get = cgfsng_get;
2774 cgfsng_ops->set = cgfsng_set;
2775 cgfsng_ops->unfreeze = cgfsng_unfreeze;
2776 cgfsng_ops->setup_limits = cgfsng_setup_limits;
2777 cgfsng_ops->driver = "cgfsng";
2778 cgfsng_ops->version = "1.0.0";
2779 cgfsng_ops->attach = cgfsng_attach;
2780 cgfsng_ops->chown = cgfsng_chown;
2781 cgfsng_ops->mount = cgfsng_mount;
2782 cgfsng_ops->nrtasks = cgfsng_nrtasks;
2783
2784 return cgfsng_ops;
2785 }