]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
cgfsng: do MS_REMOUNT
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
ccb4cabe
SH
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24/*
25 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
26 * cgroup backend. The original cgfs.c was designed to be as flexible
27 * as possible. It would try to find cgroup filesystems no matter where
28 * or how you had them mounted, and deduce the most usable mount for
29 * each controller. It also was not designed for unprivileged use, as
30 * that was reserved for cgmanager.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
a54694f8 36
ccb4cabe 37#include "config.h"
a54694f8
CB
38
39#include <ctype.h>
40#include <dirent.h>
41#include <errno.h>
42#include <grp.h>
43#include <stdint.h>
ccb4cabe
SH
44#include <stdio.h>
45#include <stdlib.h>
a54694f8 46#include <string.h>
ccb4cabe 47#include <unistd.h>
a54694f8 48#include <sys/types.h>
ccb4cabe 49
c8bf519d 50#include <linux/types.h>
51#include <linux/kdev_t.h>
52
b635e92d 53#include "caps.h"
ccb4cabe 54#include "cgroup.h"
6328fd9c 55#include "cgroup_utils.h"
ccb4cabe 56#include "commands.h"
43654d34 57#include "conf.h"
a54694f8 58#include "log.h"
43654d34 59#include "storage/storage.h"
a54694f8 60#include "utils.h"
ccb4cabe
SH
61
62lxc_log_define(lxc_cgfsng, lxc);
63
64static struct cgroup_ops cgfsng_ops;
65
ccb4cabe
SH
66/*
67 * A descriptor for a mounted hierarchy
68 * @controllers: either NULL, or a null-terminated list of all
69 * the co-mounted controllers
70 * @mountpoint: the mountpoint we will use. It will be either
71 * /sys/fs/cgroup/controller or /sys/fs/cgroup/controllerlist
72 * @base_cgroup: the cgroup under which the container cgroup path
73 is created. This will be either the caller's cgroup (if not
74 root), or init's cgroup (if root).
75 */
76struct hierarchy {
77 char **controllers;
78 char *mountpoint;
79 char *base_cgroup;
80 char *fullcgpath;
d6337a5f 81 int version;
ccb4cabe
SH
82};
83
84/*
85 * The cgroup data which is attached to the lxc_handler.
43654d34
CB
86 * @cgroup_pattern : A copy of the lxc.cgroup.pattern
87 * @container_cgroup : If not null, the cgroup which was created for the
88 * container. For each hierarchy, it is created under the
89 * @hierarchy->base_cgroup directory. Relative to the
90 * base_cgroup it is the same for all hierarchies.
91 * @name : The name of the container.
92 * @cgroup_meta : A copy of the container's cgroup information. This
93 * overrides @cgroup_pattern.
ccb4cabe
SH
94 */
95struct cgfsng_handler_data {
ccb4cabe 96 char *cgroup_pattern;
1a0e70ac
CB
97 char *container_cgroup; /* cgroup we created for the container */
98 char *name; /* container name */
43654d34
CB
99 /* per-container cgroup information */
100 struct lxc_cgroup cgroup_meta;
d6337a5f 101 cgroup_layout_t cgroup_layout;
ccb4cabe
SH
102};
103
457ca9aa
SH
104/*
105 * @hierarchies - a NULL-terminated array of struct hierarchy, one per
d6337a5f
CB
106 * legacy hierarchy. No duplicates. First sufficient, writeable
107 * mounted hierarchy wins
457ca9aa
SH
108 */
109struct hierarchy **hierarchies;
d6337a5f
CB
110struct hierarchy *unified;
111cgroup_layout_t cgroup_layout;
457ca9aa
SH
112
113/*
114 * @cgroup_use - a copy of the lxc.cgroup.use
115 */
116char *cgroup_use;
117
e4aeecf5
CB
118/*
119 * @lxc_cgfsng_debug - whether to print debug info to stdout for the cgfsng
120 * driver
121 */
122static bool lxc_cgfsng_debug;
123
65d78313
MPS
124#define CGFSNG_DEBUG(format, ...) do { \
125 if (lxc_cgfsng_debug) \
126 printf("cgfsng: " format, ##__VA_ARGS__); \
127} while(0)
128
ccb4cabe
SH
129static void free_string_list(char **clist)
130{
131 if (clist) {
132 int i;
133
134 for (i = 0; clist[i]; i++)
135 free(clist[i]);
136 free(clist);
137 }
138}
139
ccb4cabe
SH
140/* Allocate a pointer, do not fail */
141static void *must_alloc(size_t sz)
142{
143 return must_realloc(NULL, sz);
144}
145
ccb4cabe
SH
146/*
147 * This is a special case - return a copy of @entry
148 * prepending 'name='. I.e. turn systemd into name=systemd.
149 * Do not fail.
150 */
151static char *must_prefix_named(char *entry)
152{
153 char *ret;
154 size_t len = strlen(entry);
155
156 ret = must_alloc(len + 6);
157 snprintf(ret, len + 6, "name=%s", entry);
158 return ret;
159}
160
161/*
162 * Given a pointer to a null-terminated array of pointers, realloc to
163 * add one entry, and point the new entry to NULL. Do not fail. Return
164 * the index to the second-to-last entry - that is, the one which is
165 * now available for use (keeping the list null-terminated).
166 */
167static int append_null_to_list(void ***list)
168{
169 int newentry = 0;
170
171 if (*list)
172 for (; (*list)[newentry]; newentry++);
173
174 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
175 (*list)[newentry + 1] = NULL;
176 return newentry;
177}
178
179/*
180 * Given a null-terminated array of strings, check whether @entry
181 * is one of the strings
182 */
183static bool string_in_list(char **list, const char *entry)
184{
185 int i;
186
187 if (!list)
188 return false;
d6337a5f 189
ccb4cabe
SH
190 for (i = 0; list[i]; i++)
191 if (strcmp(list[i], entry) == 0)
192 return true;
193
194 return false;
195}
196
197/*
198 * append an entry to the clist. Do not fail.
199 * *clist must be NULL the first time we are called.
200 *
201 * We also handle named subsystems here. Any controller which is not a
202 * kernel subsystem, we prefix 'name='. Any which is both a kernel and
203 * named subsystem, we refuse to use because we're not sure which we
204 * have here. (TODO - we could work around this in some cases by just
205 * remounting to be unambiguous, or by comparing mountpoint contents
206 * with current cgroup)
207 *
208 * The last entry will always be NULL.
209 */
210static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry)
211{
212 int newentry;
213 char *copy;
214
215 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
c2712f64 216 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
ccb4cabe
SH
217 ERROR("It is both a named and kernel subsystem");
218 return;
219 }
220
221 newentry = append_null_to_list((void ***)clist);
222
223 if (strncmp(entry, "name=", 5) == 0)
224 copy = must_copy_string(entry);
225 else if (string_in_list(klist, entry))
226 copy = must_copy_string(entry);
227 else
228 copy = must_prefix_named(entry);
229
230 (*clist)[newentry] = copy;
231}
232
ccb4cabe
SH
233static void free_handler_data(struct cgfsng_handler_data *d)
234{
ccb4cabe
SH
235 free(d->cgroup_pattern);
236 free(d->container_cgroup);
237 free(d->name);
43654d34
CB
238 if (d->cgroup_meta.dir)
239 free(d->cgroup_meta.dir);
240 if (d->cgroup_meta.controllers)
241 free(d->cgroup_meta.controllers);
ccb4cabe
SH
242 free(d);
243}
244
245/*
246 * Given a handler's cgroup data, return the struct hierarchy for the
247 * controller @c, or NULL if there is none.
248 */
457ca9aa 249struct hierarchy *get_hierarchy(const char *c)
ccb4cabe
SH
250{
251 int i;
252
457ca9aa 253 if (!hierarchies)
ccb4cabe 254 return NULL;
d6337a5f 255
457ca9aa 256 for (i = 0; hierarchies[i]; i++) {
d6337a5f
CB
257 if (!c) {
258 /* This is the empty unified hierarchy. */
259 if (hierarchies[i]->controllers &&
260 !hierarchies[i]->controllers[0])
261 return hierarchies[i];
262
263 return NULL;
264 }
265
457ca9aa
SH
266 if (string_in_list(hierarchies[i]->controllers, c))
267 return hierarchies[i];
ccb4cabe 268 }
d6337a5f 269
ccb4cabe
SH
270 return NULL;
271}
272
a54694f8
CB
273#define BATCH_SIZE 50
274static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
275{
276 int newbatches = (newlen / BATCH_SIZE) + 1;
277 int oldbatches = (oldlen / BATCH_SIZE) + 1;
278
279 if (!*mem || newbatches > oldbatches) {
280 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
281 }
282}
283
284static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
285{
286 size_t full = oldlen + newlen;
287
288 batch_realloc(dest, oldlen, full + 1);
289
290 memcpy(*dest + oldlen, new, newlen + 1);
291}
292
293/* Slurp in a whole file */
d6337a5f 294static char *read_file(const char *fnam)
a54694f8
CB
295{
296 FILE *f;
297 char *line = NULL, *buf = NULL;
298 size_t len = 0, fulllen = 0;
299 int linelen;
300
301 f = fopen(fnam, "r");
302 if (!f)
303 return NULL;
304 while ((linelen = getline(&line, &len, f)) != -1) {
305 append_line(&buf, fulllen, line, linelen);
306 fulllen += linelen;
307 }
308 fclose(f);
309 free(line);
310 return buf;
311}
312
313/* Taken over modified from the kernel sources. */
314#define NBITS 32 /* bits in uint32_t */
315#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
316#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
317
318static void set_bit(unsigned bit, uint32_t *bitarr)
319{
320 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
321}
322
323static void clear_bit(unsigned bit, uint32_t *bitarr)
324{
325 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
326}
327
328static bool is_set(unsigned bit, uint32_t *bitarr)
329{
330 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
331}
332
333/* Create cpumask from cpulist aka turn:
334 *
335 * 0,2-3
336 *
337 * into bit array
338 *
339 * 1 0 1 1
340 */
341static uint32_t *lxc_cpumask(char *buf, size_t nbits)
342{
343 char *token;
344 char *saveptr = NULL;
345 size_t arrlen = BITS_TO_LONGS(nbits);
346 uint32_t *bitarr = calloc(arrlen, sizeof(uint32_t));
347 if (!bitarr)
348 return NULL;
349
350 for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
351 errno = 0;
352 unsigned start = strtoul(token, NULL, 0);
353 unsigned end = start;
354
355 char *range = strchr(token, '-');
356 if (range)
357 end = strtoul(range + 1, NULL, 0);
358 if (!(start <= end)) {
359 free(bitarr);
360 return NULL;
361 }
362
363 if (end >= nbits) {
364 free(bitarr);
365 return NULL;
366 }
367
368 while (start <= end)
369 set_bit(start++, bitarr);
370 }
371
372 return bitarr;
373}
374
a54694f8
CB
375/* Turn cpumask into simple, comma-separated cpulist. */
376static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
377{
378 size_t i;
379 int ret;
eab15c1e 380 char numstr[LXC_NUMSTRLEN64] = {0};
a54694f8
CB
381 char **cpulist = NULL;
382
383 for (i = 0; i <= nbits; i++) {
384 if (is_set(i, bitarr)) {
eab15c1e
CB
385 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
386 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
a54694f8
CB
387 lxc_free_array((void **)cpulist, free);
388 return NULL;
389 }
390 if (lxc_append_string(&cpulist, numstr) < 0) {
391 lxc_free_array((void **)cpulist, free);
392 return NULL;
393 }
394 }
395 }
396 return lxc_string_join(",", (const char **)cpulist, false);
397}
398
399static ssize_t get_max_cpus(char *cpulist)
400{
401 char *c1, *c2;
402 char *maxcpus = cpulist;
403 size_t cpus = 0;
404
405 c1 = strrchr(maxcpus, ',');
406 if (c1)
407 c1++;
408
409 c2 = strrchr(maxcpus, '-');
410 if (c2)
411 c2++;
412
413 if (!c1 && !c2)
414 c1 = maxcpus;
415 else if (c1 > c2)
416 c2 = c1;
417 else if (c1 < c2)
418 c1 = c2;
1a0e70ac 419 else if (!c1 && c2) /* The reverse case is obvs. not needed. */
a54694f8
CB
420 c1 = c2;
421
422 /* If the above logic is correct, c1 should always hold a valid string
423 * here.
424 */
425
426 errno = 0;
427 cpus = strtoul(c1, NULL, 0);
428 if (errno != 0)
429 return -1;
430
431 return cpus;
432}
433
6f9584d8 434#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
a3926f6a 435static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
a54694f8
CB
436{
437 char *lastslash, *fpath, oldv;
438 int ret;
439 ssize_t i;
440
441 ssize_t maxposs = 0, maxisol = 0;
442 char *cpulist = NULL, *posscpus = NULL, *isolcpus = NULL;
443 uint32_t *possmask = NULL, *isolmask = NULL;
6f9584d8 444 bool bret = false, flipped_bit = false;
a54694f8
CB
445
446 lastslash = strrchr(path, '/');
1a0e70ac 447 if (!lastslash) { /* bug... this shouldn't be possible */
6f9584d8 448 ERROR("Invalid path: %s.", path);
a54694f8
CB
449 return bret;
450 }
451 oldv = *lastslash;
452 *lastslash = '\0';
453 fpath = must_make_path(path, "cpuset.cpus", NULL);
454 posscpus = read_file(fpath);
6f9584d8
CB
455 if (!posscpus) {
456 SYSERROR("Could not read file: %s.\n", fpath);
457 goto on_error;
458 }
a54694f8
CB
459
460 /* Get maximum number of cpus found in possible cpuset. */
461 maxposs = get_max_cpus(posscpus);
462 if (maxposs < 0)
6f9584d8 463 goto on_error;
a54694f8 464
6f9584d8
CB
465 if (!file_exists(__ISOL_CPUS)) {
466 /* This system doesn't expose isolated cpus. */
467 DEBUG("Path: "__ISOL_CPUS" to read isolated cpus from does not exist.\n");
65d29cbc
CB
468 cpulist = posscpus;
469 /* No isolated cpus but we weren't already initialized by
470 * someone. We should simply copy the parents cpuset.cpus
471 * values.
472 */
473 if (!am_initialized) {
474 DEBUG("Copying cpuset of parent cgroup.");
475 goto copy_parent;
476 }
477 /* No isolated cpus but we were already initialized by someone.
478 * Nothing more to do for us.
479 */
6f9584d8
CB
480 goto on_success;
481 }
482
483 isolcpus = read_file(__ISOL_CPUS);
484 if (!isolcpus) {
485 SYSERROR("Could not read file "__ISOL_CPUS);
486 goto on_error;
487 }
a54694f8 488 if (!isdigit(isolcpus[0])) {
6f9584d8 489 DEBUG("No isolated cpus detected.");
a54694f8
CB
490 cpulist = posscpus;
491 /* No isolated cpus but we weren't already initialized by
492 * someone. We should simply copy the parents cpuset.cpus
493 * values.
494 */
6f9584d8
CB
495 if (!am_initialized) {
496 DEBUG("Copying cpuset of parent cgroup.");
a54694f8 497 goto copy_parent;
6f9584d8 498 }
a54694f8
CB
499 /* No isolated cpus but we were already initialized by someone.
500 * Nothing more to do for us.
501 */
6f9584d8 502 goto on_success;
a54694f8
CB
503 }
504
505 /* Get maximum number of cpus found in isolated cpuset. */
506 maxisol = get_max_cpus(isolcpus);
507 if (maxisol < 0)
6f9584d8 508 goto on_error;
a54694f8
CB
509
510 if (maxposs < maxisol)
511 maxposs = maxisol;
512 maxposs++;
513
514 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8
CB
515 if (!possmask) {
516 ERROR("Could not create cpumask for all possible cpus.\n");
517 goto on_error;
518 }
a54694f8
CB
519
520 isolmask = lxc_cpumask(isolcpus, maxposs);
6f9584d8
CB
521 if (!isolmask) {
522 ERROR("Could not create cpumask for all isolated cpus.\n");
523 goto on_error;
524 }
a54694f8
CB
525
526 for (i = 0; i <= maxposs; i++) {
527 if (is_set(i, isolmask) && is_set(i, possmask)) {
6f9584d8 528 flipped_bit = true;
a54694f8
CB
529 clear_bit(i, possmask);
530 }
531 }
532
6f9584d8
CB
533 if (!flipped_bit) {
534 DEBUG("No isolated cpus present in cpuset.");
535 goto on_success;
536 }
537 DEBUG("Removed isolated cpus from cpuset.");
538
a54694f8 539 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
6f9584d8
CB
540 if (!cpulist) {
541 ERROR("Could not create cpu list.\n");
542 goto on_error;
543 }
a54694f8
CB
544
545copy_parent:
546 *lastslash = oldv;
dcbc861e 547 free(fpath);
a54694f8
CB
548 fpath = must_make_path(path, "cpuset.cpus", NULL);
549 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false);
6f9584d8
CB
550 if (ret < 0) {
551 SYSERROR("Could not write cpu list to: %s.\n", fpath);
552 goto on_error;
553 }
554
555on_success:
556 bret = true;
a54694f8 557
6f9584d8 558on_error:
a54694f8
CB
559 free(fpath);
560
561 free(isolcpus);
562 free(isolmask);
563
564 if (posscpus != cpulist)
565 free(posscpus);
566 free(possmask);
567
568 free(cpulist);
569 return bret;
570}
571
e3a3fecf
SH
572/* Copy contents of parent(@path)/@file to @path/@file */
573static bool copy_parent_file(char *path, char *file)
574{
575 char *lastslash, *value = NULL, *fpath, oldv;
576 int len = 0;
577 int ret;
578
579 lastslash = strrchr(path, '/');
1a0e70ac 580 if (!lastslash) { /* bug... this shouldn't be possible */
e3a3fecf
SH
581 ERROR("cgfsng:copy_parent_file: bad path %s", path);
582 return false;
583 }
584 oldv = *lastslash;
585 *lastslash = '\0';
586 fpath = must_make_path(path, file, NULL);
587 len = lxc_read_from_file(fpath, NULL, 0);
588 if (len <= 0)
589 goto bad;
590 value = must_alloc(len + 1);
591 if (lxc_read_from_file(fpath, value, len) != len)
592 goto bad;
593 free(fpath);
594 *lastslash = oldv;
595 fpath = must_make_path(path, file, NULL);
596 ret = lxc_write_to_file(fpath, value, len, false);
597 if (ret < 0)
598 SYSERROR("Unable to write %s to %s", value, fpath);
599 free(fpath);
600 free(value);
601 return ret >= 0;
602
603bad:
604 SYSERROR("Error reading '%s'", fpath);
605 free(fpath);
606 free(value);
607 return false;
608}
609
610/*
611 * Initialize the cpuset hierarchy in first directory of @gname and
612 * set cgroup.clone_children so that children inherit settings.
613 * Since the h->base_path is populated by init or ourselves, we know
614 * it is already initialized.
615 */
a3926f6a 616static bool cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf
SH
617{
618 char *cgpath, *clonechildrenpath, v, *slash;
619
620 if (!string_in_list(h->controllers, "cpuset"))
621 return true;
622
623 if (*cgname == '/')
624 cgname++;
625 slash = strchr(cgname, '/');
626 if (slash)
627 *slash = '\0';
628
629 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
630 if (slash)
631 *slash = '/';
632 if (mkdir(cgpath, 0755) < 0 && errno != EEXIST) {
633 SYSERROR("Failed to create '%s'", cgpath);
634 free(cgpath);
635 return false;
636 }
6f9584d8 637
e3a3fecf 638 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
6328fd9c
CB
639 /* unified hierarchy doesn't have clone_children */
640 if (!file_exists(clonechildrenpath)) {
e3a3fecf
SH
641 free(clonechildrenpath);
642 free(cgpath);
643 return true;
644 }
645 if (lxc_read_from_file(clonechildrenpath, &v, 1) < 0) {
646 SYSERROR("Failed to read '%s'", clonechildrenpath);
647 free(clonechildrenpath);
648 free(cgpath);
649 return false;
650 }
651
a54694f8 652 /* Make sure any isolated cpus are removed from cpuset.cpus. */
a3926f6a 653 if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1')) {
6f9584d8
CB
654 SYSERROR("Failed to remove isolated cpus.");
655 free(clonechildrenpath);
656 free(cgpath);
a54694f8 657 return false;
6f9584d8 658 }
a54694f8 659
e3a3fecf 660 if (v == '1') { /* already set for us by someone else */
6f9584d8 661 DEBUG("\"cgroup.clone_children\" was already set to \"1\".");
e3a3fecf
SH
662 free(clonechildrenpath);
663 free(cgpath);
664 return true;
665 }
666
667 /* copy parent's settings */
a54694f8 668 if (!copy_parent_file(cgpath, "cpuset.mems")) {
6f9584d8 669 SYSERROR("Failed to copy \"cpuset.mems\" settings.");
e3a3fecf
SH
670 free(cgpath);
671 free(clonechildrenpath);
672 return false;
673 }
674 free(cgpath);
675
676 if (lxc_write_to_file(clonechildrenpath, "1", 1, false) < 0) {
677 /* Set clone_children so children inherit our settings */
678 SYSERROR("Failed to write 1 to %s", clonechildrenpath);
679 free(clonechildrenpath);
680 return false;
681 }
682 free(clonechildrenpath);
683 return true;
684}
685
ccb4cabe
SH
686/*
687 * Given two null-terminated lists of strings, return true if any string
688 * is in both.
689 */
690static bool controller_lists_intersect(char **l1, char **l2)
691{
692 int i;
693
694 if (!l1 || !l2)
695 return false;
696
697 for (i = 0; l1[i]; i++) {
698 if (string_in_list(l2, l1[i]))
699 return true;
700 }
701 return false;
702}
703
704/*
705 * For a null-terminated list of controllers @clist, return true if any of
706 * those controllers is already listed the null-terminated list of
707 * hierarchies @hlist. Realistically, if one is present, all must be present.
708 */
709static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
710{
711 int i;
712
713 if (!hlist)
714 return false;
715 for (i = 0; hlist[i]; i++)
716 if (controller_lists_intersect(hlist[i]->controllers, clist))
717 return true;
718 return false;
719
720}
721
722/*
723 * Return true if the controller @entry is found in the null-terminated
724 * list of hierarchies @hlist
725 */
726static bool controller_found(struct hierarchy **hlist, char *entry)
727{
728 int i;
d6337a5f 729
ccb4cabe
SH
730 if (!hlist)
731 return false;
732
733 for (i = 0; hlist[i]; i++)
734 if (string_in_list(hlist[i]->controllers, entry))
735 return true;
d6337a5f 736
ccb4cabe
SH
737 return false;
738}
739
740/*
c30b61c3
SH
741 * Return true if all of the controllers which we require have been found.
742 * The required list is freezer and anything in * lxc.cgroup.use.
ccb4cabe 743 */
457ca9aa 744static bool all_controllers_found(void)
ccb4cabe
SH
745{
746 char *p, *saveptr = NULL;
457ca9aa 747 struct hierarchy ** hlist = hierarchies;
ccb4cabe 748
ccb4cabe 749 if (!controller_found(hlist, "freezer")) {
65d78313 750 CGFSNG_DEBUG("No freezer controller mountpoint found\n");
ccb4cabe
SH
751 return false;
752 }
753
457ca9aa 754 if (!cgroup_use)
ccb4cabe 755 return true;
c2712f64 756
457ca9aa 757 for (p = strtok_r(cgroup_use, ",", &saveptr); p;
ccb4cabe
SH
758 p = strtok_r(NULL, ",", &saveptr)) {
759 if (!controller_found(hlist, p)) {
65d78313 760 CGFSNG_DEBUG("No %s controller mountpoint found\n", p);
ccb4cabe
SH
761 return false;
762 }
763 }
c2712f64 764
ccb4cabe
SH
765 return true;
766}
767
ccb4cabe
SH
768/*
769 * Get the controllers from a mountinfo line
770 * There are other ways we could get this info. For lxcfs, field 3
771 * is /cgroup/controller-list. For cgroupfs, we could parse the mount
772 * options. But we simply assume that the mountpoint must be
773 * /sys/fs/cgroup/controller-list
774 */
a3926f6a
CB
775static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
776 int type)
ccb4cabe 777{
6328fd9c 778 /* the fourth field is /sys/fs/cgroup/comma-delimited-controller-list */
ccb4cabe 779 int i;
411ac6d8 780 char *dup, *p2, *tok;
d6337a5f 781 char *p = line, *saveptr = NULL, *sep = ",";
411ac6d8 782 char **aret = NULL;
6328fd9c 783
ccb4cabe 784 for (i = 0; i < 4; i++) {
235f1815 785 p = strchr(p, ' ');
ccb4cabe
SH
786 if (!p)
787 return NULL;
788 p++;
789 }
a55f31bd 790
ccb4cabe
SH
791 /* note - if we change how mountinfo works, then our caller
792 * will need to verify /sys/fs/cgroup/ in this field */
c2712f64 793 if (strncmp(p, "/sys/fs/cgroup/", 15)) {
65d78313 794 CGFSNG_DEBUG("Found hierarchy not under /sys/fs/cgroup: \"%s\"\n", p);
ccb4cabe 795 return NULL;
5059aae9 796 }
d6337a5f 797
ccb4cabe 798 p += 15;
235f1815 799 p2 = strchr(p, ' ');
ccb4cabe 800 if (!p2) {
65d78313 801 CGFSNG_DEBUG("Corrupt mountinfo\n");
ccb4cabe
SH
802 return NULL;
803 }
804 *p2 = '\0';
6328fd9c 805
d6337a5f
CB
806 if (type == CGROUP_SUPER_MAGIC) {
807 /* strdup() here for v1 hierarchies. Otherwise strtok_r() will
808 * destroy mountpoints such as "/sys/fs/cgroup/cpu,cpuacct".
809 */
810 dup = strdup(p);
811 if (!dup)
812 return NULL;
813
814 for (tok = strtok_r(dup, sep, &saveptr); tok;
815 tok = strtok_r(NULL, sep, &saveptr))
816 must_append_controller(klist, nlist, &aret, tok);
817
818 free(dup);
411ac6d8 819 }
d6337a5f
CB
820 *p2 = ' ';
821 return aret;
822}
411ac6d8 823
d6337a5f
CB
824static char **cg_unified_make_empty_controller(void)
825{
826 int newentry;
827 char **aret = NULL;
828
829 newentry = append_null_to_list((void ***)&aret);
830 aret[newentry] = NULL;
831 return aret;
832}
833
834static char **cg_unified_get_controllers(const char *file)
835{
836 char *buf, *tok;
837 char *saveptr = NULL, *sep = " \t\n";
838 char **aret = NULL;
839
840 buf = read_file(file);
841 if (!buf)
411ac6d8 842 return NULL;
6328fd9c 843
d6337a5f
CB
844 for (tok = strtok_r(buf, sep, &saveptr); tok;
845 tok = strtok_r(NULL, sep, &saveptr)) {
846 int newentry;
847 char *copy;
848
849 newentry = append_null_to_list((void ***)&aret);
850 copy = must_copy_string(tok);
851 aret[newentry] = copy;
ccb4cabe
SH
852 }
853
d6337a5f 854 free(buf);
ccb4cabe
SH
855 return aret;
856}
857
d6337a5f
CB
858static struct hierarchy *add_hierarchy(char **clist, char *mountpoint,
859 char *base_cgroup, int type)
ccb4cabe
SH
860{
861 struct hierarchy *new;
862 int newentry;
863
864 new = must_alloc(sizeof(*new));
865 new->controllers = clist;
866 new->mountpoint = mountpoint;
867 new->base_cgroup = base_cgroup;
868 new->fullcgpath = NULL;
d6337a5f 869 new->version = type;
6328fd9c 870
457ca9aa
SH
871 newentry = append_null_to_list((void ***)&hierarchies);
872 hierarchies[newentry] = new;
d6337a5f 873 return new;
ccb4cabe
SH
874}
875
876/*
877 * Get a copy of the mountpoint from @line, which is a line from
878 * /proc/self/mountinfo
879 */
a3926f6a 880static char *cg_hybrid_get_mountpoint(char *line)
ccb4cabe
SH
881{
882 int i;
d6337a5f 883 char *p2;
ccb4cabe 884 size_t len;
d6337a5f
CB
885 char *p = line;
886 char *sret = NULL;
ccb4cabe
SH
887
888 for (i = 0; i < 4; i++) {
235f1815 889 p = strchr(p, ' ');
ccb4cabe
SH
890 if (!p)
891 return NULL;
892 p++;
893 }
d6337a5f
CB
894
895 if (strncmp(p, "/sys/fs/cgroup/", 15))
896 return NULL;
897
898 p2 = strchr(p + 15, ' ');
899 if (!p2)
900 return NULL;
901 *p2 = '\0';
902
ccb4cabe
SH
903 len = strlen(p);
904 sret = must_alloc(len + 1);
905 memcpy(sret, p, len);
906 sret[len] = '\0';
907 return sret;
908}
909
910/*
911 * Given a multi-line string, return a null-terminated copy of the
912 * current line.
913 */
914static char *copy_to_eol(char *p)
915{
235f1815 916 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
917 size_t len;
918
919 if (!p2)
920 return NULL;
921
922 len = p2 - p;
923 sret = must_alloc(len + 1);
924 memcpy(sret, p, len);
925 sret[len] = '\0';
926 return sret;
927}
928
929/*
930 * cgline: pointer to character after the first ':' in a line in a
931 * \n-terminated /proc/self/cgroup file. Check whether * controller c is
932 * present.
933 */
934static bool controller_in_clist(char *cgline, char *c)
935{
936 char *tok, *saveptr = NULL, *eol, *tmp;
937 size_t len;
938
235f1815 939 eol = strchr(cgline, ':');
ccb4cabe
SH
940 if (!eol)
941 return false;
942
943 len = eol - cgline;
944 tmp = alloca(len + 1);
945 memcpy(tmp, cgline, len);
946 tmp[len] = '\0';
947
948 for (tok = strtok_r(tmp, ",", &saveptr); tok;
d6337a5f 949 tok = strtok_r(NULL, ",", &saveptr)) {
ccb4cabe
SH
950 if (strcmp(tok, c) == 0)
951 return true;
952 }
d6337a5f 953
ccb4cabe
SH
954 return false;
955}
956
957/*
958 * @basecginfo is a copy of /proc/$$/cgroup. Return the current
959 * cgroup for @controller
960 */
a3926f6a 961static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller, int type)
ccb4cabe
SH
962{
963 char *p = basecginfo;
6328fd9c 964
d6337a5f
CB
965 for (;;) {
966 bool is_cgv2_base_cgroup = false;
967
6328fd9c 968 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
d6337a5f
CB
969 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
970 is_cgv2_base_cgroup = true;
ccb4cabe 971
235f1815 972 p = strchr(p, ':');
ccb4cabe
SH
973 if (!p)
974 return NULL;
975 p++;
d6337a5f
CB
976
977 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
235f1815 978 p = strchr(p, ':');
ccb4cabe
SH
979 if (!p)
980 return NULL;
981 p++;
982 return copy_to_eol(p);
983 }
984
235f1815 985 p = strchr(p, '\n');
ccb4cabe
SH
986 if (!p)
987 return NULL;
988 p++;
989 }
990}
991
ccb4cabe
SH
992static void must_append_string(char ***list, char *entry)
993{
994 int newentry = append_null_to_list((void ***)list);
995 char *copy;
996
997 copy = must_copy_string(entry);
998 (*list)[newentry] = copy;
999}
1000
d6337a5f 1001static int get_existing_subsystems(char ***klist, char ***nlist)
ccb4cabe
SH
1002{
1003 FILE *f;
1004 char *line = NULL;
1005 size_t len = 0;
1006
d6337a5f
CB
1007 f = fopen("/proc/self/cgroup", "r");
1008 if (!f)
1009 return -1;
1010
ccb4cabe
SH
1011 while (getline(&line, &len, f) != -1) {
1012 char *p, *p2, *tok, *saveptr = NULL;
235f1815 1013 p = strchr(line, ':');
ccb4cabe
SH
1014 if (!p)
1015 continue;
1016 p++;
235f1815 1017 p2 = strchr(p, ':');
ccb4cabe
SH
1018 if (!p2)
1019 continue;
1020 *p2 = '\0';
ff8d6ee9 1021
6328fd9c
CB
1022 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
1023 * contains an entry of the form:
ff8d6ee9
CB
1024 *
1025 * 0::/some/path
1026 *
6328fd9c 1027 * In this case we use "cgroup2" as controller name.
ff8d6ee9 1028 */
6328fd9c
CB
1029 if ((p2 - p) == 0) {
1030 must_append_string(klist, "cgroup2");
ff8d6ee9 1031 continue;
6328fd9c 1032 }
ff8d6ee9 1033
ccb4cabe 1034 for (tok = strtok_r(p, ",", &saveptr); tok;
d6337a5f 1035 tok = strtok_r(NULL, ",", &saveptr)) {
ccb4cabe
SH
1036 if (strncmp(tok, "name=", 5) == 0)
1037 must_append_string(nlist, tok);
1038 else
1039 must_append_string(klist, tok);
1040 }
1041 }
1042
1043 free(line);
1044 fclose(f);
d6337a5f 1045 return 0;
ccb4cabe
SH
1046}
1047
1048static void trim(char *s)
1049{
1050 size_t len = strlen(s);
2c28d76b 1051 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
1052 s[--len] = '\0';
1053}
1054
e4aeecf5
CB
1055static void lxc_cgfsng_print_handler_data(const struct cgfsng_handler_data *d)
1056{
1057 printf("Cgroup information:\n");
1058 printf(" container name: %s\n", d->name ? d->name : "(null)");
1059 printf(" lxc.cgroup.use: %s\n", cgroup_use ? cgroup_use : "(null)");
43654d34
CB
1060 printf(" lxc.cgroup.pattern: %s\n",
1061 d->cgroup_pattern ? d->cgroup_pattern : "(null)");
1062 printf(" lxc.cgroup.dir: %s\n",
1063 d->cgroup_meta.dir ? d->cgroup_meta.dir : "(null)");
1064 printf(" cgroup: %s\n",
1065 d->container_cgroup ? d->container_cgroup : "(null)");
e4aeecf5
CB
1066}
1067
1068static void lxc_cgfsng_print_hierarchies()
ccb4cabe 1069{
a7b0cc4c 1070 struct hierarchy **it;
ccb4cabe 1071 int i;
41c33dbe 1072
457ca9aa 1073 if (!hierarchies) {
c2712f64 1074 printf(" No hierarchies found\n");
ccb4cabe
SH
1075 return;
1076 }
e4aeecf5 1077 printf(" Hierarchies:\n");
a7b0cc4c
CB
1078 for (i = 0, it = hierarchies; it && *it; it++, i++) {
1079 char **cit;
ccb4cabe 1080 int j;
c2712f64
CB
1081 printf(" %d: base_cgroup: %s\n", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1082 printf(" mountpoint: %s\n", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
e4aeecf5 1083 printf(" controllers:\n");
a7b0cc4c 1084 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
e4aeecf5 1085 printf(" %d: %s\n", j, *cit);
ccb4cabe
SH
1086 }
1087}
41c33dbe 1088
a3926f6a
CB
1089static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1090 char **nlist)
41c33dbe
SH
1091{
1092 int k;
a7b0cc4c 1093 char **it;
41c33dbe 1094
a7b0cc4c
CB
1095 printf("basecginfo is:\n");
1096 printf("%s\n", basecginfo);
41c33dbe 1097
a7b0cc4c
CB
1098 for (k = 0, it = klist; it && *it; it++, k++)
1099 printf("kernel subsystem %d: %s\n", k, *it);
1100 for (k = 0, it = nlist; it && *it; it++, k++)
1101 printf("named subsystem %d: %s\n", k, *it);
41c33dbe 1102}
ccb4cabe 1103
e4aeecf5
CB
1104static void lxc_cgfsng_print_debuginfo(const struct cgfsng_handler_data *d)
1105{
1106 lxc_cgfsng_print_handler_data(d);
1107 lxc_cgfsng_print_hierarchies();
1108}
1109
ccb4cabe
SH
1110/*
1111 * At startup, parse_hierarchies finds all the info we need about
1112 * cgroup mountpoints and current cgroups, and stores it in @d.
1113 */
a3926f6a 1114static bool cg_hybrid_init(void)
ccb4cabe 1115{
d6337a5f
CB
1116 int ret;
1117 char *basecginfo;
1118 bool will_escape;
ccb4cabe 1119 FILE *f;
ccb4cabe 1120 size_t len = 0;
d6337a5f
CB
1121 char *line = NULL;
1122 char **klist = NULL, **nlist = NULL;
ccb4cabe 1123
d30ec4cb
SH
1124 /*
1125 * Root spawned containers escape the current cgroup, so use init's
1126 * cgroups as our base in that case.
1127 */
d6337a5f
CB
1128 will_escape = (geteuid() == 0);
1129 if (will_escape)
ccb4cabe 1130 basecginfo = read_file("/proc/1/cgroup");
d6337a5f
CB
1131 else
1132 basecginfo = read_file("/proc/self/cgroup");
ccb4cabe
SH
1133 if (!basecginfo)
1134 return false;
1135
d6337a5f
CB
1136 ret = get_existing_subsystems(&klist, &nlist);
1137 if (ret < 0) {
1138 CGFSNG_DEBUG("Failed to retrieve available cgroup v1 controllers\n");
1139 free(basecginfo);
ccb4cabe
SH
1140 return false;
1141 }
1142
d6337a5f
CB
1143 f = fopen("/proc/self/mountinfo", "r");
1144 if (!f) {
1145 CGFSNG_DEBUG("Failed to open \"/proc/self/mountinfo\"\n");
1146 return false;
1147 }
41c33dbe 1148
e4aeecf5
CB
1149 if (lxc_cgfsng_debug)
1150 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
ccb4cabe 1151
ccb4cabe 1152 while (getline(&line, &len, f) != -1) {
49ff3958 1153 int type;
d6337a5f
CB
1154 bool writeable;
1155 struct hierarchy *new;
1156 char *mountpoint = NULL, *base_cgroup = NULL;
1157 char **controller_list = NULL;
ccb4cabe 1158
49ff3958 1159 type = get_cgroup_version(line);
d6337a5f 1160 if (type == 0)
ccb4cabe
SH
1161 continue;
1162
d6337a5f 1163 if (type == CGROUP2_SUPER_MAGIC && unified)
ccb4cabe
SH
1164 continue;
1165
d6337a5f
CB
1166 if (cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
1167 if (type == CGROUP2_SUPER_MAGIC)
1168 cgroup_layout = CGROUP_LAYOUT_UNIFIED;
1169 else if (type == CGROUP_SUPER_MAGIC)
1170 cgroup_layout = CGROUP_LAYOUT_LEGACY;
1171 } else if (cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
1172 if (type == CGROUP_SUPER_MAGIC)
1173 cgroup_layout = CGROUP_LAYOUT_HYBRID;
1174 } else if (cgroup_layout == CGROUP_LAYOUT_LEGACY) {
1175 if (type == CGROUP2_SUPER_MAGIC)
1176 cgroup_layout = CGROUP_LAYOUT_HYBRID;
ccb4cabe
SH
1177 }
1178
a3926f6a 1179 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
d6337a5f
CB
1180 if (!controller_list && type == CGROUP_SUPER_MAGIC)
1181 continue;
1182
1183 if (type == CGROUP_SUPER_MAGIC)
1184 if (controller_list_is_dup(hierarchies, controller_list))
1185 goto next;
1186
a3926f6a 1187 mountpoint = cg_hybrid_get_mountpoint(line);
ccb4cabe 1188 if (!mountpoint) {
65d78313 1189 CGFSNG_DEBUG("Failed parsing mountpoint from \"%s\"\n", line);
d6337a5f 1190 goto next;
ccb4cabe
SH
1191 }
1192
d6337a5f 1193 if (type == CGROUP_SUPER_MAGIC)
a3926f6a 1194 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
d6337a5f 1195 else
a3926f6a 1196 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
ccb4cabe 1197 if (!base_cgroup) {
d6337a5f
CB
1198 CGFSNG_DEBUG("Failed to find current cgroup\n");
1199 goto next;
ccb4cabe 1200 }
6328fd9c 1201
ccb4cabe
SH
1202 trim(base_cgroup);
1203 prune_init_scope(base_cgroup);
d6337a5f 1204 if (type == CGROUP2_SUPER_MAGIC)
6328fd9c
CB
1205 writeable = test_writeable_v2(mountpoint, base_cgroup);
1206 else
1207 writeable = test_writeable_v1(mountpoint, base_cgroup);
d6337a5f
CB
1208 if (!writeable)
1209 goto next;
1210
1211 if (type == CGROUP2_SUPER_MAGIC) {
1212 char *cgv2_ctrl_path;
1213
1214 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
1215 "cgroup.controllers",
1216 NULL);
1217
1218 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
1219 free(cgv2_ctrl_path);
1220 if (!controller_list)
1221 controller_list = cg_unified_make_empty_controller();
ccb4cabe 1222 }
d6337a5f
CB
1223 new = add_hierarchy(controller_list, mountpoint, base_cgroup, type);
1224 if (type == CGROUP2_SUPER_MAGIC && !unified)
1225 unified = new;
1226
1227 continue;
1228
1229 next:
1230 free_string_list(controller_list);
1231 free(mountpoint);
1232 free(base_cgroup);
ccb4cabe
SH
1233 }
1234
1235 free_string_list(klist);
1236 free_string_list(nlist);
1237
1238 free(basecginfo);
1239
1240 fclose(f);
1241 free(line);
1242
e4aeecf5
CB
1243 if (lxc_cgfsng_debug) {
1244 printf("writeable subsystems:\n");
1245 lxc_cgfsng_print_hierarchies();
1246 }
1247
ccb4cabe
SH
1248 /* verify that all controllers in cgroup.use and all crucial
1249 * controllers are accounted for
1250 */
c2712f64 1251 if (!all_controllers_found())
ccb4cabe
SH
1252 return false;
1253
1254 return true;
1255}
1256
d6337a5f
CB
1257static int cg_is_pure_unified(void) {
1258
1259 int ret;
1260 struct statfs fs;
1261
1262 ret = statfs("/sys/fs/cgroup", &fs);
1263 if (ret < 0)
1264 return -ENOMEDIUM;
1265
1266 if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
1267 return CGROUP2_SUPER_MAGIC;
1268
1269 return 0;
1270}
1271
1272/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
a3926f6a 1273static char *cg_unified_get_current_cgroup(void)
457ca9aa 1274{
d6337a5f
CB
1275 char *basecginfo;
1276 char *base_cgroup;
1277 bool will_escape;
1278 char *copy = NULL;
1279
1280 will_escape = (geteuid() == 0);
1281 if (will_escape)
1282 basecginfo = read_file("/proc/1/cgroup");
1283 else
1284 basecginfo = read_file("/proc/self/cgroup");
1285 if (!basecginfo)
1286 return NULL;
1287
1288 base_cgroup = strstr(basecginfo, "0::/");
1289 if (!base_cgroup)
1290 goto cleanup_on_err;
1291
1292 base_cgroup = base_cgroup + 3;
1293 copy = copy_to_eol(base_cgroup);
1294 if (!copy)
1295 goto cleanup_on_err;
1296
1297cleanup_on_err:
1298 free(basecginfo);
1299 if (copy)
1300 trim(copy);
1301
1302 return copy;
1303}
1304
a3926f6a 1305static int cg_unified_init(void)
d6337a5f
CB
1306{
1307 int ret;
1308 char *mountpoint, *subtree_path;
1309 char **delegatable;
1310 char *base_cgroup = NULL;
1311
1312 ret = cg_is_pure_unified();
1313 if (ret == -ENOMEDIUM)
1314 return -ENOMEDIUM;
1315
1316 if (ret != CGROUP2_SUPER_MAGIC)
1317 return 0;
1318
a3926f6a 1319 base_cgroup = cg_unified_get_current_cgroup();
d6337a5f
CB
1320 if (!base_cgroup)
1321 return -EINVAL;
1322 prune_init_scope(base_cgroup);
1323
1324 /* We assume that we have already been given controllers to delegate
1325 * further down the hierarchy. If not it is up to the user to delegate
1326 * them to us.
1327 */
1328 mountpoint = must_copy_string("/sys/fs/cgroup");
1329 subtree_path = must_make_path(mountpoint, base_cgroup,
1330 "cgroup.subtree_control", NULL);
1331 delegatable = cg_unified_get_controllers(subtree_path);
1332 free(subtree_path);
1333 if (!delegatable)
1334 delegatable = cg_unified_make_empty_controller();
1335 if (!delegatable[0])
1336 CGFSNG_DEBUG("No controllers are enabled for delegation\n");
1337
1338 /* TODO: If the user requested specific controllers via lxc.cgroup.use
1339 * we should verify here. The reason I'm not doing it right is that I'm
1340 * not convinced that lxc.cgroup.use will be the future since it is a
1341 * global property. I much rather have an option that lets you request
1342 * controllers per container.
1343 */
1344
1345 add_hierarchy(delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
1346 unified = hierarchies[0];
1347
1348 cgroup_layout = CGROUP_LAYOUT_UNIFIED;
1349 return CGROUP2_SUPER_MAGIC;
1350}
1351
1352static bool cg_init(void)
1353{
1354 int ret;
457ca9aa 1355 const char *tmp;
d6337a5f 1356
457ca9aa
SH
1357 errno = 0;
1358 tmp = lxc_global_config_value("lxc.cgroup.use");
1a0e70ac 1359 if (!cgroup_use && errno != 0) { /* lxc.cgroup.use can be NULL */
65d78313 1360 CGFSNG_DEBUG("Failed to retrieve list of cgroups to use\n");
457ca9aa
SH
1361 return false;
1362 }
1363 cgroup_use = must_copy_string(tmp);
1364
a3926f6a 1365 ret = cg_unified_init();
d6337a5f
CB
1366 if (ret < 0)
1367 return false;
1368
1369 if (ret == CGROUP2_SUPER_MAGIC)
1370 return true;
1371
a3926f6a 1372 return cg_hybrid_init();
457ca9aa
SH
1373}
1374
43654d34 1375static void *cgfsng_init(struct lxc_handler *handler)
ccb4cabe 1376{
457ca9aa 1377 const char *cgroup_pattern;
43654d34 1378 struct cgfsng_handler_data *d;
ccb4cabe
SH
1379
1380 d = must_alloc(sizeof(*d));
1381 memset(d, 0, sizeof(*d));
1382
43654d34
CB
1383 /* copy container name */
1384 d->name = must_copy_string(handler->name);
1385
1386 /* copy per-container cgroup information */
ae5e6c08
CB
1387 d->cgroup_meta.dir = NULL;
1388 d->cgroup_meta.controllers = NULL;
9b5396f9
CB
1389 if (handler->conf) {
1390 d->cgroup_meta.dir = must_copy_string(handler->conf->cgroup_meta.dir);
1391 d->cgroup_meta.controllers = must_copy_string(handler->conf->cgroup_meta.controllers);
1392 }
ccb4cabe 1393
43654d34 1394 /* copy system-wide cgroup information */
ccb4cabe 1395 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
43654d34
CB
1396 if (!cgroup_pattern) {
1397 /* lxc.cgroup.pattern is only NULL on error. */
ccb4cabe
SH
1398 ERROR("Error getting cgroup pattern");
1399 goto out_free;
1400 }
1401 d->cgroup_pattern = must_copy_string(cgroup_pattern);
1402
d6337a5f
CB
1403 d->cgroup_layout = cgroup_layout;
1404 if (d->cgroup_layout == CGROUP_LAYOUT_LEGACY)
1405 TRACE("Running with legacy cgroup layout");
1406 else if (d->cgroup_layout == CGROUP_LAYOUT_HYBRID)
1407 TRACE("Running with hybrid cgroup layout");
1408 else if (d->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
1409 TRACE("Running with unified cgroup layout");
1410 else
1411 WARN("Running with unknown cgroup layout");
1412
e4aeecf5
CB
1413 if (lxc_cgfsng_debug)
1414 lxc_cgfsng_print_debuginfo(d);
ccb4cabe
SH
1415
1416 return d;
1417
1418out_free:
1419 free_handler_data(d);
1420 return NULL;
1421}
1422
bd8ef4e4 1423static int recursive_destroy(char *dirname)
ccb4cabe 1424{
a17f8b3f 1425 int ret;
74f96976 1426 struct dirent *direntp;
ccb4cabe
SH
1427 DIR *dir;
1428 int r = 0;
1429
1430 dir = opendir(dirname);
1431 if (!dir)
1432 return -1;
1433
74f96976 1434 while ((direntp = readdir(dir))) {
ccb4cabe 1435 char *pathname;
a17f8b3f 1436 struct stat mystat;
ccb4cabe 1437
ccb4cabe
SH
1438 if (!strcmp(direntp->d_name, ".") ||
1439 !strcmp(direntp->d_name, ".."))
1440 continue;
1441
1442 pathname = must_make_path(dirname, direntp->d_name, NULL);
1443
a17f8b3f
CB
1444 ret = lstat(pathname, &mystat);
1445 if (ret < 0) {
ccb4cabe 1446 if (!r)
a17f8b3f 1447 WARN("Failed to stat %s", pathname);
ccb4cabe
SH
1448 r = -1;
1449 goto next;
1450 }
1451
1452 if (!S_ISDIR(mystat.st_mode))
1453 goto next;
a17f8b3f 1454
bd8ef4e4 1455 ret = recursive_destroy(pathname);
a17f8b3f 1456 if (ret < 0)
ccb4cabe 1457 r = -1;
bd8ef4e4 1458 next:
ccb4cabe
SH
1459 free(pathname);
1460 }
1461
a17f8b3f
CB
1462 ret = rmdir(dirname);
1463 if (ret < 0) {
ccb4cabe 1464 if (!r)
bd8ef4e4
CB
1465 WARN("%s - Failed to delete \"%s\"", strerror(errno),
1466 dirname);
ccb4cabe
SH
1467 r = -1;
1468 }
1469
a17f8b3f
CB
1470 ret = closedir(dir);
1471 if (ret < 0) {
ccb4cabe 1472 if (!r)
bd8ef4e4
CB
1473 WARN("%s - Failed to delete \"%s\"", strerror(errno),
1474 dirname);
ccb4cabe
SH
1475 r = -1;
1476 }
a17f8b3f 1477
ccb4cabe
SH
1478 return r;
1479}
1480
bd8ef4e4
CB
1481static int cgroup_rmdir(char *container_cgroup)
1482{
1483 int i;
1484
1485 if (!container_cgroup || !hierarchies)
1486 return 0;
1487
1488 for (i = 0; hierarchies[i]; i++) {
1489 int ret;
1490 struct hierarchy *h = hierarchies[i];
1491
1492 if (!h->fullcgpath)
1493 continue;
1494
1495 ret = recursive_destroy(h->fullcgpath);
1496 if (ret < 0)
1497 WARN("Failed to destroy \"%s\"", h->fullcgpath);
1498
1499 free(h->fullcgpath);
1500 h->fullcgpath = NULL;
1501 }
1502
1503 return 0;
1504}
1505
4160c3a0
CB
1506struct generic_userns_exec_data {
1507 struct cgfsng_handler_data *d;
1508 struct lxc_conf *conf;
1509 uid_t origuid; /* target uid in parent namespace */
1510 char *path;
1511};
1512
bd8ef4e4 1513static int cgroup_rmdir_wrapper(void *data)
ccb4cabe 1514{
4160c3a0
CB
1515 struct generic_userns_exec_data *arg = data;
1516 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1517 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1518
4160c3a0 1519 if (setresgid(nsgid, nsgid, nsgid) < 0)
ccb4cabe 1520 SYSERROR("Failed to setgid to 0");
4160c3a0 1521 if (setresuid(nsuid, nsuid, nsuid) < 0)
ccb4cabe 1522 SYSERROR("Failed to setuid to 0");
a19b974f 1523 if (setgroups(0, NULL) < 0 && errno != EPERM)
ccb4cabe
SH
1524 SYSERROR("Failed to clear groups");
1525
bd8ef4e4 1526 return cgroup_rmdir(arg->d->container_cgroup);
ccb4cabe
SH
1527}
1528
bd8ef4e4 1529static void cgfsng_destroy(void *hdata, struct lxc_conf *conf)
ccb4cabe 1530{
bd8ef4e4
CB
1531 int ret;
1532 struct cgfsng_handler_data *d = hdata;
4160c3a0
CB
1533 struct generic_userns_exec_data wrap;
1534
bd8ef4e4
CB
1535 if (!d)
1536 return;
1537
4160c3a0 1538 wrap.origuid = 0;
bd8ef4e4 1539 wrap.d = hdata;
4160c3a0
CB
1540 wrap.conf = conf;
1541
ccb4cabe 1542 if (conf && !lxc_list_empty(&conf->id_map))
bd8ef4e4
CB
1543 ret = userns_exec_1(conf, cgroup_rmdir_wrapper, &wrap,
1544 "cgroup_rmdir_wrapper");
ccb4cabe 1545 else
bd8ef4e4
CB
1546 ret = cgroup_rmdir(d->container_cgroup);
1547 if (ret < 0) {
1548 WARN("Failed to destroy cgroups");
ccb4cabe 1549 return;
ccb4cabe
SH
1550 }
1551
1552 free_handler_data(d);
1553}
1554
1555struct cgroup_ops *cgfsng_ops_init(void)
1556{
e4aeecf5
CB
1557 if (getenv("LXC_DEBUG_CGFSNG"))
1558 lxc_cgfsng_debug = true;
1559
d6337a5f 1560 if (!cg_init())
457ca9aa 1561 return NULL;
e4aeecf5 1562
ccb4cabe
SH
1563 return &cgfsng_ops;
1564}
1565
a3926f6a 1566static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
0c3deb94
CB
1567{
1568 char **it;
1569 size_t i, parts_len;
1570 size_t full_len = 0;
1571 char *add_controllers = NULL, *cgroup = NULL;
1572 char **parts = NULL;
1573 bool bret = false;
1574
1575 if (h->version != CGROUP2_SUPER_MAGIC)
1576 return true;
1577
1578 if (!h->controllers)
1579 return true;
1580
1581 /* For now we simply enable all controllers that we have detected by
1582 * creating a string like "+memory +pids +cpu +io".
1583 * TODO: In the near future we might want to support "-<controller>"
1584 * etc. but whether supporting semantics like this make sense will need
1585 * some thinking.
1586 */
1587 for (it = h->controllers; it && *it; it++) {
1588 full_len += strlen(*it) + 2;
1589 add_controllers = must_realloc(add_controllers, full_len + 1);
1590 if (h->controllers[0] == *it)
1591 add_controllers[0] = '\0';
1592 strcat(add_controllers, "+");
1593 strcat(add_controllers, *it);
1594 if ((it + 1) && *(it + 1))
1595 strcat(add_controllers, " ");
1596 }
1597
1598 parts = lxc_string_split(cgname, '/');
1599 if (!parts)
1600 goto on_error;
1601 parts_len = lxc_array_len((void **)parts);
1602 if (parts_len > 0)
1603 parts_len--;
1604
1605 cgroup = must_make_path(h->mountpoint, h->base_cgroup, NULL);
1606 for (i = 0; i < parts_len; i++) {
1607 int ret;
1608 char *target;
1609
1610 cgroup = must_append_path(cgroup, parts[i], NULL);
1611 target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
1612 ret = lxc_write_to_file(target, add_controllers, full_len, false);
1613 free(target);
1614 if (ret < 0) {
1615 SYSERROR("Could not enable \"%s\" controllers in the "
1616 "unified cgroup \"%s\"", add_controllers, cgroup);
1617 goto on_error;
1618 }
1619 }
1620
1621 bret = true;
1622
1623on_error:
1624 lxc_free_array((void **)parts, free);
1625 free(add_controllers);
1626 free(cgroup);
1627 return bret;
1628}
1629
ccb4cabe
SH
1630static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1631{
0c3deb94
CB
1632 int ret;
1633
e3a3fecf 1634 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
1a0e70ac 1635 if (dir_exists(h->fullcgpath)) { /* it must not already exist */
0c3deb94 1636 ERROR("cgroup \"%s\" already existed", h->fullcgpath);
d8da679e 1637 return false;
6f9584d8 1638 }
0c3deb94 1639
a3926f6a 1640 if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
0c3deb94
CB
1641 ERROR("Failed to handle cgroupfs v1 cpuset controller");
1642 return false;
1643 }
1644
1645 ret = mkdir_p(h->fullcgpath, 0755);
1646 if (ret < 0) {
1647 ERROR("Failed to create cgroup \"%s\"", h->fullcgpath);
e3a3fecf 1648 return false;
6f9584d8 1649 }
0c3deb94 1650
a3926f6a 1651 return cg_unified_create_cgroup(h, cgname);
ccb4cabe
SH
1652}
1653
1654static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1655{
1656 if (rmdir(h->fullcgpath) < 0)
1657 SYSERROR("Failed to clean up cgroup %s from failed creation attempt", h->fullcgpath);
1658 free(h->fullcgpath);
1659 h->fullcgpath = NULL;
1660}
1661
1662/*
d30ec4cb 1663 * Try to create the same cgroup in all hierarchies.
ccb4cabe
SH
1664 * Start with cgroup_pattern; next cgroup_pattern-1, -2, ..., -999
1665 */
1666static inline bool cgfsng_create(void *hdata)
1667{
bb30b52a 1668 int i;
ccb4cabe 1669 size_t len;
0c3deb94 1670 char *container_cgroup, *offset, *tmp;
7d531e9b
CB
1671 int idx = 0;
1672 struct cgfsng_handler_data *d = hdata;
ccb4cabe
SH
1673
1674 if (!d)
1675 return false;
43654d34 1676
ccb4cabe
SH
1677 if (d->container_cgroup) {
1678 WARN("cgfsng_create called a second time");
1679 return false;
1680 }
1681
43654d34 1682 if (d->cgroup_meta.dir)
7d531e9b 1683 tmp = lxc_string_join("/", (const char *[]){d->cgroup_meta.dir, d->name, NULL}, false);
43654d34
CB
1684 else
1685 tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
ccb4cabe
SH
1686 if (!tmp) {
1687 ERROR("Failed expanding cgroup name pattern");
1688 return false;
1689 }
1a0e70ac 1690 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
0c3deb94
CB
1691 container_cgroup = must_alloc(len);
1692 strcpy(container_cgroup, tmp);
ccb4cabe 1693 free(tmp);
0c3deb94 1694 offset = container_cgroup + len - 5;
ccb4cabe
SH
1695
1696again:
95adfe93
SH
1697 if (idx == 1000) {
1698 ERROR("Too many conflicting cgroup names");
ccb4cabe 1699 goto out_free;
95adfe93 1700 }
66b66624 1701 if (idx) {
bb30b52a
CB
1702 int ret;
1703
66b66624
CB
1704 ret = snprintf(offset, 5, "-%d", idx);
1705 if (ret < 0 || (size_t)ret >= 5) {
1706 FILE *f = fopen("/dev/null", "w");
97ebced3 1707 if (f) {
66b66624
CB
1708 fprintf(f, "Workaround for GCC7 bug: "
1709 "https://gcc.gnu.org/bugzilla/"
1710 "show_bug.cgi?id=78969");
1711 fclose(f);
1712 }
1713 }
1714 }
457ca9aa 1715 for (i = 0; hierarchies[i]; i++) {
0c3deb94 1716 if (!create_path_for_hierarchy(hierarchies[i], container_cgroup)) {
ccb4cabe 1717 int j;
1a0e70ac 1718 ERROR("Failed to create \"%s\"", hierarchies[i]->fullcgpath);
457ca9aa
SH
1719 free(hierarchies[i]->fullcgpath);
1720 hierarchies[i]->fullcgpath = NULL;
ccb4cabe 1721 for (j = 0; j < i; j++)
0c3deb94 1722 remove_path_for_hierarchy(hierarchies[j], container_cgroup);
ccb4cabe
SH
1723 idx++;
1724 goto again;
1725 }
1726 }
1727 /* Done */
0c3deb94 1728 d->container_cgroup = container_cgroup;
ccb4cabe
SH
1729 return true;
1730
1731out_free:
0c3deb94 1732 free(container_cgroup);
ccb4cabe
SH
1733 return false;
1734}
1735
ccb4cabe
SH
1736static bool cgfsng_enter(void *hdata, pid_t pid)
1737{
ccb4cabe
SH
1738 char pidstr[25];
1739 int i, len;
1740
1741 len = snprintf(pidstr, 25, "%d", pid);
1742 if (len < 0 || len > 25)
1743 return false;
1744
457ca9aa
SH
1745 for (i = 0; hierarchies[i]; i++) {
1746 char *fullpath = must_make_path(hierarchies[i]->fullcgpath,
ccb4cabe
SH
1747 "cgroup.procs", NULL);
1748 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
d3b00a8f 1749 SYSERROR("Failed to enter %s", fullpath);
ccb4cabe
SH
1750 free(fullpath);
1751 return false;
1752 }
1753 free(fullpath);
1754 }
1755
1756 return true;
1757}
1758
c0888dfe
SH
1759/*
1760 * chgrp the container cgroups to container group. We leave
1761 * the container owner as cgroup owner. So we must make the
1762 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1763 *
1764 * Also chown the tasks and cgroup.procs files. Those may not
1765 * exist depending on kernel version.
c0888dfe 1766 */
ccb4cabe
SH
1767static int chown_cgroup_wrapper(void *data)
1768{
ccb4cabe 1769 int i;
4160c3a0
CB
1770 uid_t destuid;
1771 struct generic_userns_exec_data *arg = data;
1772 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1773 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1774
4160c3a0 1775 if (setresgid(nsgid, nsgid, nsgid) < 0)
ccb4cabe 1776 SYSERROR("Failed to setgid to 0");
4160c3a0 1777 if (setresuid(nsuid, nsuid, nsuid) < 0)
ccb4cabe 1778 SYSERROR("Failed to setuid to 0");
a19b974f 1779 if (setgroups(0, NULL) < 0 && errno != EPERM)
ccb4cabe
SH
1780 SYSERROR("Failed to clear groups");
1781
1782 destuid = get_ns_uid(arg->origuid);
1783
457ca9aa
SH
1784 for (i = 0; hierarchies[i]; i++) {
1785 char *fullpath, *path = hierarchies[i]->fullcgpath;
43647298 1786
4160c3a0 1787 if (chown(path, destuid, nsgid) < 0) {
ab8f5424 1788 SYSERROR("Error chowning %s to %d", path, (int) destuid);
ccb4cabe
SH
1789 return -1;
1790 }
c0888dfe 1791
43647298 1792 if (chmod(path, 0775) < 0) {
ab8f5424 1793 SYSERROR("Error chmoding %s", path);
c0888dfe
SH
1794 return -1;
1795 }
ccb4cabe 1796
ab8f5424
SH
1797 /*
1798 * Failures to chown these are inconvenient but not detrimental
1799 * We leave these owned by the container launcher, so that container
1800 * root can write to the files to attach. We chmod them 664 so that
1801 * container systemd can write to the files (which systemd in wily
1802 * insists on doing)
1803 */
43647298 1804 fullpath = must_make_path(path, "tasks", NULL);
4160c3a0 1805 if (chown(fullpath, destuid, nsgid) < 0 && errno != ENOENT)
13277ec4 1806 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1807 strerror(errno));
ab8f5424 1808 if (chmod(fullpath, 0664) < 0)
13277ec4 1809 WARN("Error chmoding %s: %s", path, strerror(errno));
43647298
SH
1810 free(fullpath);
1811
1812 fullpath = must_make_path(path, "cgroup.procs", NULL);
1813 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1814 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1815 strerror(errno));
ab8f5424 1816 if (chmod(fullpath, 0664) < 0)
13277ec4 1817 WARN("Error chmoding %s: %s", path, strerror(errno));
ccb4cabe 1818 free(fullpath);
0e17357c 1819
d6337a5f 1820 if (hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
0e17357c
CB
1821 continue;
1822
1823 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
4160c3a0 1824 if (chown(fullpath, destuid, nsgid) < 0 && errno != ENOENT)
0e17357c
CB
1825 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1826 strerror(errno));
1827 if (chmod(fullpath, 0664) < 0)
1828 WARN("Error chmoding %s: %s", path, strerror(errno));
1829 free(fullpath);
1830
1831 fullpath = must_make_path(path, "cgroup.threads", NULL);
4160c3a0 1832 if (chown(fullpath, destuid, nsgid) < 0 && errno != ENOENT)
0e17357c
CB
1833 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1834 strerror(errno));
1835 if (chmod(fullpath, 0664) < 0)
1836 WARN("Error chmoding %s: %s", path, strerror(errno));
1837 free(fullpath);
ccb4cabe
SH
1838 }
1839
1840 return 0;
1841}
1842
058c1cb6 1843static bool cgfsng_chown(void *hdata, struct lxc_conf *conf)
ccb4cabe
SH
1844{
1845 struct cgfsng_handler_data *d = hdata;
4160c3a0 1846 struct generic_userns_exec_data wrap;
ccb4cabe
SH
1847
1848 if (!d)
1849 return false;
1850
1851 if (lxc_list_empty(&conf->id_map))
1852 return true;
1853
ccb4cabe 1854 wrap.origuid = geteuid();
4160c3a0
CB
1855 wrap.path = NULL;
1856 wrap.d = d;
1857 wrap.conf = conf;
ccb4cabe 1858
c9b7c33e
CB
1859 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1860 "chown_cgroup_wrapper") < 0) {
ccb4cabe
SH
1861 ERROR("Error requesting cgroup chown in new namespace");
1862 return false;
1863 }
1864
1865 return true;
1866}
1867
8aa1044f
SH
1868/*
1869 * We've safe-mounted a tmpfs as parent, so we don't need to protect against
1870 * symlinks any more - just use mount
1871 */
1872
1873/* mount cgroup-full if requested */
1874static int mount_cgroup_full(int type, struct hierarchy *h, char *dest,
a3926f6a 1875 char *container_cgroup)
8aa1044f
SH
1876{
1877 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1878 return 0;
1879 if (mount(h->mountpoint, dest, "cgroup", MS_BIND, NULL) < 0) {
1880 SYSERROR("Error bind-mounting %s cgroup onto %s", h->mountpoint,
1881 dest);
1882 return -1;
1883 }
1884 if (type != LXC_AUTO_CGROUP_FULL_RW) {
5b6f9369
SH
1885 unsigned long flags = MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV |
1886 MS_REMOUNT | MS_RDONLY;
1887 if (mount(NULL, dest, "cgroup", flags, NULL) < 0) {
8aa1044f
SH
1888 SYSERROR("Error remounting %s readonly", dest);
1889 return -1;
1890 }
1891 }
1892
1893 INFO("Bind mounted %s onto %s", h->mountpoint, dest);
1894 if (type != LXC_AUTO_CGROUP_FULL_MIXED)
1895 return 0;
1896
1897 /* mount just the container path rw */
1898 char *source = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
5b6f9369 1899 char *rwpath = must_make_path(dest, h->base_cgroup, container_cgroup, NULL);
8aa1044f 1900 if (mount(source, rwpath, "cgroup", MS_BIND, NULL) < 0)
13277ec4 1901 WARN("Failed to mount %s read-write: %s", rwpath,
1902 strerror(errno));
8aa1044f
SH
1903 INFO("Made %s read-write", rwpath);
1904 free(rwpath);
1905 free(source);
1906 return 0;
1907}
1908
1909/* cgroup-full:* is done, no need to create subdirs */
1910static bool cg_mount_needs_subdirs(int type)
1911{
1912 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1913 return false;
a3926f6a 1914
8aa1044f
SH
1915 return true;
1916}
1917
1918/*
1919 * After $rootfs/sys/fs/container/controller/the/cg/path has been
1920 * created, remount controller ro if needed and bindmount the
1921 * cgroupfs onto controll/the/cg/path
1922 */
a3926f6a
CB
1923static int do_secondstage_mounts_if_needed(int type, struct hierarchy *h,
1924 char *controllerpath, char *cgpath,
1925 const char *container_cgroup)
8aa1044f
SH
1926{
1927 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1928 if (mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL) < 0) {
1929 SYSERROR("Error bind-mounting %s", controllerpath);
1930 return -1;
1931 }
1932 if (mount(controllerpath, controllerpath, "cgroup",
1933 MS_REMOUNT | MS_BIND | MS_RDONLY, NULL) < 0) {
1934 SYSERROR("Error remounting %s read-only", controllerpath);
1935 return -1;
1936 }
1937 INFO("Remounted %s read-only", controllerpath);
1938 }
1939 char *sourcepath = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
1940 int flags = MS_BIND;
1941 if (type == LXC_AUTO_CGROUP_RO)
1942 flags |= MS_RDONLY;
1943 INFO("Mounting %s onto %s", sourcepath, cgpath);
1944 if (mount(sourcepath, cgpath, "cgroup", flags, NULL) < 0) {
1945 free(sourcepath);
1946 SYSERROR("Error mounting cgroup %s onto %s", h->controllers[0],
1947 cgpath);
1948 return -1;
1949 }
f8c40ffa
L
1950
1951 if (flags & MS_RDONLY) {
1952 if (mount(sourcepath, cgpath, "cgroup", MS_REMOUNT | flags | MS_RDONLY, NULL) < 0) {
1953 free(sourcepath);
1954 SYSERROR("Error remounting %s read-only", cgpath);
1955 return -1;
1956 }
1957 }
1958
8aa1044f
SH
1959 free(sourcepath);
1960 INFO("Completed second stage cgroup automounts for %s", cgpath);
1961 return 0;
1962}
1963
a760603e 1964static int mount_cgroup_cgns_supported(int type, struct hierarchy *h, const char *controllerpath)
b635e92d
CB
1965{
1966 int ret;
1967 char *controllers = NULL;
a760603e
CB
1968 char *fstype = "cgroup2";
1969 unsigned long flags = 0;
b635e92d 1970
a760603e
CB
1971 flags |= MS_NOSUID;
1972 flags |= MS_NOEXEC;
1973 flags |= MS_NODEV;
1974 flags |= MS_RELATIME;
1975
1976 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1977 flags |= MS_RDONLY;
1978
d6337a5f 1979 if (h->version != CGROUP2_SUPER_MAGIC) {
a760603e
CB
1980 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1981 if (!controllers)
1982 return -ENOMEM;
1983 fstype = "cgroup";
b635e92d
CB
1984 }
1985
a760603e 1986 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
b635e92d
CB
1987 free(controllers);
1988 if (ret < 0) {
a760603e 1989 SYSERROR("Failed to mount %s with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1990 return -1;
1991 }
1992
a760603e 1993 DEBUG("Mounted %s with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1994 return 0;
1995}
1996
ccb4cabe
SH
1997static bool cgfsng_mount(void *hdata, const char *root, int type)
1998{
b635e92d 1999 int i;
8aa1044f
SH
2000 char *tmpfspath = NULL;
2001 bool retval = false;
b635e92d
CB
2002 struct lxc_handler *handler = hdata;
2003 struct cgfsng_handler_data *d = handler->cgroup_data;
2004 bool has_cgns = false, has_sys_admin = true;
8aa1044f
SH
2005
2006 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
2007 return true;
2008
b635e92d
CB
2009 has_cgns = cgns_supported();
2010 if (!lxc_list_empty(&handler->conf->keepcaps))
2011 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
2012 else
2013 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
2014
2015 if (has_cgns && has_sys_admin)
ccb4cabe 2016 return true;
8aa1044f
SH
2017
2018 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
2019
2020 if (type == LXC_AUTO_CGROUP_NOSPEC)
2021 type = LXC_AUTO_CGROUP_MIXED;
2022 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
2023 type = LXC_AUTO_CGROUP_FULL_MIXED;
2024
2025 /* Mount tmpfs */
2026 if (safe_mount("cgroup_root", tmpfspath, "tmpfs",
2027 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
2028 "size=10240k,mode=755",
2029 root) < 0)
2030 goto bad;
2031
457ca9aa 2032 for (i = 0; hierarchies[i]; i++) {
8aa1044f 2033 char *controllerpath, *path2;
457ca9aa 2034 struct hierarchy *h = hierarchies[i];
8aa1044f
SH
2035 char *controller = strrchr(h->mountpoint, '/');
2036 int r;
2037
2038 if (!controller)
2039 continue;
2040 controller++;
2041 controllerpath = must_make_path(tmpfspath, controller, NULL);
2042 if (dir_exists(controllerpath)) {
2043 free(controllerpath);
2044 continue;
2045 }
2046 if (mkdir(controllerpath, 0755) < 0) {
2047 SYSERROR("Error creating cgroup path: %s", controllerpath);
2048 free(controllerpath);
2049 goto bad;
2050 }
b635e92d
CB
2051
2052 if (has_cgns && !has_sys_admin) {
2053 /* If cgroup namespaces are supported but the container
2054 * will not have CAP_SYS_ADMIN after it has started we
2055 * need to mount the cgroups manually.
2056 */
a760603e 2057 r = mount_cgroup_cgns_supported(type, h, controllerpath);
b635e92d
CB
2058 free(controllerpath);
2059 if (r < 0)
2060 goto bad;
2061 continue;
2062 }
2063
8aa1044f
SH
2064 if (mount_cgroup_full(type, h, controllerpath, d->container_cgroup) < 0) {
2065 free(controllerpath);
2066 goto bad;
2067 }
2068 if (!cg_mount_needs_subdirs(type)) {
2069 free(controllerpath);
2070 continue;
2071 }
ef4413fa 2072 path2 = must_make_path(controllerpath, h->base_cgroup, d->container_cgroup, NULL);
8aa1044f
SH
2073 if (mkdir_p(path2, 0755) < 0) {
2074 free(controllerpath);
8e0c6620 2075 free(path2);
8aa1044f
SH
2076 goto bad;
2077 }
2f62fb00 2078
8aa1044f
SH
2079 r = do_secondstage_mounts_if_needed(type, h, controllerpath, path2,
2080 d->container_cgroup);
2081 free(controllerpath);
2082 free(path2);
2083 if (r < 0)
2084 goto bad;
2085 }
2086 retval = true;
2087
2088bad:
2089 free(tmpfspath);
2090 return retval;
ccb4cabe
SH
2091}
2092
2093static int recursive_count_nrtasks(char *dirname)
2094{
74f96976 2095 struct dirent *direntp;
ccb4cabe
SH
2096 DIR *dir;
2097 int count = 0, ret;
2098 char *path;
2099
2100 dir = opendir(dirname);
2101 if (!dir)
2102 return 0;
2103
74f96976 2104 while ((direntp = readdir(dir))) {
ccb4cabe
SH
2105 struct stat mystat;
2106
2107 if (!direntp)
2108 break;
2109
2110 if (!strcmp(direntp->d_name, ".") ||
2111 !strcmp(direntp->d_name, ".."))
2112 continue;
2113
2114 path = must_make_path(dirname, direntp->d_name, NULL);
2115
2116 if (lstat(path, &mystat))
2117 goto next;
2118
2119 if (!S_ISDIR(mystat.st_mode))
2120 goto next;
2121
2122 count += recursive_count_nrtasks(path);
2123next:
2124 free(path);
2125 }
2126
2127 path = must_make_path(dirname, "cgroup.procs", NULL);
2128 ret = lxc_count_file_lines(path);
2129 if (ret != -1)
2130 count += ret;
2131 free(path);
2132
2133 (void) closedir(dir);
2134
2135 return count;
2136}
2137
2138static int cgfsng_nrtasks(void *hdata) {
2139 struct cgfsng_handler_data *d = hdata;
2140 char *path;
2141 int count;
2142
457ca9aa 2143 if (!d || !d->container_cgroup || !hierarchies)
ccb4cabe 2144 return -1;
a3926f6a 2145
457ca9aa 2146 path = must_make_path(hierarchies[0]->fullcgpath, NULL);
ccb4cabe
SH
2147 count = recursive_count_nrtasks(path);
2148 free(path);
2149 return count;
2150}
2151
2152/* Only root needs to escape to the cgroup of its init */
7103fe6f 2153static bool cgfsng_escape()
ccb4cabe 2154{
ccb4cabe
SH
2155 int i;
2156
2157 if (geteuid())
2158 return true;
2159
457ca9aa
SH
2160 for (i = 0; hierarchies[i]; i++) {
2161 char *fullpath = must_make_path(hierarchies[i]->mountpoint,
2162 hierarchies[i]->base_cgroup,
ccb4cabe
SH
2163 "cgroup.procs", NULL);
2164 if (lxc_write_to_file(fullpath, "0", 2, false) != 0) {
d3b00a8f 2165 SYSERROR("Failed to escape to %s", fullpath);
ccb4cabe 2166 free(fullpath);
6df334d1 2167 return false;
ccb4cabe
SH
2168 }
2169 free(fullpath);
2170 }
2171
6df334d1 2172 return true;
ccb4cabe
SH
2173}
2174
36662416
TA
2175static int cgfsng_num_hierarchies(void)
2176{
2177 int i;
2178
2179 for (i = 0; hierarchies[i]; i++)
2180 ;
2181
2182 return i;
2183}
2184
2185static bool cgfsng_get_hierarchies(int n, char ***out)
2186{
2187 int i;
2188
2189 /* sanity check n */
6b38e644 2190 for (i = 0; i < n; i++)
36662416
TA
2191 if (!hierarchies[i])
2192 return false;
36662416
TA
2193
2194 *out = hierarchies[i]->controllers;
2195
2196 return true;
2197}
2198
ccb4cabe
SH
2199#define THAWED "THAWED"
2200#define THAWED_LEN (strlen(THAWED))
2201
d6337a5f
CB
2202/* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
2203 * to be adapted.
2204 */
ccb4cabe
SH
2205static bool cgfsng_unfreeze(void *hdata)
2206{
d6337a5f 2207 int ret;
ccb4cabe 2208 char *fullpath;
d6337a5f 2209 struct hierarchy *h;
ccb4cabe 2210
d6337a5f 2211 h = get_hierarchy("freezer");
457ca9aa 2212 if (!h)
ccb4cabe 2213 return false;
d6337a5f 2214
ccb4cabe 2215 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
d6337a5f 2216 ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false);
ccb4cabe 2217 free(fullpath);
d6337a5f
CB
2218 if (ret < 0)
2219 return false;
2220
ccb4cabe
SH
2221 return true;
2222}
2223
2224static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem)
2225{
d6337a5f
CB
2226 struct hierarchy *h;
2227
2228 h = get_hierarchy(subsystem);
ccb4cabe
SH
2229 if (!h)
2230 return NULL;
2231
371f834d
SH
2232 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
2233}
2234
2235/*
2236 * Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a
2237 * full path, which must be freed by the caller.
2238 */
2239static char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2240 const char *inpath,
2241 const char *filename)
2242{
371f834d 2243 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
2244}
2245
c2aed66d
CB
2246/* Technically, we're always at a delegation boundary here. (This is especially
2247 * true when cgroup namespaces are available.) The reasoning is that in order
2248 * for us to have been able to start a container in the first place the root
2249 * cgroup must have been a leaf node. Now, either the container's init system
2250 * has populated the cgroup and kept it as a leaf node or it has created
2251 * subtrees. In the former case we will simply attach to the leaf node we
2252 * created when we started the container in the latter case we create our own
2253 * cgroup for the attaching process.
2254 */
a3926f6a
CB
2255static int __cg_unified_attach(const struct hierarchy *h, const char *name,
2256 const char *lxcpath, const char *pidstr,
2257 size_t pidstr_len, const char *controller)
c2aed66d
CB
2258{
2259 int ret;
2260 size_t len;
2261 int fret = -1, idx = 0;
2262 char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
2263
2264 container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2265 /* not running */
2266 if (!container_cgroup)
2267 return 0;
2268
2269 base_path = must_make_path(h->mountpoint, container_cgroup, NULL);
2270 full_path = must_make_path(base_path, "cgroup.procs", NULL);
2271 /* cgroup is populated */
2272 ret = lxc_write_to_file(full_path, pidstr, pidstr_len, false);
2273 if (ret < 0 && errno != EBUSY)
2274 goto on_error;
2275
2276 if (ret == 0)
2277 goto on_success;
2278
2279 free(full_path);
2280
2281 len = strlen(base_path) + sizeof("/lxc-1000") - 1 +
2282 sizeof("/cgroup-procs") - 1;
2283 full_path = must_alloc(len + 1);
2284 do {
2285 if (idx)
2286 ret = snprintf(full_path, len + 1, "%s/lxc-%d",
2287 base_path, idx);
2288 else
2289 ret = snprintf(full_path, len + 1, "%s/lxc", base_path);
2290 if (ret < 0 || (size_t)ret >= len + 1)
2291 goto on_error;
2292
2293 ret = mkdir_p(full_path, 0755);
2294 if (ret < 0 && errno != EEXIST)
2295 goto on_error;
2296
2297 strcat(full_path, "/cgroup.procs");
2298 ret = lxc_write_to_file(full_path, pidstr, len, false);
2299 if (ret == 0)
2300 goto on_success;
2301
2302 /* this is a non-leaf node */
2303 if (errno != EBUSY)
2304 goto on_error;
2305
2306 } while (++idx > 0 && idx < 1000);
2307
2308on_success:
2309 if (idx < 1000)
2310 fret = 0;
2311
2312on_error:
2313 free(base_path);
2314 free(container_cgroup);
2315 free(full_path);
2316
2317 return fret;
2318}
2319
ccb4cabe
SH
2320static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
2321{
c2aed66d 2322 int i, len, ret;
ccb4cabe 2323 char pidstr[25];
ccb4cabe
SH
2324
2325 len = snprintf(pidstr, 25, "%d", pid);
2326 if (len < 0 || len > 25)
2327 return false;
2328
457ca9aa 2329 for (i = 0; hierarchies[i]; i++) {
c2aed66d
CB
2330 char *path;
2331 char *fullpath = NULL;
457ca9aa 2332 struct hierarchy *h = hierarchies[i];
ccb4cabe 2333
c2aed66d 2334 if (h->version == CGROUP2_SUPER_MAGIC) {
a3926f6a
CB
2335 ret = __cg_unified_attach(h, name, lxcpath, pidstr, len,
2336 h->controllers[0]);
c2aed66d
CB
2337 if (ret < 0)
2338 return false;
2339
2340 continue;
2341 }
2342
ccb4cabe 2343 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
c2aed66d
CB
2344 /* not running */
2345 if (!path)
ccb4cabe
SH
2346 continue;
2347
371f834d 2348 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
c2aed66d
CB
2349 ret = lxc_write_to_file(fullpath, pidstr, len, false);
2350 if (ret < 0) {
ccb4cabe
SH
2351 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
2352 free(fullpath);
ccb4cabe
SH
2353 return false;
2354 }
ccb4cabe
SH
2355 free(fullpath);
2356 }
2357
ccb4cabe
SH
2358 return true;
2359}
2360
2361/*
2362 * Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.
2363 * Here we don't have a cgroup_data set up, so we ask the running
2364 * container through the commands API for the cgroup path
2365 */
0069cc61
CB
2366static int cgfsng_get(const char *filename, char *value, size_t len,
2367 const char *name, const char *lxcpath)
ccb4cabe 2368{
ccb4cabe 2369 int ret = -1;
0069cc61
CB
2370 size_t controller_len;
2371 char *controller, *p, *path;
2372 struct hierarchy *h;
ccb4cabe 2373
0069cc61
CB
2374 controller_len = strlen(filename);
2375 controller = alloca(controller_len + 1);
2376 strcpy(controller, filename);
2377 p = strchr(controller, '.');
2378 if (p)
ccb4cabe
SH
2379 *p = '\0';
2380
0069cc61
CB
2381 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2382 /* not running */
2383 if (!path)
ccb4cabe
SH
2384 return -1;
2385
0069cc61 2386 h = get_hierarchy(controller);
ccb4cabe 2387 if (h) {
0069cc61
CB
2388 char *fullpath;
2389
2390 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
2391 ret = lxc_read_from_file(fullpath, value, len);
2392 free(fullpath);
2393 }
ccb4cabe
SH
2394 free(path);
2395
2396 return ret;
2397}
2398
2399/*
2400 * Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.
2401 * Here we don't have a cgroup_data set up, so we ask the running
2402 * container through the commands API for the cgroup path
2403 */
87777968
CB
2404static int cgfsng_set(const char *filename, const char *value, const char *name,
2405 const char *lxcpath)
ccb4cabe 2406{
ccb4cabe 2407 int ret = -1;
87777968
CB
2408 size_t controller_len;
2409 char *controller, *p, *path;
2410 struct hierarchy *h;
ccb4cabe 2411
87777968
CB
2412 controller_len = strlen(filename);
2413 controller = alloca(controller_len + 1);
2414 strcpy(controller, filename);
2415 p = strchr(controller, '.');
2416 if (p)
ccb4cabe
SH
2417 *p = '\0';
2418
87777968
CB
2419 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2420 /* not running */
2421 if (!path)
ccb4cabe
SH
2422 return -1;
2423
87777968 2424 h = get_hierarchy(controller);
ccb4cabe 2425 if (h) {
87777968
CB
2426 char *fullpath;
2427
2428 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
2429 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
2430 free(fullpath);
2431 }
ccb4cabe
SH
2432 free(path);
2433
2434 return ret;
2435}
2436
72add155
SH
2437/*
2438 * take devices cgroup line
2439 * /dev/foo rwx
2440 * and convert it to a valid
2441 * type major:minor mode
2442 * line. Return <0 on error. Dest is a preallocated buffer
2443 * long enough to hold the output.
2444 */
2445static int convert_devpath(const char *invalue, char *dest)
2446{
2a06d041
CB
2447 int n_parts;
2448 char *p, *path, type;
72add155
SH
2449 struct stat sb;
2450 unsigned long minor, major;
2a06d041
CB
2451 int ret = -EINVAL;
2452 char *mode = NULL;
72add155
SH
2453
2454 path = must_copy_string(invalue);
2455
2456 /*
2457 * read path followed by mode; ignore any trailing text.
2458 * A ' # comment' would be legal. Technically other text
2459 * is not legal, we could check for that if we cared to
2460 */
2461 for (n_parts = 1, p = path; *p && n_parts < 3; p++) {
2c2d6c49
SH
2462 if (*p != ' ')
2463 continue;
2464 *p = '\0';
2465 if (n_parts != 1)
2466 break;
2467 p++;
2468 n_parts++;
2469 while (*p == ' ')
2470 p++;
2471 mode = p;
2472 if (*p == '\0')
2473 goto out;
72add155 2474 }
2c2d6c49
SH
2475
2476 if (n_parts == 1)
72add155 2477 goto out;
72add155
SH
2478
2479 ret = stat(path, &sb);
2480 if (ret < 0)
2481 goto out;
2482
72add155
SH
2483 mode_t m = sb.st_mode & S_IFMT;
2484 switch (m) {
2485 case S_IFBLK:
2486 type = 'b';
2487 break;
2488 case S_IFCHR:
2489 type = 'c';
2490 break;
2c2d6c49 2491 default:
72add155
SH
2492 ERROR("Unsupported device type %i for %s", m, path);
2493 ret = -EINVAL;
2494 goto out;
2495 }
2c2d6c49
SH
2496
2497 major = MAJOR(sb.st_rdev);
2498 minor = MINOR(sb.st_rdev);
2499 ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
72add155 2500 if (ret < 0 || ret >= 50) {
2a06d041
CB
2501 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2502 "chars)", type, major, minor, mode);
72add155
SH
2503 ret = -ENAMETOOLONG;
2504 goto out;
2505 }
2506 ret = 0;
2507
2508out:
2509 free(path);
2510 return ret;
2511}
2512
ccb4cabe
SH
2513/*
2514 * Called from setup_limits - here we have the container's cgroup_data because
2515 * we created the cgroups
2516 */
a3926f6a
CB
2517static int cg_legacy_set_data(const char *filename, const char *value,
2518 struct cgfsng_handler_data *d)
ccb4cabe 2519{
b3646d7e 2520 char *fullpath, *p;
1a0e70ac
CB
2521 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2522 char converted_value[50];
b3646d7e
CB
2523 struct hierarchy *h;
2524 int ret = 0;
2525 char *controller = NULL;
ccb4cabe 2526
b3646d7e
CB
2527 controller = alloca(strlen(filename) + 1);
2528 strcpy(controller, filename);
2529 if ((p = strchr(controller, '.')) != NULL)
ccb4cabe
SH
2530 *p = '\0';
2531
c8bf519d 2532 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
72add155
SH
2533 ret = convert_devpath(value, converted_value);
2534 if (ret < 0)
c8bf519d 2535 return ret;
72add155
SH
2536 value = converted_value;
2537
c8bf519d 2538 }
2539
b3646d7e
CB
2540 h = get_hierarchy(controller);
2541 if (!h) {
2542 ERROR("Failed to setup limits for the \"%s\" controller. "
2543 "The controller seems to be unused by \"cgfsng\" cgroup "
2544 "driver or not enabled on the cgroup hierarchy",
2545 controller);
d1953b26 2546 errno = ENOENT;
b3646d7e 2547 return -1;
ccb4cabe 2548 }
b3646d7e
CB
2549
2550 fullpath = must_make_path(h->fullcgpath, filename, NULL);
2551 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
2552 free(fullpath);
ccb4cabe
SH
2553 return ret;
2554}
2555
a3926f6a
CB
2556static bool __cg_legacy_setup_limits(void *hdata,
2557 struct lxc_list *cgroup_settings,
2558 bool do_devices)
ccb4cabe
SH
2559{
2560 struct cgfsng_handler_data *d = hdata;
2561 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
2562 struct lxc_cgroup *cg;
ccb4cabe
SH
2563 bool ret = false;
2564
2565 if (lxc_list_empty(cgroup_settings))
2566 return true;
2567
2568 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
6b38e644 2569 if (!sorted_cgroup_settings)
ccb4cabe 2570 return false;
ccb4cabe 2571
ccb4cabe
SH
2572 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2573 cg = iterator->elem;
2574
2575 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
a3926f6a 2576 if (cg_legacy_set_data(cg->subsystem, cg->value, d)) {
ccb4cabe
SH
2577 if (do_devices && (errno == EACCES || errno == EPERM)) {
2578 WARN("Error setting %s to %s for %s",
2579 cg->subsystem, cg->value, d->name);
2580 continue;
2581 }
2582 SYSERROR("Error setting %s to %s for %s",
2583 cg->subsystem, cg->value, d->name);
2584 goto out;
2585 }
6a628f4a 2586 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
ccb4cabe 2587 }
ccb4cabe
SH
2588 }
2589
2590 ret = true;
6b38e644 2591 INFO("Limits for the legacy cgroup hierarchies have been setup");
ccb4cabe 2592out:
ccb4cabe
SH
2593 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2594 lxc_list_del(iterator);
2595 free(iterator);
2596 }
2597 free(sorted_cgroup_settings);
2598 return ret;
2599}
2600
a3926f6a
CB
2601static bool __cg_unified_setup_limits(void *hdata,
2602 struct lxc_list *cgroup_settings)
6b38e644
CB
2603{
2604 struct lxc_list *iterator;
2605 struct hierarchy *h = unified;
2606
2607 if (lxc_list_empty(cgroup_settings))
2608 return true;
2609
2610 if (!h)
2611 return false;
2612
2613 lxc_list_for_each(iterator, cgroup_settings) {
2614 int ret;
2615 char *fullpath;
2616 struct lxc_cgroup *cg = iterator->elem;
2617
2618 fullpath = must_make_path(h->fullcgpath, cg->subsystem, NULL);
2619 ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false);
2620 free(fullpath);
2621 if (ret < 0) {
2622 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2623 return false;
2624 }
2625 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2626 }
2627
2628 INFO("Limits for the unified cgroup hierarchy have been setup");
2629 return true;
2630}
2631
2632static bool cgfsng_setup_limits(void *hdata, struct lxc_conf *conf,
2633 bool do_devices)
2634{
2635 bool bret;
2636
a3926f6a 2637 bret = __cg_legacy_setup_limits(hdata, &conf->cgroup, do_devices);
6b38e644
CB
2638 if (!bret)
2639 return false;
2640
a3926f6a 2641 return __cg_unified_setup_limits(hdata, &conf->cgroup2);
6b38e644
CB
2642}
2643
ccb4cabe
SH
2644static struct cgroup_ops cgfsng_ops = {
2645 .init = cgfsng_init,
2646 .destroy = cgfsng_destroy,
2647 .create = cgfsng_create,
2648 .enter = cgfsng_enter,
ccb4cabe 2649 .escape = cgfsng_escape,
36662416
TA
2650 .num_hierarchies = cgfsng_num_hierarchies,
2651 .get_hierarchies = cgfsng_get_hierarchies,
ccb4cabe
SH
2652 .get_cgroup = cgfsng_get_cgroup,
2653 .get = cgfsng_get,
2654 .set = cgfsng_set,
2655 .unfreeze = cgfsng_unfreeze,
2656 .setup_limits = cgfsng_setup_limits,
2657 .name = "cgroupfs-ng",
2658 .attach = cgfsng_attach,
058c1cb6 2659 .chown = cgfsng_chown,
ccb4cabe
SH
2660 .mount_cgroup = cgfsng_mount,
2661 .nrtasks = cgfsng_nrtasks,
2662 .driver = CGFSNG,
2663
2664 /* unsupported */
2665 .create_legacy = NULL,
2666};