]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
Merge pull request #2144 from brauner/2018-02-08/coverity_bug_smash
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
ccb4cabe
SH
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24/*
25 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
26 * cgroup backend. The original cgfs.c was designed to be as flexible
27 * as possible. It would try to find cgroup filesystems no matter where
28 * or how you had them mounted, and deduce the most usable mount for
29 * each controller. It also was not designed for unprivileged use, as
30 * that was reserved for cgmanager.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
a54694f8 36
ccb4cabe 37#include "config.h"
a54694f8
CB
38
39#include <ctype.h>
40#include <dirent.h>
41#include <errno.h>
42#include <grp.h>
43#include <stdint.h>
ccb4cabe
SH
44#include <stdio.h>
45#include <stdlib.h>
a54694f8 46#include <string.h>
ccb4cabe 47#include <unistd.h>
a54694f8 48#include <sys/types.h>
ccb4cabe 49
c8bf519d 50#include <linux/types.h>
51#include <linux/kdev_t.h>
52
b635e92d 53#include "caps.h"
ccb4cabe 54#include "cgroup.h"
6328fd9c 55#include "cgroup_utils.h"
ccb4cabe 56#include "commands.h"
43654d34 57#include "conf.h"
a54694f8 58#include "log.h"
43654d34 59#include "storage/storage.h"
a54694f8 60#include "utils.h"
ccb4cabe
SH
61
62lxc_log_define(lxc_cgfsng, lxc);
63
64static struct cgroup_ops cgfsng_ops;
65
ccb4cabe
SH
66/*
67 * A descriptor for a mounted hierarchy
68 * @controllers: either NULL, or a null-terminated list of all
69 * the co-mounted controllers
70 * @mountpoint: the mountpoint we will use. It will be either
71 * /sys/fs/cgroup/controller or /sys/fs/cgroup/controllerlist
72 * @base_cgroup: the cgroup under which the container cgroup path
73 is created. This will be either the caller's cgroup (if not
74 root), or init's cgroup (if root).
75 */
76struct hierarchy {
77 char **controllers;
78 char *mountpoint;
79 char *base_cgroup;
80 char *fullcgpath;
d6337a5f 81 int version;
ccb4cabe
SH
82};
83
84/*
85 * The cgroup data which is attached to the lxc_handler.
43654d34
CB
86 * @cgroup_pattern : A copy of the lxc.cgroup.pattern
87 * @container_cgroup : If not null, the cgroup which was created for the
88 * container. For each hierarchy, it is created under the
89 * @hierarchy->base_cgroup directory. Relative to the
90 * base_cgroup it is the same for all hierarchies.
91 * @name : The name of the container.
92 * @cgroup_meta : A copy of the container's cgroup information. This
93 * overrides @cgroup_pattern.
ccb4cabe
SH
94 */
95struct cgfsng_handler_data {
ccb4cabe 96 char *cgroup_pattern;
1a0e70ac
CB
97 char *container_cgroup; /* cgroup we created for the container */
98 char *name; /* container name */
43654d34
CB
99 /* per-container cgroup information */
100 struct lxc_cgroup cgroup_meta;
d6337a5f 101 cgroup_layout_t cgroup_layout;
ccb4cabe
SH
102};
103
457ca9aa
SH
104/*
105 * @hierarchies - a NULL-terminated array of struct hierarchy, one per
d6337a5f
CB
106 * legacy hierarchy. No duplicates. First sufficient, writeable
107 * mounted hierarchy wins
457ca9aa
SH
108 */
109struct hierarchy **hierarchies;
d6337a5f
CB
110struct hierarchy *unified;
111cgroup_layout_t cgroup_layout;
457ca9aa
SH
112
113/*
114 * @cgroup_use - a copy of the lxc.cgroup.use
115 */
116char *cgroup_use;
117
e4aeecf5
CB
118/*
119 * @lxc_cgfsng_debug - whether to print debug info to stdout for the cgfsng
120 * driver
121 */
122static bool lxc_cgfsng_debug;
123
65d78313
MPS
124#define CGFSNG_DEBUG(format, ...) do { \
125 if (lxc_cgfsng_debug) \
126 printf("cgfsng: " format, ##__VA_ARGS__); \
127} while(0)
128
ccb4cabe
SH
129static void free_string_list(char **clist)
130{
131 if (clist) {
132 int i;
133
134 for (i = 0; clist[i]; i++)
135 free(clist[i]);
136 free(clist);
137 }
138}
139
ccb4cabe
SH
140/* Allocate a pointer, do not fail */
141static void *must_alloc(size_t sz)
142{
143 return must_realloc(NULL, sz);
144}
145
ccb4cabe
SH
146/*
147 * This is a special case - return a copy of @entry
148 * prepending 'name='. I.e. turn systemd into name=systemd.
149 * Do not fail.
150 */
151static char *must_prefix_named(char *entry)
152{
153 char *ret;
154 size_t len = strlen(entry);
155
156 ret = must_alloc(len + 6);
157 snprintf(ret, len + 6, "name=%s", entry);
158 return ret;
159}
160
161/*
162 * Given a pointer to a null-terminated array of pointers, realloc to
163 * add one entry, and point the new entry to NULL. Do not fail. Return
164 * the index to the second-to-last entry - that is, the one which is
165 * now available for use (keeping the list null-terminated).
166 */
167static int append_null_to_list(void ***list)
168{
169 int newentry = 0;
170
171 if (*list)
172 for (; (*list)[newentry]; newentry++);
173
174 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
175 (*list)[newentry + 1] = NULL;
176 return newentry;
177}
178
179/*
180 * Given a null-terminated array of strings, check whether @entry
181 * is one of the strings
182 */
183static bool string_in_list(char **list, const char *entry)
184{
185 int i;
186
187 if (!list)
188 return false;
d6337a5f 189
ccb4cabe
SH
190 for (i = 0; list[i]; i++)
191 if (strcmp(list[i], entry) == 0)
192 return true;
193
194 return false;
195}
196
197/*
198 * append an entry to the clist. Do not fail.
199 * *clist must be NULL the first time we are called.
200 *
201 * We also handle named subsystems here. Any controller which is not a
202 * kernel subsystem, we prefix 'name='. Any which is both a kernel and
203 * named subsystem, we refuse to use because we're not sure which we
204 * have here. (TODO - we could work around this in some cases by just
205 * remounting to be unambiguous, or by comparing mountpoint contents
206 * with current cgroup)
207 *
208 * The last entry will always be NULL.
209 */
210static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry)
211{
212 int newentry;
213 char *copy;
214
215 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
c2712f64 216 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
ccb4cabe
SH
217 ERROR("It is both a named and kernel subsystem");
218 return;
219 }
220
221 newentry = append_null_to_list((void ***)clist);
222
223 if (strncmp(entry, "name=", 5) == 0)
224 copy = must_copy_string(entry);
225 else if (string_in_list(klist, entry))
226 copy = must_copy_string(entry);
227 else
228 copy = must_prefix_named(entry);
229
230 (*clist)[newentry] = copy;
231}
232
ccb4cabe
SH
233static void free_handler_data(struct cgfsng_handler_data *d)
234{
ccb4cabe
SH
235 free(d->cgroup_pattern);
236 free(d->container_cgroup);
237 free(d->name);
43654d34
CB
238 if (d->cgroup_meta.dir)
239 free(d->cgroup_meta.dir);
240 if (d->cgroup_meta.controllers)
241 free(d->cgroup_meta.controllers);
ccb4cabe
SH
242 free(d);
243}
244
245/*
246 * Given a handler's cgroup data, return the struct hierarchy for the
247 * controller @c, or NULL if there is none.
248 */
457ca9aa 249struct hierarchy *get_hierarchy(const char *c)
ccb4cabe
SH
250{
251 int i;
252
457ca9aa 253 if (!hierarchies)
ccb4cabe 254 return NULL;
d6337a5f 255
457ca9aa 256 for (i = 0; hierarchies[i]; i++) {
d6337a5f
CB
257 if (!c) {
258 /* This is the empty unified hierarchy. */
259 if (hierarchies[i]->controllers &&
260 !hierarchies[i]->controllers[0])
261 return hierarchies[i];
262
263 return NULL;
264 }
265
457ca9aa
SH
266 if (string_in_list(hierarchies[i]->controllers, c))
267 return hierarchies[i];
ccb4cabe 268 }
d6337a5f 269
ccb4cabe
SH
270 return NULL;
271}
272
a54694f8
CB
273#define BATCH_SIZE 50
274static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
275{
276 int newbatches = (newlen / BATCH_SIZE) + 1;
277 int oldbatches = (oldlen / BATCH_SIZE) + 1;
278
279 if (!*mem || newbatches > oldbatches) {
280 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
281 }
282}
283
284static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
285{
286 size_t full = oldlen + newlen;
287
288 batch_realloc(dest, oldlen, full + 1);
289
290 memcpy(*dest + oldlen, new, newlen + 1);
291}
292
293/* Slurp in a whole file */
d6337a5f 294static char *read_file(const char *fnam)
a54694f8
CB
295{
296 FILE *f;
297 char *line = NULL, *buf = NULL;
298 size_t len = 0, fulllen = 0;
299 int linelen;
300
301 f = fopen(fnam, "r");
302 if (!f)
303 return NULL;
304 while ((linelen = getline(&line, &len, f)) != -1) {
305 append_line(&buf, fulllen, line, linelen);
306 fulllen += linelen;
307 }
308 fclose(f);
309 free(line);
310 return buf;
311}
312
313/* Taken over modified from the kernel sources. */
314#define NBITS 32 /* bits in uint32_t */
315#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
316#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
317
318static void set_bit(unsigned bit, uint32_t *bitarr)
319{
320 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
321}
322
323static void clear_bit(unsigned bit, uint32_t *bitarr)
324{
325 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
326}
327
328static bool is_set(unsigned bit, uint32_t *bitarr)
329{
330 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
331}
332
333/* Create cpumask from cpulist aka turn:
334 *
335 * 0,2-3
336 *
337 * into bit array
338 *
339 * 1 0 1 1
340 */
341static uint32_t *lxc_cpumask(char *buf, size_t nbits)
342{
343 char *token;
344 char *saveptr = NULL;
345 size_t arrlen = BITS_TO_LONGS(nbits);
346 uint32_t *bitarr = calloc(arrlen, sizeof(uint32_t));
347 if (!bitarr)
348 return NULL;
349
350 for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
351 errno = 0;
352 unsigned start = strtoul(token, NULL, 0);
353 unsigned end = start;
354
355 char *range = strchr(token, '-');
356 if (range)
357 end = strtoul(range + 1, NULL, 0);
358 if (!(start <= end)) {
359 free(bitarr);
360 return NULL;
361 }
362
363 if (end >= nbits) {
364 free(bitarr);
365 return NULL;
366 }
367
368 while (start <= end)
369 set_bit(start++, bitarr);
370 }
371
372 return bitarr;
373}
374
a54694f8
CB
375/* Turn cpumask into simple, comma-separated cpulist. */
376static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
377{
378 size_t i;
379 int ret;
eab15c1e 380 char numstr[LXC_NUMSTRLEN64] = {0};
a54694f8
CB
381 char **cpulist = NULL;
382
383 for (i = 0; i <= nbits; i++) {
384 if (is_set(i, bitarr)) {
eab15c1e
CB
385 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
386 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
a54694f8
CB
387 lxc_free_array((void **)cpulist, free);
388 return NULL;
389 }
390 if (lxc_append_string(&cpulist, numstr) < 0) {
391 lxc_free_array((void **)cpulist, free);
392 return NULL;
393 }
394 }
395 }
396 return lxc_string_join(",", (const char **)cpulist, false);
397}
398
399static ssize_t get_max_cpus(char *cpulist)
400{
401 char *c1, *c2;
402 char *maxcpus = cpulist;
403 size_t cpus = 0;
404
405 c1 = strrchr(maxcpus, ',');
406 if (c1)
407 c1++;
408
409 c2 = strrchr(maxcpus, '-');
410 if (c2)
411 c2++;
412
413 if (!c1 && !c2)
414 c1 = maxcpus;
415 else if (c1 > c2)
416 c2 = c1;
417 else if (c1 < c2)
418 c1 = c2;
1a0e70ac 419 else if (!c1 && c2) /* The reverse case is obvs. not needed. */
a54694f8
CB
420 c1 = c2;
421
422 /* If the above logic is correct, c1 should always hold a valid string
423 * here.
424 */
425
426 errno = 0;
427 cpus = strtoul(c1, NULL, 0);
428 if (errno != 0)
429 return -1;
430
431 return cpus;
432}
433
6f9584d8 434#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
a3926f6a 435static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
a54694f8
CB
436{
437 char *lastslash, *fpath, oldv;
438 int ret;
439 ssize_t i;
440
441 ssize_t maxposs = 0, maxisol = 0;
442 char *cpulist = NULL, *posscpus = NULL, *isolcpus = NULL;
443 uint32_t *possmask = NULL, *isolmask = NULL;
6f9584d8 444 bool bret = false, flipped_bit = false;
a54694f8
CB
445
446 lastslash = strrchr(path, '/');
1a0e70ac 447 if (!lastslash) { /* bug... this shouldn't be possible */
6f9584d8 448 ERROR("Invalid path: %s.", path);
a54694f8
CB
449 return bret;
450 }
451 oldv = *lastslash;
452 *lastslash = '\0';
453 fpath = must_make_path(path, "cpuset.cpus", NULL);
454 posscpus = read_file(fpath);
6f9584d8
CB
455 if (!posscpus) {
456 SYSERROR("Could not read file: %s.\n", fpath);
457 goto on_error;
458 }
a54694f8
CB
459
460 /* Get maximum number of cpus found in possible cpuset. */
461 maxposs = get_max_cpus(posscpus);
462 if (maxposs < 0)
6f9584d8 463 goto on_error;
a54694f8 464
6f9584d8
CB
465 if (!file_exists(__ISOL_CPUS)) {
466 /* This system doesn't expose isolated cpus. */
467 DEBUG("Path: "__ISOL_CPUS" to read isolated cpus from does not exist.\n");
65d29cbc
CB
468 cpulist = posscpus;
469 /* No isolated cpus but we weren't already initialized by
470 * someone. We should simply copy the parents cpuset.cpus
471 * values.
472 */
473 if (!am_initialized) {
474 DEBUG("Copying cpuset of parent cgroup.");
475 goto copy_parent;
476 }
477 /* No isolated cpus but we were already initialized by someone.
478 * Nothing more to do for us.
479 */
6f9584d8
CB
480 goto on_success;
481 }
482
483 isolcpus = read_file(__ISOL_CPUS);
484 if (!isolcpus) {
485 SYSERROR("Could not read file "__ISOL_CPUS);
486 goto on_error;
487 }
a54694f8 488 if (!isdigit(isolcpus[0])) {
6f9584d8 489 DEBUG("No isolated cpus detected.");
a54694f8
CB
490 cpulist = posscpus;
491 /* No isolated cpus but we weren't already initialized by
492 * someone. We should simply copy the parents cpuset.cpus
493 * values.
494 */
6f9584d8
CB
495 if (!am_initialized) {
496 DEBUG("Copying cpuset of parent cgroup.");
a54694f8 497 goto copy_parent;
6f9584d8 498 }
a54694f8
CB
499 /* No isolated cpus but we were already initialized by someone.
500 * Nothing more to do for us.
501 */
6f9584d8 502 goto on_success;
a54694f8
CB
503 }
504
505 /* Get maximum number of cpus found in isolated cpuset. */
506 maxisol = get_max_cpus(isolcpus);
507 if (maxisol < 0)
6f9584d8 508 goto on_error;
a54694f8
CB
509
510 if (maxposs < maxisol)
511 maxposs = maxisol;
512 maxposs++;
513
514 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8
CB
515 if (!possmask) {
516 ERROR("Could not create cpumask for all possible cpus.\n");
517 goto on_error;
518 }
a54694f8
CB
519
520 isolmask = lxc_cpumask(isolcpus, maxposs);
6f9584d8
CB
521 if (!isolmask) {
522 ERROR("Could not create cpumask for all isolated cpus.\n");
523 goto on_error;
524 }
a54694f8
CB
525
526 for (i = 0; i <= maxposs; i++) {
527 if (is_set(i, isolmask) && is_set(i, possmask)) {
6f9584d8 528 flipped_bit = true;
a54694f8
CB
529 clear_bit(i, possmask);
530 }
531 }
532
6f9584d8
CB
533 if (!flipped_bit) {
534 DEBUG("No isolated cpus present in cpuset.");
535 goto on_success;
536 }
537 DEBUG("Removed isolated cpus from cpuset.");
538
a54694f8 539 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
6f9584d8
CB
540 if (!cpulist) {
541 ERROR("Could not create cpu list.\n");
542 goto on_error;
543 }
a54694f8
CB
544
545copy_parent:
546 *lastslash = oldv;
dcbc861e 547 free(fpath);
a54694f8
CB
548 fpath = must_make_path(path, "cpuset.cpus", NULL);
549 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false);
6f9584d8
CB
550 if (ret < 0) {
551 SYSERROR("Could not write cpu list to: %s.\n", fpath);
552 goto on_error;
553 }
554
555on_success:
556 bret = true;
a54694f8 557
6f9584d8 558on_error:
a54694f8
CB
559 free(fpath);
560
561 free(isolcpus);
562 free(isolmask);
563
564 if (posscpus != cpulist)
565 free(posscpus);
566 free(possmask);
567
568 free(cpulist);
569 return bret;
570}
571
e3a3fecf
SH
572/* Copy contents of parent(@path)/@file to @path/@file */
573static bool copy_parent_file(char *path, char *file)
574{
575 char *lastslash, *value = NULL, *fpath, oldv;
576 int len = 0;
577 int ret;
578
579 lastslash = strrchr(path, '/');
1a0e70ac 580 if (!lastslash) { /* bug... this shouldn't be possible */
e3a3fecf
SH
581 ERROR("cgfsng:copy_parent_file: bad path %s", path);
582 return false;
583 }
584 oldv = *lastslash;
585 *lastslash = '\0';
586 fpath = must_make_path(path, file, NULL);
587 len = lxc_read_from_file(fpath, NULL, 0);
588 if (len <= 0)
589 goto bad;
590 value = must_alloc(len + 1);
591 if (lxc_read_from_file(fpath, value, len) != len)
592 goto bad;
593 free(fpath);
594 *lastslash = oldv;
595 fpath = must_make_path(path, file, NULL);
596 ret = lxc_write_to_file(fpath, value, len, false);
597 if (ret < 0)
598 SYSERROR("Unable to write %s to %s", value, fpath);
599 free(fpath);
600 free(value);
601 return ret >= 0;
602
603bad:
604 SYSERROR("Error reading '%s'", fpath);
605 free(fpath);
606 free(value);
607 return false;
608}
609
610/*
611 * Initialize the cpuset hierarchy in first directory of @gname and
612 * set cgroup.clone_children so that children inherit settings.
613 * Since the h->base_path is populated by init or ourselves, we know
614 * it is already initialized.
615 */
a3926f6a 616static bool cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf
SH
617{
618 char *cgpath, *clonechildrenpath, v, *slash;
619
620 if (!string_in_list(h->controllers, "cpuset"))
621 return true;
622
623 if (*cgname == '/')
624 cgname++;
625 slash = strchr(cgname, '/');
626 if (slash)
627 *slash = '\0';
628
629 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
630 if (slash)
631 *slash = '/';
632 if (mkdir(cgpath, 0755) < 0 && errno != EEXIST) {
633 SYSERROR("Failed to create '%s'", cgpath);
634 free(cgpath);
635 return false;
636 }
6f9584d8 637
e3a3fecf 638 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
6328fd9c
CB
639 /* unified hierarchy doesn't have clone_children */
640 if (!file_exists(clonechildrenpath)) {
e3a3fecf
SH
641 free(clonechildrenpath);
642 free(cgpath);
643 return true;
644 }
645 if (lxc_read_from_file(clonechildrenpath, &v, 1) < 0) {
646 SYSERROR("Failed to read '%s'", clonechildrenpath);
647 free(clonechildrenpath);
648 free(cgpath);
649 return false;
650 }
651
a54694f8 652 /* Make sure any isolated cpus are removed from cpuset.cpus. */
a3926f6a 653 if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1')) {
6f9584d8
CB
654 SYSERROR("Failed to remove isolated cpus.");
655 free(clonechildrenpath);
656 free(cgpath);
a54694f8 657 return false;
6f9584d8 658 }
a54694f8 659
e3a3fecf 660 if (v == '1') { /* already set for us by someone else */
6f9584d8 661 DEBUG("\"cgroup.clone_children\" was already set to \"1\".");
e3a3fecf
SH
662 free(clonechildrenpath);
663 free(cgpath);
664 return true;
665 }
666
667 /* copy parent's settings */
a54694f8 668 if (!copy_parent_file(cgpath, "cpuset.mems")) {
6f9584d8 669 SYSERROR("Failed to copy \"cpuset.mems\" settings.");
e3a3fecf
SH
670 free(cgpath);
671 free(clonechildrenpath);
672 return false;
673 }
674 free(cgpath);
675
676 if (lxc_write_to_file(clonechildrenpath, "1", 1, false) < 0) {
677 /* Set clone_children so children inherit our settings */
678 SYSERROR("Failed to write 1 to %s", clonechildrenpath);
679 free(clonechildrenpath);
680 return false;
681 }
682 free(clonechildrenpath);
683 return true;
684}
685
ccb4cabe
SH
686/*
687 * Given two null-terminated lists of strings, return true if any string
688 * is in both.
689 */
690static bool controller_lists_intersect(char **l1, char **l2)
691{
692 int i;
693
694 if (!l1 || !l2)
695 return false;
696
697 for (i = 0; l1[i]; i++) {
698 if (string_in_list(l2, l1[i]))
699 return true;
700 }
701 return false;
702}
703
704/*
705 * For a null-terminated list of controllers @clist, return true if any of
706 * those controllers is already listed the null-terminated list of
707 * hierarchies @hlist. Realistically, if one is present, all must be present.
708 */
709static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
710{
711 int i;
712
713 if (!hlist)
714 return false;
715 for (i = 0; hlist[i]; i++)
716 if (controller_lists_intersect(hlist[i]->controllers, clist))
717 return true;
718 return false;
719
720}
721
722/*
723 * Return true if the controller @entry is found in the null-terminated
724 * list of hierarchies @hlist
725 */
726static bool controller_found(struct hierarchy **hlist, char *entry)
727{
728 int i;
d6337a5f 729
ccb4cabe
SH
730 if (!hlist)
731 return false;
732
733 for (i = 0; hlist[i]; i++)
734 if (string_in_list(hlist[i]->controllers, entry))
735 return true;
d6337a5f 736
ccb4cabe
SH
737 return false;
738}
739
740/*
c30b61c3
SH
741 * Return true if all of the controllers which we require have been found.
742 * The required list is freezer and anything in * lxc.cgroup.use.
ccb4cabe 743 */
457ca9aa 744static bool all_controllers_found(void)
ccb4cabe
SH
745{
746 char *p, *saveptr = NULL;
457ca9aa 747 struct hierarchy ** hlist = hierarchies;
ccb4cabe 748
ccb4cabe 749 if (!controller_found(hlist, "freezer")) {
65d78313 750 CGFSNG_DEBUG("No freezer controller mountpoint found\n");
ccb4cabe
SH
751 return false;
752 }
753
457ca9aa 754 if (!cgroup_use)
ccb4cabe 755 return true;
c2712f64 756
457ca9aa 757 for (p = strtok_r(cgroup_use, ",", &saveptr); p;
ccb4cabe
SH
758 p = strtok_r(NULL, ",", &saveptr)) {
759 if (!controller_found(hlist, p)) {
65d78313 760 CGFSNG_DEBUG("No %s controller mountpoint found\n", p);
ccb4cabe
SH
761 return false;
762 }
763 }
c2712f64 764
ccb4cabe
SH
765 return true;
766}
767
ccb4cabe
SH
768/*
769 * Get the controllers from a mountinfo line
770 * There are other ways we could get this info. For lxcfs, field 3
771 * is /cgroup/controller-list. For cgroupfs, we could parse the mount
772 * options. But we simply assume that the mountpoint must be
773 * /sys/fs/cgroup/controller-list
774 */
a3926f6a
CB
775static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
776 int type)
ccb4cabe 777{
6328fd9c 778 /* the fourth field is /sys/fs/cgroup/comma-delimited-controller-list */
ccb4cabe 779 int i;
411ac6d8 780 char *dup, *p2, *tok;
d6337a5f 781 char *p = line, *saveptr = NULL, *sep = ",";
411ac6d8 782 char **aret = NULL;
6328fd9c 783
ccb4cabe 784 for (i = 0; i < 4; i++) {
235f1815 785 p = strchr(p, ' ');
ccb4cabe
SH
786 if (!p)
787 return NULL;
788 p++;
789 }
a55f31bd 790
ccb4cabe
SH
791 /* note - if we change how mountinfo works, then our caller
792 * will need to verify /sys/fs/cgroup/ in this field */
c2712f64 793 if (strncmp(p, "/sys/fs/cgroup/", 15)) {
65d78313 794 CGFSNG_DEBUG("Found hierarchy not under /sys/fs/cgroup: \"%s\"\n", p);
ccb4cabe 795 return NULL;
5059aae9 796 }
d6337a5f 797
ccb4cabe 798 p += 15;
235f1815 799 p2 = strchr(p, ' ');
ccb4cabe 800 if (!p2) {
65d78313 801 CGFSNG_DEBUG("Corrupt mountinfo\n");
ccb4cabe
SH
802 return NULL;
803 }
804 *p2 = '\0';
6328fd9c 805
d6337a5f
CB
806 if (type == CGROUP_SUPER_MAGIC) {
807 /* strdup() here for v1 hierarchies. Otherwise strtok_r() will
808 * destroy mountpoints such as "/sys/fs/cgroup/cpu,cpuacct".
809 */
810 dup = strdup(p);
811 if (!dup)
812 return NULL;
813
814 for (tok = strtok_r(dup, sep, &saveptr); tok;
815 tok = strtok_r(NULL, sep, &saveptr))
816 must_append_controller(klist, nlist, &aret, tok);
817
818 free(dup);
411ac6d8 819 }
d6337a5f
CB
820 *p2 = ' ';
821 return aret;
822}
411ac6d8 823
d6337a5f
CB
824static char **cg_unified_make_empty_controller(void)
825{
826 int newentry;
827 char **aret = NULL;
828
829 newentry = append_null_to_list((void ***)&aret);
830 aret[newentry] = NULL;
831 return aret;
832}
833
834static char **cg_unified_get_controllers(const char *file)
835{
836 char *buf, *tok;
837 char *saveptr = NULL, *sep = " \t\n";
838 char **aret = NULL;
839
840 buf = read_file(file);
841 if (!buf)
411ac6d8 842 return NULL;
6328fd9c 843
d6337a5f
CB
844 for (tok = strtok_r(buf, sep, &saveptr); tok;
845 tok = strtok_r(NULL, sep, &saveptr)) {
846 int newentry;
847 char *copy;
848
849 newentry = append_null_to_list((void ***)&aret);
850 copy = must_copy_string(tok);
851 aret[newentry] = copy;
ccb4cabe
SH
852 }
853
d6337a5f 854 free(buf);
ccb4cabe
SH
855 return aret;
856}
857
d6337a5f
CB
858static struct hierarchy *add_hierarchy(char **clist, char *mountpoint,
859 char *base_cgroup, int type)
ccb4cabe
SH
860{
861 struct hierarchy *new;
862 int newentry;
863
864 new = must_alloc(sizeof(*new));
865 new->controllers = clist;
866 new->mountpoint = mountpoint;
867 new->base_cgroup = base_cgroup;
868 new->fullcgpath = NULL;
d6337a5f 869 new->version = type;
6328fd9c 870
457ca9aa
SH
871 newentry = append_null_to_list((void ***)&hierarchies);
872 hierarchies[newentry] = new;
d6337a5f 873 return new;
ccb4cabe
SH
874}
875
876/*
877 * Get a copy of the mountpoint from @line, which is a line from
878 * /proc/self/mountinfo
879 */
a3926f6a 880static char *cg_hybrid_get_mountpoint(char *line)
ccb4cabe
SH
881{
882 int i;
d6337a5f 883 char *p2;
ccb4cabe 884 size_t len;
d6337a5f
CB
885 char *p = line;
886 char *sret = NULL;
ccb4cabe
SH
887
888 for (i = 0; i < 4; i++) {
235f1815 889 p = strchr(p, ' ');
ccb4cabe
SH
890 if (!p)
891 return NULL;
892 p++;
893 }
d6337a5f
CB
894
895 if (strncmp(p, "/sys/fs/cgroup/", 15))
896 return NULL;
897
898 p2 = strchr(p + 15, ' ');
899 if (!p2)
900 return NULL;
901 *p2 = '\0';
902
ccb4cabe
SH
903 len = strlen(p);
904 sret = must_alloc(len + 1);
905 memcpy(sret, p, len);
906 sret[len] = '\0';
907 return sret;
908}
909
910/*
911 * Given a multi-line string, return a null-terminated copy of the
912 * current line.
913 */
914static char *copy_to_eol(char *p)
915{
235f1815 916 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
917 size_t len;
918
919 if (!p2)
920 return NULL;
921
922 len = p2 - p;
923 sret = must_alloc(len + 1);
924 memcpy(sret, p, len);
925 sret[len] = '\0';
926 return sret;
927}
928
929/*
930 * cgline: pointer to character after the first ':' in a line in a
931 * \n-terminated /proc/self/cgroup file. Check whether * controller c is
932 * present.
933 */
934static bool controller_in_clist(char *cgline, char *c)
935{
936 char *tok, *saveptr = NULL, *eol, *tmp;
937 size_t len;
938
235f1815 939 eol = strchr(cgline, ':');
ccb4cabe
SH
940 if (!eol)
941 return false;
942
943 len = eol - cgline;
944 tmp = alloca(len + 1);
945 memcpy(tmp, cgline, len);
946 tmp[len] = '\0';
947
948 for (tok = strtok_r(tmp, ",", &saveptr); tok;
d6337a5f 949 tok = strtok_r(NULL, ",", &saveptr)) {
ccb4cabe
SH
950 if (strcmp(tok, c) == 0)
951 return true;
952 }
d6337a5f 953
ccb4cabe
SH
954 return false;
955}
956
957/*
958 * @basecginfo is a copy of /proc/$$/cgroup. Return the current
959 * cgroup for @controller
960 */
a3926f6a 961static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller, int type)
ccb4cabe
SH
962{
963 char *p = basecginfo;
6328fd9c 964
d6337a5f
CB
965 for (;;) {
966 bool is_cgv2_base_cgroup = false;
967
6328fd9c 968 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
d6337a5f
CB
969 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
970 is_cgv2_base_cgroup = true;
ccb4cabe 971
235f1815 972 p = strchr(p, ':');
ccb4cabe
SH
973 if (!p)
974 return NULL;
975 p++;
d6337a5f
CB
976
977 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
235f1815 978 p = strchr(p, ':');
ccb4cabe
SH
979 if (!p)
980 return NULL;
981 p++;
982 return copy_to_eol(p);
983 }
984
235f1815 985 p = strchr(p, '\n');
ccb4cabe
SH
986 if (!p)
987 return NULL;
988 p++;
989 }
990}
991
ccb4cabe
SH
992static void must_append_string(char ***list, char *entry)
993{
994 int newentry = append_null_to_list((void ***)list);
995 char *copy;
996
997 copy = must_copy_string(entry);
998 (*list)[newentry] = copy;
999}
1000
d6337a5f 1001static int get_existing_subsystems(char ***klist, char ***nlist)
ccb4cabe
SH
1002{
1003 FILE *f;
1004 char *line = NULL;
1005 size_t len = 0;
1006
d6337a5f
CB
1007 f = fopen("/proc/self/cgroup", "r");
1008 if (!f)
1009 return -1;
1010
ccb4cabe
SH
1011 while (getline(&line, &len, f) != -1) {
1012 char *p, *p2, *tok, *saveptr = NULL;
235f1815 1013 p = strchr(line, ':');
ccb4cabe
SH
1014 if (!p)
1015 continue;
1016 p++;
235f1815 1017 p2 = strchr(p, ':');
ccb4cabe
SH
1018 if (!p2)
1019 continue;
1020 *p2 = '\0';
ff8d6ee9 1021
6328fd9c
CB
1022 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
1023 * contains an entry of the form:
ff8d6ee9
CB
1024 *
1025 * 0::/some/path
1026 *
6328fd9c 1027 * In this case we use "cgroup2" as controller name.
ff8d6ee9 1028 */
6328fd9c
CB
1029 if ((p2 - p) == 0) {
1030 must_append_string(klist, "cgroup2");
ff8d6ee9 1031 continue;
6328fd9c 1032 }
ff8d6ee9 1033
ccb4cabe 1034 for (tok = strtok_r(p, ",", &saveptr); tok;
d6337a5f 1035 tok = strtok_r(NULL, ",", &saveptr)) {
ccb4cabe
SH
1036 if (strncmp(tok, "name=", 5) == 0)
1037 must_append_string(nlist, tok);
1038 else
1039 must_append_string(klist, tok);
1040 }
1041 }
1042
1043 free(line);
1044 fclose(f);
d6337a5f 1045 return 0;
ccb4cabe
SH
1046}
1047
1048static void trim(char *s)
1049{
1050 size_t len = strlen(s);
2c28d76b 1051 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
1052 s[--len] = '\0';
1053}
1054
e4aeecf5
CB
1055static void lxc_cgfsng_print_handler_data(const struct cgfsng_handler_data *d)
1056{
1057 printf("Cgroup information:\n");
1058 printf(" container name: %s\n", d->name ? d->name : "(null)");
1059 printf(" lxc.cgroup.use: %s\n", cgroup_use ? cgroup_use : "(null)");
43654d34
CB
1060 printf(" lxc.cgroup.pattern: %s\n",
1061 d->cgroup_pattern ? d->cgroup_pattern : "(null)");
1062 printf(" lxc.cgroup.dir: %s\n",
1063 d->cgroup_meta.dir ? d->cgroup_meta.dir : "(null)");
1064 printf(" cgroup: %s\n",
1065 d->container_cgroup ? d->container_cgroup : "(null)");
e4aeecf5
CB
1066}
1067
1068static void lxc_cgfsng_print_hierarchies()
ccb4cabe 1069{
a7b0cc4c 1070 struct hierarchy **it;
ccb4cabe 1071 int i;
41c33dbe 1072
457ca9aa 1073 if (!hierarchies) {
c2712f64 1074 printf(" No hierarchies found\n");
ccb4cabe
SH
1075 return;
1076 }
e4aeecf5 1077 printf(" Hierarchies:\n");
a7b0cc4c
CB
1078 for (i = 0, it = hierarchies; it && *it; it++, i++) {
1079 char **cit;
ccb4cabe 1080 int j;
c2712f64
CB
1081 printf(" %d: base_cgroup: %s\n", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1082 printf(" mountpoint: %s\n", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
e4aeecf5 1083 printf(" controllers:\n");
a7b0cc4c 1084 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
e4aeecf5 1085 printf(" %d: %s\n", j, *cit);
ccb4cabe
SH
1086 }
1087}
41c33dbe 1088
a3926f6a
CB
1089static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1090 char **nlist)
41c33dbe
SH
1091{
1092 int k;
a7b0cc4c 1093 char **it;
41c33dbe 1094
a7b0cc4c
CB
1095 printf("basecginfo is:\n");
1096 printf("%s\n", basecginfo);
41c33dbe 1097
a7b0cc4c
CB
1098 for (k = 0, it = klist; it && *it; it++, k++)
1099 printf("kernel subsystem %d: %s\n", k, *it);
1100 for (k = 0, it = nlist; it && *it; it++, k++)
1101 printf("named subsystem %d: %s\n", k, *it);
41c33dbe 1102}
ccb4cabe 1103
e4aeecf5
CB
1104static void lxc_cgfsng_print_debuginfo(const struct cgfsng_handler_data *d)
1105{
1106 lxc_cgfsng_print_handler_data(d);
1107 lxc_cgfsng_print_hierarchies();
1108}
1109
ccb4cabe
SH
1110/*
1111 * At startup, parse_hierarchies finds all the info we need about
1112 * cgroup mountpoints and current cgroups, and stores it in @d.
1113 */
a3926f6a 1114static bool cg_hybrid_init(void)
ccb4cabe 1115{
d6337a5f
CB
1116 int ret;
1117 char *basecginfo;
1118 bool will_escape;
ccb4cabe 1119 FILE *f;
ccb4cabe 1120 size_t len = 0;
d6337a5f
CB
1121 char *line = NULL;
1122 char **klist = NULL, **nlist = NULL;
ccb4cabe 1123
d30ec4cb
SH
1124 /*
1125 * Root spawned containers escape the current cgroup, so use init's
1126 * cgroups as our base in that case.
1127 */
d6337a5f
CB
1128 will_escape = (geteuid() == 0);
1129 if (will_escape)
ccb4cabe 1130 basecginfo = read_file("/proc/1/cgroup");
d6337a5f
CB
1131 else
1132 basecginfo = read_file("/proc/self/cgroup");
ccb4cabe
SH
1133 if (!basecginfo)
1134 return false;
1135
d6337a5f
CB
1136 ret = get_existing_subsystems(&klist, &nlist);
1137 if (ret < 0) {
1138 CGFSNG_DEBUG("Failed to retrieve available cgroup v1 controllers\n");
1139 free(basecginfo);
ccb4cabe
SH
1140 return false;
1141 }
1142
d6337a5f
CB
1143 f = fopen("/proc/self/mountinfo", "r");
1144 if (!f) {
1145 CGFSNG_DEBUG("Failed to open \"/proc/self/mountinfo\"\n");
bd01b7d5 1146 free(basecginfo);
d6337a5f
CB
1147 return false;
1148 }
41c33dbe 1149
e4aeecf5
CB
1150 if (lxc_cgfsng_debug)
1151 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
ccb4cabe 1152
ccb4cabe 1153 while (getline(&line, &len, f) != -1) {
49ff3958 1154 int type;
d6337a5f
CB
1155 bool writeable;
1156 struct hierarchy *new;
1157 char *mountpoint = NULL, *base_cgroup = NULL;
1158 char **controller_list = NULL;
ccb4cabe 1159
49ff3958 1160 type = get_cgroup_version(line);
d6337a5f 1161 if (type == 0)
ccb4cabe
SH
1162 continue;
1163
d6337a5f 1164 if (type == CGROUP2_SUPER_MAGIC && unified)
ccb4cabe
SH
1165 continue;
1166
d6337a5f
CB
1167 if (cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
1168 if (type == CGROUP2_SUPER_MAGIC)
1169 cgroup_layout = CGROUP_LAYOUT_UNIFIED;
1170 else if (type == CGROUP_SUPER_MAGIC)
1171 cgroup_layout = CGROUP_LAYOUT_LEGACY;
1172 } else if (cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
1173 if (type == CGROUP_SUPER_MAGIC)
1174 cgroup_layout = CGROUP_LAYOUT_HYBRID;
1175 } else if (cgroup_layout == CGROUP_LAYOUT_LEGACY) {
1176 if (type == CGROUP2_SUPER_MAGIC)
1177 cgroup_layout = CGROUP_LAYOUT_HYBRID;
ccb4cabe
SH
1178 }
1179
a3926f6a 1180 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
d6337a5f
CB
1181 if (!controller_list && type == CGROUP_SUPER_MAGIC)
1182 continue;
1183
1184 if (type == CGROUP_SUPER_MAGIC)
1185 if (controller_list_is_dup(hierarchies, controller_list))
1186 goto next;
1187
a3926f6a 1188 mountpoint = cg_hybrid_get_mountpoint(line);
ccb4cabe 1189 if (!mountpoint) {
65d78313 1190 CGFSNG_DEBUG("Failed parsing mountpoint from \"%s\"\n", line);
d6337a5f 1191 goto next;
ccb4cabe
SH
1192 }
1193
d6337a5f 1194 if (type == CGROUP_SUPER_MAGIC)
a3926f6a 1195 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
d6337a5f 1196 else
a3926f6a 1197 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
ccb4cabe 1198 if (!base_cgroup) {
d6337a5f
CB
1199 CGFSNG_DEBUG("Failed to find current cgroup\n");
1200 goto next;
ccb4cabe 1201 }
6328fd9c 1202
ccb4cabe
SH
1203 trim(base_cgroup);
1204 prune_init_scope(base_cgroup);
d6337a5f 1205 if (type == CGROUP2_SUPER_MAGIC)
6328fd9c
CB
1206 writeable = test_writeable_v2(mountpoint, base_cgroup);
1207 else
1208 writeable = test_writeable_v1(mountpoint, base_cgroup);
d6337a5f
CB
1209 if (!writeable)
1210 goto next;
1211
1212 if (type == CGROUP2_SUPER_MAGIC) {
1213 char *cgv2_ctrl_path;
1214
1215 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
1216 "cgroup.controllers",
1217 NULL);
1218
1219 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
1220 free(cgv2_ctrl_path);
1221 if (!controller_list)
1222 controller_list = cg_unified_make_empty_controller();
ccb4cabe 1223 }
d6337a5f
CB
1224 new = add_hierarchy(controller_list, mountpoint, base_cgroup, type);
1225 if (type == CGROUP2_SUPER_MAGIC && !unified)
1226 unified = new;
1227
1228 continue;
1229
1230 next:
1231 free_string_list(controller_list);
1232 free(mountpoint);
1233 free(base_cgroup);
ccb4cabe
SH
1234 }
1235
1236 free_string_list(klist);
1237 free_string_list(nlist);
1238
1239 free(basecginfo);
1240
1241 fclose(f);
1242 free(line);
1243
e4aeecf5
CB
1244 if (lxc_cgfsng_debug) {
1245 printf("writeable subsystems:\n");
1246 lxc_cgfsng_print_hierarchies();
1247 }
1248
ccb4cabe
SH
1249 /* verify that all controllers in cgroup.use and all crucial
1250 * controllers are accounted for
1251 */
c2712f64 1252 if (!all_controllers_found())
ccb4cabe
SH
1253 return false;
1254
1255 return true;
1256}
1257
d6337a5f
CB
1258static int cg_is_pure_unified(void) {
1259
1260 int ret;
1261 struct statfs fs;
1262
1263 ret = statfs("/sys/fs/cgroup", &fs);
1264 if (ret < 0)
1265 return -ENOMEDIUM;
1266
1267 if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
1268 return CGROUP2_SUPER_MAGIC;
1269
1270 return 0;
1271}
1272
1273/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
a3926f6a 1274static char *cg_unified_get_current_cgroup(void)
457ca9aa 1275{
d6337a5f
CB
1276 char *basecginfo;
1277 char *base_cgroup;
1278 bool will_escape;
1279 char *copy = NULL;
1280
1281 will_escape = (geteuid() == 0);
1282 if (will_escape)
1283 basecginfo = read_file("/proc/1/cgroup");
1284 else
1285 basecginfo = read_file("/proc/self/cgroup");
1286 if (!basecginfo)
1287 return NULL;
1288
1289 base_cgroup = strstr(basecginfo, "0::/");
1290 if (!base_cgroup)
1291 goto cleanup_on_err;
1292
1293 base_cgroup = base_cgroup + 3;
1294 copy = copy_to_eol(base_cgroup);
1295 if (!copy)
1296 goto cleanup_on_err;
1297
1298cleanup_on_err:
1299 free(basecginfo);
1300 if (copy)
1301 trim(copy);
1302
1303 return copy;
1304}
1305
a3926f6a 1306static int cg_unified_init(void)
d6337a5f
CB
1307{
1308 int ret;
1309 char *mountpoint, *subtree_path;
1310 char **delegatable;
1311 char *base_cgroup = NULL;
1312
1313 ret = cg_is_pure_unified();
1314 if (ret == -ENOMEDIUM)
1315 return -ENOMEDIUM;
1316
1317 if (ret != CGROUP2_SUPER_MAGIC)
1318 return 0;
1319
a3926f6a 1320 base_cgroup = cg_unified_get_current_cgroup();
d6337a5f
CB
1321 if (!base_cgroup)
1322 return -EINVAL;
1323 prune_init_scope(base_cgroup);
1324
1325 /* We assume that we have already been given controllers to delegate
1326 * further down the hierarchy. If not it is up to the user to delegate
1327 * them to us.
1328 */
1329 mountpoint = must_copy_string("/sys/fs/cgroup");
1330 subtree_path = must_make_path(mountpoint, base_cgroup,
1331 "cgroup.subtree_control", NULL);
1332 delegatable = cg_unified_get_controllers(subtree_path);
1333 free(subtree_path);
1334 if (!delegatable)
1335 delegatable = cg_unified_make_empty_controller();
1336 if (!delegatable[0])
1337 CGFSNG_DEBUG("No controllers are enabled for delegation\n");
1338
1339 /* TODO: If the user requested specific controllers via lxc.cgroup.use
1340 * we should verify here. The reason I'm not doing it right is that I'm
1341 * not convinced that lxc.cgroup.use will be the future since it is a
1342 * global property. I much rather have an option that lets you request
1343 * controllers per container.
1344 */
1345
1346 add_hierarchy(delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
1347 unified = hierarchies[0];
1348
1349 cgroup_layout = CGROUP_LAYOUT_UNIFIED;
1350 return CGROUP2_SUPER_MAGIC;
1351}
1352
1353static bool cg_init(void)
1354{
1355 int ret;
457ca9aa 1356 const char *tmp;
d6337a5f 1357
457ca9aa
SH
1358 errno = 0;
1359 tmp = lxc_global_config_value("lxc.cgroup.use");
1a0e70ac 1360 if (!cgroup_use && errno != 0) { /* lxc.cgroup.use can be NULL */
65d78313 1361 CGFSNG_DEBUG("Failed to retrieve list of cgroups to use\n");
457ca9aa
SH
1362 return false;
1363 }
1364 cgroup_use = must_copy_string(tmp);
1365
a3926f6a 1366 ret = cg_unified_init();
d6337a5f
CB
1367 if (ret < 0)
1368 return false;
1369
1370 if (ret == CGROUP2_SUPER_MAGIC)
1371 return true;
1372
a3926f6a 1373 return cg_hybrid_init();
457ca9aa
SH
1374}
1375
43654d34 1376static void *cgfsng_init(struct lxc_handler *handler)
ccb4cabe 1377{
457ca9aa 1378 const char *cgroup_pattern;
43654d34 1379 struct cgfsng_handler_data *d;
ccb4cabe
SH
1380
1381 d = must_alloc(sizeof(*d));
1382 memset(d, 0, sizeof(*d));
1383
43654d34
CB
1384 /* copy container name */
1385 d->name = must_copy_string(handler->name);
1386
1387 /* copy per-container cgroup information */
ae5e6c08
CB
1388 d->cgroup_meta.dir = NULL;
1389 d->cgroup_meta.controllers = NULL;
9b5396f9
CB
1390 if (handler->conf) {
1391 d->cgroup_meta.dir = must_copy_string(handler->conf->cgroup_meta.dir);
1392 d->cgroup_meta.controllers = must_copy_string(handler->conf->cgroup_meta.controllers);
1393 }
ccb4cabe 1394
43654d34 1395 /* copy system-wide cgroup information */
ccb4cabe 1396 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
43654d34
CB
1397 if (!cgroup_pattern) {
1398 /* lxc.cgroup.pattern is only NULL on error. */
ccb4cabe
SH
1399 ERROR("Error getting cgroup pattern");
1400 goto out_free;
1401 }
1402 d->cgroup_pattern = must_copy_string(cgroup_pattern);
1403
d6337a5f
CB
1404 d->cgroup_layout = cgroup_layout;
1405 if (d->cgroup_layout == CGROUP_LAYOUT_LEGACY)
1406 TRACE("Running with legacy cgroup layout");
1407 else if (d->cgroup_layout == CGROUP_LAYOUT_HYBRID)
1408 TRACE("Running with hybrid cgroup layout");
1409 else if (d->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
1410 TRACE("Running with unified cgroup layout");
1411 else
1412 WARN("Running with unknown cgroup layout");
1413
e4aeecf5
CB
1414 if (lxc_cgfsng_debug)
1415 lxc_cgfsng_print_debuginfo(d);
ccb4cabe
SH
1416
1417 return d;
1418
1419out_free:
1420 free_handler_data(d);
1421 return NULL;
1422}
1423
bd8ef4e4 1424static int recursive_destroy(char *dirname)
ccb4cabe 1425{
a17f8b3f 1426 int ret;
74f96976 1427 struct dirent *direntp;
ccb4cabe
SH
1428 DIR *dir;
1429 int r = 0;
1430
1431 dir = opendir(dirname);
1432 if (!dir)
1433 return -1;
1434
74f96976 1435 while ((direntp = readdir(dir))) {
ccb4cabe 1436 char *pathname;
a17f8b3f 1437 struct stat mystat;
ccb4cabe 1438
ccb4cabe
SH
1439 if (!strcmp(direntp->d_name, ".") ||
1440 !strcmp(direntp->d_name, ".."))
1441 continue;
1442
1443 pathname = must_make_path(dirname, direntp->d_name, NULL);
1444
a17f8b3f
CB
1445 ret = lstat(pathname, &mystat);
1446 if (ret < 0) {
ccb4cabe 1447 if (!r)
a17f8b3f 1448 WARN("Failed to stat %s", pathname);
ccb4cabe
SH
1449 r = -1;
1450 goto next;
1451 }
1452
1453 if (!S_ISDIR(mystat.st_mode))
1454 goto next;
a17f8b3f 1455
bd8ef4e4 1456 ret = recursive_destroy(pathname);
a17f8b3f 1457 if (ret < 0)
ccb4cabe 1458 r = -1;
bd8ef4e4 1459 next:
ccb4cabe
SH
1460 free(pathname);
1461 }
1462
a17f8b3f
CB
1463 ret = rmdir(dirname);
1464 if (ret < 0) {
ccb4cabe 1465 if (!r)
bd8ef4e4
CB
1466 WARN("%s - Failed to delete \"%s\"", strerror(errno),
1467 dirname);
ccb4cabe
SH
1468 r = -1;
1469 }
1470
a17f8b3f
CB
1471 ret = closedir(dir);
1472 if (ret < 0) {
ccb4cabe 1473 if (!r)
bd8ef4e4
CB
1474 WARN("%s - Failed to delete \"%s\"", strerror(errno),
1475 dirname);
ccb4cabe
SH
1476 r = -1;
1477 }
a17f8b3f 1478
ccb4cabe
SH
1479 return r;
1480}
1481
bd8ef4e4
CB
1482static int cgroup_rmdir(char *container_cgroup)
1483{
1484 int i;
1485
1486 if (!container_cgroup || !hierarchies)
1487 return 0;
1488
1489 for (i = 0; hierarchies[i]; i++) {
1490 int ret;
1491 struct hierarchy *h = hierarchies[i];
1492
1493 if (!h->fullcgpath)
1494 continue;
1495
1496 ret = recursive_destroy(h->fullcgpath);
1497 if (ret < 0)
1498 WARN("Failed to destroy \"%s\"", h->fullcgpath);
1499
1500 free(h->fullcgpath);
1501 h->fullcgpath = NULL;
1502 }
1503
1504 return 0;
1505}
1506
4160c3a0
CB
1507struct generic_userns_exec_data {
1508 struct cgfsng_handler_data *d;
1509 struct lxc_conf *conf;
1510 uid_t origuid; /* target uid in parent namespace */
1511 char *path;
1512};
1513
bd8ef4e4 1514static int cgroup_rmdir_wrapper(void *data)
ccb4cabe 1515{
4160c3a0
CB
1516 struct generic_userns_exec_data *arg = data;
1517 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1518 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1519
4160c3a0 1520 if (setresgid(nsgid, nsgid, nsgid) < 0)
ccb4cabe 1521 SYSERROR("Failed to setgid to 0");
4160c3a0 1522 if (setresuid(nsuid, nsuid, nsuid) < 0)
ccb4cabe 1523 SYSERROR("Failed to setuid to 0");
a19b974f 1524 if (setgroups(0, NULL) < 0 && errno != EPERM)
ccb4cabe
SH
1525 SYSERROR("Failed to clear groups");
1526
bd8ef4e4 1527 return cgroup_rmdir(arg->d->container_cgroup);
ccb4cabe
SH
1528}
1529
bd8ef4e4 1530static void cgfsng_destroy(void *hdata, struct lxc_conf *conf)
ccb4cabe 1531{
bd8ef4e4
CB
1532 int ret;
1533 struct cgfsng_handler_data *d = hdata;
4160c3a0
CB
1534 struct generic_userns_exec_data wrap;
1535
bd8ef4e4
CB
1536 if (!d)
1537 return;
1538
4160c3a0 1539 wrap.origuid = 0;
bd8ef4e4 1540 wrap.d = hdata;
4160c3a0
CB
1541 wrap.conf = conf;
1542
ccb4cabe 1543 if (conf && !lxc_list_empty(&conf->id_map))
bd8ef4e4
CB
1544 ret = userns_exec_1(conf, cgroup_rmdir_wrapper, &wrap,
1545 "cgroup_rmdir_wrapper");
ccb4cabe 1546 else
bd8ef4e4
CB
1547 ret = cgroup_rmdir(d->container_cgroup);
1548 if (ret < 0) {
1549 WARN("Failed to destroy cgroups");
ccb4cabe 1550 return;
ccb4cabe
SH
1551 }
1552
1553 free_handler_data(d);
1554}
1555
1556struct cgroup_ops *cgfsng_ops_init(void)
1557{
e4aeecf5
CB
1558 if (getenv("LXC_DEBUG_CGFSNG"))
1559 lxc_cgfsng_debug = true;
1560
d6337a5f 1561 if (!cg_init())
457ca9aa 1562 return NULL;
e4aeecf5 1563
ccb4cabe
SH
1564 return &cgfsng_ops;
1565}
1566
a3926f6a 1567static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
0c3deb94
CB
1568{
1569 char **it;
1570 size_t i, parts_len;
1571 size_t full_len = 0;
1572 char *add_controllers = NULL, *cgroup = NULL;
1573 char **parts = NULL;
1574 bool bret = false;
1575
1576 if (h->version != CGROUP2_SUPER_MAGIC)
1577 return true;
1578
1579 if (!h->controllers)
1580 return true;
1581
1582 /* For now we simply enable all controllers that we have detected by
1583 * creating a string like "+memory +pids +cpu +io".
1584 * TODO: In the near future we might want to support "-<controller>"
1585 * etc. but whether supporting semantics like this make sense will need
1586 * some thinking.
1587 */
1588 for (it = h->controllers; it && *it; it++) {
1589 full_len += strlen(*it) + 2;
1590 add_controllers = must_realloc(add_controllers, full_len + 1);
1591 if (h->controllers[0] == *it)
1592 add_controllers[0] = '\0';
1593 strcat(add_controllers, "+");
1594 strcat(add_controllers, *it);
1595 if ((it + 1) && *(it + 1))
1596 strcat(add_controllers, " ");
1597 }
1598
1599 parts = lxc_string_split(cgname, '/');
1600 if (!parts)
1601 goto on_error;
1602 parts_len = lxc_array_len((void **)parts);
1603 if (parts_len > 0)
1604 parts_len--;
1605
1606 cgroup = must_make_path(h->mountpoint, h->base_cgroup, NULL);
1607 for (i = 0; i < parts_len; i++) {
1608 int ret;
1609 char *target;
1610
1611 cgroup = must_append_path(cgroup, parts[i], NULL);
1612 target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
1613 ret = lxc_write_to_file(target, add_controllers, full_len, false);
1614 free(target);
1615 if (ret < 0) {
1616 SYSERROR("Could not enable \"%s\" controllers in the "
1617 "unified cgroup \"%s\"", add_controllers, cgroup);
1618 goto on_error;
1619 }
1620 }
1621
1622 bret = true;
1623
1624on_error:
1625 lxc_free_array((void **)parts, free);
1626 free(add_controllers);
1627 free(cgroup);
1628 return bret;
1629}
1630
ccb4cabe
SH
1631static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1632{
0c3deb94
CB
1633 int ret;
1634
e3a3fecf 1635 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
1a0e70ac 1636 if (dir_exists(h->fullcgpath)) { /* it must not already exist */
0c3deb94 1637 ERROR("cgroup \"%s\" already existed", h->fullcgpath);
d8da679e 1638 return false;
6f9584d8 1639 }
0c3deb94 1640
a3926f6a 1641 if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
0c3deb94
CB
1642 ERROR("Failed to handle cgroupfs v1 cpuset controller");
1643 return false;
1644 }
1645
1646 ret = mkdir_p(h->fullcgpath, 0755);
1647 if (ret < 0) {
1648 ERROR("Failed to create cgroup \"%s\"", h->fullcgpath);
e3a3fecf 1649 return false;
6f9584d8 1650 }
0c3deb94 1651
a3926f6a 1652 return cg_unified_create_cgroup(h, cgname);
ccb4cabe
SH
1653}
1654
1655static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1656{
1657 if (rmdir(h->fullcgpath) < 0)
1658 SYSERROR("Failed to clean up cgroup %s from failed creation attempt", h->fullcgpath);
1659 free(h->fullcgpath);
1660 h->fullcgpath = NULL;
1661}
1662
1663/*
d30ec4cb 1664 * Try to create the same cgroup in all hierarchies.
ccb4cabe
SH
1665 * Start with cgroup_pattern; next cgroup_pattern-1, -2, ..., -999
1666 */
1667static inline bool cgfsng_create(void *hdata)
1668{
bb30b52a 1669 int i;
ccb4cabe 1670 size_t len;
0c3deb94 1671 char *container_cgroup, *offset, *tmp;
7d531e9b
CB
1672 int idx = 0;
1673 struct cgfsng_handler_data *d = hdata;
ccb4cabe
SH
1674
1675 if (!d)
1676 return false;
43654d34 1677
ccb4cabe
SH
1678 if (d->container_cgroup) {
1679 WARN("cgfsng_create called a second time");
1680 return false;
1681 }
1682
43654d34 1683 if (d->cgroup_meta.dir)
7d531e9b 1684 tmp = lxc_string_join("/", (const char *[]){d->cgroup_meta.dir, d->name, NULL}, false);
43654d34
CB
1685 else
1686 tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
ccb4cabe
SH
1687 if (!tmp) {
1688 ERROR("Failed expanding cgroup name pattern");
1689 return false;
1690 }
1a0e70ac 1691 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
0c3deb94
CB
1692 container_cgroup = must_alloc(len);
1693 strcpy(container_cgroup, tmp);
ccb4cabe 1694 free(tmp);
0c3deb94 1695 offset = container_cgroup + len - 5;
ccb4cabe
SH
1696
1697again:
95adfe93
SH
1698 if (idx == 1000) {
1699 ERROR("Too many conflicting cgroup names");
ccb4cabe 1700 goto out_free;
95adfe93 1701 }
66b66624 1702 if (idx) {
bb30b52a
CB
1703 int ret;
1704
66b66624
CB
1705 ret = snprintf(offset, 5, "-%d", idx);
1706 if (ret < 0 || (size_t)ret >= 5) {
1707 FILE *f = fopen("/dev/null", "w");
97ebced3 1708 if (f) {
66b66624
CB
1709 fprintf(f, "Workaround for GCC7 bug: "
1710 "https://gcc.gnu.org/bugzilla/"
1711 "show_bug.cgi?id=78969");
1712 fclose(f);
1713 }
1714 }
1715 }
457ca9aa 1716 for (i = 0; hierarchies[i]; i++) {
0c3deb94 1717 if (!create_path_for_hierarchy(hierarchies[i], container_cgroup)) {
ccb4cabe 1718 int j;
1a0e70ac 1719 ERROR("Failed to create \"%s\"", hierarchies[i]->fullcgpath);
457ca9aa
SH
1720 free(hierarchies[i]->fullcgpath);
1721 hierarchies[i]->fullcgpath = NULL;
ccb4cabe 1722 for (j = 0; j < i; j++)
0c3deb94 1723 remove_path_for_hierarchy(hierarchies[j], container_cgroup);
ccb4cabe
SH
1724 idx++;
1725 goto again;
1726 }
1727 }
1728 /* Done */
0c3deb94 1729 d->container_cgroup = container_cgroup;
ccb4cabe
SH
1730 return true;
1731
1732out_free:
0c3deb94 1733 free(container_cgroup);
ccb4cabe
SH
1734 return false;
1735}
1736
ccb4cabe
SH
1737static bool cgfsng_enter(void *hdata, pid_t pid)
1738{
ccb4cabe
SH
1739 char pidstr[25];
1740 int i, len;
1741
1742 len = snprintf(pidstr, 25, "%d", pid);
1743 if (len < 0 || len > 25)
1744 return false;
1745
457ca9aa
SH
1746 for (i = 0; hierarchies[i]; i++) {
1747 char *fullpath = must_make_path(hierarchies[i]->fullcgpath,
ccb4cabe
SH
1748 "cgroup.procs", NULL);
1749 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
d3b00a8f 1750 SYSERROR("Failed to enter %s", fullpath);
ccb4cabe
SH
1751 free(fullpath);
1752 return false;
1753 }
1754 free(fullpath);
1755 }
1756
1757 return true;
1758}
1759
c0888dfe
SH
1760/*
1761 * chgrp the container cgroups to container group. We leave
1762 * the container owner as cgroup owner. So we must make the
1763 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1764 *
1765 * Also chown the tasks and cgroup.procs files. Those may not
1766 * exist depending on kernel version.
c0888dfe 1767 */
ccb4cabe
SH
1768static int chown_cgroup_wrapper(void *data)
1769{
ccb4cabe 1770 int i;
4160c3a0
CB
1771 uid_t destuid;
1772 struct generic_userns_exec_data *arg = data;
1773 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1774 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1775
4160c3a0 1776 if (setresgid(nsgid, nsgid, nsgid) < 0)
ccb4cabe 1777 SYSERROR("Failed to setgid to 0");
4160c3a0 1778 if (setresuid(nsuid, nsuid, nsuid) < 0)
ccb4cabe 1779 SYSERROR("Failed to setuid to 0");
a19b974f 1780 if (setgroups(0, NULL) < 0 && errno != EPERM)
ccb4cabe
SH
1781 SYSERROR("Failed to clear groups");
1782
1783 destuid = get_ns_uid(arg->origuid);
1784
457ca9aa
SH
1785 for (i = 0; hierarchies[i]; i++) {
1786 char *fullpath, *path = hierarchies[i]->fullcgpath;
43647298 1787
4160c3a0 1788 if (chown(path, destuid, nsgid) < 0) {
ab8f5424 1789 SYSERROR("Error chowning %s to %d", path, (int) destuid);
ccb4cabe
SH
1790 return -1;
1791 }
c0888dfe 1792
43647298 1793 if (chmod(path, 0775) < 0) {
ab8f5424 1794 SYSERROR("Error chmoding %s", path);
c0888dfe
SH
1795 return -1;
1796 }
ccb4cabe 1797
ab8f5424
SH
1798 /*
1799 * Failures to chown these are inconvenient but not detrimental
1800 * We leave these owned by the container launcher, so that container
1801 * root can write to the files to attach. We chmod them 664 so that
1802 * container systemd can write to the files (which systemd in wily
1803 * insists on doing)
1804 */
43647298 1805 fullpath = must_make_path(path, "tasks", NULL);
4160c3a0 1806 if (chown(fullpath, destuid, nsgid) < 0 && errno != ENOENT)
13277ec4 1807 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1808 strerror(errno));
ab8f5424 1809 if (chmod(fullpath, 0664) < 0)
13277ec4 1810 WARN("Error chmoding %s: %s", path, strerror(errno));
43647298
SH
1811 free(fullpath);
1812
1813 fullpath = must_make_path(path, "cgroup.procs", NULL);
1814 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1815 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1816 strerror(errno));
ab8f5424 1817 if (chmod(fullpath, 0664) < 0)
13277ec4 1818 WARN("Error chmoding %s: %s", path, strerror(errno));
ccb4cabe 1819 free(fullpath);
0e17357c 1820
d6337a5f 1821 if (hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
0e17357c
CB
1822 continue;
1823
1824 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
4160c3a0 1825 if (chown(fullpath, destuid, nsgid) < 0 && errno != ENOENT)
0e17357c
CB
1826 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1827 strerror(errno));
1828 if (chmod(fullpath, 0664) < 0)
1829 WARN("Error chmoding %s: %s", path, strerror(errno));
1830 free(fullpath);
1831
1832 fullpath = must_make_path(path, "cgroup.threads", NULL);
4160c3a0 1833 if (chown(fullpath, destuid, nsgid) < 0 && errno != ENOENT)
0e17357c
CB
1834 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1835 strerror(errno));
1836 if (chmod(fullpath, 0664) < 0)
1837 WARN("Error chmoding %s: %s", path, strerror(errno));
1838 free(fullpath);
ccb4cabe
SH
1839 }
1840
1841 return 0;
1842}
1843
058c1cb6 1844static bool cgfsng_chown(void *hdata, struct lxc_conf *conf)
ccb4cabe
SH
1845{
1846 struct cgfsng_handler_data *d = hdata;
4160c3a0 1847 struct generic_userns_exec_data wrap;
ccb4cabe
SH
1848
1849 if (!d)
1850 return false;
1851
1852 if (lxc_list_empty(&conf->id_map))
1853 return true;
1854
ccb4cabe 1855 wrap.origuid = geteuid();
4160c3a0
CB
1856 wrap.path = NULL;
1857 wrap.d = d;
1858 wrap.conf = conf;
ccb4cabe 1859
c9b7c33e
CB
1860 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1861 "chown_cgroup_wrapper") < 0) {
ccb4cabe
SH
1862 ERROR("Error requesting cgroup chown in new namespace");
1863 return false;
1864 }
1865
1866 return true;
1867}
1868
8aa1044f
SH
1869/*
1870 * We've safe-mounted a tmpfs as parent, so we don't need to protect against
1871 * symlinks any more - just use mount
1872 */
1873
1874/* mount cgroup-full if requested */
1875static int mount_cgroup_full(int type, struct hierarchy *h, char *dest,
a3926f6a 1876 char *container_cgroup)
8aa1044f
SH
1877{
1878 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1879 return 0;
1880 if (mount(h->mountpoint, dest, "cgroup", MS_BIND, NULL) < 0) {
1881 SYSERROR("Error bind-mounting %s cgroup onto %s", h->mountpoint,
1882 dest);
1883 return -1;
1884 }
1885 if (type != LXC_AUTO_CGROUP_FULL_RW) {
5b6f9369
SH
1886 unsigned long flags = MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV |
1887 MS_REMOUNT | MS_RDONLY;
1888 if (mount(NULL, dest, "cgroup", flags, NULL) < 0) {
8aa1044f
SH
1889 SYSERROR("Error remounting %s readonly", dest);
1890 return -1;
1891 }
1892 }
1893
1894 INFO("Bind mounted %s onto %s", h->mountpoint, dest);
1895 if (type != LXC_AUTO_CGROUP_FULL_MIXED)
1896 return 0;
1897
1898 /* mount just the container path rw */
1899 char *source = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
5b6f9369 1900 char *rwpath = must_make_path(dest, h->base_cgroup, container_cgroup, NULL);
8aa1044f 1901 if (mount(source, rwpath, "cgroup", MS_BIND, NULL) < 0)
13277ec4 1902 WARN("Failed to mount %s read-write: %s", rwpath,
1903 strerror(errno));
8aa1044f
SH
1904 INFO("Made %s read-write", rwpath);
1905 free(rwpath);
1906 free(source);
1907 return 0;
1908}
1909
1910/* cgroup-full:* is done, no need to create subdirs */
1911static bool cg_mount_needs_subdirs(int type)
1912{
1913 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1914 return false;
a3926f6a 1915
8aa1044f
SH
1916 return true;
1917}
1918
886cac86
CB
1919/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1920 * remount controller ro if needed and bindmount the cgroupfs onto
1921 * controll/the/cg/path.
8aa1044f 1922 */
a3926f6a
CB
1923static int do_secondstage_mounts_if_needed(int type, struct hierarchy *h,
1924 char *controllerpath, char *cgpath,
1925 const char *container_cgroup)
8aa1044f 1926{
5285689c 1927 int ret, remount_flags;
886cac86
CB
1928 char *sourcepath;
1929 int flags = MS_BIND;
1930
8aa1044f 1931 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
886cac86
CB
1932 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1933 if (ret < 0) {
1934 SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
1935 controllerpath, controllerpath);
8aa1044f
SH
1936 return -1;
1937 }
886cac86 1938
5285689c
CB
1939 remount_flags = add_required_remount_flags(controllerpath,
1940 controllerpath,
1941 flags | MS_REMOUNT);
886cac86
CB
1942 ret = mount(controllerpath, controllerpath, "cgroup",
1943 MS_REMOUNT | MS_BIND | MS_RDONLY, NULL);
1944 if (ret < 0) {
1945 SYSERROR("Failed to remount \"%s\" ro", controllerpath);
8aa1044f
SH
1946 return -1;
1947 }
886cac86 1948
8aa1044f
SH
1949 INFO("Remounted %s read-only", controllerpath);
1950 }
886cac86
CB
1951
1952 sourcepath = must_make_path(h->mountpoint, h->base_cgroup,
1953 container_cgroup, NULL);
8aa1044f
SH
1954 if (type == LXC_AUTO_CGROUP_RO)
1955 flags |= MS_RDONLY;
886cac86
CB
1956
1957 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1958 if (ret < 0) {
1959 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
8aa1044f 1960 free(sourcepath);
8aa1044f
SH
1961 return -1;
1962 }
886cac86 1963 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
f8c40ffa
L
1964
1965 if (flags & MS_RDONLY) {
5285689c
CB
1966 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1967 flags | MS_REMOUNT);
1968 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
886cac86
CB
1969 if (ret < 0) {
1970 SYSERROR("Failed to remount \"%s\" ro", cgpath);
f8c40ffa 1971 free(sourcepath);
f8c40ffa
L
1972 return -1;
1973 }
5285689c 1974 INFO("Remounted %s read-only", cgpath);
f8c40ffa
L
1975 }
1976
8aa1044f 1977 free(sourcepath);
886cac86 1978 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
8aa1044f
SH
1979 return 0;
1980}
1981
5285689c
CB
1982static int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1983 const char *controllerpath)
b635e92d
CB
1984{
1985 int ret;
1986 char *controllers = NULL;
a760603e
CB
1987 char *fstype = "cgroup2";
1988 unsigned long flags = 0;
b635e92d 1989
a760603e
CB
1990 flags |= MS_NOSUID;
1991 flags |= MS_NOEXEC;
1992 flags |= MS_NODEV;
1993 flags |= MS_RELATIME;
1994
1995 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1996 flags |= MS_RDONLY;
1997
d6337a5f 1998 if (h->version != CGROUP2_SUPER_MAGIC) {
a760603e
CB
1999 controllers = lxc_string_join(",", (const char **)h->controllers, false);
2000 if (!controllers)
2001 return -ENOMEM;
2002 fstype = "cgroup";
b635e92d
CB
2003 }
2004
a760603e 2005 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
b635e92d
CB
2006 free(controllers);
2007 if (ret < 0) {
a760603e 2008 SYSERROR("Failed to mount %s with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
2009 return -1;
2010 }
2011
a760603e 2012 DEBUG("Mounted %s with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
2013 return 0;
2014}
2015
ccb4cabe
SH
2016static bool cgfsng_mount(void *hdata, const char *root, int type)
2017{
b635e92d 2018 int i;
8aa1044f
SH
2019 char *tmpfspath = NULL;
2020 bool retval = false;
b635e92d
CB
2021 struct lxc_handler *handler = hdata;
2022 struct cgfsng_handler_data *d = handler->cgroup_data;
2023 bool has_cgns = false, has_sys_admin = true;
8aa1044f
SH
2024
2025 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
2026 return true;
2027
b635e92d
CB
2028 has_cgns = cgns_supported();
2029 if (!lxc_list_empty(&handler->conf->keepcaps))
2030 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
2031 else
2032 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
2033
2034 if (has_cgns && has_sys_admin)
ccb4cabe 2035 return true;
8aa1044f
SH
2036
2037 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
2038
2039 if (type == LXC_AUTO_CGROUP_NOSPEC)
2040 type = LXC_AUTO_CGROUP_MIXED;
2041 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
2042 type = LXC_AUTO_CGROUP_FULL_MIXED;
2043
2044 /* Mount tmpfs */
2045 if (safe_mount("cgroup_root", tmpfspath, "tmpfs",
2046 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
2047 "size=10240k,mode=755",
2048 root) < 0)
2049 goto bad;
2050
457ca9aa 2051 for (i = 0; hierarchies[i]; i++) {
8aa1044f 2052 char *controllerpath, *path2;
457ca9aa 2053 struct hierarchy *h = hierarchies[i];
8aa1044f
SH
2054 char *controller = strrchr(h->mountpoint, '/');
2055 int r;
2056
2057 if (!controller)
2058 continue;
2059 controller++;
2060 controllerpath = must_make_path(tmpfspath, controller, NULL);
2061 if (dir_exists(controllerpath)) {
2062 free(controllerpath);
2063 continue;
2064 }
2065 if (mkdir(controllerpath, 0755) < 0) {
2066 SYSERROR("Error creating cgroup path: %s", controllerpath);
2067 free(controllerpath);
2068 goto bad;
2069 }
b635e92d
CB
2070
2071 if (has_cgns && !has_sys_admin) {
2072 /* If cgroup namespaces are supported but the container
2073 * will not have CAP_SYS_ADMIN after it has started we
2074 * need to mount the cgroups manually.
2075 */
5285689c 2076 r = cg_mount_in_cgroup_namespace(type, h, controllerpath);
b635e92d
CB
2077 free(controllerpath);
2078 if (r < 0)
2079 goto bad;
2080 continue;
2081 }
2082
8aa1044f
SH
2083 if (mount_cgroup_full(type, h, controllerpath, d->container_cgroup) < 0) {
2084 free(controllerpath);
2085 goto bad;
2086 }
2087 if (!cg_mount_needs_subdirs(type)) {
2088 free(controllerpath);
2089 continue;
2090 }
ef4413fa 2091 path2 = must_make_path(controllerpath, h->base_cgroup, d->container_cgroup, NULL);
8aa1044f
SH
2092 if (mkdir_p(path2, 0755) < 0) {
2093 free(controllerpath);
8e0c6620 2094 free(path2);
8aa1044f
SH
2095 goto bad;
2096 }
2f62fb00 2097
8aa1044f
SH
2098 r = do_secondstage_mounts_if_needed(type, h, controllerpath, path2,
2099 d->container_cgroup);
2100 free(controllerpath);
2101 free(path2);
2102 if (r < 0)
2103 goto bad;
2104 }
2105 retval = true;
2106
2107bad:
2108 free(tmpfspath);
2109 return retval;
ccb4cabe
SH
2110}
2111
2112static int recursive_count_nrtasks(char *dirname)
2113{
74f96976 2114 struct dirent *direntp;
ccb4cabe
SH
2115 DIR *dir;
2116 int count = 0, ret;
2117 char *path;
2118
2119 dir = opendir(dirname);
2120 if (!dir)
2121 return 0;
2122
74f96976 2123 while ((direntp = readdir(dir))) {
ccb4cabe
SH
2124 struct stat mystat;
2125
2126 if (!direntp)
2127 break;
2128
2129 if (!strcmp(direntp->d_name, ".") ||
2130 !strcmp(direntp->d_name, ".."))
2131 continue;
2132
2133 path = must_make_path(dirname, direntp->d_name, NULL);
2134
2135 if (lstat(path, &mystat))
2136 goto next;
2137
2138 if (!S_ISDIR(mystat.st_mode))
2139 goto next;
2140
2141 count += recursive_count_nrtasks(path);
2142next:
2143 free(path);
2144 }
2145
2146 path = must_make_path(dirname, "cgroup.procs", NULL);
2147 ret = lxc_count_file_lines(path);
2148 if (ret != -1)
2149 count += ret;
2150 free(path);
2151
2152 (void) closedir(dir);
2153
2154 return count;
2155}
2156
2157static int cgfsng_nrtasks(void *hdata) {
2158 struct cgfsng_handler_data *d = hdata;
2159 char *path;
2160 int count;
2161
457ca9aa 2162 if (!d || !d->container_cgroup || !hierarchies)
ccb4cabe 2163 return -1;
a3926f6a 2164
457ca9aa 2165 path = must_make_path(hierarchies[0]->fullcgpath, NULL);
ccb4cabe
SH
2166 count = recursive_count_nrtasks(path);
2167 free(path);
2168 return count;
2169}
2170
2171/* Only root needs to escape to the cgroup of its init */
7103fe6f 2172static bool cgfsng_escape()
ccb4cabe 2173{
ccb4cabe
SH
2174 int i;
2175
2176 if (geteuid())
2177 return true;
2178
457ca9aa
SH
2179 for (i = 0; hierarchies[i]; i++) {
2180 char *fullpath = must_make_path(hierarchies[i]->mountpoint,
2181 hierarchies[i]->base_cgroup,
ccb4cabe
SH
2182 "cgroup.procs", NULL);
2183 if (lxc_write_to_file(fullpath, "0", 2, false) != 0) {
d3b00a8f 2184 SYSERROR("Failed to escape to %s", fullpath);
ccb4cabe 2185 free(fullpath);
6df334d1 2186 return false;
ccb4cabe
SH
2187 }
2188 free(fullpath);
2189 }
2190
6df334d1 2191 return true;
ccb4cabe
SH
2192}
2193
36662416
TA
2194static int cgfsng_num_hierarchies(void)
2195{
2196 int i;
2197
2198 for (i = 0; hierarchies[i]; i++)
2199 ;
2200
2201 return i;
2202}
2203
2204static bool cgfsng_get_hierarchies(int n, char ***out)
2205{
2206 int i;
2207
2208 /* sanity check n */
6b38e644 2209 for (i = 0; i < n; i++)
36662416
TA
2210 if (!hierarchies[i])
2211 return false;
36662416
TA
2212
2213 *out = hierarchies[i]->controllers;
2214
2215 return true;
2216}
2217
ccb4cabe
SH
2218#define THAWED "THAWED"
2219#define THAWED_LEN (strlen(THAWED))
2220
d6337a5f
CB
2221/* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
2222 * to be adapted.
2223 */
ccb4cabe
SH
2224static bool cgfsng_unfreeze(void *hdata)
2225{
d6337a5f 2226 int ret;
ccb4cabe 2227 char *fullpath;
d6337a5f 2228 struct hierarchy *h;
ccb4cabe 2229
d6337a5f 2230 h = get_hierarchy("freezer");
457ca9aa 2231 if (!h)
ccb4cabe 2232 return false;
d6337a5f 2233
ccb4cabe 2234 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
d6337a5f 2235 ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false);
ccb4cabe 2236 free(fullpath);
d6337a5f
CB
2237 if (ret < 0)
2238 return false;
2239
ccb4cabe
SH
2240 return true;
2241}
2242
2243static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem)
2244{
d6337a5f
CB
2245 struct hierarchy *h;
2246
2247 h = get_hierarchy(subsystem);
ccb4cabe
SH
2248 if (!h)
2249 return NULL;
2250
371f834d
SH
2251 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
2252}
2253
2254/*
2255 * Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a
2256 * full path, which must be freed by the caller.
2257 */
2258static char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2259 const char *inpath,
2260 const char *filename)
2261{
371f834d 2262 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
2263}
2264
c2aed66d
CB
2265/* Technically, we're always at a delegation boundary here. (This is especially
2266 * true when cgroup namespaces are available.) The reasoning is that in order
2267 * for us to have been able to start a container in the first place the root
2268 * cgroup must have been a leaf node. Now, either the container's init system
2269 * has populated the cgroup and kept it as a leaf node or it has created
2270 * subtrees. In the former case we will simply attach to the leaf node we
2271 * created when we started the container in the latter case we create our own
2272 * cgroup for the attaching process.
2273 */
a3926f6a
CB
2274static int __cg_unified_attach(const struct hierarchy *h, const char *name,
2275 const char *lxcpath, const char *pidstr,
2276 size_t pidstr_len, const char *controller)
c2aed66d
CB
2277{
2278 int ret;
2279 size_t len;
2280 int fret = -1, idx = 0;
2281 char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
2282
2283 container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2284 /* not running */
2285 if (!container_cgroup)
2286 return 0;
2287
2288 base_path = must_make_path(h->mountpoint, container_cgroup, NULL);
2289 full_path = must_make_path(base_path, "cgroup.procs", NULL);
2290 /* cgroup is populated */
2291 ret = lxc_write_to_file(full_path, pidstr, pidstr_len, false);
2292 if (ret < 0 && errno != EBUSY)
2293 goto on_error;
2294
2295 if (ret == 0)
2296 goto on_success;
2297
2298 free(full_path);
2299
2300 len = strlen(base_path) + sizeof("/lxc-1000") - 1 +
2301 sizeof("/cgroup-procs") - 1;
2302 full_path = must_alloc(len + 1);
2303 do {
2304 if (idx)
2305 ret = snprintf(full_path, len + 1, "%s/lxc-%d",
2306 base_path, idx);
2307 else
2308 ret = snprintf(full_path, len + 1, "%s/lxc", base_path);
2309 if (ret < 0 || (size_t)ret >= len + 1)
2310 goto on_error;
2311
2312 ret = mkdir_p(full_path, 0755);
2313 if (ret < 0 && errno != EEXIST)
2314 goto on_error;
2315
2316 strcat(full_path, "/cgroup.procs");
2317 ret = lxc_write_to_file(full_path, pidstr, len, false);
2318 if (ret == 0)
2319 goto on_success;
2320
2321 /* this is a non-leaf node */
2322 if (errno != EBUSY)
2323 goto on_error;
2324
2325 } while (++idx > 0 && idx < 1000);
2326
2327on_success:
2328 if (idx < 1000)
2329 fret = 0;
2330
2331on_error:
2332 free(base_path);
2333 free(container_cgroup);
2334 free(full_path);
2335
2336 return fret;
2337}
2338
ccb4cabe
SH
2339static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
2340{
c2aed66d 2341 int i, len, ret;
ccb4cabe 2342 char pidstr[25];
ccb4cabe
SH
2343
2344 len = snprintf(pidstr, 25, "%d", pid);
2345 if (len < 0 || len > 25)
2346 return false;
2347
457ca9aa 2348 for (i = 0; hierarchies[i]; i++) {
c2aed66d
CB
2349 char *path;
2350 char *fullpath = NULL;
457ca9aa 2351 struct hierarchy *h = hierarchies[i];
ccb4cabe 2352
c2aed66d 2353 if (h->version == CGROUP2_SUPER_MAGIC) {
a3926f6a
CB
2354 ret = __cg_unified_attach(h, name, lxcpath, pidstr, len,
2355 h->controllers[0]);
c2aed66d
CB
2356 if (ret < 0)
2357 return false;
2358
2359 continue;
2360 }
2361
ccb4cabe 2362 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
c2aed66d
CB
2363 /* not running */
2364 if (!path)
ccb4cabe
SH
2365 continue;
2366
371f834d 2367 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
c2aed66d
CB
2368 ret = lxc_write_to_file(fullpath, pidstr, len, false);
2369 if (ret < 0) {
ccb4cabe
SH
2370 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
2371 free(fullpath);
ccb4cabe
SH
2372 return false;
2373 }
ccb4cabe
SH
2374 free(fullpath);
2375 }
2376
ccb4cabe
SH
2377 return true;
2378}
2379
2380/*
2381 * Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.
2382 * Here we don't have a cgroup_data set up, so we ask the running
2383 * container through the commands API for the cgroup path
2384 */
0069cc61
CB
2385static int cgfsng_get(const char *filename, char *value, size_t len,
2386 const char *name, const char *lxcpath)
ccb4cabe 2387{
ccb4cabe 2388 int ret = -1;
0069cc61
CB
2389 size_t controller_len;
2390 char *controller, *p, *path;
2391 struct hierarchy *h;
ccb4cabe 2392
0069cc61
CB
2393 controller_len = strlen(filename);
2394 controller = alloca(controller_len + 1);
2395 strcpy(controller, filename);
2396 p = strchr(controller, '.');
2397 if (p)
ccb4cabe
SH
2398 *p = '\0';
2399
0069cc61
CB
2400 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2401 /* not running */
2402 if (!path)
ccb4cabe
SH
2403 return -1;
2404
0069cc61 2405 h = get_hierarchy(controller);
ccb4cabe 2406 if (h) {
0069cc61
CB
2407 char *fullpath;
2408
2409 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
2410 ret = lxc_read_from_file(fullpath, value, len);
2411 free(fullpath);
2412 }
ccb4cabe
SH
2413 free(path);
2414
2415 return ret;
2416}
2417
2418/*
2419 * Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.
2420 * Here we don't have a cgroup_data set up, so we ask the running
2421 * container through the commands API for the cgroup path
2422 */
87777968
CB
2423static int cgfsng_set(const char *filename, const char *value, const char *name,
2424 const char *lxcpath)
ccb4cabe 2425{
ccb4cabe 2426 int ret = -1;
87777968
CB
2427 size_t controller_len;
2428 char *controller, *p, *path;
2429 struct hierarchy *h;
ccb4cabe 2430
87777968
CB
2431 controller_len = strlen(filename);
2432 controller = alloca(controller_len + 1);
2433 strcpy(controller, filename);
2434 p = strchr(controller, '.');
2435 if (p)
ccb4cabe
SH
2436 *p = '\0';
2437
87777968
CB
2438 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2439 /* not running */
2440 if (!path)
ccb4cabe
SH
2441 return -1;
2442
87777968 2443 h = get_hierarchy(controller);
ccb4cabe 2444 if (h) {
87777968
CB
2445 char *fullpath;
2446
2447 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
2448 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
2449 free(fullpath);
2450 }
ccb4cabe
SH
2451 free(path);
2452
2453 return ret;
2454}
2455
72add155
SH
2456/*
2457 * take devices cgroup line
2458 * /dev/foo rwx
2459 * and convert it to a valid
2460 * type major:minor mode
2461 * line. Return <0 on error. Dest is a preallocated buffer
2462 * long enough to hold the output.
2463 */
2464static int convert_devpath(const char *invalue, char *dest)
2465{
2a06d041
CB
2466 int n_parts;
2467 char *p, *path, type;
72add155
SH
2468 struct stat sb;
2469 unsigned long minor, major;
2a06d041
CB
2470 int ret = -EINVAL;
2471 char *mode = NULL;
72add155
SH
2472
2473 path = must_copy_string(invalue);
2474
2475 /*
2476 * read path followed by mode; ignore any trailing text.
2477 * A ' # comment' would be legal. Technically other text
2478 * is not legal, we could check for that if we cared to
2479 */
2480 for (n_parts = 1, p = path; *p && n_parts < 3; p++) {
2c2d6c49
SH
2481 if (*p != ' ')
2482 continue;
2483 *p = '\0';
2484 if (n_parts != 1)
2485 break;
2486 p++;
2487 n_parts++;
2488 while (*p == ' ')
2489 p++;
2490 mode = p;
2491 if (*p == '\0')
2492 goto out;
72add155 2493 }
2c2d6c49
SH
2494
2495 if (n_parts == 1)
72add155 2496 goto out;
72add155
SH
2497
2498 ret = stat(path, &sb);
2499 if (ret < 0)
2500 goto out;
2501
72add155
SH
2502 mode_t m = sb.st_mode & S_IFMT;
2503 switch (m) {
2504 case S_IFBLK:
2505 type = 'b';
2506 break;
2507 case S_IFCHR:
2508 type = 'c';
2509 break;
2c2d6c49 2510 default:
72add155
SH
2511 ERROR("Unsupported device type %i for %s", m, path);
2512 ret = -EINVAL;
2513 goto out;
2514 }
2c2d6c49
SH
2515
2516 major = MAJOR(sb.st_rdev);
2517 minor = MINOR(sb.st_rdev);
2518 ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
72add155 2519 if (ret < 0 || ret >= 50) {
2a06d041
CB
2520 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2521 "chars)", type, major, minor, mode);
72add155
SH
2522 ret = -ENAMETOOLONG;
2523 goto out;
2524 }
2525 ret = 0;
2526
2527out:
2528 free(path);
2529 return ret;
2530}
2531
ccb4cabe
SH
2532/*
2533 * Called from setup_limits - here we have the container's cgroup_data because
2534 * we created the cgroups
2535 */
a3926f6a
CB
2536static int cg_legacy_set_data(const char *filename, const char *value,
2537 struct cgfsng_handler_data *d)
ccb4cabe 2538{
b3646d7e 2539 char *fullpath, *p;
1a0e70ac
CB
2540 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2541 char converted_value[50];
b3646d7e
CB
2542 struct hierarchy *h;
2543 int ret = 0;
2544 char *controller = NULL;
ccb4cabe 2545
b3646d7e
CB
2546 controller = alloca(strlen(filename) + 1);
2547 strcpy(controller, filename);
2548 if ((p = strchr(controller, '.')) != NULL)
ccb4cabe
SH
2549 *p = '\0';
2550
c8bf519d 2551 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
72add155
SH
2552 ret = convert_devpath(value, converted_value);
2553 if (ret < 0)
c8bf519d 2554 return ret;
72add155
SH
2555 value = converted_value;
2556
c8bf519d 2557 }
2558
b3646d7e
CB
2559 h = get_hierarchy(controller);
2560 if (!h) {
2561 ERROR("Failed to setup limits for the \"%s\" controller. "
2562 "The controller seems to be unused by \"cgfsng\" cgroup "
2563 "driver or not enabled on the cgroup hierarchy",
2564 controller);
d1953b26 2565 errno = ENOENT;
b3646d7e 2566 return -1;
ccb4cabe 2567 }
b3646d7e
CB
2568
2569 fullpath = must_make_path(h->fullcgpath, filename, NULL);
2570 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
2571 free(fullpath);
ccb4cabe
SH
2572 return ret;
2573}
2574
a3926f6a
CB
2575static bool __cg_legacy_setup_limits(void *hdata,
2576 struct lxc_list *cgroup_settings,
2577 bool do_devices)
ccb4cabe
SH
2578{
2579 struct cgfsng_handler_data *d = hdata;
2580 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
2581 struct lxc_cgroup *cg;
ccb4cabe
SH
2582 bool ret = false;
2583
2584 if (lxc_list_empty(cgroup_settings))
2585 return true;
2586
2587 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
6b38e644 2588 if (!sorted_cgroup_settings)
ccb4cabe 2589 return false;
ccb4cabe 2590
ccb4cabe
SH
2591 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2592 cg = iterator->elem;
2593
2594 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
a3926f6a 2595 if (cg_legacy_set_data(cg->subsystem, cg->value, d)) {
ccb4cabe
SH
2596 if (do_devices && (errno == EACCES || errno == EPERM)) {
2597 WARN("Error setting %s to %s for %s",
2598 cg->subsystem, cg->value, d->name);
2599 continue;
2600 }
2601 SYSERROR("Error setting %s to %s for %s",
2602 cg->subsystem, cg->value, d->name);
2603 goto out;
2604 }
6a628f4a 2605 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
ccb4cabe 2606 }
ccb4cabe
SH
2607 }
2608
2609 ret = true;
6b38e644 2610 INFO("Limits for the legacy cgroup hierarchies have been setup");
ccb4cabe 2611out:
ccb4cabe
SH
2612 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2613 lxc_list_del(iterator);
2614 free(iterator);
2615 }
2616 free(sorted_cgroup_settings);
2617 return ret;
2618}
2619
a3926f6a
CB
2620static bool __cg_unified_setup_limits(void *hdata,
2621 struct lxc_list *cgroup_settings)
6b38e644
CB
2622{
2623 struct lxc_list *iterator;
2624 struct hierarchy *h = unified;
2625
2626 if (lxc_list_empty(cgroup_settings))
2627 return true;
2628
2629 if (!h)
2630 return false;
2631
2632 lxc_list_for_each(iterator, cgroup_settings) {
2633 int ret;
2634 char *fullpath;
2635 struct lxc_cgroup *cg = iterator->elem;
2636
2637 fullpath = must_make_path(h->fullcgpath, cg->subsystem, NULL);
2638 ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false);
2639 free(fullpath);
2640 if (ret < 0) {
2641 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2642 return false;
2643 }
2644 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2645 }
2646
2647 INFO("Limits for the unified cgroup hierarchy have been setup");
2648 return true;
2649}
2650
2651static bool cgfsng_setup_limits(void *hdata, struct lxc_conf *conf,
2652 bool do_devices)
2653{
2654 bool bret;
2655
a3926f6a 2656 bret = __cg_legacy_setup_limits(hdata, &conf->cgroup, do_devices);
6b38e644
CB
2657 if (!bret)
2658 return false;
2659
a3926f6a 2660 return __cg_unified_setup_limits(hdata, &conf->cgroup2);
6b38e644
CB
2661}
2662
ccb4cabe
SH
2663static struct cgroup_ops cgfsng_ops = {
2664 .init = cgfsng_init,
2665 .destroy = cgfsng_destroy,
2666 .create = cgfsng_create,
2667 .enter = cgfsng_enter,
ccb4cabe 2668 .escape = cgfsng_escape,
36662416
TA
2669 .num_hierarchies = cgfsng_num_hierarchies,
2670 .get_hierarchies = cgfsng_get_hierarchies,
ccb4cabe
SH
2671 .get_cgroup = cgfsng_get_cgroup,
2672 .get = cgfsng_get,
2673 .set = cgfsng_set,
2674 .unfreeze = cgfsng_unfreeze,
2675 .setup_limits = cgfsng_setup_limits,
2676 .name = "cgroupfs-ng",
2677 .attach = cgfsng_attach,
058c1cb6 2678 .chown = cgfsng_chown,
ccb4cabe
SH
2679 .mount_cgroup = cgfsng_mount,
2680 .nrtasks = cgfsng_nrtasks,
2681 .driver = CGFSNG,
2682
2683 /* unsupported */
2684 .create_legacy = NULL,
2685};