]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
cgroups: get controllers on the unified hierarchy
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
ccb4cabe
SH
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24/*
25 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
26 * cgroup backend. The original cgfs.c was designed to be as flexible
27 * as possible. It would try to find cgroup filesystems no matter where
28 * or how you had them mounted, and deduce the most usable mount for
29 * each controller. It also was not designed for unprivileged use, as
30 * that was reserved for cgmanager.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
a54694f8 36
ccb4cabe 37#include "config.h"
a54694f8
CB
38
39#include <ctype.h>
40#include <dirent.h>
41#include <errno.h>
42#include <grp.h>
43#include <stdint.h>
ccb4cabe
SH
44#include <stdio.h>
45#include <stdlib.h>
a54694f8 46#include <string.h>
ccb4cabe 47#include <unistd.h>
a54694f8 48#include <sys/types.h>
ccb4cabe 49
c8bf519d 50#include <linux/types.h>
51#include <linux/kdev_t.h>
52
b635e92d 53#include "caps.h"
ccb4cabe 54#include "cgroup.h"
6328fd9c 55#include "cgroup_utils.h"
ccb4cabe 56#include "commands.h"
43654d34 57#include "conf.h"
a54694f8 58#include "log.h"
43654d34 59#include "storage/storage.h"
a54694f8 60#include "utils.h"
ccb4cabe
SH
61
62lxc_log_define(lxc_cgfsng, lxc);
63
64static struct cgroup_ops cgfsng_ops;
65
ccb4cabe
SH
66/*
67 * A descriptor for a mounted hierarchy
68 * @controllers: either NULL, or a null-terminated list of all
69 * the co-mounted controllers
70 * @mountpoint: the mountpoint we will use. It will be either
71 * /sys/fs/cgroup/controller or /sys/fs/cgroup/controllerlist
72 * @base_cgroup: the cgroup under which the container cgroup path
73 is created. This will be either the caller's cgroup (if not
74 root), or init's cgroup (if root).
75 */
76struct hierarchy {
77 char **controllers;
78 char *mountpoint;
79 char *base_cgroup;
80 char *fullcgpath;
d6337a5f 81 int version;
ccb4cabe
SH
82};
83
84/*
85 * The cgroup data which is attached to the lxc_handler.
43654d34
CB
86 * @cgroup_pattern : A copy of the lxc.cgroup.pattern
87 * @container_cgroup : If not null, the cgroup which was created for the
88 * container. For each hierarchy, it is created under the
89 * @hierarchy->base_cgroup directory. Relative to the
90 * base_cgroup it is the same for all hierarchies.
91 * @name : The name of the container.
92 * @cgroup_meta : A copy of the container's cgroup information. This
93 * overrides @cgroup_pattern.
ccb4cabe
SH
94 */
95struct cgfsng_handler_data {
ccb4cabe 96 char *cgroup_pattern;
1a0e70ac
CB
97 char *container_cgroup; /* cgroup we created for the container */
98 char *name; /* container name */
43654d34
CB
99 /* per-container cgroup information */
100 struct lxc_cgroup cgroup_meta;
d6337a5f 101 cgroup_layout_t cgroup_layout;
ccb4cabe
SH
102};
103
457ca9aa
SH
104/*
105 * @hierarchies - a NULL-terminated array of struct hierarchy, one per
d6337a5f
CB
106 * legacy hierarchy. No duplicates. First sufficient, writeable
107 * mounted hierarchy wins
457ca9aa
SH
108 */
109struct hierarchy **hierarchies;
d6337a5f
CB
110struct hierarchy *unified;
111cgroup_layout_t cgroup_layout;
457ca9aa
SH
112
113/*
114 * @cgroup_use - a copy of the lxc.cgroup.use
115 */
116char *cgroup_use;
117
e4aeecf5
CB
118/*
119 * @lxc_cgfsng_debug - whether to print debug info to stdout for the cgfsng
120 * driver
121 */
122static bool lxc_cgfsng_debug;
123
65d78313
MPS
124#define CGFSNG_DEBUG(format, ...) do { \
125 if (lxc_cgfsng_debug) \
126 printf("cgfsng: " format, ##__VA_ARGS__); \
127} while(0)
128
ccb4cabe
SH
129static void free_string_list(char **clist)
130{
131 if (clist) {
132 int i;
133
134 for (i = 0; clist[i]; i++)
135 free(clist[i]);
136 free(clist);
137 }
138}
139
ccb4cabe
SH
140/* Allocate a pointer, do not fail */
141static void *must_alloc(size_t sz)
142{
143 return must_realloc(NULL, sz);
144}
145
ccb4cabe
SH
146/*
147 * This is a special case - return a copy of @entry
148 * prepending 'name='. I.e. turn systemd into name=systemd.
149 * Do not fail.
150 */
151static char *must_prefix_named(char *entry)
152{
153 char *ret;
154 size_t len = strlen(entry);
155
156 ret = must_alloc(len + 6);
157 snprintf(ret, len + 6, "name=%s", entry);
158 return ret;
159}
160
161/*
162 * Given a pointer to a null-terminated array of pointers, realloc to
163 * add one entry, and point the new entry to NULL. Do not fail. Return
164 * the index to the second-to-last entry - that is, the one which is
165 * now available for use (keeping the list null-terminated).
166 */
167static int append_null_to_list(void ***list)
168{
169 int newentry = 0;
170
171 if (*list)
172 for (; (*list)[newentry]; newentry++);
173
174 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
175 (*list)[newentry + 1] = NULL;
176 return newentry;
177}
178
179/*
180 * Given a null-terminated array of strings, check whether @entry
181 * is one of the strings
182 */
183static bool string_in_list(char **list, const char *entry)
184{
185 int i;
186
187 if (!list)
188 return false;
d6337a5f 189
ccb4cabe
SH
190 for (i = 0; list[i]; i++)
191 if (strcmp(list[i], entry) == 0)
192 return true;
193
194 return false;
195}
196
197/*
198 * append an entry to the clist. Do not fail.
199 * *clist must be NULL the first time we are called.
200 *
201 * We also handle named subsystems here. Any controller which is not a
202 * kernel subsystem, we prefix 'name='. Any which is both a kernel and
203 * named subsystem, we refuse to use because we're not sure which we
204 * have here. (TODO - we could work around this in some cases by just
205 * remounting to be unambiguous, or by comparing mountpoint contents
206 * with current cgroup)
207 *
208 * The last entry will always be NULL.
209 */
210static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry)
211{
212 int newentry;
213 char *copy;
214
215 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
c2712f64 216 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
ccb4cabe
SH
217 ERROR("It is both a named and kernel subsystem");
218 return;
219 }
220
221 newentry = append_null_to_list((void ***)clist);
222
223 if (strncmp(entry, "name=", 5) == 0)
224 copy = must_copy_string(entry);
225 else if (string_in_list(klist, entry))
226 copy = must_copy_string(entry);
227 else
228 copy = must_prefix_named(entry);
229
230 (*clist)[newentry] = copy;
231}
232
ccb4cabe
SH
233static void free_handler_data(struct cgfsng_handler_data *d)
234{
ccb4cabe
SH
235 free(d->cgroup_pattern);
236 free(d->container_cgroup);
237 free(d->name);
43654d34
CB
238 if (d->cgroup_meta.dir)
239 free(d->cgroup_meta.dir);
240 if (d->cgroup_meta.controllers)
241 free(d->cgroup_meta.controllers);
ccb4cabe
SH
242 free(d);
243}
244
245/*
246 * Given a handler's cgroup data, return the struct hierarchy for the
247 * controller @c, or NULL if there is none.
248 */
457ca9aa 249struct hierarchy *get_hierarchy(const char *c)
ccb4cabe
SH
250{
251 int i;
252
457ca9aa 253 if (!hierarchies)
ccb4cabe 254 return NULL;
d6337a5f 255
457ca9aa 256 for (i = 0; hierarchies[i]; i++) {
d6337a5f
CB
257 if (!c) {
258 /* This is the empty unified hierarchy. */
259 if (hierarchies[i]->controllers &&
260 !hierarchies[i]->controllers[0])
261 return hierarchies[i];
262
263 return NULL;
264 }
265
457ca9aa
SH
266 if (string_in_list(hierarchies[i]->controllers, c))
267 return hierarchies[i];
ccb4cabe 268 }
d6337a5f 269
ccb4cabe
SH
270 return NULL;
271}
272
a54694f8
CB
273#define BATCH_SIZE 50
274static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
275{
276 int newbatches = (newlen / BATCH_SIZE) + 1;
277 int oldbatches = (oldlen / BATCH_SIZE) + 1;
278
279 if (!*mem || newbatches > oldbatches) {
280 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
281 }
282}
283
284static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
285{
286 size_t full = oldlen + newlen;
287
288 batch_realloc(dest, oldlen, full + 1);
289
290 memcpy(*dest + oldlen, new, newlen + 1);
291}
292
293/* Slurp in a whole file */
d6337a5f 294static char *read_file(const char *fnam)
a54694f8
CB
295{
296 FILE *f;
297 char *line = NULL, *buf = NULL;
298 size_t len = 0, fulllen = 0;
299 int linelen;
300
301 f = fopen(fnam, "r");
302 if (!f)
303 return NULL;
304 while ((linelen = getline(&line, &len, f)) != -1) {
305 append_line(&buf, fulllen, line, linelen);
306 fulllen += linelen;
307 }
308 fclose(f);
309 free(line);
310 return buf;
311}
312
313/* Taken over modified from the kernel sources. */
314#define NBITS 32 /* bits in uint32_t */
315#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
316#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
317
318static void set_bit(unsigned bit, uint32_t *bitarr)
319{
320 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
321}
322
323static void clear_bit(unsigned bit, uint32_t *bitarr)
324{
325 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
326}
327
328static bool is_set(unsigned bit, uint32_t *bitarr)
329{
330 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
331}
332
333/* Create cpumask from cpulist aka turn:
334 *
335 * 0,2-3
336 *
337 * into bit array
338 *
339 * 1 0 1 1
340 */
341static uint32_t *lxc_cpumask(char *buf, size_t nbits)
342{
343 char *token;
344 char *saveptr = NULL;
345 size_t arrlen = BITS_TO_LONGS(nbits);
346 uint32_t *bitarr = calloc(arrlen, sizeof(uint32_t));
347 if (!bitarr)
348 return NULL;
349
350 for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
351 errno = 0;
352 unsigned start = strtoul(token, NULL, 0);
353 unsigned end = start;
354
355 char *range = strchr(token, '-');
356 if (range)
357 end = strtoul(range + 1, NULL, 0);
358 if (!(start <= end)) {
359 free(bitarr);
360 return NULL;
361 }
362
363 if (end >= nbits) {
364 free(bitarr);
365 return NULL;
366 }
367
368 while (start <= end)
369 set_bit(start++, bitarr);
370 }
371
372 return bitarr;
373}
374
a54694f8
CB
375/* Turn cpumask into simple, comma-separated cpulist. */
376static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
377{
378 size_t i;
379 int ret;
eab15c1e 380 char numstr[LXC_NUMSTRLEN64] = {0};
a54694f8
CB
381 char **cpulist = NULL;
382
383 for (i = 0; i <= nbits; i++) {
384 if (is_set(i, bitarr)) {
eab15c1e
CB
385 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
386 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
a54694f8
CB
387 lxc_free_array((void **)cpulist, free);
388 return NULL;
389 }
390 if (lxc_append_string(&cpulist, numstr) < 0) {
391 lxc_free_array((void **)cpulist, free);
392 return NULL;
393 }
394 }
395 }
396 return lxc_string_join(",", (const char **)cpulist, false);
397}
398
399static ssize_t get_max_cpus(char *cpulist)
400{
401 char *c1, *c2;
402 char *maxcpus = cpulist;
403 size_t cpus = 0;
404
405 c1 = strrchr(maxcpus, ',');
406 if (c1)
407 c1++;
408
409 c2 = strrchr(maxcpus, '-');
410 if (c2)
411 c2++;
412
413 if (!c1 && !c2)
414 c1 = maxcpus;
415 else if (c1 > c2)
416 c2 = c1;
417 else if (c1 < c2)
418 c1 = c2;
1a0e70ac 419 else if (!c1 && c2) /* The reverse case is obvs. not needed. */
a54694f8
CB
420 c1 = c2;
421
422 /* If the above logic is correct, c1 should always hold a valid string
423 * here.
424 */
425
426 errno = 0;
427 cpus = strtoul(c1, NULL, 0);
428 if (errno != 0)
429 return -1;
430
431 return cpus;
432}
433
6f9584d8 434#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
a54694f8
CB
435static bool filter_and_set_cpus(char *path, bool am_initialized)
436{
437 char *lastslash, *fpath, oldv;
438 int ret;
439 ssize_t i;
440
441 ssize_t maxposs = 0, maxisol = 0;
442 char *cpulist = NULL, *posscpus = NULL, *isolcpus = NULL;
443 uint32_t *possmask = NULL, *isolmask = NULL;
6f9584d8 444 bool bret = false, flipped_bit = false;
a54694f8
CB
445
446 lastslash = strrchr(path, '/');
1a0e70ac 447 if (!lastslash) { /* bug... this shouldn't be possible */
6f9584d8 448 ERROR("Invalid path: %s.", path);
a54694f8
CB
449 return bret;
450 }
451 oldv = *lastslash;
452 *lastslash = '\0';
453 fpath = must_make_path(path, "cpuset.cpus", NULL);
454 posscpus = read_file(fpath);
6f9584d8
CB
455 if (!posscpus) {
456 SYSERROR("Could not read file: %s.\n", fpath);
457 goto on_error;
458 }
a54694f8
CB
459
460 /* Get maximum number of cpus found in possible cpuset. */
461 maxposs = get_max_cpus(posscpus);
462 if (maxposs < 0)
6f9584d8 463 goto on_error;
a54694f8 464
6f9584d8
CB
465 if (!file_exists(__ISOL_CPUS)) {
466 /* This system doesn't expose isolated cpus. */
467 DEBUG("Path: "__ISOL_CPUS" to read isolated cpus from does not exist.\n");
65d29cbc
CB
468 cpulist = posscpus;
469 /* No isolated cpus but we weren't already initialized by
470 * someone. We should simply copy the parents cpuset.cpus
471 * values.
472 */
473 if (!am_initialized) {
474 DEBUG("Copying cpuset of parent cgroup.");
475 goto copy_parent;
476 }
477 /* No isolated cpus but we were already initialized by someone.
478 * Nothing more to do for us.
479 */
6f9584d8
CB
480 goto on_success;
481 }
482
483 isolcpus = read_file(__ISOL_CPUS);
484 if (!isolcpus) {
485 SYSERROR("Could not read file "__ISOL_CPUS);
486 goto on_error;
487 }
a54694f8 488 if (!isdigit(isolcpus[0])) {
6f9584d8 489 DEBUG("No isolated cpus detected.");
a54694f8
CB
490 cpulist = posscpus;
491 /* No isolated cpus but we weren't already initialized by
492 * someone. We should simply copy the parents cpuset.cpus
493 * values.
494 */
6f9584d8
CB
495 if (!am_initialized) {
496 DEBUG("Copying cpuset of parent cgroup.");
a54694f8 497 goto copy_parent;
6f9584d8 498 }
a54694f8
CB
499 /* No isolated cpus but we were already initialized by someone.
500 * Nothing more to do for us.
501 */
6f9584d8 502 goto on_success;
a54694f8
CB
503 }
504
505 /* Get maximum number of cpus found in isolated cpuset. */
506 maxisol = get_max_cpus(isolcpus);
507 if (maxisol < 0)
6f9584d8 508 goto on_error;
a54694f8
CB
509
510 if (maxposs < maxisol)
511 maxposs = maxisol;
512 maxposs++;
513
514 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8
CB
515 if (!possmask) {
516 ERROR("Could not create cpumask for all possible cpus.\n");
517 goto on_error;
518 }
a54694f8
CB
519
520 isolmask = lxc_cpumask(isolcpus, maxposs);
6f9584d8
CB
521 if (!isolmask) {
522 ERROR("Could not create cpumask for all isolated cpus.\n");
523 goto on_error;
524 }
a54694f8
CB
525
526 for (i = 0; i <= maxposs; i++) {
527 if (is_set(i, isolmask) && is_set(i, possmask)) {
6f9584d8 528 flipped_bit = true;
a54694f8
CB
529 clear_bit(i, possmask);
530 }
531 }
532
6f9584d8
CB
533 if (!flipped_bit) {
534 DEBUG("No isolated cpus present in cpuset.");
535 goto on_success;
536 }
537 DEBUG("Removed isolated cpus from cpuset.");
538
a54694f8 539 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
6f9584d8
CB
540 if (!cpulist) {
541 ERROR("Could not create cpu list.\n");
542 goto on_error;
543 }
a54694f8
CB
544
545copy_parent:
546 *lastslash = oldv;
dcbc861e 547 free(fpath);
a54694f8
CB
548 fpath = must_make_path(path, "cpuset.cpus", NULL);
549 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false);
6f9584d8
CB
550 if (ret < 0) {
551 SYSERROR("Could not write cpu list to: %s.\n", fpath);
552 goto on_error;
553 }
554
555on_success:
556 bret = true;
a54694f8 557
6f9584d8 558on_error:
a54694f8
CB
559 free(fpath);
560
561 free(isolcpus);
562 free(isolmask);
563
564 if (posscpus != cpulist)
565 free(posscpus);
566 free(possmask);
567
568 free(cpulist);
569 return bret;
570}
571
e3a3fecf
SH
572/* Copy contents of parent(@path)/@file to @path/@file */
573static bool copy_parent_file(char *path, char *file)
574{
575 char *lastslash, *value = NULL, *fpath, oldv;
576 int len = 0;
577 int ret;
578
579 lastslash = strrchr(path, '/');
1a0e70ac 580 if (!lastslash) { /* bug... this shouldn't be possible */
e3a3fecf
SH
581 ERROR("cgfsng:copy_parent_file: bad path %s", path);
582 return false;
583 }
584 oldv = *lastslash;
585 *lastslash = '\0';
586 fpath = must_make_path(path, file, NULL);
587 len = lxc_read_from_file(fpath, NULL, 0);
588 if (len <= 0)
589 goto bad;
590 value = must_alloc(len + 1);
591 if (lxc_read_from_file(fpath, value, len) != len)
592 goto bad;
593 free(fpath);
594 *lastslash = oldv;
595 fpath = must_make_path(path, file, NULL);
596 ret = lxc_write_to_file(fpath, value, len, false);
597 if (ret < 0)
598 SYSERROR("Unable to write %s to %s", value, fpath);
599 free(fpath);
600 free(value);
601 return ret >= 0;
602
603bad:
604 SYSERROR("Error reading '%s'", fpath);
605 free(fpath);
606 free(value);
607 return false;
608}
609
610/*
611 * Initialize the cpuset hierarchy in first directory of @gname and
612 * set cgroup.clone_children so that children inherit settings.
613 * Since the h->base_path is populated by init or ourselves, we know
614 * it is already initialized.
615 */
a54694f8 616static bool handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf
SH
617{
618 char *cgpath, *clonechildrenpath, v, *slash;
619
620 if (!string_in_list(h->controllers, "cpuset"))
621 return true;
622
623 if (*cgname == '/')
624 cgname++;
625 slash = strchr(cgname, '/');
626 if (slash)
627 *slash = '\0';
628
629 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
630 if (slash)
631 *slash = '/';
632 if (mkdir(cgpath, 0755) < 0 && errno != EEXIST) {
633 SYSERROR("Failed to create '%s'", cgpath);
634 free(cgpath);
635 return false;
636 }
6f9584d8 637
e3a3fecf 638 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
6328fd9c
CB
639 /* unified hierarchy doesn't have clone_children */
640 if (!file_exists(clonechildrenpath)) {
e3a3fecf
SH
641 free(clonechildrenpath);
642 free(cgpath);
643 return true;
644 }
645 if (lxc_read_from_file(clonechildrenpath, &v, 1) < 0) {
646 SYSERROR("Failed to read '%s'", clonechildrenpath);
647 free(clonechildrenpath);
648 free(cgpath);
649 return false;
650 }
651
a54694f8 652 /* Make sure any isolated cpus are removed from cpuset.cpus. */
6f9584d8
CB
653 if (!filter_and_set_cpus(cgpath, v == '1')) {
654 SYSERROR("Failed to remove isolated cpus.");
655 free(clonechildrenpath);
656 free(cgpath);
a54694f8 657 return false;
6f9584d8 658 }
a54694f8 659
e3a3fecf 660 if (v == '1') { /* already set for us by someone else */
6f9584d8 661 DEBUG("\"cgroup.clone_children\" was already set to \"1\".");
e3a3fecf
SH
662 free(clonechildrenpath);
663 free(cgpath);
664 return true;
665 }
666
667 /* copy parent's settings */
a54694f8 668 if (!copy_parent_file(cgpath, "cpuset.mems")) {
6f9584d8 669 SYSERROR("Failed to copy \"cpuset.mems\" settings.");
e3a3fecf
SH
670 free(cgpath);
671 free(clonechildrenpath);
672 return false;
673 }
674 free(cgpath);
675
676 if (lxc_write_to_file(clonechildrenpath, "1", 1, false) < 0) {
677 /* Set clone_children so children inherit our settings */
678 SYSERROR("Failed to write 1 to %s", clonechildrenpath);
679 free(clonechildrenpath);
680 return false;
681 }
682 free(clonechildrenpath);
683 return true;
684}
685
ccb4cabe
SH
686/*
687 * Given two null-terminated lists of strings, return true if any string
688 * is in both.
689 */
690static bool controller_lists_intersect(char **l1, char **l2)
691{
692 int i;
693
694 if (!l1 || !l2)
695 return false;
696
697 for (i = 0; l1[i]; i++) {
698 if (string_in_list(l2, l1[i]))
699 return true;
700 }
701 return false;
702}
703
704/*
705 * For a null-terminated list of controllers @clist, return true if any of
706 * those controllers is already listed the null-terminated list of
707 * hierarchies @hlist. Realistically, if one is present, all must be present.
708 */
709static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
710{
711 int i;
712
713 if (!hlist)
714 return false;
715 for (i = 0; hlist[i]; i++)
716 if (controller_lists_intersect(hlist[i]->controllers, clist))
717 return true;
718 return false;
719
720}
721
722/*
723 * Return true if the controller @entry is found in the null-terminated
724 * list of hierarchies @hlist
725 */
726static bool controller_found(struct hierarchy **hlist, char *entry)
727{
728 int i;
d6337a5f 729
ccb4cabe
SH
730 if (!hlist)
731 return false;
732
733 for (i = 0; hlist[i]; i++)
734 if (string_in_list(hlist[i]->controllers, entry))
735 return true;
d6337a5f 736
ccb4cabe
SH
737 return false;
738}
739
740/*
c30b61c3
SH
741 * Return true if all of the controllers which we require have been found.
742 * The required list is freezer and anything in * lxc.cgroup.use.
ccb4cabe 743 */
457ca9aa 744static bool all_controllers_found(void)
ccb4cabe
SH
745{
746 char *p, *saveptr = NULL;
457ca9aa 747 struct hierarchy ** hlist = hierarchies;
ccb4cabe 748
ccb4cabe 749 if (!controller_found(hlist, "freezer")) {
65d78313 750 CGFSNG_DEBUG("No freezer controller mountpoint found\n");
ccb4cabe
SH
751 return false;
752 }
753
457ca9aa 754 if (!cgroup_use)
ccb4cabe 755 return true;
c2712f64 756
457ca9aa 757 for (p = strtok_r(cgroup_use, ",", &saveptr); p;
ccb4cabe
SH
758 p = strtok_r(NULL, ",", &saveptr)) {
759 if (!controller_found(hlist, p)) {
65d78313 760 CGFSNG_DEBUG("No %s controller mountpoint found\n", p);
ccb4cabe
SH
761 return false;
762 }
763 }
c2712f64 764
ccb4cabe
SH
765 return true;
766}
767
ccb4cabe
SH
768/*
769 * Get the controllers from a mountinfo line
770 * There are other ways we could get this info. For lxcfs, field 3
771 * is /cgroup/controller-list. For cgroupfs, we could parse the mount
772 * options. But we simply assume that the mountpoint must be
773 * /sys/fs/cgroup/controller-list
774 */
d6337a5f
CB
775static char **get_controllers_on_hybrid_layout(char **klist, char **nlist,
776 char *line, int type)
ccb4cabe 777{
6328fd9c 778 /* the fourth field is /sys/fs/cgroup/comma-delimited-controller-list */
ccb4cabe 779 int i;
411ac6d8 780 char *dup, *p2, *tok;
d6337a5f 781 char *p = line, *saveptr = NULL, *sep = ",";
411ac6d8 782 char **aret = NULL;
6328fd9c 783
ccb4cabe 784 for (i = 0; i < 4; i++) {
235f1815 785 p = strchr(p, ' ');
ccb4cabe
SH
786 if (!p)
787 return NULL;
788 p++;
789 }
a55f31bd 790
ccb4cabe
SH
791 /* note - if we change how mountinfo works, then our caller
792 * will need to verify /sys/fs/cgroup/ in this field */
c2712f64 793 if (strncmp(p, "/sys/fs/cgroup/", 15)) {
65d78313 794 CGFSNG_DEBUG("Found hierarchy not under /sys/fs/cgroup: \"%s\"\n", p);
ccb4cabe 795 return NULL;
5059aae9 796 }
d6337a5f 797
ccb4cabe 798 p += 15;
235f1815 799 p2 = strchr(p, ' ');
ccb4cabe 800 if (!p2) {
65d78313 801 CGFSNG_DEBUG("Corrupt mountinfo\n");
ccb4cabe
SH
802 return NULL;
803 }
804 *p2 = '\0';
6328fd9c 805
d6337a5f
CB
806 if (type == CGROUP_SUPER_MAGIC) {
807 /* strdup() here for v1 hierarchies. Otherwise strtok_r() will
808 * destroy mountpoints such as "/sys/fs/cgroup/cpu,cpuacct".
809 */
810 dup = strdup(p);
811 if (!dup)
812 return NULL;
813
814 for (tok = strtok_r(dup, sep, &saveptr); tok;
815 tok = strtok_r(NULL, sep, &saveptr))
816 must_append_controller(klist, nlist, &aret, tok);
817
818 free(dup);
411ac6d8 819 }
d6337a5f
CB
820 *p2 = ' ';
821 return aret;
822}
411ac6d8 823
d6337a5f
CB
824static char **cg_unified_make_empty_controller(void)
825{
826 int newentry;
827 char **aret = NULL;
828
829 newentry = append_null_to_list((void ***)&aret);
830 aret[newentry] = NULL;
831 return aret;
832}
833
834static char **cg_unified_get_controllers(const char *file)
835{
836 char *buf, *tok;
837 char *saveptr = NULL, *sep = " \t\n";
838 char **aret = NULL;
839
840 buf = read_file(file);
841 if (!buf)
411ac6d8 842 return NULL;
6328fd9c 843
d6337a5f
CB
844 for (tok = strtok_r(buf, sep, &saveptr); tok;
845 tok = strtok_r(NULL, sep, &saveptr)) {
846 int newentry;
847 char *copy;
848
849 newentry = append_null_to_list((void ***)&aret);
850 copy = must_copy_string(tok);
851 aret[newentry] = copy;
ccb4cabe
SH
852 }
853
d6337a5f 854 free(buf);
ccb4cabe
SH
855 return aret;
856}
857
d6337a5f
CB
858static struct hierarchy *add_hierarchy(char **clist, char *mountpoint,
859 char *base_cgroup, int type)
ccb4cabe
SH
860{
861 struct hierarchy *new;
862 int newentry;
863
864 new = must_alloc(sizeof(*new));
865 new->controllers = clist;
866 new->mountpoint = mountpoint;
867 new->base_cgroup = base_cgroup;
868 new->fullcgpath = NULL;
d6337a5f 869 new->version = type;
6328fd9c 870
457ca9aa
SH
871 newentry = append_null_to_list((void ***)&hierarchies);
872 hierarchies[newentry] = new;
d6337a5f 873 return new;
ccb4cabe
SH
874}
875
876/*
877 * Get a copy of the mountpoint from @line, which is a line from
878 * /proc/self/mountinfo
879 */
d6337a5f 880static char *get_mountpoint_on_hybrid_layout(char *line)
ccb4cabe
SH
881{
882 int i;
d6337a5f 883 char *p2;
ccb4cabe 884 size_t len;
d6337a5f
CB
885 char *p = line;
886 char *sret = NULL;
ccb4cabe
SH
887
888 for (i = 0; i < 4; i++) {
235f1815 889 p = strchr(p, ' ');
ccb4cabe
SH
890 if (!p)
891 return NULL;
892 p++;
893 }
d6337a5f
CB
894
895 if (strncmp(p, "/sys/fs/cgroup/", 15))
896 return NULL;
897
898 p2 = strchr(p + 15, ' ');
899 if (!p2)
900 return NULL;
901 *p2 = '\0';
902
ccb4cabe
SH
903 len = strlen(p);
904 sret = must_alloc(len + 1);
905 memcpy(sret, p, len);
906 sret[len] = '\0';
907 return sret;
908}
909
910/*
911 * Given a multi-line string, return a null-terminated copy of the
912 * current line.
913 */
914static char *copy_to_eol(char *p)
915{
235f1815 916 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
917 size_t len;
918
919 if (!p2)
920 return NULL;
921
922 len = p2 - p;
923 sret = must_alloc(len + 1);
924 memcpy(sret, p, len);
925 sret[len] = '\0';
926 return sret;
927}
928
929/*
930 * cgline: pointer to character after the first ':' in a line in a
931 * \n-terminated /proc/self/cgroup file. Check whether * controller c is
932 * present.
933 */
934static bool controller_in_clist(char *cgline, char *c)
935{
936 char *tok, *saveptr = NULL, *eol, *tmp;
937 size_t len;
938
235f1815 939 eol = strchr(cgline, ':');
ccb4cabe
SH
940 if (!eol)
941 return false;
942
943 len = eol - cgline;
944 tmp = alloca(len + 1);
945 memcpy(tmp, cgline, len);
946 tmp[len] = '\0';
947
948 for (tok = strtok_r(tmp, ",", &saveptr); tok;
d6337a5f 949 tok = strtok_r(NULL, ",", &saveptr)) {
ccb4cabe
SH
950 if (strcmp(tok, c) == 0)
951 return true;
952 }
d6337a5f 953
ccb4cabe
SH
954 return false;
955}
956
957/*
958 * @basecginfo is a copy of /proc/$$/cgroup. Return the current
959 * cgroup for @controller
960 */
d6337a5f 961static char *get_current_cgroup(char *basecginfo, char *controller, int type)
ccb4cabe
SH
962{
963 char *p = basecginfo;
6328fd9c 964
d6337a5f
CB
965 for (;;) {
966 bool is_cgv2_base_cgroup = false;
967
6328fd9c 968 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
d6337a5f
CB
969 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
970 is_cgv2_base_cgroup = true;
ccb4cabe 971
235f1815 972 p = strchr(p, ':');
ccb4cabe
SH
973 if (!p)
974 return NULL;
975 p++;
d6337a5f
CB
976
977 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
235f1815 978 p = strchr(p, ':');
ccb4cabe
SH
979 if (!p)
980 return NULL;
981 p++;
982 return copy_to_eol(p);
983 }
984
235f1815 985 p = strchr(p, '\n');
ccb4cabe
SH
986 if (!p)
987 return NULL;
988 p++;
989 }
990}
991
ccb4cabe
SH
992static void must_append_string(char ***list, char *entry)
993{
994 int newentry = append_null_to_list((void ***)list);
995 char *copy;
996
997 copy = must_copy_string(entry);
998 (*list)[newentry] = copy;
999}
1000
d6337a5f 1001static int get_existing_subsystems(char ***klist, char ***nlist)
ccb4cabe
SH
1002{
1003 FILE *f;
1004 char *line = NULL;
1005 size_t len = 0;
1006
d6337a5f
CB
1007 f = fopen("/proc/self/cgroup", "r");
1008 if (!f)
1009 return -1;
1010
ccb4cabe
SH
1011 while (getline(&line, &len, f) != -1) {
1012 char *p, *p2, *tok, *saveptr = NULL;
235f1815 1013 p = strchr(line, ':');
ccb4cabe
SH
1014 if (!p)
1015 continue;
1016 p++;
235f1815 1017 p2 = strchr(p, ':');
ccb4cabe
SH
1018 if (!p2)
1019 continue;
1020 *p2 = '\0';
ff8d6ee9 1021
6328fd9c
CB
1022 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
1023 * contains an entry of the form:
ff8d6ee9
CB
1024 *
1025 * 0::/some/path
1026 *
6328fd9c 1027 * In this case we use "cgroup2" as controller name.
ff8d6ee9 1028 */
6328fd9c
CB
1029 if ((p2 - p) == 0) {
1030 must_append_string(klist, "cgroup2");
ff8d6ee9 1031 continue;
6328fd9c 1032 }
ff8d6ee9 1033
ccb4cabe 1034 for (tok = strtok_r(p, ",", &saveptr); tok;
d6337a5f 1035 tok = strtok_r(NULL, ",", &saveptr)) {
ccb4cabe
SH
1036 if (strncmp(tok, "name=", 5) == 0)
1037 must_append_string(nlist, tok);
1038 else
1039 must_append_string(klist, tok);
1040 }
1041 }
1042
1043 free(line);
1044 fclose(f);
d6337a5f 1045 return 0;
ccb4cabe
SH
1046}
1047
1048static void trim(char *s)
1049{
1050 size_t len = strlen(s);
2c28d76b 1051 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
1052 s[--len] = '\0';
1053}
1054
e4aeecf5
CB
1055static void lxc_cgfsng_print_handler_data(const struct cgfsng_handler_data *d)
1056{
1057 printf("Cgroup information:\n");
1058 printf(" container name: %s\n", d->name ? d->name : "(null)");
1059 printf(" lxc.cgroup.use: %s\n", cgroup_use ? cgroup_use : "(null)");
43654d34
CB
1060 printf(" lxc.cgroup.pattern: %s\n",
1061 d->cgroup_pattern ? d->cgroup_pattern : "(null)");
1062 printf(" lxc.cgroup.dir: %s\n",
1063 d->cgroup_meta.dir ? d->cgroup_meta.dir : "(null)");
1064 printf(" cgroup: %s\n",
1065 d->container_cgroup ? d->container_cgroup : "(null)");
e4aeecf5
CB
1066}
1067
1068static void lxc_cgfsng_print_hierarchies()
ccb4cabe 1069{
a7b0cc4c 1070 struct hierarchy **it;
ccb4cabe 1071 int i;
41c33dbe 1072
457ca9aa 1073 if (!hierarchies) {
c2712f64 1074 printf(" No hierarchies found\n");
ccb4cabe
SH
1075 return;
1076 }
e4aeecf5 1077 printf(" Hierarchies:\n");
a7b0cc4c
CB
1078 for (i = 0, it = hierarchies; it && *it; it++, i++) {
1079 char **cit;
ccb4cabe 1080 int j;
c2712f64
CB
1081 printf(" %d: base_cgroup: %s\n", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1082 printf(" mountpoint: %s\n", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
e4aeecf5 1083 printf(" controllers:\n");
a7b0cc4c 1084 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
e4aeecf5 1085 printf(" %d: %s\n", j, *cit);
ccb4cabe
SH
1086 }
1087}
41c33dbe 1088
e4aeecf5 1089static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist, char **nlist)
41c33dbe
SH
1090{
1091 int k;
a7b0cc4c 1092 char **it;
41c33dbe 1093
a7b0cc4c
CB
1094 printf("basecginfo is:\n");
1095 printf("%s\n", basecginfo);
41c33dbe 1096
a7b0cc4c
CB
1097 for (k = 0, it = klist; it && *it; it++, k++)
1098 printf("kernel subsystem %d: %s\n", k, *it);
1099 for (k = 0, it = nlist; it && *it; it++, k++)
1100 printf("named subsystem %d: %s\n", k, *it);
41c33dbe 1101}
ccb4cabe 1102
e4aeecf5
CB
1103static void lxc_cgfsng_print_debuginfo(const struct cgfsng_handler_data *d)
1104{
1105 lxc_cgfsng_print_handler_data(d);
1106 lxc_cgfsng_print_hierarchies();
1107}
1108
ccb4cabe
SH
1109/*
1110 * At startup, parse_hierarchies finds all the info we need about
1111 * cgroup mountpoints and current cgroups, and stores it in @d.
1112 */
d6337a5f 1113static bool cg_init_hybrid(void)
ccb4cabe 1114{
d6337a5f
CB
1115 int ret;
1116 char *basecginfo;
1117 bool will_escape;
ccb4cabe 1118 FILE *f;
ccb4cabe 1119 size_t len = 0;
d6337a5f
CB
1120 char *line = NULL;
1121 char **klist = NULL, **nlist = NULL;
ccb4cabe 1122
d30ec4cb
SH
1123 /*
1124 * Root spawned containers escape the current cgroup, so use init's
1125 * cgroups as our base in that case.
1126 */
d6337a5f
CB
1127 will_escape = (geteuid() == 0);
1128 if (will_escape)
ccb4cabe 1129 basecginfo = read_file("/proc/1/cgroup");
d6337a5f
CB
1130 else
1131 basecginfo = read_file("/proc/self/cgroup");
ccb4cabe
SH
1132 if (!basecginfo)
1133 return false;
1134
d6337a5f
CB
1135 ret = get_existing_subsystems(&klist, &nlist);
1136 if (ret < 0) {
1137 CGFSNG_DEBUG("Failed to retrieve available cgroup v1 controllers\n");
1138 free(basecginfo);
ccb4cabe
SH
1139 return false;
1140 }
1141
d6337a5f
CB
1142 f = fopen("/proc/self/mountinfo", "r");
1143 if (!f) {
1144 CGFSNG_DEBUG("Failed to open \"/proc/self/mountinfo\"\n");
1145 return false;
1146 }
41c33dbe 1147
e4aeecf5
CB
1148 if (lxc_cgfsng_debug)
1149 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
ccb4cabe 1150
ccb4cabe 1151 while (getline(&line, &len, f) != -1) {
49ff3958 1152 int type;
d6337a5f
CB
1153 bool writeable;
1154 struct hierarchy *new;
1155 char *mountpoint = NULL, *base_cgroup = NULL;
1156 char **controller_list = NULL;
ccb4cabe 1157
49ff3958 1158 type = get_cgroup_version(line);
d6337a5f 1159 if (type == 0)
ccb4cabe
SH
1160 continue;
1161
d6337a5f 1162 if (type == CGROUP2_SUPER_MAGIC && unified)
ccb4cabe
SH
1163 continue;
1164
d6337a5f
CB
1165 if (cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
1166 if (type == CGROUP2_SUPER_MAGIC)
1167 cgroup_layout = CGROUP_LAYOUT_UNIFIED;
1168 else if (type == CGROUP_SUPER_MAGIC)
1169 cgroup_layout = CGROUP_LAYOUT_LEGACY;
1170 } else if (cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
1171 if (type == CGROUP_SUPER_MAGIC)
1172 cgroup_layout = CGROUP_LAYOUT_HYBRID;
1173 } else if (cgroup_layout == CGROUP_LAYOUT_LEGACY) {
1174 if (type == CGROUP2_SUPER_MAGIC)
1175 cgroup_layout = CGROUP_LAYOUT_HYBRID;
ccb4cabe
SH
1176 }
1177
d6337a5f
CB
1178 controller_list = get_controllers_on_hybrid_layout(klist, nlist, line, type);
1179 if (!controller_list && type == CGROUP_SUPER_MAGIC)
1180 continue;
1181
1182 if (type == CGROUP_SUPER_MAGIC)
1183 if (controller_list_is_dup(hierarchies, controller_list))
1184 goto next;
1185
1186 mountpoint = get_mountpoint_on_hybrid_layout(line);
ccb4cabe 1187 if (!mountpoint) {
65d78313 1188 CGFSNG_DEBUG("Failed parsing mountpoint from \"%s\"\n", line);
d6337a5f 1189 goto next;
ccb4cabe
SH
1190 }
1191
d6337a5f
CB
1192 if (type == CGROUP_SUPER_MAGIC)
1193 base_cgroup = get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
1194 else
1195 base_cgroup = get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
ccb4cabe 1196 if (!base_cgroup) {
d6337a5f
CB
1197 CGFSNG_DEBUG("Failed to find current cgroup\n");
1198 goto next;
ccb4cabe 1199 }
6328fd9c 1200
ccb4cabe
SH
1201 trim(base_cgroup);
1202 prune_init_scope(base_cgroup);
d6337a5f 1203 if (type == CGROUP2_SUPER_MAGIC)
6328fd9c
CB
1204 writeable = test_writeable_v2(mountpoint, base_cgroup);
1205 else
1206 writeable = test_writeable_v1(mountpoint, base_cgroup);
d6337a5f
CB
1207 if (!writeable)
1208 goto next;
1209
1210 if (type == CGROUP2_SUPER_MAGIC) {
1211 char *cgv2_ctrl_path;
1212
1213 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
1214 "cgroup.controllers",
1215 NULL);
1216
1217 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
1218 free(cgv2_ctrl_path);
1219 if (!controller_list)
1220 controller_list = cg_unified_make_empty_controller();
ccb4cabe 1221 }
d6337a5f
CB
1222 new = add_hierarchy(controller_list, mountpoint, base_cgroup, type);
1223 if (type == CGROUP2_SUPER_MAGIC && !unified)
1224 unified = new;
1225
1226 continue;
1227
1228 next:
1229 free_string_list(controller_list);
1230 free(mountpoint);
1231 free(base_cgroup);
ccb4cabe
SH
1232 }
1233
1234 free_string_list(klist);
1235 free_string_list(nlist);
1236
1237 free(basecginfo);
1238
1239 fclose(f);
1240 free(line);
1241
e4aeecf5
CB
1242 if (lxc_cgfsng_debug) {
1243 printf("writeable subsystems:\n");
1244 lxc_cgfsng_print_hierarchies();
1245 }
1246
ccb4cabe
SH
1247 /* verify that all controllers in cgroup.use and all crucial
1248 * controllers are accounted for
1249 */
c2712f64 1250 if (!all_controllers_found())
ccb4cabe
SH
1251 return false;
1252
1253 return true;
1254}
1255
d6337a5f
CB
1256static int cg_is_pure_unified(void) {
1257
1258 int ret;
1259 struct statfs fs;
1260
1261 ret = statfs("/sys/fs/cgroup", &fs);
1262 if (ret < 0)
1263 return -ENOMEDIUM;
1264
1265 if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
1266 return CGROUP2_SUPER_MAGIC;
1267
1268 return 0;
1269}
1270
1271/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
1272static char *cg_get_current_cgroup_unified(void)
457ca9aa 1273{
d6337a5f
CB
1274 char *basecginfo;
1275 char *base_cgroup;
1276 bool will_escape;
1277 char *copy = NULL;
1278
1279 will_escape = (geteuid() == 0);
1280 if (will_escape)
1281 basecginfo = read_file("/proc/1/cgroup");
1282 else
1283 basecginfo = read_file("/proc/self/cgroup");
1284 if (!basecginfo)
1285 return NULL;
1286
1287 base_cgroup = strstr(basecginfo, "0::/");
1288 if (!base_cgroup)
1289 goto cleanup_on_err;
1290
1291 base_cgroup = base_cgroup + 3;
1292 copy = copy_to_eol(base_cgroup);
1293 if (!copy)
1294 goto cleanup_on_err;
1295
1296cleanup_on_err:
1297 free(basecginfo);
1298 if (copy)
1299 trim(copy);
1300
1301 return copy;
1302}
1303
1304static int cg_init_unified(void)
1305{
1306 int ret;
1307 char *mountpoint, *subtree_path;
1308 char **delegatable;
1309 char *base_cgroup = NULL;
1310
1311 ret = cg_is_pure_unified();
1312 if (ret == -ENOMEDIUM)
1313 return -ENOMEDIUM;
1314
1315 if (ret != CGROUP2_SUPER_MAGIC)
1316 return 0;
1317
1318 base_cgroup = cg_get_current_cgroup_unified();
1319 if (!base_cgroup)
1320 return -EINVAL;
1321 prune_init_scope(base_cgroup);
1322
1323 /* We assume that we have already been given controllers to delegate
1324 * further down the hierarchy. If not it is up to the user to delegate
1325 * them to us.
1326 */
1327 mountpoint = must_copy_string("/sys/fs/cgroup");
1328 subtree_path = must_make_path(mountpoint, base_cgroup,
1329 "cgroup.subtree_control", NULL);
1330 delegatable = cg_unified_get_controllers(subtree_path);
1331 free(subtree_path);
1332 if (!delegatable)
1333 delegatable = cg_unified_make_empty_controller();
1334 if (!delegatable[0])
1335 CGFSNG_DEBUG("No controllers are enabled for delegation\n");
1336
1337 /* TODO: If the user requested specific controllers via lxc.cgroup.use
1338 * we should verify here. The reason I'm not doing it right is that I'm
1339 * not convinced that lxc.cgroup.use will be the future since it is a
1340 * global property. I much rather have an option that lets you request
1341 * controllers per container.
1342 */
1343
1344 add_hierarchy(delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
1345 unified = hierarchies[0];
1346
1347 cgroup_layout = CGROUP_LAYOUT_UNIFIED;
1348 return CGROUP2_SUPER_MAGIC;
1349}
1350
1351static bool cg_init(void)
1352{
1353 int ret;
457ca9aa 1354 const char *tmp;
d6337a5f 1355
457ca9aa
SH
1356 errno = 0;
1357 tmp = lxc_global_config_value("lxc.cgroup.use");
1a0e70ac 1358 if (!cgroup_use && errno != 0) { /* lxc.cgroup.use can be NULL */
65d78313 1359 CGFSNG_DEBUG("Failed to retrieve list of cgroups to use\n");
457ca9aa
SH
1360 return false;
1361 }
1362 cgroup_use = must_copy_string(tmp);
1363
d6337a5f
CB
1364 ret = cg_init_unified();
1365 if (ret < 0)
1366 return false;
1367
1368 if (ret == CGROUP2_SUPER_MAGIC)
1369 return true;
1370
1371 return cg_init_hybrid();
457ca9aa
SH
1372}
1373
43654d34 1374static void *cgfsng_init(struct lxc_handler *handler)
ccb4cabe 1375{
457ca9aa 1376 const char *cgroup_pattern;
43654d34 1377 struct cgfsng_handler_data *d;
ccb4cabe
SH
1378
1379 d = must_alloc(sizeof(*d));
1380 memset(d, 0, sizeof(*d));
1381
43654d34
CB
1382 /* copy container name */
1383 d->name = must_copy_string(handler->name);
1384
1385 /* copy per-container cgroup information */
ae5e6c08
CB
1386 d->cgroup_meta.dir = NULL;
1387 d->cgroup_meta.controllers = NULL;
9b5396f9
CB
1388 if (handler->conf) {
1389 d->cgroup_meta.dir = must_copy_string(handler->conf->cgroup_meta.dir);
1390 d->cgroup_meta.controllers = must_copy_string(handler->conf->cgroup_meta.controllers);
1391 }
ccb4cabe 1392
43654d34 1393 /* copy system-wide cgroup information */
ccb4cabe 1394 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
43654d34
CB
1395 if (!cgroup_pattern) {
1396 /* lxc.cgroup.pattern is only NULL on error. */
ccb4cabe
SH
1397 ERROR("Error getting cgroup pattern");
1398 goto out_free;
1399 }
1400 d->cgroup_pattern = must_copy_string(cgroup_pattern);
1401
d6337a5f
CB
1402 d->cgroup_layout = cgroup_layout;
1403 if (d->cgroup_layout == CGROUP_LAYOUT_LEGACY)
1404 TRACE("Running with legacy cgroup layout");
1405 else if (d->cgroup_layout == CGROUP_LAYOUT_HYBRID)
1406 TRACE("Running with hybrid cgroup layout");
1407 else if (d->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
1408 TRACE("Running with unified cgroup layout");
1409 else
1410 WARN("Running with unknown cgroup layout");
1411
e4aeecf5
CB
1412 if (lxc_cgfsng_debug)
1413 lxc_cgfsng_print_debuginfo(d);
ccb4cabe
SH
1414
1415 return d;
1416
1417out_free:
1418 free_handler_data(d);
1419 return NULL;
1420}
1421
bd8ef4e4 1422static int recursive_destroy(char *dirname)
ccb4cabe 1423{
a17f8b3f 1424 int ret;
74f96976 1425 struct dirent *direntp;
ccb4cabe
SH
1426 DIR *dir;
1427 int r = 0;
1428
1429 dir = opendir(dirname);
1430 if (!dir)
1431 return -1;
1432
74f96976 1433 while ((direntp = readdir(dir))) {
ccb4cabe 1434 char *pathname;
a17f8b3f 1435 struct stat mystat;
ccb4cabe 1436
ccb4cabe
SH
1437 if (!strcmp(direntp->d_name, ".") ||
1438 !strcmp(direntp->d_name, ".."))
1439 continue;
1440
1441 pathname = must_make_path(dirname, direntp->d_name, NULL);
1442
a17f8b3f
CB
1443 ret = lstat(pathname, &mystat);
1444 if (ret < 0) {
ccb4cabe 1445 if (!r)
a17f8b3f 1446 WARN("Failed to stat %s", pathname);
ccb4cabe
SH
1447 r = -1;
1448 goto next;
1449 }
1450
1451 if (!S_ISDIR(mystat.st_mode))
1452 goto next;
a17f8b3f 1453
bd8ef4e4 1454 ret = recursive_destroy(pathname);
a17f8b3f 1455 if (ret < 0)
ccb4cabe 1456 r = -1;
bd8ef4e4 1457 next:
ccb4cabe
SH
1458 free(pathname);
1459 }
1460
a17f8b3f
CB
1461 ret = rmdir(dirname);
1462 if (ret < 0) {
ccb4cabe 1463 if (!r)
bd8ef4e4
CB
1464 WARN("%s - Failed to delete \"%s\"", strerror(errno),
1465 dirname);
ccb4cabe
SH
1466 r = -1;
1467 }
1468
a17f8b3f
CB
1469 ret = closedir(dir);
1470 if (ret < 0) {
ccb4cabe 1471 if (!r)
bd8ef4e4
CB
1472 WARN("%s - Failed to delete \"%s\"", strerror(errno),
1473 dirname);
ccb4cabe
SH
1474 r = -1;
1475 }
a17f8b3f 1476
ccb4cabe
SH
1477 return r;
1478}
1479
bd8ef4e4
CB
1480static int cgroup_rmdir(char *container_cgroup)
1481{
1482 int i;
1483
1484 if (!container_cgroup || !hierarchies)
1485 return 0;
1486
1487 for (i = 0; hierarchies[i]; i++) {
1488 int ret;
1489 struct hierarchy *h = hierarchies[i];
1490
1491 if (!h->fullcgpath)
1492 continue;
1493
1494 ret = recursive_destroy(h->fullcgpath);
1495 if (ret < 0)
1496 WARN("Failed to destroy \"%s\"", h->fullcgpath);
1497
1498 free(h->fullcgpath);
1499 h->fullcgpath = NULL;
1500 }
1501
1502 return 0;
1503}
1504
4160c3a0
CB
1505struct generic_userns_exec_data {
1506 struct cgfsng_handler_data *d;
1507 struct lxc_conf *conf;
1508 uid_t origuid; /* target uid in parent namespace */
1509 char *path;
1510};
1511
bd8ef4e4 1512static int cgroup_rmdir_wrapper(void *data)
ccb4cabe 1513{
4160c3a0
CB
1514 struct generic_userns_exec_data *arg = data;
1515 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1516 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1517
4160c3a0 1518 if (setresgid(nsgid, nsgid, nsgid) < 0)
ccb4cabe 1519 SYSERROR("Failed to setgid to 0");
4160c3a0 1520 if (setresuid(nsuid, nsuid, nsuid) < 0)
ccb4cabe 1521 SYSERROR("Failed to setuid to 0");
a19b974f 1522 if (setgroups(0, NULL) < 0 && errno != EPERM)
ccb4cabe
SH
1523 SYSERROR("Failed to clear groups");
1524
bd8ef4e4 1525 return cgroup_rmdir(arg->d->container_cgroup);
ccb4cabe
SH
1526}
1527
bd8ef4e4 1528static void cgfsng_destroy(void *hdata, struct lxc_conf *conf)
ccb4cabe 1529{
bd8ef4e4
CB
1530 int ret;
1531 struct cgfsng_handler_data *d = hdata;
4160c3a0
CB
1532 struct generic_userns_exec_data wrap;
1533
bd8ef4e4
CB
1534 if (!d)
1535 return;
1536
4160c3a0 1537 wrap.origuid = 0;
bd8ef4e4 1538 wrap.d = hdata;
4160c3a0
CB
1539 wrap.conf = conf;
1540
ccb4cabe 1541 if (conf && !lxc_list_empty(&conf->id_map))
bd8ef4e4
CB
1542 ret = userns_exec_1(conf, cgroup_rmdir_wrapper, &wrap,
1543 "cgroup_rmdir_wrapper");
ccb4cabe 1544 else
bd8ef4e4
CB
1545 ret = cgroup_rmdir(d->container_cgroup);
1546 if (ret < 0) {
1547 WARN("Failed to destroy cgroups");
ccb4cabe 1548 return;
ccb4cabe
SH
1549 }
1550
1551 free_handler_data(d);
1552}
1553
1554struct cgroup_ops *cgfsng_ops_init(void)
1555{
e4aeecf5
CB
1556 if (getenv("LXC_DEBUG_CGFSNG"))
1557 lxc_cgfsng_debug = true;
1558
d6337a5f 1559 if (!cg_init())
457ca9aa 1560 return NULL;
e4aeecf5 1561
ccb4cabe
SH
1562 return &cgfsng_ops;
1563}
1564
1565static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1566{
e3a3fecf 1567 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
1a0e70ac 1568 if (dir_exists(h->fullcgpath)) { /* it must not already exist */
6f9584d8 1569 ERROR("Path \"%s\" already existed.", h->fullcgpath);
d8da679e 1570 return false;
6f9584d8
CB
1571 }
1572 if (!handle_cpuset_hierarchy(h, cgname)) {
1573 ERROR("Failed to handle cgroupfs v1 cpuset controller.");
e3a3fecf 1574 return false;
6f9584d8 1575 }
e3a3fecf 1576 return mkdir_p(h->fullcgpath, 0755) == 0;
ccb4cabe
SH
1577}
1578
1579static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1580{
1581 if (rmdir(h->fullcgpath) < 0)
1582 SYSERROR("Failed to clean up cgroup %s from failed creation attempt", h->fullcgpath);
1583 free(h->fullcgpath);
1584 h->fullcgpath = NULL;
1585}
1586
1587/*
d30ec4cb 1588 * Try to create the same cgroup in all hierarchies.
ccb4cabe
SH
1589 * Start with cgroup_pattern; next cgroup_pattern-1, -2, ..., -999
1590 */
1591static inline bool cgfsng_create(void *hdata)
1592{
bb30b52a 1593 int i;
ccb4cabe 1594 size_t len;
7d531e9b
CB
1595 char *cgname, *offset, *tmp;
1596 int idx = 0;
1597 struct cgfsng_handler_data *d = hdata;
ccb4cabe
SH
1598
1599 if (!d)
1600 return false;
43654d34 1601
ccb4cabe
SH
1602 if (d->container_cgroup) {
1603 WARN("cgfsng_create called a second time");
1604 return false;
1605 }
1606
43654d34 1607 if (d->cgroup_meta.dir)
7d531e9b 1608 tmp = lxc_string_join("/", (const char *[]){d->cgroup_meta.dir, d->name, NULL}, false);
43654d34
CB
1609 else
1610 tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
ccb4cabe
SH
1611 if (!tmp) {
1612 ERROR("Failed expanding cgroup name pattern");
1613 return false;
1614 }
1a0e70ac 1615 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
ccb4cabe
SH
1616 cgname = must_alloc(len);
1617 strcpy(cgname, tmp);
1618 free(tmp);
1619 offset = cgname + len - 5;
1620
1621again:
95adfe93
SH
1622 if (idx == 1000) {
1623 ERROR("Too many conflicting cgroup names");
ccb4cabe 1624 goto out_free;
95adfe93 1625 }
66b66624 1626 if (idx) {
bb30b52a
CB
1627 int ret;
1628
66b66624
CB
1629 ret = snprintf(offset, 5, "-%d", idx);
1630 if (ret < 0 || (size_t)ret >= 5) {
1631 FILE *f = fopen("/dev/null", "w");
97ebced3 1632 if (f) {
66b66624
CB
1633 fprintf(f, "Workaround for GCC7 bug: "
1634 "https://gcc.gnu.org/bugzilla/"
1635 "show_bug.cgi?id=78969");
1636 fclose(f);
1637 }
1638 }
1639 }
457ca9aa
SH
1640 for (i = 0; hierarchies[i]; i++) {
1641 if (!create_path_for_hierarchy(hierarchies[i], cgname)) {
ccb4cabe 1642 int j;
1a0e70ac 1643 ERROR("Failed to create \"%s\"", hierarchies[i]->fullcgpath);
457ca9aa
SH
1644 free(hierarchies[i]->fullcgpath);
1645 hierarchies[i]->fullcgpath = NULL;
ccb4cabe 1646 for (j = 0; j < i; j++)
457ca9aa 1647 remove_path_for_hierarchy(hierarchies[j], cgname);
ccb4cabe
SH
1648 idx++;
1649 goto again;
1650 }
1651 }
1652 /* Done */
1653 d->container_cgroup = cgname;
1654 return true;
1655
1656out_free:
1657 free(cgname);
1658 return false;
1659}
1660
ccb4cabe
SH
1661static bool cgfsng_enter(void *hdata, pid_t pid)
1662{
ccb4cabe
SH
1663 char pidstr[25];
1664 int i, len;
1665
1666 len = snprintf(pidstr, 25, "%d", pid);
1667 if (len < 0 || len > 25)
1668 return false;
1669
457ca9aa
SH
1670 for (i = 0; hierarchies[i]; i++) {
1671 char *fullpath = must_make_path(hierarchies[i]->fullcgpath,
ccb4cabe
SH
1672 "cgroup.procs", NULL);
1673 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
d3b00a8f 1674 SYSERROR("Failed to enter %s", fullpath);
ccb4cabe
SH
1675 free(fullpath);
1676 return false;
1677 }
1678 free(fullpath);
1679 }
1680
1681 return true;
1682}
1683
c0888dfe
SH
1684/*
1685 * chgrp the container cgroups to container group. We leave
1686 * the container owner as cgroup owner. So we must make the
1687 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1688 *
1689 * Also chown the tasks and cgroup.procs files. Those may not
1690 * exist depending on kernel version.
c0888dfe 1691 */
ccb4cabe
SH
1692static int chown_cgroup_wrapper(void *data)
1693{
ccb4cabe 1694 int i;
4160c3a0
CB
1695 uid_t destuid;
1696 struct generic_userns_exec_data *arg = data;
1697 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1698 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1699
4160c3a0 1700 if (setresgid(nsgid, nsgid, nsgid) < 0)
ccb4cabe 1701 SYSERROR("Failed to setgid to 0");
4160c3a0 1702 if (setresuid(nsuid, nsuid, nsuid) < 0)
ccb4cabe 1703 SYSERROR("Failed to setuid to 0");
a19b974f 1704 if (setgroups(0, NULL) < 0 && errno != EPERM)
ccb4cabe
SH
1705 SYSERROR("Failed to clear groups");
1706
1707 destuid = get_ns_uid(arg->origuid);
1708
457ca9aa
SH
1709 for (i = 0; hierarchies[i]; i++) {
1710 char *fullpath, *path = hierarchies[i]->fullcgpath;
43647298 1711
4160c3a0 1712 if (chown(path, destuid, nsgid) < 0) {
ab8f5424 1713 SYSERROR("Error chowning %s to %d", path, (int) destuid);
ccb4cabe
SH
1714 return -1;
1715 }
c0888dfe 1716
43647298 1717 if (chmod(path, 0775) < 0) {
ab8f5424 1718 SYSERROR("Error chmoding %s", path);
c0888dfe
SH
1719 return -1;
1720 }
ccb4cabe 1721
ab8f5424
SH
1722 /*
1723 * Failures to chown these are inconvenient but not detrimental
1724 * We leave these owned by the container launcher, so that container
1725 * root can write to the files to attach. We chmod them 664 so that
1726 * container systemd can write to the files (which systemd in wily
1727 * insists on doing)
1728 */
43647298 1729 fullpath = must_make_path(path, "tasks", NULL);
4160c3a0 1730 if (chown(fullpath, destuid, nsgid) < 0 && errno != ENOENT)
13277ec4 1731 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1732 strerror(errno));
ab8f5424 1733 if (chmod(fullpath, 0664) < 0)
13277ec4 1734 WARN("Error chmoding %s: %s", path, strerror(errno));
43647298
SH
1735 free(fullpath);
1736
1737 fullpath = must_make_path(path, "cgroup.procs", NULL);
1738 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1739 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1740 strerror(errno));
ab8f5424 1741 if (chmod(fullpath, 0664) < 0)
13277ec4 1742 WARN("Error chmoding %s: %s", path, strerror(errno));
ccb4cabe 1743 free(fullpath);
0e17357c 1744
d6337a5f 1745 if (hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
0e17357c
CB
1746 continue;
1747
1748 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
4160c3a0 1749 if (chown(fullpath, destuid, nsgid) < 0 && errno != ENOENT)
0e17357c
CB
1750 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1751 strerror(errno));
1752 if (chmod(fullpath, 0664) < 0)
1753 WARN("Error chmoding %s: %s", path, strerror(errno));
1754 free(fullpath);
1755
1756 fullpath = must_make_path(path, "cgroup.threads", NULL);
4160c3a0 1757 if (chown(fullpath, destuid, nsgid) < 0 && errno != ENOENT)
0e17357c
CB
1758 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1759 strerror(errno));
1760 if (chmod(fullpath, 0664) < 0)
1761 WARN("Error chmoding %s: %s", path, strerror(errno));
1762 free(fullpath);
ccb4cabe
SH
1763 }
1764
1765 return 0;
1766}
1767
058c1cb6 1768static bool cgfsng_chown(void *hdata, struct lxc_conf *conf)
ccb4cabe
SH
1769{
1770 struct cgfsng_handler_data *d = hdata;
4160c3a0 1771 struct generic_userns_exec_data wrap;
ccb4cabe
SH
1772
1773 if (!d)
1774 return false;
1775
1776 if (lxc_list_empty(&conf->id_map))
1777 return true;
1778
ccb4cabe 1779 wrap.origuid = geteuid();
4160c3a0
CB
1780 wrap.path = NULL;
1781 wrap.d = d;
1782 wrap.conf = conf;
ccb4cabe 1783
c9b7c33e
CB
1784 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1785 "chown_cgroup_wrapper") < 0) {
ccb4cabe
SH
1786 ERROR("Error requesting cgroup chown in new namespace");
1787 return false;
1788 }
1789
1790 return true;
1791}
1792
8aa1044f
SH
1793/*
1794 * We've safe-mounted a tmpfs as parent, so we don't need to protect against
1795 * symlinks any more - just use mount
1796 */
1797
1798/* mount cgroup-full if requested */
1799static int mount_cgroup_full(int type, struct hierarchy *h, char *dest,
1800 char *container_cgroup)
1801{
1802 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1803 return 0;
1804 if (mount(h->mountpoint, dest, "cgroup", MS_BIND, NULL) < 0) {
1805 SYSERROR("Error bind-mounting %s cgroup onto %s", h->mountpoint,
1806 dest);
1807 return -1;
1808 }
1809 if (type != LXC_AUTO_CGROUP_FULL_RW) {
5b6f9369
SH
1810 unsigned long flags = MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV |
1811 MS_REMOUNT | MS_RDONLY;
1812 if (mount(NULL, dest, "cgroup", flags, NULL) < 0) {
8aa1044f
SH
1813 SYSERROR("Error remounting %s readonly", dest);
1814 return -1;
1815 }
1816 }
1817
1818 INFO("Bind mounted %s onto %s", h->mountpoint, dest);
1819 if (type != LXC_AUTO_CGROUP_FULL_MIXED)
1820 return 0;
1821
1822 /* mount just the container path rw */
1823 char *source = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
5b6f9369 1824 char *rwpath = must_make_path(dest, h->base_cgroup, container_cgroup, NULL);
8aa1044f 1825 if (mount(source, rwpath, "cgroup", MS_BIND, NULL) < 0)
13277ec4 1826 WARN("Failed to mount %s read-write: %s", rwpath,
1827 strerror(errno));
8aa1044f
SH
1828 INFO("Made %s read-write", rwpath);
1829 free(rwpath);
1830 free(source);
1831 return 0;
1832}
1833
1834/* cgroup-full:* is done, no need to create subdirs */
1835static bool cg_mount_needs_subdirs(int type)
1836{
1837 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1838 return false;
1839 return true;
1840}
1841
1842/*
1843 * After $rootfs/sys/fs/container/controller/the/cg/path has been
1844 * created, remount controller ro if needed and bindmount the
1845 * cgroupfs onto controll/the/cg/path
1846 */
1847static int
1848do_secondstage_mounts_if_needed(int type, struct hierarchy *h,
1849 char *controllerpath, char *cgpath,
1850 const char *container_cgroup)
1851{
1852 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1853 if (mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL) < 0) {
1854 SYSERROR("Error bind-mounting %s", controllerpath);
1855 return -1;
1856 }
1857 if (mount(controllerpath, controllerpath, "cgroup",
1858 MS_REMOUNT | MS_BIND | MS_RDONLY, NULL) < 0) {
1859 SYSERROR("Error remounting %s read-only", controllerpath);
1860 return -1;
1861 }
1862 INFO("Remounted %s read-only", controllerpath);
1863 }
1864 char *sourcepath = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
1865 int flags = MS_BIND;
1866 if (type == LXC_AUTO_CGROUP_RO)
1867 flags |= MS_RDONLY;
1868 INFO("Mounting %s onto %s", sourcepath, cgpath);
1869 if (mount(sourcepath, cgpath, "cgroup", flags, NULL) < 0) {
1870 free(sourcepath);
1871 SYSERROR("Error mounting cgroup %s onto %s", h->controllers[0],
1872 cgpath);
1873 return -1;
1874 }
1875 free(sourcepath);
1876 INFO("Completed second stage cgroup automounts for %s", cgpath);
1877 return 0;
1878}
1879
a760603e 1880static int mount_cgroup_cgns_supported(int type, struct hierarchy *h, const char *controllerpath)
b635e92d
CB
1881{
1882 int ret;
1883 char *controllers = NULL;
a760603e
CB
1884 char *fstype = "cgroup2";
1885 unsigned long flags = 0;
b635e92d 1886
a760603e
CB
1887 flags |= MS_NOSUID;
1888 flags |= MS_NOEXEC;
1889 flags |= MS_NODEV;
1890 flags |= MS_RELATIME;
1891
1892 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1893 flags |= MS_RDONLY;
1894
d6337a5f 1895 if (h->version != CGROUP2_SUPER_MAGIC) {
a760603e
CB
1896 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1897 if (!controllers)
1898 return -ENOMEM;
1899 fstype = "cgroup";
b635e92d
CB
1900 }
1901
a760603e 1902 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
b635e92d
CB
1903 free(controllers);
1904 if (ret < 0) {
a760603e 1905 SYSERROR("Failed to mount %s with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1906 return -1;
1907 }
1908
a760603e 1909 DEBUG("Mounted %s with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1910 return 0;
1911}
1912
ccb4cabe
SH
1913static bool cgfsng_mount(void *hdata, const char *root, int type)
1914{
b635e92d 1915 int i;
8aa1044f
SH
1916 char *tmpfspath = NULL;
1917 bool retval = false;
b635e92d
CB
1918 struct lxc_handler *handler = hdata;
1919 struct cgfsng_handler_data *d = handler->cgroup_data;
1920 bool has_cgns = false, has_sys_admin = true;
8aa1044f
SH
1921
1922 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1923 return true;
1924
b635e92d
CB
1925 has_cgns = cgns_supported();
1926 if (!lxc_list_empty(&handler->conf->keepcaps))
1927 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1928 else
1929 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1930
1931 if (has_cgns && has_sys_admin)
ccb4cabe 1932 return true;
8aa1044f
SH
1933
1934 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1935
1936 if (type == LXC_AUTO_CGROUP_NOSPEC)
1937 type = LXC_AUTO_CGROUP_MIXED;
1938 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1939 type = LXC_AUTO_CGROUP_FULL_MIXED;
1940
1941 /* Mount tmpfs */
1942 if (safe_mount("cgroup_root", tmpfspath, "tmpfs",
1943 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1944 "size=10240k,mode=755",
1945 root) < 0)
1946 goto bad;
1947
457ca9aa 1948 for (i = 0; hierarchies[i]; i++) {
8aa1044f 1949 char *controllerpath, *path2;
457ca9aa 1950 struct hierarchy *h = hierarchies[i];
8aa1044f
SH
1951 char *controller = strrchr(h->mountpoint, '/');
1952 int r;
1953
1954 if (!controller)
1955 continue;
1956 controller++;
1957 controllerpath = must_make_path(tmpfspath, controller, NULL);
1958 if (dir_exists(controllerpath)) {
1959 free(controllerpath);
1960 continue;
1961 }
1962 if (mkdir(controllerpath, 0755) < 0) {
1963 SYSERROR("Error creating cgroup path: %s", controllerpath);
1964 free(controllerpath);
1965 goto bad;
1966 }
b635e92d
CB
1967
1968 if (has_cgns && !has_sys_admin) {
1969 /* If cgroup namespaces are supported but the container
1970 * will not have CAP_SYS_ADMIN after it has started we
1971 * need to mount the cgroups manually.
1972 */
a760603e 1973 r = mount_cgroup_cgns_supported(type, h, controllerpath);
b635e92d
CB
1974 free(controllerpath);
1975 if (r < 0)
1976 goto bad;
1977 continue;
1978 }
1979
8aa1044f
SH
1980 if (mount_cgroup_full(type, h, controllerpath, d->container_cgroup) < 0) {
1981 free(controllerpath);
1982 goto bad;
1983 }
1984 if (!cg_mount_needs_subdirs(type)) {
1985 free(controllerpath);
1986 continue;
1987 }
ef4413fa 1988 path2 = must_make_path(controllerpath, h->base_cgroup, d->container_cgroup, NULL);
8aa1044f
SH
1989 if (mkdir_p(path2, 0755) < 0) {
1990 free(controllerpath);
8e0c6620 1991 free(path2);
8aa1044f
SH
1992 goto bad;
1993 }
2f62fb00 1994
8aa1044f
SH
1995 r = do_secondstage_mounts_if_needed(type, h, controllerpath, path2,
1996 d->container_cgroup);
1997 free(controllerpath);
1998 free(path2);
1999 if (r < 0)
2000 goto bad;
2001 }
2002 retval = true;
2003
2004bad:
2005 free(tmpfspath);
2006 return retval;
ccb4cabe
SH
2007}
2008
2009static int recursive_count_nrtasks(char *dirname)
2010{
74f96976 2011 struct dirent *direntp;
ccb4cabe
SH
2012 DIR *dir;
2013 int count = 0, ret;
2014 char *path;
2015
2016 dir = opendir(dirname);
2017 if (!dir)
2018 return 0;
2019
74f96976 2020 while ((direntp = readdir(dir))) {
ccb4cabe
SH
2021 struct stat mystat;
2022
2023 if (!direntp)
2024 break;
2025
2026 if (!strcmp(direntp->d_name, ".") ||
2027 !strcmp(direntp->d_name, ".."))
2028 continue;
2029
2030 path = must_make_path(dirname, direntp->d_name, NULL);
2031
2032 if (lstat(path, &mystat))
2033 goto next;
2034
2035 if (!S_ISDIR(mystat.st_mode))
2036 goto next;
2037
2038 count += recursive_count_nrtasks(path);
2039next:
2040 free(path);
2041 }
2042
2043 path = must_make_path(dirname, "cgroup.procs", NULL);
2044 ret = lxc_count_file_lines(path);
2045 if (ret != -1)
2046 count += ret;
2047 free(path);
2048
2049 (void) closedir(dir);
2050
2051 return count;
2052}
2053
2054static int cgfsng_nrtasks(void *hdata) {
2055 struct cgfsng_handler_data *d = hdata;
2056 char *path;
2057 int count;
2058
457ca9aa 2059 if (!d || !d->container_cgroup || !hierarchies)
ccb4cabe 2060 return -1;
457ca9aa 2061 path = must_make_path(hierarchies[0]->fullcgpath, NULL);
ccb4cabe
SH
2062 count = recursive_count_nrtasks(path);
2063 free(path);
2064 return count;
2065}
2066
2067/* Only root needs to escape to the cgroup of its init */
7103fe6f 2068static bool cgfsng_escape()
ccb4cabe 2069{
ccb4cabe
SH
2070 int i;
2071
2072 if (geteuid())
2073 return true;
2074
457ca9aa
SH
2075 for (i = 0; hierarchies[i]; i++) {
2076 char *fullpath = must_make_path(hierarchies[i]->mountpoint,
2077 hierarchies[i]->base_cgroup,
ccb4cabe
SH
2078 "cgroup.procs", NULL);
2079 if (lxc_write_to_file(fullpath, "0", 2, false) != 0) {
d3b00a8f 2080 SYSERROR("Failed to escape to %s", fullpath);
ccb4cabe 2081 free(fullpath);
6df334d1 2082 return false;
ccb4cabe
SH
2083 }
2084 free(fullpath);
2085 }
2086
6df334d1 2087 return true;
ccb4cabe
SH
2088}
2089
36662416
TA
2090static int cgfsng_num_hierarchies(void)
2091{
2092 int i;
2093
2094 for (i = 0; hierarchies[i]; i++)
2095 ;
2096
2097 return i;
2098}
2099
2100static bool cgfsng_get_hierarchies(int n, char ***out)
2101{
2102 int i;
2103
2104 /* sanity check n */
2105 for (i = 0; i < n; i++) {
2106 if (!hierarchies[i])
2107 return false;
2108 }
2109
2110 *out = hierarchies[i]->controllers;
2111
2112 return true;
2113}
2114
ccb4cabe
SH
2115#define THAWED "THAWED"
2116#define THAWED_LEN (strlen(THAWED))
2117
d6337a5f
CB
2118/* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
2119 * to be adapted.
2120 */
ccb4cabe
SH
2121static bool cgfsng_unfreeze(void *hdata)
2122{
d6337a5f 2123 int ret;
ccb4cabe 2124 char *fullpath;
d6337a5f 2125 struct hierarchy *h;
ccb4cabe 2126
d6337a5f 2127 h = get_hierarchy("freezer");
457ca9aa 2128 if (!h)
ccb4cabe 2129 return false;
d6337a5f 2130
ccb4cabe 2131 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
d6337a5f 2132 ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false);
ccb4cabe 2133 free(fullpath);
d6337a5f
CB
2134 if (ret < 0)
2135 return false;
2136
ccb4cabe
SH
2137 return true;
2138}
2139
2140static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem)
2141{
d6337a5f
CB
2142 struct hierarchy *h;
2143
2144 h = get_hierarchy(subsystem);
ccb4cabe
SH
2145 if (!h)
2146 return NULL;
2147
371f834d
SH
2148 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
2149}
2150
2151/*
2152 * Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a
2153 * full path, which must be freed by the caller.
2154 */
2155static char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2156 const char *inpath,
2157 const char *filename)
2158{
371f834d 2159 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
2160}
2161
2162static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
2163{
ccb4cabe
SH
2164 char pidstr[25];
2165 int i, len;
2166
2167 len = snprintf(pidstr, 25, "%d", pid);
2168 if (len < 0 || len > 25)
2169 return false;
2170
457ca9aa 2171 for (i = 0; hierarchies[i]; i++) {
ccb4cabe 2172 char *path, *fullpath;
457ca9aa 2173 struct hierarchy *h = hierarchies[i];
ccb4cabe
SH
2174
2175 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
1a0e70ac 2176 if (!path) /* not running */
ccb4cabe
SH
2177 continue;
2178
371f834d
SH
2179 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2180 free(path);
ccb4cabe
SH
2181 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
2182 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
2183 free(fullpath);
ccb4cabe
SH
2184 return false;
2185 }
ccb4cabe
SH
2186 free(fullpath);
2187 }
2188
ccb4cabe
SH
2189 return true;
2190}
2191
2192/*
2193 * Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.
2194 * Here we don't have a cgroup_data set up, so we ask the running
2195 * container through the commands API for the cgroup path
2196 */
2197static int cgfsng_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
2198{
2199 char *subsystem, *p, *path;
ccb4cabe
SH
2200 struct hierarchy *h;
2201 int ret = -1;
2202
2203 subsystem = alloca(strlen(filename) + 1);
2204 strcpy(subsystem, filename);
2205 if ((p = strchr(subsystem, '.')) != NULL)
2206 *p = '\0';
2207
2208 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1a0e70ac 2209 if (!path) /* not running */
ccb4cabe
SH
2210 return -1;
2211
457ca9aa 2212 h = get_hierarchy(subsystem);
ccb4cabe 2213 if (h) {
371f834d 2214 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
2215 ret = lxc_read_from_file(fullpath, value, len);
2216 free(fullpath);
2217 }
2218
ccb4cabe
SH
2219 free(path);
2220
2221 return ret;
2222}
2223
2224/*
2225 * Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.
2226 * Here we don't have a cgroup_data set up, so we ask the running
2227 * container through the commands API for the cgroup path
2228 */
2229static int cgfsng_set(const char *filename, const char *value, const char *name, const char *lxcpath)
2230{
2231 char *subsystem, *p, *path;
ccb4cabe
SH
2232 struct hierarchy *h;
2233 int ret = -1;
2234
2235 subsystem = alloca(strlen(filename) + 1);
2236 strcpy(subsystem, filename);
2237 if ((p = strchr(subsystem, '.')) != NULL)
2238 *p = '\0';
2239
2240 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1a0e70ac 2241 if (!path) /* not running */
ccb4cabe
SH
2242 return -1;
2243
457ca9aa 2244 h = get_hierarchy(subsystem);
ccb4cabe 2245 if (h) {
371f834d 2246 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
2247 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
2248 free(fullpath);
2249 }
2250
ccb4cabe
SH
2251 free(path);
2252
2253 return ret;
2254}
2255
72add155
SH
2256/*
2257 * take devices cgroup line
2258 * /dev/foo rwx
2259 * and convert it to a valid
2260 * type major:minor mode
2261 * line. Return <0 on error. Dest is a preallocated buffer
2262 * long enough to hold the output.
2263 */
2264static int convert_devpath(const char *invalue, char *dest)
2265{
2a06d041
CB
2266 int n_parts;
2267 char *p, *path, type;
72add155
SH
2268 struct stat sb;
2269 unsigned long minor, major;
2a06d041
CB
2270 int ret = -EINVAL;
2271 char *mode = NULL;
72add155
SH
2272
2273 path = must_copy_string(invalue);
2274
2275 /*
2276 * read path followed by mode; ignore any trailing text.
2277 * A ' # comment' would be legal. Technically other text
2278 * is not legal, we could check for that if we cared to
2279 */
2280 for (n_parts = 1, p = path; *p && n_parts < 3; p++) {
2c2d6c49
SH
2281 if (*p != ' ')
2282 continue;
2283 *p = '\0';
2284 if (n_parts != 1)
2285 break;
2286 p++;
2287 n_parts++;
2288 while (*p == ' ')
2289 p++;
2290 mode = p;
2291 if (*p == '\0')
2292 goto out;
72add155 2293 }
2c2d6c49
SH
2294
2295 if (n_parts == 1)
72add155 2296 goto out;
72add155
SH
2297
2298 ret = stat(path, &sb);
2299 if (ret < 0)
2300 goto out;
2301
72add155
SH
2302 mode_t m = sb.st_mode & S_IFMT;
2303 switch (m) {
2304 case S_IFBLK:
2305 type = 'b';
2306 break;
2307 case S_IFCHR:
2308 type = 'c';
2309 break;
2c2d6c49 2310 default:
72add155
SH
2311 ERROR("Unsupported device type %i for %s", m, path);
2312 ret = -EINVAL;
2313 goto out;
2314 }
2c2d6c49
SH
2315
2316 major = MAJOR(sb.st_rdev);
2317 minor = MINOR(sb.st_rdev);
2318 ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
72add155 2319 if (ret < 0 || ret >= 50) {
2a06d041
CB
2320 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2321 "chars)", type, major, minor, mode);
72add155
SH
2322 ret = -ENAMETOOLONG;
2323 goto out;
2324 }
2325 ret = 0;
2326
2327out:
2328 free(path);
2329 return ret;
2330}
2331
ccb4cabe
SH
2332/*
2333 * Called from setup_limits - here we have the container's cgroup_data because
2334 * we created the cgroups
2335 */
2336static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfsng_handler_data *d)
2337{
b3646d7e 2338 char *fullpath, *p;
1a0e70ac
CB
2339 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2340 char converted_value[50];
b3646d7e
CB
2341 struct hierarchy *h;
2342 int ret = 0;
2343 char *controller = NULL;
ccb4cabe 2344
b3646d7e
CB
2345 controller = alloca(strlen(filename) + 1);
2346 strcpy(controller, filename);
2347 if ((p = strchr(controller, '.')) != NULL)
ccb4cabe
SH
2348 *p = '\0';
2349
c8bf519d 2350 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
72add155
SH
2351 ret = convert_devpath(value, converted_value);
2352 if (ret < 0)
c8bf519d 2353 return ret;
72add155
SH
2354 value = converted_value;
2355
c8bf519d 2356 }
2357
b3646d7e
CB
2358 h = get_hierarchy(controller);
2359 if (!h) {
2360 ERROR("Failed to setup limits for the \"%s\" controller. "
2361 "The controller seems to be unused by \"cgfsng\" cgroup "
2362 "driver or not enabled on the cgroup hierarchy",
2363 controller);
d1953b26 2364 errno = ENOENT;
b3646d7e 2365 return -1;
ccb4cabe 2366 }
b3646d7e
CB
2367
2368 fullpath = must_make_path(h->fullcgpath, filename, NULL);
2369 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
2370 free(fullpath);
ccb4cabe
SH
2371 return ret;
2372}
2373
2374static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings,
2375 bool do_devices)
2376{
2377 struct cgfsng_handler_data *d = hdata;
2378 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
2379 struct lxc_cgroup *cg;
ccb4cabe
SH
2380 bool ret = false;
2381
2382 if (lxc_list_empty(cgroup_settings))
2383 return true;
2384
2385 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2386 if (!sorted_cgroup_settings) {
2387 return false;
2388 }
2389
ccb4cabe
SH
2390 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2391 cg = iterator->elem;
2392
2393 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
ccb4cabe
SH
2394 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
2395 if (do_devices && (errno == EACCES || errno == EPERM)) {
2396 WARN("Error setting %s to %s for %s",
2397 cg->subsystem, cg->value, d->name);
2398 continue;
2399 }
2400 SYSERROR("Error setting %s to %s for %s",
2401 cg->subsystem, cg->value, d->name);
2402 goto out;
2403 }
6a628f4a 2404 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
ccb4cabe 2405 }
ccb4cabe
SH
2406 }
2407
2408 ret = true;
2409 INFO("cgroup has been setup");
2410out:
ccb4cabe
SH
2411 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2412 lxc_list_del(iterator);
2413 free(iterator);
2414 }
2415 free(sorted_cgroup_settings);
2416 return ret;
2417}
2418
2419static struct cgroup_ops cgfsng_ops = {
2420 .init = cgfsng_init,
2421 .destroy = cgfsng_destroy,
2422 .create = cgfsng_create,
2423 .enter = cgfsng_enter,
ccb4cabe 2424 .escape = cgfsng_escape,
36662416
TA
2425 .num_hierarchies = cgfsng_num_hierarchies,
2426 .get_hierarchies = cgfsng_get_hierarchies,
ccb4cabe
SH
2427 .get_cgroup = cgfsng_get_cgroup,
2428 .get = cgfsng_get,
2429 .set = cgfsng_set,
2430 .unfreeze = cgfsng_unfreeze,
2431 .setup_limits = cgfsng_setup_limits,
2432 .name = "cgroupfs-ng",
2433 .attach = cgfsng_attach,
058c1cb6 2434 .chown = cgfsng_chown,
ccb4cabe
SH
2435 .mount_cgroup = cgfsng_mount,
2436 .nrtasks = cgfsng_nrtasks,
2437 .driver = CGFSNG,
2438
2439 /* unsupported */
2440 .create_legacy = NULL,
2441};