]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
cgfsng: add me to authors
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
ccb4cabe
SH
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
3fd0de4d 8 * Christian Brauner <christian.brauner@ubuntu.com>
ccb4cabe
SH
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25/*
26 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
27 * cgroup backend. The original cgfs.c was designed to be as flexible
28 * as possible. It would try to find cgroup filesystems no matter where
29 * or how you had them mounted, and deduce the most usable mount for
0e7ff52c 30 * each controller.
ccb4cabe
SH
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
a54694f8 36
ccb4cabe 37#include "config.h"
a54694f8
CB
38
39#include <ctype.h>
40#include <dirent.h>
41#include <errno.h>
42#include <grp.h>
43#include <stdint.h>
ccb4cabe
SH
44#include <stdio.h>
45#include <stdlib.h>
a54694f8 46#include <string.h>
ccb4cabe 47#include <unistd.h>
c8bf519d 48#include <linux/kdev_t.h>
438c4581
CB
49#include <linux/types.h>
50#include <sys/types.h>
c8bf519d 51
b635e92d 52#include "caps.h"
ccb4cabe 53#include "cgroup.h"
6328fd9c 54#include "cgroup_utils.h"
ccb4cabe 55#include "commands.h"
43654d34 56#include "conf.h"
a54694f8 57#include "log.h"
43654d34 58#include "storage/storage.h"
a54694f8 59#include "utils.h"
ccb4cabe
SH
60
61lxc_log_define(lxc_cgfsng, lxc);
62
63static struct cgroup_ops cgfsng_ops;
64
9e288301 65/* A descriptor for a mounted hierarchy
16a2cde9
CB
66 *
67 * @controllers
68 * - legacy hierarchy
9e288301 69 * Either NULL, or a null-terminated list of all the co-mounted controllers.
16a2cde9 70 * - unified hierarchy
9e288301 71 * Either NULL, or a null-terminated list of all enabled controllers.
16a2cde9
CB
72 *
73 * @mountpoint
9e288301 74 * - The mountpoint we will use.
16a2cde9 75 * - legacy hierarchy
9e288301
CB
76 * It will be either /sys/fs/cgroup/controller or
77 * /sys/fs/cgroup/controllerlist.
16a2cde9 78 * - unified hierarchy
9e288301
CB
79 * It will either be /sys/fs/cgroup or /sys/fs/cgroup/<mountpoint-name>
80 * depending on whether this is a hybrid cgroup layout (mix of legacy and
81 * unified hierarchies) or a pure unified cgroup layout.
16a2cde9
CB
82 *
83 * @base_cgroup
9e288301
CB
84 * - The cgroup under which the container cgroup path
85 * is created. This will be either the caller's cgroup (if not root), or
86 * init's cgroup (if root).
16a2cde9
CB
87 *
88 * @fullcgpath
9e288301 89 * - The full path to the containers cgroup.
16a2cde9
CB
90 *
91 * @version
92 * - legacy hierarchy
9e288301
CB
93 * If the hierarchy is a legacy hierarchy this will be set to
94 * CGROUP_SUPER_MAGIC.
16a2cde9 95 * - unified hierarchy
9e288301
CB
96 * If the hierarchy is a legacy hierarchy this will be set to
97 * CGROUP2_SUPER_MAGIC.
ccb4cabe
SH
98 */
99struct hierarchy {
100 char **controllers;
101 char *mountpoint;
102 char *base_cgroup;
103 char *fullcgpath;
d6337a5f 104 int version;
ccb4cabe
SH
105};
106
16a2cde9
CB
107/* The cgroup data which is attached to the lxc_handler.
108 *
109 * @cgroup_pattern
110 * - A copy of lxc.cgroup.pattern.
111 *
112 * @container_cgroup
113 * - If not null, the cgroup which was created for the container. For each
114 * hierarchy, it is created under the @hierarchy->base_cgroup directory.
115 * Relative to the base_cgroup it is the same for all hierarchies.
116 *
117 * @name
118 * - The name of the container.
119 *
120 * @cgroup_meta
121 * - A copy of the container's cgroup information. This overrides
122 * @cgroup_pattern.
123 *
09f3bb13
CB
124 * @cgroup_layout
125 * - What cgroup layout the container is running with.
16a2cde9
CB
126 * - CGROUP_LAYOUT_UNKNOWN
127 * The cgroup layout could not be determined. This should be treated as an
128 * error condition.
129 * - CGROUP_LAYOUT_LEGACY
130 * The container is running with all controllers mounted into legacy cgroup
131 * hierarchies.
132 * - CGROUP_LAYOUT_HYBRID
133 * The container is running with at least one controller mounted into a
134 * legacy cgroup hierarchy and a mountpoint for the unified hierarchy. The
135 * unified hierarchy can be empty (no controllers enabled) or non-empty
136 * (controllers enabled).
137 * - CGROUP_LAYOUT_UNIFIED
138 * The container is running on a pure unified cgroup hierarchy. The unified
139 * hierarchy can be empty (no controllers enabled) or non-empty (controllers
140 * enabled).
ccb4cabe
SH
141 */
142struct cgfsng_handler_data {
ccb4cabe 143 char *cgroup_pattern;
1a0e70ac
CB
144 char *container_cgroup; /* cgroup we created for the container */
145 char *name; /* container name */
43654d34
CB
146 /* per-container cgroup information */
147 struct lxc_cgroup cgroup_meta;
d6337a5f 148 cgroup_layout_t cgroup_layout;
ccb4cabe
SH
149};
150
09f3bb13
CB
151/* @hierarchies
152 * - A NULL-terminated array of struct hierarchy, one per legacy hierarchy. No
153 * duplicates. First sufficient, writeable mounted hierarchy wins.
457ca9aa
SH
154 */
155struct hierarchy **hierarchies;
09f3bb13
CB
156/* Pointer to the unified hierarchy in the null terminated list @hierarchies.
157 * This is merely a convenience for hybrid cgroup layouts to easily retrieve the
158 * unified hierarchy without iterating throught @hierarchies.
159 */
d6337a5f 160struct hierarchy *unified;
457ca9aa 161/*
09f3bb13
CB
162 * @cgroup_layout
163 * - What cgroup layout the container is running with.
164 * - CGROUP_LAYOUT_UNKNOWN
165 * The cgroup layout could not be determined. This should be treated as an
166 * error condition.
167 * - CGROUP_LAYOUT_LEGACY
168 * The container is running with all controllers mounted into legacy cgroup
169 * hierarchies.
170 * - CGROUP_LAYOUT_HYBRID
171 * The container is running with at least one controller mounted into a
172 * legacy cgroup hierarchy and a mountpoint for the unified hierarchy. The
173 * unified hierarchy can be empty (no controllers enabled) or non-empty
174 * (controllers enabled).
175 * - CGROUP_LAYOUT_UNIFIED
176 * The container is running on a pure unified cgroup hierarchy. The unified
177 * hierarchy can be empty (no controllers enabled) or non-empty (controllers
178 * enabled).
457ca9aa 179 */
09f3bb13
CB
180cgroup_layout_t cgroup_layout;
181/* What controllers is the container supposed to use. */
457ca9aa
SH
182char *cgroup_use;
183
09f3bb13
CB
184/* @lxc_cgfsng_debug
185 * - Whether to print debug info to stdout for the cgfsng driver.
e4aeecf5
CB
186 */
187static bool lxc_cgfsng_debug;
188
09f3bb13
CB
189#define CGFSNG_DEBUG(format, ...) \
190 do { \
191 if (lxc_cgfsng_debug) \
192 printf("cgfsng: " format, ##__VA_ARGS__); \
193 } while (0)
65d78313 194
ccb4cabe
SH
195static void free_string_list(char **clist)
196{
2d5fe5ba 197 int i;
ccb4cabe 198
2d5fe5ba
CB
199 if (!clist)
200 return;
201
202 for (i = 0; clist[i]; i++)
203 free(clist[i]);
204
205 free(clist);
ccb4cabe
SH
206}
207
7745483d 208/* Allocate a pointer, do not fail. */
ccb4cabe
SH
209static void *must_alloc(size_t sz)
210{
211 return must_realloc(NULL, sz);
212}
213
ccb4cabe
SH
214/*
215 * Given a pointer to a null-terminated array of pointers, realloc to
216 * add one entry, and point the new entry to NULL. Do not fail. Return
217 * the index to the second-to-last entry - that is, the one which is
218 * now available for use (keeping the list null-terminated).
219 */
220static int append_null_to_list(void ***list)
221{
222 int newentry = 0;
223
224 if (*list)
225 for (; (*list)[newentry]; newentry++);
226
227 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
228 (*list)[newentry + 1] = NULL;
229 return newentry;
230}
231
232/*
233 * Given a null-terminated array of strings, check whether @entry
234 * is one of the strings
235 */
236static bool string_in_list(char **list, const char *entry)
237{
238 int i;
239
240 if (!list)
241 return false;
d6337a5f 242
ccb4cabe
SH
243 for (i = 0; list[i]; i++)
244 if (strcmp(list[i], entry) == 0)
245 return true;
246
247 return false;
248}
249
ac010944
CB
250/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
251 * "name=systemd". Do not fail.
252 */
253static char *cg_legacy_must_prefix_named(char *entry)
254{
255 size_t len;
256 char *prefixed;
257
258 len = strlen(entry);
259 prefixed = must_alloc(len + 6);
260
261 memcpy(prefixed, "name=", sizeof("name="));
262 memcpy(prefixed + sizeof("name="), entry, len);
263 prefixed[len + 5] = '\0';
264 return prefixed;
265}
266
ccb4cabe
SH
267/*
268 * append an entry to the clist. Do not fail.
269 * *clist must be NULL the first time we are called.
270 *
271 * We also handle named subsystems here. Any controller which is not a
272 * kernel subsystem, we prefix 'name='. Any which is both a kernel and
273 * named subsystem, we refuse to use because we're not sure which we
274 * have here. (TODO - we could work around this in some cases by just
275 * remounting to be unambiguous, or by comparing mountpoint contents
276 * with current cgroup)
277 *
278 * The last entry will always be NULL.
279 */
280static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry)
281{
282 int newentry;
283 char *copy;
284
285 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
c2712f64 286 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
ccb4cabe
SH
287 ERROR("It is both a named and kernel subsystem");
288 return;
289 }
290
291 newentry = append_null_to_list((void ***)clist);
292
293 if (strncmp(entry, "name=", 5) == 0)
294 copy = must_copy_string(entry);
295 else if (string_in_list(klist, entry))
296 copy = must_copy_string(entry);
297 else
7745483d 298 copy = cg_legacy_must_prefix_named(entry);
ccb4cabe
SH
299
300 (*clist)[newentry] = copy;
301}
302
ccb4cabe
SH
303static void free_handler_data(struct cgfsng_handler_data *d)
304{
ccb4cabe
SH
305 free(d->cgroup_pattern);
306 free(d->container_cgroup);
307 free(d->name);
43654d34
CB
308 if (d->cgroup_meta.dir)
309 free(d->cgroup_meta.dir);
310 if (d->cgroup_meta.controllers)
311 free(d->cgroup_meta.controllers);
ccb4cabe
SH
312 free(d);
313}
314
315/*
316 * Given a handler's cgroup data, return the struct hierarchy for the
317 * controller @c, or NULL if there is none.
318 */
457ca9aa 319struct hierarchy *get_hierarchy(const char *c)
ccb4cabe
SH
320{
321 int i;
322
457ca9aa 323 if (!hierarchies)
ccb4cabe 324 return NULL;
d6337a5f 325
457ca9aa 326 for (i = 0; hierarchies[i]; i++) {
d6337a5f
CB
327 if (!c) {
328 /* This is the empty unified hierarchy. */
329 if (hierarchies[i]->controllers &&
330 !hierarchies[i]->controllers[0])
331 return hierarchies[i];
332
333 return NULL;
334 }
335
457ca9aa
SH
336 if (string_in_list(hierarchies[i]->controllers, c))
337 return hierarchies[i];
ccb4cabe 338 }
d6337a5f 339
ccb4cabe
SH
340 return NULL;
341}
342
a54694f8
CB
343#define BATCH_SIZE 50
344static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
345{
346 int newbatches = (newlen / BATCH_SIZE) + 1;
347 int oldbatches = (oldlen / BATCH_SIZE) + 1;
348
349 if (!*mem || newbatches > oldbatches) {
350 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
351 }
352}
353
354static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
355{
356 size_t full = oldlen + newlen;
357
358 batch_realloc(dest, oldlen, full + 1);
359
360 memcpy(*dest + oldlen, new, newlen + 1);
361}
362
363/* Slurp in a whole file */
d6337a5f 364static char *read_file(const char *fnam)
a54694f8
CB
365{
366 FILE *f;
367 char *line = NULL, *buf = NULL;
368 size_t len = 0, fulllen = 0;
369 int linelen;
370
371 f = fopen(fnam, "r");
372 if (!f)
373 return NULL;
374 while ((linelen = getline(&line, &len, f)) != -1) {
375 append_line(&buf, fulllen, line, linelen);
376 fulllen += linelen;
377 }
378 fclose(f);
379 free(line);
380 return buf;
381}
382
383/* Taken over modified from the kernel sources. */
384#define NBITS 32 /* bits in uint32_t */
385#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
386#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
387
388static void set_bit(unsigned bit, uint32_t *bitarr)
389{
390 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
391}
392
393static void clear_bit(unsigned bit, uint32_t *bitarr)
394{
395 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
396}
397
398static bool is_set(unsigned bit, uint32_t *bitarr)
399{
400 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
401}
402
403/* Create cpumask from cpulist aka turn:
404 *
405 * 0,2-3
406 *
407 * into bit array
408 *
409 * 1 0 1 1
410 */
411static uint32_t *lxc_cpumask(char *buf, size_t nbits)
412{
413 char *token;
414 char *saveptr = NULL;
415 size_t arrlen = BITS_TO_LONGS(nbits);
416 uint32_t *bitarr = calloc(arrlen, sizeof(uint32_t));
417 if (!bitarr)
418 return NULL;
419
420 for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
421 errno = 0;
422 unsigned start = strtoul(token, NULL, 0);
423 unsigned end = start;
424
425 char *range = strchr(token, '-');
426 if (range)
427 end = strtoul(range + 1, NULL, 0);
428 if (!(start <= end)) {
429 free(bitarr);
430 return NULL;
431 }
432
433 if (end >= nbits) {
434 free(bitarr);
435 return NULL;
436 }
437
438 while (start <= end)
439 set_bit(start++, bitarr);
440 }
441
442 return bitarr;
443}
444
a54694f8
CB
445/* Turn cpumask into simple, comma-separated cpulist. */
446static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
447{
448 size_t i;
449 int ret;
eab15c1e 450 char numstr[LXC_NUMSTRLEN64] = {0};
a54694f8
CB
451 char **cpulist = NULL;
452
453 for (i = 0; i <= nbits; i++) {
454 if (is_set(i, bitarr)) {
eab15c1e
CB
455 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
456 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
a54694f8
CB
457 lxc_free_array((void **)cpulist, free);
458 return NULL;
459 }
460 if (lxc_append_string(&cpulist, numstr) < 0) {
461 lxc_free_array((void **)cpulist, free);
462 return NULL;
463 }
464 }
465 }
466 return lxc_string_join(",", (const char **)cpulist, false);
467}
468
469static ssize_t get_max_cpus(char *cpulist)
470{
471 char *c1, *c2;
472 char *maxcpus = cpulist;
473 size_t cpus = 0;
474
475 c1 = strrchr(maxcpus, ',');
476 if (c1)
477 c1++;
478
479 c2 = strrchr(maxcpus, '-');
480 if (c2)
481 c2++;
482
483 if (!c1 && !c2)
484 c1 = maxcpus;
485 else if (c1 > c2)
486 c2 = c1;
487 else if (c1 < c2)
488 c1 = c2;
1a0e70ac 489 else if (!c1 && c2) /* The reverse case is obvs. not needed. */
a54694f8
CB
490 c1 = c2;
491
492 /* If the above logic is correct, c1 should always hold a valid string
493 * here.
494 */
495
496 errno = 0;
497 cpus = strtoul(c1, NULL, 0);
498 if (errno != 0)
499 return -1;
500
501 return cpus;
502}
503
6f9584d8 504#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
a3926f6a 505static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
a54694f8
CB
506{
507 char *lastslash, *fpath, oldv;
508 int ret;
509 ssize_t i;
510
511 ssize_t maxposs = 0, maxisol = 0;
512 char *cpulist = NULL, *posscpus = NULL, *isolcpus = NULL;
513 uint32_t *possmask = NULL, *isolmask = NULL;
6f9584d8 514 bool bret = false, flipped_bit = false;
a54694f8
CB
515
516 lastslash = strrchr(path, '/');
1a0e70ac 517 if (!lastslash) { /* bug... this shouldn't be possible */
6f9584d8 518 ERROR("Invalid path: %s.", path);
a54694f8
CB
519 return bret;
520 }
521 oldv = *lastslash;
522 *lastslash = '\0';
523 fpath = must_make_path(path, "cpuset.cpus", NULL);
524 posscpus = read_file(fpath);
6f9584d8
CB
525 if (!posscpus) {
526 SYSERROR("Could not read file: %s.\n", fpath);
527 goto on_error;
528 }
a54694f8
CB
529
530 /* Get maximum number of cpus found in possible cpuset. */
531 maxposs = get_max_cpus(posscpus);
532 if (maxposs < 0)
6f9584d8 533 goto on_error;
a54694f8 534
6f9584d8
CB
535 if (!file_exists(__ISOL_CPUS)) {
536 /* This system doesn't expose isolated cpus. */
537 DEBUG("Path: "__ISOL_CPUS" to read isolated cpus from does not exist.\n");
65d29cbc
CB
538 cpulist = posscpus;
539 /* No isolated cpus but we weren't already initialized by
540 * someone. We should simply copy the parents cpuset.cpus
541 * values.
542 */
543 if (!am_initialized) {
544 DEBUG("Copying cpuset of parent cgroup.");
545 goto copy_parent;
546 }
547 /* No isolated cpus but we were already initialized by someone.
548 * Nothing more to do for us.
549 */
6f9584d8
CB
550 goto on_success;
551 }
552
553 isolcpus = read_file(__ISOL_CPUS);
554 if (!isolcpus) {
555 SYSERROR("Could not read file "__ISOL_CPUS);
556 goto on_error;
557 }
a54694f8 558 if (!isdigit(isolcpus[0])) {
6f9584d8 559 DEBUG("No isolated cpus detected.");
a54694f8
CB
560 cpulist = posscpus;
561 /* No isolated cpus but we weren't already initialized by
562 * someone. We should simply copy the parents cpuset.cpus
563 * values.
564 */
6f9584d8
CB
565 if (!am_initialized) {
566 DEBUG("Copying cpuset of parent cgroup.");
a54694f8 567 goto copy_parent;
6f9584d8 568 }
a54694f8
CB
569 /* No isolated cpus but we were already initialized by someone.
570 * Nothing more to do for us.
571 */
6f9584d8 572 goto on_success;
a54694f8
CB
573 }
574
575 /* Get maximum number of cpus found in isolated cpuset. */
576 maxisol = get_max_cpus(isolcpus);
577 if (maxisol < 0)
6f9584d8 578 goto on_error;
a54694f8
CB
579
580 if (maxposs < maxisol)
581 maxposs = maxisol;
582 maxposs++;
583
584 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8
CB
585 if (!possmask) {
586 ERROR("Could not create cpumask for all possible cpus.\n");
587 goto on_error;
588 }
a54694f8
CB
589
590 isolmask = lxc_cpumask(isolcpus, maxposs);
6f9584d8
CB
591 if (!isolmask) {
592 ERROR("Could not create cpumask for all isolated cpus.\n");
593 goto on_error;
594 }
a54694f8
CB
595
596 for (i = 0; i <= maxposs; i++) {
597 if (is_set(i, isolmask) && is_set(i, possmask)) {
6f9584d8 598 flipped_bit = true;
a54694f8
CB
599 clear_bit(i, possmask);
600 }
601 }
602
6f9584d8
CB
603 if (!flipped_bit) {
604 DEBUG("No isolated cpus present in cpuset.");
605 goto on_success;
606 }
607 DEBUG("Removed isolated cpus from cpuset.");
608
a54694f8 609 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
6f9584d8
CB
610 if (!cpulist) {
611 ERROR("Could not create cpu list.\n");
612 goto on_error;
613 }
a54694f8
CB
614
615copy_parent:
616 *lastslash = oldv;
dcbc861e 617 free(fpath);
a54694f8
CB
618 fpath = must_make_path(path, "cpuset.cpus", NULL);
619 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false);
6f9584d8
CB
620 if (ret < 0) {
621 SYSERROR("Could not write cpu list to: %s.\n", fpath);
622 goto on_error;
623 }
624
625on_success:
626 bret = true;
a54694f8 627
6f9584d8 628on_error:
a54694f8
CB
629 free(fpath);
630
631 free(isolcpus);
632 free(isolmask);
633
634 if (posscpus != cpulist)
635 free(posscpus);
636 free(possmask);
637
638 free(cpulist);
639 return bret;
640}
641
e3a3fecf
SH
642/* Copy contents of parent(@path)/@file to @path/@file */
643static bool copy_parent_file(char *path, char *file)
644{
645 char *lastslash, *value = NULL, *fpath, oldv;
646 int len = 0;
647 int ret;
648
649 lastslash = strrchr(path, '/');
1a0e70ac 650 if (!lastslash) { /* bug... this shouldn't be possible */
e3a3fecf
SH
651 ERROR("cgfsng:copy_parent_file: bad path %s", path);
652 return false;
653 }
654 oldv = *lastslash;
655 *lastslash = '\0';
656 fpath = must_make_path(path, file, NULL);
657 len = lxc_read_from_file(fpath, NULL, 0);
658 if (len <= 0)
659 goto bad;
660 value = must_alloc(len + 1);
661 if (lxc_read_from_file(fpath, value, len) != len)
662 goto bad;
663 free(fpath);
664 *lastslash = oldv;
665 fpath = must_make_path(path, file, NULL);
666 ret = lxc_write_to_file(fpath, value, len, false);
667 if (ret < 0)
668 SYSERROR("Unable to write %s to %s", value, fpath);
669 free(fpath);
670 free(value);
671 return ret >= 0;
672
673bad:
674 SYSERROR("Error reading '%s'", fpath);
675 free(fpath);
676 free(value);
677 return false;
678}
679
680/*
681 * Initialize the cpuset hierarchy in first directory of @gname and
682 * set cgroup.clone_children so that children inherit settings.
683 * Since the h->base_path is populated by init or ourselves, we know
684 * it is already initialized.
685 */
a3926f6a 686static bool cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf
SH
687{
688 char *cgpath, *clonechildrenpath, v, *slash;
689
690 if (!string_in_list(h->controllers, "cpuset"))
691 return true;
692
693 if (*cgname == '/')
694 cgname++;
695 slash = strchr(cgname, '/');
696 if (slash)
697 *slash = '\0';
698
699 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
700 if (slash)
701 *slash = '/';
702 if (mkdir(cgpath, 0755) < 0 && errno != EEXIST) {
703 SYSERROR("Failed to create '%s'", cgpath);
704 free(cgpath);
705 return false;
706 }
6f9584d8 707
e3a3fecf 708 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
6328fd9c
CB
709 /* unified hierarchy doesn't have clone_children */
710 if (!file_exists(clonechildrenpath)) {
e3a3fecf
SH
711 free(clonechildrenpath);
712 free(cgpath);
713 return true;
714 }
715 if (lxc_read_from_file(clonechildrenpath, &v, 1) < 0) {
716 SYSERROR("Failed to read '%s'", clonechildrenpath);
717 free(clonechildrenpath);
718 free(cgpath);
719 return false;
720 }
721
a54694f8 722 /* Make sure any isolated cpus are removed from cpuset.cpus. */
a3926f6a 723 if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1')) {
6f9584d8
CB
724 SYSERROR("Failed to remove isolated cpus.");
725 free(clonechildrenpath);
726 free(cgpath);
a54694f8 727 return false;
6f9584d8 728 }
a54694f8 729
e3a3fecf 730 if (v == '1') { /* already set for us by someone else */
6f9584d8 731 DEBUG("\"cgroup.clone_children\" was already set to \"1\".");
e3a3fecf
SH
732 free(clonechildrenpath);
733 free(cgpath);
734 return true;
735 }
736
737 /* copy parent's settings */
a54694f8 738 if (!copy_parent_file(cgpath, "cpuset.mems")) {
6f9584d8 739 SYSERROR("Failed to copy \"cpuset.mems\" settings.");
e3a3fecf
SH
740 free(cgpath);
741 free(clonechildrenpath);
742 return false;
743 }
744 free(cgpath);
745
746 if (lxc_write_to_file(clonechildrenpath, "1", 1, false) < 0) {
747 /* Set clone_children so children inherit our settings */
748 SYSERROR("Failed to write 1 to %s", clonechildrenpath);
749 free(clonechildrenpath);
750 return false;
751 }
752 free(clonechildrenpath);
753 return true;
754}
755
ccb4cabe
SH
756/*
757 * Given two null-terminated lists of strings, return true if any string
758 * is in both.
759 */
760static bool controller_lists_intersect(char **l1, char **l2)
761{
762 int i;
763
764 if (!l1 || !l2)
765 return false;
766
767 for (i = 0; l1[i]; i++) {
768 if (string_in_list(l2, l1[i]))
769 return true;
770 }
771 return false;
772}
773
774/*
775 * For a null-terminated list of controllers @clist, return true if any of
776 * those controllers is already listed the null-terminated list of
777 * hierarchies @hlist. Realistically, if one is present, all must be present.
778 */
779static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
780{
781 int i;
782
783 if (!hlist)
784 return false;
785 for (i = 0; hlist[i]; i++)
786 if (controller_lists_intersect(hlist[i]->controllers, clist))
787 return true;
788 return false;
789
790}
791
792/*
793 * Return true if the controller @entry is found in the null-terminated
794 * list of hierarchies @hlist
795 */
796static bool controller_found(struct hierarchy **hlist, char *entry)
797{
798 int i;
d6337a5f 799
ccb4cabe
SH
800 if (!hlist)
801 return false;
802
803 for (i = 0; hlist[i]; i++)
804 if (string_in_list(hlist[i]->controllers, entry))
805 return true;
d6337a5f 806
ccb4cabe
SH
807 return false;
808}
809
810/*
c30b61c3
SH
811 * Return true if all of the controllers which we require have been found.
812 * The required list is freezer and anything in * lxc.cgroup.use.
ccb4cabe 813 */
457ca9aa 814static bool all_controllers_found(void)
ccb4cabe
SH
815{
816 char *p, *saveptr = NULL;
457ca9aa 817 struct hierarchy ** hlist = hierarchies;
ccb4cabe 818
ccb4cabe 819 if (!controller_found(hlist, "freezer")) {
65d78313 820 CGFSNG_DEBUG("No freezer controller mountpoint found\n");
ccb4cabe
SH
821 return false;
822 }
823
457ca9aa 824 if (!cgroup_use)
ccb4cabe 825 return true;
c2712f64 826
457ca9aa 827 for (p = strtok_r(cgroup_use, ",", &saveptr); p;
ccb4cabe
SH
828 p = strtok_r(NULL, ",", &saveptr)) {
829 if (!controller_found(hlist, p)) {
65d78313 830 CGFSNG_DEBUG("No %s controller mountpoint found\n", p);
ccb4cabe
SH
831 return false;
832 }
833 }
c2712f64 834
ccb4cabe
SH
835 return true;
836}
837
ccb4cabe
SH
838/*
839 * Get the controllers from a mountinfo line
840 * There are other ways we could get this info. For lxcfs, field 3
841 * is /cgroup/controller-list. For cgroupfs, we could parse the mount
842 * options. But we simply assume that the mountpoint must be
843 * /sys/fs/cgroup/controller-list
844 */
a3926f6a
CB
845static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
846 int type)
ccb4cabe 847{
6328fd9c 848 /* the fourth field is /sys/fs/cgroup/comma-delimited-controller-list */
ccb4cabe 849 int i;
411ac6d8 850 char *dup, *p2, *tok;
d6337a5f 851 char *p = line, *saveptr = NULL, *sep = ",";
411ac6d8 852 char **aret = NULL;
6328fd9c 853
ccb4cabe 854 for (i = 0; i < 4; i++) {
235f1815 855 p = strchr(p, ' ');
ccb4cabe
SH
856 if (!p)
857 return NULL;
858 p++;
859 }
a55f31bd 860
ccb4cabe
SH
861 /* note - if we change how mountinfo works, then our caller
862 * will need to verify /sys/fs/cgroup/ in this field */
c2712f64 863 if (strncmp(p, "/sys/fs/cgroup/", 15)) {
65d78313 864 CGFSNG_DEBUG("Found hierarchy not under /sys/fs/cgroup: \"%s\"\n", p);
ccb4cabe 865 return NULL;
5059aae9 866 }
d6337a5f 867
ccb4cabe 868 p += 15;
235f1815 869 p2 = strchr(p, ' ');
ccb4cabe 870 if (!p2) {
65d78313 871 CGFSNG_DEBUG("Corrupt mountinfo\n");
ccb4cabe
SH
872 return NULL;
873 }
874 *p2 = '\0';
6328fd9c 875
d6337a5f
CB
876 if (type == CGROUP_SUPER_MAGIC) {
877 /* strdup() here for v1 hierarchies. Otherwise strtok_r() will
878 * destroy mountpoints such as "/sys/fs/cgroup/cpu,cpuacct".
879 */
880 dup = strdup(p);
881 if (!dup)
882 return NULL;
883
884 for (tok = strtok_r(dup, sep, &saveptr); tok;
885 tok = strtok_r(NULL, sep, &saveptr))
886 must_append_controller(klist, nlist, &aret, tok);
887
888 free(dup);
411ac6d8 889 }
d6337a5f
CB
890 *p2 = ' ';
891 return aret;
892}
411ac6d8 893
d6337a5f
CB
894static char **cg_unified_make_empty_controller(void)
895{
896 int newentry;
897 char **aret = NULL;
898
899 newentry = append_null_to_list((void ***)&aret);
900 aret[newentry] = NULL;
901 return aret;
902}
903
904static char **cg_unified_get_controllers(const char *file)
905{
906 char *buf, *tok;
907 char *saveptr = NULL, *sep = " \t\n";
908 char **aret = NULL;
909
910 buf = read_file(file);
911 if (!buf)
411ac6d8 912 return NULL;
6328fd9c 913
d6337a5f
CB
914 for (tok = strtok_r(buf, sep, &saveptr); tok;
915 tok = strtok_r(NULL, sep, &saveptr)) {
916 int newentry;
917 char *copy;
918
919 newentry = append_null_to_list((void ***)&aret);
920 copy = must_copy_string(tok);
921 aret[newentry] = copy;
ccb4cabe
SH
922 }
923
d6337a5f 924 free(buf);
ccb4cabe
SH
925 return aret;
926}
927
d6337a5f
CB
928static struct hierarchy *add_hierarchy(char **clist, char *mountpoint,
929 char *base_cgroup, int type)
ccb4cabe
SH
930{
931 struct hierarchy *new;
932 int newentry;
933
934 new = must_alloc(sizeof(*new));
935 new->controllers = clist;
936 new->mountpoint = mountpoint;
937 new->base_cgroup = base_cgroup;
938 new->fullcgpath = NULL;
d6337a5f 939 new->version = type;
6328fd9c 940
457ca9aa
SH
941 newentry = append_null_to_list((void ***)&hierarchies);
942 hierarchies[newentry] = new;
d6337a5f 943 return new;
ccb4cabe
SH
944}
945
946/*
947 * Get a copy of the mountpoint from @line, which is a line from
948 * /proc/self/mountinfo
949 */
a3926f6a 950static char *cg_hybrid_get_mountpoint(char *line)
ccb4cabe
SH
951{
952 int i;
d6337a5f 953 char *p2;
ccb4cabe 954 size_t len;
d6337a5f
CB
955 char *p = line;
956 char *sret = NULL;
ccb4cabe
SH
957
958 for (i = 0; i < 4; i++) {
235f1815 959 p = strchr(p, ' ');
ccb4cabe
SH
960 if (!p)
961 return NULL;
962 p++;
963 }
d6337a5f
CB
964
965 if (strncmp(p, "/sys/fs/cgroup/", 15))
966 return NULL;
967
968 p2 = strchr(p + 15, ' ');
969 if (!p2)
970 return NULL;
971 *p2 = '\0';
972
ccb4cabe
SH
973 len = strlen(p);
974 sret = must_alloc(len + 1);
975 memcpy(sret, p, len);
976 sret[len] = '\0';
977 return sret;
978}
979
980/*
981 * Given a multi-line string, return a null-terminated copy of the
982 * current line.
983 */
984static char *copy_to_eol(char *p)
985{
235f1815 986 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
987 size_t len;
988
989 if (!p2)
990 return NULL;
991
992 len = p2 - p;
993 sret = must_alloc(len + 1);
994 memcpy(sret, p, len);
995 sret[len] = '\0';
996 return sret;
997}
998
999/*
1000 * cgline: pointer to character after the first ':' in a line in a
1001 * \n-terminated /proc/self/cgroup file. Check whether * controller c is
1002 * present.
1003 */
1004static bool controller_in_clist(char *cgline, char *c)
1005{
1006 char *tok, *saveptr = NULL, *eol, *tmp;
1007 size_t len;
1008
235f1815 1009 eol = strchr(cgline, ':');
ccb4cabe
SH
1010 if (!eol)
1011 return false;
1012
1013 len = eol - cgline;
1014 tmp = alloca(len + 1);
1015 memcpy(tmp, cgline, len);
1016 tmp[len] = '\0';
1017
1018 for (tok = strtok_r(tmp, ",", &saveptr); tok;
d6337a5f 1019 tok = strtok_r(NULL, ",", &saveptr)) {
ccb4cabe
SH
1020 if (strcmp(tok, c) == 0)
1021 return true;
1022 }
d6337a5f 1023
ccb4cabe
SH
1024 return false;
1025}
1026
1027/*
1028 * @basecginfo is a copy of /proc/$$/cgroup. Return the current
1029 * cgroup for @controller
1030 */
a3926f6a 1031static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller, int type)
ccb4cabe
SH
1032{
1033 char *p = basecginfo;
6328fd9c 1034
d6337a5f
CB
1035 for (;;) {
1036 bool is_cgv2_base_cgroup = false;
1037
6328fd9c 1038 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
d6337a5f
CB
1039 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
1040 is_cgv2_base_cgroup = true;
ccb4cabe 1041
235f1815 1042 p = strchr(p, ':');
ccb4cabe
SH
1043 if (!p)
1044 return NULL;
1045 p++;
d6337a5f
CB
1046
1047 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
235f1815 1048 p = strchr(p, ':');
ccb4cabe
SH
1049 if (!p)
1050 return NULL;
1051 p++;
1052 return copy_to_eol(p);
1053 }
1054
235f1815 1055 p = strchr(p, '\n');
ccb4cabe
SH
1056 if (!p)
1057 return NULL;
1058 p++;
1059 }
1060}
1061
ccb4cabe
SH
1062static void must_append_string(char ***list, char *entry)
1063{
1064 int newentry = append_null_to_list((void ***)list);
1065 char *copy;
1066
1067 copy = must_copy_string(entry);
1068 (*list)[newentry] = copy;
1069}
1070
d6337a5f 1071static int get_existing_subsystems(char ***klist, char ***nlist)
ccb4cabe
SH
1072{
1073 FILE *f;
1074 char *line = NULL;
1075 size_t len = 0;
1076
d6337a5f
CB
1077 f = fopen("/proc/self/cgroup", "r");
1078 if (!f)
1079 return -1;
1080
ccb4cabe
SH
1081 while (getline(&line, &len, f) != -1) {
1082 char *p, *p2, *tok, *saveptr = NULL;
235f1815 1083 p = strchr(line, ':');
ccb4cabe
SH
1084 if (!p)
1085 continue;
1086 p++;
235f1815 1087 p2 = strchr(p, ':');
ccb4cabe
SH
1088 if (!p2)
1089 continue;
1090 *p2 = '\0';
ff8d6ee9 1091
6328fd9c
CB
1092 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
1093 * contains an entry of the form:
ff8d6ee9
CB
1094 *
1095 * 0::/some/path
1096 *
6328fd9c 1097 * In this case we use "cgroup2" as controller name.
ff8d6ee9 1098 */
6328fd9c
CB
1099 if ((p2 - p) == 0) {
1100 must_append_string(klist, "cgroup2");
ff8d6ee9 1101 continue;
6328fd9c 1102 }
ff8d6ee9 1103
ccb4cabe 1104 for (tok = strtok_r(p, ",", &saveptr); tok;
d6337a5f 1105 tok = strtok_r(NULL, ",", &saveptr)) {
ccb4cabe
SH
1106 if (strncmp(tok, "name=", 5) == 0)
1107 must_append_string(nlist, tok);
1108 else
1109 must_append_string(klist, tok);
1110 }
1111 }
1112
1113 free(line);
1114 fclose(f);
d6337a5f 1115 return 0;
ccb4cabe
SH
1116}
1117
1118static void trim(char *s)
1119{
1120 size_t len = strlen(s);
2c28d76b 1121 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
1122 s[--len] = '\0';
1123}
1124
e4aeecf5
CB
1125static void lxc_cgfsng_print_handler_data(const struct cgfsng_handler_data *d)
1126{
1127 printf("Cgroup information:\n");
1128 printf(" container name: %s\n", d->name ? d->name : "(null)");
1129 printf(" lxc.cgroup.use: %s\n", cgroup_use ? cgroup_use : "(null)");
43654d34
CB
1130 printf(" lxc.cgroup.pattern: %s\n",
1131 d->cgroup_pattern ? d->cgroup_pattern : "(null)");
1132 printf(" lxc.cgroup.dir: %s\n",
1133 d->cgroup_meta.dir ? d->cgroup_meta.dir : "(null)");
1134 printf(" cgroup: %s\n",
1135 d->container_cgroup ? d->container_cgroup : "(null)");
e4aeecf5
CB
1136}
1137
1138static void lxc_cgfsng_print_hierarchies()
ccb4cabe 1139{
a7b0cc4c 1140 struct hierarchy **it;
ccb4cabe 1141 int i;
41c33dbe 1142
457ca9aa 1143 if (!hierarchies) {
c2712f64 1144 printf(" No hierarchies found\n");
ccb4cabe
SH
1145 return;
1146 }
e4aeecf5 1147 printf(" Hierarchies:\n");
a7b0cc4c
CB
1148 for (i = 0, it = hierarchies; it && *it; it++, i++) {
1149 char **cit;
ccb4cabe 1150 int j;
c2712f64
CB
1151 printf(" %d: base_cgroup: %s\n", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1152 printf(" mountpoint: %s\n", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
e4aeecf5 1153 printf(" controllers:\n");
a7b0cc4c 1154 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
e4aeecf5 1155 printf(" %d: %s\n", j, *cit);
ccb4cabe
SH
1156 }
1157}
41c33dbe 1158
a3926f6a
CB
1159static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1160 char **nlist)
41c33dbe
SH
1161{
1162 int k;
a7b0cc4c 1163 char **it;
41c33dbe 1164
a7b0cc4c
CB
1165 printf("basecginfo is:\n");
1166 printf("%s\n", basecginfo);
41c33dbe 1167
a7b0cc4c
CB
1168 for (k = 0, it = klist; it && *it; it++, k++)
1169 printf("kernel subsystem %d: %s\n", k, *it);
1170 for (k = 0, it = nlist; it && *it; it++, k++)
1171 printf("named subsystem %d: %s\n", k, *it);
41c33dbe 1172}
ccb4cabe 1173
e4aeecf5
CB
1174static void lxc_cgfsng_print_debuginfo(const struct cgfsng_handler_data *d)
1175{
1176 lxc_cgfsng_print_handler_data(d);
1177 lxc_cgfsng_print_hierarchies();
1178}
1179
ccb4cabe
SH
1180/*
1181 * At startup, parse_hierarchies finds all the info we need about
1182 * cgroup mountpoints and current cgroups, and stores it in @d.
1183 */
a3926f6a 1184static bool cg_hybrid_init(void)
ccb4cabe 1185{
d6337a5f
CB
1186 int ret;
1187 char *basecginfo;
1188 bool will_escape;
ccb4cabe 1189 FILE *f;
ccb4cabe 1190 size_t len = 0;
d6337a5f
CB
1191 char *line = NULL;
1192 char **klist = NULL, **nlist = NULL;
ccb4cabe 1193
d30ec4cb
SH
1194 /*
1195 * Root spawned containers escape the current cgroup, so use init's
1196 * cgroups as our base in that case.
1197 */
d6337a5f
CB
1198 will_escape = (geteuid() == 0);
1199 if (will_escape)
ccb4cabe 1200 basecginfo = read_file("/proc/1/cgroup");
d6337a5f
CB
1201 else
1202 basecginfo = read_file("/proc/self/cgroup");
ccb4cabe
SH
1203 if (!basecginfo)
1204 return false;
1205
d6337a5f
CB
1206 ret = get_existing_subsystems(&klist, &nlist);
1207 if (ret < 0) {
1208 CGFSNG_DEBUG("Failed to retrieve available cgroup v1 controllers\n");
1209 free(basecginfo);
ccb4cabe
SH
1210 return false;
1211 }
1212
d6337a5f
CB
1213 f = fopen("/proc/self/mountinfo", "r");
1214 if (!f) {
1215 CGFSNG_DEBUG("Failed to open \"/proc/self/mountinfo\"\n");
bd01b7d5 1216 free(basecginfo);
d6337a5f
CB
1217 return false;
1218 }
41c33dbe 1219
e4aeecf5
CB
1220 if (lxc_cgfsng_debug)
1221 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
ccb4cabe 1222
ccb4cabe 1223 while (getline(&line, &len, f) != -1) {
49ff3958 1224 int type;
d6337a5f
CB
1225 bool writeable;
1226 struct hierarchy *new;
1227 char *mountpoint = NULL, *base_cgroup = NULL;
1228 char **controller_list = NULL;
ccb4cabe 1229
49ff3958 1230 type = get_cgroup_version(line);
d6337a5f 1231 if (type == 0)
ccb4cabe
SH
1232 continue;
1233
d6337a5f 1234 if (type == CGROUP2_SUPER_MAGIC && unified)
ccb4cabe
SH
1235 continue;
1236
d6337a5f
CB
1237 if (cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
1238 if (type == CGROUP2_SUPER_MAGIC)
1239 cgroup_layout = CGROUP_LAYOUT_UNIFIED;
1240 else if (type == CGROUP_SUPER_MAGIC)
1241 cgroup_layout = CGROUP_LAYOUT_LEGACY;
1242 } else if (cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
1243 if (type == CGROUP_SUPER_MAGIC)
1244 cgroup_layout = CGROUP_LAYOUT_HYBRID;
1245 } else if (cgroup_layout == CGROUP_LAYOUT_LEGACY) {
1246 if (type == CGROUP2_SUPER_MAGIC)
1247 cgroup_layout = CGROUP_LAYOUT_HYBRID;
ccb4cabe
SH
1248 }
1249
a3926f6a 1250 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
d6337a5f
CB
1251 if (!controller_list && type == CGROUP_SUPER_MAGIC)
1252 continue;
1253
1254 if (type == CGROUP_SUPER_MAGIC)
1255 if (controller_list_is_dup(hierarchies, controller_list))
1256 goto next;
1257
a3926f6a 1258 mountpoint = cg_hybrid_get_mountpoint(line);
ccb4cabe 1259 if (!mountpoint) {
65d78313 1260 CGFSNG_DEBUG("Failed parsing mountpoint from \"%s\"\n", line);
d6337a5f 1261 goto next;
ccb4cabe
SH
1262 }
1263
d6337a5f 1264 if (type == CGROUP_SUPER_MAGIC)
a3926f6a 1265 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
d6337a5f 1266 else
a3926f6a 1267 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
ccb4cabe 1268 if (!base_cgroup) {
d6337a5f
CB
1269 CGFSNG_DEBUG("Failed to find current cgroup\n");
1270 goto next;
ccb4cabe 1271 }
6328fd9c 1272
ccb4cabe
SH
1273 trim(base_cgroup);
1274 prune_init_scope(base_cgroup);
d6337a5f 1275 if (type == CGROUP2_SUPER_MAGIC)
6328fd9c
CB
1276 writeable = test_writeable_v2(mountpoint, base_cgroup);
1277 else
1278 writeable = test_writeable_v1(mountpoint, base_cgroup);
d6337a5f
CB
1279 if (!writeable)
1280 goto next;
1281
1282 if (type == CGROUP2_SUPER_MAGIC) {
1283 char *cgv2_ctrl_path;
1284
1285 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
1286 "cgroup.controllers",
1287 NULL);
1288
1289 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
1290 free(cgv2_ctrl_path);
1291 if (!controller_list)
1292 controller_list = cg_unified_make_empty_controller();
ccb4cabe 1293 }
d6337a5f
CB
1294 new = add_hierarchy(controller_list, mountpoint, base_cgroup, type);
1295 if (type == CGROUP2_SUPER_MAGIC && !unified)
1296 unified = new;
1297
1298 continue;
1299
1300 next:
1301 free_string_list(controller_list);
1302 free(mountpoint);
1303 free(base_cgroup);
ccb4cabe
SH
1304 }
1305
1306 free_string_list(klist);
1307 free_string_list(nlist);
1308
1309 free(basecginfo);
1310
1311 fclose(f);
1312 free(line);
1313
e4aeecf5
CB
1314 if (lxc_cgfsng_debug) {
1315 printf("writeable subsystems:\n");
1316 lxc_cgfsng_print_hierarchies();
1317 }
1318
ccb4cabe
SH
1319 /* verify that all controllers in cgroup.use and all crucial
1320 * controllers are accounted for
1321 */
c2712f64 1322 if (!all_controllers_found())
ccb4cabe
SH
1323 return false;
1324
1325 return true;
1326}
1327
d6337a5f
CB
1328static int cg_is_pure_unified(void) {
1329
1330 int ret;
1331 struct statfs fs;
1332
1333 ret = statfs("/sys/fs/cgroup", &fs);
1334 if (ret < 0)
1335 return -ENOMEDIUM;
1336
1337 if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
1338 return CGROUP2_SUPER_MAGIC;
1339
1340 return 0;
1341}
1342
1343/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
a3926f6a 1344static char *cg_unified_get_current_cgroup(void)
457ca9aa 1345{
d6337a5f
CB
1346 char *basecginfo;
1347 char *base_cgroup;
1348 bool will_escape;
1349 char *copy = NULL;
1350
1351 will_escape = (geteuid() == 0);
1352 if (will_escape)
1353 basecginfo = read_file("/proc/1/cgroup");
1354 else
1355 basecginfo = read_file("/proc/self/cgroup");
1356 if (!basecginfo)
1357 return NULL;
1358
1359 base_cgroup = strstr(basecginfo, "0::/");
1360 if (!base_cgroup)
1361 goto cleanup_on_err;
1362
1363 base_cgroup = base_cgroup + 3;
1364 copy = copy_to_eol(base_cgroup);
1365 if (!copy)
1366 goto cleanup_on_err;
1367
1368cleanup_on_err:
1369 free(basecginfo);
1370 if (copy)
1371 trim(copy);
1372
1373 return copy;
1374}
1375
a3926f6a 1376static int cg_unified_init(void)
d6337a5f
CB
1377{
1378 int ret;
1379 char *mountpoint, *subtree_path;
1380 char **delegatable;
1381 char *base_cgroup = NULL;
1382
1383 ret = cg_is_pure_unified();
1384 if (ret == -ENOMEDIUM)
1385 return -ENOMEDIUM;
1386
1387 if (ret != CGROUP2_SUPER_MAGIC)
1388 return 0;
1389
a3926f6a 1390 base_cgroup = cg_unified_get_current_cgroup();
d6337a5f
CB
1391 if (!base_cgroup)
1392 return -EINVAL;
1393 prune_init_scope(base_cgroup);
1394
1395 /* We assume that we have already been given controllers to delegate
1396 * further down the hierarchy. If not it is up to the user to delegate
1397 * them to us.
1398 */
1399 mountpoint = must_copy_string("/sys/fs/cgroup");
1400 subtree_path = must_make_path(mountpoint, base_cgroup,
1401 "cgroup.subtree_control", NULL);
1402 delegatable = cg_unified_get_controllers(subtree_path);
1403 free(subtree_path);
1404 if (!delegatable)
1405 delegatable = cg_unified_make_empty_controller();
1406 if (!delegatable[0])
1407 CGFSNG_DEBUG("No controllers are enabled for delegation\n");
1408
1409 /* TODO: If the user requested specific controllers via lxc.cgroup.use
1410 * we should verify here. The reason I'm not doing it right is that I'm
1411 * not convinced that lxc.cgroup.use will be the future since it is a
1412 * global property. I much rather have an option that lets you request
1413 * controllers per container.
1414 */
1415
1416 add_hierarchy(delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
1417 unified = hierarchies[0];
1418
1419 cgroup_layout = CGROUP_LAYOUT_UNIFIED;
1420 return CGROUP2_SUPER_MAGIC;
1421}
1422
1423static bool cg_init(void)
1424{
1425 int ret;
457ca9aa 1426 const char *tmp;
d6337a5f 1427
457ca9aa
SH
1428 errno = 0;
1429 tmp = lxc_global_config_value("lxc.cgroup.use");
1a0e70ac 1430 if (!cgroup_use && errno != 0) { /* lxc.cgroup.use can be NULL */
65d78313 1431 CGFSNG_DEBUG("Failed to retrieve list of cgroups to use\n");
457ca9aa
SH
1432 return false;
1433 }
1434 cgroup_use = must_copy_string(tmp);
1435
a3926f6a 1436 ret = cg_unified_init();
d6337a5f
CB
1437 if (ret < 0)
1438 return false;
1439
1440 if (ret == CGROUP2_SUPER_MAGIC)
1441 return true;
1442
a3926f6a 1443 return cg_hybrid_init();
457ca9aa
SH
1444}
1445
43654d34 1446static void *cgfsng_init(struct lxc_handler *handler)
ccb4cabe 1447{
457ca9aa 1448 const char *cgroup_pattern;
43654d34 1449 struct cgfsng_handler_data *d;
ccb4cabe
SH
1450
1451 d = must_alloc(sizeof(*d));
1452 memset(d, 0, sizeof(*d));
1453
43654d34
CB
1454 /* copy container name */
1455 d->name = must_copy_string(handler->name);
1456
1457 /* copy per-container cgroup information */
ae5e6c08
CB
1458 d->cgroup_meta.dir = NULL;
1459 d->cgroup_meta.controllers = NULL;
9b5396f9
CB
1460 if (handler->conf) {
1461 d->cgroup_meta.dir = must_copy_string(handler->conf->cgroup_meta.dir);
1462 d->cgroup_meta.controllers = must_copy_string(handler->conf->cgroup_meta.controllers);
1463 }
ccb4cabe 1464
43654d34 1465 /* copy system-wide cgroup information */
ccb4cabe 1466 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
43654d34
CB
1467 if (!cgroup_pattern) {
1468 /* lxc.cgroup.pattern is only NULL on error. */
ccb4cabe
SH
1469 ERROR("Error getting cgroup pattern");
1470 goto out_free;
1471 }
1472 d->cgroup_pattern = must_copy_string(cgroup_pattern);
1473
d6337a5f
CB
1474 d->cgroup_layout = cgroup_layout;
1475 if (d->cgroup_layout == CGROUP_LAYOUT_LEGACY)
1476 TRACE("Running with legacy cgroup layout");
1477 else if (d->cgroup_layout == CGROUP_LAYOUT_HYBRID)
1478 TRACE("Running with hybrid cgroup layout");
1479 else if (d->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
1480 TRACE("Running with unified cgroup layout");
1481 else
1482 WARN("Running with unknown cgroup layout");
1483
e4aeecf5
CB
1484 if (lxc_cgfsng_debug)
1485 lxc_cgfsng_print_debuginfo(d);
ccb4cabe
SH
1486
1487 return d;
1488
1489out_free:
1490 free_handler_data(d);
1491 return NULL;
1492}
1493
bd8ef4e4 1494static int recursive_destroy(char *dirname)
ccb4cabe 1495{
a17f8b3f 1496 int ret;
74f96976 1497 struct dirent *direntp;
ccb4cabe
SH
1498 DIR *dir;
1499 int r = 0;
1500
1501 dir = opendir(dirname);
1502 if (!dir)
1503 return -1;
1504
74f96976 1505 while ((direntp = readdir(dir))) {
ccb4cabe 1506 char *pathname;
a17f8b3f 1507 struct stat mystat;
ccb4cabe 1508
ccb4cabe
SH
1509 if (!strcmp(direntp->d_name, ".") ||
1510 !strcmp(direntp->d_name, ".."))
1511 continue;
1512
1513 pathname = must_make_path(dirname, direntp->d_name, NULL);
1514
a17f8b3f
CB
1515 ret = lstat(pathname, &mystat);
1516 if (ret < 0) {
ccb4cabe 1517 if (!r)
a17f8b3f 1518 WARN("Failed to stat %s", pathname);
ccb4cabe
SH
1519 r = -1;
1520 goto next;
1521 }
1522
1523 if (!S_ISDIR(mystat.st_mode))
1524 goto next;
a17f8b3f 1525
bd8ef4e4 1526 ret = recursive_destroy(pathname);
a17f8b3f 1527 if (ret < 0)
ccb4cabe 1528 r = -1;
bd8ef4e4 1529 next:
ccb4cabe
SH
1530 free(pathname);
1531 }
1532
a17f8b3f
CB
1533 ret = rmdir(dirname);
1534 if (ret < 0) {
ccb4cabe 1535 if (!r)
bd8ef4e4
CB
1536 WARN("%s - Failed to delete \"%s\"", strerror(errno),
1537 dirname);
ccb4cabe
SH
1538 r = -1;
1539 }
1540
a17f8b3f
CB
1541 ret = closedir(dir);
1542 if (ret < 0) {
ccb4cabe 1543 if (!r)
bd8ef4e4
CB
1544 WARN("%s - Failed to delete \"%s\"", strerror(errno),
1545 dirname);
ccb4cabe
SH
1546 r = -1;
1547 }
a17f8b3f 1548
ccb4cabe
SH
1549 return r;
1550}
1551
bd8ef4e4
CB
1552static int cgroup_rmdir(char *container_cgroup)
1553{
1554 int i;
1555
1556 if (!container_cgroup || !hierarchies)
1557 return 0;
1558
1559 for (i = 0; hierarchies[i]; i++) {
1560 int ret;
1561 struct hierarchy *h = hierarchies[i];
1562
1563 if (!h->fullcgpath)
1564 continue;
1565
1566 ret = recursive_destroy(h->fullcgpath);
1567 if (ret < 0)
1568 WARN("Failed to destroy \"%s\"", h->fullcgpath);
1569
1570 free(h->fullcgpath);
1571 h->fullcgpath = NULL;
1572 }
1573
1574 return 0;
1575}
1576
4160c3a0
CB
1577struct generic_userns_exec_data {
1578 struct cgfsng_handler_data *d;
1579 struct lxc_conf *conf;
1580 uid_t origuid; /* target uid in parent namespace */
1581 char *path;
1582};
1583
bd8ef4e4 1584static int cgroup_rmdir_wrapper(void *data)
ccb4cabe 1585{
6efacf80 1586 int ret;
4160c3a0
CB
1587 struct generic_userns_exec_data *arg = data;
1588 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1589 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1590
6efacf80
CB
1591 ret = setresgid(nsgid, nsgid, nsgid);
1592 if (ret < 0) {
1593 SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
1594 (int)nsgid, (int)nsgid);
1595 return -1;
1596 }
1597
1598 ret = setresuid(nsuid, nsuid, nsuid);
1599 if (ret < 0) {
1600 SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
1601 (int)nsuid, (int)nsuid);
1602 return -1;
1603 }
1604
1605 ret = setgroups(0, NULL);
1606 if (ret < 0 && errno != EPERM) {
1607 SYSERROR("Failed to setgroups(0, NULL)");
1608 return -1;
1609 }
ccb4cabe 1610
bd8ef4e4 1611 return cgroup_rmdir(arg->d->container_cgroup);
ccb4cabe
SH
1612}
1613
bd8ef4e4 1614static void cgfsng_destroy(void *hdata, struct lxc_conf *conf)
ccb4cabe 1615{
bd8ef4e4
CB
1616 int ret;
1617 struct cgfsng_handler_data *d = hdata;
4160c3a0
CB
1618 struct generic_userns_exec_data wrap;
1619
bd8ef4e4
CB
1620 if (!d)
1621 return;
1622
4160c3a0 1623 wrap.origuid = 0;
bd8ef4e4 1624 wrap.d = hdata;
4160c3a0
CB
1625 wrap.conf = conf;
1626
ccb4cabe 1627 if (conf && !lxc_list_empty(&conf->id_map))
bd8ef4e4
CB
1628 ret = userns_exec_1(conf, cgroup_rmdir_wrapper, &wrap,
1629 "cgroup_rmdir_wrapper");
ccb4cabe 1630 else
bd8ef4e4
CB
1631 ret = cgroup_rmdir(d->container_cgroup);
1632 if (ret < 0) {
1633 WARN("Failed to destroy cgroups");
ccb4cabe 1634 return;
ccb4cabe
SH
1635 }
1636
1637 free_handler_data(d);
1638}
1639
1640struct cgroup_ops *cgfsng_ops_init(void)
1641{
e4aeecf5
CB
1642 if (getenv("LXC_DEBUG_CGFSNG"))
1643 lxc_cgfsng_debug = true;
1644
d6337a5f 1645 if (!cg_init())
457ca9aa 1646 return NULL;
e4aeecf5 1647
ccb4cabe
SH
1648 return &cgfsng_ops;
1649}
1650
a3926f6a 1651static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
0c3deb94
CB
1652{
1653 char **it;
1654 size_t i, parts_len;
1655 size_t full_len = 0;
1656 char *add_controllers = NULL, *cgroup = NULL;
1657 char **parts = NULL;
1658 bool bret = false;
1659
1660 if (h->version != CGROUP2_SUPER_MAGIC)
1661 return true;
1662
1663 if (!h->controllers)
1664 return true;
1665
1666 /* For now we simply enable all controllers that we have detected by
1667 * creating a string like "+memory +pids +cpu +io".
1668 * TODO: In the near future we might want to support "-<controller>"
1669 * etc. but whether supporting semantics like this make sense will need
1670 * some thinking.
1671 */
1672 for (it = h->controllers; it && *it; it++) {
1673 full_len += strlen(*it) + 2;
1674 add_controllers = must_realloc(add_controllers, full_len + 1);
1675 if (h->controllers[0] == *it)
1676 add_controllers[0] = '\0';
1677 strcat(add_controllers, "+");
1678 strcat(add_controllers, *it);
1679 if ((it + 1) && *(it + 1))
1680 strcat(add_controllers, " ");
1681 }
1682
1683 parts = lxc_string_split(cgname, '/');
1684 if (!parts)
1685 goto on_error;
1686 parts_len = lxc_array_len((void **)parts);
1687 if (parts_len > 0)
1688 parts_len--;
1689
1690 cgroup = must_make_path(h->mountpoint, h->base_cgroup, NULL);
1691 for (i = 0; i < parts_len; i++) {
1692 int ret;
1693 char *target;
1694
1695 cgroup = must_append_path(cgroup, parts[i], NULL);
1696 target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
1697 ret = lxc_write_to_file(target, add_controllers, full_len, false);
1698 free(target);
1699 if (ret < 0) {
1700 SYSERROR("Could not enable \"%s\" controllers in the "
1701 "unified cgroup \"%s\"", add_controllers, cgroup);
1702 goto on_error;
1703 }
1704 }
1705
1706 bret = true;
1707
1708on_error:
1709 lxc_free_array((void **)parts, free);
1710 free(add_controllers);
1711 free(cgroup);
1712 return bret;
1713}
1714
ccb4cabe
SH
1715static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1716{
0c3deb94
CB
1717 int ret;
1718
e3a3fecf 1719 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
1a0e70ac 1720 if (dir_exists(h->fullcgpath)) { /* it must not already exist */
0c3deb94 1721 ERROR("cgroup \"%s\" already existed", h->fullcgpath);
d8da679e 1722 return false;
6f9584d8 1723 }
0c3deb94 1724
a3926f6a 1725 if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
0c3deb94
CB
1726 ERROR("Failed to handle cgroupfs v1 cpuset controller");
1727 return false;
1728 }
1729
1730 ret = mkdir_p(h->fullcgpath, 0755);
1731 if (ret < 0) {
1732 ERROR("Failed to create cgroup \"%s\"", h->fullcgpath);
e3a3fecf 1733 return false;
6f9584d8 1734 }
0c3deb94 1735
a3926f6a 1736 return cg_unified_create_cgroup(h, cgname);
ccb4cabe
SH
1737}
1738
1739static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1740{
1741 if (rmdir(h->fullcgpath) < 0)
1742 SYSERROR("Failed to clean up cgroup %s from failed creation attempt", h->fullcgpath);
1743 free(h->fullcgpath);
1744 h->fullcgpath = NULL;
1745}
1746
1747/*
d30ec4cb 1748 * Try to create the same cgroup in all hierarchies.
ccb4cabe
SH
1749 * Start with cgroup_pattern; next cgroup_pattern-1, -2, ..., -999
1750 */
1751static inline bool cgfsng_create(void *hdata)
1752{
bb30b52a 1753 int i;
ccb4cabe 1754 size_t len;
0c3deb94 1755 char *container_cgroup, *offset, *tmp;
7d531e9b
CB
1756 int idx = 0;
1757 struct cgfsng_handler_data *d = hdata;
ccb4cabe
SH
1758
1759 if (!d)
1760 return false;
43654d34 1761
ccb4cabe
SH
1762 if (d->container_cgroup) {
1763 WARN("cgfsng_create called a second time");
1764 return false;
1765 }
1766
43654d34 1767 if (d->cgroup_meta.dir)
7d531e9b 1768 tmp = lxc_string_join("/", (const char *[]){d->cgroup_meta.dir, d->name, NULL}, false);
43654d34
CB
1769 else
1770 tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
ccb4cabe
SH
1771 if (!tmp) {
1772 ERROR("Failed expanding cgroup name pattern");
1773 return false;
1774 }
1a0e70ac 1775 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
0c3deb94
CB
1776 container_cgroup = must_alloc(len);
1777 strcpy(container_cgroup, tmp);
ccb4cabe 1778 free(tmp);
0c3deb94 1779 offset = container_cgroup + len - 5;
ccb4cabe
SH
1780
1781again:
95adfe93
SH
1782 if (idx == 1000) {
1783 ERROR("Too many conflicting cgroup names");
ccb4cabe 1784 goto out_free;
95adfe93 1785 }
66b66624 1786 if (idx) {
bb30b52a
CB
1787 int ret;
1788
66b66624
CB
1789 ret = snprintf(offset, 5, "-%d", idx);
1790 if (ret < 0 || (size_t)ret >= 5) {
1791 FILE *f = fopen("/dev/null", "w");
97ebced3 1792 if (f) {
66b66624
CB
1793 fprintf(f, "Workaround for GCC7 bug: "
1794 "https://gcc.gnu.org/bugzilla/"
1795 "show_bug.cgi?id=78969");
1796 fclose(f);
1797 }
1798 }
1799 }
457ca9aa 1800 for (i = 0; hierarchies[i]; i++) {
0c3deb94 1801 if (!create_path_for_hierarchy(hierarchies[i], container_cgroup)) {
ccb4cabe 1802 int j;
1a0e70ac 1803 ERROR("Failed to create \"%s\"", hierarchies[i]->fullcgpath);
457ca9aa
SH
1804 free(hierarchies[i]->fullcgpath);
1805 hierarchies[i]->fullcgpath = NULL;
ccb4cabe 1806 for (j = 0; j < i; j++)
0c3deb94 1807 remove_path_for_hierarchy(hierarchies[j], container_cgroup);
ccb4cabe
SH
1808 idx++;
1809 goto again;
1810 }
1811 }
1812 /* Done */
0c3deb94 1813 d->container_cgroup = container_cgroup;
ccb4cabe
SH
1814 return true;
1815
1816out_free:
0c3deb94 1817 free(container_cgroup);
ccb4cabe
SH
1818 return false;
1819}
1820
ccb4cabe
SH
1821static bool cgfsng_enter(void *hdata, pid_t pid)
1822{
ccb4cabe
SH
1823 char pidstr[25];
1824 int i, len;
1825
1826 len = snprintf(pidstr, 25, "%d", pid);
1827 if (len < 0 || len > 25)
1828 return false;
1829
457ca9aa
SH
1830 for (i = 0; hierarchies[i]; i++) {
1831 char *fullpath = must_make_path(hierarchies[i]->fullcgpath,
ccb4cabe
SH
1832 "cgroup.procs", NULL);
1833 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
d3b00a8f 1834 SYSERROR("Failed to enter %s", fullpath);
ccb4cabe
SH
1835 free(fullpath);
1836 return false;
1837 }
1838 free(fullpath);
1839 }
1840
1841 return true;
1842}
1843
6efacf80
CB
1844static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
1845 mode_t chmod_mode)
1846{
1847 int ret;
1848
1849 ret = chown(path, chown_uid, chown_gid);
1850 if (ret < 0) {
1851 WARN("%s - Failed to chown(%s, %d, %d)", strerror(errno), path,
1852 (int)chown_uid, (int)chown_gid);
1853 return -1;
1854 }
1855
1856 ret = chmod(path, chmod_mode);
1857 if (ret < 0) {
1858 WARN("%s - Failed to chmod(%s, %d)", strerror(errno), path,
1859 (int)chmod_mode);
1860 return -1;
1861 }
1862
1863 return 0;
1864}
1865
1866/* chgrp the container cgroups to container group. We leave
c0888dfe
SH
1867 * the container owner as cgroup owner. So we must make the
1868 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1869 *
1870 * Also chown the tasks and cgroup.procs files. Those may not
1871 * exist depending on kernel version.
c0888dfe 1872 */
ccb4cabe
SH
1873static int chown_cgroup_wrapper(void *data)
1874{
6efacf80 1875 int i, ret;
4160c3a0
CB
1876 uid_t destuid;
1877 struct generic_userns_exec_data *arg = data;
1878 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1879 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1880
6efacf80
CB
1881 ret = setresgid(nsgid, nsgid, nsgid);
1882 if (ret < 0) {
1883 SYSERROR("Failed to setresgid(%d, %d, %d)",
1884 (int)nsgid, (int)nsgid, (int)nsgid);
1885 return -1;
1886 }
1887
1888 ret = setresuid(nsuid, nsuid, nsuid);
1889 if (ret < 0) {
1890 SYSERROR("Failed to setresuid(%d, %d, %d)",
1891 (int)nsuid, (int)nsuid, (int)nsuid);
1892 return -1;
1893 }
1894
1895 ret = setgroups(0, NULL);
1896 if (ret < 0 && errno != EPERM) {
1897 SYSERROR("Failed to setgroups(0, NULL)");
1898 return -1;
1899 }
ccb4cabe
SH
1900
1901 destuid = get_ns_uid(arg->origuid);
1902
457ca9aa 1903 for (i = 0; hierarchies[i]; i++) {
6efacf80
CB
1904 char *fullpath;
1905 char *path = hierarchies[i]->fullcgpath;
43647298 1906
63e42fee 1907 ret = chowmod(path, destuid, nsgid, 0775);
6efacf80 1908 if (ret < 0)
ccb4cabe 1909 return -1;
c0888dfe 1910
6efacf80
CB
1911 /* Failures to chown() these are inconvenient but not
1912 * detrimental We leave these owned by the container launcher,
1913 * so that container root can write to the files to attach. We
1914 * chmod() them 664 so that container systemd can write to the
1915 * files (which systemd in wily insists on doing).
ab8f5424 1916 */
6efacf80
CB
1917
1918 if (hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
1919 fullpath = must_make_path(path, "tasks", NULL);
1920 (void)chowmod(fullpath, destuid, nsgid, 0664);
1921 free(fullpath);
1922 }
43647298
SH
1923
1924 fullpath = must_make_path(path, "cgroup.procs", NULL);
6efacf80 1925 (void)chowmod(fullpath, destuid, 0, 0664);
ccb4cabe 1926 free(fullpath);
0e17357c 1927
d6337a5f 1928 if (hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
0e17357c
CB
1929 continue;
1930
1931 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
6efacf80 1932 (void)chowmod(fullpath, destuid, nsgid, 0664);
0e17357c
CB
1933 free(fullpath);
1934
1935 fullpath = must_make_path(path, "cgroup.threads", NULL);
6efacf80 1936 (void)chowmod(fullpath, destuid, nsgid, 0664);
0e17357c 1937 free(fullpath);
ccb4cabe
SH
1938 }
1939
1940 return 0;
1941}
1942
058c1cb6 1943static bool cgfsng_chown(void *hdata, struct lxc_conf *conf)
ccb4cabe
SH
1944{
1945 struct cgfsng_handler_data *d = hdata;
4160c3a0 1946 struct generic_userns_exec_data wrap;
ccb4cabe
SH
1947
1948 if (!d)
1949 return false;
1950
1951 if (lxc_list_empty(&conf->id_map))
1952 return true;
1953
ccb4cabe 1954 wrap.origuid = geteuid();
4160c3a0
CB
1955 wrap.path = NULL;
1956 wrap.d = d;
1957 wrap.conf = conf;
ccb4cabe 1958
c9b7c33e
CB
1959 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1960 "chown_cgroup_wrapper") < 0) {
ccb4cabe
SH
1961 ERROR("Error requesting cgroup chown in new namespace");
1962 return false;
1963 }
1964
1965 return true;
1966}
1967
8aa1044f
SH
1968/*
1969 * We've safe-mounted a tmpfs as parent, so we don't need to protect against
1970 * symlinks any more - just use mount
1971 */
1972
1973/* mount cgroup-full if requested */
1974static int mount_cgroup_full(int type, struct hierarchy *h, char *dest,
a3926f6a 1975 char *container_cgroup)
8aa1044f
SH
1976{
1977 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1978 return 0;
1979 if (mount(h->mountpoint, dest, "cgroup", MS_BIND, NULL) < 0) {
1980 SYSERROR("Error bind-mounting %s cgroup onto %s", h->mountpoint,
1981 dest);
1982 return -1;
1983 }
1984 if (type != LXC_AUTO_CGROUP_FULL_RW) {
5b6f9369
SH
1985 unsigned long flags = MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV |
1986 MS_REMOUNT | MS_RDONLY;
1987 if (mount(NULL, dest, "cgroup", flags, NULL) < 0) {
8aa1044f
SH
1988 SYSERROR("Error remounting %s readonly", dest);
1989 return -1;
1990 }
1991 }
1992
1993 INFO("Bind mounted %s onto %s", h->mountpoint, dest);
1994 if (type != LXC_AUTO_CGROUP_FULL_MIXED)
1995 return 0;
1996
1997 /* mount just the container path rw */
1998 char *source = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
5b6f9369 1999 char *rwpath = must_make_path(dest, h->base_cgroup, container_cgroup, NULL);
8aa1044f 2000 if (mount(source, rwpath, "cgroup", MS_BIND, NULL) < 0)
13277ec4 2001 WARN("Failed to mount %s read-write: %s", rwpath,
2002 strerror(errno));
8aa1044f
SH
2003 INFO("Made %s read-write", rwpath);
2004 free(rwpath);
2005 free(source);
2006 return 0;
2007}
2008
2009/* cgroup-full:* is done, no need to create subdirs */
2010static bool cg_mount_needs_subdirs(int type)
2011{
2012 if (type >= LXC_AUTO_CGROUP_FULL_RO)
2013 return false;
a3926f6a 2014
8aa1044f
SH
2015 return true;
2016}
2017
886cac86
CB
2018/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
2019 * remount controller ro if needed and bindmount the cgroupfs onto
2020 * controll/the/cg/path.
8aa1044f 2021 */
a3926f6a
CB
2022static int do_secondstage_mounts_if_needed(int type, struct hierarchy *h,
2023 char *controllerpath, char *cgpath,
2024 const char *container_cgroup)
8aa1044f 2025{
5285689c 2026 int ret, remount_flags;
886cac86
CB
2027 char *sourcepath;
2028 int flags = MS_BIND;
2029
8aa1044f 2030 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
886cac86
CB
2031 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
2032 if (ret < 0) {
2033 SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
2034 controllerpath, controllerpath);
8aa1044f
SH
2035 return -1;
2036 }
886cac86 2037
5285689c
CB
2038 remount_flags = add_required_remount_flags(controllerpath,
2039 controllerpath,
2040 flags | MS_REMOUNT);
886cac86
CB
2041 ret = mount(controllerpath, controllerpath, "cgroup",
2042 MS_REMOUNT | MS_BIND | MS_RDONLY, NULL);
2043 if (ret < 0) {
2044 SYSERROR("Failed to remount \"%s\" ro", controllerpath);
8aa1044f
SH
2045 return -1;
2046 }
886cac86 2047
8aa1044f
SH
2048 INFO("Remounted %s read-only", controllerpath);
2049 }
886cac86
CB
2050
2051 sourcepath = must_make_path(h->mountpoint, h->base_cgroup,
2052 container_cgroup, NULL);
8aa1044f
SH
2053 if (type == LXC_AUTO_CGROUP_RO)
2054 flags |= MS_RDONLY;
886cac86
CB
2055
2056 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
2057 if (ret < 0) {
2058 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
8aa1044f 2059 free(sourcepath);
8aa1044f
SH
2060 return -1;
2061 }
886cac86 2062 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
f8c40ffa
L
2063
2064 if (flags & MS_RDONLY) {
5285689c
CB
2065 remount_flags = add_required_remount_flags(sourcepath, cgpath,
2066 flags | MS_REMOUNT);
2067 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
886cac86
CB
2068 if (ret < 0) {
2069 SYSERROR("Failed to remount \"%s\" ro", cgpath);
f8c40ffa 2070 free(sourcepath);
f8c40ffa
L
2071 return -1;
2072 }
5285689c 2073 INFO("Remounted %s read-only", cgpath);
f8c40ffa
L
2074 }
2075
8aa1044f 2076 free(sourcepath);
886cac86 2077 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
8aa1044f
SH
2078 return 0;
2079}
2080
5285689c
CB
2081static int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
2082 const char *controllerpath)
b635e92d
CB
2083{
2084 int ret;
2085 char *controllers = NULL;
a760603e
CB
2086 char *fstype = "cgroup2";
2087 unsigned long flags = 0;
b635e92d 2088
a760603e
CB
2089 flags |= MS_NOSUID;
2090 flags |= MS_NOEXEC;
2091 flags |= MS_NODEV;
2092 flags |= MS_RELATIME;
2093
2094 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
2095 flags |= MS_RDONLY;
2096
d6337a5f 2097 if (h->version != CGROUP2_SUPER_MAGIC) {
a760603e
CB
2098 controllers = lxc_string_join(",", (const char **)h->controllers, false);
2099 if (!controllers)
2100 return -ENOMEM;
2101 fstype = "cgroup";
b635e92d
CB
2102 }
2103
a760603e 2104 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
b635e92d
CB
2105 free(controllers);
2106 if (ret < 0) {
a760603e 2107 SYSERROR("Failed to mount %s with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
2108 return -1;
2109 }
2110
a760603e 2111 DEBUG("Mounted %s with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
2112 return 0;
2113}
2114
ccb4cabe
SH
2115static bool cgfsng_mount(void *hdata, const char *root, int type)
2116{
3f69fb12 2117 int i, ret;
8aa1044f
SH
2118 char *tmpfspath = NULL;
2119 bool retval = false;
b635e92d
CB
2120 struct lxc_handler *handler = hdata;
2121 struct cgfsng_handler_data *d = handler->cgroup_data;
3f69fb12 2122 bool has_cgns = false, wants_force_mount = false;
8aa1044f
SH
2123
2124 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
2125 return true;
2126
3f69fb12
SY
2127 if (type & LXC_AUTO_CGROUP_FORCE) {
2128 type &= ~LXC_AUTO_CGROUP_FORCE;
2129 wants_force_mount = true;
2130 }
b635e92d 2131
3f69fb12
SY
2132 if (!wants_force_mount){
2133 if (!lxc_list_empty(&handler->conf->keepcaps))
2134 wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
2135 else
2136 wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
2137 }
8aa1044f 2138
3f69fb12
SY
2139 has_cgns = cgns_supported();
2140 if (has_cgns && !wants_force_mount)
2141 return true;
8aa1044f
SH
2142
2143 if (type == LXC_AUTO_CGROUP_NOSPEC)
2144 type = LXC_AUTO_CGROUP_MIXED;
2145 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
2146 type = LXC_AUTO_CGROUP_FULL_MIXED;
2147
2148 /* Mount tmpfs */
3f69fb12
SY
2149 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
2150 ret = safe_mount("cgroup_root", tmpfspath, "tmpfs",
2151 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
2152 "size=10240k,mode=755", root);
2153 if (ret < 0)
2154 goto on_error;
8aa1044f 2155
457ca9aa 2156 for (i = 0; hierarchies[i]; i++) {
8aa1044f 2157 char *controllerpath, *path2;
457ca9aa 2158 struct hierarchy *h = hierarchies[i];
8aa1044f 2159 char *controller = strrchr(h->mountpoint, '/');
8aa1044f
SH
2160
2161 if (!controller)
2162 continue;
2163 controller++;
2164 controllerpath = must_make_path(tmpfspath, controller, NULL);
2165 if (dir_exists(controllerpath)) {
2166 free(controllerpath);
2167 continue;
2168 }
3f69fb12
SY
2169 ret = mkdir(controllerpath, 0755);
2170 if (ret < 0) {
8aa1044f
SH
2171 SYSERROR("Error creating cgroup path: %s", controllerpath);
2172 free(controllerpath);
3f69fb12 2173 goto on_error;
8aa1044f 2174 }
b635e92d 2175
3f69fb12 2176 if (has_cgns && wants_force_mount) {
b635e92d
CB
2177 /* If cgroup namespaces are supported but the container
2178 * will not have CAP_SYS_ADMIN after it has started we
2179 * need to mount the cgroups manually.
2180 */
3f69fb12 2181 ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
b635e92d 2182 free(controllerpath);
3f69fb12
SY
2183 if (ret < 0)
2184 goto on_error;
2185
b635e92d
CB
2186 continue;
2187 }
2188
3f69fb12
SY
2189 ret = mount_cgroup_full(type, h, controllerpath, d->container_cgroup);
2190 if (ret < 0) {
8aa1044f 2191 free(controllerpath);
3f69fb12 2192 goto on_error;
8aa1044f 2193 }
3f69fb12 2194
8aa1044f
SH
2195 if (!cg_mount_needs_subdirs(type)) {
2196 free(controllerpath);
2197 continue;
2198 }
3f69fb12
SY
2199
2200 path2 = must_make_path(controllerpath, h->base_cgroup,
2201 d->container_cgroup, NULL);
2202 ret = mkdir_p(path2, 0755);
2203 if (ret < 0) {
8aa1044f 2204 free(controllerpath);
8e0c6620 2205 free(path2);
3f69fb12 2206 goto on_error;
8aa1044f 2207 }
2f62fb00 2208
3f69fb12
SY
2209 ret = do_secondstage_mounts_if_needed(
2210 type, h, controllerpath, path2, d->container_cgroup);
8aa1044f
SH
2211 free(controllerpath);
2212 free(path2);
3f69fb12
SY
2213 if (ret < 0)
2214 goto on_error;
8aa1044f
SH
2215 }
2216 retval = true;
2217
3f69fb12 2218on_error:
8aa1044f
SH
2219 free(tmpfspath);
2220 return retval;
ccb4cabe
SH
2221}
2222
2223static int recursive_count_nrtasks(char *dirname)
2224{
74f96976 2225 struct dirent *direntp;
ccb4cabe
SH
2226 DIR *dir;
2227 int count = 0, ret;
2228 char *path;
2229
2230 dir = opendir(dirname);
2231 if (!dir)
2232 return 0;
2233
74f96976 2234 while ((direntp = readdir(dir))) {
ccb4cabe
SH
2235 struct stat mystat;
2236
2237 if (!direntp)
2238 break;
2239
2240 if (!strcmp(direntp->d_name, ".") ||
2241 !strcmp(direntp->d_name, ".."))
2242 continue;
2243
2244 path = must_make_path(dirname, direntp->d_name, NULL);
2245
2246 if (lstat(path, &mystat))
2247 goto next;
2248
2249 if (!S_ISDIR(mystat.st_mode))
2250 goto next;
2251
2252 count += recursive_count_nrtasks(path);
2253next:
2254 free(path);
2255 }
2256
2257 path = must_make_path(dirname, "cgroup.procs", NULL);
2258 ret = lxc_count_file_lines(path);
2259 if (ret != -1)
2260 count += ret;
2261 free(path);
2262
2263 (void) closedir(dir);
2264
2265 return count;
2266}
2267
2268static int cgfsng_nrtasks(void *hdata) {
2269 struct cgfsng_handler_data *d = hdata;
2270 char *path;
2271 int count;
2272
457ca9aa 2273 if (!d || !d->container_cgroup || !hierarchies)
ccb4cabe 2274 return -1;
a3926f6a 2275
457ca9aa 2276 path = must_make_path(hierarchies[0]->fullcgpath, NULL);
ccb4cabe
SH
2277 count = recursive_count_nrtasks(path);
2278 free(path);
2279 return count;
2280}
2281
2282/* Only root needs to escape to the cgroup of its init */
7103fe6f 2283static bool cgfsng_escape()
ccb4cabe 2284{
ccb4cabe
SH
2285 int i;
2286
2287 if (geteuid())
2288 return true;
2289
457ca9aa
SH
2290 for (i = 0; hierarchies[i]; i++) {
2291 char *fullpath = must_make_path(hierarchies[i]->mountpoint,
2292 hierarchies[i]->base_cgroup,
ccb4cabe
SH
2293 "cgroup.procs", NULL);
2294 if (lxc_write_to_file(fullpath, "0", 2, false) != 0) {
d3b00a8f 2295 SYSERROR("Failed to escape to %s", fullpath);
ccb4cabe 2296 free(fullpath);
6df334d1 2297 return false;
ccb4cabe
SH
2298 }
2299 free(fullpath);
2300 }
2301
6df334d1 2302 return true;
ccb4cabe
SH
2303}
2304
36662416
TA
2305static int cgfsng_num_hierarchies(void)
2306{
2307 int i;
2308
2309 for (i = 0; hierarchies[i]; i++)
2310 ;
2311
2312 return i;
2313}
2314
2315static bool cgfsng_get_hierarchies(int n, char ***out)
2316{
2317 int i;
2318
2319 /* sanity check n */
6b38e644 2320 for (i = 0; i < n; i++)
36662416
TA
2321 if (!hierarchies[i])
2322 return false;
36662416
TA
2323
2324 *out = hierarchies[i]->controllers;
2325
2326 return true;
2327}
2328
ccb4cabe
SH
2329#define THAWED "THAWED"
2330#define THAWED_LEN (strlen(THAWED))
2331
d6337a5f
CB
2332/* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
2333 * to be adapted.
2334 */
ccb4cabe
SH
2335static bool cgfsng_unfreeze(void *hdata)
2336{
d6337a5f 2337 int ret;
ccb4cabe 2338 char *fullpath;
d6337a5f 2339 struct hierarchy *h;
ccb4cabe 2340
d6337a5f 2341 h = get_hierarchy("freezer");
457ca9aa 2342 if (!h)
ccb4cabe 2343 return false;
d6337a5f 2344
ccb4cabe 2345 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
d6337a5f 2346 ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false);
ccb4cabe 2347 free(fullpath);
d6337a5f
CB
2348 if (ret < 0)
2349 return false;
2350
ccb4cabe
SH
2351 return true;
2352}
2353
2354static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem)
2355{
d6337a5f
CB
2356 struct hierarchy *h;
2357
2358 h = get_hierarchy(subsystem);
ccb4cabe
SH
2359 if (!h)
2360 return NULL;
2361
371f834d
SH
2362 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
2363}
2364
2365/*
2366 * Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a
2367 * full path, which must be freed by the caller.
2368 */
2369static char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2370 const char *inpath,
2371 const char *filename)
2372{
371f834d 2373 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
2374}
2375
c2aed66d
CB
2376/* Technically, we're always at a delegation boundary here. (This is especially
2377 * true when cgroup namespaces are available.) The reasoning is that in order
2378 * for us to have been able to start a container in the first place the root
2379 * cgroup must have been a leaf node. Now, either the container's init system
2380 * has populated the cgroup and kept it as a leaf node or it has created
2381 * subtrees. In the former case we will simply attach to the leaf node we
2382 * created when we started the container in the latter case we create our own
2383 * cgroup for the attaching process.
2384 */
a3926f6a
CB
2385static int __cg_unified_attach(const struct hierarchy *h, const char *name,
2386 const char *lxcpath, const char *pidstr,
2387 size_t pidstr_len, const char *controller)
c2aed66d
CB
2388{
2389 int ret;
2390 size_t len;
2391 int fret = -1, idx = 0;
2392 char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
2393
2394 container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2395 /* not running */
2396 if (!container_cgroup)
2397 return 0;
2398
2399 base_path = must_make_path(h->mountpoint, container_cgroup, NULL);
2400 full_path = must_make_path(base_path, "cgroup.procs", NULL);
2401 /* cgroup is populated */
2402 ret = lxc_write_to_file(full_path, pidstr, pidstr_len, false);
2403 if (ret < 0 && errno != EBUSY)
2404 goto on_error;
2405
2406 if (ret == 0)
2407 goto on_success;
2408
2409 free(full_path);
2410
2411 len = strlen(base_path) + sizeof("/lxc-1000") - 1 +
2412 sizeof("/cgroup-procs") - 1;
2413 full_path = must_alloc(len + 1);
2414 do {
2415 if (idx)
2416 ret = snprintf(full_path, len + 1, "%s/lxc-%d",
2417 base_path, idx);
2418 else
2419 ret = snprintf(full_path, len + 1, "%s/lxc", base_path);
2420 if (ret < 0 || (size_t)ret >= len + 1)
2421 goto on_error;
2422
2423 ret = mkdir_p(full_path, 0755);
2424 if (ret < 0 && errno != EEXIST)
2425 goto on_error;
2426
2427 strcat(full_path, "/cgroup.procs");
2428 ret = lxc_write_to_file(full_path, pidstr, len, false);
2429 if (ret == 0)
2430 goto on_success;
2431
2432 /* this is a non-leaf node */
2433 if (errno != EBUSY)
2434 goto on_error;
2435
2436 } while (++idx > 0 && idx < 1000);
2437
2438on_success:
2439 if (idx < 1000)
2440 fret = 0;
2441
2442on_error:
2443 free(base_path);
2444 free(container_cgroup);
2445 free(full_path);
2446
2447 return fret;
2448}
2449
ccb4cabe
SH
2450static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
2451{
c2aed66d 2452 int i, len, ret;
ccb4cabe 2453 char pidstr[25];
ccb4cabe
SH
2454
2455 len = snprintf(pidstr, 25, "%d", pid);
2456 if (len < 0 || len > 25)
2457 return false;
2458
457ca9aa 2459 for (i = 0; hierarchies[i]; i++) {
c2aed66d
CB
2460 char *path;
2461 char *fullpath = NULL;
457ca9aa 2462 struct hierarchy *h = hierarchies[i];
ccb4cabe 2463
c2aed66d 2464 if (h->version == CGROUP2_SUPER_MAGIC) {
a3926f6a
CB
2465 ret = __cg_unified_attach(h, name, lxcpath, pidstr, len,
2466 h->controllers[0]);
c2aed66d
CB
2467 if (ret < 0)
2468 return false;
2469
2470 continue;
2471 }
2472
ccb4cabe 2473 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
c2aed66d
CB
2474 /* not running */
2475 if (!path)
ccb4cabe
SH
2476 continue;
2477
371f834d 2478 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
c2aed66d
CB
2479 ret = lxc_write_to_file(fullpath, pidstr, len, false);
2480 if (ret < 0) {
ccb4cabe
SH
2481 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
2482 free(fullpath);
ccb4cabe
SH
2483 return false;
2484 }
ccb4cabe
SH
2485 free(fullpath);
2486 }
2487
ccb4cabe
SH
2488 return true;
2489}
2490
2491/*
2492 * Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.
2493 * Here we don't have a cgroup_data set up, so we ask the running
2494 * container through the commands API for the cgroup path
2495 */
0069cc61
CB
2496static int cgfsng_get(const char *filename, char *value, size_t len,
2497 const char *name, const char *lxcpath)
ccb4cabe 2498{
ccb4cabe 2499 int ret = -1;
0069cc61
CB
2500 size_t controller_len;
2501 char *controller, *p, *path;
2502 struct hierarchy *h;
ccb4cabe 2503
0069cc61
CB
2504 controller_len = strlen(filename);
2505 controller = alloca(controller_len + 1);
2506 strcpy(controller, filename);
2507 p = strchr(controller, '.');
2508 if (p)
ccb4cabe
SH
2509 *p = '\0';
2510
0069cc61
CB
2511 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2512 /* not running */
2513 if (!path)
ccb4cabe
SH
2514 return -1;
2515
0069cc61 2516 h = get_hierarchy(controller);
ccb4cabe 2517 if (h) {
0069cc61
CB
2518 char *fullpath;
2519
2520 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
2521 ret = lxc_read_from_file(fullpath, value, len);
2522 free(fullpath);
2523 }
ccb4cabe
SH
2524 free(path);
2525
2526 return ret;
2527}
2528
2529/*
2530 * Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.
2531 * Here we don't have a cgroup_data set up, so we ask the running
2532 * container through the commands API for the cgroup path
2533 */
87777968
CB
2534static int cgfsng_set(const char *filename, const char *value, const char *name,
2535 const char *lxcpath)
ccb4cabe 2536{
ccb4cabe 2537 int ret = -1;
87777968
CB
2538 size_t controller_len;
2539 char *controller, *p, *path;
2540 struct hierarchy *h;
ccb4cabe 2541
87777968
CB
2542 controller_len = strlen(filename);
2543 controller = alloca(controller_len + 1);
2544 strcpy(controller, filename);
2545 p = strchr(controller, '.');
2546 if (p)
ccb4cabe
SH
2547 *p = '\0';
2548
87777968
CB
2549 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2550 /* not running */
2551 if (!path)
ccb4cabe
SH
2552 return -1;
2553
87777968 2554 h = get_hierarchy(controller);
ccb4cabe 2555 if (h) {
87777968
CB
2556 char *fullpath;
2557
2558 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
2559 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
2560 free(fullpath);
2561 }
ccb4cabe
SH
2562 free(path);
2563
2564 return ret;
2565}
2566
72add155
SH
2567/*
2568 * take devices cgroup line
2569 * /dev/foo rwx
2570 * and convert it to a valid
2571 * type major:minor mode
2572 * line. Return <0 on error. Dest is a preallocated buffer
2573 * long enough to hold the output.
2574 */
2575static int convert_devpath(const char *invalue, char *dest)
2576{
2a06d041
CB
2577 int n_parts;
2578 char *p, *path, type;
72add155
SH
2579 struct stat sb;
2580 unsigned long minor, major;
2a06d041
CB
2581 int ret = -EINVAL;
2582 char *mode = NULL;
72add155
SH
2583
2584 path = must_copy_string(invalue);
2585
2586 /*
2587 * read path followed by mode; ignore any trailing text.
2588 * A ' # comment' would be legal. Technically other text
2589 * is not legal, we could check for that if we cared to
2590 */
2591 for (n_parts = 1, p = path; *p && n_parts < 3; p++) {
2c2d6c49
SH
2592 if (*p != ' ')
2593 continue;
2594 *p = '\0';
2595 if (n_parts != 1)
2596 break;
2597 p++;
2598 n_parts++;
2599 while (*p == ' ')
2600 p++;
2601 mode = p;
2602 if (*p == '\0')
2603 goto out;
72add155 2604 }
2c2d6c49
SH
2605
2606 if (n_parts == 1)
72add155 2607 goto out;
72add155
SH
2608
2609 ret = stat(path, &sb);
2610 if (ret < 0)
2611 goto out;
2612
72add155
SH
2613 mode_t m = sb.st_mode & S_IFMT;
2614 switch (m) {
2615 case S_IFBLK:
2616 type = 'b';
2617 break;
2618 case S_IFCHR:
2619 type = 'c';
2620 break;
2c2d6c49 2621 default:
72add155
SH
2622 ERROR("Unsupported device type %i for %s", m, path);
2623 ret = -EINVAL;
2624 goto out;
2625 }
2c2d6c49
SH
2626
2627 major = MAJOR(sb.st_rdev);
2628 minor = MINOR(sb.st_rdev);
2629 ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
72add155 2630 if (ret < 0 || ret >= 50) {
2a06d041
CB
2631 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2632 "chars)", type, major, minor, mode);
72add155
SH
2633 ret = -ENAMETOOLONG;
2634 goto out;
2635 }
2636 ret = 0;
2637
2638out:
2639 free(path);
2640 return ret;
2641}
2642
ccb4cabe
SH
2643/*
2644 * Called from setup_limits - here we have the container's cgroup_data because
2645 * we created the cgroups
2646 */
a3926f6a
CB
2647static int cg_legacy_set_data(const char *filename, const char *value,
2648 struct cgfsng_handler_data *d)
ccb4cabe 2649{
b3646d7e 2650 char *fullpath, *p;
ab1a6cac 2651 size_t len;
1a0e70ac
CB
2652 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2653 char converted_value[50];
b3646d7e
CB
2654 struct hierarchy *h;
2655 int ret = 0;
2656 char *controller = NULL;
ccb4cabe 2657
ab1a6cac
CB
2658 len = strlen(filename);
2659 controller = alloca(len + 1);
b3646d7e 2660 strcpy(controller, filename);
ab1a6cac
CB
2661 p = strchr(controller, '.');
2662 if (p)
ccb4cabe
SH
2663 *p = '\0';
2664
c8bf519d 2665 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
72add155
SH
2666 ret = convert_devpath(value, converted_value);
2667 if (ret < 0)
c8bf519d 2668 return ret;
72add155 2669 value = converted_value;
c8bf519d 2670 }
2671
b3646d7e
CB
2672 h = get_hierarchy(controller);
2673 if (!h) {
2674 ERROR("Failed to setup limits for the \"%s\" controller. "
2675 "The controller seems to be unused by \"cgfsng\" cgroup "
2676 "driver or not enabled on the cgroup hierarchy",
2677 controller);
d1953b26 2678 errno = ENOENT;
ab1a6cac 2679 return -ENOENT;
ccb4cabe 2680 }
b3646d7e
CB
2681
2682 fullpath = must_make_path(h->fullcgpath, filename, NULL);
2683 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
2684 free(fullpath);
ccb4cabe
SH
2685 return ret;
2686}
2687
a3926f6a
CB
2688static bool __cg_legacy_setup_limits(void *hdata,
2689 struct lxc_list *cgroup_settings,
2690 bool do_devices)
ccb4cabe
SH
2691{
2692 struct cgfsng_handler_data *d = hdata;
2693 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
2694 struct lxc_cgroup *cg;
ccb4cabe
SH
2695 bool ret = false;
2696
2697 if (lxc_list_empty(cgroup_settings))
2698 return true;
2699
2700 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
6b38e644 2701 if (!sorted_cgroup_settings)
ccb4cabe 2702 return false;
ccb4cabe 2703
ccb4cabe
SH
2704 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2705 cg = iterator->elem;
2706
2707 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
a3926f6a 2708 if (cg_legacy_set_data(cg->subsystem, cg->value, d)) {
ccb4cabe
SH
2709 if (do_devices && (errno == EACCES || errno == EPERM)) {
2710 WARN("Error setting %s to %s for %s",
2711 cg->subsystem, cg->value, d->name);
2712 continue;
2713 }
2714 SYSERROR("Error setting %s to %s for %s",
2715 cg->subsystem, cg->value, d->name);
2716 goto out;
2717 }
6a628f4a 2718 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
ccb4cabe 2719 }
ccb4cabe
SH
2720 }
2721
2722 ret = true;
6b38e644 2723 INFO("Limits for the legacy cgroup hierarchies have been setup");
ccb4cabe 2724out:
ccb4cabe
SH
2725 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2726 lxc_list_del(iterator);
2727 free(iterator);
2728 }
2729 free(sorted_cgroup_settings);
2730 return ret;
2731}
2732
a3926f6a
CB
2733static bool __cg_unified_setup_limits(void *hdata,
2734 struct lxc_list *cgroup_settings)
6b38e644
CB
2735{
2736 struct lxc_list *iterator;
2737 struct hierarchy *h = unified;
2738
2739 if (lxc_list_empty(cgroup_settings))
2740 return true;
2741
2742 if (!h)
2743 return false;
2744
2745 lxc_list_for_each(iterator, cgroup_settings) {
2746 int ret;
2747 char *fullpath;
2748 struct lxc_cgroup *cg = iterator->elem;
2749
2750 fullpath = must_make_path(h->fullcgpath, cg->subsystem, NULL);
2751 ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false);
2752 free(fullpath);
2753 if (ret < 0) {
2754 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2755 return false;
2756 }
2757 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2758 }
2759
2760 INFO("Limits for the unified cgroup hierarchy have been setup");
2761 return true;
2762}
2763
2764static bool cgfsng_setup_limits(void *hdata, struct lxc_conf *conf,
2765 bool do_devices)
2766{
2767 bool bret;
2768
a3926f6a 2769 bret = __cg_legacy_setup_limits(hdata, &conf->cgroup, do_devices);
6b38e644
CB
2770 if (!bret)
2771 return false;
2772
a3926f6a 2773 return __cg_unified_setup_limits(hdata, &conf->cgroup2);
6b38e644
CB
2774}
2775
ccb4cabe
SH
2776static struct cgroup_ops cgfsng_ops = {
2777 .init = cgfsng_init,
2778 .destroy = cgfsng_destroy,
2779 .create = cgfsng_create,
2780 .enter = cgfsng_enter,
ccb4cabe 2781 .escape = cgfsng_escape,
36662416
TA
2782 .num_hierarchies = cgfsng_num_hierarchies,
2783 .get_hierarchies = cgfsng_get_hierarchies,
ccb4cabe
SH
2784 .get_cgroup = cgfsng_get_cgroup,
2785 .get = cgfsng_get,
2786 .set = cgfsng_set,
2787 .unfreeze = cgfsng_unfreeze,
2788 .setup_limits = cgfsng_setup_limits,
2789 .name = "cgroupfs-ng",
2790 .attach = cgfsng_attach,
058c1cb6 2791 .chown = cgfsng_chown,
ccb4cabe
SH
2792 .mount_cgroup = cgfsng_mount,
2793 .nrtasks = cgfsng_nrtasks,
2794 .driver = CGFSNG,
2795
2796 /* unsupported */
2797 .create_legacy = NULL,
2798};