]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
coverity: #1425858
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
ccb4cabe
SH
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24/*
25 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
26 * cgroup backend. The original cgfs.c was designed to be as flexible
27 * as possible. It would try to find cgroup filesystems no matter where
28 * or how you had them mounted, and deduce the most usable mount for
29 * each controller. It also was not designed for unprivileged use, as
30 * that was reserved for cgmanager.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
a54694f8 36
ccb4cabe 37#include "config.h"
a54694f8
CB
38
39#include <ctype.h>
40#include <dirent.h>
41#include <errno.h>
42#include <grp.h>
43#include <stdint.h>
ccb4cabe
SH
44#include <stdio.h>
45#include <stdlib.h>
a54694f8 46#include <string.h>
ccb4cabe 47#include <unistd.h>
a54694f8 48#include <sys/types.h>
ccb4cabe 49
c8bf519d 50#include <linux/types.h>
51#include <linux/kdev_t.h>
52
b635e92d 53#include "caps.h"
ccb4cabe 54#include "cgroup.h"
6328fd9c 55#include "cgroup_utils.h"
ccb4cabe 56#include "commands.h"
43654d34 57#include "conf.h"
a54694f8 58#include "log.h"
43654d34 59#include "storage/storage.h"
a54694f8 60#include "utils.h"
ccb4cabe
SH
61
62lxc_log_define(lxc_cgfsng, lxc);
63
64static struct cgroup_ops cgfsng_ops;
65
ccb4cabe
SH
66/*
67 * A descriptor for a mounted hierarchy
68 * @controllers: either NULL, or a null-terminated list of all
69 * the co-mounted controllers
70 * @mountpoint: the mountpoint we will use. It will be either
71 * /sys/fs/cgroup/controller or /sys/fs/cgroup/controllerlist
72 * @base_cgroup: the cgroup under which the container cgroup path
73 is created. This will be either the caller's cgroup (if not
74 root), or init's cgroup (if root).
75 */
76struct hierarchy {
77 char **controllers;
78 char *mountpoint;
79 char *base_cgroup;
80 char *fullcgpath;
6328fd9c 81 bool is_cgroup_v2;
ccb4cabe
SH
82};
83
84/*
85 * The cgroup data which is attached to the lxc_handler.
43654d34
CB
86 * @cgroup_pattern : A copy of the lxc.cgroup.pattern
87 * @container_cgroup : If not null, the cgroup which was created for the
88 * container. For each hierarchy, it is created under the
89 * @hierarchy->base_cgroup directory. Relative to the
90 * base_cgroup it is the same for all hierarchies.
91 * @name : The name of the container.
92 * @cgroup_meta : A copy of the container's cgroup information. This
93 * overrides @cgroup_pattern.
ccb4cabe
SH
94 */
95struct cgfsng_handler_data {
ccb4cabe 96 char *cgroup_pattern;
1a0e70ac
CB
97 char *container_cgroup; /* cgroup we created for the container */
98 char *name; /* container name */
43654d34
CB
99 /* per-container cgroup information */
100 struct lxc_cgroup cgroup_meta;
ccb4cabe
SH
101};
102
457ca9aa
SH
103/*
104 * @hierarchies - a NULL-terminated array of struct hierarchy, one per
105 * hierarchy. No duplicates. First sufficient, writeable mounted
106 * hierarchy wins
107 */
108struct hierarchy **hierarchies;
109
110/*
111 * @cgroup_use - a copy of the lxc.cgroup.use
112 */
113char *cgroup_use;
114
e4aeecf5
CB
115/*
116 * @lxc_cgfsng_debug - whether to print debug info to stdout for the cgfsng
117 * driver
118 */
119static bool lxc_cgfsng_debug;
120
ccb4cabe
SH
121static void free_string_list(char **clist)
122{
123 if (clist) {
124 int i;
125
126 for (i = 0; clist[i]; i++)
127 free(clist[i]);
128 free(clist);
129 }
130}
131
ccb4cabe
SH
132/* Allocate a pointer, do not fail */
133static void *must_alloc(size_t sz)
134{
135 return must_realloc(NULL, sz);
136}
137
ccb4cabe
SH
138/*
139 * This is a special case - return a copy of @entry
140 * prepending 'name='. I.e. turn systemd into name=systemd.
141 * Do not fail.
142 */
143static char *must_prefix_named(char *entry)
144{
145 char *ret;
146 size_t len = strlen(entry);
147
148 ret = must_alloc(len + 6);
149 snprintf(ret, len + 6, "name=%s", entry);
150 return ret;
151}
152
153/*
154 * Given a pointer to a null-terminated array of pointers, realloc to
155 * add one entry, and point the new entry to NULL. Do not fail. Return
156 * the index to the second-to-last entry - that is, the one which is
157 * now available for use (keeping the list null-terminated).
158 */
159static int append_null_to_list(void ***list)
160{
161 int newentry = 0;
162
163 if (*list)
164 for (; (*list)[newentry]; newentry++);
165
166 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
167 (*list)[newentry + 1] = NULL;
168 return newentry;
169}
170
171/*
172 * Given a null-terminated array of strings, check whether @entry
173 * is one of the strings
174 */
175static bool string_in_list(char **list, const char *entry)
176{
177 int i;
178
179 if (!list)
180 return false;
181 for (i = 0; list[i]; i++)
182 if (strcmp(list[i], entry) == 0)
183 return true;
184
185 return false;
186}
187
188/*
189 * append an entry to the clist. Do not fail.
190 * *clist must be NULL the first time we are called.
191 *
192 * We also handle named subsystems here. Any controller which is not a
193 * kernel subsystem, we prefix 'name='. Any which is both a kernel and
194 * named subsystem, we refuse to use because we're not sure which we
195 * have here. (TODO - we could work around this in some cases by just
196 * remounting to be unambiguous, or by comparing mountpoint contents
197 * with current cgroup)
198 *
199 * The last entry will always be NULL.
200 */
201static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry)
202{
203 int newentry;
204 char *copy;
205
206 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
c2712f64 207 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
ccb4cabe
SH
208 ERROR("It is both a named and kernel subsystem");
209 return;
210 }
211
212 newentry = append_null_to_list((void ***)clist);
213
214 if (strncmp(entry, "name=", 5) == 0)
215 copy = must_copy_string(entry);
216 else if (string_in_list(klist, entry))
217 copy = must_copy_string(entry);
49ff3958
CB
218 else if (!strcmp(entry, "cgroup2"))
219 copy = must_copy_string(entry);
ccb4cabe
SH
220 else
221 copy = must_prefix_named(entry);
222
223 (*clist)[newentry] = copy;
224}
225
ccb4cabe
SH
226static void free_handler_data(struct cgfsng_handler_data *d)
227{
ccb4cabe
SH
228 free(d->cgroup_pattern);
229 free(d->container_cgroup);
230 free(d->name);
43654d34
CB
231 if (d->cgroup_meta.dir)
232 free(d->cgroup_meta.dir);
233 if (d->cgroup_meta.controllers)
234 free(d->cgroup_meta.controllers);
ccb4cabe
SH
235 free(d);
236}
237
238/*
239 * Given a handler's cgroup data, return the struct hierarchy for the
240 * controller @c, or NULL if there is none.
241 */
457ca9aa 242struct hierarchy *get_hierarchy(const char *c)
ccb4cabe
SH
243{
244 int i;
245
457ca9aa 246 if (!hierarchies)
ccb4cabe 247 return NULL;
457ca9aa
SH
248 for (i = 0; hierarchies[i]; i++) {
249 if (string_in_list(hierarchies[i]->controllers, c))
250 return hierarchies[i];
ccb4cabe
SH
251 }
252 return NULL;
253}
254
a54694f8
CB
255#define BATCH_SIZE 50
256static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
257{
258 int newbatches = (newlen / BATCH_SIZE) + 1;
259 int oldbatches = (oldlen / BATCH_SIZE) + 1;
260
261 if (!*mem || newbatches > oldbatches) {
262 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
263 }
264}
265
266static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
267{
268 size_t full = oldlen + newlen;
269
270 batch_realloc(dest, oldlen, full + 1);
271
272 memcpy(*dest + oldlen, new, newlen + 1);
273}
274
275/* Slurp in a whole file */
276static char *read_file(char *fnam)
277{
278 FILE *f;
279 char *line = NULL, *buf = NULL;
280 size_t len = 0, fulllen = 0;
281 int linelen;
282
283 f = fopen(fnam, "r");
284 if (!f)
285 return NULL;
286 while ((linelen = getline(&line, &len, f)) != -1) {
287 append_line(&buf, fulllen, line, linelen);
288 fulllen += linelen;
289 }
290 fclose(f);
291 free(line);
292 return buf;
293}
294
295/* Taken over modified from the kernel sources. */
296#define NBITS 32 /* bits in uint32_t */
297#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
298#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
299
300static void set_bit(unsigned bit, uint32_t *bitarr)
301{
302 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
303}
304
305static void clear_bit(unsigned bit, uint32_t *bitarr)
306{
307 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
308}
309
310static bool is_set(unsigned bit, uint32_t *bitarr)
311{
312 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
313}
314
315/* Create cpumask from cpulist aka turn:
316 *
317 * 0,2-3
318 *
319 * into bit array
320 *
321 * 1 0 1 1
322 */
323static uint32_t *lxc_cpumask(char *buf, size_t nbits)
324{
325 char *token;
326 char *saveptr = NULL;
327 size_t arrlen = BITS_TO_LONGS(nbits);
328 uint32_t *bitarr = calloc(arrlen, sizeof(uint32_t));
329 if (!bitarr)
330 return NULL;
331
332 for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
333 errno = 0;
334 unsigned start = strtoul(token, NULL, 0);
335 unsigned end = start;
336
337 char *range = strchr(token, '-');
338 if (range)
339 end = strtoul(range + 1, NULL, 0);
340 if (!(start <= end)) {
341 free(bitarr);
342 return NULL;
343 }
344
345 if (end >= nbits) {
346 free(bitarr);
347 return NULL;
348 }
349
350 while (start <= end)
351 set_bit(start++, bitarr);
352 }
353
354 return bitarr;
355}
356
a54694f8
CB
357/* Turn cpumask into simple, comma-separated cpulist. */
358static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
359{
360 size_t i;
361 int ret;
eab15c1e 362 char numstr[LXC_NUMSTRLEN64] = {0};
a54694f8
CB
363 char **cpulist = NULL;
364
365 for (i = 0; i <= nbits; i++) {
366 if (is_set(i, bitarr)) {
eab15c1e
CB
367 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
368 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
a54694f8
CB
369 lxc_free_array((void **)cpulist, free);
370 return NULL;
371 }
372 if (lxc_append_string(&cpulist, numstr) < 0) {
373 lxc_free_array((void **)cpulist, free);
374 return NULL;
375 }
376 }
377 }
378 return lxc_string_join(",", (const char **)cpulist, false);
379}
380
381static ssize_t get_max_cpus(char *cpulist)
382{
383 char *c1, *c2;
384 char *maxcpus = cpulist;
385 size_t cpus = 0;
386
387 c1 = strrchr(maxcpus, ',');
388 if (c1)
389 c1++;
390
391 c2 = strrchr(maxcpus, '-');
392 if (c2)
393 c2++;
394
395 if (!c1 && !c2)
396 c1 = maxcpus;
397 else if (c1 > c2)
398 c2 = c1;
399 else if (c1 < c2)
400 c1 = c2;
1a0e70ac 401 else if (!c1 && c2) /* The reverse case is obvs. not needed. */
a54694f8
CB
402 c1 = c2;
403
404 /* If the above logic is correct, c1 should always hold a valid string
405 * here.
406 */
407
408 errno = 0;
409 cpus = strtoul(c1, NULL, 0);
410 if (errno != 0)
411 return -1;
412
413 return cpus;
414}
415
6f9584d8 416#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
a54694f8
CB
417static bool filter_and_set_cpus(char *path, bool am_initialized)
418{
419 char *lastslash, *fpath, oldv;
420 int ret;
421 ssize_t i;
422
423 ssize_t maxposs = 0, maxisol = 0;
424 char *cpulist = NULL, *posscpus = NULL, *isolcpus = NULL;
425 uint32_t *possmask = NULL, *isolmask = NULL;
6f9584d8 426 bool bret = false, flipped_bit = false;
a54694f8
CB
427
428 lastslash = strrchr(path, '/');
1a0e70ac 429 if (!lastslash) { /* bug... this shouldn't be possible */
6f9584d8 430 ERROR("Invalid path: %s.", path);
a54694f8
CB
431 return bret;
432 }
433 oldv = *lastslash;
434 *lastslash = '\0';
435 fpath = must_make_path(path, "cpuset.cpus", NULL);
436 posscpus = read_file(fpath);
6f9584d8
CB
437 if (!posscpus) {
438 SYSERROR("Could not read file: %s.\n", fpath);
439 goto on_error;
440 }
a54694f8
CB
441
442 /* Get maximum number of cpus found in possible cpuset. */
443 maxposs = get_max_cpus(posscpus);
444 if (maxposs < 0)
6f9584d8 445 goto on_error;
a54694f8 446
6f9584d8
CB
447 if (!file_exists(__ISOL_CPUS)) {
448 /* This system doesn't expose isolated cpus. */
449 DEBUG("Path: "__ISOL_CPUS" to read isolated cpus from does not exist.\n");
65d29cbc
CB
450 cpulist = posscpus;
451 /* No isolated cpus but we weren't already initialized by
452 * someone. We should simply copy the parents cpuset.cpus
453 * values.
454 */
455 if (!am_initialized) {
456 DEBUG("Copying cpuset of parent cgroup.");
457 goto copy_parent;
458 }
459 /* No isolated cpus but we were already initialized by someone.
460 * Nothing more to do for us.
461 */
6f9584d8
CB
462 goto on_success;
463 }
464
465 isolcpus = read_file(__ISOL_CPUS);
466 if (!isolcpus) {
467 SYSERROR("Could not read file "__ISOL_CPUS);
468 goto on_error;
469 }
a54694f8 470 if (!isdigit(isolcpus[0])) {
6f9584d8 471 DEBUG("No isolated cpus detected.");
a54694f8
CB
472 cpulist = posscpus;
473 /* No isolated cpus but we weren't already initialized by
474 * someone. We should simply copy the parents cpuset.cpus
475 * values.
476 */
6f9584d8
CB
477 if (!am_initialized) {
478 DEBUG("Copying cpuset of parent cgroup.");
a54694f8 479 goto copy_parent;
6f9584d8 480 }
a54694f8
CB
481 /* No isolated cpus but we were already initialized by someone.
482 * Nothing more to do for us.
483 */
6f9584d8 484 goto on_success;
a54694f8
CB
485 }
486
487 /* Get maximum number of cpus found in isolated cpuset. */
488 maxisol = get_max_cpus(isolcpus);
489 if (maxisol < 0)
6f9584d8 490 goto on_error;
a54694f8
CB
491
492 if (maxposs < maxisol)
493 maxposs = maxisol;
494 maxposs++;
495
496 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8
CB
497 if (!possmask) {
498 ERROR("Could not create cpumask for all possible cpus.\n");
499 goto on_error;
500 }
a54694f8
CB
501
502 isolmask = lxc_cpumask(isolcpus, maxposs);
6f9584d8
CB
503 if (!isolmask) {
504 ERROR("Could not create cpumask for all isolated cpus.\n");
505 goto on_error;
506 }
a54694f8
CB
507
508 for (i = 0; i <= maxposs; i++) {
509 if (is_set(i, isolmask) && is_set(i, possmask)) {
6f9584d8 510 flipped_bit = true;
a54694f8
CB
511 clear_bit(i, possmask);
512 }
513 }
514
6f9584d8
CB
515 if (!flipped_bit) {
516 DEBUG("No isolated cpus present in cpuset.");
517 goto on_success;
518 }
519 DEBUG("Removed isolated cpus from cpuset.");
520
a54694f8 521 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
6f9584d8
CB
522 if (!cpulist) {
523 ERROR("Could not create cpu list.\n");
524 goto on_error;
525 }
a54694f8
CB
526
527copy_parent:
528 *lastslash = oldv;
dcbc861e 529 free(fpath);
a54694f8
CB
530 fpath = must_make_path(path, "cpuset.cpus", NULL);
531 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false);
6f9584d8
CB
532 if (ret < 0) {
533 SYSERROR("Could not write cpu list to: %s.\n", fpath);
534 goto on_error;
535 }
536
537on_success:
538 bret = true;
a54694f8 539
6f9584d8 540on_error:
a54694f8
CB
541 free(fpath);
542
543 free(isolcpus);
544 free(isolmask);
545
546 if (posscpus != cpulist)
547 free(posscpus);
548 free(possmask);
549
550 free(cpulist);
551 return bret;
552}
553
e3a3fecf
SH
554/* Copy contents of parent(@path)/@file to @path/@file */
555static bool copy_parent_file(char *path, char *file)
556{
557 char *lastslash, *value = NULL, *fpath, oldv;
558 int len = 0;
559 int ret;
560
561 lastslash = strrchr(path, '/');
1a0e70ac 562 if (!lastslash) { /* bug... this shouldn't be possible */
e3a3fecf
SH
563 ERROR("cgfsng:copy_parent_file: bad path %s", path);
564 return false;
565 }
566 oldv = *lastslash;
567 *lastslash = '\0';
568 fpath = must_make_path(path, file, NULL);
569 len = lxc_read_from_file(fpath, NULL, 0);
570 if (len <= 0)
571 goto bad;
572 value = must_alloc(len + 1);
573 if (lxc_read_from_file(fpath, value, len) != len)
574 goto bad;
575 free(fpath);
576 *lastslash = oldv;
577 fpath = must_make_path(path, file, NULL);
578 ret = lxc_write_to_file(fpath, value, len, false);
579 if (ret < 0)
580 SYSERROR("Unable to write %s to %s", value, fpath);
581 free(fpath);
582 free(value);
583 return ret >= 0;
584
585bad:
586 SYSERROR("Error reading '%s'", fpath);
587 free(fpath);
588 free(value);
589 return false;
590}
591
592/*
593 * Initialize the cpuset hierarchy in first directory of @gname and
594 * set cgroup.clone_children so that children inherit settings.
595 * Since the h->base_path is populated by init or ourselves, we know
596 * it is already initialized.
597 */
a54694f8 598static bool handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf
SH
599{
600 char *cgpath, *clonechildrenpath, v, *slash;
601
602 if (!string_in_list(h->controllers, "cpuset"))
603 return true;
604
605 if (*cgname == '/')
606 cgname++;
607 slash = strchr(cgname, '/');
608 if (slash)
609 *slash = '\0';
610
611 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
612 if (slash)
613 *slash = '/';
614 if (mkdir(cgpath, 0755) < 0 && errno != EEXIST) {
615 SYSERROR("Failed to create '%s'", cgpath);
616 free(cgpath);
617 return false;
618 }
6f9584d8 619
e3a3fecf 620 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
6328fd9c
CB
621 /* unified hierarchy doesn't have clone_children */
622 if (!file_exists(clonechildrenpath)) {
e3a3fecf
SH
623 free(clonechildrenpath);
624 free(cgpath);
625 return true;
626 }
627 if (lxc_read_from_file(clonechildrenpath, &v, 1) < 0) {
628 SYSERROR("Failed to read '%s'", clonechildrenpath);
629 free(clonechildrenpath);
630 free(cgpath);
631 return false;
632 }
633
a54694f8 634 /* Make sure any isolated cpus are removed from cpuset.cpus. */
6f9584d8
CB
635 if (!filter_and_set_cpus(cgpath, v == '1')) {
636 SYSERROR("Failed to remove isolated cpus.");
637 free(clonechildrenpath);
638 free(cgpath);
a54694f8 639 return false;
6f9584d8 640 }
a54694f8 641
e3a3fecf 642 if (v == '1') { /* already set for us by someone else */
6f9584d8 643 DEBUG("\"cgroup.clone_children\" was already set to \"1\".");
e3a3fecf
SH
644 free(clonechildrenpath);
645 free(cgpath);
646 return true;
647 }
648
649 /* copy parent's settings */
a54694f8 650 if (!copy_parent_file(cgpath, "cpuset.mems")) {
6f9584d8 651 SYSERROR("Failed to copy \"cpuset.mems\" settings.");
e3a3fecf
SH
652 free(cgpath);
653 free(clonechildrenpath);
654 return false;
655 }
656 free(cgpath);
657
658 if (lxc_write_to_file(clonechildrenpath, "1", 1, false) < 0) {
659 /* Set clone_children so children inherit our settings */
660 SYSERROR("Failed to write 1 to %s", clonechildrenpath);
661 free(clonechildrenpath);
662 return false;
663 }
664 free(clonechildrenpath);
665 return true;
666}
667
ccb4cabe
SH
668/*
669 * Given two null-terminated lists of strings, return true if any string
670 * is in both.
671 */
672static bool controller_lists_intersect(char **l1, char **l2)
673{
674 int i;
675
676 if (!l1 || !l2)
677 return false;
678
679 for (i = 0; l1[i]; i++) {
680 if (string_in_list(l2, l1[i]))
681 return true;
682 }
683 return false;
684}
685
686/*
687 * For a null-terminated list of controllers @clist, return true if any of
688 * those controllers is already listed the null-terminated list of
689 * hierarchies @hlist. Realistically, if one is present, all must be present.
690 */
691static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
692{
693 int i;
694
695 if (!hlist)
696 return false;
697 for (i = 0; hlist[i]; i++)
698 if (controller_lists_intersect(hlist[i]->controllers, clist))
699 return true;
700 return false;
701
702}
703
704/*
705 * Return true if the controller @entry is found in the null-terminated
706 * list of hierarchies @hlist
707 */
708static bool controller_found(struct hierarchy **hlist, char *entry)
709{
710 int i;
711 if (!hlist)
712 return false;
713
714 for (i = 0; hlist[i]; i++)
715 if (string_in_list(hlist[i]->controllers, entry))
716 return true;
717 return false;
718}
719
720/*
c30b61c3
SH
721 * Return true if all of the controllers which we require have been found.
722 * The required list is freezer and anything in * lxc.cgroup.use.
ccb4cabe 723 */
457ca9aa 724static bool all_controllers_found(void)
ccb4cabe
SH
725{
726 char *p, *saveptr = NULL;
457ca9aa 727 struct hierarchy ** hlist = hierarchies;
ccb4cabe 728
ccb4cabe 729 if (!controller_found(hlist, "freezer")) {
c2712f64 730 ERROR("No freezer controller mountpoint found");
ccb4cabe
SH
731 return false;
732 }
733
457ca9aa 734 if (!cgroup_use)
ccb4cabe 735 return true;
c2712f64 736
457ca9aa 737 for (p = strtok_r(cgroup_use, ",", &saveptr); p;
ccb4cabe
SH
738 p = strtok_r(NULL, ",", &saveptr)) {
739 if (!controller_found(hlist, p)) {
c2712f64 740 ERROR("No %s controller mountpoint found", p);
ccb4cabe
SH
741 return false;
742 }
743 }
c2712f64 744
ccb4cabe
SH
745 return true;
746}
747
ccb4cabe
SH
748/*
749 * Get the controllers from a mountinfo line
750 * There are other ways we could get this info. For lxcfs, field 3
751 * is /cgroup/controller-list. For cgroupfs, we could parse the mount
752 * options. But we simply assume that the mountpoint must be
753 * /sys/fs/cgroup/controller-list
754 */
49ff3958 755static char **get_controllers(char **klist, char **nlist, char *line, int type)
ccb4cabe 756{
6328fd9c 757 /* the fourth field is /sys/fs/cgroup/comma-delimited-controller-list */
ccb4cabe 758 int i;
411ac6d8 759 char *dup, *p2, *tok;
411ac6d8
CB
760 char *p = line, *saveptr = NULL;
761 char **aret = NULL;
6328fd9c 762
ccb4cabe 763 for (i = 0; i < 4; i++) {
235f1815 764 p = strchr(p, ' ');
ccb4cabe
SH
765 if (!p)
766 return NULL;
767 p++;
768 }
769 if (!p)
770 return NULL;
771 /* note - if we change how mountinfo works, then our caller
772 * will need to verify /sys/fs/cgroup/ in this field */
c2712f64
CB
773 if (strncmp(p, "/sys/fs/cgroup/", 15)) {
774 INFO("Found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
ccb4cabe 775 return NULL;
5059aae9 776 }
ccb4cabe 777 p += 15;
235f1815 778 p2 = strchr(p, ' ');
ccb4cabe 779 if (!p2) {
c2712f64 780 ERROR("Corrupt mountinfo");
ccb4cabe
SH
781 return NULL;
782 }
783 *p2 = '\0';
6328fd9c
CB
784
785 /* cgroup v2 does not have separate mountpoints for controllers */
49ff3958 786 if (type == CGROUP_V2) {
6328fd9c 787 must_append_controller(klist, nlist, &aret, "cgroup2");
e2deb1d8 788 return aret;
411ac6d8
CB
789 }
790
791 /* strdup() here for v1 hierarchies. Otherwise strtok_r() will destroy
792 * mountpoints such as "/sys/fs/cgroup/cpu,cpuacct".
793 */
794 dup = strdup(p);
c2712f64 795 if (!dup)
411ac6d8 796 return NULL;
6328fd9c 797
411ac6d8 798 for (tok = strtok_r(dup, ",", &saveptr); tok;
ccb4cabe
SH
799 tok = strtok_r(NULL, ",", &saveptr)) {
800 must_append_controller(klist, nlist, &aret, tok);
801 }
802
411ac6d8 803 free(dup);
ccb4cabe
SH
804 return aret;
805}
806
ccb4cabe 807/* Add a controller to our list of hierarchies */
457ca9aa 808static void add_controller(char **clist, char *mountpoint, char *base_cgroup)
ccb4cabe
SH
809{
810 struct hierarchy *new;
811 int newentry;
812
813 new = must_alloc(sizeof(*new));
814 new->controllers = clist;
815 new->mountpoint = mountpoint;
816 new->base_cgroup = base_cgroup;
817 new->fullcgpath = NULL;
818
6328fd9c 819 /* record if this is the cgroup v2 hierarchy */
cdfe90a4 820 if (clist && !strcmp(*clist, "cgroup2"))
6328fd9c
CB
821 new->is_cgroup_v2 = true;
822 else
823 new->is_cgroup_v2 = false;
824
457ca9aa
SH
825 newentry = append_null_to_list((void ***)&hierarchies);
826 hierarchies[newentry] = new;
ccb4cabe
SH
827}
828
829/*
830 * Get a copy of the mountpoint from @line, which is a line from
831 * /proc/self/mountinfo
832 */
833static char *get_mountpoint(char *line)
834{
835 int i;
836 char *p = line, *sret;
837 size_t len;
838
839 for (i = 0; i < 4; i++) {
235f1815 840 p = strchr(p, ' ');
ccb4cabe
SH
841 if (!p)
842 return NULL;
843 p++;
844 }
845 /* we've already stuck a \0 after the mountpoint */
846 len = strlen(p);
847 sret = must_alloc(len + 1);
848 memcpy(sret, p, len);
849 sret[len] = '\0';
850 return sret;
851}
852
853/*
854 * Given a multi-line string, return a null-terminated copy of the
855 * current line.
856 */
857static char *copy_to_eol(char *p)
858{
235f1815 859 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
860 size_t len;
861
862 if (!p2)
863 return NULL;
864
865 len = p2 - p;
866 sret = must_alloc(len + 1);
867 memcpy(sret, p, len);
868 sret[len] = '\0';
869 return sret;
870}
871
872/*
873 * cgline: pointer to character after the first ':' in a line in a
874 * \n-terminated /proc/self/cgroup file. Check whether * controller c is
875 * present.
876 */
877static bool controller_in_clist(char *cgline, char *c)
878{
879 char *tok, *saveptr = NULL, *eol, *tmp;
880 size_t len;
881
235f1815 882 eol = strchr(cgline, ':');
ccb4cabe
SH
883 if (!eol)
884 return false;
885
886 len = eol - cgline;
887 tmp = alloca(len + 1);
888 memcpy(tmp, cgline, len);
889 tmp[len] = '\0';
890
891 for (tok = strtok_r(tmp, ",", &saveptr); tok;
892 tok = strtok_r(NULL, ",", &saveptr)) {
893 if (strcmp(tok, c) == 0)
894 return true;
895 }
896 return false;
897}
898
899/*
900 * @basecginfo is a copy of /proc/$$/cgroup. Return the current
901 * cgroup for @controller
902 */
903static char *get_current_cgroup(char *basecginfo, char *controller)
904{
905 char *p = basecginfo;
6328fd9c
CB
906 bool is_cgroup_v2;
907 bool is_cgroup_v2_base_cgroup;
908
909 is_cgroup_v2 = !strcmp(controller, "cgroup2");
910 while (true) {
911 is_cgroup_v2_base_cgroup = false;
912 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
913 if (is_cgroup_v2 && (*p == '0'))
914 is_cgroup_v2_base_cgroup = true;
ccb4cabe 915
235f1815 916 p = strchr(p, ':');
ccb4cabe
SH
917 if (!p)
918 return NULL;
919 p++;
6328fd9c 920 if (is_cgroup_v2_base_cgroup || controller_in_clist(p, controller)) {
235f1815 921 p = strchr(p, ':');
ccb4cabe
SH
922 if (!p)
923 return NULL;
924 p++;
925 return copy_to_eol(p);
926 }
927
235f1815 928 p = strchr(p, '\n');
ccb4cabe
SH
929 if (!p)
930 return NULL;
931 p++;
932 }
933}
934
ccb4cabe
SH
935static void must_append_string(char ***list, char *entry)
936{
937 int newentry = append_null_to_list((void ***)list);
938 char *copy;
939
940 copy = must_copy_string(entry);
941 (*list)[newentry] = copy;
942}
943
944static void get_existing_subsystems(char ***klist, char ***nlist)
945{
946 FILE *f;
947 char *line = NULL;
948 size_t len = 0;
949
950 if ((f = fopen("/proc/self/cgroup", "r")) == NULL)
951 return;
952 while (getline(&line, &len, f) != -1) {
953 char *p, *p2, *tok, *saveptr = NULL;
235f1815 954 p = strchr(line, ':');
ccb4cabe
SH
955 if (!p)
956 continue;
957 p++;
235f1815 958 p2 = strchr(p, ':');
ccb4cabe
SH
959 if (!p2)
960 continue;
961 *p2 = '\0';
ff8d6ee9 962
6328fd9c
CB
963 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
964 * contains an entry of the form:
ff8d6ee9
CB
965 *
966 * 0::/some/path
967 *
6328fd9c 968 * In this case we use "cgroup2" as controller name.
ff8d6ee9 969 */
6328fd9c
CB
970 if ((p2 - p) == 0) {
971 must_append_string(klist, "cgroup2");
ff8d6ee9 972 continue;
6328fd9c 973 }
ff8d6ee9 974
ccb4cabe
SH
975 for (tok = strtok_r(p, ",", &saveptr); tok;
976 tok = strtok_r(NULL, ",", &saveptr)) {
977 if (strncmp(tok, "name=", 5) == 0)
978 must_append_string(nlist, tok);
979 else
980 must_append_string(klist, tok);
981 }
982 }
983
984 free(line);
985 fclose(f);
986}
987
988static void trim(char *s)
989{
990 size_t len = strlen(s);
2c28d76b 991 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
992 s[--len] = '\0';
993}
994
e4aeecf5
CB
995static void lxc_cgfsng_print_handler_data(const struct cgfsng_handler_data *d)
996{
997 printf("Cgroup information:\n");
998 printf(" container name: %s\n", d->name ? d->name : "(null)");
999 printf(" lxc.cgroup.use: %s\n", cgroup_use ? cgroup_use : "(null)");
43654d34
CB
1000 printf(" lxc.cgroup.pattern: %s\n",
1001 d->cgroup_pattern ? d->cgroup_pattern : "(null)");
1002 printf(" lxc.cgroup.dir: %s\n",
1003 d->cgroup_meta.dir ? d->cgroup_meta.dir : "(null)");
1004 printf(" cgroup: %s\n",
1005 d->container_cgroup ? d->container_cgroup : "(null)");
e4aeecf5
CB
1006}
1007
1008static void lxc_cgfsng_print_hierarchies()
ccb4cabe 1009{
a7b0cc4c 1010 struct hierarchy **it;
ccb4cabe 1011 int i;
41c33dbe 1012
457ca9aa 1013 if (!hierarchies) {
c2712f64 1014 printf(" No hierarchies found\n");
ccb4cabe
SH
1015 return;
1016 }
e4aeecf5 1017 printf(" Hierarchies:\n");
a7b0cc4c
CB
1018 for (i = 0, it = hierarchies; it && *it; it++, i++) {
1019 char **cit;
ccb4cabe 1020 int j;
c2712f64
CB
1021 printf(" %d: base_cgroup: %s\n", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1022 printf(" mountpoint: %s\n", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
e4aeecf5 1023 printf(" controllers:\n");
a7b0cc4c 1024 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
e4aeecf5 1025 printf(" %d: %s\n", j, *cit);
ccb4cabe
SH
1026 }
1027}
41c33dbe 1028
e4aeecf5 1029static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist, char **nlist)
41c33dbe
SH
1030{
1031 int k;
a7b0cc4c 1032 char **it;
41c33dbe 1033
a7b0cc4c
CB
1034 printf("basecginfo is:\n");
1035 printf("%s\n", basecginfo);
41c33dbe 1036
a7b0cc4c
CB
1037 for (k = 0, it = klist; it && *it; it++, k++)
1038 printf("kernel subsystem %d: %s\n", k, *it);
1039 for (k = 0, it = nlist; it && *it; it++, k++)
1040 printf("named subsystem %d: %s\n", k, *it);
41c33dbe 1041}
ccb4cabe 1042
e4aeecf5
CB
1043static void lxc_cgfsng_print_debuginfo(const struct cgfsng_handler_data *d)
1044{
1045 lxc_cgfsng_print_handler_data(d);
1046 lxc_cgfsng_print_hierarchies();
1047}
1048
ccb4cabe
SH
1049/*
1050 * At startup, parse_hierarchies finds all the info we need about
1051 * cgroup mountpoints and current cgroups, and stores it in @d.
1052 */
457ca9aa 1053static bool parse_hierarchies(void)
ccb4cabe
SH
1054{
1055 FILE *f;
1056 char * line = NULL, *basecginfo;
1057 char **klist = NULL, **nlist = NULL;
1058 size_t len = 0;
1059
d30ec4cb
SH
1060 /*
1061 * Root spawned containers escape the current cgroup, so use init's
1062 * cgroups as our base in that case.
1063 */
ccb4cabe
SH
1064 if (geteuid())
1065 basecginfo = read_file("/proc/self/cgroup");
1066 else
1067 basecginfo = read_file("/proc/1/cgroup");
1068 if (!basecginfo)
1069 return false;
1070
1071 if ((f = fopen("/proc/self/mountinfo", "r")) == NULL) {
c2712f64 1072 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
ccb4cabe
SH
1073 return false;
1074 }
1075
1076 get_existing_subsystems(&klist, &nlist);
41c33dbe 1077
e4aeecf5
CB
1078 if (lxc_cgfsng_debug)
1079 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
ccb4cabe
SH
1080
1081 /* we support simple cgroup mounts and lxcfs mounts */
1082 while (getline(&line, &len, f) != -1) {
1083 char **controller_list = NULL;
1084 char *mountpoint, *base_cgroup;
49ff3958
CB
1085 bool writeable;
1086 int type;
ccb4cabe 1087
49ff3958
CB
1088 type = get_cgroup_version(line);
1089 if (type < 0)
ccb4cabe
SH
1090 continue;
1091
49ff3958 1092 controller_list = get_controllers(klist, nlist, line, type);
ccb4cabe
SH
1093 if (!controller_list)
1094 continue;
1095
457ca9aa 1096 if (controller_list_is_dup(hierarchies, controller_list)) {
ccb4cabe
SH
1097 free(controller_list);
1098 continue;
1099 }
1100
1101 mountpoint = get_mountpoint(line);
1102 if (!mountpoint) {
c2712f64 1103 ERROR("Failed parsing mountpoint from \"%s\"", line);
ccb4cabe
SH
1104 free_string_list(controller_list);
1105 continue;
1106 }
1107
1108 base_cgroup = get_current_cgroup(basecginfo, controller_list[0]);
1109 if (!base_cgroup) {
c2712f64 1110 ERROR("Failed to find current cgroup for controller \"%s\"", controller_list[0]);
ccb4cabe
SH
1111 free_string_list(controller_list);
1112 free(mountpoint);
1113 continue;
1114 }
6328fd9c 1115
ccb4cabe
SH
1116 trim(base_cgroup);
1117 prune_init_scope(base_cgroup);
49ff3958 1118 if (type == CGROUP_V2)
6328fd9c
CB
1119 writeable = test_writeable_v2(mountpoint, base_cgroup);
1120 else
1121 writeable = test_writeable_v1(mountpoint, base_cgroup);
1122 if (!writeable) {
ccb4cabe
SH
1123 free_string_list(controller_list);
1124 free(mountpoint);
1125 free(base_cgroup);
1126 continue;
1127 }
457ca9aa 1128 add_controller(controller_list, mountpoint, base_cgroup);
ccb4cabe
SH
1129 }
1130
1131 free_string_list(klist);
1132 free_string_list(nlist);
1133
1134 free(basecginfo);
1135
1136 fclose(f);
1137 free(line);
1138
e4aeecf5
CB
1139 if (lxc_cgfsng_debug) {
1140 printf("writeable subsystems:\n");
1141 lxc_cgfsng_print_hierarchies();
1142 }
1143
ccb4cabe
SH
1144 /* verify that all controllers in cgroup.use and all crucial
1145 * controllers are accounted for
1146 */
c2712f64 1147 if (!all_controllers_found())
ccb4cabe
SH
1148 return false;
1149
1150 return true;
1151}
1152
457ca9aa
SH
1153static bool collect_hierarchy_info(void)
1154{
1155 const char *tmp;
1156 errno = 0;
1157 tmp = lxc_global_config_value("lxc.cgroup.use");
1a0e70ac 1158 if (!cgroup_use && errno != 0) { /* lxc.cgroup.use can be NULL */
c2712f64 1159 ERROR("Failed to retrieve list of cgroups to use");
457ca9aa
SH
1160 return false;
1161 }
1162 cgroup_use = must_copy_string(tmp);
1163
1164 return parse_hierarchies();
1165}
1166
43654d34 1167static void *cgfsng_init(struct lxc_handler *handler)
ccb4cabe 1168{
457ca9aa 1169 const char *cgroup_pattern;
43654d34 1170 struct cgfsng_handler_data *d;
ccb4cabe
SH
1171
1172 d = must_alloc(sizeof(*d));
1173 memset(d, 0, sizeof(*d));
1174
43654d34
CB
1175 /* copy container name */
1176 d->name = must_copy_string(handler->name);
1177
1178 /* copy per-container cgroup information */
ae5e6c08
CB
1179 d->cgroup_meta.dir = NULL;
1180 d->cgroup_meta.controllers = NULL;
9b5396f9
CB
1181 if (handler->conf) {
1182 d->cgroup_meta.dir = must_copy_string(handler->conf->cgroup_meta.dir);
1183 d->cgroup_meta.controllers = must_copy_string(handler->conf->cgroup_meta.controllers);
1184 }
ccb4cabe 1185
43654d34 1186 /* copy system-wide cgroup information */
ccb4cabe 1187 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
43654d34
CB
1188 if (!cgroup_pattern) {
1189 /* lxc.cgroup.pattern is only NULL on error. */
ccb4cabe
SH
1190 ERROR("Error getting cgroup pattern");
1191 goto out_free;
1192 }
1193 d->cgroup_pattern = must_copy_string(cgroup_pattern);
1194
e4aeecf5
CB
1195 if (lxc_cgfsng_debug)
1196 lxc_cgfsng_print_debuginfo(d);
ccb4cabe
SH
1197
1198 return d;
1199
1200out_free:
1201 free_handler_data(d);
1202 return NULL;
1203}
1204
ccb4cabe
SH
1205static int cgroup_rmdir(char *dirname)
1206{
a17f8b3f 1207 int ret;
74f96976 1208 struct dirent *direntp;
ccb4cabe
SH
1209 DIR *dir;
1210 int r = 0;
1211
1212 dir = opendir(dirname);
1213 if (!dir)
1214 return -1;
1215
74f96976 1216 while ((direntp = readdir(dir))) {
ccb4cabe 1217 char *pathname;
a17f8b3f 1218 struct stat mystat;
ccb4cabe
SH
1219
1220 if (!direntp)
1221 break;
1222
1223 if (!strcmp(direntp->d_name, ".") ||
1224 !strcmp(direntp->d_name, ".."))
1225 continue;
1226
1227 pathname = must_make_path(dirname, direntp->d_name, NULL);
1228
a17f8b3f
CB
1229 ret = lstat(pathname, &mystat);
1230 if (ret < 0) {
ccb4cabe 1231 if (!r)
a17f8b3f 1232 WARN("Failed to stat %s", pathname);
ccb4cabe
SH
1233 r = -1;
1234 goto next;
1235 }
1236
1237 if (!S_ISDIR(mystat.st_mode))
1238 goto next;
a17f8b3f
CB
1239
1240 ret = cgroup_rmdir(pathname);
1241 if (ret < 0)
ccb4cabe
SH
1242 r = -1;
1243next:
1244 free(pathname);
1245 }
1246
a17f8b3f
CB
1247 ret = rmdir(dirname);
1248 if (ret < 0) {
ccb4cabe 1249 if (!r)
a17f8b3f
CB
1250 WARN("Failed to delete \"%s\": %s", dirname,
1251 strerror(errno));
ccb4cabe
SH
1252 r = -1;
1253 }
1254
a17f8b3f
CB
1255 ret = closedir(dir);
1256 if (ret < 0) {
ccb4cabe 1257 if (!r)
a17f8b3f
CB
1258 WARN("Failed to delete \"%s\": %s", dirname,
1259 strerror(errno));
ccb4cabe
SH
1260 r = -1;
1261 }
a17f8b3f 1262
ccb4cabe
SH
1263 return r;
1264}
1265
1266static int rmdir_wrapper(void *data)
1267{
1268 char *path = data;
1269
1270 if (setresgid(0,0,0) < 0)
1271 SYSERROR("Failed to setgid to 0");
1272 if (setresuid(0,0,0) < 0)
1273 SYSERROR("Failed to setuid to 0");
1274 if (setgroups(0, NULL) < 0)
1275 SYSERROR("Failed to clear groups");
1276
1277 return cgroup_rmdir(path);
1278}
1279
308a6c94 1280void recursive_destroy(char *path, struct lxc_conf *conf)
ccb4cabe
SH
1281{
1282 int r;
1283 if (conf && !lxc_list_empty(&conf->id_map))
c9b7c33e 1284 r = userns_exec_1(conf, rmdir_wrapper, path, "rmdir_wrapper");
ccb4cabe
SH
1285 else
1286 r = cgroup_rmdir(path);
308a6c94 1287
ccb4cabe 1288 if (r < 0)
1c9da8da 1289 ERROR("Error destroying %s", path);
ccb4cabe
SH
1290}
1291
1292static void cgfsng_destroy(void *hdata, struct lxc_conf *conf)
1293{
1294 struct cgfsng_handler_data *d = hdata;
1295
1296 if (!d)
1297 return;
1298
308a6c94
CB
1299 if (d->container_cgroup && hierarchies) {
1300 int i;
1301 for (i = 0; hierarchies[i]; i++) {
1302 struct hierarchy *h = hierarchies[i];
1303 if (h->fullcgpath) {
1304 recursive_destroy(h->fullcgpath, conf);
1305 free(h->fullcgpath);
1306 h->fullcgpath = NULL;
1307 }
ccb4cabe
SH
1308 }
1309 }
1310
1311 free_handler_data(d);
1312}
1313
1314struct cgroup_ops *cgfsng_ops_init(void)
1315{
e4aeecf5
CB
1316 if (getenv("LXC_DEBUG_CGFSNG"))
1317 lxc_cgfsng_debug = true;
1318
457ca9aa
SH
1319 if (!collect_hierarchy_info())
1320 return NULL;
e4aeecf5 1321
ccb4cabe
SH
1322 return &cgfsng_ops;
1323}
1324
1325static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1326{
e3a3fecf 1327 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
1a0e70ac 1328 if (dir_exists(h->fullcgpath)) { /* it must not already exist */
6f9584d8 1329 ERROR("Path \"%s\" already existed.", h->fullcgpath);
d8da679e 1330 return false;
6f9584d8
CB
1331 }
1332 if (!handle_cpuset_hierarchy(h, cgname)) {
1333 ERROR("Failed to handle cgroupfs v1 cpuset controller.");
e3a3fecf 1334 return false;
6f9584d8 1335 }
e3a3fecf 1336 return mkdir_p(h->fullcgpath, 0755) == 0;
ccb4cabe
SH
1337}
1338
1339static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1340{
1341 if (rmdir(h->fullcgpath) < 0)
1342 SYSERROR("Failed to clean up cgroup %s from failed creation attempt", h->fullcgpath);
1343 free(h->fullcgpath);
1344 h->fullcgpath = NULL;
1345}
1346
1347/*
d30ec4cb 1348 * Try to create the same cgroup in all hierarchies.
ccb4cabe
SH
1349 * Start with cgroup_pattern; next cgroup_pattern-1, -2, ..., -999
1350 */
1351static inline bool cgfsng_create(void *hdata)
1352{
bb30b52a 1353 int i;
ccb4cabe 1354 size_t len;
7d531e9b
CB
1355 char *cgname, *offset, *tmp;
1356 int idx = 0;
1357 struct cgfsng_handler_data *d = hdata;
ccb4cabe
SH
1358
1359 if (!d)
1360 return false;
43654d34 1361
ccb4cabe
SH
1362 if (d->container_cgroup) {
1363 WARN("cgfsng_create called a second time");
1364 return false;
1365 }
1366
43654d34 1367 if (d->cgroup_meta.dir)
7d531e9b 1368 tmp = lxc_string_join("/", (const char *[]){d->cgroup_meta.dir, d->name, NULL}, false);
43654d34
CB
1369 else
1370 tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
ccb4cabe
SH
1371 if (!tmp) {
1372 ERROR("Failed expanding cgroup name pattern");
1373 return false;
1374 }
1a0e70ac 1375 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
ccb4cabe
SH
1376 cgname = must_alloc(len);
1377 strcpy(cgname, tmp);
1378 free(tmp);
1379 offset = cgname + len - 5;
1380
1381again:
95adfe93
SH
1382 if (idx == 1000) {
1383 ERROR("Too many conflicting cgroup names");
ccb4cabe 1384 goto out_free;
95adfe93 1385 }
66b66624 1386 if (idx) {
bb30b52a
CB
1387 int ret;
1388
66b66624
CB
1389 ret = snprintf(offset, 5, "-%d", idx);
1390 if (ret < 0 || (size_t)ret >= 5) {
1391 FILE *f = fopen("/dev/null", "w");
97ebced3 1392 if (f) {
66b66624
CB
1393 fprintf(f, "Workaround for GCC7 bug: "
1394 "https://gcc.gnu.org/bugzilla/"
1395 "show_bug.cgi?id=78969");
1396 fclose(f);
1397 }
1398 }
1399 }
457ca9aa
SH
1400 for (i = 0; hierarchies[i]; i++) {
1401 if (!create_path_for_hierarchy(hierarchies[i], cgname)) {
ccb4cabe 1402 int j;
1a0e70ac 1403 ERROR("Failed to create \"%s\"", hierarchies[i]->fullcgpath);
457ca9aa
SH
1404 free(hierarchies[i]->fullcgpath);
1405 hierarchies[i]->fullcgpath = NULL;
ccb4cabe 1406 for (j = 0; j < i; j++)
457ca9aa 1407 remove_path_for_hierarchy(hierarchies[j], cgname);
ccb4cabe
SH
1408 idx++;
1409 goto again;
1410 }
1411 }
1412 /* Done */
1413 d->container_cgroup = cgname;
1414 return true;
1415
1416out_free:
1417 free(cgname);
1418 return false;
1419}
1420
ccb4cabe
SH
1421static bool cgfsng_enter(void *hdata, pid_t pid)
1422{
ccb4cabe
SH
1423 char pidstr[25];
1424 int i, len;
1425
1426 len = snprintf(pidstr, 25, "%d", pid);
1427 if (len < 0 || len > 25)
1428 return false;
1429
457ca9aa
SH
1430 for (i = 0; hierarchies[i]; i++) {
1431 char *fullpath = must_make_path(hierarchies[i]->fullcgpath,
ccb4cabe
SH
1432 "cgroup.procs", NULL);
1433 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
d3b00a8f 1434 SYSERROR("Failed to enter %s", fullpath);
ccb4cabe
SH
1435 free(fullpath);
1436 return false;
1437 }
1438 free(fullpath);
1439 }
1440
1441 return true;
1442}
1443
1444struct chown_data {
1445 struct cgfsng_handler_data *d;
1a0e70ac 1446 uid_t origuid; /* target uid in parent namespace */
ccb4cabe
SH
1447};
1448
c0888dfe
SH
1449/*
1450 * chgrp the container cgroups to container group. We leave
1451 * the container owner as cgroup owner. So we must make the
1452 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1453 *
1454 * Also chown the tasks and cgroup.procs files. Those may not
1455 * exist depending on kernel version.
c0888dfe 1456 */
ccb4cabe
SH
1457static int chown_cgroup_wrapper(void *data)
1458{
1459 struct chown_data *arg = data;
ccb4cabe
SH
1460 uid_t destuid;
1461 int i;
1462
1463 if (setresgid(0,0,0) < 0)
1464 SYSERROR("Failed to setgid to 0");
1465 if (setresuid(0,0,0) < 0)
1466 SYSERROR("Failed to setuid to 0");
1467 if (setgroups(0, NULL) < 0)
1468 SYSERROR("Failed to clear groups");
1469
1470 destuid = get_ns_uid(arg->origuid);
1471
457ca9aa
SH
1472 for (i = 0; hierarchies[i]; i++) {
1473 char *fullpath, *path = hierarchies[i]->fullcgpath;
43647298
SH
1474
1475 if (chown(path, destuid, 0) < 0) {
ab8f5424 1476 SYSERROR("Error chowning %s to %d", path, (int) destuid);
ccb4cabe
SH
1477 return -1;
1478 }
c0888dfe 1479
43647298 1480 if (chmod(path, 0775) < 0) {
ab8f5424 1481 SYSERROR("Error chmoding %s", path);
c0888dfe
SH
1482 return -1;
1483 }
ccb4cabe 1484
ab8f5424
SH
1485 /*
1486 * Failures to chown these are inconvenient but not detrimental
1487 * We leave these owned by the container launcher, so that container
1488 * root can write to the files to attach. We chmod them 664 so that
1489 * container systemd can write to the files (which systemd in wily
1490 * insists on doing)
1491 */
43647298
SH
1492 fullpath = must_make_path(path, "tasks", NULL);
1493 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1494 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1495 strerror(errno));
ab8f5424 1496 if (chmod(fullpath, 0664) < 0)
13277ec4 1497 WARN("Error chmoding %s: %s", path, strerror(errno));
43647298
SH
1498 free(fullpath);
1499
1500 fullpath = must_make_path(path, "cgroup.procs", NULL);
1501 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1502 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1503 strerror(errno));
ab8f5424 1504 if (chmod(fullpath, 0664) < 0)
13277ec4 1505 WARN("Error chmoding %s: %s", path, strerror(errno));
ccb4cabe 1506 free(fullpath);
0e17357c
CB
1507
1508 if (!hierarchies[i]->is_cgroup_v2)
1509 continue;
1510
1511 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
1512 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
1513 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1514 strerror(errno));
1515 if (chmod(fullpath, 0664) < 0)
1516 WARN("Error chmoding %s: %s", path, strerror(errno));
1517 free(fullpath);
1518
1519 fullpath = must_make_path(path, "cgroup.threads", NULL);
1520 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
1521 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1522 strerror(errno));
1523 if (chmod(fullpath, 0664) < 0)
1524 WARN("Error chmoding %s: %s", path, strerror(errno));
1525 free(fullpath);
ccb4cabe
SH
1526 }
1527
1528 return 0;
1529}
1530
058c1cb6 1531static bool cgfsng_chown(void *hdata, struct lxc_conf *conf)
ccb4cabe
SH
1532{
1533 struct cgfsng_handler_data *d = hdata;
1534 struct chown_data wrap;
1535
1536 if (!d)
1537 return false;
1538
1539 if (lxc_list_empty(&conf->id_map))
1540 return true;
1541
1542 wrap.d = d;
1543 wrap.origuid = geteuid();
1544
c9b7c33e
CB
1545 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1546 "chown_cgroup_wrapper") < 0) {
ccb4cabe
SH
1547 ERROR("Error requesting cgroup chown in new namespace");
1548 return false;
1549 }
1550
1551 return true;
1552}
1553
8aa1044f
SH
1554/*
1555 * We've safe-mounted a tmpfs as parent, so we don't need to protect against
1556 * symlinks any more - just use mount
1557 */
1558
1559/* mount cgroup-full if requested */
1560static int mount_cgroup_full(int type, struct hierarchy *h, char *dest,
1561 char *container_cgroup)
1562{
1563 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1564 return 0;
1565 if (mount(h->mountpoint, dest, "cgroup", MS_BIND, NULL) < 0) {
1566 SYSERROR("Error bind-mounting %s cgroup onto %s", h->mountpoint,
1567 dest);
1568 return -1;
1569 }
1570 if (type != LXC_AUTO_CGROUP_FULL_RW) {
5b6f9369
SH
1571 unsigned long flags = MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV |
1572 MS_REMOUNT | MS_RDONLY;
1573 if (mount(NULL, dest, "cgroup", flags, NULL) < 0) {
8aa1044f
SH
1574 SYSERROR("Error remounting %s readonly", dest);
1575 return -1;
1576 }
1577 }
1578
1579 INFO("Bind mounted %s onto %s", h->mountpoint, dest);
1580 if (type != LXC_AUTO_CGROUP_FULL_MIXED)
1581 return 0;
1582
1583 /* mount just the container path rw */
1584 char *source = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
5b6f9369 1585 char *rwpath = must_make_path(dest, h->base_cgroup, container_cgroup, NULL);
8aa1044f 1586 if (mount(source, rwpath, "cgroup", MS_BIND, NULL) < 0)
13277ec4 1587 WARN("Failed to mount %s read-write: %s", rwpath,
1588 strerror(errno));
8aa1044f
SH
1589 INFO("Made %s read-write", rwpath);
1590 free(rwpath);
1591 free(source);
1592 return 0;
1593}
1594
1595/* cgroup-full:* is done, no need to create subdirs */
1596static bool cg_mount_needs_subdirs(int type)
1597{
1598 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1599 return false;
1600 return true;
1601}
1602
1603/*
1604 * After $rootfs/sys/fs/container/controller/the/cg/path has been
1605 * created, remount controller ro if needed and bindmount the
1606 * cgroupfs onto controll/the/cg/path
1607 */
1608static int
1609do_secondstage_mounts_if_needed(int type, struct hierarchy *h,
1610 char *controllerpath, char *cgpath,
1611 const char *container_cgroup)
1612{
1613 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1614 if (mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL) < 0) {
1615 SYSERROR("Error bind-mounting %s", controllerpath);
1616 return -1;
1617 }
1618 if (mount(controllerpath, controllerpath, "cgroup",
1619 MS_REMOUNT | MS_BIND | MS_RDONLY, NULL) < 0) {
1620 SYSERROR("Error remounting %s read-only", controllerpath);
1621 return -1;
1622 }
1623 INFO("Remounted %s read-only", controllerpath);
1624 }
1625 char *sourcepath = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
1626 int flags = MS_BIND;
1627 if (type == LXC_AUTO_CGROUP_RO)
1628 flags |= MS_RDONLY;
1629 INFO("Mounting %s onto %s", sourcepath, cgpath);
1630 if (mount(sourcepath, cgpath, "cgroup", flags, NULL) < 0) {
1631 free(sourcepath);
1632 SYSERROR("Error mounting cgroup %s onto %s", h->controllers[0],
1633 cgpath);
1634 return -1;
1635 }
1636 free(sourcepath);
1637 INFO("Completed second stage cgroup automounts for %s", cgpath);
1638 return 0;
1639}
1640
a760603e 1641static int mount_cgroup_cgns_supported(int type, struct hierarchy *h, const char *controllerpath)
b635e92d
CB
1642{
1643 int ret;
1644 char *controllers = NULL;
a760603e
CB
1645 char *fstype = "cgroup2";
1646 unsigned long flags = 0;
b635e92d 1647
a760603e
CB
1648 flags |= MS_NOSUID;
1649 flags |= MS_NOEXEC;
1650 flags |= MS_NODEV;
1651 flags |= MS_RELATIME;
1652
1653 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1654 flags |= MS_RDONLY;
1655
1656 if (!h->is_cgroup_v2) {
1657 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1658 if (!controllers)
1659 return -ENOMEM;
1660 fstype = "cgroup";
b635e92d
CB
1661 }
1662
a760603e 1663 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
b635e92d
CB
1664 free(controllers);
1665 if (ret < 0) {
a760603e 1666 SYSERROR("Failed to mount %s with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1667 return -1;
1668 }
1669
a760603e 1670 DEBUG("Mounted %s with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1671 return 0;
1672}
1673
ccb4cabe
SH
1674static bool cgfsng_mount(void *hdata, const char *root, int type)
1675{
b635e92d 1676 int i;
8aa1044f
SH
1677 char *tmpfspath = NULL;
1678 bool retval = false;
b635e92d
CB
1679 struct lxc_handler *handler = hdata;
1680 struct cgfsng_handler_data *d = handler->cgroup_data;
1681 bool has_cgns = false, has_sys_admin = true;
8aa1044f
SH
1682
1683 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1684 return true;
1685
b635e92d
CB
1686 has_cgns = cgns_supported();
1687 if (!lxc_list_empty(&handler->conf->keepcaps))
1688 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1689 else
1690 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1691
1692 if (has_cgns && has_sys_admin)
ccb4cabe 1693 return true;
8aa1044f
SH
1694
1695 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1696
1697 if (type == LXC_AUTO_CGROUP_NOSPEC)
1698 type = LXC_AUTO_CGROUP_MIXED;
1699 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1700 type = LXC_AUTO_CGROUP_FULL_MIXED;
1701
1702 /* Mount tmpfs */
1703 if (safe_mount("cgroup_root", tmpfspath, "tmpfs",
1704 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1705 "size=10240k,mode=755",
1706 root) < 0)
1707 goto bad;
1708
457ca9aa 1709 for (i = 0; hierarchies[i]; i++) {
8aa1044f 1710 char *controllerpath, *path2;
457ca9aa 1711 struct hierarchy *h = hierarchies[i];
8aa1044f
SH
1712 char *controller = strrchr(h->mountpoint, '/');
1713 int r;
1714
1715 if (!controller)
1716 continue;
1717 controller++;
1718 controllerpath = must_make_path(tmpfspath, controller, NULL);
1719 if (dir_exists(controllerpath)) {
1720 free(controllerpath);
1721 continue;
1722 }
1723 if (mkdir(controllerpath, 0755) < 0) {
1724 SYSERROR("Error creating cgroup path: %s", controllerpath);
1725 free(controllerpath);
1726 goto bad;
1727 }
b635e92d
CB
1728
1729 if (has_cgns && !has_sys_admin) {
1730 /* If cgroup namespaces are supported but the container
1731 * will not have CAP_SYS_ADMIN after it has started we
1732 * need to mount the cgroups manually.
1733 */
a760603e 1734 r = mount_cgroup_cgns_supported(type, h, controllerpath);
b635e92d
CB
1735 free(controllerpath);
1736 if (r < 0)
1737 goto bad;
1738 continue;
1739 }
1740
8aa1044f
SH
1741 if (mount_cgroup_full(type, h, controllerpath, d->container_cgroup) < 0) {
1742 free(controllerpath);
1743 goto bad;
1744 }
1745 if (!cg_mount_needs_subdirs(type)) {
1746 free(controllerpath);
1747 continue;
1748 }
ef4413fa 1749 path2 = must_make_path(controllerpath, h->base_cgroup, d->container_cgroup, NULL);
8aa1044f
SH
1750 if (mkdir_p(path2, 0755) < 0) {
1751 free(controllerpath);
8e0c6620 1752 free(path2);
8aa1044f
SH
1753 goto bad;
1754 }
2f62fb00 1755
8aa1044f
SH
1756 r = do_secondstage_mounts_if_needed(type, h, controllerpath, path2,
1757 d->container_cgroup);
1758 free(controllerpath);
1759 free(path2);
1760 if (r < 0)
1761 goto bad;
1762 }
1763 retval = true;
1764
1765bad:
1766 free(tmpfspath);
1767 return retval;
ccb4cabe
SH
1768}
1769
1770static int recursive_count_nrtasks(char *dirname)
1771{
74f96976 1772 struct dirent *direntp;
ccb4cabe
SH
1773 DIR *dir;
1774 int count = 0, ret;
1775 char *path;
1776
1777 dir = opendir(dirname);
1778 if (!dir)
1779 return 0;
1780
74f96976 1781 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1782 struct stat mystat;
1783
1784 if (!direntp)
1785 break;
1786
1787 if (!strcmp(direntp->d_name, ".") ||
1788 !strcmp(direntp->d_name, ".."))
1789 continue;
1790
1791 path = must_make_path(dirname, direntp->d_name, NULL);
1792
1793 if (lstat(path, &mystat))
1794 goto next;
1795
1796 if (!S_ISDIR(mystat.st_mode))
1797 goto next;
1798
1799 count += recursive_count_nrtasks(path);
1800next:
1801 free(path);
1802 }
1803
1804 path = must_make_path(dirname, "cgroup.procs", NULL);
1805 ret = lxc_count_file_lines(path);
1806 if (ret != -1)
1807 count += ret;
1808 free(path);
1809
1810 (void) closedir(dir);
1811
1812 return count;
1813}
1814
1815static int cgfsng_nrtasks(void *hdata) {
1816 struct cgfsng_handler_data *d = hdata;
1817 char *path;
1818 int count;
1819
457ca9aa 1820 if (!d || !d->container_cgroup || !hierarchies)
ccb4cabe 1821 return -1;
457ca9aa 1822 path = must_make_path(hierarchies[0]->fullcgpath, NULL);
ccb4cabe
SH
1823 count = recursive_count_nrtasks(path);
1824 free(path);
1825 return count;
1826}
1827
1828/* Only root needs to escape to the cgroup of its init */
7103fe6f 1829static bool cgfsng_escape()
ccb4cabe 1830{
ccb4cabe
SH
1831 int i;
1832
1833 if (geteuid())
1834 return true;
1835
457ca9aa
SH
1836 for (i = 0; hierarchies[i]; i++) {
1837 char *fullpath = must_make_path(hierarchies[i]->mountpoint,
1838 hierarchies[i]->base_cgroup,
ccb4cabe
SH
1839 "cgroup.procs", NULL);
1840 if (lxc_write_to_file(fullpath, "0", 2, false) != 0) {
d3b00a8f 1841 SYSERROR("Failed to escape to %s", fullpath);
ccb4cabe 1842 free(fullpath);
6df334d1 1843 return false;
ccb4cabe
SH
1844 }
1845 free(fullpath);
1846 }
1847
6df334d1 1848 return true;
ccb4cabe
SH
1849}
1850
36662416
TA
1851static int cgfsng_num_hierarchies(void)
1852{
1853 int i;
1854
1855 for (i = 0; hierarchies[i]; i++)
1856 ;
1857
1858 return i;
1859}
1860
1861static bool cgfsng_get_hierarchies(int n, char ***out)
1862{
1863 int i;
1864
1865 /* sanity check n */
1866 for (i = 0; i < n; i++) {
1867 if (!hierarchies[i])
1868 return false;
1869 }
1870
1871 *out = hierarchies[i]->controllers;
1872
1873 return true;
1874}
1875
ccb4cabe
SH
1876#define THAWED "THAWED"
1877#define THAWED_LEN (strlen(THAWED))
1878
1879static bool cgfsng_unfreeze(void *hdata)
1880{
ccb4cabe 1881 char *fullpath;
457ca9aa 1882 struct hierarchy *h = get_hierarchy("freezer");
ccb4cabe 1883
457ca9aa 1884 if (!h)
ccb4cabe
SH
1885 return false;
1886 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
1887 if (lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false) != 0) {
1888 free(fullpath);
1889 return false;
1890 }
1891 free(fullpath);
1892 return true;
1893}
1894
1895static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem)
1896{
457ca9aa 1897 struct hierarchy *h = get_hierarchy(subsystem);
ccb4cabe
SH
1898 if (!h)
1899 return NULL;
1900
371f834d
SH
1901 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
1902}
1903
1904/*
1905 * Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a
1906 * full path, which must be freed by the caller.
1907 */
1908static char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
1909 const char *inpath,
1910 const char *filename)
1911{
371f834d 1912 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
1913}
1914
1915static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
1916{
ccb4cabe
SH
1917 char pidstr[25];
1918 int i, len;
1919
1920 len = snprintf(pidstr, 25, "%d", pid);
1921 if (len < 0 || len > 25)
1922 return false;
1923
457ca9aa 1924 for (i = 0; hierarchies[i]; i++) {
ccb4cabe 1925 char *path, *fullpath;
457ca9aa 1926 struct hierarchy *h = hierarchies[i];
ccb4cabe
SH
1927
1928 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
1a0e70ac 1929 if (!path) /* not running */
ccb4cabe
SH
1930 continue;
1931
371f834d
SH
1932 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
1933 free(path);
ccb4cabe
SH
1934 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
1935 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
1936 free(fullpath);
ccb4cabe
SH
1937 return false;
1938 }
ccb4cabe
SH
1939 free(fullpath);
1940 }
1941
ccb4cabe
SH
1942 return true;
1943}
1944
1945/*
1946 * Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.
1947 * Here we don't have a cgroup_data set up, so we ask the running
1948 * container through the commands API for the cgroup path
1949 */
1950static int cgfsng_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
1951{
1952 char *subsystem, *p, *path;
ccb4cabe
SH
1953 struct hierarchy *h;
1954 int ret = -1;
1955
1956 subsystem = alloca(strlen(filename) + 1);
1957 strcpy(subsystem, filename);
1958 if ((p = strchr(subsystem, '.')) != NULL)
1959 *p = '\0';
1960
1961 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1a0e70ac 1962 if (!path) /* not running */
ccb4cabe
SH
1963 return -1;
1964
457ca9aa 1965 h = get_hierarchy(subsystem);
ccb4cabe 1966 if (h) {
371f834d 1967 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1968 ret = lxc_read_from_file(fullpath, value, len);
1969 free(fullpath);
1970 }
1971
ccb4cabe
SH
1972 free(path);
1973
1974 return ret;
1975}
1976
1977/*
1978 * Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.
1979 * Here we don't have a cgroup_data set up, so we ask the running
1980 * container through the commands API for the cgroup path
1981 */
1982static int cgfsng_set(const char *filename, const char *value, const char *name, const char *lxcpath)
1983{
1984 char *subsystem, *p, *path;
ccb4cabe
SH
1985 struct hierarchy *h;
1986 int ret = -1;
1987
1988 subsystem = alloca(strlen(filename) + 1);
1989 strcpy(subsystem, filename);
1990 if ((p = strchr(subsystem, '.')) != NULL)
1991 *p = '\0';
1992
1993 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1a0e70ac 1994 if (!path) /* not running */
ccb4cabe
SH
1995 return -1;
1996
457ca9aa 1997 h = get_hierarchy(subsystem);
ccb4cabe 1998 if (h) {
371f834d 1999 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
2000 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
2001 free(fullpath);
2002 }
2003
ccb4cabe
SH
2004 free(path);
2005
2006 return ret;
2007}
2008
72add155
SH
2009/*
2010 * take devices cgroup line
2011 * /dev/foo rwx
2012 * and convert it to a valid
2013 * type major:minor mode
2014 * line. Return <0 on error. Dest is a preallocated buffer
2015 * long enough to hold the output.
2016 */
2017static int convert_devpath(const char *invalue, char *dest)
2018{
2a06d041
CB
2019 int n_parts;
2020 char *p, *path, type;
72add155
SH
2021 struct stat sb;
2022 unsigned long minor, major;
2a06d041
CB
2023 int ret = -EINVAL;
2024 char *mode = NULL;
72add155
SH
2025
2026 path = must_copy_string(invalue);
2027
2028 /*
2029 * read path followed by mode; ignore any trailing text.
2030 * A ' # comment' would be legal. Technically other text
2031 * is not legal, we could check for that if we cared to
2032 */
2033 for (n_parts = 1, p = path; *p && n_parts < 3; p++) {
2c2d6c49
SH
2034 if (*p != ' ')
2035 continue;
2036 *p = '\0';
2037 if (n_parts != 1)
2038 break;
2039 p++;
2040 n_parts++;
2041 while (*p == ' ')
2042 p++;
2043 mode = p;
2044 if (*p == '\0')
2045 goto out;
72add155 2046 }
2c2d6c49
SH
2047
2048 if (n_parts == 1)
72add155 2049 goto out;
72add155
SH
2050
2051 ret = stat(path, &sb);
2052 if (ret < 0)
2053 goto out;
2054
72add155
SH
2055 mode_t m = sb.st_mode & S_IFMT;
2056 switch (m) {
2057 case S_IFBLK:
2058 type = 'b';
2059 break;
2060 case S_IFCHR:
2061 type = 'c';
2062 break;
2c2d6c49 2063 default:
72add155
SH
2064 ERROR("Unsupported device type %i for %s", m, path);
2065 ret = -EINVAL;
2066 goto out;
2067 }
2c2d6c49
SH
2068
2069 major = MAJOR(sb.st_rdev);
2070 minor = MINOR(sb.st_rdev);
2071 ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
72add155 2072 if (ret < 0 || ret >= 50) {
2a06d041
CB
2073 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2074 "chars)", type, major, minor, mode);
72add155
SH
2075 ret = -ENAMETOOLONG;
2076 goto out;
2077 }
2078 ret = 0;
2079
2080out:
2081 free(path);
2082 return ret;
2083}
2084
ccb4cabe
SH
2085/*
2086 * Called from setup_limits - here we have the container's cgroup_data because
2087 * we created the cgroups
2088 */
2089static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfsng_handler_data *d)
2090{
b3646d7e 2091 char *fullpath, *p;
1a0e70ac
CB
2092 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2093 char converted_value[50];
b3646d7e
CB
2094 struct hierarchy *h;
2095 int ret = 0;
2096 char *controller = NULL;
ccb4cabe 2097
b3646d7e
CB
2098 controller = alloca(strlen(filename) + 1);
2099 strcpy(controller, filename);
2100 if ((p = strchr(controller, '.')) != NULL)
ccb4cabe
SH
2101 *p = '\0';
2102
c8bf519d 2103 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
72add155
SH
2104 ret = convert_devpath(value, converted_value);
2105 if (ret < 0)
c8bf519d 2106 return ret;
72add155
SH
2107 value = converted_value;
2108
c8bf519d 2109 }
2110
b3646d7e
CB
2111 h = get_hierarchy(controller);
2112 if (!h) {
2113 ERROR("Failed to setup limits for the \"%s\" controller. "
2114 "The controller seems to be unused by \"cgfsng\" cgroup "
2115 "driver or not enabled on the cgroup hierarchy",
2116 controller);
2117 return -1;
ccb4cabe 2118 }
b3646d7e
CB
2119
2120 fullpath = must_make_path(h->fullcgpath, filename, NULL);
2121 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
2122 free(fullpath);
ccb4cabe
SH
2123 return ret;
2124}
2125
2126static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings,
2127 bool do_devices)
2128{
2129 struct cgfsng_handler_data *d = hdata;
2130 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
2131 struct lxc_cgroup *cg;
ccb4cabe
SH
2132 bool ret = false;
2133
2134 if (lxc_list_empty(cgroup_settings))
2135 return true;
2136
2137 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2138 if (!sorted_cgroup_settings) {
2139 return false;
2140 }
2141
ccb4cabe
SH
2142 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2143 cg = iterator->elem;
2144
2145 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
ccb4cabe
SH
2146 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
2147 if (do_devices && (errno == EACCES || errno == EPERM)) {
2148 WARN("Error setting %s to %s for %s",
2149 cg->subsystem, cg->value, d->name);
2150 continue;
2151 }
2152 SYSERROR("Error setting %s to %s for %s",
2153 cg->subsystem, cg->value, d->name);
2154 goto out;
2155 }
6a628f4a 2156 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
ccb4cabe 2157 }
ccb4cabe
SH
2158 }
2159
2160 ret = true;
2161 INFO("cgroup has been setup");
2162out:
ccb4cabe
SH
2163 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2164 lxc_list_del(iterator);
2165 free(iterator);
2166 }
2167 free(sorted_cgroup_settings);
2168 return ret;
2169}
2170
2171static struct cgroup_ops cgfsng_ops = {
2172 .init = cgfsng_init,
2173 .destroy = cgfsng_destroy,
2174 .create = cgfsng_create,
2175 .enter = cgfsng_enter,
ccb4cabe 2176 .escape = cgfsng_escape,
36662416
TA
2177 .num_hierarchies = cgfsng_num_hierarchies,
2178 .get_hierarchies = cgfsng_get_hierarchies,
ccb4cabe
SH
2179 .get_cgroup = cgfsng_get_cgroup,
2180 .get = cgfsng_get,
2181 .set = cgfsng_set,
2182 .unfreeze = cgfsng_unfreeze,
2183 .setup_limits = cgfsng_setup_limits,
2184 .name = "cgroupfs-ng",
2185 .attach = cgfsng_attach,
058c1cb6 2186 .chown = cgfsng_chown,
ccb4cabe
SH
2187 .mount_cgroup = cgfsng_mount,
2188 .nrtasks = cgfsng_nrtasks,
2189 .driver = CGFSNG,
2190
2191 /* unsupported */
2192 .create_legacy = NULL,
2193};