]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
log: check for i/o error with vsnprintf()
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
ccb4cabe
SH
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24/*
25 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
26 * cgroup backend. The original cgfs.c was designed to be as flexible
27 * as possible. It would try to find cgroup filesystems no matter where
28 * or how you had them mounted, and deduce the most usable mount for
29 * each controller. It also was not designed for unprivileged use, as
30 * that was reserved for cgmanager.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
a54694f8 36
ccb4cabe 37#include "config.h"
a54694f8
CB
38
39#include <ctype.h>
40#include <dirent.h>
41#include <errno.h>
42#include <grp.h>
43#include <stdint.h>
ccb4cabe
SH
44#include <stdio.h>
45#include <stdlib.h>
a54694f8 46#include <string.h>
ccb4cabe 47#include <unistd.h>
a54694f8 48#include <sys/types.h>
ccb4cabe 49
c8bf519d 50#include <linux/types.h>
51#include <linux/kdev_t.h>
52
b635e92d 53#include "caps.h"
ccb4cabe 54#include "cgroup.h"
6328fd9c 55#include "cgroup_utils.h"
ccb4cabe 56#include "commands.h"
43654d34 57#include "conf.h"
a54694f8 58#include "log.h"
43654d34 59#include "storage/storage.h"
a54694f8 60#include "utils.h"
ccb4cabe
SH
61
62lxc_log_define(lxc_cgfsng, lxc);
63
64static struct cgroup_ops cgfsng_ops;
65
ccb4cabe
SH
66/*
67 * A descriptor for a mounted hierarchy
68 * @controllers: either NULL, or a null-terminated list of all
69 * the co-mounted controllers
70 * @mountpoint: the mountpoint we will use. It will be either
71 * /sys/fs/cgroup/controller or /sys/fs/cgroup/controllerlist
72 * @base_cgroup: the cgroup under which the container cgroup path
73 is created. This will be either the caller's cgroup (if not
74 root), or init's cgroup (if root).
75 */
76struct hierarchy {
77 char **controllers;
78 char *mountpoint;
79 char *base_cgroup;
80 char *fullcgpath;
6328fd9c 81 bool is_cgroup_v2;
ccb4cabe
SH
82};
83
84/*
85 * The cgroup data which is attached to the lxc_handler.
43654d34
CB
86 * @cgroup_pattern : A copy of the lxc.cgroup.pattern
87 * @container_cgroup : If not null, the cgroup which was created for the
88 * container. For each hierarchy, it is created under the
89 * @hierarchy->base_cgroup directory. Relative to the
90 * base_cgroup it is the same for all hierarchies.
91 * @name : The name of the container.
92 * @cgroup_meta : A copy of the container's cgroup information. This
93 * overrides @cgroup_pattern.
ccb4cabe
SH
94 */
95struct cgfsng_handler_data {
ccb4cabe 96 char *cgroup_pattern;
1a0e70ac
CB
97 char *container_cgroup; /* cgroup we created for the container */
98 char *name; /* container name */
43654d34
CB
99 /* per-container cgroup information */
100 struct lxc_cgroup cgroup_meta;
ccb4cabe
SH
101};
102
457ca9aa
SH
103/*
104 * @hierarchies - a NULL-terminated array of struct hierarchy, one per
105 * hierarchy. No duplicates. First sufficient, writeable mounted
106 * hierarchy wins
107 */
108struct hierarchy **hierarchies;
109
110/*
111 * @cgroup_use - a copy of the lxc.cgroup.use
112 */
113char *cgroup_use;
114
e4aeecf5
CB
115/*
116 * @lxc_cgfsng_debug - whether to print debug info to stdout for the cgfsng
117 * driver
118 */
119static bool lxc_cgfsng_debug;
120
ccb4cabe
SH
121static void free_string_list(char **clist)
122{
123 if (clist) {
124 int i;
125
126 for (i = 0; clist[i]; i++)
127 free(clist[i]);
128 free(clist);
129 }
130}
131
ccb4cabe
SH
132/* Allocate a pointer, do not fail */
133static void *must_alloc(size_t sz)
134{
135 return must_realloc(NULL, sz);
136}
137
ccb4cabe
SH
138/*
139 * This is a special case - return a copy of @entry
140 * prepending 'name='. I.e. turn systemd into name=systemd.
141 * Do not fail.
142 */
143static char *must_prefix_named(char *entry)
144{
145 char *ret;
146 size_t len = strlen(entry);
147
148 ret = must_alloc(len + 6);
149 snprintf(ret, len + 6, "name=%s", entry);
150 return ret;
151}
152
153/*
154 * Given a pointer to a null-terminated array of pointers, realloc to
155 * add one entry, and point the new entry to NULL. Do not fail. Return
156 * the index to the second-to-last entry - that is, the one which is
157 * now available for use (keeping the list null-terminated).
158 */
159static int append_null_to_list(void ***list)
160{
161 int newentry = 0;
162
163 if (*list)
164 for (; (*list)[newentry]; newentry++);
165
166 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
167 (*list)[newentry + 1] = NULL;
168 return newentry;
169}
170
171/*
172 * Given a null-terminated array of strings, check whether @entry
173 * is one of the strings
174 */
175static bool string_in_list(char **list, const char *entry)
176{
177 int i;
178
179 if (!list)
180 return false;
181 for (i = 0; list[i]; i++)
182 if (strcmp(list[i], entry) == 0)
183 return true;
184
185 return false;
186}
187
188/*
189 * append an entry to the clist. Do not fail.
190 * *clist must be NULL the first time we are called.
191 *
192 * We also handle named subsystems here. Any controller which is not a
193 * kernel subsystem, we prefix 'name='. Any which is both a kernel and
194 * named subsystem, we refuse to use because we're not sure which we
195 * have here. (TODO - we could work around this in some cases by just
196 * remounting to be unambiguous, or by comparing mountpoint contents
197 * with current cgroup)
198 *
199 * The last entry will always be NULL.
200 */
201static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry)
202{
203 int newentry;
204 char *copy;
205
206 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
207 ERROR("Refusing to use ambiguous controller '%s'", entry);
208 ERROR("It is both a named and kernel subsystem");
209 return;
210 }
211
212 newentry = append_null_to_list((void ***)clist);
213
214 if (strncmp(entry, "name=", 5) == 0)
215 copy = must_copy_string(entry);
216 else if (string_in_list(klist, entry))
217 copy = must_copy_string(entry);
218 else
219 copy = must_prefix_named(entry);
220
221 (*clist)[newentry] = copy;
222}
223
ccb4cabe
SH
224static void free_handler_data(struct cgfsng_handler_data *d)
225{
ccb4cabe
SH
226 free(d->cgroup_pattern);
227 free(d->container_cgroup);
228 free(d->name);
43654d34
CB
229 if (d->cgroup_meta.dir)
230 free(d->cgroup_meta.dir);
231 if (d->cgroup_meta.controllers)
232 free(d->cgroup_meta.controllers);
ccb4cabe
SH
233 free(d);
234}
235
236/*
237 * Given a handler's cgroup data, return the struct hierarchy for the
238 * controller @c, or NULL if there is none.
239 */
457ca9aa 240struct hierarchy *get_hierarchy(const char *c)
ccb4cabe
SH
241{
242 int i;
243
457ca9aa 244 if (!hierarchies)
ccb4cabe 245 return NULL;
457ca9aa
SH
246 for (i = 0; hierarchies[i]; i++) {
247 if (string_in_list(hierarchies[i]->controllers, c))
248 return hierarchies[i];
ccb4cabe
SH
249 }
250 return NULL;
251}
252
a54694f8
CB
253#define BATCH_SIZE 50
254static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
255{
256 int newbatches = (newlen / BATCH_SIZE) + 1;
257 int oldbatches = (oldlen / BATCH_SIZE) + 1;
258
259 if (!*mem || newbatches > oldbatches) {
260 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
261 }
262}
263
264static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
265{
266 size_t full = oldlen + newlen;
267
268 batch_realloc(dest, oldlen, full + 1);
269
270 memcpy(*dest + oldlen, new, newlen + 1);
271}
272
273/* Slurp in a whole file */
274static char *read_file(char *fnam)
275{
276 FILE *f;
277 char *line = NULL, *buf = NULL;
278 size_t len = 0, fulllen = 0;
279 int linelen;
280
281 f = fopen(fnam, "r");
282 if (!f)
283 return NULL;
284 while ((linelen = getline(&line, &len, f)) != -1) {
285 append_line(&buf, fulllen, line, linelen);
286 fulllen += linelen;
287 }
288 fclose(f);
289 free(line);
290 return buf;
291}
292
293/* Taken over modified from the kernel sources. */
294#define NBITS 32 /* bits in uint32_t */
295#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
296#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
297
298static void set_bit(unsigned bit, uint32_t *bitarr)
299{
300 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
301}
302
303static void clear_bit(unsigned bit, uint32_t *bitarr)
304{
305 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
306}
307
308static bool is_set(unsigned bit, uint32_t *bitarr)
309{
310 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
311}
312
313/* Create cpumask from cpulist aka turn:
314 *
315 * 0,2-3
316 *
317 * into bit array
318 *
319 * 1 0 1 1
320 */
321static uint32_t *lxc_cpumask(char *buf, size_t nbits)
322{
323 char *token;
324 char *saveptr = NULL;
325 size_t arrlen = BITS_TO_LONGS(nbits);
326 uint32_t *bitarr = calloc(arrlen, sizeof(uint32_t));
327 if (!bitarr)
328 return NULL;
329
330 for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
331 errno = 0;
332 unsigned start = strtoul(token, NULL, 0);
333 unsigned end = start;
334
335 char *range = strchr(token, '-');
336 if (range)
337 end = strtoul(range + 1, NULL, 0);
338 if (!(start <= end)) {
339 free(bitarr);
340 return NULL;
341 }
342
343 if (end >= nbits) {
344 free(bitarr);
345 return NULL;
346 }
347
348 while (start <= end)
349 set_bit(start++, bitarr);
350 }
351
352 return bitarr;
353}
354
a54694f8
CB
355/* Turn cpumask into simple, comma-separated cpulist. */
356static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
357{
358 size_t i;
359 int ret;
eab15c1e 360 char numstr[LXC_NUMSTRLEN64] = {0};
a54694f8
CB
361 char **cpulist = NULL;
362
363 for (i = 0; i <= nbits; i++) {
364 if (is_set(i, bitarr)) {
eab15c1e
CB
365 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
366 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
a54694f8
CB
367 lxc_free_array((void **)cpulist, free);
368 return NULL;
369 }
370 if (lxc_append_string(&cpulist, numstr) < 0) {
371 lxc_free_array((void **)cpulist, free);
372 return NULL;
373 }
374 }
375 }
376 return lxc_string_join(",", (const char **)cpulist, false);
377}
378
379static ssize_t get_max_cpus(char *cpulist)
380{
381 char *c1, *c2;
382 char *maxcpus = cpulist;
383 size_t cpus = 0;
384
385 c1 = strrchr(maxcpus, ',');
386 if (c1)
387 c1++;
388
389 c2 = strrchr(maxcpus, '-');
390 if (c2)
391 c2++;
392
393 if (!c1 && !c2)
394 c1 = maxcpus;
395 else if (c1 > c2)
396 c2 = c1;
397 else if (c1 < c2)
398 c1 = c2;
1a0e70ac 399 else if (!c1 && c2) /* The reverse case is obvs. not needed. */
a54694f8
CB
400 c1 = c2;
401
402 /* If the above logic is correct, c1 should always hold a valid string
403 * here.
404 */
405
406 errno = 0;
407 cpus = strtoul(c1, NULL, 0);
408 if (errno != 0)
409 return -1;
410
411 return cpus;
412}
413
6f9584d8 414#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
a54694f8
CB
415static bool filter_and_set_cpus(char *path, bool am_initialized)
416{
417 char *lastslash, *fpath, oldv;
418 int ret;
419 ssize_t i;
420
421 ssize_t maxposs = 0, maxisol = 0;
422 char *cpulist = NULL, *posscpus = NULL, *isolcpus = NULL;
423 uint32_t *possmask = NULL, *isolmask = NULL;
6f9584d8 424 bool bret = false, flipped_bit = false;
a54694f8
CB
425
426 lastslash = strrchr(path, '/');
1a0e70ac 427 if (!lastslash) { /* bug... this shouldn't be possible */
6f9584d8 428 ERROR("Invalid path: %s.", path);
a54694f8
CB
429 return bret;
430 }
431 oldv = *lastslash;
432 *lastslash = '\0';
433 fpath = must_make_path(path, "cpuset.cpus", NULL);
434 posscpus = read_file(fpath);
6f9584d8
CB
435 if (!posscpus) {
436 SYSERROR("Could not read file: %s.\n", fpath);
437 goto on_error;
438 }
a54694f8
CB
439
440 /* Get maximum number of cpus found in possible cpuset. */
441 maxposs = get_max_cpus(posscpus);
442 if (maxposs < 0)
6f9584d8 443 goto on_error;
a54694f8 444
6f9584d8
CB
445 if (!file_exists(__ISOL_CPUS)) {
446 /* This system doesn't expose isolated cpus. */
447 DEBUG("Path: "__ISOL_CPUS" to read isolated cpus from does not exist.\n");
65d29cbc
CB
448 cpulist = posscpus;
449 /* No isolated cpus but we weren't already initialized by
450 * someone. We should simply copy the parents cpuset.cpus
451 * values.
452 */
453 if (!am_initialized) {
454 DEBUG("Copying cpuset of parent cgroup.");
455 goto copy_parent;
456 }
457 /* No isolated cpus but we were already initialized by someone.
458 * Nothing more to do for us.
459 */
6f9584d8
CB
460 goto on_success;
461 }
462
463 isolcpus = read_file(__ISOL_CPUS);
464 if (!isolcpus) {
465 SYSERROR("Could not read file "__ISOL_CPUS);
466 goto on_error;
467 }
a54694f8 468 if (!isdigit(isolcpus[0])) {
6f9584d8 469 DEBUG("No isolated cpus detected.");
a54694f8
CB
470 cpulist = posscpus;
471 /* No isolated cpus but we weren't already initialized by
472 * someone. We should simply copy the parents cpuset.cpus
473 * values.
474 */
6f9584d8
CB
475 if (!am_initialized) {
476 DEBUG("Copying cpuset of parent cgroup.");
a54694f8 477 goto copy_parent;
6f9584d8 478 }
a54694f8
CB
479 /* No isolated cpus but we were already initialized by someone.
480 * Nothing more to do for us.
481 */
6f9584d8 482 goto on_success;
a54694f8
CB
483 }
484
485 /* Get maximum number of cpus found in isolated cpuset. */
486 maxisol = get_max_cpus(isolcpus);
487 if (maxisol < 0)
6f9584d8 488 goto on_error;
a54694f8
CB
489
490 if (maxposs < maxisol)
491 maxposs = maxisol;
492 maxposs++;
493
494 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8
CB
495 if (!possmask) {
496 ERROR("Could not create cpumask for all possible cpus.\n");
497 goto on_error;
498 }
a54694f8
CB
499
500 isolmask = lxc_cpumask(isolcpus, maxposs);
6f9584d8
CB
501 if (!isolmask) {
502 ERROR("Could not create cpumask for all isolated cpus.\n");
503 goto on_error;
504 }
a54694f8
CB
505
506 for (i = 0; i <= maxposs; i++) {
507 if (is_set(i, isolmask) && is_set(i, possmask)) {
6f9584d8 508 flipped_bit = true;
a54694f8
CB
509 clear_bit(i, possmask);
510 }
511 }
512
6f9584d8
CB
513 if (!flipped_bit) {
514 DEBUG("No isolated cpus present in cpuset.");
515 goto on_success;
516 }
517 DEBUG("Removed isolated cpus from cpuset.");
518
a54694f8 519 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
6f9584d8
CB
520 if (!cpulist) {
521 ERROR("Could not create cpu list.\n");
522 goto on_error;
523 }
a54694f8
CB
524
525copy_parent:
526 *lastslash = oldv;
527 fpath = must_make_path(path, "cpuset.cpus", NULL);
528 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false);
6f9584d8
CB
529 if (ret < 0) {
530 SYSERROR("Could not write cpu list to: %s.\n", fpath);
531 goto on_error;
532 }
533
534on_success:
535 bret = true;
a54694f8 536
6f9584d8 537on_error:
a54694f8
CB
538 free(fpath);
539
540 free(isolcpus);
541 free(isolmask);
542
543 if (posscpus != cpulist)
544 free(posscpus);
545 free(possmask);
546
547 free(cpulist);
548 return bret;
549}
550
e3a3fecf
SH
551/* Copy contents of parent(@path)/@file to @path/@file */
552static bool copy_parent_file(char *path, char *file)
553{
554 char *lastslash, *value = NULL, *fpath, oldv;
555 int len = 0;
556 int ret;
557
558 lastslash = strrchr(path, '/');
1a0e70ac 559 if (!lastslash) { /* bug... this shouldn't be possible */
e3a3fecf
SH
560 ERROR("cgfsng:copy_parent_file: bad path %s", path);
561 return false;
562 }
563 oldv = *lastslash;
564 *lastslash = '\0';
565 fpath = must_make_path(path, file, NULL);
566 len = lxc_read_from_file(fpath, NULL, 0);
567 if (len <= 0)
568 goto bad;
569 value = must_alloc(len + 1);
570 if (lxc_read_from_file(fpath, value, len) != len)
571 goto bad;
572 free(fpath);
573 *lastslash = oldv;
574 fpath = must_make_path(path, file, NULL);
575 ret = lxc_write_to_file(fpath, value, len, false);
576 if (ret < 0)
577 SYSERROR("Unable to write %s to %s", value, fpath);
578 free(fpath);
579 free(value);
580 return ret >= 0;
581
582bad:
583 SYSERROR("Error reading '%s'", fpath);
584 free(fpath);
585 free(value);
586 return false;
587}
588
589/*
590 * Initialize the cpuset hierarchy in first directory of @gname and
591 * set cgroup.clone_children so that children inherit settings.
592 * Since the h->base_path is populated by init or ourselves, we know
593 * it is already initialized.
594 */
a54694f8 595static bool handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf
SH
596{
597 char *cgpath, *clonechildrenpath, v, *slash;
598
599 if (!string_in_list(h->controllers, "cpuset"))
600 return true;
601
602 if (*cgname == '/')
603 cgname++;
604 slash = strchr(cgname, '/');
605 if (slash)
606 *slash = '\0';
607
608 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
609 if (slash)
610 *slash = '/';
611 if (mkdir(cgpath, 0755) < 0 && errno != EEXIST) {
612 SYSERROR("Failed to create '%s'", cgpath);
613 free(cgpath);
614 return false;
615 }
6f9584d8 616
e3a3fecf 617 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
6328fd9c
CB
618 /* unified hierarchy doesn't have clone_children */
619 if (!file_exists(clonechildrenpath)) {
e3a3fecf
SH
620 free(clonechildrenpath);
621 free(cgpath);
622 return true;
623 }
624 if (lxc_read_from_file(clonechildrenpath, &v, 1) < 0) {
625 SYSERROR("Failed to read '%s'", clonechildrenpath);
626 free(clonechildrenpath);
627 free(cgpath);
628 return false;
629 }
630
a54694f8 631 /* Make sure any isolated cpus are removed from cpuset.cpus. */
6f9584d8
CB
632 if (!filter_and_set_cpus(cgpath, v == '1')) {
633 SYSERROR("Failed to remove isolated cpus.");
634 free(clonechildrenpath);
635 free(cgpath);
a54694f8 636 return false;
6f9584d8 637 }
a54694f8 638
e3a3fecf 639 if (v == '1') { /* already set for us by someone else */
6f9584d8 640 DEBUG("\"cgroup.clone_children\" was already set to \"1\".");
e3a3fecf
SH
641 free(clonechildrenpath);
642 free(cgpath);
643 return true;
644 }
645
646 /* copy parent's settings */
a54694f8 647 if (!copy_parent_file(cgpath, "cpuset.mems")) {
6f9584d8 648 SYSERROR("Failed to copy \"cpuset.mems\" settings.");
e3a3fecf
SH
649 free(cgpath);
650 free(clonechildrenpath);
651 return false;
652 }
653 free(cgpath);
654
655 if (lxc_write_to_file(clonechildrenpath, "1", 1, false) < 0) {
656 /* Set clone_children so children inherit our settings */
657 SYSERROR("Failed to write 1 to %s", clonechildrenpath);
658 free(clonechildrenpath);
659 return false;
660 }
661 free(clonechildrenpath);
662 return true;
663}
664
ccb4cabe
SH
665/*
666 * Given two null-terminated lists of strings, return true if any string
667 * is in both.
668 */
669static bool controller_lists_intersect(char **l1, char **l2)
670{
671 int i;
672
673 if (!l1 || !l2)
674 return false;
675
676 for (i = 0; l1[i]; i++) {
677 if (string_in_list(l2, l1[i]))
678 return true;
679 }
680 return false;
681}
682
683/*
684 * For a null-terminated list of controllers @clist, return true if any of
685 * those controllers is already listed the null-terminated list of
686 * hierarchies @hlist. Realistically, if one is present, all must be present.
687 */
688static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
689{
690 int i;
691
692 if (!hlist)
693 return false;
694 for (i = 0; hlist[i]; i++)
695 if (controller_lists_intersect(hlist[i]->controllers, clist))
696 return true;
697 return false;
698
699}
700
701/*
702 * Return true if the controller @entry is found in the null-terminated
703 * list of hierarchies @hlist
704 */
705static bool controller_found(struct hierarchy **hlist, char *entry)
706{
707 int i;
708 if (!hlist)
709 return false;
710
711 for (i = 0; hlist[i]; i++)
712 if (string_in_list(hlist[i]->controllers, entry))
713 return true;
714 return false;
715}
716
717/*
c30b61c3
SH
718 * Return true if all of the controllers which we require have been found.
719 * The required list is freezer and anything in * lxc.cgroup.use.
ccb4cabe 720 */
457ca9aa 721static bool all_controllers_found(void)
ccb4cabe
SH
722{
723 char *p, *saveptr = NULL;
457ca9aa 724 struct hierarchy ** hlist = hierarchies;
ccb4cabe 725
ccb4cabe
SH
726 if (!controller_found(hlist, "freezer")) {
727 ERROR("no freezer controller mountpoint found");
728 return false;
729 }
730
457ca9aa 731 if (!cgroup_use)
ccb4cabe 732 return true;
457ca9aa 733 for (p = strtok_r(cgroup_use, ",", &saveptr); p;
ccb4cabe
SH
734 p = strtok_r(NULL, ",", &saveptr)) {
735 if (!controller_found(hlist, p)) {
736 ERROR("no %s controller mountpoint found", p);
737 return false;
738 }
739 }
740 return true;
741}
742
743/* Return true if the fs type is fuse.lxcfs */
744static bool is_lxcfs(const char *line)
745{
746 char *p = strstr(line, " - ");
747 if (!p)
748 return false;
2f62fb00 749 return strncmp(p, " - fuse.lxcfs ", 14) == 0;
ccb4cabe
SH
750}
751
752/*
753 * Get the controllers from a mountinfo line
754 * There are other ways we could get this info. For lxcfs, field 3
755 * is /cgroup/controller-list. For cgroupfs, we could parse the mount
756 * options. But we simply assume that the mountpoint must be
757 * /sys/fs/cgroup/controller-list
758 */
759static char **get_controllers(char **klist, char **nlist, char *line)
760{
6328fd9c 761 /* the fourth field is /sys/fs/cgroup/comma-delimited-controller-list */
ccb4cabe 762 int i;
411ac6d8 763 char *dup, *p2, *tok;
6328fd9c 764 bool is_cgroup_v2;
411ac6d8
CB
765 char *p = line, *saveptr = NULL;
766 char **aret = NULL;
6328fd9c
CB
767
768 /* handle cgroup v2 */
769 is_cgroup_v2 = is_cgroupfs_v2(line);
ccb4cabe
SH
770
771 for (i = 0; i < 4; i++) {
235f1815 772 p = strchr(p, ' ');
ccb4cabe
SH
773 if (!p)
774 return NULL;
775 p++;
776 }
777 if (!p)
778 return NULL;
779 /* note - if we change how mountinfo works, then our caller
780 * will need to verify /sys/fs/cgroup/ in this field */
5059aae9
SH
781 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
782 INFO("cgfsng: found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
ccb4cabe 783 return NULL;
5059aae9 784 }
ccb4cabe 785 p += 15;
235f1815 786 p2 = strchr(p, ' ');
ccb4cabe
SH
787 if (!p2) {
788 ERROR("corrupt mountinfo");
789 return NULL;
790 }
791 *p2 = '\0';
6328fd9c
CB
792
793 /* cgroup v2 does not have separate mountpoints for controllers */
794 if (is_cgroup_v2) {
795 must_append_controller(klist, nlist, &aret, "cgroup2");
411ac6d8
CB
796 return NULL;
797 }
798
799 /* strdup() here for v1 hierarchies. Otherwise strtok_r() will destroy
800 * mountpoints such as "/sys/fs/cgroup/cpu,cpuacct".
801 */
802 dup = strdup(p);
803 if (!dup) {
804 SYSERROR("Failed to duplicate string");
805 return NULL;
6328fd9c
CB
806 }
807
411ac6d8 808 for (tok = strtok_r(dup, ",", &saveptr); tok;
ccb4cabe
SH
809 tok = strtok_r(NULL, ",", &saveptr)) {
810 must_append_controller(klist, nlist, &aret, tok);
811 }
812
411ac6d8 813 free(dup);
ccb4cabe
SH
814 return aret;
815}
816
ccb4cabe 817/* Add a controller to our list of hierarchies */
457ca9aa 818static void add_controller(char **clist, char *mountpoint, char *base_cgroup)
ccb4cabe
SH
819{
820 struct hierarchy *new;
821 int newentry;
822
823 new = must_alloc(sizeof(*new));
824 new->controllers = clist;
825 new->mountpoint = mountpoint;
826 new->base_cgroup = base_cgroup;
827 new->fullcgpath = NULL;
828
6328fd9c 829 /* record if this is the cgroup v2 hierarchy */
cdfe90a4 830 if (clist && !strcmp(*clist, "cgroup2"))
6328fd9c
CB
831 new->is_cgroup_v2 = true;
832 else
833 new->is_cgroup_v2 = false;
834
457ca9aa
SH
835 newentry = append_null_to_list((void ***)&hierarchies);
836 hierarchies[newentry] = new;
ccb4cabe
SH
837}
838
839/*
840 * Get a copy of the mountpoint from @line, which is a line from
841 * /proc/self/mountinfo
842 */
843static char *get_mountpoint(char *line)
844{
845 int i;
846 char *p = line, *sret;
847 size_t len;
848
849 for (i = 0; i < 4; i++) {
235f1815 850 p = strchr(p, ' ');
ccb4cabe
SH
851 if (!p)
852 return NULL;
853 p++;
854 }
855 /* we've already stuck a \0 after the mountpoint */
856 len = strlen(p);
857 sret = must_alloc(len + 1);
858 memcpy(sret, p, len);
859 sret[len] = '\0';
860 return sret;
861}
862
863/*
864 * Given a multi-line string, return a null-terminated copy of the
865 * current line.
866 */
867static char *copy_to_eol(char *p)
868{
235f1815 869 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
870 size_t len;
871
872 if (!p2)
873 return NULL;
874
875 len = p2 - p;
876 sret = must_alloc(len + 1);
877 memcpy(sret, p, len);
878 sret[len] = '\0';
879 return sret;
880}
881
882/*
883 * cgline: pointer to character after the first ':' in a line in a
884 * \n-terminated /proc/self/cgroup file. Check whether * controller c is
885 * present.
886 */
887static bool controller_in_clist(char *cgline, char *c)
888{
889 char *tok, *saveptr = NULL, *eol, *tmp;
890 size_t len;
891
235f1815 892 eol = strchr(cgline, ':');
ccb4cabe
SH
893 if (!eol)
894 return false;
895
896 len = eol - cgline;
897 tmp = alloca(len + 1);
898 memcpy(tmp, cgline, len);
899 tmp[len] = '\0';
900
901 for (tok = strtok_r(tmp, ",", &saveptr); tok;
902 tok = strtok_r(NULL, ",", &saveptr)) {
903 if (strcmp(tok, c) == 0)
904 return true;
905 }
906 return false;
907}
908
909/*
910 * @basecginfo is a copy of /proc/$$/cgroup. Return the current
911 * cgroup for @controller
912 */
913static char *get_current_cgroup(char *basecginfo, char *controller)
914{
915 char *p = basecginfo;
6328fd9c
CB
916 bool is_cgroup_v2;
917 bool is_cgroup_v2_base_cgroup;
918
919 is_cgroup_v2 = !strcmp(controller, "cgroup2");
920 while (true) {
921 is_cgroup_v2_base_cgroup = false;
922 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
923 if (is_cgroup_v2 && (*p == '0'))
924 is_cgroup_v2_base_cgroup = true;
ccb4cabe 925
235f1815 926 p = strchr(p, ':');
ccb4cabe
SH
927 if (!p)
928 return NULL;
929 p++;
6328fd9c 930 if (is_cgroup_v2_base_cgroup || controller_in_clist(p, controller)) {
235f1815 931 p = strchr(p, ':');
ccb4cabe
SH
932 if (!p)
933 return NULL;
934 p++;
935 return copy_to_eol(p);
936 }
937
235f1815 938 p = strchr(p, '\n');
ccb4cabe
SH
939 if (!p)
940 return NULL;
941 p++;
942 }
943}
944
ccb4cabe
SH
945static void must_append_string(char ***list, char *entry)
946{
947 int newentry = append_null_to_list((void ***)list);
948 char *copy;
949
950 copy = must_copy_string(entry);
951 (*list)[newentry] = copy;
952}
953
954static void get_existing_subsystems(char ***klist, char ***nlist)
955{
956 FILE *f;
957 char *line = NULL;
958 size_t len = 0;
959
960 if ((f = fopen("/proc/self/cgroup", "r")) == NULL)
961 return;
962 while (getline(&line, &len, f) != -1) {
963 char *p, *p2, *tok, *saveptr = NULL;
235f1815 964 p = strchr(line, ':');
ccb4cabe
SH
965 if (!p)
966 continue;
967 p++;
235f1815 968 p2 = strchr(p, ':');
ccb4cabe
SH
969 if (!p2)
970 continue;
971 *p2 = '\0';
ff8d6ee9 972
6328fd9c
CB
973 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
974 * contains an entry of the form:
ff8d6ee9
CB
975 *
976 * 0::/some/path
977 *
6328fd9c 978 * In this case we use "cgroup2" as controller name.
ff8d6ee9 979 */
6328fd9c
CB
980 if ((p2 - p) == 0) {
981 must_append_string(klist, "cgroup2");
ff8d6ee9 982 continue;
6328fd9c 983 }
ff8d6ee9 984
ccb4cabe
SH
985 for (tok = strtok_r(p, ",", &saveptr); tok;
986 tok = strtok_r(NULL, ",", &saveptr)) {
987 if (strncmp(tok, "name=", 5) == 0)
988 must_append_string(nlist, tok);
989 else
990 must_append_string(klist, tok);
991 }
992 }
993
994 free(line);
995 fclose(f);
996}
997
998static void trim(char *s)
999{
1000 size_t len = strlen(s);
2c28d76b 1001 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
1002 s[--len] = '\0';
1003}
1004
e4aeecf5
CB
1005static void lxc_cgfsng_print_handler_data(const struct cgfsng_handler_data *d)
1006{
1007 printf("Cgroup information:\n");
1008 printf(" container name: %s\n", d->name ? d->name : "(null)");
1009 printf(" lxc.cgroup.use: %s\n", cgroup_use ? cgroup_use : "(null)");
43654d34
CB
1010 printf(" lxc.cgroup.pattern: %s\n",
1011 d->cgroup_pattern ? d->cgroup_pattern : "(null)");
1012 printf(" lxc.cgroup.dir: %s\n",
1013 d->cgroup_meta.dir ? d->cgroup_meta.dir : "(null)");
1014 printf(" cgroup: %s\n",
1015 d->container_cgroup ? d->container_cgroup : "(null)");
e4aeecf5
CB
1016}
1017
1018static void lxc_cgfsng_print_hierarchies()
ccb4cabe 1019{
a7b0cc4c 1020 struct hierarchy **it;
ccb4cabe 1021 int i;
41c33dbe 1022
457ca9aa 1023 if (!hierarchies) {
e4aeecf5 1024 printf(" No hierarchies found.");
ccb4cabe
SH
1025 return;
1026 }
e4aeecf5 1027 printf(" Hierarchies:\n");
a7b0cc4c
CB
1028 for (i = 0, it = hierarchies; it && *it; it++, i++) {
1029 char **cit;
ccb4cabe 1030 int j;
e4aeecf5
CB
1031 printf(" %d: base_cgroup %s\n", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1032 printf(" mountpoint %s\n", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1033 printf(" controllers:\n");
a7b0cc4c 1034 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
e4aeecf5 1035 printf(" %d: %s\n", j, *cit);
ccb4cabe
SH
1036 }
1037}
41c33dbe 1038
e4aeecf5 1039static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist, char **nlist)
41c33dbe
SH
1040{
1041 int k;
a7b0cc4c 1042 char **it;
41c33dbe 1043
a7b0cc4c
CB
1044 printf("basecginfo is:\n");
1045 printf("%s\n", basecginfo);
41c33dbe 1046
a7b0cc4c
CB
1047 for (k = 0, it = klist; it && *it; it++, k++)
1048 printf("kernel subsystem %d: %s\n", k, *it);
1049 for (k = 0, it = nlist; it && *it; it++, k++)
1050 printf("named subsystem %d: %s\n", k, *it);
41c33dbe 1051}
ccb4cabe 1052
e4aeecf5
CB
1053static void lxc_cgfsng_print_debuginfo(const struct cgfsng_handler_data *d)
1054{
1055 lxc_cgfsng_print_handler_data(d);
1056 lxc_cgfsng_print_hierarchies();
1057}
1058
ccb4cabe
SH
1059/*
1060 * At startup, parse_hierarchies finds all the info we need about
1061 * cgroup mountpoints and current cgroups, and stores it in @d.
1062 */
457ca9aa 1063static bool parse_hierarchies(void)
ccb4cabe
SH
1064{
1065 FILE *f;
1066 char * line = NULL, *basecginfo;
1067 char **klist = NULL, **nlist = NULL;
1068 size_t len = 0;
1069
d30ec4cb
SH
1070 /*
1071 * Root spawned containers escape the current cgroup, so use init's
1072 * cgroups as our base in that case.
1073 */
ccb4cabe
SH
1074 if (geteuid())
1075 basecginfo = read_file("/proc/self/cgroup");
1076 else
1077 basecginfo = read_file("/proc/1/cgroup");
1078 if (!basecginfo)
1079 return false;
1080
1081 if ((f = fopen("/proc/self/mountinfo", "r")) == NULL) {
d3b00a8f 1082 SYSERROR("Failed opening /proc/self/mountinfo");
ccb4cabe
SH
1083 return false;
1084 }
1085
1086 get_existing_subsystems(&klist, &nlist);
41c33dbe 1087
e4aeecf5
CB
1088 if (lxc_cgfsng_debug)
1089 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
ccb4cabe
SH
1090
1091 /* we support simple cgroup mounts and lxcfs mounts */
1092 while (getline(&line, &len, f) != -1) {
1093 char **controller_list = NULL;
1094 char *mountpoint, *base_cgroup;
6328fd9c 1095 bool is_cgroup_v2, writeable;
ccb4cabe 1096
6328fd9c
CB
1097 is_cgroup_v2 = is_cgroupfs_v2(line);
1098 if (!is_lxcfs(line) && !is_cgroupfs_v1(line) && !is_cgroup_v2)
ccb4cabe
SH
1099 continue;
1100
1101 controller_list = get_controllers(klist, nlist, line);
1102 if (!controller_list)
1103 continue;
1104
457ca9aa 1105 if (controller_list_is_dup(hierarchies, controller_list)) {
ccb4cabe
SH
1106 free(controller_list);
1107 continue;
1108 }
1109
1110 mountpoint = get_mountpoint(line);
1111 if (!mountpoint) {
1112 ERROR("Error reading mountinfo: bad line '%s'", line);
1113 free_string_list(controller_list);
1114 continue;
1115 }
1116
1117 base_cgroup = get_current_cgroup(basecginfo, controller_list[0]);
1118 if (!base_cgroup) {
1119 ERROR("Failed to find current cgroup for controller '%s'", controller_list[0]);
1120 free_string_list(controller_list);
1121 free(mountpoint);
1122 continue;
1123 }
6328fd9c 1124
ccb4cabe
SH
1125 trim(base_cgroup);
1126 prune_init_scope(base_cgroup);
6328fd9c
CB
1127 if (is_cgroup_v2)
1128 writeable = test_writeable_v2(mountpoint, base_cgroup);
1129 else
1130 writeable = test_writeable_v1(mountpoint, base_cgroup);
1131 if (!writeable) {
ccb4cabe
SH
1132 free_string_list(controller_list);
1133 free(mountpoint);
1134 free(base_cgroup);
1135 continue;
1136 }
457ca9aa 1137 add_controller(controller_list, mountpoint, base_cgroup);
ccb4cabe
SH
1138 }
1139
1140 free_string_list(klist);
1141 free_string_list(nlist);
1142
1143 free(basecginfo);
1144
1145 fclose(f);
1146 free(line);
1147
e4aeecf5
CB
1148 if (lxc_cgfsng_debug) {
1149 printf("writeable subsystems:\n");
1150 lxc_cgfsng_print_hierarchies();
1151 }
1152
ccb4cabe
SH
1153 /* verify that all controllers in cgroup.use and all crucial
1154 * controllers are accounted for
1155 */
5059aae9
SH
1156 if (!all_controllers_found()) {
1157 INFO("cgfsng: not all controllers were find, deferring to cgfs driver");
ccb4cabe 1158 return false;
5059aae9 1159 }
ccb4cabe
SH
1160
1161 return true;
1162}
1163
457ca9aa
SH
1164static bool collect_hierarchy_info(void)
1165{
1166 const char *tmp;
1167 errno = 0;
1168 tmp = lxc_global_config_value("lxc.cgroup.use");
1a0e70ac 1169 if (!cgroup_use && errno != 0) { /* lxc.cgroup.use can be NULL */
457ca9aa
SH
1170 SYSERROR("cgfsng: error reading list of cgroups to use");
1171 return false;
1172 }
1173 cgroup_use = must_copy_string(tmp);
1174
1175 return parse_hierarchies();
1176}
1177
43654d34 1178static void *cgfsng_init(struct lxc_handler *handler)
ccb4cabe 1179{
457ca9aa 1180 const char *cgroup_pattern;
43654d34 1181 struct cgfsng_handler_data *d;
ccb4cabe
SH
1182
1183 d = must_alloc(sizeof(*d));
1184 memset(d, 0, sizeof(*d));
1185
43654d34
CB
1186 /* copy container name */
1187 d->name = must_copy_string(handler->name);
1188
1189 /* copy per-container cgroup information */
ae5e6c08
CB
1190 d->cgroup_meta.dir = NULL;
1191 d->cgroup_meta.controllers = NULL;
9b5396f9
CB
1192 if (handler->conf) {
1193 d->cgroup_meta.dir = must_copy_string(handler->conf->cgroup_meta.dir);
1194 d->cgroup_meta.controllers = must_copy_string(handler->conf->cgroup_meta.controllers);
1195 }
ccb4cabe 1196
43654d34 1197 /* copy system-wide cgroup information */
ccb4cabe 1198 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
43654d34
CB
1199 if (!cgroup_pattern) {
1200 /* lxc.cgroup.pattern is only NULL on error. */
ccb4cabe
SH
1201 ERROR("Error getting cgroup pattern");
1202 goto out_free;
1203 }
1204 d->cgroup_pattern = must_copy_string(cgroup_pattern);
1205
e4aeecf5
CB
1206 if (lxc_cgfsng_debug)
1207 lxc_cgfsng_print_debuginfo(d);
ccb4cabe
SH
1208
1209 return d;
1210
1211out_free:
1212 free_handler_data(d);
1213 return NULL;
1214}
1215
ccb4cabe
SH
1216static int cgroup_rmdir(char *dirname)
1217{
a17f8b3f 1218 int ret;
74f96976 1219 struct dirent *direntp;
ccb4cabe
SH
1220 DIR *dir;
1221 int r = 0;
1222
1223 dir = opendir(dirname);
1224 if (!dir)
1225 return -1;
1226
74f96976 1227 while ((direntp = readdir(dir))) {
ccb4cabe 1228 char *pathname;
a17f8b3f 1229 struct stat mystat;
ccb4cabe
SH
1230
1231 if (!direntp)
1232 break;
1233
1234 if (!strcmp(direntp->d_name, ".") ||
1235 !strcmp(direntp->d_name, ".."))
1236 continue;
1237
1238 pathname = must_make_path(dirname, direntp->d_name, NULL);
1239
a17f8b3f
CB
1240 ret = lstat(pathname, &mystat);
1241 if (ret < 0) {
ccb4cabe 1242 if (!r)
a17f8b3f 1243 WARN("Failed to stat %s", pathname);
ccb4cabe
SH
1244 r = -1;
1245 goto next;
1246 }
1247
1248 if (!S_ISDIR(mystat.st_mode))
1249 goto next;
a17f8b3f
CB
1250
1251 ret = cgroup_rmdir(pathname);
1252 if (ret < 0)
ccb4cabe
SH
1253 r = -1;
1254next:
1255 free(pathname);
1256 }
1257
a17f8b3f
CB
1258 ret = rmdir(dirname);
1259 if (ret < 0) {
ccb4cabe 1260 if (!r)
a17f8b3f
CB
1261 WARN("Failed to delete \"%s\": %s", dirname,
1262 strerror(errno));
ccb4cabe
SH
1263 r = -1;
1264 }
1265
a17f8b3f
CB
1266 ret = closedir(dir);
1267 if (ret < 0) {
ccb4cabe 1268 if (!r)
a17f8b3f
CB
1269 WARN("Failed to delete \"%s\": %s", dirname,
1270 strerror(errno));
ccb4cabe
SH
1271 r = -1;
1272 }
a17f8b3f 1273
ccb4cabe
SH
1274 return r;
1275}
1276
1277static int rmdir_wrapper(void *data)
1278{
1279 char *path = data;
1280
1281 if (setresgid(0,0,0) < 0)
1282 SYSERROR("Failed to setgid to 0");
1283 if (setresuid(0,0,0) < 0)
1284 SYSERROR("Failed to setuid to 0");
1285 if (setgroups(0, NULL) < 0)
1286 SYSERROR("Failed to clear groups");
1287
1288 return cgroup_rmdir(path);
1289}
1290
308a6c94 1291void recursive_destroy(char *path, struct lxc_conf *conf)
ccb4cabe
SH
1292{
1293 int r;
1294 if (conf && !lxc_list_empty(&conf->id_map))
c9b7c33e 1295 r = userns_exec_1(conf, rmdir_wrapper, path, "rmdir_wrapper");
ccb4cabe
SH
1296 else
1297 r = cgroup_rmdir(path);
308a6c94 1298
ccb4cabe 1299 if (r < 0)
1c9da8da 1300 ERROR("Error destroying %s", path);
ccb4cabe
SH
1301}
1302
1303static void cgfsng_destroy(void *hdata, struct lxc_conf *conf)
1304{
1305 struct cgfsng_handler_data *d = hdata;
1306
1307 if (!d)
1308 return;
1309
308a6c94
CB
1310 if (d->container_cgroup && hierarchies) {
1311 int i;
1312 for (i = 0; hierarchies[i]; i++) {
1313 struct hierarchy *h = hierarchies[i];
1314 if (h->fullcgpath) {
1315 recursive_destroy(h->fullcgpath, conf);
1316 free(h->fullcgpath);
1317 h->fullcgpath = NULL;
1318 }
ccb4cabe
SH
1319 }
1320 }
1321
1322 free_handler_data(d);
1323}
1324
1325struct cgroup_ops *cgfsng_ops_init(void)
1326{
e4aeecf5
CB
1327 if (getenv("LXC_DEBUG_CGFSNG"))
1328 lxc_cgfsng_debug = true;
1329
457ca9aa
SH
1330 if (!collect_hierarchy_info())
1331 return NULL;
e4aeecf5 1332
ccb4cabe
SH
1333 return &cgfsng_ops;
1334}
1335
1336static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1337{
e3a3fecf 1338 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
1a0e70ac 1339 if (dir_exists(h->fullcgpath)) { /* it must not already exist */
6f9584d8 1340 ERROR("Path \"%s\" already existed.", h->fullcgpath);
d8da679e 1341 return false;
6f9584d8
CB
1342 }
1343 if (!handle_cpuset_hierarchy(h, cgname)) {
1344 ERROR("Failed to handle cgroupfs v1 cpuset controller.");
e3a3fecf 1345 return false;
6f9584d8 1346 }
e3a3fecf 1347 return mkdir_p(h->fullcgpath, 0755) == 0;
ccb4cabe
SH
1348}
1349
1350static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1351{
1352 if (rmdir(h->fullcgpath) < 0)
1353 SYSERROR("Failed to clean up cgroup %s from failed creation attempt", h->fullcgpath);
1354 free(h->fullcgpath);
1355 h->fullcgpath = NULL;
1356}
1357
1358/*
d30ec4cb 1359 * Try to create the same cgroup in all hierarchies.
ccb4cabe
SH
1360 * Start with cgroup_pattern; next cgroup_pattern-1, -2, ..., -999
1361 */
1362static inline bool cgfsng_create(void *hdata)
1363{
bb30b52a 1364 int i;
ccb4cabe 1365 size_t len;
7d531e9b
CB
1366 char *cgname, *offset, *tmp;
1367 int idx = 0;
1368 struct cgfsng_handler_data *d = hdata;
ccb4cabe
SH
1369
1370 if (!d)
1371 return false;
43654d34 1372
ccb4cabe
SH
1373 if (d->container_cgroup) {
1374 WARN("cgfsng_create called a second time");
1375 return false;
1376 }
1377
43654d34 1378 if (d->cgroup_meta.dir)
7d531e9b 1379 tmp = lxc_string_join("/", (const char *[]){d->cgroup_meta.dir, d->name, NULL}, false);
43654d34
CB
1380 else
1381 tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
ccb4cabe
SH
1382 if (!tmp) {
1383 ERROR("Failed expanding cgroup name pattern");
1384 return false;
1385 }
1a0e70ac 1386 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
ccb4cabe
SH
1387 cgname = must_alloc(len);
1388 strcpy(cgname, tmp);
1389 free(tmp);
1390 offset = cgname + len - 5;
1391
1392again:
95adfe93
SH
1393 if (idx == 1000) {
1394 ERROR("Too many conflicting cgroup names");
ccb4cabe 1395 goto out_free;
95adfe93 1396 }
66b66624 1397 if (idx) {
bb30b52a
CB
1398 int ret;
1399
66b66624
CB
1400 ret = snprintf(offset, 5, "-%d", idx);
1401 if (ret < 0 || (size_t)ret >= 5) {
1402 FILE *f = fopen("/dev/null", "w");
1403 if (f >= 0) {
1404 fprintf(f, "Workaround for GCC7 bug: "
1405 "https://gcc.gnu.org/bugzilla/"
1406 "show_bug.cgi?id=78969");
1407 fclose(f);
1408 }
1409 }
1410 }
457ca9aa
SH
1411 for (i = 0; hierarchies[i]; i++) {
1412 if (!create_path_for_hierarchy(hierarchies[i], cgname)) {
ccb4cabe 1413 int j;
1a0e70ac 1414 ERROR("Failed to create \"%s\"", hierarchies[i]->fullcgpath);
457ca9aa
SH
1415 free(hierarchies[i]->fullcgpath);
1416 hierarchies[i]->fullcgpath = NULL;
ccb4cabe 1417 for (j = 0; j < i; j++)
457ca9aa 1418 remove_path_for_hierarchy(hierarchies[j], cgname);
ccb4cabe
SH
1419 idx++;
1420 goto again;
1421 }
1422 }
1423 /* Done */
1424 d->container_cgroup = cgname;
1425 return true;
1426
1427out_free:
1428 free(cgname);
1429 return false;
1430}
1431
ccb4cabe
SH
1432static bool cgfsng_enter(void *hdata, pid_t pid)
1433{
ccb4cabe
SH
1434 char pidstr[25];
1435 int i, len;
1436
1437 len = snprintf(pidstr, 25, "%d", pid);
1438 if (len < 0 || len > 25)
1439 return false;
1440
457ca9aa
SH
1441 for (i = 0; hierarchies[i]; i++) {
1442 char *fullpath = must_make_path(hierarchies[i]->fullcgpath,
ccb4cabe
SH
1443 "cgroup.procs", NULL);
1444 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
d3b00a8f 1445 SYSERROR("Failed to enter %s", fullpath);
ccb4cabe
SH
1446 free(fullpath);
1447 return false;
1448 }
1449 free(fullpath);
1450 }
1451
1452 return true;
1453}
1454
1455struct chown_data {
1456 struct cgfsng_handler_data *d;
1a0e70ac 1457 uid_t origuid; /* target uid in parent namespace */
ccb4cabe
SH
1458};
1459
c0888dfe
SH
1460/*
1461 * chgrp the container cgroups to container group. We leave
1462 * the container owner as cgroup owner. So we must make the
1463 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1464 *
1465 * Also chown the tasks and cgroup.procs files. Those may not
1466 * exist depending on kernel version.
c0888dfe 1467 */
ccb4cabe
SH
1468static int chown_cgroup_wrapper(void *data)
1469{
1470 struct chown_data *arg = data;
ccb4cabe
SH
1471 uid_t destuid;
1472 int i;
1473
1474 if (setresgid(0,0,0) < 0)
1475 SYSERROR("Failed to setgid to 0");
1476 if (setresuid(0,0,0) < 0)
1477 SYSERROR("Failed to setuid to 0");
1478 if (setgroups(0, NULL) < 0)
1479 SYSERROR("Failed to clear groups");
1480
1481 destuid = get_ns_uid(arg->origuid);
1482
457ca9aa
SH
1483 for (i = 0; hierarchies[i]; i++) {
1484 char *fullpath, *path = hierarchies[i]->fullcgpath;
43647298
SH
1485
1486 if (chown(path, destuid, 0) < 0) {
ab8f5424 1487 SYSERROR("Error chowning %s to %d", path, (int) destuid);
ccb4cabe
SH
1488 return -1;
1489 }
c0888dfe 1490
43647298 1491 if (chmod(path, 0775) < 0) {
ab8f5424 1492 SYSERROR("Error chmoding %s", path);
c0888dfe
SH
1493 return -1;
1494 }
ccb4cabe 1495
ab8f5424
SH
1496 /*
1497 * Failures to chown these are inconvenient but not detrimental
1498 * We leave these owned by the container launcher, so that container
1499 * root can write to the files to attach. We chmod them 664 so that
1500 * container systemd can write to the files (which systemd in wily
1501 * insists on doing)
1502 */
43647298
SH
1503 fullpath = must_make_path(path, "tasks", NULL);
1504 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1505 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1506 strerror(errno));
ab8f5424 1507 if (chmod(fullpath, 0664) < 0)
13277ec4 1508 WARN("Error chmoding %s: %s", path, strerror(errno));
43647298
SH
1509 free(fullpath);
1510
1511 fullpath = must_make_path(path, "cgroup.procs", NULL);
1512 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1513 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1514 strerror(errno));
ab8f5424 1515 if (chmod(fullpath, 0664) < 0)
13277ec4 1516 WARN("Error chmoding %s: %s", path, strerror(errno));
ccb4cabe
SH
1517 free(fullpath);
1518 }
1519
1520 return 0;
1521}
1522
058c1cb6 1523static bool cgfsng_chown(void *hdata, struct lxc_conf *conf)
ccb4cabe
SH
1524{
1525 struct cgfsng_handler_data *d = hdata;
1526 struct chown_data wrap;
1527
1528 if (!d)
1529 return false;
1530
1531 if (lxc_list_empty(&conf->id_map))
1532 return true;
1533
1534 wrap.d = d;
1535 wrap.origuid = geteuid();
1536
c9b7c33e
CB
1537 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1538 "chown_cgroup_wrapper") < 0) {
ccb4cabe
SH
1539 ERROR("Error requesting cgroup chown in new namespace");
1540 return false;
1541 }
1542
1543 return true;
1544}
1545
8aa1044f
SH
1546/*
1547 * We've safe-mounted a tmpfs as parent, so we don't need to protect against
1548 * symlinks any more - just use mount
1549 */
1550
1551/* mount cgroup-full if requested */
1552static int mount_cgroup_full(int type, struct hierarchy *h, char *dest,
1553 char *container_cgroup)
1554{
1555 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1556 return 0;
1557 if (mount(h->mountpoint, dest, "cgroup", MS_BIND, NULL) < 0) {
1558 SYSERROR("Error bind-mounting %s cgroup onto %s", h->mountpoint,
1559 dest);
1560 return -1;
1561 }
1562 if (type != LXC_AUTO_CGROUP_FULL_RW) {
5b6f9369
SH
1563 unsigned long flags = MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV |
1564 MS_REMOUNT | MS_RDONLY;
1565 if (mount(NULL, dest, "cgroup", flags, NULL) < 0) {
8aa1044f
SH
1566 SYSERROR("Error remounting %s readonly", dest);
1567 return -1;
1568 }
1569 }
1570
1571 INFO("Bind mounted %s onto %s", h->mountpoint, dest);
1572 if (type != LXC_AUTO_CGROUP_FULL_MIXED)
1573 return 0;
1574
1575 /* mount just the container path rw */
1576 char *source = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
5b6f9369 1577 char *rwpath = must_make_path(dest, h->base_cgroup, container_cgroup, NULL);
8aa1044f 1578 if (mount(source, rwpath, "cgroup", MS_BIND, NULL) < 0)
13277ec4 1579 WARN("Failed to mount %s read-write: %s", rwpath,
1580 strerror(errno));
8aa1044f
SH
1581 INFO("Made %s read-write", rwpath);
1582 free(rwpath);
1583 free(source);
1584 return 0;
1585}
1586
1587/* cgroup-full:* is done, no need to create subdirs */
1588static bool cg_mount_needs_subdirs(int type)
1589{
1590 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1591 return false;
1592 return true;
1593}
1594
1595/*
1596 * After $rootfs/sys/fs/container/controller/the/cg/path has been
1597 * created, remount controller ro if needed and bindmount the
1598 * cgroupfs onto controll/the/cg/path
1599 */
1600static int
1601do_secondstage_mounts_if_needed(int type, struct hierarchy *h,
1602 char *controllerpath, char *cgpath,
1603 const char *container_cgroup)
1604{
1605 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1606 if (mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL) < 0) {
1607 SYSERROR("Error bind-mounting %s", controllerpath);
1608 return -1;
1609 }
1610 if (mount(controllerpath, controllerpath, "cgroup",
1611 MS_REMOUNT | MS_BIND | MS_RDONLY, NULL) < 0) {
1612 SYSERROR("Error remounting %s read-only", controllerpath);
1613 return -1;
1614 }
1615 INFO("Remounted %s read-only", controllerpath);
1616 }
1617 char *sourcepath = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
1618 int flags = MS_BIND;
1619 if (type == LXC_AUTO_CGROUP_RO)
1620 flags |= MS_RDONLY;
1621 INFO("Mounting %s onto %s", sourcepath, cgpath);
1622 if (mount(sourcepath, cgpath, "cgroup", flags, NULL) < 0) {
1623 free(sourcepath);
1624 SYSERROR("Error mounting cgroup %s onto %s", h->controllers[0],
1625 cgpath);
1626 return -1;
1627 }
1628 free(sourcepath);
1629 INFO("Completed second stage cgroup automounts for %s", cgpath);
1630 return 0;
1631}
1632
a760603e 1633static int mount_cgroup_cgns_supported(int type, struct hierarchy *h, const char *controllerpath)
b635e92d
CB
1634{
1635 int ret;
1636 char *controllers = NULL;
a760603e
CB
1637 char *fstype = "cgroup2";
1638 unsigned long flags = 0;
b635e92d 1639
a760603e
CB
1640 flags |= MS_NOSUID;
1641 flags |= MS_NOEXEC;
1642 flags |= MS_NODEV;
1643 flags |= MS_RELATIME;
1644
1645 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1646 flags |= MS_RDONLY;
1647
1648 if (!h->is_cgroup_v2) {
1649 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1650 if (!controllers)
1651 return -ENOMEM;
1652 fstype = "cgroup";
b635e92d
CB
1653 }
1654
a760603e 1655 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
b635e92d
CB
1656 free(controllers);
1657 if (ret < 0) {
a760603e 1658 SYSERROR("Failed to mount %s with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1659 return -1;
1660 }
1661
a760603e 1662 DEBUG("Mounted %s with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1663 return 0;
1664}
1665
ccb4cabe
SH
1666static bool cgfsng_mount(void *hdata, const char *root, int type)
1667{
b635e92d 1668 int i;
8aa1044f
SH
1669 char *tmpfspath = NULL;
1670 bool retval = false;
b635e92d
CB
1671 struct lxc_handler *handler = hdata;
1672 struct cgfsng_handler_data *d = handler->cgroup_data;
1673 bool has_cgns = false, has_sys_admin = true;
8aa1044f
SH
1674
1675 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1676 return true;
1677
b635e92d
CB
1678 has_cgns = cgns_supported();
1679 if (!lxc_list_empty(&handler->conf->keepcaps))
1680 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1681 else
1682 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1683
1684 if (has_cgns && has_sys_admin)
ccb4cabe 1685 return true;
8aa1044f
SH
1686
1687 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1688
1689 if (type == LXC_AUTO_CGROUP_NOSPEC)
1690 type = LXC_AUTO_CGROUP_MIXED;
1691 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1692 type = LXC_AUTO_CGROUP_FULL_MIXED;
1693
1694 /* Mount tmpfs */
1695 if (safe_mount("cgroup_root", tmpfspath, "tmpfs",
1696 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1697 "size=10240k,mode=755",
1698 root) < 0)
1699 goto bad;
1700
457ca9aa 1701 for (i = 0; hierarchies[i]; i++) {
8aa1044f 1702 char *controllerpath, *path2;
457ca9aa 1703 struct hierarchy *h = hierarchies[i];
8aa1044f
SH
1704 char *controller = strrchr(h->mountpoint, '/');
1705 int r;
1706
1707 if (!controller)
1708 continue;
1709 controller++;
1710 controllerpath = must_make_path(tmpfspath, controller, NULL);
1711 if (dir_exists(controllerpath)) {
1712 free(controllerpath);
1713 continue;
1714 }
1715 if (mkdir(controllerpath, 0755) < 0) {
1716 SYSERROR("Error creating cgroup path: %s", controllerpath);
1717 free(controllerpath);
1718 goto bad;
1719 }
b635e92d
CB
1720
1721 if (has_cgns && !has_sys_admin) {
1722 /* If cgroup namespaces are supported but the container
1723 * will not have CAP_SYS_ADMIN after it has started we
1724 * need to mount the cgroups manually.
1725 */
a760603e 1726 r = mount_cgroup_cgns_supported(type, h, controllerpath);
b635e92d
CB
1727 free(controllerpath);
1728 if (r < 0)
1729 goto bad;
1730 continue;
1731 }
1732
8aa1044f
SH
1733 if (mount_cgroup_full(type, h, controllerpath, d->container_cgroup) < 0) {
1734 free(controllerpath);
1735 goto bad;
1736 }
1737 if (!cg_mount_needs_subdirs(type)) {
1738 free(controllerpath);
1739 continue;
1740 }
ef4413fa 1741 path2 = must_make_path(controllerpath, h->base_cgroup, d->container_cgroup, NULL);
8aa1044f
SH
1742 if (mkdir_p(path2, 0755) < 0) {
1743 free(controllerpath);
1744 goto bad;
1745 }
2f62fb00 1746
8aa1044f
SH
1747 r = do_secondstage_mounts_if_needed(type, h, controllerpath, path2,
1748 d->container_cgroup);
1749 free(controllerpath);
1750 free(path2);
1751 if (r < 0)
1752 goto bad;
1753 }
1754 retval = true;
1755
1756bad:
1757 free(tmpfspath);
1758 return retval;
ccb4cabe
SH
1759}
1760
1761static int recursive_count_nrtasks(char *dirname)
1762{
74f96976 1763 struct dirent *direntp;
ccb4cabe
SH
1764 DIR *dir;
1765 int count = 0, ret;
1766 char *path;
1767
1768 dir = opendir(dirname);
1769 if (!dir)
1770 return 0;
1771
74f96976 1772 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1773 struct stat mystat;
1774
1775 if (!direntp)
1776 break;
1777
1778 if (!strcmp(direntp->d_name, ".") ||
1779 !strcmp(direntp->d_name, ".."))
1780 continue;
1781
1782 path = must_make_path(dirname, direntp->d_name, NULL);
1783
1784 if (lstat(path, &mystat))
1785 goto next;
1786
1787 if (!S_ISDIR(mystat.st_mode))
1788 goto next;
1789
1790 count += recursive_count_nrtasks(path);
1791next:
1792 free(path);
1793 }
1794
1795 path = must_make_path(dirname, "cgroup.procs", NULL);
1796 ret = lxc_count_file_lines(path);
1797 if (ret != -1)
1798 count += ret;
1799 free(path);
1800
1801 (void) closedir(dir);
1802
1803 return count;
1804}
1805
1806static int cgfsng_nrtasks(void *hdata) {
1807 struct cgfsng_handler_data *d = hdata;
1808 char *path;
1809 int count;
1810
457ca9aa 1811 if (!d || !d->container_cgroup || !hierarchies)
ccb4cabe 1812 return -1;
457ca9aa 1813 path = must_make_path(hierarchies[0]->fullcgpath, NULL);
ccb4cabe
SH
1814 count = recursive_count_nrtasks(path);
1815 free(path);
1816 return count;
1817}
1818
1819/* Only root needs to escape to the cgroup of its init */
7103fe6f 1820static bool cgfsng_escape()
ccb4cabe 1821{
ccb4cabe
SH
1822 int i;
1823
1824 if (geteuid())
1825 return true;
1826
457ca9aa
SH
1827 for (i = 0; hierarchies[i]; i++) {
1828 char *fullpath = must_make_path(hierarchies[i]->mountpoint,
1829 hierarchies[i]->base_cgroup,
ccb4cabe
SH
1830 "cgroup.procs", NULL);
1831 if (lxc_write_to_file(fullpath, "0", 2, false) != 0) {
d3b00a8f 1832 SYSERROR("Failed to escape to %s", fullpath);
ccb4cabe 1833 free(fullpath);
6df334d1 1834 return false;
ccb4cabe
SH
1835 }
1836 free(fullpath);
1837 }
1838
6df334d1 1839 return true;
ccb4cabe
SH
1840}
1841
36662416
TA
1842static int cgfsng_num_hierarchies(void)
1843{
1844 int i;
1845
1846 for (i = 0; hierarchies[i]; i++)
1847 ;
1848
1849 return i;
1850}
1851
1852static bool cgfsng_get_hierarchies(int n, char ***out)
1853{
1854 int i;
1855
1856 /* sanity check n */
1857 for (i = 0; i < n; i++) {
1858 if (!hierarchies[i])
1859 return false;
1860 }
1861
1862 *out = hierarchies[i]->controllers;
1863
1864 return true;
1865}
1866
ccb4cabe
SH
1867#define THAWED "THAWED"
1868#define THAWED_LEN (strlen(THAWED))
1869
1870static bool cgfsng_unfreeze(void *hdata)
1871{
ccb4cabe 1872 char *fullpath;
457ca9aa 1873 struct hierarchy *h = get_hierarchy("freezer");
ccb4cabe 1874
457ca9aa 1875 if (!h)
ccb4cabe
SH
1876 return false;
1877 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
1878 if (lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false) != 0) {
1879 free(fullpath);
1880 return false;
1881 }
1882 free(fullpath);
1883 return true;
1884}
1885
1886static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem)
1887{
457ca9aa 1888 struct hierarchy *h = get_hierarchy(subsystem);
ccb4cabe
SH
1889 if (!h)
1890 return NULL;
1891
371f834d
SH
1892 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
1893}
1894
1895/*
1896 * Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a
1897 * full path, which must be freed by the caller.
1898 */
1899static char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
1900 const char *inpath,
1901 const char *filename)
1902{
371f834d 1903 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
1904}
1905
1906static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
1907{
ccb4cabe
SH
1908 char pidstr[25];
1909 int i, len;
1910
1911 len = snprintf(pidstr, 25, "%d", pid);
1912 if (len < 0 || len > 25)
1913 return false;
1914
457ca9aa 1915 for (i = 0; hierarchies[i]; i++) {
ccb4cabe 1916 char *path, *fullpath;
457ca9aa 1917 struct hierarchy *h = hierarchies[i];
ccb4cabe
SH
1918
1919 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
1a0e70ac 1920 if (!path) /* not running */
ccb4cabe
SH
1921 continue;
1922
371f834d
SH
1923 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
1924 free(path);
ccb4cabe
SH
1925 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
1926 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
1927 free(fullpath);
ccb4cabe
SH
1928 return false;
1929 }
ccb4cabe
SH
1930 free(fullpath);
1931 }
1932
ccb4cabe
SH
1933 return true;
1934}
1935
1936/*
1937 * Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.
1938 * Here we don't have a cgroup_data set up, so we ask the running
1939 * container through the commands API for the cgroup path
1940 */
1941static int cgfsng_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
1942{
1943 char *subsystem, *p, *path;
ccb4cabe
SH
1944 struct hierarchy *h;
1945 int ret = -1;
1946
1947 subsystem = alloca(strlen(filename) + 1);
1948 strcpy(subsystem, filename);
1949 if ((p = strchr(subsystem, '.')) != NULL)
1950 *p = '\0';
1951
1952 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1a0e70ac 1953 if (!path) /* not running */
ccb4cabe
SH
1954 return -1;
1955
457ca9aa 1956 h = get_hierarchy(subsystem);
ccb4cabe 1957 if (h) {
371f834d 1958 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1959 ret = lxc_read_from_file(fullpath, value, len);
1960 free(fullpath);
1961 }
1962
ccb4cabe
SH
1963 free(path);
1964
1965 return ret;
1966}
1967
1968/*
1969 * Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.
1970 * Here we don't have a cgroup_data set up, so we ask the running
1971 * container through the commands API for the cgroup path
1972 */
1973static int cgfsng_set(const char *filename, const char *value, const char *name, const char *lxcpath)
1974{
1975 char *subsystem, *p, *path;
ccb4cabe
SH
1976 struct hierarchy *h;
1977 int ret = -1;
1978
1979 subsystem = alloca(strlen(filename) + 1);
1980 strcpy(subsystem, filename);
1981 if ((p = strchr(subsystem, '.')) != NULL)
1982 *p = '\0';
1983
1984 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1a0e70ac 1985 if (!path) /* not running */
ccb4cabe
SH
1986 return -1;
1987
457ca9aa 1988 h = get_hierarchy(subsystem);
ccb4cabe 1989 if (h) {
371f834d 1990 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1991 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
1992 free(fullpath);
1993 }
1994
ccb4cabe
SH
1995 free(path);
1996
1997 return ret;
1998}
1999
72add155
SH
2000/*
2001 * take devices cgroup line
2002 * /dev/foo rwx
2003 * and convert it to a valid
2004 * type major:minor mode
2005 * line. Return <0 on error. Dest is a preallocated buffer
2006 * long enough to hold the output.
2007 */
2008static int convert_devpath(const char *invalue, char *dest)
2009{
2a06d041
CB
2010 int n_parts;
2011 char *p, *path, type;
72add155
SH
2012 struct stat sb;
2013 unsigned long minor, major;
2a06d041
CB
2014 int ret = -EINVAL;
2015 char *mode = NULL;
72add155
SH
2016
2017 path = must_copy_string(invalue);
2018
2019 /*
2020 * read path followed by mode; ignore any trailing text.
2021 * A ' # comment' would be legal. Technically other text
2022 * is not legal, we could check for that if we cared to
2023 */
2024 for (n_parts = 1, p = path; *p && n_parts < 3; p++) {
2c2d6c49
SH
2025 if (*p != ' ')
2026 continue;
2027 *p = '\0';
2028 if (n_parts != 1)
2029 break;
2030 p++;
2031 n_parts++;
2032 while (*p == ' ')
2033 p++;
2034 mode = p;
2035 if (*p == '\0')
2036 goto out;
72add155 2037 }
2c2d6c49
SH
2038
2039 if (n_parts == 1)
72add155 2040 goto out;
72add155
SH
2041
2042 ret = stat(path, &sb);
2043 if (ret < 0)
2044 goto out;
2045
72add155
SH
2046 mode_t m = sb.st_mode & S_IFMT;
2047 switch (m) {
2048 case S_IFBLK:
2049 type = 'b';
2050 break;
2051 case S_IFCHR:
2052 type = 'c';
2053 break;
2c2d6c49 2054 default:
72add155
SH
2055 ERROR("Unsupported device type %i for %s", m, path);
2056 ret = -EINVAL;
2057 goto out;
2058 }
2c2d6c49
SH
2059
2060 major = MAJOR(sb.st_rdev);
2061 minor = MINOR(sb.st_rdev);
2062 ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
72add155 2063 if (ret < 0 || ret >= 50) {
2a06d041
CB
2064 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2065 "chars)", type, major, minor, mode);
72add155
SH
2066 ret = -ENAMETOOLONG;
2067 goto out;
2068 }
2069 ret = 0;
2070
2071out:
2072 free(path);
2073 return ret;
2074}
2075
ccb4cabe
SH
2076/*
2077 * Called from setup_limits - here we have the container's cgroup_data because
2078 * we created the cgroups
2079 */
2080static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfsng_handler_data *d)
2081{
b3646d7e 2082 char *fullpath, *p;
1a0e70ac
CB
2083 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2084 char converted_value[50];
b3646d7e
CB
2085 struct hierarchy *h;
2086 int ret = 0;
2087 char *controller = NULL;
ccb4cabe 2088
b3646d7e
CB
2089 controller = alloca(strlen(filename) + 1);
2090 strcpy(controller, filename);
2091 if ((p = strchr(controller, '.')) != NULL)
ccb4cabe
SH
2092 *p = '\0';
2093
c8bf519d 2094 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
72add155
SH
2095 ret = convert_devpath(value, converted_value);
2096 if (ret < 0)
c8bf519d 2097 return ret;
72add155
SH
2098 value = converted_value;
2099
c8bf519d 2100 }
2101
b3646d7e
CB
2102 h = get_hierarchy(controller);
2103 if (!h) {
2104 ERROR("Failed to setup limits for the \"%s\" controller. "
2105 "The controller seems to be unused by \"cgfsng\" cgroup "
2106 "driver or not enabled on the cgroup hierarchy",
2107 controller);
2108 return -1;
ccb4cabe 2109 }
b3646d7e
CB
2110
2111 fullpath = must_make_path(h->fullcgpath, filename, NULL);
2112 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
2113 free(fullpath);
ccb4cabe
SH
2114 return ret;
2115}
2116
2117static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings,
2118 bool do_devices)
2119{
2120 struct cgfsng_handler_data *d = hdata;
2121 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
2122 struct lxc_cgroup *cg;
ccb4cabe
SH
2123 bool ret = false;
2124
2125 if (lxc_list_empty(cgroup_settings))
2126 return true;
2127
2128 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2129 if (!sorted_cgroup_settings) {
2130 return false;
2131 }
2132
ccb4cabe
SH
2133 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2134 cg = iterator->elem;
2135
2136 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
ccb4cabe
SH
2137 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
2138 if (do_devices && (errno == EACCES || errno == EPERM)) {
2139 WARN("Error setting %s to %s for %s",
2140 cg->subsystem, cg->value, d->name);
2141 continue;
2142 }
2143 SYSERROR("Error setting %s to %s for %s",
2144 cg->subsystem, cg->value, d->name);
2145 goto out;
2146 }
6a628f4a 2147 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
ccb4cabe 2148 }
ccb4cabe
SH
2149 }
2150
2151 ret = true;
2152 INFO("cgroup has been setup");
2153out:
ccb4cabe
SH
2154 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2155 lxc_list_del(iterator);
2156 free(iterator);
2157 }
2158 free(sorted_cgroup_settings);
2159 return ret;
2160}
2161
2162static struct cgroup_ops cgfsng_ops = {
2163 .init = cgfsng_init,
2164 .destroy = cgfsng_destroy,
2165 .create = cgfsng_create,
2166 .enter = cgfsng_enter,
ccb4cabe 2167 .escape = cgfsng_escape,
36662416
TA
2168 .num_hierarchies = cgfsng_num_hierarchies,
2169 .get_hierarchies = cgfsng_get_hierarchies,
ccb4cabe
SH
2170 .get_cgroup = cgfsng_get_cgroup,
2171 .get = cgfsng_get,
2172 .set = cgfsng_set,
2173 .unfreeze = cgfsng_unfreeze,
2174 .setup_limits = cgfsng_setup_limits,
2175 .name = "cgroupfs-ng",
2176 .attach = cgfsng_attach,
058c1cb6 2177 .chown = cgfsng_chown,
ccb4cabe
SH
2178 .mount_cgroup = cgfsng_mount,
2179 .nrtasks = cgfsng_nrtasks,
2180 .driver = CGFSNG,
2181
2182 /* unsupported */
2183 .create_legacy = NULL,
2184};