]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
utils: move helpers from cgfsng.c to utils.{c,h}
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
ccb4cabe
SH
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24/*
25 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
26 * cgroup backend. The original cgfs.c was designed to be as flexible
27 * as possible. It would try to find cgroup filesystems no matter where
28 * or how you had them mounted, and deduce the most usable mount for
29 * each controller. It also was not designed for unprivileged use, as
30 * that was reserved for cgmanager.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
a54694f8 36
ccb4cabe 37#include "config.h"
a54694f8
CB
38
39#include <ctype.h>
40#include <dirent.h>
41#include <errno.h>
42#include <grp.h>
43#include <stdint.h>
ccb4cabe
SH
44#include <stdio.h>
45#include <stdlib.h>
a54694f8 46#include <string.h>
ccb4cabe 47#include <unistd.h>
a54694f8 48#include <sys/types.h>
ccb4cabe 49
d8e48992 50#include "bdev.h"
ccb4cabe 51#include "cgroup.h"
ccb4cabe 52#include "commands.h"
a54694f8
CB
53#include "log.h"
54#include "utils.h"
ccb4cabe
SH
55
56lxc_log_define(lxc_cgfsng, lxc);
57
58static struct cgroup_ops cgfsng_ops;
59
ccb4cabe
SH
60/*
61 * A descriptor for a mounted hierarchy
62 * @controllers: either NULL, or a null-terminated list of all
63 * the co-mounted controllers
64 * @mountpoint: the mountpoint we will use. It will be either
65 * /sys/fs/cgroup/controller or /sys/fs/cgroup/controllerlist
66 * @base_cgroup: the cgroup under which the container cgroup path
67 is created. This will be either the caller's cgroup (if not
68 root), or init's cgroup (if root).
69 */
70struct hierarchy {
71 char **controllers;
72 char *mountpoint;
73 char *base_cgroup;
74 char *fullcgpath;
75};
76
77/*
78 * The cgroup data which is attached to the lxc_handler.
ccb4cabe
SH
79 * @cgroup_pattern - a copy of the lxc.cgroup.pattern
80 * @container_cgroup - if not null, the cgroup which was created for
81 * the container. For each hierarchy, it is created under the
82 * @hierarchy->base_cgroup directory. Relative to the base_cgroup
83 * it is the same for all hierarchies.
84 * @name - the container name
85 */
86struct cgfsng_handler_data {
ccb4cabe
SH
87 char *cgroup_pattern;
88 char *container_cgroup; // cgroup we created for the container
89 char *name; // container name
90};
91
457ca9aa
SH
92/*
93 * @hierarchies - a NULL-terminated array of struct hierarchy, one per
94 * hierarchy. No duplicates. First sufficient, writeable mounted
95 * hierarchy wins
96 */
97struct hierarchy **hierarchies;
98
99/*
100 * @cgroup_use - a copy of the lxc.cgroup.use
101 */
102char *cgroup_use;
103
e4aeecf5
CB
104/*
105 * @lxc_cgfsng_debug - whether to print debug info to stdout for the cgfsng
106 * driver
107 */
108static bool lxc_cgfsng_debug;
109
ccb4cabe
SH
110static void free_string_list(char **clist)
111{
112 if (clist) {
113 int i;
114
115 for (i = 0; clist[i]; i++)
116 free(clist[i]);
117 free(clist);
118 }
119}
120
ccb4cabe
SH
121/* Allocate a pointer, do not fail */
122static void *must_alloc(size_t sz)
123{
124 return must_realloc(NULL, sz);
125}
126
ccb4cabe
SH
127/*
128 * This is a special case - return a copy of @entry
129 * prepending 'name='. I.e. turn systemd into name=systemd.
130 * Do not fail.
131 */
132static char *must_prefix_named(char *entry)
133{
134 char *ret;
135 size_t len = strlen(entry);
136
137 ret = must_alloc(len + 6);
138 snprintf(ret, len + 6, "name=%s", entry);
139 return ret;
140}
141
142/*
143 * Given a pointer to a null-terminated array of pointers, realloc to
144 * add one entry, and point the new entry to NULL. Do not fail. Return
145 * the index to the second-to-last entry - that is, the one which is
146 * now available for use (keeping the list null-terminated).
147 */
148static int append_null_to_list(void ***list)
149{
150 int newentry = 0;
151
152 if (*list)
153 for (; (*list)[newentry]; newentry++);
154
155 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
156 (*list)[newentry + 1] = NULL;
157 return newentry;
158}
159
160/*
161 * Given a null-terminated array of strings, check whether @entry
162 * is one of the strings
163 */
164static bool string_in_list(char **list, const char *entry)
165{
166 int i;
167
168 if (!list)
169 return false;
170 for (i = 0; list[i]; i++)
171 if (strcmp(list[i], entry) == 0)
172 return true;
173
174 return false;
175}
176
177/*
178 * append an entry to the clist. Do not fail.
179 * *clist must be NULL the first time we are called.
180 *
181 * We also handle named subsystems here. Any controller which is not a
182 * kernel subsystem, we prefix 'name='. Any which is both a kernel and
183 * named subsystem, we refuse to use because we're not sure which we
184 * have here. (TODO - we could work around this in some cases by just
185 * remounting to be unambiguous, or by comparing mountpoint contents
186 * with current cgroup)
187 *
188 * The last entry will always be NULL.
189 */
190static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry)
191{
192 int newentry;
193 char *copy;
194
195 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
196 ERROR("Refusing to use ambiguous controller '%s'", entry);
197 ERROR("It is both a named and kernel subsystem");
198 return;
199 }
200
201 newentry = append_null_to_list((void ***)clist);
202
203 if (strncmp(entry, "name=", 5) == 0)
204 copy = must_copy_string(entry);
205 else if (string_in_list(klist, entry))
206 copy = must_copy_string(entry);
207 else
208 copy = must_prefix_named(entry);
209
210 (*clist)[newentry] = copy;
211}
212
ccb4cabe
SH
213static void free_handler_data(struct cgfsng_handler_data *d)
214{
ccb4cabe
SH
215 free(d->cgroup_pattern);
216 free(d->container_cgroup);
217 free(d->name);
218 free(d);
219}
220
221/*
222 * Given a handler's cgroup data, return the struct hierarchy for the
223 * controller @c, or NULL if there is none.
224 */
457ca9aa 225struct hierarchy *get_hierarchy(const char *c)
ccb4cabe
SH
226{
227 int i;
228
457ca9aa 229 if (!hierarchies)
ccb4cabe 230 return NULL;
457ca9aa
SH
231 for (i = 0; hierarchies[i]; i++) {
232 if (string_in_list(hierarchies[i]->controllers, c))
233 return hierarchies[i];
ccb4cabe
SH
234 }
235 return NULL;
236}
237
a54694f8
CB
238#define BATCH_SIZE 50
239static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
240{
241 int newbatches = (newlen / BATCH_SIZE) + 1;
242 int oldbatches = (oldlen / BATCH_SIZE) + 1;
243
244 if (!*mem || newbatches > oldbatches) {
245 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
246 }
247}
248
249static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
250{
251 size_t full = oldlen + newlen;
252
253 batch_realloc(dest, oldlen, full + 1);
254
255 memcpy(*dest + oldlen, new, newlen + 1);
256}
257
258/* Slurp in a whole file */
259static char *read_file(char *fnam)
260{
261 FILE *f;
262 char *line = NULL, *buf = NULL;
263 size_t len = 0, fulllen = 0;
264 int linelen;
265
266 f = fopen(fnam, "r");
267 if (!f)
268 return NULL;
269 while ((linelen = getline(&line, &len, f)) != -1) {
270 append_line(&buf, fulllen, line, linelen);
271 fulllen += linelen;
272 }
273 fclose(f);
274 free(line);
275 return buf;
276}
277
278/* Taken over modified from the kernel sources. */
279#define NBITS 32 /* bits in uint32_t */
280#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
281#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
282
283static void set_bit(unsigned bit, uint32_t *bitarr)
284{
285 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
286}
287
288static void clear_bit(unsigned bit, uint32_t *bitarr)
289{
290 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
291}
292
293static bool is_set(unsigned bit, uint32_t *bitarr)
294{
295 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
296}
297
298/* Create cpumask from cpulist aka turn:
299 *
300 * 0,2-3
301 *
302 * into bit array
303 *
304 * 1 0 1 1
305 */
306static uint32_t *lxc_cpumask(char *buf, size_t nbits)
307{
308 char *token;
309 char *saveptr = NULL;
310 size_t arrlen = BITS_TO_LONGS(nbits);
311 uint32_t *bitarr = calloc(arrlen, sizeof(uint32_t));
312 if (!bitarr)
313 return NULL;
314
315 for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
316 errno = 0;
317 unsigned start = strtoul(token, NULL, 0);
318 unsigned end = start;
319
320 char *range = strchr(token, '-');
321 if (range)
322 end = strtoul(range + 1, NULL, 0);
323 if (!(start <= end)) {
324 free(bitarr);
325 return NULL;
326 }
327
328 if (end >= nbits) {
329 free(bitarr);
330 return NULL;
331 }
332
333 while (start <= end)
334 set_bit(start++, bitarr);
335 }
336
337 return bitarr;
338}
339
a54694f8
CB
340/* Turn cpumask into simple, comma-separated cpulist. */
341static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
342{
343 size_t i;
344 int ret;
eab15c1e 345 char numstr[LXC_NUMSTRLEN64] = {0};
a54694f8
CB
346 char **cpulist = NULL;
347
348 for (i = 0; i <= nbits; i++) {
349 if (is_set(i, bitarr)) {
eab15c1e
CB
350 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
351 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
a54694f8
CB
352 lxc_free_array((void **)cpulist, free);
353 return NULL;
354 }
355 if (lxc_append_string(&cpulist, numstr) < 0) {
356 lxc_free_array((void **)cpulist, free);
357 return NULL;
358 }
359 }
360 }
361 return lxc_string_join(",", (const char **)cpulist, false);
362}
363
364static ssize_t get_max_cpus(char *cpulist)
365{
366 char *c1, *c2;
367 char *maxcpus = cpulist;
368 size_t cpus = 0;
369
370 c1 = strrchr(maxcpus, ',');
371 if (c1)
372 c1++;
373
374 c2 = strrchr(maxcpus, '-');
375 if (c2)
376 c2++;
377
378 if (!c1 && !c2)
379 c1 = maxcpus;
380 else if (c1 > c2)
381 c2 = c1;
382 else if (c1 < c2)
383 c1 = c2;
384 else if (!c1 && c2) // The reverse case is obvs. not needed.
385 c1 = c2;
386
387 /* If the above logic is correct, c1 should always hold a valid string
388 * here.
389 */
390
391 errno = 0;
392 cpus = strtoul(c1, NULL, 0);
393 if (errno != 0)
394 return -1;
395
396 return cpus;
397}
398
6f9584d8 399#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
a54694f8
CB
400static bool filter_and_set_cpus(char *path, bool am_initialized)
401{
402 char *lastslash, *fpath, oldv;
403 int ret;
404 ssize_t i;
405
406 ssize_t maxposs = 0, maxisol = 0;
407 char *cpulist = NULL, *posscpus = NULL, *isolcpus = NULL;
408 uint32_t *possmask = NULL, *isolmask = NULL;
6f9584d8 409 bool bret = false, flipped_bit = false;
a54694f8
CB
410
411 lastslash = strrchr(path, '/');
412 if (!lastslash) { // bug... this shouldn't be possible
6f9584d8 413 ERROR("Invalid path: %s.", path);
a54694f8
CB
414 return bret;
415 }
416 oldv = *lastslash;
417 *lastslash = '\0';
418 fpath = must_make_path(path, "cpuset.cpus", NULL);
419 posscpus = read_file(fpath);
6f9584d8
CB
420 if (!posscpus) {
421 SYSERROR("Could not read file: %s.\n", fpath);
422 goto on_error;
423 }
a54694f8
CB
424
425 /* Get maximum number of cpus found in possible cpuset. */
426 maxposs = get_max_cpus(posscpus);
427 if (maxposs < 0)
6f9584d8 428 goto on_error;
a54694f8 429
6f9584d8
CB
430 if (!file_exists(__ISOL_CPUS)) {
431 /* This system doesn't expose isolated cpus. */
432 DEBUG("Path: "__ISOL_CPUS" to read isolated cpus from does not exist.\n");
65d29cbc
CB
433 cpulist = posscpus;
434 /* No isolated cpus but we weren't already initialized by
435 * someone. We should simply copy the parents cpuset.cpus
436 * values.
437 */
438 if (!am_initialized) {
439 DEBUG("Copying cpuset of parent cgroup.");
440 goto copy_parent;
441 }
442 /* No isolated cpus but we were already initialized by someone.
443 * Nothing more to do for us.
444 */
6f9584d8
CB
445 goto on_success;
446 }
447
448 isolcpus = read_file(__ISOL_CPUS);
449 if (!isolcpus) {
450 SYSERROR("Could not read file "__ISOL_CPUS);
451 goto on_error;
452 }
a54694f8 453 if (!isdigit(isolcpus[0])) {
6f9584d8 454 DEBUG("No isolated cpus detected.");
a54694f8
CB
455 cpulist = posscpus;
456 /* No isolated cpus but we weren't already initialized by
457 * someone. We should simply copy the parents cpuset.cpus
458 * values.
459 */
6f9584d8
CB
460 if (!am_initialized) {
461 DEBUG("Copying cpuset of parent cgroup.");
a54694f8 462 goto copy_parent;
6f9584d8 463 }
a54694f8
CB
464 /* No isolated cpus but we were already initialized by someone.
465 * Nothing more to do for us.
466 */
6f9584d8 467 goto on_success;
a54694f8
CB
468 }
469
470 /* Get maximum number of cpus found in isolated cpuset. */
471 maxisol = get_max_cpus(isolcpus);
472 if (maxisol < 0)
6f9584d8 473 goto on_error;
a54694f8
CB
474
475 if (maxposs < maxisol)
476 maxposs = maxisol;
477 maxposs++;
478
479 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8
CB
480 if (!possmask) {
481 ERROR("Could not create cpumask for all possible cpus.\n");
482 goto on_error;
483 }
a54694f8
CB
484
485 isolmask = lxc_cpumask(isolcpus, maxposs);
6f9584d8
CB
486 if (!isolmask) {
487 ERROR("Could not create cpumask for all isolated cpus.\n");
488 goto on_error;
489 }
a54694f8
CB
490
491 for (i = 0; i <= maxposs; i++) {
492 if (is_set(i, isolmask) && is_set(i, possmask)) {
6f9584d8 493 flipped_bit = true;
a54694f8
CB
494 clear_bit(i, possmask);
495 }
496 }
497
6f9584d8
CB
498 if (!flipped_bit) {
499 DEBUG("No isolated cpus present in cpuset.");
500 goto on_success;
501 }
502 DEBUG("Removed isolated cpus from cpuset.");
503
a54694f8 504 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
6f9584d8
CB
505 if (!cpulist) {
506 ERROR("Could not create cpu list.\n");
507 goto on_error;
508 }
a54694f8
CB
509
510copy_parent:
511 *lastslash = oldv;
512 fpath = must_make_path(path, "cpuset.cpus", NULL);
513 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false);
6f9584d8
CB
514 if (ret < 0) {
515 SYSERROR("Could not write cpu list to: %s.\n", fpath);
516 goto on_error;
517 }
518
519on_success:
520 bret = true;
a54694f8 521
6f9584d8 522on_error:
a54694f8
CB
523 free(fpath);
524
525 free(isolcpus);
526 free(isolmask);
527
528 if (posscpus != cpulist)
529 free(posscpus);
530 free(possmask);
531
532 free(cpulist);
533 return bret;
534}
535
e3a3fecf
SH
536/* Copy contents of parent(@path)/@file to @path/@file */
537static bool copy_parent_file(char *path, char *file)
538{
539 char *lastslash, *value = NULL, *fpath, oldv;
540 int len = 0;
541 int ret;
542
543 lastslash = strrchr(path, '/');
544 if (!lastslash) { // bug... this shouldn't be possible
545 ERROR("cgfsng:copy_parent_file: bad path %s", path);
546 return false;
547 }
548 oldv = *lastslash;
549 *lastslash = '\0';
550 fpath = must_make_path(path, file, NULL);
551 len = lxc_read_from_file(fpath, NULL, 0);
552 if (len <= 0)
553 goto bad;
554 value = must_alloc(len + 1);
555 if (lxc_read_from_file(fpath, value, len) != len)
556 goto bad;
557 free(fpath);
558 *lastslash = oldv;
559 fpath = must_make_path(path, file, NULL);
560 ret = lxc_write_to_file(fpath, value, len, false);
561 if (ret < 0)
562 SYSERROR("Unable to write %s to %s", value, fpath);
563 free(fpath);
564 free(value);
565 return ret >= 0;
566
567bad:
568 SYSERROR("Error reading '%s'", fpath);
569 free(fpath);
570 free(value);
571 return false;
572}
573
574/*
575 * Initialize the cpuset hierarchy in first directory of @gname and
576 * set cgroup.clone_children so that children inherit settings.
577 * Since the h->base_path is populated by init or ourselves, we know
578 * it is already initialized.
579 */
a54694f8 580static bool handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf
SH
581{
582 char *cgpath, *clonechildrenpath, v, *slash;
583
584 if (!string_in_list(h->controllers, "cpuset"))
585 return true;
586
587 if (*cgname == '/')
588 cgname++;
589 slash = strchr(cgname, '/');
590 if (slash)
591 *slash = '\0';
592
593 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
594 if (slash)
595 *slash = '/';
596 if (mkdir(cgpath, 0755) < 0 && errno != EEXIST) {
597 SYSERROR("Failed to create '%s'", cgpath);
598 free(cgpath);
599 return false;
600 }
6f9584d8 601
e3a3fecf
SH
602 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
603 if (!file_exists(clonechildrenpath)) { /* unified hierarchy doesn't have clone_children */
604 free(clonechildrenpath);
605 free(cgpath);
606 return true;
607 }
608 if (lxc_read_from_file(clonechildrenpath, &v, 1) < 0) {
609 SYSERROR("Failed to read '%s'", clonechildrenpath);
610 free(clonechildrenpath);
611 free(cgpath);
612 return false;
613 }
614
a54694f8 615 /* Make sure any isolated cpus are removed from cpuset.cpus. */
6f9584d8
CB
616 if (!filter_and_set_cpus(cgpath, v == '1')) {
617 SYSERROR("Failed to remove isolated cpus.");
618 free(clonechildrenpath);
619 free(cgpath);
a54694f8 620 return false;
6f9584d8 621 }
a54694f8 622
e3a3fecf 623 if (v == '1') { /* already set for us by someone else */
6f9584d8 624 DEBUG("\"cgroup.clone_children\" was already set to \"1\".");
e3a3fecf
SH
625 free(clonechildrenpath);
626 free(cgpath);
627 return true;
628 }
629
630 /* copy parent's settings */
a54694f8 631 if (!copy_parent_file(cgpath, "cpuset.mems")) {
6f9584d8 632 SYSERROR("Failed to copy \"cpuset.mems\" settings.");
e3a3fecf
SH
633 free(cgpath);
634 free(clonechildrenpath);
635 return false;
636 }
637 free(cgpath);
638
639 if (lxc_write_to_file(clonechildrenpath, "1", 1, false) < 0) {
640 /* Set clone_children so children inherit our settings */
641 SYSERROR("Failed to write 1 to %s", clonechildrenpath);
642 free(clonechildrenpath);
643 return false;
644 }
645 free(clonechildrenpath);
646 return true;
647}
648
ccb4cabe
SH
649/*
650 * Given two null-terminated lists of strings, return true if any string
651 * is in both.
652 */
653static bool controller_lists_intersect(char **l1, char **l2)
654{
655 int i;
656
657 if (!l1 || !l2)
658 return false;
659
660 for (i = 0; l1[i]; i++) {
661 if (string_in_list(l2, l1[i]))
662 return true;
663 }
664 return false;
665}
666
667/*
668 * For a null-terminated list of controllers @clist, return true if any of
669 * those controllers is already listed the null-terminated list of
670 * hierarchies @hlist. Realistically, if one is present, all must be present.
671 */
672static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
673{
674 int i;
675
676 if (!hlist)
677 return false;
678 for (i = 0; hlist[i]; i++)
679 if (controller_lists_intersect(hlist[i]->controllers, clist))
680 return true;
681 return false;
682
683}
684
685/*
686 * Return true if the controller @entry is found in the null-terminated
687 * list of hierarchies @hlist
688 */
689static bool controller_found(struct hierarchy **hlist, char *entry)
690{
691 int i;
692 if (!hlist)
693 return false;
694
695 for (i = 0; hlist[i]; i++)
696 if (string_in_list(hlist[i]->controllers, entry))
697 return true;
698 return false;
699}
700
701/*
c30b61c3
SH
702 * Return true if all of the controllers which we require have been found.
703 * The required list is freezer and anything in * lxc.cgroup.use.
ccb4cabe 704 */
457ca9aa 705static bool all_controllers_found(void)
ccb4cabe
SH
706{
707 char *p, *saveptr = NULL;
457ca9aa 708 struct hierarchy ** hlist = hierarchies;
ccb4cabe 709
ccb4cabe
SH
710 if (!controller_found(hlist, "freezer")) {
711 ERROR("no freezer controller mountpoint found");
712 return false;
713 }
714
457ca9aa 715 if (!cgroup_use)
ccb4cabe 716 return true;
457ca9aa 717 for (p = strtok_r(cgroup_use, ",", &saveptr); p;
ccb4cabe
SH
718 p = strtok_r(NULL, ",", &saveptr)) {
719 if (!controller_found(hlist, p)) {
720 ERROR("no %s controller mountpoint found", p);
721 return false;
722 }
723 }
724 return true;
725}
726
727/* Return true if the fs type is fuse.lxcfs */
728static bool is_lxcfs(const char *line)
729{
730 char *p = strstr(line, " - ");
731 if (!p)
732 return false;
2f62fb00 733 return strncmp(p, " - fuse.lxcfs ", 14) == 0;
ccb4cabe
SH
734}
735
736/*
737 * Get the controllers from a mountinfo line
738 * There are other ways we could get this info. For lxcfs, field 3
739 * is /cgroup/controller-list. For cgroupfs, we could parse the mount
740 * options. But we simply assume that the mountpoint must be
741 * /sys/fs/cgroup/controller-list
742 */
743static char **get_controllers(char **klist, char **nlist, char *line)
744{
745 // the fourth field is /sys/fs/cgroup/comma-delimited-controller-list
746 int i;
747 char *p = line, *p2, *tok, *saveptr = NULL;
748 char **aret = NULL;
749
750 for (i = 0; i < 4; i++) {
235f1815 751 p = strchr(p, ' ');
ccb4cabe
SH
752 if (!p)
753 return NULL;
754 p++;
755 }
756 if (!p)
757 return NULL;
758 /* note - if we change how mountinfo works, then our caller
759 * will need to verify /sys/fs/cgroup/ in this field */
5059aae9
SH
760 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
761 INFO("cgfsng: found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
ccb4cabe 762 return NULL;
5059aae9 763 }
ccb4cabe 764 p += 15;
235f1815 765 p2 = strchr(p, ' ');
ccb4cabe
SH
766 if (!p2) {
767 ERROR("corrupt mountinfo");
768 return NULL;
769 }
770 *p2 = '\0';
771 for (tok = strtok_r(p, ",", &saveptr); tok;
772 tok = strtok_r(NULL, ",", &saveptr)) {
773 must_append_controller(klist, nlist, &aret, tok);
774 }
775
776 return aret;
777}
778
779/* return true if the fstype is cgroup */
780static bool is_cgroupfs(char *line)
781{
782 char *p = strstr(line, " - ");
783 if (!p)
784 return false;
2f62fb00 785 return strncmp(p, " - cgroup ", 10) == 0;
ccb4cabe
SH
786}
787
788/* Add a controller to our list of hierarchies */
457ca9aa 789static void add_controller(char **clist, char *mountpoint, char *base_cgroup)
ccb4cabe
SH
790{
791 struct hierarchy *new;
792 int newentry;
793
794 new = must_alloc(sizeof(*new));
795 new->controllers = clist;
796 new->mountpoint = mountpoint;
797 new->base_cgroup = base_cgroup;
798 new->fullcgpath = NULL;
799
457ca9aa
SH
800 newentry = append_null_to_list((void ***)&hierarchies);
801 hierarchies[newentry] = new;
ccb4cabe
SH
802}
803
804/*
805 * Get a copy of the mountpoint from @line, which is a line from
806 * /proc/self/mountinfo
807 */
808static char *get_mountpoint(char *line)
809{
810 int i;
811 char *p = line, *sret;
812 size_t len;
813
814 for (i = 0; i < 4; i++) {
235f1815 815 p = strchr(p, ' ');
ccb4cabe
SH
816 if (!p)
817 return NULL;
818 p++;
819 }
820 /* we've already stuck a \0 after the mountpoint */
821 len = strlen(p);
822 sret = must_alloc(len + 1);
823 memcpy(sret, p, len);
824 sret[len] = '\0';
825 return sret;
826}
827
828/*
829 * Given a multi-line string, return a null-terminated copy of the
830 * current line.
831 */
832static char *copy_to_eol(char *p)
833{
235f1815 834 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
835 size_t len;
836
837 if (!p2)
838 return NULL;
839
840 len = p2 - p;
841 sret = must_alloc(len + 1);
842 memcpy(sret, p, len);
843 sret[len] = '\0';
844 return sret;
845}
846
847/*
848 * cgline: pointer to character after the first ':' in a line in a
849 * \n-terminated /proc/self/cgroup file. Check whether * controller c is
850 * present.
851 */
852static bool controller_in_clist(char *cgline, char *c)
853{
854 char *tok, *saveptr = NULL, *eol, *tmp;
855 size_t len;
856
235f1815 857 eol = strchr(cgline, ':');
ccb4cabe
SH
858 if (!eol)
859 return false;
860
861 len = eol - cgline;
862 tmp = alloca(len + 1);
863 memcpy(tmp, cgline, len);
864 tmp[len] = '\0';
865
866 for (tok = strtok_r(tmp, ",", &saveptr); tok;
867 tok = strtok_r(NULL, ",", &saveptr)) {
868 if (strcmp(tok, c) == 0)
869 return true;
870 }
871 return false;
872}
873
874/*
875 * @basecginfo is a copy of /proc/$$/cgroup. Return the current
876 * cgroup for @controller
877 */
878static char *get_current_cgroup(char *basecginfo, char *controller)
879{
880 char *p = basecginfo;
881
882 while (1) {
235f1815 883 p = strchr(p, ':');
ccb4cabe
SH
884 if (!p)
885 return NULL;
886 p++;
887 if (controller_in_clist(p, controller)) {
235f1815 888 p = strchr(p, ':');
ccb4cabe
SH
889 if (!p)
890 return NULL;
891 p++;
892 return copy_to_eol(p);
893 }
894
235f1815 895 p = strchr(p, '\n');
ccb4cabe
SH
896 if (!p)
897 return NULL;
898 p++;
899 }
900}
901
ccb4cabe
SH
902/*
903 * Given a hierarchy @mountpoint and base @path, verify that we can create
904 * directories underneath it.
905 */
906static bool test_writeable(char *mountpoint, char *path)
907{
908 char *fullpath = must_make_path(mountpoint, path, NULL);
909 int ret;
910
911 ret = access(fullpath, W_OK);
912 free(fullpath);
913 return ret == 0;
914}
915
916static void must_append_string(char ***list, char *entry)
917{
918 int newentry = append_null_to_list((void ***)list);
919 char *copy;
920
921 copy = must_copy_string(entry);
922 (*list)[newentry] = copy;
923}
924
925static void get_existing_subsystems(char ***klist, char ***nlist)
926{
927 FILE *f;
928 char *line = NULL;
929 size_t len = 0;
930
931 if ((f = fopen("/proc/self/cgroup", "r")) == NULL)
932 return;
933 while (getline(&line, &len, f) != -1) {
934 char *p, *p2, *tok, *saveptr = NULL;
235f1815 935 p = strchr(line, ':');
ccb4cabe
SH
936 if (!p)
937 continue;
938 p++;
235f1815 939 p2 = strchr(p, ':');
ccb4cabe
SH
940 if (!p2)
941 continue;
942 *p2 = '\0';
ff8d6ee9
CB
943
944 /* If we have a mixture between cgroup v1 and cgroup v2
945 * hierarchies, then /proc/self/cgroup contains entries of the
946 * form:
947 *
948 * 0::/some/path
949 *
950 * We need to skip those.
951 */
952 if ((p2 - p) == 0)
953 continue;
954
ccb4cabe
SH
955 for (tok = strtok_r(p, ",", &saveptr); tok;
956 tok = strtok_r(NULL, ",", &saveptr)) {
957 if (strncmp(tok, "name=", 5) == 0)
958 must_append_string(nlist, tok);
959 else
960 must_append_string(klist, tok);
961 }
962 }
963
964 free(line);
965 fclose(f);
966}
967
968static void trim(char *s)
969{
970 size_t len = strlen(s);
2c28d76b 971 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
972 s[--len] = '\0';
973}
974
e4aeecf5
CB
975static void lxc_cgfsng_print_handler_data(const struct cgfsng_handler_data *d)
976{
977 printf("Cgroup information:\n");
978 printf(" container name: %s\n", d->name ? d->name : "(null)");
979 printf(" lxc.cgroup.use: %s\n", cgroup_use ? cgroup_use : "(null)");
980 printf(" lxc.cgroup.pattern: %s\n", d->cgroup_pattern ? d->cgroup_pattern : "(null)");
981 printf(" cgroup: %s\n", d->container_cgroup ? d->container_cgroup : "(null)");
982}
983
984static void lxc_cgfsng_print_hierarchies()
ccb4cabe 985{
a7b0cc4c 986 struct hierarchy **it;
ccb4cabe 987 int i;
41c33dbe 988
457ca9aa 989 if (!hierarchies) {
e4aeecf5 990 printf(" No hierarchies found.");
ccb4cabe
SH
991 return;
992 }
e4aeecf5 993 printf(" Hierarchies:\n");
a7b0cc4c
CB
994 for (i = 0, it = hierarchies; it && *it; it++, i++) {
995 char **cit;
ccb4cabe 996 int j;
e4aeecf5
CB
997 printf(" %d: base_cgroup %s\n", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
998 printf(" mountpoint %s\n", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
999 printf(" controllers:\n");
a7b0cc4c 1000 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
e4aeecf5 1001 printf(" %d: %s\n", j, *cit);
ccb4cabe
SH
1002 }
1003}
41c33dbe 1004
e4aeecf5 1005static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist, char **nlist)
41c33dbe
SH
1006{
1007 int k;
a7b0cc4c 1008 char **it;
41c33dbe 1009
a7b0cc4c
CB
1010 printf("basecginfo is:\n");
1011 printf("%s\n", basecginfo);
41c33dbe 1012
a7b0cc4c
CB
1013 for (k = 0, it = klist; it && *it; it++, k++)
1014 printf("kernel subsystem %d: %s\n", k, *it);
1015 for (k = 0, it = nlist; it && *it; it++, k++)
1016 printf("named subsystem %d: %s\n", k, *it);
41c33dbe 1017}
ccb4cabe 1018
e4aeecf5
CB
1019static void lxc_cgfsng_print_debuginfo(const struct cgfsng_handler_data *d)
1020{
1021 lxc_cgfsng_print_handler_data(d);
1022 lxc_cgfsng_print_hierarchies();
1023}
1024
ccb4cabe
SH
1025/*
1026 * At startup, parse_hierarchies finds all the info we need about
1027 * cgroup mountpoints and current cgroups, and stores it in @d.
1028 */
457ca9aa 1029static bool parse_hierarchies(void)
ccb4cabe
SH
1030{
1031 FILE *f;
1032 char * line = NULL, *basecginfo;
1033 char **klist = NULL, **nlist = NULL;
1034 size_t len = 0;
1035
d30ec4cb
SH
1036 /*
1037 * Root spawned containers escape the current cgroup, so use init's
1038 * cgroups as our base in that case.
1039 */
ccb4cabe
SH
1040 if (geteuid())
1041 basecginfo = read_file("/proc/self/cgroup");
1042 else
1043 basecginfo = read_file("/proc/1/cgroup");
1044 if (!basecginfo)
1045 return false;
1046
1047 if ((f = fopen("/proc/self/mountinfo", "r")) == NULL) {
d3b00a8f 1048 SYSERROR("Failed opening /proc/self/mountinfo");
ccb4cabe
SH
1049 return false;
1050 }
1051
1052 get_existing_subsystems(&klist, &nlist);
41c33dbe 1053
e4aeecf5
CB
1054 if (lxc_cgfsng_debug)
1055 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
ccb4cabe
SH
1056
1057 /* we support simple cgroup mounts and lxcfs mounts */
1058 while (getline(&line, &len, f) != -1) {
1059 char **controller_list = NULL;
1060 char *mountpoint, *base_cgroup;
1061
1062 if (!is_lxcfs(line) && !is_cgroupfs(line))
1063 continue;
1064
1065 controller_list = get_controllers(klist, nlist, line);
1066 if (!controller_list)
1067 continue;
1068
457ca9aa 1069 if (controller_list_is_dup(hierarchies, controller_list)) {
ccb4cabe
SH
1070 free(controller_list);
1071 continue;
1072 }
1073
1074 mountpoint = get_mountpoint(line);
1075 if (!mountpoint) {
1076 ERROR("Error reading mountinfo: bad line '%s'", line);
1077 free_string_list(controller_list);
1078 continue;
1079 }
1080
1081 base_cgroup = get_current_cgroup(basecginfo, controller_list[0]);
1082 if (!base_cgroup) {
1083 ERROR("Failed to find current cgroup for controller '%s'", controller_list[0]);
1084 free_string_list(controller_list);
1085 free(mountpoint);
1086 continue;
1087 }
1088 trim(base_cgroup);
1089 prune_init_scope(base_cgroup);
1090 if (!test_writeable(mountpoint, base_cgroup)) {
1091 free_string_list(controller_list);
1092 free(mountpoint);
1093 free(base_cgroup);
1094 continue;
1095 }
457ca9aa 1096 add_controller(controller_list, mountpoint, base_cgroup);
ccb4cabe
SH
1097 }
1098
1099 free_string_list(klist);
1100 free_string_list(nlist);
1101
1102 free(basecginfo);
1103
1104 fclose(f);
1105 free(line);
1106
e4aeecf5
CB
1107 if (lxc_cgfsng_debug) {
1108 printf("writeable subsystems:\n");
1109 lxc_cgfsng_print_hierarchies();
1110 }
1111
ccb4cabe
SH
1112 /* verify that all controllers in cgroup.use and all crucial
1113 * controllers are accounted for
1114 */
5059aae9
SH
1115 if (!all_controllers_found()) {
1116 INFO("cgfsng: not all controllers were find, deferring to cgfs driver");
ccb4cabe 1117 return false;
5059aae9 1118 }
ccb4cabe
SH
1119
1120 return true;
1121}
1122
457ca9aa
SH
1123static bool collect_hierarchy_info(void)
1124{
1125 const char *tmp;
1126 errno = 0;
1127 tmp = lxc_global_config_value("lxc.cgroup.use");
1128 if (!cgroup_use && errno != 0) { // lxc.cgroup.use can be NULL
1129 SYSERROR("cgfsng: error reading list of cgroups to use");
1130 return false;
1131 }
1132 cgroup_use = must_copy_string(tmp);
1133
1134 return parse_hierarchies();
1135}
1136
ccb4cabe
SH
1137static void *cgfsng_init(const char *name)
1138{
1139 struct cgfsng_handler_data *d;
457ca9aa 1140 const char *cgroup_pattern;
ccb4cabe
SH
1141
1142 d = must_alloc(sizeof(*d));
1143 memset(d, 0, sizeof(*d));
1144
1145 d->name = must_copy_string(name);
1146
ccb4cabe
SH
1147 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
1148 if (!cgroup_pattern) { // lxc.cgroup.pattern is only NULL on error
1149 ERROR("Error getting cgroup pattern");
1150 goto out_free;
1151 }
1152 d->cgroup_pattern = must_copy_string(cgroup_pattern);
1153
e4aeecf5
CB
1154 if (lxc_cgfsng_debug)
1155 lxc_cgfsng_print_debuginfo(d);
ccb4cabe
SH
1156
1157 return d;
1158
1159out_free:
1160 free_handler_data(d);
1161 return NULL;
1162}
1163
ccb4cabe
SH
1164static int cgroup_rmdir(char *dirname)
1165{
74f96976 1166 struct dirent *direntp;
ccb4cabe
SH
1167 DIR *dir;
1168 int r = 0;
1169
1170 dir = opendir(dirname);
1171 if (!dir)
1172 return -1;
1173
74f96976 1174 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1175 struct stat mystat;
1176 char *pathname;
1177
1178 if (!direntp)
1179 break;
1180
1181 if (!strcmp(direntp->d_name, ".") ||
1182 !strcmp(direntp->d_name, ".."))
1183 continue;
1184
1185 pathname = must_make_path(dirname, direntp->d_name, NULL);
1186
1187 if (lstat(pathname, &mystat)) {
1188 if (!r)
1c9da8da 1189 WARN("failed to stat %s", pathname);
ccb4cabe
SH
1190 r = -1;
1191 goto next;
1192 }
1193
1194 if (!S_ISDIR(mystat.st_mode))
1195 goto next;
1196 if (cgroup_rmdir(pathname) < 0)
1197 r = -1;
1198next:
1199 free(pathname);
1200 }
1201
1202 if (rmdir(dirname) < 0) {
1203 if (!r)
13277ec4 1204 WARN("failed to delete %s: %s", dirname, strerror(errno));
ccb4cabe
SH
1205 r = -1;
1206 }
1207
1208 if (closedir(dir) < 0) {
1209 if (!r)
13277ec4 1210 WARN("failed to delete %s: %s", dirname, strerror(errno));
ccb4cabe
SH
1211 r = -1;
1212 }
1213 return r;
1214}
1215
1216static int rmdir_wrapper(void *data)
1217{
1218 char *path = data;
1219
1220 if (setresgid(0,0,0) < 0)
1221 SYSERROR("Failed to setgid to 0");
1222 if (setresuid(0,0,0) < 0)
1223 SYSERROR("Failed to setuid to 0");
1224 if (setgroups(0, NULL) < 0)
1225 SYSERROR("Failed to clear groups");
1226
1227 return cgroup_rmdir(path);
1228}
1229
1230void recursive_destroy(char *path, struct lxc_conf *conf)
1231{
1232 int r;
1233 if (conf && !lxc_list_empty(&conf->id_map))
c9b7c33e 1234 r = userns_exec_1(conf, rmdir_wrapper, path, "rmdir_wrapper");
ccb4cabe
SH
1235 else
1236 r = cgroup_rmdir(path);
1237
1238 if (r < 0)
1c9da8da 1239 ERROR("Error destroying %s", path);
ccb4cabe
SH
1240}
1241
1242static void cgfsng_destroy(void *hdata, struct lxc_conf *conf)
1243{
1244 struct cgfsng_handler_data *d = hdata;
1245
1246 if (!d)
1247 return;
1248
457ca9aa 1249 if (d->container_cgroup && hierarchies) {
ccb4cabe 1250 int i;
457ca9aa
SH
1251 for (i = 0; hierarchies[i]; i++) {
1252 struct hierarchy *h = hierarchies[i];
e2db2a89 1253 if (h->fullcgpath) {
ccb4cabe
SH
1254 recursive_destroy(h->fullcgpath, conf);
1255 free(h->fullcgpath);
1256 h->fullcgpath = NULL;
1257 }
1258 }
1259 }
1260
1261 free_handler_data(d);
1262}
1263
1264struct cgroup_ops *cgfsng_ops_init(void)
1265{
e4aeecf5
CB
1266 if (getenv("LXC_DEBUG_CGFSNG"))
1267 lxc_cgfsng_debug = true;
1268
457ca9aa
SH
1269 if (!collect_hierarchy_info())
1270 return NULL;
e4aeecf5 1271
ccb4cabe
SH
1272 return &cgfsng_ops;
1273}
1274
1275static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1276{
e3a3fecf 1277 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
6f9584d8
CB
1278 if (dir_exists(h->fullcgpath)) { // it must not already exist
1279 ERROR("Path \"%s\" already existed.", h->fullcgpath);
d8da679e 1280 return false;
6f9584d8
CB
1281 }
1282 if (!handle_cpuset_hierarchy(h, cgname)) {
1283 ERROR("Failed to handle cgroupfs v1 cpuset controller.");
e3a3fecf 1284 return false;
6f9584d8 1285 }
e3a3fecf 1286 return mkdir_p(h->fullcgpath, 0755) == 0;
ccb4cabe
SH
1287}
1288
1289static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1290{
1291 if (rmdir(h->fullcgpath) < 0)
1292 SYSERROR("Failed to clean up cgroup %s from failed creation attempt", h->fullcgpath);
1293 free(h->fullcgpath);
1294 h->fullcgpath = NULL;
1295}
1296
1297/*
d30ec4cb 1298 * Try to create the same cgroup in all hierarchies.
ccb4cabe
SH
1299 * Start with cgroup_pattern; next cgroup_pattern-1, -2, ..., -999
1300 */
1301static inline bool cgfsng_create(void *hdata)
1302{
1303 struct cgfsng_handler_data *d = hdata;
1304 char *tmp, *cgname, *offset;
1305 int i, idx = 0;
1306 size_t len;
1307
1308 if (!d)
1309 return false;
1310 if (d->container_cgroup) {
1311 WARN("cgfsng_create called a second time");
1312 return false;
1313 }
1314
1315 tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
1316 if (!tmp) {
1317 ERROR("Failed expanding cgroup name pattern");
1318 return false;
1319 }
1320 len = strlen(tmp) + 5; // leave room for -NNN\0
1321 cgname = must_alloc(len);
1322 strcpy(cgname, tmp);
1323 free(tmp);
1324 offset = cgname + len - 5;
1325
1326again:
95adfe93
SH
1327 if (idx == 1000) {
1328 ERROR("Too many conflicting cgroup names");
ccb4cabe 1329 goto out_free;
95adfe93 1330 }
ccb4cabe
SH
1331 if (idx)
1332 snprintf(offset, 5, "-%d", idx);
457ca9aa
SH
1333 for (i = 0; hierarchies[i]; i++) {
1334 if (!create_path_for_hierarchy(hierarchies[i], cgname)) {
ccb4cabe 1335 int j;
457ca9aa
SH
1336 SYSERROR("Failed to create %s: %s", hierarchies[i]->fullcgpath, strerror(errno));
1337 free(hierarchies[i]->fullcgpath);
1338 hierarchies[i]->fullcgpath = NULL;
ccb4cabe 1339 for (j = 0; j < i; j++)
457ca9aa 1340 remove_path_for_hierarchy(hierarchies[j], cgname);
ccb4cabe
SH
1341 idx++;
1342 goto again;
1343 }
1344 }
1345 /* Done */
1346 d->container_cgroup = cgname;
1347 return true;
1348
1349out_free:
1350 free(cgname);
1351 return false;
1352}
1353
ccb4cabe
SH
1354static bool cgfsng_enter(void *hdata, pid_t pid)
1355{
ccb4cabe
SH
1356 char pidstr[25];
1357 int i, len;
1358
1359 len = snprintf(pidstr, 25, "%d", pid);
1360 if (len < 0 || len > 25)
1361 return false;
1362
457ca9aa
SH
1363 for (i = 0; hierarchies[i]; i++) {
1364 char *fullpath = must_make_path(hierarchies[i]->fullcgpath,
ccb4cabe
SH
1365 "cgroup.procs", NULL);
1366 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
d3b00a8f 1367 SYSERROR("Failed to enter %s", fullpath);
ccb4cabe
SH
1368 free(fullpath);
1369 return false;
1370 }
1371 free(fullpath);
1372 }
1373
1374 return true;
1375}
1376
1377struct chown_data {
1378 struct cgfsng_handler_data *d;
1379 uid_t origuid; // target uid in parent namespace
1380};
1381
c0888dfe
SH
1382/*
1383 * chgrp the container cgroups to container group. We leave
1384 * the container owner as cgroup owner. So we must make the
1385 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1386 *
1387 * Also chown the tasks and cgroup.procs files. Those may not
1388 * exist depending on kernel version.
c0888dfe 1389 */
ccb4cabe
SH
1390static int chown_cgroup_wrapper(void *data)
1391{
1392 struct chown_data *arg = data;
ccb4cabe
SH
1393 uid_t destuid;
1394 int i;
1395
1396 if (setresgid(0,0,0) < 0)
1397 SYSERROR("Failed to setgid to 0");
1398 if (setresuid(0,0,0) < 0)
1399 SYSERROR("Failed to setuid to 0");
1400 if (setgroups(0, NULL) < 0)
1401 SYSERROR("Failed to clear groups");
1402
1403 destuid = get_ns_uid(arg->origuid);
1404
457ca9aa
SH
1405 for (i = 0; hierarchies[i]; i++) {
1406 char *fullpath, *path = hierarchies[i]->fullcgpath;
43647298
SH
1407
1408 if (chown(path, destuid, 0) < 0) {
ab8f5424 1409 SYSERROR("Error chowning %s to %d", path, (int) destuid);
ccb4cabe
SH
1410 return -1;
1411 }
c0888dfe 1412
43647298 1413 if (chmod(path, 0775) < 0) {
ab8f5424 1414 SYSERROR("Error chmoding %s", path);
c0888dfe
SH
1415 return -1;
1416 }
ccb4cabe 1417
ab8f5424
SH
1418 /*
1419 * Failures to chown these are inconvenient but not detrimental
1420 * We leave these owned by the container launcher, so that container
1421 * root can write to the files to attach. We chmod them 664 so that
1422 * container systemd can write to the files (which systemd in wily
1423 * insists on doing)
1424 */
43647298
SH
1425 fullpath = must_make_path(path, "tasks", NULL);
1426 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1427 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1428 strerror(errno));
ab8f5424 1429 if (chmod(fullpath, 0664) < 0)
13277ec4 1430 WARN("Error chmoding %s: %s", path, strerror(errno));
43647298
SH
1431 free(fullpath);
1432
1433 fullpath = must_make_path(path, "cgroup.procs", NULL);
1434 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1435 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1436 strerror(errno));
ab8f5424 1437 if (chmod(fullpath, 0664) < 0)
13277ec4 1438 WARN("Error chmoding %s: %s", path, strerror(errno));
ccb4cabe
SH
1439 free(fullpath);
1440 }
1441
1442 return 0;
1443}
1444
1445static bool cgfsns_chown(void *hdata, struct lxc_conf *conf)
1446{
1447 struct cgfsng_handler_data *d = hdata;
1448 struct chown_data wrap;
1449
1450 if (!d)
1451 return false;
1452
1453 if (lxc_list_empty(&conf->id_map))
1454 return true;
1455
1456 wrap.d = d;
1457 wrap.origuid = geteuid();
1458
c9b7c33e
CB
1459 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1460 "chown_cgroup_wrapper") < 0) {
ccb4cabe
SH
1461 ERROR("Error requesting cgroup chown in new namespace");
1462 return false;
1463 }
1464
1465 return true;
1466}
1467
8aa1044f
SH
1468/*
1469 * We've safe-mounted a tmpfs as parent, so we don't need to protect against
1470 * symlinks any more - just use mount
1471 */
1472
1473/* mount cgroup-full if requested */
1474static int mount_cgroup_full(int type, struct hierarchy *h, char *dest,
1475 char *container_cgroup)
1476{
1477 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1478 return 0;
1479 if (mount(h->mountpoint, dest, "cgroup", MS_BIND, NULL) < 0) {
1480 SYSERROR("Error bind-mounting %s cgroup onto %s", h->mountpoint,
1481 dest);
1482 return -1;
1483 }
1484 if (type != LXC_AUTO_CGROUP_FULL_RW) {
5b6f9369
SH
1485 unsigned long flags = MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV |
1486 MS_REMOUNT | MS_RDONLY;
1487 if (mount(NULL, dest, "cgroup", flags, NULL) < 0) {
8aa1044f
SH
1488 SYSERROR("Error remounting %s readonly", dest);
1489 return -1;
1490 }
1491 }
1492
1493 INFO("Bind mounted %s onto %s", h->mountpoint, dest);
1494 if (type != LXC_AUTO_CGROUP_FULL_MIXED)
1495 return 0;
1496
1497 /* mount just the container path rw */
1498 char *source = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
5b6f9369 1499 char *rwpath = must_make_path(dest, h->base_cgroup, container_cgroup, NULL);
8aa1044f 1500 if (mount(source, rwpath, "cgroup", MS_BIND, NULL) < 0)
13277ec4 1501 WARN("Failed to mount %s read-write: %s", rwpath,
1502 strerror(errno));
8aa1044f
SH
1503 INFO("Made %s read-write", rwpath);
1504 free(rwpath);
1505 free(source);
1506 return 0;
1507}
1508
1509/* cgroup-full:* is done, no need to create subdirs */
1510static bool cg_mount_needs_subdirs(int type)
1511{
1512 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1513 return false;
1514 return true;
1515}
1516
1517/*
1518 * After $rootfs/sys/fs/container/controller/the/cg/path has been
1519 * created, remount controller ro if needed and bindmount the
1520 * cgroupfs onto controll/the/cg/path
1521 */
1522static int
1523do_secondstage_mounts_if_needed(int type, struct hierarchy *h,
1524 char *controllerpath, char *cgpath,
1525 const char *container_cgroup)
1526{
1527 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1528 if (mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL) < 0) {
1529 SYSERROR("Error bind-mounting %s", controllerpath);
1530 return -1;
1531 }
1532 if (mount(controllerpath, controllerpath, "cgroup",
1533 MS_REMOUNT | MS_BIND | MS_RDONLY, NULL) < 0) {
1534 SYSERROR("Error remounting %s read-only", controllerpath);
1535 return -1;
1536 }
1537 INFO("Remounted %s read-only", controllerpath);
1538 }
1539 char *sourcepath = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
1540 int flags = MS_BIND;
1541 if (type == LXC_AUTO_CGROUP_RO)
1542 flags |= MS_RDONLY;
1543 INFO("Mounting %s onto %s", sourcepath, cgpath);
1544 if (mount(sourcepath, cgpath, "cgroup", flags, NULL) < 0) {
1545 free(sourcepath);
1546 SYSERROR("Error mounting cgroup %s onto %s", h->controllers[0],
1547 cgpath);
1548 return -1;
1549 }
1550 free(sourcepath);
1551 INFO("Completed second stage cgroup automounts for %s", cgpath);
1552 return 0;
1553}
1554
ccb4cabe
SH
1555static bool cgfsng_mount(void *hdata, const char *root, int type)
1556{
8aa1044f
SH
1557 struct cgfsng_handler_data *d = hdata;
1558 char *tmpfspath = NULL;
1559 bool retval = false;
a8de4c49 1560 int i;
8aa1044f
SH
1561
1562 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1563 return true;
1564
ccb4cabe
SH
1565 if (cgns_supported())
1566 return true;
8aa1044f
SH
1567
1568 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1569
1570 if (type == LXC_AUTO_CGROUP_NOSPEC)
1571 type = LXC_AUTO_CGROUP_MIXED;
1572 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1573 type = LXC_AUTO_CGROUP_FULL_MIXED;
1574
1575 /* Mount tmpfs */
1576 if (safe_mount("cgroup_root", tmpfspath, "tmpfs",
1577 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1578 "size=10240k,mode=755",
1579 root) < 0)
1580 goto bad;
1581
457ca9aa 1582 for (i = 0; hierarchies[i]; i++) {
8aa1044f 1583 char *controllerpath, *path2;
457ca9aa 1584 struct hierarchy *h = hierarchies[i];
8aa1044f
SH
1585 char *controller = strrchr(h->mountpoint, '/');
1586 int r;
1587
1588 if (!controller)
1589 continue;
1590 controller++;
1591 controllerpath = must_make_path(tmpfspath, controller, NULL);
1592 if (dir_exists(controllerpath)) {
1593 free(controllerpath);
1594 continue;
1595 }
1596 if (mkdir(controllerpath, 0755) < 0) {
1597 SYSERROR("Error creating cgroup path: %s", controllerpath);
1598 free(controllerpath);
1599 goto bad;
1600 }
1601 if (mount_cgroup_full(type, h, controllerpath, d->container_cgroup) < 0) {
1602 free(controllerpath);
1603 goto bad;
1604 }
1605 if (!cg_mount_needs_subdirs(type)) {
1606 free(controllerpath);
1607 continue;
1608 }
ef4413fa 1609 path2 = must_make_path(controllerpath, h->base_cgroup, d->container_cgroup, NULL);
8aa1044f
SH
1610 if (mkdir_p(path2, 0755) < 0) {
1611 free(controllerpath);
1612 goto bad;
1613 }
2f62fb00 1614
8aa1044f
SH
1615 r = do_secondstage_mounts_if_needed(type, h, controllerpath, path2,
1616 d->container_cgroup);
1617 free(controllerpath);
1618 free(path2);
1619 if (r < 0)
1620 goto bad;
1621 }
1622 retval = true;
1623
1624bad:
1625 free(tmpfspath);
1626 return retval;
ccb4cabe
SH
1627}
1628
1629static int recursive_count_nrtasks(char *dirname)
1630{
74f96976 1631 struct dirent *direntp;
ccb4cabe
SH
1632 DIR *dir;
1633 int count = 0, ret;
1634 char *path;
1635
1636 dir = opendir(dirname);
1637 if (!dir)
1638 return 0;
1639
74f96976 1640 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1641 struct stat mystat;
1642
1643 if (!direntp)
1644 break;
1645
1646 if (!strcmp(direntp->d_name, ".") ||
1647 !strcmp(direntp->d_name, ".."))
1648 continue;
1649
1650 path = must_make_path(dirname, direntp->d_name, NULL);
1651
1652 if (lstat(path, &mystat))
1653 goto next;
1654
1655 if (!S_ISDIR(mystat.st_mode))
1656 goto next;
1657
1658 count += recursive_count_nrtasks(path);
1659next:
1660 free(path);
1661 }
1662
1663 path = must_make_path(dirname, "cgroup.procs", NULL);
1664 ret = lxc_count_file_lines(path);
1665 if (ret != -1)
1666 count += ret;
1667 free(path);
1668
1669 (void) closedir(dir);
1670
1671 return count;
1672}
1673
1674static int cgfsng_nrtasks(void *hdata) {
1675 struct cgfsng_handler_data *d = hdata;
1676 char *path;
1677 int count;
1678
457ca9aa 1679 if (!d || !d->container_cgroup || !hierarchies)
ccb4cabe 1680 return -1;
457ca9aa 1681 path = must_make_path(hierarchies[0]->fullcgpath, NULL);
ccb4cabe
SH
1682 count = recursive_count_nrtasks(path);
1683 free(path);
1684 return count;
1685}
1686
1687/* Only root needs to escape to the cgroup of its init */
7103fe6f 1688static bool cgfsng_escape()
ccb4cabe 1689{
ccb4cabe
SH
1690 int i;
1691
1692 if (geteuid())
1693 return true;
1694
457ca9aa
SH
1695 for (i = 0; hierarchies[i]; i++) {
1696 char *fullpath = must_make_path(hierarchies[i]->mountpoint,
1697 hierarchies[i]->base_cgroup,
ccb4cabe
SH
1698 "cgroup.procs", NULL);
1699 if (lxc_write_to_file(fullpath, "0", 2, false) != 0) {
d3b00a8f 1700 SYSERROR("Failed to escape to %s", fullpath);
ccb4cabe 1701 free(fullpath);
6df334d1 1702 return false;
ccb4cabe
SH
1703 }
1704 free(fullpath);
1705 }
1706
6df334d1 1707 return true;
ccb4cabe
SH
1708}
1709
36662416
TA
1710static int cgfsng_num_hierarchies(void)
1711{
1712 int i;
1713
1714 for (i = 0; hierarchies[i]; i++)
1715 ;
1716
1717 return i;
1718}
1719
1720static bool cgfsng_get_hierarchies(int n, char ***out)
1721{
1722 int i;
1723
1724 /* sanity check n */
1725 for (i = 0; i < n; i++) {
1726 if (!hierarchies[i])
1727 return false;
1728 }
1729
1730 *out = hierarchies[i]->controllers;
1731
1732 return true;
1733}
1734
ccb4cabe
SH
1735#define THAWED "THAWED"
1736#define THAWED_LEN (strlen(THAWED))
1737
1738static bool cgfsng_unfreeze(void *hdata)
1739{
ccb4cabe 1740 char *fullpath;
457ca9aa 1741 struct hierarchy *h = get_hierarchy("freezer");
ccb4cabe 1742
457ca9aa 1743 if (!h)
ccb4cabe
SH
1744 return false;
1745 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
1746 if (lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false) != 0) {
1747 free(fullpath);
1748 return false;
1749 }
1750 free(fullpath);
1751 return true;
1752}
1753
1754static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem)
1755{
457ca9aa 1756 struct hierarchy *h = get_hierarchy(subsystem);
ccb4cabe
SH
1757 if (!h)
1758 return NULL;
1759
371f834d
SH
1760 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
1761}
1762
1763/*
1764 * Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a
1765 * full path, which must be freed by the caller.
1766 */
1767static char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
1768 const char *inpath,
1769 const char *filename)
1770{
371f834d 1771 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
1772}
1773
1774static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
1775{
ccb4cabe
SH
1776 char pidstr[25];
1777 int i, len;
1778
1779 len = snprintf(pidstr, 25, "%d", pid);
1780 if (len < 0 || len > 25)
1781 return false;
1782
457ca9aa 1783 for (i = 0; hierarchies[i]; i++) {
ccb4cabe 1784 char *path, *fullpath;
457ca9aa 1785 struct hierarchy *h = hierarchies[i];
ccb4cabe
SH
1786
1787 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
1788 if (!path) // not running
1789 continue;
1790
371f834d
SH
1791 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
1792 free(path);
ccb4cabe
SH
1793 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
1794 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
1795 free(fullpath);
ccb4cabe
SH
1796 return false;
1797 }
ccb4cabe
SH
1798 free(fullpath);
1799 }
1800
ccb4cabe
SH
1801 return true;
1802}
1803
1804/*
1805 * Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.
1806 * Here we don't have a cgroup_data set up, so we ask the running
1807 * container through the commands API for the cgroup path
1808 */
1809static int cgfsng_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
1810{
1811 char *subsystem, *p, *path;
ccb4cabe
SH
1812 struct hierarchy *h;
1813 int ret = -1;
1814
1815 subsystem = alloca(strlen(filename) + 1);
1816 strcpy(subsystem, filename);
1817 if ((p = strchr(subsystem, '.')) != NULL)
1818 *p = '\0';
1819
1820 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1821 if (!path) // not running
1822 return -1;
1823
457ca9aa 1824 h = get_hierarchy(subsystem);
ccb4cabe 1825 if (h) {
371f834d 1826 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1827 ret = lxc_read_from_file(fullpath, value, len);
1828 free(fullpath);
1829 }
1830
ccb4cabe
SH
1831 free(path);
1832
1833 return ret;
1834}
1835
1836/*
1837 * Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.
1838 * Here we don't have a cgroup_data set up, so we ask the running
1839 * container through the commands API for the cgroup path
1840 */
1841static int cgfsng_set(const char *filename, const char *value, const char *name, const char *lxcpath)
1842{
1843 char *subsystem, *p, *path;
ccb4cabe
SH
1844 struct hierarchy *h;
1845 int ret = -1;
1846
1847 subsystem = alloca(strlen(filename) + 1);
1848 strcpy(subsystem, filename);
1849 if ((p = strchr(subsystem, '.')) != NULL)
1850 *p = '\0';
1851
1852 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1853 if (!path) // not running
1854 return -1;
1855
457ca9aa 1856 h = get_hierarchy(subsystem);
ccb4cabe 1857 if (h) {
371f834d 1858 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1859 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
1860 free(fullpath);
1861 }
1862
ccb4cabe
SH
1863 free(path);
1864
1865 return ret;
1866}
1867
ccb4cabe
SH
1868/*
1869 * Called from setup_limits - here we have the container's cgroup_data because
1870 * we created the cgroups
1871 */
1872static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfsng_handler_data *d)
1873{
1874 char *subsystem = NULL, *p;
1875 int ret = -1;
1876 struct hierarchy *h;
1877
1878 subsystem = alloca(strlen(filename) + 1);
1879 strcpy(subsystem, filename);
1880 if ((p = strchr(subsystem, '.')) != NULL)
1881 *p = '\0';
1882
457ca9aa 1883 h = get_hierarchy(subsystem);
ccb4cabe
SH
1884 if (h) {
1885 char *fullpath = must_make_path(h->fullcgpath, filename, NULL);
1886 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
1887 free(fullpath);
1888 }
1889 return ret;
1890}
1891
1892static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings,
1893 bool do_devices)
1894{
1895 struct cgfsng_handler_data *d = hdata;
1896 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
1897 struct lxc_cgroup *cg;
ccb4cabe
SH
1898 bool ret = false;
1899
1900 if (lxc_list_empty(cgroup_settings))
1901 return true;
1902
1903 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
1904 if (!sorted_cgroup_settings) {
1905 return false;
1906 }
1907
ccb4cabe
SH
1908 lxc_list_for_each(iterator, sorted_cgroup_settings) {
1909 cg = iterator->elem;
1910
1911 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
ccb4cabe
SH
1912 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
1913 if (do_devices && (errno == EACCES || errno == EPERM)) {
1914 WARN("Error setting %s to %s for %s",
1915 cg->subsystem, cg->value, d->name);
1916 continue;
1917 }
1918 SYSERROR("Error setting %s to %s for %s",
1919 cg->subsystem, cg->value, d->name);
1920 goto out;
1921 }
6a628f4a 1922 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
ccb4cabe 1923 }
ccb4cabe
SH
1924 }
1925
1926 ret = true;
1927 INFO("cgroup has been setup");
1928out:
ccb4cabe
SH
1929 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
1930 lxc_list_del(iterator);
1931 free(iterator);
1932 }
1933 free(sorted_cgroup_settings);
1934 return ret;
1935}
1936
1937static struct cgroup_ops cgfsng_ops = {
1938 .init = cgfsng_init,
1939 .destroy = cgfsng_destroy,
1940 .create = cgfsng_create,
1941 .enter = cgfsng_enter,
ccb4cabe 1942 .escape = cgfsng_escape,
36662416
TA
1943 .num_hierarchies = cgfsng_num_hierarchies,
1944 .get_hierarchies = cgfsng_get_hierarchies,
ccb4cabe
SH
1945 .get_cgroup = cgfsng_get_cgroup,
1946 .get = cgfsng_get,
1947 .set = cgfsng_set,
1948 .unfreeze = cgfsng_unfreeze,
1949 .setup_limits = cgfsng_setup_limits,
1950 .name = "cgroupfs-ng",
1951 .attach = cgfsng_attach,
1952 .chown = cgfsns_chown,
1953 .mount_cgroup = cgfsng_mount,
1954 .nrtasks = cgfsng_nrtasks,
1955 .driver = CGFSNG,
1956
1957 /* unsupported */
1958 .create_legacy = NULL,
1959};