]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
lvm: check whether lxc.bdev.lvm.vg is set
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
ccb4cabe
SH
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24/*
25 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
26 * cgroup backend. The original cgfs.c was designed to be as flexible
27 * as possible. It would try to find cgroup filesystems no matter where
28 * or how you had them mounted, and deduce the most usable mount for
29 * each controller. It also was not designed for unprivileged use, as
30 * that was reserved for cgmanager.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
a54694f8 36
ccb4cabe 37#include "config.h"
a54694f8
CB
38
39#include <ctype.h>
40#include <dirent.h>
41#include <errno.h>
42#include <grp.h>
43#include <stdint.h>
ccb4cabe
SH
44#include <stdio.h>
45#include <stdlib.h>
a54694f8 46#include <string.h>
ccb4cabe 47#include <unistd.h>
a54694f8 48#include <sys/types.h>
ccb4cabe 49
d8e48992 50#include "bdev.h"
ccb4cabe 51#include "cgroup.h"
6328fd9c 52#include "cgroup_utils.h"
ccb4cabe 53#include "commands.h"
a54694f8
CB
54#include "log.h"
55#include "utils.h"
ccb4cabe
SH
56
57lxc_log_define(lxc_cgfsng, lxc);
58
59static struct cgroup_ops cgfsng_ops;
60
ccb4cabe
SH
61/*
62 * A descriptor for a mounted hierarchy
63 * @controllers: either NULL, or a null-terminated list of all
64 * the co-mounted controllers
65 * @mountpoint: the mountpoint we will use. It will be either
66 * /sys/fs/cgroup/controller or /sys/fs/cgroup/controllerlist
67 * @base_cgroup: the cgroup under which the container cgroup path
68 is created. This will be either the caller's cgroup (if not
69 root), or init's cgroup (if root).
70 */
71struct hierarchy {
72 char **controllers;
73 char *mountpoint;
74 char *base_cgroup;
75 char *fullcgpath;
6328fd9c 76 bool is_cgroup_v2;
ccb4cabe
SH
77};
78
79/*
80 * The cgroup data which is attached to the lxc_handler.
ccb4cabe
SH
81 * @cgroup_pattern - a copy of the lxc.cgroup.pattern
82 * @container_cgroup - if not null, the cgroup which was created for
83 * the container. For each hierarchy, it is created under the
84 * @hierarchy->base_cgroup directory. Relative to the base_cgroup
85 * it is the same for all hierarchies.
86 * @name - the container name
87 */
88struct cgfsng_handler_data {
ccb4cabe
SH
89 char *cgroup_pattern;
90 char *container_cgroup; // cgroup we created for the container
91 char *name; // container name
92};
93
457ca9aa
SH
94/*
95 * @hierarchies - a NULL-terminated array of struct hierarchy, one per
96 * hierarchy. No duplicates. First sufficient, writeable mounted
97 * hierarchy wins
98 */
99struct hierarchy **hierarchies;
100
101/*
102 * @cgroup_use - a copy of the lxc.cgroup.use
103 */
104char *cgroup_use;
105
e4aeecf5
CB
106/*
107 * @lxc_cgfsng_debug - whether to print debug info to stdout for the cgfsng
108 * driver
109 */
110static bool lxc_cgfsng_debug;
111
ccb4cabe
SH
112static void free_string_list(char **clist)
113{
114 if (clist) {
115 int i;
116
117 for (i = 0; clist[i]; i++)
118 free(clist[i]);
119 free(clist);
120 }
121}
122
ccb4cabe
SH
123/* Allocate a pointer, do not fail */
124static void *must_alloc(size_t sz)
125{
126 return must_realloc(NULL, sz);
127}
128
ccb4cabe
SH
129/*
130 * This is a special case - return a copy of @entry
131 * prepending 'name='. I.e. turn systemd into name=systemd.
132 * Do not fail.
133 */
134static char *must_prefix_named(char *entry)
135{
136 char *ret;
137 size_t len = strlen(entry);
138
139 ret = must_alloc(len + 6);
140 snprintf(ret, len + 6, "name=%s", entry);
141 return ret;
142}
143
144/*
145 * Given a pointer to a null-terminated array of pointers, realloc to
146 * add one entry, and point the new entry to NULL. Do not fail. Return
147 * the index to the second-to-last entry - that is, the one which is
148 * now available for use (keeping the list null-terminated).
149 */
150static int append_null_to_list(void ***list)
151{
152 int newentry = 0;
153
154 if (*list)
155 for (; (*list)[newentry]; newentry++);
156
157 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
158 (*list)[newentry + 1] = NULL;
159 return newentry;
160}
161
162/*
163 * Given a null-terminated array of strings, check whether @entry
164 * is one of the strings
165 */
166static bool string_in_list(char **list, const char *entry)
167{
168 int i;
169
170 if (!list)
171 return false;
172 for (i = 0; list[i]; i++)
173 if (strcmp(list[i], entry) == 0)
174 return true;
175
176 return false;
177}
178
179/*
180 * append an entry to the clist. Do not fail.
181 * *clist must be NULL the first time we are called.
182 *
183 * We also handle named subsystems here. Any controller which is not a
184 * kernel subsystem, we prefix 'name='. Any which is both a kernel and
185 * named subsystem, we refuse to use because we're not sure which we
186 * have here. (TODO - we could work around this in some cases by just
187 * remounting to be unambiguous, or by comparing mountpoint contents
188 * with current cgroup)
189 *
190 * The last entry will always be NULL.
191 */
192static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry)
193{
194 int newentry;
195 char *copy;
196
197 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
198 ERROR("Refusing to use ambiguous controller '%s'", entry);
199 ERROR("It is both a named and kernel subsystem");
200 return;
201 }
202
203 newentry = append_null_to_list((void ***)clist);
204
205 if (strncmp(entry, "name=", 5) == 0)
206 copy = must_copy_string(entry);
207 else if (string_in_list(klist, entry))
208 copy = must_copy_string(entry);
209 else
210 copy = must_prefix_named(entry);
211
212 (*clist)[newentry] = copy;
213}
214
ccb4cabe
SH
215static void free_handler_data(struct cgfsng_handler_data *d)
216{
ccb4cabe
SH
217 free(d->cgroup_pattern);
218 free(d->container_cgroup);
219 free(d->name);
220 free(d);
221}
222
223/*
224 * Given a handler's cgroup data, return the struct hierarchy for the
225 * controller @c, or NULL if there is none.
226 */
457ca9aa 227struct hierarchy *get_hierarchy(const char *c)
ccb4cabe
SH
228{
229 int i;
230
457ca9aa 231 if (!hierarchies)
ccb4cabe 232 return NULL;
457ca9aa
SH
233 for (i = 0; hierarchies[i]; i++) {
234 if (string_in_list(hierarchies[i]->controllers, c))
235 return hierarchies[i];
ccb4cabe
SH
236 }
237 return NULL;
238}
239
a54694f8
CB
240#define BATCH_SIZE 50
241static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
242{
243 int newbatches = (newlen / BATCH_SIZE) + 1;
244 int oldbatches = (oldlen / BATCH_SIZE) + 1;
245
246 if (!*mem || newbatches > oldbatches) {
247 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
248 }
249}
250
251static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
252{
253 size_t full = oldlen + newlen;
254
255 batch_realloc(dest, oldlen, full + 1);
256
257 memcpy(*dest + oldlen, new, newlen + 1);
258}
259
260/* Slurp in a whole file */
261static char *read_file(char *fnam)
262{
263 FILE *f;
264 char *line = NULL, *buf = NULL;
265 size_t len = 0, fulllen = 0;
266 int linelen;
267
268 f = fopen(fnam, "r");
269 if (!f)
270 return NULL;
271 while ((linelen = getline(&line, &len, f)) != -1) {
272 append_line(&buf, fulllen, line, linelen);
273 fulllen += linelen;
274 }
275 fclose(f);
276 free(line);
277 return buf;
278}
279
280/* Taken over modified from the kernel sources. */
281#define NBITS 32 /* bits in uint32_t */
282#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
283#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
284
285static void set_bit(unsigned bit, uint32_t *bitarr)
286{
287 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
288}
289
290static void clear_bit(unsigned bit, uint32_t *bitarr)
291{
292 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
293}
294
295static bool is_set(unsigned bit, uint32_t *bitarr)
296{
297 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
298}
299
300/* Create cpumask from cpulist aka turn:
301 *
302 * 0,2-3
303 *
304 * into bit array
305 *
306 * 1 0 1 1
307 */
308static uint32_t *lxc_cpumask(char *buf, size_t nbits)
309{
310 char *token;
311 char *saveptr = NULL;
312 size_t arrlen = BITS_TO_LONGS(nbits);
313 uint32_t *bitarr = calloc(arrlen, sizeof(uint32_t));
314 if (!bitarr)
315 return NULL;
316
317 for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
318 errno = 0;
319 unsigned start = strtoul(token, NULL, 0);
320 unsigned end = start;
321
322 char *range = strchr(token, '-');
323 if (range)
324 end = strtoul(range + 1, NULL, 0);
325 if (!(start <= end)) {
326 free(bitarr);
327 return NULL;
328 }
329
330 if (end >= nbits) {
331 free(bitarr);
332 return NULL;
333 }
334
335 while (start <= end)
336 set_bit(start++, bitarr);
337 }
338
339 return bitarr;
340}
341
a54694f8
CB
342/* Turn cpumask into simple, comma-separated cpulist. */
343static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
344{
345 size_t i;
346 int ret;
eab15c1e 347 char numstr[LXC_NUMSTRLEN64] = {0};
a54694f8
CB
348 char **cpulist = NULL;
349
350 for (i = 0; i <= nbits; i++) {
351 if (is_set(i, bitarr)) {
eab15c1e
CB
352 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
353 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
a54694f8
CB
354 lxc_free_array((void **)cpulist, free);
355 return NULL;
356 }
357 if (lxc_append_string(&cpulist, numstr) < 0) {
358 lxc_free_array((void **)cpulist, free);
359 return NULL;
360 }
361 }
362 }
363 return lxc_string_join(",", (const char **)cpulist, false);
364}
365
366static ssize_t get_max_cpus(char *cpulist)
367{
368 char *c1, *c2;
369 char *maxcpus = cpulist;
370 size_t cpus = 0;
371
372 c1 = strrchr(maxcpus, ',');
373 if (c1)
374 c1++;
375
376 c2 = strrchr(maxcpus, '-');
377 if (c2)
378 c2++;
379
380 if (!c1 && !c2)
381 c1 = maxcpus;
382 else if (c1 > c2)
383 c2 = c1;
384 else if (c1 < c2)
385 c1 = c2;
386 else if (!c1 && c2) // The reverse case is obvs. not needed.
387 c1 = c2;
388
389 /* If the above logic is correct, c1 should always hold a valid string
390 * here.
391 */
392
393 errno = 0;
394 cpus = strtoul(c1, NULL, 0);
395 if (errno != 0)
396 return -1;
397
398 return cpus;
399}
400
6f9584d8 401#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
a54694f8
CB
402static bool filter_and_set_cpus(char *path, bool am_initialized)
403{
404 char *lastslash, *fpath, oldv;
405 int ret;
406 ssize_t i;
407
408 ssize_t maxposs = 0, maxisol = 0;
409 char *cpulist = NULL, *posscpus = NULL, *isolcpus = NULL;
410 uint32_t *possmask = NULL, *isolmask = NULL;
6f9584d8 411 bool bret = false, flipped_bit = false;
a54694f8
CB
412
413 lastslash = strrchr(path, '/');
414 if (!lastslash) { // bug... this shouldn't be possible
6f9584d8 415 ERROR("Invalid path: %s.", path);
a54694f8
CB
416 return bret;
417 }
418 oldv = *lastslash;
419 *lastslash = '\0';
420 fpath = must_make_path(path, "cpuset.cpus", NULL);
421 posscpus = read_file(fpath);
6f9584d8
CB
422 if (!posscpus) {
423 SYSERROR("Could not read file: %s.\n", fpath);
424 goto on_error;
425 }
a54694f8
CB
426
427 /* Get maximum number of cpus found in possible cpuset. */
428 maxposs = get_max_cpus(posscpus);
429 if (maxposs < 0)
6f9584d8 430 goto on_error;
a54694f8 431
6f9584d8
CB
432 if (!file_exists(__ISOL_CPUS)) {
433 /* This system doesn't expose isolated cpus. */
434 DEBUG("Path: "__ISOL_CPUS" to read isolated cpus from does not exist.\n");
65d29cbc
CB
435 cpulist = posscpus;
436 /* No isolated cpus but we weren't already initialized by
437 * someone. We should simply copy the parents cpuset.cpus
438 * values.
439 */
440 if (!am_initialized) {
441 DEBUG("Copying cpuset of parent cgroup.");
442 goto copy_parent;
443 }
444 /* No isolated cpus but we were already initialized by someone.
445 * Nothing more to do for us.
446 */
6f9584d8
CB
447 goto on_success;
448 }
449
450 isolcpus = read_file(__ISOL_CPUS);
451 if (!isolcpus) {
452 SYSERROR("Could not read file "__ISOL_CPUS);
453 goto on_error;
454 }
a54694f8 455 if (!isdigit(isolcpus[0])) {
6f9584d8 456 DEBUG("No isolated cpus detected.");
a54694f8
CB
457 cpulist = posscpus;
458 /* No isolated cpus but we weren't already initialized by
459 * someone. We should simply copy the parents cpuset.cpus
460 * values.
461 */
6f9584d8
CB
462 if (!am_initialized) {
463 DEBUG("Copying cpuset of parent cgroup.");
a54694f8 464 goto copy_parent;
6f9584d8 465 }
a54694f8
CB
466 /* No isolated cpus but we were already initialized by someone.
467 * Nothing more to do for us.
468 */
6f9584d8 469 goto on_success;
a54694f8
CB
470 }
471
472 /* Get maximum number of cpus found in isolated cpuset. */
473 maxisol = get_max_cpus(isolcpus);
474 if (maxisol < 0)
6f9584d8 475 goto on_error;
a54694f8
CB
476
477 if (maxposs < maxisol)
478 maxposs = maxisol;
479 maxposs++;
480
481 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8
CB
482 if (!possmask) {
483 ERROR("Could not create cpumask for all possible cpus.\n");
484 goto on_error;
485 }
a54694f8
CB
486
487 isolmask = lxc_cpumask(isolcpus, maxposs);
6f9584d8
CB
488 if (!isolmask) {
489 ERROR("Could not create cpumask for all isolated cpus.\n");
490 goto on_error;
491 }
a54694f8
CB
492
493 for (i = 0; i <= maxposs; i++) {
494 if (is_set(i, isolmask) && is_set(i, possmask)) {
6f9584d8 495 flipped_bit = true;
a54694f8
CB
496 clear_bit(i, possmask);
497 }
498 }
499
6f9584d8
CB
500 if (!flipped_bit) {
501 DEBUG("No isolated cpus present in cpuset.");
502 goto on_success;
503 }
504 DEBUG("Removed isolated cpus from cpuset.");
505
a54694f8 506 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
6f9584d8
CB
507 if (!cpulist) {
508 ERROR("Could not create cpu list.\n");
509 goto on_error;
510 }
a54694f8
CB
511
512copy_parent:
513 *lastslash = oldv;
514 fpath = must_make_path(path, "cpuset.cpus", NULL);
515 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false);
6f9584d8
CB
516 if (ret < 0) {
517 SYSERROR("Could not write cpu list to: %s.\n", fpath);
518 goto on_error;
519 }
520
521on_success:
522 bret = true;
a54694f8 523
6f9584d8 524on_error:
a54694f8
CB
525 free(fpath);
526
527 free(isolcpus);
528 free(isolmask);
529
530 if (posscpus != cpulist)
531 free(posscpus);
532 free(possmask);
533
534 free(cpulist);
535 return bret;
536}
537
e3a3fecf
SH
538/* Copy contents of parent(@path)/@file to @path/@file */
539static bool copy_parent_file(char *path, char *file)
540{
541 char *lastslash, *value = NULL, *fpath, oldv;
542 int len = 0;
543 int ret;
544
545 lastslash = strrchr(path, '/');
546 if (!lastslash) { // bug... this shouldn't be possible
547 ERROR("cgfsng:copy_parent_file: bad path %s", path);
548 return false;
549 }
550 oldv = *lastslash;
551 *lastslash = '\0';
552 fpath = must_make_path(path, file, NULL);
553 len = lxc_read_from_file(fpath, NULL, 0);
554 if (len <= 0)
555 goto bad;
556 value = must_alloc(len + 1);
557 if (lxc_read_from_file(fpath, value, len) != len)
558 goto bad;
559 free(fpath);
560 *lastslash = oldv;
561 fpath = must_make_path(path, file, NULL);
562 ret = lxc_write_to_file(fpath, value, len, false);
563 if (ret < 0)
564 SYSERROR("Unable to write %s to %s", value, fpath);
565 free(fpath);
566 free(value);
567 return ret >= 0;
568
569bad:
570 SYSERROR("Error reading '%s'", fpath);
571 free(fpath);
572 free(value);
573 return false;
574}
575
576/*
577 * Initialize the cpuset hierarchy in first directory of @gname and
578 * set cgroup.clone_children so that children inherit settings.
579 * Since the h->base_path is populated by init or ourselves, we know
580 * it is already initialized.
581 */
a54694f8 582static bool handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf
SH
583{
584 char *cgpath, *clonechildrenpath, v, *slash;
585
586 if (!string_in_list(h->controllers, "cpuset"))
587 return true;
588
589 if (*cgname == '/')
590 cgname++;
591 slash = strchr(cgname, '/');
592 if (slash)
593 *slash = '\0';
594
595 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
596 if (slash)
597 *slash = '/';
598 if (mkdir(cgpath, 0755) < 0 && errno != EEXIST) {
599 SYSERROR("Failed to create '%s'", cgpath);
600 free(cgpath);
601 return false;
602 }
6f9584d8 603
e3a3fecf 604 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
6328fd9c
CB
605 /* unified hierarchy doesn't have clone_children */
606 if (!file_exists(clonechildrenpath)) {
e3a3fecf
SH
607 free(clonechildrenpath);
608 free(cgpath);
609 return true;
610 }
611 if (lxc_read_from_file(clonechildrenpath, &v, 1) < 0) {
612 SYSERROR("Failed to read '%s'", clonechildrenpath);
613 free(clonechildrenpath);
614 free(cgpath);
615 return false;
616 }
617
a54694f8 618 /* Make sure any isolated cpus are removed from cpuset.cpus. */
6f9584d8
CB
619 if (!filter_and_set_cpus(cgpath, v == '1')) {
620 SYSERROR("Failed to remove isolated cpus.");
621 free(clonechildrenpath);
622 free(cgpath);
a54694f8 623 return false;
6f9584d8 624 }
a54694f8 625
e3a3fecf 626 if (v == '1') { /* already set for us by someone else */
6f9584d8 627 DEBUG("\"cgroup.clone_children\" was already set to \"1\".");
e3a3fecf
SH
628 free(clonechildrenpath);
629 free(cgpath);
630 return true;
631 }
632
633 /* copy parent's settings */
a54694f8 634 if (!copy_parent_file(cgpath, "cpuset.mems")) {
6f9584d8 635 SYSERROR("Failed to copy \"cpuset.mems\" settings.");
e3a3fecf
SH
636 free(cgpath);
637 free(clonechildrenpath);
638 return false;
639 }
640 free(cgpath);
641
642 if (lxc_write_to_file(clonechildrenpath, "1", 1, false) < 0) {
643 /* Set clone_children so children inherit our settings */
644 SYSERROR("Failed to write 1 to %s", clonechildrenpath);
645 free(clonechildrenpath);
646 return false;
647 }
648 free(clonechildrenpath);
649 return true;
650}
651
ccb4cabe
SH
652/*
653 * Given two null-terminated lists of strings, return true if any string
654 * is in both.
655 */
656static bool controller_lists_intersect(char **l1, char **l2)
657{
658 int i;
659
660 if (!l1 || !l2)
661 return false;
662
663 for (i = 0; l1[i]; i++) {
664 if (string_in_list(l2, l1[i]))
665 return true;
666 }
667 return false;
668}
669
670/*
671 * For a null-terminated list of controllers @clist, return true if any of
672 * those controllers is already listed the null-terminated list of
673 * hierarchies @hlist. Realistically, if one is present, all must be present.
674 */
675static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
676{
677 int i;
678
679 if (!hlist)
680 return false;
681 for (i = 0; hlist[i]; i++)
682 if (controller_lists_intersect(hlist[i]->controllers, clist))
683 return true;
684 return false;
685
686}
687
688/*
689 * Return true if the controller @entry is found in the null-terminated
690 * list of hierarchies @hlist
691 */
692static bool controller_found(struct hierarchy **hlist, char *entry)
693{
694 int i;
695 if (!hlist)
696 return false;
697
698 for (i = 0; hlist[i]; i++)
699 if (string_in_list(hlist[i]->controllers, entry))
700 return true;
701 return false;
702}
703
704/*
c30b61c3
SH
705 * Return true if all of the controllers which we require have been found.
706 * The required list is freezer and anything in * lxc.cgroup.use.
ccb4cabe 707 */
457ca9aa 708static bool all_controllers_found(void)
ccb4cabe
SH
709{
710 char *p, *saveptr = NULL;
457ca9aa 711 struct hierarchy ** hlist = hierarchies;
ccb4cabe 712
ccb4cabe
SH
713 if (!controller_found(hlist, "freezer")) {
714 ERROR("no freezer controller mountpoint found");
715 return false;
716 }
717
457ca9aa 718 if (!cgroup_use)
ccb4cabe 719 return true;
457ca9aa 720 for (p = strtok_r(cgroup_use, ",", &saveptr); p;
ccb4cabe
SH
721 p = strtok_r(NULL, ",", &saveptr)) {
722 if (!controller_found(hlist, p)) {
723 ERROR("no %s controller mountpoint found", p);
724 return false;
725 }
726 }
727 return true;
728}
729
730/* Return true if the fs type is fuse.lxcfs */
731static bool is_lxcfs(const char *line)
732{
733 char *p = strstr(line, " - ");
734 if (!p)
735 return false;
2f62fb00 736 return strncmp(p, " - fuse.lxcfs ", 14) == 0;
ccb4cabe
SH
737}
738
739/*
740 * Get the controllers from a mountinfo line
741 * There are other ways we could get this info. For lxcfs, field 3
742 * is /cgroup/controller-list. For cgroupfs, we could parse the mount
743 * options. But we simply assume that the mountpoint must be
744 * /sys/fs/cgroup/controller-list
745 */
746static char **get_controllers(char **klist, char **nlist, char *line)
747{
6328fd9c 748 /* the fourth field is /sys/fs/cgroup/comma-delimited-controller-list */
ccb4cabe
SH
749 int i;
750 char *p = line, *p2, *tok, *saveptr = NULL;
751 char **aret = NULL;
6328fd9c
CB
752 bool is_cgroup_v2;
753
754 /* handle cgroup v2 */
755 is_cgroup_v2 = is_cgroupfs_v2(line);
ccb4cabe
SH
756
757 for (i = 0; i < 4; i++) {
235f1815 758 p = strchr(p, ' ');
ccb4cabe
SH
759 if (!p)
760 return NULL;
761 p++;
762 }
763 if (!p)
764 return NULL;
765 /* note - if we change how mountinfo works, then our caller
766 * will need to verify /sys/fs/cgroup/ in this field */
5059aae9
SH
767 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
768 INFO("cgfsng: found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
ccb4cabe 769 return NULL;
5059aae9 770 }
ccb4cabe 771 p += 15;
235f1815 772 p2 = strchr(p, ' ');
ccb4cabe
SH
773 if (!p2) {
774 ERROR("corrupt mountinfo");
775 return NULL;
776 }
777 *p2 = '\0';
6328fd9c
CB
778
779 /* cgroup v2 does not have separate mountpoints for controllers */
780 if (is_cgroup_v2) {
781 must_append_controller(klist, nlist, &aret, "cgroup2");
782 return aret;
783 }
784
ccb4cabe
SH
785 for (tok = strtok_r(p, ",", &saveptr); tok;
786 tok = strtok_r(NULL, ",", &saveptr)) {
787 must_append_controller(klist, nlist, &aret, tok);
788 }
789
790 return aret;
791}
792
ccb4cabe 793/* Add a controller to our list of hierarchies */
457ca9aa 794static void add_controller(char **clist, char *mountpoint, char *base_cgroup)
ccb4cabe
SH
795{
796 struct hierarchy *new;
797 int newentry;
798
799 new = must_alloc(sizeof(*new));
800 new->controllers = clist;
801 new->mountpoint = mountpoint;
802 new->base_cgroup = base_cgroup;
803 new->fullcgpath = NULL;
804
6328fd9c
CB
805 /* record if this is the cgroup v2 hierarchy */
806 if (!strcmp(base_cgroup, "cgroup2"))
807 new->is_cgroup_v2 = true;
808 else
809 new->is_cgroup_v2 = false;
810
457ca9aa
SH
811 newentry = append_null_to_list((void ***)&hierarchies);
812 hierarchies[newentry] = new;
ccb4cabe
SH
813}
814
815/*
816 * Get a copy of the mountpoint from @line, which is a line from
817 * /proc/self/mountinfo
818 */
819static char *get_mountpoint(char *line)
820{
821 int i;
822 char *p = line, *sret;
823 size_t len;
824
825 for (i = 0; i < 4; i++) {
235f1815 826 p = strchr(p, ' ');
ccb4cabe
SH
827 if (!p)
828 return NULL;
829 p++;
830 }
831 /* we've already stuck a \0 after the mountpoint */
832 len = strlen(p);
833 sret = must_alloc(len + 1);
834 memcpy(sret, p, len);
835 sret[len] = '\0';
836 return sret;
837}
838
839/*
840 * Given a multi-line string, return a null-terminated copy of the
841 * current line.
842 */
843static char *copy_to_eol(char *p)
844{
235f1815 845 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
846 size_t len;
847
848 if (!p2)
849 return NULL;
850
851 len = p2 - p;
852 sret = must_alloc(len + 1);
853 memcpy(sret, p, len);
854 sret[len] = '\0';
855 return sret;
856}
857
858/*
859 * cgline: pointer to character after the first ':' in a line in a
860 * \n-terminated /proc/self/cgroup file. Check whether * controller c is
861 * present.
862 */
863static bool controller_in_clist(char *cgline, char *c)
864{
865 char *tok, *saveptr = NULL, *eol, *tmp;
866 size_t len;
867
235f1815 868 eol = strchr(cgline, ':');
ccb4cabe
SH
869 if (!eol)
870 return false;
871
872 len = eol - cgline;
873 tmp = alloca(len + 1);
874 memcpy(tmp, cgline, len);
875 tmp[len] = '\0';
876
877 for (tok = strtok_r(tmp, ",", &saveptr); tok;
878 tok = strtok_r(NULL, ",", &saveptr)) {
879 if (strcmp(tok, c) == 0)
880 return true;
881 }
882 return false;
883}
884
885/*
886 * @basecginfo is a copy of /proc/$$/cgroup. Return the current
887 * cgroup for @controller
888 */
889static char *get_current_cgroup(char *basecginfo, char *controller)
890{
891 char *p = basecginfo;
6328fd9c
CB
892 bool is_cgroup_v2;
893 bool is_cgroup_v2_base_cgroup;
894
895 is_cgroup_v2 = !strcmp(controller, "cgroup2");
896 while (true) {
897 is_cgroup_v2_base_cgroup = false;
898 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
899 if (is_cgroup_v2 && (*p == '0'))
900 is_cgroup_v2_base_cgroup = true;
ccb4cabe 901
235f1815 902 p = strchr(p, ':');
ccb4cabe
SH
903 if (!p)
904 return NULL;
905 p++;
6328fd9c 906 if (is_cgroup_v2_base_cgroup || controller_in_clist(p, controller)) {
235f1815 907 p = strchr(p, ':');
ccb4cabe
SH
908 if (!p)
909 return NULL;
910 p++;
911 return copy_to_eol(p);
912 }
913
235f1815 914 p = strchr(p, '\n');
ccb4cabe
SH
915 if (!p)
916 return NULL;
917 p++;
918 }
919}
920
ccb4cabe
SH
921static void must_append_string(char ***list, char *entry)
922{
923 int newentry = append_null_to_list((void ***)list);
924 char *copy;
925
926 copy = must_copy_string(entry);
927 (*list)[newentry] = copy;
928}
929
930static void get_existing_subsystems(char ***klist, char ***nlist)
931{
932 FILE *f;
933 char *line = NULL;
934 size_t len = 0;
935
936 if ((f = fopen("/proc/self/cgroup", "r")) == NULL)
937 return;
938 while (getline(&line, &len, f) != -1) {
939 char *p, *p2, *tok, *saveptr = NULL;
235f1815 940 p = strchr(line, ':');
ccb4cabe
SH
941 if (!p)
942 continue;
943 p++;
235f1815 944 p2 = strchr(p, ':');
ccb4cabe
SH
945 if (!p2)
946 continue;
947 *p2 = '\0';
ff8d6ee9 948
6328fd9c
CB
949 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
950 * contains an entry of the form:
ff8d6ee9
CB
951 *
952 * 0::/some/path
953 *
6328fd9c 954 * In this case we use "cgroup2" as controller name.
ff8d6ee9 955 */
6328fd9c
CB
956 if ((p2 - p) == 0) {
957 must_append_string(klist, "cgroup2");
ff8d6ee9 958 continue;
6328fd9c 959 }
ff8d6ee9 960
ccb4cabe
SH
961 for (tok = strtok_r(p, ",", &saveptr); tok;
962 tok = strtok_r(NULL, ",", &saveptr)) {
963 if (strncmp(tok, "name=", 5) == 0)
964 must_append_string(nlist, tok);
965 else
966 must_append_string(klist, tok);
967 }
968 }
969
970 free(line);
971 fclose(f);
972}
973
974static void trim(char *s)
975{
976 size_t len = strlen(s);
2c28d76b 977 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
978 s[--len] = '\0';
979}
980
e4aeecf5
CB
981static void lxc_cgfsng_print_handler_data(const struct cgfsng_handler_data *d)
982{
983 printf("Cgroup information:\n");
984 printf(" container name: %s\n", d->name ? d->name : "(null)");
985 printf(" lxc.cgroup.use: %s\n", cgroup_use ? cgroup_use : "(null)");
986 printf(" lxc.cgroup.pattern: %s\n", d->cgroup_pattern ? d->cgroup_pattern : "(null)");
987 printf(" cgroup: %s\n", d->container_cgroup ? d->container_cgroup : "(null)");
988}
989
990static void lxc_cgfsng_print_hierarchies()
ccb4cabe 991{
a7b0cc4c 992 struct hierarchy **it;
ccb4cabe 993 int i;
41c33dbe 994
457ca9aa 995 if (!hierarchies) {
e4aeecf5 996 printf(" No hierarchies found.");
ccb4cabe
SH
997 return;
998 }
e4aeecf5 999 printf(" Hierarchies:\n");
a7b0cc4c
CB
1000 for (i = 0, it = hierarchies; it && *it; it++, i++) {
1001 char **cit;
ccb4cabe 1002 int j;
e4aeecf5
CB
1003 printf(" %d: base_cgroup %s\n", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1004 printf(" mountpoint %s\n", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1005 printf(" controllers:\n");
a7b0cc4c 1006 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
e4aeecf5 1007 printf(" %d: %s\n", j, *cit);
ccb4cabe
SH
1008 }
1009}
41c33dbe 1010
e4aeecf5 1011static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist, char **nlist)
41c33dbe
SH
1012{
1013 int k;
a7b0cc4c 1014 char **it;
41c33dbe 1015
a7b0cc4c
CB
1016 printf("basecginfo is:\n");
1017 printf("%s\n", basecginfo);
41c33dbe 1018
a7b0cc4c
CB
1019 for (k = 0, it = klist; it && *it; it++, k++)
1020 printf("kernel subsystem %d: %s\n", k, *it);
1021 for (k = 0, it = nlist; it && *it; it++, k++)
1022 printf("named subsystem %d: %s\n", k, *it);
41c33dbe 1023}
ccb4cabe 1024
e4aeecf5
CB
1025static void lxc_cgfsng_print_debuginfo(const struct cgfsng_handler_data *d)
1026{
1027 lxc_cgfsng_print_handler_data(d);
1028 lxc_cgfsng_print_hierarchies();
1029}
1030
ccb4cabe
SH
1031/*
1032 * At startup, parse_hierarchies finds all the info we need about
1033 * cgroup mountpoints and current cgroups, and stores it in @d.
1034 */
457ca9aa 1035static bool parse_hierarchies(void)
ccb4cabe
SH
1036{
1037 FILE *f;
1038 char * line = NULL, *basecginfo;
1039 char **klist = NULL, **nlist = NULL;
1040 size_t len = 0;
1041
d30ec4cb
SH
1042 /*
1043 * Root spawned containers escape the current cgroup, so use init's
1044 * cgroups as our base in that case.
1045 */
ccb4cabe
SH
1046 if (geteuid())
1047 basecginfo = read_file("/proc/self/cgroup");
1048 else
1049 basecginfo = read_file("/proc/1/cgroup");
1050 if (!basecginfo)
1051 return false;
1052
1053 if ((f = fopen("/proc/self/mountinfo", "r")) == NULL) {
d3b00a8f 1054 SYSERROR("Failed opening /proc/self/mountinfo");
ccb4cabe
SH
1055 return false;
1056 }
1057
1058 get_existing_subsystems(&klist, &nlist);
41c33dbe 1059
e4aeecf5
CB
1060 if (lxc_cgfsng_debug)
1061 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
ccb4cabe
SH
1062
1063 /* we support simple cgroup mounts and lxcfs mounts */
1064 while (getline(&line, &len, f) != -1) {
1065 char **controller_list = NULL;
1066 char *mountpoint, *base_cgroup;
6328fd9c 1067 bool is_cgroup_v2, writeable;
ccb4cabe 1068
6328fd9c
CB
1069 is_cgroup_v2 = is_cgroupfs_v2(line);
1070 if (!is_lxcfs(line) && !is_cgroupfs_v1(line) && !is_cgroup_v2)
ccb4cabe
SH
1071 continue;
1072
1073 controller_list = get_controllers(klist, nlist, line);
1074 if (!controller_list)
1075 continue;
1076
457ca9aa 1077 if (controller_list_is_dup(hierarchies, controller_list)) {
ccb4cabe
SH
1078 free(controller_list);
1079 continue;
1080 }
1081
1082 mountpoint = get_mountpoint(line);
1083 if (!mountpoint) {
1084 ERROR("Error reading mountinfo: bad line '%s'", line);
1085 free_string_list(controller_list);
1086 continue;
1087 }
1088
1089 base_cgroup = get_current_cgroup(basecginfo, controller_list[0]);
1090 if (!base_cgroup) {
1091 ERROR("Failed to find current cgroup for controller '%s'", controller_list[0]);
1092 free_string_list(controller_list);
1093 free(mountpoint);
1094 continue;
1095 }
6328fd9c 1096
ccb4cabe
SH
1097 trim(base_cgroup);
1098 prune_init_scope(base_cgroup);
6328fd9c
CB
1099 if (is_cgroup_v2)
1100 writeable = test_writeable_v2(mountpoint, base_cgroup);
1101 else
1102 writeable = test_writeable_v1(mountpoint, base_cgroup);
1103 if (!writeable) {
ccb4cabe
SH
1104 free_string_list(controller_list);
1105 free(mountpoint);
1106 free(base_cgroup);
1107 continue;
1108 }
457ca9aa 1109 add_controller(controller_list, mountpoint, base_cgroup);
ccb4cabe
SH
1110 }
1111
1112 free_string_list(klist);
1113 free_string_list(nlist);
1114
1115 free(basecginfo);
1116
1117 fclose(f);
1118 free(line);
1119
e4aeecf5
CB
1120 if (lxc_cgfsng_debug) {
1121 printf("writeable subsystems:\n");
1122 lxc_cgfsng_print_hierarchies();
1123 }
1124
ccb4cabe
SH
1125 /* verify that all controllers in cgroup.use and all crucial
1126 * controllers are accounted for
1127 */
5059aae9
SH
1128 if (!all_controllers_found()) {
1129 INFO("cgfsng: not all controllers were find, deferring to cgfs driver");
ccb4cabe 1130 return false;
5059aae9 1131 }
ccb4cabe
SH
1132
1133 return true;
1134}
1135
457ca9aa
SH
1136static bool collect_hierarchy_info(void)
1137{
1138 const char *tmp;
1139 errno = 0;
1140 tmp = lxc_global_config_value("lxc.cgroup.use");
1141 if (!cgroup_use && errno != 0) { // lxc.cgroup.use can be NULL
1142 SYSERROR("cgfsng: error reading list of cgroups to use");
1143 return false;
1144 }
1145 cgroup_use = must_copy_string(tmp);
1146
1147 return parse_hierarchies();
1148}
1149
ccb4cabe
SH
1150static void *cgfsng_init(const char *name)
1151{
1152 struct cgfsng_handler_data *d;
457ca9aa 1153 const char *cgroup_pattern;
ccb4cabe
SH
1154
1155 d = must_alloc(sizeof(*d));
1156 memset(d, 0, sizeof(*d));
1157
1158 d->name = must_copy_string(name);
1159
ccb4cabe
SH
1160 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
1161 if (!cgroup_pattern) { // lxc.cgroup.pattern is only NULL on error
1162 ERROR("Error getting cgroup pattern");
1163 goto out_free;
1164 }
1165 d->cgroup_pattern = must_copy_string(cgroup_pattern);
1166
e4aeecf5
CB
1167 if (lxc_cgfsng_debug)
1168 lxc_cgfsng_print_debuginfo(d);
ccb4cabe
SH
1169
1170 return d;
1171
1172out_free:
1173 free_handler_data(d);
1174 return NULL;
1175}
1176
ccb4cabe
SH
1177static int cgroup_rmdir(char *dirname)
1178{
74f96976 1179 struct dirent *direntp;
ccb4cabe
SH
1180 DIR *dir;
1181 int r = 0;
1182
1183 dir = opendir(dirname);
1184 if (!dir)
1185 return -1;
1186
74f96976 1187 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1188 struct stat mystat;
1189 char *pathname;
1190
1191 if (!direntp)
1192 break;
1193
1194 if (!strcmp(direntp->d_name, ".") ||
1195 !strcmp(direntp->d_name, ".."))
1196 continue;
1197
1198 pathname = must_make_path(dirname, direntp->d_name, NULL);
1199
1200 if (lstat(pathname, &mystat)) {
1201 if (!r)
1c9da8da 1202 WARN("failed to stat %s", pathname);
ccb4cabe
SH
1203 r = -1;
1204 goto next;
1205 }
1206
1207 if (!S_ISDIR(mystat.st_mode))
1208 goto next;
1209 if (cgroup_rmdir(pathname) < 0)
1210 r = -1;
1211next:
1212 free(pathname);
1213 }
1214
1215 if (rmdir(dirname) < 0) {
1216 if (!r)
13277ec4 1217 WARN("failed to delete %s: %s", dirname, strerror(errno));
ccb4cabe
SH
1218 r = -1;
1219 }
1220
1221 if (closedir(dir) < 0) {
1222 if (!r)
13277ec4 1223 WARN("failed to delete %s: %s", dirname, strerror(errno));
ccb4cabe
SH
1224 r = -1;
1225 }
1226 return r;
1227}
1228
1229static int rmdir_wrapper(void *data)
1230{
1231 char *path = data;
1232
1233 if (setresgid(0,0,0) < 0)
1234 SYSERROR("Failed to setgid to 0");
1235 if (setresuid(0,0,0) < 0)
1236 SYSERROR("Failed to setuid to 0");
1237 if (setgroups(0, NULL) < 0)
1238 SYSERROR("Failed to clear groups");
1239
1240 return cgroup_rmdir(path);
1241}
1242
1243void recursive_destroy(char *path, struct lxc_conf *conf)
1244{
1245 int r;
1246 if (conf && !lxc_list_empty(&conf->id_map))
c9b7c33e 1247 r = userns_exec_1(conf, rmdir_wrapper, path, "rmdir_wrapper");
ccb4cabe
SH
1248 else
1249 r = cgroup_rmdir(path);
1250
1251 if (r < 0)
1c9da8da 1252 ERROR("Error destroying %s", path);
ccb4cabe
SH
1253}
1254
1255static void cgfsng_destroy(void *hdata, struct lxc_conf *conf)
1256{
1257 struct cgfsng_handler_data *d = hdata;
1258
1259 if (!d)
1260 return;
1261
457ca9aa 1262 if (d->container_cgroup && hierarchies) {
ccb4cabe 1263 int i;
457ca9aa
SH
1264 for (i = 0; hierarchies[i]; i++) {
1265 struct hierarchy *h = hierarchies[i];
e2db2a89 1266 if (h->fullcgpath) {
ccb4cabe
SH
1267 recursive_destroy(h->fullcgpath, conf);
1268 free(h->fullcgpath);
1269 h->fullcgpath = NULL;
1270 }
1271 }
1272 }
1273
1274 free_handler_data(d);
1275}
1276
1277struct cgroup_ops *cgfsng_ops_init(void)
1278{
e4aeecf5
CB
1279 if (getenv("LXC_DEBUG_CGFSNG"))
1280 lxc_cgfsng_debug = true;
1281
457ca9aa
SH
1282 if (!collect_hierarchy_info())
1283 return NULL;
e4aeecf5 1284
ccb4cabe
SH
1285 return &cgfsng_ops;
1286}
1287
1288static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1289{
e3a3fecf 1290 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
6f9584d8
CB
1291 if (dir_exists(h->fullcgpath)) { // it must not already exist
1292 ERROR("Path \"%s\" already existed.", h->fullcgpath);
d8da679e 1293 return false;
6f9584d8
CB
1294 }
1295 if (!handle_cpuset_hierarchy(h, cgname)) {
1296 ERROR("Failed to handle cgroupfs v1 cpuset controller.");
e3a3fecf 1297 return false;
6f9584d8 1298 }
e3a3fecf 1299 return mkdir_p(h->fullcgpath, 0755) == 0;
ccb4cabe
SH
1300}
1301
1302static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1303{
1304 if (rmdir(h->fullcgpath) < 0)
1305 SYSERROR("Failed to clean up cgroup %s from failed creation attempt", h->fullcgpath);
1306 free(h->fullcgpath);
1307 h->fullcgpath = NULL;
1308}
1309
1310/*
d30ec4cb 1311 * Try to create the same cgroup in all hierarchies.
ccb4cabe
SH
1312 * Start with cgroup_pattern; next cgroup_pattern-1, -2, ..., -999
1313 */
1314static inline bool cgfsng_create(void *hdata)
1315{
1316 struct cgfsng_handler_data *d = hdata;
1317 char *tmp, *cgname, *offset;
66b66624
CB
1318 int i, ret;
1319 int idx = 0;
ccb4cabe
SH
1320 size_t len;
1321
1322 if (!d)
1323 return false;
1324 if (d->container_cgroup) {
1325 WARN("cgfsng_create called a second time");
1326 return false;
1327 }
1328
1329 tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
1330 if (!tmp) {
1331 ERROR("Failed expanding cgroup name pattern");
1332 return false;
1333 }
1334 len = strlen(tmp) + 5; // leave room for -NNN\0
1335 cgname = must_alloc(len);
1336 strcpy(cgname, tmp);
1337 free(tmp);
1338 offset = cgname + len - 5;
1339
1340again:
95adfe93
SH
1341 if (idx == 1000) {
1342 ERROR("Too many conflicting cgroup names");
ccb4cabe 1343 goto out_free;
95adfe93 1344 }
66b66624
CB
1345 if (idx) {
1346 ret = snprintf(offset, 5, "-%d", idx);
1347 if (ret < 0 || (size_t)ret >= 5) {
1348 FILE *f = fopen("/dev/null", "w");
1349 if (f >= 0) {
1350 fprintf(f, "Workaround for GCC7 bug: "
1351 "https://gcc.gnu.org/bugzilla/"
1352 "show_bug.cgi?id=78969");
1353 fclose(f);
1354 }
1355 }
1356 }
457ca9aa
SH
1357 for (i = 0; hierarchies[i]; i++) {
1358 if (!create_path_for_hierarchy(hierarchies[i], cgname)) {
ccb4cabe 1359 int j;
457ca9aa
SH
1360 SYSERROR("Failed to create %s: %s", hierarchies[i]->fullcgpath, strerror(errno));
1361 free(hierarchies[i]->fullcgpath);
1362 hierarchies[i]->fullcgpath = NULL;
ccb4cabe 1363 for (j = 0; j < i; j++)
457ca9aa 1364 remove_path_for_hierarchy(hierarchies[j], cgname);
ccb4cabe
SH
1365 idx++;
1366 goto again;
1367 }
1368 }
1369 /* Done */
1370 d->container_cgroup = cgname;
1371 return true;
1372
1373out_free:
1374 free(cgname);
1375 return false;
1376}
1377
ccb4cabe
SH
1378static bool cgfsng_enter(void *hdata, pid_t pid)
1379{
ccb4cabe
SH
1380 char pidstr[25];
1381 int i, len;
1382
1383 len = snprintf(pidstr, 25, "%d", pid);
1384 if (len < 0 || len > 25)
1385 return false;
1386
457ca9aa
SH
1387 for (i = 0; hierarchies[i]; i++) {
1388 char *fullpath = must_make_path(hierarchies[i]->fullcgpath,
ccb4cabe
SH
1389 "cgroup.procs", NULL);
1390 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
d3b00a8f 1391 SYSERROR("Failed to enter %s", fullpath);
ccb4cabe
SH
1392 free(fullpath);
1393 return false;
1394 }
1395 free(fullpath);
1396 }
1397
1398 return true;
1399}
1400
1401struct chown_data {
1402 struct cgfsng_handler_data *d;
1403 uid_t origuid; // target uid in parent namespace
1404};
1405
c0888dfe
SH
1406/*
1407 * chgrp the container cgroups to container group. We leave
1408 * the container owner as cgroup owner. So we must make the
1409 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1410 *
1411 * Also chown the tasks and cgroup.procs files. Those may not
1412 * exist depending on kernel version.
c0888dfe 1413 */
ccb4cabe
SH
1414static int chown_cgroup_wrapper(void *data)
1415{
1416 struct chown_data *arg = data;
ccb4cabe
SH
1417 uid_t destuid;
1418 int i;
1419
1420 if (setresgid(0,0,0) < 0)
1421 SYSERROR("Failed to setgid to 0");
1422 if (setresuid(0,0,0) < 0)
1423 SYSERROR("Failed to setuid to 0");
1424 if (setgroups(0, NULL) < 0)
1425 SYSERROR("Failed to clear groups");
1426
1427 destuid = get_ns_uid(arg->origuid);
1428
457ca9aa
SH
1429 for (i = 0; hierarchies[i]; i++) {
1430 char *fullpath, *path = hierarchies[i]->fullcgpath;
43647298
SH
1431
1432 if (chown(path, destuid, 0) < 0) {
ab8f5424 1433 SYSERROR("Error chowning %s to %d", path, (int) destuid);
ccb4cabe
SH
1434 return -1;
1435 }
c0888dfe 1436
43647298 1437 if (chmod(path, 0775) < 0) {
ab8f5424 1438 SYSERROR("Error chmoding %s", path);
c0888dfe
SH
1439 return -1;
1440 }
ccb4cabe 1441
ab8f5424
SH
1442 /*
1443 * Failures to chown these are inconvenient but not detrimental
1444 * We leave these owned by the container launcher, so that container
1445 * root can write to the files to attach. We chmod them 664 so that
1446 * container systemd can write to the files (which systemd in wily
1447 * insists on doing)
1448 */
43647298
SH
1449 fullpath = must_make_path(path, "tasks", NULL);
1450 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1451 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1452 strerror(errno));
ab8f5424 1453 if (chmod(fullpath, 0664) < 0)
13277ec4 1454 WARN("Error chmoding %s: %s", path, strerror(errno));
43647298
SH
1455 free(fullpath);
1456
1457 fullpath = must_make_path(path, "cgroup.procs", NULL);
1458 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1459 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1460 strerror(errno));
ab8f5424 1461 if (chmod(fullpath, 0664) < 0)
13277ec4 1462 WARN("Error chmoding %s: %s", path, strerror(errno));
ccb4cabe
SH
1463 free(fullpath);
1464 }
1465
1466 return 0;
1467}
1468
1469static bool cgfsns_chown(void *hdata, struct lxc_conf *conf)
1470{
1471 struct cgfsng_handler_data *d = hdata;
1472 struct chown_data wrap;
1473
1474 if (!d)
1475 return false;
1476
1477 if (lxc_list_empty(&conf->id_map))
1478 return true;
1479
1480 wrap.d = d;
1481 wrap.origuid = geteuid();
1482
c9b7c33e
CB
1483 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1484 "chown_cgroup_wrapper") < 0) {
ccb4cabe
SH
1485 ERROR("Error requesting cgroup chown in new namespace");
1486 return false;
1487 }
1488
1489 return true;
1490}
1491
8aa1044f
SH
1492/*
1493 * We've safe-mounted a tmpfs as parent, so we don't need to protect against
1494 * symlinks any more - just use mount
1495 */
1496
1497/* mount cgroup-full if requested */
1498static int mount_cgroup_full(int type, struct hierarchy *h, char *dest,
1499 char *container_cgroup)
1500{
1501 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1502 return 0;
1503 if (mount(h->mountpoint, dest, "cgroup", MS_BIND, NULL) < 0) {
1504 SYSERROR("Error bind-mounting %s cgroup onto %s", h->mountpoint,
1505 dest);
1506 return -1;
1507 }
1508 if (type != LXC_AUTO_CGROUP_FULL_RW) {
5b6f9369
SH
1509 unsigned long flags = MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV |
1510 MS_REMOUNT | MS_RDONLY;
1511 if (mount(NULL, dest, "cgroup", flags, NULL) < 0) {
8aa1044f
SH
1512 SYSERROR("Error remounting %s readonly", dest);
1513 return -1;
1514 }
1515 }
1516
1517 INFO("Bind mounted %s onto %s", h->mountpoint, dest);
1518 if (type != LXC_AUTO_CGROUP_FULL_MIXED)
1519 return 0;
1520
1521 /* mount just the container path rw */
1522 char *source = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
5b6f9369 1523 char *rwpath = must_make_path(dest, h->base_cgroup, container_cgroup, NULL);
8aa1044f 1524 if (mount(source, rwpath, "cgroup", MS_BIND, NULL) < 0)
13277ec4 1525 WARN("Failed to mount %s read-write: %s", rwpath,
1526 strerror(errno));
8aa1044f
SH
1527 INFO("Made %s read-write", rwpath);
1528 free(rwpath);
1529 free(source);
1530 return 0;
1531}
1532
1533/* cgroup-full:* is done, no need to create subdirs */
1534static bool cg_mount_needs_subdirs(int type)
1535{
1536 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1537 return false;
1538 return true;
1539}
1540
1541/*
1542 * After $rootfs/sys/fs/container/controller/the/cg/path has been
1543 * created, remount controller ro if needed and bindmount the
1544 * cgroupfs onto controll/the/cg/path
1545 */
1546static int
1547do_secondstage_mounts_if_needed(int type, struct hierarchy *h,
1548 char *controllerpath, char *cgpath,
1549 const char *container_cgroup)
1550{
1551 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1552 if (mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL) < 0) {
1553 SYSERROR("Error bind-mounting %s", controllerpath);
1554 return -1;
1555 }
1556 if (mount(controllerpath, controllerpath, "cgroup",
1557 MS_REMOUNT | MS_BIND | MS_RDONLY, NULL) < 0) {
1558 SYSERROR("Error remounting %s read-only", controllerpath);
1559 return -1;
1560 }
1561 INFO("Remounted %s read-only", controllerpath);
1562 }
1563 char *sourcepath = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
1564 int flags = MS_BIND;
1565 if (type == LXC_AUTO_CGROUP_RO)
1566 flags |= MS_RDONLY;
1567 INFO("Mounting %s onto %s", sourcepath, cgpath);
1568 if (mount(sourcepath, cgpath, "cgroup", flags, NULL) < 0) {
1569 free(sourcepath);
1570 SYSERROR("Error mounting cgroup %s onto %s", h->controllers[0],
1571 cgpath);
1572 return -1;
1573 }
1574 free(sourcepath);
1575 INFO("Completed second stage cgroup automounts for %s", cgpath);
1576 return 0;
1577}
1578
ccb4cabe
SH
1579static bool cgfsng_mount(void *hdata, const char *root, int type)
1580{
8aa1044f
SH
1581 struct cgfsng_handler_data *d = hdata;
1582 char *tmpfspath = NULL;
1583 bool retval = false;
a8de4c49 1584 int i;
8aa1044f
SH
1585
1586 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1587 return true;
1588
ccb4cabe
SH
1589 if (cgns_supported())
1590 return true;
8aa1044f
SH
1591
1592 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1593
1594 if (type == LXC_AUTO_CGROUP_NOSPEC)
1595 type = LXC_AUTO_CGROUP_MIXED;
1596 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1597 type = LXC_AUTO_CGROUP_FULL_MIXED;
1598
1599 /* Mount tmpfs */
1600 if (safe_mount("cgroup_root", tmpfspath, "tmpfs",
1601 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1602 "size=10240k,mode=755",
1603 root) < 0)
1604 goto bad;
1605
457ca9aa 1606 for (i = 0; hierarchies[i]; i++) {
8aa1044f 1607 char *controllerpath, *path2;
457ca9aa 1608 struct hierarchy *h = hierarchies[i];
8aa1044f
SH
1609 char *controller = strrchr(h->mountpoint, '/');
1610 int r;
1611
1612 if (!controller)
1613 continue;
1614 controller++;
1615 controllerpath = must_make_path(tmpfspath, controller, NULL);
1616 if (dir_exists(controllerpath)) {
1617 free(controllerpath);
1618 continue;
1619 }
1620 if (mkdir(controllerpath, 0755) < 0) {
1621 SYSERROR("Error creating cgroup path: %s", controllerpath);
1622 free(controllerpath);
1623 goto bad;
1624 }
1625 if (mount_cgroup_full(type, h, controllerpath, d->container_cgroup) < 0) {
1626 free(controllerpath);
1627 goto bad;
1628 }
1629 if (!cg_mount_needs_subdirs(type)) {
1630 free(controllerpath);
1631 continue;
1632 }
ef4413fa 1633 path2 = must_make_path(controllerpath, h->base_cgroup, d->container_cgroup, NULL);
8aa1044f
SH
1634 if (mkdir_p(path2, 0755) < 0) {
1635 free(controllerpath);
1636 goto bad;
1637 }
2f62fb00 1638
8aa1044f
SH
1639 r = do_secondstage_mounts_if_needed(type, h, controllerpath, path2,
1640 d->container_cgroup);
1641 free(controllerpath);
1642 free(path2);
1643 if (r < 0)
1644 goto bad;
1645 }
1646 retval = true;
1647
1648bad:
1649 free(tmpfspath);
1650 return retval;
ccb4cabe
SH
1651}
1652
1653static int recursive_count_nrtasks(char *dirname)
1654{
74f96976 1655 struct dirent *direntp;
ccb4cabe
SH
1656 DIR *dir;
1657 int count = 0, ret;
1658 char *path;
1659
1660 dir = opendir(dirname);
1661 if (!dir)
1662 return 0;
1663
74f96976 1664 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1665 struct stat mystat;
1666
1667 if (!direntp)
1668 break;
1669
1670 if (!strcmp(direntp->d_name, ".") ||
1671 !strcmp(direntp->d_name, ".."))
1672 continue;
1673
1674 path = must_make_path(dirname, direntp->d_name, NULL);
1675
1676 if (lstat(path, &mystat))
1677 goto next;
1678
1679 if (!S_ISDIR(mystat.st_mode))
1680 goto next;
1681
1682 count += recursive_count_nrtasks(path);
1683next:
1684 free(path);
1685 }
1686
1687 path = must_make_path(dirname, "cgroup.procs", NULL);
1688 ret = lxc_count_file_lines(path);
1689 if (ret != -1)
1690 count += ret;
1691 free(path);
1692
1693 (void) closedir(dir);
1694
1695 return count;
1696}
1697
1698static int cgfsng_nrtasks(void *hdata) {
1699 struct cgfsng_handler_data *d = hdata;
1700 char *path;
1701 int count;
1702
457ca9aa 1703 if (!d || !d->container_cgroup || !hierarchies)
ccb4cabe 1704 return -1;
457ca9aa 1705 path = must_make_path(hierarchies[0]->fullcgpath, NULL);
ccb4cabe
SH
1706 count = recursive_count_nrtasks(path);
1707 free(path);
1708 return count;
1709}
1710
1711/* Only root needs to escape to the cgroup of its init */
7103fe6f 1712static bool cgfsng_escape()
ccb4cabe 1713{
ccb4cabe
SH
1714 int i;
1715
1716 if (geteuid())
1717 return true;
1718
457ca9aa
SH
1719 for (i = 0; hierarchies[i]; i++) {
1720 char *fullpath = must_make_path(hierarchies[i]->mountpoint,
1721 hierarchies[i]->base_cgroup,
ccb4cabe
SH
1722 "cgroup.procs", NULL);
1723 if (lxc_write_to_file(fullpath, "0", 2, false) != 0) {
d3b00a8f 1724 SYSERROR("Failed to escape to %s", fullpath);
ccb4cabe 1725 free(fullpath);
6df334d1 1726 return false;
ccb4cabe
SH
1727 }
1728 free(fullpath);
1729 }
1730
6df334d1 1731 return true;
ccb4cabe
SH
1732}
1733
36662416
TA
1734static int cgfsng_num_hierarchies(void)
1735{
1736 int i;
1737
1738 for (i = 0; hierarchies[i]; i++)
1739 ;
1740
1741 return i;
1742}
1743
1744static bool cgfsng_get_hierarchies(int n, char ***out)
1745{
1746 int i;
1747
1748 /* sanity check n */
1749 for (i = 0; i < n; i++) {
1750 if (!hierarchies[i])
1751 return false;
1752 }
1753
1754 *out = hierarchies[i]->controllers;
1755
1756 return true;
1757}
1758
ccb4cabe
SH
1759#define THAWED "THAWED"
1760#define THAWED_LEN (strlen(THAWED))
1761
1762static bool cgfsng_unfreeze(void *hdata)
1763{
ccb4cabe 1764 char *fullpath;
457ca9aa 1765 struct hierarchy *h = get_hierarchy("freezer");
ccb4cabe 1766
457ca9aa 1767 if (!h)
ccb4cabe
SH
1768 return false;
1769 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
1770 if (lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false) != 0) {
1771 free(fullpath);
1772 return false;
1773 }
1774 free(fullpath);
1775 return true;
1776}
1777
1778static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem)
1779{
457ca9aa 1780 struct hierarchy *h = get_hierarchy(subsystem);
ccb4cabe
SH
1781 if (!h)
1782 return NULL;
1783
371f834d
SH
1784 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
1785}
1786
1787/*
1788 * Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a
1789 * full path, which must be freed by the caller.
1790 */
1791static char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
1792 const char *inpath,
1793 const char *filename)
1794{
371f834d 1795 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
1796}
1797
1798static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
1799{
ccb4cabe
SH
1800 char pidstr[25];
1801 int i, len;
1802
1803 len = snprintf(pidstr, 25, "%d", pid);
1804 if (len < 0 || len > 25)
1805 return false;
1806
457ca9aa 1807 for (i = 0; hierarchies[i]; i++) {
ccb4cabe 1808 char *path, *fullpath;
457ca9aa 1809 struct hierarchy *h = hierarchies[i];
ccb4cabe
SH
1810
1811 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
1812 if (!path) // not running
1813 continue;
1814
371f834d
SH
1815 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
1816 free(path);
ccb4cabe
SH
1817 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
1818 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
1819 free(fullpath);
ccb4cabe
SH
1820 return false;
1821 }
ccb4cabe
SH
1822 free(fullpath);
1823 }
1824
ccb4cabe
SH
1825 return true;
1826}
1827
1828/*
1829 * Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.
1830 * Here we don't have a cgroup_data set up, so we ask the running
1831 * container through the commands API for the cgroup path
1832 */
1833static int cgfsng_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
1834{
1835 char *subsystem, *p, *path;
ccb4cabe
SH
1836 struct hierarchy *h;
1837 int ret = -1;
1838
1839 subsystem = alloca(strlen(filename) + 1);
1840 strcpy(subsystem, filename);
1841 if ((p = strchr(subsystem, '.')) != NULL)
1842 *p = '\0';
1843
1844 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1845 if (!path) // not running
1846 return -1;
1847
457ca9aa 1848 h = get_hierarchy(subsystem);
ccb4cabe 1849 if (h) {
371f834d 1850 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1851 ret = lxc_read_from_file(fullpath, value, len);
1852 free(fullpath);
1853 }
1854
ccb4cabe
SH
1855 free(path);
1856
1857 return ret;
1858}
1859
1860/*
1861 * Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.
1862 * Here we don't have a cgroup_data set up, so we ask the running
1863 * container through the commands API for the cgroup path
1864 */
1865static int cgfsng_set(const char *filename, const char *value, const char *name, const char *lxcpath)
1866{
1867 char *subsystem, *p, *path;
ccb4cabe
SH
1868 struct hierarchy *h;
1869 int ret = -1;
1870
1871 subsystem = alloca(strlen(filename) + 1);
1872 strcpy(subsystem, filename);
1873 if ((p = strchr(subsystem, '.')) != NULL)
1874 *p = '\0';
1875
1876 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1877 if (!path) // not running
1878 return -1;
1879
457ca9aa 1880 h = get_hierarchy(subsystem);
ccb4cabe 1881 if (h) {
371f834d 1882 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1883 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
1884 free(fullpath);
1885 }
1886
ccb4cabe
SH
1887 free(path);
1888
1889 return ret;
1890}
1891
ccb4cabe
SH
1892/*
1893 * Called from setup_limits - here we have the container's cgroup_data because
1894 * we created the cgroups
1895 */
1896static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfsng_handler_data *d)
1897{
1898 char *subsystem = NULL, *p;
1899 int ret = -1;
1900 struct hierarchy *h;
1901
1902 subsystem = alloca(strlen(filename) + 1);
1903 strcpy(subsystem, filename);
1904 if ((p = strchr(subsystem, '.')) != NULL)
1905 *p = '\0';
1906
457ca9aa 1907 h = get_hierarchy(subsystem);
ccb4cabe
SH
1908 if (h) {
1909 char *fullpath = must_make_path(h->fullcgpath, filename, NULL);
1910 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
1911 free(fullpath);
1912 }
1913 return ret;
1914}
1915
1916static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings,
1917 bool do_devices)
1918{
1919 struct cgfsng_handler_data *d = hdata;
1920 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
1921 struct lxc_cgroup *cg;
ccb4cabe
SH
1922 bool ret = false;
1923
1924 if (lxc_list_empty(cgroup_settings))
1925 return true;
1926
1927 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
1928 if (!sorted_cgroup_settings) {
1929 return false;
1930 }
1931
ccb4cabe
SH
1932 lxc_list_for_each(iterator, sorted_cgroup_settings) {
1933 cg = iterator->elem;
1934
1935 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
ccb4cabe
SH
1936 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
1937 if (do_devices && (errno == EACCES || errno == EPERM)) {
1938 WARN("Error setting %s to %s for %s",
1939 cg->subsystem, cg->value, d->name);
1940 continue;
1941 }
1942 SYSERROR("Error setting %s to %s for %s",
1943 cg->subsystem, cg->value, d->name);
1944 goto out;
1945 }
6a628f4a 1946 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
ccb4cabe 1947 }
ccb4cabe
SH
1948 }
1949
1950 ret = true;
1951 INFO("cgroup has been setup");
1952out:
ccb4cabe
SH
1953 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
1954 lxc_list_del(iterator);
1955 free(iterator);
1956 }
1957 free(sorted_cgroup_settings);
1958 return ret;
1959}
1960
1961static struct cgroup_ops cgfsng_ops = {
1962 .init = cgfsng_init,
1963 .destroy = cgfsng_destroy,
1964 .create = cgfsng_create,
1965 .enter = cgfsng_enter,
ccb4cabe 1966 .escape = cgfsng_escape,
36662416
TA
1967 .num_hierarchies = cgfsng_num_hierarchies,
1968 .get_hierarchies = cgfsng_get_hierarchies,
ccb4cabe
SH
1969 .get_cgroup = cgfsng_get_cgroup,
1970 .get = cgfsng_get,
1971 .set = cgfsng_set,
1972 .unfreeze = cgfsng_unfreeze,
1973 .setup_limits = cgfsng_setup_limits,
1974 .name = "cgroupfs-ng",
1975 .attach = cgfsng_attach,
1976 .chown = cgfsns_chown,
1977 .mount_cgroup = cgfsng_mount,
1978 .nrtasks = cgfsng_nrtasks,
1979 .driver = CGFSNG,
1980
1981 /* unsupported */
1982 .create_legacy = NULL,
1983};