]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
Merge pull request #1713 from brauner/2017-07-26/hybrid_cgroup_support
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
ccb4cabe
SH
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24/*
25 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
26 * cgroup backend. The original cgfs.c was designed to be as flexible
27 * as possible. It would try to find cgroup filesystems no matter where
28 * or how you had them mounted, and deduce the most usable mount for
29 * each controller. It also was not designed for unprivileged use, as
30 * that was reserved for cgmanager.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
a54694f8 36
ccb4cabe 37#include "config.h"
a54694f8
CB
38
39#include <ctype.h>
40#include <dirent.h>
41#include <errno.h>
42#include <grp.h>
43#include <stdint.h>
ccb4cabe
SH
44#include <stdio.h>
45#include <stdlib.h>
a54694f8 46#include <string.h>
ccb4cabe 47#include <unistd.h>
a54694f8 48#include <sys/types.h>
ccb4cabe 49
d8e48992 50#include "bdev.h"
ccb4cabe 51#include "cgroup.h"
6328fd9c 52#include "cgroup_utils.h"
ccb4cabe 53#include "commands.h"
a54694f8
CB
54#include "log.h"
55#include "utils.h"
ccb4cabe
SH
56
57lxc_log_define(lxc_cgfsng, lxc);
58
59static struct cgroup_ops cgfsng_ops;
60
ccb4cabe
SH
61/*
62 * A descriptor for a mounted hierarchy
63 * @controllers: either NULL, or a null-terminated list of all
64 * the co-mounted controllers
65 * @mountpoint: the mountpoint we will use. It will be either
66 * /sys/fs/cgroup/controller or /sys/fs/cgroup/controllerlist
67 * @base_cgroup: the cgroup under which the container cgroup path
68 is created. This will be either the caller's cgroup (if not
69 root), or init's cgroup (if root).
70 */
71struct hierarchy {
72 char **controllers;
73 char *mountpoint;
74 char *base_cgroup;
75 char *fullcgpath;
6328fd9c 76 bool is_cgroup_v2;
ccb4cabe
SH
77};
78
79/*
80 * The cgroup data which is attached to the lxc_handler.
ccb4cabe
SH
81 * @cgroup_pattern - a copy of the lxc.cgroup.pattern
82 * @container_cgroup - if not null, the cgroup which was created for
83 * the container. For each hierarchy, it is created under the
84 * @hierarchy->base_cgroup directory. Relative to the base_cgroup
85 * it is the same for all hierarchies.
86 * @name - the container name
87 */
88struct cgfsng_handler_data {
ccb4cabe
SH
89 char *cgroup_pattern;
90 char *container_cgroup; // cgroup we created for the container
91 char *name; // container name
92};
93
457ca9aa
SH
94/*
95 * @hierarchies - a NULL-terminated array of struct hierarchy, one per
96 * hierarchy. No duplicates. First sufficient, writeable mounted
97 * hierarchy wins
98 */
99struct hierarchy **hierarchies;
100
101/*
102 * @cgroup_use - a copy of the lxc.cgroup.use
103 */
104char *cgroup_use;
105
e4aeecf5
CB
106/*
107 * @lxc_cgfsng_debug - whether to print debug info to stdout for the cgfsng
108 * driver
109 */
110static bool lxc_cgfsng_debug;
111
ccb4cabe
SH
112static void free_string_list(char **clist)
113{
114 if (clist) {
115 int i;
116
117 for (i = 0; clist[i]; i++)
118 free(clist[i]);
119 free(clist);
120 }
121}
122
ccb4cabe
SH
123/* Allocate a pointer, do not fail */
124static void *must_alloc(size_t sz)
125{
126 return must_realloc(NULL, sz);
127}
128
ccb4cabe
SH
129/*
130 * This is a special case - return a copy of @entry
131 * prepending 'name='. I.e. turn systemd into name=systemd.
132 * Do not fail.
133 */
134static char *must_prefix_named(char *entry)
135{
136 char *ret;
137 size_t len = strlen(entry);
138
139 ret = must_alloc(len + 6);
140 snprintf(ret, len + 6, "name=%s", entry);
141 return ret;
142}
143
144/*
145 * Given a pointer to a null-terminated array of pointers, realloc to
146 * add one entry, and point the new entry to NULL. Do not fail. Return
147 * the index to the second-to-last entry - that is, the one which is
148 * now available for use (keeping the list null-terminated).
149 */
150static int append_null_to_list(void ***list)
151{
152 int newentry = 0;
153
154 if (*list)
155 for (; (*list)[newentry]; newentry++);
156
157 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
158 (*list)[newentry + 1] = NULL;
159 return newentry;
160}
161
162/*
163 * Given a null-terminated array of strings, check whether @entry
164 * is one of the strings
165 */
166static bool string_in_list(char **list, const char *entry)
167{
168 int i;
169
170 if (!list)
171 return false;
172 for (i = 0; list[i]; i++)
173 if (strcmp(list[i], entry) == 0)
174 return true;
175
176 return false;
177}
178
179/*
180 * append an entry to the clist. Do not fail.
181 * *clist must be NULL the first time we are called.
182 *
183 * We also handle named subsystems here. Any controller which is not a
184 * kernel subsystem, we prefix 'name='. Any which is both a kernel and
185 * named subsystem, we refuse to use because we're not sure which we
186 * have here. (TODO - we could work around this in some cases by just
187 * remounting to be unambiguous, or by comparing mountpoint contents
188 * with current cgroup)
189 *
190 * The last entry will always be NULL.
191 */
192static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry)
193{
194 int newentry;
195 char *copy;
196
197 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
198 ERROR("Refusing to use ambiguous controller '%s'", entry);
199 ERROR("It is both a named and kernel subsystem");
200 return;
201 }
202
203 newentry = append_null_to_list((void ***)clist);
204
205 if (strncmp(entry, "name=", 5) == 0)
206 copy = must_copy_string(entry);
207 else if (string_in_list(klist, entry))
208 copy = must_copy_string(entry);
209 else
210 copy = must_prefix_named(entry);
211
212 (*clist)[newentry] = copy;
213}
214
ccb4cabe
SH
215static void free_handler_data(struct cgfsng_handler_data *d)
216{
ccb4cabe
SH
217 free(d->cgroup_pattern);
218 free(d->container_cgroup);
219 free(d->name);
220 free(d);
221}
222
223/*
224 * Given a handler's cgroup data, return the struct hierarchy for the
225 * controller @c, or NULL if there is none.
226 */
457ca9aa 227struct hierarchy *get_hierarchy(const char *c)
ccb4cabe
SH
228{
229 int i;
230
457ca9aa 231 if (!hierarchies)
ccb4cabe 232 return NULL;
457ca9aa
SH
233 for (i = 0; hierarchies[i]; i++) {
234 if (string_in_list(hierarchies[i]->controllers, c))
235 return hierarchies[i];
ccb4cabe
SH
236 }
237 return NULL;
238}
239
a54694f8
CB
240#define BATCH_SIZE 50
241static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
242{
243 int newbatches = (newlen / BATCH_SIZE) + 1;
244 int oldbatches = (oldlen / BATCH_SIZE) + 1;
245
246 if (!*mem || newbatches > oldbatches) {
247 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
248 }
249}
250
251static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
252{
253 size_t full = oldlen + newlen;
254
255 batch_realloc(dest, oldlen, full + 1);
256
257 memcpy(*dest + oldlen, new, newlen + 1);
258}
259
260/* Slurp in a whole file */
261static char *read_file(char *fnam)
262{
263 FILE *f;
264 char *line = NULL, *buf = NULL;
265 size_t len = 0, fulllen = 0;
266 int linelen;
267
268 f = fopen(fnam, "r");
269 if (!f)
270 return NULL;
271 while ((linelen = getline(&line, &len, f)) != -1) {
272 append_line(&buf, fulllen, line, linelen);
273 fulllen += linelen;
274 }
275 fclose(f);
276 free(line);
277 return buf;
278}
279
280/* Taken over modified from the kernel sources. */
281#define NBITS 32 /* bits in uint32_t */
282#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
283#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
284
285static void set_bit(unsigned bit, uint32_t *bitarr)
286{
287 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
288}
289
290static void clear_bit(unsigned bit, uint32_t *bitarr)
291{
292 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
293}
294
295static bool is_set(unsigned bit, uint32_t *bitarr)
296{
297 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
298}
299
300/* Create cpumask from cpulist aka turn:
301 *
302 * 0,2-3
303 *
304 * into bit array
305 *
306 * 1 0 1 1
307 */
308static uint32_t *lxc_cpumask(char *buf, size_t nbits)
309{
310 char *token;
311 char *saveptr = NULL;
312 size_t arrlen = BITS_TO_LONGS(nbits);
313 uint32_t *bitarr = calloc(arrlen, sizeof(uint32_t));
314 if (!bitarr)
315 return NULL;
316
317 for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
318 errno = 0;
319 unsigned start = strtoul(token, NULL, 0);
320 unsigned end = start;
321
322 char *range = strchr(token, '-');
323 if (range)
324 end = strtoul(range + 1, NULL, 0);
325 if (!(start <= end)) {
326 free(bitarr);
327 return NULL;
328 }
329
330 if (end >= nbits) {
331 free(bitarr);
332 return NULL;
333 }
334
335 while (start <= end)
336 set_bit(start++, bitarr);
337 }
338
339 return bitarr;
340}
341
a54694f8
CB
342/* Turn cpumask into simple, comma-separated cpulist. */
343static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
344{
345 size_t i;
346 int ret;
eab15c1e 347 char numstr[LXC_NUMSTRLEN64] = {0};
a54694f8
CB
348 char **cpulist = NULL;
349
350 for (i = 0; i <= nbits; i++) {
351 if (is_set(i, bitarr)) {
eab15c1e
CB
352 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
353 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
a54694f8
CB
354 lxc_free_array((void **)cpulist, free);
355 return NULL;
356 }
357 if (lxc_append_string(&cpulist, numstr) < 0) {
358 lxc_free_array((void **)cpulist, free);
359 return NULL;
360 }
361 }
362 }
363 return lxc_string_join(",", (const char **)cpulist, false);
364}
365
366static ssize_t get_max_cpus(char *cpulist)
367{
368 char *c1, *c2;
369 char *maxcpus = cpulist;
370 size_t cpus = 0;
371
372 c1 = strrchr(maxcpus, ',');
373 if (c1)
374 c1++;
375
376 c2 = strrchr(maxcpus, '-');
377 if (c2)
378 c2++;
379
380 if (!c1 && !c2)
381 c1 = maxcpus;
382 else if (c1 > c2)
383 c2 = c1;
384 else if (c1 < c2)
385 c1 = c2;
386 else if (!c1 && c2) // The reverse case is obvs. not needed.
387 c1 = c2;
388
389 /* If the above logic is correct, c1 should always hold a valid string
390 * here.
391 */
392
393 errno = 0;
394 cpus = strtoul(c1, NULL, 0);
395 if (errno != 0)
396 return -1;
397
398 return cpus;
399}
400
6f9584d8 401#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
a54694f8
CB
402static bool filter_and_set_cpus(char *path, bool am_initialized)
403{
404 char *lastslash, *fpath, oldv;
405 int ret;
406 ssize_t i;
407
408 ssize_t maxposs = 0, maxisol = 0;
409 char *cpulist = NULL, *posscpus = NULL, *isolcpus = NULL;
410 uint32_t *possmask = NULL, *isolmask = NULL;
6f9584d8 411 bool bret = false, flipped_bit = false;
a54694f8
CB
412
413 lastslash = strrchr(path, '/');
414 if (!lastslash) { // bug... this shouldn't be possible
6f9584d8 415 ERROR("Invalid path: %s.", path);
a54694f8
CB
416 return bret;
417 }
418 oldv = *lastslash;
419 *lastslash = '\0';
420 fpath = must_make_path(path, "cpuset.cpus", NULL);
421 posscpus = read_file(fpath);
6f9584d8
CB
422 if (!posscpus) {
423 SYSERROR("Could not read file: %s.\n", fpath);
424 goto on_error;
425 }
a54694f8
CB
426
427 /* Get maximum number of cpus found in possible cpuset. */
428 maxposs = get_max_cpus(posscpus);
429 if (maxposs < 0)
6f9584d8 430 goto on_error;
a54694f8 431
6f9584d8
CB
432 if (!file_exists(__ISOL_CPUS)) {
433 /* This system doesn't expose isolated cpus. */
434 DEBUG("Path: "__ISOL_CPUS" to read isolated cpus from does not exist.\n");
65d29cbc
CB
435 cpulist = posscpus;
436 /* No isolated cpus but we weren't already initialized by
437 * someone. We should simply copy the parents cpuset.cpus
438 * values.
439 */
440 if (!am_initialized) {
441 DEBUG("Copying cpuset of parent cgroup.");
442 goto copy_parent;
443 }
444 /* No isolated cpus but we were already initialized by someone.
445 * Nothing more to do for us.
446 */
6f9584d8
CB
447 goto on_success;
448 }
449
450 isolcpus = read_file(__ISOL_CPUS);
451 if (!isolcpus) {
452 SYSERROR("Could not read file "__ISOL_CPUS);
453 goto on_error;
454 }
a54694f8 455 if (!isdigit(isolcpus[0])) {
6f9584d8 456 DEBUG("No isolated cpus detected.");
a54694f8
CB
457 cpulist = posscpus;
458 /* No isolated cpus but we weren't already initialized by
459 * someone. We should simply copy the parents cpuset.cpus
460 * values.
461 */
6f9584d8
CB
462 if (!am_initialized) {
463 DEBUG("Copying cpuset of parent cgroup.");
a54694f8 464 goto copy_parent;
6f9584d8 465 }
a54694f8
CB
466 /* No isolated cpus but we were already initialized by someone.
467 * Nothing more to do for us.
468 */
6f9584d8 469 goto on_success;
a54694f8
CB
470 }
471
472 /* Get maximum number of cpus found in isolated cpuset. */
473 maxisol = get_max_cpus(isolcpus);
474 if (maxisol < 0)
6f9584d8 475 goto on_error;
a54694f8
CB
476
477 if (maxposs < maxisol)
478 maxposs = maxisol;
479 maxposs++;
480
481 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8
CB
482 if (!possmask) {
483 ERROR("Could not create cpumask for all possible cpus.\n");
484 goto on_error;
485 }
a54694f8
CB
486
487 isolmask = lxc_cpumask(isolcpus, maxposs);
6f9584d8
CB
488 if (!isolmask) {
489 ERROR("Could not create cpumask for all isolated cpus.\n");
490 goto on_error;
491 }
a54694f8
CB
492
493 for (i = 0; i <= maxposs; i++) {
494 if (is_set(i, isolmask) && is_set(i, possmask)) {
6f9584d8 495 flipped_bit = true;
a54694f8
CB
496 clear_bit(i, possmask);
497 }
498 }
499
6f9584d8
CB
500 if (!flipped_bit) {
501 DEBUG("No isolated cpus present in cpuset.");
502 goto on_success;
503 }
504 DEBUG("Removed isolated cpus from cpuset.");
505
a54694f8 506 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
6f9584d8
CB
507 if (!cpulist) {
508 ERROR("Could not create cpu list.\n");
509 goto on_error;
510 }
a54694f8
CB
511
512copy_parent:
513 *lastslash = oldv;
514 fpath = must_make_path(path, "cpuset.cpus", NULL);
515 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false);
6f9584d8
CB
516 if (ret < 0) {
517 SYSERROR("Could not write cpu list to: %s.\n", fpath);
518 goto on_error;
519 }
520
521on_success:
522 bret = true;
a54694f8 523
6f9584d8 524on_error:
a54694f8
CB
525 free(fpath);
526
527 free(isolcpus);
528 free(isolmask);
529
530 if (posscpus != cpulist)
531 free(posscpus);
532 free(possmask);
533
534 free(cpulist);
535 return bret;
536}
537
e3a3fecf
SH
538/* Copy contents of parent(@path)/@file to @path/@file */
539static bool copy_parent_file(char *path, char *file)
540{
541 char *lastslash, *value = NULL, *fpath, oldv;
542 int len = 0;
543 int ret;
544
545 lastslash = strrchr(path, '/');
546 if (!lastslash) { // bug... this shouldn't be possible
547 ERROR("cgfsng:copy_parent_file: bad path %s", path);
548 return false;
549 }
550 oldv = *lastslash;
551 *lastslash = '\0';
552 fpath = must_make_path(path, file, NULL);
553 len = lxc_read_from_file(fpath, NULL, 0);
554 if (len <= 0)
555 goto bad;
556 value = must_alloc(len + 1);
557 if (lxc_read_from_file(fpath, value, len) != len)
558 goto bad;
559 free(fpath);
560 *lastslash = oldv;
561 fpath = must_make_path(path, file, NULL);
562 ret = lxc_write_to_file(fpath, value, len, false);
563 if (ret < 0)
564 SYSERROR("Unable to write %s to %s", value, fpath);
565 free(fpath);
566 free(value);
567 return ret >= 0;
568
569bad:
570 SYSERROR("Error reading '%s'", fpath);
571 free(fpath);
572 free(value);
573 return false;
574}
575
576/*
577 * Initialize the cpuset hierarchy in first directory of @gname and
578 * set cgroup.clone_children so that children inherit settings.
579 * Since the h->base_path is populated by init or ourselves, we know
580 * it is already initialized.
581 */
a54694f8 582static bool handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf
SH
583{
584 char *cgpath, *clonechildrenpath, v, *slash;
585
586 if (!string_in_list(h->controllers, "cpuset"))
587 return true;
588
589 if (*cgname == '/')
590 cgname++;
591 slash = strchr(cgname, '/');
592 if (slash)
593 *slash = '\0';
594
595 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
596 if (slash)
597 *slash = '/';
598 if (mkdir(cgpath, 0755) < 0 && errno != EEXIST) {
599 SYSERROR("Failed to create '%s'", cgpath);
600 free(cgpath);
601 return false;
602 }
6f9584d8 603
e3a3fecf 604 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
6328fd9c
CB
605 /* unified hierarchy doesn't have clone_children */
606 if (!file_exists(clonechildrenpath)) {
e3a3fecf
SH
607 free(clonechildrenpath);
608 free(cgpath);
609 return true;
610 }
611 if (lxc_read_from_file(clonechildrenpath, &v, 1) < 0) {
612 SYSERROR("Failed to read '%s'", clonechildrenpath);
613 free(clonechildrenpath);
614 free(cgpath);
615 return false;
616 }
617
a54694f8 618 /* Make sure any isolated cpus are removed from cpuset.cpus. */
6f9584d8
CB
619 if (!filter_and_set_cpus(cgpath, v == '1')) {
620 SYSERROR("Failed to remove isolated cpus.");
621 free(clonechildrenpath);
622 free(cgpath);
a54694f8 623 return false;
6f9584d8 624 }
a54694f8 625
e3a3fecf 626 if (v == '1') { /* already set for us by someone else */
6f9584d8 627 DEBUG("\"cgroup.clone_children\" was already set to \"1\".");
e3a3fecf
SH
628 free(clonechildrenpath);
629 free(cgpath);
630 return true;
631 }
632
633 /* copy parent's settings */
a54694f8 634 if (!copy_parent_file(cgpath, "cpuset.mems")) {
6f9584d8 635 SYSERROR("Failed to copy \"cpuset.mems\" settings.");
e3a3fecf
SH
636 free(cgpath);
637 free(clonechildrenpath);
638 return false;
639 }
640 free(cgpath);
641
642 if (lxc_write_to_file(clonechildrenpath, "1", 1, false) < 0) {
643 /* Set clone_children so children inherit our settings */
644 SYSERROR("Failed to write 1 to %s", clonechildrenpath);
645 free(clonechildrenpath);
646 return false;
647 }
648 free(clonechildrenpath);
649 return true;
650}
651
ccb4cabe
SH
652/*
653 * Given two null-terminated lists of strings, return true if any string
654 * is in both.
655 */
656static bool controller_lists_intersect(char **l1, char **l2)
657{
658 int i;
659
660 if (!l1 || !l2)
661 return false;
662
663 for (i = 0; l1[i]; i++) {
664 if (string_in_list(l2, l1[i]))
665 return true;
666 }
667 return false;
668}
669
670/*
671 * For a null-terminated list of controllers @clist, return true if any of
672 * those controllers is already listed the null-terminated list of
673 * hierarchies @hlist. Realistically, if one is present, all must be present.
674 */
675static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
676{
677 int i;
678
679 if (!hlist)
680 return false;
681 for (i = 0; hlist[i]; i++)
682 if (controller_lists_intersect(hlist[i]->controllers, clist))
683 return true;
684 return false;
685
686}
687
688/*
689 * Return true if the controller @entry is found in the null-terminated
690 * list of hierarchies @hlist
691 */
692static bool controller_found(struct hierarchy **hlist, char *entry)
693{
694 int i;
695 if (!hlist)
696 return false;
697
698 for (i = 0; hlist[i]; i++)
699 if (string_in_list(hlist[i]->controllers, entry))
700 return true;
701 return false;
702}
703
704/*
c30b61c3
SH
705 * Return true if all of the controllers which we require have been found.
706 * The required list is freezer and anything in * lxc.cgroup.use.
ccb4cabe 707 */
457ca9aa 708static bool all_controllers_found(void)
ccb4cabe
SH
709{
710 char *p, *saveptr = NULL;
457ca9aa 711 struct hierarchy ** hlist = hierarchies;
ccb4cabe 712
ccb4cabe
SH
713 if (!controller_found(hlist, "freezer")) {
714 ERROR("no freezer controller mountpoint found");
715 return false;
716 }
717
457ca9aa 718 if (!cgroup_use)
ccb4cabe 719 return true;
457ca9aa 720 for (p = strtok_r(cgroup_use, ",", &saveptr); p;
ccb4cabe
SH
721 p = strtok_r(NULL, ",", &saveptr)) {
722 if (!controller_found(hlist, p)) {
723 ERROR("no %s controller mountpoint found", p);
724 return false;
725 }
726 }
727 return true;
728}
729
730/* Return true if the fs type is fuse.lxcfs */
731static bool is_lxcfs(const char *line)
732{
733 char *p = strstr(line, " - ");
734 if (!p)
735 return false;
2f62fb00 736 return strncmp(p, " - fuse.lxcfs ", 14) == 0;
ccb4cabe
SH
737}
738
739/*
740 * Get the controllers from a mountinfo line
741 * There are other ways we could get this info. For lxcfs, field 3
742 * is /cgroup/controller-list. For cgroupfs, we could parse the mount
743 * options. But we simply assume that the mountpoint must be
744 * /sys/fs/cgroup/controller-list
745 */
746static char **get_controllers(char **klist, char **nlist, char *line)
747{
6328fd9c 748 /* the fourth field is /sys/fs/cgroup/comma-delimited-controller-list */
ccb4cabe
SH
749 int i;
750 char *p = line, *p2, *tok, *saveptr = NULL;
751 char **aret = NULL;
6328fd9c
CB
752 bool is_cgroup_v2;
753
754 /* handle cgroup v2 */
755 is_cgroup_v2 = is_cgroupfs_v2(line);
ccb4cabe
SH
756
757 for (i = 0; i < 4; i++) {
235f1815 758 p = strchr(p, ' ');
ccb4cabe
SH
759 if (!p)
760 return NULL;
761 p++;
762 }
763 if (!p)
764 return NULL;
765 /* note - if we change how mountinfo works, then our caller
766 * will need to verify /sys/fs/cgroup/ in this field */
5059aae9
SH
767 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
768 INFO("cgfsng: found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
ccb4cabe 769 return NULL;
5059aae9 770 }
ccb4cabe 771 p += 15;
235f1815 772 p2 = strchr(p, ' ');
ccb4cabe
SH
773 if (!p2) {
774 ERROR("corrupt mountinfo");
775 return NULL;
776 }
777 *p2 = '\0';
6328fd9c
CB
778
779 /* cgroup v2 does not have separate mountpoints for controllers */
780 if (is_cgroup_v2) {
781 must_append_controller(klist, nlist, &aret, "cgroup2");
782 return aret;
783 }
784
ccb4cabe
SH
785 for (tok = strtok_r(p, ",", &saveptr); tok;
786 tok = strtok_r(NULL, ",", &saveptr)) {
787 must_append_controller(klist, nlist, &aret, tok);
788 }
789
790 return aret;
791}
792
ccb4cabe 793/* Add a controller to our list of hierarchies */
457ca9aa 794static void add_controller(char **clist, char *mountpoint, char *base_cgroup)
ccb4cabe
SH
795{
796 struct hierarchy *new;
797 int newentry;
798
799 new = must_alloc(sizeof(*new));
800 new->controllers = clist;
801 new->mountpoint = mountpoint;
802 new->base_cgroup = base_cgroup;
803 new->fullcgpath = NULL;
804
6328fd9c
CB
805 /* record if this is the cgroup v2 hierarchy */
806 if (!strcmp(base_cgroup, "cgroup2"))
807 new->is_cgroup_v2 = true;
808 else
809 new->is_cgroup_v2 = false;
810
457ca9aa
SH
811 newentry = append_null_to_list((void ***)&hierarchies);
812 hierarchies[newentry] = new;
ccb4cabe
SH
813}
814
815/*
816 * Get a copy of the mountpoint from @line, which is a line from
817 * /proc/self/mountinfo
818 */
819static char *get_mountpoint(char *line)
820{
821 int i;
822 char *p = line, *sret;
823 size_t len;
824
825 for (i = 0; i < 4; i++) {
235f1815 826 p = strchr(p, ' ');
ccb4cabe
SH
827 if (!p)
828 return NULL;
829 p++;
830 }
831 /* we've already stuck a \0 after the mountpoint */
832 len = strlen(p);
833 sret = must_alloc(len + 1);
834 memcpy(sret, p, len);
835 sret[len] = '\0';
836 return sret;
837}
838
839/*
840 * Given a multi-line string, return a null-terminated copy of the
841 * current line.
842 */
843static char *copy_to_eol(char *p)
844{
235f1815 845 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
846 size_t len;
847
848 if (!p2)
849 return NULL;
850
851 len = p2 - p;
852 sret = must_alloc(len + 1);
853 memcpy(sret, p, len);
854 sret[len] = '\0';
855 return sret;
856}
857
858/*
859 * cgline: pointer to character after the first ':' in a line in a
860 * \n-terminated /proc/self/cgroup file. Check whether * controller c is
861 * present.
862 */
863static bool controller_in_clist(char *cgline, char *c)
864{
865 char *tok, *saveptr = NULL, *eol, *tmp;
866 size_t len;
867
235f1815 868 eol = strchr(cgline, ':');
ccb4cabe
SH
869 if (!eol)
870 return false;
871
872 len = eol - cgline;
873 tmp = alloca(len + 1);
874 memcpy(tmp, cgline, len);
875 tmp[len] = '\0';
876
877 for (tok = strtok_r(tmp, ",", &saveptr); tok;
878 tok = strtok_r(NULL, ",", &saveptr)) {
879 if (strcmp(tok, c) == 0)
880 return true;
881 }
882 return false;
883}
884
885/*
886 * @basecginfo is a copy of /proc/$$/cgroup. Return the current
887 * cgroup for @controller
888 */
889static char *get_current_cgroup(char *basecginfo, char *controller)
890{
891 char *p = basecginfo;
6328fd9c
CB
892 bool is_cgroup_v2;
893 bool is_cgroup_v2_base_cgroup;
894
895 is_cgroup_v2 = !strcmp(controller, "cgroup2");
896 while (true) {
897 is_cgroup_v2_base_cgroup = false;
898 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
899 if (is_cgroup_v2 && (*p == '0'))
900 is_cgroup_v2_base_cgroup = true;
ccb4cabe 901
235f1815 902 p = strchr(p, ':');
ccb4cabe
SH
903 if (!p)
904 return NULL;
905 p++;
6328fd9c 906 if (is_cgroup_v2_base_cgroup || controller_in_clist(p, controller)) {
235f1815 907 p = strchr(p, ':');
ccb4cabe
SH
908 if (!p)
909 return NULL;
910 p++;
911 return copy_to_eol(p);
912 }
913
235f1815 914 p = strchr(p, '\n');
ccb4cabe
SH
915 if (!p)
916 return NULL;
917 p++;
918 }
919}
920
ccb4cabe
SH
921static void must_append_string(char ***list, char *entry)
922{
923 int newentry = append_null_to_list((void ***)list);
924 char *copy;
925
926 copy = must_copy_string(entry);
927 (*list)[newentry] = copy;
928}
929
930static void get_existing_subsystems(char ***klist, char ***nlist)
931{
932 FILE *f;
933 char *line = NULL;
934 size_t len = 0;
935
936 if ((f = fopen("/proc/self/cgroup", "r")) == NULL)
937 return;
938 while (getline(&line, &len, f) != -1) {
939 char *p, *p2, *tok, *saveptr = NULL;
235f1815 940 p = strchr(line, ':');
ccb4cabe
SH
941 if (!p)
942 continue;
943 p++;
235f1815 944 p2 = strchr(p, ':');
ccb4cabe
SH
945 if (!p2)
946 continue;
947 *p2 = '\0';
ff8d6ee9 948
6328fd9c
CB
949 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
950 * contains an entry of the form:
ff8d6ee9
CB
951 *
952 * 0::/some/path
953 *
6328fd9c 954 * In this case we use "cgroup2" as controller name.
ff8d6ee9 955 */
6328fd9c
CB
956 if ((p2 - p) == 0) {
957 must_append_string(klist, "cgroup2");
ff8d6ee9 958 continue;
6328fd9c 959 }
ff8d6ee9 960
ccb4cabe
SH
961 for (tok = strtok_r(p, ",", &saveptr); tok;
962 tok = strtok_r(NULL, ",", &saveptr)) {
963 if (strncmp(tok, "name=", 5) == 0)
964 must_append_string(nlist, tok);
965 else
966 must_append_string(klist, tok);
967 }
968 }
969
970 free(line);
971 fclose(f);
972}
973
974static void trim(char *s)
975{
976 size_t len = strlen(s);
2c28d76b 977 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
978 s[--len] = '\0';
979}
980
e4aeecf5
CB
981static void lxc_cgfsng_print_handler_data(const struct cgfsng_handler_data *d)
982{
983 printf("Cgroup information:\n");
984 printf(" container name: %s\n", d->name ? d->name : "(null)");
985 printf(" lxc.cgroup.use: %s\n", cgroup_use ? cgroup_use : "(null)");
986 printf(" lxc.cgroup.pattern: %s\n", d->cgroup_pattern ? d->cgroup_pattern : "(null)");
987 printf(" cgroup: %s\n", d->container_cgroup ? d->container_cgroup : "(null)");
988}
989
990static void lxc_cgfsng_print_hierarchies()
ccb4cabe 991{
a7b0cc4c 992 struct hierarchy **it;
ccb4cabe 993 int i;
41c33dbe 994
457ca9aa 995 if (!hierarchies) {
e4aeecf5 996 printf(" No hierarchies found.");
ccb4cabe
SH
997 return;
998 }
e4aeecf5 999 printf(" Hierarchies:\n");
a7b0cc4c
CB
1000 for (i = 0, it = hierarchies; it && *it; it++, i++) {
1001 char **cit;
ccb4cabe 1002 int j;
e4aeecf5
CB
1003 printf(" %d: base_cgroup %s\n", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1004 printf(" mountpoint %s\n", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1005 printf(" controllers:\n");
a7b0cc4c 1006 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
e4aeecf5 1007 printf(" %d: %s\n", j, *cit);
ccb4cabe
SH
1008 }
1009}
41c33dbe 1010
e4aeecf5 1011static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist, char **nlist)
41c33dbe
SH
1012{
1013 int k;
a7b0cc4c 1014 char **it;
41c33dbe 1015
a7b0cc4c
CB
1016 printf("basecginfo is:\n");
1017 printf("%s\n", basecginfo);
41c33dbe 1018
a7b0cc4c
CB
1019 for (k = 0, it = klist; it && *it; it++, k++)
1020 printf("kernel subsystem %d: %s\n", k, *it);
1021 for (k = 0, it = nlist; it && *it; it++, k++)
1022 printf("named subsystem %d: %s\n", k, *it);
41c33dbe 1023}
ccb4cabe 1024
e4aeecf5
CB
1025static void lxc_cgfsng_print_debuginfo(const struct cgfsng_handler_data *d)
1026{
1027 lxc_cgfsng_print_handler_data(d);
1028 lxc_cgfsng_print_hierarchies();
1029}
1030
ccb4cabe
SH
1031/*
1032 * At startup, parse_hierarchies finds all the info we need about
1033 * cgroup mountpoints and current cgroups, and stores it in @d.
1034 */
457ca9aa 1035static bool parse_hierarchies(void)
ccb4cabe
SH
1036{
1037 FILE *f;
1038 char * line = NULL, *basecginfo;
1039 char **klist = NULL, **nlist = NULL;
1040 size_t len = 0;
1041
d30ec4cb
SH
1042 /*
1043 * Root spawned containers escape the current cgroup, so use init's
1044 * cgroups as our base in that case.
1045 */
ccb4cabe
SH
1046 if (geteuid())
1047 basecginfo = read_file("/proc/self/cgroup");
1048 else
1049 basecginfo = read_file("/proc/1/cgroup");
1050 if (!basecginfo)
1051 return false;
1052
1053 if ((f = fopen("/proc/self/mountinfo", "r")) == NULL) {
d3b00a8f 1054 SYSERROR("Failed opening /proc/self/mountinfo");
ccb4cabe
SH
1055 return false;
1056 }
1057
1058 get_existing_subsystems(&klist, &nlist);
41c33dbe 1059
e4aeecf5
CB
1060 if (lxc_cgfsng_debug)
1061 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
ccb4cabe
SH
1062
1063 /* we support simple cgroup mounts and lxcfs mounts */
1064 while (getline(&line, &len, f) != -1) {
1065 char **controller_list = NULL;
1066 char *mountpoint, *base_cgroup;
6328fd9c 1067 bool is_cgroup_v2, writeable;
ccb4cabe 1068
6328fd9c
CB
1069 is_cgroup_v2 = is_cgroupfs_v2(line);
1070 if (!is_lxcfs(line) && !is_cgroupfs_v1(line) && !is_cgroup_v2)
ccb4cabe
SH
1071 continue;
1072
1073 controller_list = get_controllers(klist, nlist, line);
1074 if (!controller_list)
1075 continue;
1076
457ca9aa 1077 if (controller_list_is_dup(hierarchies, controller_list)) {
ccb4cabe
SH
1078 free(controller_list);
1079 continue;
1080 }
1081
1082 mountpoint = get_mountpoint(line);
1083 if (!mountpoint) {
1084 ERROR("Error reading mountinfo: bad line '%s'", line);
1085 free_string_list(controller_list);
1086 continue;
1087 }
1088
1089 base_cgroup = get_current_cgroup(basecginfo, controller_list[0]);
1090 if (!base_cgroup) {
1091 ERROR("Failed to find current cgroup for controller '%s'", controller_list[0]);
1092 free_string_list(controller_list);
1093 free(mountpoint);
1094 continue;
1095 }
6328fd9c 1096
ccb4cabe
SH
1097 trim(base_cgroup);
1098 prune_init_scope(base_cgroup);
6328fd9c
CB
1099 if (is_cgroup_v2)
1100 writeable = test_writeable_v2(mountpoint, base_cgroup);
1101 else
1102 writeable = test_writeable_v1(mountpoint, base_cgroup);
1103 if (!writeable) {
ccb4cabe
SH
1104 free_string_list(controller_list);
1105 free(mountpoint);
1106 free(base_cgroup);
1107 continue;
1108 }
457ca9aa 1109 add_controller(controller_list, mountpoint, base_cgroup);
ccb4cabe
SH
1110 }
1111
1112 free_string_list(klist);
1113 free_string_list(nlist);
1114
1115 free(basecginfo);
1116
1117 fclose(f);
1118 free(line);
1119
e4aeecf5
CB
1120 if (lxc_cgfsng_debug) {
1121 printf("writeable subsystems:\n");
1122 lxc_cgfsng_print_hierarchies();
1123 }
1124
ccb4cabe
SH
1125 /* verify that all controllers in cgroup.use and all crucial
1126 * controllers are accounted for
1127 */
5059aae9
SH
1128 if (!all_controllers_found()) {
1129 INFO("cgfsng: not all controllers were find, deferring to cgfs driver");
ccb4cabe 1130 return false;
5059aae9 1131 }
ccb4cabe
SH
1132
1133 return true;
1134}
1135
457ca9aa
SH
1136static bool collect_hierarchy_info(void)
1137{
1138 const char *tmp;
1139 errno = 0;
1140 tmp = lxc_global_config_value("lxc.cgroup.use");
1141 if (!cgroup_use && errno != 0) { // lxc.cgroup.use can be NULL
1142 SYSERROR("cgfsng: error reading list of cgroups to use");
1143 return false;
1144 }
1145 cgroup_use = must_copy_string(tmp);
1146
1147 return parse_hierarchies();
1148}
1149
ccb4cabe
SH
1150static void *cgfsng_init(const char *name)
1151{
1152 struct cgfsng_handler_data *d;
457ca9aa 1153 const char *cgroup_pattern;
ccb4cabe
SH
1154
1155 d = must_alloc(sizeof(*d));
1156 memset(d, 0, sizeof(*d));
1157
1158 d->name = must_copy_string(name);
1159
ccb4cabe
SH
1160 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
1161 if (!cgroup_pattern) { // lxc.cgroup.pattern is only NULL on error
1162 ERROR("Error getting cgroup pattern");
1163 goto out_free;
1164 }
1165 d->cgroup_pattern = must_copy_string(cgroup_pattern);
1166
e4aeecf5
CB
1167 if (lxc_cgfsng_debug)
1168 lxc_cgfsng_print_debuginfo(d);
ccb4cabe
SH
1169
1170 return d;
1171
1172out_free:
1173 free_handler_data(d);
1174 return NULL;
1175}
1176
ccb4cabe
SH
1177static int cgroup_rmdir(char *dirname)
1178{
74f96976 1179 struct dirent *direntp;
ccb4cabe
SH
1180 DIR *dir;
1181 int r = 0;
1182
1183 dir = opendir(dirname);
1184 if (!dir)
1185 return -1;
1186
74f96976 1187 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1188 struct stat mystat;
1189 char *pathname;
1190
1191 if (!direntp)
1192 break;
1193
1194 if (!strcmp(direntp->d_name, ".") ||
1195 !strcmp(direntp->d_name, ".."))
1196 continue;
1197
1198 pathname = must_make_path(dirname, direntp->d_name, NULL);
1199
1200 if (lstat(pathname, &mystat)) {
1201 if (!r)
1c9da8da 1202 WARN("failed to stat %s", pathname);
ccb4cabe
SH
1203 r = -1;
1204 goto next;
1205 }
1206
1207 if (!S_ISDIR(mystat.st_mode))
1208 goto next;
1209 if (cgroup_rmdir(pathname) < 0)
1210 r = -1;
1211next:
1212 free(pathname);
1213 }
1214
1215 if (rmdir(dirname) < 0) {
1216 if (!r)
13277ec4 1217 WARN("failed to delete %s: %s", dirname, strerror(errno));
ccb4cabe
SH
1218 r = -1;
1219 }
1220
1221 if (closedir(dir) < 0) {
1222 if (!r)
13277ec4 1223 WARN("failed to delete %s: %s", dirname, strerror(errno));
ccb4cabe
SH
1224 r = -1;
1225 }
1226 return r;
1227}
1228
1229static int rmdir_wrapper(void *data)
1230{
1231 char *path = data;
1232
1233 if (setresgid(0,0,0) < 0)
1234 SYSERROR("Failed to setgid to 0");
1235 if (setresuid(0,0,0) < 0)
1236 SYSERROR("Failed to setuid to 0");
1237 if (setgroups(0, NULL) < 0)
1238 SYSERROR("Failed to clear groups");
1239
1240 return cgroup_rmdir(path);
1241}
1242
1243void recursive_destroy(char *path, struct lxc_conf *conf)
1244{
1245 int r;
1246 if (conf && !lxc_list_empty(&conf->id_map))
c9b7c33e 1247 r = userns_exec_1(conf, rmdir_wrapper, path, "rmdir_wrapper");
ccb4cabe
SH
1248 else
1249 r = cgroup_rmdir(path);
1250
1251 if (r < 0)
1c9da8da 1252 ERROR("Error destroying %s", path);
ccb4cabe
SH
1253}
1254
1255static void cgfsng_destroy(void *hdata, struct lxc_conf *conf)
1256{
1257 struct cgfsng_handler_data *d = hdata;
1258
1259 if (!d)
1260 return;
1261
457ca9aa 1262 if (d->container_cgroup && hierarchies) {
ccb4cabe 1263 int i;
457ca9aa
SH
1264 for (i = 0; hierarchies[i]; i++) {
1265 struct hierarchy *h = hierarchies[i];
e2db2a89 1266 if (h->fullcgpath) {
ccb4cabe
SH
1267 recursive_destroy(h->fullcgpath, conf);
1268 free(h->fullcgpath);
1269 h->fullcgpath = NULL;
1270 }
1271 }
1272 }
1273
1274 free_handler_data(d);
1275}
1276
1277struct cgroup_ops *cgfsng_ops_init(void)
1278{
e4aeecf5
CB
1279 if (getenv("LXC_DEBUG_CGFSNG"))
1280 lxc_cgfsng_debug = true;
1281
457ca9aa
SH
1282 if (!collect_hierarchy_info())
1283 return NULL;
e4aeecf5 1284
ccb4cabe
SH
1285 return &cgfsng_ops;
1286}
1287
1288static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1289{
e3a3fecf 1290 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
6f9584d8
CB
1291 if (dir_exists(h->fullcgpath)) { // it must not already exist
1292 ERROR("Path \"%s\" already existed.", h->fullcgpath);
d8da679e 1293 return false;
6f9584d8
CB
1294 }
1295 if (!handle_cpuset_hierarchy(h, cgname)) {
1296 ERROR("Failed to handle cgroupfs v1 cpuset controller.");
e3a3fecf 1297 return false;
6f9584d8 1298 }
e3a3fecf 1299 return mkdir_p(h->fullcgpath, 0755) == 0;
ccb4cabe
SH
1300}
1301
1302static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1303{
1304 if (rmdir(h->fullcgpath) < 0)
1305 SYSERROR("Failed to clean up cgroup %s from failed creation attempt", h->fullcgpath);
1306 free(h->fullcgpath);
1307 h->fullcgpath = NULL;
1308}
1309
1310/*
d30ec4cb 1311 * Try to create the same cgroup in all hierarchies.
ccb4cabe
SH
1312 * Start with cgroup_pattern; next cgroup_pattern-1, -2, ..., -999
1313 */
1314static inline bool cgfsng_create(void *hdata)
1315{
1316 struct cgfsng_handler_data *d = hdata;
1317 char *tmp, *cgname, *offset;
1318 int i, idx = 0;
1319 size_t len;
1320
1321 if (!d)
1322 return false;
1323 if (d->container_cgroup) {
1324 WARN("cgfsng_create called a second time");
1325 return false;
1326 }
1327
1328 tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
1329 if (!tmp) {
1330 ERROR("Failed expanding cgroup name pattern");
1331 return false;
1332 }
1333 len = strlen(tmp) + 5; // leave room for -NNN\0
1334 cgname = must_alloc(len);
1335 strcpy(cgname, tmp);
1336 free(tmp);
1337 offset = cgname + len - 5;
1338
1339again:
95adfe93
SH
1340 if (idx == 1000) {
1341 ERROR("Too many conflicting cgroup names");
ccb4cabe 1342 goto out_free;
95adfe93 1343 }
ccb4cabe
SH
1344 if (idx)
1345 snprintf(offset, 5, "-%d", idx);
457ca9aa
SH
1346 for (i = 0; hierarchies[i]; i++) {
1347 if (!create_path_for_hierarchy(hierarchies[i], cgname)) {
ccb4cabe 1348 int j;
457ca9aa
SH
1349 SYSERROR("Failed to create %s: %s", hierarchies[i]->fullcgpath, strerror(errno));
1350 free(hierarchies[i]->fullcgpath);
1351 hierarchies[i]->fullcgpath = NULL;
ccb4cabe 1352 for (j = 0; j < i; j++)
457ca9aa 1353 remove_path_for_hierarchy(hierarchies[j], cgname);
ccb4cabe
SH
1354 idx++;
1355 goto again;
1356 }
1357 }
1358 /* Done */
1359 d->container_cgroup = cgname;
1360 return true;
1361
1362out_free:
1363 free(cgname);
1364 return false;
1365}
1366
ccb4cabe
SH
1367static bool cgfsng_enter(void *hdata, pid_t pid)
1368{
ccb4cabe
SH
1369 char pidstr[25];
1370 int i, len;
1371
1372 len = snprintf(pidstr, 25, "%d", pid);
1373 if (len < 0 || len > 25)
1374 return false;
1375
457ca9aa
SH
1376 for (i = 0; hierarchies[i]; i++) {
1377 char *fullpath = must_make_path(hierarchies[i]->fullcgpath,
ccb4cabe
SH
1378 "cgroup.procs", NULL);
1379 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
d3b00a8f 1380 SYSERROR("Failed to enter %s", fullpath);
ccb4cabe
SH
1381 free(fullpath);
1382 return false;
1383 }
1384 free(fullpath);
1385 }
1386
1387 return true;
1388}
1389
1390struct chown_data {
1391 struct cgfsng_handler_data *d;
1392 uid_t origuid; // target uid in parent namespace
1393};
1394
c0888dfe
SH
1395/*
1396 * chgrp the container cgroups to container group. We leave
1397 * the container owner as cgroup owner. So we must make the
1398 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1399 *
1400 * Also chown the tasks and cgroup.procs files. Those may not
1401 * exist depending on kernel version.
c0888dfe 1402 */
ccb4cabe
SH
1403static int chown_cgroup_wrapper(void *data)
1404{
1405 struct chown_data *arg = data;
ccb4cabe
SH
1406 uid_t destuid;
1407 int i;
1408
1409 if (setresgid(0,0,0) < 0)
1410 SYSERROR("Failed to setgid to 0");
1411 if (setresuid(0,0,0) < 0)
1412 SYSERROR("Failed to setuid to 0");
1413 if (setgroups(0, NULL) < 0)
1414 SYSERROR("Failed to clear groups");
1415
1416 destuid = get_ns_uid(arg->origuid);
1417
457ca9aa
SH
1418 for (i = 0; hierarchies[i]; i++) {
1419 char *fullpath, *path = hierarchies[i]->fullcgpath;
43647298
SH
1420
1421 if (chown(path, destuid, 0) < 0) {
ab8f5424 1422 SYSERROR("Error chowning %s to %d", path, (int) destuid);
ccb4cabe
SH
1423 return -1;
1424 }
c0888dfe 1425
43647298 1426 if (chmod(path, 0775) < 0) {
ab8f5424 1427 SYSERROR("Error chmoding %s", path);
c0888dfe
SH
1428 return -1;
1429 }
ccb4cabe 1430
ab8f5424
SH
1431 /*
1432 * Failures to chown these are inconvenient but not detrimental
1433 * We leave these owned by the container launcher, so that container
1434 * root can write to the files to attach. We chmod them 664 so that
1435 * container systemd can write to the files (which systemd in wily
1436 * insists on doing)
1437 */
43647298
SH
1438 fullpath = must_make_path(path, "tasks", NULL);
1439 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1440 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1441 strerror(errno));
ab8f5424 1442 if (chmod(fullpath, 0664) < 0)
13277ec4 1443 WARN("Error chmoding %s: %s", path, strerror(errno));
43647298
SH
1444 free(fullpath);
1445
1446 fullpath = must_make_path(path, "cgroup.procs", NULL);
1447 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1448 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1449 strerror(errno));
ab8f5424 1450 if (chmod(fullpath, 0664) < 0)
13277ec4 1451 WARN("Error chmoding %s: %s", path, strerror(errno));
ccb4cabe
SH
1452 free(fullpath);
1453 }
1454
1455 return 0;
1456}
1457
1458static bool cgfsns_chown(void *hdata, struct lxc_conf *conf)
1459{
1460 struct cgfsng_handler_data *d = hdata;
1461 struct chown_data wrap;
1462
1463 if (!d)
1464 return false;
1465
1466 if (lxc_list_empty(&conf->id_map))
1467 return true;
1468
1469 wrap.d = d;
1470 wrap.origuid = geteuid();
1471
c9b7c33e
CB
1472 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1473 "chown_cgroup_wrapper") < 0) {
ccb4cabe
SH
1474 ERROR("Error requesting cgroup chown in new namespace");
1475 return false;
1476 }
1477
1478 return true;
1479}
1480
8aa1044f
SH
1481/*
1482 * We've safe-mounted a tmpfs as parent, so we don't need to protect against
1483 * symlinks any more - just use mount
1484 */
1485
1486/* mount cgroup-full if requested */
1487static int mount_cgroup_full(int type, struct hierarchy *h, char *dest,
1488 char *container_cgroup)
1489{
1490 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1491 return 0;
1492 if (mount(h->mountpoint, dest, "cgroup", MS_BIND, NULL) < 0) {
1493 SYSERROR("Error bind-mounting %s cgroup onto %s", h->mountpoint,
1494 dest);
1495 return -1;
1496 }
1497 if (type != LXC_AUTO_CGROUP_FULL_RW) {
5b6f9369
SH
1498 unsigned long flags = MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV |
1499 MS_REMOUNT | MS_RDONLY;
1500 if (mount(NULL, dest, "cgroup", flags, NULL) < 0) {
8aa1044f
SH
1501 SYSERROR("Error remounting %s readonly", dest);
1502 return -1;
1503 }
1504 }
1505
1506 INFO("Bind mounted %s onto %s", h->mountpoint, dest);
1507 if (type != LXC_AUTO_CGROUP_FULL_MIXED)
1508 return 0;
1509
1510 /* mount just the container path rw */
1511 char *source = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
5b6f9369 1512 char *rwpath = must_make_path(dest, h->base_cgroup, container_cgroup, NULL);
8aa1044f 1513 if (mount(source, rwpath, "cgroup", MS_BIND, NULL) < 0)
13277ec4 1514 WARN("Failed to mount %s read-write: %s", rwpath,
1515 strerror(errno));
8aa1044f
SH
1516 INFO("Made %s read-write", rwpath);
1517 free(rwpath);
1518 free(source);
1519 return 0;
1520}
1521
1522/* cgroup-full:* is done, no need to create subdirs */
1523static bool cg_mount_needs_subdirs(int type)
1524{
1525 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1526 return false;
1527 return true;
1528}
1529
1530/*
1531 * After $rootfs/sys/fs/container/controller/the/cg/path has been
1532 * created, remount controller ro if needed and bindmount the
1533 * cgroupfs onto controll/the/cg/path
1534 */
1535static int
1536do_secondstage_mounts_if_needed(int type, struct hierarchy *h,
1537 char *controllerpath, char *cgpath,
1538 const char *container_cgroup)
1539{
1540 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1541 if (mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL) < 0) {
1542 SYSERROR("Error bind-mounting %s", controllerpath);
1543 return -1;
1544 }
1545 if (mount(controllerpath, controllerpath, "cgroup",
1546 MS_REMOUNT | MS_BIND | MS_RDONLY, NULL) < 0) {
1547 SYSERROR("Error remounting %s read-only", controllerpath);
1548 return -1;
1549 }
1550 INFO("Remounted %s read-only", controllerpath);
1551 }
1552 char *sourcepath = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
1553 int flags = MS_BIND;
1554 if (type == LXC_AUTO_CGROUP_RO)
1555 flags |= MS_RDONLY;
1556 INFO("Mounting %s onto %s", sourcepath, cgpath);
1557 if (mount(sourcepath, cgpath, "cgroup", flags, NULL) < 0) {
1558 free(sourcepath);
1559 SYSERROR("Error mounting cgroup %s onto %s", h->controllers[0],
1560 cgpath);
1561 return -1;
1562 }
1563 free(sourcepath);
1564 INFO("Completed second stage cgroup automounts for %s", cgpath);
1565 return 0;
1566}
1567
ccb4cabe
SH
1568static bool cgfsng_mount(void *hdata, const char *root, int type)
1569{
8aa1044f
SH
1570 struct cgfsng_handler_data *d = hdata;
1571 char *tmpfspath = NULL;
1572 bool retval = false;
a8de4c49 1573 int i;
8aa1044f
SH
1574
1575 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1576 return true;
1577
ccb4cabe
SH
1578 if (cgns_supported())
1579 return true;
8aa1044f
SH
1580
1581 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1582
1583 if (type == LXC_AUTO_CGROUP_NOSPEC)
1584 type = LXC_AUTO_CGROUP_MIXED;
1585 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1586 type = LXC_AUTO_CGROUP_FULL_MIXED;
1587
1588 /* Mount tmpfs */
1589 if (safe_mount("cgroup_root", tmpfspath, "tmpfs",
1590 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1591 "size=10240k,mode=755",
1592 root) < 0)
1593 goto bad;
1594
457ca9aa 1595 for (i = 0; hierarchies[i]; i++) {
8aa1044f 1596 char *controllerpath, *path2;
457ca9aa 1597 struct hierarchy *h = hierarchies[i];
8aa1044f
SH
1598 char *controller = strrchr(h->mountpoint, '/');
1599 int r;
1600
1601 if (!controller)
1602 continue;
1603 controller++;
1604 controllerpath = must_make_path(tmpfspath, controller, NULL);
1605 if (dir_exists(controllerpath)) {
1606 free(controllerpath);
1607 continue;
1608 }
1609 if (mkdir(controllerpath, 0755) < 0) {
1610 SYSERROR("Error creating cgroup path: %s", controllerpath);
1611 free(controllerpath);
1612 goto bad;
1613 }
1614 if (mount_cgroup_full(type, h, controllerpath, d->container_cgroup) < 0) {
1615 free(controllerpath);
1616 goto bad;
1617 }
1618 if (!cg_mount_needs_subdirs(type)) {
1619 free(controllerpath);
1620 continue;
1621 }
ef4413fa 1622 path2 = must_make_path(controllerpath, h->base_cgroup, d->container_cgroup, NULL);
8aa1044f
SH
1623 if (mkdir_p(path2, 0755) < 0) {
1624 free(controllerpath);
1625 goto bad;
1626 }
2f62fb00 1627
8aa1044f
SH
1628 r = do_secondstage_mounts_if_needed(type, h, controllerpath, path2,
1629 d->container_cgroup);
1630 free(controllerpath);
1631 free(path2);
1632 if (r < 0)
1633 goto bad;
1634 }
1635 retval = true;
1636
1637bad:
1638 free(tmpfspath);
1639 return retval;
ccb4cabe
SH
1640}
1641
1642static int recursive_count_nrtasks(char *dirname)
1643{
74f96976 1644 struct dirent *direntp;
ccb4cabe
SH
1645 DIR *dir;
1646 int count = 0, ret;
1647 char *path;
1648
1649 dir = opendir(dirname);
1650 if (!dir)
1651 return 0;
1652
74f96976 1653 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1654 struct stat mystat;
1655
1656 if (!direntp)
1657 break;
1658
1659 if (!strcmp(direntp->d_name, ".") ||
1660 !strcmp(direntp->d_name, ".."))
1661 continue;
1662
1663 path = must_make_path(dirname, direntp->d_name, NULL);
1664
1665 if (lstat(path, &mystat))
1666 goto next;
1667
1668 if (!S_ISDIR(mystat.st_mode))
1669 goto next;
1670
1671 count += recursive_count_nrtasks(path);
1672next:
1673 free(path);
1674 }
1675
1676 path = must_make_path(dirname, "cgroup.procs", NULL);
1677 ret = lxc_count_file_lines(path);
1678 if (ret != -1)
1679 count += ret;
1680 free(path);
1681
1682 (void) closedir(dir);
1683
1684 return count;
1685}
1686
1687static int cgfsng_nrtasks(void *hdata) {
1688 struct cgfsng_handler_data *d = hdata;
1689 char *path;
1690 int count;
1691
457ca9aa 1692 if (!d || !d->container_cgroup || !hierarchies)
ccb4cabe 1693 return -1;
457ca9aa 1694 path = must_make_path(hierarchies[0]->fullcgpath, NULL);
ccb4cabe
SH
1695 count = recursive_count_nrtasks(path);
1696 free(path);
1697 return count;
1698}
1699
1700/* Only root needs to escape to the cgroup of its init */
7103fe6f 1701static bool cgfsng_escape()
ccb4cabe 1702{
ccb4cabe
SH
1703 int i;
1704
1705 if (geteuid())
1706 return true;
1707
457ca9aa
SH
1708 for (i = 0; hierarchies[i]; i++) {
1709 char *fullpath = must_make_path(hierarchies[i]->mountpoint,
1710 hierarchies[i]->base_cgroup,
ccb4cabe
SH
1711 "cgroup.procs", NULL);
1712 if (lxc_write_to_file(fullpath, "0", 2, false) != 0) {
d3b00a8f 1713 SYSERROR("Failed to escape to %s", fullpath);
ccb4cabe 1714 free(fullpath);
6df334d1 1715 return false;
ccb4cabe
SH
1716 }
1717 free(fullpath);
1718 }
1719
6df334d1 1720 return true;
ccb4cabe
SH
1721}
1722
36662416
TA
1723static int cgfsng_num_hierarchies(void)
1724{
1725 int i;
1726
1727 for (i = 0; hierarchies[i]; i++)
1728 ;
1729
1730 return i;
1731}
1732
1733static bool cgfsng_get_hierarchies(int n, char ***out)
1734{
1735 int i;
1736
1737 /* sanity check n */
1738 for (i = 0; i < n; i++) {
1739 if (!hierarchies[i])
1740 return false;
1741 }
1742
1743 *out = hierarchies[i]->controllers;
1744
1745 return true;
1746}
1747
ccb4cabe
SH
1748#define THAWED "THAWED"
1749#define THAWED_LEN (strlen(THAWED))
1750
1751static bool cgfsng_unfreeze(void *hdata)
1752{
ccb4cabe 1753 char *fullpath;
457ca9aa 1754 struct hierarchy *h = get_hierarchy("freezer");
ccb4cabe 1755
457ca9aa 1756 if (!h)
ccb4cabe
SH
1757 return false;
1758 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
1759 if (lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false) != 0) {
1760 free(fullpath);
1761 return false;
1762 }
1763 free(fullpath);
1764 return true;
1765}
1766
1767static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem)
1768{
457ca9aa 1769 struct hierarchy *h = get_hierarchy(subsystem);
ccb4cabe
SH
1770 if (!h)
1771 return NULL;
1772
371f834d
SH
1773 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
1774}
1775
1776/*
1777 * Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a
1778 * full path, which must be freed by the caller.
1779 */
1780static char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
1781 const char *inpath,
1782 const char *filename)
1783{
371f834d 1784 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
1785}
1786
1787static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
1788{
ccb4cabe
SH
1789 char pidstr[25];
1790 int i, len;
1791
1792 len = snprintf(pidstr, 25, "%d", pid);
1793 if (len < 0 || len > 25)
1794 return false;
1795
457ca9aa 1796 for (i = 0; hierarchies[i]; i++) {
ccb4cabe 1797 char *path, *fullpath;
457ca9aa 1798 struct hierarchy *h = hierarchies[i];
ccb4cabe
SH
1799
1800 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
1801 if (!path) // not running
1802 continue;
1803
371f834d
SH
1804 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
1805 free(path);
ccb4cabe
SH
1806 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
1807 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
1808 free(fullpath);
ccb4cabe
SH
1809 return false;
1810 }
ccb4cabe
SH
1811 free(fullpath);
1812 }
1813
ccb4cabe
SH
1814 return true;
1815}
1816
1817/*
1818 * Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.
1819 * Here we don't have a cgroup_data set up, so we ask the running
1820 * container through the commands API for the cgroup path
1821 */
1822static int cgfsng_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
1823{
1824 char *subsystem, *p, *path;
ccb4cabe
SH
1825 struct hierarchy *h;
1826 int ret = -1;
1827
1828 subsystem = alloca(strlen(filename) + 1);
1829 strcpy(subsystem, filename);
1830 if ((p = strchr(subsystem, '.')) != NULL)
1831 *p = '\0';
1832
1833 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1834 if (!path) // not running
1835 return -1;
1836
457ca9aa 1837 h = get_hierarchy(subsystem);
ccb4cabe 1838 if (h) {
371f834d 1839 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1840 ret = lxc_read_from_file(fullpath, value, len);
1841 free(fullpath);
1842 }
1843
ccb4cabe
SH
1844 free(path);
1845
1846 return ret;
1847}
1848
1849/*
1850 * Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.
1851 * Here we don't have a cgroup_data set up, so we ask the running
1852 * container through the commands API for the cgroup path
1853 */
1854static int cgfsng_set(const char *filename, const char *value, const char *name, const char *lxcpath)
1855{
1856 char *subsystem, *p, *path;
ccb4cabe
SH
1857 struct hierarchy *h;
1858 int ret = -1;
1859
1860 subsystem = alloca(strlen(filename) + 1);
1861 strcpy(subsystem, filename);
1862 if ((p = strchr(subsystem, '.')) != NULL)
1863 *p = '\0';
1864
1865 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1866 if (!path) // not running
1867 return -1;
1868
457ca9aa 1869 h = get_hierarchy(subsystem);
ccb4cabe 1870 if (h) {
371f834d 1871 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1872 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
1873 free(fullpath);
1874 }
1875
ccb4cabe
SH
1876 free(path);
1877
1878 return ret;
1879}
1880
ccb4cabe
SH
1881/*
1882 * Called from setup_limits - here we have the container's cgroup_data because
1883 * we created the cgroups
1884 */
1885static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfsng_handler_data *d)
1886{
1887 char *subsystem = NULL, *p;
1888 int ret = -1;
1889 struct hierarchy *h;
1890
1891 subsystem = alloca(strlen(filename) + 1);
1892 strcpy(subsystem, filename);
1893 if ((p = strchr(subsystem, '.')) != NULL)
1894 *p = '\0';
1895
457ca9aa 1896 h = get_hierarchy(subsystem);
ccb4cabe
SH
1897 if (h) {
1898 char *fullpath = must_make_path(h->fullcgpath, filename, NULL);
1899 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
1900 free(fullpath);
1901 }
1902 return ret;
1903}
1904
1905static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings,
1906 bool do_devices)
1907{
1908 struct cgfsng_handler_data *d = hdata;
1909 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
1910 struct lxc_cgroup *cg;
ccb4cabe
SH
1911 bool ret = false;
1912
1913 if (lxc_list_empty(cgroup_settings))
1914 return true;
1915
1916 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
1917 if (!sorted_cgroup_settings) {
1918 return false;
1919 }
1920
ccb4cabe
SH
1921 lxc_list_for_each(iterator, sorted_cgroup_settings) {
1922 cg = iterator->elem;
1923
1924 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
ccb4cabe
SH
1925 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
1926 if (do_devices && (errno == EACCES || errno == EPERM)) {
1927 WARN("Error setting %s to %s for %s",
1928 cg->subsystem, cg->value, d->name);
1929 continue;
1930 }
1931 SYSERROR("Error setting %s to %s for %s",
1932 cg->subsystem, cg->value, d->name);
1933 goto out;
1934 }
6a628f4a 1935 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
ccb4cabe 1936 }
ccb4cabe
SH
1937 }
1938
1939 ret = true;
1940 INFO("cgroup has been setup");
1941out:
ccb4cabe
SH
1942 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
1943 lxc_list_del(iterator);
1944 free(iterator);
1945 }
1946 free(sorted_cgroup_settings);
1947 return ret;
1948}
1949
1950static struct cgroup_ops cgfsng_ops = {
1951 .init = cgfsng_init,
1952 .destroy = cgfsng_destroy,
1953 .create = cgfsng_create,
1954 .enter = cgfsng_enter,
ccb4cabe 1955 .escape = cgfsng_escape,
36662416
TA
1956 .num_hierarchies = cgfsng_num_hierarchies,
1957 .get_hierarchies = cgfsng_get_hierarchies,
ccb4cabe
SH
1958 .get_cgroup = cgfsng_get_cgroup,
1959 .get = cgfsng_get,
1960 .set = cgfsng_set,
1961 .unfreeze = cgfsng_unfreeze,
1962 .setup_limits = cgfsng_setup_limits,
1963 .name = "cgroupfs-ng",
1964 .attach = cgfsng_attach,
1965 .chown = cgfsns_chown,
1966 .mount_cgroup = cgfsng_mount,
1967 .nrtasks = cgfsng_nrtasks,
1968 .driver = CGFSNG,
1969
1970 /* unsupported */
1971 .create_legacy = NULL,
1972};