]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
Fix syntax error in lxc-download
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
ccb4cabe
SH
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24/*
25 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
26 * cgroup backend. The original cgfs.c was designed to be as flexible
27 * as possible. It would try to find cgroup filesystems no matter where
28 * or how you had them mounted, and deduce the most usable mount for
29 * each controller. It also was not designed for unprivileged use, as
30 * that was reserved for cgmanager.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
a54694f8 36
ccb4cabe 37#include "config.h"
a54694f8
CB
38
39#include <ctype.h>
40#include <dirent.h>
41#include <errno.h>
42#include <grp.h>
43#include <stdint.h>
ccb4cabe
SH
44#include <stdio.h>
45#include <stdlib.h>
a54694f8 46#include <string.h>
ccb4cabe 47#include <unistd.h>
a54694f8 48#include <sys/types.h>
ccb4cabe 49
d8e48992 50#include "bdev.h"
ccb4cabe 51#include "cgroup.h"
ccb4cabe 52#include "commands.h"
a54694f8
CB
53#include "log.h"
54#include "utils.h"
ccb4cabe
SH
55
56lxc_log_define(lxc_cgfsng, lxc);
57
58static struct cgroup_ops cgfsng_ops;
59
ccb4cabe
SH
60/*
61 * A descriptor for a mounted hierarchy
62 * @controllers: either NULL, or a null-terminated list of all
63 * the co-mounted controllers
64 * @mountpoint: the mountpoint we will use. It will be either
65 * /sys/fs/cgroup/controller or /sys/fs/cgroup/controllerlist
66 * @base_cgroup: the cgroup under which the container cgroup path
67 is created. This will be either the caller's cgroup (if not
68 root), or init's cgroup (if root).
69 */
70struct hierarchy {
71 char **controllers;
72 char *mountpoint;
73 char *base_cgroup;
74 char *fullcgpath;
75};
76
77/*
78 * The cgroup data which is attached to the lxc_handler.
ccb4cabe
SH
79 * @cgroup_pattern - a copy of the lxc.cgroup.pattern
80 * @container_cgroup - if not null, the cgroup which was created for
81 * the container. For each hierarchy, it is created under the
82 * @hierarchy->base_cgroup directory. Relative to the base_cgroup
83 * it is the same for all hierarchies.
84 * @name - the container name
85 */
86struct cgfsng_handler_data {
ccb4cabe
SH
87 char *cgroup_pattern;
88 char *container_cgroup; // cgroup we created for the container
89 char *name; // container name
90};
91
457ca9aa
SH
92/*
93 * @hierarchies - a NULL-terminated array of struct hierarchy, one per
94 * hierarchy. No duplicates. First sufficient, writeable mounted
95 * hierarchy wins
96 */
97struct hierarchy **hierarchies;
98
99/*
100 * @cgroup_use - a copy of the lxc.cgroup.use
101 */
102char *cgroup_use;
103
e4aeecf5
CB
104/*
105 * @lxc_cgfsng_debug - whether to print debug info to stdout for the cgfsng
106 * driver
107 */
108static bool lxc_cgfsng_debug;
109
ccb4cabe
SH
110static void free_string_list(char **clist)
111{
112 if (clist) {
113 int i;
114
115 for (i = 0; clist[i]; i++)
116 free(clist[i]);
117 free(clist);
118 }
119}
120
121/* Re-alllocate a pointer, do not fail */
122static void *must_realloc(void *orig, size_t sz)
123{
124 void *ret;
125
126 do {
127 ret = realloc(orig, sz);
128 } while (!ret);
129 return ret;
130}
131
132/* Allocate a pointer, do not fail */
133static void *must_alloc(size_t sz)
134{
135 return must_realloc(NULL, sz);
136}
137
138/* return copy of string @entry; do not fail. */
139static char *must_copy_string(const char *entry)
140{
141 char *ret;
142
143 if (!entry)
144 return NULL;
145 do {
146 ret = strdup(entry);
147 } while (!ret);
148 return ret;
149}
150
151/*
152 * This is a special case - return a copy of @entry
153 * prepending 'name='. I.e. turn systemd into name=systemd.
154 * Do not fail.
155 */
156static char *must_prefix_named(char *entry)
157{
158 char *ret;
159 size_t len = strlen(entry);
160
161 ret = must_alloc(len + 6);
162 snprintf(ret, len + 6, "name=%s", entry);
163 return ret;
164}
165
166/*
167 * Given a pointer to a null-terminated array of pointers, realloc to
168 * add one entry, and point the new entry to NULL. Do not fail. Return
169 * the index to the second-to-last entry - that is, the one which is
170 * now available for use (keeping the list null-terminated).
171 */
172static int append_null_to_list(void ***list)
173{
174 int newentry = 0;
175
176 if (*list)
177 for (; (*list)[newentry]; newentry++);
178
179 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
180 (*list)[newentry + 1] = NULL;
181 return newentry;
182}
183
184/*
185 * Given a null-terminated array of strings, check whether @entry
186 * is one of the strings
187 */
188static bool string_in_list(char **list, const char *entry)
189{
190 int i;
191
192 if (!list)
193 return false;
194 for (i = 0; list[i]; i++)
195 if (strcmp(list[i], entry) == 0)
196 return true;
197
198 return false;
199}
200
201/*
202 * append an entry to the clist. Do not fail.
203 * *clist must be NULL the first time we are called.
204 *
205 * We also handle named subsystems here. Any controller which is not a
206 * kernel subsystem, we prefix 'name='. Any which is both a kernel and
207 * named subsystem, we refuse to use because we're not sure which we
208 * have here. (TODO - we could work around this in some cases by just
209 * remounting to be unambiguous, or by comparing mountpoint contents
210 * with current cgroup)
211 *
212 * The last entry will always be NULL.
213 */
214static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry)
215{
216 int newentry;
217 char *copy;
218
219 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
220 ERROR("Refusing to use ambiguous controller '%s'", entry);
221 ERROR("It is both a named and kernel subsystem");
222 return;
223 }
224
225 newentry = append_null_to_list((void ***)clist);
226
227 if (strncmp(entry, "name=", 5) == 0)
228 copy = must_copy_string(entry);
229 else if (string_in_list(klist, entry))
230 copy = must_copy_string(entry);
231 else
232 copy = must_prefix_named(entry);
233
234 (*clist)[newentry] = copy;
235}
236
ccb4cabe
SH
237static void free_handler_data(struct cgfsng_handler_data *d)
238{
ccb4cabe
SH
239 free(d->cgroup_pattern);
240 free(d->container_cgroup);
241 free(d->name);
242 free(d);
243}
244
245/*
246 * Given a handler's cgroup data, return the struct hierarchy for the
247 * controller @c, or NULL if there is none.
248 */
457ca9aa 249struct hierarchy *get_hierarchy(const char *c)
ccb4cabe
SH
250{
251 int i;
252
457ca9aa 253 if (!hierarchies)
ccb4cabe 254 return NULL;
457ca9aa
SH
255 for (i = 0; hierarchies[i]; i++) {
256 if (string_in_list(hierarchies[i]->controllers, c))
257 return hierarchies[i];
ccb4cabe
SH
258 }
259 return NULL;
260}
261
e3a3fecf
SH
262static char *must_make_path(const char *first, ...) __attribute__((sentinel));
263
a54694f8
CB
264#define BATCH_SIZE 50
265static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
266{
267 int newbatches = (newlen / BATCH_SIZE) + 1;
268 int oldbatches = (oldlen / BATCH_SIZE) + 1;
269
270 if (!*mem || newbatches > oldbatches) {
271 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
272 }
273}
274
275static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
276{
277 size_t full = oldlen + newlen;
278
279 batch_realloc(dest, oldlen, full + 1);
280
281 memcpy(*dest + oldlen, new, newlen + 1);
282}
283
284/* Slurp in a whole file */
285static char *read_file(char *fnam)
286{
287 FILE *f;
288 char *line = NULL, *buf = NULL;
289 size_t len = 0, fulllen = 0;
290 int linelen;
291
292 f = fopen(fnam, "r");
293 if (!f)
294 return NULL;
295 while ((linelen = getline(&line, &len, f)) != -1) {
296 append_line(&buf, fulllen, line, linelen);
297 fulllen += linelen;
298 }
299 fclose(f);
300 free(line);
301 return buf;
302}
303
304/* Taken over modified from the kernel sources. */
305#define NBITS 32 /* bits in uint32_t */
306#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
307#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
308
309static void set_bit(unsigned bit, uint32_t *bitarr)
310{
311 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
312}
313
314static void clear_bit(unsigned bit, uint32_t *bitarr)
315{
316 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
317}
318
319static bool is_set(unsigned bit, uint32_t *bitarr)
320{
321 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
322}
323
324/* Create cpumask from cpulist aka turn:
325 *
326 * 0,2-3
327 *
328 * into bit array
329 *
330 * 1 0 1 1
331 */
332static uint32_t *lxc_cpumask(char *buf, size_t nbits)
333{
334 char *token;
335 char *saveptr = NULL;
336 size_t arrlen = BITS_TO_LONGS(nbits);
337 uint32_t *bitarr = calloc(arrlen, sizeof(uint32_t));
338 if (!bitarr)
339 return NULL;
340
341 for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
342 errno = 0;
343 unsigned start = strtoul(token, NULL, 0);
344 unsigned end = start;
345
346 char *range = strchr(token, '-');
347 if (range)
348 end = strtoul(range + 1, NULL, 0);
349 if (!(start <= end)) {
350 free(bitarr);
351 return NULL;
352 }
353
354 if (end >= nbits) {
355 free(bitarr);
356 return NULL;
357 }
358
359 while (start <= end)
360 set_bit(start++, bitarr);
361 }
362
363 return bitarr;
364}
365
a54694f8
CB
366/* Turn cpumask into simple, comma-separated cpulist. */
367static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
368{
369 size_t i;
370 int ret;
eab15c1e 371 char numstr[LXC_NUMSTRLEN64] = {0};
a54694f8
CB
372 char **cpulist = NULL;
373
374 for (i = 0; i <= nbits; i++) {
375 if (is_set(i, bitarr)) {
eab15c1e
CB
376 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
377 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
a54694f8
CB
378 lxc_free_array((void **)cpulist, free);
379 return NULL;
380 }
381 if (lxc_append_string(&cpulist, numstr) < 0) {
382 lxc_free_array((void **)cpulist, free);
383 return NULL;
384 }
385 }
386 }
387 return lxc_string_join(",", (const char **)cpulist, false);
388}
389
390static ssize_t get_max_cpus(char *cpulist)
391{
392 char *c1, *c2;
393 char *maxcpus = cpulist;
394 size_t cpus = 0;
395
396 c1 = strrchr(maxcpus, ',');
397 if (c1)
398 c1++;
399
400 c2 = strrchr(maxcpus, '-');
401 if (c2)
402 c2++;
403
404 if (!c1 && !c2)
405 c1 = maxcpus;
406 else if (c1 > c2)
407 c2 = c1;
408 else if (c1 < c2)
409 c1 = c2;
410 else if (!c1 && c2) // The reverse case is obvs. not needed.
411 c1 = c2;
412
413 /* If the above logic is correct, c1 should always hold a valid string
414 * here.
415 */
416
417 errno = 0;
418 cpus = strtoul(c1, NULL, 0);
419 if (errno != 0)
420 return -1;
421
422 return cpus;
423}
424
6f9584d8 425#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
a54694f8
CB
426static bool filter_and_set_cpus(char *path, bool am_initialized)
427{
428 char *lastslash, *fpath, oldv;
429 int ret;
430 ssize_t i;
431
432 ssize_t maxposs = 0, maxisol = 0;
433 char *cpulist = NULL, *posscpus = NULL, *isolcpus = NULL;
434 uint32_t *possmask = NULL, *isolmask = NULL;
6f9584d8 435 bool bret = false, flipped_bit = false;
a54694f8
CB
436
437 lastslash = strrchr(path, '/');
438 if (!lastslash) { // bug... this shouldn't be possible
6f9584d8 439 ERROR("Invalid path: %s.", path);
a54694f8
CB
440 return bret;
441 }
442 oldv = *lastslash;
443 *lastslash = '\0';
444 fpath = must_make_path(path, "cpuset.cpus", NULL);
445 posscpus = read_file(fpath);
6f9584d8
CB
446 if (!posscpus) {
447 SYSERROR("Could not read file: %s.\n", fpath);
448 goto on_error;
449 }
a54694f8
CB
450
451 /* Get maximum number of cpus found in possible cpuset. */
452 maxposs = get_max_cpus(posscpus);
453 if (maxposs < 0)
6f9584d8 454 goto on_error;
a54694f8 455
6f9584d8
CB
456 if (!file_exists(__ISOL_CPUS)) {
457 /* This system doesn't expose isolated cpus. */
458 DEBUG("Path: "__ISOL_CPUS" to read isolated cpus from does not exist.\n");
65d29cbc
CB
459 cpulist = posscpus;
460 /* No isolated cpus but we weren't already initialized by
461 * someone. We should simply copy the parents cpuset.cpus
462 * values.
463 */
464 if (!am_initialized) {
465 DEBUG("Copying cpuset of parent cgroup.");
466 goto copy_parent;
467 }
468 /* No isolated cpus but we were already initialized by someone.
469 * Nothing more to do for us.
470 */
6f9584d8
CB
471 goto on_success;
472 }
473
474 isolcpus = read_file(__ISOL_CPUS);
475 if (!isolcpus) {
476 SYSERROR("Could not read file "__ISOL_CPUS);
477 goto on_error;
478 }
a54694f8 479 if (!isdigit(isolcpus[0])) {
6f9584d8 480 DEBUG("No isolated cpus detected.");
a54694f8
CB
481 cpulist = posscpus;
482 /* No isolated cpus but we weren't already initialized by
483 * someone. We should simply copy the parents cpuset.cpus
484 * values.
485 */
6f9584d8
CB
486 if (!am_initialized) {
487 DEBUG("Copying cpuset of parent cgroup.");
a54694f8 488 goto copy_parent;
6f9584d8 489 }
a54694f8
CB
490 /* No isolated cpus but we were already initialized by someone.
491 * Nothing more to do for us.
492 */
6f9584d8 493 goto on_success;
a54694f8
CB
494 }
495
496 /* Get maximum number of cpus found in isolated cpuset. */
497 maxisol = get_max_cpus(isolcpus);
498 if (maxisol < 0)
6f9584d8 499 goto on_error;
a54694f8
CB
500
501 if (maxposs < maxisol)
502 maxposs = maxisol;
503 maxposs++;
504
505 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8
CB
506 if (!possmask) {
507 ERROR("Could not create cpumask for all possible cpus.\n");
508 goto on_error;
509 }
a54694f8
CB
510
511 isolmask = lxc_cpumask(isolcpus, maxposs);
6f9584d8
CB
512 if (!isolmask) {
513 ERROR("Could not create cpumask for all isolated cpus.\n");
514 goto on_error;
515 }
a54694f8
CB
516
517 for (i = 0; i <= maxposs; i++) {
518 if (is_set(i, isolmask) && is_set(i, possmask)) {
6f9584d8 519 flipped_bit = true;
a54694f8
CB
520 clear_bit(i, possmask);
521 }
522 }
523
6f9584d8
CB
524 if (!flipped_bit) {
525 DEBUG("No isolated cpus present in cpuset.");
526 goto on_success;
527 }
528 DEBUG("Removed isolated cpus from cpuset.");
529
a54694f8 530 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
6f9584d8
CB
531 if (!cpulist) {
532 ERROR("Could not create cpu list.\n");
533 goto on_error;
534 }
a54694f8
CB
535
536copy_parent:
537 *lastslash = oldv;
538 fpath = must_make_path(path, "cpuset.cpus", NULL);
539 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false);
6f9584d8
CB
540 if (ret < 0) {
541 SYSERROR("Could not write cpu list to: %s.\n", fpath);
542 goto on_error;
543 }
544
545on_success:
546 bret = true;
a54694f8 547
6f9584d8 548on_error:
a54694f8
CB
549 free(fpath);
550
551 free(isolcpus);
552 free(isolmask);
553
554 if (posscpus != cpulist)
555 free(posscpus);
556 free(possmask);
557
558 free(cpulist);
559 return bret;
560}
561
e3a3fecf
SH
562/* Copy contents of parent(@path)/@file to @path/@file */
563static bool copy_parent_file(char *path, char *file)
564{
565 char *lastslash, *value = NULL, *fpath, oldv;
566 int len = 0;
567 int ret;
568
569 lastslash = strrchr(path, '/');
570 if (!lastslash) { // bug... this shouldn't be possible
571 ERROR("cgfsng:copy_parent_file: bad path %s", path);
572 return false;
573 }
574 oldv = *lastslash;
575 *lastslash = '\0';
576 fpath = must_make_path(path, file, NULL);
577 len = lxc_read_from_file(fpath, NULL, 0);
578 if (len <= 0)
579 goto bad;
580 value = must_alloc(len + 1);
581 if (lxc_read_from_file(fpath, value, len) != len)
582 goto bad;
583 free(fpath);
584 *lastslash = oldv;
585 fpath = must_make_path(path, file, NULL);
586 ret = lxc_write_to_file(fpath, value, len, false);
587 if (ret < 0)
588 SYSERROR("Unable to write %s to %s", value, fpath);
589 free(fpath);
590 free(value);
591 return ret >= 0;
592
593bad:
594 SYSERROR("Error reading '%s'", fpath);
595 free(fpath);
596 free(value);
597 return false;
598}
599
600/*
601 * Initialize the cpuset hierarchy in first directory of @gname and
602 * set cgroup.clone_children so that children inherit settings.
603 * Since the h->base_path is populated by init or ourselves, we know
604 * it is already initialized.
605 */
a54694f8 606static bool handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf
SH
607{
608 char *cgpath, *clonechildrenpath, v, *slash;
609
610 if (!string_in_list(h->controllers, "cpuset"))
611 return true;
612
613 if (*cgname == '/')
614 cgname++;
615 slash = strchr(cgname, '/');
616 if (slash)
617 *slash = '\0';
618
619 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
620 if (slash)
621 *slash = '/';
622 if (mkdir(cgpath, 0755) < 0 && errno != EEXIST) {
623 SYSERROR("Failed to create '%s'", cgpath);
624 free(cgpath);
625 return false;
626 }
6f9584d8 627
e3a3fecf
SH
628 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
629 if (!file_exists(clonechildrenpath)) { /* unified hierarchy doesn't have clone_children */
630 free(clonechildrenpath);
631 free(cgpath);
632 return true;
633 }
634 if (lxc_read_from_file(clonechildrenpath, &v, 1) < 0) {
635 SYSERROR("Failed to read '%s'", clonechildrenpath);
636 free(clonechildrenpath);
637 free(cgpath);
638 return false;
639 }
640
a54694f8 641 /* Make sure any isolated cpus are removed from cpuset.cpus. */
6f9584d8
CB
642 if (!filter_and_set_cpus(cgpath, v == '1')) {
643 SYSERROR("Failed to remove isolated cpus.");
644 free(clonechildrenpath);
645 free(cgpath);
a54694f8 646 return false;
6f9584d8 647 }
a54694f8 648
e3a3fecf 649 if (v == '1') { /* already set for us by someone else */
6f9584d8 650 DEBUG("\"cgroup.clone_children\" was already set to \"1\".");
e3a3fecf
SH
651 free(clonechildrenpath);
652 free(cgpath);
653 return true;
654 }
655
656 /* copy parent's settings */
a54694f8 657 if (!copy_parent_file(cgpath, "cpuset.mems")) {
6f9584d8 658 SYSERROR("Failed to copy \"cpuset.mems\" settings.");
e3a3fecf
SH
659 free(cgpath);
660 free(clonechildrenpath);
661 return false;
662 }
663 free(cgpath);
664
665 if (lxc_write_to_file(clonechildrenpath, "1", 1, false) < 0) {
666 /* Set clone_children so children inherit our settings */
667 SYSERROR("Failed to write 1 to %s", clonechildrenpath);
668 free(clonechildrenpath);
669 return false;
670 }
671 free(clonechildrenpath);
672 return true;
673}
674
ccb4cabe
SH
675/*
676 * Given two null-terminated lists of strings, return true if any string
677 * is in both.
678 */
679static bool controller_lists_intersect(char **l1, char **l2)
680{
681 int i;
682
683 if (!l1 || !l2)
684 return false;
685
686 for (i = 0; l1[i]; i++) {
687 if (string_in_list(l2, l1[i]))
688 return true;
689 }
690 return false;
691}
692
693/*
694 * For a null-terminated list of controllers @clist, return true if any of
695 * those controllers is already listed the null-terminated list of
696 * hierarchies @hlist. Realistically, if one is present, all must be present.
697 */
698static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
699{
700 int i;
701
702 if (!hlist)
703 return false;
704 for (i = 0; hlist[i]; i++)
705 if (controller_lists_intersect(hlist[i]->controllers, clist))
706 return true;
707 return false;
708
709}
710
711/*
712 * Return true if the controller @entry is found in the null-terminated
713 * list of hierarchies @hlist
714 */
715static bool controller_found(struct hierarchy **hlist, char *entry)
716{
717 int i;
718 if (!hlist)
719 return false;
720
721 for (i = 0; hlist[i]; i++)
722 if (string_in_list(hlist[i]->controllers, entry))
723 return true;
724 return false;
725}
726
727/*
c30b61c3
SH
728 * Return true if all of the controllers which we require have been found.
729 * The required list is freezer and anything in * lxc.cgroup.use.
ccb4cabe 730 */
457ca9aa 731static bool all_controllers_found(void)
ccb4cabe
SH
732{
733 char *p, *saveptr = NULL;
457ca9aa 734 struct hierarchy ** hlist = hierarchies;
ccb4cabe 735
ccb4cabe
SH
736 if (!controller_found(hlist, "freezer")) {
737 ERROR("no freezer controller mountpoint found");
738 return false;
739 }
740
457ca9aa 741 if (!cgroup_use)
ccb4cabe 742 return true;
457ca9aa 743 for (p = strtok_r(cgroup_use, ",", &saveptr); p;
ccb4cabe
SH
744 p = strtok_r(NULL, ",", &saveptr)) {
745 if (!controller_found(hlist, p)) {
746 ERROR("no %s controller mountpoint found", p);
747 return false;
748 }
749 }
750 return true;
751}
752
753/* Return true if the fs type is fuse.lxcfs */
754static bool is_lxcfs(const char *line)
755{
756 char *p = strstr(line, " - ");
757 if (!p)
758 return false;
2f62fb00 759 return strncmp(p, " - fuse.lxcfs ", 14) == 0;
ccb4cabe
SH
760}
761
762/*
763 * Get the controllers from a mountinfo line
764 * There are other ways we could get this info. For lxcfs, field 3
765 * is /cgroup/controller-list. For cgroupfs, we could parse the mount
766 * options. But we simply assume that the mountpoint must be
767 * /sys/fs/cgroup/controller-list
768 */
769static char **get_controllers(char **klist, char **nlist, char *line)
770{
771 // the fourth field is /sys/fs/cgroup/comma-delimited-controller-list
772 int i;
773 char *p = line, *p2, *tok, *saveptr = NULL;
774 char **aret = NULL;
775
776 for (i = 0; i < 4; i++) {
235f1815 777 p = strchr(p, ' ');
ccb4cabe
SH
778 if (!p)
779 return NULL;
780 p++;
781 }
782 if (!p)
783 return NULL;
784 /* note - if we change how mountinfo works, then our caller
785 * will need to verify /sys/fs/cgroup/ in this field */
5059aae9
SH
786 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
787 INFO("cgfsng: found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
ccb4cabe 788 return NULL;
5059aae9 789 }
ccb4cabe 790 p += 15;
235f1815 791 p2 = strchr(p, ' ');
ccb4cabe
SH
792 if (!p2) {
793 ERROR("corrupt mountinfo");
794 return NULL;
795 }
796 *p2 = '\0';
797 for (tok = strtok_r(p, ",", &saveptr); tok;
798 tok = strtok_r(NULL, ",", &saveptr)) {
799 must_append_controller(klist, nlist, &aret, tok);
800 }
801
802 return aret;
803}
804
805/* return true if the fstype is cgroup */
806static bool is_cgroupfs(char *line)
807{
808 char *p = strstr(line, " - ");
809 if (!p)
810 return false;
2f62fb00 811 return strncmp(p, " - cgroup ", 10) == 0;
ccb4cabe
SH
812}
813
814/* Add a controller to our list of hierarchies */
457ca9aa 815static void add_controller(char **clist, char *mountpoint, char *base_cgroup)
ccb4cabe
SH
816{
817 struct hierarchy *new;
818 int newentry;
819
820 new = must_alloc(sizeof(*new));
821 new->controllers = clist;
822 new->mountpoint = mountpoint;
823 new->base_cgroup = base_cgroup;
824 new->fullcgpath = NULL;
825
457ca9aa
SH
826 newentry = append_null_to_list((void ***)&hierarchies);
827 hierarchies[newentry] = new;
ccb4cabe
SH
828}
829
830/*
831 * Get a copy of the mountpoint from @line, which is a line from
832 * /proc/self/mountinfo
833 */
834static char *get_mountpoint(char *line)
835{
836 int i;
837 char *p = line, *sret;
838 size_t len;
839
840 for (i = 0; i < 4; i++) {
235f1815 841 p = strchr(p, ' ');
ccb4cabe
SH
842 if (!p)
843 return NULL;
844 p++;
845 }
846 /* we've already stuck a \0 after the mountpoint */
847 len = strlen(p);
848 sret = must_alloc(len + 1);
849 memcpy(sret, p, len);
850 sret[len] = '\0';
851 return sret;
852}
853
854/*
855 * Given a multi-line string, return a null-terminated copy of the
856 * current line.
857 */
858static char *copy_to_eol(char *p)
859{
235f1815 860 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
861 size_t len;
862
863 if (!p2)
864 return NULL;
865
866 len = p2 - p;
867 sret = must_alloc(len + 1);
868 memcpy(sret, p, len);
869 sret[len] = '\0';
870 return sret;
871}
872
873/*
874 * cgline: pointer to character after the first ':' in a line in a
875 * \n-terminated /proc/self/cgroup file. Check whether * controller c is
876 * present.
877 */
878static bool controller_in_clist(char *cgline, char *c)
879{
880 char *tok, *saveptr = NULL, *eol, *tmp;
881 size_t len;
882
235f1815 883 eol = strchr(cgline, ':');
ccb4cabe
SH
884 if (!eol)
885 return false;
886
887 len = eol - cgline;
888 tmp = alloca(len + 1);
889 memcpy(tmp, cgline, len);
890 tmp[len] = '\0';
891
892 for (tok = strtok_r(tmp, ",", &saveptr); tok;
893 tok = strtok_r(NULL, ",", &saveptr)) {
894 if (strcmp(tok, c) == 0)
895 return true;
896 }
897 return false;
898}
899
900/*
901 * @basecginfo is a copy of /proc/$$/cgroup. Return the current
902 * cgroup for @controller
903 */
904static char *get_current_cgroup(char *basecginfo, char *controller)
905{
906 char *p = basecginfo;
907
908 while (1) {
235f1815 909 p = strchr(p, ':');
ccb4cabe
SH
910 if (!p)
911 return NULL;
912 p++;
913 if (controller_in_clist(p, controller)) {
235f1815 914 p = strchr(p, ':');
ccb4cabe
SH
915 if (!p)
916 return NULL;
917 p++;
918 return copy_to_eol(p);
919 }
920
235f1815 921 p = strchr(p, '\n');
ccb4cabe
SH
922 if (!p)
923 return NULL;
924 p++;
925 }
926}
927
ccb4cabe
SH
928/*
929 * Given a hierarchy @mountpoint and base @path, verify that we can create
930 * directories underneath it.
931 */
932static bool test_writeable(char *mountpoint, char *path)
933{
934 char *fullpath = must_make_path(mountpoint, path, NULL);
935 int ret;
936
937 ret = access(fullpath, W_OK);
938 free(fullpath);
939 return ret == 0;
940}
941
942static void must_append_string(char ***list, char *entry)
943{
944 int newentry = append_null_to_list((void ***)list);
945 char *copy;
946
947 copy = must_copy_string(entry);
948 (*list)[newentry] = copy;
949}
950
951static void get_existing_subsystems(char ***klist, char ***nlist)
952{
953 FILE *f;
954 char *line = NULL;
955 size_t len = 0;
956
957 if ((f = fopen("/proc/self/cgroup", "r")) == NULL)
958 return;
959 while (getline(&line, &len, f) != -1) {
960 char *p, *p2, *tok, *saveptr = NULL;
235f1815 961 p = strchr(line, ':');
ccb4cabe
SH
962 if (!p)
963 continue;
964 p++;
235f1815 965 p2 = strchr(p, ':');
ccb4cabe
SH
966 if (!p2)
967 continue;
968 *p2 = '\0';
ff8d6ee9
CB
969
970 /* If we have a mixture between cgroup v1 and cgroup v2
971 * hierarchies, then /proc/self/cgroup contains entries of the
972 * form:
973 *
974 * 0::/some/path
975 *
976 * We need to skip those.
977 */
978 if ((p2 - p) == 0)
979 continue;
980
ccb4cabe
SH
981 for (tok = strtok_r(p, ",", &saveptr); tok;
982 tok = strtok_r(NULL, ",", &saveptr)) {
983 if (strncmp(tok, "name=", 5) == 0)
984 must_append_string(nlist, tok);
985 else
986 must_append_string(klist, tok);
987 }
988 }
989
990 free(line);
991 fclose(f);
992}
993
994static void trim(char *s)
995{
996 size_t len = strlen(s);
2c28d76b 997 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
998 s[--len] = '\0';
999}
1000
e4aeecf5
CB
1001static void lxc_cgfsng_print_handler_data(const struct cgfsng_handler_data *d)
1002{
1003 printf("Cgroup information:\n");
1004 printf(" container name: %s\n", d->name ? d->name : "(null)");
1005 printf(" lxc.cgroup.use: %s\n", cgroup_use ? cgroup_use : "(null)");
1006 printf(" lxc.cgroup.pattern: %s\n", d->cgroup_pattern ? d->cgroup_pattern : "(null)");
1007 printf(" cgroup: %s\n", d->container_cgroup ? d->container_cgroup : "(null)");
1008}
1009
1010static void lxc_cgfsng_print_hierarchies()
ccb4cabe 1011{
a7b0cc4c 1012 struct hierarchy **it;
ccb4cabe 1013 int i;
41c33dbe 1014
457ca9aa 1015 if (!hierarchies) {
e4aeecf5 1016 printf(" No hierarchies found.");
ccb4cabe
SH
1017 return;
1018 }
e4aeecf5 1019 printf(" Hierarchies:\n");
a7b0cc4c
CB
1020 for (i = 0, it = hierarchies; it && *it; it++, i++) {
1021 char **cit;
ccb4cabe 1022 int j;
e4aeecf5
CB
1023 printf(" %d: base_cgroup %s\n", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1024 printf(" mountpoint %s\n", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1025 printf(" controllers:\n");
a7b0cc4c 1026 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
e4aeecf5 1027 printf(" %d: %s\n", j, *cit);
ccb4cabe
SH
1028 }
1029}
41c33dbe 1030
e4aeecf5 1031static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist, char **nlist)
41c33dbe
SH
1032{
1033 int k;
a7b0cc4c 1034 char **it;
41c33dbe 1035
a7b0cc4c
CB
1036 printf("basecginfo is:\n");
1037 printf("%s\n", basecginfo);
41c33dbe 1038
a7b0cc4c
CB
1039 for (k = 0, it = klist; it && *it; it++, k++)
1040 printf("kernel subsystem %d: %s\n", k, *it);
1041 for (k = 0, it = nlist; it && *it; it++, k++)
1042 printf("named subsystem %d: %s\n", k, *it);
41c33dbe 1043}
ccb4cabe 1044
e4aeecf5
CB
1045static void lxc_cgfsng_print_debuginfo(const struct cgfsng_handler_data *d)
1046{
1047 lxc_cgfsng_print_handler_data(d);
1048 lxc_cgfsng_print_hierarchies();
1049}
1050
ccb4cabe
SH
1051/*
1052 * At startup, parse_hierarchies finds all the info we need about
1053 * cgroup mountpoints and current cgroups, and stores it in @d.
1054 */
457ca9aa 1055static bool parse_hierarchies(void)
ccb4cabe
SH
1056{
1057 FILE *f;
1058 char * line = NULL, *basecginfo;
1059 char **klist = NULL, **nlist = NULL;
1060 size_t len = 0;
1061
d30ec4cb
SH
1062 /*
1063 * Root spawned containers escape the current cgroup, so use init's
1064 * cgroups as our base in that case.
1065 */
ccb4cabe
SH
1066 if (geteuid())
1067 basecginfo = read_file("/proc/self/cgroup");
1068 else
1069 basecginfo = read_file("/proc/1/cgroup");
1070 if (!basecginfo)
1071 return false;
1072
1073 if ((f = fopen("/proc/self/mountinfo", "r")) == NULL) {
d3b00a8f 1074 SYSERROR("Failed opening /proc/self/mountinfo");
ccb4cabe
SH
1075 return false;
1076 }
1077
1078 get_existing_subsystems(&klist, &nlist);
41c33dbe 1079
e4aeecf5
CB
1080 if (lxc_cgfsng_debug)
1081 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
ccb4cabe
SH
1082
1083 /* we support simple cgroup mounts and lxcfs mounts */
1084 while (getline(&line, &len, f) != -1) {
1085 char **controller_list = NULL;
1086 char *mountpoint, *base_cgroup;
1087
1088 if (!is_lxcfs(line) && !is_cgroupfs(line))
1089 continue;
1090
1091 controller_list = get_controllers(klist, nlist, line);
1092 if (!controller_list)
1093 continue;
1094
457ca9aa 1095 if (controller_list_is_dup(hierarchies, controller_list)) {
ccb4cabe
SH
1096 free(controller_list);
1097 continue;
1098 }
1099
1100 mountpoint = get_mountpoint(line);
1101 if (!mountpoint) {
1102 ERROR("Error reading mountinfo: bad line '%s'", line);
1103 free_string_list(controller_list);
1104 continue;
1105 }
1106
1107 base_cgroup = get_current_cgroup(basecginfo, controller_list[0]);
1108 if (!base_cgroup) {
1109 ERROR("Failed to find current cgroup for controller '%s'", controller_list[0]);
1110 free_string_list(controller_list);
1111 free(mountpoint);
1112 continue;
1113 }
1114 trim(base_cgroup);
1115 prune_init_scope(base_cgroup);
1116 if (!test_writeable(mountpoint, base_cgroup)) {
1117 free_string_list(controller_list);
1118 free(mountpoint);
1119 free(base_cgroup);
1120 continue;
1121 }
457ca9aa 1122 add_controller(controller_list, mountpoint, base_cgroup);
ccb4cabe
SH
1123 }
1124
1125 free_string_list(klist);
1126 free_string_list(nlist);
1127
1128 free(basecginfo);
1129
1130 fclose(f);
1131 free(line);
1132
e4aeecf5
CB
1133 if (lxc_cgfsng_debug) {
1134 printf("writeable subsystems:\n");
1135 lxc_cgfsng_print_hierarchies();
1136 }
1137
ccb4cabe
SH
1138 /* verify that all controllers in cgroup.use and all crucial
1139 * controllers are accounted for
1140 */
5059aae9
SH
1141 if (!all_controllers_found()) {
1142 INFO("cgfsng: not all controllers were find, deferring to cgfs driver");
ccb4cabe 1143 return false;
5059aae9 1144 }
ccb4cabe
SH
1145
1146 return true;
1147}
1148
457ca9aa
SH
1149static bool collect_hierarchy_info(void)
1150{
1151 const char *tmp;
1152 errno = 0;
1153 tmp = lxc_global_config_value("lxc.cgroup.use");
1154 if (!cgroup_use && errno != 0) { // lxc.cgroup.use can be NULL
1155 SYSERROR("cgfsng: error reading list of cgroups to use");
1156 return false;
1157 }
1158 cgroup_use = must_copy_string(tmp);
1159
1160 return parse_hierarchies();
1161}
1162
ccb4cabe
SH
1163static void *cgfsng_init(const char *name)
1164{
1165 struct cgfsng_handler_data *d;
457ca9aa 1166 const char *cgroup_pattern;
ccb4cabe
SH
1167
1168 d = must_alloc(sizeof(*d));
1169 memset(d, 0, sizeof(*d));
1170
1171 d->name = must_copy_string(name);
1172
ccb4cabe
SH
1173 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
1174 if (!cgroup_pattern) { // lxc.cgroup.pattern is only NULL on error
1175 ERROR("Error getting cgroup pattern");
1176 goto out_free;
1177 }
1178 d->cgroup_pattern = must_copy_string(cgroup_pattern);
1179
e4aeecf5
CB
1180 if (lxc_cgfsng_debug)
1181 lxc_cgfsng_print_debuginfo(d);
ccb4cabe
SH
1182
1183 return d;
1184
1185out_free:
1186 free_handler_data(d);
1187 return NULL;
1188}
1189
1190/*
1191 * Concatenate all passed-in strings into one path. Do not fail. If any piece is
1192 * not prefixed with '/', add a '/'.
1193 */
1194static char *must_make_path(const char *first, ...)
1195{
1196 va_list args;
1197 char *cur, *dest;
1198 size_t full_len = strlen(first);
1199
1200 dest = must_copy_string(first);
1201
1202 va_start(args, first);
1203 while ((cur = va_arg(args, char *)) != NULL) {
1204 full_len += strlen(cur);
1205 if (cur[0] != '/')
1206 full_len++;
1207 dest = must_realloc(dest, full_len + 1);
1208 if (cur[0] != '/')
1209 strcat(dest, "/");
1210 strcat(dest, cur);
1211 }
1212 va_end(args);
1213
1214 return dest;
1215}
1216
1217static int cgroup_rmdir(char *dirname)
1218{
74f96976 1219 struct dirent *direntp;
ccb4cabe
SH
1220 DIR *dir;
1221 int r = 0;
1222
1223 dir = opendir(dirname);
1224 if (!dir)
1225 return -1;
1226
74f96976 1227 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1228 struct stat mystat;
1229 char *pathname;
1230
1231 if (!direntp)
1232 break;
1233
1234 if (!strcmp(direntp->d_name, ".") ||
1235 !strcmp(direntp->d_name, ".."))
1236 continue;
1237
1238 pathname = must_make_path(dirname, direntp->d_name, NULL);
1239
1240 if (lstat(pathname, &mystat)) {
1241 if (!r)
1c9da8da 1242 WARN("failed to stat %s", pathname);
ccb4cabe
SH
1243 r = -1;
1244 goto next;
1245 }
1246
1247 if (!S_ISDIR(mystat.st_mode))
1248 goto next;
1249 if (cgroup_rmdir(pathname) < 0)
1250 r = -1;
1251next:
1252 free(pathname);
1253 }
1254
1255 if (rmdir(dirname) < 0) {
1256 if (!r)
13277ec4 1257 WARN("failed to delete %s: %s", dirname, strerror(errno));
ccb4cabe
SH
1258 r = -1;
1259 }
1260
1261 if (closedir(dir) < 0) {
1262 if (!r)
13277ec4 1263 WARN("failed to delete %s: %s", dirname, strerror(errno));
ccb4cabe
SH
1264 r = -1;
1265 }
1266 return r;
1267}
1268
1269static int rmdir_wrapper(void *data)
1270{
1271 char *path = data;
1272
1273 if (setresgid(0,0,0) < 0)
1274 SYSERROR("Failed to setgid to 0");
1275 if (setresuid(0,0,0) < 0)
1276 SYSERROR("Failed to setuid to 0");
1277 if (setgroups(0, NULL) < 0)
1278 SYSERROR("Failed to clear groups");
1279
1280 return cgroup_rmdir(path);
1281}
1282
1283void recursive_destroy(char *path, struct lxc_conf *conf)
1284{
1285 int r;
1286 if (conf && !lxc_list_empty(&conf->id_map))
c9b7c33e 1287 r = userns_exec_1(conf, rmdir_wrapper, path, "rmdir_wrapper");
ccb4cabe
SH
1288 else
1289 r = cgroup_rmdir(path);
1290
1291 if (r < 0)
1c9da8da 1292 ERROR("Error destroying %s", path);
ccb4cabe
SH
1293}
1294
1295static void cgfsng_destroy(void *hdata, struct lxc_conf *conf)
1296{
1297 struct cgfsng_handler_data *d = hdata;
1298
1299 if (!d)
1300 return;
1301
457ca9aa 1302 if (d->container_cgroup && hierarchies) {
ccb4cabe 1303 int i;
457ca9aa
SH
1304 for (i = 0; hierarchies[i]; i++) {
1305 struct hierarchy *h = hierarchies[i];
e2db2a89 1306 if (h->fullcgpath) {
ccb4cabe
SH
1307 recursive_destroy(h->fullcgpath, conf);
1308 free(h->fullcgpath);
1309 h->fullcgpath = NULL;
1310 }
1311 }
1312 }
1313
1314 free_handler_data(d);
1315}
1316
1317struct cgroup_ops *cgfsng_ops_init(void)
1318{
e4aeecf5
CB
1319 if (getenv("LXC_DEBUG_CGFSNG"))
1320 lxc_cgfsng_debug = true;
1321
457ca9aa
SH
1322 if (!collect_hierarchy_info())
1323 return NULL;
e4aeecf5 1324
ccb4cabe
SH
1325 return &cgfsng_ops;
1326}
1327
1328static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1329{
e3a3fecf 1330 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
6f9584d8
CB
1331 if (dir_exists(h->fullcgpath)) { // it must not already exist
1332 ERROR("Path \"%s\" already existed.", h->fullcgpath);
d8da679e 1333 return false;
6f9584d8
CB
1334 }
1335 if (!handle_cpuset_hierarchy(h, cgname)) {
1336 ERROR("Failed to handle cgroupfs v1 cpuset controller.");
e3a3fecf 1337 return false;
6f9584d8 1338 }
e3a3fecf 1339 return mkdir_p(h->fullcgpath, 0755) == 0;
ccb4cabe
SH
1340}
1341
1342static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1343{
1344 if (rmdir(h->fullcgpath) < 0)
1345 SYSERROR("Failed to clean up cgroup %s from failed creation attempt", h->fullcgpath);
1346 free(h->fullcgpath);
1347 h->fullcgpath = NULL;
1348}
1349
1350/*
d30ec4cb 1351 * Try to create the same cgroup in all hierarchies.
ccb4cabe
SH
1352 * Start with cgroup_pattern; next cgroup_pattern-1, -2, ..., -999
1353 */
1354static inline bool cgfsng_create(void *hdata)
1355{
1356 struct cgfsng_handler_data *d = hdata;
1357 char *tmp, *cgname, *offset;
1358 int i, idx = 0;
1359 size_t len;
1360
1361 if (!d)
1362 return false;
1363 if (d->container_cgroup) {
1364 WARN("cgfsng_create called a second time");
1365 return false;
1366 }
1367
1368 tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
1369 if (!tmp) {
1370 ERROR("Failed expanding cgroup name pattern");
1371 return false;
1372 }
1373 len = strlen(tmp) + 5; // leave room for -NNN\0
1374 cgname = must_alloc(len);
1375 strcpy(cgname, tmp);
1376 free(tmp);
1377 offset = cgname + len - 5;
1378
1379again:
95adfe93
SH
1380 if (idx == 1000) {
1381 ERROR("Too many conflicting cgroup names");
ccb4cabe 1382 goto out_free;
95adfe93 1383 }
ccb4cabe
SH
1384 if (idx)
1385 snprintf(offset, 5, "-%d", idx);
457ca9aa
SH
1386 for (i = 0; hierarchies[i]; i++) {
1387 if (!create_path_for_hierarchy(hierarchies[i], cgname)) {
ccb4cabe 1388 int j;
457ca9aa
SH
1389 SYSERROR("Failed to create %s: %s", hierarchies[i]->fullcgpath, strerror(errno));
1390 free(hierarchies[i]->fullcgpath);
1391 hierarchies[i]->fullcgpath = NULL;
ccb4cabe 1392 for (j = 0; j < i; j++)
457ca9aa 1393 remove_path_for_hierarchy(hierarchies[j], cgname);
ccb4cabe
SH
1394 idx++;
1395 goto again;
1396 }
1397 }
1398 /* Done */
1399 d->container_cgroup = cgname;
1400 return true;
1401
1402out_free:
1403 free(cgname);
1404 return false;
1405}
1406
ccb4cabe
SH
1407static bool cgfsng_enter(void *hdata, pid_t pid)
1408{
ccb4cabe
SH
1409 char pidstr[25];
1410 int i, len;
1411
1412 len = snprintf(pidstr, 25, "%d", pid);
1413 if (len < 0 || len > 25)
1414 return false;
1415
457ca9aa
SH
1416 for (i = 0; hierarchies[i]; i++) {
1417 char *fullpath = must_make_path(hierarchies[i]->fullcgpath,
ccb4cabe
SH
1418 "cgroup.procs", NULL);
1419 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
d3b00a8f 1420 SYSERROR("Failed to enter %s", fullpath);
ccb4cabe
SH
1421 free(fullpath);
1422 return false;
1423 }
1424 free(fullpath);
1425 }
1426
1427 return true;
1428}
1429
1430struct chown_data {
1431 struct cgfsng_handler_data *d;
1432 uid_t origuid; // target uid in parent namespace
1433};
1434
c0888dfe
SH
1435/*
1436 * chgrp the container cgroups to container group. We leave
1437 * the container owner as cgroup owner. So we must make the
1438 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1439 *
1440 * Also chown the tasks and cgroup.procs files. Those may not
1441 * exist depending on kernel version.
c0888dfe 1442 */
ccb4cabe
SH
1443static int chown_cgroup_wrapper(void *data)
1444{
1445 struct chown_data *arg = data;
ccb4cabe
SH
1446 uid_t destuid;
1447 int i;
1448
1449 if (setresgid(0,0,0) < 0)
1450 SYSERROR("Failed to setgid to 0");
1451 if (setresuid(0,0,0) < 0)
1452 SYSERROR("Failed to setuid to 0");
1453 if (setgroups(0, NULL) < 0)
1454 SYSERROR("Failed to clear groups");
1455
1456 destuid = get_ns_uid(arg->origuid);
1457
457ca9aa
SH
1458 for (i = 0; hierarchies[i]; i++) {
1459 char *fullpath, *path = hierarchies[i]->fullcgpath;
43647298
SH
1460
1461 if (chown(path, destuid, 0) < 0) {
ab8f5424 1462 SYSERROR("Error chowning %s to %d", path, (int) destuid);
ccb4cabe
SH
1463 return -1;
1464 }
c0888dfe 1465
43647298 1466 if (chmod(path, 0775) < 0) {
ab8f5424 1467 SYSERROR("Error chmoding %s", path);
c0888dfe
SH
1468 return -1;
1469 }
ccb4cabe 1470
ab8f5424
SH
1471 /*
1472 * Failures to chown these are inconvenient but not detrimental
1473 * We leave these owned by the container launcher, so that container
1474 * root can write to the files to attach. We chmod them 664 so that
1475 * container systemd can write to the files (which systemd in wily
1476 * insists on doing)
1477 */
43647298
SH
1478 fullpath = must_make_path(path, "tasks", NULL);
1479 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1480 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1481 strerror(errno));
ab8f5424 1482 if (chmod(fullpath, 0664) < 0)
13277ec4 1483 WARN("Error chmoding %s: %s", path, strerror(errno));
43647298
SH
1484 free(fullpath);
1485
1486 fullpath = must_make_path(path, "cgroup.procs", NULL);
1487 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
13277ec4 1488 WARN("Failed chowning %s to %d: %s", fullpath, (int) destuid,
1489 strerror(errno));
ab8f5424 1490 if (chmod(fullpath, 0664) < 0)
13277ec4 1491 WARN("Error chmoding %s: %s", path, strerror(errno));
ccb4cabe
SH
1492 free(fullpath);
1493 }
1494
1495 return 0;
1496}
1497
1498static bool cgfsns_chown(void *hdata, struct lxc_conf *conf)
1499{
1500 struct cgfsng_handler_data *d = hdata;
1501 struct chown_data wrap;
1502
1503 if (!d)
1504 return false;
1505
1506 if (lxc_list_empty(&conf->id_map))
1507 return true;
1508
1509 wrap.d = d;
1510 wrap.origuid = geteuid();
1511
c9b7c33e
CB
1512 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1513 "chown_cgroup_wrapper") < 0) {
ccb4cabe
SH
1514 ERROR("Error requesting cgroup chown in new namespace");
1515 return false;
1516 }
1517
1518 return true;
1519}
1520
8aa1044f
SH
1521/*
1522 * We've safe-mounted a tmpfs as parent, so we don't need to protect against
1523 * symlinks any more - just use mount
1524 */
1525
1526/* mount cgroup-full if requested */
1527static int mount_cgroup_full(int type, struct hierarchy *h, char *dest,
1528 char *container_cgroup)
1529{
1530 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1531 return 0;
1532 if (mount(h->mountpoint, dest, "cgroup", MS_BIND, NULL) < 0) {
1533 SYSERROR("Error bind-mounting %s cgroup onto %s", h->mountpoint,
1534 dest);
1535 return -1;
1536 }
1537 if (type != LXC_AUTO_CGROUP_FULL_RW) {
5b6f9369
SH
1538 unsigned long flags = MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV |
1539 MS_REMOUNT | MS_RDONLY;
1540 if (mount(NULL, dest, "cgroup", flags, NULL) < 0) {
8aa1044f
SH
1541 SYSERROR("Error remounting %s readonly", dest);
1542 return -1;
1543 }
1544 }
1545
1546 INFO("Bind mounted %s onto %s", h->mountpoint, dest);
1547 if (type != LXC_AUTO_CGROUP_FULL_MIXED)
1548 return 0;
1549
1550 /* mount just the container path rw */
1551 char *source = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
5b6f9369 1552 char *rwpath = must_make_path(dest, h->base_cgroup, container_cgroup, NULL);
8aa1044f 1553 if (mount(source, rwpath, "cgroup", MS_BIND, NULL) < 0)
13277ec4 1554 WARN("Failed to mount %s read-write: %s", rwpath,
1555 strerror(errno));
8aa1044f
SH
1556 INFO("Made %s read-write", rwpath);
1557 free(rwpath);
1558 free(source);
1559 return 0;
1560}
1561
1562/* cgroup-full:* is done, no need to create subdirs */
1563static bool cg_mount_needs_subdirs(int type)
1564{
1565 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1566 return false;
1567 return true;
1568}
1569
1570/*
1571 * After $rootfs/sys/fs/container/controller/the/cg/path has been
1572 * created, remount controller ro if needed and bindmount the
1573 * cgroupfs onto controll/the/cg/path
1574 */
1575static int
1576do_secondstage_mounts_if_needed(int type, struct hierarchy *h,
1577 char *controllerpath, char *cgpath,
1578 const char *container_cgroup)
1579{
1580 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1581 if (mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL) < 0) {
1582 SYSERROR("Error bind-mounting %s", controllerpath);
1583 return -1;
1584 }
1585 if (mount(controllerpath, controllerpath, "cgroup",
1586 MS_REMOUNT | MS_BIND | MS_RDONLY, NULL) < 0) {
1587 SYSERROR("Error remounting %s read-only", controllerpath);
1588 return -1;
1589 }
1590 INFO("Remounted %s read-only", controllerpath);
1591 }
1592 char *sourcepath = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
1593 int flags = MS_BIND;
1594 if (type == LXC_AUTO_CGROUP_RO)
1595 flags |= MS_RDONLY;
1596 INFO("Mounting %s onto %s", sourcepath, cgpath);
1597 if (mount(sourcepath, cgpath, "cgroup", flags, NULL) < 0) {
1598 free(sourcepath);
1599 SYSERROR("Error mounting cgroup %s onto %s", h->controllers[0],
1600 cgpath);
1601 return -1;
1602 }
1603 free(sourcepath);
1604 INFO("Completed second stage cgroup automounts for %s", cgpath);
1605 return 0;
1606}
1607
ccb4cabe
SH
1608static bool cgfsng_mount(void *hdata, const char *root, int type)
1609{
8aa1044f
SH
1610 struct cgfsng_handler_data *d = hdata;
1611 char *tmpfspath = NULL;
1612 bool retval = false;
a8de4c49 1613 int i;
8aa1044f
SH
1614
1615 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1616 return true;
1617
ccb4cabe
SH
1618 if (cgns_supported())
1619 return true;
8aa1044f
SH
1620
1621 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1622
1623 if (type == LXC_AUTO_CGROUP_NOSPEC)
1624 type = LXC_AUTO_CGROUP_MIXED;
1625 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1626 type = LXC_AUTO_CGROUP_FULL_MIXED;
1627
1628 /* Mount tmpfs */
1629 if (safe_mount("cgroup_root", tmpfspath, "tmpfs",
1630 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1631 "size=10240k,mode=755",
1632 root) < 0)
1633 goto bad;
1634
457ca9aa 1635 for (i = 0; hierarchies[i]; i++) {
8aa1044f 1636 char *controllerpath, *path2;
457ca9aa 1637 struct hierarchy *h = hierarchies[i];
8aa1044f
SH
1638 char *controller = strrchr(h->mountpoint, '/');
1639 int r;
1640
1641 if (!controller)
1642 continue;
1643 controller++;
1644 controllerpath = must_make_path(tmpfspath, controller, NULL);
1645 if (dir_exists(controllerpath)) {
1646 free(controllerpath);
1647 continue;
1648 }
1649 if (mkdir(controllerpath, 0755) < 0) {
1650 SYSERROR("Error creating cgroup path: %s", controllerpath);
1651 free(controllerpath);
1652 goto bad;
1653 }
1654 if (mount_cgroup_full(type, h, controllerpath, d->container_cgroup) < 0) {
1655 free(controllerpath);
1656 goto bad;
1657 }
1658 if (!cg_mount_needs_subdirs(type)) {
1659 free(controllerpath);
1660 continue;
1661 }
ef4413fa 1662 path2 = must_make_path(controllerpath, h->base_cgroup, d->container_cgroup, NULL);
8aa1044f
SH
1663 if (mkdir_p(path2, 0755) < 0) {
1664 free(controllerpath);
1665 goto bad;
1666 }
2f62fb00 1667
8aa1044f
SH
1668 r = do_secondstage_mounts_if_needed(type, h, controllerpath, path2,
1669 d->container_cgroup);
1670 free(controllerpath);
1671 free(path2);
1672 if (r < 0)
1673 goto bad;
1674 }
1675 retval = true;
1676
1677bad:
1678 free(tmpfspath);
1679 return retval;
ccb4cabe
SH
1680}
1681
1682static int recursive_count_nrtasks(char *dirname)
1683{
74f96976 1684 struct dirent *direntp;
ccb4cabe
SH
1685 DIR *dir;
1686 int count = 0, ret;
1687 char *path;
1688
1689 dir = opendir(dirname);
1690 if (!dir)
1691 return 0;
1692
74f96976 1693 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1694 struct stat mystat;
1695
1696 if (!direntp)
1697 break;
1698
1699 if (!strcmp(direntp->d_name, ".") ||
1700 !strcmp(direntp->d_name, ".."))
1701 continue;
1702
1703 path = must_make_path(dirname, direntp->d_name, NULL);
1704
1705 if (lstat(path, &mystat))
1706 goto next;
1707
1708 if (!S_ISDIR(mystat.st_mode))
1709 goto next;
1710
1711 count += recursive_count_nrtasks(path);
1712next:
1713 free(path);
1714 }
1715
1716 path = must_make_path(dirname, "cgroup.procs", NULL);
1717 ret = lxc_count_file_lines(path);
1718 if (ret != -1)
1719 count += ret;
1720 free(path);
1721
1722 (void) closedir(dir);
1723
1724 return count;
1725}
1726
1727static int cgfsng_nrtasks(void *hdata) {
1728 struct cgfsng_handler_data *d = hdata;
1729 char *path;
1730 int count;
1731
457ca9aa 1732 if (!d || !d->container_cgroup || !hierarchies)
ccb4cabe 1733 return -1;
457ca9aa 1734 path = must_make_path(hierarchies[0]->fullcgpath, NULL);
ccb4cabe
SH
1735 count = recursive_count_nrtasks(path);
1736 free(path);
1737 return count;
1738}
1739
1740/* Only root needs to escape to the cgroup of its init */
7103fe6f 1741static bool cgfsng_escape()
ccb4cabe 1742{
ccb4cabe
SH
1743 int i;
1744
1745 if (geteuid())
1746 return true;
1747
457ca9aa
SH
1748 for (i = 0; hierarchies[i]; i++) {
1749 char *fullpath = must_make_path(hierarchies[i]->mountpoint,
1750 hierarchies[i]->base_cgroup,
ccb4cabe
SH
1751 "cgroup.procs", NULL);
1752 if (lxc_write_to_file(fullpath, "0", 2, false) != 0) {
d3b00a8f 1753 SYSERROR("Failed to escape to %s", fullpath);
ccb4cabe 1754 free(fullpath);
6df334d1 1755 return false;
ccb4cabe
SH
1756 }
1757 free(fullpath);
1758 }
1759
6df334d1 1760 return true;
ccb4cabe
SH
1761}
1762
36662416
TA
1763static int cgfsng_num_hierarchies(void)
1764{
1765 int i;
1766
1767 for (i = 0; hierarchies[i]; i++)
1768 ;
1769
1770 return i;
1771}
1772
1773static bool cgfsng_get_hierarchies(int n, char ***out)
1774{
1775 int i;
1776
1777 /* sanity check n */
1778 for (i = 0; i < n; i++) {
1779 if (!hierarchies[i])
1780 return false;
1781 }
1782
1783 *out = hierarchies[i]->controllers;
1784
1785 return true;
1786}
1787
ccb4cabe
SH
1788#define THAWED "THAWED"
1789#define THAWED_LEN (strlen(THAWED))
1790
1791static bool cgfsng_unfreeze(void *hdata)
1792{
ccb4cabe 1793 char *fullpath;
457ca9aa 1794 struct hierarchy *h = get_hierarchy("freezer");
ccb4cabe 1795
457ca9aa 1796 if (!h)
ccb4cabe
SH
1797 return false;
1798 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
1799 if (lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false) != 0) {
1800 free(fullpath);
1801 return false;
1802 }
1803 free(fullpath);
1804 return true;
1805}
1806
1807static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem)
1808{
457ca9aa 1809 struct hierarchy *h = get_hierarchy(subsystem);
ccb4cabe
SH
1810 if (!h)
1811 return NULL;
1812
371f834d
SH
1813 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
1814}
1815
1816/*
1817 * Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a
1818 * full path, which must be freed by the caller.
1819 */
1820static char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
1821 const char *inpath,
1822 const char *filename)
1823{
1824 /*
1825 * XXX Remove this case after 2.0 release. It's for dealing with
1826 * containers spawned under the old buggy cgfsng which wasn't around
1827 * for long.
1828 */
1829 if (strncmp(inpath, "/sys/fs/cgroup/", 15) == 0)
1830 return must_make_path(inpath, filename, NULL);
1831 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
1832}
1833
1834static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
1835{
ccb4cabe
SH
1836 char pidstr[25];
1837 int i, len;
1838
1839 len = snprintf(pidstr, 25, "%d", pid);
1840 if (len < 0 || len > 25)
1841 return false;
1842
457ca9aa 1843 for (i = 0; hierarchies[i]; i++) {
ccb4cabe 1844 char *path, *fullpath;
457ca9aa 1845 struct hierarchy *h = hierarchies[i];
ccb4cabe
SH
1846
1847 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
1848 if (!path) // not running
1849 continue;
1850
371f834d
SH
1851 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
1852 free(path);
ccb4cabe
SH
1853 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
1854 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
1855 free(fullpath);
ccb4cabe
SH
1856 return false;
1857 }
ccb4cabe
SH
1858 free(fullpath);
1859 }
1860
ccb4cabe
SH
1861 return true;
1862}
1863
1864/*
1865 * Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.
1866 * Here we don't have a cgroup_data set up, so we ask the running
1867 * container through the commands API for the cgroup path
1868 */
1869static int cgfsng_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
1870{
1871 char *subsystem, *p, *path;
ccb4cabe
SH
1872 struct hierarchy *h;
1873 int ret = -1;
1874
1875 subsystem = alloca(strlen(filename) + 1);
1876 strcpy(subsystem, filename);
1877 if ((p = strchr(subsystem, '.')) != NULL)
1878 *p = '\0';
1879
1880 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1881 if (!path) // not running
1882 return -1;
1883
457ca9aa 1884 h = get_hierarchy(subsystem);
ccb4cabe 1885 if (h) {
371f834d 1886 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1887 ret = lxc_read_from_file(fullpath, value, len);
1888 free(fullpath);
1889 }
1890
ccb4cabe
SH
1891 free(path);
1892
1893 return ret;
1894}
1895
1896/*
1897 * Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.
1898 * Here we don't have a cgroup_data set up, so we ask the running
1899 * container through the commands API for the cgroup path
1900 */
1901static int cgfsng_set(const char *filename, const char *value, const char *name, const char *lxcpath)
1902{
1903 char *subsystem, *p, *path;
ccb4cabe
SH
1904 struct hierarchy *h;
1905 int ret = -1;
1906
1907 subsystem = alloca(strlen(filename) + 1);
1908 strcpy(subsystem, filename);
1909 if ((p = strchr(subsystem, '.')) != NULL)
1910 *p = '\0';
1911
1912 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1913 if (!path) // not running
1914 return -1;
1915
457ca9aa 1916 h = get_hierarchy(subsystem);
ccb4cabe 1917 if (h) {
371f834d 1918 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1919 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
1920 free(fullpath);
1921 }
1922
ccb4cabe
SH
1923 free(path);
1924
1925 return ret;
1926}
1927
ccb4cabe
SH
1928/*
1929 * Called from setup_limits - here we have the container's cgroup_data because
1930 * we created the cgroups
1931 */
1932static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfsng_handler_data *d)
1933{
1934 char *subsystem = NULL, *p;
1935 int ret = -1;
1936 struct hierarchy *h;
1937
1938 subsystem = alloca(strlen(filename) + 1);
1939 strcpy(subsystem, filename);
1940 if ((p = strchr(subsystem, '.')) != NULL)
1941 *p = '\0';
1942
457ca9aa 1943 h = get_hierarchy(subsystem);
ccb4cabe
SH
1944 if (h) {
1945 char *fullpath = must_make_path(h->fullcgpath, filename, NULL);
1946 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
1947 free(fullpath);
1948 }
1949 return ret;
1950}
1951
1952static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings,
1953 bool do_devices)
1954{
1955 struct cgfsng_handler_data *d = hdata;
1956 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
1957 struct lxc_cgroup *cg;
ccb4cabe
SH
1958 bool ret = false;
1959
1960 if (lxc_list_empty(cgroup_settings))
1961 return true;
1962
1963 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
1964 if (!sorted_cgroup_settings) {
1965 return false;
1966 }
1967
ccb4cabe
SH
1968 lxc_list_for_each(iterator, sorted_cgroup_settings) {
1969 cg = iterator->elem;
1970
1971 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
ccb4cabe
SH
1972 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
1973 if (do_devices && (errno == EACCES || errno == EPERM)) {
1974 WARN("Error setting %s to %s for %s",
1975 cg->subsystem, cg->value, d->name);
1976 continue;
1977 }
1978 SYSERROR("Error setting %s to %s for %s",
1979 cg->subsystem, cg->value, d->name);
1980 goto out;
1981 }
1982 }
1983
1984 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
1985 }
1986
1987 ret = true;
1988 INFO("cgroup has been setup");
1989out:
ccb4cabe
SH
1990 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
1991 lxc_list_del(iterator);
1992 free(iterator);
1993 }
1994 free(sorted_cgroup_settings);
1995 return ret;
1996}
1997
1998static struct cgroup_ops cgfsng_ops = {
1999 .init = cgfsng_init,
2000 .destroy = cgfsng_destroy,
2001 .create = cgfsng_create,
2002 .enter = cgfsng_enter,
ccb4cabe 2003 .escape = cgfsng_escape,
36662416
TA
2004 .num_hierarchies = cgfsng_num_hierarchies,
2005 .get_hierarchies = cgfsng_get_hierarchies,
ccb4cabe
SH
2006 .get_cgroup = cgfsng_get_cgroup,
2007 .get = cgfsng_get,
2008 .set = cgfsng_set,
2009 .unfreeze = cgfsng_unfreeze,
2010 .setup_limits = cgfsng_setup_limits,
2011 .name = "cgroupfs-ng",
2012 .attach = cgfsng_attach,
2013 .chown = cgfsns_chown,
2014 .mount_cgroup = cgfsng_mount,
2015 .nrtasks = cgfsng_nrtasks,
2016 .driver = CGFSNG,
2017
2018 /* unsupported */
2019 .create_legacy = NULL,
2020};