]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
cgroup: improve isolcpus handling
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
ccb4cabe
SH
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24/*
25 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
26 * cgroup backend. The original cgfs.c was designed to be as flexible
27 * as possible. It would try to find cgroup filesystems no matter where
28 * or how you had them mounted, and deduce the most usable mount for
29 * each controller. It also was not designed for unprivileged use, as
30 * that was reserved for cgmanager.
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
34 * a comman-separated list of controllers.
35 */
a54694f8 36
ccb4cabe 37#include "config.h"
a54694f8
CB
38
39#include <ctype.h>
40#include <dirent.h>
41#include <errno.h>
42#include <grp.h>
43#include <stdint.h>
ccb4cabe
SH
44#include <stdio.h>
45#include <stdlib.h>
a54694f8 46#include <string.h>
ccb4cabe 47#include <unistd.h>
a54694f8 48#include <sys/types.h>
ccb4cabe 49
d8e48992 50#include "bdev.h"
ccb4cabe 51#include "cgroup.h"
ccb4cabe 52#include "commands.h"
a54694f8
CB
53#include "log.h"
54#include "utils.h"
ccb4cabe
SH
55
56lxc_log_define(lxc_cgfsng, lxc);
57
58static struct cgroup_ops cgfsng_ops;
59
ccb4cabe
SH
60/*
61 * A descriptor for a mounted hierarchy
62 * @controllers: either NULL, or a null-terminated list of all
63 * the co-mounted controllers
64 * @mountpoint: the mountpoint we will use. It will be either
65 * /sys/fs/cgroup/controller or /sys/fs/cgroup/controllerlist
66 * @base_cgroup: the cgroup under which the container cgroup path
67 is created. This will be either the caller's cgroup (if not
68 root), or init's cgroup (if root).
69 */
70struct hierarchy {
71 char **controllers;
72 char *mountpoint;
73 char *base_cgroup;
74 char *fullcgpath;
75};
76
77/*
78 * The cgroup data which is attached to the lxc_handler.
ccb4cabe
SH
79 * @cgroup_pattern - a copy of the lxc.cgroup.pattern
80 * @container_cgroup - if not null, the cgroup which was created for
81 * the container. For each hierarchy, it is created under the
82 * @hierarchy->base_cgroup directory. Relative to the base_cgroup
83 * it is the same for all hierarchies.
84 * @name - the container name
85 */
86struct cgfsng_handler_data {
ccb4cabe
SH
87 char *cgroup_pattern;
88 char *container_cgroup; // cgroup we created for the container
89 char *name; // container name
90};
91
457ca9aa
SH
92/*
93 * @hierarchies - a NULL-terminated array of struct hierarchy, one per
94 * hierarchy. No duplicates. First sufficient, writeable mounted
95 * hierarchy wins
96 */
97struct hierarchy **hierarchies;
98
99/*
100 * @cgroup_use - a copy of the lxc.cgroup.use
101 */
102char *cgroup_use;
103
ccb4cabe
SH
104static void free_string_list(char **clist)
105{
106 if (clist) {
107 int i;
108
109 for (i = 0; clist[i]; i++)
110 free(clist[i]);
111 free(clist);
112 }
113}
114
115/* Re-alllocate a pointer, do not fail */
116static void *must_realloc(void *orig, size_t sz)
117{
118 void *ret;
119
120 do {
121 ret = realloc(orig, sz);
122 } while (!ret);
123 return ret;
124}
125
126/* Allocate a pointer, do not fail */
127static void *must_alloc(size_t sz)
128{
129 return must_realloc(NULL, sz);
130}
131
132/* return copy of string @entry; do not fail. */
133static char *must_copy_string(const char *entry)
134{
135 char *ret;
136
137 if (!entry)
138 return NULL;
139 do {
140 ret = strdup(entry);
141 } while (!ret);
142 return ret;
143}
144
145/*
146 * This is a special case - return a copy of @entry
147 * prepending 'name='. I.e. turn systemd into name=systemd.
148 * Do not fail.
149 */
150static char *must_prefix_named(char *entry)
151{
152 char *ret;
153 size_t len = strlen(entry);
154
155 ret = must_alloc(len + 6);
156 snprintf(ret, len + 6, "name=%s", entry);
157 return ret;
158}
159
160/*
161 * Given a pointer to a null-terminated array of pointers, realloc to
162 * add one entry, and point the new entry to NULL. Do not fail. Return
163 * the index to the second-to-last entry - that is, the one which is
164 * now available for use (keeping the list null-terminated).
165 */
166static int append_null_to_list(void ***list)
167{
168 int newentry = 0;
169
170 if (*list)
171 for (; (*list)[newentry]; newentry++);
172
173 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
174 (*list)[newentry + 1] = NULL;
175 return newentry;
176}
177
178/*
179 * Given a null-terminated array of strings, check whether @entry
180 * is one of the strings
181 */
182static bool string_in_list(char **list, const char *entry)
183{
184 int i;
185
186 if (!list)
187 return false;
188 for (i = 0; list[i]; i++)
189 if (strcmp(list[i], entry) == 0)
190 return true;
191
192 return false;
193}
194
195/*
196 * append an entry to the clist. Do not fail.
197 * *clist must be NULL the first time we are called.
198 *
199 * We also handle named subsystems here. Any controller which is not a
200 * kernel subsystem, we prefix 'name='. Any which is both a kernel and
201 * named subsystem, we refuse to use because we're not sure which we
202 * have here. (TODO - we could work around this in some cases by just
203 * remounting to be unambiguous, or by comparing mountpoint contents
204 * with current cgroup)
205 *
206 * The last entry will always be NULL.
207 */
208static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry)
209{
210 int newentry;
211 char *copy;
212
213 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
214 ERROR("Refusing to use ambiguous controller '%s'", entry);
215 ERROR("It is both a named and kernel subsystem");
216 return;
217 }
218
219 newentry = append_null_to_list((void ***)clist);
220
221 if (strncmp(entry, "name=", 5) == 0)
222 copy = must_copy_string(entry);
223 else if (string_in_list(klist, entry))
224 copy = must_copy_string(entry);
225 else
226 copy = must_prefix_named(entry);
227
228 (*clist)[newentry] = copy;
229}
230
ccb4cabe
SH
231static void free_handler_data(struct cgfsng_handler_data *d)
232{
ccb4cabe
SH
233 free(d->cgroup_pattern);
234 free(d->container_cgroup);
235 free(d->name);
236 free(d);
237}
238
239/*
240 * Given a handler's cgroup data, return the struct hierarchy for the
241 * controller @c, or NULL if there is none.
242 */
457ca9aa 243struct hierarchy *get_hierarchy(const char *c)
ccb4cabe
SH
244{
245 int i;
246
457ca9aa 247 if (!hierarchies)
ccb4cabe 248 return NULL;
457ca9aa
SH
249 for (i = 0; hierarchies[i]; i++) {
250 if (string_in_list(hierarchies[i]->controllers, c))
251 return hierarchies[i];
ccb4cabe
SH
252 }
253 return NULL;
254}
255
e3a3fecf
SH
256static char *must_make_path(const char *first, ...) __attribute__((sentinel));
257
a54694f8
CB
258#define BATCH_SIZE 50
259static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
260{
261 int newbatches = (newlen / BATCH_SIZE) + 1;
262 int oldbatches = (oldlen / BATCH_SIZE) + 1;
263
264 if (!*mem || newbatches > oldbatches) {
265 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
266 }
267}
268
269static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
270{
271 size_t full = oldlen + newlen;
272
273 batch_realloc(dest, oldlen, full + 1);
274
275 memcpy(*dest + oldlen, new, newlen + 1);
276}
277
278/* Slurp in a whole file */
279static char *read_file(char *fnam)
280{
281 FILE *f;
282 char *line = NULL, *buf = NULL;
283 size_t len = 0, fulllen = 0;
284 int linelen;
285
286 f = fopen(fnam, "r");
287 if (!f)
288 return NULL;
289 while ((linelen = getline(&line, &len, f)) != -1) {
290 append_line(&buf, fulllen, line, linelen);
291 fulllen += linelen;
292 }
293 fclose(f);
294 free(line);
295 return buf;
296}
297
298/* Taken over modified from the kernel sources. */
299#define NBITS 32 /* bits in uint32_t */
300#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
301#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
302
303static void set_bit(unsigned bit, uint32_t *bitarr)
304{
305 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
306}
307
308static void clear_bit(unsigned bit, uint32_t *bitarr)
309{
310 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
311}
312
313static bool is_set(unsigned bit, uint32_t *bitarr)
314{
315 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
316}
317
318/* Create cpumask from cpulist aka turn:
319 *
320 * 0,2-3
321 *
322 * into bit array
323 *
324 * 1 0 1 1
325 */
326static uint32_t *lxc_cpumask(char *buf, size_t nbits)
327{
328 char *token;
329 char *saveptr = NULL;
330 size_t arrlen = BITS_TO_LONGS(nbits);
331 uint32_t *bitarr = calloc(arrlen, sizeof(uint32_t));
332 if (!bitarr)
333 return NULL;
334
335 for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
336 errno = 0;
337 unsigned start = strtoul(token, NULL, 0);
338 unsigned end = start;
339
340 char *range = strchr(token, '-');
341 if (range)
342 end = strtoul(range + 1, NULL, 0);
343 if (!(start <= end)) {
344 free(bitarr);
345 return NULL;
346 }
347
348 if (end >= nbits) {
349 free(bitarr);
350 return NULL;
351 }
352
353 while (start <= end)
354 set_bit(start++, bitarr);
355 }
356
357 return bitarr;
358}
359
360/* The largest integer that can fit into long int is 2^64. This is a
657f8907
CB
361 * 20-digit number.
362 */
363#define __IN_TO_STR_LEN 21
a54694f8
CB
364/* Turn cpumask into simple, comma-separated cpulist. */
365static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
366{
367 size_t i;
368 int ret;
657f8907 369 char numstr[__IN_TO_STR_LEN] = {0};
a54694f8
CB
370 char **cpulist = NULL;
371
372 for (i = 0; i <= nbits; i++) {
373 if (is_set(i, bitarr)) {
657f8907
CB
374 ret = snprintf(numstr, __IN_TO_STR_LEN, "%zu", i);
375 if (ret < 0 || (size_t)ret >= __IN_TO_STR_LEN) {
a54694f8
CB
376 lxc_free_array((void **)cpulist, free);
377 return NULL;
378 }
379 if (lxc_append_string(&cpulist, numstr) < 0) {
380 lxc_free_array((void **)cpulist, free);
381 return NULL;
382 }
383 }
384 }
385 return lxc_string_join(",", (const char **)cpulist, false);
386}
387
388static ssize_t get_max_cpus(char *cpulist)
389{
390 char *c1, *c2;
391 char *maxcpus = cpulist;
392 size_t cpus = 0;
393
394 c1 = strrchr(maxcpus, ',');
395 if (c1)
396 c1++;
397
398 c2 = strrchr(maxcpus, '-');
399 if (c2)
400 c2++;
401
402 if (!c1 && !c2)
403 c1 = maxcpus;
404 else if (c1 > c2)
405 c2 = c1;
406 else if (c1 < c2)
407 c1 = c2;
408 else if (!c1 && c2) // The reverse case is obvs. not needed.
409 c1 = c2;
410
411 /* If the above logic is correct, c1 should always hold a valid string
412 * here.
413 */
414
415 errno = 0;
416 cpus = strtoul(c1, NULL, 0);
417 if (errno != 0)
418 return -1;
419
420 return cpus;
421}
422
6f9584d8 423#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
a54694f8
CB
424static bool filter_and_set_cpus(char *path, bool am_initialized)
425{
426 char *lastslash, *fpath, oldv;
427 int ret;
428 ssize_t i;
429
430 ssize_t maxposs = 0, maxisol = 0;
431 char *cpulist = NULL, *posscpus = NULL, *isolcpus = NULL;
432 uint32_t *possmask = NULL, *isolmask = NULL;
6f9584d8 433 bool bret = false, flipped_bit = false;
a54694f8
CB
434
435 lastslash = strrchr(path, '/');
436 if (!lastslash) { // bug... this shouldn't be possible
6f9584d8 437 ERROR("Invalid path: %s.", path);
a54694f8
CB
438 return bret;
439 }
440 oldv = *lastslash;
441 *lastslash = '\0';
442 fpath = must_make_path(path, "cpuset.cpus", NULL);
443 posscpus = read_file(fpath);
6f9584d8
CB
444 if (!posscpus) {
445 SYSERROR("Could not read file: %s.\n", fpath);
446 goto on_error;
447 }
a54694f8
CB
448
449 /* Get maximum number of cpus found in possible cpuset. */
450 maxposs = get_max_cpus(posscpus);
451 if (maxposs < 0)
6f9584d8 452 goto on_error;
a54694f8 453
6f9584d8
CB
454 if (!file_exists(__ISOL_CPUS)) {
455 /* This system doesn't expose isolated cpus. */
456 DEBUG("Path: "__ISOL_CPUS" to read isolated cpus from does not exist.\n");
457 goto on_success;
458 }
459
460 isolcpus = read_file(__ISOL_CPUS);
461 if (!isolcpus) {
462 SYSERROR("Could not read file "__ISOL_CPUS);
463 goto on_error;
464 }
a54694f8 465 if (!isdigit(isolcpus[0])) {
6f9584d8 466 DEBUG("No isolated cpus detected.");
a54694f8
CB
467 cpulist = posscpus;
468 /* No isolated cpus but we weren't already initialized by
469 * someone. We should simply copy the parents cpuset.cpus
470 * values.
471 */
6f9584d8
CB
472 if (!am_initialized) {
473 DEBUG("Copying cpuset of parent cgroup.");
a54694f8 474 goto copy_parent;
6f9584d8 475 }
a54694f8
CB
476 /* No isolated cpus but we were already initialized by someone.
477 * Nothing more to do for us.
478 */
6f9584d8 479 goto on_success;
a54694f8
CB
480 }
481
482 /* Get maximum number of cpus found in isolated cpuset. */
483 maxisol = get_max_cpus(isolcpus);
484 if (maxisol < 0)
6f9584d8 485 goto on_error;
a54694f8
CB
486
487 if (maxposs < maxisol)
488 maxposs = maxisol;
489 maxposs++;
490
491 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8
CB
492 if (!possmask) {
493 ERROR("Could not create cpumask for all possible cpus.\n");
494 goto on_error;
495 }
a54694f8
CB
496
497 isolmask = lxc_cpumask(isolcpus, maxposs);
6f9584d8
CB
498 if (!isolmask) {
499 ERROR("Could not create cpumask for all isolated cpus.\n");
500 goto on_error;
501 }
a54694f8
CB
502
503 for (i = 0; i <= maxposs; i++) {
504 if (is_set(i, isolmask) && is_set(i, possmask)) {
6f9584d8 505 flipped_bit = true;
a54694f8
CB
506 clear_bit(i, possmask);
507 }
508 }
509
6f9584d8
CB
510 if (!flipped_bit) {
511 DEBUG("No isolated cpus present in cpuset.");
512 goto on_success;
513 }
514 DEBUG("Removed isolated cpus from cpuset.");
515
a54694f8 516 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
6f9584d8
CB
517 if (!cpulist) {
518 ERROR("Could not create cpu list.\n");
519 goto on_error;
520 }
a54694f8
CB
521
522copy_parent:
523 *lastslash = oldv;
524 fpath = must_make_path(path, "cpuset.cpus", NULL);
525 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false);
6f9584d8
CB
526 if (ret < 0) {
527 SYSERROR("Could not write cpu list to: %s.\n", fpath);
528 goto on_error;
529 }
530
531on_success:
532 bret = true;
a54694f8 533
6f9584d8 534on_error:
a54694f8
CB
535 free(fpath);
536
537 free(isolcpus);
538 free(isolmask);
539
540 if (posscpus != cpulist)
541 free(posscpus);
542 free(possmask);
543
544 free(cpulist);
545 return bret;
546}
547
e3a3fecf
SH
548/* Copy contents of parent(@path)/@file to @path/@file */
549static bool copy_parent_file(char *path, char *file)
550{
551 char *lastslash, *value = NULL, *fpath, oldv;
552 int len = 0;
553 int ret;
554
555 lastslash = strrchr(path, '/');
556 if (!lastslash) { // bug... this shouldn't be possible
557 ERROR("cgfsng:copy_parent_file: bad path %s", path);
558 return false;
559 }
560 oldv = *lastslash;
561 *lastslash = '\0';
562 fpath = must_make_path(path, file, NULL);
563 len = lxc_read_from_file(fpath, NULL, 0);
564 if (len <= 0)
565 goto bad;
566 value = must_alloc(len + 1);
567 if (lxc_read_from_file(fpath, value, len) != len)
568 goto bad;
569 free(fpath);
570 *lastslash = oldv;
571 fpath = must_make_path(path, file, NULL);
572 ret = lxc_write_to_file(fpath, value, len, false);
573 if (ret < 0)
574 SYSERROR("Unable to write %s to %s", value, fpath);
575 free(fpath);
576 free(value);
577 return ret >= 0;
578
579bad:
580 SYSERROR("Error reading '%s'", fpath);
581 free(fpath);
582 free(value);
583 return false;
584}
585
586/*
587 * Initialize the cpuset hierarchy in first directory of @gname and
588 * set cgroup.clone_children so that children inherit settings.
589 * Since the h->base_path is populated by init or ourselves, we know
590 * it is already initialized.
591 */
a54694f8 592static bool handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf
SH
593{
594 char *cgpath, *clonechildrenpath, v, *slash;
595
596 if (!string_in_list(h->controllers, "cpuset"))
597 return true;
598
599 if (*cgname == '/')
600 cgname++;
601 slash = strchr(cgname, '/');
602 if (slash)
603 *slash = '\0';
604
605 cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
606 if (slash)
607 *slash = '/';
608 if (mkdir(cgpath, 0755) < 0 && errno != EEXIST) {
609 SYSERROR("Failed to create '%s'", cgpath);
610 free(cgpath);
611 return false;
612 }
6f9584d8 613
e3a3fecf
SH
614 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
615 if (!file_exists(clonechildrenpath)) { /* unified hierarchy doesn't have clone_children */
616 free(clonechildrenpath);
617 free(cgpath);
618 return true;
619 }
620 if (lxc_read_from_file(clonechildrenpath, &v, 1) < 0) {
621 SYSERROR("Failed to read '%s'", clonechildrenpath);
622 free(clonechildrenpath);
623 free(cgpath);
624 return false;
625 }
626
a54694f8 627 /* Make sure any isolated cpus are removed from cpuset.cpus. */
6f9584d8
CB
628 if (!filter_and_set_cpus(cgpath, v == '1')) {
629 SYSERROR("Failed to remove isolated cpus.");
630 free(clonechildrenpath);
631 free(cgpath);
a54694f8 632 return false;
6f9584d8 633 }
a54694f8 634
e3a3fecf 635 if (v == '1') { /* already set for us by someone else */
6f9584d8 636 DEBUG("\"cgroup.clone_children\" was already set to \"1\".");
e3a3fecf
SH
637 free(clonechildrenpath);
638 free(cgpath);
639 return true;
640 }
641
642 /* copy parent's settings */
a54694f8 643 if (!copy_parent_file(cgpath, "cpuset.mems")) {
6f9584d8 644 SYSERROR("Failed to copy \"cpuset.mems\" settings.");
e3a3fecf
SH
645 free(cgpath);
646 free(clonechildrenpath);
647 return false;
648 }
649 free(cgpath);
650
651 if (lxc_write_to_file(clonechildrenpath, "1", 1, false) < 0) {
652 /* Set clone_children so children inherit our settings */
653 SYSERROR("Failed to write 1 to %s", clonechildrenpath);
654 free(clonechildrenpath);
655 return false;
656 }
657 free(clonechildrenpath);
658 return true;
659}
660
ccb4cabe
SH
661/*
662 * Given two null-terminated lists of strings, return true if any string
663 * is in both.
664 */
665static bool controller_lists_intersect(char **l1, char **l2)
666{
667 int i;
668
669 if (!l1 || !l2)
670 return false;
671
672 for (i = 0; l1[i]; i++) {
673 if (string_in_list(l2, l1[i]))
674 return true;
675 }
676 return false;
677}
678
679/*
680 * For a null-terminated list of controllers @clist, return true if any of
681 * those controllers is already listed the null-terminated list of
682 * hierarchies @hlist. Realistically, if one is present, all must be present.
683 */
684static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
685{
686 int i;
687
688 if (!hlist)
689 return false;
690 for (i = 0; hlist[i]; i++)
691 if (controller_lists_intersect(hlist[i]->controllers, clist))
692 return true;
693 return false;
694
695}
696
697/*
698 * Return true if the controller @entry is found in the null-terminated
699 * list of hierarchies @hlist
700 */
701static bool controller_found(struct hierarchy **hlist, char *entry)
702{
703 int i;
704 if (!hlist)
705 return false;
706
707 for (i = 0; hlist[i]; i++)
708 if (string_in_list(hlist[i]->controllers, entry))
709 return true;
710 return false;
711}
712
713/*
c30b61c3
SH
714 * Return true if all of the controllers which we require have been found.
715 * The required list is freezer and anything in * lxc.cgroup.use.
ccb4cabe 716 */
457ca9aa 717static bool all_controllers_found(void)
ccb4cabe
SH
718{
719 char *p, *saveptr = NULL;
457ca9aa 720 struct hierarchy ** hlist = hierarchies;
ccb4cabe 721
ccb4cabe
SH
722 if (!controller_found(hlist, "freezer")) {
723 ERROR("no freezer controller mountpoint found");
724 return false;
725 }
726
457ca9aa 727 if (!cgroup_use)
ccb4cabe 728 return true;
457ca9aa 729 for (p = strtok_r(cgroup_use, ",", &saveptr); p;
ccb4cabe
SH
730 p = strtok_r(NULL, ",", &saveptr)) {
731 if (!controller_found(hlist, p)) {
732 ERROR("no %s controller mountpoint found", p);
733 return false;
734 }
735 }
736 return true;
737}
738
739/* Return true if the fs type is fuse.lxcfs */
740static bool is_lxcfs(const char *line)
741{
742 char *p = strstr(line, " - ");
743 if (!p)
744 return false;
2f62fb00 745 return strncmp(p, " - fuse.lxcfs ", 14) == 0;
ccb4cabe
SH
746}
747
748/*
749 * Get the controllers from a mountinfo line
750 * There are other ways we could get this info. For lxcfs, field 3
751 * is /cgroup/controller-list. For cgroupfs, we could parse the mount
752 * options. But we simply assume that the mountpoint must be
753 * /sys/fs/cgroup/controller-list
754 */
755static char **get_controllers(char **klist, char **nlist, char *line)
756{
757 // the fourth field is /sys/fs/cgroup/comma-delimited-controller-list
758 int i;
759 char *p = line, *p2, *tok, *saveptr = NULL;
760 char **aret = NULL;
761
762 for (i = 0; i < 4; i++) {
235f1815 763 p = strchr(p, ' ');
ccb4cabe
SH
764 if (!p)
765 return NULL;
766 p++;
767 }
768 if (!p)
769 return NULL;
770 /* note - if we change how mountinfo works, then our caller
771 * will need to verify /sys/fs/cgroup/ in this field */
772 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0)
773 return NULL;
774 p += 15;
235f1815 775 p2 = strchr(p, ' ');
ccb4cabe
SH
776 if (!p2) {
777 ERROR("corrupt mountinfo");
778 return NULL;
779 }
780 *p2 = '\0';
781 for (tok = strtok_r(p, ",", &saveptr); tok;
782 tok = strtok_r(NULL, ",", &saveptr)) {
783 must_append_controller(klist, nlist, &aret, tok);
784 }
785
786 return aret;
787}
788
789/* return true if the fstype is cgroup */
790static bool is_cgroupfs(char *line)
791{
792 char *p = strstr(line, " - ");
793 if (!p)
794 return false;
2f62fb00 795 return strncmp(p, " - cgroup ", 10) == 0;
ccb4cabe
SH
796}
797
798/* Add a controller to our list of hierarchies */
457ca9aa 799static void add_controller(char **clist, char *mountpoint, char *base_cgroup)
ccb4cabe
SH
800{
801 struct hierarchy *new;
802 int newentry;
803
804 new = must_alloc(sizeof(*new));
805 new->controllers = clist;
806 new->mountpoint = mountpoint;
807 new->base_cgroup = base_cgroup;
808 new->fullcgpath = NULL;
809
457ca9aa
SH
810 newentry = append_null_to_list((void ***)&hierarchies);
811 hierarchies[newentry] = new;
ccb4cabe
SH
812}
813
814/*
815 * Get a copy of the mountpoint from @line, which is a line from
816 * /proc/self/mountinfo
817 */
818static char *get_mountpoint(char *line)
819{
820 int i;
821 char *p = line, *sret;
822 size_t len;
823
824 for (i = 0; i < 4; i++) {
235f1815 825 p = strchr(p, ' ');
ccb4cabe
SH
826 if (!p)
827 return NULL;
828 p++;
829 }
830 /* we've already stuck a \0 after the mountpoint */
831 len = strlen(p);
832 sret = must_alloc(len + 1);
833 memcpy(sret, p, len);
834 sret[len] = '\0';
835 return sret;
836}
837
838/*
839 * Given a multi-line string, return a null-terminated copy of the
840 * current line.
841 */
842static char *copy_to_eol(char *p)
843{
235f1815 844 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
845 size_t len;
846
847 if (!p2)
848 return NULL;
849
850 len = p2 - p;
851 sret = must_alloc(len + 1);
852 memcpy(sret, p, len);
853 sret[len] = '\0';
854 return sret;
855}
856
857/*
858 * cgline: pointer to character after the first ':' in a line in a
859 * \n-terminated /proc/self/cgroup file. Check whether * controller c is
860 * present.
861 */
862static bool controller_in_clist(char *cgline, char *c)
863{
864 char *tok, *saveptr = NULL, *eol, *tmp;
865 size_t len;
866
235f1815 867 eol = strchr(cgline, ':');
ccb4cabe
SH
868 if (!eol)
869 return false;
870
871 len = eol - cgline;
872 tmp = alloca(len + 1);
873 memcpy(tmp, cgline, len);
874 tmp[len] = '\0';
875
876 for (tok = strtok_r(tmp, ",", &saveptr); tok;
877 tok = strtok_r(NULL, ",", &saveptr)) {
878 if (strcmp(tok, c) == 0)
879 return true;
880 }
881 return false;
882}
883
884/*
885 * @basecginfo is a copy of /proc/$$/cgroup. Return the current
886 * cgroup for @controller
887 */
888static char *get_current_cgroup(char *basecginfo, char *controller)
889{
890 char *p = basecginfo;
891
892 while (1) {
235f1815 893 p = strchr(p, ':');
ccb4cabe
SH
894 if (!p)
895 return NULL;
896 p++;
897 if (controller_in_clist(p, controller)) {
235f1815 898 p = strchr(p, ':');
ccb4cabe
SH
899 if (!p)
900 return NULL;
901 p++;
902 return copy_to_eol(p);
903 }
904
235f1815 905 p = strchr(p, '\n');
ccb4cabe
SH
906 if (!p)
907 return NULL;
908 p++;
909 }
910}
911
ccb4cabe
SH
912/*
913 * Given a hierarchy @mountpoint and base @path, verify that we can create
914 * directories underneath it.
915 */
916static bool test_writeable(char *mountpoint, char *path)
917{
918 char *fullpath = must_make_path(mountpoint, path, NULL);
919 int ret;
920
921 ret = access(fullpath, W_OK);
922 free(fullpath);
923 return ret == 0;
924}
925
926static void must_append_string(char ***list, char *entry)
927{
928 int newentry = append_null_to_list((void ***)list);
929 char *copy;
930
931 copy = must_copy_string(entry);
932 (*list)[newentry] = copy;
933}
934
935static void get_existing_subsystems(char ***klist, char ***nlist)
936{
937 FILE *f;
938 char *line = NULL;
939 size_t len = 0;
940
941 if ((f = fopen("/proc/self/cgroup", "r")) == NULL)
942 return;
943 while (getline(&line, &len, f) != -1) {
944 char *p, *p2, *tok, *saveptr = NULL;
235f1815 945 p = strchr(line, ':');
ccb4cabe
SH
946 if (!p)
947 continue;
948 p++;
235f1815 949 p2 = strchr(p, ':');
ccb4cabe
SH
950 if (!p2)
951 continue;
952 *p2 = '\0';
ff8d6ee9
CB
953
954 /* If we have a mixture between cgroup v1 and cgroup v2
955 * hierarchies, then /proc/self/cgroup contains entries of the
956 * form:
957 *
958 * 0::/some/path
959 *
960 * We need to skip those.
961 */
962 if ((p2 - p) == 0)
963 continue;
964
ccb4cabe
SH
965 for (tok = strtok_r(p, ",", &saveptr); tok;
966 tok = strtok_r(NULL, ",", &saveptr)) {
967 if (strncmp(tok, "name=", 5) == 0)
968 must_append_string(nlist, tok);
969 else
970 must_append_string(klist, tok);
971 }
972 }
973
974 free(line);
975 fclose(f);
976}
977
978static void trim(char *s)
979{
980 size_t len = strlen(s);
981 while (s[len-1] == '\n')
982 s[--len] = '\0';
983}
984
ccb4cabe
SH
985static void print_init_debuginfo(struct cgfsng_handler_data *d)
986{
a7b0cc4c 987 struct hierarchy **it;
ccb4cabe 988 int i;
41c33dbe
SH
989
990 if (!getenv("LXC_DEBUG_CGFSNG"))
991 return;
992
a7b0cc4c
CB
993 DEBUG("Cgroup information:");
994 DEBUG(" container name: %s", d->name ? d->name : "(null)");
995 DEBUG(" lxc.cgroup.use: %s", cgroup_use ? cgroup_use : "(null)");
996 DEBUG(" lxc.cgroup.pattern: %s", d->cgroup_pattern ? d->cgroup_pattern : "(null)");
997 DEBUG(" cgroup: %s", d->container_cgroup ? d->container_cgroup : "(null)");
457ca9aa 998 if (!hierarchies) {
a7b0cc4c 999 DEBUG(" No hierarchies found.");
ccb4cabe
SH
1000 return;
1001 }
a7b0cc4c
CB
1002 DEBUG(" Hierarchies:");
1003 for (i = 0, it = hierarchies; it && *it; it++, i++) {
1004 char **cit;
ccb4cabe 1005 int j;
a7b0cc4c
CB
1006 DEBUG(" %d: base_cgroup %s", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1007 DEBUG(" mountpoint %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1008 DEBUG(" controllers:");
1009 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
1010 DEBUG(" %d: %s", j, *cit);
ccb4cabe
SH
1011 }
1012}
41c33dbe
SH
1013
1014static void print_basecg_debuginfo(char *basecginfo, char **klist, char **nlist)
1015{
1016 int k;
a7b0cc4c 1017 char **it;
41c33dbe
SH
1018 if (!getenv("LXC_DEBUG_CGFSNG"))
1019 return;
1020
a7b0cc4c
CB
1021 printf("basecginfo is:\n");
1022 printf("%s\n", basecginfo);
41c33dbe 1023
a7b0cc4c
CB
1024 for (k = 0, it = klist; it && *it; it++, k++)
1025 printf("kernel subsystem %d: %s\n", k, *it);
1026 for (k = 0, it = nlist; it && *it; it++, k++)
1027 printf("named subsystem %d: %s\n", k, *it);
41c33dbe 1028}
ccb4cabe
SH
1029
1030/*
1031 * At startup, parse_hierarchies finds all the info we need about
1032 * cgroup mountpoints and current cgroups, and stores it in @d.
1033 */
457ca9aa 1034static bool parse_hierarchies(void)
ccb4cabe
SH
1035{
1036 FILE *f;
1037 char * line = NULL, *basecginfo;
1038 char **klist = NULL, **nlist = NULL;
1039 size_t len = 0;
1040
d30ec4cb
SH
1041 /*
1042 * Root spawned containers escape the current cgroup, so use init's
1043 * cgroups as our base in that case.
1044 */
ccb4cabe
SH
1045 if (geteuid())
1046 basecginfo = read_file("/proc/self/cgroup");
1047 else
1048 basecginfo = read_file("/proc/1/cgroup");
1049 if (!basecginfo)
1050 return false;
1051
1052 if ((f = fopen("/proc/self/mountinfo", "r")) == NULL) {
d3b00a8f 1053 SYSERROR("Failed opening /proc/self/mountinfo");
ccb4cabe
SH
1054 return false;
1055 }
1056
1057 get_existing_subsystems(&klist, &nlist);
41c33dbe
SH
1058
1059 print_basecg_debuginfo(basecginfo, klist, nlist);
ccb4cabe
SH
1060
1061 /* we support simple cgroup mounts and lxcfs mounts */
1062 while (getline(&line, &len, f) != -1) {
1063 char **controller_list = NULL;
1064 char *mountpoint, *base_cgroup;
1065
1066 if (!is_lxcfs(line) && !is_cgroupfs(line))
1067 continue;
1068
1069 controller_list = get_controllers(klist, nlist, line);
1070 if (!controller_list)
1071 continue;
1072
457ca9aa 1073 if (controller_list_is_dup(hierarchies, controller_list)) {
ccb4cabe
SH
1074 free(controller_list);
1075 continue;
1076 }
1077
1078 mountpoint = get_mountpoint(line);
1079 if (!mountpoint) {
1080 ERROR("Error reading mountinfo: bad line '%s'", line);
1081 free_string_list(controller_list);
1082 continue;
1083 }
1084
1085 base_cgroup = get_current_cgroup(basecginfo, controller_list[0]);
1086 if (!base_cgroup) {
1087 ERROR("Failed to find current cgroup for controller '%s'", controller_list[0]);
1088 free_string_list(controller_list);
1089 free(mountpoint);
1090 continue;
1091 }
1092 trim(base_cgroup);
1093 prune_init_scope(base_cgroup);
1094 if (!test_writeable(mountpoint, base_cgroup)) {
1095 free_string_list(controller_list);
1096 free(mountpoint);
1097 free(base_cgroup);
1098 continue;
1099 }
457ca9aa 1100 add_controller(controller_list, mountpoint, base_cgroup);
ccb4cabe
SH
1101 }
1102
1103 free_string_list(klist);
1104 free_string_list(nlist);
1105
1106 free(basecginfo);
1107
1108 fclose(f);
1109 free(line);
1110
ccb4cabe
SH
1111 /* verify that all controllers in cgroup.use and all crucial
1112 * controllers are accounted for
1113 */
457ca9aa 1114 if (!all_controllers_found())
ccb4cabe
SH
1115 return false;
1116
1117 return true;
1118}
1119
457ca9aa
SH
1120static bool collect_hierarchy_info(void)
1121{
1122 const char *tmp;
1123 errno = 0;
1124 tmp = lxc_global_config_value("lxc.cgroup.use");
1125 if (!cgroup_use && errno != 0) { // lxc.cgroup.use can be NULL
1126 SYSERROR("cgfsng: error reading list of cgroups to use");
1127 return false;
1128 }
1129 cgroup_use = must_copy_string(tmp);
1130
1131 return parse_hierarchies();
1132}
1133
ccb4cabe
SH
1134static void *cgfsng_init(const char *name)
1135{
1136 struct cgfsng_handler_data *d;
457ca9aa 1137 const char *cgroup_pattern;
ccb4cabe
SH
1138
1139 d = must_alloc(sizeof(*d));
1140 memset(d, 0, sizeof(*d));
1141
1142 d->name = must_copy_string(name);
1143
ccb4cabe
SH
1144 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
1145 if (!cgroup_pattern) { // lxc.cgroup.pattern is only NULL on error
1146 ERROR("Error getting cgroup pattern");
1147 goto out_free;
1148 }
1149 d->cgroup_pattern = must_copy_string(cgroup_pattern);
1150
ccb4cabe
SH
1151 print_init_debuginfo(d);
1152
1153 return d;
1154
1155out_free:
1156 free_handler_data(d);
1157 return NULL;
1158}
1159
1160/*
1161 * Concatenate all passed-in strings into one path. Do not fail. If any piece is
1162 * not prefixed with '/', add a '/'.
1163 */
1164static char *must_make_path(const char *first, ...)
1165{
1166 va_list args;
1167 char *cur, *dest;
1168 size_t full_len = strlen(first);
1169
1170 dest = must_copy_string(first);
1171
1172 va_start(args, first);
1173 while ((cur = va_arg(args, char *)) != NULL) {
1174 full_len += strlen(cur);
1175 if (cur[0] != '/')
1176 full_len++;
1177 dest = must_realloc(dest, full_len + 1);
1178 if (cur[0] != '/')
1179 strcat(dest, "/");
1180 strcat(dest, cur);
1181 }
1182 va_end(args);
1183
1184 return dest;
1185}
1186
1187static int cgroup_rmdir(char *dirname)
1188{
74f96976 1189 struct dirent *direntp;
ccb4cabe
SH
1190 DIR *dir;
1191 int r = 0;
1192
1193 dir = opendir(dirname);
1194 if (!dir)
1195 return -1;
1196
74f96976 1197 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1198 struct stat mystat;
1199 char *pathname;
1200
1201 if (!direntp)
1202 break;
1203
1204 if (!strcmp(direntp->d_name, ".") ||
1205 !strcmp(direntp->d_name, ".."))
1206 continue;
1207
1208 pathname = must_make_path(dirname, direntp->d_name, NULL);
1209
1210 if (lstat(pathname, &mystat)) {
1211 if (!r)
1c9da8da 1212 WARN("failed to stat %s", pathname);
ccb4cabe
SH
1213 r = -1;
1214 goto next;
1215 }
1216
1217 if (!S_ISDIR(mystat.st_mode))
1218 goto next;
1219 if (cgroup_rmdir(pathname) < 0)
1220 r = -1;
1221next:
1222 free(pathname);
1223 }
1224
1225 if (rmdir(dirname) < 0) {
1226 if (!r)
1227 WARN("%s: failed to delete %s: %m", __func__, dirname);
1228 r = -1;
1229 }
1230
1231 if (closedir(dir) < 0) {
1232 if (!r)
1233 WARN("%s: failed to delete %s: %m", __func__, dirname);
1234 r = -1;
1235 }
1236 return r;
1237}
1238
1239static int rmdir_wrapper(void *data)
1240{
1241 char *path = data;
1242
1243 if (setresgid(0,0,0) < 0)
1244 SYSERROR("Failed to setgid to 0");
1245 if (setresuid(0,0,0) < 0)
1246 SYSERROR("Failed to setuid to 0");
1247 if (setgroups(0, NULL) < 0)
1248 SYSERROR("Failed to clear groups");
1249
1250 return cgroup_rmdir(path);
1251}
1252
1253void recursive_destroy(char *path, struct lxc_conf *conf)
1254{
1255 int r;
1256 if (conf && !lxc_list_empty(&conf->id_map))
1257 r = userns_exec_1(conf, rmdir_wrapper, path);
1258 else
1259 r = cgroup_rmdir(path);
1260
1261 if (r < 0)
1c9da8da 1262 ERROR("Error destroying %s", path);
ccb4cabe
SH
1263}
1264
1265static void cgfsng_destroy(void *hdata, struct lxc_conf *conf)
1266{
1267 struct cgfsng_handler_data *d = hdata;
1268
1269 if (!d)
1270 return;
1271
457ca9aa 1272 if (d->container_cgroup && hierarchies) {
ccb4cabe 1273 int i;
457ca9aa
SH
1274 for (i = 0; hierarchies[i]; i++) {
1275 struct hierarchy *h = hierarchies[i];
e2db2a89 1276 if (h->fullcgpath) {
ccb4cabe
SH
1277 recursive_destroy(h->fullcgpath, conf);
1278 free(h->fullcgpath);
1279 h->fullcgpath = NULL;
1280 }
1281 }
1282 }
1283
1284 free_handler_data(d);
1285}
1286
1287struct cgroup_ops *cgfsng_ops_init(void)
1288{
457ca9aa
SH
1289 if (!collect_hierarchy_info())
1290 return NULL;
ccb4cabe
SH
1291 return &cgfsng_ops;
1292}
1293
1294static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1295{
e3a3fecf 1296 h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
6f9584d8
CB
1297 if (dir_exists(h->fullcgpath)) { // it must not already exist
1298 ERROR("Path \"%s\" already existed.", h->fullcgpath);
d8da679e 1299 return false;
6f9584d8
CB
1300 }
1301 if (!handle_cpuset_hierarchy(h, cgname)) {
1302 ERROR("Failed to handle cgroupfs v1 cpuset controller.");
e3a3fecf 1303 return false;
6f9584d8 1304 }
e3a3fecf 1305 return mkdir_p(h->fullcgpath, 0755) == 0;
ccb4cabe
SH
1306}
1307
1308static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1309{
1310 if (rmdir(h->fullcgpath) < 0)
1311 SYSERROR("Failed to clean up cgroup %s from failed creation attempt", h->fullcgpath);
1312 free(h->fullcgpath);
1313 h->fullcgpath = NULL;
1314}
1315
1316/*
d30ec4cb 1317 * Try to create the same cgroup in all hierarchies.
ccb4cabe
SH
1318 * Start with cgroup_pattern; next cgroup_pattern-1, -2, ..., -999
1319 */
1320static inline bool cgfsng_create(void *hdata)
1321{
1322 struct cgfsng_handler_data *d = hdata;
1323 char *tmp, *cgname, *offset;
1324 int i, idx = 0;
1325 size_t len;
1326
1327 if (!d)
1328 return false;
1329 if (d->container_cgroup) {
1330 WARN("cgfsng_create called a second time");
1331 return false;
1332 }
1333
1334 tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
1335 if (!tmp) {
1336 ERROR("Failed expanding cgroup name pattern");
1337 return false;
1338 }
1339 len = strlen(tmp) + 5; // leave room for -NNN\0
1340 cgname = must_alloc(len);
1341 strcpy(cgname, tmp);
1342 free(tmp);
1343 offset = cgname + len - 5;
1344
1345again:
95adfe93
SH
1346 if (idx == 1000) {
1347 ERROR("Too many conflicting cgroup names");
ccb4cabe 1348 goto out_free;
95adfe93 1349 }
ccb4cabe
SH
1350 if (idx)
1351 snprintf(offset, 5, "-%d", idx);
457ca9aa
SH
1352 for (i = 0; hierarchies[i]; i++) {
1353 if (!create_path_for_hierarchy(hierarchies[i], cgname)) {
ccb4cabe 1354 int j;
457ca9aa
SH
1355 SYSERROR("Failed to create %s: %s", hierarchies[i]->fullcgpath, strerror(errno));
1356 free(hierarchies[i]->fullcgpath);
1357 hierarchies[i]->fullcgpath = NULL;
ccb4cabe 1358 for (j = 0; j < i; j++)
457ca9aa 1359 remove_path_for_hierarchy(hierarchies[j], cgname);
ccb4cabe
SH
1360 idx++;
1361 goto again;
1362 }
1363 }
1364 /* Done */
1365 d->container_cgroup = cgname;
1366 return true;
1367
1368out_free:
1369 free(cgname);
1370 return false;
1371}
1372
ccb4cabe
SH
1373static bool cgfsng_enter(void *hdata, pid_t pid)
1374{
ccb4cabe
SH
1375 char pidstr[25];
1376 int i, len;
1377
1378 len = snprintf(pidstr, 25, "%d", pid);
1379 if (len < 0 || len > 25)
1380 return false;
1381
457ca9aa
SH
1382 for (i = 0; hierarchies[i]; i++) {
1383 char *fullpath = must_make_path(hierarchies[i]->fullcgpath,
ccb4cabe
SH
1384 "cgroup.procs", NULL);
1385 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
d3b00a8f 1386 SYSERROR("Failed to enter %s", fullpath);
ccb4cabe
SH
1387 free(fullpath);
1388 return false;
1389 }
1390 free(fullpath);
1391 }
1392
1393 return true;
1394}
1395
1396struct chown_data {
1397 struct cgfsng_handler_data *d;
1398 uid_t origuid; // target uid in parent namespace
1399};
1400
c0888dfe
SH
1401/*
1402 * chgrp the container cgroups to container group. We leave
1403 * the container owner as cgroup owner. So we must make the
1404 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1405 *
1406 * Also chown the tasks and cgroup.procs files. Those may not
1407 * exist depending on kernel version.
c0888dfe 1408 */
ccb4cabe
SH
1409static int chown_cgroup_wrapper(void *data)
1410{
1411 struct chown_data *arg = data;
ccb4cabe
SH
1412 uid_t destuid;
1413 int i;
1414
1415 if (setresgid(0,0,0) < 0)
1416 SYSERROR("Failed to setgid to 0");
1417 if (setresuid(0,0,0) < 0)
1418 SYSERROR("Failed to setuid to 0");
1419 if (setgroups(0, NULL) < 0)
1420 SYSERROR("Failed to clear groups");
1421
1422 destuid = get_ns_uid(arg->origuid);
1423
457ca9aa
SH
1424 for (i = 0; hierarchies[i]; i++) {
1425 char *fullpath, *path = hierarchies[i]->fullcgpath;
43647298
SH
1426
1427 if (chown(path, destuid, 0) < 0) {
ab8f5424 1428 SYSERROR("Error chowning %s to %d", path, (int) destuid);
ccb4cabe
SH
1429 return -1;
1430 }
c0888dfe 1431
43647298 1432 if (chmod(path, 0775) < 0) {
ab8f5424 1433 SYSERROR("Error chmoding %s", path);
c0888dfe
SH
1434 return -1;
1435 }
ccb4cabe 1436
ab8f5424
SH
1437 /*
1438 * Failures to chown these are inconvenient but not detrimental
1439 * We leave these owned by the container launcher, so that container
1440 * root can write to the files to attach. We chmod them 664 so that
1441 * container systemd can write to the files (which systemd in wily
1442 * insists on doing)
1443 */
43647298
SH
1444 fullpath = must_make_path(path, "tasks", NULL);
1445 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
1446 WARN("Failed chowning %s to %d: %m", fullpath, (int) destuid);
ab8f5424
SH
1447 if (chmod(fullpath, 0664) < 0)
1448 WARN("Error chmoding %s: %m", path);
43647298
SH
1449 free(fullpath);
1450
1451 fullpath = must_make_path(path, "cgroup.procs", NULL);
1452 if (chown(fullpath, destuid, 0) < 0 && errno != ENOENT)
1453 WARN("Failed chowning %s to %d: %m", fullpath, (int) destuid);
ab8f5424
SH
1454 if (chmod(fullpath, 0664) < 0)
1455 WARN("Error chmoding %s: %m", path);
ccb4cabe
SH
1456 free(fullpath);
1457 }
1458
1459 return 0;
1460}
1461
1462static bool cgfsns_chown(void *hdata, struct lxc_conf *conf)
1463{
1464 struct cgfsng_handler_data *d = hdata;
1465 struct chown_data wrap;
1466
1467 if (!d)
1468 return false;
1469
1470 if (lxc_list_empty(&conf->id_map))
1471 return true;
1472
1473 wrap.d = d;
1474 wrap.origuid = geteuid();
1475
1476 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap) < 0) {
1477 ERROR("Error requesting cgroup chown in new namespace");
1478 return false;
1479 }
1480
1481 return true;
1482}
1483
8aa1044f
SH
1484/*
1485 * We've safe-mounted a tmpfs as parent, so we don't need to protect against
1486 * symlinks any more - just use mount
1487 */
1488
1489/* mount cgroup-full if requested */
1490static int mount_cgroup_full(int type, struct hierarchy *h, char *dest,
1491 char *container_cgroup)
1492{
1493 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1494 return 0;
1495 if (mount(h->mountpoint, dest, "cgroup", MS_BIND, NULL) < 0) {
1496 SYSERROR("Error bind-mounting %s cgroup onto %s", h->mountpoint,
1497 dest);
1498 return -1;
1499 }
1500 if (type != LXC_AUTO_CGROUP_FULL_RW) {
5b6f9369
SH
1501 unsigned long flags = MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV |
1502 MS_REMOUNT | MS_RDONLY;
1503 if (mount(NULL, dest, "cgroup", flags, NULL) < 0) {
8aa1044f
SH
1504 SYSERROR("Error remounting %s readonly", dest);
1505 return -1;
1506 }
1507 }
1508
1509 INFO("Bind mounted %s onto %s", h->mountpoint, dest);
1510 if (type != LXC_AUTO_CGROUP_FULL_MIXED)
1511 return 0;
1512
1513 /* mount just the container path rw */
1514 char *source = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
5b6f9369 1515 char *rwpath = must_make_path(dest, h->base_cgroup, container_cgroup, NULL);
8aa1044f
SH
1516 if (mount(source, rwpath, "cgroup", MS_BIND, NULL) < 0)
1517 WARN("Failed to mount %s read-write: %m", rwpath);
1518 INFO("Made %s read-write", rwpath);
1519 free(rwpath);
1520 free(source);
1521 return 0;
1522}
1523
1524/* cgroup-full:* is done, no need to create subdirs */
1525static bool cg_mount_needs_subdirs(int type)
1526{
1527 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1528 return false;
1529 return true;
1530}
1531
1532/*
1533 * After $rootfs/sys/fs/container/controller/the/cg/path has been
1534 * created, remount controller ro if needed and bindmount the
1535 * cgroupfs onto controll/the/cg/path
1536 */
1537static int
1538do_secondstage_mounts_if_needed(int type, struct hierarchy *h,
1539 char *controllerpath, char *cgpath,
1540 const char *container_cgroup)
1541{
1542 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1543 if (mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL) < 0) {
1544 SYSERROR("Error bind-mounting %s", controllerpath);
1545 return -1;
1546 }
1547 if (mount(controllerpath, controllerpath, "cgroup",
1548 MS_REMOUNT | MS_BIND | MS_RDONLY, NULL) < 0) {
1549 SYSERROR("Error remounting %s read-only", controllerpath);
1550 return -1;
1551 }
1552 INFO("Remounted %s read-only", controllerpath);
1553 }
1554 char *sourcepath = must_make_path(h->mountpoint, h->base_cgroup, container_cgroup, NULL);
1555 int flags = MS_BIND;
1556 if (type == LXC_AUTO_CGROUP_RO)
1557 flags |= MS_RDONLY;
1558 INFO("Mounting %s onto %s", sourcepath, cgpath);
1559 if (mount(sourcepath, cgpath, "cgroup", flags, NULL) < 0) {
1560 free(sourcepath);
1561 SYSERROR("Error mounting cgroup %s onto %s", h->controllers[0],
1562 cgpath);
1563 return -1;
1564 }
1565 free(sourcepath);
1566 INFO("Completed second stage cgroup automounts for %s", cgpath);
1567 return 0;
1568}
1569
ccb4cabe
SH
1570static bool cgfsng_mount(void *hdata, const char *root, int type)
1571{
8aa1044f
SH
1572 struct cgfsng_handler_data *d = hdata;
1573 char *tmpfspath = NULL;
1574 bool retval = false;
a8de4c49 1575 int i;
8aa1044f
SH
1576
1577 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1578 return true;
1579
ccb4cabe
SH
1580 if (cgns_supported())
1581 return true;
8aa1044f
SH
1582
1583 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1584
1585 if (type == LXC_AUTO_CGROUP_NOSPEC)
1586 type = LXC_AUTO_CGROUP_MIXED;
1587 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1588 type = LXC_AUTO_CGROUP_FULL_MIXED;
1589
1590 /* Mount tmpfs */
1591 if (safe_mount("cgroup_root", tmpfspath, "tmpfs",
1592 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1593 "size=10240k,mode=755",
1594 root) < 0)
1595 goto bad;
1596
457ca9aa 1597 for (i = 0; hierarchies[i]; i++) {
8aa1044f 1598 char *controllerpath, *path2;
457ca9aa 1599 struct hierarchy *h = hierarchies[i];
8aa1044f
SH
1600 char *controller = strrchr(h->mountpoint, '/');
1601 int r;
1602
1603 if (!controller)
1604 continue;
1605 controller++;
1606 controllerpath = must_make_path(tmpfspath, controller, NULL);
1607 if (dir_exists(controllerpath)) {
1608 free(controllerpath);
1609 continue;
1610 }
1611 if (mkdir(controllerpath, 0755) < 0) {
1612 SYSERROR("Error creating cgroup path: %s", controllerpath);
1613 free(controllerpath);
1614 goto bad;
1615 }
1616 if (mount_cgroup_full(type, h, controllerpath, d->container_cgroup) < 0) {
1617 free(controllerpath);
1618 goto bad;
1619 }
1620 if (!cg_mount_needs_subdirs(type)) {
1621 free(controllerpath);
1622 continue;
1623 }
ef4413fa 1624 path2 = must_make_path(controllerpath, h->base_cgroup, d->container_cgroup, NULL);
8aa1044f
SH
1625 if (mkdir_p(path2, 0755) < 0) {
1626 free(controllerpath);
1627 goto bad;
1628 }
2f62fb00 1629
8aa1044f
SH
1630 r = do_secondstage_mounts_if_needed(type, h, controllerpath, path2,
1631 d->container_cgroup);
1632 free(controllerpath);
1633 free(path2);
1634 if (r < 0)
1635 goto bad;
1636 }
1637 retval = true;
1638
1639bad:
1640 free(tmpfspath);
1641 return retval;
ccb4cabe
SH
1642}
1643
1644static int recursive_count_nrtasks(char *dirname)
1645{
74f96976 1646 struct dirent *direntp;
ccb4cabe
SH
1647 DIR *dir;
1648 int count = 0, ret;
1649 char *path;
1650
1651 dir = opendir(dirname);
1652 if (!dir)
1653 return 0;
1654
74f96976 1655 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1656 struct stat mystat;
1657
1658 if (!direntp)
1659 break;
1660
1661 if (!strcmp(direntp->d_name, ".") ||
1662 !strcmp(direntp->d_name, ".."))
1663 continue;
1664
1665 path = must_make_path(dirname, direntp->d_name, NULL);
1666
1667 if (lstat(path, &mystat))
1668 goto next;
1669
1670 if (!S_ISDIR(mystat.st_mode))
1671 goto next;
1672
1673 count += recursive_count_nrtasks(path);
1674next:
1675 free(path);
1676 }
1677
1678 path = must_make_path(dirname, "cgroup.procs", NULL);
1679 ret = lxc_count_file_lines(path);
1680 if (ret != -1)
1681 count += ret;
1682 free(path);
1683
1684 (void) closedir(dir);
1685
1686 return count;
1687}
1688
1689static int cgfsng_nrtasks(void *hdata) {
1690 struct cgfsng_handler_data *d = hdata;
1691 char *path;
1692 int count;
1693
457ca9aa 1694 if (!d || !d->container_cgroup || !hierarchies)
ccb4cabe 1695 return -1;
457ca9aa 1696 path = must_make_path(hierarchies[0]->fullcgpath, NULL);
ccb4cabe
SH
1697 count = recursive_count_nrtasks(path);
1698 free(path);
1699 return count;
1700}
1701
1702/* Only root needs to escape to the cgroup of its init */
7103fe6f 1703static bool cgfsng_escape()
ccb4cabe 1704{
ccb4cabe
SH
1705 int i;
1706
1707 if (geteuid())
1708 return true;
1709
457ca9aa
SH
1710 for (i = 0; hierarchies[i]; i++) {
1711 char *fullpath = must_make_path(hierarchies[i]->mountpoint,
1712 hierarchies[i]->base_cgroup,
ccb4cabe
SH
1713 "cgroup.procs", NULL);
1714 if (lxc_write_to_file(fullpath, "0", 2, false) != 0) {
d3b00a8f 1715 SYSERROR("Failed to escape to %s", fullpath);
ccb4cabe 1716 free(fullpath);
6df334d1 1717 return false;
ccb4cabe
SH
1718 }
1719 free(fullpath);
1720 }
1721
6df334d1 1722 return true;
ccb4cabe
SH
1723}
1724
36662416
TA
1725static int cgfsng_num_hierarchies(void)
1726{
1727 int i;
1728
1729 for (i = 0; hierarchies[i]; i++)
1730 ;
1731
1732 return i;
1733}
1734
1735static bool cgfsng_get_hierarchies(int n, char ***out)
1736{
1737 int i;
1738
1739 /* sanity check n */
1740 for (i = 0; i < n; i++) {
1741 if (!hierarchies[i])
1742 return false;
1743 }
1744
1745 *out = hierarchies[i]->controllers;
1746
1747 return true;
1748}
1749
ccb4cabe
SH
1750#define THAWED "THAWED"
1751#define THAWED_LEN (strlen(THAWED))
1752
1753static bool cgfsng_unfreeze(void *hdata)
1754{
ccb4cabe 1755 char *fullpath;
457ca9aa 1756 struct hierarchy *h = get_hierarchy("freezer");
ccb4cabe 1757
457ca9aa 1758 if (!h)
ccb4cabe
SH
1759 return false;
1760 fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
1761 if (lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false) != 0) {
1762 free(fullpath);
1763 return false;
1764 }
1765 free(fullpath);
1766 return true;
1767}
1768
1769static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem)
1770{
457ca9aa 1771 struct hierarchy *h = get_hierarchy(subsystem);
ccb4cabe
SH
1772 if (!h)
1773 return NULL;
1774
371f834d
SH
1775 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
1776}
1777
1778/*
1779 * Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a
1780 * full path, which must be freed by the caller.
1781 */
1782static char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
1783 const char *inpath,
1784 const char *filename)
1785{
1786 /*
1787 * XXX Remove this case after 2.0 release. It's for dealing with
1788 * containers spawned under the old buggy cgfsng which wasn't around
1789 * for long.
1790 */
1791 if (strncmp(inpath, "/sys/fs/cgroup/", 15) == 0)
1792 return must_make_path(inpath, filename, NULL);
1793 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
1794}
1795
1796static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
1797{
ccb4cabe
SH
1798 char pidstr[25];
1799 int i, len;
1800
1801 len = snprintf(pidstr, 25, "%d", pid);
1802 if (len < 0 || len > 25)
1803 return false;
1804
457ca9aa 1805 for (i = 0; hierarchies[i]; i++) {
ccb4cabe 1806 char *path, *fullpath;
457ca9aa 1807 struct hierarchy *h = hierarchies[i];
ccb4cabe
SH
1808
1809 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
1810 if (!path) // not running
1811 continue;
1812
371f834d
SH
1813 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
1814 free(path);
ccb4cabe
SH
1815 if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
1816 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
1817 free(fullpath);
ccb4cabe
SH
1818 return false;
1819 }
ccb4cabe
SH
1820 free(fullpath);
1821 }
1822
ccb4cabe
SH
1823 return true;
1824}
1825
1826/*
1827 * Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.
1828 * Here we don't have a cgroup_data set up, so we ask the running
1829 * container through the commands API for the cgroup path
1830 */
1831static int cgfsng_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
1832{
1833 char *subsystem, *p, *path;
ccb4cabe
SH
1834 struct hierarchy *h;
1835 int ret = -1;
1836
1837 subsystem = alloca(strlen(filename) + 1);
1838 strcpy(subsystem, filename);
1839 if ((p = strchr(subsystem, '.')) != NULL)
1840 *p = '\0';
1841
1842 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1843 if (!path) // not running
1844 return -1;
1845
457ca9aa 1846 h = get_hierarchy(subsystem);
ccb4cabe 1847 if (h) {
371f834d 1848 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1849 ret = lxc_read_from_file(fullpath, value, len);
1850 free(fullpath);
1851 }
1852
ccb4cabe
SH
1853 free(path);
1854
1855 return ret;
1856}
1857
1858/*
1859 * Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.
1860 * Here we don't have a cgroup_data set up, so we ask the running
1861 * container through the commands API for the cgroup path
1862 */
1863static int cgfsng_set(const char *filename, const char *value, const char *name, const char *lxcpath)
1864{
1865 char *subsystem, *p, *path;
ccb4cabe
SH
1866 struct hierarchy *h;
1867 int ret = -1;
1868
1869 subsystem = alloca(strlen(filename) + 1);
1870 strcpy(subsystem, filename);
1871 if ((p = strchr(subsystem, '.')) != NULL)
1872 *p = '\0';
1873
1874 path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
1875 if (!path) // not running
1876 return -1;
1877
457ca9aa 1878 h = get_hierarchy(subsystem);
ccb4cabe 1879 if (h) {
371f834d 1880 char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe
SH
1881 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
1882 free(fullpath);
1883 }
1884
ccb4cabe
SH
1885 free(path);
1886
1887 return ret;
1888}
1889
ccb4cabe
SH
1890/*
1891 * Called from setup_limits - here we have the container's cgroup_data because
1892 * we created the cgroups
1893 */
1894static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfsng_handler_data *d)
1895{
1896 char *subsystem = NULL, *p;
1897 int ret = -1;
1898 struct hierarchy *h;
1899
1900 subsystem = alloca(strlen(filename) + 1);
1901 strcpy(subsystem, filename);
1902 if ((p = strchr(subsystem, '.')) != NULL)
1903 *p = '\0';
1904
457ca9aa 1905 h = get_hierarchy(subsystem);
ccb4cabe
SH
1906 if (h) {
1907 char *fullpath = must_make_path(h->fullcgpath, filename, NULL);
1908 ret = lxc_write_to_file(fullpath, value, strlen(value), false);
1909 free(fullpath);
1910 }
1911 return ret;
1912}
1913
1914static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings,
1915 bool do_devices)
1916{
1917 struct cgfsng_handler_data *d = hdata;
1918 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
1919 struct lxc_cgroup *cg;
ccb4cabe
SH
1920 bool ret = false;
1921
1922 if (lxc_list_empty(cgroup_settings))
1923 return true;
1924
1925 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
1926 if (!sorted_cgroup_settings) {
1927 return false;
1928 }
1929
ccb4cabe
SH
1930 lxc_list_for_each(iterator, sorted_cgroup_settings) {
1931 cg = iterator->elem;
1932
1933 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
ccb4cabe
SH
1934 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
1935 if (do_devices && (errno == EACCES || errno == EPERM)) {
1936 WARN("Error setting %s to %s for %s",
1937 cg->subsystem, cg->value, d->name);
1938 continue;
1939 }
1940 SYSERROR("Error setting %s to %s for %s",
1941 cg->subsystem, cg->value, d->name);
1942 goto out;
1943 }
1944 }
1945
1946 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
1947 }
1948
1949 ret = true;
1950 INFO("cgroup has been setup");
1951out:
ccb4cabe
SH
1952 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
1953 lxc_list_del(iterator);
1954 free(iterator);
1955 }
1956 free(sorted_cgroup_settings);
1957 return ret;
1958}
1959
1960static struct cgroup_ops cgfsng_ops = {
1961 .init = cgfsng_init,
1962 .destroy = cgfsng_destroy,
1963 .create = cgfsng_create,
1964 .enter = cgfsng_enter,
ccb4cabe 1965 .escape = cgfsng_escape,
36662416
TA
1966 .num_hierarchies = cgfsng_num_hierarchies,
1967 .get_hierarchies = cgfsng_get_hierarchies,
ccb4cabe
SH
1968 .get_cgroup = cgfsng_get_cgroup,
1969 .get = cgfsng_get,
1970 .set = cgfsng_set,
1971 .unfreeze = cgfsng_unfreeze,
1972 .setup_limits = cgfsng_setup_limits,
1973 .name = "cgroupfs-ng",
1974 .attach = cgfsng_attach,
1975 .chown = cgfsns_chown,
1976 .mount_cgroup = cgfsng_mount,
1977 .nrtasks = cgfsng_nrtasks,
1978 .driver = CGFSNG,
1979
1980 /* unsupported */
1981 .create_legacy = NULL,
1982};