]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
cgroups: hande cpuset initialization race
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
ccb4cabe
SH
1/*
2 * lxc: linux Container library
3 *
4 * Copyright © 2016 Canonical Ltd.
5 *
6 * Authors:
7 * Serge Hallyn <serge.hallyn@ubuntu.com>
3fd0de4d 8 * Christian Brauner <christian.brauner@ubuntu.com>
ccb4cabe
SH
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25/*
26 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
27 * cgroup backend. The original cgfs.c was designed to be as flexible
28 * as possible. It would try to find cgroup filesystems no matter where
29 * or how you had them mounted, and deduce the most usable mount for
0e7ff52c 30 * each controller.
ccb4cabe
SH
31 *
32 * This new implementation assumes that cgroup filesystems are mounted
33 * under /sys/fs/cgroup/clist where clist is either the controller, or
18406e5a 34 * a comma-separated list of controllers.
ccb4cabe 35 */
a54694f8 36
d38dd64a
CB
37#ifndef _GNU_SOURCE
38#define _GNU_SOURCE 1
39#endif
a54694f8
CB
40#include <ctype.h>
41#include <dirent.h>
42#include <errno.h>
43#include <grp.h>
d38dd64a
CB
44#include <linux/kdev_t.h>
45#include <linux/types.h>
a54694f8 46#include <stdint.h>
ccb4cabe
SH
47#include <stdio.h>
48#include <stdlib.h>
a54694f8 49#include <string.h>
438c4581 50#include <sys/types.h>
d38dd64a 51#include <unistd.h>
c8bf519d 52
b635e92d 53#include "caps.h"
ccb4cabe 54#include "cgroup.h"
6328fd9c 55#include "cgroup_utils.h"
ccb4cabe 56#include "commands.h"
43654d34 57#include "conf.h"
d38dd64a 58#include "config.h"
a54694f8 59#include "log.h"
c19ad94b 60#include "macro.h"
861cb8c2 61#include "memory_utils.h"
43654d34 62#include "storage/storage.h"
a54694f8 63#include "utils.h"
ccb4cabe 64
64e82f8b
DJ
65#ifndef HAVE_STRLCPY
66#include "include/strlcpy.h"
67#endif
68
3ebe2fbd
DJ
69#ifndef HAVE_STRLCAT
70#include "include/strlcat.h"
71#endif
72
ac2cecc4 73lxc_log_define(cgfsng, cgroup);
ccb4cabe 74
ccb4cabe
SH
75static void free_string_list(char **clist)
76{
2d5fe5ba 77 int i;
ccb4cabe 78
2d5fe5ba
CB
79 if (!clist)
80 return;
81
82 for (i = 0; clist[i]; i++)
83 free(clist[i]);
84
85 free(clist);
ccb4cabe
SH
86}
87
8b8db2f6
CB
88/* Given a pointer to a null-terminated array of pointers, realloc to add one
89 * entry, and point the new entry to NULL. Do not fail. Return the index to the
90 * second-to-last entry - that is, the one which is now available for use
91 * (keeping the list null-terminated).
ccb4cabe
SH
92 */
93static int append_null_to_list(void ***list)
94{
95 int newentry = 0;
96
97 if (*list)
8b8db2f6
CB
98 for (; (*list)[newentry]; newentry++)
99 ;
ccb4cabe
SH
100
101 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
102 (*list)[newentry + 1] = NULL;
103 return newentry;
104}
105
8073018d
CB
106/* Given a null-terminated array of strings, check whether @entry is one of the
107 * strings.
ccb4cabe
SH
108 */
109static bool string_in_list(char **list, const char *entry)
110{
111 int i;
112
113 if (!list)
114 return false;
d6337a5f 115
ccb4cabe
SH
116 for (i = 0; list[i]; i++)
117 if (strcmp(list[i], entry) == 0)
118 return true;
119
120 return false;
121}
122
ac010944
CB
123/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
124 * "name=systemd". Do not fail.
125 */
126static char *cg_legacy_must_prefix_named(char *entry)
127{
128 size_t len;
129 char *prefixed;
130
131 len = strlen(entry);
f25a2044 132 prefixed = must_realloc(NULL, len + 6);
ac010944 133
6333c915
CB
134 memcpy(prefixed, "name=", STRLITERALLEN("name="));
135 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
ac010944 136 prefixed[len + 5] = '\0';
99bb3fa8 137
ac010944
CB
138 return prefixed;
139}
140
42a993b4
CB
141/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
142 * we are called.
ccb4cabe 143 *
42a993b4
CB
144 * We also handle named subsystems here. Any controller which is not a kernel
145 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
146 * we refuse to use because we're not sure which we have here.
147 * (TODO: We could work around this in some cases by just remounting to be
148 * unambiguous, or by comparing mountpoint contents with current cgroup.)
ccb4cabe
SH
149 *
150 * The last entry will always be NULL.
151 */
42a993b4
CB
152static void must_append_controller(char **klist, char **nlist, char ***clist,
153 char *entry)
ccb4cabe
SH
154{
155 int newentry;
156 char *copy;
157
158 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
c2712f64 159 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
ccb4cabe
SH
160 ERROR("It is both a named and kernel subsystem");
161 return;
162 }
163
164 newentry = append_null_to_list((void ***)clist);
165
166 if (strncmp(entry, "name=", 5) == 0)
167 copy = must_copy_string(entry);
168 else if (string_in_list(klist, entry))
169 copy = must_copy_string(entry);
170 else
7745483d 171 copy = cg_legacy_must_prefix_named(entry);
ccb4cabe
SH
172
173 (*clist)[newentry] = copy;
174}
175
5ae0207c
CB
176/* Given a handler's cgroup data, return the struct hierarchy for the controller
177 * @c, or NULL if there is none.
ccb4cabe 178 */
27a5132c 179struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
ccb4cabe
SH
180{
181 int i;
182
27a5132c
CB
183 errno = ENOENT;
184
185 if (!ops->hierarchies) {
186 TRACE("There are no useable cgroup controllers");
ccb4cabe 187 return NULL;
27a5132c 188 }
d6337a5f 189
2202afc9 190 for (i = 0; ops->hierarchies[i]; i++) {
27a5132c 191 if (!controller) {
d6337a5f 192 /* This is the empty unified hierarchy. */
2202afc9
CB
193 if (ops->hierarchies[i]->controllers &&
194 !ops->hierarchies[i]->controllers[0])
195 return ops->hierarchies[i];
d6337a5f 196
106f1f38 197 continue;
d6337a5f
CB
198 }
199
27a5132c 200 if (string_in_list(ops->hierarchies[i]->controllers, controller))
2202afc9 201 return ops->hierarchies[i];
ccb4cabe 202 }
d6337a5f 203
27a5132c
CB
204 if (controller)
205 WARN("There is no useable %s controller", controller);
206 else
207 WARN("There is no empty unified cgroup hierarchy");
208
ccb4cabe
SH
209 return NULL;
210}
211
a54694f8
CB
212#define BATCH_SIZE 50
213static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
214{
215 int newbatches = (newlen / BATCH_SIZE) + 1;
216 int oldbatches = (oldlen / BATCH_SIZE) + 1;
217
218 if (!*mem || newbatches > oldbatches) {
219 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
220 }
221}
222
223static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
224{
225 size_t full = oldlen + newlen;
226
227 batch_realloc(dest, oldlen, full + 1);
228
229 memcpy(*dest + oldlen, new, newlen + 1);
230}
231
232/* Slurp in a whole file */
d6337a5f 233static char *read_file(const char *fnam)
a54694f8 234{
d97919ab
CB
235 __do_free char *line = NULL;
236 __do_fclose FILE *f = NULL;
a54694f8 237 int linelen;
d97919ab
CB
238 char *buf = NULL;
239 size_t len = 0, fulllen = 0;
a54694f8
CB
240
241 f = fopen(fnam, "r");
242 if (!f)
243 return NULL;
244 while ((linelen = getline(&line, &len, f)) != -1) {
245 append_line(&buf, fulllen, line, linelen);
246 fulllen += linelen;
247 }
a54694f8
CB
248 return buf;
249}
250
251/* Taken over modified from the kernel sources. */
252#define NBITS 32 /* bits in uint32_t */
253#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
254#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
255
256static void set_bit(unsigned bit, uint32_t *bitarr)
257{
258 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
259}
260
261static void clear_bit(unsigned bit, uint32_t *bitarr)
262{
263 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
264}
265
266static bool is_set(unsigned bit, uint32_t *bitarr)
267{
268 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
269}
270
271/* Create cpumask from cpulist aka turn:
272 *
273 * 0,2-3
274 *
d5d468f6 275 * into bit array
a54694f8
CB
276 *
277 * 1 0 1 1
278 */
279static uint32_t *lxc_cpumask(char *buf, size_t nbits)
280{
281 char *token;
d5d468f6
CB
282 size_t arrlen;
283 uint32_t *bitarr;
d5d468f6
CB
284
285 arrlen = BITS_TO_LONGS(nbits);
286 bitarr = calloc(arrlen, sizeof(uint32_t));
a54694f8
CB
287 if (!bitarr)
288 return NULL;
289
0be0d78f 290 lxc_iterate_parts(token, buf, ",") {
a54694f8 291 errno = 0;
d5d468f6
CB
292 unsigned end, start;
293 char *range;
a54694f8 294
d5d468f6
CB
295 start = strtoul(token, NULL, 0);
296 end = start;
297 range = strchr(token, '-');
a54694f8
CB
298 if (range)
299 end = strtoul(range + 1, NULL, 0);
d5d468f6 300
a54694f8
CB
301 if (!(start <= end)) {
302 free(bitarr);
303 return NULL;
304 }
305
306 if (end >= nbits) {
307 free(bitarr);
308 return NULL;
309 }
310
311 while (start <= end)
312 set_bit(start++, bitarr);
313 }
314
315 return bitarr;
316}
317
a54694f8
CB
318/* Turn cpumask into simple, comma-separated cpulist. */
319static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
320{
a54694f8 321 int ret;
414c6719 322 size_t i;
24cac6af 323 char *tmp = NULL;
a54694f8 324 char **cpulist = NULL;
c19ad94b 325 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
a54694f8
CB
326
327 for (i = 0; i <= nbits; i++) {
414c6719
CB
328 if (!is_set(i, bitarr))
329 continue;
330
979a0d93
CB
331 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
332 if (ret < 0 || (size_t)ret >= sizeof(numstr)) {
414c6719
CB
333 lxc_free_array((void **)cpulist, free);
334 return NULL;
335 }
336
337 ret = lxc_append_string(&cpulist, numstr);
338 if (ret < 0) {
339 lxc_free_array((void **)cpulist, free);
340 return NULL;
a54694f8
CB
341 }
342 }
414c6719
CB
343
344 if (!cpulist)
345 return NULL;
346
24cac6af
L
347 tmp = lxc_string_join(",", (const char **)cpulist, false);
348 lxc_free_array((void **)cpulist, free);
349
350 return tmp;
a54694f8
CB
351}
352
353static ssize_t get_max_cpus(char *cpulist)
354{
355 char *c1, *c2;
356 char *maxcpus = cpulist;
357 size_t cpus = 0;
358
359 c1 = strrchr(maxcpus, ',');
360 if (c1)
361 c1++;
362
363 c2 = strrchr(maxcpus, '-');
364 if (c2)
365 c2++;
366
367 if (!c1 && !c2)
368 c1 = maxcpus;
369 else if (c1 > c2)
370 c2 = c1;
371 else if (c1 < c2)
372 c1 = c2;
333987b9 373 else if (!c1 && c2)
a54694f8
CB
374 c1 = c2;
375
a54694f8
CB
376 errno = 0;
377 cpus = strtoul(c1, NULL, 0);
378 if (errno != 0)
379 return -1;
380
381 return cpus;
382}
383
6f9584d8 384#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
36f70181 385#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
a3926f6a 386static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
a54694f8 387{
d97919ab 388 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
36f70181
CB
389 *offlinecpus = NULL, *posscpus = NULL;
390 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
391 *possmask = NULL;
a54694f8
CB
392 int ret;
393 ssize_t i;
d97919ab 394 char oldv;
7717e175 395 char *lastslash;
36f70181 396 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
6f9584d8 397 bool bret = false, flipped_bit = false;
a54694f8
CB
398
399 lastslash = strrchr(path, '/');
59ac3b88
CB
400 if (!lastslash) {
401 ERROR("Failed to detect \"/\" in \"%s\"", path);
a54694f8
CB
402 return bret;
403 }
404 oldv = *lastslash;
405 *lastslash = '\0';
406 fpath = must_make_path(path, "cpuset.cpus", NULL);
f68ea354 407 *lastslash = oldv;
a54694f8 408 posscpus = read_file(fpath);
6f9584d8 409 if (!posscpus) {
59ac3b88 410 SYSERROR("Failed to read file \"%s\"", fpath);
d97919ab 411 return false;
6f9584d8 412 }
a54694f8
CB
413
414 /* Get maximum number of cpus found in possible cpuset. */
415 maxposs = get_max_cpus(posscpus);
92d5ea57 416 if (maxposs < 0 || maxposs >= INT_MAX - 1)
d97919ab 417 return false;
a54694f8 418
36f70181
CB
419 if (file_exists(__ISOL_CPUS)) {
420 isolcpus = read_file(__ISOL_CPUS);
421 if (!isolcpus) {
422 SYSERROR("Failed to read file \"%s\"", __ISOL_CPUS);
423 return false;
65d29cbc 424 }
6f9584d8 425
36f70181
CB
426 if (isdigit(isolcpus[0])) {
427 /* Get maximum number of cpus found in isolated cpuset. */
428 maxisol = get_max_cpus(isolcpus);
429 if (maxisol < 0 || maxisol >= INT_MAX - 1)
430 return false;
6f9584d8 431 }
36f70181
CB
432
433 if (maxposs < maxisol)
434 maxposs = maxisol;
435 maxposs++;
436 } else {
437 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
a54694f8
CB
438 }
439
36f70181
CB
440 if (file_exists(__OFFLINE_CPUS)) {
441 offlinecpus = read_file(__OFFLINE_CPUS);
442 if (!offlinecpus) {
443 SYSERROR("Failed to read file \"%s\"", __OFFLINE_CPUS);
444 return false;
445 }
446
447 if (isdigit(offlinecpus[0])) {
448 /* Get maximum number of cpus found in offline cpuset. */
449 maxoffline = get_max_cpus(offlinecpus);
450 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
451 return false;
452 }
453
454 if (maxposs < maxoffline)
455 maxposs = maxoffline;
456 maxposs++;
457 } else {
458 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
459 }
a54694f8 460
dcd14a3d
CB
461 if ((maxisol == 0) && (maxoffline == 0)) {
462 cpulist = move_ptr(posscpus);
36f70181 463 goto copy_parent;
dcd14a3d 464 }
a54694f8
CB
465
466 possmask = lxc_cpumask(posscpus, maxposs);
6f9584d8 467 if (!possmask) {
59ac3b88 468 ERROR("Failed to create cpumask for possible cpus");
d97919ab 469 return false;
6f9584d8 470 }
a54694f8 471
36f70181
CB
472 if (maxisol > 0) {
473 isolmask = lxc_cpumask(isolcpus, maxposs);
474 if (!isolmask) {
475 ERROR("Failed to create cpumask for isolated cpus");
476 return false;
477 }
478 }
479
480 if (maxoffline > 0) {
481 offlinemask = lxc_cpumask(offlinecpus, maxposs);
482 if (!offlinemask) {
483 ERROR("Failed to create cpumask for offline cpus");
484 return false;
485 }
6f9584d8 486 }
a54694f8
CB
487
488 for (i = 0; i <= maxposs; i++) {
36f70181
CB
489 if ((isolmask && !is_set(i, isolmask)) ||
490 (offlinemask && !is_set(i, offlinemask)) ||
491 !is_set(i, possmask))
59ac3b88
CB
492 continue;
493
494 flipped_bit = true;
495 clear_bit(i, possmask);
a54694f8
CB
496 }
497
6f9584d8 498 if (!flipped_bit) {
36f70181 499 DEBUG("No isolated or offline cpus present in cpuset");
d97919ab 500 return true;
6f9584d8 501 }
36f70181 502 DEBUG("Removed isolated or offline cpus from cpuset");
6f9584d8 503
a54694f8 504 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
6f9584d8 505 if (!cpulist) {
59ac3b88 506 ERROR("Failed to create cpu list");
d97919ab 507 return false;
6f9584d8 508 }
a54694f8
CB
509
510copy_parent:
36f70181 511 if (!am_initialized) {
36f70181
CB
512 fpath = must_make_path(path, "cpuset.cpus", NULL);
513 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false,
514 0666);
36f70181
CB
515 if (ret < 0) {
516 SYSERROR("Failed to write cpu list to \"%s\"", fpath);
517 return false;
518 }
519
520 TRACE("Copied cpu settings of parent cgroup");
6f9584d8
CB
521 }
522
d97919ab 523 return true;
a54694f8
CB
524}
525
e3a3fecf
SH
526/* Copy contents of parent(@path)/@file to @path/@file */
527static bool copy_parent_file(char *path, char *file)
528{
d97919ab 529 __do_free char *child_path = NULL, *parent_path = NULL, *value = NULL;
e3a3fecf 530 int ret;
d97919ab 531 char oldv;
b095a8eb 532 int len = 0;
d97919ab 533 char *lastslash = NULL;
e3a3fecf
SH
534
535 lastslash = strrchr(path, '/');
b095a8eb
CB
536 if (!lastslash) {
537 ERROR("Failed to detect \"/\" in \"%s\"", path);
e3a3fecf
SH
538 return false;
539 }
540 oldv = *lastslash;
541 *lastslash = '\0';
d97919ab
CB
542 parent_path = must_make_path(path, file, NULL);
543 len = lxc_read_from_file(parent_path, NULL, 0);
b53a0853
CB
544 if (len <= 0) {
545 SYSERROR("Failed to determine buffer size");
546 return false;
547 }
b095a8eb 548
f25a2044 549 value = must_realloc(NULL, len + 1);
d97919ab 550 ret = lxc_read_from_file(parent_path, value, len);
b53a0853
CB
551 if (ret != len) {
552 SYSERROR("Failed to read from parent file \"%s\"", parent_path);
553 return false;
554 }
b095a8eb 555
e3a3fecf 556 *lastslash = oldv;
d97919ab
CB
557 child_path = must_make_path(path, file, NULL);
558 ret = lxc_write_to_file(child_path, value, len, false, 0666);
e3a3fecf 559 if (ret < 0)
d97919ab 560 SYSERROR("Failed to write \"%s\" to file \"%s\"", value, child_path);
e3a3fecf 561 return ret >= 0;
e3a3fecf
SH
562}
563
7793add3
CB
564/* Initialize the cpuset hierarchy in first directory of @gname and set
565 * cgroup.clone_children so that children inherit settings. Since the
566 * h->base_path is populated by init or ourselves, we know it is already
567 * initialized.
e3a3fecf 568 */
a3926f6a 569static bool cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
e3a3fecf 570{
d97919ab 571 __do_free char *cgpath = NULL, *clonechildrenpath = NULL;
7793add3
CB
572 int ret;
573 char v;
d97919ab 574 char *slash;
e3a3fecf
SH
575
576 if (!string_in_list(h->controllers, "cpuset"))
577 return true;
578
579 if (*cgname == '/')
580 cgname++;
581 slash = strchr(cgname, '/');
582 if (slash)
583 *slash = '\0';
584
bb221ad1 585 cgpath = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
e3a3fecf
SH
586 if (slash)
587 *slash = '/';
7793add3
CB
588
589 ret = mkdir(cgpath, 0755);
590 if (ret < 0) {
591 if (errno != EEXIST) {
592 SYSERROR("Failed to create directory \"%s\"", cgpath);
7793add3
CB
593 return false;
594 }
e3a3fecf 595 }
6f9584d8 596
f8390327 597 clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
6328fd9c 598 /* unified hierarchy doesn't have clone_children */
d97919ab 599 if (!file_exists(clonechildrenpath))
e3a3fecf 600 return true;
7793add3
CB
601
602 ret = lxc_read_from_file(clonechildrenpath, &v, 1);
603 if (ret < 0) {
604 SYSERROR("Failed to read file \"%s\"", clonechildrenpath);
e3a3fecf
SH
605 return false;
606 }
607
a54694f8 608 /* Make sure any isolated cpus are removed from cpuset.cpus. */
a3926f6a 609 if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1')) {
7793add3 610 SYSERROR("Failed to remove isolated cpus");
a54694f8 611 return false;
6f9584d8 612 }
a54694f8 613
7793add3 614 /* Already set for us by someone else. */
b28c2810
CB
615 if (v == '1')
616 TRACE("\"cgroup.clone_children\" was already set to \"1\"");
e3a3fecf
SH
617
618 /* copy parent's settings */
a54694f8 619 if (!copy_parent_file(cgpath, "cpuset.mems")) {
7793add3 620 SYSERROR("Failed to copy \"cpuset.mems\" settings");
e3a3fecf
SH
621 return false;
622 }
e3a3fecf 623
7cea5905 624 ret = lxc_write_to_file(clonechildrenpath, "1", 1, false, 0666);
7793add3 625 if (ret < 0) {
e3a3fecf 626 /* Set clone_children so children inherit our settings */
7793add3 627 SYSERROR("Failed to write 1 to \"%s\"", clonechildrenpath);
e3a3fecf
SH
628 return false;
629 }
d97919ab 630
e3a3fecf
SH
631 return true;
632}
633
5c0089ae
CB
634/* Given two null-terminated lists of strings, return true if any string is in
635 * both.
ccb4cabe
SH
636 */
637static bool controller_lists_intersect(char **l1, char **l2)
638{
639 int i;
640
641 if (!l1 || !l2)
642 return false;
643
644 for (i = 0; l1[i]; i++) {
645 if (string_in_list(l2, l1[i]))
646 return true;
647 }
5c0089ae 648
ccb4cabe
SH
649 return false;
650}
651
258449e5
CB
652/* For a null-terminated list of controllers @clist, return true if any of those
653 * controllers is already listed the null-terminated list of hierarchies @hlist.
654 * Realistically, if one is present, all must be present.
ccb4cabe
SH
655 */
656static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
657{
658 int i;
659
660 if (!hlist)
661 return false;
258449e5 662
ccb4cabe
SH
663 for (i = 0; hlist[i]; i++)
664 if (controller_lists_intersect(hlist[i]->controllers, clist))
665 return true;
ccb4cabe 666
258449e5 667 return false;
ccb4cabe
SH
668}
669
f57ac67f
CB
670/* Return true if the controller @entry is found in the null-terminated list of
671 * hierarchies @hlist.
ccb4cabe
SH
672 */
673static bool controller_found(struct hierarchy **hlist, char *entry)
674{
675 int i;
d6337a5f 676
ccb4cabe
SH
677 if (!hlist)
678 return false;
679
680 for (i = 0; hlist[i]; i++)
681 if (string_in_list(hlist[i]->controllers, entry))
682 return true;
d6337a5f 683
ccb4cabe
SH
684 return false;
685}
686
e1c27ab0
CB
687/* Return true if all of the controllers which we require have been found. The
688 * required list is freezer and anything in lxc.cgroup.use.
ccb4cabe 689 */
2202afc9 690static bool all_controllers_found(struct cgroup_ops *ops)
ccb4cabe 691{
b7b18fc5 692 char **cur;
2202afc9 693 struct hierarchy **hlist = ops->hierarchies;
ccb4cabe 694
2202afc9 695 if (!ops->cgroup_use)
ccb4cabe 696 return true;
c2712f64 697
b7b18fc5
CB
698 for (cur = ops->cgroup_use; cur && *cur; cur++)
699 if (!controller_found(hlist, *cur)) {
700 ERROR("No %s controller mountpoint found", *cur);
ccb4cabe
SH
701 return false;
702 }
c2712f64 703
ccb4cabe
SH
704 return true;
705}
706
f205f10c
CB
707/* Get the controllers from a mountinfo line There are other ways we could get
708 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
709 * could parse the mount options. But we simply assume that the mountpoint must
710 * be /sys/fs/cgroup/controller-list
ccb4cabe 711 */
a3926f6a
CB
712static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
713 int type)
ccb4cabe 714{
f205f10c
CB
715 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
716 * for legacy hierarchies.
717 */
ccb4cabe 718 int i;
d97919ab 719 char *p2, *tok;
0be0d78f 720 char *p = line, *sep = ",";
411ac6d8 721 char **aret = NULL;
6328fd9c 722
ccb4cabe 723 for (i = 0; i < 4; i++) {
235f1815 724 p = strchr(p, ' ');
ccb4cabe
SH
725 if (!p)
726 return NULL;
727 p++;
728 }
a55f31bd 729
f205f10c
CB
730 /* Note, if we change how mountinfo works, then our caller will need to
731 * verify /sys/fs/cgroup/ in this field.
732 */
733 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
2202afc9 734 ERROR("Found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
ccb4cabe 735 return NULL;
5059aae9 736 }
d6337a5f 737
ccb4cabe 738 p += 15;
235f1815 739 p2 = strchr(p, ' ');
ccb4cabe 740 if (!p2) {
2202afc9 741 ERROR("Corrupt mountinfo");
ccb4cabe
SH
742 return NULL;
743 }
744 *p2 = '\0';
6328fd9c 745
d6337a5f 746 if (type == CGROUP_SUPER_MAGIC) {
88396101 747 __do_free char *dup = NULL;
d97919ab 748
0be0d78f
CB
749 /* strdup() here for v1 hierarchies. Otherwise
750 * lxc_iterate_parts() will destroy mountpoints such as
751 * "/sys/fs/cgroup/cpu,cpuacct".
d6337a5f 752 */
d97919ab 753 dup = must_copy_string(p);
d6337a5f
CB
754 if (!dup)
755 return NULL;
756
d97919ab 757 lxc_iterate_parts (tok, dup, sep)
d6337a5f 758 must_append_controller(klist, nlist, &aret, tok);
411ac6d8 759 }
d6337a5f 760 *p2 = ' ';
f205f10c 761
d6337a5f
CB
762 return aret;
763}
411ac6d8 764
d6337a5f
CB
765static char **cg_unified_make_empty_controller(void)
766{
767 int newentry;
768 char **aret = NULL;
769
770 newentry = append_null_to_list((void ***)&aret);
771 aret[newentry] = NULL;
772 return aret;
773}
774
775static char **cg_unified_get_controllers(const char *file)
776{
d97919ab
CB
777 __do_free char *buf = NULL;
778 char *tok;
0be0d78f 779 char *sep = " \t\n";
d6337a5f
CB
780 char **aret = NULL;
781
782 buf = read_file(file);
783 if (!buf)
411ac6d8 784 return NULL;
6328fd9c 785
0be0d78f 786 lxc_iterate_parts(tok, buf, sep) {
d6337a5f
CB
787 int newentry;
788 char *copy;
789
790 newentry = append_null_to_list((void ***)&aret);
791 copy = must_copy_string(tok);
792 aret[newentry] = copy;
ccb4cabe
SH
793 }
794
795 return aret;
796}
797
2202afc9 798static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
bb221ad1 799 char *container_base_path, int type)
ccb4cabe
SH
800{
801 struct hierarchy *new;
802 int newentry;
803
f25a2044 804 new = must_realloc(NULL, sizeof(*new));
ccb4cabe
SH
805 new->controllers = clist;
806 new->mountpoint = mountpoint;
bb221ad1 807 new->container_base_path = container_base_path;
eb697136 808 new->container_full_path = NULL;
e09b62f9 809 new->monitor_full_path = NULL;
d6337a5f 810 new->version = type;
a6ca2ed8 811 new->cgroup2_chown = NULL;
6328fd9c 812
2202afc9
CB
813 newentry = append_null_to_list((void ***)h);
814 (*h)[newentry] = new;
d6337a5f 815 return new;
ccb4cabe
SH
816}
817
798c3b33
CB
818/* Get a copy of the mountpoint from @line, which is a line from
819 * /proc/self/mountinfo.
ccb4cabe 820 */
a3926f6a 821static char *cg_hybrid_get_mountpoint(char *line)
ccb4cabe
SH
822{
823 int i;
ccb4cabe 824 size_t len;
798c3b33
CB
825 char *p2;
826 char *p = line, *sret = NULL;
ccb4cabe
SH
827
828 for (i = 0; i < 4; i++) {
235f1815 829 p = strchr(p, ' ');
ccb4cabe
SH
830 if (!p)
831 return NULL;
832 p++;
833 }
d6337a5f 834
798c3b33 835 if (strncmp(p, "/sys/fs/cgroup/", 15) != 0)
d6337a5f
CB
836 return NULL;
837
838 p2 = strchr(p + 15, ' ');
839 if (!p2)
840 return NULL;
841 *p2 = '\0';
842
ccb4cabe 843 len = strlen(p);
f25a2044 844 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
845 memcpy(sret, p, len);
846 sret[len] = '\0';
847 return sret;
848}
849
f523291e 850/* Given a multi-line string, return a null-terminated copy of the current line. */
ccb4cabe
SH
851static char *copy_to_eol(char *p)
852{
235f1815 853 char *p2 = strchr(p, '\n'), *sret;
ccb4cabe
SH
854 size_t len;
855
856 if (!p2)
857 return NULL;
858
859 len = p2 - p;
f25a2044 860 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
861 memcpy(sret, p, len);
862 sret[len] = '\0';
863 return sret;
864}
865
bced39de
CB
866/* cgline: pointer to character after the first ':' in a line in a \n-terminated
867 * /proc/self/cgroup file. Check whether controller c is present.
ccb4cabe
SH
868 */
869static bool controller_in_clist(char *cgline, char *c)
870{
d97919ab
CB
871 __do_free char *tmp = NULL;
872 char *tok, *eol;
ccb4cabe
SH
873 size_t len;
874
235f1815 875 eol = strchr(cgline, ':');
ccb4cabe
SH
876 if (!eol)
877 return false;
878
879 len = eol - cgline;
861cb8c2 880 tmp = must_realloc(NULL, len + 1);
ccb4cabe
SH
881 memcpy(tmp, cgline, len);
882 tmp[len] = '\0';
883
d97919ab
CB
884 lxc_iterate_parts(tok, tmp, ",")
885 if (strcmp(tok, c) == 0)
ccb4cabe 886 return true;
d6337a5f 887
ccb4cabe
SH
888 return false;
889}
890
c3ef912e
CB
891/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
892 * @controller.
ccb4cabe 893 */
c3ef912e
CB
894static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
895 int type)
ccb4cabe
SH
896{
897 char *p = basecginfo;
6328fd9c 898
d6337a5f
CB
899 for (;;) {
900 bool is_cgv2_base_cgroup = false;
901
6328fd9c 902 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
d6337a5f
CB
903 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
904 is_cgv2_base_cgroup = true;
ccb4cabe 905
235f1815 906 p = strchr(p, ':');
ccb4cabe
SH
907 if (!p)
908 return NULL;
909 p++;
d6337a5f
CB
910
911 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
235f1815 912 p = strchr(p, ':');
ccb4cabe
SH
913 if (!p)
914 return NULL;
915 p++;
916 return copy_to_eol(p);
917 }
918
235f1815 919 p = strchr(p, '\n');
ccb4cabe
SH
920 if (!p)
921 return NULL;
922 p++;
923 }
924}
925
ccb4cabe
SH
926static void must_append_string(char ***list, char *entry)
927{
6dfb18bf 928 int newentry;
ccb4cabe
SH
929 char *copy;
930
6dfb18bf 931 newentry = append_null_to_list((void ***)list);
ccb4cabe
SH
932 copy = must_copy_string(entry);
933 (*list)[newentry] = copy;
934}
935
d6337a5f 936static int get_existing_subsystems(char ***klist, char ***nlist)
ccb4cabe 937{
d97919ab
CB
938 __do_free char *line = NULL;
939 __do_fclose FILE *f = NULL;
ccb4cabe
SH
940 size_t len = 0;
941
d6337a5f
CB
942 f = fopen("/proc/self/cgroup", "r");
943 if (!f)
944 return -1;
945
ccb4cabe 946 while (getline(&line, &len, f) != -1) {
0be0d78f 947 char *p, *p2, *tok;
235f1815 948 p = strchr(line, ':');
ccb4cabe
SH
949 if (!p)
950 continue;
951 p++;
235f1815 952 p2 = strchr(p, ':');
ccb4cabe
SH
953 if (!p2)
954 continue;
955 *p2 = '\0';
ff8d6ee9 956
6328fd9c
CB
957 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
958 * contains an entry of the form:
ff8d6ee9
CB
959 *
960 * 0::/some/path
961 *
6328fd9c 962 * In this case we use "cgroup2" as controller name.
ff8d6ee9 963 */
6328fd9c
CB
964 if ((p2 - p) == 0) {
965 must_append_string(klist, "cgroup2");
ff8d6ee9 966 continue;
6328fd9c 967 }
ff8d6ee9 968
0be0d78f 969 lxc_iterate_parts(tok, p, ",") {
ccb4cabe
SH
970 if (strncmp(tok, "name=", 5) == 0)
971 must_append_string(nlist, tok);
972 else
973 must_append_string(klist, tok);
974 }
975 }
976
d6337a5f 977 return 0;
ccb4cabe
SH
978}
979
980static void trim(char *s)
981{
7689dfd7
CB
982 size_t len;
983
984 len = strlen(s);
2c28d76b 985 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe
SH
986 s[--len] = '\0';
987}
988
2202afc9 989static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
ccb4cabe
SH
990{
991 int i;
27d84737 992 struct hierarchy **it;
41c33dbe 993
2202afc9
CB
994 if (!ops->hierarchies) {
995 TRACE(" No hierarchies found");
ccb4cabe
SH
996 return;
997 }
27d84737 998
2202afc9
CB
999 TRACE(" Hierarchies:");
1000 for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
ccb4cabe 1001 int j;
27d84737
CB
1002 char **cit;
1003
bb221ad1 1004 TRACE(" %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
2202afc9
CB
1005 TRACE(" mountpoint: %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1006 TRACE(" controllers:");
a7b0cc4c 1007 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
2202afc9 1008 TRACE(" %d: %s", j, *cit);
ccb4cabe
SH
1009 }
1010}
41c33dbe 1011
a3926f6a
CB
1012static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1013 char **nlist)
41c33dbe
SH
1014{
1015 int k;
a7b0cc4c 1016 char **it;
41c33dbe 1017
2202afc9
CB
1018 TRACE("basecginfo is:");
1019 TRACE("%s", basecginfo);
41c33dbe 1020
a7b0cc4c 1021 for (k = 0, it = klist; it && *it; it++, k++)
2202afc9 1022 TRACE("kernel subsystem %d: %s", k, *it);
0f71dd9b 1023
a7b0cc4c 1024 for (k = 0, it = nlist; it && *it; it++, k++)
2202afc9 1025 TRACE("named subsystem %d: %s", k, *it);
41c33dbe 1026}
ccb4cabe 1027
2202afc9
CB
1028static int cgroup_rmdir(struct hierarchy **hierarchies,
1029 const char *container_cgroup)
c71d83e1 1030{
2202afc9 1031 int i;
d6337a5f 1032
2202afc9
CB
1033 if (!container_cgroup || !hierarchies)
1034 return 0;
d6337a5f 1035
2202afc9
CB
1036 for (i = 0; hierarchies[i]; i++) {
1037 int ret;
1038 struct hierarchy *h = hierarchies[i];
d6337a5f 1039
eb697136 1040 if (!h->container_full_path)
2202afc9
CB
1041 continue;
1042
eb697136 1043 ret = recursive_destroy(h->container_full_path);
2202afc9 1044 if (ret < 0)
eb697136 1045 WARN("Failed to destroy \"%s\"", h->container_full_path);
2202afc9 1046
eb697136
CB
1047 free(h->container_full_path);
1048 h->container_full_path = NULL;
2202afc9 1049 }
d6337a5f 1050
c71d83e1 1051 return 0;
d6337a5f
CB
1052}
1053
2202afc9
CB
1054struct generic_userns_exec_data {
1055 struct hierarchy **hierarchies;
1056 const char *container_cgroup;
1057 struct lxc_conf *conf;
1058 uid_t origuid; /* target uid in parent namespace */
1059 char *path;
1060};
d6337a5f 1061
2202afc9
CB
1062static int cgroup_rmdir_wrapper(void *data)
1063{
1064 int ret;
1065 struct generic_userns_exec_data *arg = data;
1066 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1067 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
d6337a5f 1068
2202afc9
CB
1069 ret = setresgid(nsgid, nsgid, nsgid);
1070 if (ret < 0) {
1071 SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
1072 (int)nsgid, (int)nsgid);
1073 return -1;
1074 }
d6337a5f 1075
2202afc9
CB
1076 ret = setresuid(nsuid, nsuid, nsuid);
1077 if (ret < 0) {
1078 SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
1079 (int)nsuid, (int)nsuid);
1080 return -1;
1081 }
d6337a5f 1082
2202afc9
CB
1083 ret = setgroups(0, NULL);
1084 if (ret < 0 && errno != EPERM) {
1085 SYSERROR("Failed to setgroups(0, NULL)");
1086 return -1;
1087 }
d6337a5f 1088
2202afc9 1089 return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
d6337a5f
CB
1090}
1091
434c8e15
CB
1092__cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
1093 struct lxc_handler *handler)
d6337a5f
CB
1094{
1095 int ret;
2202afc9 1096 struct generic_userns_exec_data wrap;
bd8ef4e4 1097
69b4a4bb
CB
1098 if (!ops->hierarchies)
1099 return;
1100
4160c3a0 1101 wrap.origuid = 0;
2202afc9
CB
1102 wrap.container_cgroup = ops->container_cgroup;
1103 wrap.hierarchies = ops->hierarchies;
1104 wrap.conf = handler->conf;
4160c3a0 1105
2202afc9
CB
1106 if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
1107 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
bd8ef4e4 1108 "cgroup_rmdir_wrapper");
ccb4cabe 1109 else
2202afc9 1110 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
bd8ef4e4
CB
1111 if (ret < 0) {
1112 WARN("Failed to destroy cgroups");
ccb4cabe 1113 return;
ccb4cabe 1114 }
ccb4cabe
SH
1115}
1116
434c8e15
CB
1117__cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1118 struct lxc_handler *handler)
1119{
1120 int len;
434c8e15
CB
1121 struct lxc_conf *conf = handler->conf;
1122 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1123
1124 if (!ops->hierarchies)
1125 return;
1126
1127 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1128 if (len < 0 || (size_t)len >= sizeof(pidstr))
1129 return;
1130
1131 for (int i = 0; ops->hierarchies[i]; i++) {
d97919ab 1132 __do_free char *pivot_path = NULL;
434c8e15 1133 int ret;
23e5c045 1134 char *chop;
ecedb5de 1135 char pivot_cgroup[] = PIVOT_CGROUP;
434c8e15
CB
1136 struct hierarchy *h = ops->hierarchies[i];
1137
1138 if (!h->monitor_full_path)
1139 continue;
1140
1141 if (conf && conf->cgroup_meta.dir)
1142 pivot_path = must_make_path(h->mountpoint,
1143 h->container_base_path,
1144 conf->cgroup_meta.dir,
625ad37b 1145 PIVOT_CGROUP,
434c8e15
CB
1146 "cgroup.procs", NULL);
1147 else
1148 pivot_path = must_make_path(h->mountpoint,
1149 h->container_base_path,
625ad37b 1150 PIVOT_CGROUP,
434c8e15
CB
1151 "cgroup.procs", NULL);
1152
23e5c045
CB
1153 chop = strrchr(pivot_path, '/');
1154 if (chop)
1155 *chop = '\0';
1156
ecedb5de
CB
1157 /*
1158 * Make sure not to pass in the ro string literal PIVOT_CGROUP
1159 * here.
1160 */
1161 if (!cg_legacy_handle_cpuset_hierarchy(h, pivot_cgroup)) {
1162 WARN("Failed to handle legacy cpuset controller");
d97919ab 1163 continue;
ecedb5de
CB
1164 }
1165
434c8e15 1166 ret = mkdir_p(pivot_path, 0755);
d5fc4dd4
CB
1167 if (ret < 0 && errno != EEXIST) {
1168 SYSWARN("Failed to create cgroup \"%s\"\n", pivot_path);
d97919ab 1169 continue;
d5fc4dd4 1170 }
434c8e15 1171
23e5c045
CB
1172 if (chop)
1173 *chop = '/';
1174
434c8e15
CB
1175 /* Move ourselves into the pivot cgroup to delete our own
1176 * cgroup.
1177 */
1178 ret = lxc_write_to_file(pivot_path, pidstr, len, false, 0666);
d5fc4dd4
CB
1179 if (ret != 0) {
1180 SYSWARN("Failed to move monitor %s to \"%s\"\n", pidstr, pivot_path);
d97919ab 1181 continue;
d5fc4dd4 1182 }
434c8e15
CB
1183
1184 ret = recursive_destroy(h->monitor_full_path);
1185 if (ret < 0)
1186 WARN("Failed to destroy \"%s\"", h->monitor_full_path);
434c8e15
CB
1187 }
1188}
1189
a3926f6a 1190static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
0c3deb94 1191{
d97919ab 1192 __do_free char *add_controllers = NULL, *cgroup = NULL;
0c3deb94 1193 size_t i, parts_len;
389d44ec 1194 char **it;
0c3deb94 1195 size_t full_len = 0;
0c3deb94
CB
1196 char **parts = NULL;
1197 bool bret = false;
1198
1199 if (h->version != CGROUP2_SUPER_MAGIC)
1200 return true;
1201
1202 if (!h->controllers)
1203 return true;
1204
1205 /* For now we simply enable all controllers that we have detected by
1206 * creating a string like "+memory +pids +cpu +io".
1207 * TODO: In the near future we might want to support "-<controller>"
1208 * etc. but whether supporting semantics like this make sense will need
1209 * some thinking.
1210 */
1211 for (it = h->controllers; it && *it; it++) {
64e82f8b
DJ
1212 full_len += strlen(*it) + 2;
1213 add_controllers = must_realloc(add_controllers, full_len + 1);
1214
1215 if (h->controllers[0] == *it)
1216 add_controllers[0] = '\0';
1217
3ebe2fbd
DJ
1218 (void)strlcat(add_controllers, "+", full_len + 1);
1219 (void)strlcat(add_controllers, *it, full_len + 1);
64e82f8b
DJ
1220
1221 if ((it + 1) && *(it + 1))
3ebe2fbd 1222 (void)strlcat(add_controllers, " ", full_len + 1);
0c3deb94
CB
1223 }
1224
1225 parts = lxc_string_split(cgname, '/');
1226 if (!parts)
1227 goto on_error;
64e82f8b 1228
0c3deb94
CB
1229 parts_len = lxc_array_len((void **)parts);
1230 if (parts_len > 0)
1231 parts_len--;
1232
bb221ad1 1233 cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
0c3deb94
CB
1234 for (i = 0; i < parts_len; i++) {
1235 int ret;
88396101 1236 __do_free char *target = NULL;
0c3deb94
CB
1237
1238 cgroup = must_append_path(cgroup, parts[i], NULL);
1239 target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
7cea5905 1240 ret = lxc_write_to_file(target, add_controllers, full_len, false, 0666);
0c3deb94
CB
1241 if (ret < 0) {
1242 SYSERROR("Could not enable \"%s\" controllers in the "
1243 "unified cgroup \"%s\"", add_controllers, cgroup);
1244 goto on_error;
1245 }
1246 }
1247
1248 bret = true;
1249
1250on_error:
1251 lxc_free_array((void **)parts, free);
0c3deb94
CB
1252 return bret;
1253}
1254
6099dd5a
CB
1255static int mkdir_eexist_on_last(const char *dir, mode_t mode)
1256{
1257 const char *tmp = dir;
1258 const char *orig = dir;
1259 size_t orig_len;
1260
1261 orig_len = strlen(dir);
1262 do {
d97919ab 1263 __do_free char *makeme;
6099dd5a
CB
1264 int ret;
1265 size_t cur_len;
6099dd5a
CB
1266
1267 dir = tmp + strspn(tmp, "/");
1268 tmp = dir + strcspn(dir, "/");
1269
1270 errno = ENOMEM;
1271 cur_len = dir - orig;
1272 makeme = strndup(orig, cur_len);
1273 if (!makeme)
1274 return -1;
1275
1276 ret = mkdir(makeme, mode);
1277 if (ret < 0) {
1278 if ((errno != EEXIST) || (orig_len == cur_len)) {
1279 SYSERROR("Failed to create directory \"%s\"", makeme);
6099dd5a
CB
1280 return -1;
1281 }
1282 }
6099dd5a
CB
1283 } while (tmp != dir);
1284
1285 return 0;
1286}
1287
72068e74
CB
1288static bool monitor_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1289{
1290 int ret;
1291
ef185360
CB
1292 if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1293 ERROR("Failed to handle legacy cpuset controller");
1294 return false;
1295 }
1296
72068e74 1297 h->monitor_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
6099dd5a
CB
1298 ret = mkdir_eexist_on_last(h->monitor_full_path, 0755);
1299 if (ret < 0) {
1300 ERROR("Failed to create cgroup \"%s\"", h->monitor_full_path);
ee455be4
CB
1301 return false;
1302 }
72068e74 1303
72068e74
CB
1304 return cg_unified_create_cgroup(h, cgname);
1305}
1306
1307static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
ccb4cabe 1308{
0c3deb94
CB
1309 int ret;
1310
ef185360
CB
1311 if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1312 ERROR("Failed to handle legacy cpuset controller");
1313 return false;
1314 }
1315
bb221ad1 1316 h->container_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
6099dd5a
CB
1317 ret = mkdir_eexist_on_last(h->container_full_path, 0755);
1318 if (ret < 0) {
1319 ERROR("Failed to create cgroup \"%s\"", h->container_full_path);
d8da679e 1320 return false;
6f9584d8 1321 }
0c3deb94 1322
a3926f6a 1323 return cg_unified_create_cgroup(h, cgname);
ccb4cabe
SH
1324}
1325
72068e74 1326static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname, bool monitor)
ccb4cabe 1327{
e56639fb 1328 int ret;
72068e74
CB
1329 char *full_path;
1330
1331 if (monitor)
1332 full_path = h->monitor_full_path;
1333 else
1334 full_path = h->container_full_path;
e56639fb 1335
72068e74 1336 ret = rmdir(full_path);
e56639fb 1337 if (ret < 0)
72068e74
CB
1338 SYSERROR("Failed to rmdir(\"%s\") from failed creation attempt", full_path);
1339
1340 free(full_path);
1341
1342 if (monitor)
1343 h->monitor_full_path = NULL;
1344 else
1345 h->container_full_path = NULL;
1346}
1347
b857f4be 1348__cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
f2668eea 1349 struct lxc_handler *handler)
72068e74 1350{
d97919ab
CB
1351 __do_free char *monitor_cgroup = NULL;
1352 char *offset, *tmp;
ebc10afe 1353 int i, idx = 0;
5ce03bc0 1354 size_t len;
72068e74
CB
1355 struct lxc_conf *conf = handler->conf;
1356
1357 if (!conf)
d97919ab 1358 return false;
e56639fb 1359
69b4a4bb
CB
1360 if (!ops->hierarchies)
1361 return true;
1362
72068e74 1363 if (conf->cgroup_meta.dir)
5ce03bc0
CB
1364 tmp = lxc_string_join("/",
1365 (const char *[]){conf->cgroup_meta.dir,
1366 ops->monitor_pattern,
1367 handler->name, NULL},
1368 false);
72068e74 1369 else
5ce03bc0
CB
1370 tmp = must_make_path(ops->monitor_pattern, handler->name, NULL);
1371 if (!tmp)
d97919ab 1372 return false;
72068e74 1373
5ce03bc0 1374 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
5407d095 1375 monitor_cgroup = must_realloc(tmp, len);
5ce03bc0 1376 offset = monitor_cgroup + len - 5;
5407d095 1377 *offset = 0;
5ce03bc0
CB
1378
1379 do {
1380 if (idx) {
1381 int ret = snprintf(offset, 5, "-%d", idx);
1382 if (ret < 0 || (size_t)ret >= 5)
d97919ab 1383 return false;
72068e74 1384 }
72068e74 1385
ebc10afe 1386 for (i = 0; ops->hierarchies[i]; i++) {
f2668eea
CB
1387 if (!monitor_create_path_for_hierarchy(ops->hierarchies[i],
1388 monitor_cgroup)) {
1389 ERROR("Failed to create cgroup \"%s\"",
1390 ops->hierarchies[i]->monitor_full_path);
5ce03bc0 1391 for (int j = 0; j < i; j++)
f2668eea
CB
1392 remove_path_for_hierarchy(ops->hierarchies[j],
1393 monitor_cgroup,
1394 true);
5ce03bc0
CB
1395
1396 idx++;
1397 break;
1398 }
1399 }
ebc10afe 1400 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
5ce03bc0 1401
d97919ab
CB
1402 if (idx == 1000)
1403 return false;
72068e74 1404
d97919ab
CB
1405 INFO("The monitor process uses \"%s\" as cgroup", monitor_cgroup);
1406 return true;
ccb4cabe
SH
1407}
1408
cecad0c1
CB
1409/* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1410 * next cgroup_pattern-1, -2, ..., -999.
ccb4cabe 1411 */
b857f4be 1412__cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
6439f06e 1413 struct lxc_handler *handler)
ccb4cabe 1414{
d97919ab 1415 __do_free char *container_cgroup = NULL, *tmp = NULL;
bb30b52a 1416 int i;
ccb4cabe 1417 size_t len;
d97919ab 1418 char *offset;
7d531e9b 1419 int idx = 0;
2202afc9 1420 struct lxc_conf *conf = handler->conf;
ccb4cabe 1421
d97919ab 1422 if (ops->container_cgroup)
ccb4cabe 1423 return false;
43654d34 1424
2202afc9 1425 if (!conf)
ccb4cabe 1426 return false;
ccb4cabe 1427
69b4a4bb
CB
1428 if (!ops->hierarchies)
1429 return true;
1430
2202afc9 1431 if (conf->cgroup_meta.dir)
3ec12d39 1432 tmp = lxc_string_join("/", (const char *[]){conf->cgroup_meta.dir, handler->name, NULL}, false);
43654d34 1433 else
2202afc9 1434 tmp = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
ccb4cabe
SH
1435 if (!tmp) {
1436 ERROR("Failed expanding cgroup name pattern");
1437 return false;
1438 }
64e82f8b 1439
1a0e70ac 1440 len = strlen(tmp) + 5; /* leave room for -NNN\0 */
f25a2044 1441 container_cgroup = must_realloc(NULL, len);
64e82f8b 1442 (void)strlcpy(container_cgroup, tmp, len);
0c3deb94 1443 offset = container_cgroup + len - 5;
ccb4cabe 1444
d97919ab 1445 do {
c74da4ab
CB
1446 if (idx) {
1447 int ret = snprintf(offset, 5, "-%d", idx);
1448 if (ret < 0 || (size_t)ret >= 5)
1449 return false;
1450 }
bb30b52a 1451
d97919ab 1452 for (i = 0; ops->hierarchies[i]; i++) {
d99d5c93
CB
1453 if (!container_create_path_for_hierarchy(ops->hierarchies[i],
1454 container_cgroup)) {
1455 ERROR("Failed to create cgroup \"%s\"",
1456 ops->hierarchies[i]->container_full_path);
d97919ab 1457 for (int j = 0; j < i; j++)
d99d5c93
CB
1458 remove_path_for_hierarchy(ops->hierarchies[j],
1459 container_cgroup,
1460 false);
d97919ab
CB
1461 idx++;
1462 break;
66b66624
CB
1463 }
1464 }
d97919ab 1465 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
cecad0c1 1466
d97919ab
CB
1467 if (idx == 1000)
1468 return false;
cecad0c1 1469
c74da4ab 1470 INFO("The container process uses \"%s\" as cgroup", container_cgroup);
e4edf5d7 1471 ops->container_cgroup = move_ptr(container_cgroup);
ccb4cabe 1472 return true;
ccb4cabe
SH
1473}
1474
b857f4be 1475__cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
eeef32bb 1476 bool monitor)
ccb4cabe 1477{
eeef32bb 1478 int len;
a3650c0c 1479 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 1480
69b4a4bb
CB
1481 if (!ops->hierarchies)
1482 return true;
1483
a3650c0c
CB
1484 len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
1485 if (len < 0 || (size_t)len >= sizeof(pidstr))
ccb4cabe
SH
1486 return false;
1487
eeef32bb 1488 for (int i = 0; ops->hierarchies[i]; i++) {
08768001 1489 int ret;
88396101 1490 __do_free char *path = NULL;
08768001 1491
eeef32bb
CB
1492 if (monitor)
1493 path = must_make_path(ops->hierarchies[i]->monitor_full_path,
1494 "cgroup.procs", NULL);
1495 else
1496 path = must_make_path(ops->hierarchies[i]->container_full_path,
1497 "cgroup.procs", NULL);
1498 ret = lxc_write_to_file(path, pidstr, len, false, 0666);
08768001 1499 if (ret != 0) {
eeef32bb 1500 SYSERROR("Failed to enter cgroup \"%s\"", path);
ccb4cabe
SH
1501 return false;
1502 }
ccb4cabe
SH
1503 }
1504
1505 return true;
1506}
1507
b857f4be 1508__cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, pid_t pid)
eeef32bb
CB
1509{
1510 return __do_cgroup_enter(ops, pid, true);
1511}
1512
1513static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid)
1514{
1515 return __do_cgroup_enter(ops, pid, false);
1516}
1517
6efacf80
CB
1518static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
1519 mode_t chmod_mode)
1520{
1521 int ret;
1522
1523 ret = chown(path, chown_uid, chown_gid);
1524 if (ret < 0) {
a24c5678 1525 SYSWARN("Failed to chown(%s, %d, %d)", path, (int)chown_uid, (int)chown_gid);
6efacf80
CB
1526 return -1;
1527 }
1528
1529 ret = chmod(path, chmod_mode);
1530 if (ret < 0) {
a24c5678 1531 SYSWARN("Failed to chmod(%s, %d)", path, (int)chmod_mode);
6efacf80
CB
1532 return -1;
1533 }
1534
1535 return 0;
1536}
1537
1538/* chgrp the container cgroups to container group. We leave
c0888dfe
SH
1539 * the container owner as cgroup owner. So we must make the
1540 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1541 *
1542 * Also chown the tasks and cgroup.procs files. Those may not
1543 * exist depending on kernel version.
c0888dfe 1544 */
ccb4cabe
SH
1545static int chown_cgroup_wrapper(void *data)
1546{
6a720d74 1547 int ret;
4160c3a0
CB
1548 uid_t destuid;
1549 struct generic_userns_exec_data *arg = data;
1550 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1551 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1552
6efacf80
CB
1553 ret = setresgid(nsgid, nsgid, nsgid);
1554 if (ret < 0) {
1555 SYSERROR("Failed to setresgid(%d, %d, %d)",
1556 (int)nsgid, (int)nsgid, (int)nsgid);
1557 return -1;
1558 }
1559
1560 ret = setresuid(nsuid, nsuid, nsuid);
1561 if (ret < 0) {
1562 SYSERROR("Failed to setresuid(%d, %d, %d)",
1563 (int)nsuid, (int)nsuid, (int)nsuid);
1564 return -1;
1565 }
1566
1567 ret = setgroups(0, NULL);
1568 if (ret < 0 && errno != EPERM) {
1569 SYSERROR("Failed to setgroups(0, NULL)");
1570 return -1;
1571 }
ccb4cabe
SH
1572
1573 destuid = get_ns_uid(arg->origuid);
b962868f
CB
1574 if (destuid == LXC_INVALID_UID)
1575 destuid = 0;
ccb4cabe 1576
6a720d74 1577 for (int i = 0; arg->hierarchies[i]; i++) {
d97919ab 1578 __do_free char *fullpath = NULL;
eb697136 1579 char *path = arg->hierarchies[i]->container_full_path;
43647298 1580
63e42fee 1581 ret = chowmod(path, destuid, nsgid, 0775);
6efacf80 1582 if (ret < 0)
ccb4cabe 1583 return -1;
c0888dfe 1584
6efacf80
CB
1585 /* Failures to chown() these are inconvenient but not
1586 * detrimental We leave these owned by the container launcher,
1587 * so that container root can write to the files to attach. We
1588 * chmod() them 664 so that container systemd can write to the
1589 * files (which systemd in wily insists on doing).
ab8f5424 1590 */
6efacf80 1591
2202afc9 1592 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
6efacf80
CB
1593 fullpath = must_make_path(path, "tasks", NULL);
1594 (void)chowmod(fullpath, destuid, nsgid, 0664);
6efacf80 1595 }
43647298
SH
1596
1597 fullpath = must_make_path(path, "cgroup.procs", NULL);
2202afc9 1598 (void)chowmod(fullpath, destuid, nsgid, 0664);
0e17357c 1599
2202afc9 1600 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
0e17357c
CB
1601 continue;
1602
a6ca2ed8
CB
1603 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++) {
1604 fullpath = must_make_path(path, *p, NULL);
1605 (void)chowmod(fullpath, destuid, nsgid, 0664);
a6ca2ed8 1606 }
ccb4cabe
SH
1607 }
1608
1609 return 0;
1610}
1611
b857f4be 1612__cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
fb55e009 1613 struct lxc_conf *conf)
ccb4cabe 1614{
4160c3a0 1615 struct generic_userns_exec_data wrap;
ccb4cabe 1616
ccb4cabe
SH
1617 if (lxc_list_empty(&conf->id_map))
1618 return true;
1619
69b4a4bb
CB
1620 if (!ops->hierarchies)
1621 return true;
1622
ccb4cabe 1623 wrap.origuid = geteuid();
4160c3a0 1624 wrap.path = NULL;
2202afc9 1625 wrap.hierarchies = ops->hierarchies;
4160c3a0 1626 wrap.conf = conf;
ccb4cabe 1627
c9b7c33e
CB
1628 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1629 "chown_cgroup_wrapper") < 0) {
f7faba6c 1630 ERROR("Error requesting cgroup chown in new user namespace");
ccb4cabe
SH
1631 return false;
1632 }
1633
1634 return true;
1635}
1636
8aa1044f
SH
1637/* cgroup-full:* is done, no need to create subdirs */
1638static bool cg_mount_needs_subdirs(int type)
1639{
1640 if (type >= LXC_AUTO_CGROUP_FULL_RO)
1641 return false;
a3926f6a 1642
8aa1044f
SH
1643 return true;
1644}
1645
886cac86
CB
1646/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1647 * remount controller ro if needed and bindmount the cgroupfs onto
25fa6f8c 1648 * control/the/cg/path.
8aa1044f 1649 */
6812d833
CB
1650static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1651 char *controllerpath, char *cgpath,
1652 const char *container_cgroup)
8aa1044f 1653{
d97919ab 1654 __do_free char *sourcepath = NULL;
5285689c 1655 int ret, remount_flags;
886cac86
CB
1656 int flags = MS_BIND;
1657
8aa1044f 1658 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
886cac86
CB
1659 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1660 if (ret < 0) {
1661 SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
1662 controllerpath, controllerpath);
8aa1044f
SH
1663 return -1;
1664 }
886cac86 1665
5285689c
CB
1666 remount_flags = add_required_remount_flags(controllerpath,
1667 controllerpath,
1668 flags | MS_REMOUNT);
886cac86 1669 ret = mount(controllerpath, controllerpath, "cgroup",
8186c5c7
CB
1670 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1671 NULL);
886cac86
CB
1672 if (ret < 0) {
1673 SYSERROR("Failed to remount \"%s\" ro", controllerpath);
8aa1044f
SH
1674 return -1;
1675 }
886cac86 1676
8aa1044f
SH
1677 INFO("Remounted %s read-only", controllerpath);
1678 }
886cac86 1679
bb221ad1 1680 sourcepath = must_make_path(h->mountpoint, h->container_base_path,
886cac86 1681 container_cgroup, NULL);
8aa1044f
SH
1682 if (type == LXC_AUTO_CGROUP_RO)
1683 flags |= MS_RDONLY;
886cac86
CB
1684
1685 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1686 if (ret < 0) {
1687 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
8aa1044f
SH
1688 return -1;
1689 }
886cac86 1690 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
f8c40ffa
L
1691
1692 if (flags & MS_RDONLY) {
5285689c
CB
1693 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1694 flags | MS_REMOUNT);
1695 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
886cac86
CB
1696 if (ret < 0) {
1697 SYSERROR("Failed to remount \"%s\" ro", cgpath);
f8c40ffa
L
1698 return -1;
1699 }
5285689c 1700 INFO("Remounted %s read-only", cgpath);
f8c40ffa
L
1701 }
1702
886cac86 1703 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
8aa1044f
SH
1704 return 0;
1705}
1706
6812d833
CB
1707/* __cg_mount_direct
1708 *
1709 * Mount cgroup hierarchies directly without using bind-mounts. The main
1710 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1711 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1712 */
1713static int __cg_mount_direct(int type, struct hierarchy *h,
1714 const char *controllerpath)
b635e92d 1715{
d97919ab 1716 __do_free char *controllers = NULL;
a760603e
CB
1717 char *fstype = "cgroup2";
1718 unsigned long flags = 0;
f6b54668 1719 int ret;
b635e92d 1720
a760603e
CB
1721 flags |= MS_NOSUID;
1722 flags |= MS_NOEXEC;
1723 flags |= MS_NODEV;
1724 flags |= MS_RELATIME;
1725
1726 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1727 flags |= MS_RDONLY;
1728
d6337a5f 1729 if (h->version != CGROUP2_SUPER_MAGIC) {
a760603e
CB
1730 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1731 if (!controllers)
1732 return -ENOMEM;
1733 fstype = "cgroup";
b635e92d
CB
1734 }
1735
a760603e 1736 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
b635e92d 1737 if (ret < 0) {
6812d833 1738 SYSERROR("Failed to mount \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1739 return -1;
1740 }
1741
6812d833 1742 DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1743 return 0;
1744}
1745
6812d833
CB
1746static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1747 const char *controllerpath)
1748{
1749 return __cg_mount_direct(type, h, controllerpath);
1750}
1751
1752static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1753 const char *controllerpath)
1754{
1755 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1756 return 0;
1757
1758 return __cg_mount_direct(type, h, controllerpath);
1759}
1760
b857f4be 1761__cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
fb55e009
CB
1762 struct lxc_handler *handler,
1763 const char *root, int type)
ccb4cabe 1764{
d97919ab 1765 __do_free char *tmpfspath = NULL;
dfa835ac 1766 int ret;
affd10fa 1767 bool has_cgns = false, retval = false, wants_force_mount = false;
8aa1044f 1768
69b4a4bb
CB
1769 if (!ops->hierarchies)
1770 return true;
1771
8aa1044f
SH
1772 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1773 return true;
1774
3f69fb12
SY
1775 if (type & LXC_AUTO_CGROUP_FORCE) {
1776 type &= ~LXC_AUTO_CGROUP_FORCE;
1777 wants_force_mount = true;
1778 }
b635e92d 1779
3f69fb12
SY
1780 if (!wants_force_mount){
1781 if (!lxc_list_empty(&handler->conf->keepcaps))
1782 wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1783 else
1784 wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1785 }
8aa1044f 1786
3f69fb12
SY
1787 has_cgns = cgns_supported();
1788 if (has_cgns && !wants_force_mount)
1789 return true;
8aa1044f
SH
1790
1791 if (type == LXC_AUTO_CGROUP_NOSPEC)
1792 type = LXC_AUTO_CGROUP_MIXED;
1793 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1794 type = LXC_AUTO_CGROUP_FULL_MIXED;
1795
1796 /* Mount tmpfs */
3f69fb12 1797 tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
6812d833 1798 ret = safe_mount(NULL, tmpfspath, "tmpfs",
3f69fb12
SY
1799 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1800 "size=10240k,mode=755", root);
1801 if (ret < 0)
1802 goto on_error;
8aa1044f 1803
dfa835ac 1804 for (int i = 0; ops->hierarchies[i]; i++) {
d97919ab 1805 __do_free char *controllerpath = NULL, *path2 = NULL;
2202afc9 1806 struct hierarchy *h = ops->hierarchies[i];
8aa1044f 1807 char *controller = strrchr(h->mountpoint, '/');
8aa1044f
SH
1808
1809 if (!controller)
1810 continue;
1811 controller++;
affd10fa 1812
8aa1044f 1813 controllerpath = must_make_path(tmpfspath, controller, NULL);
d97919ab 1814 if (dir_exists(controllerpath))
8aa1044f 1815 continue;
affd10fa 1816
3f69fb12
SY
1817 ret = mkdir(controllerpath, 0755);
1818 if (ret < 0) {
8aa1044f 1819 SYSERROR("Error creating cgroup path: %s", controllerpath);
3f69fb12 1820 goto on_error;
8aa1044f 1821 }
b635e92d 1822
3f69fb12 1823 if (has_cgns && wants_force_mount) {
b635e92d
CB
1824 /* If cgroup namespaces are supported but the container
1825 * will not have CAP_SYS_ADMIN after it has started we
1826 * need to mount the cgroups manually.
1827 */
3f69fb12 1828 ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
3f69fb12
SY
1829 if (ret < 0)
1830 goto on_error;
1831
b635e92d
CB
1832 continue;
1833 }
1834
6812d833 1835 ret = cg_mount_cgroup_full(type, h, controllerpath);
d97919ab 1836 if (ret < 0)
3f69fb12 1837 goto on_error;
3f69fb12 1838
d97919ab 1839 if (!cg_mount_needs_subdirs(type))
8aa1044f 1840 continue;
3f69fb12 1841
bb221ad1 1842 path2 = must_make_path(controllerpath, h->container_base_path,
2202afc9 1843 ops->container_cgroup, NULL);
3f69fb12 1844 ret = mkdir_p(path2, 0755);
d97919ab 1845 if (ret < 0)
3f69fb12 1846 goto on_error;
2f62fb00 1847
6812d833 1848 ret = cg_legacy_mount_controllers(type, h, controllerpath,
2202afc9 1849 path2, ops->container_cgroup);
3f69fb12
SY
1850 if (ret < 0)
1851 goto on_error;
8aa1044f
SH
1852 }
1853 retval = true;
1854
3f69fb12 1855on_error:
8aa1044f 1856 return retval;
ccb4cabe
SH
1857}
1858
1859static int recursive_count_nrtasks(char *dirname)
1860{
d97919ab 1861 __do_free char *path = NULL;
88396101 1862 __do_closedir DIR *dir = NULL;
74f96976 1863 struct dirent *direntp;
ccb4cabe 1864 int count = 0, ret;
ccb4cabe
SH
1865
1866 dir = opendir(dirname);
1867 if (!dir)
1868 return 0;
1869
74f96976 1870 while ((direntp = readdir(dir))) {
ccb4cabe
SH
1871 struct stat mystat;
1872
ccb4cabe
SH
1873 if (!strcmp(direntp->d_name, ".") ||
1874 !strcmp(direntp->d_name, ".."))
1875 continue;
1876
1877 path = must_make_path(dirname, direntp->d_name, NULL);
1878
1879 if (lstat(path, &mystat))
d97919ab 1880 continue;
ccb4cabe
SH
1881
1882 if (!S_ISDIR(mystat.st_mode))
d97919ab 1883 continue;
ccb4cabe
SH
1884
1885 count += recursive_count_nrtasks(path);
ccb4cabe
SH
1886 }
1887
1888 path = must_make_path(dirname, "cgroup.procs", NULL);
1889 ret = lxc_count_file_lines(path);
1890 if (ret != -1)
1891 count += ret;
ccb4cabe
SH
1892
1893 return count;
1894}
1895
b857f4be 1896__cgfsng_ops static int cgfsng_nrtasks(struct cgroup_ops *ops)
3135c5d4 1897{
d97919ab 1898 __do_free char *path = NULL;
ccb4cabe 1899
2202afc9 1900 if (!ops->container_cgroup || !ops->hierarchies)
ccb4cabe 1901 return -1;
a3926f6a 1902
eb697136 1903 path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
3312a94f 1904 return recursive_count_nrtasks(path);
ccb4cabe
SH
1905}
1906
11c23867 1907/* Only root needs to escape to the cgroup of its init. */
b857f4be 1908__cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
fb55e009 1909 struct lxc_conf *conf)
ccb4cabe 1910{
69b4a4bb 1911 if (conf->cgroup_meta.relative || geteuid() || !ops->hierarchies)
ccb4cabe
SH
1912 return true;
1913
779b3d82 1914 for (int i = 0; ops->hierarchies[i]; i++) {
11c23867 1915 int ret;
88396101 1916 __do_free char *fullpath = NULL;
11c23867 1917
2202afc9 1918 fullpath = must_make_path(ops->hierarchies[i]->mountpoint,
bb221ad1 1919 ops->hierarchies[i]->container_base_path,
11c23867 1920 "cgroup.procs", NULL);
7cea5905 1921 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
11c23867
CB
1922 if (ret != 0) {
1923 SYSERROR("Failed to escape to cgroup \"%s\"", fullpath);
6df334d1 1924 return false;
ccb4cabe 1925 }
ccb4cabe
SH
1926 }
1927
6df334d1 1928 return true;
ccb4cabe
SH
1929}
1930
b857f4be 1931__cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
36662416 1932{
69b4a4bb
CB
1933 int i = 0;
1934
1935 if (!ops->hierarchies)
1936 return 0;
36662416 1937
69b4a4bb 1938 for (; ops->hierarchies[i]; i++)
36662416
TA
1939 ;
1940
1941 return i;
1942}
1943
b857f4be 1944__cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
36662416
TA
1945{
1946 int i;
1947
69b4a4bb
CB
1948 if (!ops->hierarchies)
1949 return false;
1950
36662416 1951 /* sanity check n */
6b38e644 1952 for (i = 0; i < n; i++)
2202afc9 1953 if (!ops->hierarchies[i])
36662416 1954 return false;
36662416 1955
2202afc9 1956 *out = ops->hierarchies[i]->controllers;
36662416
TA
1957
1958 return true;
1959}
1960
ccb4cabe
SH
1961#define THAWED "THAWED"
1962#define THAWED_LEN (strlen(THAWED))
1963
d6337a5f
CB
1964/* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
1965 * to be adapted.
1966 */
b857f4be 1967__cgfsng_ops static bool cgfsng_unfreeze(struct cgroup_ops *ops)
ccb4cabe 1968{
d6337a5f 1969 int ret;
d97919ab 1970 __do_free char *fullpath = NULL;
d6337a5f 1971 struct hierarchy *h;
ccb4cabe 1972
2202afc9 1973 h = get_hierarchy(ops, "freezer");
457ca9aa 1974 if (!h)
ccb4cabe 1975 return false;
d6337a5f 1976
eb697136 1977 fullpath = must_make_path(h->container_full_path, "freezer.state", NULL);
7cea5905 1978 ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false, 0666);
d6337a5f
CB
1979 if (ret < 0)
1980 return false;
1981
ccb4cabe
SH
1982 return true;
1983}
1984
b857f4be 1985__cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
fb55e009 1986 const char *controller)
ccb4cabe 1987{
d6337a5f
CB
1988 struct hierarchy *h;
1989
2202afc9 1990 h = get_hierarchy(ops, controller);
106f1f38 1991 if (!h) {
2202afc9
CB
1992 WARN("Failed to find hierarchy for controller \"%s\"",
1993 controller ? controller : "(null)");
ccb4cabe 1994 return NULL;
106f1f38 1995 }
ccb4cabe 1996
eb697136 1997 return h->container_full_path ? h->container_full_path + strlen(h->mountpoint) : NULL;
371f834d
SH
1998}
1999
c40c8209
CB
2000/* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2001 * which must be freed by the caller.
371f834d 2002 */
c40c8209
CB
2003static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2004 const char *inpath,
2005 const char *filename)
371f834d 2006{
371f834d 2007 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
2008}
2009
25f66a8f
CB
2010/* Technically, we're always at a delegation boundary here (This is especially
2011 * true when cgroup namespaces are available.). The reasoning is that in order
c2aed66d 2012 * for us to have been able to start a container in the first place the root
25f66a8f 2013 * cgroup must have been a leaf node. Now, either the container's init system
c2aed66d
CB
2014 * has populated the cgroup and kept it as a leaf node or it has created
2015 * subtrees. In the former case we will simply attach to the leaf node we
2016 * created when we started the container in the latter case we create our own
2017 * cgroup for the attaching process.
2018 */
a3926f6a
CB
2019static int __cg_unified_attach(const struct hierarchy *h, const char *name,
2020 const char *lxcpath, const char *pidstr,
2021 size_t pidstr_len, const char *controller)
c2aed66d 2022{
d97919ab
CB
2023 __do_free char *base_path = NULL, *container_cgroup = NULL,
2024 *full_path = NULL;
c2aed66d
CB
2025 int ret;
2026 size_t len;
2027 int fret = -1, idx = 0;
c2aed66d
CB
2028
2029 container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2030 /* not running */
2031 if (!container_cgroup)
2032 return 0;
2033
2034 base_path = must_make_path(h->mountpoint, container_cgroup, NULL);
2035 full_path = must_make_path(base_path, "cgroup.procs", NULL);
2036 /* cgroup is populated */
7cea5905 2037 ret = lxc_write_to_file(full_path, pidstr, pidstr_len, false, 0666);
c2aed66d
CB
2038 if (ret < 0 && errno != EBUSY)
2039 goto on_error;
2040
2041 if (ret == 0)
2042 goto on_success;
2043
6333c915
CB
2044 len = strlen(base_path) + STRLITERALLEN("/lxc-1000") +
2045 STRLITERALLEN("/cgroup-procs");
f25a2044 2046 full_path = must_realloc(NULL, len + 1);
c2aed66d
CB
2047 do {
2048 if (idx)
2049 ret = snprintf(full_path, len + 1, "%s/lxc-%d",
2050 base_path, idx);
2051 else
2052 ret = snprintf(full_path, len + 1, "%s/lxc", base_path);
2053 if (ret < 0 || (size_t)ret >= len + 1)
2054 goto on_error;
2055
2056 ret = mkdir_p(full_path, 0755);
2057 if (ret < 0 && errno != EEXIST)
2058 goto on_error;
2059
3ebe2fbd 2060 (void)strlcat(full_path, "/cgroup.procs", len + 1);
7cea5905 2061 ret = lxc_write_to_file(full_path, pidstr, len, false, 0666);
c2aed66d
CB
2062 if (ret == 0)
2063 goto on_success;
2064
2065 /* this is a non-leaf node */
2066 if (errno != EBUSY)
2067 goto on_error;
2068
edae86e9
CB
2069 idx++;
2070 } while (idx < 1000);
c2aed66d
CB
2071
2072on_success:
2073 if (idx < 1000)
2074 fret = 0;
2075
2076on_error:
c2aed66d
CB
2077 return fret;
2078}
2079
b857f4be 2080__cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
fb55e009 2081 const char *lxcpath, pid_t pid)
ccb4cabe 2082{
81b5d48a 2083 int len, ret;
a3650c0c 2084 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 2085
69b4a4bb
CB
2086 if (!ops->hierarchies)
2087 return true;
2088
a3650c0c
CB
2089 len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
2090 if (len < 0 || (size_t)len >= sizeof(pidstr))
ccb4cabe
SH
2091 return false;
2092
81b5d48a 2093 for (int i = 0; ops->hierarchies[i]; i++) {
c05b17bd 2094 __do_free char *fullpath = NULL, *path = NULL;
2202afc9 2095 struct hierarchy *h = ops->hierarchies[i];
ccb4cabe 2096
c2aed66d 2097 if (h->version == CGROUP2_SUPER_MAGIC) {
a3926f6a
CB
2098 ret = __cg_unified_attach(h, name, lxcpath, pidstr, len,
2099 h->controllers[0]);
c2aed66d
CB
2100 if (ret < 0)
2101 return false;
2102
2103 continue;
2104 }
2105
ccb4cabe 2106 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
c2aed66d
CB
2107 /* not running */
2108 if (!path)
ccb4cabe
SH
2109 continue;
2110
371f834d 2111 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
7cea5905 2112 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
c2aed66d 2113 if (ret < 0) {
ccb4cabe 2114 SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
ccb4cabe
SH
2115 return false;
2116 }
ccb4cabe
SH
2117 }
2118
ccb4cabe
SH
2119 return true;
2120}
2121
e2bd2b13
CB
2122/* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2123 * don't have a cgroup_data set up, so we ask the running container through the
2124 * commands API for the cgroup path.
ccb4cabe 2125 */
b857f4be 2126__cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
fb55e009
CB
2127 char *value, size_t len, const char *name,
2128 const char *lxcpath)
ccb4cabe 2129{
d97919ab 2130 __do_free char *path = NULL;
88396101 2131 __do_free char *controller = NULL;
d97919ab 2132 char *p;
0069cc61 2133 struct hierarchy *h;
861cb8c2 2134 int ret = -1;
ccb4cabe 2135
861cb8c2 2136 controller = must_copy_string(filename);
0069cc61
CB
2137 p = strchr(controller, '.');
2138 if (p)
ccb4cabe
SH
2139 *p = '\0';
2140
0069cc61
CB
2141 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2142 /* not running */
2143 if (!path)
ccb4cabe
SH
2144 return -1;
2145
2202afc9 2146 h = get_hierarchy(ops, controller);
ccb4cabe 2147 if (h) {
88396101 2148 __do_free char *fullpath = NULL;
0069cc61
CB
2149
2150 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe 2151 ret = lxc_read_from_file(fullpath, value, len);
ccb4cabe 2152 }
ccb4cabe
SH
2153
2154 return ret;
2155}
2156
eec533e3
CB
2157/* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2158 * don't have a cgroup_data set up, so we ask the running container through the
2159 * commands API for the cgroup path.
ccb4cabe 2160 */
b857f4be 2161__cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
fb55e009
CB
2162 const char *filename, const char *value,
2163 const char *name, const char *lxcpath)
ccb4cabe 2164{
d97919ab 2165 __do_free char *path = NULL;
88396101 2166 __do_free char *controller = NULL;
d97919ab 2167 char *p;
87777968 2168 struct hierarchy *h;
861cb8c2 2169 int ret = -1;
ccb4cabe 2170
861cb8c2 2171 controller = must_copy_string(filename);
87777968
CB
2172 p = strchr(controller, '.');
2173 if (p)
ccb4cabe
SH
2174 *p = '\0';
2175
87777968
CB
2176 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2177 /* not running */
2178 if (!path)
ccb4cabe
SH
2179 return -1;
2180
2202afc9 2181 h = get_hierarchy(ops, controller);
ccb4cabe 2182 if (h) {
88396101 2183 __do_free char *fullpath = NULL;
87777968
CB
2184
2185 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
7cea5905 2186 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
ccb4cabe 2187 }
ccb4cabe
SH
2188
2189 return ret;
2190}
2191
91d1a13a 2192/* take devices cgroup line
72add155
SH
2193 * /dev/foo rwx
2194 * and convert it to a valid
2195 * type major:minor mode
91d1a13a
CB
2196 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2197 * the output.
72add155
SH
2198 */
2199static int convert_devpath(const char *invalue, char *dest)
2200{
88396101 2201 __do_free char *path = NULL;
2a06d041 2202 int n_parts;
d97919ab 2203 char *p, type;
72add155 2204 unsigned long minor, major;
91d1a13a 2205 struct stat sb;
2a06d041
CB
2206 int ret = -EINVAL;
2207 char *mode = NULL;
72add155
SH
2208
2209 path = must_copy_string(invalue);
2210
91d1a13a
CB
2211 /* Read path followed by mode. Ignore any trailing text.
2212 * A ' # comment' would be legal. Technically other text is not
2213 * legal, we could check for that if we cared to.
72add155 2214 */
0dbdb99e 2215 for (n_parts = 1, p = path; *p; p++) {
2c2d6c49
SH
2216 if (*p != ' ')
2217 continue;
2218 *p = '\0';
91d1a13a 2219
2c2d6c49
SH
2220 if (n_parts != 1)
2221 break;
2222 p++;
2223 n_parts++;
91d1a13a 2224
2c2d6c49
SH
2225 while (*p == ' ')
2226 p++;
91d1a13a 2227
2c2d6c49 2228 mode = p;
91d1a13a 2229
2c2d6c49
SH
2230 if (*p == '\0')
2231 goto out;
72add155 2232 }
2c2d6c49
SH
2233
2234 if (n_parts == 1)
72add155 2235 goto out;
72add155
SH
2236
2237 ret = stat(path, &sb);
2238 if (ret < 0)
2239 goto out;
2240
72add155
SH
2241 mode_t m = sb.st_mode & S_IFMT;
2242 switch (m) {
2243 case S_IFBLK:
2244 type = 'b';
2245 break;
2246 case S_IFCHR:
2247 type = 'c';
2248 break;
2c2d6c49 2249 default:
91d1a13a 2250 ERROR("Unsupported device type %i for \"%s\"", m, path);
72add155
SH
2251 ret = -EINVAL;
2252 goto out;
2253 }
2c2d6c49
SH
2254
2255 major = MAJOR(sb.st_rdev);
2256 minor = MINOR(sb.st_rdev);
2257 ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
72add155 2258 if (ret < 0 || ret >= 50) {
2a06d041
CB
2259 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2260 "chars)", type, major, minor, mode);
72add155
SH
2261 ret = -ENAMETOOLONG;
2262 goto out;
2263 }
2264 ret = 0;
2265
2266out:
72add155
SH
2267 return ret;
2268}
2269
90e97284
CB
2270/* Called from setup_limits - here we have the container's cgroup_data because
2271 * we created the cgroups.
ccb4cabe 2272 */
2202afc9
CB
2273static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2274 const char *value)
ccb4cabe 2275{
88396101 2276 __do_free char *controller = NULL;
d97919ab
CB
2277 __do_free char *fullpath = NULL;
2278 char *p;
1a0e70ac
CB
2279 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2280 char converted_value[50];
b3646d7e
CB
2281 struct hierarchy *h;
2282 int ret = 0;
64e82f8b 2283
861cb8c2 2284 controller = must_copy_string(filename);
ab1a6cac
CB
2285 p = strchr(controller, '.');
2286 if (p)
ccb4cabe
SH
2287 *p = '\0';
2288
c8bf519d 2289 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
72add155
SH
2290 ret = convert_devpath(value, converted_value);
2291 if (ret < 0)
c8bf519d 2292 return ret;
72add155 2293 value = converted_value;
c8bf519d 2294 }
2295
2202afc9 2296 h = get_hierarchy(ops, controller);
b3646d7e
CB
2297 if (!h) {
2298 ERROR("Failed to setup limits for the \"%s\" controller. "
2299 "The controller seems to be unused by \"cgfsng\" cgroup "
2300 "driver or not enabled on the cgroup hierarchy",
2301 controller);
d1953b26 2302 errno = ENOENT;
ab1a6cac 2303 return -ENOENT;
ccb4cabe 2304 }
b3646d7e 2305
eb697136 2306 fullpath = must_make_path(h->container_full_path, filename, NULL);
7cea5905 2307 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
ccb4cabe
SH
2308 return ret;
2309}
2310
2202afc9 2311static bool __cg_legacy_setup_limits(struct cgroup_ops *ops,
a3926f6a
CB
2312 struct lxc_list *cgroup_settings,
2313 bool do_devices)
ccb4cabe 2314{
d97919ab
CB
2315 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
2316 struct lxc_list *iterator, *next;
ccb4cabe 2317 struct lxc_cgroup *cg;
ccb4cabe
SH
2318 bool ret = false;
2319
2320 if (lxc_list_empty(cgroup_settings))
2321 return true;
2322
69b4a4bb
CB
2323 if (!ops->hierarchies)
2324 return false;
2325
ccb4cabe 2326 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
6b38e644 2327 if (!sorted_cgroup_settings)
ccb4cabe 2328 return false;
ccb4cabe 2329
ccb4cabe
SH
2330 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2331 cg = iterator->elem;
2332
2333 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2202afc9 2334 if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
ccb4cabe 2335 if (do_devices && (errno == EACCES || errno == EPERM)) {
c347df58
CB
2336 WARN("Failed to set \"%s\" to \"%s\"",
2337 cg->subsystem, cg->value);
ccb4cabe
SH
2338 continue;
2339 }
c347df58
CB
2340 WARN("Failed to set \"%s\" to \"%s\"",
2341 cg->subsystem, cg->value);
ccb4cabe
SH
2342 goto out;
2343 }
c347df58
CB
2344 DEBUG("Set controller \"%s\" set to \"%s\"",
2345 cg->subsystem, cg->value);
ccb4cabe 2346 }
ccb4cabe
SH
2347 }
2348
2349 ret = true;
6b38e644 2350 INFO("Limits for the legacy cgroup hierarchies have been setup");
ccb4cabe 2351out:
ccb4cabe
SH
2352 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2353 lxc_list_del(iterator);
2354 free(iterator);
2355 }
d97919ab 2356
ccb4cabe
SH
2357 return ret;
2358}
2359
2202afc9 2360static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
a3926f6a 2361 struct lxc_list *cgroup_settings)
6b38e644
CB
2362{
2363 struct lxc_list *iterator;
2202afc9 2364 struct hierarchy *h = ops->unified;
6b38e644
CB
2365
2366 if (lxc_list_empty(cgroup_settings))
2367 return true;
2368
2369 if (!h)
2370 return false;
2371
2372 lxc_list_for_each(iterator, cgroup_settings) {
88396101 2373 __do_free char *fullpath = NULL;
6b38e644 2374 int ret;
6b38e644
CB
2375 struct lxc_cgroup *cg = iterator->elem;
2376
eb697136 2377 fullpath = must_make_path(h->container_full_path, cg->subsystem, NULL);
7cea5905 2378 ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666);
6b38e644 2379 if (ret < 0) {
b2ac2cb7
CB
2380 SYSERROR("Failed to set \"%s\" to \"%s\"",
2381 cg->subsystem, cg->value);
6b38e644
CB
2382 return false;
2383 }
2384 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2385 }
2386
2387 INFO("Limits for the unified cgroup hierarchy have been setup");
2388 return true;
2389}
2390
b857f4be 2391__cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
6280d4c9
CB
2392 struct lxc_conf *conf,
2393 bool do_devices)
6b38e644 2394{
6280d4c9 2395 if (!__cg_legacy_setup_limits(ops, &conf->cgroup, do_devices))
6b38e644
CB
2396 return false;
2397
2202afc9
CB
2398 return __cg_unified_setup_limits(ops, &conf->cgroup2);
2399}
2400
b7b18fc5
CB
2401static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2402 char **controllers)
2403{
b7b18fc5
CB
2404 if (!ops->cgroup_use)
2405 return true;
2406
431e2c54 2407 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
b7b18fc5
CB
2408 bool found = false;
2409
431e2c54 2410 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
b7b18fc5
CB
2411 if (strcmp(*cur_use, *cur_ctrl) != 0)
2412 continue;
2413
2414 found = true;
2415 break;
2416 }
2417
2418 if (found)
2419 continue;
2420
2421 return false;
2422 }
2423
2424 return true;
2425}
2426
a6ca2ed8
CB
2427static void cg_unified_delegate(char ***delegate)
2428{
88396101 2429 __do_free char *tmp = NULL;
a6ca2ed8
CB
2430 int idx;
2431 char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
2432
2433 tmp = read_file("/sys/kernel/cgroup/delegate");
2434 if (!tmp) {
2435 for (char **p = standard; p && *p; p++) {
2436 idx = append_null_to_list((void ***)delegate);
2437 (*delegate)[idx] = must_copy_string(*p);
2438 }
2439 } else {
2440 char *token;
2441 lxc_iterate_parts (token, tmp, " \t\n") {
2442 /*
2443 * We always need to chown this for both cgroup and
2444 * cgroup2.
2445 */
2446 if (strcmp(token, "cgroup.procs") == 0)
2447 continue;
2448
2449 idx = append_null_to_list((void ***)delegate);
2450 (*delegate)[idx] = must_copy_string(token);
2451 }
a6ca2ed8
CB
2452 }
2453}
2454
2202afc9
CB
2455/* At startup, parse_hierarchies finds all the info we need about cgroup
2456 * mountpoints and current cgroups, and stores it in @d.
2457 */
a6ca2ed8
CB
2458static bool cg_hybrid_init(struct cgroup_ops *ops, bool relative,
2459 bool unprivileged)
2202afc9 2460{
88396101 2461 __do_free char *basecginfo = NULL;
d97919ab
CB
2462 __do_free char *line = NULL;
2463 __do_fclose FILE *f = NULL;
2202afc9 2464 int ret;
2202afc9 2465 size_t len = 0;
2202afc9
CB
2466 char **klist = NULL, **nlist = NULL;
2467
2468 /* Root spawned containers escape the current cgroup, so use init's
2469 * cgroups as our base in that case.
2470 */
9caee129 2471 if (!relative && (geteuid() == 0))
2202afc9
CB
2472 basecginfo = read_file("/proc/1/cgroup");
2473 else
2474 basecginfo = read_file("/proc/self/cgroup");
2475 if (!basecginfo)
2476 return false;
2477
2478 ret = get_existing_subsystems(&klist, &nlist);
2479 if (ret < 0) {
2480 ERROR("Failed to retrieve available legacy cgroup controllers");
2202afc9
CB
2481 return false;
2482 }
2483
2484 f = fopen("/proc/self/mountinfo", "r");
2485 if (!f) {
2486 ERROR("Failed to open \"/proc/self/mountinfo\"");
2202afc9
CB
2487 return false;
2488 }
2489
2490 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2491
2492 while (getline(&line, &len, f) != -1) {
2493 int type;
2494 bool writeable;
2495 struct hierarchy *new;
2496 char *base_cgroup = NULL, *mountpoint = NULL;
2497 char **controller_list = NULL;
2498
2499 type = get_cgroup_version(line);
2500 if (type == 0)
2501 continue;
2502
2503 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2504 continue;
2505
2506 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2507 if (type == CGROUP2_SUPER_MAGIC)
2508 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2509 else if (type == CGROUP_SUPER_MAGIC)
2510 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2511 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2512 if (type == CGROUP_SUPER_MAGIC)
2513 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2514 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2515 if (type == CGROUP2_SUPER_MAGIC)
2516 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2517 }
2518
2519 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
2520 if (!controller_list && type == CGROUP_SUPER_MAGIC)
2521 continue;
2522
2523 if (type == CGROUP_SUPER_MAGIC)
2524 if (controller_list_is_dup(ops->hierarchies, controller_list))
2525 goto next;
2526
2527 mountpoint = cg_hybrid_get_mountpoint(line);
2528 if (!mountpoint) {
2529 ERROR("Failed parsing mountpoint from \"%s\"", line);
2530 goto next;
2531 }
2532
2533 if (type == CGROUP_SUPER_MAGIC)
2534 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
2535 else
2536 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
2537 if (!base_cgroup) {
2538 ERROR("Failed to find current cgroup");
2539 goto next;
2540 }
2541
2542 trim(base_cgroup);
2543 prune_init_scope(base_cgroup);
2544 if (type == CGROUP2_SUPER_MAGIC)
2545 writeable = test_writeable_v2(mountpoint, base_cgroup);
2546 else
2547 writeable = test_writeable_v1(mountpoint, base_cgroup);
2548 if (!writeable)
2549 goto next;
2550
2551 if (type == CGROUP2_SUPER_MAGIC) {
2552 char *cgv2_ctrl_path;
2553
2554 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
2555 "cgroup.controllers",
2556 NULL);
2557
2558 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
2559 free(cgv2_ctrl_path);
2560 if (!controller_list) {
2561 controller_list = cg_unified_make_empty_controller();
2562 TRACE("No controllers are enabled for "
2563 "delegation in the unified hierarchy");
2564 }
2565 }
2566
b7b18fc5
CB
2567 /* Exclude all controllers that cgroup use does not want. */
2568 if (!cgroup_use_wants_controllers(ops, controller_list))
2569 goto next;
2570
2202afc9 2571 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
a6ca2ed8
CB
2572 if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
2573 if (unprivileged)
2574 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 2575 ops->unified = new;
a6ca2ed8 2576 }
2202afc9
CB
2577
2578 continue;
2579
2580 next:
2581 free_string_list(controller_list);
2582 free(mountpoint);
2583 free(base_cgroup);
2584 }
2585
2586 free_string_list(klist);
2587 free_string_list(nlist);
2588
2202afc9
CB
2589 TRACE("Writable cgroup hierarchies:");
2590 lxc_cgfsng_print_hierarchies(ops);
2591
2592 /* verify that all controllers in cgroup.use and all crucial
2593 * controllers are accounted for
2594 */
2595 if (!all_controllers_found(ops))
2596 return false;
2597
2598 return true;
2599}
2600
2601static int cg_is_pure_unified(void)
2602{
2603
2604 int ret;
2605 struct statfs fs;
2606
2607 ret = statfs("/sys/fs/cgroup", &fs);
2608 if (ret < 0)
2609 return -ENOMEDIUM;
2610
2611 if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
2612 return CGROUP2_SUPER_MAGIC;
2613
2614 return 0;
2615}
2616
2617/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
9caee129 2618static char *cg_unified_get_current_cgroup(bool relative)
2202afc9 2619{
88396101 2620 __do_free char *basecginfo = NULL;
d97919ab 2621 char *base_cgroup;
2202afc9
CB
2622 char *copy = NULL;
2623
9caee129 2624 if (!relative && (geteuid() == 0))
2202afc9
CB
2625 basecginfo = read_file("/proc/1/cgroup");
2626 else
2627 basecginfo = read_file("/proc/self/cgroup");
2628 if (!basecginfo)
2629 return NULL;
2630
2631 base_cgroup = strstr(basecginfo, "0::/");
2632 if (!base_cgroup)
2633 goto cleanup_on_err;
2634
2635 base_cgroup = base_cgroup + 3;
2636 copy = copy_to_eol(base_cgroup);
2637 if (!copy)
2638 goto cleanup_on_err;
2639
2640cleanup_on_err:
2202afc9
CB
2641 if (copy)
2642 trim(copy);
2643
2644 return copy;
2645}
2646
a6ca2ed8
CB
2647static int cg_unified_init(struct cgroup_ops *ops, bool relative,
2648 bool unprivileged)
2202afc9 2649{
d97919ab 2650 __do_free char *subtree_path = NULL;
2202afc9 2651 int ret;
7717e175 2652 char *mountpoint;
2202afc9 2653 char **delegatable;
a6ca2ed8 2654 struct hierarchy *new;
2202afc9
CB
2655 char *base_cgroup = NULL;
2656
2657 ret = cg_is_pure_unified();
2658 if (ret == -ENOMEDIUM)
2659 return -ENOMEDIUM;
2660
2661 if (ret != CGROUP2_SUPER_MAGIC)
2662 return 0;
2663
9caee129 2664 base_cgroup = cg_unified_get_current_cgroup(relative);
2202afc9
CB
2665 if (!base_cgroup)
2666 return -EINVAL;
2667 prune_init_scope(base_cgroup);
2668
2669 /* We assume that we have already been given controllers to delegate
2670 * further down the hierarchy. If not it is up to the user to delegate
2671 * them to us.
2672 */
2673 mountpoint = must_copy_string("/sys/fs/cgroup");
2674 subtree_path = must_make_path(mountpoint, base_cgroup,
2675 "cgroup.subtree_control", NULL);
2676 delegatable = cg_unified_get_controllers(subtree_path);
2202afc9
CB
2677 if (!delegatable)
2678 delegatable = cg_unified_make_empty_controller();
2679 if (!delegatable[0])
2680 TRACE("No controllers are enabled for delegation");
2681
2682 /* TODO: If the user requested specific controllers via lxc.cgroup.use
2683 * we should verify here. The reason I'm not doing it right is that I'm
2684 * not convinced that lxc.cgroup.use will be the future since it is a
2685 * global property. I much rather have an option that lets you request
2686 * controllers per container.
2687 */
2688
a6ca2ed8
CB
2689 new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
2690 if (!unprivileged)
2691 cg_unified_delegate(&new->cgroup2_chown);
2202afc9
CB
2692
2693 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
908e0ee5 2694 ops->unified = new;
2202afc9
CB
2695 return CGROUP2_SUPER_MAGIC;
2696}
2697
5a087e05 2698static bool cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
2202afc9
CB
2699{
2700 int ret;
2701 const char *tmp;
9caee129 2702 bool relative = conf->cgroup_meta.relative;
2202afc9
CB
2703
2704 tmp = lxc_global_config_value("lxc.cgroup.use");
b7b18fc5 2705 if (tmp) {
88396101 2706 __do_free char *pin = NULL;
d97919ab 2707 char *chop, *cur;
b7b18fc5
CB
2708
2709 pin = must_copy_string(tmp);
2710 chop = pin;
2711
d97919ab 2712 lxc_iterate_parts(cur, chop, ",")
b7b18fc5 2713 must_append_string(&ops->cgroup_use, cur);
b7b18fc5 2714 }
2202afc9 2715
a6ca2ed8 2716 ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9
CB
2717 if (ret < 0)
2718 return false;
2719
2720 if (ret == CGROUP2_SUPER_MAGIC)
2721 return true;
2722
a6ca2ed8 2723 return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9
CB
2724}
2725
b857f4be 2726__cgfsng_ops static bool cgfsng_data_init(struct cgroup_ops *ops)
2202afc9
CB
2727{
2728 const char *cgroup_pattern;
2729
2730 /* copy system-wide cgroup information */
2731 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
2732 if (!cgroup_pattern) {
2733 /* lxc.cgroup.pattern is only NULL on error. */
2734 ERROR("Failed to retrieve cgroup pattern");
2735 return false;
2736 }
2737 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
625ad37b 2738 ops->monitor_pattern = MONITOR_CGROUP;
2202afc9
CB
2739
2740 return true;
2741}
2742
5a087e05 2743struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
2202afc9 2744{
a64edc1c 2745 __do_free struct cgroup_ops *cgfsng_ops = NULL;
2202afc9
CB
2746
2747 cgfsng_ops = malloc(sizeof(struct cgroup_ops));
2748 if (!cgfsng_ops)
2749 return NULL;
2750
2751 memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
2752 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
2753
a64edc1c 2754 if (!cg_init(cgfsng_ops, conf))
2202afc9 2755 return NULL;
2202afc9
CB
2756
2757 cgfsng_ops->data_init = cgfsng_data_init;
434c8e15
CB
2758 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
2759 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
72068e74 2760 cgfsng_ops->monitor_create = cgfsng_monitor_create;
eeef32bb 2761 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
e8b181f5
CB
2762 cgfsng_ops->payload_create = cgfsng_payload_create;
2763 cgfsng_ops->payload_enter = cgfsng_payload_enter;
2202afc9
CB
2764 cgfsng_ops->escape = cgfsng_escape;
2765 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
2766 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
2767 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
2768 cgfsng_ops->get = cgfsng_get;
2769 cgfsng_ops->set = cgfsng_set;
2770 cgfsng_ops->unfreeze = cgfsng_unfreeze;
2771 cgfsng_ops->setup_limits = cgfsng_setup_limits;
2772 cgfsng_ops->driver = "cgfsng";
2773 cgfsng_ops->version = "1.0.0";
2774 cgfsng_ops->attach = cgfsng_attach;
2775 cgfsng_ops->chown = cgfsng_chown;
2776 cgfsng_ops->mount = cgfsng_mount;
2777 cgfsng_ops->nrtasks = cgfsng_nrtasks;
2778
a64edc1c 2779 return move_ptr(cgfsng_ops);
2202afc9 2780}