]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfsng.c
Merge pull request #3352 from Blub/readd-cgroup-ops-check
[mirror_lxc.git] / src / lxc / cgroups / cgfsng.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
ccb4cabe
SH
2
3/*
4 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
5 * cgroup backend. The original cgfs.c was designed to be as flexible
6 * as possible. It would try to find cgroup filesystems no matter where
7 * or how you had them mounted, and deduce the most usable mount for
0e7ff52c 8 * each controller.
ccb4cabe
SH
9 *
10 * This new implementation assumes that cgroup filesystems are mounted
11 * under /sys/fs/cgroup/clist where clist is either the controller, or
18406e5a 12 * a comma-separated list of controllers.
ccb4cabe 13 */
a54694f8 14
d38dd64a
CB
15#ifndef _GNU_SOURCE
16#define _GNU_SOURCE 1
17#endif
a54694f8
CB
18#include <ctype.h>
19#include <dirent.h>
20#include <errno.h>
21#include <grp.h>
d38dd64a
CB
22#include <linux/kdev_t.h>
23#include <linux/types.h>
942e193e
CB
24#include <poll.h>
25#include <signal.h>
a54694f8 26#include <stdint.h>
ccb4cabe
SH
27#include <stdio.h>
28#include <stdlib.h>
a54694f8 29#include <string.h>
438c4581 30#include <sys/types.h>
d38dd64a 31#include <unistd.h>
c8bf519d 32
d1783ef4 33#include "af_unix.h"
b635e92d 34#include "caps.h"
ccb4cabe 35#include "cgroup.h"
bf651989 36#include "cgroup2_devices.h"
6328fd9c 37#include "cgroup_utils.h"
ccb4cabe 38#include "commands.h"
43654d34 39#include "conf.h"
d38dd64a 40#include "config.h"
a54694f8 41#include "log.h"
c19ad94b 42#include "macro.h"
018051e3 43#include "mainloop.h"
861cb8c2 44#include "memory_utils.h"
43654d34 45#include "storage/storage.h"
a54694f8 46#include "utils.h"
ccb4cabe 47
64e82f8b
DJ
48#ifndef HAVE_STRLCPY
49#include "include/strlcpy.h"
50#endif
51
3ebe2fbd
DJ
52#ifndef HAVE_STRLCAT
53#include "include/strlcat.h"
54#endif
55
ac2cecc4 56lxc_log_define(cgfsng, cgroup);
ccb4cabe 57
8b8db2f6
CB
58/* Given a pointer to a null-terminated array of pointers, realloc to add one
59 * entry, and point the new entry to NULL. Do not fail. Return the index to the
60 * second-to-last entry - that is, the one which is now available for use
61 * (keeping the list null-terminated).
ccb4cabe
SH
62 */
63static int append_null_to_list(void ***list)
64{
65 int newentry = 0;
66
67 if (*list)
8b8db2f6
CB
68 for (; (*list)[newentry]; newentry++)
69 ;
ccb4cabe
SH
70
71 *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
72 (*list)[newentry + 1] = NULL;
73 return newentry;
74}
75
8073018d
CB
76/* Given a null-terminated array of strings, check whether @entry is one of the
77 * strings.
ccb4cabe
SH
78 */
79static bool string_in_list(char **list, const char *entry)
80{
ccb4cabe
SH
81 if (!list)
82 return false;
d6337a5f 83
77c3e9a2 84 for (int i = 0; list[i]; i++)
ccb4cabe
SH
85 if (strcmp(list[i], entry) == 0)
86 return true;
87
88 return false;
89}
90
ac010944
CB
91/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
92 * "name=systemd". Do not fail.
93 */
94static char *cg_legacy_must_prefix_named(char *entry)
95{
96 size_t len;
97 char *prefixed;
98
99 len = strlen(entry);
f25a2044 100 prefixed = must_realloc(NULL, len + 6);
ac010944 101
6333c915
CB
102 memcpy(prefixed, "name=", STRLITERALLEN("name="));
103 memcpy(prefixed + STRLITERALLEN("name="), entry, len);
ac010944 104 prefixed[len + 5] = '\0';
99bb3fa8 105
ac010944
CB
106 return prefixed;
107}
108
42a993b4
CB
109/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
110 * we are called.
ccb4cabe 111 *
42a993b4
CB
112 * We also handle named subsystems here. Any controller which is not a kernel
113 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
114 * we refuse to use because we're not sure which we have here.
115 * (TODO: We could work around this in some cases by just remounting to be
116 * unambiguous, or by comparing mountpoint contents with current cgroup.)
ccb4cabe
SH
117 *
118 * The last entry will always be NULL.
119 */
42a993b4
CB
120static void must_append_controller(char **klist, char **nlist, char ***clist,
121 char *entry)
ccb4cabe
SH
122{
123 int newentry;
124 char *copy;
125
126 if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
c2712f64 127 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
ccb4cabe
SH
128 ERROR("It is both a named and kernel subsystem");
129 return;
130 }
131
132 newentry = append_null_to_list((void ***)clist);
133
134 if (strncmp(entry, "name=", 5) == 0)
135 copy = must_copy_string(entry);
136 else if (string_in_list(klist, entry))
137 copy = must_copy_string(entry);
138 else
7745483d 139 copy = cg_legacy_must_prefix_named(entry);
ccb4cabe
SH
140
141 (*clist)[newentry] = copy;
142}
143
5ae0207c
CB
144/* Given a handler's cgroup data, return the struct hierarchy for the controller
145 * @c, or NULL if there is none.
ccb4cabe 146 */
27a5132c 147struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
ccb4cabe 148{
77c3e9a2
CB
149 if (!ops->hierarchies)
150 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
d6337a5f 151
77c3e9a2 152 for (int i = 0; ops->hierarchies[i]; i++) {
27a5132c 153 if (!controller) {
d6337a5f 154 /* This is the empty unified hierarchy. */
2202afc9
CB
155 if (ops->hierarchies[i]->controllers &&
156 !ops->hierarchies[i]->controllers[0])
157 return ops->hierarchies[i];
106f1f38 158 continue;
2a63b5cb
CB
159 } else if (pure_unified_layout(ops) &&
160 strcmp(controller, "devices") == 0) {
161 if (ops->unified->bpf_device_controller)
162 return ops->unified;
163 break;
d6337a5f
CB
164 }
165
27a5132c 166 if (string_in_list(ops->hierarchies[i]->controllers, controller))
2202afc9 167 return ops->hierarchies[i];
ccb4cabe 168 }
d6337a5f 169
27a5132c
CB
170 if (controller)
171 WARN("There is no useable %s controller", controller);
172 else
173 WARN("There is no empty unified cgroup hierarchy");
174
77c3e9a2 175 return ret_set_errno(NULL, ENOENT);
ccb4cabe
SH
176}
177
a54694f8
CB
178#define BATCH_SIZE 50
179static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
180{
181 int newbatches = (newlen / BATCH_SIZE) + 1;
182 int oldbatches = (oldlen / BATCH_SIZE) + 1;
183
77c3e9a2 184 if (!*mem || newbatches > oldbatches)
a54694f8 185 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
a54694f8
CB
186}
187
188static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
189{
190 size_t full = oldlen + newlen;
191
192 batch_realloc(dest, oldlen, full + 1);
193
194 memcpy(*dest + oldlen, new, newlen + 1);
195}
196
197/* Slurp in a whole file */
d6337a5f 198static char *read_file(const char *fnam)
a54694f8 199{
77c3e9a2 200 __do_free char *buf = NULL, *line = NULL;
d97919ab 201 __do_fclose FILE *f = NULL;
d97919ab 202 size_t len = 0, fulllen = 0;
77c3e9a2 203 int linelen;
a54694f8 204
4110345b 205 f = fopen(fnam, "re");
a54694f8
CB
206 if (!f)
207 return NULL;
77c3e9a2 208
a54694f8
CB
209 while ((linelen = getline(&line, &len, f)) != -1) {
210 append_line(&buf, fulllen, line, linelen);
211 fulllen += linelen;
212 }
77c3e9a2
CB
213
214 return move_ptr(buf);
a54694f8
CB
215}
216
217/* Taken over modified from the kernel sources. */
218#define NBITS 32 /* bits in uint32_t */
219#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
220#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
221
222static void set_bit(unsigned bit, uint32_t *bitarr)
223{
224 bitarr[bit / NBITS] |= (1 << (bit % NBITS));
225}
226
227static void clear_bit(unsigned bit, uint32_t *bitarr)
228{
229 bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
230}
231
232static bool is_set(unsigned bit, uint32_t *bitarr)
233{
234 return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
235}
236
237/* Create cpumask from cpulist aka turn:
238 *
239 * 0,2-3
240 *
d5d468f6 241 * into bit array
a54694f8
CB
242 *
243 * 1 0 1 1
244 */
245static uint32_t *lxc_cpumask(char *buf, size_t nbits)
246{
77c3e9a2 247 __do_free uint32_t *bitarr = NULL;
a54694f8 248 char *token;
d5d468f6 249 size_t arrlen;
d5d468f6
CB
250
251 arrlen = BITS_TO_LONGS(nbits);
252 bitarr = calloc(arrlen, sizeof(uint32_t));
a54694f8 253 if (!bitarr)
c5b8049e 254 return ret_set_errno(NULL, ENOMEM);
a54694f8 255
0be0d78f 256 lxc_iterate_parts(token, buf, ",") {
a54694f8 257 errno = 0;
d5d468f6
CB
258 unsigned end, start;
259 char *range;
a54694f8 260
d5d468f6
CB
261 start = strtoul(token, NULL, 0);
262 end = start;
263 range = strchr(token, '-');
a54694f8
CB
264 if (range)
265 end = strtoul(range + 1, NULL, 0);
d5d468f6 266
c5b8049e
CB
267 if (!(start <= end))
268 return ret_set_errno(NULL, EINVAL);
a54694f8 269
c5b8049e
CB
270 if (end >= nbits)
271 return ret_set_errno(NULL, EINVAL);
a54694f8
CB
272
273 while (start <= end)
274 set_bit(start++, bitarr);
275 }
276
c5b8049e 277 return move_ptr(bitarr);
a54694f8
CB
278}
279
a54694f8
CB
280/* Turn cpumask into simple, comma-separated cpulist. */
281static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
282{
f761d24d 283 __do_free_string_list char **cpulist = NULL;
c19ad94b 284 char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
77c3e9a2 285 int ret;
a54694f8 286
77c3e9a2 287 for (size_t i = 0; i <= nbits; i++) {
414c6719
CB
288 if (!is_set(i, bitarr))
289 continue;
290
979a0d93 291 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
f761d24d 292 if (ret < 0 || (size_t)ret >= sizeof(numstr))
414c6719 293 return NULL;
414c6719
CB
294
295 ret = lxc_append_string(&cpulist, numstr);
f761d24d 296 if (ret < 0)
c5b8049e 297 return ret_set_errno(NULL, ENOMEM);
a54694f8 298 }
414c6719
CB
299
300 if (!cpulist)
c5b8049e 301 return ret_set_errno(NULL, ENOMEM);
414c6719 302
f761d24d 303 return lxc_string_join(",", (const char **)cpulist, false);
a54694f8
CB
304}
305
306static ssize_t get_max_cpus(char *cpulist)
307{
308 char *c1, *c2;
309 char *maxcpus = cpulist;
310 size_t cpus = 0;
311
312 c1 = strrchr(maxcpus, ',');
313 if (c1)
314 c1++;
315
316 c2 = strrchr(maxcpus, '-');
317 if (c2)
318 c2++;
319
320 if (!c1 && !c2)
321 c1 = maxcpus;
322 else if (c1 > c2)
323 c2 = c1;
324 else if (c1 < c2)
325 c1 = c2;
333987b9 326 else if (!c1 && c2)
a54694f8
CB
327 c1 = c2;
328
a54694f8
CB
329 errno = 0;
330 cpus = strtoul(c1, NULL, 0);
331 if (errno != 0)
332 return -1;
333
334 return cpus;
335}
336
6f9584d8 337#define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
36f70181 338#define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
c5b8049e
CB
339static bool cg_legacy_filter_and_set_cpus(const char *parent_cgroup,
340 char *child_cgroup, bool am_initialized)
a54694f8 341{
d97919ab 342 __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
36f70181
CB
343 *offlinecpus = NULL, *posscpus = NULL;
344 __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
345 *possmask = NULL;
a54694f8
CB
346 int ret;
347 ssize_t i;
36f70181 348 ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
c5b8049e 349 bool flipped_bit = false;
a54694f8 350
c5b8049e 351 fpath = must_make_path(parent_cgroup, "cpuset.cpus", NULL);
a54694f8 352 posscpus = read_file(fpath);
c5b8049e
CB
353 if (!posscpus)
354 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
a54694f8
CB
355
356 /* Get maximum number of cpus found in possible cpuset. */
357 maxposs = get_max_cpus(posscpus);
92d5ea57 358 if (maxposs < 0 || maxposs >= INT_MAX - 1)
d97919ab 359 return false;
a54694f8 360
36f70181
CB
361 if (file_exists(__ISOL_CPUS)) {
362 isolcpus = read_file(__ISOL_CPUS);
c5b8049e
CB
363 if (!isolcpus)
364 return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
6f9584d8 365
36f70181
CB
366 if (isdigit(isolcpus[0])) {
367 /* Get maximum number of cpus found in isolated cpuset. */
368 maxisol = get_max_cpus(isolcpus);
369 if (maxisol < 0 || maxisol >= INT_MAX - 1)
370 return false;
6f9584d8 371 }
36f70181
CB
372
373 if (maxposs < maxisol)
374 maxposs = maxisol;
375 maxposs++;
376 } else {
377 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
a54694f8
CB
378 }
379
36f70181
CB
380 if (file_exists(__OFFLINE_CPUS)) {
381 offlinecpus = read_file(__OFFLINE_CPUS);
c5b8049e
CB
382 if (!offlinecpus)
383 return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
36f70181
CB
384
385 if (isdigit(offlinecpus[0])) {
386 /* Get maximum number of cpus found in offline cpuset. */
387 maxoffline = get_max_cpus(offlinecpus);
388 if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
389 return false;
390 }
391
392 if (maxposs < maxoffline)
393 maxposs = maxoffline;
394 maxposs++;
395 } else {
396 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
397 }
a54694f8 398
dcd14a3d
CB
399 if ((maxisol == 0) && (maxoffline == 0)) {
400 cpulist = move_ptr(posscpus);
36f70181 401 goto copy_parent;
dcd14a3d 402 }
a54694f8
CB
403
404 possmask = lxc_cpumask(posscpus, maxposs);
c5b8049e
CB
405 if (!possmask)
406 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
a54694f8 407
36f70181
CB
408 if (maxisol > 0) {
409 isolmask = lxc_cpumask(isolcpus, maxposs);
c5b8049e
CB
410 if (!isolmask)
411 return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
36f70181
CB
412 }
413
414 if (maxoffline > 0) {
415 offlinemask = lxc_cpumask(offlinecpus, maxposs);
c5b8049e
CB
416 if (!offlinemask)
417 return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
6f9584d8 418 }
a54694f8
CB
419
420 for (i = 0; i <= maxposs; i++) {
36f70181
CB
421 if ((isolmask && !is_set(i, isolmask)) ||
422 (offlinemask && !is_set(i, offlinemask)) ||
423 !is_set(i, possmask))
59ac3b88
CB
424 continue;
425
426 flipped_bit = true;
427 clear_bit(i, possmask);
a54694f8
CB
428 }
429
6f9584d8 430 if (!flipped_bit) {
b31d62b8
CB
431 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
432 TRACE("No isolated or offline cpus present in cpuset");
433 } else {
434 cpulist = move_ptr(posscpus);
435 TRACE("Removed isolated or offline cpus from cpuset");
6f9584d8 436 }
c5b8049e
CB
437 if (!cpulist)
438 return log_error_errno(false, errno, "Failed to create cpu list");
a54694f8
CB
439
440copy_parent:
36f70181 441 if (!am_initialized) {
c5b8049e 442 ret = lxc_write_openat(child_cgroup, "cpuset.cpus", cpulist, strlen(cpulist));
c04a6d4e
CB
443 if (ret < 0)
444 return log_error_errno(false,
445 errno, "Failed to write cpu list to \"%s/cpuset.cpus\"",
c5b8049e 446 child_cgroup);
36f70181
CB
447
448 TRACE("Copied cpu settings of parent cgroup");
6f9584d8
CB
449 }
450
d97919ab 451 return true;
a54694f8
CB
452}
453
e3a3fecf 454/* Copy contents of parent(@path)/@file to @path/@file */
c5b8049e
CB
455static bool copy_parent_file(const char *parent_cgroup,
456 const char *child_cgroup, const char *file)
e3a3fecf 457{
c5b8049e 458 __do_free char *parent_file = NULL, *value = NULL;
b095a8eb 459 int len = 0;
fe70edee 460 int ret;
e3a3fecf 461
c5b8049e
CB
462 parent_file = must_make_path(parent_cgroup, file, NULL);
463 len = lxc_read_from_file(parent_file, NULL, 0);
fe70edee 464 if (len <= 0)
77c3e9a2 465 return log_error_errno(false, errno, "Failed to determine buffer size");
b095a8eb 466
f25a2044 467 value = must_realloc(NULL, len + 1);
fe70edee 468 value[len] = '\0';
c5b8049e 469 ret = lxc_read_from_file(parent_file, value, len);
fe70edee 470 if (ret != len)
77c3e9a2 471 return log_error_errno(false, errno, "Failed to read from parent file \"%s\"", parent_file);
b095a8eb 472
c5b8049e 473 ret = lxc_write_openat(child_cgroup, file, value, len);
fe70edee 474 if (ret < 0 && errno != EACCES)
77c3e9a2 475 return log_error_errno(false, errno, "Failed to write \"%s\" to file \"%s/%s\"",
c5b8049e 476 value, child_cgroup, file);
fe70edee 477 return true;
e3a3fecf
SH
478}
479
77c3e9a2 480static inline bool is_unified_hierarchy(const struct hierarchy *h)
c04a6d4e
CB
481{
482 return h->version == CGROUP2_SUPER_MAGIC;
483}
484
f990d3bf
CB
485/*
486 * Initialize the cpuset hierarchy in first directory of @cgroup_leaf and set
7793add3
CB
487 * cgroup.clone_children so that children inherit settings. Since the
488 * h->base_path is populated by init or ourselves, we know it is already
489 * initialized.
fe70edee
CB
490 *
491 * returns -1 on error, 0 when we didn't created a cgroup, 1 if we created a
492 * cgroup.
e3a3fecf 493 */
f990d3bf
CB
494static int cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h,
495 const char *cgroup_leaf)
e3a3fecf 496{
c5b8049e 497 __do_free char *parent_cgroup = NULL, *child_cgroup = NULL, *dup = NULL;
f62cf1d4 498 __do_close int cgroup_fd = -EBADF;
c5b8049e 499 int fret = -1;
7793add3
CB
500 int ret;
501 char v;
f990d3bf 502 char *leaf, *slash;
e3a3fecf 503
c04a6d4e 504 if (is_unified_hierarchy(h))
fe70edee 505 return 0;
c04a6d4e 506
e3a3fecf 507 if (!string_in_list(h->controllers, "cpuset"))
fe70edee 508 return 0;
e3a3fecf 509
f990d3bf
CB
510 if (!cgroup_leaf)
511 return ret_set_errno(-1, EINVAL);
512
513 dup = strdup(cgroup_leaf);
514 if (!dup)
515 return ret_set_errno(-1, ENOMEM);
516
c5b8049e
CB
517 parent_cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
518
519 leaf = dup;
f990d3bf
CB
520 leaf += strspn(leaf, "/");
521 slash = strchr(leaf, '/');
e3a3fecf
SH
522 if (slash)
523 *slash = '\0';
c5b8049e 524 child_cgroup = must_make_path(parent_cgroup, leaf, NULL);
e3a3fecf
SH
525 if (slash)
526 *slash = '/';
7793add3 527
fe70edee 528 fret = 1;
c5b8049e 529 ret = mkdir(child_cgroup, 0755);
7793add3 530 if (ret < 0) {
fe70edee 531 if (errno != EEXIST)
c5b8049e 532 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", child_cgroup);
fe70edee
CB
533
534 fret = 0;
e3a3fecf 535 }
6f9584d8 536
c5b8049e 537 cgroup_fd = lxc_open_dirfd(child_cgroup);
c04a6d4e 538 if (cgroup_fd < 0)
fe70edee 539 return -1;
7793add3 540
c04a6d4e 541 ret = lxc_readat(cgroup_fd, "cgroup.clone_children", &v, 1);
fe70edee 542 if (ret < 0)
c5b8049e 543 return log_error_errno(-1, errno, "Failed to read file \"%s/cgroup.clone_children\"", child_cgroup);
e3a3fecf 544
a54694f8 545 /* Make sure any isolated cpus are removed from cpuset.cpus. */
c5b8049e 546 if (!cg_legacy_filter_and_set_cpus(parent_cgroup, child_cgroup, v == '1'))
fe70edee 547 return log_error_errno(-1, errno, "Failed to remove isolated cpus");
a54694f8 548
7793add3 549 /* Already set for us by someone else. */
b28c2810
CB
550 if (v == '1')
551 TRACE("\"cgroup.clone_children\" was already set to \"1\"");
e3a3fecf
SH
552
553 /* copy parent's settings */
c5b8049e 554 if (!copy_parent_file(parent_cgroup, child_cgroup, "cpuset.mems"))
fe70edee 555 return log_error_errno(-1, errno, "Failed to copy \"cpuset.mems\" settings");
e3a3fecf 556
fe70edee 557 /* Set clone_children so children inherit our settings */
c04a6d4e 558 ret = lxc_writeat(cgroup_fd, "cgroup.clone_children", "1", 1);
fe70edee 559 if (ret < 0)
c5b8049e 560 return log_error_errno(-1, errno, "Failed to write 1 to \"%s/cgroup.clone_children\"", child_cgroup);
d97919ab 561
fe70edee 562 return fret;
e3a3fecf
SH
563}
564
5c0089ae
CB
565/* Given two null-terminated lists of strings, return true if any string is in
566 * both.
ccb4cabe
SH
567 */
568static bool controller_lists_intersect(char **l1, char **l2)
569{
ccb4cabe
SH
570 if (!l1 || !l2)
571 return false;
572
77c3e9a2 573 for (int i = 0; l1[i]; i++)
ccb4cabe
SH
574 if (string_in_list(l2, l1[i]))
575 return true;
5c0089ae 576
ccb4cabe
SH
577 return false;
578}
579
258449e5
CB
580/* For a null-terminated list of controllers @clist, return true if any of those
581 * controllers is already listed the null-terminated list of hierarchies @hlist.
582 * Realistically, if one is present, all must be present.
ccb4cabe
SH
583 */
584static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
585{
ccb4cabe
SH
586 if (!hlist)
587 return false;
258449e5 588
77c3e9a2 589 for (int i = 0; hlist[i]; i++)
ccb4cabe
SH
590 if (controller_lists_intersect(hlist[i]->controllers, clist))
591 return true;
ccb4cabe 592
258449e5 593 return false;
ccb4cabe
SH
594}
595
f57ac67f
CB
596/* Return true if the controller @entry is found in the null-terminated list of
597 * hierarchies @hlist.
ccb4cabe
SH
598 */
599static bool controller_found(struct hierarchy **hlist, char *entry)
600{
ccb4cabe
SH
601 if (!hlist)
602 return false;
603
77c3e9a2 604 for (int i = 0; hlist[i]; i++)
ccb4cabe
SH
605 if (string_in_list(hlist[i]->controllers, entry))
606 return true;
d6337a5f 607
ccb4cabe
SH
608 return false;
609}
610
e1c27ab0
CB
611/* Return true if all of the controllers which we require have been found. The
612 * required list is freezer and anything in lxc.cgroup.use.
ccb4cabe 613 */
2202afc9 614static bool all_controllers_found(struct cgroup_ops *ops)
ccb4cabe 615{
77c3e9a2 616 struct hierarchy **hlist;
ccb4cabe 617
2202afc9 618 if (!ops->cgroup_use)
ccb4cabe 619 return true;
c2712f64 620
77c3e9a2
CB
621 hlist = ops->hierarchies;
622 for (char **cur = ops->cgroup_use; cur && *cur; cur++)
623 if (!controller_found(hlist, *cur))
624 return log_error(false, "No %s controller mountpoint found", *cur);
c2712f64 625
ccb4cabe
SH
626 return true;
627}
628
f205f10c
CB
629/* Get the controllers from a mountinfo line There are other ways we could get
630 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
631 * could parse the mount options. But we simply assume that the mountpoint must
632 * be /sys/fs/cgroup/controller-list
ccb4cabe 633 */
a3926f6a
CB
634static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
635 int type)
ccb4cabe 636{
f205f10c
CB
637 /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
638 * for legacy hierarchies.
639 */
f761d24d 640 __do_free_string_list char **aret = NULL;
ccb4cabe 641 int i;
d97919ab 642 char *p2, *tok;
0be0d78f 643 char *p = line, *sep = ",";
6328fd9c 644
ccb4cabe 645 for (i = 0; i < 4; i++) {
235f1815 646 p = strchr(p, ' ');
ccb4cabe
SH
647 if (!p)
648 return NULL;
649 p++;
650 }
a55f31bd 651
f205f10c
CB
652 /* Note, if we change how mountinfo works, then our caller will need to
653 * verify /sys/fs/cgroup/ in this field.
654 */
77c3e9a2
CB
655 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
656 return log_error(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
d6337a5f 657
ccb4cabe 658 p += 15;
235f1815 659 p2 = strchr(p, ' ');
77c3e9a2
CB
660 if (!p2)
661 return log_error(NULL, "Corrupt mountinfo");
ccb4cabe 662 *p2 = '\0';
6328fd9c 663
d6337a5f 664 if (type == CGROUP_SUPER_MAGIC) {
88396101 665 __do_free char *dup = NULL;
d97919ab 666
0be0d78f
CB
667 /* strdup() here for v1 hierarchies. Otherwise
668 * lxc_iterate_parts() will destroy mountpoints such as
669 * "/sys/fs/cgroup/cpu,cpuacct".
d6337a5f 670 */
d97919ab 671 dup = must_copy_string(p);
d6337a5f
CB
672 if (!dup)
673 return NULL;
674
d97919ab 675 lxc_iterate_parts (tok, dup, sep)
d6337a5f 676 must_append_controller(klist, nlist, &aret, tok);
411ac6d8 677 }
d6337a5f 678 *p2 = ' ';
f205f10c 679
f761d24d 680 return move_ptr(aret);
d6337a5f 681}
411ac6d8 682
d6337a5f
CB
683static char **cg_unified_make_empty_controller(void)
684{
f761d24d 685 __do_free_string_list char **aret = NULL;
d6337a5f 686 int newentry;
d6337a5f
CB
687
688 newentry = append_null_to_list((void ***)&aret);
689 aret[newentry] = NULL;
f761d24d 690 return move_ptr(aret);
d6337a5f
CB
691}
692
693static char **cg_unified_get_controllers(const char *file)
694{
d97919ab 695 __do_free char *buf = NULL;
f761d24d 696 __do_free_string_list char **aret = NULL;
0be0d78f 697 char *sep = " \t\n";
2a63b5cb 698 char *tok;
d6337a5f
CB
699
700 buf = read_file(file);
701 if (!buf)
411ac6d8 702 return NULL;
6328fd9c 703
0be0d78f 704 lxc_iterate_parts(tok, buf, sep) {
d6337a5f
CB
705 int newentry;
706 char *copy;
707
708 newentry = append_null_to_list((void ***)&aret);
709 copy = must_copy_string(tok);
710 aret[newentry] = copy;
ccb4cabe
SH
711 }
712
f761d24d 713 return move_ptr(aret);
ccb4cabe
SH
714}
715
2202afc9 716static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
bb221ad1 717 char *container_base_path, int type)
ccb4cabe
SH
718{
719 struct hierarchy *new;
720 int newentry;
721
1973b62a 722 new = zalloc(sizeof(*new));
ccb4cabe
SH
723 new->controllers = clist;
724 new->mountpoint = mountpoint;
bb221ad1 725 new->container_base_path = container_base_path;
d6337a5f 726 new->version = type;
1973b62a
CB
727 new->cgfd_con = -EBADF;
728 new->cgfd_mon = -EBADF;
6328fd9c 729
2202afc9
CB
730 newentry = append_null_to_list((void ***)h);
731 (*h)[newentry] = new;
d6337a5f 732 return new;
ccb4cabe
SH
733}
734
798c3b33
CB
735/* Get a copy of the mountpoint from @line, which is a line from
736 * /proc/self/mountinfo.
ccb4cabe 737 */
a3926f6a 738static char *cg_hybrid_get_mountpoint(char *line)
ccb4cabe 739{
77c3e9a2 740 char *p = line, *sret = NULL;
ccb4cabe 741 size_t len;
798c3b33 742 char *p2;
ccb4cabe 743
77c3e9a2 744 for (int i = 0; i < 4; i++) {
235f1815 745 p = strchr(p, ' ');
ccb4cabe
SH
746 if (!p)
747 return NULL;
748 p++;
749 }
d6337a5f 750
dca9587a 751 if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
d6337a5f
CB
752 return NULL;
753
754 p2 = strchr(p + 15, ' ');
755 if (!p2)
756 return NULL;
757 *p2 = '\0';
758
ccb4cabe 759 len = strlen(p);
f25a2044 760 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
761 memcpy(sret, p, len);
762 sret[len] = '\0';
77c3e9a2 763
ccb4cabe
SH
764 return sret;
765}
766
f523291e 767/* Given a multi-line string, return a null-terminated copy of the current line. */
ccb4cabe
SH
768static char *copy_to_eol(char *p)
769{
77c3e9a2 770 char *p2, *sret;
ccb4cabe
SH
771 size_t len;
772
77c3e9a2 773 p2 = strchr(p, '\n');
ccb4cabe
SH
774 if (!p2)
775 return NULL;
776
777 len = p2 - p;
f25a2044 778 sret = must_realloc(NULL, len + 1);
ccb4cabe
SH
779 memcpy(sret, p, len);
780 sret[len] = '\0';
77c3e9a2 781
ccb4cabe
SH
782 return sret;
783}
784
bced39de
CB
785/* cgline: pointer to character after the first ':' in a line in a \n-terminated
786 * /proc/self/cgroup file. Check whether controller c is present.
ccb4cabe
SH
787 */
788static bool controller_in_clist(char *cgline, char *c)
789{
d97919ab
CB
790 __do_free char *tmp = NULL;
791 char *tok, *eol;
ccb4cabe
SH
792 size_t len;
793
235f1815 794 eol = strchr(cgline, ':');
ccb4cabe
SH
795 if (!eol)
796 return false;
797
798 len = eol - cgline;
861cb8c2 799 tmp = must_realloc(NULL, len + 1);
ccb4cabe
SH
800 memcpy(tmp, cgline, len);
801 tmp[len] = '\0';
802
d97919ab
CB
803 lxc_iterate_parts(tok, tmp, ",")
804 if (strcmp(tok, c) == 0)
ccb4cabe 805 return true;
d6337a5f 806
ccb4cabe
SH
807 return false;
808}
809
c3ef912e
CB
810/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
811 * @controller.
ccb4cabe 812 */
c3ef912e
CB
813static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
814 int type)
ccb4cabe
SH
815{
816 char *p = basecginfo;
6328fd9c 817
d6337a5f
CB
818 for (;;) {
819 bool is_cgv2_base_cgroup = false;
820
6328fd9c 821 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
d6337a5f
CB
822 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
823 is_cgv2_base_cgroup = true;
ccb4cabe 824
235f1815 825 p = strchr(p, ':');
ccb4cabe
SH
826 if (!p)
827 return NULL;
828 p++;
d6337a5f
CB
829
830 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
235f1815 831 p = strchr(p, ':');
ccb4cabe
SH
832 if (!p)
833 return NULL;
834 p++;
835 return copy_to_eol(p);
836 }
837
235f1815 838 p = strchr(p, '\n');
ccb4cabe
SH
839 if (!p)
840 return NULL;
841 p++;
842 }
843}
844
ccb4cabe
SH
845static void must_append_string(char ***list, char *entry)
846{
6dfb18bf 847 int newentry;
ccb4cabe
SH
848 char *copy;
849
6dfb18bf 850 newentry = append_null_to_list((void ***)list);
ccb4cabe
SH
851 copy = must_copy_string(entry);
852 (*list)[newentry] = copy;
853}
854
d6337a5f 855static int get_existing_subsystems(char ***klist, char ***nlist)
ccb4cabe 856{
d97919ab
CB
857 __do_free char *line = NULL;
858 __do_fclose FILE *f = NULL;
ccb4cabe
SH
859 size_t len = 0;
860
4110345b 861 f = fopen("/proc/self/cgroup", "re");
d6337a5f
CB
862 if (!f)
863 return -1;
864
ccb4cabe 865 while (getline(&line, &len, f) != -1) {
0be0d78f 866 char *p, *p2, *tok;
235f1815 867 p = strchr(line, ':');
ccb4cabe
SH
868 if (!p)
869 continue;
870 p++;
235f1815 871 p2 = strchr(p, ':');
ccb4cabe
SH
872 if (!p2)
873 continue;
874 *p2 = '\0';
ff8d6ee9 875
6328fd9c
CB
876 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
877 * contains an entry of the form:
ff8d6ee9
CB
878 *
879 * 0::/some/path
880 *
6328fd9c 881 * In this case we use "cgroup2" as controller name.
ff8d6ee9 882 */
6328fd9c
CB
883 if ((p2 - p) == 0) {
884 must_append_string(klist, "cgroup2");
ff8d6ee9 885 continue;
6328fd9c 886 }
ff8d6ee9 887
0be0d78f 888 lxc_iterate_parts(tok, p, ",") {
ccb4cabe
SH
889 if (strncmp(tok, "name=", 5) == 0)
890 must_append_string(nlist, tok);
891 else
892 must_append_string(klist, tok);
893 }
894 }
895
d6337a5f 896 return 0;
ccb4cabe
SH
897}
898
d7314671 899static char *trim(char *s)
ccb4cabe 900{
7689dfd7
CB
901 size_t len;
902
903 len = strlen(s);
2c28d76b 904 while ((len > 1) && (s[len - 1] == '\n'))
ccb4cabe 905 s[--len] = '\0';
d7314671
CB
906
907 return s;
ccb4cabe
SH
908}
909
2202afc9 910static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
ccb4cabe
SH
911{
912 int i;
27d84737 913 struct hierarchy **it;
41c33dbe 914
2202afc9
CB
915 if (!ops->hierarchies) {
916 TRACE(" No hierarchies found");
ccb4cabe
SH
917 return;
918 }
27d84737 919
2202afc9
CB
920 TRACE(" Hierarchies:");
921 for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
ccb4cabe 922 int j;
27d84737
CB
923 char **cit;
924
bb221ad1 925 TRACE(" %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
2202afc9
CB
926 TRACE(" mountpoint: %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
927 TRACE(" controllers:");
a7b0cc4c 928 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
2202afc9 929 TRACE(" %d: %s", j, *cit);
ccb4cabe
SH
930 }
931}
41c33dbe 932
a3926f6a
CB
933static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
934 char **nlist)
41c33dbe
SH
935{
936 int k;
a7b0cc4c 937 char **it;
41c33dbe 938
2202afc9
CB
939 TRACE("basecginfo is:");
940 TRACE("%s", basecginfo);
41c33dbe 941
a7b0cc4c 942 for (k = 0, it = klist; it && *it; it++, k++)
2202afc9 943 TRACE("kernel subsystem %d: %s", k, *it);
0f71dd9b 944
a7b0cc4c 945 for (k = 0, it = nlist; it && *it; it++, k++)
2202afc9 946 TRACE("named subsystem %d: %s", k, *it);
41c33dbe 947}
ccb4cabe 948
de6fe132 949static int cgroup_tree_remove(struct hierarchy **hierarchies,
2202afc9 950 const char *container_cgroup)
c71d83e1 951{
2202afc9
CB
952 if (!container_cgroup || !hierarchies)
953 return 0;
d6337a5f 954
8e64b673 955 for (int i = 0; hierarchies[i]; i++) {
2202afc9 956 struct hierarchy *h = hierarchies[i];
77c3e9a2 957 int ret;
d6337a5f 958
eb697136 959 if (!h->container_full_path)
2202afc9
CB
960 continue;
961
8408a9cc 962 ret = lxc_rm_rf(h->container_full_path);
2202afc9 963 if (ret < 0)
eb697136 964 WARN("Failed to destroy \"%s\"", h->container_full_path);
2202afc9 965
77c3e9a2 966 free_disarm(h->container_full_path);
2202afc9 967 }
d6337a5f 968
c71d83e1 969 return 0;
d6337a5f
CB
970}
971
2202afc9
CB
972struct generic_userns_exec_data {
973 struct hierarchy **hierarchies;
974 const char *container_cgroup;
975 struct lxc_conf *conf;
976 uid_t origuid; /* target uid in parent namespace */
977 char *path;
978};
d6337a5f 979
de6fe132 980static int cgroup_tree_remove_wrapper(void *data)
2202afc9 981{
2202afc9
CB
982 struct generic_userns_exec_data *arg = data;
983 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
984 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
8e64b673 985 int ret;
d6337a5f 986
b58214ac
CB
987 if (!lxc_setgroups(0, NULL) && errno != EPERM)
988 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
989
2202afc9 990 ret = setresgid(nsgid, nsgid, nsgid);
8e64b673 991 if (ret < 0)
77c3e9a2 992 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
8e64b673 993 (int)nsgid, (int)nsgid, (int)nsgid);
d6337a5f 994
2202afc9 995 ret = setresuid(nsuid, nsuid, nsuid);
8e64b673 996 if (ret < 0)
77c3e9a2 997 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
8e64b673 998 (int)nsuid, (int)nsuid, (int)nsuid);
d6337a5f 999
de6fe132 1000 return cgroup_tree_remove(arg->hierarchies, arg->container_cgroup);
d6337a5f
CB
1001}
1002
434c8e15
CB
1003__cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
1004 struct lxc_handler *handler)
d6337a5f
CB
1005{
1006 int ret;
bd8ef4e4 1007
fc3b9533
CB
1008 if (!ops) {
1009 ERROR("Called with uninitialized cgroup operations");
1010 return;
1011 }
fc1c3af9 1012
69b4a4bb
CB
1013 if (!ops->hierarchies)
1014 return;
1015
fc3b9533
CB
1016 if (!handler) {
1017 ERROR("Called with uninitialized handler");
1018 return;
1019 }
fc1c3af9 1020
fc3b9533
CB
1021 if (!handler->conf) {
1022 ERROR("Called with uninitialized conf");
1023 return;
1024 }
fc1c3af9 1025
bf651989
CB
1026#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
1027 ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices);
1028 if (ret < 0)
1029 WARN("Failed to detach bpf program from cgroup");
1030#endif
1031
8e64b673
CB
1032 if (handler->conf && !lxc_list_empty(&handler->conf->id_map)) {
1033 struct generic_userns_exec_data wrap = {
77c3e9a2
CB
1034 .conf = handler->conf,
1035 .container_cgroup = ops->container_cgroup,
1036 .hierarchies = ops->hierarchies,
1037 .origuid = 0,
8e64b673 1038 };
de6fe132
CB
1039 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
1040 &wrap, "cgroup_tree_remove_wrapper");
8e64b673 1041 } else {
de6fe132 1042 ret = cgroup_tree_remove(ops->hierarchies, ops->container_cgroup);
ccb4cabe 1043 }
8e64b673 1044 if (ret < 0)
fc3b9533 1045 SYSWARN("Failed to destroy cgroups");
ccb4cabe
SH
1046}
1047
434c8e15
CB
1048__cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1049 struct lxc_handler *handler)
1050{
1051 int len;
434c8e15 1052 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1973b62a 1053 const struct lxc_conf *conf;
b376d3d0 1054
fc3b9533
CB
1055 if (!ops) {
1056 ERROR("Called with uninitialized cgroup operations");
1057 return;
1058 }
434c8e15
CB
1059
1060 if (!ops->hierarchies)
1061 return;
1062
fc3b9533
CB
1063 if (!handler) {
1064 ERROR("Called with uninitialized handler");
1065 return;
1066 }
b376d3d0 1067
fc3b9533
CB
1068 if (!handler->conf) {
1069 ERROR("Called with uninitialized conf");
1070 return;
1071 }
1973b62a
CB
1072 conf = handler->conf;
1073
434c8e15
CB
1074 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1075 if (len < 0 || (size_t)len >= sizeof(pidstr))
1076 return;
1077
1078 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1079 __do_free char *pivot_path = NULL;
434c8e15 1080 struct hierarchy *h = ops->hierarchies[i];
fe70edee 1081 int ret;
434c8e15
CB
1082
1083 if (!h->monitor_full_path)
1084 continue;
1085
c468e4d4
CB
1086 /* Monitor might have died before we entered the cgroup. */
1087 if (handler->monitor_pid <= 0) {
1088 WARN("No valid monitor process found while destroying cgroups");
8408a9cc 1089 goto try_lxc_rm_rf;
c468e4d4
CB
1090 }
1091
1973b62a
CB
1092 if (conf && conf->cgroup_meta.dir)
1093 pivot_path = must_make_path(h->mountpoint,
1094 h->container_base_path,
1095 conf->cgroup_meta.dir,
1096 CGROUP_PIVOT, NULL);
1097 else
1098 pivot_path = must_make_path(h->mountpoint,
1099 h->container_base_path,
1100 CGROUP_PIVOT, NULL);
1101
1102 ret = mkdir_p(pivot_path, 0755);
fc3b9533
CB
1103 if (ret < 0 && errno != EEXIST) {
1104 ERROR("Failed to create %s", pivot_path);
8408a9cc 1105 goto try_lxc_rm_rf;
fc3b9533 1106 }
1973b62a 1107
c468e4d4
CB
1108 ret = lxc_write_openat(pivot_path, "cgroup.procs", pidstr, len);
1109 if (ret != 0) {
1110 SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
1111 continue;
fc3b9533 1112 }
434c8e15 1113
8408a9cc
CB
1114try_lxc_rm_rf:
1115 ret = lxc_rm_rf(h->monitor_full_path);
434c8e15
CB
1116 if (ret < 0)
1117 WARN("Failed to destroy \"%s\"", h->monitor_full_path);
434c8e15
CB
1118 }
1119}
1120
6099dd5a
CB
1121static int mkdir_eexist_on_last(const char *dir, mode_t mode)
1122{
1123 const char *tmp = dir;
1124 const char *orig = dir;
1125 size_t orig_len;
1126
1127 orig_len = strlen(dir);
1128 do {
6453ba56 1129 __do_free char *makeme = NULL;
6099dd5a
CB
1130 int ret;
1131 size_t cur_len;
6099dd5a
CB
1132
1133 dir = tmp + strspn(tmp, "/");
1134 tmp = dir + strcspn(dir, "/");
1135
6099dd5a
CB
1136 cur_len = dir - orig;
1137 makeme = strndup(orig, cur_len);
1138 if (!makeme)
77c3e9a2 1139 return ret_set_errno(-1, ENOMEM);
6099dd5a
CB
1140
1141 ret = mkdir(makeme, mode);
77c3e9a2
CB
1142 if (ret < 0 && ((errno != EEXIST) || (orig_len == cur_len)))
1143 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
6099dd5a
CB
1144 } while (tmp != dir);
1145
1146 return 0;
1147}
1148
de6fe132 1149static bool cgroup_tree_create(struct hierarchy *h, const char *cgroup_tree,
f990d3bf 1150 const char *cgroup_leaf, bool payload)
72068e74 1151{
fe70edee
CB
1152 __do_free char *path = NULL;
1153 int ret, ret_cpuset;
72068e74 1154
fe70edee
CB
1155 path = must_make_path(h->mountpoint, h->container_base_path, cgroup_leaf, NULL);
1156 if (dir_exists(path))
1157 return log_warn_errno(false, errno, "The %s cgroup already existed", path);
72068e74 1158
fe70edee
CB
1159 ret_cpuset = cg_legacy_handle_cpuset_hierarchy(h, cgroup_leaf);
1160 if (ret_cpuset < 0)
1161 return log_error_errno(false, errno, "Failed to handle legacy cpuset controller");
0c3deb94 1162
fe70edee 1163 ret = mkdir_eexist_on_last(path, 0755);
6099dd5a 1164 if (ret < 0) {
fe70edee
CB
1165 /*
1166 * This is the cpuset controller and
1167 * cg_legacy_handle_cpuset_hierarchy() has created our target
1168 * directory for us to ensure correct initialization.
1169 */
1170 if (ret_cpuset != 1 || cgroup_tree)
1171 return log_error_errno(false, errno, "Failed to create %s cgroup", path);
6f9584d8 1172 }
0c3deb94 1173
1973b62a
CB
1174 if (payload) {
1175 h->cgfd_con = lxc_open_dirfd(path);
1176 if (h->cgfd_con < 0)
1177 return log_error_errno(false, errno, "Failed to open %s", path);
fe70edee 1178 h->container_full_path = move_ptr(path);
1973b62a
CB
1179 } else {
1180 h->cgfd_mon = lxc_open_dirfd(path);
1181 if (h->cgfd_mon < 0)
1182 return log_error_errno(false, errno, "Failed to open %s", path);
fe70edee 1183 h->monitor_full_path = move_ptr(path);
1973b62a 1184 }
fe70edee 1185
c581d2a6 1186 return true;
ccb4cabe
SH
1187}
1188
de6fe132 1189static void cgroup_tree_leaf_remove(struct hierarchy *h, bool payload)
ccb4cabe 1190{
fe70edee 1191 __do_free char *full_path = NULL;
72068e74 1192
1973b62a 1193 if (payload) {
f62cf1d4 1194 __lxc_unused __do_close int fd = move_fd(h->cgfd_con);
d6bdd182 1195 full_path = move_ptr(h->container_full_path);
1973b62a 1196 } else {
f62cf1d4 1197 __lxc_unused __do_close int fd = move_fd(h->cgfd_mon);
d6bdd182 1198 full_path = move_ptr(h->monitor_full_path);
1973b62a 1199 }
e56639fb 1200
d6bdd182 1201 if (full_path && rmdir(full_path))
fe70edee 1202 SYSWARN("Failed to rmdir(\"%s\") cgroup", full_path);
72068e74
CB
1203}
1204
b857f4be 1205__cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
f2668eea 1206 struct lxc_handler *handler)
72068e74 1207{
b3ed2061 1208 __do_free char *monitor_cgroup = NULL, *__cgroup_tree = NULL;
fe70edee
CB
1209 const char *cgroup_tree;
1210 int idx = 0;
1211 int i;
5ce03bc0 1212 size_t len;
fe70edee 1213 char *suffix;
0d66e29a 1214 struct lxc_conf *conf;
72068e74 1215
0d66e29a
CB
1216 if (!ops)
1217 return ret_set_errno(false, ENOENT);
e56639fb 1218
69b4a4bb
CB
1219 if (!ops->hierarchies)
1220 return true;
1221
0d66e29a
CB
1222 if (ops->monitor_cgroup)
1223 return ret_set_errno(false, EEXIST);
1224
1225 if (!handler || !handler->conf)
1226 return ret_set_errno(false, EINVAL);
1227
1228 conf = handler->conf;
1229
b3ed2061
CB
1230 if (conf->cgroup_meta.dir) {
1231 cgroup_tree = conf->cgroup_meta.dir;
fe70edee
CB
1232 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1233 DEFAULT_MONITOR_CGROUP_PREFIX,
1234 handler->name,
1235 CGROUP_CREATE_RETRY, NULL);
b3ed2061
CB
1236 } else if (ops->cgroup_pattern) {
1237 __cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
d6bdd182
CB
1238 if (!__cgroup_tree)
1239 return ret_set_errno(false, ENOMEM);
1240
b3ed2061 1241 cgroup_tree = __cgroup_tree;
d6bdd182
CB
1242 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1243 DEFAULT_MONITOR_CGROUP,
b3ed2061
CB
1244 CGROUP_CREATE_RETRY, NULL);
1245 } else {
d6bdd182 1246 cgroup_tree = NULL;
fe70edee
CB
1247 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1248 handler->name,
1249 CGROUP_CREATE_RETRY, NULL);
b3ed2061 1250 }
fe70edee 1251 if (!monitor_cgroup)
0d66e29a 1252 return ret_set_errno(false, ENOMEM);
72068e74 1253
fe70edee
CB
1254 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1255 *suffix = '\0';
5ce03bc0 1256 do {
0d66e29a 1257 if (idx)
fe70edee 1258 sprintf(suffix, "-%d", idx);
72068e74 1259
ebc10afe 1260 for (i = 0; ops->hierarchies[i]; i++) {
de6fe132 1261 if (cgroup_tree_create(ops->hierarchies[i], cgroup_tree, monitor_cgroup, false))
fe70edee
CB
1262 continue;
1263
1264 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path ?: "(null)");
1265 for (int j = 0; j < i; j++)
de6fe132 1266 cgroup_tree_leaf_remove(ops->hierarchies[j], false);
fe70edee
CB
1267
1268 idx++;
1269 break;
5ce03bc0 1270 }
ebc10afe 1271 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
5ce03bc0 1272
d97919ab 1273 if (idx == 1000)
0d66e29a 1274 return ret_set_errno(false, ERANGE);
72068e74 1275
c581d2a6 1276 ops->monitor_cgroup = move_ptr(monitor_cgroup);
6e8703a4 1277 return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
ccb4cabe
SH
1278}
1279
fe70edee
CB
1280/*
1281 * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
cecad0c1 1282 * next cgroup_pattern-1, -2, ..., -999.
ccb4cabe 1283 */
b857f4be 1284__cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
f3839f12 1285 struct lxc_handler *handler)
ccb4cabe 1286{
b3ed2061 1287 __do_free char *container_cgroup = NULL, *__cgroup_tree = NULL;
fe70edee 1288 const char *cgroup_tree;
f3839f12 1289 int idx = 0;
fe70edee 1290 int i;
ccb4cabe 1291 size_t len;
fe70edee 1292 char *suffix;
f3839f12 1293 struct lxc_conf *conf;
43654d34 1294
f3839f12
CB
1295 if (!ops)
1296 return ret_set_errno(false, ENOENT);
ccb4cabe 1297
69b4a4bb
CB
1298 if (!ops->hierarchies)
1299 return true;
1300
f3839f12
CB
1301 if (ops->container_cgroup)
1302 return ret_set_errno(false, EEXIST);
1303
1304 if (!handler || !handler->conf)
1305 return ret_set_errno(false, EINVAL);
1306
1307 conf = handler->conf;
1308
b3ed2061
CB
1309 if (conf->cgroup_meta.dir) {
1310 cgroup_tree = conf->cgroup_meta.dir;
fe70edee
CB
1311 container_cgroup = must_concat(&len, cgroup_tree, "/",
1312 DEFAULT_PAYLOAD_CGROUP_PREFIX,
1313 handler->name,
1314 CGROUP_CREATE_RETRY, NULL);
b3ed2061
CB
1315 } else if (ops->cgroup_pattern) {
1316 __cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
d6bdd182
CB
1317 if (!__cgroup_tree)
1318 return ret_set_errno(false, ENOMEM);
1319
b3ed2061 1320 cgroup_tree = __cgroup_tree;
d6bdd182
CB
1321 container_cgroup = must_concat(&len, cgroup_tree, "/",
1322 DEFAULT_PAYLOAD_CGROUP,
b3ed2061
CB
1323 CGROUP_CREATE_RETRY, NULL);
1324 } else {
d6bdd182 1325 cgroup_tree = NULL;
fe70edee
CB
1326 container_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1327 handler->name,
1328 CGROUP_CREATE_RETRY, NULL);
b3ed2061 1329 }
fe70edee
CB
1330 if (!container_cgroup)
1331 return ret_set_errno(false, ENOMEM);
ccb4cabe 1332
fe70edee
CB
1333 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1334 *suffix = '\0';
d97919ab 1335 do {
f3839f12 1336 if (idx)
fe70edee 1337 sprintf(suffix, "-%d", idx);
bb30b52a 1338
d97919ab 1339 for (i = 0; ops->hierarchies[i]; i++) {
de6fe132 1340 if (cgroup_tree_create(ops->hierarchies[i], cgroup_tree, container_cgroup, true))
fe70edee
CB
1341 continue;
1342
1343 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
1344 for (int j = 0; j < i; j++)
de6fe132 1345 cgroup_tree_leaf_remove(ops->hierarchies[j], true);
fe70edee
CB
1346
1347 idx++;
1348 break;
66b66624 1349 }
d97919ab 1350 } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
cecad0c1 1351
d97919ab 1352 if (idx == 1000)
f3839f12 1353 return ret_set_errno(false, ERANGE);
cecad0c1 1354
fe70edee
CB
1355 ops->container_cgroup = move_ptr(container_cgroup);
1356 INFO("The container process uses \"%s\" as cgroup", ops->container_cgroup);
ccb4cabe 1357 return true;
ccb4cabe
SH
1358}
1359
c581d2a6
CB
1360__cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1361 struct lxc_handler *handler)
ccb4cabe 1362{
fdb0b8ab 1363 int monitor_len, transient_len = 0;
c581d2a6
CB
1364 char monitor[INTTYPE_TO_STRLEN(pid_t)],
1365 transient[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 1366
797fa65e
CB
1367 if (!ops)
1368 return ret_set_errno(false, ENOENT);
1369
69b4a4bb
CB
1370 if (!ops->hierarchies)
1371 return true;
1372
797fa65e
CB
1373 if (!ops->monitor_cgroup)
1374 return ret_set_errno(false, ENOENT);
1375
1376 if (!handler || !handler->conf)
1377 return ret_set_errno(false, EINVAL);
1378
c581d2a6
CB
1379 monitor_len = snprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1380 if (handler->transient_pid > 0)
1973b62a 1381 transient_len = snprintf(transient, sizeof(transient), "%d", handler->transient_pid);
ccb4cabe 1382
eeef32bb 1383 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1384 struct hierarchy *h = ops->hierarchies[i];
c581d2a6 1385 int ret;
08768001 1386
1973b62a
CB
1387 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", monitor, monitor_len);
1388 if (ret)
1389 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->monitor_full_path);
c581d2a6 1390
34683042 1391 if (handler->transient_pid <= 0)
c581d2a6
CB
1392 return true;
1393
1973b62a
CB
1394 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", transient, transient_len);
1395 if (ret)
1396 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->monitor_full_path);
1397
1398 /*
78eb6aa6 1399 * we don't keep the fds for non-unified hierarchies around
1973b62a 1400 * mainly because we don't make use of them anymore after the
78eb6aa6 1401 * core cgroup setup is done but also because there are quite a
1973b62a
CB
1402 * lot of them.
1403 */
1404 if (!is_unified_hierarchy(h))
1405 close_prot_errno_disarm(h->cgfd_mon);
ccb4cabe 1406 }
c581d2a6 1407 handler->transient_pid = -1;
ccb4cabe
SH
1408
1409 return true;
1410}
1411
c581d2a6
CB
1412__cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1413 struct lxc_handler *handler)
eeef32bb 1414{
c581d2a6
CB
1415 int len;
1416 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
eeef32bb 1417
4490328e
CB
1418 if (!ops)
1419 return ret_set_errno(false, ENOENT);
1420
c581d2a6
CB
1421 if (!ops->hierarchies)
1422 return true;
1423
4490328e
CB
1424 if (!ops->container_cgroup)
1425 return ret_set_errno(false, ENOENT);
1426
1427 if (!handler || !handler->conf)
1428 return ret_set_errno(false, EINVAL);
1429
c581d2a6
CB
1430 len = snprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1431
1432 for (int i = 0; ops->hierarchies[i]; i++) {
1973b62a 1433 struct hierarchy *h = ops->hierarchies[i];
c581d2a6
CB
1434 int ret;
1435
1973b62a 1436 ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
c581d2a6 1437 if (ret != 0)
1973b62a 1438 return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
c581d2a6
CB
1439 }
1440
1441 return true;
eeef32bb
CB
1442}
1443
1973b62a
CB
1444static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1445 gid_t chown_gid, mode_t chmod_mode)
6efacf80
CB
1446{
1447 int ret;
1448
1973b62a
CB
1449 ret = fchownat(dirfd, path, chown_uid, chown_gid,
1450 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1451 if (ret < 0)
1452 return log_warn_errno(-1,
1453 errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1454 dirfd, path, (int)chown_uid,
1455 (int)chown_gid);
6efacf80 1456
1973b62a
CB
1457 ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1458 if (ret < 0)
1459 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1460 dirfd, path, (int)chmod_mode);
6efacf80
CB
1461
1462 return 0;
1463}
1464
1465/* chgrp the container cgroups to container group. We leave
c0888dfe
SH
1466 * the container owner as cgroup owner. So we must make the
1467 * directories 775 so that the container can create sub-cgroups.
43647298
SH
1468 *
1469 * Also chown the tasks and cgroup.procs files. Those may not
1470 * exist depending on kernel version.
c0888dfe 1471 */
ccb4cabe
SH
1472static int chown_cgroup_wrapper(void *data)
1473{
6a720d74 1474 int ret;
4160c3a0
CB
1475 uid_t destuid;
1476 struct generic_userns_exec_data *arg = data;
1477 uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1478 gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
ccb4cabe 1479
b58214ac
CB
1480 if (!lxc_setgroups(0, NULL) && errno != EPERM)
1481 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1482
6efacf80 1483 ret = setresgid(nsgid, nsgid, nsgid);
803e4123 1484 if (ret < 0)
77c3e9a2 1485 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
803e4123 1486 (int)nsgid, (int)nsgid, (int)nsgid);
6efacf80
CB
1487
1488 ret = setresuid(nsuid, nsuid, nsuid);
803e4123 1489 if (ret < 0)
77c3e9a2 1490 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
803e4123 1491 (int)nsuid, (int)nsuid, (int)nsuid);
6efacf80 1492
ccb4cabe 1493 destuid = get_ns_uid(arg->origuid);
b962868f
CB
1494 if (destuid == LXC_INVALID_UID)
1495 destuid = 0;
ccb4cabe 1496
6a720d74 1497 for (int i = 0; arg->hierarchies[i]; i++) {
1973b62a 1498 int dirfd = arg->hierarchies[i]->cgfd_con;
43647298 1499
1973b62a 1500 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
c0888dfe 1501
1973b62a
CB
1502 /*
1503 * Failures to chown() these are inconvenient but not
6efacf80
CB
1504 * detrimental We leave these owned by the container launcher,
1505 * so that container root can write to the files to attach. We
1506 * chmod() them 664 so that container systemd can write to the
1507 * files (which systemd in wily insists on doing).
ab8f5424 1508 */
6efacf80 1509
1973b62a
CB
1510 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC)
1511 (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
43647298 1512
1973b62a 1513 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
0e17357c 1514
2202afc9 1515 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
0e17357c
CB
1516 continue;
1517
1973b62a
CB
1518 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++)
1519 (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
ccb4cabe
SH
1520 }
1521
1522 return 0;
1523}
1524
b857f4be 1525__cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
c98bbf71 1526 struct lxc_conf *conf)
ccb4cabe 1527{
4160c3a0 1528 struct generic_userns_exec_data wrap;
ccb4cabe 1529
c98bbf71
CB
1530 if (!ops)
1531 return ret_set_errno(false, ENOENT);
ccb4cabe 1532
69b4a4bb
CB
1533 if (!ops->hierarchies)
1534 return true;
1535
c98bbf71
CB
1536 if (!ops->container_cgroup)
1537 return ret_set_errno(false, ENOENT);
1538
1539 if (!conf)
1540 return ret_set_errno(false, EINVAL);
1541
1542 if (lxc_list_empty(&conf->id_map))
1543 return true;
1544
ccb4cabe 1545 wrap.origuid = geteuid();
4160c3a0 1546 wrap.path = NULL;
2202afc9 1547 wrap.hierarchies = ops->hierarchies;
4160c3a0 1548 wrap.conf = conf;
ccb4cabe 1549
c98bbf71
CB
1550 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1551 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
ccb4cabe
SH
1552
1553 return true;
1554}
1555
78eb6aa6
CB
1556__cgfsng_ops void cgfsng_payload_finalize(struct cgroup_ops *ops)
1557{
1558 if (!ops)
1559 return;
1560
1561 if (!ops->hierarchies)
1562 return;
1563
1564 for (int i = 0; ops->hierarchies[i]; i++) {
1565 struct hierarchy *h = ops->hierarchies[i];
1566 /*
1567 * we don't keep the fds for non-unified hierarchies around
1568 * mainly because we don't make use of them anymore after the
1569 * core cgroup setup is done but also because there are quite a
1570 * lot of them.
1571 */
1572 if (!is_unified_hierarchy(h))
1573 close_prot_errno_disarm(h->cgfd_con);
1574 }
1575}
1576
8aa1044f 1577/* cgroup-full:* is done, no need to create subdirs */
77c3e9a2 1578static inline bool cg_mount_needs_subdirs(int type)
8aa1044f 1579{
77c3e9a2 1580 return !(type >= LXC_AUTO_CGROUP_FULL_RO);
8aa1044f
SH
1581}
1582
886cac86
CB
1583/* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1584 * remount controller ro if needed and bindmount the cgroupfs onto
25fa6f8c 1585 * control/the/cg/path.
8aa1044f 1586 */
6812d833
CB
1587static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1588 char *controllerpath, char *cgpath,
1589 const char *container_cgroup)
8aa1044f 1590{
d97919ab 1591 __do_free char *sourcepath = NULL;
5285689c 1592 int ret, remount_flags;
886cac86
CB
1593 int flags = MS_BIND;
1594
8aa1044f 1595 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
886cac86 1596 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
77c3e9a2
CB
1597 if (ret < 0)
1598 return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1599 controllerpath, controllerpath);
886cac86 1600
5285689c
CB
1601 remount_flags = add_required_remount_flags(controllerpath,
1602 controllerpath,
1603 flags | MS_REMOUNT);
886cac86 1604 ret = mount(controllerpath, controllerpath, "cgroup",
8186c5c7
CB
1605 remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1606 NULL);
77c3e9a2
CB
1607 if (ret < 0)
1608 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath);
886cac86 1609
8aa1044f
SH
1610 INFO("Remounted %s read-only", controllerpath);
1611 }
886cac86 1612
bb221ad1 1613 sourcepath = must_make_path(h->mountpoint, h->container_base_path,
886cac86 1614 container_cgroup, NULL);
8aa1044f
SH
1615 if (type == LXC_AUTO_CGROUP_RO)
1616 flags |= MS_RDONLY;
886cac86
CB
1617
1618 ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
77c3e9a2
CB
1619 if (ret < 0)
1620 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1621 h->controllers[0], cgpath);
886cac86 1622 INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
f8c40ffa
L
1623
1624 if (flags & MS_RDONLY) {
5285689c
CB
1625 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1626 flags | MS_REMOUNT);
1627 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
77c3e9a2
CB
1628 if (ret < 0)
1629 return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
5285689c 1630 INFO("Remounted %s read-only", cgpath);
f8c40ffa
L
1631 }
1632
886cac86 1633 INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
8aa1044f
SH
1634 return 0;
1635}
1636
6812d833
CB
1637/* __cg_mount_direct
1638 *
1639 * Mount cgroup hierarchies directly without using bind-mounts. The main
1640 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1641 * cgroups for the LXC_AUTO_CGROUP_FULL option.
1642 */
1643static int __cg_mount_direct(int type, struct hierarchy *h,
1644 const char *controllerpath)
b635e92d 1645{
d97919ab 1646 __do_free char *controllers = NULL;
a760603e
CB
1647 char *fstype = "cgroup2";
1648 unsigned long flags = 0;
f6b54668 1649 int ret;
b635e92d 1650
a760603e
CB
1651 flags |= MS_NOSUID;
1652 flags |= MS_NOEXEC;
1653 flags |= MS_NODEV;
1654 flags |= MS_RELATIME;
1655
1656 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1657 flags |= MS_RDONLY;
1658
d6337a5f 1659 if (h->version != CGROUP2_SUPER_MAGIC) {
a760603e
CB
1660 controllers = lxc_string_join(",", (const char **)h->controllers, false);
1661 if (!controllers)
1662 return -ENOMEM;
1663 fstype = "cgroup";
b635e92d
CB
1664 }
1665
a760603e 1666 ret = mount("cgroup", controllerpath, fstype, flags, controllers);
77c3e9a2
CB
1667 if (ret < 0)
1668 return log_error_errno(-1, errno, "Failed to mount \"%s\" with cgroup filesystem type %s",
1669 controllerpath, fstype);
b635e92d 1670
6812d833 1671 DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
b635e92d
CB
1672 return 0;
1673}
1674
6812d833
CB
1675static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1676 const char *controllerpath)
1677{
1678 return __cg_mount_direct(type, h, controllerpath);
1679}
1680
1681static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1682 const char *controllerpath)
1683{
1684 if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1685 return 0;
1686
1687 return __cg_mount_direct(type, h, controllerpath);
1688}
1689
b857f4be 1690__cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
8d661d38
CB
1691 struct lxc_handler *handler,
1692 const char *root, int type)
ccb4cabe 1693{
6607d6e9 1694 __do_free char *cgroup_root = NULL;
d7314671 1695 bool has_cgns = false, wants_force_mount = false;
dfa835ac 1696 int ret;
8aa1044f 1697
9585ccb3
CB
1698 if (!ops)
1699 return ret_set_errno(false, ENOENT);
1700
69b4a4bb
CB
1701 if (!ops->hierarchies)
1702 return true;
1703
9585ccb3
CB
1704 if (!handler || !handler->conf)
1705 return ret_set_errno(false, EINVAL);
1706
8aa1044f
SH
1707 if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1708 return true;
1709
3f69fb12
SY
1710 if (type & LXC_AUTO_CGROUP_FORCE) {
1711 type &= ~LXC_AUTO_CGROUP_FORCE;
1712 wants_force_mount = true;
1713 }
b635e92d 1714
3f69fb12
SY
1715 if (!wants_force_mount){
1716 if (!lxc_list_empty(&handler->conf->keepcaps))
1717 wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1718 else
1719 wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1720 }
8aa1044f 1721
3f69fb12
SY
1722 has_cgns = cgns_supported();
1723 if (has_cgns && !wants_force_mount)
1724 return true;
8aa1044f
SH
1725
1726 if (type == LXC_AUTO_CGROUP_NOSPEC)
1727 type = LXC_AUTO_CGROUP_MIXED;
1728 else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1729 type = LXC_AUTO_CGROUP_FULL_MIXED;
1730
dca9587a 1731 cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
8d661d38 1732 if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
8d661d38 1733 if (has_cgns && wants_force_mount) {
d7314671
CB
1734 /*
1735 * If cgroup namespaces are supported but the container
8d661d38
CB
1736 * will not have CAP_SYS_ADMIN after it has started we
1737 * need to mount the cgroups manually.
1738 */
d7314671 1739 return cg_mount_in_cgroup_namespace(type, ops->unified, cgroup_root) == 0;
8d661d38
CB
1740 }
1741
6607d6e9 1742 return cg_mount_cgroup_full(type, ops->unified, cgroup_root) == 0;
8d661d38
CB
1743 }
1744
1745 /* mount tmpfs */
6607d6e9 1746 ret = safe_mount(NULL, cgroup_root, "tmpfs",
3f69fb12
SY
1747 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1748 "size=10240k,mode=755", root);
1749 if (ret < 0)
d7314671 1750 return false;
8aa1044f 1751
dfa835ac 1752 for (int i = 0; ops->hierarchies[i]; i++) {
d97919ab 1753 __do_free char *controllerpath = NULL, *path2 = NULL;
2202afc9 1754 struct hierarchy *h = ops->hierarchies[i];
8aa1044f 1755 char *controller = strrchr(h->mountpoint, '/');
8aa1044f
SH
1756
1757 if (!controller)
1758 continue;
1759 controller++;
affd10fa 1760
6607d6e9 1761 controllerpath = must_make_path(cgroup_root, controller, NULL);
d97919ab 1762 if (dir_exists(controllerpath))
8aa1044f 1763 continue;
affd10fa 1764
3f69fb12 1765 ret = mkdir(controllerpath, 0755);
d7314671
CB
1766 if (ret < 0)
1767 return log_error_errno(false, errno, "Error creating cgroup path: %s", controllerpath);
b635e92d 1768
3f69fb12 1769 if (has_cgns && wants_force_mount) {
b635e92d
CB
1770 /* If cgroup namespaces are supported but the container
1771 * will not have CAP_SYS_ADMIN after it has started we
1772 * need to mount the cgroups manually.
1773 */
3f69fb12 1774 ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
3f69fb12 1775 if (ret < 0)
d7314671 1776 return false;
3f69fb12 1777
b635e92d
CB
1778 continue;
1779 }
1780
6812d833 1781 ret = cg_mount_cgroup_full(type, h, controllerpath);
d97919ab 1782 if (ret < 0)
d7314671 1783 return false;
3f69fb12 1784
d97919ab 1785 if (!cg_mount_needs_subdirs(type))
8aa1044f 1786 continue;
3f69fb12 1787
bb221ad1 1788 path2 = must_make_path(controllerpath, h->container_base_path,
2202afc9 1789 ops->container_cgroup, NULL);
3f69fb12 1790 ret = mkdir_p(path2, 0755);
d97919ab 1791 if (ret < 0)
d7314671 1792 return false;
2f62fb00 1793
6812d833 1794 ret = cg_legacy_mount_controllers(type, h, controllerpath,
2202afc9 1795 path2, ops->container_cgroup);
3f69fb12 1796 if (ret < 0)
d7314671 1797 return false;
8aa1044f 1798 }
8aa1044f 1799
d7314671 1800 return true;
ccb4cabe
SH
1801}
1802
11c23867 1803/* Only root needs to escape to the cgroup of its init. */
b857f4be 1804__cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
52d08ab0 1805 struct lxc_conf *conf)
ccb4cabe 1806{
52d08ab0
CB
1807 if (!ops)
1808 return ret_set_errno(false, ENOENT);
1809
1810 if (!ops->hierarchies)
1811 return true;
1812
1813 if (!conf)
1814 return ret_set_errno(false, EINVAL);
1815
1816 if (conf->cgroup_meta.relative || geteuid())
ccb4cabe
SH
1817 return true;
1818
779b3d82 1819 for (int i = 0; ops->hierarchies[i]; i++) {
88396101 1820 __do_free char *fullpath = NULL;
52d08ab0 1821 int ret;
11c23867 1822
52d08ab0
CB
1823 fullpath =
1824 must_make_path(ops->hierarchies[i]->mountpoint,
1825 ops->hierarchies[i]->container_base_path,
1826 "cgroup.procs", NULL);
7cea5905 1827 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
52d08ab0 1828 if (ret != 0)
77c3e9a2 1829 return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
ccb4cabe
SH
1830 }
1831
6df334d1 1832 return true;
ccb4cabe
SH
1833}
1834
b857f4be 1835__cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
36662416 1836{
69b4a4bb
CB
1837 int i = 0;
1838
e3ffb28b
CB
1839 if (!ops)
1840 return ret_set_errno(-1, ENOENT);
1841
69b4a4bb
CB
1842 if (!ops->hierarchies)
1843 return 0;
36662416 1844
69b4a4bb 1845 for (; ops->hierarchies[i]; i++)
36662416
TA
1846 ;
1847
1848 return i;
1849}
1850
aa48a34f
CB
1851__cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n,
1852 char ***out)
36662416
TA
1853{
1854 int i;
1855
aa48a34f
CB
1856 if (!ops)
1857 return ret_set_errno(false, ENOENT);
1858
69b4a4bb 1859 if (!ops->hierarchies)
77c3e9a2 1860 return ret_set_errno(false, ENOENT);
69b4a4bb 1861
36662416 1862 /* sanity check n */
6b38e644 1863 for (i = 0; i < n; i++)
2202afc9 1864 if (!ops->hierarchies[i])
aa48a34f 1865 return ret_set_errno(false, ENOENT);
36662416 1866
2202afc9 1867 *out = ops->hierarchies[i]->controllers;
36662416
TA
1868
1869 return true;
1870}
1871
ee3a7775 1872static bool cg_legacy_freeze(struct cgroup_ops *ops)
ccb4cabe 1873{
d6337a5f 1874 struct hierarchy *h;
ccb4cabe 1875
ee3a7775
CB
1876 h = get_hierarchy(ops, "freezer");
1877 if (!h)
d2203230 1878 return ret_set_errno(-1, ENOENT);
81468ea7 1879
c04a6d4e
CB
1880 return lxc_write_openat(h->container_full_path, "freezer.state",
1881 "FROZEN", STRLITERALLEN("FROZEN"));
ee3a7775 1882}
942e193e 1883
018051e3
CB
1884static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1885 struct lxc_epoll_descr *descr)
ee3a7775 1886{
f62cf1d4 1887 __do_close int duped_fd = -EBADF;
018051e3 1888 __do_free char *line = NULL;
ee3a7775 1889 __do_fclose FILE *f = NULL;
018051e3
CB
1890 int state = PTR_TO_INT(cbdata);
1891 size_t len;
1892 const char *state_string;
1893
1894 duped_fd = dup(fd);
1895 if (duped_fd < 0)
1896 return LXC_MAINLOOP_ERROR;
1897
1898 if (lseek(duped_fd, 0, SEEK_SET) < (off_t)-1)
1899 return LXC_MAINLOOP_ERROR;
1900
1901 f = fdopen(duped_fd, "re");
1902 if (!f)
1903 return LXC_MAINLOOP_ERROR;
1904 move_fd(duped_fd);
1905
1906 if (state == 1)
1907 state_string = "frozen 1";
1908 else
1909 state_string = "frozen 0";
1910
1911 while (getline(&line, &len, f) != -1)
1912 if (strncmp(line, state_string, STRLITERALLEN("frozen") + 2) == 0)
1913 return LXC_MAINLOOP_CLOSE;
1914
1915 return LXC_MAINLOOP_CONTINUE;
1916}
1917
1918static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1919{
f62cf1d4 1920 __do_close int fd = -EBADF;
eafc1bb6 1921 call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
018051e3
CB
1922 int ret;
1923 struct lxc_epoll_descr descr;
ee3a7775 1924 struct hierarchy *h;
942e193e
CB
1925
1926 h = ops->unified;
457ca9aa 1927 if (!h)
d2203230 1928 return ret_set_errno(-1, ENOENT);
d6337a5f 1929
018051e3 1930 if (!h->container_full_path)
d2203230 1931 return ret_set_errno(-1, EEXIST);
d6337a5f 1932
018051e3
CB
1933 if (timeout != 0) {
1934 __do_free char *events_file = NULL;
942e193e 1935
018051e3
CB
1936 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
1937 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1938 if (fd < 0)
d2203230 1939 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
942e193e 1940
018051e3
CB
1941 ret = lxc_mainloop_open(&descr);
1942 if (ret)
d2203230 1943 return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container freeze");
942e193e 1944
018051e3
CB
1945 /* automatically cleaned up now */
1946 descr_ptr = &descr;
942e193e 1947
018051e3
CB
1948 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){1}));
1949 if (ret < 0)
d2203230 1950 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 1951 }
942e193e 1952
c04a6d4e 1953 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "1", 1);
018051e3 1954 if (ret < 0)
d2203230 1955 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
1956
1957 if (timeout != 0 && lxc_mainloop(&descr, timeout))
d2203230 1958 return log_error_errno(-1, errno, "Failed to wait for container to be frozen");
018051e3
CB
1959
1960 return 0;
942e193e
CB
1961}
1962
018051e3 1963__cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
942e193e 1964{
81468ea7 1965 if (!ops->hierarchies)
d2203230 1966 return ret_set_errno(-1, ENOENT);
81468ea7 1967
ee3a7775
CB
1968 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
1969 return cg_legacy_freeze(ops);
942e193e 1970
018051e3 1971 return cg_unified_freeze(ops, timeout);
ee3a7775
CB
1972}
1973
018051e3 1974static int cg_legacy_unfreeze(struct cgroup_ops *ops)
ee3a7775 1975{
ee3a7775
CB
1976 struct hierarchy *h;
1977
1978 h = get_hierarchy(ops, "freezer");
1979 if (!h)
d2203230 1980 return ret_set_errno(-1, ENOENT);
ee3a7775 1981
c04a6d4e
CB
1982 return lxc_write_openat(h->container_full_path, "freezer.state",
1983 "THAWED", STRLITERALLEN("THAWED"));
ee3a7775
CB
1984}
1985
018051e3 1986static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775 1987{
f62cf1d4 1988 __do_close int fd = -EBADF;
eafc1bb6 1989 call_cleaner(lxc_mainloop_close)struct lxc_epoll_descr *descr_ptr = NULL;
018051e3
CB
1990 int ret;
1991 struct lxc_epoll_descr descr;
ee3a7775 1992 struct hierarchy *h;
942e193e
CB
1993
1994 h = ops->unified;
1995 if (!h)
d2203230 1996 return ret_set_errno(-1, ENOENT);
018051e3
CB
1997
1998 if (!h->container_full_path)
d2203230 1999 return ret_set_errno(-1, EEXIST);
018051e3
CB
2000
2001 if (timeout != 0) {
2002 __do_free char *events_file = NULL;
2003
2004 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2005 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2006 if (fd < 0)
d2203230 2007 return log_error_errno(-1, errno, "Failed to open cgroup.events file");
018051e3
CB
2008
2009 ret = lxc_mainloop_open(&descr);
2010 if (ret)
d2203230 2011 return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container unfreeze");
018051e3
CB
2012
2013 /* automatically cleaned up now */
2014 descr_ptr = &descr;
2015
2016 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){0}));
2017 if (ret < 0)
d2203230 2018 return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
018051e3 2019 }
942e193e 2020
c04a6d4e 2021 ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "0", 1);
018051e3 2022 if (ret < 0)
d2203230 2023 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
018051e3
CB
2024
2025 if (timeout != 0 && lxc_mainloop(&descr, timeout))
d2203230 2026 return log_error_errno(-1, errno, "Failed to wait for container to be unfrozen");
018051e3
CB
2027
2028 return 0;
ee3a7775
CB
2029}
2030
018051e3 2031__cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
ee3a7775
CB
2032{
2033 if (!ops->hierarchies)
d2203230 2034 return ret_set_errno(-1, ENOENT);
ee3a7775
CB
2035
2036 if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2037 return cg_legacy_unfreeze(ops);
2038
018051e3 2039 return cg_unified_unfreeze(ops, timeout);
ccb4cabe
SH
2040}
2041
b857f4be 2042__cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
6bdf9691 2043 const char *controller)
ccb4cabe 2044{
d6337a5f
CB
2045 struct hierarchy *h;
2046
2202afc9 2047 h = get_hierarchy(ops, controller);
6bdf9691 2048 if (!h)
77c3e9a2 2049 return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
6bdf9691 2050 controller ? controller : "(null)");
ccb4cabe 2051
6bdf9691
CB
2052 return h->container_full_path
2053 ? h->container_full_path + strlen(h->mountpoint)
2054 : NULL;
371f834d
SH
2055}
2056
c40c8209
CB
2057/* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2058 * which must be freed by the caller.
371f834d 2059 */
c40c8209
CB
2060static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2061 const char *inpath,
2062 const char *filename)
371f834d 2063{
371f834d 2064 return must_make_path(h->mountpoint, inpath, filename, NULL);
ccb4cabe
SH
2065}
2066
4b86fefd 2067static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
c2aed66d 2068{
ad275c16 2069 int idx = 1;
c2aed66d 2070 int ret;
900b6606
CB
2071 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2072 size_t pidstr_len;
c2aed66d 2073
ad275c16 2074 /* Create leaf cgroup. */
275e8ef8 2075 ret = mkdirat(unified_fd, ".lxc", 0755);
ad275c16 2076 if (ret < 0 && errno != EEXIST)
275e8ef8 2077 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
ad275c16 2078
7581a82f 2079 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
275e8ef8 2080 ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
ad275c16
CB
2081 if (ret < 0)
2082 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
c2aed66d 2083 if (ret == 0)
bad788b0 2084 return 0;
ad275c16 2085
bad788b0
CB
2086 /* this is a non-leaf node */
2087 if (errno != EBUSY)
d2203230 2088 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2089
c2aed66d 2090 do {
7581a82f 2091 bool rm = false;
275e8ef8 2092 char attach_cgroup[STRLITERALLEN(".lxc-1000/cgroup.procs") + 1];
bad788b0 2093 char *slash;
c2aed66d 2094
5045306b
CB
2095 ret = snprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2096 if (ret < 0 || (size_t)ret >= sizeof(attach_cgroup))
2097 return ret_errno(EIO);
2098
bad788b0
CB
2099 slash = &attach_cgroup[ret] - STRLITERALLEN("/cgroup.procs");
2100 *slash = '\0';
ad275c16 2101
bad788b0 2102 ret = mkdirat(unified_fd, attach_cgroup, 0755);
c2aed66d 2103 if (ret < 0 && errno != EEXIST)
d2203230 2104 return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
7581a82f
CB
2105 if (ret == 0)
2106 rm = true;
c2aed66d 2107
bad788b0 2108 *slash = '/';
ad275c16 2109
bad788b0 2110 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
c2aed66d 2111 if (ret == 0)
bad788b0 2112 return 0;
c2aed66d 2113
7581a82f
CB
2114 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2115 SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2116
c2aed66d
CB
2117 /* this is a non-leaf node */
2118 if (errno != EBUSY)
d2203230 2119 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d 2120
edae86e9
CB
2121 idx++;
2122 } while (idx < 1000);
c2aed66d 2123
ad275c16 2124 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
c2aed66d
CB
2125}
2126
d1783ef4
CB
2127static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2128 int unified_fd, int *sk_fd)
2129{
7d849163
CB
2130 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2131 int target_fds[2];
d1783ef4
CB
2132 ssize_t ret;
2133
2134 /* Create leaf cgroup. */
2135 ret = mkdirat(unified_fd, ".lxc", 0755);
2136 if (ret < 0 && errno != EEXIST)
2137 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2138
7d849163
CB
2139 target_fd0 = openat(unified_fd, ".lxc/cgroup.procs", O_WRONLY | O_CLOEXEC | O_NOFOLLOW);
2140 if (target_fd0 < 0)
d1783ef4 2141 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
7d849163 2142 target_fds[0] = target_fd0;
d1783ef4 2143
7d849163
CB
2144 target_fd1 = openat(unified_fd, "cgroup.procs", O_WRONLY | O_CLOEXEC | O_NOFOLLOW);
2145 if (target_fd1 < 0)
49df620b 2146 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
7d849163 2147 target_fds[1] = target_fd1;
49df620b
CB
2148
2149 ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
d1783ef4 2150 if (ret <= 0)
49df620b 2151 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
7d849163 2152 target_fd0, target_fd1);
d1783ef4 2153
7d849163 2154 return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
d1783ef4
CB
2155}
2156
2157static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2158 int *sk_fd, pid_t pid)
2159{
7d849163
CB
2160 __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2161 int target_fds[2];
d1783ef4
CB
2162 char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2163 size_t pidstr_len;
2164 ssize_t ret;
2165
49df620b 2166 ret = lxc_abstract_unix_recv_fds(sk, target_fds, 2, NULL, 0);
d1783ef4
CB
2167 if (ret <= 0)
2168 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
7d849163
CB
2169 target_fd0 = target_fds[0];
2170 target_fd1 = target_fds[1];
d1783ef4
CB
2171
2172 pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2173
7d849163
CB
2174 ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2175 if (ret > 0 && ret == pidstr_len)
2176 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2177
49df620b 2178 ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
7d849163
CB
2179 if (ret > 0 && ret == pidstr_len)
2180 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
d1783ef4 2181
7d849163
CB
2182 return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2183 target_fd0, target_fd1);
d1783ef4
CB
2184}
2185
4b86fefd
CB
2186struct userns_exec_unified_attach_data {
2187 const struct lxc_conf *conf;
2188 int unified_fd;
d1783ef4 2189 int sk_pair[2];
4b86fefd
CB
2190 pid_t pid;
2191};
2192
d1783ef4
CB
2193static int cgroup_unified_attach_child_wrapper(void *data)
2194{
2195 struct userns_exec_unified_attach_data *args = data;
2196
2197 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2198 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2199 return ret_errno(EINVAL);
2200
2201 close_prot_errno_disarm(args->sk_pair[0]);
2202 return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2203 &args->sk_pair[1]);
2204}
2205
2206static int cgroup_unified_attach_parent_wrapper(void *data)
4b86fefd
CB
2207{
2208 struct userns_exec_unified_attach_data *args = data;
4b86fefd 2209
d1783ef4
CB
2210 if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2211 args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
4b86fefd
CB
2212 return ret_errno(EINVAL);
2213
d1783ef4
CB
2214 close_prot_errno_disarm(args->sk_pair[1]);
2215 return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2216 args->pid);
4b86fefd
CB
2217}
2218
7581a82f
CB
2219int cgroup_attach(const struct lxc_conf *conf, const char *name,
2220 const char *lxcpath, pid_t pid)
900b6606 2221{
f62cf1d4 2222 __do_close int unified_fd = -EBADF;
7581a82f
CB
2223 int ret;
2224
2225 if (!conf || !name || !lxcpath || pid <= 0)
2226 return ret_errno(EINVAL);
900b6606
CB
2227
2228 unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
2229 if (unified_fd < 0)
7581a82f 2230 return ret_errno(EBADF);
900b6606 2231
4b86fefd
CB
2232 if (!lxc_list_empty(&conf->id_map)) {
2233 struct userns_exec_unified_attach_data args = {
2234 .conf = conf,
2235 .unified_fd = unified_fd,
2236 .pid = pid,
2237 };
ba7ca43b 2238
d1783ef4
CB
2239 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2240 if (ret < 0)
2241 return -errno;
2242
2243 ret = userns_exec_minimal(conf,
2244 cgroup_unified_attach_parent_wrapper,
2245 &args,
2246 cgroup_unified_attach_child_wrapper,
2247 &args);
4b86fefd
CB
2248 } else {
2249 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2250 }
7581a82f 2251
4b86fefd 2252 return ret;
900b6606
CB
2253}
2254
2255/* Technically, we're always at a delegation boundary here (This is especially
2256 * true when cgroup namespaces are available.). The reasoning is that in order
2257 * for us to have been able to start a container in the first place the root
2258 * cgroup must have been a leaf node. Now, either the container's init system
2259 * has populated the cgroup and kept it as a leaf node or it has created
2260 * subtrees. In the former case we will simply attach to the leaf node we
2261 * created when we started the container in the latter case we create our own
2262 * cgroup for the attaching process.
2263 */
7581a82f
CB
2264static int __cg_unified_attach(const struct hierarchy *h,
2265 const struct lxc_conf *conf, const char *name,
900b6606
CB
2266 const char *lxcpath, pid_t pid,
2267 const char *controller)
2268{
f62cf1d4 2269 __do_close int unified_fd = -EBADF;
32908bfd 2270 __do_free char *path = NULL, *cgroup = NULL;
900b6606
CB
2271 int ret;
2272
7581a82f
CB
2273 if (!conf || !name || !lxcpath || pid <= 0)
2274 return ret_errno(EINVAL);
2275
2276 ret = cgroup_attach(conf, name, lxcpath, pid);
32908bfd
CB
2277 if (ret == 0)
2278 return log_trace(0, "Attached to unified cgroup via command handler");
2279 if (ret != -EBADF)
2280 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2281
2282 /* Fall back to retrieving the path for the unified cgroup. */
2283 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2284 /* not running */
2285 if (!cgroup)
2286 return 0;
900b6606 2287
32908bfd 2288 path = must_make_path(h->mountpoint, cgroup, NULL);
900b6606 2289
32908bfd 2290 unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
900b6606 2291 if (unified_fd < 0)
7581a82f
CB
2292 return ret_errno(EBADF);
2293
4b86fefd
CB
2294 if (!lxc_list_empty(&conf->id_map)) {
2295 struct userns_exec_unified_attach_data args = {
2296 .conf = conf,
2297 .unified_fd = unified_fd,
2298 .pid = pid,
2299 };
2300
d1783ef4
CB
2301 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2302 if (ret < 0)
2303 return -errno;
2304
2305 ret = userns_exec_minimal(conf,
2306 cgroup_unified_attach_parent_wrapper,
2307 &args,
2308 cgroup_unified_attach_child_wrapper,
2309 &args);
4b86fefd
CB
2310 } else {
2311 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2312 }
2313
2314 return ret;
900b6606
CB
2315}
2316
7581a82f
CB
2317__cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2318 const struct lxc_conf *conf,
2319 const char *name, const char *lxcpath,
2320 pid_t pid)
ccb4cabe 2321{
81b5d48a 2322 int len, ret;
a3650c0c 2323 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
ccb4cabe 2324
ab9a452d
CB
2325 if (!ops)
2326 return ret_set_errno(false, ENOENT);
2327
69b4a4bb
CB
2328 if (!ops->hierarchies)
2329 return true;
2330
a3650c0c
CB
2331 len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
2332 if (len < 0 || (size_t)len >= sizeof(pidstr))
ccb4cabe
SH
2333 return false;
2334
81b5d48a 2335 for (int i = 0; ops->hierarchies[i]; i++) {
c05b17bd 2336 __do_free char *fullpath = NULL, *path = NULL;
2202afc9 2337 struct hierarchy *h = ops->hierarchies[i];
ccb4cabe 2338
c2aed66d 2339 if (h->version == CGROUP2_SUPER_MAGIC) {
7581a82f 2340 ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
a3926f6a 2341 h->controllers[0]);
c2aed66d
CB
2342 if (ret < 0)
2343 return false;
2344
2345 continue;
2346 }
2347
ccb4cabe 2348 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
c2aed66d
CB
2349 /* not running */
2350 if (!path)
e2cb2e74 2351 return false;
ccb4cabe 2352
371f834d 2353 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
7cea5905 2354 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
ab9a452d 2355 if (ret < 0)
77c3e9a2 2356 return log_error_errno(false, errno, "Failed to attach %d to %s",
ab9a452d 2357 (int)pid, fullpath);
ccb4cabe
SH
2358 }
2359
ccb4cabe
SH
2360 return true;
2361}
2362
e2bd2b13
CB
2363/* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we
2364 * don't have a cgroup_data set up, so we ask the running container through the
2365 * commands API for the cgroup path.
ccb4cabe 2366 */
b857f4be 2367__cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
fb55e009
CB
2368 char *value, size_t len, const char *name,
2369 const char *lxcpath)
ccb4cabe 2370{
d97919ab 2371 __do_free char *path = NULL;
88396101 2372 __do_free char *controller = NULL;
d97919ab 2373 char *p;
0069cc61 2374 struct hierarchy *h;
861cb8c2 2375 int ret = -1;
ccb4cabe 2376
a358028a
CB
2377 if (!ops)
2378 return ret_set_errno(-1, ENOENT);
2379
861cb8c2 2380 controller = must_copy_string(filename);
0069cc61
CB
2381 p = strchr(controller, '.');
2382 if (p)
ccb4cabe
SH
2383 *p = '\0';
2384
0069cc61
CB
2385 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2386 /* not running */
2387 if (!path)
ccb4cabe
SH
2388 return -1;
2389
2202afc9 2390 h = get_hierarchy(ops, controller);
ccb4cabe 2391 if (h) {
88396101 2392 __do_free char *fullpath = NULL;
0069cc61
CB
2393
2394 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
ccb4cabe 2395 ret = lxc_read_from_file(fullpath, value, len);
ccb4cabe 2396 }
ccb4cabe
SH
2397
2398 return ret;
2399}
2400
cb3fc90c
CB
2401static int device_cgroup_parse_access(struct device_item *device, const char *val)
2402{
2403 for (int count = 0; count < 3; count++, val++) {
2404 switch (*val) {
2405 case 'r':
2406 device->access[count] = *val;
2407 break;
2408 case 'w':
2409 device->access[count] = *val;
2410 break;
2411 case 'm':
2412 device->access[count] = *val;
2413 break;
2414 case '\n':
2415 case '\0':
2416 count = 3;
2417 break;
2418 default:
2419 return ret_errno(EINVAL);
2420 }
2421 }
2422
2423 return 0;
2424}
2425
2a63b5cb
CB
2426static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2427 const char *val)
2428{
2429 int count, ret;
2430 char temp[50];
2431
2432 if (strcmp("devices.allow", key) == 0)
2433 device->allow = 1;
2434 else
2435 device->allow = 0;
2436
2437 if (strcmp(val, "a") == 0) {
2438 /* global rule */
2439 device->type = 'a';
2440 device->major = -1;
2441 device->minor = -1;
fda39d45
CB
2442 device->global_rule = device->allow
2443 ? LXC_BPF_DEVICE_CGROUP_BLACKLIST
2444 : LXC_BPF_DEVICE_CGROUP_WHITELIST;
2a63b5cb
CB
2445 device->allow = -1;
2446 return 0;
2a63b5cb
CB
2447 }
2448
77c3e9a2
CB
2449 /* local rule */
2450 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2451
2a63b5cb
CB
2452 switch (*val) {
2453 case 'a':
2454 __fallthrough;
2455 case 'b':
2456 __fallthrough;
2457 case 'c':
2458 device->type = *val;
2459 break;
2460 default:
2461 return -1;
2462 }
2463
2464 val++;
2465 if (!isspace(*val))
2466 return -1;
2467 val++;
2468 if (*val == '*') {
2469 device->major = -1;
2470 val++;
2471 } else if (isdigit(*val)) {
2472 memset(temp, 0, sizeof(temp));
2473 for (count = 0; count < sizeof(temp) - 1; count++) {
2474 temp[count] = *val;
2475 val++;
2476 if (!isdigit(*val))
2477 break;
2478 }
2479 ret = lxc_safe_int(temp, &device->major);
2480 if (ret)
2481 return -1;
2482 } else {
2483 return -1;
2484 }
2485 if (*val != ':')
2486 return -1;
2487 val++;
2488
2489 /* read minor */
2490 if (*val == '*') {
2491 device->minor = -1;
2492 val++;
2493 } else if (isdigit(*val)) {
2494 memset(temp, 0, sizeof(temp));
2495 for (count = 0; count < sizeof(temp) - 1; count++) {
2496 temp[count] = *val;
2497 val++;
2498 if (!isdigit(*val))
2499 break;
2500 }
2501 ret = lxc_safe_int(temp, &device->minor);
2502 if (ret)
2503 return -1;
2504 } else {
2505 return -1;
2506 }
2507 if (!isspace(*val))
2508 return -1;
2a63b5cb 2509
cb3fc90c 2510 return device_cgroup_parse_access(device, ++val);
2a63b5cb
CB
2511}
2512
eec533e3
CB
2513/* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we
2514 * don't have a cgroup_data set up, so we ask the running container through the
2515 * commands API for the cgroup path.
ccb4cabe 2516 */
b857f4be 2517__cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2a63b5cb 2518 const char *key, const char *value,
fb55e009 2519 const char *name, const char *lxcpath)
ccb4cabe 2520{
d97919ab 2521 __do_free char *path = NULL;
88396101 2522 __do_free char *controller = NULL;
d97919ab 2523 char *p;
87777968 2524 struct hierarchy *h;
861cb8c2 2525 int ret = -1;
ccb4cabe 2526
a358028a
CB
2527 if (!ops)
2528 return ret_set_errno(-1, ENOENT);
2529
2a63b5cb 2530 controller = must_copy_string(key);
87777968
CB
2531 p = strchr(controller, '.');
2532 if (p)
ccb4cabe
SH
2533 *p = '\0';
2534
2a63b5cb
CB
2535 if (pure_unified_layout(ops) && strcmp(controller, "devices") == 0) {
2536 struct device_item device = {0};
2537
2538 ret = device_cgroup_rule_parse(&device, key, value);
2539 if (ret < 0)
d2203230 2540 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2a63b5cb
CB
2541 key, value);
2542
2543 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2544 if (ret < 0)
2545 return -1;
2546
2547 return 0;
2548 }
2549
87777968
CB
2550 path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2551 /* not running */
2552 if (!path)
ccb4cabe
SH
2553 return -1;
2554
2202afc9 2555 h = get_hierarchy(ops, controller);
ccb4cabe 2556 if (h) {
88396101 2557 __do_free char *fullpath = NULL;
87777968 2558
2a63b5cb 2559 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
7cea5905 2560 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
ccb4cabe 2561 }
ccb4cabe
SH
2562
2563 return ret;
2564}
2565
91d1a13a 2566/* take devices cgroup line
72add155
SH
2567 * /dev/foo rwx
2568 * and convert it to a valid
2569 * type major:minor mode
91d1a13a
CB
2570 * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2571 * the output.
72add155 2572 */
cb3fc90c
CB
2573static int device_cgroup_rule_parse_devpath(struct device_item *device,
2574 const char *devpath)
72add155 2575{
88396101 2576 __do_free char *path = NULL;
2a06d041 2577 char *mode = NULL;
cb3fc90c
CB
2578 int n_parts, ret;
2579 char *p;
2580 struct stat sb;
72add155 2581
cb3fc90c 2582 path = must_copy_string(devpath);
72add155 2583
cb3fc90c
CB
2584 /*
2585 * Read path followed by mode. Ignore any trailing text.
91d1a13a
CB
2586 * A ' # comment' would be legal. Technically other text is not
2587 * legal, we could check for that if we cared to.
72add155 2588 */
0dbdb99e 2589 for (n_parts = 1, p = path; *p; p++) {
2c2d6c49
SH
2590 if (*p != ' ')
2591 continue;
2592 *p = '\0';
91d1a13a 2593
2c2d6c49
SH
2594 if (n_parts != 1)
2595 break;
2596 p++;
2597 n_parts++;
91d1a13a 2598
2c2d6c49
SH
2599 while (*p == ' ')
2600 p++;
91d1a13a 2601
2c2d6c49 2602 mode = p;
91d1a13a 2603
2c2d6c49 2604 if (*p == '\0')
cb3fc90c 2605 return ret_set_errno(-1, EINVAL);
72add155 2606 }
2c2d6c49 2607
cb3fc90c
CB
2608 if (device_cgroup_parse_access(device, mode) < 0)
2609 return -1;
2610
2c2d6c49 2611 if (n_parts == 1)
cb3fc90c 2612 return ret_set_errno(-1, EINVAL);
72add155
SH
2613
2614 ret = stat(path, &sb);
2615 if (ret < 0)
cb3fc90c 2616 return ret_set_errno(-1, errno);
72add155 2617
72add155
SH
2618 mode_t m = sb.st_mode & S_IFMT;
2619 switch (m) {
2620 case S_IFBLK:
cb3fc90c 2621 device->type = 'b';
72add155
SH
2622 break;
2623 case S_IFCHR:
cb3fc90c 2624 device->type = 'c';
72add155 2625 break;
2c2d6c49 2626 default:
77c3e9a2 2627 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
72add155 2628 }
2c2d6c49 2629
cb3fc90c
CB
2630 device->major = MAJOR(sb.st_rdev);
2631 device->minor = MINOR(sb.st_rdev);
2632 device->allow = 1;
2633 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
72add155 2634
cb3fc90c
CB
2635 return 0;
2636}
2637
2638static int convert_devpath(const char *invalue, char *dest)
2639{
2640 struct device_item device = {0};
2641 int ret;
2642
2643 ret = device_cgroup_rule_parse_devpath(&device, invalue);
2644 if (ret < 0)
2645 return -1;
2646
2647 ret = snprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2648 device.minor, device.access);
2649 if (ret < 0 || ret >= 50)
77c3e9a2
CB
2650 return log_error_errno(-1, ENAMETOOLONG, "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2651 device.type, device.major, device.minor, device.access);
cb3fc90c
CB
2652
2653 return 0;
72add155
SH
2654}
2655
90e97284
CB
2656/* Called from setup_limits - here we have the container's cgroup_data because
2657 * we created the cgroups.
ccb4cabe 2658 */
2202afc9
CB
2659static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2660 const char *value)
ccb4cabe 2661{
88396101 2662 __do_free char *controller = NULL;
d97919ab 2663 char *p;
1a0e70ac
CB
2664 /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2665 char converted_value[50];
b3646d7e 2666 struct hierarchy *h;
64e82f8b 2667
861cb8c2 2668 controller = must_copy_string(filename);
ab1a6cac
CB
2669 p = strchr(controller, '.');
2670 if (p)
ccb4cabe
SH
2671 *p = '\0';
2672
c8bf519d 2673 if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
c04a6d4e
CB
2674 int ret;
2675
72add155
SH
2676 ret = convert_devpath(value, converted_value);
2677 if (ret < 0)
c8bf519d 2678 return ret;
72add155 2679 value = converted_value;
c8bf519d 2680 }
2681
2202afc9 2682 h = get_hierarchy(ops, controller);
77c3e9a2
CB
2683 if (!h)
2684 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
b3646d7e 2685
c04a6d4e 2686 return lxc_write_openat(h->container_full_path, filename, value, strlen(value));
ccb4cabe
SH
2687}
2688
c581d2a6
CB
2689__cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2690 struct lxc_conf *conf,
2691 bool do_devices)
ccb4cabe 2692{
d97919ab 2693 __do_free struct lxc_list *sorted_cgroup_settings = NULL;
c581d2a6 2694 struct lxc_list *cgroup_settings = &conf->cgroup;
d97919ab 2695 struct lxc_list *iterator, *next;
ccb4cabe 2696 struct lxc_cgroup *cg;
ccb4cabe
SH
2697 bool ret = false;
2698
92ca7eb5
CB
2699 if (!ops)
2700 return ret_set_errno(false, ENOENT);
2701
2702 if (!conf)
2703 return ret_set_errno(false, EINVAL);
2704
2705 cgroup_settings = &conf->cgroup;
ccb4cabe
SH
2706 if (lxc_list_empty(cgroup_settings))
2707 return true;
2708
69b4a4bb 2709 if (!ops->hierarchies)
92ca7eb5 2710 return ret_set_errno(false, EINVAL);
69b4a4bb 2711
ccb4cabe 2712 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
6b38e644 2713 if (!sorted_cgroup_settings)
ccb4cabe 2714 return false;
ccb4cabe 2715
ccb4cabe
SH
2716 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2717 cg = iterator->elem;
2718
2719 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2202afc9 2720 if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
fc3b9533
CB
2721 if (do_devices && (errno == EACCES || errno == EPERM)) {
2722 SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2723 continue;
2724 }
2725 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2726 goto out;
ccb4cabe 2727 }
77c3e9a2 2728 DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
ccb4cabe 2729 }
ccb4cabe
SH
2730 }
2731
2732 ret = true;
6b38e644 2733 INFO("Limits for the legacy cgroup hierarchies have been setup");
ccb4cabe 2734out:
ccb4cabe
SH
2735 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2736 lxc_list_del(iterator);
2737 free(iterator);
2738 }
d97919ab 2739
ccb4cabe
SH
2740 return ret;
2741}
2742
bf651989
CB
2743/*
2744 * Some of the parsing logic comes from the original cgroup device v1
2745 * implementation in the kernel.
2746 */
4bfb655e
CB
2747static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2748 struct lxc_conf *conf, const char *key,
bf651989
CB
2749 const char *val)
2750{
2751#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
4bfb655e 2752 struct device_item device_item = {0};
2a63b5cb 2753 int ret;
bf651989 2754
cb3fc90c
CB
2755 if (strcmp("devices.allow", key) == 0 && *val == '/')
2756 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2757 else
2758 ret = device_cgroup_rule_parse(&device_item, key, val);
2a63b5cb 2759 if (ret < 0)
77c3e9a2 2760 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", key, val);
4bfb655e
CB
2761
2762 ret = bpf_list_add_device(conf, &device_item);
2a63b5cb 2763 if (ret < 0)
4bfb655e 2764 return -1;
bf651989
CB
2765#endif
2766 return 0;
2767}
2768
c581d2a6
CB
2769__cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2770 struct lxc_handler *handler)
6b38e644 2771{
7e31931f
CB
2772 struct lxc_list *cgroup_settings, *iterator;
2773 struct hierarchy *h;
2774 struct lxc_conf *conf;
6b38e644 2775
7e31931f
CB
2776 if (!ops)
2777 return ret_set_errno(false, ENOENT);
2778
2779 if (!ops->hierarchies)
6b38e644
CB
2780 return true;
2781
7e31931f
CB
2782 if (!ops->container_cgroup)
2783 return ret_set_errno(false, EINVAL);
2784
2785 if (!handler || !handler->conf)
2786 return ret_set_errno(false, EINVAL);
2787 conf = handler->conf;
2788
2789 if (lxc_list_empty(&conf->cgroup2))
2790 return true;
2791 cgroup_settings = &conf->cgroup2;
2792
2793 if (!ops->unified)
6b38e644 2794 return false;
7e31931f 2795 h = ops->unified;
6b38e644 2796
bf651989 2797 lxc_list_for_each (iterator, cgroup_settings) {
6b38e644 2798 struct lxc_cgroup *cg = iterator->elem;
c04a6d4e 2799 int ret;
6b38e644 2800
bf651989 2801 if (strncmp("devices", cg->subsystem, 7) == 0) {
4bfb655e 2802 ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem,
bf651989
CB
2803 cg->value);
2804 } else {
c04a6d4e
CB
2805 ret = lxc_write_openat(h->container_full_path,
2806 cg->subsystem, cg->value,
2807 strlen(cg->value));
7e31931f 2808 if (ret < 0)
77c3e9a2 2809 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"",
7e31931f 2810 cg->subsystem, cg->value);
6b38e644
CB
2811 }
2812 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2813 }
2814
7e31931f 2815 return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
6b38e644
CB
2816}
2817
bf651989
CB
2818__cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
2819 struct lxc_handler *handler)
2820{
2821#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
2a63b5cb 2822 __do_bpf_program_free struct bpf_program *devices = NULL;
bf651989 2823 int ret;
e552bd1a
CB
2824 struct lxc_conf *conf;
2825 struct hierarchy *unified;
2a63b5cb
CB
2826 struct lxc_list *it;
2827 struct bpf_program *devices_old;
bf651989 2828
e552bd1a
CB
2829 if (!ops)
2830 return ret_set_errno(false, ENOENT);
2831
2832 if (!ops->hierarchies)
2833 return true;
2834
2835 if (!ops->container_cgroup)
2836 return ret_set_errno(false, EEXIST);
2837
2838 if (!handler || !handler->conf)
2839 return ret_set_errno(false, EINVAL);
2840 conf = handler->conf;
2841
2842 unified = ops->unified;
9994db51
CB
2843 if (!unified || !unified->bpf_device_controller ||
2844 !unified->container_full_path || lxc_list_empty(&conf->devices))
bf651989
CB
2845 return true;
2846
2a63b5cb
CB
2847 devices = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
2848 if (!devices)
77c3e9a2 2849 return log_error_errno(false, ENOMEM, "Failed to create new bpf program");
2a63b5cb
CB
2850
2851 ret = bpf_program_init(devices);
bf651989 2852 if (ret)
77c3e9a2 2853 return log_error_errno(false, ENOMEM, "Failed to initialize bpf program");
2a63b5cb
CB
2854
2855 lxc_list_for_each(it, &conf->devices) {
2856 struct device_item *cur = it->elem;
2857
2858 ret = bpf_program_append_device(devices, cur);
2859 if (ret)
77c3e9a2
CB
2860 return log_error_errno(false, ENOMEM, "Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2861 cur->type,
2862 cur->major,
2863 cur->minor,
2864 cur->access,
2865 cur->allow,
2866 cur->global_rule);
2a63b5cb 2867 TRACE("Added rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
77c3e9a2
CB
2868 cur->type,
2869 cur->major,
2870 cur->minor,
2871 cur->access,
2872 cur->allow,
2873 cur->global_rule);
2a63b5cb
CB
2874 }
2875
2876 ret = bpf_program_finalize(devices);
2877 if (ret)
77c3e9a2 2878 return log_error_errno(false, ENOMEM, "Failed to finalize bpf program");
bf651989 2879
2a63b5cb
CB
2880 ret = bpf_program_cgroup_attach(devices, BPF_CGROUP_DEVICE,
2881 unified->container_full_path,
cce5a3d7
CB
2882 BPF_F_ALLOW_MULTI);
2883 if (ret)
77c3e9a2 2884 return log_error_errno(false, ENOMEM, "Failed to attach bpf program");
cce5a3d7
CB
2885
2886 /* Replace old bpf program. */
2a63b5cb
CB
2887 devices_old = move_ptr(conf->cgroup2_devices);
2888 conf->cgroup2_devices = move_ptr(devices);
2889 devices = move_ptr(devices_old);
bf651989 2890#endif
cce5a3d7 2891 return true;
bf651989
CB
2892}
2893
c581d2a6 2894bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
6b38e644 2895{
c581d2a6 2896 __do_free char *add_controllers = NULL, *base_path = NULL;
f761d24d 2897 __do_free_string_list char **parts = NULL;
c581d2a6
CB
2898 struct hierarchy *unified = ops->unified;
2899 ssize_t parts_len;
2900 char **it;
2901 size_t full_len = 0;
6b38e644 2902
c581d2a6
CB
2903 if (!ops->hierarchies || !pure_unified_layout(ops) ||
2904 !unified->controllers[0])
bf651989
CB
2905 return true;
2906
c581d2a6
CB
2907 /* For now we simply enable all controllers that we have detected by
2908 * creating a string like "+memory +pids +cpu +io".
2909 * TODO: In the near future we might want to support "-<controller>"
2910 * etc. but whether supporting semantics like this make sense will need
2911 * some thinking.
2912 */
2913 for (it = unified->controllers; it && *it; it++) {
2914 full_len += strlen(*it) + 2;
2915 add_controllers = must_realloc(add_controllers, full_len + 1);
2916
2917 if (unified->controllers[0] == *it)
2918 add_controllers[0] = '\0';
2919
2920 (void)strlcat(add_controllers, "+", full_len + 1);
2921 (void)strlcat(add_controllers, *it, full_len + 1);
2922
2923 if ((it + 1) && *(it + 1))
2924 (void)strlcat(add_controllers, " ", full_len + 1);
2925 }
2926
2927 parts = lxc_string_split(cgroup, '/');
2928 if (!parts)
f761d24d 2929 return false;
c581d2a6
CB
2930
2931 parts_len = lxc_array_len((void **)parts);
2932 if (parts_len > 0)
2933 parts_len--;
2934
2935 base_path = must_make_path(unified->mountpoint, unified->container_base_path, NULL);
2936 for (ssize_t i = -1; i < parts_len; i++) {
2937 int ret;
2938 __do_free char *target = NULL;
2939
2940 if (i >= 0)
2941 base_path = must_append_path(base_path, parts[i], NULL);
2942 target = must_make_path(base_path, "cgroup.subtree_control", NULL);
2943 ret = lxc_writeat(-1, target, add_controllers, full_len);
61fbc369 2944 if (ret < 0)
f761d24d
CB
2945 return log_error_errno(false, errno, "Could not enable \"%s\" controllers in the unified cgroup \"%s\"",
2946 add_controllers, target);
c581d2a6
CB
2947 TRACE("Enable \"%s\" controllers in the unified cgroup \"%s\"", add_controllers, target);
2948 }
2949
f761d24d 2950 return true;
c581d2a6
CB
2951}
2952
2953__cgfsng_ops bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2954{
61fbc369
CB
2955 if (!ops)
2956 return ret_set_errno(false, ENOENT);
2957
c581d2a6
CB
2958 return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2959}
2960
2961__cgfsng_ops bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2962{
61fbc369
CB
2963 if (!ops)
2964 return ret_set_errno(false, ENOENT);
2965
c581d2a6 2966 return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2202afc9
CB
2967}
2968
b7b18fc5
CB
2969static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2970 char **controllers)
2971{
b7b18fc5
CB
2972 if (!ops->cgroup_use)
2973 return true;
2974
431e2c54 2975 for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
b7b18fc5
CB
2976 bool found = false;
2977
431e2c54 2978 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
b7b18fc5
CB
2979 if (strcmp(*cur_use, *cur_ctrl) != 0)
2980 continue;
2981
2982 found = true;
2983 break;
2984 }
2985
2986 if (found)
2987 continue;
2988
2989 return false;
2990 }
2991
2992 return true;
2993}
2994
a6ca2ed8
CB
2995static void cg_unified_delegate(char ***delegate)
2996{
d606c4e9 2997 __do_free char *buf = NULL;
a6ca2ed8 2998 char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
d606c4e9
CB
2999 char *token;
3000 int idx;
a6ca2ed8 3001
d606c4e9
CB
3002 buf = read_file("/sys/kernel/cgroup/delegate");
3003 if (!buf) {
a6ca2ed8
CB
3004 for (char **p = standard; p && *p; p++) {
3005 idx = append_null_to_list((void ***)delegate);
3006 (*delegate)[idx] = must_copy_string(*p);
3007 }
fc3b9533
CB
3008 SYSWARN("Failed to read /sys/kernel/cgroup/delegate");
3009 return;
d606c4e9 3010 }
a6ca2ed8 3011
d606c4e9
CB
3012 lxc_iterate_parts (token, buf, " \t\n") {
3013 /*
3014 * We always need to chown this for both cgroup and
3015 * cgroup2.
3016 */
3017 if (strcmp(token, "cgroup.procs") == 0)
3018 continue;
3019
3020 idx = append_null_to_list((void ***)delegate);
3021 (*delegate)[idx] = must_copy_string(token);
a6ca2ed8
CB
3022 }
3023}
3024
2202afc9
CB
3025/* At startup, parse_hierarchies finds all the info we need about cgroup
3026 * mountpoints and current cgroups, and stores it in @d.
3027 */
341e6516 3028static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
2202afc9 3029{
bbba37f7
CB
3030 __do_free char *basecginfo = NULL, *line = NULL;
3031 __do_free_string_list char **klist = NULL, **nlist = NULL;
d97919ab 3032 __do_fclose FILE *f = NULL;
2202afc9 3033 int ret;
2202afc9 3034 size_t len = 0;
2202afc9
CB
3035
3036 /* Root spawned containers escape the current cgroup, so use init's
3037 * cgroups as our base in that case.
3038 */
9caee129 3039 if (!relative && (geteuid() == 0))
2202afc9
CB
3040 basecginfo = read_file("/proc/1/cgroup");
3041 else
3042 basecginfo = read_file("/proc/self/cgroup");
3043 if (!basecginfo)
341e6516 3044 return ret_set_errno(-1, ENOMEM);
2202afc9
CB
3045
3046 ret = get_existing_subsystems(&klist, &nlist);
341e6516
CB
3047 if (ret < 0)
3048 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
2202afc9 3049
4110345b 3050 f = fopen("/proc/self/mountinfo", "re");
341e6516
CB
3051 if (!f)
3052 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
2202afc9
CB
3053
3054 lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
3055
3056 while (getline(&line, &len, f) != -1) {
bbba37f7
CB
3057 __do_free char *base_cgroup = NULL, *mountpoint = NULL;
3058 __do_free_string_list char **controller_list = NULL;
2202afc9
CB
3059 int type;
3060 bool writeable;
3061 struct hierarchy *new;
2202afc9
CB
3062
3063 type = get_cgroup_version(line);
3064 if (type == 0)
3065 continue;
3066
3067 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
3068 continue;
3069
3070 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
3071 if (type == CGROUP2_SUPER_MAGIC)
3072 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3073 else if (type == CGROUP_SUPER_MAGIC)
3074 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3075 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
3076 if (type == CGROUP_SUPER_MAGIC)
3077 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3078 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3079 if (type == CGROUP2_SUPER_MAGIC)
3080 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3081 }
3082
3083 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
3084 if (!controller_list && type == CGROUP_SUPER_MAGIC)
3085 continue;
3086
3087 if (type == CGROUP_SUPER_MAGIC)
fc3b9533
CB
3088 if (controller_list_is_dup(ops->hierarchies, controller_list)) {
3089 TRACE("Skipping duplicating controller");
3090 continue;
3091 }
2202afc9
CB
3092
3093 mountpoint = cg_hybrid_get_mountpoint(line);
fc3b9533
CB
3094 if (!mountpoint) {
3095 ERROR("Failed parsing mountpoint from \"%s\"", line);
3096 continue;
3097 }
2202afc9
CB
3098
3099 if (type == CGROUP_SUPER_MAGIC)
3100 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
3101 else
3102 base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
fc3b9533
CB
3103 if (!base_cgroup) {
3104 ERROR("Failed to find current cgroup");
3105 continue;
3106 }
2202afc9
CB
3107
3108 trim(base_cgroup);
3109 prune_init_scope(base_cgroup);
3110 if (type == CGROUP2_SUPER_MAGIC)
3111 writeable = test_writeable_v2(mountpoint, base_cgroup);
3112 else
3113 writeable = test_writeable_v1(mountpoint, base_cgroup);
fc3b9533
CB
3114 if (!writeable) {
3115 TRACE("The %s group is not writeable", base_cgroup);
3116 continue;
3117 }
2202afc9
CB
3118
3119 if (type == CGROUP2_SUPER_MAGIC) {
3120 char *cgv2_ctrl_path;
3121
3122 cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
3123 "cgroup.controllers",
3124 NULL);
3125
3126 controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
3127 free(cgv2_ctrl_path);
3128 if (!controller_list) {
3129 controller_list = cg_unified_make_empty_controller();
3130 TRACE("No controllers are enabled for "
3131 "delegation in the unified hierarchy");
3132 }
3133 }
3134
b7b18fc5 3135 /* Exclude all controllers that cgroup use does not want. */
fc3b9533
CB
3136 if (!cgroup_use_wants_controllers(ops, controller_list)) {
3137 TRACE("Skipping controller");
3138 continue;
3139 }
b7b18fc5 3140
bbba37f7 3141 new = add_hierarchy(&ops->hierarchies, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
a6ca2ed8
CB
3142 if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
3143 if (unprivileged)
3144 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 3145 ops->unified = new;
a6ca2ed8 3146 }
2202afc9
CB
3147 }
3148
2202afc9
CB
3149 TRACE("Writable cgroup hierarchies:");
3150 lxc_cgfsng_print_hierarchies(ops);
3151
3152 /* verify that all controllers in cgroup.use and all crucial
3153 * controllers are accounted for
3154 */
3155 if (!all_controllers_found(ops))
341e6516 3156 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
2202afc9 3157
341e6516 3158 return 0;
2202afc9
CB
3159}
3160
2202afc9 3161/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
9caee129 3162static char *cg_unified_get_current_cgroup(bool relative)
2202afc9 3163{
88396101 3164 __do_free char *basecginfo = NULL;
d7314671 3165 char *copy;
d97919ab 3166 char *base_cgroup;
2202afc9 3167
9caee129 3168 if (!relative && (geteuid() == 0))
2202afc9
CB
3169 basecginfo = read_file("/proc/1/cgroup");
3170 else
3171 basecginfo = read_file("/proc/self/cgroup");
3172 if (!basecginfo)
3173 return NULL;
3174
3175 base_cgroup = strstr(basecginfo, "0::/");
3176 if (!base_cgroup)
d7314671 3177 return NULL;
2202afc9
CB
3178
3179 base_cgroup = base_cgroup + 3;
3180 copy = copy_to_eol(base_cgroup);
3181 if (!copy)
d7314671 3182 return NULL;
2202afc9 3183
d7314671 3184 return trim(copy);
2202afc9
CB
3185}
3186
a6ca2ed8
CB
3187static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3188 bool unprivileged)
2202afc9 3189{
d97919ab 3190 __do_free char *subtree_path = NULL;
2202afc9 3191 int ret;
7717e175 3192 char *mountpoint;
2202afc9 3193 char **delegatable;
a6ca2ed8 3194 struct hierarchy *new;
2202afc9
CB
3195 char *base_cgroup = NULL;
3196
d47ff01b 3197 ret = unified_cgroup_hierarchy();
2202afc9 3198 if (ret == -ENOMEDIUM)
d2203230 3199 return ret_errno(ENOMEDIUM);
2202afc9
CB
3200
3201 if (ret != CGROUP2_SUPER_MAGIC)
3202 return 0;
3203
9caee129 3204 base_cgroup = cg_unified_get_current_cgroup(relative);
2202afc9 3205 if (!base_cgroup)
d2203230 3206 return ret_errno(EINVAL);
c581d2a6
CB
3207 if (!relative)
3208 prune_init_scope(base_cgroup);
2202afc9 3209
d606c4e9
CB
3210 /*
3211 * We assume that the cgroup we're currently in has been delegated to
3212 * us and we are free to further delege all of the controllers listed
3213 * in cgroup.controllers further down the hierarchy.
2202afc9 3214 */
dca9587a 3215 mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
c581d2a6 3216 subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
2202afc9 3217 delegatable = cg_unified_get_controllers(subtree_path);
2202afc9
CB
3218 if (!delegatable)
3219 delegatable = cg_unified_make_empty_controller();
3220 if (!delegatable[0])
3221 TRACE("No controllers are enabled for delegation");
3222
3223 /* TODO: If the user requested specific controllers via lxc.cgroup.use
3224 * we should verify here. The reason I'm not doing it right is that I'm
3225 * not convinced that lxc.cgroup.use will be the future since it is a
3226 * global property. I much rather have an option that lets you request
3227 * controllers per container.
3228 */
3229
a6ca2ed8 3230 new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
d606c4e9 3231 if (unprivileged)
a6ca2ed8 3232 cg_unified_delegate(&new->cgroup2_chown);
2202afc9 3233
2a63b5cb
CB
3234 if (bpf_devices_cgroup_supported())
3235 new->bpf_device_controller = 1;
3236
2202afc9 3237 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
908e0ee5 3238 ops->unified = new;
77c3e9a2 3239
2202afc9
CB
3240 return CGROUP2_SUPER_MAGIC;
3241}
3242
341e6516 3243static int cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
2202afc9
CB
3244{
3245 int ret;
3246 const char *tmp;
9caee129 3247 bool relative = conf->cgroup_meta.relative;
2202afc9
CB
3248
3249 tmp = lxc_global_config_value("lxc.cgroup.use");
b7b18fc5 3250 if (tmp) {
88396101 3251 __do_free char *pin = NULL;
d97919ab 3252 char *chop, *cur;
b7b18fc5
CB
3253
3254 pin = must_copy_string(tmp);
3255 chop = pin;
3256
d97919ab 3257 lxc_iterate_parts(cur, chop, ",")
b7b18fc5 3258 must_append_string(&ops->cgroup_use, cur);
b7b18fc5 3259 }
2202afc9 3260
a6ca2ed8 3261 ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9 3262 if (ret < 0)
341e6516 3263 return -1;
2202afc9
CB
3264
3265 if (ret == CGROUP2_SUPER_MAGIC)
341e6516 3266 return 0;
2202afc9 3267
a6ca2ed8 3268 return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
2202afc9
CB
3269}
3270
341e6516 3271__cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
2202afc9
CB
3272{
3273 const char *cgroup_pattern;
3274
341e6516
CB
3275 if (!ops)
3276 return ret_set_errno(-1, ENOENT);
3277
2202afc9
CB
3278 /* copy system-wide cgroup information */
3279 cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
b3ed2061
CB
3280 if (cgroup_pattern && strcmp(cgroup_pattern, "") != 0)
3281 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
2202afc9 3282
341e6516 3283 return 0;
2202afc9
CB
3284}
3285
5a087e05 3286struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
2202afc9 3287{
a64edc1c 3288 __do_free struct cgroup_ops *cgfsng_ops = NULL;
2202afc9
CB
3289
3290 cgfsng_ops = malloc(sizeof(struct cgroup_ops));
3291 if (!cgfsng_ops)
341e6516 3292 return ret_set_errno(NULL, ENOMEM);
2202afc9
CB
3293
3294 memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
3295 cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3296
341e6516 3297 if (cg_init(cgfsng_ops, conf))
2202afc9 3298 return NULL;
2202afc9
CB
3299
3300 cgfsng_ops->data_init = cgfsng_data_init;
434c8e15
CB
3301 cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3302 cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
72068e74 3303 cgfsng_ops->monitor_create = cgfsng_monitor_create;
eeef32bb 3304 cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
c581d2a6
CB
3305 cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3306 cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
e8b181f5
CB
3307 cgfsng_ops->payload_create = cgfsng_payload_create;
3308 cgfsng_ops->payload_enter = cgfsng_payload_enter;
78eb6aa6 3309 cgfsng_ops->payload_finalize = cgfsng_payload_finalize;
2202afc9
CB
3310 cgfsng_ops->escape = cgfsng_escape;
3311 cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
3312 cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
3313 cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3314 cgfsng_ops->get = cgfsng_get;
3315 cgfsng_ops->set = cgfsng_set;
942e193e 3316 cgfsng_ops->freeze = cgfsng_freeze;
2202afc9 3317 cgfsng_ops->unfreeze = cgfsng_unfreeze;
c581d2a6 3318 cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
2202afc9
CB
3319 cgfsng_ops->setup_limits = cgfsng_setup_limits;
3320 cgfsng_ops->driver = "cgfsng";
3321 cgfsng_ops->version = "1.0.0";
3322 cgfsng_ops->attach = cgfsng_attach;
3323 cgfsng_ops->chown = cgfsng_chown;
3324 cgfsng_ops->mount = cgfsng_mount;
bf651989 3325 cgfsng_ops->devices_activate = cgfsng_devices_activate;
2202afc9 3326
a64edc1c 3327 return move_ptr(cgfsng_ops);
2202afc9 3328}