[mirror_lxcfs.git] / cgroups / cgfsng.c

/* SPDX-License-Identifier: LGPL-2.1+ */

/*
 * cgfs-ng.c: this is a new, simplified implementation of a filesystem
 * cgroup backend.  The original cgfs.c was designed to be as flexible
 * as possible.  It would try to find cgroup filesystems no matter where
 * or how you had them mounted, and deduce the most usable mount for
 * each controller.
 *
 * This new implementation assumes that cgroup filesystems are mounted
 * under /sys/fs/cgroup/clist where clist is either the controller, or
 * a comma-separated list of controllers.
 */

#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
#include <ctype.h>
#include <dirent.h>
#include <errno.h>
#include <grp.h>
#include <linux/kdev_t.h>
#include <linux/types.h>
#include <poll.h>
#include <signal.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/types.h>
#include <unistd.h>

#include "cgroup.h"
#include "cgroup2_devices.h"
#include "cgroup_utils.h"
#include "macro.h"
#include "memory_utils.h"

static void free_string_list(char **clist)
{
	int i;

	if (!clist)
		return;

	for (i = 0; clist[i]; i++)
		free(clist[i]);

	free(clist);
}

/* Given a pointer to a null-terminated array of pointers, realloc to add one
 * entry, and point the new entry to NULL. Do not fail. Return the index to the
 * second-to-last entry - that is, the one which is now available for use
 * (keeping the list null-terminated).
 */
static int append_null_to_list(void ***list)
{
	int newentry = 0;

	if (*list)
		for (; (*list)[newentry]; newentry++)
			;

	*list = must_realloc(*list, (newentry + 2) * sizeof(void **));
	(*list)[newentry + 1] = NULL;
	return newentry;
}

/* Given a null-terminated array of strings, check whether @entry is one of the
 * strings.
 */
static bool string_in_list(char **list, const char *entry)
{
	int i;

	if (!list)
		return false;

	for (i = 0; list[i]; i++)
		if (strcmp(list[i], entry) == 0)
			return true;

	return false;
}

/* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
 * "name=systemd". Do not fail.
 */
static char *cg_legacy_must_prefix_named(char *entry)
{
	size_t len;
	char *prefixed;

	len = strlen(entry);
	prefixed = must_realloc(NULL, len + 6);

	memcpy(prefixed, "name=", STRLITERALLEN("name="));
	memcpy(prefixed + STRLITERALLEN("name="), entry, len);
	prefixed[len + 5] = '\0';

	return prefixed;
}

/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
 * we are called.
 *
 * We also handle named subsystems here. Any controller which is not a kernel
 * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
 * we refuse to use because we're not sure which we have here.
 * (TODO: We could work around this in some cases by just remounting to be
 * unambiguous, or by comparing mountpoint contents with current cgroup.)
 *
 * The last entry will always be NULL.
 */
static void must_append_controller(char **klist, char **nlist, char ***clist,
				   char *entry)
{
	int newentry;
	char *copy;

	if (string_in_list(klist, entry) && string_in_list(nlist, entry))
		return;

	newentry = append_null_to_list((void ***)clist);

	if (strncmp(entry, "name=", 5) == 0)
		copy = must_copy_string(entry);
	else if (string_in_list(klist, entry))
		copy = must_copy_string(entry);
	else
		copy = cg_legacy_must_prefix_named(entry);

	(*clist)[newentry] = copy;
}

/* Given a handler's cgroup data, return the struct hierarchy for the controller
 * @c, or NULL if there is none.
 */
static struct hierarchy *cgfsng_get_hierarchy(struct cgroup_ops *ops,
					      const char *controller)
{
	int i;

	errno = ENOENT;

	if (!ops->hierarchies)
		return NULL;

	for (i = 0; ops->hierarchies[i]; i++) {
		if (!controller) {
			/* This is the empty unified hierarchy. */
			if (ops->hierarchies[i]->controllers &&
			    !ops->hierarchies[i]->controllers[0])
				return ops->hierarchies[i];
			continue;
		} else if (pure_unified_layout(ops) &&
			   strcmp(controller, "devices") == 0) {
			if (ops->unified->bpf_device_controller)
				return ops->unified;
			break;
		}

		if (string_in_list(ops->hierarchies[i]->controllers, controller))
			return ops->hierarchies[i];
	}

	return NULL;
}

static inline struct hierarchy *get_hierarchy(struct cgroup_ops *ops,
					      const char *controller)
{
	return cgfsng_get_hierarchy(ops, controller);
}

/* Given two null-terminated lists of strings, return true if any string is in
 * both.
 */
static bool controller_lists_intersect(char **l1, char **l2)
{
	int i;

	if (!l1 || !l2)
		return false;

	for (i = 0; l1[i]; i++) {
		if (string_in_list(l2, l1[i]))
			return true;
	}

	return false;
}

/* For a null-terminated list of controllers @clist, return true if any of those
 * controllers is already listed the null-terminated list of hierarchies @hlist.
 * Realistically, if one is present, all must be present.
 */
static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
{
	int i;

	if (!hlist)
		return false;

	for (i = 0; hlist[i]; i++)
		if (controller_lists_intersect(hlist[i]->controllers, clist))
			return true;

	return false;
}

/* Get the controllers from a mountinfo line There are other ways we could get
 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
 * could parse the mount options. But we simply assume that the mountpoint must
 * be /sys/fs/cgroup/controller-list
 */
static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
					int type, char **controllers)
{
	/* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
	 * for legacy hierarchies.
	 */
	int i;
	char *p2, *tok;
	char *p = line, *sep = ",";
	char **aret = NULL;

	for (i = 0; i < 4; i++) {
		p = strchr(p, ' ');
		if (!p)
			return NULL;
		p++;
	}

	/* Note, if we change how mountinfo works, then our caller will need to
	 * verify /sys/fs/cgroup/ in this field.
	 */
	if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
		return NULL;

	p += 15;
	p2 = strchr(p, ' ');
	if (!p2)
		return NULL;
	*p2 = '\0';

	if (type == CGROUP_SUPER_MAGIC) {
		__do_free char *dup = NULL;

		/* strdup() here for v1 hierarchies. Otherwise
		 * lxc_iterate_parts() will destroy mountpoints such as
		 * "/sys/fs/cgroup/cpu,cpuacct".
		 */
		dup = must_copy_string(p);
		if (!dup)
			return NULL;

		lxc_iterate_parts (tok, dup, sep)
			must_append_controller(klist, nlist, &aret, tok);
		*controllers = move_ptr(dup);
	}
	*p2 = ' ';

	return aret;
}

static char **cg_unified_make_empty_controller(void)
{
	int newentry;
	char **aret = NULL;

	newentry = append_null_to_list((void ***)&aret);
	aret[newentry] = NULL;
	return aret;
}

static char **cg_unified_get_controllers(const char *file)
{
	__do_free char *buf = NULL;
	char *sep = " \t\n";
	char **aret = NULL;
	char *tok;

	buf = read_file(file);
	if (!buf)
		return NULL;

	lxc_iterate_parts(tok, buf, sep) {
		int newentry;
		char *copy;

		newentry = append_null_to_list((void ***)&aret);
		copy = must_copy_string(tok);
		aret[newentry] = copy;
	}

	return aret;
}

static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
				       char *container_base_path, int type)
{
	struct hierarchy *new;
	int newentry;

	new = zalloc(sizeof(*new));
	new->controllers = clist;
	new->mountpoint = mountpoint;
	new->container_base_path = container_base_path;
	new->version = type;

	newentry = append_null_to_list((void ***)h);
	(*h)[newentry] = new;
	return new;
}

/* Get a copy of the mountpoint from @line, which is a line from
 * /proc/self/mountinfo.
 */
static char *cg_hybrid_get_mountpoint(char *line)
{
	int i;
	size_t len;
	char *p2;
	char *p = line, *sret = NULL;

	for (i = 0; i < 4; i++) {
		p = strchr(p, ' ');
		if (!p)
			return NULL;
		p++;
	}

	if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
		return NULL;

	p2 = strchr(p + 15, ' ');
	if (!p2)
		return NULL;
	*p2 = '\0';

	len = strlen(p);
	sret = must_realloc(NULL, len + 1);
	memcpy(sret, p, len);
	sret[len] = '\0';
	return sret;
}

static void must_append_string(char ***list, char *entry)
{
	int newentry;
	char *copy;

	newentry = append_null_to_list((void ***)list);
	copy = must_copy_string(entry);
	(*list)[newentry] = copy;
}

static int get_existing_subsystems(char ***klist, char ***nlist)
{
	__do_free char *line = NULL;
	__do_fclose FILE *f = NULL;
	size_t len = 0;

	f = fopen("/proc/self/cgroup", "r");
	if (!f)
		return -1;

	while (getline(&line, &len, f) != -1) {
		char *p, *p2, *tok;
		p = strchr(line, ':');
		if (!p)
			continue;
		p++;
		p2 = strchr(p, ':');
		if (!p2)
			continue;
		*p2 = '\0';

		/* If the kernel has cgroup v2 support, then /proc/self/cgroup
		 * contains an entry of the form:
		 *
		 *	0::/some/path
		 *
		 * In this case we use "cgroup2" as controller name.
		 */
		if ((p2 - p) == 0) {
			must_append_string(klist, "cgroup2");
			continue;
		}

		lxc_iterate_parts(tok, p, ",") {
			if (strncmp(tok, "name=", 5) == 0)
				must_append_string(nlist, tok);
			else
				must_append_string(klist, tok);
		}
	}

	return 0;
}

static void trim(char *s)
{
	size_t len;

	len = strlen(s);
	while ((len > 1) && (s[len - 1] == '\n'))
		s[--len] = '\0';
}

/* __cg_mount_direct
 *
 * Mount cgroup hierarchies directly without using bind-mounts. The main
 * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
 * cgroups for the LXC_AUTO_CGROUP_FULL option.
 */
static int __cg_mount_direct(struct hierarchy *h, const char *controllerpath)
{
	 __do_free char *controllers = NULL;
	 char *fstype = "cgroup2";
	 unsigned long flags = 0;
	 int ret;

	 flags |= MS_NOSUID;
	 flags |= MS_NOEXEC;
	 flags |= MS_NODEV;
	 flags |= MS_RELATIME;

	 if (h->version != CGROUP2_SUPER_MAGIC) {
		 controllers = lxc_string_join(",", (const char **)h->controllers, false);
		 if (!controllers)
			 return -ENOMEM;
		 fstype = "cgroup";
	}

	ret = mount("cgroup", controllerpath, fstype, flags, controllers);
	if (ret < 0)
		return -1;

	return 0;
}

static inline int cg_mount_cgroup_full(struct hierarchy *h,
				       const char *controllerpath)
{
	return __cg_mount_direct(h, controllerpath);
}

static bool cgfsng_mount(struct cgroup_ops *ops, const char *root)
{
	__do_free char *cgroup_root = NULL;
	int ret;
	bool retval = false;

	if (!ops)
		return ret_set_errno(false, ENOENT);

	if (!ops->hierarchies)
		return true;

	cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
	if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
		return cg_mount_cgroup_full(ops->unified, cgroup_root) == 0;

	/* mount tmpfs */
	ret = safe_mount(NULL, cgroup_root, "tmpfs",
			 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
			 "size=10240k,mode=755", root);
	if (ret < 0)
		goto on_error;

	for (int i = 0; ops->hierarchies[i]; i++) {
		__do_free char *controllerpath = NULL;
		struct hierarchy *h = ops->hierarchies[i];
		char *controller = strrchr(h->mountpoint, '/');

		if (!controller)
			continue;
		controller++;

		controllerpath = must_make_path(cgroup_root, controller, NULL);
		if (dir_exists(controllerpath))
			continue;

		ret = mkdir(controllerpath, 0755);
		if (ret < 0)
			log_error_errno(goto on_error, errno,
					"Error creating cgroup path: %s",
					controllerpath);

		ret = cg_mount_cgroup_full( h, controllerpath);
		if (ret < 0)
			goto on_error;
	}
	retval = true;

on_error:
	return retval;
}

static int recursive_count_nrtasks(char *dirname)
{
	__do_free char *path = NULL;
	__do_closedir DIR *dir = NULL;
	struct dirent *direntp;
	int count = 0, ret;

	dir = opendir(dirname);
	if (!dir)
		return 0;

	while ((direntp = readdir(dir))) {
		struct stat mystat;

		if (!strcmp(direntp->d_name, ".") ||
		    !strcmp(direntp->d_name, ".."))
			continue;

		path = must_make_path(dirname, direntp->d_name, NULL);

		if (lstat(path, &mystat))
			continue;

		if (!S_ISDIR(mystat.st_mode))
			continue;

		count += recursive_count_nrtasks(path);
	}

	path = must_make_path(dirname, "cgroup.procs", NULL);
	ret = lxc_count_file_lines(path);
	if (ret != -1)
		count += ret;

	return count;
}

static int cgfsng_nrtasks(struct cgroup_ops *ops)
{
	__do_free char *path = NULL;

	if (!ops)
		return ret_set_errno(-1, ENOENT);

	if (!ops->container_cgroup || !ops->hierarchies)
		return ret_set_errno(-1, EINVAL);

	path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
	return recursive_count_nrtasks(path);
}

static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
{
	int i = 0;

	if (!ops)
		return ret_set_errno(-1, ENOENT);

	if (!ops->hierarchies)
		return 0;

	for (; ops->hierarchies[i]; i++)
		;

	return i;
}

static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
{
	int i;

	if (!ops)
		return ret_set_errno(false, ENOENT);

	if (!ops->hierarchies)
		return false;

	/* sanity check n */
	for (i = 0; i < n; i++)
		if (!ops->hierarchies[i])
			return ret_set_errno(false, ENOENT);

	*out = ops->hierarchies[i]->controllers;

	return true;
}

/* At startup, parse_hierarchies finds all the info we need about cgroup
 * mountpoints and current cgroups, and stores it in @d.
 */
static int cg_hybrid_init(struct cgroup_ops *ops)
{
	__do_free char *basecginfo = NULL;
	__do_free char *line = NULL;
	__do_fclose FILE *f = NULL;
	int ret;
	size_t len = 0;
	char **klist = NULL, **nlist = NULL;

	/* Root spawned containers escape the current cgroup, so use init's
	 * cgroups as our base in that case.
	 */
	basecginfo = read_file("/proc/1/cgroup");
	if (!basecginfo)
		return ret_set_errno(-1, ENOMEM);

	ret = get_existing_subsystems(&klist, &nlist);
	if (ret < 0)
		return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");

	f = fopen("/proc/self/mountinfo", "r");
	if (!f)
		return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");

	while (getline(&line, &len, f) != -1) {
		int type;
		struct hierarchy *new;
		char *base_cgroup = NULL, *mountpoint = NULL;
		char **controller_list = NULL;
		__do_free char *controllers = NULL;

		type = get_cgroup_version(line);
		if (type == 0)
			continue;

		if (type == CGROUP2_SUPER_MAGIC && ops->unified)
			continue;

		if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
			if (type == CGROUP2_SUPER_MAGIC)
				ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
			else if (type == CGROUP_SUPER_MAGIC)
				ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
		} else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
			if (type == CGROUP_SUPER_MAGIC)
				ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
		} else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
			if (type == CGROUP2_SUPER_MAGIC)
				ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
		}

		controller_list = cg_hybrid_get_controllers(klist, nlist, line,
							    type, &controllers);
		if (!controller_list && type == CGROUP_SUPER_MAGIC)
			continue;

		if (type == CGROUP_SUPER_MAGIC)
			if (controller_list_is_dup(ops->hierarchies, controller_list))
				ret_set_errno(goto next, EEXIST);

		mountpoint = cg_hybrid_get_mountpoint(line);
		if (!mountpoint)
			log_error_errno(goto next, EINVAL, "Failed parsing mountpoint from \"%s\"", line);

		if (type == CGROUP_SUPER_MAGIC) {
			base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
		} else {
			base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
		}
		if (!base_cgroup)
			log_error_errno(goto next, EINVAL, "Failed to find current cgroup %s", mountpoint);

		trim(base_cgroup);
		prune_init_scope(base_cgroup);

		if (type == CGROUP2_SUPER_MAGIC) {
			char *cgv2_ctrl_path;

			cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
							"cgroup.controllers",
							NULL);

			controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
			free(cgv2_ctrl_path);
			if (!controller_list)
				controller_list = cg_unified_make_empty_controller();
		}

		new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
		new->__controllers = move_ptr(controllers);
		if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
			ops->unified = new;

		continue;

	next:
		free_string_list(controller_list);
		free(mountpoint);
		free(base_cgroup);
	}

	free_string_list(klist);
	free_string_list(nlist);

	return 0;
}

static int cg_unified_init(struct cgroup_ops *ops)
{
	__do_free char *subtree_path = NULL;
	int ret;
	char *mountpoint;
	char **delegatable;
	struct hierarchy *new;
	char *base_cgroup = NULL;

	ret = unified_cgroup_hierarchy();
	if (ret == -ENOMEDIUM)
		return ret_errno(ENOMEDIUM);

	if (ret != CGROUP2_SUPER_MAGIC)
		return 0;

	base_cgroup = cg_unified_get_current_cgroup(1);
	if (!base_cgroup)
		return ret_errno(EINVAL);
	prune_init_scope(base_cgroup);

	/*
	 * We assume that the cgroup we're currently in has been delegated to
	 * us and we are free to further delege all of the controllers listed
	 * in cgroup.controllers further down the hierarchy.
	 */
	mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
	subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
	delegatable = cg_unified_get_controllers(subtree_path);
	if (!delegatable)
		delegatable = cg_unified_make_empty_controller();

	/* TODO: If the user requested specific controllers via lxc.cgroup.use
	 * we should verify here. The reason I'm not doing it right is that I'm
	 * not convinced that lxc.cgroup.use will be the future since it is a
	 * global property. I much rather have an option that lets you request
	 * controllers per container.
	 */

	new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);

	if (bpf_devices_cgroup_supported())
		new->bpf_device_controller = 1;

	ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
	ops->unified = new;
	return CGROUP2_SUPER_MAGIC;
}

static int cg_init(struct cgroup_ops *ops)
{
	int ret;

	ret = cg_unified_init(ops);
	if (ret < 0)
		return -1;

	if (ret == CGROUP2_SUPER_MAGIC)
		return 0;

	return cg_hybrid_init(ops);
}

struct cgroup_ops *cgfsng_ops_init(void)
{
	__do_free struct cgroup_ops *cgfsng_ops = NULL;

	cgfsng_ops = malloc(sizeof(struct cgroup_ops));
	if (!cgfsng_ops)
		return ret_set_errno(NULL, ENOMEM);

	memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
	cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;

	if (cg_init(cgfsng_ops))
		return NULL;

	cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
	cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
	cgfsng_ops->get_hierarchy = get_hierarchy;
	cgfsng_ops->driver = "cgfsng";
	cgfsng_ops->version = "1.0.0";
	cgfsng_ops->mount = cgfsng_mount;
	cgfsng_ops->nrtasks = cgfsng_nrtasks;

	return move_ptr(cgfsng_ops);
}
Commit	Line	Data
5fbea8a6 CB	1	/* SPDX-License-Identifier: LGPL-2.1+ */
	2
	3	/*
	4	* cgfs-ng.c: this is a new, simplified implementation of a filesystem
	5	* cgroup backend. The original cgfs.c was designed to be as flexible
	6	* as possible. It would try to find cgroup filesystems no matter where
	7	* or how you had them mounted, and deduce the most usable mount for
	8	* each controller.
	9	*
	10	* This new implementation assumes that cgroup filesystems are mounted
	11	* under /sys/fs/cgroup/clist where clist is either the controller, or
	12	* a comma-separated list of controllers.
	13	*/
	14
	15	#ifndef _GNU_SOURCE
	16	#define _GNU_SOURCE 1
	17	#endif
	18	#include <ctype.h>
	19	#include <dirent.h>
	20	#include <errno.h>
	21	#include <grp.h>
	22	#include <linux/kdev_t.h>
	23	#include <linux/types.h>
	24	#include <poll.h>
	25	#include <signal.h>
	26	#include <stdint.h>
	27	#include <stdio.h>
	28	#include <stdlib.h>
	29	#include <string.h>
	30	#include <sys/mount.h>
	31	#include <sys/types.h>
	32	#include <unistd.h>
	33
	34	#include "cgroup.h"
	35	#include "cgroup2_devices.h"
	36	#include "cgroup_utils.h"
	37	#include "macro.h"
	38	#include "memory_utils.h"
	39
	40	static void free_string_list(char **clist)
	41	{
	42	int i;
	43
	44	if (!clist)
	45	return;
	46
	47	for (i = 0; clist[i]; i++)
	48	free(clist[i]);
	49
	50	free(clist);
	51	}
	52
	53	/* Given a pointer to a null-terminated array of pointers, realloc to add one
	54	* entry, and point the new entry to NULL. Do not fail. Return the index to the
	55	* second-to-last entry - that is, the one which is now available for use
	56	* (keeping the list null-terminated).
	57	*/
	58	static int append_null_to_list(void ***list)
	59	{
	60	int newentry = 0;
	61
	62	if (*list)
	63	for (; (*list)[newentry]; newentry++)
	64	;
65
66	list = must_realloc(list, (newentry + 2) * sizeof(void **));
67	(*list)[newentry + 1] = NULL;
68	return newentry;
69	}
70
71	/* Given a null-terminated array of strings, check whether @entry is one of the
72	* strings.
73	*/
74	static bool string_in_list(char *list, const char entry)
75	{
76	int i;
77
78	if (!list)
79	return false;
80
81	for (i = 0; list[i]; i++)
82	if (strcmp(list[i], entry) == 0)
83	return true;
84
85	return false;
86	}
87
88	/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into
89	* "name=systemd". Do not fail.
90	*/
91	static char cg_legacy_must_prefix_named(char entry)
92	{
93	size_t len;
94	char *prefixed;
95
96	len = strlen(entry);
97	prefixed = must_realloc(NULL, len + 6);
98
99	memcpy(prefixed, "name=", STRLITERALLEN("name="));
100	memcpy(prefixed + STRLITERALLEN("name="), entry, len);
101	prefixed[len + 5] = '\0';
102
103	return prefixed;
104	}
105
106	/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
107	* we are called.
108	*
109	* We also handle named subsystems here. Any controller which is not a kernel
110	* subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
111	* we refuse to use because we're not sure which we have here.
112	* (TODO: We could work around this in some cases by just remounting to be
113	* unambiguous, or by comparing mountpoint contents with current cgroup.)
114	*
115	* The last entry will always be NULL.
116	*/
117	static void must_append_controller(char klist, char nlist, char ***clist,
118	char *entry)
119	{
120	int newentry;
121	char *copy;
122
123	if (string_in_list(klist, entry) && string_in_list(nlist, entry))
124	return;
125
126	newentry = append_null_to_list((void ***)clist);
127
128	if (strncmp(entry, "name=", 5) == 0)
129	copy = must_copy_string(entry);
130	else if (string_in_list(klist, entry))
131	copy = must_copy_string(entry);
132	else
133	copy = cg_legacy_must_prefix_named(entry);
134
135	(*clist)[newentry] = copy;
136	}
137
138	/* Given a handler's cgroup data, return the struct hierarchy for the controller
139	* @c, or NULL if there is none.
140	*/
141	static struct hierarchy cgfsng_get_hierarchy(struct cgroup_ops ops,
142	const char *controller)
143	{
144	int i;
145
146	errno = ENOENT;
147
148	if (!ops->hierarchies)
149	return NULL;
150
151	for (i = 0; ops->hierarchies[i]; i++) {
152	if (!controller) {
153	/* This is the empty unified hierarchy. */
154	if (ops->hierarchies[i]->controllers &&
155	!ops->hierarchies[i]->controllers[0])
156	return ops->hierarchies[i];
157	continue;
158	} else if (pure_unified_layout(ops) &&
159	strcmp(controller, "devices") == 0) {
160	if (ops->unified->bpf_device_controller)
161	return ops->unified;
162	break;
163	}
164
165	if (string_in_list(ops->hierarchies[i]->controllers, controller))
166	return ops->hierarchies[i];
167	}
168
169	return NULL;
170	}
171
172	static inline struct hierarchy get_hierarchy(struct cgroup_ops ops,
173	const char *controller)
174	{
175	return cgfsng_get_hierarchy(ops, controller);
176	}
177
178	/* Given two null-terminated lists of strings, return true if any string is in
179	* both.
180	*/
181	static bool controller_lists_intersect(char l1, char l2)
182	{
183	int i;
184
185	if (!l1 \|\| !l2)
186	return false;
187
188	for (i = 0; l1[i]; i++) {
189	if (string_in_list(l2, l1[i]))
190	return true;
191	}
192
193	return false;
194	}
195
196	/* For a null-terminated list of controllers @clist, return true if any of those
197	* controllers is already listed the null-terminated list of hierarchies @hlist.
198	* Realistically, if one is present, all must be present.
199	*/
200	static bool controller_list_is_dup(struct hierarchy hlist, char clist)
201	{
202	int i;
203
204	if (!hlist)
205	return false;
206
207	for (i = 0; hlist[i]; i++)
208	if (controller_lists_intersect(hlist[i]->controllers, clist))
209	return true;
210
211	return false;
212	}
213
214	/* Get the controllers from a mountinfo line There are other ways we could get
215	* this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
216	* could parse the mount options. But we simply assume that the mountpoint must
217	* be /sys/fs/cgroup/controller-list
218	*/
219	static char cg_hybrid_get_controllers(char klist, char *nlist, char line,
220	int type, char **controllers)
221	{
222	/* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
223	* for legacy hierarchies.
224	*/
225	int i;
226	char p2, tok;
227	char p = line, sep = ",";
228	char **aret = NULL;
229
230	for (i = 0; i < 4; i++) {
231	p = strchr(p, ' ');
232	if (!p)
233	return NULL;
234	p++;
235	}
236
237	/* Note, if we change how mountinfo works, then our caller will need to
238	* verify /sys/fs/cgroup/ in this field.
239	*/
240	if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
241	return NULL;
242
243	p += 15;
244	p2 = strchr(p, ' ');
245	if (!p2)
246	return NULL;
247	*p2 = '\0';
248
249	if (type == CGROUP_SUPER_MAGIC) {
250	__do_free char *dup = NULL;
251
252	/* strdup() here for v1 hierarchies. Otherwise
253	* lxc_iterate_parts() will destroy mountpoints such as
254	* "/sys/fs/cgroup/cpu,cpuacct".
255	*/
256	dup = must_copy_string(p);
257	if (!dup)
258	return NULL;
259
260	lxc_iterate_parts (tok, dup, sep)
261	must_append_controller(klist, nlist, &aret, tok);
262	*controllers = move_ptr(dup);
263	}
264	*p2 = ' ';
265
266	return aret;
267	}
268
269	static char **cg_unified_make_empty_controller(void)
270	{
271	int newentry;
272	char **aret = NULL;
273
274	newentry = append_null_to_list((void ***)&aret);
275	aret[newentry] = NULL;
276	return aret;
277	}
278
279	static char *cg_unified_get_controllers(const char file)
280	{
281	__do_free char *buf = NULL;
282	char *sep = " \t\n";
283	char **aret = NULL;
284	char *tok;
285
286	buf = read_file(file);
287	if (!buf)
288	return NULL;
289
290	lxc_iterate_parts(tok, buf, sep) {
291	int newentry;
292	char *copy;
293
294	newentry = append_null_to_list((void ***)&aret);
295	copy = must_copy_string(tok);
296	aret[newentry] = copy;
297	}
298
299	return aret;
300	}
301
302	static struct hierarchy add_hierarchy(struct hierarchy h, char clist, char *mountpoint,
303	char *container_base_path, int type)
304	{
305	struct hierarchy *new;
306	int newentry;
307
308	new = zalloc(sizeof(*new));
309	new->controllers = clist;
310	new->mountpoint = mountpoint;
311	new->container_base_path = container_base_path;
312	new->version = type;
313
314	newentry = append_null_to_list((void ***)h);
315	(*h)[newentry] = new;
316	return new;
317	}
318
319	/* Get a copy of the mountpoint from @line, which is a line from
320	* /proc/self/mountinfo.
321	*/
322	static char cg_hybrid_get_mountpoint(char line)
323	{
324	int i;
325	size_t len;
326	char *p2;
327	char p = line, sret = NULL;
328
329	for (i = 0; i < 4; i++) {
330	p = strchr(p, ' ');
331	if (!p)
332	return NULL;
333	p++;
334	}
335
336	if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
337	return NULL;
338
339	p2 = strchr(p + 15, ' ');
340	if (!p2)
341	return NULL;
342	*p2 = '\0';
343
344	len = strlen(p);
345	sret = must_realloc(NULL, len + 1);
346	memcpy(sret, p, len);
347	sret[len] = '\0';
348	return sret;
349	}
350
351	static void must_append_string(char **list, char entry)
352	{
353	int newentry;
354	char *copy;
355
356	newentry = append_null_to_list((void ***)list);
357	copy = must_copy_string(entry);
358	(*list)[newentry] = copy;
359	}
360
361	static int get_existing_subsystems(char *klist, char *nlist)
362	{
363	__do_free char *line = NULL;
364	__do_fclose FILE *f = NULL;
365	size_t len = 0;
366
367	f = fopen("/proc/self/cgroup", "r");
368	if (!f)
369	return -1;
370
371	while (getline(&line, &len, f) != -1) {
372	char p, p2, *tok;
373	p = strchr(line, ':');
374	if (!p)
375	continue;
376	p++;
377	p2 = strchr(p, ':');
378	if (!p2)
379	continue;
380	*p2 = '\0';
381
382	/* If the kernel has cgroup v2 support, then /proc/self/cgroup
383	* contains an entry of the form:
384	*
385	* 0::/some/path
386	*
387	* In this case we use "cgroup2" as controller name.
388	*/
389	if ((p2 - p) == 0) {
390	must_append_string(klist, "cgroup2");
391	continue;
392	}
393
394	lxc_iterate_parts(tok, p, ",") {
395	if (strncmp(tok, "name=", 5) == 0)
396	must_append_string(nlist, tok);
397	else
398	must_append_string(klist, tok);
399	}
400	}
401
402	return 0;
403	}
404
405	static void trim(char *s)
406	{
407	size_t len;
408
409	len = strlen(s);
410	while ((len > 1) && (s[len - 1] == '\n'))
411	s[--len] = '\0';
412	}
413
414	/* __cg_mount_direct
415	*
416	* Mount cgroup hierarchies directly without using bind-mounts. The main
417	* uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
418	* cgroups for the LXC_AUTO_CGROUP_FULL option.
419	*/
420	static int __cg_mount_direct(struct hierarchy h, const char controllerpath)
421	{
422	__do_free char *controllers = NULL;
423	char *fstype = "cgroup2";
424	unsigned long flags = 0;
425	int ret;
426
427	flags \|= MS_NOSUID;
428	flags \|= MS_NOEXEC;
429	flags \|= MS_NODEV;
430	flags \|= MS_RELATIME;
431
432	if (h->version != CGROUP2_SUPER_MAGIC) {
433	controllers = lxc_string_join(",", (const char **)h->controllers, false);
434	if (!controllers)
435	return -ENOMEM;
436	fstype = "cgroup";
437	}
438
439	ret = mount("cgroup", controllerpath, fstype, flags, controllers);
440	if (ret < 0)
441	return -1;
442
443	return 0;
444	}
445
446	static inline int cg_mount_cgroup_full(struct hierarchy *h,
447	const char *controllerpath)
448	{
449	return __cg_mount_direct(h, controllerpath);
450	}
451
452	static bool cgfsng_mount(struct cgroup_ops ops, const char root)
453	{
454	__do_free char *cgroup_root = NULL;
455	int ret;
456	bool retval = false;
457
458	if (!ops)
459	return ret_set_errno(false, ENOENT);
460
461	if (!ops->hierarchies)
462	return true;
463
464	cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
465	if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
466	return cg_mount_cgroup_full(ops->unified, cgroup_root) == 0;
467
468	/* mount tmpfs */
469	ret = safe_mount(NULL, cgroup_root, "tmpfs",
470	MS_NOSUID \| MS_NODEV \| MS_NOEXEC \| MS_RELATIME,
471	"size=10240k,mode=755", root);
472	if (ret < 0)
473	goto on_error;
474
475	for (int i = 0; ops->hierarchies[i]; i++) {
476	__do_free char *controllerpath = NULL;
477	struct hierarchy *h = ops->hierarchies[i];
478	char *controller = strrchr(h->mountpoint, '/');
479
480	if (!controller)
481	continue;
482	controller++;
483
484	controllerpath = must_make_path(cgroup_root, controller, NULL);
485	if (dir_exists(controllerpath))
486	continue;
487
488	ret = mkdir(controllerpath, 0755);
489	if (ret < 0)
490	log_error_errno(goto on_error, errno,
491	"Error creating cgroup path: %s",
492	controllerpath);
493
494	ret = cg_mount_cgroup_full( h, controllerpath);
495	if (ret < 0)
496	goto on_error;
497	}
498	retval = true;
499
500	on_error:
501	return retval;
502	}
503
504	static int recursive_count_nrtasks(char *dirname)
505	{
506	__do_free char *path = NULL;
507	__do_closedir DIR *dir = NULL;
508	struct dirent *direntp;
509	int count = 0, ret;
510
511	dir = opendir(dirname);
512	if (!dir)
513	return 0;
514
515	while ((direntp = readdir(dir))) {
516	struct stat mystat;
517
518	if (!strcmp(direntp->d_name, ".") \|\|
519	!strcmp(direntp->d_name, ".."))
520	continue;
521
522	path = must_make_path(dirname, direntp->d_name, NULL);
523
524	if (lstat(path, &mystat))
525	continue;
526
527	if (!S_ISDIR(mystat.st_mode))
528	continue;
529
530	count += recursive_count_nrtasks(path);
531	}
532
533	path = must_make_path(dirname, "cgroup.procs", NULL);
534	ret = lxc_count_file_lines(path);
535	if (ret != -1)
536	count += ret;
537
538	return count;
539	}
540
541	static int cgfsng_nrtasks(struct cgroup_ops *ops)
542	{
543	__do_free char *path = NULL;
544
545	if (!ops)
546	return ret_set_errno(-1, ENOENT);
547
548	if (!ops->container_cgroup \|\| !ops->hierarchies)
549	return ret_set_errno(-1, EINVAL);
550
551	path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
552	return recursive_count_nrtasks(path);
553	}
554
555	static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
556	{
557	int i = 0;
558
559	if (!ops)
560	return ret_set_errno(-1, ENOENT);
561
562	if (!ops->hierarchies)
563	return 0;
564
565	for (; ops->hierarchies[i]; i++)
566	;
567
568	return i;
569	}
570
571	static bool cgfsng_get_hierarchies(struct cgroup_ops ops, int n, char **out)
572	{
573	int i;
574
575	if (!ops)
576	return ret_set_errno(false, ENOENT);
577
578	if (!ops->hierarchies)
579	return false;
580
581	/* sanity check n */
582	for (i = 0; i < n; i++)
583	if (!ops->hierarchies[i])
584	return ret_set_errno(false, ENOENT);
585
586	*out = ops->hierarchies[i]->controllers;
587
588	return true;
589	}
590
591	/* At startup, parse_hierarchies finds all the info we need about cgroup
592	* mountpoints and current cgroups, and stores it in @d.
593	*/
594	static int cg_hybrid_init(struct cgroup_ops *ops)
595	{
596	__do_free char *basecginfo = NULL;
597	__do_free char *line = NULL;
598	__do_fclose FILE *f = NULL;
599	int ret;
600	size_t len = 0;
601	char klist = NULL, nlist = NULL;
602
603	/* Root spawned containers escape the current cgroup, so use init's
604	* cgroups as our base in that case.
605	*/
606	basecginfo = read_file("/proc/1/cgroup");
607	if (!basecginfo)
608	return ret_set_errno(-1, ENOMEM);
609
610	ret = get_existing_subsystems(&klist, &nlist);
611	if (ret < 0)
612	return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
613
614	f = fopen("/proc/self/mountinfo", "r");
615	if (!f)
616	return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
617
618	while (getline(&line, &len, f) != -1) {
619	int type;
620	struct hierarchy *new;
621	char base_cgroup = NULL, mountpoint = NULL;
622	char **controller_list = NULL;
623	__do_free char *controllers = NULL;
624
625	type = get_cgroup_version(line);
626	if (type == 0)
627	continue;
628
629	if (type == CGROUP2_SUPER_MAGIC && ops->unified)
630	continue;
631
632	if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
633	if (type == CGROUP2_SUPER_MAGIC)
634	ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
635	else if (type == CGROUP_SUPER_MAGIC)
636	ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
637	} else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
638	if (type == CGROUP_SUPER_MAGIC)
639	ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
640	} else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
641	if (type == CGROUP2_SUPER_MAGIC)
642	ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
643	}
644
645	controller_list = cg_hybrid_get_controllers(klist, nlist, line,
646	type, &controllers);
647	if (!controller_list && type == CGROUP_SUPER_MAGIC)
648	continue;
649
650	if (type == CGROUP_SUPER_MAGIC)
651	if (controller_list_is_dup(ops->hierarchies, controller_list))
652	ret_set_errno(goto next, EEXIST);
653
654	mountpoint = cg_hybrid_get_mountpoint(line);
655	if (!mountpoint)
656	log_error_errno(goto next, EINVAL, "Failed parsing mountpoint from \"%s\"", line);
657
658	if (type == CGROUP_SUPER_MAGIC) {
659	base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
660	} else {
661	base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
662	}
663	if (!base_cgroup)
664	log_error_errno(goto next, EINVAL, "Failed to find current cgroup %s", mountpoint);
665
666	trim(base_cgroup);
667	prune_init_scope(base_cgroup);
668
669	if (type == CGROUP2_SUPER_MAGIC) {
670	char *cgv2_ctrl_path;
671
672	cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
673	"cgroup.controllers",
674	NULL);
675
676	controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
677	free(cgv2_ctrl_path);
678	if (!controller_list)
679	controller_list = cg_unified_make_empty_controller();
680	}
681
682	new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
683	new->__controllers = move_ptr(controllers);
684	if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
685	ops->unified = new;
686
687	continue;
688
689	next:
690	free_string_list(controller_list);
691	free(mountpoint);
692	free(base_cgroup);
693	}
694
695	free_string_list(klist);
696	free_string_list(nlist);
697
698	return 0;
699	}
700
701	static int cg_unified_init(struct cgroup_ops *ops)
702	{
703	__do_free char *subtree_path = NULL;
704	int ret;
705	char *mountpoint;
706	char **delegatable;
707	struct hierarchy *new;
708	char *base_cgroup = NULL;
709
710	ret = unified_cgroup_hierarchy();
711	if (ret == -ENOMEDIUM)
712	return ret_errno(ENOMEDIUM);
713
714	if (ret != CGROUP2_SUPER_MAGIC)
715	return 0;
716
717	base_cgroup = cg_unified_get_current_cgroup(1);
718	if (!base_cgroup)
719	return ret_errno(EINVAL);
720	prune_init_scope(base_cgroup);
721
722	/*
723	* We assume that the cgroup we're currently in has been delegated to
724	* us and we are free to further delege all of the controllers listed
725	* in cgroup.controllers further down the hierarchy.
726	*/
727	mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
728	subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
729	delegatable = cg_unified_get_controllers(subtree_path);
730	if (!delegatable)
731	delegatable = cg_unified_make_empty_controller();
732
733	/* TODO: If the user requested specific controllers via lxc.cgroup.use
734	* we should verify here. The reason I'm not doing it right is that I'm
735	* not convinced that lxc.cgroup.use will be the future since it is a
736	* global property. I much rather have an option that lets you request
737	* controllers per container.
738	*/
739
740	new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
741
742	if (bpf_devices_cgroup_supported())
743	new->bpf_device_controller = 1;
744
745	ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
746	ops->unified = new;
747	return CGROUP2_SUPER_MAGIC;
748	}
749
750	static int cg_init(struct cgroup_ops *ops)
751	{
752	int ret;
753
754	ret = cg_unified_init(ops);
755	if (ret < 0)
756	return -1;
757
758	if (ret == CGROUP2_SUPER_MAGIC)
759	return 0;
760
761	return cg_hybrid_init(ops);
762	}
763
764	struct cgroup_ops *cgfsng_ops_init(void)
765	{
766	__do_free struct cgroup_ops *cgfsng_ops = NULL;
767
768	cgfsng_ops = malloc(sizeof(struct cgroup_ops));
769	if (!cgfsng_ops)
770	return ret_set_errno(NULL, ENOMEM);
771
772	memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
773	cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
774
775	if (cg_init(cgfsng_ops))
776	return NULL;
777
778	cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
779	cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
780	cgfsng_ops->get_hierarchy = get_hierarchy;
781	cgfsng_ops->driver = "cgfsng";
782	cgfsng_ops->version = "1.0.0";
783	cgfsng_ops->mount = cgfsng_mount;
784	cgfsng_ops->nrtasks = cgfsng_nrtasks;
785
786	return move_ptr(cgfsng_ops);
787	}