1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Wolfgang Bumiller <w.bumiller@proxmox.com>
3 Date: Wed, 28 Mar 2018 13:37:28 +0200
4 Subject: [PATCH] PVE: [Up] separate the limiting from the namespaced cgroup
7 When cgroup namespaces are enabled a privileged container
8 with mixed cgroups has full write access to its own root
9 cgroup effectively allowing it to overwrite values written
10 from the outside or configured via lxc.cgroup.*.
12 This patch causes an additional 'ns/' directory to be
13 created in all cgroups if cgroup namespaces and cgfsng are
14 being used in order to combat this.
16 Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
18 src/lxc/cgroups/cgfsng.c | 92 +++++++++++++++++++++++++++++++++++++++---------
19 src/lxc/cgroups/cgroup.h | 18 +++++++---
20 src/lxc/commands.c | 85 +++++++++++++++++++++++++++++++++-----------
21 src/lxc/commands.h | 2 ++
22 src/lxc/criu.c | 4 +--
23 src/lxc/start.c | 28 +++++++++++----
24 6 files changed, 180 insertions(+), 49 deletions(-)
26 diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
27 index 935b868b..9281cee0 100644
28 --- a/src/lxc/cgroups/cgfsng.c
29 +++ b/src/lxc/cgroups/cgfsng.c
30 @@ -818,6 +818,7 @@ static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char
31 new->mountpoint = mountpoint;
32 new->base_cgroup = base_cgroup;
33 new->fullcgpath = NULL;
34 + new->innercgpath = NULL;
37 newentry = append_null_to_list((void ***)h);
38 @@ -1060,6 +1061,9 @@ static int cgroup_rmdir(struct hierarchy **hierarchies,
43 + free(h->innercgpath);
44 + h->innercgpath = NULL;
48 @@ -1071,6 +1075,7 @@ struct generic_userns_exec_data {
49 struct lxc_conf *conf;
50 uid_t origuid; /* target uid in parent namespace */
55 static int cgroup_rmdir_wrapper(void *data)
56 @@ -1112,6 +1117,7 @@ static void cgfsng_destroy(struct cgroup_ops *ops, struct lxc_handler *handler)
57 wrap.container_cgroup = ops->container_cgroup;
58 wrap.hierarchies = ops->hierarchies;
59 wrap.conf = handler->conf;
62 if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
63 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
64 @@ -1192,22 +1198,29 @@ on_error:
68 -static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
69 +static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname, bool inner)
74 - h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
75 - if (dir_exists(h->fullcgpath)) {
77 + path = must_make_path(h->fullcgpath, CGROUP_NAMESPACE_SUBDIR, NULL);
78 + h->innercgpath = path;
80 + path = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
81 + h->fullcgpath = path;
83 + if (dir_exists(path)) {
84 ERROR("The cgroup \"%s\" already existed", h->fullcgpath);
88 - if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
89 + if (!inner && !cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
90 ERROR("Failed to handle legacy cpuset controller");
94 - ret = mkdir_p(h->fullcgpath, 0755);
95 + ret = mkdir_p(path, 0755);
97 ERROR("Failed to create cgroup \"%s\"", h->fullcgpath);
99 @@ -1228,11 +1241,29 @@ static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
100 h->fullcgpath = NULL;
103 +static inline bool cgfsng_create_inner(struct cgroup_ops *ops)
107 + char *cgname = must_make_path(ops->container_cgroup, CGROUP_NAMESPACE_SUBDIR, NULL);
108 + for (i = 0; ops->hierarchies[i]; i++) {
109 + if (!create_path_for_hierarchy(ops->hierarchies[i], cgname, true)) {
110 + SYSERROR("Failed to create %s namespace subdirectory: %s",
111 + ops->hierarchies[i]->fullcgpath, strerror(errno));
120 /* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
121 * next cgroup_pattern-1, -2, ..., -999.
123 static inline bool cgfsng_create(struct cgroup_ops *ops,
124 - struct lxc_handler *handler)
125 + struct lxc_handler *handler,
130 @@ -1241,10 +1272,17 @@ static inline bool cgfsng_create(struct cgroup_ops *ops,
131 struct lxc_conf *conf = handler->conf;
133 if (ops->container_cgroup) {
135 + return cgfsng_create_inner(ops);
136 WARN("cgfsng_create called a second time: %s", ops->container_cgroup);
141 + ERROR("cgfsng_create called twice for inner cgroup");
148 @@ -1285,7 +1323,7 @@ again:
151 for (i = 0; ops->hierarchies[i]; i++) {
152 - if (!create_path_for_hierarchy(ops->hierarchies[i], container_cgroup)) {
153 + if (!create_path_for_hierarchy(ops->hierarchies[i], container_cgroup, false)) {
155 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->fullcgpath);
156 free(ops->hierarchies[i]->fullcgpath);
157 @@ -1307,7 +1345,7 @@ out_free:
161 -static bool cgfsng_enter(struct cgroup_ops *ops, pid_t pid)
162 +static bool cgfsng_enter(struct cgroup_ops *ops, pid_t pid, bool inner)
166 @@ -1320,8 +1358,13 @@ static bool cgfsng_enter(struct cgroup_ops *ops, pid_t pid)
170 - fullpath = must_make_path(ops->hierarchies[i]->fullcgpath,
171 - "cgroup.procs", NULL);
173 + fullpath = must_make_path(ops->hierarchies[i]->fullcgpath,
174 + CGROUP_NAMESPACE_SUBDIR,
175 + "cgroup.procs", NULL);
177 + fullpath = must_make_path(ops->hierarchies[i]->fullcgpath,
178 + "cgroup.procs", NULL);
179 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
181 SYSERROR("Failed to enter cgroup \"%s\"", fullpath);
182 @@ -1395,9 +1438,15 @@ static int chown_cgroup_wrapper(void *data)
184 char *path = arg->hierarchies[i]->fullcgpath;
187 + path = must_make_path(path, CGROUP_NAMESPACE_SUBDIR, NULL);
189 ret = chowmod(path, destuid, nsgid, 0775);
197 /* Failures to chown() these are inconvenient but not
198 * detrimental We leave these owned by the container launcher,
199 @@ -1416,8 +1465,11 @@ static int chown_cgroup_wrapper(void *data)
200 (void)chowmod(fullpath, destuid, nsgid, 0664);
203 - if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
204 + if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC) {
210 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
211 (void)chowmod(fullpath, destuid, nsgid, 0664);
212 @@ -1426,12 +1478,15 @@ static int chown_cgroup_wrapper(void *data)
213 fullpath = must_make_path(path, "cgroup.threads", NULL);
214 (void)chowmod(fullpath, destuid, nsgid, 0664);
224 -static bool cgfsng_chown(struct cgroup_ops *ops, struct lxc_conf *conf)
225 +static bool cgfsng_chown(struct cgroup_ops *ops, struct lxc_conf *conf, bool inner)
227 struct generic_userns_exec_data wrap;
229 @@ -1442,6 +1497,7 @@ static bool cgfsng_chown(struct cgroup_ops *ops, struct lxc_conf *conf)
231 wrap.hierarchies = ops->hierarchies;
233 + wrap.inner = inner;
235 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
236 "chown_cgroup_wrapper") < 0) {
237 @@ -1821,7 +1877,8 @@ static bool cgfsng_unfreeze(struct cgroup_ops *ops)
240 static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
241 - const char *controller)
242 + const char *controller,
247 @@ -1832,6 +1889,9 @@ static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
252 + return h->innercgpath ? h->innercgpath + strlen(h->mountpoint) : NULL;
254 return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
257 @@ -1863,7 +1923,7 @@ static int __cg_unified_attach(const struct hierarchy *h, const char *name,
258 int fret = -1, idx = 0;
259 char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
261 - container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
262 + container_cgroup = lxc_cmd_get_attach_cgroup_path(name, lxcpath, controller);
264 if (!container_cgroup)
266 @@ -1943,7 +2003,7 @@ static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
270 - path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
271 + path = lxc_cmd_get_attach_cgroup_path(name, lxcpath, h->controllers[0]);
275 diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h
276 index 8f4af06c..b12c1f4c 100644
277 --- a/src/lxc/cgroups/cgroup.h
278 +++ b/src/lxc/cgroups/cgroup.h
281 #include <sys/types.h>
283 +/* When lxc.cgroup.protect_limits is in effect the container's cgroup namespace
284 + * will be moved into an additional subdirectory "cgns/" inside the cgroup in
285 + * order to prevent it from accessing the outer limiting cgroup.
287 +#define CGROUP_NAMESPACE_SUBDIR "cgns"
292 @@ -65,6 +71,9 @@ typedef enum {
294 * - The full path to the containers cgroup.
297 + * - The full path to the container's inner cgroup when protect_limits is used.
301 * If the hierarchy is a legacy hierarchy this will be set to
302 @@ -78,6 +87,7 @@ struct hierarchy {
310 @@ -124,9 +134,9 @@ struct cgroup_ops {
312 bool (*data_init)(struct cgroup_ops *ops);
313 void (*destroy)(struct cgroup_ops *ops, struct lxc_handler *handler);
314 - bool (*create)(struct cgroup_ops *ops, struct lxc_handler *handler);
315 - bool (*enter)(struct cgroup_ops *ops, pid_t pid);
316 - const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller);
317 + bool (*create)(struct cgroup_ops *ops, struct lxc_handler *handler, bool inner);
318 + bool (*enter)(struct cgroup_ops *ops, pid_t pid, bool inner);
319 + const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller, bool inner);
320 bool (*escape)(const struct cgroup_ops *ops);
321 int (*num_hierarchies)(struct cgroup_ops *ops);
322 bool (*get_hierarchies)(struct cgroup_ops *ops, int n, char ***out);
323 @@ -137,7 +147,7 @@ struct cgroup_ops {
324 bool (*unfreeze)(struct cgroup_ops *ops);
325 bool (*setup_limits)(struct cgroup_ops *ops, struct lxc_conf *conf,
327 - bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf);
328 + bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf, bool inner);
329 bool (*attach)(struct cgroup_ops *ops, const char *name,
330 const char *lxcpath, pid_t pid);
331 bool (*mount)(struct cgroup_ops *ops, struct lxc_handler *handler,
332 diff --git a/src/lxc/commands.c b/src/lxc/commands.c
333 index 30d6b604..e1bad635 100644
334 --- a/src/lxc/commands.c
335 +++ b/src/lxc/commands.c
336 @@ -424,20 +424,8 @@ static int lxc_cmd_get_clone_flags_callback(int fd, struct lxc_cmd_req *req,
337 return lxc_cmd_rsp_send(fd, &rsp);
341 - * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a
342 - * particular subsystem. This is the cgroup path relative to the root
343 - * of the cgroup filesystem.
345 - * @name : name of container to connect to
346 - * @lxcpath : the lxcpath in which the container is running
347 - * @subsystem : the subsystem being asked about
349 - * Returns the path on success, NULL on failure. The caller must free() the
352 -char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
353 - const char *subsystem)
354 +char *do_lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
355 + const char *subsystem, bool inner)
358 struct lxc_cmd_rr cmd = {
359 @@ -450,8 +438,18 @@ char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
361 cmd.req.data = subsystem;
364 - cmd.req.datalen = strlen(subsystem) + 1;
366 + size_t subsyslen = strlen(subsystem);
368 + char *data = alloca(subsyslen+2);
369 + memcpy(data, subsystem, subsyslen+1);
370 + data[subsyslen+1] = 1;
371 + cmd.req.datalen = subsyslen+2,
372 + cmd.req.data = data;
374 + cmd.req.datalen = subsyslen+1;
378 ret = lxc_cmd(name, &cmd, &stopped, lxcpath, NULL);
380 @@ -466,6 +464,42 @@ char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
385 + * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a
386 + * particular subsystem. This is the cgroup path relative to the root
387 + * of the cgroup filesystem.
389 + * @name : name of container to connect to
390 + * @lxcpath : the lxcpath in which the container is running
391 + * @subsystem : the subsystem being asked about
393 + * Returns the path on success, NULL on failure. The caller must free() the
396 +char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
397 + const char *subsystem)
399 + return do_lxc_cmd_get_cgroup_path(name, lxcpath, subsystem, false);
403 + * lxc_cmd_get_attach_cgroup_path: Calculate a container's inner cgroup path
404 + * for a particular subsystem. This is the cgroup path relative to the root
405 + * of the cgroup filesystem.
407 + * @name : name of container to connect to
408 + * @lxcpath : the lxcpath in which the container is running
409 + * @subsystem : the subsystem being asked about
411 + * Returns the path on success, NULL on failure. The caller must free() the
414 +char *lxc_cmd_get_attach_cgroup_path(const char *name, const char *lxcpath,
415 + const char *subsystem)
417 + return do_lxc_cmd_get_cgroup_path(name, lxcpath, subsystem, true);
420 static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
421 struct lxc_handler *handler)
423 @@ -473,10 +507,21 @@ static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
424 struct lxc_cmd_rsp rsp;
425 struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
427 - if (req->datalen > 0)
428 - path = cgroup_ops->get_cgroup(cgroup_ops, req->data);
430 - path = cgroup_ops->get_cgroup(cgroup_ops, NULL);
431 + if (req->datalen > 0) {
432 + const char *subsystem;
434 + bool inner = false;
435 + subsystem = req->data;
436 + subsyslen = strlen(subsystem);
437 + if (req->datalen == subsyslen+2)
438 + inner = (subsystem[subsyslen+1] == 1);
440 + path = cgroup_ops->get_cgroup(cgroup_ops, req->data, inner);
442 + // FIXME: cgroup separation for cgroup v2 cannot be handled
443 + // like we used to do v1 here... need to figure this out...
444 + path = cgroup_ops->get_cgroup(cgroup_ops, NULL, false);
449 diff --git a/src/lxc/commands.h b/src/lxc/commands.h
450 index 816cd748..e16c0d79 100644
451 --- a/src/lxc/commands.h
452 +++ b/src/lxc/commands.h
453 @@ -93,6 +93,8 @@ extern int lxc_cmd_console(const char *name, int *ttynum, int *fd,
455 extern char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
456 const char *subsystem);
457 +extern char *lxc_cmd_get_attach_cgroup_path(const char *name,
458 + const char *lxcpath, const char *subsystem);
459 extern int lxc_cmd_get_clone_flags(const char *name, const char *lxcpath);
460 extern char *lxc_cmd_get_config_item(const char *name, const char *item, const char *lxcpath);
461 extern char *lxc_cmd_get_name(const char *hashed_sock);
462 diff --git a/src/lxc/criu.c b/src/lxc/criu.c
463 index c3642162..456d19cf 100644
466 @@ -328,7 +328,7 @@ static void exec_criu(struct cgroup_ops *cgroup_ops, struct criu_opts *opts)
470 - p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0]);
471 + p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0], false);
473 ERROR("failed to get cgroup path for %s", controllers[0]);
475 @@ -971,7 +971,7 @@ static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_
476 goto out_fini_handler;
477 handler->cgroup_ops = cgroup_ops;
479 - if (!cgroup_ops->create(cgroup_ops, handler)) {
480 + if (!cgroup_ops->create(cgroup_ops, handler, false)) {
481 ERROR("failed creating groups");
482 goto out_fini_handler;
484 diff --git a/src/lxc/start.c b/src/lxc/start.c
485 index 739866d8..6944b310 100644
486 --- a/src/lxc/start.c
487 +++ b/src/lxc/start.c
488 @@ -1597,7 +1597,7 @@ static int lxc_spawn(struct lxc_handler *handler)
492 - if (!cgroup_ops->create(cgroup_ops, handler)) {
493 + if (!cgroup_ops->create(cgroup_ops, handler, false)) {
494 ERROR("Failed creating cgroups");
497 @@ -1691,10 +1691,10 @@ static int lxc_spawn(struct lxc_handler *handler)
501 - if (!cgroup_ops->enter(cgroup_ops, handler->pid))
502 + if (!cgroup_ops->enter(cgroup_ops, handler->pid, false))
505 - if (!cgroup_ops->chown(cgroup_ops, handler->conf))
506 + if (!cgroup_ops->chown(cgroup_ops, handler->conf, false))
509 /* Now we're ready to preserve the network namespace */
510 @@ -1755,16 +1755,30 @@ static int lxc_spawn(struct lxc_handler *handler)
514 - ret = lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE);
516 - goto out_delete_net;
518 if (!cgroup_ops->setup_limits(cgroup_ops, handler->conf, true)) {
519 ERROR("Failed to setup legacy device cgroup controller limits");
522 TRACE("Set up legacy device cgroup controller limits");
524 + if (cgns_supported()) {
525 + if (!cgroup_ops->create(cgroup_ops, handler, true)) {
526 + ERROR("failed to create inner cgroup separation layer");
527 + goto out_delete_net;
529 + if (!cgroup_ops->enter(cgroup_ops, handler->pid, true)) {
530 + ERROR("failed to enter inner cgroup separation layer");
531 + goto out_delete_net;
533 + if (!cgroup_ops->chown(cgroup_ops, handler->conf, true)) {
534 + ERROR("failed chown inner cgroup separation layer");
535 + goto out_delete_net;
539 + if (lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE))
540 + goto out_delete_net;
542 if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
543 /* Now we're ready to preserve the cgroup namespace */
544 ret = lxc_try_preserve_ns(handler->pid, "cgroup");