1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Wolfgang Bumiller <w.bumiller@proxmox.com>
3 Date: Wed, 28 Mar 2018 13:37:28 +0200
4 Subject: [PATCH] PVE: [Up] separate the limiting from the namespaced cgroup
7 When cgroup namespaces are enabled a privileged container
8 with mixed cgroups has full write access to its own root
9 cgroup effectively allowing it to overwrite values written
10 from the outside or configured via lxc.cgroup.*.
12 This patch causes an additional 'ns/' directory to be
13 created in all cgroups if cgroup namespaces and cgfsng are
14 being used in order to combat this.
16 Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
18 src/lxc/cgroups/cgfsng.c | 94 +++++++++++++++++++++++++++++++++-------
19 src/lxc/cgroups/cgroup.h | 18 ++++++--
20 src/lxc/commands.c | 87 ++++++++++++++++++++++++++++---------
21 src/lxc/commands.h | 2 +
23 src/lxc/start.c | 28 +++++++++---
24 6 files changed, 183 insertions(+), 50 deletions(-)
26 diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
27 index ab99b47c5..ac8f469bb 100644
28 --- a/src/lxc/cgroups/cgfsng.c
29 +++ b/src/lxc/cgroups/cgfsng.c
30 @@ -818,6 +818,7 @@ static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char
31 new->mountpoint = mountpoint;
32 new->container_base_path = container_base_path;
33 new->container_full_path = NULL;
34 + new->container_inner_path = NULL;
35 new->monitor_full_path = NULL;
38 @@ -1059,6 +1060,9 @@ static int cgroup_rmdir(struct hierarchy **hierarchies,
40 free(h->container_full_path);
41 h->container_full_path = NULL;
43 + free(h->container_inner_path);
44 + h->container_inner_path = NULL;
48 @@ -1070,6 +1074,7 @@ struct generic_userns_exec_data {
49 struct lxc_conf *conf;
50 uid_t origuid; /* target uid in parent namespace */
55 static int cgroup_rmdir_wrapper(void *data)
56 @@ -1112,6 +1117,7 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
57 wrap.container_cgroup = ops->container_cgroup;
58 wrap.hierarchies = ops->hierarchies;
59 wrap.conf = handler->conf;
62 if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
63 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
64 @@ -1323,17 +1329,26 @@ static bool monitor_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
65 return cg_unified_create_cgroup(h, cgname);
68 -static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
69 +static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname, bool inner)
74 - if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
75 + if (!inner && !cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
76 ERROR("Failed to handle legacy cpuset controller");
80 - h->container_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
81 - ret = mkdir_eexist_on_last(h->container_full_path, 0755);
83 + path = must_make_path(h->container_full_path, CGROUP_NAMESPACE_SUBDIR, NULL);
84 + h->container_inner_path = path;
85 + ret = mkdir(path, 0755);
87 + path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
88 + h->container_full_path = path;
89 + ret = mkdir_eexist_on_last(path, 0755);
93 ERROR("Failed to create cgroup \"%s\"", h->container_full_path);
95 @@ -1425,11 +1440,29 @@ on_error:
99 +static inline bool cgfsng_create_inner(struct cgroup_ops *ops)
103 + char *cgname = must_make_path(ops->container_cgroup, CGROUP_NAMESPACE_SUBDIR, NULL);
104 + for (i = 0; ops->hierarchies[i]; i++) {
105 + if (!container_create_path_for_hierarchy(ops->hierarchies[i], cgname, true)) {
106 + SYSERROR("Failed to create %s namespace subdirectory: %s",
107 + ops->hierarchies[i]->container_full_path, strerror(errno));
116 /* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
117 * next cgroup_pattern-1, -2, ..., -999.
119 __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
120 - struct lxc_handler *handler)
121 + struct lxc_handler *handler,
126 @@ -1438,10 +1471,17 @@ __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
127 struct lxc_conf *conf = handler->conf;
129 if (ops->container_cgroup) {
131 + return cgfsng_create_inner(ops);
132 WARN("cgfsng_create called a second time: %s", ops->container_cgroup);
137 + ERROR("cgfsng_create called twice for inner cgroup");
144 @@ -1482,7 +1522,7 @@ again:
147 for (i = 0; ops->hierarchies[i]; i++) {
148 - if (!container_create_path_for_hierarchy(ops->hierarchies[i], container_cgroup)) {
149 + if (!container_create_path_for_hierarchy(ops->hierarchies[i], container_cgroup, false)) {
150 ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path);
151 free(ops->hierarchies[i]->container_full_path);
152 ops->hierarchies[i]->container_full_path = NULL;
153 @@ -1505,7 +1545,8 @@ out_free:
156 __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
162 char pidstr[INTTYPE_TO_STRLEN(pid_t)];
163 @@ -1521,6 +1562,9 @@ __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
165 path = must_make_path(ops->hierarchies[i]->monitor_full_path,
166 "cgroup.procs", NULL);
168 + path = must_make_path(ops->hierarchies[i]->container_inner_path,
169 + "cgroup.procs", NULL);
171 path = must_make_path(ops->hierarchies[i]->container_full_path,
172 "cgroup.procs", NULL);
173 @@ -1538,12 +1582,12 @@ __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
175 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, pid_t pid)
177 - return __do_cgroup_enter(ops, pid, true);
178 + return __do_cgroup_enter(ops, pid, true, false);
181 -static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid)
182 +static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid, bool inner)
184 - return __do_cgroup_enter(ops, pid, false);
185 + return __do_cgroup_enter(ops, pid, false, inner);
188 static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
189 @@ -1609,9 +1653,15 @@ static int chown_cgroup_wrapper(void *data)
191 char *path = arg->hierarchies[i]->container_full_path;
194 + path = must_make_path(path, CGROUP_NAMESPACE_SUBDIR, NULL);
196 ret = chowmod(path, destuid, nsgid, 0775);
204 /* Failures to chown() these are inconvenient but not
205 * detrimental We leave these owned by the container launcher,
206 @@ -1630,8 +1680,11 @@ static int chown_cgroup_wrapper(void *data)
207 (void)chowmod(fullpath, destuid, nsgid, 0664);
210 - if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
211 + if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC) {
217 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
218 (void)chowmod(fullpath, destuid, nsgid, 0664);
219 @@ -1640,13 +1693,17 @@ static int chown_cgroup_wrapper(void *data)
220 fullpath = must_make_path(path, "cgroup.threads", NULL);
221 (void)chowmod(fullpath, destuid, nsgid, 0664);
231 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
232 - struct lxc_conf *conf)
233 + struct lxc_conf *conf,
236 struct generic_userns_exec_data wrap;
238 @@ -1657,6 +1714,7 @@ __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
240 wrap.hierarchies = ops->hierarchies;
242 + wrap.inner = inner;
244 if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
245 "chown_cgroup_wrapper") < 0) {
246 @@ -2038,7 +2096,8 @@ __cgfsng_ops static bool cgfsng_unfreeze(struct cgroup_ops *ops)
249 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
250 - const char *controller)
251 + const char *controller,
256 @@ -2049,6 +2108,9 @@ __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
261 + return h->container_inner_path ? h->container_inner_path + strlen(h->mountpoint) : NULL;
263 return h->container_full_path ? h->container_full_path + strlen(h->mountpoint) : NULL;
266 @@ -2080,7 +2142,7 @@ static int __cg_unified_attach(const struct hierarchy *h, const char *name,
267 int fret = -1, idx = 0;
268 char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
270 - container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
271 + container_cgroup = lxc_cmd_get_attach_cgroup_path(name, lxcpath, controller);
273 if (!container_cgroup)
275 @@ -2161,7 +2223,7 @@ __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
279 - path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
280 + path = lxc_cmd_get_attach_cgroup_path(name, lxcpath, h->controllers[0]);
284 diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h
285 index d4dcd506b..59445b5a5 100644
286 --- a/src/lxc/cgroups/cgroup.h
287 +++ b/src/lxc/cgroups/cgroup.h
289 #define MONITOR_CGROUP "lxc.monitor"
290 #define PIVOT_CGROUP "lxc.pivot"
292 +/* When lxc.cgroup.protect_limits is in effect the container's cgroup namespace
293 + * will be moved into an additional subdirectory "cgns/" inside the cgroup in
294 + * order to prevent it from accessing the outer limiting cgroup.
296 +#define CGROUP_NAMESPACE_SUBDIR "cgns"
301 @@ -72,6 +78,9 @@ typedef enum {
303 * - The full path to the monitor's cgroup.
305 + * @container_inner_path
306 + * - The full path to the container's inner cgroup when protect_limits is used.
310 * If the hierarchy is a legacy hierarchy this will be set to
311 @@ -85,6 +94,7 @@ struct hierarchy {
313 char *container_base_path;
314 char *container_full_path;
315 + char *container_inner_path;
316 char *monitor_full_path;
319 @@ -139,9 +149,9 @@ struct cgroup_ops {
320 void (*monitor_destroy)(struct cgroup_ops *ops, struct lxc_handler *handler);
321 bool (*monitor_create)(struct cgroup_ops *ops, struct lxc_handler *handler);
322 bool (*monitor_enter)(struct cgroup_ops *ops, pid_t pid);
323 - bool (*payload_create)(struct cgroup_ops *ops, struct lxc_handler *handler);
324 - bool (*payload_enter)(struct cgroup_ops *ops, pid_t pid);
325 - const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller);
326 + bool (*payload_create)(struct cgroup_ops *ops, struct lxc_handler *handler, bool inner);
327 + bool (*payload_enter)(struct cgroup_ops *ops, pid_t pid, bool inner);
328 + const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller, bool inner);
329 bool (*escape)(const struct cgroup_ops *ops, struct lxc_conf *conf);
330 int (*num_hierarchies)(struct cgroup_ops *ops);
331 bool (*get_hierarchies)(struct cgroup_ops *ops, int n, char ***out);
332 @@ -152,7 +162,7 @@ struct cgroup_ops {
333 bool (*unfreeze)(struct cgroup_ops *ops);
334 bool (*setup_limits)(struct cgroup_ops *ops, struct lxc_conf *conf,
336 - bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf);
337 + bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf, bool inner);
338 bool (*attach)(struct cgroup_ops *ops, const char *name,
339 const char *lxcpath, pid_t pid);
340 bool (*mount)(struct cgroup_ops *ops, struct lxc_handler *handler,
341 diff --git a/src/lxc/commands.c b/src/lxc/commands.c
342 index 133384d72..b41a76000 100644
343 --- a/src/lxc/commands.c
344 +++ b/src/lxc/commands.c
345 @@ -427,20 +427,8 @@ static int lxc_cmd_get_clone_flags_callback(int fd, struct lxc_cmd_req *req,
346 return lxc_cmd_rsp_send(fd, &rsp);
350 - * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a
351 - * particular subsystem. This is the cgroup path relative to the root
352 - * of the cgroup filesystem.
354 - * @name : name of container to connect to
355 - * @lxcpath : the lxcpath in which the container is running
356 - * @subsystem : the subsystem being asked about
358 - * Returns the path on success, NULL on failure. The caller must free() the
361 -char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
362 - const char *subsystem)
363 +char *do_lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
364 + const char *subsystem, bool inner)
367 struct lxc_cmd_rr cmd = {
368 @@ -453,8 +441,18 @@ char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
370 cmd.req.data = subsystem;
373 - cmd.req.datalen = strlen(subsystem) + 1;
375 + size_t subsyslen = strlen(subsystem);
377 + char *data = alloca(subsyslen+2);
378 + memcpy(data, subsystem, subsyslen+1);
379 + data[subsyslen+1] = 1;
380 + cmd.req.datalen = subsyslen+2,
381 + cmd.req.data = data;
383 + cmd.req.datalen = subsyslen+1;
387 ret = lxc_cmd(name, &cmd, &stopped, lxcpath, NULL);
389 @@ -469,6 +467,42 @@ char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
394 + * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a
395 + * particular subsystem. This is the cgroup path relative to the root
396 + * of the cgroup filesystem.
398 + * @name : name of container to connect to
399 + * @lxcpath : the lxcpath in which the container is running
400 + * @subsystem : the subsystem being asked about
402 + * Returns the path on success, NULL on failure. The caller must free() the
405 +char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
406 + const char *subsystem)
408 + return do_lxc_cmd_get_cgroup_path(name, lxcpath, subsystem, false);
412 + * lxc_cmd_get_attach_cgroup_path: Calculate a container's inner cgroup path
413 + * for a particular subsystem. This is the cgroup path relative to the root
414 + * of the cgroup filesystem.
416 + * @name : name of container to connect to
417 + * @lxcpath : the lxcpath in which the container is running
418 + * @subsystem : the subsystem being asked about
420 + * Returns the path on success, NULL on failure. The caller must free() the
423 +char *lxc_cmd_get_attach_cgroup_path(const char *name, const char *lxcpath,
424 + const char *subsystem)
426 + return do_lxc_cmd_get_cgroup_path(name, lxcpath, subsystem, true);
429 static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
430 struct lxc_handler *handler)
432 @@ -476,10 +510,21 @@ static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
433 struct lxc_cmd_rsp rsp;
434 struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
436 - if (req->datalen > 0)
437 - path = cgroup_ops->get_cgroup(cgroup_ops, req->data);
439 - path = cgroup_ops->get_cgroup(cgroup_ops, NULL);
440 + if (req->datalen > 0) {
441 + const char *subsystem;
443 + bool inner = false;
444 + subsystem = req->data;
445 + subsyslen = strlen(subsystem);
446 + if (req->datalen == subsyslen+2)
447 + inner = (subsystem[subsyslen+1] == 1);
449 + path = cgroup_ops->get_cgroup(cgroup_ops, req->data, inner);
451 + // FIXME: cgroup separation for cgroup v2 cannot be handled
452 + // like we used to do v1 here... need to figure this out...
453 + path = cgroup_ops->get_cgroup(cgroup_ops, NULL, false);
458 @@ -651,7 +696,7 @@ static int lxc_cmd_stop_callback(int fd, struct lxc_cmd_req *req,
459 * lxc_unfreeze() would do another cmd (GET_CGROUP) which would
462 - if (!cgroup_ops->get_cgroup(cgroup_ops, "freezer"))
463 + if (!cgroup_ops->get_cgroup(cgroup_ops, "freezer", false))
466 if (cgroup_ops->unfreeze(cgroup_ops))
467 diff --git a/src/lxc/commands.h b/src/lxc/commands.h
468 index 2c024b65d..7c4c00b1e 100644
469 --- a/src/lxc/commands.h
470 +++ b/src/lxc/commands.h
471 @@ -88,6 +88,8 @@ extern int lxc_cmd_console(const char *name, int *ttynum, int *fd,
473 extern char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
474 const char *subsystem);
475 +extern char *lxc_cmd_get_attach_cgroup_path(const char *name,
476 + const char *lxcpath, const char *subsystem);
477 extern int lxc_cmd_get_clone_flags(const char *name, const char *lxcpath);
478 extern char *lxc_cmd_get_config_item(const char *name, const char *item, const char *lxcpath);
479 extern char *lxc_cmd_get_name(const char *hashed_sock);
480 diff --git a/src/lxc/criu.c b/src/lxc/criu.c
481 index 3d857b541..ec9bcb7e4 100644
484 @@ -332,7 +332,7 @@ static void exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf,
488 - p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0]);
489 + p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0], false);
491 ERROR("failed to get cgroup path for %s", controllers[0]);
493 @@ -976,7 +976,7 @@ static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_
494 goto out_fini_handler;
495 handler->cgroup_ops = cgroup_ops;
497 - if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
498 + if (!cgroup_ops->payload_create(cgroup_ops, handler, false)) {
499 ERROR("failed creating groups");
500 goto out_fini_handler;
502 diff --git a/src/lxc/start.c b/src/lxc/start.c
503 index dae3bcfe5..f3b29d6cd 100644
504 --- a/src/lxc/start.c
505 +++ b/src/lxc/start.c
506 @@ -1649,7 +1649,7 @@ static int lxc_spawn(struct lxc_handler *handler)
510 - if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
511 + if (!cgroup_ops->payload_create(cgroup_ops, handler, false)) {
512 ERROR("Failed creating cgroups");
515 @@ -1743,10 +1743,10 @@ static int lxc_spawn(struct lxc_handler *handler)
519 - if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid))
520 + if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid, false))
523 - if (!cgroup_ops->chown(cgroup_ops, handler->conf))
524 + if (!cgroup_ops->chown(cgroup_ops, handler->conf, false))
527 /* Now we're ready to preserve the network namespace */
528 @@ -1813,16 +1813,30 @@ static int lxc_spawn(struct lxc_handler *handler)
532 - ret = lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE);
534 - goto out_delete_net;
536 if (!cgroup_ops->setup_limits(cgroup_ops, handler->conf, true)) {
537 ERROR("Failed to setup legacy device cgroup controller limits");
540 TRACE("Set up legacy device cgroup controller limits");
542 + if (cgns_supported()) {
543 + if (!cgroup_ops->payload_create(cgroup_ops, handler, true)) {
544 + ERROR("failed to create inner cgroup separation layer");
545 + goto out_delete_net;
547 + if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid, true)) {
548 + ERROR("failed to enter inner cgroup separation layer");
549 + goto out_delete_net;
551 + if (!cgroup_ops->chown(cgroup_ops, handler->conf, true)) {
552 + ERROR("failed chown inner cgroup separation layer");
553 + goto out_delete_net;
557 + if (lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE))
558 + goto out_delete_net;
560 if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
561 /* Now we're ready to preserve the cgroup namespace */
562 ret = lxc_try_preserve_ns(handler->pid, "cgroup");