debian/patches/pve/0004-PVE-Up-separate-the-limiting-from-the-namespaced-cgr.patch

   1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
   2 From: Wolfgang Bumiller <w.bumiller@proxmox.com>
   3 Date: Wed, 28 Mar 2018 13:37:28 +0200
   4 Subject: [PATCH] PVE: [Up] separate the limiting from the namespaced cgroup
   5  root
   6
   7 When cgroup namespaces are enabled a privileged container
   8 with mixed cgroups has full write access to its own root
   9 cgroup effectively allowing it to overwrite values written
  10 from the outside or configured via lxc.cgroup.*.
  11
  12 This patch causes an additional 'ns/' directory to be
  13 created in all cgroups if cgroup namespaces and cgfsng are
  14 being used in order to combat this.
  15
  16 Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
  17 ---
  18  src/lxc/cgroups/cgfsng.c | 94 +++++++++++++++++++++++++++++++++-------
  19  src/lxc/cgroups/cgroup.h | 18 ++++++--
  20  src/lxc/commands.c       | 87 ++++++++++++++++++++++++++++---------
  21  src/lxc/commands.h       |  2 +
  22  src/lxc/criu.c           |  4 +-
  23  src/lxc/start.c          | 28 +++++++++---
  24  6 files changed, 183 insertions(+), 50 deletions(-)
  25
  26 diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
  27 index ab99b47c5..ac8f469bb 100644
  28 --- a/src/lxc/cgroups/cgfsng.c
  29 +++ b/src/lxc/cgroups/cgfsng.c
  30 @@ -818,6 +818,7 @@ static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char
  31         new->mountpoint = mountpoint;
  32         new->container_base_path = container_base_path;
  33         new->container_full_path = NULL;
  34 +       new->container_inner_path = NULL;
  35         new->monitor_full_path = NULL;
  36         new->version = type;
  37
  38 @@ -1059,6 +1060,9 @@ static int cgroup_rmdir(struct hierarchy **hierarchies,
  39
  40                 free(h->container_full_path);
  41                 h->container_full_path = NULL;
  42 +
  43 +               free(h->container_inner_path);
  44 +               h->container_inner_path = NULL;
  45         }
  46
  47         return 0;
  48 @@ -1070,6 +1074,7 @@ struct generic_userns_exec_data {
  49         struct lxc_conf *conf;
  50         uid_t origuid; /* target uid in parent namespace */
  51         char *path;
  52 +       bool inner;
  53  };
  54
  55  static int cgroup_rmdir_wrapper(void *data)
  56 @@ -1112,6 +1117,7 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
  57         wrap.container_cgroup = ops->container_cgroup;
  58         wrap.hierarchies = ops->hierarchies;
  59         wrap.conf = handler->conf;
  60 +       wrap.inner = false;
  61
  62         if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
  63                 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
  64 @@ -1323,17 +1329,26 @@ static bool monitor_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
  65         return cg_unified_create_cgroup(h, cgname);
  66  }
  67
  68 -static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
  69 +static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname, bool inner)
  70  {
  71         int ret;
  72 +       char *path;
  73
  74 -       if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
  75 +       if (!inner && !cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
  76                 ERROR("Failed to handle legacy cpuset controller");
  77                 return false;
  78         }
  79
  80 -       h->container_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
  81 -       ret = mkdir_eexist_on_last(h->container_full_path, 0755);
  82 +       if (inner) {
  83 +               path = must_make_path(h->container_full_path, CGROUP_NAMESPACE_SUBDIR, NULL);
  84 +               h->container_inner_path = path;
  85 +               ret = mkdir(path, 0755);
  86 +       } else {
  87 +               path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
  88 +               h->container_full_path = path;
  89 +               ret = mkdir_eexist_on_last(path, 0755);
  90 +       }
  91 +
  92         if (ret < 0) {
  93                 ERROR("Failed to create cgroup \"%s\"", h->container_full_path);
  94                 return false;
  95 @@ -1425,11 +1440,29 @@ on_error:
  96         return bret;
  97  }
  98
  99 +static inline bool cgfsng_create_inner(struct cgroup_ops *ops)
 100 +{
 101 +       size_t i;
 102 +       bool ret = true;
 103 +       char *cgname = must_make_path(ops->container_cgroup, CGROUP_NAMESPACE_SUBDIR, NULL);
 104 +       for (i = 0; ops->hierarchies[i]; i++) {
 105 +               if (!container_create_path_for_hierarchy(ops->hierarchies[i], cgname, true)) {
 106 +                       SYSERROR("Failed to create %s namespace subdirectory: %s",
 107 +                                ops->hierarchies[i]->container_full_path, strerror(errno));
 108 +                       ret = false;
 109 +                       break;
 110 +               }
 111 +       }
 112 +       free(cgname);
 113 +       return ret;
 114 +}
 115 +
 116  /* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
 117   * next cgroup_pattern-1, -2, ..., -999.
 118   */
 119  __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
 120 -                                                       struct lxc_handler *handler)
 121 +                                                       struct lxc_handler *handler,
 122 +                                                       bool inner)
 123  {
 124         int i;
 125         size_t len;
 126 @@ -1438,10 +1471,17 @@ __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
 127         struct lxc_conf *conf = handler->conf;
 128
 129         if (ops->container_cgroup) {
 130 +               if (inner)
 131 +                       return cgfsng_create_inner(ops);
 132                 WARN("cgfsng_create called a second time: %s", ops->container_cgroup);
 133                 return false;
 134         }
 135
 136 +       if (inner) {
 137 +               ERROR("cgfsng_create called twice for inner cgroup");
 138 +               return false;
 139 +       }
 140 +
 141         if (!conf)
 142                 return false;
 143
 144 @@ -1482,7 +1522,7 @@ again:
 145         }
 146
 147         for (i = 0; ops->hierarchies[i]; i++) {
 148 -               if (!container_create_path_for_hierarchy(ops->hierarchies[i], container_cgroup)) {
 149 +               if (!container_create_path_for_hierarchy(ops->hierarchies[i], container_cgroup, false)) {
 150                         ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path);
 151                         free(ops->hierarchies[i]->container_full_path);
 152                         ops->hierarchies[i]->container_full_path = NULL;
 153 @@ -1505,7 +1545,8 @@ out_free:
 154  }
 155
 156  __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
 157 -                                            bool monitor)
 158 +                                            bool monitor,
 159 +                                            bool inner)
 160  {
 161         int len;
 162         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
 163 @@ -1521,6 +1562,9 @@ __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
 164                 if (monitor)
 165                         path = must_make_path(ops->hierarchies[i]->monitor_full_path,
 166                                               "cgroup.procs", NULL);
 167 +               else if (inner)
 168 +                       path = must_make_path(ops->hierarchies[i]->container_inner_path,
 169 +                                             "cgroup.procs", NULL);
 170                 else
 171                         path = must_make_path(ops->hierarchies[i]->container_full_path,
 172                                               "cgroup.procs", NULL);
 173 @@ -1538,12 +1582,12 @@ __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
 174
 175  __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, pid_t pid)
 176  {
 177 -       return __do_cgroup_enter(ops, pid, true);
 178 +       return __do_cgroup_enter(ops, pid, true, false);
 179  }
 180
 181 -static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid)
 182 +static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid, bool inner)
 183  {
 184 -       return __do_cgroup_enter(ops, pid, false);
 185 +       return __do_cgroup_enter(ops, pid, false, inner);
 186  }
 187
 188  static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
 189 @@ -1609,9 +1653,15 @@ static int chown_cgroup_wrapper(void *data)
 190                 char *fullpath;
 191                 char *path = arg->hierarchies[i]->container_full_path;
 192
 193 +               if (arg->inner)
 194 +                       path = must_make_path(path, CGROUP_NAMESPACE_SUBDIR, NULL);
 195 +
 196                 ret = chowmod(path, destuid, nsgid, 0775);
 197 -               if (ret < 0)
 198 +               if (ret < 0) {
 199 +                       if (arg->inner)
 200 +                               free(path);
 201                         return -1;
 202 +               }
 203
 204                 /* Failures to chown() these are inconvenient but not
 205                  * detrimental We leave these owned by the container launcher,
 206 @@ -1630,8 +1680,11 @@ static int chown_cgroup_wrapper(void *data)
 207                 (void)chowmod(fullpath, destuid, nsgid, 0664);
 208                 free(fullpath);
 209
 210 -               if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
 211 +               if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC) {
 212 +                       if (arg->inner)
 213 +                               free(path);
 214                         continue;
 215 +               }
 216
 217                 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
 218                 (void)chowmod(fullpath, destuid, nsgid, 0664);
 219 @@ -1640,13 +1693,17 @@ static int chown_cgroup_wrapper(void *data)
 220                 fullpath = must_make_path(path, "cgroup.threads", NULL);
 221                 (void)chowmod(fullpath, destuid, nsgid, 0664);
 222                 free(fullpath);
 223 +
 224 +               if (arg->inner)
 225 +                       free(path);
 226         }
 227
 228         return 0;
 229  }
 230
 231  __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
 232 -                                       struct lxc_conf *conf)
 233 +                                       struct lxc_conf *conf,
 234 +                                       bool inner)
 235  {
 236         struct generic_userns_exec_data wrap;
 237
 238 @@ -1657,6 +1714,7 @@ __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
 239         wrap.path = NULL;
 240         wrap.hierarchies = ops->hierarchies;
 241         wrap.conf = conf;
 242 +       wrap.inner = inner;
 243
 244         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
 245                           "chown_cgroup_wrapper") < 0) {
 246 @@ -2038,7 +2096,8 @@ __cgfsng_ops static bool cgfsng_unfreeze(struct cgroup_ops *ops)
 247  }
 248
 249  __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
 250 -                                                   const char *controller)
 251 +                                                   const char *controller,
 252 +                                                   bool inner)
 253  {
 254         struct hierarchy *h;
 255
 256 @@ -2049,6 +2108,9 @@ __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
 257                 return NULL;
 258         }
 259
 260 +       if (inner)
 261 +               return h->container_inner_path ? h->container_inner_path + strlen(h->mountpoint) : NULL;
 262 +
 263         return h->container_full_path ? h->container_full_path + strlen(h->mountpoint) : NULL;
 264  }
 265
 266 @@ -2080,7 +2142,7 @@ static int __cg_unified_attach(const struct hierarchy *h, const char *name,
 267         int fret = -1, idx = 0;
 268         char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
 269
 270 -       container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
 271 +       container_cgroup = lxc_cmd_get_attach_cgroup_path(name, lxcpath, controller);
 272         /* not running */
 273         if (!container_cgroup)
 274                 return 0;
 275 @@ -2161,7 +2223,7 @@ __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
 276                         continue;
 277                 }
 278
 279 -               path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
 280 +               path = lxc_cmd_get_attach_cgroup_path(name, lxcpath, h->controllers[0]);
 281                 /* not running */
 282                 if (!path)
 283                         continue;
 284 diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h
 285 index d4dcd506b..59445b5a5 100644
 286 --- a/src/lxc/cgroups/cgroup.h
 287 +++ b/src/lxc/cgroups/cgroup.h
 288 @@ -32,6 +32,12 @@
 289  #define MONITOR_CGROUP "lxc.monitor"
 290  #define PIVOT_CGROUP "lxc.pivot"
 291
 292 +/* When lxc.cgroup.protect_limits is in effect the container's cgroup namespace
 293 + * will be moved into an additional subdirectory "cgns/" inside the cgroup in
 294 + * order to prevent it from accessing the outer limiting cgroup.
 295 + */
 296 +#define CGROUP_NAMESPACE_SUBDIR "cgns"
 297 +
 298  struct lxc_handler;
 299  struct lxc_conf;
 300  struct lxc_list;
 301 @@ -72,6 +78,9 @@ typedef enum {
 302   * @monitor_full_path
 303   * - The full path to the monitor's cgroup.
 304   *
 305 + * @container_inner_path
 306 + * - The full path to the container's inner cgroup when protect_limits is used.
 307 + *
 308   * @version
 309   * - legacy hierarchy
 310   *   If the hierarchy is a legacy hierarchy this will be set to
 311 @@ -85,6 +94,7 @@ struct hierarchy {
 312         char *mountpoint;
 313         char *container_base_path;
 314         char *container_full_path;
 315 +       char *container_inner_path;
 316         char *monitor_full_path;
 317         int version;
 318  };
 319 @@ -139,9 +149,9 @@ struct cgroup_ops {
 320         void (*monitor_destroy)(struct cgroup_ops *ops, struct lxc_handler *handler);
 321         bool (*monitor_create)(struct cgroup_ops *ops, struct lxc_handler *handler);
 322         bool (*monitor_enter)(struct cgroup_ops *ops, pid_t pid);
 323 -       bool (*payload_create)(struct cgroup_ops *ops, struct lxc_handler *handler);
 324 -       bool (*payload_enter)(struct cgroup_ops *ops, pid_t pid);
 325 -       const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller);
 326 +       bool (*payload_create)(struct cgroup_ops *ops, struct lxc_handler *handler, bool inner);
 327 +       bool (*payload_enter)(struct cgroup_ops *ops, pid_t pid, bool inner);
 328 +       const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller, bool inner);
 329         bool (*escape)(const struct cgroup_ops *ops, struct lxc_conf *conf);
 330         int (*num_hierarchies)(struct cgroup_ops *ops);
 331         bool (*get_hierarchies)(struct cgroup_ops *ops, int n, char ***out);
 332 @@ -152,7 +162,7 @@ struct cgroup_ops {
 333         bool (*unfreeze)(struct cgroup_ops *ops);
 334         bool (*setup_limits)(struct cgroup_ops *ops, struct lxc_conf *conf,
 335                              bool with_devices);
 336 -       bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf);
 337 +       bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf, bool inner);
 338         bool (*attach)(struct cgroup_ops *ops, const char *name,
 339                        const char *lxcpath, pid_t pid);
 340         bool (*mount)(struct cgroup_ops *ops, struct lxc_handler *handler,
 341 diff --git a/src/lxc/commands.c b/src/lxc/commands.c
 342 index 133384d72..b41a76000 100644
 343 --- a/src/lxc/commands.c
 344 +++ b/src/lxc/commands.c
 345 @@ -427,20 +427,8 @@ static int lxc_cmd_get_clone_flags_callback(int fd, struct lxc_cmd_req *req,
 346         return lxc_cmd_rsp_send(fd, &rsp);
 347  }
 348
 349 -/*
 350 - * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a
 351 - * particular subsystem. This is the cgroup path relative to the root
 352 - * of the cgroup filesystem.
 353 - *
 354 - * @name      : name of container to connect to
 355 - * @lxcpath   : the lxcpath in which the container is running
 356 - * @subsystem : the subsystem being asked about
 357 - *
 358 - * Returns the path on success, NULL on failure. The caller must free() the
 359 - * returned path.
 360 - */
 361 -char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
 362 -                             const char *subsystem)
 363 +char *do_lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
 364 +                             const char *subsystem, bool inner)
 365  {
 366         int ret, stopped;
 367         struct lxc_cmd_rr cmd = {
 368 @@ -453,8 +441,18 @@ char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
 369
 370         cmd.req.data = subsystem;
 371         cmd.req.datalen = 0;
 372 -       if (subsystem)
 373 -               cmd.req.datalen = strlen(subsystem) + 1;
 374 +       if (subsystem) {
 375 +               size_t subsyslen = strlen(subsystem);
 376 +               if (inner) {
 377 +                       char *data = alloca(subsyslen+2);
 378 +                       memcpy(data, subsystem, subsyslen+1);
 379 +                       data[subsyslen+1] = 1;
 380 +                       cmd.req.datalen = subsyslen+2,
 381 +                       cmd.req.data = data;
 382 +               } else {
 383 +                       cmd.req.datalen = subsyslen+1;
 384 +               }
 385 +       }
 386
 387         ret = lxc_cmd(name, &cmd, &stopped, lxcpath, NULL);
 388         if (ret < 0)
 389 @@ -469,6 +467,42 @@ char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
 390         return cmd.rsp.data;
 391  }
 392
 393 +/*
 394 + * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a
 395 + * particular subsystem. This is the cgroup path relative to the root
 396 + * of the cgroup filesystem.
 397 + *
 398 + * @name      : name of container to connect to
 399 + * @lxcpath   : the lxcpath in which the container is running
 400 + * @subsystem : the subsystem being asked about
 401 + *
 402 + * Returns the path on success, NULL on failure. The caller must free() the
 403 + * returned path.
 404 + */
 405 +char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
 406 +       const char *subsystem)
 407 +{
 408 +       return do_lxc_cmd_get_cgroup_path(name, lxcpath, subsystem, false);
 409 +}
 410 +
 411 +/*
 412 + * lxc_cmd_get_attach_cgroup_path: Calculate a container's inner cgroup path
 413 + * for a particular subsystem. This is the cgroup path relative to the root
 414 + * of the cgroup filesystem.
 415 + *
 416 + * @name      : name of container to connect to
 417 + * @lxcpath   : the lxcpath in which the container is running
 418 + * @subsystem : the subsystem being asked about
 419 + *
 420 + * Returns the path on success, NULL on failure. The caller must free() the
 421 + * returned path.
 422 + */
 423 +char *lxc_cmd_get_attach_cgroup_path(const char *name, const char *lxcpath,
 424 +       const char *subsystem)
 425 +{
 426 +       return do_lxc_cmd_get_cgroup_path(name, lxcpath, subsystem, true);
 427 +}
 428 +
 429  static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
 430                                        struct lxc_handler *handler)
 431  {
 432 @@ -476,10 +510,21 @@ static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
 433         struct lxc_cmd_rsp rsp;
 434         struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
 435
 436 -       if (req->datalen > 0)
 437 -               path = cgroup_ops->get_cgroup(cgroup_ops, req->data);
 438 -       else
 439 -               path = cgroup_ops->get_cgroup(cgroup_ops, NULL);
 440 +       if (req->datalen > 0) {
 441 +               const char *subsystem;
 442 +               size_t subsyslen;
 443 +               bool inner = false;
 444 +               subsystem = req->data;
 445 +               subsyslen = strlen(subsystem);
 446 +               if (req->datalen == subsyslen+2)
 447 +                       inner = (subsystem[subsyslen+1] == 1);
 448 +
 449 +               path = cgroup_ops->get_cgroup(cgroup_ops, req->data, inner);
 450 +       } else {
 451 +               // FIXME: cgroup separation for cgroup v2 cannot be handled
 452 +               // like we used to do v1 here... need to figure this out...
 453 +               path = cgroup_ops->get_cgroup(cgroup_ops, NULL, false);
 454 +       }
 455         if (!path)
 456                 return -1;
 457
 458 @@ -651,7 +696,7 @@ static int lxc_cmd_stop_callback(int fd, struct lxc_cmd_req *req,
 459                  * lxc_unfreeze() would do another cmd (GET_CGROUP) which would
 460                  * deadlock us.
 461                  */
 462 -               if (!cgroup_ops->get_cgroup(cgroup_ops, "freezer"))
 463 +               if (!cgroup_ops->get_cgroup(cgroup_ops, "freezer", false))
 464                         return 0;
 465
 466                 if (cgroup_ops->unfreeze(cgroup_ops))
 467 diff --git a/src/lxc/commands.h b/src/lxc/commands.h
 468 index 2c024b65d..7c4c00b1e 100644
 469 --- a/src/lxc/commands.h
 470 +++ b/src/lxc/commands.h
 471 @@ -88,6 +88,8 @@ extern int lxc_cmd_console(const char *name, int *ttynum, int *fd,
 472   */
 473  extern char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
 474                         const char *subsystem);
 475 +extern char *lxc_cmd_get_attach_cgroup_path(const char *name,
 476 +                       const char *lxcpath, const char *subsystem);
 477  extern int lxc_cmd_get_clone_flags(const char *name, const char *lxcpath);
 478  extern char *lxc_cmd_get_config_item(const char *name, const char *item, const char *lxcpath);
 479  extern char *lxc_cmd_get_name(const char *hashed_sock);
 480 diff --git a/src/lxc/criu.c b/src/lxc/criu.c
 481 index 3d857b541..ec9bcb7e4 100644
 482 --- a/src/lxc/criu.c
 483 +++ b/src/lxc/criu.c
 484 @@ -332,7 +332,7 @@ static void exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf,
 485                 } else {
 486                         const char *p;
 487
 488 -                       p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0]);
 489 +                       p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0], false);
 490                         if (!p) {
 491                                 ERROR("failed to get cgroup path for %s", controllers[0]);
 492                                 goto err;
 493 @@ -976,7 +976,7 @@ static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_
 494                 goto out_fini_handler;
 495         handler->cgroup_ops = cgroup_ops;
 496
 497 -       if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
 498 +       if (!cgroup_ops->payload_create(cgroup_ops, handler, false)) {
 499                 ERROR("failed creating groups");
 500                 goto out_fini_handler;
 501         }
 502 diff --git a/src/lxc/start.c b/src/lxc/start.c
 503 index dae3bcfe5..f3b29d6cd 100644
 504 --- a/src/lxc/start.c
 505 +++ b/src/lxc/start.c
 506 @@ -1649,7 +1649,7 @@ static int lxc_spawn(struct lxc_handler *handler)
 507                 }
 508         }
 509
 510 -       if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
 511 +       if (!cgroup_ops->payload_create(cgroup_ops, handler, false)) {
 512                 ERROR("Failed creating cgroups");
 513                 goto out_delete_net;
 514         }
 515 @@ -1743,10 +1743,10 @@ static int lxc_spawn(struct lxc_handler *handler)
 516                 goto out_delete_net;
 517         }
 518
 519 -       if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid))
 520 +       if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid, false))
 521                 goto out_delete_net;
 522
 523 -       if (!cgroup_ops->chown(cgroup_ops, handler->conf))
 524 +       if (!cgroup_ops->chown(cgroup_ops, handler->conf, false))
 525                 goto out_delete_net;
 526
 527         /* Now we're ready to preserve the network namespace */
 528 @@ -1813,16 +1813,30 @@ static int lxc_spawn(struct lxc_handler *handler)
 529                 }
 530         }
 531
 532 -       ret = lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE);
 533 -       if (ret < 0)
 534 -               goto out_delete_net;
 535 -
 536         if (!cgroup_ops->setup_limits(cgroup_ops, handler->conf, true)) {
 537                 ERROR("Failed to setup legacy device cgroup controller limits");
 538                 goto out_delete_net;
 539         }
 540         TRACE("Set up legacy device cgroup controller limits");
 541
 542 +       if (cgns_supported()) {
 543 +               if (!cgroup_ops->payload_create(cgroup_ops, handler, true)) {
 544 +                       ERROR("failed to create inner cgroup separation layer");
 545 +                       goto out_delete_net;
 546 +               }
 547 +               if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid, true)) {
 548 +                       ERROR("failed to enter inner cgroup separation layer");
 549 +                       goto out_delete_net;
 550 +               }
 551 +               if (!cgroup_ops->chown(cgroup_ops, handler->conf, true)) {
 552 +                       ERROR("failed chown inner cgroup separation layer");
 553 +                       goto out_delete_net;
 554 +               }
 555 +       }
 556 +
 557 +       if (lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE))
 558 +               goto out_delete_net;
 559 +
 560         if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
 561                 /* Now we're ready to preserve the cgroup namespace */
 562                 ret = lxc_try_preserve_ns(handler->pid, "cgroup");
 563 --
 564 2.20.1
 565