]> git.proxmox.com Git - mirror_lxc.git/commitdiff
use systemd dbus StartTransientUnit for unpriv cgroup2
authorSerge Hallyn <serge@hallyn.com>
Tue, 21 Jun 2022 12:50:53 +0000 (14:50 +0200)
committerChristian Brauner (Microsoft) <christian.brauner@ubuntu.com>
Tue, 21 Jun 2022 14:01:13 +0000 (16:01 +0200)
If, when init'ing cgroups for a container start, we detect that we
are an unprivileged user on a unified-hierarchy-only system, then we
try to request systemd, through dbus api, to create a new scope for
us with delegation.  Call the cgroup it creates for us P1.  We then
create P1/init, move ourselves into there, so we can enable the
controllers for delegation to P1's children through P1/cgroup.subtree_control.

On attach, we try to request systemd attach us to the container's
scope.  We can't do that ourselves in the normal case, as root owns
our login cgroups.

Create a new command api for the lxc monitor to tell lxc-attach the
systemd scope to which to attach.

Changelog:
 * free cgroup_meta.systemd_scope in lxc_conf_free (Thanks Tycho)
 * fix some indent
 * address some (not all) of brauner's feedback

Signed-off-by: Serge Hallyn <serge@hallyn.com>
12 files changed:
.github/workflows/build.yml
.github/workflows/coverity.yml
.github/workflows/sanitizers.sh
.github/workflows/sanitizers.yml
meson.build
meson_options.txt
src/lxc/cgroups/cgfsng.c
src/lxc/commands.c
src/lxc/commands.h
src/lxc/conf.c
src/lxc/conf.h
src/tests/oss-fuzz.sh

index f01fdb3c9da632aa529890c95cf2a2960cecadda..0a6f406cab9bde986f04e114ca6c6a25feebe916 100644 (file)
@@ -26,7 +26,7 @@ jobs:
         run: |
           sudo apt-get update -qq
           sudo apt-get install -qq gcc clang meson llvm
-          sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev libpam0g-dev docbook2x
+          sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev libpam0g-dev docbook2x libsystemd-dev
 
       - name: Compiler version
         env:
index 4457474b79c6f023435d718223f31c793cf83db6..52d7cac724a5170105aea5a5c51319265e324d2a 100644 (file)
@@ -25,7 +25,7 @@ jobs:
         run: |
           sudo apt-get update -qq
           sudo apt-get install -qq gcc clang
-          sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev docbook2x
+          sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev docbook2x libsystemd-dev
 
       - name: Compiler version
         run: |
index 061061c0ad50716a5a4fb7d0704b4bc81c83b8a8..0144f153eb0fd46c891453ea11c1667ff6fe3732 100755 (executable)
@@ -18,7 +18,7 @@ apt-get install --yes --no-install-recommends \
     libpam0g-dev libseccomp-dev libselinux1-dev libtool linux-libc-dev \
     llvm lsb-release make openssl pkg-config python3-all-dev \
     python3-setuptools rsync squashfs-tools uidmap unzip uuid-runtime \
-    wget xz-utils systemd-coredump
+    wget xz-utils systemd-coredump libsystemd-dev
 apt-get remove --yes lxc-utils liblxc-common liblxc1 liblxc-dev
 
 ARGS="-Dprefix=/usr -Dtests=true -Dpam-cgroup=false -Dwerror=true -Dio-uring-event-loop=false -Db_lto_mode=default -Db_lundef=false"
index 4a28c8e1cf3569e4571738fd5f85cae9c33bb2ca..ce50dfaecb04bcd9ced9a32727bda7268c052f06 100644 (file)
@@ -22,7 +22,7 @@ jobs:
         run: |
           sudo apt-get update -qq
           sudo apt-get install -qq gcc clang meson llvm
-          sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev libpam0g-dev docbook2x
+          sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev libpam0g-dev docbook2x libsystemd-dev
 
       - name: Compiler version
         env:
index 4cef7c9fc1802129692edf61b65d496c9d5c7e1a..21eba6d1ebf2155a8296bb0a9ee1690819ee597c 100644 (file)
@@ -151,6 +151,7 @@ want_oss_fuzz = get_option('oss-fuzz')
 want_seccomp = get_option('seccomp')
 want_thread_safety = get_option('thread-safety')
 want_memfd_rexec = get_option('memfd-rexec')
+want_sd_bus = get_option('sd-bus')
 
 srcconf.set_quoted('DEFAULT_CGROUP_PATTERN', cgrouppattern)
 if coverity
@@ -256,6 +257,49 @@ else
     srcconf.set10('HAVE_LIBURING', false)
 endif
 
+if not want_sd_bus.disabled()
+    has_sd_bus = true
+    sd_bus_optional = want_sd_bus.auto()
+
+    libsystemd = dependency('libsystemd', required: not sd_bus_optional)
+    if not libsystemd.found()
+        if not sd_bus_optional
+            error('missing required libsystemd dependency')
+        endif
+
+        has_sd_bus = false
+    endif
+
+    if not cc.has_header('systemd/sd-bus.h')
+        if not sd_bus_optional
+            error('libsystemd misses required systemd/sd-bus.h header')
+        endif
+
+        has_sd_bus = false
+    endif
+
+    if not cc.has_header('systemd/sd-event.h')
+        if not sd_bus_optional
+            error('libsystemd misses required systemd/sd-event.h header')
+        endif
+
+        has_sd_bus = false
+    endif
+
+    if not cc.has_function('sd_bus_call_method_asyncv', prefix: '#include <systemd/sd-bus.h>', dependencies: libsystemd) 
+        if not sd_bus_optional
+            error('libsystemd misses required sd_bus_call_method_asyncv function')
+        endif
+
+        has_sd_bus = false
+    endif
+
+    srcconf.set10('HAVE_LIBSYSTEMD', has_sd_bus)
+else
+    has_sd_bus = false
+    srcconf.set10('HAVE_LIBSYSTEMD', false)
+endif
+
 ## Time EPOCH.
 sh = find_program('sh')
 date = find_program('date')
@@ -639,6 +683,8 @@ endforeach
 found_headers = []
 missing_headers = []
 foreach tuple: [
+    ['systemd/sd-bus.h'],
+    ['systemd/sd-event.h'],
     ['sys/resource.h'],
     ['sys/memfd.h'],
     ['sys/personality.h'],
@@ -676,6 +722,7 @@ foreach tuple: [
     ['pam'],
     ['openssl'],
     ['liburing'],
+    ['libsystemd'],
 ]
 
     if tuple.length() >= 2
@@ -750,6 +797,10 @@ if want_io_uring
     liblxc_dependencies += [liburing]
 endif
 
+if has_sd_bus
+    liblxc_dependencies += [libsystemd]
+endif
+
 liblxc_link_whole = [liblxc_static]
 
 liblxc = shared_library(
index d82ae3486e0648c1aefa403bb5b8192d888d1838..c14dacf27fa09bf0cb0c94715dcbececb8aac4c1 100644 (file)
@@ -22,6 +22,9 @@ option('init-script', type : 'array',
 option('io-uring-event-loop', type: 'boolean', value: 'false',
        description: 'Enable io-uring based event loop')
 
+option('sd-bus', type: 'feature', value: 'auto',
+       description: 'Enable linking against sd-bus')
+
 # was --{disable,enable}-doc in autotools
 option('man', type: 'boolean', value: 'true',
        description: 'build and install manpages')
index e39bde8df9cb21b89027d78303719e3d8a6f1df8..ee4fc052fd610ef8b45c85644bfde22828d3112e 100644 (file)
@@ -20,6 +20,7 @@
 #include <grp.h>
 #include <linux/kdev_t.h>
 #include <linux/types.h>
+#include <libgen.h>
 #include <poll.h>
 #include <signal.h>
 #include <stdint.h>
 #include "strlcat.h"
 #endif
 
+#if HAVE_LIBSYSTEMD
+#include <systemd/sd-bus.h>
+#include <systemd/sd-event.h>
+#endif
+
 lxc_log_define(cgfsng, cgroup);
 
 /*
@@ -947,6 +953,354 @@ static bool check_cgroup_dir_config(struct lxc_conf *conf)
        return true;
 }
 
+#define SYSTEMD_SCOPE_FAILED 2
+#define SYSTEMD_SCOPE_UNSUPP 1
+#define SYSTEMD_SCOPE_SUCCESS 0
+
+#if HAVE_LIBSYSTEMD
+struct sd_callback_data {
+       char *scope_name;
+       bool job_complete;
+};
+
+static int systemd_jobremoved_callback(sd_bus_message *m, void *userdata, sd_bus_error *error)
+{
+       char *path, *unit, *result;
+       struct sd_callback_data *sd_data = userdata;
+       uint32_t id;
+       int r;
+
+       r = sd_bus_message_read(m, "uoss", &id, &path, &unit, &result);
+       if (r < 0)
+               return log_error(-1, "bad message received in callback: %s", strerror(-r));
+
+       if (sd_data->scope_name && strcmp(unit, sd_data->scope_name) != 0)
+               return log_trace(-1, "unit was '%s' not '%s'", unit, sd_data->scope_name);
+       if (strcmp(result, "done") == 0) {
+               sd_data->job_complete = true;
+               return log_info(1, "job is done");
+       }
+       return log_debug(0, "result was '%s', not 'done'", result);
+}
+
+#define DESTINATION "org.freedesktop.systemd1"
+#define PATH "/org/freedesktop/systemd1"
+#define INTERFACE "org.freedesktop.systemd1.Manager"
+#define MEMBER "StartTransientUnit"
+static bool start_scope(sd_bus *bus, struct sd_callback_data *data, struct sd_event *event)
+{
+       __attribute__((__cleanup__(sd_bus_error_free))) sd_bus_error error = SD_BUS_ERROR_NULL;;
+       __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *reply = NULL;
+       __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *m = NULL;
+       char *path = NULL;
+       int r;
+
+       r = sd_bus_message_new_method_call(bus, &m,
+               DESTINATION, PATH, INTERFACE, MEMBER);
+       if (r < 0)
+               return log_error(false, "Failed creating sdbus message");
+
+       r = sd_bus_message_append(m, "ss", data->scope_name, "fail");
+       if (r < 0)
+               return log_error(false, "Failed setting systemd scope name");
+
+       r = sd_bus_message_open_container(m, 'a', "(sv)");
+       if (r < 0)
+               return log_error(false, "Failed allocating sdbus msg properties");
+
+       r = sd_bus_message_append(m, "(sv)(sv)(sv)",
+               "PIDs", "au", 1, getpid(),
+               "Delegate", "b", 1,
+               "CollectMode", "s", "inactive-or-failed");
+       if (r < 0)
+               return log_error(false, "Failed setting properties on sdbus message");
+
+       r = sd_bus_message_close_container(m);
+       if (r < 0)
+               return log_error(false, "Failed closing sdbus message properties");
+
+       r = sd_bus_message_append(m, "a(sa(sv))", 0);
+       if (r < 0)
+               return log_error(false, "Failed appending aux boilerplate\n");
+
+       r = sd_bus_call(NULL, m, 0, &error, &reply);
+       if (r < 0)
+               return log_error(false,  "Failed sending sdbus message: %s", error.message);
+
+       /* Parse the response message */
+       r = sd_bus_message_read(reply, "o", &path);
+       if (r < 0)
+               return log_error(false, "Failed to parse response message: %s", strerror(-r));
+
+       /* Now spin up a mini-event-loop to wait for the "job completed" message */
+       int tries = 0;
+
+       while (!data->job_complete) {
+               r = sd_event_run(event, 1000 * 1000);
+               if (r < 0) {
+                       log_debug(stderr, "Error waiting for JobRemoved: %s\n", strerror(-r));
+                       continue;
+               }
+               if (data->job_complete || tries == 5)
+                       break;
+               if (r > 0) {
+                       log_trace(stderr, "Debug: we processed an event (%d), but not the one we wanted\n", r);
+                       continue;
+               }
+               if (r == 0) // timeout
+                       tries++;
+       }
+       if (!data->job_complete) {
+               return log_error(false, "Error: %s job was never removed", data->scope_name);
+       }
+       return true;
+}
+
+static bool string_pure_unified_system(char *contents)
+{
+       char *p;
+       bool first_line_read = false;
+
+       lxc_iterate_parts(p, contents, "\n") {
+               if (first_line_read) // if >1 line, this is not pure unified
+                       return false;
+               first_line_read = true;
+
+               if (strlen(p) > 3 && strncmp(p, "0:", 2) == 0)
+                       return true;
+       }
+
+       return false;
+}
+
+/*
+ * Only call get_current_unified_cgroup() when we are in a pure
+ * unified (v2-only) cgroup
+ */
+static char *get_current_unified_cgroup(void)
+{
+       __do_free char *buf = NULL;
+       __do_free_string_list char **list = NULL;
+       char *p;
+
+       buf = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
+       if (!buf)
+               return NULL;
+
+       if (!string_pure_unified_system(buf))
+               return NULL;
+
+       // 0::/user.slice/user-1000.slice/session-136.scope
+       // Get past the "0::"
+       p = buf;
+       if (strnequal(p, "0::", STRLITERALLEN("0::")))
+               p += STRLITERALLEN("0::");
+
+       return strdup(p);
+}
+
+static bool pure_unified_system(void)
+{
+       __do_free char *buf = NULL;
+
+       buf = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
+       if (!buf)
+               return false;
+
+       return string_pure_unified_system(buf);
+}
+
+#define MEMBER_JOIN "AttachProcessesToUnit"
+static bool enter_scope(char *scope_name, pid_t pid)
+{
+       __attribute__((__cleanup__(sd_bus_unrefp))) sd_bus *bus = NULL;
+       __attribute__((__cleanup__(sd_bus_error_free))) sd_bus_error error = SD_BUS_ERROR_NULL;;
+       __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *reply = NULL;
+       __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *m = NULL;
+       int r;
+
+       r = sd_bus_open_user(&bus);
+       if (r < 0)
+               return log_error(false, "Failed to connect to user bus: %s", strerror(-r));
+
+       r = sd_bus_message_new_method_call(bus, &m,
+               DESTINATION, PATH, INTERFACE, MEMBER_JOIN);
+       if (r < 0)
+               return log_error(false, "Failed creating sdbus message");
+
+       r = sd_bus_message_append(m, "ssau", scope_name, "/init", 1, pid);
+       if (r < 0)
+               return log_error(false, "Failed setting systemd scope name");
+
+
+       r = sd_bus_call(NULL, m, 0, &error, &reply);
+       if (r < 0)
+               return log_error(false,  "Failed sending sdbus message: %s", error.message);
+
+       return true;
+}
+
+static bool enable_controllers_delegation(int fd_dir, char *cg)
+{
+       __do_free char *rbuf = NULL;
+       __do_free char *wbuf = NULL;
+       __do_free_string_list char **cpulist = NULL;
+       char *controller;
+       size_t full_len = 0;
+       bool first = true;
+       int ret;
+
+       rbuf = read_file_at(fd_dir, "cgroup.controllers", PROTECT_OPEN, 0);
+       if (!rbuf)
+               return false;
+
+       lxc_iterate_parts(controller, rbuf, " ") {
+               full_len += strlen(controller) + 2;
+               wbuf = must_realloc(wbuf, full_len);
+               if (first) {
+                       wbuf[0] = '\0';
+                       first = false;
+               } else {
+                       (void)strlcat(wbuf, " ", full_len + 1);
+               }
+               strlcat(wbuf, "+", full_len + 1);
+               strlcat(wbuf, controller, full_len + 1);
+       }
+       if (!wbuf)
+               return log_debug(true, "No controllers to delegate!");
+
+       ret = lxc_writeat(fd_dir, "cgroup.subtree_control", wbuf, strlen(wbuf));
+       if (ret < 0)
+               return log_error_errno(false, errno, "Failed to write \"%s\" to %s/cgroup.subtree_control", wbuf, cg);
+
+       return true;
+}
+
+/*
+ * systemd places us in say .../lxc-1.scope.  We create lxc-1.scope/init,
+ * move ourselves to there, then enable controllers in lxc-1.scope
+ */
+static bool move_and_delegate_unified(char *parent_cgroup)
+{
+       __do_free char *buf = NULL;
+       __do_close int fd_parent = -EBADF;
+       int ret;
+
+       fd_parent = open_at(-EBADF, parent_cgroup, O_DIRECTORY, 0, 0);
+       if (fd_parent < 0)
+               return syserror_ret(false, "Failed opening cgroup dir \"%s\"", parent_cgroup);
+
+       ret = mkdirat(fd_parent, "init", 0755);
+       if (ret < 0 && errno != EEXIST)
+               return syserror_ret(false, "Failed to create \"%d/init\" cgroup", fd_parent);
+
+       buf = read_file_at(fd_parent, "cgroup.procs", PROTECT_OPEN, 0);
+       if (!buf)
+               return false;
+
+       ret = lxc_writeat(fd_parent, "init/cgroup.procs", buf, strlen(buf));
+       if (ret)
+               return syserror_ret(false, "Failed to escape to cgroup \"init/cgroup.procs\"");
+
+       /* enable controllers in parent_cgroup */
+       return enable_controllers_delegation(fd_parent, parent_cgroup);
+}
+
+static int unpriv_systemd_create_scope(struct cgroup_ops *ops, struct lxc_conf *conf)
+{
+       __do_free char *full_scope_name = NULL;
+       __do_free char *fs_cg_path = NULL;
+       sd_event *event = NULL;
+       __attribute__((__cleanup__(sd_bus_unrefp))) sd_bus *bus = NULL; // free the bus before the names it references, just to be sure
+       struct sd_callback_data sd_data;
+       int idx = 0;
+       size_t len;
+       int r;
+
+       if (geteuid() == 0)
+               return log_info(SYSTEMD_SCOPE_UNSUPP, "Running privileged, not using a systemd unit");
+       // Pure_unified_layout() can't be used as that info is not yet setup.  At
+       // the same time, we don't want to calculate current cgroups until after
+       // we optionally enter a new systemd user scope.  So let's just do a quick
+       // check for pure unified cgroup system: single line /proc/self/cgroup with
+       // only index '0:'
+       if (!pure_unified_system())
+               return log_info(SYSTEMD_SCOPE_UNSUPP, "Not in unified layout, not using a systemd unit");
+
+       r = sd_bus_open_user(&bus);
+       if (r < 0)
+               return log_error(SYSTEMD_SCOPE_FAILED, "Failed to connect to user bus: %s", strerror(-r));
+
+       r = sd_bus_call_method_asyncv(bus, NULL, DESTINATION, PATH, INTERFACE, "Subscribe", NULL, NULL, NULL, NULL);
+       if (r < 0)
+               return log_error(SYSTEMD_SCOPE_FAILED, "Failed to subscribe to signals: %s", strerror(-r));
+
+       sd_data.job_complete = false;
+       sd_data.scope_name = NULL;
+       r = sd_bus_match_signal(bus,
+               NULL, // no slot
+               DESTINATION, PATH, INTERFACE, "JobRemoved",
+               systemd_jobremoved_callback, &sd_data);
+       if (r < 0)
+               return log_error(SYSTEMD_SCOPE_FAILED, "Failed to register systemd event loop signal handler: %s", strerror(-r));
+
+       // NEXT: create and attach event
+       r = sd_event_new(&event);
+       if (r < 0)
+               return log_error(SYSTEMD_SCOPE_FAILED, "Failed allocating new event: %s\n", strerror(-r));
+       r = sd_bus_attach_event(bus, event, SD_EVENT_PRIORITY_NORMAL);
+       if (r < 0) {
+               // bus won't clean up event since the attach failed
+               sd_event_unrefp(&event);
+               return log_error(SYSTEMD_SCOPE_FAILED, "Failed attaching event: %s\n", strerror(-r));
+       }
+
+       // "lxc-" + (conf->name) + "-NN" + ".scope" + '\0'
+       len = STRLITERALLEN("lxc-") + strlen(conf->name) + 3 + STRLITERALLEN(".scope") + 1;
+       full_scope_name = malloc(len);
+       if (!full_scope_name)
+               return syserror("Out of memory");
+
+       do {
+               snprintf(full_scope_name, len, "lxc-%s-%d.scope", conf->name, idx);
+               sd_data.scope_name = full_scope_name;
+               if (start_scope(bus, &sd_data, event)) {
+                       conf->cgroup_meta.systemd_scope = get_current_unified_cgroup();
+                       if (!conf->cgroup_meta.systemd_scope)
+                               return log_trace(SYSTEMD_SCOPE_FAILED, "Out of memory");
+                       fs_cg_path = must_make_path("/sys/fs/cgroup", conf->cgroup_meta.systemd_scope, NULL);
+                       if (!move_and_delegate_unified(fs_cg_path))
+                               return log_error(SYSTEMD_SCOPE_FAILED, "Failed delegating the controllers to our cgroup");
+                       return log_trace(SYSTEMD_SCOPE_SUCCESS, "Created systemd scope %s", full_scope_name);
+               }
+               idx++;
+       } while (idx < 99);
+
+       return SYSTEMD_SCOPE_FAILED; // failed, let's try old-school after all
+}
+#else /* !HAVE_LIBSYSTEMD */
+static int unpriv_systemd_create_scope(struct cgroup_ops *ops, struct lxc_conf *conf)
+{
+       TRACE("unpriv_systemd_create_scope: no systemd support");
+       return SYSTEMD_SCOPE_UNSUPP; // not supported
+}
+#endif /* HAVE_LIBSYSTEMD */
+
+// Return a duplicate of cgroup path @cg without leading /, so
+// that caller can own+free it and be certain it's not abspath.
+static char *cgroup_relpath(char *cg)
+{
+       char *p;
+
+       if (!cg || strequal(cg, "/"))
+               return NULL;
+       p = strdup(deabs(cg));
+       if (!p)
+               return ERR_PTR(-ENOMEM);
+
+       return p;
+}
+
 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
 {
        __do_free char *monitor_cgroup = NULL;
@@ -1176,14 +1530,19 @@ __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
                if (ret)
                        return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
 
-               TRACE("Moved monitor into cgroup %d", h->dfd_mon);
+               TRACE("Moved monitor (%d) into cgroup %d", handler->monitor_pid, h->dfd_mon);
 
                if (handler->transient_pid <= 0)
                        continue;
 
                ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
-               if (ret)
-                       return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
+               if (ret) {
+                       // TODO: probably ask systemd to do the move for us instead
+                       if (!handler->conf->cgroup_meta.systemd_scope)
+                               return log_error_errno(false, errno, "Failed to enter pid %d into cgroup %d", handler->transient_pid, h->dfd_mon);
+                       else
+                               TRACE("Failed moving transient process into cgroup %d", h->dfd_mon);
+               }
 
                TRACE("Moved transient process into cgroup %d", h->dfd_mon);
 
@@ -2184,14 +2543,30 @@ static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
 }
 
 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
+                                       const char *lxcpath,
                                        int unified_fd, int *sk_fd, pid_t pid,
                                        bool unprivileged)
 {
        __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
        char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
        size_t pidstr_len;
+#if HAVE_LIBSYSTEMD
+       __do_free char *scope = NULL;
+#endif
        ssize_t ret;
 
+#if HAVE_LIBSYSTEMD
+       scope = lxc_cmd_get_systemd_scope(conf->name, lxcpath);
+       if (scope) {
+               TRACE("%s:%s is running under systemd-created scope '%s'.  Attaching...", lxcpath, conf->name, scope);
+               if (enter_scope(scope, pid))
+                       TRACE("Successfully entered scope '%s'", scope);
+               else
+                       ERROR("Failed entering scope '%s'", scope);
+       } else {
+               TRACE("%s:%s is not running under a systemd-created scope", lxcpath, conf->name);
+       }
+#endif
        if (unprivileged) {
                ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1);
                if (ret < 0)
@@ -2229,6 +2604,7 @@ static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
 
 struct userns_exec_unified_attach_data {
        const struct lxc_conf *conf;
+       const char *lxcpath;
        int unified_fd;
        int sk_pair[2];
        pid_t pid;
@@ -2239,8 +2615,8 @@ static int cgroup_unified_attach_child_wrapper(void *data)
 {
        struct userns_exec_unified_attach_data *args = data;
 
-       if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
-           args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
+       if (!args->conf || !args->lxcpath || args->unified_fd < 0 ||
+           args->pid <= 0 || args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
                return ret_errno(EINVAL);
 
        close_prot_errno_disarm(args->sk_pair[0]);
@@ -2257,7 +2633,8 @@ static int cgroup_unified_attach_parent_wrapper(void *data)
                return ret_errno(EINVAL);
 
        close_prot_errno_disarm(args->sk_pair[1]);
-       return cgroup_attach_move_into_leaf(args->conf, args->unified_fd,
+       return cgroup_attach_move_into_leaf(args->conf, args->lxcpath,
+                                           args->unified_fd,
                                            &args->sk_pair[0], args->pid,
                                            args->unprivileged);
 }
@@ -2286,6 +2663,7 @@ static int __cg_unified_attach(const struct hierarchy *h,
        ret = cgroup_attach(conf, name, lxcpath, pid);
        if (ret == 0)
                return log_trace(0, "Attached to unified cgroup via command handler");
+       TRACE("__cg_unified_attach: cgroup_attach returned %d", ret);
        if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2)
                return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
 
@@ -2294,6 +2672,7 @@ static int __cg_unified_attach(const struct hierarchy *h,
        /* not running */
        if (!cgroup)
                return 0;
+       TRACE("lxc_cmd_get_cgroup_path returned %s", cgroup);
 
        path = make_cgroup_path(h, cgroup, NULL);
 
@@ -2307,6 +2686,7 @@ static int __cg_unified_attach(const struct hierarchy *h,
                        .unified_fd     = unified_fd,
                        .pid            = pid,
                        .unprivileged   = am_guest_unpriv(),
+                       .lxcpath        = lxcpath,
                };
 
                ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
@@ -3152,12 +3532,19 @@ static const char *stable_order(const char *controllers)
 #define CGFSNG_LAYOUT_UNIFIED  BIT(1)
 
 static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
-                               bool unprivileged)
+                               bool unprivileged, struct lxc_conf *conf)
 {
        __do_free char *cgroup_info = NULL;
        unsigned int layout_mask = 0;
+       int ret;
        char *it;
 
+       ret = unpriv_systemd_create_scope(ops, conf);
+       if (ret < 0)
+               return ret_set_errno(false, ret);
+       else if (ret == 0)
+               TRACE("Entered an unpriv systemd scope");
+
        /*
         * Root spawned containers escape the current cgroup, so use init's
         * cgroups as our base in that case.
@@ -3175,7 +3562,7 @@ static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
                __do_free_string_list char **controller_list = NULL,
                                           **delegate = NULL;
                char *line;
-               int dfd, ret, type;
+               int dfd, type;
 
                /* Handle the unified cgroup hierarchy. */
                line = it;
@@ -3185,7 +3572,10 @@ static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
                        type = UNIFIED_HIERARCHY;
                        layout_mask |= CGFSNG_LAYOUT_UNIFIED;
 
-                       current_cgroup = current_unified_cgroup(relative, line);
+                       if (conf->cgroup_meta.systemd_scope)
+                               current_cgroup = cgroup_relpath(conf->cgroup_meta.systemd_scope);
+                       if (IS_ERR_OR_NULL(current_cgroup))
+                               current_cgroup = current_unified_cgroup(relative, line);
                        if (IS_ERR(current_cgroup))
                                return PTR_ERR(current_cgroup);
 
@@ -3429,7 +3819,7 @@ static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
         */
        ops->dfd_mnt = dfd;
 
-       ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map));
+       ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map), conf);
        if (ret < 0)
                return syserror_ret(ret, "Failed to initialize cgroups");
 
@@ -3502,7 +3892,7 @@ struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
        return move_ptr(cgfsng_ops);
 }
 
-static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid)
+static int __unified_attach_fd(const struct lxc_conf *conf, const char *lxcpath, int fd_unified, pid_t pid)
 {
        int ret;
 
@@ -3512,6 +3902,7 @@ static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_
                        .unified_fd     = fd_unified,
                        .pid            = pid,
                        .unprivileged   = am_guest_unpriv(),
+                       .lxcpath        = lxcpath,
                };
 
                ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
@@ -3555,7 +3946,7 @@ static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name,
                int dfd_con = ctx->fd[idx];
 
                if (unified_cgroup_fd(dfd_con))
-                       ret = __unified_attach_fd(conf, dfd_con, pid);
+                       ret = __unified_attach_fd(conf, lxcpath, dfd_con, pid);
                else
                        ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len);
                if (ret)
@@ -3580,7 +3971,7 @@ static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name
        if (dfd_unified < 0)
                return ret_errno(ENOSYS);
 
-       return __unified_attach_fd(conf, dfd_unified, pid);
+       return __unified_attach_fd(conf, lxcpath, dfd_unified, pid);
 }
 
 int cgroup_attach(const struct lxc_conf *conf, const char *name,
index 27861f25dbe9d3e5cc00e914069ee1430bb9db9e..946c72e959478ebc022a6cc8f838c1b7a3f373ff 100644 (file)
@@ -89,6 +89,7 @@ static const char *lxc_cmd_str(lxc_cmd_t cmd)
                [LXC_CMD_GET_CGROUP_CTX]                = "get_cgroup_ctx",
                [LXC_CMD_GET_CGROUP_FD]                 = "get_cgroup_fd",
                [LXC_CMD_GET_LIMIT_CGROUP_FD]           = "get_limit_cgroup_fd",
+               [LXC_CMD_GET_SYSTEMD_SCOPE]             = "get_systemd_scope",
        };
 
        if (cmd >= LXC_CMD_MAX)
@@ -1316,6 +1317,55 @@ static int lxc_cmd_get_lxcpath_callback(int fd, struct lxc_cmd_req *req,
        return lxc_cmd_rsp_send_reap(fd, &rsp);
 }
 
+char *lxc_cmd_get_systemd_scope(const char *name, const char *lxcpath)
+{
+       bool stopped = false;
+       ssize_t ret;
+       struct lxc_cmd_rr cmd;
+
+       lxc_cmd_init(&cmd, LXC_CMD_GET_SYSTEMD_SCOPE);
+
+       ret = lxc_cmd(name, &cmd, &stopped, lxcpath, NULL);
+       if (ret < 0)
+               return NULL;
+
+       if (cmd.rsp.ret == 0)
+               return cmd.rsp.data;
+
+       return NULL;
+}
+
+static int lxc_cmd_get_systemd_scope_callback(int fd, struct lxc_cmd_req *req,
+                                            struct lxc_handler *handler,
+                                            struct lxc_async_descr *descr)
+{
+       __do_free char *scope = NULL;
+       struct lxc_cmd_rsp rsp = {
+               .ret = -EINVAL,
+       };
+
+       // cgroup_meta.systemd_scope is the full cgroup path to the scope.
+       // The caller just wants the actual scope name, that is, basename().
+       // (XXX - or do we want the caller to massage it?  I'm undecided)
+       if (handler->conf->cgroup_meta.systemd_scope) {
+               scope = strrchr(handler->conf->cgroup_meta.systemd_scope, '/');
+               if (scope && *scope)
+                       scope++;
+               if (scope && *scope)
+                       scope = strdup(scope);
+       }
+
+       if (!scope)
+               goto out;
+
+       rsp.ret = 0;
+       rsp.data = scope;
+       rsp.datalen = strlen(scope) + 1;
+
+out:
+       return lxc_cmd_rsp_send_reap(fd, &rsp);
+}
+
 int lxc_cmd_add_state_client(const char *name, const char *lxcpath,
                             lxc_state_t states[static MAX_STATE],
                             int *state_client_fd)
@@ -1900,6 +1950,7 @@ static int lxc_cmd_process(int fd, struct lxc_cmd_req *req,
                [LXC_CMD_GET_CGROUP_CTX]                = lxc_cmd_get_cgroup_ctx_callback,
                [LXC_CMD_GET_CGROUP_FD]                 = lxc_cmd_get_cgroup_fd_callback,
                [LXC_CMD_GET_LIMIT_CGROUP_FD]           = lxc_cmd_get_limit_cgroup_fd_callback,
+               [LXC_CMD_GET_SYSTEMD_SCOPE]             = lxc_cmd_get_systemd_scope_callback,
        };
 
        if (req->cmd >= LXC_CMD_MAX)
index b4aac93a0db089aa663c4d81dba0bc8440043600..2a39748075cdfbf0cac9792198b97fb8d1f107b2 100644 (file)
@@ -52,6 +52,7 @@ typedef enum {
        LXC_CMD_GET_CGROUP_CTX                  = 23,
        LXC_CMD_GET_CGROUP_FD                   = 24,
        LXC_CMD_GET_LIMIT_CGROUP_FD             = 25,
+       LXC_CMD_GET_SYSTEMD_SCOPE               = 26,
        LXC_CMD_MAX,
 } lxc_cmd_t;
 
@@ -115,6 +116,7 @@ __hidden extern char *lxc_cmd_get_config_item(const char *name, const char *item
                                              const char *lxcpath);
 __hidden extern char *lxc_cmd_get_name(const char *hashed_sock);
 __hidden extern char *lxc_cmd_get_lxcpath(const char *hashed_sock);
+__hidden extern char *lxc_cmd_get_systemd_scope(const char *name, const char *lxcpath);
 __hidden extern pid_t lxc_cmd_get_init_pid(const char *name, const char *lxcpath);
 __hidden extern int lxc_cmd_get_init_pidfd(const char *name, const char *lxcpath);
 __hidden extern int lxc_cmd_get_state(const char *name, const char *lxcpath);
index a3293a5315f0c6e4548edc3a05668a4f300e6912..a24fdcc8f43ab43141b79ba6567cb976843fb7a0 100644 (file)
@@ -4831,6 +4831,7 @@ void lxc_conf_free(struct lxc_conf *conf)
        free(conf->cgroup_meta.container_dir);
        free(conf->cgroup_meta.namespace_dir);
        free(conf->cgroup_meta.controllers);
+       free(conf->cgroup_meta.systemd_scope);
        free(conf->shmount.path_host);
        free(conf->shmount.path_cont);
        free(conf);
index ccf59b47efac0934cc646fe50be07fb4ef425190..7dc2f15b603cf98b7514b6aa4203ad8770724114 100644 (file)
@@ -74,6 +74,13 @@ struct lxc_cgroup {
                        char *container_dir;
                        char *namespace_dir;
                        bool relative;
+                       /* If an unpriv user in pure unified-only hierarchy
+                        * starts a container, then we ask systemd to create
+                        * a scope for us, and create the monitor and container
+                        * cgroups under that.
+                        * This will ignore the above things like monitor_dir
+                        */
+                       char *systemd_scope;
                };
        };
 
index 4a3920a7756f5ccd0360cbc23cb0d7ce22a89838..2f95d34e53b836c580d872b70de563946b98e380 100755 (executable)
@@ -24,7 +24,7 @@ mkdir -p $OUT
 apt-get update -qq
 apt-get install --yes --no-install-recommends \
     build-essential docbook2x doxygen git \
-    wget xz-utils systemd-coredump pkgconf
+    wget xz-utils systemd-coredump pkgconf libsystemd-dev
 apt-get remove --yes lxc-utils liblxc-common liblxc1 liblxc-dev
 
 # make sure we have a new enough meson version