/* SPDX-License-Identifier: LGPL-2.1+ */
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE 1
-#endif
+#include "config.h"
+
#include <arpa/inet.h>
#include <dirent.h>
#include <errno.h>
#include <time.h>
#include <unistd.h>
+#include "conf.h"
#include "af_unix.h"
#include "caps.h"
#include "cgroups/cgroup.h"
#include "compiler.h"
-#include "conf.h"
-#include "config.h"
#include "confile.h"
#include "confile_utils.h"
#include "error.h"
#include "mount_utils.h"
#include "namespace.h"
#include "network.h"
+#include "open_utils.h"
#include "parse.h"
#include "process_utils.h"
#include "ringbuf.h"
#if HAVE_OPENPTY
#include <pty.h>
#else
-#include <../include/openpty.h>
+#include "openpty.h"
#endif
#if HAVE_LIBCAP
#include <sys/capability.h>
#endif
-#ifndef HAVE_STRLCAT
-#include "include/strlcat.h"
+#if !HAVE_STRLCAT
+#include "strlcat.h"
#endif
#if IS_BIONIC
-#include <../include/lxcmntent.h>
+#include "lxcmntent.h"
#else
#include <mntent.h>
#endif
-#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
-#include <../include/prlimit.h>
+#if !HAVE_PRLIMIT && HAVE_PRLIMIT64
+#include "prlimit.h"
#endif
-#ifndef HAVE_STRLCPY
-#include "include/strlcpy.h"
+#if !HAVE_STRLCPY
+#include "strlcpy.h"
#endif
-#ifndef HAVE_STRCHRNUL
-#include "include/strchrnul.h"
+#if !HAVE_STRCHRNUL
+#include "strchrnul.h"
#endif
lxc_log_define(conf, lxc);
struct caps_opt {
char *name;
- int value;
+ __u32 value;
};
struct limit_opt {
{ "rshared", 0, true, MS_SHARED, MS_SHARED | MS_REC },
{ "rslave", 0, true, MS_SLAVE, MS_SLAVE | MS_REC },
{ "runbindable", 0, true, MS_UNBINDABLE, MS_UNBINDABLE | MS_REC },
- { NULL, 0, 0 },
+ { NULL, 0, false, 0, 0 },
};
static struct caps_opt caps_opt[] = {
PROTECT_LOOKUP_BENEATH,
S_IWUSR | S_IRUSR);
if (fd_pin < 0) {
- if (errno == EROFS) {
+ if (errno == EROFS)
return log_trace_errno(0, EROFS, "Not pinning on read-only filesystem");
- }
return syserror("Failed to pin rootfs");
}
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
{ LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL, false },
{ LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL, false },
+ /* /proc/sys is used as a temporary staging directory for the read-write sysfs mount and unmounted after binding net */
+ { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/proc/sys", "sysfs", MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL, false },
{ LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL, false },
- { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL, false },
- { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL, false },
+ { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/proc/sys/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL, false },
+ { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/proc/sys", NULL, NULL, 0, NULL, false },
{ 0, 0, NULL, NULL, NULL, 0, NULL, false }
};
struct lxc_conf *conf = handler->conf;
return syserror_set(-ENOMEM, "Failed to create source path");
}
- if (!default_mounts[i].destination)
- return syserror_set(-EINVAL, "BUG: auto mounts destination %d was NULL", i);
-
if (!has_cap_net_admin && default_mounts[i].requires_cap_net_admin) {
TRACE("Container does not have CAP_NET_ADMIN. Skipping \"%s\" mount", default_mounts[i].source ?: "(null)");
continue;
}
+ if (!default_mounts[i].destination) {
+ ret = umount2(source, MNT_DETACH);
+ if (ret < 0)
+ return log_error_errno(-1, errno,
+ "Failed to unmount \"%s\"",
+ source);
+ TRACE("Unmounted automount \"%s\"", source);
+ continue;
+ }
+
/* will act like strdup if %r is not present */
destination = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].destination);
if (!destination)
* container can't remount it read-write.
*/
if ((cg_flags == LXC_AUTO_CGROUP_NOSPEC) || (cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC)) {
- int has_sys_admin = 0;
-
- if (!lxc_list_empty(&conf->keepcaps))
- has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
- else
- has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
-
if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
- cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
+ cg_flags = has_cap(CAP_SYS_ADMIN, conf)
+ ? LXC_AUTO_CGROUP_RW
+ : LXC_AUTO_CGROUP_MIXED;
else
- cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
+ cg_flags = has_cap(CAP_SYS_ADMIN, conf)
+ ? LXC_AUTO_CGROUP_FULL_RW
+ : LXC_AUTO_CGROUP_FULL_MIXED;
}
if (flags & LXC_AUTO_CGROUP_FORCE)
static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
{
- for (int i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
+ for (size_t i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
int ret;
struct stat s;
const struct dev_symlinks *d = &dev_symlinks[i];
}
/* Build a space-separate list of ptys to pass to systemd. */
-static bool append_ttyname(char **pp, char *name)
+static bool append_ttyname(struct lxc_tty_info *ttys, char *tty_name)
{
- char *p;
+ char *tty_names, *buf;
size_t size;
- if (!*pp) {
- *pp = zalloc(strlen(name) + strlen("container_ttys=") + 1);
- if (!*pp)
- return false;
+ if (!tty_name)
+ return false;
- sprintf(*pp, "container_ttys=%s", name);
- return true;
- }
+ size = strlen(tty_name) + 1;
+ if (ttys->tty_names)
+ size += strlen(ttys->tty_names) + 1;
- size = strlen(*pp) + strlen(name) + 2;
- p = realloc(*pp, size);
- if (!p)
+ buf = realloc(ttys->tty_names, size);
+ if (!buf)
return false;
+ tty_names = buf;
- *pp = p;
- (void)strlcat(p, " ", size);
- (void)strlcat(p, name, size);
-
+ if (ttys->tty_names)
+ (void)strlcat(buf, " ", size);
+ else
+ buf[0] = '\0';
+ (void)strlcat(buf, tty_name, size);
+ ttys->tty_names = tty_names;
return true;
}
PROTECT_LOOKUP_BENEATH,
0);
if (fd < 0) {
- if (!IN_SET(errno, ENXIO, EEXIST))
+ if (errno != ENXIO && errno != EEXIST)
return syserror("Failed to create \"%d/\%s\"", dfd, path);
SYSINFO("Failed to create \"%d/\%s\"", dfd, path);
if (!conf->rootfs.path)
return 0;
- for (int i = 0; i < ttys->max; i++) {
+ for (size_t i = 0; i < ttys->max; i++) {
__do_close int fd_to = -EBADF;
struct lxc_terminal_info *tty = &ttys->tty[i];
char *tty_name, *tty_path;
ret = strnprintf(rootfs->buf, sizeof(rootfs->buf),
- "/dev/%s/tty%d", ttydir, i + 1);
+ "/dev/%s/tty%zu", ttydir, i + 1);
if (ret < 0)
return ret_errno(-EIO);
rootfs->dfd_dev, tty_name,
rootfs->dfd_dev, tty_path);
} else {
- ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "tty%d", i + 1);
+ ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "tty%zu", i + 1);
if (ret < 0)
return ret_errno(-EIO);
DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, rootfs->buf);
}
- if (!append_ttyname(&conf->ttys.tty_names, tty->name))
+ if (!append_ttyname(&conf->ttys, tty->name))
return log_error(-1, "Error setting up container_ttys string");
}
return -ENOMEM;
for (size_t i = 0; i < conf->ttys.max; i++) {
- int pty_nr = -1;
struct lxc_terminal_info *tty = &ttys->tty[i];
ret = lxc_devpts_terminal(conf->devpts_fd, &tty->ptx,
- &tty->pty, &pty_nr, false);
+ &tty->pty, &tty->pty_nr, false);
if (ret < 0) {
conf->ttys.max = i;
return syserror_set(-ENOTTY, "Failed to create tty %zu", i);
}
+ ret = strnprintf(tty->name, sizeof(tty->name), "pts/%d", tty->pty_nr);
+ if (ret < 0)
+ return syserror("Failed to create tty %zu", i);
+
DEBUG("Created tty with ptx fd %d and pty fd %d and index %d",
- tty->ptx, tty->pty, pty_nr);
+ tty->ptx, tty->pty, tty->pty_nr);
tty->busy = -1;
}
if (!ttys || !ttys->tty)
return;
- for (int i = 0; i < ttys->max; i++) {
+ for (size_t i = 0; i < ttys->max; i++) {
struct lxc_terminal_info *tty = &ttys->tty[i];
close_prot_errno_disarm(tty->ptx);
close_prot_errno_disarm(tty->pty);
static int __lxc_send_ttys_to_parent(struct lxc_handler *handler)
{
- int i;
int ret = -1;
struct lxc_conf *conf = handler->conf;
struct lxc_tty_info *ttys = &conf->ttys;
if (ttys->max == 0)
return 0;
- for (i = 0; i < ttys->max; i++) {
+ for (size_t i = 0; i < ttys->max; i++) {
int ttyfds[2];
struct lxc_terminal_info *tty = &ttys->tty[i];
SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
goto on_error;
}
+ TRACE("Set \"container_ttys=%s\"", conf->ttys.tty_names);
}
return 0;
static int lxc_fill_autodev(struct lxc_rootfs *rootfs)
{
- int i, ret;
+ int ret;
mode_t cmask;
int use_mknod = LXC_DEVNODE_MKNOD;
INFO("Populating \"/dev\"");
cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
- for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
+ for (size_t i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
const struct lxc_device_node *device = &lxc_devices[i];
if (use_mknod >= LXC_DEVNODE_MKNOD) {
return log_trace(0, "Container uses separate rootfs. Opened container's rootfs");
}
+static bool lxc_rootfs_overmounted(struct lxc_rootfs *rootfs)
+{
+ __do_close int fd_rootfs = -EBADF;
+
+ if (!rootfs->path)
+ fd_rootfs = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
+ else
+ fd_rootfs = open_at(-EBADF, rootfs->mount, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
+ if (fd_rootfs < 0)
+ return true;
+
+ if (!same_file_lax(rootfs->dfd_mnt, fd_rootfs))
+ return syswarn_ret(true, "Rootfs seems to have changed after setting up mounts");
+
+ return false;
+}
+
static int lxc_chroot(const struct lxc_rootfs *rootfs)
{
__do_free char *nroot = NULL;
return log_error_errno(-errno, errno, "Failed to enter old root directory");
/*
- * Make fd_oldroot a depedent mount to make sure our umounts don't
- * propagate to the host.
+ * Unprivileged containers will have had all their mounts turned into
+ * dependent mounts when the container was created. But for privileged
+ * containers we need to turn the old root mount tree into a dependent
+ * mount tree to prevent propagating mounts and umounts into the host
+ * mount namespace.
*/
ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
if (ret < 0)
if (ret < 0)
return log_error_errno(-errno, errno, "Failed to re-enter new root directory \"%s\"", rootfs->mount);
+ /*
+ * Finally, we turn the rootfs into a shared mount. Note, that this
+ * doesn't reestablish mount propagation with the hosts mount
+ * namespace. Instead we'll create a new peer group.
+ *
+ * We're doing this because most workloads do rely on the rootfs being
+ * a shared mount. For example, systemd daemon like sytemd-udevd run in
+ * their own mount namespace. Their mount namespace has been made a
+ * dependent mount (MS_SLAVE) with the host rootfs as it's dominating
+ * mount. This means new mounts on the host propagate into the
+ * respective services.
+ *
+ * This is broken if we leave the container's rootfs a dependent mount.
+ * In which case both the container's rootfs and the service's rootfs
+ * will be dependent mounts with the host's rootfs as their dominating
+ * mount. So if you were to mount over the rootfs from the host it
+ * would not just propagate into the container's mount namespace it
+ * would also propagate into the service. That's nonsense semantics for
+ * nearly all relevant use-cases. Instead, establish the container's
+ * rootfs as a separate peer group mirroring the behavior on the host.
+ */
+ ret = mount("", ".", "", MS_SHARED | MS_REC, NULL);
+ if (ret < 0)
+ return log_error_errno(-errno, errno, "Failed to turn new root mount tree into shared mount tree");
+
TRACE("Changed into new rootfs \"%s\"", rootfs->mount);
return 0;
}
return syserror("Failed to create path");
close_prot_errno_disarm(conf->devpts_fd);
- return umount2(rootfs->buf, MNT_DETACH);
+ (void)umount2(rootfs->buf, MNT_DETACH);
+ return 0;
}
static int lxc_send_devpts_to_parent(struct lxc_handler *handler)
static int parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
{
- ssize_t ret;
+ size_t ret;
/* If '=' is contained in opt, the option must go into data. */
if (!strchr(opt, '=')) {
if (strlen(*data)) {
ret = strlcat(*data, ",", size);
- if (ret < 0)
+ if (ret >= size)
return log_error_errno(ret, errno, "Failed to append \",\" to %s", *data);
}
ret = strlcat(*data, opt, size);
- if (ret < 0)
+ if (ret >= size)
return log_error_errno(ret, errno, "Failed to append \"%s\" to %s", opt, *data);
return 0;
return 0;
}
- if (mo->flag == ~0)
+ if (mo->flag == (__u64)~0)
return log_info(0, "Ignoring %s mount option", mo->name);
if (mo->clear) {
"proc dev/.lxc/proc proc create=dir,optional 0 0\n"
"sys dev/.lxc/sys sysfs create=dir,optional 0 0\n";
-FILE *make_anonymous_mount_file(struct lxc_list *mount,
+FILE *make_anonymous_mount_file(const struct list_head *mount_entries,
bool include_nesting_helpers)
{
__do_close int fd = -EBADF;
FILE *f;
int ret;
- char *mount_entry;
- struct lxc_list *iterator;
+ struct string_entry *entry;
fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
if (fd < 0) {
TRACE("Created temporary mount file");
}
- lxc_list_for_each (iterator, mount) {
+ list_for_each_entry(entry, mount_entries, head) {
size_t len;
- mount_entry = iterator->elem;
- len = strlen(mount_entry);
+ len = strlen(entry->val);
- ret = lxc_write_nointr(fd, mount_entry, len);
- if (ret != len)
+ ret = lxc_write_nointr(fd, entry->val, len);
+ if (ret < 0 || (size_t)ret != len)
return NULL;
ret = lxc_write_nointr(fd, "\n", 1);
}
static int setup_mount_entries(const struct lxc_conf *conf,
- struct lxc_rootfs *rootfs, struct lxc_list *mount,
+ struct lxc_rootfs *rootfs,
const char *lxc_name, const char *lxc_path)
{
__do_fclose FILE *f = NULL;
- f = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
+ f = make_anonymous_mount_file(&conf->mount_entries, conf->lsm_aa_allow_nesting);
if (!f)
return -1;
struct lxc_mount_options opts = {};
int dfd_from;
const char *source_relative, *target_relative;
- struct lxc_mount_attr attr = {};
+ struct mount_attr attr = {};
ret = parse_lxc_mount_attrs(&opts, mntent.mnt_opts);
if (ret < 0)
/* Set propagation mount options. */
if (opts.attr.propagation) {
- attr = (struct lxc_mount_attr) {
- attr.propagation = opts.attr.propagation,
+ attr = (struct mount_attr) {
+ .propagation = opts.attr.propagation,
};
ret = mount_setattr(fd_from,
dfd_from = rootfs->dfd_mnt;
else
dfd_from = rootfs->dfd_host;
- fd_to = open_at(dfd_from, target_relative, PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH_WITH_SYMLINKS, 0);
+ fd_to = open_at(dfd_from, target_relative, PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH_XDEV, 0);
if (fd_to < 0) {
if (opts.optional) {
TRACE("Skipping optional idmapped mount");
int fret = -1;
struct lxc_conf *conf = handler->conf;
const char *fstab = conf->fstab;
- struct lxc_list *mount = &conf->mount_list;
int ret;
- f_entries = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
+ f_entries = make_anonymous_mount_file(&conf->mount_entries,
+ conf->lsm_aa_allow_nesting);
if (!f_entries) {
SYSERROR("Failed to create anonymous mount file");
goto out;
return fret;
}
-static int parse_cap(const char *cap)
+int parse_cap(const char *cap_name, __u32 *cap)
{
- size_t i;
- int capid = -1;
size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
- char *ptr = NULL;
+ int ret;
+ unsigned int res;
+ __u32 last_cap;
- if (strequal(cap, "none"))
+ if (strequal(cap_name, "none"))
return -2;
- for (i = 0; i < end; i++) {
- if (!strequal(cap, caps_opt[i].name))
+ for (size_t i = 0; i < end; i++) {
+ if (!strequal(cap_name, caps_opt[i].name))
continue;
- capid = caps_opt[i].value;
- break;
+ *cap = caps_opt[i].value;
+ return 0;
}
- if (capid < 0) {
- /* Try to see if it's numeric, so the user may specify
- * capabilities that the running kernel knows about but we
- * don't
- */
- errno = 0;
- capid = strtol(cap, &ptr, 10);
- if (!ptr || *ptr != '\0' || errno != 0)
- /* not a valid number */
- capid = -1;
- else if (capid > lxc_caps_last_cap())
- /* we have a number but it's not a valid
- * capability */
- capid = -1;
- }
-
- return capid;
+ /*
+ * Try to see if it's numeric, so the user may specify
+ * capabilities that the running kernel knows about but we
+ * don't.
+ */
+ ret = lxc_safe_uint(cap_name, &res);
+ if (ret < 0)
+ return -1;
+
+ ret = lxc_caps_last_cap(&last_cap);
+ if (ret)
+ return -1;
+
+ if ((__u32)res > last_cap)
+ return -1;
+
+ *cap = (__u32)res;
+ return 0;
}
-int in_caplist(int cap, struct lxc_list *caps)
+bool has_cap(__u32 cap, struct lxc_conf *conf)
{
- int capid;
- struct lxc_list *iterator;
+ bool cap_in_list = false;
+ struct cap_entry *cap_entry;
+
+ list_for_each_entry(cap_entry, &conf->caps.list, head) {
+ if (cap_entry->cap != cap)
+ continue;
- lxc_list_for_each (iterator, caps) {
- capid = parse_cap(iterator->elem);
- if (capid == cap)
- return 1;
+ cap_in_list = true;
}
- return 0;
+ /* The capability is kept. */
+ if (conf->caps.keep)
+ return cap_in_list;
+
+ /* The capability is not dropped. */
+ return !cap_in_list;
}
-static int setup_caps(struct lxc_list *caps)
+static int capabilities_deny(struct lxc_conf *conf)
{
- int capid;
- char *drop_entry;
- struct lxc_list *iterator;
+ struct cap_entry *cap;
- lxc_list_for_each (iterator, caps) {
+ list_for_each_entry(cap, &conf->caps.list, head) {
int ret;
- drop_entry = iterator->elem;
-
- capid = parse_cap(drop_entry);
- if (capid < 0)
- return log_error(-1, "unknown capability %s", drop_entry);
-
- ret = prctl(PR_CAPBSET_DROP, prctl_arg(capid), prctl_arg(0),
+ ret = prctl(PR_CAPBSET_DROP, prctl_arg(cap->cap), prctl_arg(0),
prctl_arg(0), prctl_arg(0));
if (ret < 0)
- return log_error_errno(-1, errno, "Failed to remove %s capability", drop_entry);
- DEBUG("Dropped %s (%d) capability", drop_entry, capid);
+ return syserror("Failed to remove %s capability", cap->cap_name);
+
+ DEBUG("Dropped %s (%d) capability", cap->cap_name, cap->cap);
}
DEBUG("Capabilities have been setup");
return 0;
}
-static int dropcaps_except(struct lxc_list *caps)
+static int capabilities_allow(struct lxc_conf *conf)
{
- __do_free int *caplist = NULL;
- int i, capid, numcaps;
- char *keep_entry;
- struct lxc_list *iterator;
+ __do_free __u32 *keep_bits = NULL;
+ int ret;
+ struct cap_entry *cap;
+ __u32 last_cap, nr_u32;
- numcaps = lxc_caps_last_cap() + 1;
- if (numcaps <= 0 || numcaps > 200)
- return -1;
- TRACE("Found %d capabilities", numcaps);
+ ret = lxc_caps_last_cap(&last_cap);
+ if (ret || last_cap > 200)
+ return ret_errno(EINVAL);
- /* caplist[i] is 1 if we keep capability i */
- caplist = must_realloc(NULL, numcaps * sizeof(int));
- memset(caplist, 0, numcaps * sizeof(int));
+ TRACE("Found %d capabilities", last_cap);
- lxc_list_for_each (iterator, caps) {
- keep_entry = iterator->elem;
+ nr_u32 = BITS_TO_LONGS(last_cap);
+ keep_bits = zalloc(nr_u32 * sizeof(__u32));
+ if (!keep_bits)
+ return ret_errno(ENOMEM);
- capid = parse_cap(keep_entry);
- if (capid == -2)
+ list_for_each_entry(cap, &conf->caps.list, head) {
+ if (cap->cap > last_cap)
continue;
- if (capid < 0)
- return log_error(-1, "Unknown capability %s", keep_entry);
-
- DEBUG("Keep capability %s (%d)", keep_entry, capid);
- caplist[capid] = 1;
+ set_bit(cap->cap, keep_bits);
+ DEBUG("Keeping %s (%d) capability", cap->cap_name, cap->cap);
}
- for (i = 0; i < numcaps; i++) {
- int ret;
-
- if (caplist[i])
+ for (__u32 cap_bit = 0; cap_bit <= last_cap; cap_bit++) {
+ if (is_set(cap_bit, keep_bits))
continue;
- ret = prctl(PR_CAPBSET_DROP, prctl_arg(i), prctl_arg(0),
+ ret = prctl(PR_CAPBSET_DROP, prctl_arg(cap_bit), prctl_arg(0),
prctl_arg(0), prctl_arg(0));
if (ret < 0)
- return log_error_errno(-1, errno, "Failed to remove capability %d", i);
+ return syserror("Failed to remove capability %d", cap_bit);
+
+ TRACE("Dropped capability %d", cap_bit);
}
DEBUG("Capabilities have been setup");
char filename[PATH_MAX] = {0};
struct lxc_sysctl *sysctl, *nsysctl;
- if (!list_empty(&conf->sysctls))
+ if (list_empty(&conf->sysctls))
return 0;
list_for_each_entry_safe(sysctl, nsysctl, &conf->sysctls, head) {
if (ret < 0)
return log_error_errno(-1, errno, "Failed to setup sysctl parameters %s to %s",
sysctl->key, sysctl->value);
+
+ TRACE("Setting %s to %s", filename, sysctl->value);
}
+ TRACE("Setup /proc/sys settings");
return 0;
}
char filename[PATH_MAX] = {0};
struct lxc_proc *proc;
- if (!list_empty(&conf->procs))
+ if (list_empty(&conf->procs))
return 0;
list_for_each_entry(proc, &conf->procs, head) {
if (ret < 0)
return log_error_errno(-1, errno, "Failed to setup proc filesystem %s to %s",
proc->filename, proc->value);
+
+ TRACE("Setting %s to %s", filename, proc->value);
}
TRACE("Setup /proc/%d settings", pid);
/* Block ("allowlist") all devices by default. */
new->bpf_devices.list_type = LXC_BPF_DEVICE_CGROUP_ALLOWLIST;
INIT_LIST_HEAD(&(new->bpf_devices).devices);
- lxc_list_init(&new->mount_list);
- lxc_list_init(&new->caps);
- lxc_list_init(&new->keepcaps);
+ INIT_LIST_HEAD(&new->mount_entries);
+ INIT_LIST_HEAD(&new->caps.list);
INIT_LIST_HEAD(&new->id_map);
new->root_nsuid_map = NULL;
new->root_nsgid_map = NULL;
INIT_LIST_HEAD(&new->procs);
new->hooks_version = 0;
for (i = 0; i < NUM_LXC_HOOKS; i++)
- lxc_list_init(&new->hooks[i]);
- lxc_list_init(&new->groups);
+ INIT_LIST_HEAD(&new->hooks[i]);
+ INIT_LIST_HEAD(&new->groups);
INIT_LIST_HEAD(&new->state_clients);
new->lsm_aa_profile = NULL;
- lxc_list_init(&new->lsm_aa_raw);
+ INIT_LIST_HEAD(&new->lsm_aa_raw);
new->lsm_se_context = NULL;
new->lsm_se_keyring_context = NULL;
new->keyring_disable_session = false;
new->transient_procfs_mnt = false;
new->shmount.path_host = NULL;
new->shmount.path_cont = NULL;
+ new->sched_core = false;
+ new->sched_core_cookie = INVALID_SCHED_CORE_COOKIE;
/* if running in a new user namespace, init and COMMAND
* default to running as UID/GID 0 when using lxc-execute */
return log_error_errno(-1, errno, "Failed to open \"%s\"", path);
ret = lxc_write_nointr(fd, buf, buf_size);
- if (ret != buf_size)
+ if (ret < 0 || (size_t)ret != buf_size)
return log_error_errno(-1, errno, "Failed to write %cid mapping to \"%s\"",
idtype == ID_TYPE_UID ? 'u' : 'g', path);
int lxc_map_ids(struct list_head *idmap, pid_t pid)
{
- int hostuid, hostgid, fill, left;
+ int fill, left;
+ uid_t hostuid;
+ gid_t hostgid;
char u_or_g;
char *pos;
char cmd_output[PATH_MAX];
/* Check if we really need to use newuidmap and newgidmap.
* If the user is only remapping their own {g,u}id, we don't need it.
*/
- if (use_shadow && list_len(idmap) == 2) {
+ if (use_shadow && list_len(map, idmap, head) == 2) {
use_shadow = false;
list_for_each_entry(map, idmap, head) {
if (map->idtype == ID_TYPE_UID && map->range == 1 &&
return log_error_errno(-errno, errno, "Failed to create %d(proc)", rootfs->dfd_mnt);
goto domount;
- } else if (link_len >= sizeof(link)) {
+ } else if ((size_t)link_len >= sizeof(link)) {
return log_error_errno(-EIO, EIO, "Truncated link target");
}
link[link_len] = '\0';
static bool verify_start_hooks(struct lxc_conf *conf)
{
char path[PATH_MAX];
- struct lxc_list *it;
+ struct string_entry *hook;
- lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
+ list_for_each_entry(hook, &conf->hooks[LXCHOOK_START], head) {
int ret;
- char *hookname = it->elem;
+ char *hookname = hook->val;
ret = strnprintf(path, sizeof(path), "%s%s",
conf->rootfs.path ? conf->rootfs.mount : "",
for (;;) {
__do_close int fd_from = -EBADF, fd_userns = -EBADF;
- struct lxc_mount_attr attr = {};
+ struct mount_attr attr = {};
struct lxc_mount_options opts = {};
ssize_t ret;
return syserror("Failed to receive idmapped mount file descriptors from child");
if (fd_from < 0 || fd_userns < 0)
- return log_trace(0, "Finished receiving idmapped mount file descriptors from child");
+ return log_trace(0, "Finished receiving idmapped mount file descriptors (%d | %d) from child", fd_from, fd_userns);
attr.attr_set = MOUNT_ATTR_IDMAP;
attr.userns_fd = fd_userns;
if (!info_new->tty)
return ret_errno(ENOMEM);
- for (int i = 0; i < ttys_max; i++) {
+ for (size_t i = 0; i < ttys_max; i++) {
terminal_info = &info_new->tty[i];
terminal_info->busy = -1;
+ terminal_info->pty_nr = -1;
terminal_info->ptx = -EBADF;
terminal_info->pty = -EBADF;
}
- for (int i = 0; i < ttys_max; i++) {
+ for (size_t i = 0; i < ttys_max; i++) {
int ptx = -EBADF, pty = -EBADF;
ret = lxc_abstract_unix_recv_two_fds(sock, &ptx, &pty);
return 0;
}
+static int setup_capabilities(struct lxc_conf *conf)
+{
+ int ret;
+
+ if (conf->caps.keep)
+ ret = capabilities_allow(conf);
+ else
+ ret = capabilities_deny(conf);
+ if (ret < 0)
+ return syserror_ret(ret, "Failed to %s capabilities", conf->caps.keep ? "allow" : "deny");
+
+ return 0;
+}
+
+static int make_shmount_dependent_mount(const struct lxc_conf *conf)
+{
+ if (!(conf->auto_mounts & LXC_AUTO_SHMOUNTS_MASK))
+ return 0;
+
+ return mount(NULL, conf->shmount.path_cont, NULL, MS_REC | MS_SLAVE, 0);
+}
+
int lxc_setup(struct lxc_handler *handler)
{
int ret;
if (ret < 0)
return log_error(-1, "Failed to setup mounts");
- if (!lxc_list_empty(&lxc_conf->mount_list)) {
- ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
- &lxc_conf->mount_list, name, lxcpath);
+ if (!list_empty(&lxc_conf->mount_entries)) {
+ ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs, name, lxcpath);
if (ret < 0)
return log_error(-1, "Failed to setup mount entries");
}
if (ret < 0)
return log_error(-1, "Failed to run mount hooks");
+ if (lxc_rootfs_overmounted(&lxc_conf->rootfs))
+ return log_error(-1, "Rootfs overmounted");
+
if (lxc_conf->autodev > 0) {
ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
if (ret < 0)
if (ret < 0)
return log_error(-1, "Failed to pivot root into rootfs");
+ ret = make_shmount_dependent_mount(lxc_conf);
+ if (ret < 0)
+ return log_error(-1, "Failed to turn mount tunnel \"%s\" into dependent mount",
+ lxc_conf->shmount.path_cont);
+
/* Setting the boot-id is best-effort for now. */
if (lxc_conf->autodev > 0)
(void)lxc_setup_boot_id();
if (ret < 0)
return log_error(-1, "Failed to setup sysctl parameters");
- if (!lxc_list_empty(&lxc_conf->keepcaps)) {
- if (!lxc_list_empty(&lxc_conf->caps))
- return log_error(-1, "Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both");
-
- if (dropcaps_except(&lxc_conf->keepcaps))
- return log_error(-1, "Failed to keep capabilities");
- } else if (setup_caps(&lxc_conf->caps)) {
- return log_error(-1, "Failed to drop capabilities");
- }
+ ret = setup_capabilities(lxc_conf);
+ if (ret < 0)
+ return log_error(-1, "Failed to setup capabilities");
put_lxc_rootfs(&handler->conf->rootfs, true);
NOTICE("The container \"%s\" is set up", name);
int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
char *argv[])
{
- struct lxc_list *it;
int which;
+ struct string_entry *entry;
for (which = 0; which < NUM_LXC_HOOKS; which ++) {
if (strequal(hookname, lxchook_names[which]))
if (which >= NUM_LXC_HOOKS)
return -1;
- lxc_list_for_each (it, &conf->hooks[which]) {
+ list_for_each_entry(entry, &conf->hooks[which], head) {
int ret;
- char *hook = it->elem;
+ char *hook = entry->val;
ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
hookname, argv);
int lxc_clear_config_caps(struct lxc_conf *c)
{
- struct lxc_list *it, *next;
+ struct cap_entry *cap, *ncap;
- lxc_list_for_each_safe (it, &c->caps, next) {
- lxc_list_del(it);
- free(it->elem);
- free(it);
+ list_for_each_entry_safe(cap, ncap, &c->caps.list, head) {
+ list_del(&cap->head);
+ free(cap->cap_name);
+ free(cap);
}
- lxc_list_init(&c->caps);
+ c->caps.keep = false;
+ INIT_LIST_HEAD(&c->caps.list);
return 0;
}
return lxc_free_idmap(&c->id_map);
}
-int lxc_clear_config_keepcaps(struct lxc_conf *c)
-{
- struct lxc_list *it, *next;
-
- lxc_list_for_each_safe (it, &c->keepcaps, next) {
- lxc_list_del(it);
- free(it->elem);
- free(it);
- }
-
- lxc_list_init(&c->keepcaps);
- return 0;
-}
-
int lxc_clear_namespace(struct lxc_conf *c)
{
for (int i = 0; i < LXC_NS_MAX; i++)
int lxc_clear_groups(struct lxc_conf *c)
{
- struct lxc_list *it, *next;
+ struct string_entry *entry, *nentry;
- lxc_list_for_each_safe (it, &c->groups, next) {
- lxc_list_del(it);
- free(it->elem);
- free(it);
+ list_for_each_entry_safe(entry, nentry, &c->groups, head) {
+ list_del(&entry->head);
+ free(entry->val);
+ free(entry);
}
- lxc_list_init(&c->groups);
+ INIT_LIST_HEAD(&c->groups);
return 0;
}
int lxc_clear_mount_entries(struct lxc_conf *c)
{
- struct lxc_list *it, *next;
+ struct string_entry *entry, *nentry;
- lxc_list_for_each_safe (it, &c->mount_list, next) {
- lxc_list_del(it);
- free(it->elem);
- free(it);
+ list_for_each_entry_safe(entry, nentry, &c->mount_entries, head) {
+ list_del(&entry->head);
+ free(entry->val);
+ free(entry);
}
- lxc_list_init(&c->mount_list);
+ INIT_LIST_HEAD(&c->mount_entries);
return 0;
}
int lxc_clear_hooks(struct lxc_conf *c, const char *key)
{
- struct lxc_list *it, *next;
const char *k = NULL;
bool all = false, done = false;
+ struct string_entry *entry, *nentry;
if (strequal(key, "lxc.hook"))
all = true;
for (int i = 0; i < NUM_LXC_HOOKS; i++) {
if (all || strequal(k, lxchook_names[i])) {
- lxc_list_for_each_safe (it, &c->hooks[i], next) {
- lxc_list_del(it);
- free(it->elem);
- free(it);
+ list_for_each_entry_safe(entry, nentry, &c->hooks[i], head) {
+ list_del(&entry->head);
+ free(entry->val);
+ free(entry);
}
- lxc_list_init(&c->hooks[i]);
-
+ INIT_LIST_HEAD(&c->hooks[i]);
done = true;
}
}
int lxc_clear_apparmor_raw(struct lxc_conf *c)
{
- struct lxc_list *it, *next;
+ struct string_entry *entry, *nentry;
- lxc_list_for_each_safe (it, &c->lsm_aa_raw, next) {
- lxc_list_del(it);
- free(it->elem);
- free(it);
+ list_for_each_entry_safe(entry, nentry, &c->lsm_aa_raw, head) {
+ list_del(&entry->head);
+ free(entry->val);
+ free(entry);
}
- lxc_list_init(&c->lsm_aa_raw);
+ INIT_LIST_HEAD(&c->lsm_aa_raw);
return 0;
}
free(conf->lsm_se_keyring_context);
lxc_seccomp_free(&conf->seccomp);
lxc_clear_config_caps(conf);
- lxc_clear_config_keepcaps(conf);
lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
lxc_clear_cgroups_devices(conf);
free(conf->cgroup_meta.container_dir);
free(conf->cgroup_meta.namespace_dir);
free(conf->cgroup_meta.controllers);
+ free(conf->cgroup_meta.systemd_scope);
free(conf->shmount.path_host);
free(conf->shmount.path_cont);
free(conf);
close_prot_errno_disarm(sock_fds[0]);
- if (!lxc_switch_uid_gid(0, 0))
+ if (!lxc_drop_groups() && errno != EPERM)
+ _exit(EXIT_FAILURE);
+
+ ret = setresgid(0, 0, 0);
+ if (ret < 0) {
+ SYSERROR("Failed to setresgid(0, 0, 0)");
_exit(EXIT_FAILURE);
+ }
- if (!lxc_drop_groups())
+ ret = setresuid(0, 0, 0);
+ if (ret < 0) {
+ SYSERROR("Failed to setresuid(0, 0, 0)");
_exit(EXIT_FAILURE);
+ }
ret = fchown(target_fd, 0, st.st_gid);
if (ret) {
/* Wait for child to finish. */
if (pid < 0)
+ return log_error(-1, "Failed to create child process");
+
+ if (!wait_exited(pid))
return -1;
- return wait_for_pid(pid);
+ return 0;
}
/* not thread-safe, do not use from api without first forking */
__do_free char *buf = NULL;
struct passwd pwent;
struct passwd *pwentp = NULL;
- size_t bufsize;
+ ssize_t bufsize;
int ret;
bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
- if (bufsize == -1)
+ if (bufsize < 0)
bufsize = 1024;
buf = zalloc(bufsize);
__do_free char *buf = NULL;
struct group grent;
struct group *grentp = NULL;
- size_t bufsize;
+ ssize_t bufsize;
int ret;
bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
- if (bufsize == -1)
+ if (bufsize < 0)
bufsize = 1024;
buf = zalloc(bufsize);
ERROR("lxc.idmap = g 0 %u %u", gid, grange);
}
-/* Return the list of cgroup_settings sorted according to the following rules
- * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
- */
-void sort_cgroup_settings(struct lxc_conf *conf)
-{
- struct lxc_cgroup *cgroup, *memsw_limit, *ncgroup;
-
- /* Iterate over the cgroup settings and copy them to the output list. */
- list_for_each_entry_safe(cgroup, ncgroup, &conf->cgroup, head) {
- if (strequal(cgroup->subsystem, "memory.memsw.limit_in_bytes")) {
- /* Store the memsw_limit location */
- memsw_limit = cgroup;
- } else if (memsw_limit && strequal(cgroup->subsystem, "memory.limit_in_bytes")) {
- /*
- * lxc.cgroup.memory.memsw.limit_in_bytes is found
- * before lxc.cgroup.memory.limit_in_bytes, swap these
- * two items.
- */
- list_swap(&memsw_limit->head, &cgroup->head);
- }
- }
-}
-
int lxc_set_environment(const struct lxc_conf *conf)
{
struct environment_entry *env;