/* SPDX-License-Identifier: LGPL-2.1+ */
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE 1
-#endif
+#include "config.h"
+
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/wait.h>
#include <unistd.h>
+#include "lxc.h"
+
#include "af_unix.h"
+#include "attach_options.h"
#include "caps.h"
-#include "cgroup.h"
+#include "cgroups/cgroup.h"
+#include "cgroups/cgroup_utils.h"
#include "commands.h"
#include "commands_utils.h"
#include "compiler.h"
#include "conf.h"
-#include "config.h"
#include "confile_utils.h"
#include "error.h"
#include "file_utils.h"
#include "list.h"
#include "log.h"
#include "lsm/lsm.h"
-#include "lxccontainer.h"
#include "lxclock.h"
#include "lxcseccomp.h"
#include "macro.h"
#include <sys/capability.h>
#endif
-#ifndef HAVE_STRLCPY
-#include "include/strlcpy.h"
+#if !HAVE_STRLCPY
+#include "strlcpy.h"
#endif
lxc_log_define(start, lxc);
}
}
-static int lxc_try_preserve_ns(const int pid, const char *ns)
+static int lxc_try_preserve_namespace(struct lxc_handler *handler,
+ lxc_namespace_t idx, const char *ns)
{
- int fd;
+ __do_close int fd = -EBADF;
+ int ret;
- fd = lxc_preserve_ns(pid, ns);
- if (fd < 0) {
- if (errno != ENOENT)
- return log_error_errno(-EINVAL,
- errno, "Failed to preserve %s namespace",
- ns);
+ fd = lxc_preserve_ns(handler->pid, ns);
+ if (fd < 0)
+ return -errno;
- return log_warn_errno(-EOPNOTSUPP,
- errno, "Kernel does not support preserving %s namespaces",
- ns);
- }
+ ret = strnprintf(handler->nsfd_paths[idx],
+ sizeof(handler->nsfd_paths[idx]), "%s:/proc/%d/fd/%d",
+ ns_info[idx].proc_name, handler->monitor_pid, fd);
+ if (ret < 0)
+ return ret_errno(EIO);
- return fd;
+ /*
+ * In case LXC is configured for exposing information to hooks as
+ * argv-style arguments prepare an argv array we can use.
+ */
+ handler->hook_argv[handler->hook_argc] = handler->nsfd_paths[idx];
+ handler->hook_argc++;
+
+ DEBUG("Preserved %s namespace via fd %d and stashed path as %s",
+ ns_info[idx].proc_name, fd, handler->nsfd_paths[idx]);
+
+ handler->nsfd[idx] = move_fd(fd);
+ return 0;
}
/* lxc_try_preserve_namespaces: open /proc/@pid/ns/@ns for each namespace
* Return true on success, false on failure.
*/
static bool lxc_try_preserve_namespaces(struct lxc_handler *handler,
- int ns_clone_flags, pid_t pid)
+ int ns_clone_flags)
{
- int i;
+ for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++)
+ handler->nsfd[ns_idx] = -EBADF;
- for (i = 0; i < LXC_NS_MAX; i++)
- handler->nsfd[i] = -EBADF;
-
- for (i = 0; i < LXC_NS_MAX; i++) {
- int fd;
+ for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++) {
+ int ret;
+ const char *ns = ns_info[ns_idx].proc_name;
- if ((ns_clone_flags & ns_info[i].clone_flag) == 0)
+ if ((ns_clone_flags & ns_info[ns_idx].clone_flag) == 0)
continue;
- fd = lxc_try_preserve_ns(pid, ns_info[i].proc_name);
- if (fd < 0) {
- /* Do not fail to start container on kernels that do
- * not support interacting with namespaces through
- * /proc.
- */
- if (fd == -EOPNOTSUPP)
+ ret = lxc_try_preserve_namespace(handler, ns_idx,
+ ns_info[ns_idx].proc_name);
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ SYSERROR("Kernel does not support preserving %s namespaces", ns);
continue;
+ }
+ /*
+ * Handle kernels that do not support interacting with
+ * namespaces through procfs.
+ */
lxc_put_nsfds(handler);
- return false;
+ return log_error_errno(false, errno, "Failed to preserve %s namespace", ns);
}
-
- handler->nsfd[i] = fd;
- DEBUG("Preserved %s namespace via fd %d", ns_info[i].proc_name,
- handler->nsfd[i]);
}
return true;
ssize_t linklen;
int ret;
- ret = snprintf(path, PATH_MAX, "/proc/self/fd/%s", direntp->d_name);
- if (ret < 0 || ret >= PATH_MAX)
+ ret = strnprintf(path, sizeof(path), "/proc/self/fd/%s", direntp->d_name);
+ if (ret < 0)
return log_error(false, "Failed to create file descriptor name");
linklen = readlink(path, link, PATH_MAX);
else if (linklen >= PATH_MAX)
return log_error(false, "The name of link path is too long - \"%s\"", path);
- if (strcmp(link, "/dev/log_main") == 0 ||
- strcmp(link, "/dev/log_system") == 0 ||
- strcmp(link, "/dev/log_radio") == 0)
+ if (strequal(link, "/dev/log_main") ||
+ strequal(link, "/dev/log_system") ||
+ strequal(link, "/dev/log_radio"))
return true;
return false;
}
#endif
+/* Parses the LISTEN_FDS environment variable value.
+ * The returned value is the highest fd number up to which the
+ * file descriptors must be passed to the container process.
+ *
+ * For example, if LISTEN_FDS=2 then 4 is returned and file descriptors 3 and 4
+ * MUST be passed to the container process (in addition to the standard streams)
+ * to support [socket activation][systemd-listen-fds].
+ */
+static unsigned int get_listen_fds_max(void)
+{
+ int ret;
+ unsigned int num_fds;
+ const char *val;
+
+ val = getenv("LISTEN_FDS");
+ if (!val)
+ return 0;
+
+ ret = lxc_safe_uint(val, &num_fds);
+ if (ret < 0)
+ return syserror_ret(0, "Failed to parse \"LISTEN_FDS=%s\" environment variable", val);
+
+ return log_trace(num_fds, "Parsed \"LISTEN_FDS=%s\" environment variable", val);
+}
+
int lxc_check_inherited(struct lxc_conf *conf, bool closeall,
int *fds_to_ignore, size_t len_fds)
{
size_t i;
DIR *dir;
struct dirent *direntp;
+ unsigned int listen_fds_max;
+ struct lxc_state_client *client, *nclient;
if (conf && conf->close_all_fds)
closeall = true;
+ listen_fds_max = get_listen_fds_max();
+
/*
* Disable syslog at this point to avoid the above logging
* function to open a new fd and make the check_inherited function
while ((direntp = readdir(dir))) {
int ret;
- struct lxc_list *cur;
bool matched = false;
- if (strcmp(direntp->d_name, ".") == 0)
+ if (strequal(direntp->d_name, "."))
continue;
- if (strcmp(direntp->d_name, "..") == 0)
+ if (strequal(direntp->d_name, ".."))
continue;
ret = lxc_safe_int(direntp->d_name, &fd);
/* Keep state clients that wait on reboots. */
if (conf) {
- lxc_list_for_each(cur, &conf->state_clients) {
- struct lxc_state_client *client = cur->elem;
-
+ list_for_each_entry_safe(client, nclient, &conf->state_clients, head) {
if (client->clientfd != fd)
continue;
continue;
#endif
+
+ if ((size_t)fd <= listen_fds_max) {
+ INFO("Inheriting fd %d (using the LISTEN_FDS environment variable)", fd);
+ continue;
+ }
+
if (closeall) {
if (close(fd))
SYSINFO("Closed inherited fd %d", fd);
if (ret < 0)
return -EBADF;
- for (int sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
+ for (size_t sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
ret = sigdelset(&mask, signals[sig]);
if (ret < 0)
return -EBADF;
}
static int signal_handler(int fd, uint32_t events, void *data,
- struct lxc_epoll_descr *descr)
+ struct lxc_async_descr *descr)
{
int ret;
siginfo_t info;
if (ret == 0 && info.si_pid == hdlr->pid)
hdlr->init_died = true;
+ TRACE("Received signal ssi_signo(%d) for ssi_pid(%d), si_signo(%d), si_pid(%d)",
+ siginfo.ssi_signo, siginfo.ssi_pid, info.si_signo, info.si_pid);
+
/* Try to figure out a reasonable exit status to report. */
if (hdlr->init_died) {
switch (info.si_code) {
/* More robustness, protect ourself from a SIGCHLD sent
* by a process different from the container init.
*/
- if (siginfo.ssi_pid != hdlr->pid) {
+ if ((__u64)siginfo.ssi_pid != (__u64)hdlr->pid) {
NOTICE("Received %d from pid %d instead of container init %d",
siginfo.ssi_signo, siginfo.ssi_pid, hdlr->pid);
return hdlr->init_died ? LXC_MAINLOOP_CLOSE
int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
lxc_state_t state)
{
+ struct lxc_msg msg = {
+ .type = lxc_msg_state,
+ .value = state,
+ };
size_t retlen;
ssize_t ret;
- struct lxc_list *cur, *next;
- struct lxc_msg msg = {.type = lxc_msg_state, .value = state};
+ struct lxc_state_client *client, *nclient;
if (state == THAWED)
handler->state = RUNNING;
TRACE("Set container state to %s", lxc_state2str(state));
- if (lxc_list_empty(&handler->conf->state_clients))
+ if (list_empty(&handler->conf->state_clients))
return log_trace(0, "No state clients registered");
retlen = strlcpy(msg.name, name, sizeof(msg.name));
if (retlen >= sizeof(msg.name))
return -E2BIG;
- lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
- struct lxc_state_client *client = cur->elem;
-
+ list_for_each_entry_safe(client, nclient, &handler->conf->state_clients, head) {
if (client->states[state] == 0) {
TRACE("State %s not registered for state client %d",
lxc_state2str(state), client->clientfd);
SYSERROR("Failed to send message to client");
/* kick client from list */
- lxc_list_del(cur);
+ list_del(&client->head);
close(client->clientfd);
- free(cur->elem);
- free(cur);
+ free(client);
}
return 0;
int lxc_poll(const char *name, struct lxc_handler *handler)
{
int ret;
- bool has_console = true;
- struct lxc_epoll_descr descr, descr_console;
+ struct lxc_terminal *console = &handler->conf->console;
+ struct lxc_async_descr descr, descr_console;
- if (handler->conf->console.path &&
- strcmp(handler->conf->console.path, "none") == 0)
- has_console = false;
+ if (!wants_console(console))
+ console = NULL;
ret = lxc_mainloop_open(&descr);
if (ret < 0) {
goto out_sigfd;
}
- if (has_console) {
+ if (console) {
ret = lxc_mainloop_open(&descr_console);
if (ret < 0) {
ERROR("Failed to create console mainloop");
}
}
- ret = lxc_mainloop_add_handler(&descr, handler->sigfd, signal_handler, handler);
+ ret = lxc_mainloop_add_handler(&descr, handler->sigfd,
+ signal_handler,
+ default_cleanup_handler,
+ handler, "signal_handler");
if (ret < 0) {
ERROR("Failed to add signal handler for %d to mainloop", handler->sigfd);
goto out_mainloop_console;
goto out_mainloop_console;
}
- if (has_console) {
- struct lxc_terminal *console = &handler->conf->console;
-
+ if (console) {
ret = lxc_terminal_mainloop_add(&descr, console);
if (ret < 0) {
ERROR("Failed to add console handlers to mainloop");
goto out_mainloop_console;
}
-
- ret = lxc_terminal_mainloop_add(&descr_console, console);
- if (ret < 0) {
- ERROR("Failed to add console handlers to console mainloop");
- goto out_mainloop_console;
- }
-
- handler->conf->console.descr = &descr;
}
ret = lxc_cmd_mainloop_add(name, &descr, handler);
TRACE("Mainloop is ready");
ret = lxc_mainloop(&descr, -1);
- close_prot_errno_disarm(descr.epfd);
+ if (descr.type == LXC_MAINLOOP_EPOLL)
+ close_prot_errno_disarm(descr.epfd);
if (ret < 0 || !handler->init_died)
goto out_mainloop_console;
- if (has_console)
- ret = lxc_mainloop(&descr_console, 0);
+ if (console) {
+ ret = lxc_terminal_mainloop_add(&descr_console, console);
+ if (ret == 0)
+ ret = lxc_mainloop(&descr_console, 0);
+ }
out_mainloop_console:
- if (has_console) {
+ if (console) {
lxc_mainloop_close(&descr_console);
TRACE("Closed console mainloop");
}
handler->state_socket_pair[0] = -EBADF;
handler->state_socket_pair[1] = -EBADF;
if (handler->conf->reboot == REBOOT_NONE)
- lxc_list_init(&handler->conf->state_clients);
+ INIT_LIST_HEAD(&handler->conf->state_clients);
+
+ for (lxc_namespace_t idx = 0; idx < LXC_NS_MAX; idx++) {
+ handler->nsfd[idx] = -EBADF;
+
+ if (handler->conf->reboot == REBOOT_NONE)
+ continue;
- for (int i = 0; i < LXC_NS_MAX; i++)
- handler->nsfd[i] = -EBADF;
+ handler->nsfd_paths[idx][0] = '\0';
+ handler->hook_argv[idx] = NULL;
+
+ if (handler->hook_argc != 0)
+ handler->hook_argc = 0;
+ }
handler->name = name;
if (daemonize)
}
if (handler->conf->reboot == REBOOT_NONE) {
- handler->conf->maincmd_fd = lxc_cmd_init(name, lxcpath, "command");
+ handler->conf->maincmd_fd = lxc_server_init(name, lxcpath, "command");
if (handler->conf->maincmd_fd < 0) {
ERROR("Failed to set up command socket");
goto on_error;
return log_error(-1, "Failed to run lxc.hook.pre-start for container \"%s\"", name);
TRACE("Ran pre-start hooks");
+ ret = lxc_terminal_parent(conf);
+ if (ret < 0)
+ return log_error(-1, "Failed to allocate terminal");
+
/* The signal fd has to be created before forking otherwise if the child
* process exits before we setup the signal fd, the event will be lost
* and the command will be stuck.
return log_error(-1, "Failed to setup SIGCHLD fd handler.");
TRACE("Set up signal fd");
- /* Do this after setting up signals since it might unblock SIGWINCH. */
- ret = lxc_terminal_setup(conf);
- if (ret < 0) {
- ERROR("Failed to create console");
- goto out_restore_sigmask;
- }
- TRACE("Created console");
-
handler->cgroup_ops = cgroup_init(handler->conf);
if (!handler->cgroup_ops) {
ERROR("Failed to initialize cgroup driver");
- goto out_delete_terminal;
+ goto out_restore_sigmask;
}
TRACE("Initialized cgroup driver");
ret = lxc_read_seccomp_config(conf);
- if (ret < 0)
- return log_error(-1, "Failed loading seccomp policy");
+ if (ret < 0) {
+ ERROR("Failed to read seccomp policy");
+ goto out_restore_sigmask;
+ }
TRACE("Read seccomp policy");
ret = handler->lsm_ops->prepare(handler->lsm_ops, conf, handler->lxcpath);
if (ret < 0) {
ERROR("Failed to initialize LSM");
- goto out_delete_terminal;
+ goto out_restore_sigmask;
}
TRACE("Initialized LSM");
handler->monitor_status_fd = move_fd(status_fd);
return 0;
-out_delete_terminal:
- lxc_terminal_delete(&handler->conf->console);
-
out_restore_sigmask:
(void)pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
return -1;
}
+void lxc_expose_namespace_environment(const struct lxc_handler *handler)
+{
+ for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
+ int ret;
+ const char *fd_path;
+
+ if (handler->nsfd[i] < 0)
+ continue;
+
+ fd_path = handler->nsfd_paths[i] + strcspn(handler->nsfd_paths[i], "/");
+ ret = setenv(ns_info[i].env_name, fd_path, 1);
+ if (ret < 0)
+ SYSERROR("Failed to set environment variable %s=%s",
+ ns_info[i].env_name, fd_path);
+ else
+ TRACE("Set environment variable %s=%s",
+ ns_info[i].env_name, fd_path);
+ }
+}
+
void lxc_end(struct lxc_handler *handler)
{
int ret;
- pid_t self;
- struct lxc_list *cur, *next;
- char *namespaces[LXC_NS_MAX + 1];
- size_t namespace_count = 0;
const char *name = handler->name;
struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
+ struct lxc_state_client *client, *nclient;
/* The STOPPING state is there for future cleanup code which can take
* awhile.
*/
lxc_set_state(name, handler, STOPPING);
- self = lxc_raw_getpid();
- for (int i = 0; i < LXC_NS_MAX; i++) {
- if (handler->nsfd[i] < 0)
- continue;
-
- if (handler->conf->hooks_version == 0)
- ret = asprintf(&namespaces[namespace_count],
- "%s:/proc/%d/fd/%d", ns_info[i].proc_name,
- self, handler->nsfd[i]);
- else
- ret = asprintf(&namespaces[namespace_count],
- "/proc/%d/fd/%d", self, handler->nsfd[i]);
- if (ret < 0) {
- SYSERROR("Failed to allocate memory");
- break;
- }
-
- if (handler->conf->hooks_version == 0) {
- namespace_count++;
- continue;
- }
-
- ret = setenv(ns_info[i].env_name, namespaces[namespace_count], 1);
- if (ret < 0)
- SYSERROR("Failed to set environment variable %s=%s",
- ns_info[i].env_name, namespaces[namespace_count]);
- else
- TRACE("Set environment variable %s=%s",
- ns_info[i].env_name, namespaces[namespace_count]);
-
- namespace_count++;
- }
- namespaces[namespace_count] = NULL;
+ /* Passing information to hooks via environment variables. */
+ if (handler->conf->hooks_version > 0)
+ lxc_expose_namespace_environment(handler);
if (handler->conf->reboot > REBOOT_NONE) {
ret = setenv("LXC_TARGET", "reboot", 1);
}
if (handler->conf->hooks_version == 0)
- ret = run_lxc_hooks(name, "stop", handler->conf, namespaces);
+ ret = run_lxc_hooks(name, "stop", handler->conf, handler->hook_argv);
else
ret = run_lxc_hooks(name, "stop", handler->conf, NULL);
if (ret < 0)
ERROR("Failed to run \"lxc.hook.stop\" hook");
- while (namespace_count--)
- free(namespaces[namespace_count]);
-
handler->lsm_ops->cleanup(handler->lsm_ops, handler->conf, handler->lxcpath);
if (cgroup_ops) {
/* The command socket is now closed, no more state clients can register
* themselves from now on. So free the list of state clients.
*/
- lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
- struct lxc_state_client *client = cur->elem;
-
+ list_for_each_entry_safe(client, nclient, &handler->conf->state_clients, head) {
/* Keep state clients that want to be notified about reboots. */
if ((handler->conf->reboot > REBOOT_NONE) &&
(client->states[RUNNING] == 2))
continue;
/* close state client socket */
- lxc_list_del(cur);
+ list_del(&client->head);
close(client->clientfd);
- free(cur->elem);
- free(cur);
+ free(client);
}
if (handler->conf->ephemeral == 1 && handler->conf->reboot != REBOOT_REQ)
{
struct lxc_handler *handler = data;
__lxc_unused __do_close int data_sock0 = handler->data_sock[0],
- data_sock1 = handler->data_sock[1];
+ data_sock1 = handler->data_sock[1];
__do_close int devnull_fd = -EBADF, status_fd = -EBADF;
int ret;
uid_t new_uid;
gid_t new_gid;
- struct lxc_list *iterator;
uid_t nsuid = 0;
gid_t nsgid = 0;
lxc_sync_fini_parent(handler);
- if (lxc_abstract_unix_recv_fds(data_sock1, &status_fd, 1, NULL, 0) < 0) {
- ERROR("Failed to receive status file descriptor to child process");
+ if (lxc_abstract_unix_recv_one_fd(data_sock1, &status_fd, NULL, 0) < 0) {
+ ERROR("Failed to receive status file descriptor from parent process");
goto out_warn_father;
}
INFO("Unshared CLONE_NEWNET");
}
- /* Tell the parent task it can begin to configure the container and wait
- * for it to finish.
- */
- if (!lxc_sync_barrier_parent(handler, START_SYNC_CONFIGURE))
- goto out_error;
-
- if (handler->ns_clone_flags & CLONE_NEWNET) {
- ret = lxc_network_recv_from_parent(handler);
- if (ret < 0) {
- ERROR("Failed to receive veth names from parent");
- goto out_warn_father;
- }
- }
-
/* If we are in a new user namespace, become root there to have
* privilege over our namespace.
*/
- if (!lxc_list_empty(&handler->conf->id_map)) {
+ if (!list_empty(&handler->conf->id_map)) {
if (!handler->conf->root_nsuid_map)
nsuid = handler->conf->init_uid;
if (handler->daemonize && !handler->conf->autodev) {
char path[PATH_MAX];
- ret = snprintf(path, sizeof(path), "%s/dev/null",
- handler->conf->rootfs.mount);
- if (ret < 0 || ret >= sizeof(path))
+ ret = strnprintf(path, sizeof(path), "%s/dev/null",
+ handler->conf->rootfs.mount);
+ if (ret < 0)
goto out_warn_father;
ret = access(path, F_OK);
}
}
- /* Ask father to setup cgroups and wait for him to finish. */
- if (!lxc_sync_barrier_parent(handler, START_SYNC_CGROUP))
+ /*
+ * Tell the parent task it can begin to configure the container and wait
+ * for it to finish.
+ */
+ if (!lxc_sync_wake_parent(handler, START_SYNC_CONFIGURE))
goto out_error;
/* Unshare cgroup namespace after we have setup our cgroups. If we do it
}
ret = setns(timens_fd, CLONE_NEWTIME);
- if (ret) {
+ if (ret) {
SYSERROR("Failed to setns(%d(\"/proc/self/ns/time_for_children\"))", timens_fd);
goto out_warn_father;
}
}
}
- /* Add the requested environment variables to the current environment to
- * allow them to be used by the various hooks, such as the start hook
- * below.
+ /*
+ * Add the requested environment variables to the current environment
+ * to allow them to be used by the various hooks, such as the start
+ * hook below.
*/
- lxc_list_for_each(iterator, &handler->conf->environment) {
- ret = putenv((char *)iterator->elem);
- if (ret < 0) {
- SYSERROR("Failed to set environment variable: %s",
- (char *)iterator->elem);
- goto out_warn_father;
- }
- }
+ ret = lxc_set_environment(handler->conf);
+ if (ret < 0)
+ goto out_warn_father;
+
+ if (!lxc_sync_wait_parent(handler, START_SYNC_POST_CONFIGURE))
+ goto out_warn_father;
/* Setup the container, ip, names, utsname, ... */
ret = lxc_setup(handler);
DEBUG("Set PR_SET_NO_NEW_PRIVS to block execve() gainable privileges");
}
- /* Some init's such as busybox will set sane tty settings on stdin,
- * stdout, stderr which it thinks is the console. We already set them
- * the way we wanted on the real terminal, and we want init to do its
- * setup on its console ie. the pty allocated in lxc_terminal_setup() so
- * make sure that that pty is stdin,stdout,stderr.
- */
- if (handler->conf->console.pty >= 0) {
- if (handler->daemonize || !handler->conf->is_execute)
- ret = set_stdfds(handler->conf->console.pty);
- else
- ret = lxc_terminal_set_stdfds(handler->conf->console.pty);
- if (ret < 0) {
- ERROR("Failed to redirect std{in,out,err} to pty file descriptor %d",
- handler->conf->console.pty);
- goto out_warn_father;
- }
- }
-
/* If we mounted a temporary proc, then unmount it now. */
tmp_proc_unmount(handler->conf);
if (ret < 0)
goto out_warn_father;
- ret = lxc_seccomp_send_notifier_fd(&handler->conf->seccomp, data_sock0);
- if (ret < 0) {
- SYSERROR("Failed to send seccomp notify fd to parent");
- goto out_warn_father;
- }
-
ret = run_lxc_hooks(handler->name, "start", handler->conf, NULL);
if (ret < 0) {
ERROR("Failed to run lxc.hook.start for container \"%s\"",
if (!lxc_sync_barrier_parent(handler, START_SYNC_CGROUP_LIMITS))
goto out_warn_father;
+ ret = lxc_sync_fds_child(handler);
+ if (ret < 0) {
+ SYSERROR("Failed to sync file descriptors with parent");
+ goto out_warn_father;
+ }
+
+ if (!lxc_sync_wait_parent(handler, START_SYNC_READY_START))
+ goto out_warn_father;
+
/* Reset the environment variables the user requested in a clear
* environment.
*/
if (ret < 0)
SYSERROR("Failed to clear environment.");
- lxc_list_for_each(iterator, &handler->conf->environment) {
- ret = putenv((char *)iterator->elem);
- if (ret < 0) {
- SYSERROR("Failed to set environment variable: %s",
- (char *)iterator->elem);
- goto out_warn_father;
- }
- }
+ ret = lxc_set_environment(handler->conf);
+ if (ret < 0)
+ goto out_warn_father;
ret = putenv("container=lxc");
if (ret < 0) {
* we switched to root in the new user namespace further above. Only
* drop groups if we can, so ensure that we have necessary privilege.
*/
- if (lxc_list_empty(&handler->conf->id_map)) {
+ if (list_empty(&handler->conf->id_map)) {
#if HAVE_LIBCAP
if (lxc_proc_cap_is_set(CAP_SETGID, CAP_EFFECTIVE))
#endif
return -1;
}
-static int lxc_recv_ttys_from_child(struct lxc_handler *handler)
-{
- int i;
- struct lxc_terminal_info *tty;
- int ret = -1;
- int sock = handler->data_sock[1];
- struct lxc_conf *conf = handler->conf;
- struct lxc_tty_info *ttys = &conf->ttys;
-
- if (!conf->ttys.max)
- return 0;
-
- ttys->tty = malloc(sizeof(*ttys->tty) * ttys->max);
- if (!ttys->tty)
- return -1;
-
- for (i = 0; i < conf->ttys.max; i++) {
- int ttyfds[2];
-
- ret = lxc_abstract_unix_recv_fds(sock, ttyfds, 2, NULL, 0);
- if (ret < 0)
- break;
-
- tty = &ttys->tty[i];
- tty->busy = -1;
- tty->ptx = ttyfds[0];
- tty->pty = ttyfds[1];
- TRACE("Received pty with ptx fd %d and pty fd %d from child", tty->ptx, tty->pty);
- }
-
- if (ret < 0)
- SYSERROR("Failed to receive %zu ttys from child", ttys->max);
- else
- TRACE("Received %zu ttys from child", ttys->max);
-
- return ret;
-}
-
int resolve_clone_flags(struct lxc_handler *handler)
{
int i;
if ((conf->ns_clone & ns_info[i].clone_flag))
handler->ns_clone_flags |= ns_info[i].clone_flag;
} else {
- if (i == LXC_NS_USER && lxc_list_empty(&handler->conf->id_map))
+ if (i == LXC_NS_USER && list_empty(&handler->conf->id_map))
continue;
if (i == LXC_NS_NET && lxc_requests_empty_network(handler))
* newer glibc versions where the getpid() cache is removed and the pid/tid is
* not reset anymore.
* However, if for whatever reason you - dear committer - somehow need to get the
- * pid of the dummy intermediate process for do_share_ns() you need to call
- * lxc_raw_getpid(). The next lxc_raw_clone() call does not employ CLONE_VM and
- * will be fine.
+ * pid of the placeholder intermediate process for do_share_ns() you need to
+ * call lxc_raw_getpid(). The next lxc_raw_clone() call does not employ
+ * CLONE_VM and will be fine.
*/
static inline int do_share_ns(void *arg)
{
return 0;
}
+static int core_scheduling(struct lxc_handler *handler)
+{
+ struct lxc_conf *conf = handler->conf;
+ int ret;
+
+ if (!conf->sched_core)
+ return log_trace(0, "No new core scheduling domain requested");
+
+ if (!(handler->ns_clone_flags & CLONE_NEWPID))
+ return syserror_set(-EINVAL, "Core scheduling currently requires a separate pid namespace");
+
+ ret = core_scheduling_cookie_create_threadgroup(handler->pid);
+ if (ret < 0) {
+ if (ret == -ENODEV) {
+ INFO("The kernel doesn't support or doesn't use simultaneous multithreading (SMT)");
+ conf->sched_core = false;
+ return 0;
+ }
+ if (ret == -EINVAL)
+ return syserror("The kernel does not support core scheduling");
+
+ return syserror("Failed to create new core scheduling domain");
+ }
+
+ ret = core_scheduling_cookie_get(handler->pid, &conf->sched_core_cookie);
+ if (ret || !core_scheduling_cookie_valid(conf->sched_core_cookie))
+ return syserror("Failed to retrieve core scheduling domain cookie");
+
+ TRACE("Created new core scheduling domain with cookie %llu",
+ (llu)conf->sched_core_cookie);
+
+ return 0;
+}
+
/* lxc_spawn() performs crucial setup tasks and clone()s the new process which
* exec()s the requested container binary.
* Note that lxc_spawn() runs in the parent namespaces. Any operations performed
int i, ret;
char pidstr[20];
bool wants_to_map_ids;
- struct lxc_list *id_map;
+ struct list_head *id_map;
const char *name = handler->name;
const char *lxcpath = handler->lxcpath;
bool share_ns = false;
struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
id_map = &conf->id_map;
- wants_to_map_ids = !lxc_list_empty(id_map);
+ wants_to_map_ids = !list_empty(id_map);
for (i = 0; i < LXC_NS_MAX; i++) {
if (!conf->ns_share[i])
goto out_delete_net;
}
- /* If the rootfs is not a blockdev, prevent the container from marking
- * it readonly.
- * If the container is unprivileged then skip rootfs pinning.
- */
- ret = lxc_rootfs_prepare(&conf->rootfs, wants_to_map_ids);
- if (ret) {
- ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
- goto out_delete_net;
- }
-
/* Create a process in a new set of namespaces. */
if (share_ns) {
pid_t attacher_pid;
/* Kernel might be too old for clone3(). */
if (handler->pid < 0) {
SYSTRACE("Failed to spawn container via clone3()");
+
+ /*
+ * In contrast to all other architectures arm64 verifies that
+ * the argument we use to retrieve the pidfd with is
+ * initialized to 0. But we need to be able to initialize it to
+ * a negative value such as our customary -EBADF so we can
+ * detect whether this kernel supports pidfds. If the syscall
+ * returns and the pidfd variable is set to something >= 0 then
+ * we know this is a kernel supporting pidfds. But if we can't
+ * set it to -EBADF then this won't work since 0 is a valid
+ * file descriptor too. And since legacy clone silently ignores
+ * unknown flags we are left without any way to detect support
+ * for pidfds. So let's special-case arm64 to not fail starting
+ * containers.
+ */
+ #if defined(__aarch64__)
+ handler->pid = lxc_raw_legacy_clone(handler->clone_flags & ~CLONE_PIDFD, NULL);
+ #else
handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd);
+ #endif
}
if (handler->pid < 0) {
handler->clone_flags &= ~CLONE_PIDFD;
TRACE("Cloned child process %d", handler->pid);
+ ret = core_scheduling(handler);
+ if (ret < 0)
+ goto out_delete_net;
+
/* Verify that we can actually make use of pidfds. */
if (!lxc_can_use_pidfd(handler->pidfd))
close_prot_errno_disarm(handler->pidfd);
- ret = snprintf(pidstr, 20, "%d", handler->pid);
- if (ret < 0 || ret >= 20)
+ ret = strnprintf(pidstr, 20, "%d", handler->pid);
+ if (ret < 0)
goto out_delete_net;
ret = setenv("LXC_PID", pidstr, 1);
if (handler->ns_on_clone_flags & ns_info[i].clone_flag)
INFO("Cloned %s", ns_info[i].flag_name);
- if (!lxc_try_preserve_namespaces(handler, handler->ns_on_clone_flags, handler->pid)) {
+ if (!lxc_try_preserve_namespaces(handler, handler->ns_on_clone_flags)) {
ERROR("Failed to preserve cloned namespaces for lxc.hook.stop");
goto out_delete_net;
}
}
}
- if (!lxc_sync_wake_child(handler, START_SYNC_STARTUP))
- goto out_delete_net;
-
- if (!lxc_sync_wait_child(handler, START_SYNC_CONFIGURE))
- goto out_delete_net;
-
if (!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, false)) {
ERROR("Failed to setup cgroup limits for container \"%s\"", name);
goto out_delete_net;
}
- if (!cgroup_ops->payload_enter(cgroup_ops, handler)) {
- ERROR("Failed to enter cgroups");
+ if (!cgroup_ops->payload_delegate_controllers(cgroup_ops)) {
+ ERROR("Failed to delegate controllers to payload cgroup");
goto out_delete_net;
}
- if (!cgroup_ops->payload_delegate_controllers(cgroup_ops)) {
- ERROR("Failed to delegate controllers to payload cgroup");
+ if (!cgroup_ops->payload_enter(cgroup_ops, handler)) {
+ ERROR("Failed to enter cgroups");
goto out_delete_net;
}
if (!cgroup_ops->chown(cgroup_ops, handler->conf))
goto out_delete_net;
+ if (!lxc_sync_barrier_child(handler, START_SYNC_STARTUP))
+ goto out_delete_net;
+
/* If not done yet, we're now ready to preserve the network namespace */
if (handler->nsfd[LXC_NS_NET] < 0) {
- ret = lxc_try_preserve_ns(handler->pid, "net");
+ ret = lxc_try_preserve_namespace(handler, LXC_NS_NET, "net");
if (ret < 0) {
- if (ret != -EOPNOTSUPP) {
+ if (ret != -ENOENT) {
SYSERROR("Failed to preserve net namespace");
goto out_delete_net;
}
- } else {
- handler->nsfd[LXC_NS_NET] = ret;
- DEBUG("Preserved net namespace via fd %d", ret);
}
}
ret = lxc_netns_set_nsid(handler->nsfd[LXC_NS_NET]);
ERROR("Failed to create the network");
goto out_delete_net;
}
+ }
- ret = lxc_network_send_to_child(handler);
- if (ret < 0) {
- ERROR("Failed to send veth names to child");
- goto out_delete_net;
- }
+ ret = setup_proc_filesystem(conf, handler->pid);
+ if (ret < 0) {
+ ERROR("Failed to setup procfs limits");
+ goto out_delete_net;
}
- if (!lxc_list_empty(&conf->procs)) {
- ret = setup_proc_filesystem(&conf->procs, handler->pid);
- if (ret < 0)
- goto out_delete_net;
+ ret = setup_resource_limits(conf, handler->pid);
+ if (ret < 0) {
+ ERROR("Failed to setup resource limits");
+ goto out_delete_net;
}
- /* Tell the child to continue its initialization. We'll get
- * START_SYNC_CGROUP when it is ready for us to setup cgroups.
- */
- if (!lxc_sync_barrier_child(handler, START_SYNC_POST_CONFIGURE))
+ /* Tell the child to continue its initialization. */
+ if (!lxc_sync_wake_child(handler, START_SYNC_POST_CONFIGURE))
goto out_delete_net;
- if (!lxc_list_empty(&conf->limits)) {
- ret = setup_resource_limits(&conf->limits, handler->pid);
+ ret = lxc_rootfs_prepare_parent(handler);
+ if (ret) {
+ ERROR("Failed to prepare rootfs");
+ goto out_delete_net;
+ }
+
+ if (handler->ns_clone_flags & CLONE_NEWNET) {
+ ret = lxc_network_send_to_child(handler);
if (ret < 0) {
- ERROR("Failed to setup resource limits");
+ SYSERROR("Failed to send veth names to child");
goto out_delete_net;
}
}
- if (!lxc_sync_barrier_child(handler, START_SYNC_CGROUP_UNSHARE))
+ if (!lxc_sync_wait_child(handler, START_SYNC_IDMAPPED_MOUNTS))
+ goto out_delete_net;
+
+ ret = lxc_idmapped_mounts_parent(handler);
+ if (ret) {
+ ERROR("Failed to setup mount entries");
+ goto out_delete_net;
+ }
+
+ if (!lxc_sync_wait_child(handler, START_SYNC_CGROUP_LIMITS))
goto out_delete_net;
/*
- * with isolation the limiting devices cgroup was already setup, so
- * only setup devices here if we have no namespace directory
+ * With isolation the limiting devices cgroup was already setup, so
+ * only setup devices here if we have no namespace directory.
*/
if (!handler->conf->cgroup_meta.namespace_dir &&
!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, true)) {
}
TRACE("Set up cgroup2 device controller limits");
+ cgroup_ops->finalize(cgroup_ops);
+ TRACE("Finished setting up cgroups");
+
+ /* Run any host-side start hooks */
+ ret = run_lxc_hooks(name, "start-host", conf, NULL);
+ if (ret < 0) {
+ ERROR("Failed to run lxc.hook.start-host");
+ goto out_delete_net;
+ }
+
+ if (!lxc_sync_wake_child(handler, START_SYNC_FDS))
+ goto out_delete_net;
+
if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
/* Now we're ready to preserve the cgroup namespace */
- ret = lxc_try_preserve_ns(handler->pid, "cgroup");
+ ret = lxc_try_preserve_namespace(handler, LXC_NS_CGROUP, "cgroup");
if (ret < 0) {
- if (ret != -EOPNOTSUPP) {
+ if (ret != -ENOENT) {
SYSERROR("Failed to preserve cgroup namespace");
goto out_delete_net;
}
- } else {
- handler->nsfd[LXC_NS_CGROUP] = ret;
- DEBUG("Preserved cgroup namespace via fd %d", ret);
}
}
- cgroup_ops->payload_finalize(cgroup_ops);
- TRACE("Finished setting up cgroups");
-
if (handler->ns_unshare_flags & CLONE_NEWTIME) {
- /* Now we're ready to preserve the cgroup namespace */
- ret = lxc_try_preserve_ns(handler->pid, "time");
+ /* Now we're ready to preserve the time namespace */
+ ret = lxc_try_preserve_namespace(handler, LXC_NS_TIME, "time");
if (ret < 0) {
- if (ret != -EOPNOTSUPP) {
+ if (ret != -ENOENT) {
SYSERROR("Failed to preserve time namespace");
goto out_delete_net;
}
- } else {
- handler->nsfd[LXC_NS_TIME] = ret;
- DEBUG("Preserved time namespace via fd %d", ret);
}
}
- /* Run any host-side start hooks */
- ret = run_lxc_hooks(name, "start-host", conf, NULL);
+ ret = lxc_sync_fds_parent(handler);
if (ret < 0) {
- ERROR("Failed to run lxc.hook.start-host");
+ SYSERROR("Failed to sync file descriptors with child");
goto out_delete_net;
}
- /* Tell the child to complete its initialization and wait for it to exec
- * or return an error. (The child will never return
- * START_SYNC_READY_START+1. It will either close the sync pipe, causing
- * lxc_sync_barrier_child to return success, or return a different
- * value, causing us to error out).
- */
- if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START))
+ ret = lxc_terminal_setup(conf);
+ if (ret < 0) {
+ SYSERROR("Failed to create console");
goto out_delete_net;
-
- if (handler->ns_clone_flags & CLONE_NEWNET) {
- ret = lxc_network_recv_name_and_ifindex_from_child(handler);
- if (ret < 0) {
- ERROR("Failed to receive names and ifindices for network devices from child");
- goto out_delete_net;
- }
}
- ret = lxc_setup_devpts_parent(handler);
- if (ret < 0) {
- SYSERROR("Failed to receive devpts fd from child");
+ /*
+ * Tell the child to complete its initialization and wait for it to
+ * exec or return an error. (The child will never return
+ * START_SYNC_READY_START+1. It will either close the sync pipe,
+ * causing lxc_sync_barrier_child to return success, or return a
+ * different value, causing us to error out).
+ */
+ if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START))
goto out_delete_net;
- }
/* Now all networks are created, network devices are moved into place,
* and the correct names and ifindices in the respective namespaces have
*/
lxc_log_configured_netdevs(conf);
- /* Read tty fds allocated by child. */
- ret = lxc_recv_ttys_from_child(handler);
- if (ret < 0) {
- ERROR("Failed to receive tty info from child process");
- goto out_delete_net;
- }
-
- ret = lxc_seccomp_recv_notifier_fd(&handler->conf->seccomp, data_sock1);
- if (ret < 0) {
- SYSERROR("Failed to receive seccomp notify fd from child");
- goto out_delete_net;
- }
-
ret = handler->ops->post_start(handler, handler->data);
if (ret < 0)
goto out_abort;
goto out_abort;
}
+ if (!cgroup_ops->monitor_delegate_controllers(cgroup_ops)) {
+ ERROR("Failed to delegate controllers to monitor cgroup");
+ ret = -1;
+ goto out_abort;
+ }
+
if (!cgroup_ops->monitor_enter(cgroup_ops, handler)) {
ERROR("Failed to enter monitor cgroup");
ret = -1;
goto out_abort;
}
- if (!cgroup_ops->monitor_delegate_controllers(cgroup_ops)) {
- ERROR("Failed to delegate controllers to monitor cgroup");
+ /* If the rootfs is not a blockdev, prevent the container from marking
+ * it readonly.
+ * If the container is unprivileged then skip rootfs pinning.
+ */
+ ret = lxc_rootfs_init(conf, !list_empty(&conf->id_map));
+ if (ret) {
+ ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
ret = -1;
goto out_abort;
}
- if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) {
- /* If the backing store is a device, mount it here and now. */
+ if (geteuid() == 0 && !list_empty(&conf->id_map)) {
+ /*
+ * Most filesystems can't be mounted inside a userns so handle them here.
+ */
if (rootfs_is_blockdev(conf)) {
ret = unshare(CLONE_NEWNS);
if (ret < 0) {
}
INFO("Unshared CLONE_NEWNS");
- turn_into_dependent_mounts();
ret = lxc_setup_rootfs_prepare_root(conf, name, lxcpath);
if (ret < 0) {
ERROR("Error setting up rootfs mount as root before spawn");
}
INFO("Destroyed rootfs for container \"%s\"", name);
- ret = snprintf(destroy, PATH_MAX, "%s/%s", handler->lxcpath, name);
- if (ret < 0 || ret >= PATH_MAX) {
+ ret = strnprintf(destroy, sizeof(destroy), "%s/%s", handler->lxcpath, name);
+ if (ret < 0) {
ERROR("Error destroying directory for container \"%s\"", name);
return;
}