/* SPDX-License-Identifier: LGPL-2.1+ */
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE 1
-#endif
+#include "config.h"
+
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/wait.h>
#include <unistd.h>
+#include "lxc.h"
+
#include "af_unix.h"
+#include "attach_options.h"
#include "caps.h"
#include "cgroups/cgroup.h"
#include "cgroups/cgroup_utils.h"
#include "commands_utils.h"
#include "compiler.h"
#include "conf.h"
-#include "config.h"
#include "confile_utils.h"
#include "error.h"
#include "file_utils.h"
#include "list.h"
#include "log.h"
#include "lsm/lsm.h"
-#include "lxccontainer.h"
#include "lxclock.h"
#include "lxcseccomp.h"
#include "macro.h"
#include <sys/capability.h>
#endif
-#ifndef HAVE_STRLCPY
-#include "include/strlcpy.h"
+#if !HAVE_STRLCPY
+#include "strlcpy.h"
#endif
lxc_log_define(start, lxc);
static bool lxc_try_preserve_namespaces(struct lxc_handler *handler,
int ns_clone_flags)
{
- for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++)
- handler->nsfd[ns_idx] = -EBADF;
-
for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++) {
int ret;
const char *ns = ns_info[ns_idx].proc_name;
DIR *dir;
struct dirent *direntp;
unsigned int listen_fds_max;
+ struct lxc_state_client *client, *nclient;
if (conf && conf->close_all_fds)
closeall = true;
while ((direntp = readdir(dir))) {
int ret;
- struct lxc_list *cur;
bool matched = false;
if (strequal(direntp->d_name, "."))
/* Keep state clients that wait on reboots. */
if (conf) {
- lxc_list_for_each(cur, &conf->state_clients) {
- struct lxc_state_client *client = cur->elem;
-
+ list_for_each_entry_safe(client, nclient, &conf->state_clients, head) {
if (client->clientfd != fd)
continue;
#endif
- if (fd <= listen_fds_max) {
+ if ((size_t)fd <= listen_fds_max) {
INFO("Inheriting fd %d (using the LISTEN_FDS environment variable)", fd);
continue;
}
if (ret < 0)
return -EBADF;
- for (int sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
+ for (size_t sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
ret = sigdelset(&mask, signals[sig]);
if (ret < 0)
return -EBADF;
/* More robustness, protect ourself from a SIGCHLD sent
* by a process different from the container init.
*/
- if (siginfo.ssi_pid != hdlr->pid) {
+ if ((__u64)siginfo.ssi_pid != (__u64)hdlr->pid) {
NOTICE("Received %d from pid %d instead of container init %d",
siginfo.ssi_signo, siginfo.ssi_pid, hdlr->pid);
return hdlr->init_died ? LXC_MAINLOOP_CLOSE
int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
lxc_state_t state)
{
+ struct lxc_msg msg = {
+ .type = lxc_msg_state,
+ .value = state,
+ };
size_t retlen;
ssize_t ret;
- struct lxc_list *cur, *next;
- struct lxc_msg msg = {.type = lxc_msg_state, .value = state};
+ struct lxc_state_client *client, *nclient;
if (state == THAWED)
handler->state = RUNNING;
TRACE("Set container state to %s", lxc_state2str(state));
- if (lxc_list_empty(&handler->conf->state_clients))
+ if (list_empty(&handler->conf->state_clients))
return log_trace(0, "No state clients registered");
retlen = strlcpy(msg.name, name, sizeof(msg.name));
if (retlen >= sizeof(msg.name))
return -E2BIG;
- lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
- struct lxc_state_client *client = cur->elem;
-
+ list_for_each_entry_safe(client, nclient, &handler->conf->state_clients, head) {
if (client->states[state] == 0) {
TRACE("State %s not registered for state client %d",
lxc_state2str(state), client->clientfd);
SYSERROR("Failed to send message to client");
/* kick client from list */
- lxc_list_del(cur);
+ list_del(&client->head);
close(client->clientfd);
- free(cur->elem);
- free(cur);
+ free(client);
}
return 0;
TRACE("Mainloop is ready");
ret = lxc_mainloop(&descr, -1);
- close_prot_errno_disarm(descr.epfd);
+ if (descr.type == LXC_MAINLOOP_EPOLL)
+ close_prot_errno_disarm(descr.epfd);
if (ret < 0 || !handler->init_died)
goto out_mainloop_console;
handler->state_socket_pair[0] = -EBADF;
handler->state_socket_pair[1] = -EBADF;
if (handler->conf->reboot == REBOOT_NONE)
- lxc_list_init(&handler->conf->state_clients);
+ INIT_LIST_HEAD(&handler->conf->state_clients);
for (lxc_namespace_t idx = 0; idx < LXC_NS_MAX; idx++) {
handler->nsfd[idx] = -EBADF;
void lxc_end(struct lxc_handler *handler)
{
int ret;
- struct lxc_list *cur, *next;
const char *name = handler->name;
struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
+ struct lxc_state_client *client, *nclient;
/* The STOPPING state is there for future cleanup code which can take
* awhile.
/* The command socket is now closed, no more state clients can register
* themselves from now on. So free the list of state clients.
*/
- lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
- struct lxc_state_client *client = cur->elem;
-
+ list_for_each_entry_safe(client, nclient, &handler->conf->state_clients, head) {
/* Keep state clients that want to be notified about reboots. */
if ((handler->conf->reboot > REBOOT_NONE) &&
(client->states[RUNNING] == 2))
continue;
/* close state client socket */
- lxc_list_del(cur);
+ list_del(&client->head);
close(client->clientfd);
- free(cur->elem);
- free(cur);
+ free(client);
}
if (handler->conf->ephemeral == 1 && handler->conf->reboot != REBOOT_REQ)
{
struct lxc_handler *handler = data;
__lxc_unused __do_close int data_sock0 = handler->data_sock[0],
- data_sock1 = handler->data_sock[1];
+ data_sock1 = handler->data_sock[1];
__do_close int devnull_fd = -EBADF, status_fd = -EBADF;
int ret;
uid_t new_uid;
gid_t new_gid;
- struct lxc_list *iterator;
uid_t nsuid = 0;
gid_t nsgid = 0;
/* If we are in a new user namespace, become root there to have
* privilege over our namespace.
*/
- if (!lxc_list_empty(&handler->conf->id_map)) {
+ if (!list_empty(&handler->conf->id_map)) {
if (!handler->conf->root_nsuid_map)
nsuid = handler->conf->init_uid;
}
}
- /* Add the requested environment variables to the current environment to
- * allow them to be used by the various hooks, such as the start hook
- * below.
+ /*
+ * Add the requested environment variables to the current environment
+ * to allow them to be used by the various hooks, such as the start
+ * hook below.
*/
- lxc_list_for_each(iterator, &handler->conf->environment) {
- ret = putenv((char *)iterator->elem);
- if (ret < 0) {
- SYSERROR("Failed to set environment variable: %s",
- (char *)iterator->elem);
- goto out_warn_father;
- }
- }
+ ret = lxc_set_environment(handler->conf);
+ if (ret < 0)
+ goto out_warn_father;
if (!lxc_sync_wait_parent(handler, START_SYNC_POST_CONFIGURE))
goto out_warn_father;
if (ret < 0)
SYSERROR("Failed to clear environment.");
- lxc_list_for_each(iterator, &handler->conf->environment) {
- ret = putenv((char *)iterator->elem);
- if (ret < 0) {
- SYSERROR("Failed to set environment variable: %s",
- (char *)iterator->elem);
- goto out_warn_father;
- }
- }
+ ret = lxc_set_environment(handler->conf);
+ if (ret < 0)
+ goto out_warn_father;
ret = putenv("container=lxc");
if (ret < 0) {
* we switched to root in the new user namespace further above. Only
* drop groups if we can, so ensure that we have necessary privilege.
*/
- if (lxc_list_empty(&handler->conf->id_map)) {
+ if (list_empty(&handler->conf->id_map)) {
#if HAVE_LIBCAP
if (lxc_proc_cap_is_set(CAP_SETGID, CAP_EFFECTIVE))
#endif
if ((conf->ns_clone & ns_info[i].clone_flag))
handler->ns_clone_flags |= ns_info[i].clone_flag;
} else {
- if (i == LXC_NS_USER && lxc_list_empty(&handler->conf->id_map))
+ if (i == LXC_NS_USER && list_empty(&handler->conf->id_map))
continue;
if (i == LXC_NS_NET && lxc_requests_empty_network(handler))
return 0;
}
+static int core_scheduling(struct lxc_handler *handler)
+{
+ struct lxc_conf *conf = handler->conf;
+ int ret;
+
+ if (!conf->sched_core)
+ return log_trace(0, "No new core scheduling domain requested");
+
+ if (!(handler->ns_clone_flags & CLONE_NEWPID))
+ return syserror_set(-EINVAL, "Core scheduling currently requires a separate pid namespace");
+
+ ret = core_scheduling_cookie_create_threadgroup(handler->pid);
+ if (ret < 0) {
+ if (ret == -ENODEV) {
+ INFO("The kernel doesn't support or doesn't use simultaneous multithreading (SMT)");
+ conf->sched_core = false;
+ return 0;
+ }
+ if (ret == -EINVAL)
+ return syserror("The kernel does not support core scheduling");
+
+ return syserror("Failed to create new core scheduling domain");
+ }
+
+ ret = core_scheduling_cookie_get(handler->pid, &conf->sched_core_cookie);
+ if (ret || !core_scheduling_cookie_valid(conf->sched_core_cookie))
+ return syserror("Failed to retrieve core scheduling domain cookie");
+
+ TRACE("Created new core scheduling domain with cookie %llu",
+ (llu)conf->sched_core_cookie);
+
+ return 0;
+}
+
+static bool inherits_namespaces(const struct lxc_handler *handler)
+{
+ struct lxc_conf *conf = handler->conf;
+
+ for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
+ if (conf->ns_share[i])
+ return true;
+ }
+
+ return false;
+}
+
/* lxc_spawn() performs crucial setup tasks and clone()s the new process which
* exec()s the requested container binary.
* Note that lxc_spawn() runs in the parent namespaces. Any operations performed
int i, ret;
char pidstr[20];
bool wants_to_map_ids;
- struct lxc_list *id_map;
+ struct list_head *id_map;
const char *name = handler->name;
- const char *lxcpath = handler->lxcpath;
- bool share_ns = false;
struct lxc_conf *conf = handler->conf;
struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
id_map = &conf->id_map;
- wants_to_map_ids = !lxc_list_empty(id_map);
-
- for (i = 0; i < LXC_NS_MAX; i++) {
- if (!conf->ns_share[i])
- continue;
-
- handler->nsfd[i] = lxc_inherit_namespace(conf->ns_share[i], lxcpath, ns_info[i].proc_name);
- if (handler->nsfd[i] < 0)
- return -1;
-
- share_ns = true;
- }
+ wants_to_map_ids = !list_empty(id_map);
if (!lxc_sync_init(handler))
return -1;
data_sock0 = handler->data_sock[0];
data_sock1 = handler->data_sock[1];
- ret = resolve_clone_flags(handler);
- if (ret < 0)
- goto out_sync_fini;
-
if (handler->ns_clone_flags & CLONE_NEWNET) {
ret = lxc_find_gateway_addresses(handler);
if (ret) {
}
/* Create a process in a new set of namespaces. */
- if (share_ns) {
+ if (inherits_namespaces(handler)) {
pid_t attacher_pid;
attacher_pid = lxc_clone(do_share_ns, handler,
handler->clone_flags &= ~CLONE_PIDFD;
TRACE("Cloned child process %d", handler->pid);
+ ret = core_scheduling(handler);
+ if (ret < 0)
+ goto out_delete_net;
+
/* Verify that we can actually make use of pidfds. */
if (!lxc_can_use_pidfd(handler->pidfd))
close_prot_errno_disarm(handler->pidfd);
}
}
- if (!lxc_list_empty(&conf->procs)) {
- ret = setup_proc_filesystem(&conf->procs, handler->pid);
- if (ret < 0)
- goto out_delete_net;
+ ret = setup_proc_filesystem(conf, handler->pid);
+ if (ret < 0) {
+ ERROR("Failed to setup procfs limits");
+ goto out_delete_net;
}
- if (!lxc_list_empty(&conf->limits)) {
- ret = setup_resource_limits(&conf->limits, handler->pid);
- if (ret < 0) {
- ERROR("Failed to setup resource limits");
- goto out_delete_net;
- }
+ ret = setup_resource_limits(conf, handler->pid);
+ if (ret < 0) {
+ ERROR("Failed to setup resource limits");
+ goto out_delete_net;
}
/* Tell the child to continue its initialization. */
return -1;
}
+static int lxc_inherit_namespaces(struct lxc_handler *handler)
+{
+ const char *lxcpath = handler->lxcpath;
+ struct lxc_conf *conf = handler->conf;
+
+ for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
+ if (!conf->ns_share[i])
+ continue;
+
+ handler->nsfd[i] = lxc_inherit_namespace(conf->ns_share[i],
+ lxcpath,
+ ns_info[i].proc_name);
+ if (handler->nsfd[i] < 0)
+ return -1;
+
+ TRACE("Recording inherited %s namespace with fd %d",
+ ns_info[i].proc_name, handler->nsfd[i]);
+ }
+
+ return 0;
+}
+
int __lxc_start(struct lxc_handler *handler, struct lxc_operations *ops,
void *data, const char *lxcpath, bool daemonize, int *error_num)
{
goto out_abort;
}
+ ret = resolve_clone_flags(handler);
+ if (ret < 0) {
+ ERROR("Failed to resolve clone flags");
+ ret = -1;
+ goto out_abort;
+ }
+
+ ret = lxc_inherit_namespaces(handler);
+ if (ret) {
+ SYSERROR("Failed to record inherited namespaces");
+ ret = -1;
+ goto out_abort;
+ }
+
/* If the rootfs is not a blockdev, prevent the container from marking
* it readonly.
* If the container is unprivileged then skip rootfs pinning.
*/
- ret = lxc_rootfs_init(conf, !lxc_list_empty(&conf->id_map));
+ ret = lxc_rootfs_init(conf, !list_empty(&conf->id_map));
if (ret) {
ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
ret = -1;
goto out_abort;
}
- if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) {
+ if (geteuid() == 0 && !list_empty(&conf->id_map)) {
/*
* Most filesystems can't be mounted inside a userns so handle them here.
*/
* In any case, treat it as a 'halt'.
*/
if (WIFSIGNALED(status)) {
- switch(WTERMSIG(status)) {
+ int signal_nr = WTERMSIG(status);
+ switch(signal_nr) {
case SIGINT: /* halt */
- DEBUG("Container \"%s\" is halting", name);
+ DEBUG("%s(%d) - Container \"%s\" is halting", signal_name(signal_nr), signal_nr, name);
break;
case SIGHUP: /* reboot */
- DEBUG("Container \"%s\" is rebooting", name);
+ DEBUG("%s(%d) - Container \"%s\" is rebooting", signal_name(signal_nr), signal_nr, name);
handler->conf->reboot = REBOOT_REQ;
break;
case SIGSYS: /* seccomp */
- DEBUG("Container \"%s\" violated its seccomp policy", name);
+ DEBUG("%s(%d) - Container \"%s\" violated its seccomp policy", signal_name(signal_nr), signal_nr, name);
break;
default:
- DEBUG("Unknown exit status for container \"%s\" init %d", name, WTERMSIG(status));
+ DEBUG("%s(%d) - Container \"%s\" init exited", signal_name(signal_nr), signal_nr, name);
break;
}
}