Merge pull request #4157 from brauner/2022-06-30.fixes

[mirror_lxc.git] / src / lxc / start.c
diff --git a/src/lxc/start.c b/src/lxc/start.c

index 247dd1a18c2ae129c04de5afedaee639ae9e61a6..bc6b2252b01cae1e949f26130dbd7480b441dfa1 100644 (file)
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -1,8 +1,7 @@
  /* SPDX-License-Identifier: LGPL-2.1+ */
  
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE 1
-#endif
+#include "config.h"
+
  #include <dirent.h>
  #include <errno.h>
  #include <fcntl.h>
@@ -25,7 +24,10 @@
  #include <sys/wait.h>
  #include <unistd.h>
  
+#include "lxc.h"
+
  #include "af_unix.h"
+#include "attach_options.h"
  #include "caps.h"
  #include "cgroups/cgroup.h"
  #include "cgroups/cgroup_utils.h"
@@ -33,14 +35,12 @@
  #include "commands_utils.h"
  #include "compiler.h"
  #include "conf.h"
-#include "config.h"
  #include "confile_utils.h"
  #include "error.h"
  #include "file_utils.h"
  #include "list.h"
  #include "log.h"
  #include "lsm/lsm.h"
-#include "lxccontainer.h"
  #include "lxclock.h"
  #include "lxcseccomp.h"
  #include "macro.h"
@@ -62,8 +62,8 @@
  #include <sys/capability.h>
  #endif
  
-#ifndef HAVE_STRLCPY
-#include "include/strlcpy.h"
+#if !HAVE_STRLCPY
+#include "strlcpy.h"
  #endif
  
  lxc_log_define(start, lxc);
@@ -150,9 +150,6 @@ static int lxc_try_preserve_namespace(struct lxc_handler *handler,
  static bool lxc_try_preserve_namespaces(struct lxc_handler *handler,
                                         int ns_clone_flags)
  {
-       for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++)
-               handler->nsfd[ns_idx] = -EBADF;
-
         for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++) {
                 int ret;
                 const char *ns = ns_info[ns_idx].proc_name;
@@ -245,6 +242,7 @@ int lxc_check_inherited(struct lxc_conf *conf, bool closeall,
         DIR *dir;
         struct dirent *direntp;
         unsigned int listen_fds_max;
+       struct lxc_state_client *client, *nclient;
  
         if (conf && conf->close_all_fds)
                 closeall = true;
@@ -267,7 +265,6 @@ restart:
  
         while ((direntp = readdir(dir))) {
                 int ret;
-               struct lxc_list *cur;
                 bool matched = false;
  
                 if (strequal(direntp->d_name, "."))
@@ -292,9 +289,7 @@ restart:
  
                 /* Keep state clients that wait on reboots. */
                 if (conf) {
-                       lxc_list_for_each(cur, &conf->state_clients) {
-                               struct lxc_state_client *client = cur->elem;
-
+                       list_for_each_entry_safe(client, nclient, &conf->state_clients, head) {
                                 if (client->clientfd != fd)
                                         continue;
  
@@ -318,7 +313,7 @@ restart:
  
  #endif
  
-               if (fd <= listen_fds_max) {
+               if ((size_t)fd <= listen_fds_max) {
                         INFO("Inheriting fd %d (using the LISTEN_FDS environment variable)", fd);
                         continue;
                 }
@@ -356,7 +351,7 @@ static int setup_signal_fd(sigset_t *oldmask)
         if (ret < 0)
                 return -EBADF;
  
-       for (int sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
+       for (size_t sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
                 ret = sigdelset(&mask, signals[sig]);
                 if (ret < 0)
                         return -EBADF;
@@ -447,7 +442,7 @@ static int signal_handler(int fd, uint32_t events, void *data,
         /* More robustness, protect ourself from a SIGCHLD sent
          * by a process different from the container init.
          */
-       if (siginfo.ssi_pid != hdlr->pid) {
+       if ((__u64)siginfo.ssi_pid != (__u64)hdlr->pid) {
                 NOTICE("Received %d from pid %d instead of container init %d",
                        siginfo.ssi_signo, siginfo.ssi_pid, hdlr->pid);
                 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
@@ -472,10 +467,13 @@ static int signal_handler(int fd, uint32_t events, void *data,
  int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
                             lxc_state_t state)
  {
+       struct lxc_msg msg = {
+               .type   = lxc_msg_state,
+               .value  = state,
+       };
         size_t retlen;
         ssize_t ret;
-       struct lxc_list *cur, *next;
-       struct lxc_msg msg = {.type = lxc_msg_state, .value = state};
+       struct lxc_state_client *client, *nclient;
  
         if (state == THAWED)
                 handler->state = RUNNING;
@@ -484,16 +482,14 @@ int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
  
         TRACE("Set container state to %s", lxc_state2str(state));
  
-       if (lxc_list_empty(&handler->conf->state_clients))
+       if (list_empty(&handler->conf->state_clients))
                 return log_trace(0, "No state clients registered");
  
         retlen = strlcpy(msg.name, name, sizeof(msg.name));
         if (retlen >= sizeof(msg.name))
                 return -E2BIG;
  
-       lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
-               struct lxc_state_client *client = cur->elem;
-
+       list_for_each_entry_safe(client, nclient, &handler->conf->state_clients, head) {
                 if (client->states[state] == 0) {
                         TRACE("State %s not registered for state client %d",
                               lxc_state2str(state), client->clientfd);
@@ -508,10 +504,9 @@ int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
                         SYSERROR("Failed to send message to client");
  
                 /* kick client from list */
-               lxc_list_del(cur);
+               list_del(&client->head);
                 close(client->clientfd);
-               free(cur->elem);
-               free(cur);
+               free(client);
         }
  
         return 0;
@@ -631,7 +626,8 @@ int lxc_poll(const char *name, struct lxc_handler *handler)
         TRACE("Mainloop is ready");
  
         ret = lxc_mainloop(&descr, -1);
-       close_prot_errno_disarm(descr.epfd);
+       if (descr.type == LXC_MAINLOOP_EPOLL)
+               close_prot_errno_disarm(descr.epfd);
         if (ret < 0 || !handler->init_died)
                 goto out_mainloop_console;
  
@@ -706,7 +702,7 @@ struct lxc_handler *lxc_init_handler(struct lxc_handler *old,
         handler->state_socket_pair[0] = -EBADF;
         handler->state_socket_pair[1] = -EBADF;
         if (handler->conf->reboot == REBOOT_NONE)
-               lxc_list_init(&handler->conf->state_clients);
+               INIT_LIST_HEAD(&handler->conf->state_clients);
  
         for (lxc_namespace_t idx = 0; idx < LXC_NS_MAX; idx++) {
                 handler->nsfd[idx] = -EBADF;
@@ -915,9 +911,9 @@ void lxc_expose_namespace_environment(const struct lxc_handler *handler)
  void lxc_end(struct lxc_handler *handler)
  {
         int ret;
-       struct lxc_list *cur, *next;
         const char *name = handler->name;
         struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
+       struct lxc_state_client *client, *nclient;
  
         /* The STOPPING state is there for future cleanup code which can take
          * awhile.
@@ -1009,19 +1005,16 @@ void lxc_end(struct lxc_handler *handler)
         /* The command socket is now closed, no more state clients can register
          * themselves from now on. So free the list of state clients.
          */
-       lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
-               struct lxc_state_client *client = cur->elem;
-
+       list_for_each_entry_safe(client, nclient, &handler->conf->state_clients, head) {
                 /* Keep state clients that want to be notified about reboots. */
                 if ((handler->conf->reboot > REBOOT_NONE) &&
                     (client->states[RUNNING] == 2))
                         continue;
  
                 /* close state client socket */
-               lxc_list_del(cur);
+               list_del(&client->head);
                 close(client->clientfd);
-               free(cur->elem);
-               free(cur);
+               free(client);
         }
  
         if (handler->conf->ephemeral == 1 && handler->conf->reboot != REBOOT_REQ)
@@ -1057,12 +1050,11 @@ static int do_start(void *data)
  {
         struct lxc_handler *handler = data;
         __lxc_unused __do_close int data_sock0 = handler->data_sock[0],
-                                          data_sock1 = handler->data_sock[1];
+                                   data_sock1 = handler->data_sock[1];
         __do_close int devnull_fd = -EBADF, status_fd = -EBADF;
         int ret;
         uid_t new_uid;
         gid_t new_gid;
-       struct lxc_list *iterator;
         uid_t nsuid = 0;
         gid_t nsgid = 0;
  
@@ -1115,7 +1107,7 @@ static int do_start(void *data)
         /* If we are in a new user namespace, become root there to have
          * privilege over our namespace.
          */
-       if (!lxc_list_empty(&handler->conf->id_map)) {
+       if (!list_empty(&handler->conf->id_map)) {
                 if (!handler->conf->root_nsuid_map)
                         nsuid = handler->conf->init_uid;
  
@@ -1262,18 +1254,14 @@ static int do_start(void *data)
                 }
         }
  
-       /* Add the requested environment variables to the current environment to
-        * allow them to be used by the various hooks, such as the start hook
-        * below.
+       /*
+        * Add the requested environment variables to the current environment
+        * to allow them to be used by the various hooks, such as the start
+        * hook below.
          */
-       lxc_list_for_each(iterator, &handler->conf->environment) {
-               ret = putenv((char *)iterator->elem);
-               if (ret < 0) {
-                       SYSERROR("Failed to set environment variable: %s",
-                                (char *)iterator->elem);
-                       goto out_warn_father;
-               }
-       }
+       ret = lxc_set_environment(handler->conf);
+       if (ret < 0)
+               goto out_warn_father;
  
         if (!lxc_sync_wait_parent(handler, START_SYNC_POST_CONFIGURE))
                 goto out_warn_father;
@@ -1366,14 +1354,9 @@ static int do_start(void *data)
         if (ret < 0)
                 SYSERROR("Failed to clear environment.");
  
-       lxc_list_for_each(iterator, &handler->conf->environment) {
-               ret = putenv((char *)iterator->elem);
-               if (ret < 0) {
-                       SYSERROR("Failed to set environment variable: %s",
-                                (char *)iterator->elem);
-                       goto out_warn_father;
-               }
-       }
+       ret = lxc_set_environment(handler->conf);
+       if (ret < 0)
+               goto out_warn_father;
  
         ret = putenv("container=lxc");
         if (ret < 0) {
@@ -1411,7 +1394,7 @@ static int do_start(void *data)
          * we switched to root in the new user namespace further above. Only
          * drop groups if we can, so ensure that we have necessary privilege.
          */
-       if (lxc_list_empty(&handler->conf->id_map)) {
+       if (list_empty(&handler->conf->id_map)) {
                 #if HAVE_LIBCAP
                 if (lxc_proc_cap_is_set(CAP_SETGID, CAP_EFFECTIVE))
                 #endif
@@ -1478,7 +1461,7 @@ int resolve_clone_flags(struct lxc_handler *handler)
                         if ((conf->ns_clone & ns_info[i].clone_flag))
                                 handler->ns_clone_flags |= ns_info[i].clone_flag;
                 } else {
-                       if (i == LXC_NS_USER && lxc_list_empty(&handler->conf->id_map))
+                       if (i == LXC_NS_USER && list_empty(&handler->conf->id_map))
                                 continue;
  
                         if (i == LXC_NS_NET && lxc_requests_empty_network(handler))
@@ -1568,6 +1551,52 @@ static inline int do_share_ns(void *arg)
         return 0;
  }
  
+static int core_scheduling(struct lxc_handler *handler)
+{
+       struct lxc_conf *conf = handler->conf;
+       int ret;
+
+       if (!conf->sched_core)
+               return log_trace(0, "No new core scheduling domain requested");
+
+       if (!(handler->ns_clone_flags & CLONE_NEWPID))
+               return syserror_set(-EINVAL, "Core scheduling currently requires a separate pid namespace");
+
+       ret = core_scheduling_cookie_create_threadgroup(handler->pid);
+       if (ret < 0) {
+               if (ret == -ENODEV) {
+                       INFO("The kernel doesn't support or doesn't use simultaneous multithreading (SMT)");
+                       conf->sched_core = false;
+                       return 0;
+               }
+               if (ret == -EINVAL)
+                       return syserror("The kernel does not support core scheduling");
+
+               return syserror("Failed to create new core scheduling domain");
+       }
+
+       ret = core_scheduling_cookie_get(handler->pid, &conf->sched_core_cookie);
+       if (ret || !core_scheduling_cookie_valid(conf->sched_core_cookie))
+               return syserror("Failed to retrieve core scheduling domain cookie");
+
+       TRACE("Created new core scheduling domain with cookie %llu",
+             (llu)conf->sched_core_cookie);
+
+       return 0;
+}
+
+static bool inherits_namespaces(const struct lxc_handler *handler)
+{
+       struct lxc_conf *conf = handler->conf;
+
+       for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
+               if (conf->ns_share[i])
+                       return true;
+       }
+
+       return false;
+}
+
  /* lxc_spawn() performs crucial setup tasks and clone()s the new process which
   * exec()s the requested container binary.
   * Note that lxc_spawn() runs in the parent namespaces. Any operations performed
@@ -1581,26 +1610,13 @@ static int lxc_spawn(struct lxc_handler *handler)
         int i, ret;
         char pidstr[20];
         bool wants_to_map_ids;
-       struct lxc_list *id_map;
+       struct list_head *id_map;
         const char *name = handler->name;
-       const char *lxcpath = handler->lxcpath;
-       bool share_ns = false;
         struct lxc_conf *conf = handler->conf;
         struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
  
         id_map = &conf->id_map;
-       wants_to_map_ids = !lxc_list_empty(id_map);
-
-       for (i = 0; i < LXC_NS_MAX; i++) {
-               if (!conf->ns_share[i])
-                       continue;
-
-               handler->nsfd[i] = lxc_inherit_namespace(conf->ns_share[i], lxcpath, ns_info[i].proc_name);
-               if (handler->nsfd[i] < 0)
-                       return -1;
-
-               share_ns = true;
-       }
+       wants_to_map_ids = !list_empty(id_map);
  
         if (!lxc_sync_init(handler))
                 return -1;
@@ -1612,10 +1628,6 @@ static int lxc_spawn(struct lxc_handler *handler)
         data_sock0 = handler->data_sock[0];
         data_sock1 = handler->data_sock[1];
  
-       ret = resolve_clone_flags(handler);
-       if (ret < 0)
-               goto out_sync_fini;
-
         if (handler->ns_clone_flags & CLONE_NEWNET) {
                 ret = lxc_find_gateway_addresses(handler);
                 if (ret) {
@@ -1630,7 +1642,7 @@ static int lxc_spawn(struct lxc_handler *handler)
         }
  
         /* Create a process in a new set of namespaces. */
-       if (share_ns) {
+       if (inherits_namespaces(handler)) {
                 pid_t attacher_pid;
  
                 attacher_pid = lxc_clone(do_share_ns, handler,
@@ -1724,6 +1736,10 @@ static int lxc_spawn(struct lxc_handler *handler)
                 handler->clone_flags &= ~CLONE_PIDFD;
         TRACE("Cloned child process %d", handler->pid);
  
+       ret = core_scheduling(handler);
+       if (ret < 0)
+               goto out_delete_net;
+
         /* Verify that we can actually make use of pidfds. */
         if (!lxc_can_use_pidfd(handler->pidfd))
                 close_prot_errno_disarm(handler->pidfd);
@@ -1821,18 +1837,16 @@ static int lxc_spawn(struct lxc_handler *handler)
                 }
         }
  
-       if (!lxc_list_empty(&conf->procs)) {
-               ret = setup_proc_filesystem(&conf->procs, handler->pid);
-               if (ret < 0)
-                       goto out_delete_net;
+       ret = setup_proc_filesystem(conf, handler->pid);
+       if (ret < 0) {
+               ERROR("Failed to setup procfs limits");
+               goto out_delete_net;
         }
  
-       if (!lxc_list_empty(&conf->limits)) {
-               ret = setup_resource_limits(&conf->limits, handler->pid);
-               if (ret < 0) {
-                       ERROR("Failed to setup resource limits");
-                       goto out_delete_net;
-               }
+       ret = setup_resource_limits(conf, handler->pid);
+       if (ret < 0) {
+               ERROR("Failed to setup resource limits");
+               goto out_delete_net;
         }
  
         /* Tell the child to continue its initialization. */
@@ -1973,6 +1987,28 @@ out_sync_fini:
         return -1;
  }
  
+static int lxc_inherit_namespaces(struct lxc_handler *handler)
+{
+       const char *lxcpath = handler->lxcpath;
+       struct lxc_conf *conf = handler->conf;
+
+       for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
+               if (!conf->ns_share[i])
+                       continue;
+
+               handler->nsfd[i] = lxc_inherit_namespace(conf->ns_share[i],
+                                                       lxcpath,
+                                                       ns_info[i].proc_name);
+               if (handler->nsfd[i] < 0)
+                       return -1;
+
+               TRACE("Recording inherited %s namespace with fd %d",
+                     ns_info[i].proc_name, handler->nsfd[i]);
+       }
+
+       return 0;
+}
+
  int __lxc_start(struct lxc_handler *handler, struct lxc_operations *ops,
                 void *data, const char *lxcpath, bool daemonize, int *error_num)
  {
@@ -2015,18 +2051,32 @@ int __lxc_start(struct lxc_handler *handler, struct lxc_operations *ops,
                 goto out_abort;
         }
  
+       ret = resolve_clone_flags(handler);
+       if (ret < 0) {
+               ERROR("Failed to resolve clone flags");
+               ret = -1;
+               goto out_abort;
+       }
+
+       ret = lxc_inherit_namespaces(handler);
+       if (ret) {
+               SYSERROR("Failed to record inherited namespaces");
+               ret = -1;
+               goto out_abort;
+       }
+
         /* If the rootfs is not a blockdev, prevent the container from marking
          * it readonly.
          * If the container is unprivileged then skip rootfs pinning.
          */
-       ret = lxc_rootfs_init(conf, !lxc_list_empty(&conf->id_map));
+       ret = lxc_rootfs_init(conf, !list_empty(&conf->id_map));
         if (ret) {
                 ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
                 ret = -1;
                 goto out_abort;
         }
  
-       if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) {
+       if (geteuid() == 0 && !list_empty(&conf->id_map)) {
                 /*
                  * Most filesystems can't be mounted inside a userns so handle them here.
                  */
@@ -2076,19 +2126,20 @@ int __lxc_start(struct lxc_handler *handler, struct lxc_operations *ops,
          * In any case, treat it as a 'halt'.
          */
         if (WIFSIGNALED(status)) {
-               switch(WTERMSIG(status)) {
+               int signal_nr = WTERMSIG(status);
+               switch(signal_nr) {
                 case SIGINT: /* halt */
-                       DEBUG("Container \"%s\" is halting", name);
+                       DEBUG("%s(%d) - Container \"%s\" is halting", signal_name(signal_nr), signal_nr, name);
                         break;
                 case SIGHUP: /* reboot */
-                       DEBUG("Container \"%s\" is rebooting", name);
+                       DEBUG("%s(%d) - Container \"%s\" is rebooting", signal_name(signal_nr), signal_nr, name);
                         handler->conf->reboot = REBOOT_REQ;
                         break;
                 case SIGSYS: /* seccomp */
-                       DEBUG("Container \"%s\" violated its seccomp policy", name);
+                       DEBUG("%s(%d) - Container \"%s\" violated its seccomp policy", signal_name(signal_nr), signal_nr, name);
                         break;
                 default:
-                       DEBUG("Unknown exit status for container \"%s\" init %d", name, WTERMSIG(status));
+                       DEBUG("%s(%d) - Container \"%s\" init exited", signal_name(signal_nr), signal_nr, name);
                         break;
                 }
         }