src/lxc/start.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include "config.h"
   4
   5 #include <dirent.h>
   6 #include <errno.h>
   7 #include <fcntl.h>
   8 #include <grp.h>
   9 #include <poll.h>
  10 #include <pthread.h>
  11 #include <signal.h>
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14 #include <string.h>
  15 #include <sys/file.h>
  16 #include <sys/mount.h>
  17 #include <sys/param.h>
  18 #include <sys/prctl.h>
  19 #include <sys/socket.h>
  20 #include <sys/stat.h>
  21 #include <sys/syscall.h>
  22 #include <sys/types.h>
  23 #include <sys/un.h>
  24 #include <sys/wait.h>
  25 #include <unistd.h>
  26
  27 #include "lxc.h"
  28
  29 #include "af_unix.h"
  30 #include "attach_options.h"
  31 #include "caps.h"
  32 #include "cgroups/cgroup.h"
  33 #include "cgroups/cgroup_utils.h"
  34 #include "commands.h"
  35 #include "commands_utils.h"
  36 #include "compiler.h"
  37 #include "conf.h"
  38 #include "confile_utils.h"
  39 #include "error.h"
  40 #include "file_utils.h"
  41 #include "list.h"
  42 #include "log.h"
  43 #include "lsm/lsm.h"
  44 #include "lxclock.h"
  45 #include "lxcseccomp.h"
  46 #include "macro.h"
  47 #include "mainloop.h"
  48 #include "memory_utils.h"
  49 #include "monitor.h"
  50 #include "namespace.h"
  51 #include "network.h"
  52 #include "process_utils.h"
  53 #include "start.h"
  54 #include "storage/storage.h"
  55 #include "storage/storage_utils.h"
  56 #include "sync.h"
  57 #include "syscall_wrappers.h"
  58 #include "terminal.h"
  59 #include "utils.h"
  60
  61 #if HAVE_LIBCAP
  62 #include <sys/capability.h>
  63 #endif
  64
  65 #if !HAVE_STRLCPY
  66 #include "strlcpy.h"
  67 #endif
  68
  69 lxc_log_define(start, lxc);
  70
  71 extern void mod_all_rdeps(struct lxc_container *c, bool inc);
  72 static bool do_destroy_container(struct lxc_handler *handler);
  73 static int lxc_rmdir_onedev_wrapper(void *data);
  74 static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
  75                                             const char *name);
  76
  77 static void print_top_failing_dir(const char *path)
  78 {
  79         __do_free char *copy = NULL;
  80         int ret;
  81         char *e, *p, saved;
  82
  83         copy = must_copy_string(path);
  84         p = copy;
  85         e = copy + strlen(path);
  86
  87         while (p < e) {
  88                 while (p < e && *p == '/')
  89                         p++;
  90
  91                 while (p < e && *p != '/')
  92                         p++;
  93
  94                 saved = *p;
  95                 *p = '\0';
  96
  97                 ret = access(copy, X_OK);
  98                 if (ret != 0) {
  99                         SYSERROR("Could not access %s. Please grant it x access, or add an ACL for the container " "root", copy);
 100                         return;
 101                 }
 102                 *p = saved;
 103         }
 104 }
 105
 106 static void lxc_put_nsfds(struct lxc_handler *handler)
 107 {
 108         for (int i = 0; i < LXC_NS_MAX; i++) {
 109                 if (handler->nsfd[i] < 0)
 110                         continue;
 111
 112                 close_prot_errno_disarm(handler->nsfd[i]);
 113         }
 114 }
 115
 116 static int lxc_try_preserve_namespace(struct lxc_handler *handler,
 117                                       lxc_namespace_t idx, const char *ns)
 118 {
 119         __do_close int fd = -EBADF;
 120         int ret;
 121
 122         fd = lxc_preserve_ns(handler->pid, ns);
 123         if (fd < 0)
 124                 return -errno;
 125
 126         ret = strnprintf(handler->nsfd_paths[idx],
 127                          sizeof(handler->nsfd_paths[idx]), "%s:/proc/%d/fd/%d",
 128                          ns_info[idx].proc_name, handler->monitor_pid, fd);
 129         if (ret < 0)
 130                 return ret_errno(EIO);
 131
 132         /*
 133          * In case LXC is configured for exposing information to hooks as
 134          * argv-style arguments prepare an argv array we can use.
 135          */
 136         handler->hook_argv[handler->hook_argc] = handler->nsfd_paths[idx];
 137         handler->hook_argc++;
 138
 139         DEBUG("Preserved %s namespace via fd %d and stashed path as %s",
 140               ns_info[idx].proc_name, fd, handler->nsfd_paths[idx]);
 141
 142         handler->nsfd[idx] = move_fd(fd);
 143         return 0;
 144 }
 145
 146 /* lxc_try_preserve_namespaces: open /proc/@pid/ns/@ns for each namespace
 147  * specified in ns_clone_flags.
 148  * Return true on success, false on failure.
 149  */
 150 static bool lxc_try_preserve_namespaces(struct lxc_handler *handler,
 151                                         int ns_clone_flags)
 152 {
 153         for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++)
 154                 handler->nsfd[ns_idx] = -EBADF;
 155
 156         for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++) {
 157                 int ret;
 158                 const char *ns = ns_info[ns_idx].proc_name;
 159
 160                 if ((ns_clone_flags & ns_info[ns_idx].clone_flag) == 0)
 161                         continue;
 162
 163                 ret = lxc_try_preserve_namespace(handler, ns_idx,
 164                                                  ns_info[ns_idx].proc_name);
 165                 if (ret < 0) {
 166                         if (ret == -ENOENT) {
 167                                 SYSERROR("Kernel does not support preserving %s namespaces", ns);
 168                                 continue;
 169                         }
 170
 171                         /*
 172                          * Handle kernels that do not support interacting with
 173                          * namespaces through procfs.
 174                          */
 175                         lxc_put_nsfds(handler);
 176                         return log_error_errno(false, errno, "Failed to preserve %s namespace", ns);
 177                 }
 178         }
 179
 180         return true;
 181 }
 182
 183 static inline bool match_stdfds(int fd)
 184 {
 185         return (fd == STDIN_FILENO || fd == STDOUT_FILENO || fd == STDERR_FILENO);
 186 }
 187
 188 #ifdef HAVE_DLOG
 189 static bool match_dlog_fds(struct dirent *direntp)
 190 {
 191         char path[PATH_MAX] = {0};
 192         char link[PATH_MAX] = {0};
 193         ssize_t linklen;
 194         int ret;
 195
 196         ret = strnprintf(path, sizeof(path), "/proc/self/fd/%s", direntp->d_name);
 197         if (ret < 0)
 198                 return log_error(false, "Failed to create file descriptor name");
 199
 200         linklen = readlink(path, link, PATH_MAX);
 201         if (linklen < 0)
 202                 return log_error(false, "Failed to read link path - \"%s\"", path);
 203         else if (linklen >= PATH_MAX)
 204                 return log_error(false, "The name of link path is too long - \"%s\"", path);
 205
 206         if (strequal(link, "/dev/log_main") ||
 207             strequal(link, "/dev/log_system") ||
 208             strequal(link, "/dev/log_radio"))
 209                 return true;
 210
 211         return false;
 212 }
 213 #endif
 214
 215 /* Parses the LISTEN_FDS environment variable value.
 216  * The returned value is the highest fd number up to which the
 217  * file descriptors must be passed to the container process.
 218  *
 219  * For example, if LISTEN_FDS=2 then 4 is returned and file descriptors 3 and 4
 220  * MUST be passed to the container process (in addition to the standard streams)
 221  * to support [socket activation][systemd-listen-fds].
 222  */
 223 static unsigned int get_listen_fds_max(void)
 224 {
 225         int ret;
 226         unsigned int num_fds;
 227         const char *val;
 228
 229         val = getenv("LISTEN_FDS");
 230         if (!val)
 231                 return 0;
 232
 233         ret = lxc_safe_uint(val, &num_fds);
 234         if (ret < 0)
 235                 return syserror_ret(0, "Failed to parse \"LISTEN_FDS=%s\" environment variable", val);
 236
 237         return log_trace(num_fds, "Parsed \"LISTEN_FDS=%s\" environment variable", val);
 238 }
 239
 240 int lxc_check_inherited(struct lxc_conf *conf, bool closeall,
 241                         int *fds_to_ignore, size_t len_fds)
 242 {
 243         int fd, fddir;
 244         size_t i;
 245         DIR *dir;
 246         struct dirent *direntp;
 247         unsigned int listen_fds_max;
 248         struct lxc_state_client *client, *nclient;
 249
 250         if (conf && conf->close_all_fds)
 251                 closeall = true;
 252
 253         listen_fds_max = get_listen_fds_max();
 254
 255         /*
 256          * Disable syslog at this point to avoid the above logging
 257          * function to open a new fd and make the check_inherited function
 258          * enter an infinite loop.
 259          */
 260         lxc_log_syslog_disable();
 261
 262 restart:
 263         dir = opendir("/proc/self/fd");
 264         if (!dir)
 265                 return log_warn(-1, "Failed to open directory");
 266
 267         fddir = dirfd(dir);
 268
 269         while ((direntp = readdir(dir))) {
 270                 int ret;
 271                 bool matched = false;
 272
 273                 if (strequal(direntp->d_name, "."))
 274                         continue;
 275
 276                 if (strequal(direntp->d_name, ".."))
 277                         continue;
 278
 279                 ret = lxc_safe_int(direntp->d_name, &fd);
 280                 if (ret < 0) {
 281                         INFO("Could not parse file descriptor for \"%s\"", direntp->d_name);
 282                         continue;
 283                 }
 284
 285                 for (i = 0; i < len_fds; i++)
 286                         if (fds_to_ignore[i] == fd)
 287                                 break;
 288
 289                 if (fd == fddir || fd == lxc_log_fd ||
 290                     (i < len_fds && fd == fds_to_ignore[i]))
 291                         continue;
 292
 293                 /* Keep state clients that wait on reboots. */
 294                 if (conf) {
 295                         list_for_each_entry_safe(client, nclient, &conf->state_clients, head) {
 296                                 if (client->clientfd != fd)
 297                                         continue;
 298
 299                                 matched = true;
 300                                 break;
 301                         }
 302                 }
 303
 304                 if (matched)
 305                         continue;
 306
 307                 if (current_config && fd == current_config->logfd)
 308                         continue;
 309
 310                 if (match_stdfds(fd))
 311                         continue;
 312
 313 #ifdef HAVE_DLOG
 314                 if (match_dlog_fds(direntp))
 315                         continue;
 316
 317 #endif
 318
 319                 if ((size_t)fd <= listen_fds_max) {
 320                         INFO("Inheriting fd %d (using the LISTEN_FDS environment variable)", fd);
 321                         continue;
 322                 }
 323
 324                 if (closeall) {
 325                         if (close(fd))
 326                                 SYSINFO("Closed inherited fd %d", fd);
 327                         else
 328                                 INFO("Closed inherited fd %d", fd);
 329                         closedir(dir);
 330                         goto restart;
 331                 }
 332                 WARN("Inherited fd %d", fd);
 333         }
 334         closedir(dir);
 335
 336         /*
 337          * Only enable syslog at this point to avoid the above logging
 338          * function to open a new fd and make the check_inherited function
 339          * enter an infinite loop.
 340          */
 341         lxc_log_syslog_enable();
 342
 343         return 0;
 344 }
 345
 346 static int setup_signal_fd(sigset_t *oldmask)
 347 {
 348         int ret;
 349         sigset_t mask;
 350         const int signals[] = {SIGBUS, SIGILL, SIGSEGV, SIGWINCH};
 351
 352         /* Block everything except serious error signals. */
 353         ret = sigfillset(&mask);
 354         if (ret < 0)
 355                 return -EBADF;
 356
 357         for (size_t sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
 358                 ret = sigdelset(&mask, signals[sig]);
 359                 if (ret < 0)
 360                         return -EBADF;
 361         }
 362
 363         ret = pthread_sigmask(SIG_BLOCK, &mask, oldmask);
 364         if (ret < 0)
 365                 return log_error_errno(-EBADF, errno,
 366                                        "Failed to set signal mask");
 367
 368         ret = signalfd(-1, &mask, SFD_CLOEXEC);
 369         if (ret < 0)
 370                 return log_error_errno(-EBADF,
 371                                        errno, "Failed to create signal file descriptor");
 372
 373         TRACE("Created signal file descriptor %d", ret);
 374
 375         return ret;
 376 }
 377
 378 static int signal_handler(int fd, uint32_t events, void *data,
 379                           struct lxc_async_descr *descr)
 380 {
 381         int ret;
 382         siginfo_t info;
 383         struct signalfd_siginfo siginfo;
 384         struct lxc_handler *hdlr = data;
 385
 386         ret = lxc_read_nointr(fd, &siginfo, sizeof(siginfo));
 387         if (ret < 0)
 388                 return log_error(LXC_MAINLOOP_ERROR, "Failed to read signal info from signal file descriptor %d", fd);
 389
 390         if (ret != sizeof(siginfo))
 391                 return log_error(LXC_MAINLOOP_ERROR, "Unexpected size for struct signalfd_siginfo");
 392
 393         /* Check whether init is running. */
 394         info.si_pid = 0;
 395         ret = waitid(P_PID, hdlr->pid, &info, WEXITED | WNOWAIT | WNOHANG);
 396         if (ret == 0 && info.si_pid == hdlr->pid)
 397                 hdlr->init_died = true;
 398
 399         TRACE("Received signal ssi_signo(%d) for ssi_pid(%d), si_signo(%d), si_pid(%d)",
 400               siginfo.ssi_signo, siginfo.ssi_pid, info.si_signo, info.si_pid);
 401
 402         /* Try to figure out a reasonable exit status to report. */
 403         if (hdlr->init_died) {
 404                 switch (info.si_code) {
 405                 case CLD_EXITED:
 406                         hdlr->exit_status = info.si_status << 8;
 407                         break;
 408                 case CLD_KILLED:
 409                 case CLD_DUMPED:
 410                 case CLD_STOPPED:
 411                         hdlr->exit_status = info.si_status << 8 | 0x7f;
 412                         break;
 413                 case CLD_CONTINUED:
 414                         /* Huh? The waitid() told us it's dead *and* continued? */
 415                         WARN("Init %d dead and continued?", hdlr->pid);
 416                         hdlr->exit_status = 1;
 417                         break;
 418                 default:
 419                         ERROR("Unknown si_code: %d", info.si_code);
 420                         hdlr->exit_status = 1;
 421                 }
 422         }
 423
 424         if (siginfo.ssi_signo == SIGHUP) {
 425                 if (hdlr->pidfd >= 0)
 426                         lxc_raw_pidfd_send_signal(hdlr->pidfd, SIGTERM, NULL, 0);
 427                 else
 428                         kill(hdlr->pid, SIGTERM);
 429                 INFO("Killing %d since terminal hung up", hdlr->pid);
 430                 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
 431                                        : LXC_MAINLOOP_CONTINUE;
 432         }
 433
 434         if (siginfo.ssi_signo != SIGCHLD) {
 435                 if (hdlr->pidfd >= 0)
 436                         lxc_raw_pidfd_send_signal(hdlr->pidfd,
 437                                                   siginfo.ssi_signo, NULL, 0);
 438                 else
 439                         kill(hdlr->pid, siginfo.ssi_signo);
 440                 INFO("Forwarded signal %d to pid %d", siginfo.ssi_signo, hdlr->pid);
 441                 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
 442                                        : LXC_MAINLOOP_CONTINUE;
 443         }
 444
 445         /* More robustness, protect ourself from a SIGCHLD sent
 446          * by a process different from the container init.
 447          */
 448         if ((__u64)siginfo.ssi_pid != (__u64)hdlr->pid) {
 449                 NOTICE("Received %d from pid %d instead of container init %d",
 450                        siginfo.ssi_signo, siginfo.ssi_pid, hdlr->pid);
 451                 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
 452                                        : LXC_MAINLOOP_CONTINUE;
 453         }
 454
 455         if (siginfo.ssi_code == CLD_STOPPED) {
 456                 INFO("Container init process was stopped");
 457                 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
 458                                        : LXC_MAINLOOP_CONTINUE;
 459         }
 460
 461         if (siginfo.ssi_code == CLD_CONTINUED) {
 462                 INFO("Container init process was continued");
 463                 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
 464                                        : LXC_MAINLOOP_CONTINUE;
 465         }
 466
 467         return log_debug(LXC_MAINLOOP_CLOSE, "Container init process %d exited", hdlr->pid);
 468 }
 469
 470 int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
 471                             lxc_state_t state)
 472 {
 473         struct lxc_msg msg = {
 474                 .type   = lxc_msg_state,
 475                 .value  = state,
 476         };
 477         size_t retlen;
 478         ssize_t ret;
 479         struct lxc_state_client *client, *nclient;
 480
 481         if (state == THAWED)
 482                 handler->state = RUNNING;
 483         else
 484                 handler->state = state;
 485
 486         TRACE("Set container state to %s", lxc_state2str(state));
 487
 488         if (list_empty(&handler->conf->state_clients))
 489                 return log_trace(0, "No state clients registered");
 490
 491         retlen = strlcpy(msg.name, name, sizeof(msg.name));
 492         if (retlen >= sizeof(msg.name))
 493                 return -E2BIG;
 494
 495         list_for_each_entry_safe(client, nclient, &handler->conf->state_clients, head) {
 496                 if (client->states[state] == 0) {
 497                         TRACE("State %s not registered for state client %d",
 498                               lxc_state2str(state), client->clientfd);
 499                         continue;
 500                 }
 501
 502                 TRACE("Sending state %s to state client %d",
 503                       lxc_state2str(state), client->clientfd);
 504
 505                 ret = lxc_send_nointr(client->clientfd, &msg, sizeof(msg), MSG_NOSIGNAL);
 506                 if (ret <= 0)
 507                         SYSERROR("Failed to send message to client");
 508
 509                 /* kick client from list */
 510                 list_del(&client->head);
 511                 close(client->clientfd);
 512                 free(client);
 513         }
 514
 515         return 0;
 516 }
 517
 518 static int lxc_serve_state_socket_pair(const char *name,
 519                                        struct lxc_handler *handler,
 520                                        lxc_state_t state)
 521 {
 522         ssize_t ret;
 523
 524         if (!handler->daemonize ||
 525             handler->state_socket_pair[1] < 0 ||
 526             state == STARTING)
 527                 return 0;
 528
 529         /* Close read end of the socket pair. */
 530         close_prot_errno_disarm(handler->state_socket_pair[0]);
 531
 532 again:
 533         ret = lxc_abstract_unix_send_credential(handler->state_socket_pair[1],
 534                                                 &(int){state}, sizeof(int));
 535         if (ret < 0) {
 536                 SYSERROR("Failed to send state to %d", handler->state_socket_pair[1]);
 537
 538                 if (errno == EINTR)
 539                         goto again;
 540
 541                 return -1;
 542         }
 543
 544         if (ret != sizeof(int))
 545                 return log_error(-1, "Message too long : %d", handler->state_socket_pair[1]);
 546
 547         TRACE("Sent container state \"%s\" to %d", lxc_state2str(state),
 548               handler->state_socket_pair[1]);
 549
 550         /* Close write end of the socket pair. */
 551         close_prot_errno_disarm(handler->state_socket_pair[1]);
 552
 553         return 0;
 554 }
 555
 556 int lxc_set_state(const char *name, struct lxc_handler *handler,
 557                   lxc_state_t state)
 558 {
 559         int ret;
 560
 561         ret = lxc_serve_state_socket_pair(name, handler, state);
 562         if (ret < 0)
 563                 return log_error(-1, "Failed to synchronize via anonymous pair of unix sockets");
 564
 565         ret = lxc_serve_state_clients(name, handler, state);
 566         if (ret < 0)
 567                 return -1;
 568
 569         /* This function will try to connect to the legacy lxc-monitord state
 570          * server and only exists for backwards compatibility.
 571          */
 572         lxc_monitor_send_state(name, state, handler->lxcpath);
 573
 574         return 0;
 575 }
 576
 577 int lxc_poll(const char *name, struct lxc_handler *handler)
 578 {
 579         int ret;
 580         struct lxc_terminal *console = &handler->conf->console;
 581         struct lxc_async_descr descr, descr_console;
 582
 583         if (!wants_console(console))
 584                 console = NULL;
 585
 586         ret = lxc_mainloop_open(&descr);
 587         if (ret < 0) {
 588                 ERROR("Failed to create mainloop");
 589                 goto out_sigfd;
 590         }
 591
 592         if (console) {
 593                 ret = lxc_mainloop_open(&descr_console);
 594                 if (ret < 0) {
 595                         ERROR("Failed to create console mainloop");
 596                         goto out_mainloop;
 597                 }
 598         }
 599
 600         ret = lxc_mainloop_add_handler(&descr, handler->sigfd,
 601                                        signal_handler,
 602                                        default_cleanup_handler,
 603                                        handler, "signal_handler");
 604         if (ret < 0) {
 605                 ERROR("Failed to add signal handler for %d to mainloop", handler->sigfd);
 606                 goto out_mainloop_console;
 607         }
 608
 609         ret = lxc_seccomp_setup_proxy(&handler->conf->seccomp, &descr, handler);
 610         if (ret < 0) {
 611                 ERROR("Failed to setup seccomp proxy");
 612                 goto out_mainloop_console;
 613         }
 614
 615         if (console) {
 616                 ret = lxc_terminal_mainloop_add(&descr, console);
 617                 if (ret < 0) {
 618                         ERROR("Failed to add console handlers to mainloop");
 619                         goto out_mainloop_console;
 620                 }
 621         }
 622
 623         ret = lxc_cmd_mainloop_add(name, &descr, handler);
 624         if (ret < 0) {
 625                 ERROR("Failed to add command handler to mainloop");
 626                 goto out_mainloop_console;
 627         }
 628
 629         TRACE("Mainloop is ready");
 630
 631         ret = lxc_mainloop(&descr, -1);
 632         if (descr.type == LXC_MAINLOOP_EPOLL)
 633                 close_prot_errno_disarm(descr.epfd);
 634         if (ret < 0 || !handler->init_died)
 635                 goto out_mainloop_console;
 636
 637         if (console) {
 638                 ret = lxc_terminal_mainloop_add(&descr_console, console);
 639                 if (ret == 0)
 640                         ret = lxc_mainloop(&descr_console, 0);
 641         }
 642
 643 out_mainloop_console:
 644         if (console) {
 645                 lxc_mainloop_close(&descr_console);
 646                 TRACE("Closed console mainloop");
 647         }
 648
 649 out_mainloop:
 650         lxc_mainloop_close(&descr);
 651         TRACE("Closed mainloop");
 652
 653 out_sigfd:
 654         TRACE("Closed signal file descriptor %d", handler->sigfd);
 655         close_prot_errno_disarm(handler->sigfd);
 656
 657         return ret;
 658 }
 659
 660 void lxc_put_handler(struct lxc_handler *handler)
 661 {
 662         close_prot_errno_disarm(handler->pidfd);
 663         close_prot_errno_disarm(handler->sigfd);
 664         lxc_put_nsfds(handler);
 665         if (handler->conf && handler->conf->reboot == REBOOT_NONE)
 666                 close_prot_errno_disarm(handler->conf->maincmd_fd);
 667         close_prot_errno_disarm(handler->monitor_status_fd);
 668         close_prot_errno_disarm(handler->state_socket_pair[0]);
 669         close_prot_errno_disarm(handler->state_socket_pair[1]);
 670         cgroup_exit(handler->cgroup_ops);
 671         if (handler->conf && handler->conf->reboot == REBOOT_NONE)
 672                 free_disarm(handler);
 673         else
 674                 handler->conf = NULL;
 675 }
 676
 677 struct lxc_handler *lxc_init_handler(struct lxc_handler *old,
 678                                      const char *name, struct lxc_conf *conf,
 679                                      const char *lxcpath, bool daemonize)
 680 {
 681         int nr_keep_fds = 0;
 682         int ret;
 683         struct lxc_handler *handler;
 684
 685         if (!old)
 686                 handler = zalloc(sizeof(*handler));
 687         else
 688                 handler = old;
 689         if (!handler)
 690                 return NULL;
 691
 692         /* Note that am_guest_unpriv() checks the effective uid. We
 693          * probably don't care if we are real root only if we are running
 694          * as root so this should be fine.
 695          */
 696         handler->am_root = !am_guest_unpriv();
 697         handler->conf = conf;
 698         handler->lxcpath = lxcpath;
 699         handler->init_died = false;
 700         handler->data_sock[0] = -EBADF;
 701         handler->data_sock[1] = -EBADF;
 702         handler->monitor_status_fd = -EBADF;
 703         handler->pidfd = -EBADF;
 704         handler->sigfd = -EBADF;
 705         handler->state_socket_pair[0] = -EBADF;
 706         handler->state_socket_pair[1] = -EBADF;
 707         if (handler->conf->reboot == REBOOT_NONE)
 708                 INIT_LIST_HEAD(&handler->conf->state_clients);
 709
 710         for (lxc_namespace_t idx = 0; idx < LXC_NS_MAX; idx++) {
 711                 handler->nsfd[idx] = -EBADF;
 712
 713                 if (handler->conf->reboot == REBOOT_NONE)
 714                         continue;
 715
 716                 handler->nsfd_paths[idx][0] = '\0';
 717                 handler->hook_argv[idx] = NULL;
 718
 719                 if (handler->hook_argc != 0)
 720                         handler->hook_argc = 0;
 721         }
 722
 723         handler->name = name;
 724         if (daemonize)
 725                 handler->transient_pid = lxc_raw_getpid();
 726         else
 727                 handler->transient_pid = -1;
 728
 729         if (daemonize && handler->conf->reboot == REBOOT_NONE) {
 730                 /* Create socketpair() to synchronize on daemonized startup.
 731                  * When the container reboots we don't need to synchronize
 732                  * again currently so don't open another socketpair().
 733                  */
 734                 ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0,
 735                                  handler->state_socket_pair);
 736                 if (ret < 0) {
 737                         ERROR("Failed to create anonymous pair of unix sockets");
 738                         goto on_error;
 739                 }
 740
 741                 TRACE("Created anonymous pair {%d,%d} of unix sockets",
 742                       handler->state_socket_pair[0],
 743                       handler->state_socket_pair[1]);
 744                 handler->keep_fds[nr_keep_fds++] = handler->state_socket_pair[0];
 745                 handler->keep_fds[nr_keep_fds++] = handler->state_socket_pair[1];
 746         }
 747
 748         if (handler->conf->reboot == REBOOT_NONE) {
 749                 handler->conf->maincmd_fd = lxc_server_init(name, lxcpath, "command");
 750                 if (handler->conf->maincmd_fd < 0) {
 751                         ERROR("Failed to set up command socket");
 752                         goto on_error;
 753                 }
 754                 handler->keep_fds[nr_keep_fds++] = handler->conf->maincmd_fd;
 755         }
 756
 757         TRACE("Unix domain socket %d for command server is ready",
 758               handler->conf->maincmd_fd);
 759
 760         return handler;
 761
 762 on_error:
 763         lxc_put_handler(handler);
 764
 765         return NULL;
 766 }
 767
 768 int lxc_init(const char *name, struct lxc_handler *handler)
 769 {
 770         __do_close int status_fd = -EBADF;
 771         int ret;
 772         const char *loglevel;
 773         struct lxc_conf *conf = handler->conf;
 774
 775         handler->monitor_pid = lxc_raw_getpid();
 776         status_fd = open("/proc/self/status", O_RDONLY | O_CLOEXEC);
 777         if (status_fd < 0)
 778                 return log_error_errno(-1, errno, "Failed to open monitor status fd");
 779
 780         handler->lsm_ops = lsm_init_static();
 781         TRACE("Initialized LSM");
 782
 783         /* Begin by setting the state to STARTING. */
 784         ret = lxc_set_state(name, handler, STARTING);
 785         if (ret < 0)
 786                 return log_error(-1, "Failed to set state to \"%s\"", lxc_state2str(STARTING));
 787         TRACE("Set container state to \"STARTING\"");
 788
 789         /* Start of environment variable setup for hooks. */
 790         ret = setenv("LXC_NAME", name, 1);
 791         if (ret < 0)
 792                 SYSERROR("Failed to set environment variable: LXC_NAME=%s", name);
 793
 794         if (conf->rcfile) {
 795                 ret = setenv("LXC_CONFIG_FILE", conf->rcfile, 1);
 796                 if (ret < 0)
 797                         SYSERROR("Failed to set environment variable: LXC_CONFIG_FILE=%s", conf->rcfile);
 798         }
 799
 800         if (conf->rootfs.mount) {
 801                 ret = setenv("LXC_ROOTFS_MOUNT", conf->rootfs.mount, 1);
 802                 if (ret < 0)
 803                         SYSERROR("Failed to set environment variable: LXC_ROOTFS_MOUNT=%s", conf->rootfs.mount);
 804         }
 805
 806         if (conf->rootfs.path) {
 807                 ret = setenv("LXC_ROOTFS_PATH", conf->rootfs.path, 1);
 808                 if (ret < 0)
 809                         SYSERROR("Failed to set environment variable: LXC_ROOTFS_PATH=%s", conf->rootfs.path);
 810         }
 811
 812         if (conf->console.path) {
 813                 ret = setenv("LXC_CONSOLE", conf->console.path, 1);
 814                 if (ret < 0)
 815                         SYSERROR("Failed to set environment variable: LXC_CONSOLE=%s", conf->console.path);
 816         }
 817
 818         if (conf->console.log_path) {
 819                 ret = setenv("LXC_CONSOLE_LOGPATH", conf->console.log_path, 1);
 820                 if (ret < 0)
 821                         SYSERROR("Failed to set environment variable: LXC_CONSOLE_LOGPATH=%s", conf->console.log_path);
 822         }
 823
 824         if (cgns_supported()) {
 825                 ret = setenv("LXC_CGNS_AWARE", "1", 1);
 826                 if (ret < 0)
 827                         SYSERROR("Failed to set environment variable LXC_CGNS_AWARE=1");
 828         }
 829
 830         loglevel = lxc_log_priority_to_string(lxc_log_get_level());
 831         ret = setenv("LXC_LOG_LEVEL", loglevel, 1);
 832         if (ret < 0)
 833                 SYSERROR("Set environment variable LXC_LOG_LEVEL=%s", loglevel);
 834
 835         if (conf->hooks_version == 0)
 836                 ret = setenv("LXC_HOOK_VERSION", "0", 1);
 837         else
 838                 ret = setenv("LXC_HOOK_VERSION", "1", 1);
 839         if (ret < 0)
 840                 SYSERROR("Failed to set environment variable LXC_HOOK_VERSION=%u", conf->hooks_version);
 841         /* End of environment variable setup for hooks. */
 842
 843         TRACE("Set environment variables");
 844
 845         ret = run_lxc_hooks(name, "pre-start", conf, NULL);
 846         if (ret < 0)
 847                 return log_error(-1, "Failed to run lxc.hook.pre-start for container \"%s\"", name);
 848         TRACE("Ran pre-start hooks");
 849
 850         ret = lxc_terminal_parent(conf);
 851         if (ret < 0)
 852                 return log_error(-1, "Failed to allocate terminal");
 853
 854         /* The signal fd has to be created before forking otherwise if the child
 855          * process exits before we setup the signal fd, the event will be lost
 856          * and the command will be stuck.
 857          */
 858         handler->sigfd = setup_signal_fd(&handler->oldmask);
 859         if (handler->sigfd < 0)
 860                 return log_error(-1, "Failed to setup SIGCHLD fd handler.");
 861         TRACE("Set up signal fd");
 862
 863         handler->cgroup_ops = cgroup_init(handler->conf);
 864         if (!handler->cgroup_ops) {
 865                 ERROR("Failed to initialize cgroup driver");
 866                 goto out_restore_sigmask;
 867         }
 868         TRACE("Initialized cgroup driver");
 869
 870         ret = lxc_read_seccomp_config(conf);
 871         if (ret < 0) {
 872                 ERROR("Failed to read seccomp policy");
 873                 goto out_restore_sigmask;
 874         }
 875         TRACE("Read seccomp policy");
 876
 877         ret = handler->lsm_ops->prepare(handler->lsm_ops, conf, handler->lxcpath);
 878         if (ret < 0) {
 879                 ERROR("Failed to initialize LSM");
 880                 goto out_restore_sigmask;
 881         }
 882         TRACE("Initialized LSM");
 883
 884         INFO("Container \"%s\" is initialized", name);
 885         handler->monitor_status_fd = move_fd(status_fd);
 886         return 0;
 887
 888 out_restore_sigmask:
 889         (void)pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
 890
 891         return -1;
 892 }
 893
 894 void lxc_expose_namespace_environment(const struct lxc_handler *handler)
 895 {
 896         for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
 897                 int ret;
 898                 const char *fd_path;
 899
 900                 if (handler->nsfd[i] < 0)
 901                         continue;
 902
 903                 fd_path = handler->nsfd_paths[i] + strcspn(handler->nsfd_paths[i], "/");
 904                 ret = setenv(ns_info[i].env_name, fd_path, 1);
 905                 if (ret < 0)
 906                         SYSERROR("Failed to set environment variable %s=%s",
 907                                  ns_info[i].env_name, fd_path);
 908                 else
 909                         TRACE("Set environment variable %s=%s",
 910                               ns_info[i].env_name, fd_path);
 911         }
 912 }
 913
 914 void lxc_end(struct lxc_handler *handler)
 915 {
 916         int ret;
 917         const char *name = handler->name;
 918         struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
 919         struct lxc_state_client *client, *nclient;
 920
 921         /* The STOPPING state is there for future cleanup code which can take
 922          * awhile.
 923          */
 924         lxc_set_state(name, handler, STOPPING);
 925
 926         /* Passing information to hooks via environment variables. */
 927         if (handler->conf->hooks_version > 0)
 928                 lxc_expose_namespace_environment(handler);
 929
 930         if (handler->conf->reboot > REBOOT_NONE) {
 931                 ret = setenv("LXC_TARGET", "reboot", 1);
 932                 if (ret < 0)
 933                         SYSERROR("Failed to set environment variable: LXC_TARGET=reboot");
 934         }
 935
 936         if (handler->conf->reboot == REBOOT_NONE) {
 937                 ret = setenv("LXC_TARGET", "stop", 1);
 938                 if (ret < 0)
 939                         SYSERROR("Failed to set environment variable: LXC_TARGET=stop");
 940         }
 941
 942         if (handler->conf->hooks_version == 0)
 943                 ret = run_lxc_hooks(name, "stop", handler->conf, handler->hook_argv);
 944         else
 945                 ret = run_lxc_hooks(name, "stop", handler->conf, NULL);
 946         if (ret < 0)
 947                 ERROR("Failed to run \"lxc.hook.stop\" hook");
 948
 949         handler->lsm_ops->cleanup(handler->lsm_ops, handler->conf, handler->lxcpath);
 950
 951         if (cgroup_ops) {
 952                 cgroup_ops->payload_destroy(cgroup_ops, handler);
 953                 cgroup_ops->monitor_destroy(cgroup_ops, handler);
 954         }
 955
 956         put_lxc_rootfs(&handler->conf->rootfs, true);
 957
 958         if (handler->conf->reboot == REBOOT_NONE) {
 959                 /* For all new state clients simply close the command socket.
 960                  * This will inform all state clients that the container is
 961                  * STOPPED and also prevents a race between a open()/close() on
 962                  * the command socket causing a new process to get ECONNREFUSED
 963                  * because we haven't yet closed the command socket.
 964                  */
 965                 close_prot_errno_disarm(handler->conf->maincmd_fd);
 966                 TRACE("Closed command socket");
 967
 968                 /* This function will try to connect to the legacy lxc-monitord
 969                  * state server and only exists for backwards compatibility.
 970                  */
 971                 lxc_monitor_send_state(name, STOPPED, handler->lxcpath);
 972
 973                 /* The command socket is closed so no one can acces the command
 974                  * socket anymore so there's no need to lock it.
 975                  */
 976                 handler->state = STOPPED;
 977                 TRACE("Set container state to \"STOPPED\"");
 978         } else {
 979                 lxc_set_state(name, handler, STOPPED);
 980                 TRACE("Set container state to \"STOPPED\"");
 981         }
 982
 983         /* Avoid lingering namespace references. */
 984         lxc_put_nsfds(handler);
 985
 986         ret = run_lxc_hooks(name, "post-stop", handler->conf, NULL);
 987         if (ret < 0) {
 988                 ERROR("Failed to run lxc.hook.post-stop for container \"%s\"", name);
 989                 if (handler->conf->reboot > REBOOT_NONE) {
 990                         WARN("Container will be stopped instead of rebooted");
 991                         handler->conf->reboot = REBOOT_NONE;
 992
 993                         ret = setenv("LXC_TARGET", "stop", 1);
 994                         if (ret < 0)
 995                                 WARN("Failed to set environment variable: LXC_TARGET=stop");
 996                 }
 997         }
 998
 999         /* Reset mask set by setup_signal_fd. */
1000         ret = pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
1001         if (ret < 0)
1002                 SYSWARN("Failed to restore signal mask");
1003
1004         lxc_terminal_delete(&handler->conf->console);
1005         lxc_delete_tty(&handler->conf->ttys);
1006         close_prot_errno_disarm(handler->conf->devpts_fd);
1007
1008         /* The command socket is now closed, no more state clients can register
1009          * themselves from now on. So free the list of state clients.
1010          */
1011         list_for_each_entry_safe(client, nclient, &handler->conf->state_clients, head) {
1012                 /* Keep state clients that want to be notified about reboots. */
1013                 if ((handler->conf->reboot > REBOOT_NONE) &&
1014                     (client->states[RUNNING] == 2))
1015                         continue;
1016
1017                 /* close state client socket */
1018                 list_del(&client->head);
1019                 close(client->clientfd);
1020                 free(client);
1021         }
1022
1023         if (handler->conf->ephemeral == 1 && handler->conf->reboot != REBOOT_REQ)
1024                 lxc_destroy_container_on_signal(handler, name);
1025
1026         lxc_put_handler(handler);
1027 }
1028
1029 void lxc_abort(struct lxc_handler *handler)
1030 {
1031         int ret = 0;
1032         int status;
1033
1034         lxc_set_state(handler->name, handler, ABORTING);
1035
1036         if (handler->pidfd >= 0) {
1037                 ret = lxc_raw_pidfd_send_signal(handler->pidfd, SIGKILL, NULL, 0);
1038                 if (ret)
1039                         SYSWARN("Failed to send SIGKILL via pidfd %d for process %d",
1040                                 handler->pidfd, handler->pid);
1041         }
1042
1043         if ((!ret || errno != ESRCH) && handler->pid > 0)
1044                 if (kill(handler->pid, SIGKILL))
1045                         SYSWARN("Failed to send SIGKILL to %d", handler->pid);
1046
1047         do {
1048                 ret = waitpid(-1, &status, 0);
1049         } while (ret > 0);
1050 }
1051
1052 static int do_start(void *data)
1053 {
1054         struct lxc_handler *handler = data;
1055         __lxc_unused __do_close int data_sock0 = handler->data_sock[0],
1056                                     data_sock1 = handler->data_sock[1];
1057         __do_close int devnull_fd = -EBADF, status_fd = -EBADF;
1058         int ret;
1059         uid_t new_uid;
1060         gid_t new_gid;
1061         uid_t nsuid = 0;
1062         gid_t nsgid = 0;
1063
1064         lxc_sync_fini_parent(handler);
1065
1066         if (lxc_abstract_unix_recv_one_fd(data_sock1, &status_fd, NULL, 0) < 0) {
1067                 ERROR("Failed to receive status file descriptor from parent process");
1068                 goto out_warn_father;
1069         }
1070
1071         /* This prctl must be before the synchro, so if the parent dies before
1072          * we set the parent death signal, we will detect its death with the
1073          * synchro right after, otherwise we have a window where the parent can
1074          * exit before we set the pdeath signal leading to a unsupervized
1075          * container.
1076          */
1077         ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid, status_fd);
1078         if (ret < 0) {
1079                 SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
1080                 goto out_warn_father;
1081         }
1082
1083         ret = lxc_ambient_caps_up();
1084         if (ret < 0) {
1085                 ERROR("Failed to raise ambient capabilities");
1086                 goto out_warn_father;
1087         }
1088
1089         ret = pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
1090         if (ret < 0) {
1091                 SYSERROR("Failed to set signal mask");
1092                 goto out_warn_father;
1093         }
1094
1095         if (!lxc_sync_wait_parent(handler, START_SYNC_STARTUP))
1096                 goto out_warn_father;
1097
1098         /* Unshare CLONE_NEWNET after CLONE_NEWUSER. See
1099          * https://github.com/lxc/lxd/issues/1978.
1100          */
1101         if (handler->ns_unshare_flags & CLONE_NEWNET) {
1102                 ret = unshare(CLONE_NEWNET);
1103                 if (ret < 0) {
1104                         SYSERROR("Failed to unshare CLONE_NEWNET");
1105                         goto out_warn_father;
1106                 }
1107                 INFO("Unshared CLONE_NEWNET");
1108         }
1109
1110         /* If we are in a new user namespace, become root there to have
1111          * privilege over our namespace.
1112          */
1113         if (!list_empty(&handler->conf->id_map)) {
1114                 if (!handler->conf->root_nsuid_map)
1115                         nsuid = handler->conf->init_uid;
1116
1117                 if (!handler->conf->root_nsgid_map)
1118                         nsgid = handler->conf->init_gid;
1119
1120                 /* Drop groups only after we switched to a valid gid in the new
1121                  * user namespace.
1122                  */
1123                 if (!lxc_drop_groups() &&
1124                     (handler->am_root || errno != EPERM))
1125                         goto out_warn_father;
1126
1127                 if (!lxc_switch_uid_gid(nsuid, nsgid))
1128                         goto out_warn_father;
1129
1130                 ret = prctl(PR_SET_DUMPABLE, prctl_arg(1), prctl_arg(0),
1131                             prctl_arg(0), prctl_arg(0));
1132                 if (ret < 0)
1133                         goto out_warn_father;
1134
1135                 /* set{g,u}id() clears deathsignal */
1136                 ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid, status_fd);
1137                 if (ret < 0) {
1138                         SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
1139                         goto out_warn_father;
1140                 }
1141         }
1142
1143         ret = access(handler->lxcpath, X_OK);
1144         if (ret != 0) {
1145                 print_top_failing_dir(handler->lxcpath);
1146                 goto out_warn_father;
1147         }
1148
1149         /* In order to checkpoint restore, we need to have everything in the
1150          * same mount namespace. However, some containers may not have a
1151          * reasonable /dev (in particular, they may not have /dev/null), so we
1152          * can't set init's std fds to /dev/null by opening it from inside the
1153          * container.
1154          *
1155          * If that's the case, fall back to using the host's /dev/null. This
1156          * means that migration won't work, but at least we won't spew output
1157          * where it isn't wanted.
1158          */
1159         if (handler->daemonize && !handler->conf->autodev) {
1160                 char path[PATH_MAX];
1161
1162                 ret = strnprintf(path, sizeof(path), "%s/dev/null",
1163                                  handler->conf->rootfs.mount);
1164                 if (ret < 0)
1165                         goto out_warn_father;
1166
1167                 ret = access(path, F_OK);
1168                 if (ret != 0) {
1169                         devnull_fd = open_devnull();
1170
1171                         if (devnull_fd < 0)
1172                                 goto out_warn_father;
1173                         WARN("Using /dev/null from the host for container init's standard file descriptors. Migration will not work");
1174                 }
1175         }
1176
1177         /*
1178          * Tell the parent task it can begin to configure the container and wait
1179          * for it to finish.
1180          */
1181         if (!lxc_sync_wake_parent(handler, START_SYNC_CONFIGURE))
1182                 goto out_error;
1183
1184         /* Unshare cgroup namespace after we have setup our cgroups. If we do it
1185          * earlier we end up with a wrong view of /proc/self/cgroup. For
1186          * example, assume we unshare(CLONE_NEWCGROUP) first, and then create
1187          * the cgroup for the container, say /sys/fs/cgroup/cpuset/lxc/c, then
1188          * /proc/self/cgroup would show us:
1189          *
1190          *      8:cpuset:/lxc/c
1191          *
1192          * whereas it should actually show
1193          *
1194          *      8:cpuset:/
1195          */
1196         if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
1197                 ret = unshare(CLONE_NEWCGROUP);
1198                 if (ret < 0) {
1199                         if (errno != EINVAL) {
1200                                 SYSERROR("Failed to unshare CLONE_NEWCGROUP");
1201                                 goto out_warn_father;
1202                         }
1203
1204                         handler->ns_clone_flags &= ~CLONE_NEWCGROUP;
1205                         SYSINFO("Kernel does not support CLONE_NEWCGROUP");
1206                 } else {
1207                         INFO("Unshared CLONE_NEWCGROUP");
1208                 }
1209         }
1210
1211         if (handler->ns_unshare_flags & CLONE_NEWTIME) {
1212                 ret = unshare(CLONE_NEWTIME);
1213                 if (ret < 0) {
1214                         if (errno != EINVAL) {
1215                                 SYSERROR("Failed to unshare CLONE_NEWTIME");
1216                                 goto out_warn_father;
1217                         }
1218
1219                         handler->ns_clone_flags &= ~CLONE_NEWTIME;
1220                         SYSINFO("Kernel does not support CLONE_NEWTIME");
1221                 } else {
1222                         __do_close int timens_fd = -EBADF;
1223
1224                         INFO("Unshared CLONE_NEWTIME");
1225
1226                         if (handler->conf->timens.s_boot)
1227                                 ret = timens_offset_write(CLOCK_BOOTTIME, handler->conf->timens.s_boot, 0);
1228                         else if (handler->conf->timens.ns_boot)
1229                                 ret = timens_offset_write(CLOCK_BOOTTIME, 0, handler->conf->timens.ns_boot);
1230                         if (ret) {
1231                                 SYSERROR("Failed to write CLONE_BOOTTIME offset");
1232                                 goto out_warn_father;
1233                         }
1234                         TRACE("Wrote CLOCK_BOOTTIME offset");
1235
1236                         if (handler->conf->timens.s_monotonic)
1237                                 ret = timens_offset_write(CLOCK_MONOTONIC, handler->conf->timens.s_monotonic, 0);
1238                         else if (handler->conf->timens.ns_monotonic)
1239                                 ret = timens_offset_write(CLOCK_MONOTONIC, 0, handler->conf->timens.ns_monotonic);
1240                         if (ret) {
1241                                 SYSERROR("Failed to write CLONE_MONOTONIC offset");
1242                                 goto out_warn_father;
1243                         }
1244                         TRACE("Wrote CLOCK_MONOTONIC offset");
1245
1246                         timens_fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC);
1247                         if (timens_fd < 0) {
1248                                 SYSERROR("Failed to open \"/proc/self/ns/time_for_children\"");
1249                                 goto out_warn_father;
1250                         }
1251
1252                         ret = setns(timens_fd, CLONE_NEWTIME);
1253                         if (ret) {
1254                                 SYSERROR("Failed to setns(%d(\"/proc/self/ns/time_for_children\"))", timens_fd);
1255                                 goto out_warn_father;
1256                         }
1257                 }
1258         }
1259
1260         /*
1261          * Add the requested environment variables to the current environment
1262          * to allow them to be used by the various hooks, such as the start
1263          * hook below.
1264          */
1265         ret = lxc_set_environment(handler->conf);
1266         if (ret < 0)
1267                 goto out_warn_father;
1268
1269         if (!lxc_sync_wait_parent(handler, START_SYNC_POST_CONFIGURE))
1270                 goto out_warn_father;
1271
1272         /* Setup the container, ip, names, utsname, ... */
1273         ret = lxc_setup(handler);
1274         if (ret < 0) {
1275                 ERROR("Failed to setup container \"%s\"", handler->name);
1276                 goto out_warn_father;
1277         }
1278
1279         /* Set the label to change to when we exec(2) the container's init. */
1280         ret = handler->lsm_ops->process_label_set(handler->lsm_ops, NULL, handler->conf, true);
1281         if (ret < 0)
1282                 goto out_warn_father;
1283
1284         /* Set PR_SET_NO_NEW_PRIVS after we changed the lsm label. If we do it
1285          * before we aren't allowed anymore.
1286          */
1287         if (handler->conf->no_new_privs) {
1288                 ret = prctl(PR_SET_NO_NEW_PRIVS, prctl_arg(1), prctl_arg(0),
1289                             prctl_arg(0), prctl_arg(0));
1290                 if (ret < 0) {
1291                         SYSERROR("Could not set PR_SET_NO_NEW_PRIVS to block execve() gainable privileges");
1292                         goto out_warn_father;
1293                 }
1294                 DEBUG("Set PR_SET_NO_NEW_PRIVS to block execve() gainable privileges");
1295         }
1296
1297         /* If we mounted a temporary proc, then unmount it now. */
1298         tmp_proc_unmount(handler->conf);
1299
1300         ret = lxc_seccomp_load(handler->conf);
1301         if (ret < 0)
1302                 goto out_warn_father;
1303
1304         ret = run_lxc_hooks(handler->name, "start", handler->conf, NULL);
1305         if (ret < 0) {
1306                 ERROR("Failed to run lxc.hook.start for container \"%s\"",
1307                       handler->name);
1308                 goto out_warn_father;
1309         }
1310
1311         close_prot_errno_disarm(handler->sigfd);
1312
1313         if (handler->conf->console.pty < 0 && handler->daemonize) {
1314                 if (devnull_fd < 0) {
1315                         devnull_fd = open_devnull();
1316                         if (devnull_fd < 0)
1317                                 goto out_warn_father;
1318                 }
1319
1320                 ret = set_stdfds(devnull_fd);
1321                 if (ret < 0) {
1322                         ERROR("Failed to redirect std{in,out,err} to \"/dev/null\"");
1323                         goto out_warn_father;
1324                 }
1325         }
1326
1327         close_prot_errno_disarm(devnull_fd);
1328
1329         setsid();
1330
1331         if (handler->conf->init_cwd) {
1332                 ret = chdir(handler->conf->init_cwd);
1333                 if (ret < 0) {
1334                         SYSERROR("Could not change directory to \"%s\"",
1335                                  handler->conf->init_cwd);
1336                         goto out_warn_father;
1337                 }
1338         }
1339
1340         if (!lxc_sync_barrier_parent(handler, START_SYNC_CGROUP_LIMITS))
1341                 goto out_warn_father;
1342
1343         ret = lxc_sync_fds_child(handler);
1344         if (ret < 0) {
1345                 SYSERROR("Failed to sync file descriptors with parent");
1346                 goto out_warn_father;
1347         }
1348
1349         if (!lxc_sync_wait_parent(handler, START_SYNC_READY_START))
1350                 goto out_warn_father;
1351
1352         /* Reset the environment variables the user requested in a clear
1353          * environment.
1354          */
1355         ret = clearenv();
1356         /* Don't error out though. */
1357         if (ret < 0)
1358                 SYSERROR("Failed to clear environment.");
1359
1360         ret = lxc_set_environment(handler->conf);
1361         if (ret < 0)
1362                 goto out_warn_father;
1363
1364         ret = putenv("container=lxc");
1365         if (ret < 0) {
1366                 SYSERROR("Failed to set environment variable: container=lxc");
1367                 goto out_warn_father;
1368         }
1369
1370         if (handler->conf->ttys.tty_names) {
1371                 ret = putenv(handler->conf->ttys.tty_names);
1372                 if (ret < 0) {
1373                         SYSERROR("Failed to set environment variable for container ptys");
1374                         goto out_warn_father;
1375                 }
1376         }
1377
1378         /* The container has been setup. We can now switch to an unprivileged
1379          * uid/gid.
1380          */
1381         new_uid = handler->conf->init_uid;
1382         new_gid = handler->conf->init_gid;
1383
1384         /* Avoid unnecessary syscalls. */
1385         if (new_uid == nsuid)
1386                 new_uid = LXC_INVALID_UID;
1387
1388         if (new_gid == nsgid)
1389                 new_gid = LXC_INVALID_GID;
1390
1391         /* Make sure that the processes STDIO is correctly owned by the user that we are switching to */
1392         ret = fix_stdio_permissions(new_uid);
1393         if (ret)
1394                 WARN("Failed to ajust stdio permissions");
1395
1396         /* If we are in a new user namespace we already dropped all groups when
1397          * we switched to root in the new user namespace further above. Only
1398          * drop groups if we can, so ensure that we have necessary privilege.
1399          */
1400         if (list_empty(&handler->conf->id_map)) {
1401                 #if HAVE_LIBCAP
1402                 if (lxc_proc_cap_is_set(CAP_SETGID, CAP_EFFECTIVE))
1403                 #endif
1404                 {
1405                         if (handler->conf->init_groups.size > 0) {
1406                                 if (!lxc_setgroups(handler->conf->init_groups.list,
1407                                                    handler->conf->init_groups.size))
1408                                         goto out_warn_father;
1409                         } else {
1410                                 if (!lxc_drop_groups())
1411                                         goto out_warn_father;
1412                         }
1413                 }
1414         }
1415
1416         if (!lxc_switch_uid_gid(new_uid, new_gid))
1417                 goto out_warn_father;
1418
1419         ret = lxc_ambient_caps_down();
1420         if (ret < 0) {
1421                 ERROR("Failed to clear ambient capabilities");
1422                 goto out_warn_father;
1423         }
1424
1425         if (handler->conf->monitor_signal_pdeath != SIGKILL) {
1426                 ret = lxc_set_death_signal(handler->conf->monitor_signal_pdeath,
1427                                            handler->monitor_pid, status_fd);
1428                 if (ret < 0) {
1429                         SYSERROR("Failed to set PR_SET_PDEATHSIG to %d",
1430                                  handler->conf->monitor_signal_pdeath);
1431                         goto out_warn_father;
1432                 }
1433         }
1434
1435         /*
1436          * After this call, we are in error because this ops should not return
1437          * as it execs.
1438          */
1439         handler->ops->start(handler, handler->data);
1440
1441 out_warn_father:
1442         /*
1443          * We want the parent to know something went wrong, so we return a
1444          * special error code.
1445          */
1446         lxc_sync_wake_parent(handler, SYNC_ERROR);
1447
1448 out_error:
1449         return -1;
1450 }
1451
1452 int resolve_clone_flags(struct lxc_handler *handler)
1453 {
1454         int i;
1455         struct lxc_conf *conf = handler->conf;
1456         bool wants_timens = conf->timens.s_boot || conf->timens.ns_boot ||
1457                             conf->timens.s_monotonic || conf->timens.ns_monotonic;
1458
1459         for (i = 0; i < LXC_NS_MAX; i++) {
1460                 if (conf->ns_keep) {
1461                         if (!(conf->ns_keep & ns_info[i].clone_flag))
1462                                 handler->ns_clone_flags |= ns_info[i].clone_flag;
1463                 } else if (conf->ns_clone) {
1464                         if ((conf->ns_clone & ns_info[i].clone_flag))
1465                                 handler->ns_clone_flags |= ns_info[i].clone_flag;
1466                 } else {
1467                         if (i == LXC_NS_USER && list_empty(&handler->conf->id_map))
1468                                 continue;
1469
1470                         if (i == LXC_NS_NET && lxc_requests_empty_network(handler))
1471                                 continue;
1472
1473                         if (i == LXC_NS_CGROUP && !cgns_supported())
1474                                 continue;
1475
1476                         if (i == LXC_NS_TIME && !wants_timens)
1477                                 continue;
1478
1479                         handler->ns_clone_flags |= ns_info[i].clone_flag;
1480                 }
1481
1482                 if (!conf->ns_share[i])
1483                         continue;
1484
1485                 handler->ns_clone_flags &= ~ns_info[i].clone_flag;
1486                 TRACE("Sharing %s namespace", ns_info[i].proc_name);
1487         }
1488
1489         if (wants_timens && (conf->ns_keep & ns_info[LXC_NS_TIME].clone_flag))
1490                 return log_trace_errno(-1, EINVAL, "Requested to keep time namespace while also specifying offsets");
1491
1492         /* Deal with namespaces that are unshared. */
1493         if (handler->ns_clone_flags & CLONE_NEWTIME)
1494                 handler->ns_unshare_flags |= CLONE_NEWTIME;
1495
1496         if (!pure_unified_layout(handler->cgroup_ops) && handler->ns_clone_flags & CLONE_NEWCGROUP)
1497                 handler->ns_unshare_flags |= CLONE_NEWCGROUP;
1498
1499         if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
1500             (CLONE_NEWNET | CLONE_NEWUSER))
1501                 handler->ns_unshare_flags |= CLONE_NEWNET;
1502
1503         /* Deal with namespaces that are spawned. */
1504         handler->ns_on_clone_flags = handler->ns_clone_flags & ~handler->ns_unshare_flags;
1505
1506         handler->clone_flags = handler->ns_on_clone_flags | CLONE_PIDFD;
1507
1508         return 0;
1509 }
1510
1511 /* Note that this function is used with clone(CLONE_VM). Some glibc versions
1512  * used to reset the pid/tid to -1 when CLONE_VM was used without CLONE_THREAD.
1513  * But since the memory between parent and child is shared on CLONE_VM this
1514  * would invalidate the getpid() cache that glibc used to maintain and so
1515  * getpid() in the child would return the parent's pid. This is all fixed in
1516  * newer glibc versions where the getpid() cache is removed and the pid/tid is
1517  * not reset anymore.
1518  * However, if for whatever reason you - dear committer - somehow need to get the
1519  * pid of the placeholder intermediate process for do_share_ns() you need to
1520  * call lxc_raw_getpid(). The next lxc_raw_clone() call does not employ
1521  * CLONE_VM and will be fine.
1522  */
1523 static inline int do_share_ns(void *arg)
1524 {
1525         int i, flags, ret;
1526         struct lxc_handler *handler = arg;
1527
1528         for (i = 0; i < LXC_NS_MAX; i++) {
1529                 if (handler->nsfd[i] < 0)
1530                         continue;
1531
1532                 ret = setns(handler->nsfd[i], 0);
1533                 if (ret < 0) {
1534                         /*
1535                          * Note that joining a user and/or mount namespace
1536                          * requires the process is not multithreaded otherwise
1537                          * setns() will fail here.
1538                          */
1539                         SYSERROR("Failed to inherit %s namespace",
1540                                  ns_info[i].proc_name);
1541                         return -1;
1542                 }
1543
1544                 DEBUG("Inherited %s namespace", ns_info[i].proc_name);
1545         }
1546
1547         flags = handler->ns_on_clone_flags;
1548         flags |= CLONE_PARENT;
1549         handler->pid = lxc_raw_clone_cb(do_start, handler, CLONE_PIDFD | flags,
1550                                         &handler->pidfd);
1551         if (handler->pid < 0)
1552                 return -1;
1553
1554         return 0;
1555 }
1556
1557 static int core_scheduling(struct lxc_handler *handler)
1558 {
1559         struct lxc_conf *conf = handler->conf;
1560         int ret;
1561
1562         if (!conf->sched_core)
1563                 return log_trace(0, "No new core scheduling domain requested");
1564
1565         if (!(handler->ns_clone_flags & CLONE_NEWPID))
1566                 return syserror_set(-EINVAL, "Core scheduling currently requires a separate pid namespace");
1567
1568         ret = core_scheduling_cookie_create_threadgroup(handler->pid);
1569         if (ret < 0) {
1570                 if (ret == -ENODEV) {
1571                         INFO("The kernel doesn't support or doesn't use simultaneous multithreading (SMT)");
1572                         conf->sched_core = false;
1573                         return 0;
1574                 }
1575                 if (ret == -EINVAL)
1576                         return syserror("The kernel does not support core scheduling");
1577
1578                 return syserror("Failed to create new core scheduling domain");
1579         }
1580
1581         ret = core_scheduling_cookie_get(handler->pid, &conf->sched_core_cookie);
1582         if (ret || !core_scheduling_cookie_valid(conf->sched_core_cookie))
1583                 return syserror("Failed to retrieve core scheduling domain cookie");
1584
1585         TRACE("Created new core scheduling domain with cookie %llu",
1586               (llu)conf->sched_core_cookie);
1587
1588         return 0;
1589 }
1590
1591 /* lxc_spawn() performs crucial setup tasks and clone()s the new process which
1592  * exec()s the requested container binary.
1593  * Note that lxc_spawn() runs in the parent namespaces. Any operations performed
1594  * right here should be double checked if they'd pose a security risk. (For
1595  * example, any {u}mount() operations performed here will be reflected on the
1596  * host!)
1597  */
1598 static int lxc_spawn(struct lxc_handler *handler)
1599 {
1600         __do_close int data_sock0 = -EBADF, data_sock1 = -EBADF;
1601         int i, ret;
1602         char pidstr[20];
1603         bool wants_to_map_ids;
1604         struct list_head *id_map;
1605         const char *name = handler->name;
1606         const char *lxcpath = handler->lxcpath;
1607         bool share_ns = false;
1608         struct lxc_conf *conf = handler->conf;
1609         struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
1610
1611         id_map = &conf->id_map;
1612         wants_to_map_ids = !list_empty(id_map);
1613
1614         for (i = 0; i < LXC_NS_MAX; i++) {
1615                 if (!conf->ns_share[i])
1616                         continue;
1617
1618                 handler->nsfd[i] = lxc_inherit_namespace(conf->ns_share[i], lxcpath, ns_info[i].proc_name);
1619                 if (handler->nsfd[i] < 0)
1620                         return -1;
1621
1622                 share_ns = true;
1623         }
1624
1625         if (!lxc_sync_init(handler))
1626                 return -1;
1627
1628         ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0,
1629                          handler->data_sock);
1630         if (ret < 0)
1631                 goto out_sync_fini;
1632         data_sock0 = handler->data_sock[0];
1633         data_sock1 = handler->data_sock[1];
1634
1635         ret = resolve_clone_flags(handler);
1636         if (ret < 0)
1637                 goto out_sync_fini;
1638
1639         if (handler->ns_clone_flags & CLONE_NEWNET) {
1640                 ret = lxc_find_gateway_addresses(handler);
1641                 if (ret) {
1642                         ERROR("Failed to find gateway addresses");
1643                         goto out_sync_fini;
1644                 }
1645         }
1646
1647         if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
1648                 ERROR("Failed creating cgroups");
1649                 goto out_delete_net;
1650         }
1651
1652         /* Create a process in a new set of namespaces. */
1653         if (share_ns) {
1654                 pid_t attacher_pid;
1655
1656                 attacher_pid = lxc_clone(do_share_ns, handler,
1657                                          CLONE_VFORK | CLONE_VM | CLONE_FILES, NULL);
1658                 if (attacher_pid < 0) {
1659                         SYSERROR(LXC_CLONE_ERROR);
1660                         goto out_delete_net;
1661                 }
1662
1663                 ret = wait_for_pid(attacher_pid);
1664                 if (ret < 0) {
1665                         SYSERROR("Intermediate process failed");
1666                         goto out_delete_net;
1667                 }
1668
1669                 if (handler->pid < 0) {
1670                         SYSERROR(LXC_CLONE_ERROR);
1671                         goto out_delete_net;
1672                 }
1673         } else {
1674                 int cgroup_fd = -EBADF;
1675
1676                 struct lxc_clone_args clone_args = {
1677                         .flags = handler->clone_flags,
1678                         .pidfd = ptr_to_u64(&handler->pidfd),
1679                         .exit_signal = SIGCHLD,
1680                 };
1681
1682                 if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
1683                         cgroup_fd = cgroup_unified_fd(cgroup_ops);
1684                         if (cgroup_fd >= 0) {
1685                                 handler->clone_flags    |= CLONE_INTO_CGROUP;
1686                                 clone_args.flags        |= CLONE_INTO_CGROUP;
1687                                 clone_args.cgroup       = cgroup_fd;
1688                         }
1689                 }
1690
1691                 /* Try to spawn directly into target cgroup. */
1692                 handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER2);
1693                 if (handler->pid < 0) {
1694                         SYSTRACE("Failed to spawn container directly into target cgroup");
1695
1696                         /* Kernel might simply be too old for CLONE_INTO_CGROUP. */
1697                         handler->clone_flags            &= ~(CLONE_INTO_CGROUP | CLONE_NEWCGROUP);
1698                         handler->ns_on_clone_flags      &= ~CLONE_NEWCGROUP;
1699                         handler->ns_unshare_flags       |= CLONE_NEWCGROUP;
1700
1701                         clone_args.flags                = handler->clone_flags;
1702
1703                         handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER0);
1704                 } else if (cgroup_fd >= 0) {
1705                         TRACE("Spawned container directly into target cgroup via cgroup2 fd %d", cgroup_fd);
1706                 }
1707
1708                 /* Kernel might be too old for clone3(). */
1709                 if (handler->pid < 0) {
1710                         SYSTRACE("Failed to spawn container via clone3()");
1711
1712                 /*
1713                  * In contrast to all other architectures arm64 verifies that
1714                  * the argument we use to retrieve the pidfd with is
1715                  * initialized to 0. But we need to be able to initialize it to
1716                  * a negative value such as our customary -EBADF so we can
1717                  * detect whether this kernel supports pidfds. If the syscall
1718                  * returns and the pidfd variable is set to something >= 0 then
1719                  * we know this is a kernel supporting pidfds. But if we can't
1720                  * set it to -EBADF then this won't work since 0 is a valid
1721                  * file descriptor too. And since legacy clone silently ignores
1722                  * unknown flags we are left without any way to detect support
1723                  * for pidfds. So let's special-case arm64 to not fail starting
1724                  * containers.
1725                  */
1726                 #if defined(__aarch64__)
1727                         handler->pid = lxc_raw_legacy_clone(handler->clone_flags & ~CLONE_PIDFD, NULL);
1728                 #else
1729                         handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd);
1730                 #endif
1731                 }
1732
1733                 if (handler->pid < 0) {
1734                         SYSERROR(LXC_CLONE_ERROR);
1735                         goto out_delete_net;
1736                 }
1737
1738                 if (handler->pid == 0) {
1739                         (void)do_start(handler);
1740                         _exit(EXIT_FAILURE);
1741                 }
1742         }
1743         if (handler->pidfd < 0)
1744                 handler->clone_flags &= ~CLONE_PIDFD;
1745         TRACE("Cloned child process %d", handler->pid);
1746
1747         ret = core_scheduling(handler);
1748         if (ret < 0)
1749                 goto out_delete_net;
1750
1751         /* Verify that we can actually make use of pidfds. */
1752         if (!lxc_can_use_pidfd(handler->pidfd))
1753                 close_prot_errno_disarm(handler->pidfd);
1754
1755         ret = strnprintf(pidstr, 20, "%d", handler->pid);
1756         if (ret < 0)
1757                 goto out_delete_net;
1758
1759         ret = setenv("LXC_PID", pidstr, 1);
1760         if (ret < 0)
1761                 SYSERROR("Failed to set environment variable: LXC_PID=%s", pidstr);
1762
1763         for (i = 0; i < LXC_NS_MAX; i++)
1764                 if (handler->ns_on_clone_flags & ns_info[i].clone_flag)
1765                         INFO("Cloned %s", ns_info[i].flag_name);
1766
1767         if (!lxc_try_preserve_namespaces(handler, handler->ns_on_clone_flags)) {
1768                 ERROR("Failed to preserve cloned namespaces for lxc.hook.stop");
1769                 goto out_delete_net;
1770         }
1771
1772         lxc_sync_fini_child(handler);
1773
1774         if (lxc_abstract_unix_send_fds(handler->data_sock[0], &handler->monitor_status_fd, 1, NULL, 0) < 0) {
1775                 ERROR("Failed to send status file descriptor to child process");
1776                 goto out_delete_net;
1777         }
1778         close_prot_errno_disarm(handler->monitor_status_fd);
1779
1780         /* Map the container uids. The container became an invalid userid the
1781          * moment it was cloned with CLONE_NEWUSER. This call doesn't change
1782          * anything immediately, but allows the container to setuid(0) (0 being
1783          * mapped to something else on the host.) later to become a valid uid
1784          * again.
1785          */
1786         if (wants_to_map_ids) {
1787                 if (!handler->conf->ns_share[LXC_NS_USER] &&
1788                     (handler->conf->ns_keep & CLONE_NEWUSER) == 0) {
1789                         ret = lxc_map_ids(id_map, handler->pid);
1790                         if (ret < 0) {
1791                                 ERROR("Failed to set up id mapping.");
1792                                 goto out_delete_net;
1793                         }
1794                 }
1795         }
1796
1797         if (!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, false)) {
1798                 ERROR("Failed to setup cgroup limits for container \"%s\"", name);
1799                 goto out_delete_net;
1800         }
1801
1802         if (!cgroup_ops->payload_delegate_controllers(cgroup_ops)) {
1803                 ERROR("Failed to delegate controllers to payload cgroup");
1804                 goto out_delete_net;
1805         }
1806
1807         if (!cgroup_ops->payload_enter(cgroup_ops, handler)) {
1808                 ERROR("Failed to enter cgroups");
1809                 goto out_delete_net;
1810         }
1811
1812         if (!cgroup_ops->setup_limits(cgroup_ops, handler)) {
1813                 ERROR("Failed to setup cgroup limits for container \"%s\"", name);
1814                 goto out_delete_net;
1815         }
1816
1817         if (!cgroup_ops->chown(cgroup_ops, handler->conf))
1818                 goto out_delete_net;
1819
1820         if (!lxc_sync_barrier_child(handler, START_SYNC_STARTUP))
1821                 goto out_delete_net;
1822
1823         /* If not done yet, we're now ready to preserve the network namespace */
1824         if (handler->nsfd[LXC_NS_NET] < 0) {
1825                 ret = lxc_try_preserve_namespace(handler, LXC_NS_NET, "net");
1826                 if (ret < 0) {
1827                         if (ret != -ENOENT) {
1828                                 SYSERROR("Failed to preserve net namespace");
1829                                 goto out_delete_net;
1830                         }
1831                 }
1832         }
1833         ret = lxc_netns_set_nsid(handler->nsfd[LXC_NS_NET]);
1834         if (ret < 0)
1835                 SYSWARN("Failed to allocate new network namespace id");
1836         else
1837                 TRACE("Allocated new network namespace id");
1838
1839         /* Create the network configuration. */
1840         if (handler->ns_clone_flags & CLONE_NEWNET) {
1841                 ret = lxc_create_network(handler);
1842                 if (ret < 0) {
1843                         ERROR("Failed to create the network");
1844                         goto out_delete_net;
1845                 }
1846         }
1847
1848         ret = setup_proc_filesystem(conf, handler->pid);
1849         if (ret < 0) {
1850                 ERROR("Failed to setup procfs limits");
1851                 goto out_delete_net;
1852         }
1853
1854         ret = setup_resource_limits(conf, handler->pid);
1855         if (ret < 0) {
1856                 ERROR("Failed to setup resource limits");
1857                 goto out_delete_net;
1858         }
1859
1860         /* Tell the child to continue its initialization. */
1861         if (!lxc_sync_wake_child(handler, START_SYNC_POST_CONFIGURE))
1862                 goto out_delete_net;
1863
1864         ret = lxc_rootfs_prepare_parent(handler);
1865         if (ret) {
1866                 ERROR("Failed to prepare rootfs");
1867                 goto out_delete_net;
1868         }
1869
1870         if (handler->ns_clone_flags & CLONE_NEWNET) {
1871                 ret = lxc_network_send_to_child(handler);
1872                 if (ret < 0) {
1873                         SYSERROR("Failed to send veth names to child");
1874                         goto out_delete_net;
1875                 }
1876         }
1877
1878         if (!lxc_sync_wait_child(handler, START_SYNC_IDMAPPED_MOUNTS))
1879                 goto out_delete_net;
1880
1881         ret = lxc_idmapped_mounts_parent(handler);
1882         if (ret) {
1883                 ERROR("Failed to setup mount entries");
1884                 goto out_delete_net;
1885         }
1886
1887         if (!lxc_sync_wait_child(handler, START_SYNC_CGROUP_LIMITS))
1888                 goto out_delete_net;
1889
1890         /*
1891          * With isolation the limiting devices cgroup was already setup, so
1892          * only setup devices here if we have no namespace directory.
1893          */
1894         if (!handler->conf->cgroup_meta.namespace_dir &&
1895             !cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, true)) {
1896                 ERROR("Failed to setup legacy device cgroup controller limits");
1897                 goto out_delete_net;
1898         }
1899         TRACE("Set up legacy device cgroup controller limits");
1900
1901         if (!cgroup_ops->devices_activate(cgroup_ops, handler)) {
1902                 ERROR("Failed to setup cgroup2 device controller limits");
1903                 goto out_delete_net;
1904         }
1905         TRACE("Set up cgroup2 device controller limits");
1906
1907         cgroup_ops->finalize(cgroup_ops);
1908         TRACE("Finished setting up cgroups");
1909
1910         /* Run any host-side start hooks */
1911         ret = run_lxc_hooks(name, "start-host", conf, NULL);
1912         if (ret < 0) {
1913                 ERROR("Failed to run lxc.hook.start-host");
1914                 goto out_delete_net;
1915         }
1916
1917         if (!lxc_sync_wake_child(handler, START_SYNC_FDS))
1918                 goto out_delete_net;
1919
1920         if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
1921                 /* Now we're ready to preserve the cgroup namespace */
1922                 ret = lxc_try_preserve_namespace(handler, LXC_NS_CGROUP, "cgroup");
1923                 if (ret < 0) {
1924                         if (ret != -ENOENT) {
1925                                 SYSERROR("Failed to preserve cgroup namespace");
1926                                 goto out_delete_net;
1927                         }
1928                 }
1929         }
1930
1931         if (handler->ns_unshare_flags & CLONE_NEWTIME) {
1932                 /* Now we're ready to preserve the time namespace */
1933                 ret = lxc_try_preserve_namespace(handler, LXC_NS_TIME, "time");
1934                 if (ret < 0) {
1935                         if (ret != -ENOENT) {
1936                                 SYSERROR("Failed to preserve time namespace");
1937                                 goto out_delete_net;
1938                         }
1939                 }
1940         }
1941
1942         ret = lxc_sync_fds_parent(handler);
1943         if (ret < 0) {
1944                 SYSERROR("Failed to sync file descriptors with child");
1945                 goto out_delete_net;
1946         }
1947
1948         ret = lxc_terminal_setup(conf);
1949         if (ret < 0) {
1950                 SYSERROR("Failed to create console");
1951                 goto out_delete_net;
1952         }
1953
1954         /*
1955          * Tell the child to complete its initialization and wait for it to
1956          * exec or return an error. (The child will never return
1957          * START_SYNC_READY_START+1. It will either close the sync pipe,
1958          * causing lxc_sync_barrier_child to return success, or return a
1959          * different value, causing us to error out).
1960          */
1961         if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START))
1962                 goto out_delete_net;
1963
1964         /* Now all networks are created, network devices are moved into place,
1965          * and the correct names and ifindices in the respective namespaces have
1966          * been recorded. The corresponding structs have now all been filled. So
1967          * log them for debugging purposes.
1968          */
1969         lxc_log_configured_netdevs(conf);
1970
1971         ret = handler->ops->post_start(handler, handler->data);
1972         if (ret < 0)
1973                 goto out_abort;
1974
1975         ret = lxc_set_state(name, handler, RUNNING);
1976         if (ret < 0) {
1977                 ERROR("Failed to set state to \"%s\"", lxc_state2str(RUNNING));
1978                 goto out_abort;
1979         }
1980
1981         lxc_sync_fini(handler);
1982
1983         return 0;
1984
1985 out_delete_net:
1986         if (handler->ns_clone_flags & CLONE_NEWNET)
1987                 lxc_delete_network(handler);
1988
1989 out_abort:
1990         lxc_abort(handler);
1991
1992 out_sync_fini:
1993         lxc_sync_fini(handler);
1994
1995         return -1;
1996 }
1997
1998 int __lxc_start(struct lxc_handler *handler, struct lxc_operations *ops,
1999                 void *data, const char *lxcpath, bool daemonize, int *error_num)
2000 {
2001         int ret, status;
2002         const char *name = handler->name;
2003         struct lxc_conf *conf = handler->conf;
2004         struct cgroup_ops *cgroup_ops;
2005
2006         ret = lxc_init(name, handler);
2007         if (ret < 0) {
2008                 ERROR("Failed to initialize container \"%s\"", name);
2009                 goto out_abort;
2010         }
2011         handler->ops = ops;
2012         handler->data = data;
2013         handler->daemonize = daemonize;
2014         cgroup_ops = handler->cgroup_ops;
2015
2016         if (!attach_block_device(handler->conf)) {
2017                 ERROR("Failed to attach block device");
2018                 ret = -1;
2019                 goto out_abort;
2020         }
2021
2022         if (!cgroup_ops->monitor_create(cgroup_ops, handler)) {
2023                 ERROR("Failed to create monitor cgroup");
2024                 ret = -1;
2025                 goto out_abort;
2026         }
2027
2028         if (!cgroup_ops->monitor_delegate_controllers(cgroup_ops)) {
2029                 ERROR("Failed to delegate controllers to monitor cgroup");
2030                 ret = -1;
2031                 goto out_abort;
2032         }
2033
2034         if (!cgroup_ops->monitor_enter(cgroup_ops, handler)) {
2035                 ERROR("Failed to enter monitor cgroup");
2036                 ret = -1;
2037                 goto out_abort;
2038         }
2039
2040         /* If the rootfs is not a blockdev, prevent the container from marking
2041          * it readonly.
2042          * If the container is unprivileged then skip rootfs pinning.
2043          */
2044         ret = lxc_rootfs_init(conf, !list_empty(&conf->id_map));
2045         if (ret) {
2046                 ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
2047                 ret = -1;
2048                 goto out_abort;
2049         }
2050
2051         if (geteuid() == 0 && !list_empty(&conf->id_map)) {
2052                 /*
2053                  * Most filesystems can't be mounted inside a userns so handle them here.
2054                  */
2055                 if (rootfs_is_blockdev(conf)) {
2056                         ret = unshare(CLONE_NEWNS);
2057                         if (ret < 0) {
2058                                 ERROR("Failed to unshare CLONE_NEWNS");
2059                                 goto out_abort;
2060                         }
2061                         INFO("Unshared CLONE_NEWNS");
2062
2063                         ret = lxc_setup_rootfs_prepare_root(conf, name, lxcpath);
2064                         if (ret < 0) {
2065                                 ERROR("Error setting up rootfs mount as root before spawn");
2066                                 goto out_abort;
2067                         }
2068                         INFO("Set up container rootfs as host root");
2069                 }
2070         }
2071
2072         ret = lxc_spawn(handler);
2073         if (ret < 0) {
2074                 ERROR("Failed to spawn container \"%s\"", name);
2075                 goto out_detach_blockdev;
2076         }
2077
2078         handler->conf->reboot = REBOOT_NONE;
2079
2080         ret = lxc_poll(name, handler);
2081         if (ret) {
2082                 ERROR("LXC mainloop exited with error: %d", ret);
2083                 goto out_delete_network;
2084         }
2085
2086         if (!handler->init_died && handler->pid > 0) {
2087                 ERROR("Child process is not killed");
2088                 ret = -1;
2089                 goto out_delete_network;
2090         }
2091
2092         status = lxc_wait_for_pid_status(handler->pid);
2093         if (status < 0)
2094                 SYSERROR("Failed to retrieve status for %d", handler->pid);
2095
2096         /* If the child process exited but was not signaled, it didn't call
2097          * reboot. This should mean it was an lxc-execute which simply exited.
2098          * In any case, treat it as a 'halt'.
2099          */
2100         if (WIFSIGNALED(status)) {
2101                 int signal_nr = WTERMSIG(status);
2102                 switch(signal_nr) {
2103                 case SIGINT: /* halt */
2104                         DEBUG("%s(%d) - Container \"%s\" is halting", signal_name(signal_nr), signal_nr, name);
2105                         break;
2106                 case SIGHUP: /* reboot */
2107                         DEBUG("%s(%d) - Container \"%s\" is rebooting", signal_name(signal_nr), signal_nr, name);
2108                         handler->conf->reboot = REBOOT_REQ;
2109                         break;
2110                 case SIGSYS: /* seccomp */
2111                         DEBUG("%s(%d) - Container \"%s\" violated its seccomp policy", signal_name(signal_nr), signal_nr, name);
2112                         break;
2113                 default:
2114                         DEBUG("%s(%d) - Container \"%s\" init exited", signal_name(signal_nr), signal_nr, name);
2115                         break;
2116                 }
2117         }
2118
2119         ret = lxc_restore_phys_nics_to_netns(handler);
2120         if (ret < 0)
2121                 ERROR("Failed to move physical network devices back to parent network namespace");
2122
2123         lxc_monitor_send_exit_code(name, status, handler->lxcpath);
2124         lxc_error_set_and_log(handler->pid, status);
2125         if (error_num)
2126                 *error_num = handler->exit_status;
2127
2128         lxc_delete_network(handler);
2129         detach_block_device(handler->conf);
2130         lxc_end(handler);
2131         return ret;
2132
2133 out_abort:
2134         lxc_abort(handler);
2135         lxc_end(handler);
2136         return ret;
2137
2138 out_detach_blockdev:
2139         lxc_abort(handler);
2140         detach_block_device(handler->conf);
2141         lxc_end(handler);
2142         return ret;
2143
2144 out_delete_network:
2145         lxc_abort(handler);
2146         lxc_restore_phys_nics_to_netns(handler);
2147         lxc_delete_network(handler);
2148         detach_block_device(handler->conf);
2149         lxc_end(handler);
2150         return ret;
2151 }
2152
2153 struct start_args {
2154         char *const *argv;
2155 };
2156
2157 static int start(struct lxc_handler *handler, void* data)
2158 {
2159         struct start_args *arg = data;
2160
2161         NOTICE("Exec'ing \"%s\"", arg->argv[0]);
2162
2163         execvp(arg->argv[0], arg->argv);
2164         SYSERROR("Failed to exec \"%s\"", arg->argv[0]);
2165         return 0;
2166 }
2167
2168 static int post_start(struct lxc_handler *handler, void* data)
2169 {
2170         struct start_args *arg = data;
2171
2172         NOTICE("Started \"%s\" with pid \"%d\"", arg->argv[0], handler->pid);
2173         return 0;
2174 }
2175
2176 static struct lxc_operations start_ops = {
2177         .start = start,
2178         .post_start = post_start
2179 };
2180
2181 int lxc_start(char *const argv[], struct lxc_handler *handler,
2182               const char *lxcpath, bool daemonize, int *error_num)
2183 {
2184         struct start_args start_arg = {
2185                 .argv = argv,
2186         };
2187
2188         TRACE("Doing lxc_start");
2189         return __lxc_start(handler, &start_ops, &start_arg, lxcpath, daemonize, error_num);
2190 }
2191
2192 static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
2193                                             const char *name)
2194 {
2195         char destroy[PATH_MAX];
2196         struct lxc_container *c;
2197         int ret = 0;
2198         bool bret = true;
2199
2200         if (handler->conf->rootfs.path && handler->conf->rootfs.mount) {
2201                 bret = do_destroy_container(handler);
2202                 if (!bret) {
2203                         ERROR("Error destroying rootfs for container \"%s\"", name);
2204                         return;
2205                 }
2206         }
2207         INFO("Destroyed rootfs for container \"%s\"", name);
2208
2209         ret = strnprintf(destroy, sizeof(destroy), "%s/%s", handler->lxcpath, name);
2210         if (ret < 0) {
2211                 ERROR("Error destroying directory for container \"%s\"", name);
2212                 return;
2213         }
2214
2215         c = lxc_container_new(name, handler->lxcpath);
2216         if (c) {
2217                 if (container_disk_lock(c)) {
2218                         INFO("Could not update lxc_snapshots file");
2219                         lxc_container_put(c);
2220                 } else {
2221                         mod_all_rdeps(c, false);
2222                         container_disk_unlock(c);
2223                         lxc_container_put(c);
2224                 }
2225         }
2226
2227         if (!handler->am_root)
2228                 ret = userns_exec_full(handler->conf, lxc_rmdir_onedev_wrapper,
2229                                        destroy, "lxc_rmdir_onedev_wrapper");
2230         else
2231                 ret = lxc_rmdir_onedev(destroy, NULL);
2232
2233         if (ret < 0) {
2234                 ERROR("Error destroying directory for container \"%s\"", name);
2235                 return;
2236         }
2237         INFO("Destroyed directory for container \"%s\"", name);
2238 }
2239
2240 static int lxc_rmdir_onedev_wrapper(void *data)
2241 {
2242         char *arg = (char *) data;
2243         return lxc_rmdir_onedev(arg, NULL);
2244 }
2245
2246 static bool do_destroy_container(struct lxc_handler *handler)
2247 {
2248         int ret;
2249
2250         if (!handler->am_root) {
2251                 ret = userns_exec_full(handler->conf, storage_destroy_wrapper,
2252                                        handler->conf, "storage_destroy_wrapper");
2253                 if (ret < 0)
2254                         return false;
2255
2256                 return true;
2257         }
2258
2259         return storage_destroy(handler->conf);
2260 }