src/lxc/start.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include "config.h"
   4
   5 #include <dirent.h>
   6 #include <errno.h>
   7 #include <fcntl.h>
   8 #include <grp.h>
   9 #include <poll.h>
  10 #include <pthread.h>
  11 #include <signal.h>
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14 #include <string.h>
  15 #include <sys/file.h>
  16 #include <sys/mount.h>
  17 #include <sys/param.h>
  18 #include <sys/prctl.h>
  19 #include <sys/socket.h>
  20 #include <sys/stat.h>
  21 #include <sys/syscall.h>
  22 #include <sys/types.h>
  23 #include <sys/un.h>
  24 #include <sys/wait.h>
  25 #include <unistd.h>
  26
  27 #include "lxc.h"
  28
  29 #include "af_unix.h"
  30 #include "attach_options.h"
  31 #include "caps.h"
  32 #include "cgroups/cgroup.h"
  33 #include "cgroups/cgroup_utils.h"
  34 #include "commands.h"
  35 #include "commands_utils.h"
  36 #include "compiler.h"
  37 #include "conf.h"
  38 #include "confile_utils.h"
  39 #include "error.h"
  40 #include "file_utils.h"
  41 #include "list.h"
  42 #include "log.h"
  43 #include "lsm/lsm.h"
  44 #include "lxclock.h"
  45 #include "lxcseccomp.h"
  46 #include "macro.h"
  47 #include "mainloop.h"
  48 #include "memory_utils.h"
  49 #include "monitor.h"
  50 #include "namespace.h"
  51 #include "network.h"
  52 #include "process_utils.h"
  53 #include "start.h"
  54 #include "storage/storage.h"
  55 #include "storage/storage_utils.h"
  56 #include "sync.h"
  57 #include "syscall_wrappers.h"
  58 #include "terminal.h"
  59 #include "utils.h"
  60
  61 #if HAVE_LIBCAP
  62 #include <sys/capability.h>
  63 #endif
  64
  65 #if !HAVE_STRLCPY
  66 #include "strlcpy.h"
  67 #endif
  68
  69 lxc_log_define(start, lxc);
  70
  71 extern void mod_all_rdeps(struct lxc_container *c, bool inc);
  72 static bool do_destroy_container(struct lxc_handler *handler);
  73 static int lxc_rmdir_onedev_wrapper(void *data);
  74 static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
  75                                             const char *name);
  76
  77 static void print_top_failing_dir(const char *path)
  78 {
  79         __do_free char *copy = NULL;
  80         int ret;
  81         char *e, *p, saved;
  82
  83         copy = must_copy_string(path);
  84         p = copy;
  85         e = copy + strlen(path);
  86
  87         while (p < e) {
  88                 while (p < e && *p == '/')
  89                         p++;
  90
  91                 while (p < e && *p != '/')
  92                         p++;
  93
  94                 saved = *p;
  95                 *p = '\0';
  96
  97                 ret = access(copy, X_OK);
  98                 if (ret != 0) {
  99                         SYSERROR("Could not access %s. Please grant it x access, or add an ACL for the container " "root", copy);
 100                         return;
 101                 }
 102                 *p = saved;
 103         }
 104 }
 105
 106 static void lxc_put_nsfds(struct lxc_handler *handler)
 107 {
 108         for (int i = 0; i < LXC_NS_MAX; i++) {
 109                 if (handler->nsfd[i] < 0)
 110                         continue;
 111
 112                 close_prot_errno_disarm(handler->nsfd[i]);
 113         }
 114 }
 115
 116 static int lxc_try_preserve_namespace(struct lxc_handler *handler,
 117                                       lxc_namespace_t idx, const char *ns)
 118 {
 119         __do_close int fd = -EBADF;
 120         int ret;
 121
 122         fd = lxc_preserve_ns(handler->pid, ns);
 123         if (fd < 0)
 124                 return -errno;
 125
 126         ret = strnprintf(handler->nsfd_paths[idx],
 127                          sizeof(handler->nsfd_paths[idx]), "%s:/proc/%d/fd/%d",
 128                          ns_info[idx].proc_name, handler->monitor_pid, fd);
 129         if (ret < 0)
 130                 return ret_errno(EIO);
 131
 132         /*
 133          * In case LXC is configured for exposing information to hooks as
 134          * argv-style arguments prepare an argv array we can use.
 135          */
 136         handler->hook_argv[handler->hook_argc] = handler->nsfd_paths[idx];
 137         handler->hook_argc++;
 138
 139         DEBUG("Preserved %s namespace via fd %d and stashed path as %s",
 140               ns_info[idx].proc_name, fd, handler->nsfd_paths[idx]);
 141
 142         handler->nsfd[idx] = move_fd(fd);
 143         return 0;
 144 }
 145
 146 /* lxc_try_preserve_namespaces: open /proc/@pid/ns/@ns for each namespace
 147  * specified in ns_clone_flags.
 148  * Return true on success, false on failure.
 149  */
 150 static bool lxc_try_preserve_namespaces(struct lxc_handler *handler,
 151                                         int ns_clone_flags)
 152 {
 153         for (lxc_namespace_t ns_idx = 0; ns_idx < LXC_NS_MAX; ns_idx++) {
 154                 int ret;
 155                 const char *ns = ns_info[ns_idx].proc_name;
 156
 157                 if ((ns_clone_flags & ns_info[ns_idx].clone_flag) == 0)
 158                         continue;
 159
 160                 ret = lxc_try_preserve_namespace(handler, ns_idx,
 161                                                  ns_info[ns_idx].proc_name);
 162                 if (ret < 0) {
 163                         if (ret == -ENOENT) {
 164                                 SYSERROR("Kernel does not support preserving %s namespaces", ns);
 165                                 continue;
 166                         }
 167
 168                         /*
 169                          * Handle kernels that do not support interacting with
 170                          * namespaces through procfs.
 171                          */
 172                         lxc_put_nsfds(handler);
 173                         return log_error_errno(false, errno, "Failed to preserve %s namespace", ns);
 174                 }
 175         }
 176
 177         return true;
 178 }
 179
 180 static inline bool match_stdfds(int fd)
 181 {
 182         return (fd == STDIN_FILENO || fd == STDOUT_FILENO || fd == STDERR_FILENO);
 183 }
 184
 185 #ifdef HAVE_DLOG
 186 static bool match_dlog_fds(struct dirent *direntp)
 187 {
 188         char path[PATH_MAX] = {0};
 189         char link[PATH_MAX] = {0};
 190         ssize_t linklen;
 191         int ret;
 192
 193         ret = strnprintf(path, sizeof(path), "/proc/self/fd/%s", direntp->d_name);
 194         if (ret < 0)
 195                 return log_error(false, "Failed to create file descriptor name");
 196
 197         linklen = readlink(path, link, PATH_MAX);
 198         if (linklen < 0)
 199                 return log_error(false, "Failed to read link path - \"%s\"", path);
 200         else if (linklen >= PATH_MAX)
 201                 return log_error(false, "The name of link path is too long - \"%s\"", path);
 202
 203         if (strequal(link, "/dev/log_main") ||
 204             strequal(link, "/dev/log_system") ||
 205             strequal(link, "/dev/log_radio"))
 206                 return true;
 207
 208         return false;
 209 }
 210 #endif
 211
 212 /* Parses the LISTEN_FDS environment variable value.
 213  * The returned value is the highest fd number up to which the
 214  * file descriptors must be passed to the container process.
 215  *
 216  * For example, if LISTEN_FDS=2 then 4 is returned and file descriptors 3 and 4
 217  * MUST be passed to the container process (in addition to the standard streams)
 218  * to support [socket activation][systemd-listen-fds].
 219  */
 220 static unsigned int get_listen_fds_max(void)
 221 {
 222         int ret;
 223         unsigned int num_fds;
 224         const char *val;
 225
 226         val = getenv("LISTEN_FDS");
 227         if (!val)
 228                 return 0;
 229
 230         ret = lxc_safe_uint(val, &num_fds);
 231         if (ret < 0)
 232                 return syserror_ret(0, "Failed to parse \"LISTEN_FDS=%s\" environment variable", val);
 233
 234         return log_trace(num_fds, "Parsed \"LISTEN_FDS=%s\" environment variable", val);
 235 }
 236
 237 int lxc_check_inherited(struct lxc_conf *conf, bool closeall,
 238                         int *fds_to_ignore, size_t len_fds)
 239 {
 240         int fd, fddir;
 241         size_t i;
 242         DIR *dir;
 243         struct dirent *direntp;
 244         unsigned int listen_fds_max;
 245         struct lxc_state_client *client, *nclient;
 246
 247         if (conf && conf->close_all_fds)
 248                 closeall = true;
 249
 250         listen_fds_max = get_listen_fds_max();
 251
 252         /*
 253          * Disable syslog at this point to avoid the above logging
 254          * function to open a new fd and make the check_inherited function
 255          * enter an infinite loop.
 256          */
 257         lxc_log_syslog_disable();
 258
 259 restart:
 260         dir = opendir("/proc/self/fd");
 261         if (!dir)
 262                 return log_warn(-1, "Failed to open directory");
 263
 264         fddir = dirfd(dir);
 265
 266         while ((direntp = readdir(dir))) {
 267                 int ret;
 268                 bool matched = false;
 269
 270                 if (strequal(direntp->d_name, "."))
 271                         continue;
 272
 273                 if (strequal(direntp->d_name, ".."))
 274                         continue;
 275
 276                 ret = lxc_safe_int(direntp->d_name, &fd);
 277                 if (ret < 0) {
 278                         INFO("Could not parse file descriptor for \"%s\"", direntp->d_name);
 279                         continue;
 280                 }
 281
 282                 for (i = 0; i < len_fds; i++)
 283                         if (fds_to_ignore[i] == fd)
 284                                 break;
 285
 286                 if (fd == fddir || fd == lxc_log_fd ||
 287                     (i < len_fds && fd == fds_to_ignore[i]))
 288                         continue;
 289
 290                 /* Keep state clients that wait on reboots. */
 291                 if (conf) {
 292                         list_for_each_entry_safe(client, nclient, &conf->state_clients, head) {
 293                                 if (client->clientfd != fd)
 294                                         continue;
 295
 296                                 matched = true;
 297                                 break;
 298                         }
 299                 }
 300
 301                 if (matched)
 302                         continue;
 303
 304                 if (current_config && fd == current_config->logfd)
 305                         continue;
 306
 307                 if (match_stdfds(fd))
 308                         continue;
 309
 310 #ifdef HAVE_DLOG
 311                 if (match_dlog_fds(direntp))
 312                         continue;
 313
 314 #endif
 315
 316                 if ((size_t)fd <= listen_fds_max) {
 317                         INFO("Inheriting fd %d (using the LISTEN_FDS environment variable)", fd);
 318                         continue;
 319                 }
 320
 321                 if (closeall) {
 322                         if (close(fd))
 323                                 SYSINFO("Closed inherited fd %d", fd);
 324                         else
 325                                 INFO("Closed inherited fd %d", fd);
 326                         closedir(dir);
 327                         goto restart;
 328                 }
 329                 WARN("Inherited fd %d", fd);
 330         }
 331         closedir(dir);
 332
 333         /*
 334          * Only enable syslog at this point to avoid the above logging
 335          * function to open a new fd and make the check_inherited function
 336          * enter an infinite loop.
 337          */
 338         lxc_log_syslog_enable();
 339
 340         return 0;
 341 }
 342
 343 static int setup_signal_fd(sigset_t *oldmask)
 344 {
 345         int ret;
 346         sigset_t mask;
 347         const int signals[] = {SIGBUS, SIGILL, SIGSEGV, SIGWINCH};
 348
 349         /* Block everything except serious error signals. */
 350         ret = sigfillset(&mask);
 351         if (ret < 0)
 352                 return -EBADF;
 353
 354         for (size_t sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
 355                 ret = sigdelset(&mask, signals[sig]);
 356                 if (ret < 0)
 357                         return -EBADF;
 358         }
 359
 360         ret = pthread_sigmask(SIG_BLOCK, &mask, oldmask);
 361         if (ret < 0)
 362                 return log_error_errno(-EBADF, errno,
 363                                        "Failed to set signal mask");
 364
 365         ret = signalfd(-1, &mask, SFD_CLOEXEC);
 366         if (ret < 0)
 367                 return log_error_errno(-EBADF,
 368                                        errno, "Failed to create signal file descriptor");
 369
 370         TRACE("Created signal file descriptor %d", ret);
 371
 372         return ret;
 373 }
 374
 375 static int signal_handler(int fd, uint32_t events, void *data,
 376                           struct lxc_async_descr *descr)
 377 {
 378         int ret;
 379         siginfo_t info;
 380         struct signalfd_siginfo siginfo;
 381         struct lxc_handler *hdlr = data;
 382
 383         ret = lxc_read_nointr(fd, &siginfo, sizeof(siginfo));
 384         if (ret < 0)
 385                 return log_error(LXC_MAINLOOP_ERROR, "Failed to read signal info from signal file descriptor %d", fd);
 386
 387         if (ret != sizeof(siginfo))
 388                 return log_error(LXC_MAINLOOP_ERROR, "Unexpected size for struct signalfd_siginfo");
 389
 390         /* Check whether init is running. */
 391         info.si_pid = 0;
 392         ret = waitid(P_PID, hdlr->pid, &info, WEXITED | WNOWAIT | WNOHANG);
 393         if (ret == 0 && info.si_pid == hdlr->pid)
 394                 hdlr->init_died = true;
 395
 396         TRACE("Received signal ssi_signo(%d) for ssi_pid(%d), si_signo(%d), si_pid(%d)",
 397               siginfo.ssi_signo, siginfo.ssi_pid, info.si_signo, info.si_pid);
 398
 399         /* Try to figure out a reasonable exit status to report. */
 400         if (hdlr->init_died) {
 401                 switch (info.si_code) {
 402                 case CLD_EXITED:
 403                         hdlr->exit_status = info.si_status << 8;
 404                         break;
 405                 case CLD_KILLED:
 406                 case CLD_DUMPED:
 407                 case CLD_STOPPED:
 408                         hdlr->exit_status = info.si_status << 8 | 0x7f;
 409                         break;
 410                 case CLD_CONTINUED:
 411                         /* Huh? The waitid() told us it's dead *and* continued? */
 412                         WARN("Init %d dead and continued?", hdlr->pid);
 413                         hdlr->exit_status = 1;
 414                         break;
 415                 default:
 416                         ERROR("Unknown si_code: %d", info.si_code);
 417                         hdlr->exit_status = 1;
 418                 }
 419         }
 420
 421         if (siginfo.ssi_signo == SIGHUP) {
 422                 if (hdlr->pidfd >= 0)
 423                         lxc_raw_pidfd_send_signal(hdlr->pidfd, SIGTERM, NULL, 0);
 424                 else
 425                         kill(hdlr->pid, SIGTERM);
 426                 INFO("Killing %d since terminal hung up", hdlr->pid);
 427                 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
 428                                        : LXC_MAINLOOP_CONTINUE;
 429         }
 430
 431         if (siginfo.ssi_signo != SIGCHLD) {
 432                 if (hdlr->pidfd >= 0)
 433                         lxc_raw_pidfd_send_signal(hdlr->pidfd,
 434                                                   siginfo.ssi_signo, NULL, 0);
 435                 else
 436                         kill(hdlr->pid, siginfo.ssi_signo);
 437                 INFO("Forwarded signal %d to pid %d", siginfo.ssi_signo, hdlr->pid);
 438                 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
 439                                        : LXC_MAINLOOP_CONTINUE;
 440         }
 441
 442         /* More robustness, protect ourself from a SIGCHLD sent
 443          * by a process different from the container init.
 444          */
 445         if ((__u64)siginfo.ssi_pid != (__u64)hdlr->pid) {
 446                 NOTICE("Received %d from pid %d instead of container init %d",
 447                        siginfo.ssi_signo, siginfo.ssi_pid, hdlr->pid);
 448                 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
 449                                        : LXC_MAINLOOP_CONTINUE;
 450         }
 451
 452         if (siginfo.ssi_code == CLD_STOPPED) {
 453                 INFO("Container init process was stopped");
 454                 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
 455                                        : LXC_MAINLOOP_CONTINUE;
 456         }
 457
 458         if (siginfo.ssi_code == CLD_CONTINUED) {
 459                 INFO("Container init process was continued");
 460                 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
 461                                        : LXC_MAINLOOP_CONTINUE;
 462         }
 463
 464         return log_debug(LXC_MAINLOOP_CLOSE, "Container init process %d exited", hdlr->pid);
 465 }
 466
 467 int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
 468                             lxc_state_t state)
 469 {
 470         struct lxc_msg msg = {
 471                 .type   = lxc_msg_state,
 472                 .value  = state,
 473         };
 474         size_t retlen;
 475         ssize_t ret;
 476         struct lxc_state_client *client, *nclient;
 477
 478         if (state == THAWED)
 479                 handler->state = RUNNING;
 480         else
 481                 handler->state = state;
 482
 483         TRACE("Set container state to %s", lxc_state2str(state));
 484
 485         if (list_empty(&handler->conf->state_clients))
 486                 return log_trace(0, "No state clients registered");
 487
 488         retlen = strlcpy(msg.name, name, sizeof(msg.name));
 489         if (retlen >= sizeof(msg.name))
 490                 return -E2BIG;
 491
 492         list_for_each_entry_safe(client, nclient, &handler->conf->state_clients, head) {
 493                 if (client->states[state] == 0) {
 494                         TRACE("State %s not registered for state client %d",
 495                               lxc_state2str(state), client->clientfd);
 496                         continue;
 497                 }
 498
 499                 TRACE("Sending state %s to state client %d",
 500                       lxc_state2str(state), client->clientfd);
 501
 502                 ret = lxc_send_nointr(client->clientfd, &msg, sizeof(msg), MSG_NOSIGNAL);
 503                 if (ret <= 0)
 504                         SYSERROR("Failed to send message to client");
 505
 506                 /* kick client from list */
 507                 list_del(&client->head);
 508                 close(client->clientfd);
 509                 free(client);
 510         }
 511
 512         return 0;
 513 }
 514
 515 static int lxc_serve_state_socket_pair(const char *name,
 516                                        struct lxc_handler *handler,
 517                                        lxc_state_t state)
 518 {
 519         ssize_t ret;
 520
 521         if (!handler->daemonize ||
 522             handler->state_socket_pair[1] < 0 ||
 523             state == STARTING)
 524                 return 0;
 525
 526         /* Close read end of the socket pair. */
 527         close_prot_errno_disarm(handler->state_socket_pair[0]);
 528
 529 again:
 530         ret = lxc_abstract_unix_send_credential(handler->state_socket_pair[1],
 531                                                 &(int){state}, sizeof(int));
 532         if (ret < 0) {
 533                 SYSERROR("Failed to send state to %d", handler->state_socket_pair[1]);
 534
 535                 if (errno == EINTR)
 536                         goto again;
 537
 538                 return -1;
 539         }
 540
 541         if (ret != sizeof(int))
 542                 return log_error(-1, "Message too long : %d", handler->state_socket_pair[1]);
 543
 544         TRACE("Sent container state \"%s\" to %d", lxc_state2str(state),
 545               handler->state_socket_pair[1]);
 546
 547         /* Close write end of the socket pair. */
 548         close_prot_errno_disarm(handler->state_socket_pair[1]);
 549
 550         return 0;
 551 }
 552
 553 int lxc_set_state(const char *name, struct lxc_handler *handler,
 554                   lxc_state_t state)
 555 {
 556         int ret;
 557
 558         ret = lxc_serve_state_socket_pair(name, handler, state);
 559         if (ret < 0)
 560                 return log_error(-1, "Failed to synchronize via anonymous pair of unix sockets");
 561
 562         ret = lxc_serve_state_clients(name, handler, state);
 563         if (ret < 0)
 564                 return -1;
 565
 566         /* This function will try to connect to the legacy lxc-monitord state
 567          * server and only exists for backwards compatibility.
 568          */
 569         lxc_monitor_send_state(name, state, handler->lxcpath);
 570
 571         return 0;
 572 }
 573
 574 int lxc_poll(const char *name, struct lxc_handler *handler)
 575 {
 576         int ret;
 577         struct lxc_terminal *console = &handler->conf->console;
 578         struct lxc_async_descr descr, descr_console;
 579
 580         if (!wants_console(console))
 581                 console = NULL;
 582
 583         ret = lxc_mainloop_open(&descr);
 584         if (ret < 0) {
 585                 ERROR("Failed to create mainloop");
 586                 goto out_sigfd;
 587         }
 588
 589         if (console) {
 590                 ret = lxc_mainloop_open(&descr_console);
 591                 if (ret < 0) {
 592                         ERROR("Failed to create console mainloop");
 593                         goto out_mainloop;
 594                 }
 595         }
 596
 597         ret = lxc_mainloop_add_handler(&descr, handler->sigfd,
 598                                        signal_handler,
 599                                        default_cleanup_handler,
 600                                        handler, "signal_handler");
 601         if (ret < 0) {
 602                 ERROR("Failed to add signal handler for %d to mainloop", handler->sigfd);
 603                 goto out_mainloop_console;
 604         }
 605
 606         ret = lxc_seccomp_setup_proxy(&handler->conf->seccomp, &descr, handler);
 607         if (ret < 0) {
 608                 ERROR("Failed to setup seccomp proxy");
 609                 goto out_mainloop_console;
 610         }
 611
 612         if (console) {
 613                 ret = lxc_terminal_mainloop_add(&descr, console);
 614                 if (ret < 0) {
 615                         ERROR("Failed to add console handlers to mainloop");
 616                         goto out_mainloop_console;
 617                 }
 618         }
 619
 620         ret = lxc_cmd_mainloop_add(name, &descr, handler);
 621         if (ret < 0) {
 622                 ERROR("Failed to add command handler to mainloop");
 623                 goto out_mainloop_console;
 624         }
 625
 626         TRACE("Mainloop is ready");
 627
 628         ret = lxc_mainloop(&descr, -1);
 629         if (descr.type == LXC_MAINLOOP_EPOLL)
 630                 close_prot_errno_disarm(descr.epfd);
 631         if (ret < 0 || !handler->init_died)
 632                 goto out_mainloop_console;
 633
 634         if (console) {
 635                 ret = lxc_terminal_mainloop_add(&descr_console, console);
 636                 if (ret == 0)
 637                         ret = lxc_mainloop(&descr_console, 0);
 638         }
 639
 640 out_mainloop_console:
 641         if (console) {
 642                 lxc_mainloop_close(&descr_console);
 643                 TRACE("Closed console mainloop");
 644         }
 645
 646 out_mainloop:
 647         lxc_mainloop_close(&descr);
 648         TRACE("Closed mainloop");
 649
 650 out_sigfd:
 651         TRACE("Closed signal file descriptor %d", handler->sigfd);
 652         close_prot_errno_disarm(handler->sigfd);
 653
 654         return ret;
 655 }
 656
 657 void lxc_put_handler(struct lxc_handler *handler)
 658 {
 659         close_prot_errno_disarm(handler->pidfd);
 660         close_prot_errno_disarm(handler->sigfd);
 661         lxc_put_nsfds(handler);
 662         if (handler->conf && handler->conf->reboot == REBOOT_NONE)
 663                 close_prot_errno_disarm(handler->conf->maincmd_fd);
 664         close_prot_errno_disarm(handler->monitor_status_fd);
 665         close_prot_errno_disarm(handler->state_socket_pair[0]);
 666         close_prot_errno_disarm(handler->state_socket_pair[1]);
 667         cgroup_exit(handler->cgroup_ops);
 668         if (handler->conf && handler->conf->reboot == REBOOT_NONE)
 669                 free_disarm(handler);
 670         else
 671                 handler->conf = NULL;
 672 }
 673
 674 struct lxc_handler *lxc_init_handler(struct lxc_handler *old,
 675                                      const char *name, struct lxc_conf *conf,
 676                                      const char *lxcpath, bool daemonize)
 677 {
 678         int nr_keep_fds = 0;
 679         int ret;
 680         struct lxc_handler *handler;
 681
 682         if (!old)
 683                 handler = zalloc(sizeof(*handler));
 684         else
 685                 handler = old;
 686         if (!handler)
 687                 return NULL;
 688
 689         /* Note that am_guest_unpriv() checks the effective uid. We
 690          * probably don't care if we are real root only if we are running
 691          * as root so this should be fine.
 692          */
 693         handler->am_root = !am_guest_unpriv();
 694         handler->conf = conf;
 695         handler->lxcpath = lxcpath;
 696         handler->init_died = false;
 697         handler->data_sock[0] = -EBADF;
 698         handler->data_sock[1] = -EBADF;
 699         handler->monitor_status_fd = -EBADF;
 700         handler->pidfd = -EBADF;
 701         handler->sigfd = -EBADF;
 702         handler->state_socket_pair[0] = -EBADF;
 703         handler->state_socket_pair[1] = -EBADF;
 704         if (handler->conf->reboot == REBOOT_NONE)
 705                 INIT_LIST_HEAD(&handler->conf->state_clients);
 706
 707         for (lxc_namespace_t idx = 0; idx < LXC_NS_MAX; idx++) {
 708                 handler->nsfd[idx] = -EBADF;
 709
 710                 if (handler->conf->reboot == REBOOT_NONE)
 711                         continue;
 712
 713                 handler->nsfd_paths[idx][0] = '\0';
 714                 handler->hook_argv[idx] = NULL;
 715
 716                 if (handler->hook_argc != 0)
 717                         handler->hook_argc = 0;
 718         }
 719
 720         handler->name = name;
 721         if (daemonize)
 722                 handler->transient_pid = lxc_raw_getpid();
 723         else
 724                 handler->transient_pid = -1;
 725
 726         if (daemonize && handler->conf->reboot == REBOOT_NONE) {
 727                 /* Create socketpair() to synchronize on daemonized startup.
 728                  * When the container reboots we don't need to synchronize
 729                  * again currently so don't open another socketpair().
 730                  */
 731                 ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0,
 732                                  handler->state_socket_pair);
 733                 if (ret < 0) {
 734                         ERROR("Failed to create anonymous pair of unix sockets");
 735                         goto on_error;
 736                 }
 737
 738                 TRACE("Created anonymous pair {%d,%d} of unix sockets",
 739                       handler->state_socket_pair[0],
 740                       handler->state_socket_pair[1]);
 741                 handler->keep_fds[nr_keep_fds++] = handler->state_socket_pair[0];
 742                 handler->keep_fds[nr_keep_fds++] = handler->state_socket_pair[1];
 743         }
 744
 745         if (handler->conf->reboot == REBOOT_NONE) {
 746                 handler->conf->maincmd_fd = lxc_server_init(name, lxcpath, "command");
 747                 if (handler->conf->maincmd_fd < 0) {
 748                         ERROR("Failed to set up command socket");
 749                         goto on_error;
 750                 }
 751                 handler->keep_fds[nr_keep_fds++] = handler->conf->maincmd_fd;
 752         }
 753
 754         TRACE("Unix domain socket %d for command server is ready",
 755               handler->conf->maincmd_fd);
 756
 757         return handler;
 758
 759 on_error:
 760         lxc_put_handler(handler);
 761
 762         return NULL;
 763 }
 764
 765 int lxc_init(const char *name, struct lxc_handler *handler)
 766 {
 767         __do_close int status_fd = -EBADF;
 768         int ret;
 769         const char *loglevel;
 770         struct lxc_conf *conf = handler->conf;
 771
 772         handler->monitor_pid = lxc_raw_getpid();
 773         status_fd = open("/proc/self/status", O_RDONLY | O_CLOEXEC);
 774         if (status_fd < 0)
 775                 return log_error_errno(-1, errno, "Failed to open monitor status fd");
 776
 777         handler->lsm_ops = lsm_init_static();
 778         TRACE("Initialized LSM");
 779
 780         /* Begin by setting the state to STARTING. */
 781         ret = lxc_set_state(name, handler, STARTING);
 782         if (ret < 0)
 783                 return log_error(-1, "Failed to set state to \"%s\"", lxc_state2str(STARTING));
 784         TRACE("Set container state to \"STARTING\"");
 785
 786         /* Start of environment variable setup for hooks. */
 787         ret = setenv("LXC_NAME", name, 1);
 788         if (ret < 0)
 789                 SYSERROR("Failed to set environment variable: LXC_NAME=%s", name);
 790
 791         if (conf->rcfile) {
 792                 ret = setenv("LXC_CONFIG_FILE", conf->rcfile, 1);
 793                 if (ret < 0)
 794                         SYSERROR("Failed to set environment variable: LXC_CONFIG_FILE=%s", conf->rcfile);
 795         }
 796
 797         if (conf->rootfs.mount) {
 798                 ret = setenv("LXC_ROOTFS_MOUNT", conf->rootfs.mount, 1);
 799                 if (ret < 0)
 800                         SYSERROR("Failed to set environment variable: LXC_ROOTFS_MOUNT=%s", conf->rootfs.mount);
 801         }
 802
 803         if (conf->rootfs.path) {
 804                 ret = setenv("LXC_ROOTFS_PATH", conf->rootfs.path, 1);
 805                 if (ret < 0)
 806                         SYSERROR("Failed to set environment variable: LXC_ROOTFS_PATH=%s", conf->rootfs.path);
 807         }
 808
 809         if (conf->console.path) {
 810                 ret = setenv("LXC_CONSOLE", conf->console.path, 1);
 811                 if (ret < 0)
 812                         SYSERROR("Failed to set environment variable: LXC_CONSOLE=%s", conf->console.path);
 813         }
 814
 815         if (conf->console.log_path) {
 816                 ret = setenv("LXC_CONSOLE_LOGPATH", conf->console.log_path, 1);
 817                 if (ret < 0)
 818                         SYSERROR("Failed to set environment variable: LXC_CONSOLE_LOGPATH=%s", conf->console.log_path);
 819         }
 820
 821         if (cgns_supported()) {
 822                 ret = setenv("LXC_CGNS_AWARE", "1", 1);
 823                 if (ret < 0)
 824                         SYSERROR("Failed to set environment variable LXC_CGNS_AWARE=1");
 825         }
 826
 827         loglevel = lxc_log_priority_to_string(lxc_log_get_level());
 828         ret = setenv("LXC_LOG_LEVEL", loglevel, 1);
 829         if (ret < 0)
 830                 SYSERROR("Set environment variable LXC_LOG_LEVEL=%s", loglevel);
 831
 832         if (conf->hooks_version == 0)
 833                 ret = setenv("LXC_HOOK_VERSION", "0", 1);
 834         else
 835                 ret = setenv("LXC_HOOK_VERSION", "1", 1);
 836         if (ret < 0)
 837                 SYSERROR("Failed to set environment variable LXC_HOOK_VERSION=%u", conf->hooks_version);
 838         /* End of environment variable setup for hooks. */
 839
 840         TRACE("Set environment variables");
 841
 842         ret = run_lxc_hooks(name, "pre-start", conf, NULL);
 843         if (ret < 0)
 844                 return log_error(-1, "Failed to run lxc.hook.pre-start for container \"%s\"", name);
 845         TRACE("Ran pre-start hooks");
 846
 847         ret = lxc_terminal_parent(conf);
 848         if (ret < 0)
 849                 return log_error(-1, "Failed to allocate terminal");
 850
 851         /* The signal fd has to be created before forking otherwise if the child
 852          * process exits before we setup the signal fd, the event will be lost
 853          * and the command will be stuck.
 854          */
 855         handler->sigfd = setup_signal_fd(&handler->oldmask);
 856         if (handler->sigfd < 0)
 857                 return log_error(-1, "Failed to setup SIGCHLD fd handler.");
 858         TRACE("Set up signal fd");
 859
 860         handler->cgroup_ops = cgroup_init(handler->conf);
 861         if (!handler->cgroup_ops) {
 862                 ERROR("Failed to initialize cgroup driver");
 863                 goto out_restore_sigmask;
 864         }
 865         TRACE("Initialized cgroup driver");
 866
 867         ret = lxc_read_seccomp_config(conf);
 868         if (ret < 0) {
 869                 ERROR("Failed to read seccomp policy");
 870                 goto out_restore_sigmask;
 871         }
 872         TRACE("Read seccomp policy");
 873
 874         ret = handler->lsm_ops->prepare(handler->lsm_ops, conf, handler->lxcpath);
 875         if (ret < 0) {
 876                 ERROR("Failed to initialize LSM");
 877                 goto out_restore_sigmask;
 878         }
 879         TRACE("Initialized LSM");
 880
 881         INFO("Container \"%s\" is initialized", name);
 882         handler->monitor_status_fd = move_fd(status_fd);
 883         return 0;
 884
 885 out_restore_sigmask:
 886         (void)pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
 887
 888         return -1;
 889 }
 890
 891 void lxc_expose_namespace_environment(const struct lxc_handler *handler)
 892 {
 893         for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
 894                 int ret;
 895                 const char *fd_path;
 896
 897                 if (handler->nsfd[i] < 0)
 898                         continue;
 899
 900                 fd_path = handler->nsfd_paths[i] + strcspn(handler->nsfd_paths[i], "/");
 901                 ret = setenv(ns_info[i].env_name, fd_path, 1);
 902                 if (ret < 0)
 903                         SYSERROR("Failed to set environment variable %s=%s",
 904                                  ns_info[i].env_name, fd_path);
 905                 else
 906                         TRACE("Set environment variable %s=%s",
 907                               ns_info[i].env_name, fd_path);
 908         }
 909 }
 910
 911 void lxc_end(struct lxc_handler *handler)
 912 {
 913         int ret;
 914         const char *name = handler->name;
 915         struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
 916         struct lxc_state_client *client, *nclient;
 917
 918         /* The STOPPING state is there for future cleanup code which can take
 919          * awhile.
 920          */
 921         lxc_set_state(name, handler, STOPPING);
 922
 923         /* Passing information to hooks via environment variables. */
 924         if (handler->conf->hooks_version > 0)
 925                 lxc_expose_namespace_environment(handler);
 926
 927         if (handler->conf->reboot > REBOOT_NONE) {
 928                 ret = setenv("LXC_TARGET", "reboot", 1);
 929                 if (ret < 0)
 930                         SYSERROR("Failed to set environment variable: LXC_TARGET=reboot");
 931         }
 932
 933         if (handler->conf->reboot == REBOOT_NONE) {
 934                 ret = setenv("LXC_TARGET", "stop", 1);
 935                 if (ret < 0)
 936                         SYSERROR("Failed to set environment variable: LXC_TARGET=stop");
 937         }
 938
 939         if (handler->conf->hooks_version == 0)
 940                 ret = run_lxc_hooks(name, "stop", handler->conf, handler->hook_argv);
 941         else
 942                 ret = run_lxc_hooks(name, "stop", handler->conf, NULL);
 943         if (ret < 0)
 944                 ERROR("Failed to run \"lxc.hook.stop\" hook");
 945
 946         handler->lsm_ops->cleanup(handler->lsm_ops, handler->conf, handler->lxcpath);
 947
 948         if (cgroup_ops) {
 949                 cgroup_ops->payload_destroy(cgroup_ops, handler);
 950                 cgroup_ops->monitor_destroy(cgroup_ops, handler);
 951         }
 952
 953         put_lxc_rootfs(&handler->conf->rootfs, true);
 954
 955         if (handler->conf->reboot == REBOOT_NONE) {
 956                 /* For all new state clients simply close the command socket.
 957                  * This will inform all state clients that the container is
 958                  * STOPPED and also prevents a race between a open()/close() on
 959                  * the command socket causing a new process to get ECONNREFUSED
 960                  * because we haven't yet closed the command socket.
 961                  */
 962                 close_prot_errno_disarm(handler->conf->maincmd_fd);
 963                 TRACE("Closed command socket");
 964
 965                 /* This function will try to connect to the legacy lxc-monitord
 966                  * state server and only exists for backwards compatibility.
 967                  */
 968                 lxc_monitor_send_state(name, STOPPED, handler->lxcpath);
 969
 970                 /* The command socket is closed so no one can acces the command
 971                  * socket anymore so there's no need to lock it.
 972                  */
 973                 handler->state = STOPPED;
 974                 TRACE("Set container state to \"STOPPED\"");
 975         } else {
 976                 lxc_set_state(name, handler, STOPPED);
 977                 TRACE("Set container state to \"STOPPED\"");
 978         }
 979
 980         /* Avoid lingering namespace references. */
 981         lxc_put_nsfds(handler);
 982
 983         ret = run_lxc_hooks(name, "post-stop", handler->conf, NULL);
 984         if (ret < 0) {
 985                 ERROR("Failed to run lxc.hook.post-stop for container \"%s\"", name);
 986                 if (handler->conf->reboot > REBOOT_NONE) {
 987                         WARN("Container will be stopped instead of rebooted");
 988                         handler->conf->reboot = REBOOT_NONE;
 989
 990                         ret = setenv("LXC_TARGET", "stop", 1);
 991                         if (ret < 0)
 992                                 WARN("Failed to set environment variable: LXC_TARGET=stop");
 993                 }
 994         }
 995
 996         /* Reset mask set by setup_signal_fd. */
 997         ret = pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
 998         if (ret < 0)
 999                 SYSWARN("Failed to restore signal mask");
1000
1001         lxc_terminal_delete(&handler->conf->console);
1002         lxc_delete_tty(&handler->conf->ttys);
1003         close_prot_errno_disarm(handler->conf->devpts_fd);
1004
1005         /* The command socket is now closed, no more state clients can register
1006          * themselves from now on. So free the list of state clients.
1007          */
1008         list_for_each_entry_safe(client, nclient, &handler->conf->state_clients, head) {
1009                 /* Keep state clients that want to be notified about reboots. */
1010                 if ((handler->conf->reboot > REBOOT_NONE) &&
1011                     (client->states[RUNNING] == 2))
1012                         continue;
1013
1014                 /* close state client socket */
1015                 list_del(&client->head);
1016                 close(client->clientfd);
1017                 free(client);
1018         }
1019
1020         if (handler->conf->ephemeral == 1 && handler->conf->reboot != REBOOT_REQ)
1021                 lxc_destroy_container_on_signal(handler, name);
1022
1023         lxc_put_handler(handler);
1024 }
1025
1026 void lxc_abort(struct lxc_handler *handler)
1027 {
1028         int ret = 0;
1029         int status;
1030
1031         lxc_set_state(handler->name, handler, ABORTING);
1032
1033         if (handler->pidfd >= 0) {
1034                 ret = lxc_raw_pidfd_send_signal(handler->pidfd, SIGKILL, NULL, 0);
1035                 if (ret)
1036                         SYSWARN("Failed to send SIGKILL via pidfd %d for process %d",
1037                                 handler->pidfd, handler->pid);
1038         }
1039
1040         if ((!ret || errno != ESRCH) && handler->pid > 0)
1041                 if (kill(handler->pid, SIGKILL))
1042                         SYSWARN("Failed to send SIGKILL to %d", handler->pid);
1043
1044         do {
1045                 ret = waitpid(-1, &status, 0);
1046         } while (ret > 0);
1047 }
1048
1049 static int do_start(void *data)
1050 {
1051         struct lxc_handler *handler = data;
1052         __lxc_unused __do_close int data_sock0 = handler->data_sock[0],
1053                                     data_sock1 = handler->data_sock[1];
1054         __do_close int devnull_fd = -EBADF, status_fd = -EBADF;
1055         int ret;
1056         uid_t new_uid;
1057         gid_t new_gid;
1058         uid_t nsuid = 0;
1059         gid_t nsgid = 0;
1060
1061         lxc_sync_fini_parent(handler);
1062
1063         if (lxc_abstract_unix_recv_one_fd(data_sock1, &status_fd, NULL, 0) < 0) {
1064                 ERROR("Failed to receive status file descriptor from parent process");
1065                 goto out_warn_father;
1066         }
1067
1068         /* This prctl must be before the synchro, so if the parent dies before
1069          * we set the parent death signal, we will detect its death with the
1070          * synchro right after, otherwise we have a window where the parent can
1071          * exit before we set the pdeath signal leading to a unsupervized
1072          * container.
1073          */
1074         ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid, status_fd);
1075         if (ret < 0) {
1076                 SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
1077                 goto out_warn_father;
1078         }
1079
1080         ret = lxc_ambient_caps_up();
1081         if (ret < 0) {
1082                 ERROR("Failed to raise ambient capabilities");
1083                 goto out_warn_father;
1084         }
1085
1086         ret = pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
1087         if (ret < 0) {
1088                 SYSERROR("Failed to set signal mask");
1089                 goto out_warn_father;
1090         }
1091
1092         if (!lxc_sync_wait_parent(handler, START_SYNC_STARTUP))
1093                 goto out_warn_father;
1094
1095         /* Unshare CLONE_NEWNET after CLONE_NEWUSER. See
1096          * https://github.com/lxc/lxd/issues/1978.
1097          */
1098         if (handler->ns_unshare_flags & CLONE_NEWNET) {
1099                 ret = unshare(CLONE_NEWNET);
1100                 if (ret < 0) {
1101                         SYSERROR("Failed to unshare CLONE_NEWNET");
1102                         goto out_warn_father;
1103                 }
1104                 INFO("Unshared CLONE_NEWNET");
1105         }
1106
1107         /* If we are in a new user namespace, become root there to have
1108          * privilege over our namespace.
1109          */
1110         if (!list_empty(&handler->conf->id_map)) {
1111                 if (!handler->conf->root_nsuid_map)
1112                         nsuid = handler->conf->init_uid;
1113
1114                 if (!handler->conf->root_nsgid_map)
1115                         nsgid = handler->conf->init_gid;
1116
1117                 /* Drop groups only after we switched to a valid gid in the new
1118                  * user namespace.
1119                  */
1120                 if (!lxc_drop_groups() &&
1121                     (handler->am_root || errno != EPERM))
1122                         goto out_warn_father;
1123
1124                 if (!lxc_switch_uid_gid(nsuid, nsgid))
1125                         goto out_warn_father;
1126
1127                 ret = prctl(PR_SET_DUMPABLE, prctl_arg(1), prctl_arg(0),
1128                             prctl_arg(0), prctl_arg(0));
1129                 if (ret < 0)
1130                         goto out_warn_father;
1131
1132                 /* set{g,u}id() clears deathsignal */
1133                 ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid, status_fd);
1134                 if (ret < 0) {
1135                         SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
1136                         goto out_warn_father;
1137                 }
1138         }
1139
1140         ret = access(handler->lxcpath, X_OK);
1141         if (ret != 0) {
1142                 print_top_failing_dir(handler->lxcpath);
1143                 goto out_warn_father;
1144         }
1145
1146         /* In order to checkpoint restore, we need to have everything in the
1147          * same mount namespace. However, some containers may not have a
1148          * reasonable /dev (in particular, they may not have /dev/null), so we
1149          * can't set init's std fds to /dev/null by opening it from inside the
1150          * container.
1151          *
1152          * If that's the case, fall back to using the host's /dev/null. This
1153          * means that migration won't work, but at least we won't spew output
1154          * where it isn't wanted.
1155          */
1156         if (handler->daemonize && !handler->conf->autodev) {
1157                 char path[PATH_MAX];
1158
1159                 ret = strnprintf(path, sizeof(path), "%s/dev/null",
1160                                  handler->conf->rootfs.mount);
1161                 if (ret < 0)
1162                         goto out_warn_father;
1163
1164                 ret = access(path, F_OK);
1165                 if (ret != 0) {
1166                         devnull_fd = open_devnull();
1167
1168                         if (devnull_fd < 0)
1169                                 goto out_warn_father;
1170                         WARN("Using /dev/null from the host for container init's standard file descriptors. Migration will not work");
1171                 }
1172         }
1173
1174         /*
1175          * Tell the parent task it can begin to configure the container and wait
1176          * for it to finish.
1177          */
1178         if (!lxc_sync_wake_parent(handler, START_SYNC_CONFIGURE))
1179                 goto out_error;
1180
1181         /* Unshare cgroup namespace after we have setup our cgroups. If we do it
1182          * earlier we end up with a wrong view of /proc/self/cgroup. For
1183          * example, assume we unshare(CLONE_NEWCGROUP) first, and then create
1184          * the cgroup for the container, say /sys/fs/cgroup/cpuset/lxc/c, then
1185          * /proc/self/cgroup would show us:
1186          *
1187          *      8:cpuset:/lxc/c
1188          *
1189          * whereas it should actually show
1190          *
1191          *      8:cpuset:/
1192          */
1193         if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
1194                 ret = unshare(CLONE_NEWCGROUP);
1195                 if (ret < 0) {
1196                         if (errno != EINVAL) {
1197                                 SYSERROR("Failed to unshare CLONE_NEWCGROUP");
1198                                 goto out_warn_father;
1199                         }
1200
1201                         handler->ns_clone_flags &= ~CLONE_NEWCGROUP;
1202                         SYSINFO("Kernel does not support CLONE_NEWCGROUP");
1203                 } else {
1204                         INFO("Unshared CLONE_NEWCGROUP");
1205                 }
1206         }
1207
1208         if (handler->ns_unshare_flags & CLONE_NEWTIME) {
1209                 ret = unshare(CLONE_NEWTIME);
1210                 if (ret < 0) {
1211                         if (errno != EINVAL) {
1212                                 SYSERROR("Failed to unshare CLONE_NEWTIME");
1213                                 goto out_warn_father;
1214                         }
1215
1216                         handler->ns_clone_flags &= ~CLONE_NEWTIME;
1217                         SYSINFO("Kernel does not support CLONE_NEWTIME");
1218                 } else {
1219                         __do_close int timens_fd = -EBADF;
1220
1221                         INFO("Unshared CLONE_NEWTIME");
1222
1223                         if (handler->conf->timens.s_boot)
1224                                 ret = timens_offset_write(CLOCK_BOOTTIME, handler->conf->timens.s_boot, 0);
1225                         else if (handler->conf->timens.ns_boot)
1226                                 ret = timens_offset_write(CLOCK_BOOTTIME, 0, handler->conf->timens.ns_boot);
1227                         if (ret) {
1228                                 SYSERROR("Failed to write CLONE_BOOTTIME offset");
1229                                 goto out_warn_father;
1230                         }
1231                         TRACE("Wrote CLOCK_BOOTTIME offset");
1232
1233                         if (handler->conf->timens.s_monotonic)
1234                                 ret = timens_offset_write(CLOCK_MONOTONIC, handler->conf->timens.s_monotonic, 0);
1235                         else if (handler->conf->timens.ns_monotonic)
1236                                 ret = timens_offset_write(CLOCK_MONOTONIC, 0, handler->conf->timens.ns_monotonic);
1237                         if (ret) {
1238                                 SYSERROR("Failed to write CLONE_MONOTONIC offset");
1239                                 goto out_warn_father;
1240                         }
1241                         TRACE("Wrote CLOCK_MONOTONIC offset");
1242
1243                         timens_fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC);
1244                         if (timens_fd < 0) {
1245                                 SYSERROR("Failed to open \"/proc/self/ns/time_for_children\"");
1246                                 goto out_warn_father;
1247                         }
1248
1249                         ret = setns(timens_fd, CLONE_NEWTIME);
1250                         if (ret) {
1251                                 SYSERROR("Failed to setns(%d(\"/proc/self/ns/time_for_children\"))", timens_fd);
1252                                 goto out_warn_father;
1253                         }
1254                 }
1255         }
1256
1257         /*
1258          * Add the requested environment variables to the current environment
1259          * to allow them to be used by the various hooks, such as the start
1260          * hook below.
1261          */
1262         ret = lxc_set_environment(handler->conf);
1263         if (ret < 0)
1264                 goto out_warn_father;
1265
1266         if (!lxc_sync_wait_parent(handler, START_SYNC_POST_CONFIGURE))
1267                 goto out_warn_father;
1268
1269         /* Setup the container, ip, names, utsname, ... */
1270         ret = lxc_setup(handler);
1271         if (ret < 0) {
1272                 ERROR("Failed to setup container \"%s\"", handler->name);
1273                 goto out_warn_father;
1274         }
1275
1276         /* Set the label to change to when we exec(2) the container's init. */
1277         ret = handler->lsm_ops->process_label_set(handler->lsm_ops, NULL, handler->conf, true);
1278         if (ret < 0)
1279                 goto out_warn_father;
1280
1281         /* Set PR_SET_NO_NEW_PRIVS after we changed the lsm label. If we do it
1282          * before we aren't allowed anymore.
1283          */
1284         if (handler->conf->no_new_privs) {
1285                 ret = prctl(PR_SET_NO_NEW_PRIVS, prctl_arg(1), prctl_arg(0),
1286                             prctl_arg(0), prctl_arg(0));
1287                 if (ret < 0) {
1288                         SYSERROR("Could not set PR_SET_NO_NEW_PRIVS to block execve() gainable privileges");
1289                         goto out_warn_father;
1290                 }
1291                 DEBUG("Set PR_SET_NO_NEW_PRIVS to block execve() gainable privileges");
1292         }
1293
1294         /* If we mounted a temporary proc, then unmount it now. */
1295         tmp_proc_unmount(handler->conf);
1296
1297         ret = lxc_seccomp_load(handler->conf);
1298         if (ret < 0)
1299                 goto out_warn_father;
1300
1301         ret = run_lxc_hooks(handler->name, "start", handler->conf, NULL);
1302         if (ret < 0) {
1303                 ERROR("Failed to run lxc.hook.start for container \"%s\"",
1304                       handler->name);
1305                 goto out_warn_father;
1306         }
1307
1308         close_prot_errno_disarm(handler->sigfd);
1309
1310         if (handler->conf->console.pty < 0 && handler->daemonize) {
1311                 if (devnull_fd < 0) {
1312                         devnull_fd = open_devnull();
1313                         if (devnull_fd < 0)
1314                                 goto out_warn_father;
1315                 }
1316
1317                 ret = set_stdfds(devnull_fd);
1318                 if (ret < 0) {
1319                         ERROR("Failed to redirect std{in,out,err} to \"/dev/null\"");
1320                         goto out_warn_father;
1321                 }
1322         }
1323
1324         close_prot_errno_disarm(devnull_fd);
1325
1326         setsid();
1327
1328         if (handler->conf->init_cwd) {
1329                 ret = chdir(handler->conf->init_cwd);
1330                 if (ret < 0) {
1331                         SYSERROR("Could not change directory to \"%s\"",
1332                                  handler->conf->init_cwd);
1333                         goto out_warn_father;
1334                 }
1335         }
1336
1337         if (!lxc_sync_barrier_parent(handler, START_SYNC_CGROUP_LIMITS))
1338                 goto out_warn_father;
1339
1340         ret = lxc_sync_fds_child(handler);
1341         if (ret < 0) {
1342                 SYSERROR("Failed to sync file descriptors with parent");
1343                 goto out_warn_father;
1344         }
1345
1346         if (!lxc_sync_wait_parent(handler, START_SYNC_READY_START))
1347                 goto out_warn_father;
1348
1349         /* Reset the environment variables the user requested in a clear
1350          * environment.
1351          */
1352         ret = clearenv();
1353         /* Don't error out though. */
1354         if (ret < 0)
1355                 SYSERROR("Failed to clear environment.");
1356
1357         ret = lxc_set_environment(handler->conf);
1358         if (ret < 0)
1359                 goto out_warn_father;
1360
1361         ret = putenv("container=lxc");
1362         if (ret < 0) {
1363                 SYSERROR("Failed to set environment variable: container=lxc");
1364                 goto out_warn_father;
1365         }
1366
1367         if (handler->conf->ttys.tty_names) {
1368                 ret = putenv(handler->conf->ttys.tty_names);
1369                 if (ret < 0) {
1370                         SYSERROR("Failed to set environment variable for container ptys");
1371                         goto out_warn_father;
1372                 }
1373         }
1374
1375         /* The container has been setup. We can now switch to an unprivileged
1376          * uid/gid.
1377          */
1378         new_uid = handler->conf->init_uid;
1379         new_gid = handler->conf->init_gid;
1380
1381         /* Avoid unnecessary syscalls. */
1382         if (new_uid == nsuid)
1383                 new_uid = LXC_INVALID_UID;
1384
1385         if (new_gid == nsgid)
1386                 new_gid = LXC_INVALID_GID;
1387
1388         /* Make sure that the processes STDIO is correctly owned by the user that we are switching to */
1389         ret = fix_stdio_permissions(new_uid);
1390         if (ret)
1391                 WARN("Failed to ajust stdio permissions");
1392
1393         /* If we are in a new user namespace we already dropped all groups when
1394          * we switched to root in the new user namespace further above. Only
1395          * drop groups if we can, so ensure that we have necessary privilege.
1396          */
1397         if (list_empty(&handler->conf->id_map)) {
1398                 #if HAVE_LIBCAP
1399                 if (lxc_proc_cap_is_set(CAP_SETGID, CAP_EFFECTIVE))
1400                 #endif
1401                 {
1402                         if (handler->conf->init_groups.size > 0) {
1403                                 if (!lxc_setgroups(handler->conf->init_groups.list,
1404                                                    handler->conf->init_groups.size))
1405                                         goto out_warn_father;
1406                         } else {
1407                                 if (!lxc_drop_groups())
1408                                         goto out_warn_father;
1409                         }
1410                 }
1411         }
1412
1413         if (!lxc_switch_uid_gid(new_uid, new_gid))
1414                 goto out_warn_father;
1415
1416         ret = lxc_ambient_caps_down();
1417         if (ret < 0) {
1418                 ERROR("Failed to clear ambient capabilities");
1419                 goto out_warn_father;
1420         }
1421
1422         if (handler->conf->monitor_signal_pdeath != SIGKILL) {
1423                 ret = lxc_set_death_signal(handler->conf->monitor_signal_pdeath,
1424                                            handler->monitor_pid, status_fd);
1425                 if (ret < 0) {
1426                         SYSERROR("Failed to set PR_SET_PDEATHSIG to %d",
1427                                  handler->conf->monitor_signal_pdeath);
1428                         goto out_warn_father;
1429                 }
1430         }
1431
1432         /*
1433          * After this call, we are in error because this ops should not return
1434          * as it execs.
1435          */
1436         handler->ops->start(handler, handler->data);
1437
1438 out_warn_father:
1439         /*
1440          * We want the parent to know something went wrong, so we return a
1441          * special error code.
1442          */
1443         lxc_sync_wake_parent(handler, SYNC_ERROR);
1444
1445 out_error:
1446         return -1;
1447 }
1448
1449 int resolve_clone_flags(struct lxc_handler *handler)
1450 {
1451         int i;
1452         struct lxc_conf *conf = handler->conf;
1453         bool wants_timens = conf->timens.s_boot || conf->timens.ns_boot ||
1454                             conf->timens.s_monotonic || conf->timens.ns_monotonic;
1455
1456         for (i = 0; i < LXC_NS_MAX; i++) {
1457                 if (conf->ns_keep) {
1458                         if (!(conf->ns_keep & ns_info[i].clone_flag))
1459                                 handler->ns_clone_flags |= ns_info[i].clone_flag;
1460                 } else if (conf->ns_clone) {
1461                         if ((conf->ns_clone & ns_info[i].clone_flag))
1462                                 handler->ns_clone_flags |= ns_info[i].clone_flag;
1463                 } else {
1464                         if (i == LXC_NS_USER && list_empty(&handler->conf->id_map))
1465                                 continue;
1466
1467                         if (i == LXC_NS_NET && lxc_requests_empty_network(handler))
1468                                 continue;
1469
1470                         if (i == LXC_NS_CGROUP && !cgns_supported())
1471                                 continue;
1472
1473                         if (i == LXC_NS_TIME && !wants_timens)
1474                                 continue;
1475
1476                         handler->ns_clone_flags |= ns_info[i].clone_flag;
1477                 }
1478
1479                 if (!conf->ns_share[i])
1480                         continue;
1481
1482                 handler->ns_clone_flags &= ~ns_info[i].clone_flag;
1483                 TRACE("Sharing %s namespace", ns_info[i].proc_name);
1484         }
1485
1486         if (wants_timens && (conf->ns_keep & ns_info[LXC_NS_TIME].clone_flag))
1487                 return log_trace_errno(-1, EINVAL, "Requested to keep time namespace while also specifying offsets");
1488
1489         /* Deal with namespaces that are unshared. */
1490         if (handler->ns_clone_flags & CLONE_NEWTIME)
1491                 handler->ns_unshare_flags |= CLONE_NEWTIME;
1492
1493         if (!pure_unified_layout(handler->cgroup_ops) && handler->ns_clone_flags & CLONE_NEWCGROUP)
1494                 handler->ns_unshare_flags |= CLONE_NEWCGROUP;
1495
1496         if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
1497             (CLONE_NEWNET | CLONE_NEWUSER))
1498                 handler->ns_unshare_flags |= CLONE_NEWNET;
1499
1500         /* Deal with namespaces that are spawned. */
1501         handler->ns_on_clone_flags = handler->ns_clone_flags & ~handler->ns_unshare_flags;
1502
1503         handler->clone_flags = handler->ns_on_clone_flags | CLONE_PIDFD;
1504
1505         return 0;
1506 }
1507
1508 /* Note that this function is used with clone(CLONE_VM). Some glibc versions
1509  * used to reset the pid/tid to -1 when CLONE_VM was used without CLONE_THREAD.
1510  * But since the memory between parent and child is shared on CLONE_VM this
1511  * would invalidate the getpid() cache that glibc used to maintain and so
1512  * getpid() in the child would return the parent's pid. This is all fixed in
1513  * newer glibc versions where the getpid() cache is removed and the pid/tid is
1514  * not reset anymore.
1515  * However, if for whatever reason you - dear committer - somehow need to get the
1516  * pid of the placeholder intermediate process for do_share_ns() you need to
1517  * call lxc_raw_getpid(). The next lxc_raw_clone() call does not employ
1518  * CLONE_VM and will be fine.
1519  */
1520 static inline int do_share_ns(void *arg)
1521 {
1522         int i, flags, ret;
1523         struct lxc_handler *handler = arg;
1524
1525         for (i = 0; i < LXC_NS_MAX; i++) {
1526                 if (handler->nsfd[i] < 0)
1527                         continue;
1528
1529                 ret = setns(handler->nsfd[i], 0);
1530                 if (ret < 0) {
1531                         /*
1532                          * Note that joining a user and/or mount namespace
1533                          * requires the process is not multithreaded otherwise
1534                          * setns() will fail here.
1535                          */
1536                         SYSERROR("Failed to inherit %s namespace",
1537                                  ns_info[i].proc_name);
1538                         return -1;
1539                 }
1540
1541                 DEBUG("Inherited %s namespace", ns_info[i].proc_name);
1542         }
1543
1544         flags = handler->ns_on_clone_flags;
1545         flags |= CLONE_PARENT;
1546         handler->pid = lxc_raw_clone_cb(do_start, handler, CLONE_PIDFD | flags,
1547                                         &handler->pidfd);
1548         if (handler->pid < 0)
1549                 return -1;
1550
1551         return 0;
1552 }
1553
1554 static int core_scheduling(struct lxc_handler *handler)
1555 {
1556         struct lxc_conf *conf = handler->conf;
1557         int ret;
1558
1559         if (!conf->sched_core)
1560                 return log_trace(0, "No new core scheduling domain requested");
1561
1562         if (!(handler->ns_clone_flags & CLONE_NEWPID))
1563                 return syserror_set(-EINVAL, "Core scheduling currently requires a separate pid namespace");
1564
1565         ret = core_scheduling_cookie_create_threadgroup(handler->pid);
1566         if (ret < 0) {
1567                 if (ret == -ENODEV) {
1568                         INFO("The kernel doesn't support or doesn't use simultaneous multithreading (SMT)");
1569                         conf->sched_core = false;
1570                         return 0;
1571                 }
1572                 if (ret == -EINVAL)
1573                         return syserror("The kernel does not support core scheduling");
1574
1575                 return syserror("Failed to create new core scheduling domain");
1576         }
1577
1578         ret = core_scheduling_cookie_get(handler->pid, &conf->sched_core_cookie);
1579         if (ret || !core_scheduling_cookie_valid(conf->sched_core_cookie))
1580                 return syserror("Failed to retrieve core scheduling domain cookie");
1581
1582         TRACE("Created new core scheduling domain with cookie %llu",
1583               (llu)conf->sched_core_cookie);
1584
1585         return 0;
1586 }
1587
1588 static bool inherits_namespaces(const struct lxc_handler *handler)
1589 {
1590         struct lxc_conf *conf = handler->conf;
1591
1592         for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
1593                 if (conf->ns_share[i])
1594                         return true;
1595         }
1596
1597         return false;
1598 }
1599
1600 static inline void resolve_cgroup_clone_flags(struct lxc_handler *handler)
1601 {
1602         handler->clone_flags            &= ~(CLONE_INTO_CGROUP | CLONE_NEWCGROUP);
1603         handler->ns_on_clone_flags      &= ~(CLONE_INTO_CGROUP | CLONE_NEWCGROUP);
1604         handler->ns_unshare_flags       |= CLONE_NEWCGROUP;
1605 }
1606
1607 /* lxc_spawn() performs crucial setup tasks and clone()s the new process which
1608  * exec()s the requested container binary.
1609  * Note that lxc_spawn() runs in the parent namespaces. Any operations performed
1610  * right here should be double checked if they'd pose a security risk. (For
1611  * example, any {u}mount() operations performed here will be reflected on the
1612  * host!)
1613  */
1614 static int lxc_spawn(struct lxc_handler *handler)
1615 {
1616         __do_close int data_sock0 = -EBADF, data_sock1 = -EBADF;
1617         int i, ret;
1618         char pidstr[20];
1619         bool wants_to_map_ids;
1620         struct list_head *id_map;
1621         const char *name = handler->name;
1622         struct lxc_conf *conf = handler->conf;
1623         struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
1624
1625         id_map = &conf->id_map;
1626         wants_to_map_ids = !list_empty(id_map);
1627
1628         if (!lxc_sync_init(handler))
1629                 return -1;
1630
1631         ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0,
1632                          handler->data_sock);
1633         if (ret < 0)
1634                 goto out_sync_fini;
1635         data_sock0 = handler->data_sock[0];
1636         data_sock1 = handler->data_sock[1];
1637
1638         if (handler->ns_clone_flags & CLONE_NEWNET) {
1639                 ret = lxc_find_gateway_addresses(handler);
1640                 if (ret) {
1641                         ERROR("Failed to find gateway addresses");
1642                         goto out_sync_fini;
1643                 }
1644         }
1645
1646         if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
1647                 ERROR("Failed creating cgroups");
1648                 goto out_delete_net;
1649         }
1650
1651         /* Create a process in a new set of namespaces. */
1652         if (inherits_namespaces(handler)) {
1653                 pid_t attacher_pid;
1654
1655                 resolve_cgroup_clone_flags(handler);
1656                 attacher_pid = lxc_clone(do_share_ns, handler,
1657                                          CLONE_VFORK | CLONE_VM | CLONE_FILES, NULL);
1658                 if (attacher_pid < 0) {
1659                         SYSERROR(LXC_CLONE_ERROR);
1660                         goto out_delete_net;
1661                 }
1662
1663                 ret = wait_for_pid(attacher_pid);
1664                 if (ret < 0) {
1665                         SYSERROR("Intermediate process failed");
1666                         goto out_delete_net;
1667                 }
1668
1669                 if (handler->pid < 0) {
1670                         SYSERROR(LXC_CLONE_ERROR);
1671                         goto out_delete_net;
1672                 }
1673         } else {
1674                 int cgroup_fd = -EBADF;
1675
1676                 struct clone_args clone_args = {
1677                         .flags = handler->clone_flags,
1678                         .pidfd = ptr_to_u64(&handler->pidfd),
1679                         .exit_signal = SIGCHLD,
1680                 };
1681
1682                 if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
1683                         cgroup_fd = cgroup_unified_fd(cgroup_ops);
1684                         if (cgroup_fd >= 0) {
1685                                 handler->clone_flags    |= CLONE_INTO_CGROUP;
1686                                 clone_args.flags        |= CLONE_INTO_CGROUP;
1687                                 clone_args.cgroup       = cgroup_fd;
1688                         }
1689                 }
1690
1691                 /* Try to spawn directly into target cgroup. */
1692                 handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER2);
1693                 if (handler->pid < 0) {
1694                         SYSTRACE("Failed to spawn container directly into target cgroup");
1695
1696                         /* Kernel might simply be too old for CLONE_INTO_CGROUP. */
1697                         resolve_cgroup_clone_flags(handler);
1698                         clone_args.flags = handler->clone_flags;
1699
1700                         handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER0);
1701                 } else if (cgroup_fd >= 0) {
1702                         TRACE("Spawned container directly into target cgroup via cgroup2 fd %d", cgroup_fd);
1703                 }
1704
1705                 /* Kernel might be too old for clone3(). */
1706                 if (handler->pid < 0) {
1707                         SYSTRACE("Failed to spawn container via clone3()");
1708
1709                 /*
1710                  * In contrast to all other architectures arm64 verifies that
1711                  * the argument we use to retrieve the pidfd with is
1712                  * initialized to 0. But we need to be able to initialize it to
1713                  * a negative value such as our customary -EBADF so we can
1714                  * detect whether this kernel supports pidfds. If the syscall
1715                  * returns and the pidfd variable is set to something >= 0 then
1716                  * we know this is a kernel supporting pidfds. But if we can't
1717                  * set it to -EBADF then this won't work since 0 is a valid
1718                  * file descriptor too. And since legacy clone silently ignores
1719                  * unknown flags we are left without any way to detect support
1720                  * for pidfds. So let's special-case arm64 to not fail starting
1721                  * containers.
1722                  */
1723                 #if defined(__aarch64__)
1724                         handler->pid = lxc_raw_legacy_clone(handler->clone_flags & ~CLONE_PIDFD, NULL);
1725                 #else
1726                         handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd);
1727                 #endif
1728                 }
1729
1730                 if (handler->pid < 0) {
1731                         SYSERROR(LXC_CLONE_ERROR);
1732                         goto out_delete_net;
1733                 }
1734
1735                 if (handler->pid == 0) {
1736                         (void)do_start(handler);
1737                         _exit(EXIT_FAILURE);
1738                 }
1739         }
1740         if (handler->pidfd < 0)
1741                 handler->clone_flags &= ~CLONE_PIDFD;
1742         TRACE("Cloned child process %d", handler->pid);
1743
1744         ret = core_scheduling(handler);
1745         if (ret < 0)
1746                 goto out_delete_net;
1747
1748         /* Verify that we can actually make use of pidfds. */
1749         if (!lxc_can_use_pidfd(handler->pidfd))
1750                 close_prot_errno_disarm(handler->pidfd);
1751
1752         ret = strnprintf(pidstr, 20, "%d", handler->pid);
1753         if (ret < 0)
1754                 goto out_delete_net;
1755
1756         ret = setenv("LXC_PID", pidstr, 1);
1757         if (ret < 0)
1758                 SYSERROR("Failed to set environment variable: LXC_PID=%s", pidstr);
1759
1760         for (i = 0; i < LXC_NS_MAX; i++)
1761                 if (handler->ns_on_clone_flags & ns_info[i].clone_flag)
1762                         INFO("Cloned %s", ns_info[i].flag_name);
1763
1764         if (!lxc_try_preserve_namespaces(handler, handler->ns_on_clone_flags)) {
1765                 ERROR("Failed to preserve cloned namespaces for lxc.hook.stop");
1766                 goto out_delete_net;
1767         }
1768
1769         lxc_sync_fini_child(handler);
1770
1771         if (lxc_abstract_unix_send_fds(handler->data_sock[0], &handler->monitor_status_fd, 1, NULL, 0) < 0) {
1772                 ERROR("Failed to send status file descriptor to child process");
1773                 goto out_delete_net;
1774         }
1775         close_prot_errno_disarm(handler->monitor_status_fd);
1776
1777         /* Map the container uids. The container became an invalid userid the
1778          * moment it was cloned with CLONE_NEWUSER. This call doesn't change
1779          * anything immediately, but allows the container to setuid(0) (0 being
1780          * mapped to something else on the host.) later to become a valid uid
1781          * again.
1782          */
1783         if (wants_to_map_ids) {
1784                 if (!handler->conf->ns_share[LXC_NS_USER] &&
1785                     (handler->conf->ns_keep & CLONE_NEWUSER) == 0) {
1786                         ret = lxc_map_ids(id_map, handler->pid);
1787                         if (ret < 0) {
1788                                 ERROR("Failed to set up id mapping.");
1789                                 goto out_delete_net;
1790                         }
1791                 }
1792         }
1793
1794         if (!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, false)) {
1795                 ERROR("Failed to setup cgroup limits for container \"%s\"", name);
1796                 goto out_delete_net;
1797         }
1798
1799         if (!cgroup_ops->payload_delegate_controllers(cgroup_ops)) {
1800                 ERROR("Failed to delegate controllers to payload cgroup");
1801                 goto out_delete_net;
1802         }
1803
1804         if (!cgroup_ops->payload_enter(cgroup_ops, handler)) {
1805                 ERROR("Failed to enter cgroups");
1806                 goto out_delete_net;
1807         }
1808
1809         if (!cgroup_ops->setup_limits(cgroup_ops, handler)) {
1810                 ERROR("Failed to setup cgroup limits for container \"%s\"", name);
1811                 goto out_delete_net;
1812         }
1813
1814         if (!cgroup_ops->chown(cgroup_ops, handler->conf))
1815                 goto out_delete_net;
1816
1817         if (!lxc_sync_barrier_child(handler, START_SYNC_STARTUP))
1818                 goto out_delete_net;
1819
1820         /* If not done yet, we're now ready to preserve the network namespace */
1821         if (handler->nsfd[LXC_NS_NET] < 0) {
1822                 ret = lxc_try_preserve_namespace(handler, LXC_NS_NET, "net");
1823                 if (ret < 0) {
1824                         if (ret != -ENOENT) {
1825                                 SYSERROR("Failed to preserve net namespace");
1826                                 goto out_delete_net;
1827                         }
1828                 }
1829         }
1830         ret = lxc_netns_set_nsid(handler->nsfd[LXC_NS_NET]);
1831         if (ret < 0)
1832                 SYSWARN("Failed to allocate new network namespace id");
1833         else
1834                 TRACE("Allocated new network namespace id");
1835
1836         /* Create the network configuration. */
1837         if (handler->ns_clone_flags & CLONE_NEWNET) {
1838                 ret = lxc_create_network(handler);
1839                 if (ret < 0) {
1840                         ERROR("Failed to create the network");
1841                         goto out_delete_net;
1842                 }
1843         }
1844
1845         ret = setup_proc_filesystem(conf, handler->pid);
1846         if (ret < 0) {
1847                 ERROR("Failed to setup procfs limits");
1848                 goto out_delete_net;
1849         }
1850
1851         ret = setup_resource_limits(conf, handler->pid);
1852         if (ret < 0) {
1853                 ERROR("Failed to setup resource limits");
1854                 goto out_delete_net;
1855         }
1856
1857         /* Tell the child to continue its initialization. */
1858         if (!lxc_sync_wake_child(handler, START_SYNC_POST_CONFIGURE))
1859                 goto out_delete_net;
1860
1861         ret = lxc_rootfs_prepare_parent(handler);
1862         if (ret) {
1863                 ERROR("Failed to prepare rootfs");
1864                 goto out_delete_net;
1865         }
1866
1867         if (handler->ns_clone_flags & CLONE_NEWNET) {
1868                 ret = lxc_network_send_to_child(handler);
1869                 if (ret < 0) {
1870                         SYSERROR("Failed to send veth names to child");
1871                         goto out_delete_net;
1872                 }
1873         }
1874
1875         if (!lxc_sync_wait_child(handler, START_SYNC_IDMAPPED_MOUNTS))
1876                 goto out_delete_net;
1877
1878         ret = lxc_idmapped_mounts_parent(handler);
1879         if (ret) {
1880                 ERROR("Failed to setup mount entries");
1881                 goto out_delete_net;
1882         }
1883
1884         if (!lxc_sync_wait_child(handler, START_SYNC_CGROUP_LIMITS))
1885                 goto out_delete_net;
1886
1887         /*
1888          * With isolation the limiting devices cgroup was already setup, so
1889          * only setup devices here if we have no namespace directory.
1890          */
1891         if (!handler->conf->cgroup_meta.namespace_dir &&
1892             !cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, true)) {
1893                 ERROR("Failed to setup legacy device cgroup controller limits");
1894                 goto out_delete_net;
1895         }
1896         TRACE("Set up legacy device cgroup controller limits");
1897
1898         if (!cgroup_ops->devices_activate(cgroup_ops, handler)) {
1899                 ERROR("Failed to setup cgroup2 device controller limits");
1900                 goto out_delete_net;
1901         }
1902         TRACE("Set up cgroup2 device controller limits");
1903
1904         cgroup_ops->finalize(cgroup_ops);
1905         TRACE("Finished setting up cgroups");
1906
1907         /* Run any host-side start hooks */
1908         ret = run_lxc_hooks(name, "start-host", conf, NULL);
1909         if (ret < 0) {
1910                 ERROR("Failed to run lxc.hook.start-host");
1911                 goto out_delete_net;
1912         }
1913
1914         if (!lxc_sync_wake_child(handler, START_SYNC_FDS))
1915                 goto out_delete_net;
1916
1917         if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
1918                 /* Now we're ready to preserve the cgroup namespace */
1919                 ret = lxc_try_preserve_namespace(handler, LXC_NS_CGROUP, "cgroup");
1920                 if (ret < 0) {
1921                         if (ret != -ENOENT) {
1922                                 SYSERROR("Failed to preserve cgroup namespace");
1923                                 goto out_delete_net;
1924                         }
1925                 }
1926         }
1927
1928         if (handler->ns_unshare_flags & CLONE_NEWTIME) {
1929                 /* Now we're ready to preserve the time namespace */
1930                 ret = lxc_try_preserve_namespace(handler, LXC_NS_TIME, "time");
1931                 if (ret < 0) {
1932                         if (ret != -ENOENT) {
1933                                 SYSERROR("Failed to preserve time namespace");
1934                                 goto out_delete_net;
1935                         }
1936                 }
1937         }
1938
1939         ret = lxc_sync_fds_parent(handler);
1940         if (ret < 0) {
1941                 SYSERROR("Failed to sync file descriptors with child");
1942                 goto out_delete_net;
1943         }
1944
1945         ret = lxc_terminal_setup(conf);
1946         if (ret < 0) {
1947                 SYSERROR("Failed to create console");
1948                 goto out_delete_net;
1949         }
1950
1951         /*
1952          * Tell the child to complete its initialization and wait for it to
1953          * exec or return an error. (The child will never return
1954          * START_SYNC_READY_START+1. It will either close the sync pipe,
1955          * causing lxc_sync_barrier_child to return success, or return a
1956          * different value, causing us to error out).
1957          */
1958         if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START))
1959                 goto out_delete_net;
1960
1961         /* Now all networks are created, network devices are moved into place,
1962          * and the correct names and ifindices in the respective namespaces have
1963          * been recorded. The corresponding structs have now all been filled. So
1964          * log them for debugging purposes.
1965          */
1966         lxc_log_configured_netdevs(conf);
1967
1968         ret = handler->ops->post_start(handler, handler->data);
1969         if (ret < 0)
1970                 goto out_abort;
1971
1972         ret = lxc_set_state(name, handler, RUNNING);
1973         if (ret < 0) {
1974                 ERROR("Failed to set state to \"%s\"", lxc_state2str(RUNNING));
1975                 goto out_abort;
1976         }
1977
1978         lxc_sync_fini(handler);
1979
1980         return 0;
1981
1982 out_delete_net:
1983         if (handler->ns_clone_flags & CLONE_NEWNET)
1984                 lxc_delete_network(handler);
1985
1986 out_abort:
1987         lxc_abort(handler);
1988
1989 out_sync_fini:
1990         lxc_sync_fini(handler);
1991
1992         return -1;
1993 }
1994
1995 static int lxc_inherit_namespaces(struct lxc_handler *handler)
1996 {
1997         const char *lxcpath = handler->lxcpath;
1998         struct lxc_conf *conf = handler->conf;
1999
2000         for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
2001                 if (!conf->ns_share[i])
2002                         continue;
2003
2004                 handler->nsfd[i] = lxc_inherit_namespace(conf->ns_share[i],
2005                                                         lxcpath,
2006                                                         ns_info[i].proc_name);
2007                 if (handler->nsfd[i] < 0)
2008                         return -1;
2009
2010                 TRACE("Recording inherited %s namespace with fd %d",
2011                       ns_info[i].proc_name, handler->nsfd[i]);
2012         }
2013
2014         return 0;
2015 }
2016
2017 int __lxc_start(struct lxc_handler *handler, struct lxc_operations *ops,
2018                 void *data, const char *lxcpath, bool daemonize, int *error_num)
2019 {
2020         int ret, status;
2021         const char *name = handler->name;
2022         struct lxc_conf *conf = handler->conf;
2023         struct cgroup_ops *cgroup_ops;
2024
2025         ret = lxc_init(name, handler);
2026         if (ret < 0) {
2027                 ERROR("Failed to initialize container \"%s\"", name);
2028                 goto out_abort;
2029         }
2030         handler->ops = ops;
2031         handler->data = data;
2032         handler->daemonize = daemonize;
2033         cgroup_ops = handler->cgroup_ops;
2034
2035         if (!attach_block_device(handler->conf)) {
2036                 ERROR("Failed to attach block device");
2037                 ret = -1;
2038                 goto out_abort;
2039         }
2040
2041         if (!cgroup_ops->monitor_create(cgroup_ops, handler)) {
2042                 ERROR("Failed to create monitor cgroup");
2043                 ret = -1;
2044                 goto out_abort;
2045         }
2046
2047         if (!cgroup_ops->monitor_delegate_controllers(cgroup_ops)) {
2048                 ERROR("Failed to delegate controllers to monitor cgroup");
2049                 ret = -1;
2050                 goto out_abort;
2051         }
2052
2053         if (!cgroup_ops->monitor_enter(cgroup_ops, handler)) {
2054                 ERROR("Failed to enter monitor cgroup");
2055                 ret = -1;
2056                 goto out_abort;
2057         }
2058
2059         ret = resolve_clone_flags(handler);
2060         if (ret < 0) {
2061                 ERROR("Failed to resolve clone flags");
2062                 ret = -1;
2063                 goto out_abort;
2064         }
2065
2066         ret = lxc_inherit_namespaces(handler);
2067         if (ret) {
2068                 SYSERROR("Failed to record inherited namespaces");
2069                 ret = -1;
2070                 goto out_abort;
2071         }
2072
2073         /* If the rootfs is not a blockdev, prevent the container from marking
2074          * it readonly.
2075          * If the container is unprivileged then skip rootfs pinning.
2076          */
2077         ret = lxc_rootfs_init(conf, !list_empty(&conf->id_map));
2078         if (ret) {
2079                 ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
2080                 ret = -1;
2081                 goto out_abort;
2082         }
2083
2084         if (geteuid() == 0 && !list_empty(&conf->id_map)) {
2085                 /*
2086                  * Most filesystems can't be mounted inside a userns so handle them here.
2087                  */
2088                 if (rootfs_is_blockdev(conf)) {
2089                         ret = unshare(CLONE_NEWNS);
2090                         if (ret < 0) {
2091                                 ERROR("Failed to unshare CLONE_NEWNS");
2092                                 goto out_abort;
2093                         }
2094                         INFO("Unshared CLONE_NEWNS");
2095
2096                         ret = lxc_setup_rootfs_prepare_root(conf, name, lxcpath);
2097                         if (ret < 0) {
2098                                 ERROR("Error setting up rootfs mount as root before spawn");
2099                                 goto out_abort;
2100                         }
2101                         INFO("Set up container rootfs as host root");
2102                 }
2103         }
2104
2105         ret = lxc_spawn(handler);
2106         if (ret < 0) {
2107                 ERROR("Failed to spawn container \"%s\"", name);
2108                 goto out_detach_blockdev;
2109         }
2110
2111         handler->conf->reboot = REBOOT_NONE;
2112
2113         ret = lxc_poll(name, handler);
2114         if (ret) {
2115                 ERROR("LXC mainloop exited with error: %d", ret);
2116                 goto out_delete_network;
2117         }
2118
2119         if (!handler->init_died && handler->pid > 0) {
2120                 ERROR("Child process is not killed");
2121                 ret = -1;
2122                 goto out_delete_network;
2123         }
2124
2125         status = lxc_wait_for_pid_status(handler->pid);
2126         if (status < 0)
2127                 SYSERROR("Failed to retrieve status for %d", handler->pid);
2128
2129         /* If the child process exited but was not signaled, it didn't call
2130          * reboot. This should mean it was an lxc-execute which simply exited.
2131          * In any case, treat it as a 'halt'.
2132          */
2133         if (WIFSIGNALED(status)) {
2134                 int signal_nr = WTERMSIG(status);
2135                 switch(signal_nr) {
2136                 case SIGINT: /* halt */
2137                         DEBUG("%s(%d) - Container \"%s\" is halting", signal_name(signal_nr), signal_nr, name);
2138                         break;
2139                 case SIGHUP: /* reboot */
2140                         DEBUG("%s(%d) - Container \"%s\" is rebooting", signal_name(signal_nr), signal_nr, name);
2141                         handler->conf->reboot = REBOOT_REQ;
2142                         break;
2143                 case SIGSYS: /* seccomp */
2144                         DEBUG("%s(%d) - Container \"%s\" violated its seccomp policy", signal_name(signal_nr), signal_nr, name);
2145                         break;
2146                 default:
2147                         DEBUG("%s(%d) - Container \"%s\" init exited", signal_name(signal_nr), signal_nr, name);
2148                         break;
2149                 }
2150         }
2151
2152         ret = lxc_restore_phys_nics_to_netns(handler);
2153         if (ret < 0)
2154                 ERROR("Failed to move physical network devices back to parent network namespace");
2155
2156         lxc_monitor_send_exit_code(name, status, handler->lxcpath);
2157         lxc_error_set_and_log(handler->pid, status);
2158         if (error_num)
2159                 *error_num = handler->exit_status;
2160
2161         lxc_delete_network(handler);
2162         detach_block_device(handler->conf);
2163         lxc_end(handler);
2164         return ret;
2165
2166 out_abort:
2167         lxc_abort(handler);
2168         lxc_end(handler);
2169         return ret;
2170
2171 out_detach_blockdev:
2172         lxc_abort(handler);
2173         detach_block_device(handler->conf);
2174         lxc_end(handler);
2175         return ret;
2176
2177 out_delete_network:
2178         lxc_abort(handler);
2179         lxc_restore_phys_nics_to_netns(handler);
2180         lxc_delete_network(handler);
2181         detach_block_device(handler->conf);
2182         lxc_end(handler);
2183         return ret;
2184 }
2185
2186 struct start_args {
2187         char *const *argv;
2188 };
2189
2190 static int start(struct lxc_handler *handler, void* data)
2191 {
2192         struct start_args *arg = data;
2193
2194         NOTICE("Exec'ing \"%s\"", arg->argv[0]);
2195
2196         execvp(arg->argv[0], arg->argv);
2197         SYSERROR("Failed to exec \"%s\"", arg->argv[0]);
2198         return 0;
2199 }
2200
2201 static int post_start(struct lxc_handler *handler, void* data)
2202 {
2203         struct start_args *arg = data;
2204
2205         NOTICE("Started \"%s\" with pid \"%d\"", arg->argv[0], handler->pid);
2206         return 0;
2207 }
2208
2209 static struct lxc_operations start_ops = {
2210         .start = start,
2211         .post_start = post_start
2212 };
2213
2214 int lxc_start(char *const argv[], struct lxc_handler *handler,
2215               const char *lxcpath, bool daemonize, int *error_num)
2216 {
2217         struct start_args start_arg = {
2218                 .argv = argv,
2219         };
2220
2221         TRACE("Doing lxc_start");
2222         return __lxc_start(handler, &start_ops, &start_arg, lxcpath, daemonize, error_num);
2223 }
2224
2225 static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
2226                                             const char *name)
2227 {
2228         char destroy[PATH_MAX];
2229         struct lxc_container *c;
2230         int ret = 0;
2231         bool bret = true;
2232
2233         if (handler->conf->rootfs.path && handler->conf->rootfs.mount) {
2234                 bret = do_destroy_container(handler);
2235                 if (!bret) {
2236                         ERROR("Error destroying rootfs for container \"%s\"", name);
2237                         return;
2238                 }
2239         }
2240         INFO("Destroyed rootfs for container \"%s\"", name);
2241
2242         ret = strnprintf(destroy, sizeof(destroy), "%s/%s", handler->lxcpath, name);
2243         if (ret < 0) {
2244                 ERROR("Error destroying directory for container \"%s\"", name);
2245                 return;
2246         }
2247
2248         c = lxc_container_new(name, handler->lxcpath);
2249         if (c) {
2250                 if (container_disk_lock(c)) {
2251                         INFO("Could not update lxc_snapshots file");
2252                         lxc_container_put(c);
2253                 } else {
2254                         mod_all_rdeps(c, false);
2255                         container_disk_unlock(c);
2256                         lxc_container_put(c);
2257                 }
2258         }
2259
2260         if (!handler->am_root)
2261                 ret = userns_exec_full(handler->conf, lxc_rmdir_onedev_wrapper,
2262                                        destroy, "lxc_rmdir_onedev_wrapper");
2263         else
2264                 ret = lxc_rmdir_onedev(destroy, NULL);
2265
2266         if (ret < 0) {
2267                 ERROR("Error destroying directory for container \"%s\"", name);
2268                 return;
2269         }
2270         INFO("Destroyed directory for container \"%s\"", name);
2271 }
2272
2273 static int lxc_rmdir_onedev_wrapper(void *data)
2274 {
2275         char *arg = (char *) data;
2276         return lxc_rmdir_onedev(arg, NULL);
2277 }
2278
2279 static bool do_destroy_container(struct lxc_handler *handler)
2280 {
2281         int ret;
2282
2283         if (!handler->am_root) {
2284                 ret = userns_exec_full(handler->conf, storage_destroy_wrapper,
2285                                        handler->conf, "storage_destroy_wrapper");
2286                 if (ret < 0)
2287                         return false;
2288
2289                 return true;
2290         }
2291
2292         return storage_destroy(handler->conf);
2293 }