src/lxc/attach.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include "config.h"
   4
   5 #include <errno.h>
   6 #include <fcntl.h>
   7 #include <grp.h>
   8 #include <linux/unistd.h>
   9 #include <pwd.h>
  10 #include <pthread.h>
  11 #include <signal.h>
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14 #include <string.h>
  15 #include <sys/mount.h>
  16 #include <sys/param.h>
  17 #include <sys/prctl.h>
  18 #include <sys/socket.h>
  19 #include <sys/syscall.h>
  20 #include <sys/wait.h>
  21 #include <termios.h>
  22 #include <unistd.h>
  23
  24 #include "attach.h"
  25
  26 #include "af_unix.h"
  27 #include "attach.h"
  28 #include "caps.h"
  29 #include "cgroups/cgroup.h"
  30 #include "cgroups/cgroup_utils.h"
  31 #include "commands.h"
  32 #include "conf.h"
  33 #include "confile.h"
  34 #include "log.h"
  35 #include "lsm/lsm.h"
  36 #include "lxclock.h"
  37 #include "lxcseccomp.h"
  38 #include "macro.h"
  39 #include "mainloop.h"
  40 #include "memory_utils.h"
  41 #include "mount_utils.h"
  42 #include "namespace.h"
  43 #include "process_utils.h"
  44 #include "sync.h"
  45 #include "syscall_wrappers.h"
  46 #include "terminal.h"
  47 #include "utils.h"
  48
  49 lxc_log_define(attach, lxc);
  50
  51 /* Define default options if no options are supplied by the user. */
  52 static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT;
  53
  54 /*
  55  * The context used to attach to the container.
  56  * @attach_flags        : the attach flags specified in lxc_attach_options_t
  57  * @init_pid            : the PID of the container's init process
  58  * @dfd_init_pid        : file descriptor to /proc/@init_pid
  59  *                        __Must be closed in attach_context_security_barrier()__!
  60  * @dfd_self_pid        : file descriptor to /proc/self
  61  *                        __Must be closed in attach_context_security_barrier()__!
  62  * @setup_ns_uid        : if CLONE_NEWUSER is specified will contain the uid used
  63  *                        during attach setup.
  64  * @setup_ns_gid        : if CLONE_NEWUSER is specified will contain the gid used
  65  *                        during attach setup.
  66  * @target_ns_uid       : if CLONE_NEWUSER is specified the uid that the final
  67  *                        program will be run with.
  68  * @target_ns_gid       : if CLONE_NEWUSER is specified the gid that the final
  69  *                        program will be run with.
  70  * @target_host_uid     : if CLONE_NEWUSER is specified the uid that the final
  71  *                        program will be run with on the host.
  72  * @target_host_gid     : if CLONE_NEWUSER is specified the gid that the final
  73  *                        program will be run with on the host.
  74  * @lsm_label           : LSM label to be used for the attaching process
  75  * @container           : the container we're attaching o
  76  * @personality         : the personality to use for the final program
  77  * @capability          : the capability mask of the @init_pid
  78  * @ns_inherited        : flags of namespaces that the final program will inherit
  79  *                        from @init_pid
  80  * @ns_fd               : file descriptors to @init_pid's namespaces
  81  * @core_sched_cookie   : core scheduling cookie
  82  */
  83 struct attach_context {
  84         unsigned int ns_clone_flags;
  85         unsigned int attach_flags;
  86         int init_pid;
  87         int init_pidfd;
  88         int dfd_init_pid;
  89         int dfd_self_pid;
  90         uid_t setup_ns_uid;
  91         gid_t setup_ns_gid;
  92         uid_t target_ns_uid;
  93         gid_t target_ns_gid;
  94         uid_t target_host_uid;
  95         uid_t target_host_gid;
  96         char *lsm_label;
  97         struct lxc_container *container;
  98         personality_t personality;
  99         unsigned long long capability_mask;
 100         int ns_inherited;
 101         int ns_fd[LXC_NS_MAX];
 102         struct lsm_ops *lsm_ops;
 103         __u64 core_sched_cookie;
 104 };
 105
 106 static pid_t pidfd_get_pid(int dfd_init_pid, int pidfd)
 107 {
 108         __do_free char *line = NULL;
 109         __do_fclose FILE *f = NULL;
 110         size_t len = 0;
 111         char path[STRLITERALLEN("fdinfo/") + INTTYPE_TO_STRLEN(int) + 1 ] = "fdinfo/";
 112         int ret;
 113
 114         if (dfd_init_pid < 0 || pidfd < 0)
 115                 return ret_errno(EBADF);
 116
 117         ret = strnprintf(path + STRLITERALLEN("fdinfo/"), INTTYPE_TO_STRLEN(int), "%d", pidfd);
 118         if (ret < 0)
 119                 return ret_errno(EIO);
 120
 121         f = fdopen_at(dfd_init_pid, path, "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
 122         if (!f)
 123                 return -errno;
 124
 125         while (getline(&line, &len, f) != -1) {
 126                 const char *prefix = "Pid:\t";
 127                 const size_t prefix_len = STRLITERALLEN("Pid:\t");
 128                 int pid = -ESRCH;
 129                 char *slider = line;
 130
 131                 if (!strnequal(slider, prefix, prefix_len))
 132                         continue;
 133
 134                 slider += prefix_len;
 135                 slider = lxc_trim_whitespace_in_place(slider);
 136
 137                 ret = lxc_safe_int(slider, &pid);
 138                 if (ret)
 139                         return -ret;
 140
 141                 return pid;
 142         }
 143
 144         return ret_errno(ENOENT);
 145 }
 146
 147 static inline bool sync_wake_pid(int fd, pid_t pid)
 148 {
 149         return lxc_write_nointr(fd, &pid, sizeof(pid_t)) == sizeof(pid_t);
 150 }
 151
 152 static inline bool sync_wait_pid(int fd, pid_t *pid)
 153 {
 154         return lxc_read_nointr(fd, pid, sizeof(pid_t)) == sizeof(pid_t);
 155 }
 156
 157 static inline bool sync_wake_fd(int fd, int fd_send)
 158 {
 159         return lxc_abstract_unix_send_fds(fd, &fd_send, 1, NULL, 0) > 0;
 160 }
 161
 162 static inline bool sync_wait_fd(int fd, int *fd_recv)
 163 {
 164         return lxc_abstract_unix_recv_one_fd(fd, fd_recv, NULL, 0) > 0;
 165 }
 166
 167 static inline bool attach_lsm(lxc_attach_options_t *options)
 168 {
 169         return (options->attach_flags & (LXC_ATTACH_LSM | LXC_ATTACH_LSM_LABEL));
 170 }
 171
 172 static struct attach_context *alloc_attach_context(void)
 173 {
 174         struct attach_context *ctx;
 175
 176         ctx = zalloc(sizeof(struct attach_context));
 177         if (!ctx)
 178                 return ret_set_errno(NULL, ENOMEM);
 179
 180         ctx->init_pid           = -ESRCH;
 181
 182         ctx->dfd_self_pid       = -EBADF;
 183         ctx->dfd_init_pid       = -EBADF;
 184         ctx->init_pidfd         = -EBADF;
 185
 186         ctx->setup_ns_uid       = LXC_INVALID_UID;
 187         ctx->setup_ns_gid       = LXC_INVALID_GID;
 188         ctx->target_ns_uid      = LXC_INVALID_UID;
 189         ctx->target_ns_gid      = LXC_INVALID_GID;
 190         ctx->target_host_uid    = LXC_INVALID_UID;
 191         ctx->target_host_gid    = LXC_INVALID_GID;
 192
 193         ctx->core_sched_cookie  = INVALID_SCHED_CORE_COOKIE;
 194
 195         for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++)
 196                 ctx->ns_fd[i] = -EBADF;
 197
 198         return ctx;
 199 }
 200
 201 static int get_personality(const char *name, const char *lxcpath,
 202                            personality_t *personality)
 203 {
 204         __do_free char *p = NULL;
 205         int ret;
 206         signed long per;
 207
 208         p = lxc_cmd_get_config_item(name, "lxc.arch", lxcpath);
 209         if (!p) {
 210                 *personality = LXC_ARCH_UNCHANGED;
 211                 return 0;
 212         }
 213
 214         ret = lxc_config_parse_arch(p, &per);
 215         if (ret < 0)
 216                 return syserror("Failed to parse personality");
 217
 218         *personality = per;
 219         return 0;
 220 }
 221
 222 static int userns_setup_ids(struct attach_context *ctx,
 223                             lxc_attach_options_t *options)
 224 {
 225         __do_free char *line = NULL;
 226         __do_fclose FILE *f_gidmap = NULL, *f_uidmap = NULL;
 227         size_t len = 0;
 228         uid_t init_ns_uid = LXC_INVALID_UID;
 229         gid_t init_ns_gid = LXC_INVALID_GID;
 230         uid_t nsuid, hostuid, range_uid;
 231         gid_t nsgid, hostgid, range_gid;
 232
 233         if (!(options->namespaces & CLONE_NEWUSER))
 234                 return 0;
 235
 236         f_uidmap = fdopen_at(ctx->dfd_init_pid, "uid_map", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
 237         if (!f_uidmap)
 238                 return syserror("Failed to open uid_map");
 239
 240         while (getline(&line, &len, f_uidmap) != -1) {
 241                 if (sscanf(line, "%u %u %u", &nsuid, &hostuid, &range_uid) != 3)
 242                         continue;
 243
 244                 if (0 >= nsuid && 0 < nsuid + range_uid) {
 245                         ctx->setup_ns_uid = 0;
 246                         TRACE("Container has mapping for uid 0");
 247                         break;
 248                 }
 249
 250                 if (ctx->target_host_uid >= hostuid && ctx->target_host_uid < hostuid + range_uid) {
 251                         init_ns_uid = (ctx->target_host_uid - hostuid) + nsuid;
 252                         TRACE("Container runs with uid %d", init_ns_uid);
 253                 }
 254         }
 255
 256         f_gidmap = fdopen_at(ctx->dfd_init_pid, "gid_map", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
 257         if (!f_gidmap)
 258                 return syserror("Failed to open gid_map");
 259
 260         while (getline(&line, &len, f_gidmap) != -1) {
 261                 if (sscanf(line, "%u %u %u", &nsgid, &hostgid, &range_gid) != 3)
 262                         continue;
 263
 264                 if (0 >= nsgid && 0 < nsgid + range_gid) {
 265                         ctx->setup_ns_gid = 0;
 266                         TRACE("Container has mapping for gid 0");
 267                         break;
 268                 }
 269
 270                 if (ctx->target_host_gid >= hostgid && ctx->target_host_gid < hostgid + range_gid) {
 271                         init_ns_gid = (ctx->target_host_gid - hostgid) + nsgid;
 272                         TRACE("Container runs with gid %d", init_ns_gid);
 273                 }
 274         }
 275
 276         if (ctx->setup_ns_uid == LXC_INVALID_UID)
 277                 ctx->setup_ns_uid = init_ns_uid;
 278
 279         if (ctx->setup_ns_gid == LXC_INVALID_UID)
 280                 ctx->setup_ns_gid = init_ns_gid;
 281
 282         return 0;
 283 }
 284
 285 static void userns_target_ids(struct attach_context *ctx, lxc_attach_options_t *options)
 286 {
 287         if (options->uid != LXC_INVALID_UID)
 288                 ctx->target_ns_uid = options->uid;
 289         else if (options->namespaces & CLONE_NEWUSER)
 290                 ctx->target_ns_uid = ctx->setup_ns_uid;
 291         else
 292                 ctx->target_ns_uid = 0;
 293
 294         if (ctx->target_ns_uid == LXC_INVALID_UID)
 295                 WARN("Invalid uid specified");
 296
 297         if (options->gid != LXC_INVALID_GID)
 298                 ctx->target_ns_gid = options->gid;
 299         else if (options->namespaces & CLONE_NEWUSER)
 300                 ctx->target_ns_gid = ctx->setup_ns_gid;
 301         else
 302                 ctx->target_ns_gid = 0;
 303
 304         if (ctx->target_ns_gid == LXC_INVALID_GID)
 305                 WARN("Invalid gid specified");
 306 }
 307
 308 static int parse_init_status(struct attach_context *ctx, lxc_attach_options_t *options)
 309 {
 310         __do_free char *line = NULL;
 311         __do_fclose FILE *f = NULL;
 312         size_t len = 0;
 313         bool caps_found = false;
 314         int ret;
 315
 316         f = fdopen_at(ctx->dfd_init_pid, "status", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
 317         if (!f)
 318                 return syserror("Failed to open status file");
 319
 320         while (getline(&line, &len, f) != -1) {
 321                 signed long value = -1;
 322
 323                 /*
 324                  * Format is: real, effective, saved set user, fs we only care
 325                  * about real uid.
 326                  */
 327                 ret = sscanf(line, "Uid: %ld", &value);
 328                 if (ret != EOF && ret == 1) {
 329                         ctx->target_host_uid = (uid_t)value;
 330                         TRACE("Container's init process runs with hostuid %d", ctx->target_host_uid);
 331                         goto next;
 332                 }
 333
 334                 ret = sscanf(line, "Gid: %ld", &value);
 335                 if (ret != EOF && ret == 1) {
 336                         ctx->target_host_gid = (gid_t)value;
 337                         TRACE("Container's init process runs with hostgid %d", ctx->target_host_gid);
 338                         goto next;
 339                 }
 340
 341                 ret = sscanf(line, "CapBnd: %llx", &ctx->capability_mask);
 342                 if (ret != EOF && ret == 1) {
 343                         caps_found = true;
 344                         goto next;
 345                 }
 346
 347         next:
 348                 if (ctx->target_host_uid != LXC_INVALID_UID &&
 349                     ctx->target_host_gid != LXC_INVALID_GID &&
 350                     caps_found)
 351                         break;
 352
 353         }
 354
 355         ret = userns_setup_ids(ctx, options);
 356         if (ret)
 357                 return syserror_ret(ret, "Failed to get setup ids");
 358         userns_target_ids(ctx, options);
 359
 360         return 0;
 361 }
 362
 363 static bool pidfd_setns_supported(struct attach_context *ctx)
 364 {
 365         int ret;
 366
 367         /*
 368          * The ability to attach to time namespaces came after the introduction
 369          * of of using pidfds for attaching to namespaces. To avoid having to
 370          * special-case both CLONE_NEWUSER and CLONE_NEWTIME handling, let's
 371          * use CLONE_NEWTIME as gatekeeper.
 372          */
 373         if (ctx->init_pidfd >= 0)
 374                 ret = setns(ctx->init_pidfd, CLONE_NEWTIME);
 375         else
 376                 ret = -EOPNOTSUPP;
 377         TRACE("Attaching to namespaces via pidfds %s",
 378               ret ? "unsupported" : "supported");
 379         return ret == 0;
 380 }
 381
 382 static int get_attach_context(struct attach_context *ctx,
 383                               struct lxc_container *container,
 384                               lxc_attach_options_t *options)
 385 {
 386         __do_free char *lsm_label = NULL;
 387         int ret;
 388         char path[LXC_PROC_PID_LEN];
 389
 390         ctx->container = container;
 391         ctx->attach_flags = options->attach_flags;
 392
 393         ctx->dfd_self_pid = open_at(-EBADF, "/proc/self",
 394                                     PROTECT_OPATH_FILE & ~O_NOFOLLOW,
 395                                     (PROTECT_LOOKUP_ABSOLUTE_WITH_SYMLINKS & ~RESOLVE_NO_XDEV), 0);
 396         if (ctx->dfd_self_pid < 0)
 397                 return syserror("Failed to open /proc/self");
 398
 399         ctx->init_pidfd = lxc_cmd_get_init_pidfd(container->name, container->config_path);
 400         if (ctx->init_pidfd >= 0)
 401                 ctx->init_pid = pidfd_get_pid(ctx->dfd_self_pid, ctx->init_pidfd);
 402         else
 403                 ctx->init_pid = lxc_cmd_get_init_pid(container->name, container->config_path);
 404         if (ctx->init_pid < 0)
 405                 return syserror_ret(-1, "Failed to get init pid");
 406
 407         ret = lxc_cmd_get_clone_flags(container->name, container->config_path);
 408         if (ret < 0)
 409                 SYSERROR("Failed to retrieve namespace flags");
 410         ctx->ns_clone_flags = ret;
 411
 412         ret = core_scheduling_cookie_get(ctx->init_pid, &ctx->core_sched_cookie);
 413         if (ret || !core_scheduling_cookie_valid(ctx->core_sched_cookie))
 414                 INFO("Container does not run in a separate core scheduling domain");
 415         else
 416                 INFO("Container runs in separate core scheduling domain %llu",
 417                      (llu)ctx->core_sched_cookie);
 418
 419         ret = strnprintf(path, sizeof(path), "/proc/%d", ctx->init_pid);
 420         if (ret < 0)
 421                 return ret_errno(EIO);
 422
 423         ctx->dfd_init_pid = open_at(-EBADF, path,
 424                                     PROTECT_OPATH_DIRECTORY,
 425                                     (PROTECT_LOOKUP_ABSOLUTE & ~RESOLVE_NO_XDEV), 0);
 426         if (ctx->dfd_init_pid < 0)
 427                 return syserror("Failed to open /proc/%d", ctx->init_pid);
 428
 429         if (ctx->init_pidfd >= 0) {
 430                 ret = lxc_raw_pidfd_send_signal(ctx->init_pidfd, 0, NULL, 0);
 431                 if (ret)
 432                         return syserror("Container process exited or PID has been recycled");
 433                 else
 434                         TRACE("Container process still running and PID was not recycled");
 435
 436                 if (!pidfd_setns_supported(ctx)) {
 437                         /* We can't risk leaking file descriptors during attach. */
 438                         if (close(ctx->init_pidfd))
 439                                 return syserror("Failed to close pidfd");
 440
 441                         ctx->init_pidfd = -EBADF;
 442                         TRACE("Attaching to namespaces via pidfds not supported");
 443                 }
 444         }
 445
 446         /* Determine which namespaces the container was created with. */
 447         if (options->namespaces == -1) {
 448                 options->namespaces = ctx->ns_clone_flags;
 449                 if (options->namespaces == -1)
 450                         return syserror_set(-EINVAL, "Failed to automatically determine the namespaces which the container uses");
 451
 452                 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
 453                         if (ns_info[i].clone_flag & CLONE_NEWCGROUP)
 454                                 if (!(options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) ||
 455                                     !cgns_supported())
 456                                         continue;
 457
 458                         if (ns_info[i].clone_flag & options->namespaces)
 459                                 continue;
 460
 461                         ctx->ns_inherited |= ns_info[i].clone_flag;
 462                 }
 463         }
 464
 465         ret = parse_init_status(ctx, options);
 466         if (ret)
 467                 return syserror("Failed to open parse file");
 468
 469         ctx->lsm_ops = lsm_init_static();
 470
 471         if (attach_lsm(options)) {
 472                 if (ctx->attach_flags & LXC_ATTACH_LSM_LABEL)
 473                         lsm_label = options->lsm_label;
 474                 else
 475                         lsm_label = ctx->lsm_ops->process_label_get_at(ctx->lsm_ops, ctx->dfd_init_pid);
 476                 if (!lsm_label)
 477                         WARN("No security context received");
 478                 else
 479                         INFO("Retrieved security context %s", lsm_label);
 480         }
 481
 482         ret = get_personality(container->name, container->config_path, &ctx->personality);
 483         if (ret)
 484                 return syserror_ret(ret, "Failed to get personality of the container");
 485
 486         if (!ctx->container->lxc_conf) {
 487                 ctx->container->lxc_conf = lxc_conf_init();
 488                 if (!ctx->container->lxc_conf)
 489                         return syserror_set(-ENOMEM, "Failed to allocate new lxc config");
 490         }
 491
 492         ctx->lsm_label = move_ptr(lsm_label);
 493         return 0;
 494 }
 495
 496 static int same_nsfd(int dfd_pid1, int dfd_pid2, const char *ns_path)
 497 {
 498         int ret;
 499         struct stat ns_st1, ns_st2;
 500
 501         ret = fstatat(dfd_pid1, ns_path, &ns_st1, 0);
 502         if (ret)
 503                 return -errno;
 504
 505         ret = fstatat(dfd_pid2, ns_path, &ns_st2, 0);
 506         if (ret)
 507                 return -errno;
 508
 509         /* processes are in the same namespace */
 510         if ((ns_st1.st_dev == ns_st2.st_dev) &&
 511             (ns_st1.st_ino == ns_st2.st_ino))
 512                 return 1;
 513
 514         return 0;
 515 }
 516
 517 static int same_ns(int dfd_pid1, int dfd_pid2, const char *ns_path)
 518 {
 519         __do_close int ns_fd2 = -EBADF;
 520         int ret = -1;
 521
 522         ns_fd2 = open_at(dfd_pid2, ns_path, PROTECT_OPEN_WITH_TRAILING_SYMLINKS,
 523                          (PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS &
 524                           ~(RESOLVE_NO_XDEV | RESOLVE_BENEATH)), 0);
 525         if (ns_fd2 < 0) {
 526                 if (errno == ENOENT)
 527                         return -ENOENT;
 528                 return syserror("Failed to open %d(%s)", dfd_pid2, ns_path);
 529         }
 530
 531         ret = same_nsfd(dfd_pid1, dfd_pid2, ns_path);
 532         switch (ret) {
 533         case -ENOENT:
 534                 __fallthrough;
 535         case 1:
 536                 return ret_errno(ENOENT);
 537         case 0:
 538                 /* processes are in different namespaces */
 539                 return move_fd(ns_fd2);
 540         }
 541
 542         return ret;
 543 }
 544
 545 static int __prepare_namespaces_pidfd(struct attach_context *ctx)
 546 {
 547         for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
 548                 int ret;
 549
 550                 ret = same_nsfd(ctx->dfd_self_pid,
 551                                 ctx->dfd_init_pid,
 552                                 ns_info[i].proc_path);
 553                 switch (ret) {
 554                 case -ENOENT:
 555                         __fallthrough;
 556                 case 1:
 557                         ctx->ns_inherited &= ~ns_info[i].clone_flag;
 558                         TRACE("Shared %s namespace doesn't need attach", ns_info[i].proc_name);
 559                         continue;
 560                 case 0:
 561                         TRACE("Different %s namespace needs attach", ns_info[i].proc_name);
 562                         continue;
 563                 }
 564
 565                 return syserror("Failed to determine whether %s namespace is shared",
 566                                 ns_info[i].proc_name);
 567         }
 568
 569         return 0;
 570 }
 571
 572 static int __prepare_namespaces_nsfd(struct attach_context *ctx,
 573                                      lxc_attach_options_t *options)
 574 {
 575         for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
 576                 lxc_namespace_t j;
 577
 578                 if (options->namespaces & ns_info[i].clone_flag)
 579                         ctx->ns_fd[i] = open_at(ctx->dfd_init_pid,
 580                                                 ns_info[i].proc_path,
 581                                                 PROTECT_OPEN_WITH_TRAILING_SYMLINKS,
 582                                                 (PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS &
 583                                                  ~(RESOLVE_NO_XDEV | RESOLVE_BENEATH)),
 584                                                 0);
 585                 else if (ctx->ns_inherited & ns_info[i].clone_flag)
 586                         ctx->ns_fd[i] = same_ns(ctx->dfd_self_pid,
 587                                                 ctx->dfd_init_pid,
 588                                                 ns_info[i].proc_path);
 589                 else
 590                         continue;
 591
 592                 if (ctx->ns_fd[i] >= 0)
 593                         continue;
 594
 595                 if (ctx->ns_fd[i] == -ENOENT) {
 596                         ctx->ns_inherited &= ~ns_info[i].clone_flag;
 597                         continue;
 598                 }
 599
 600                 /* We failed to preserve the namespace. */
 601                 SYSERROR("Failed to preserve %s namespace of %d",
 602                          ns_info[i].proc_name, ctx->init_pid);
 603
 604                 /* Close all already opened file descriptors before we return an
 605                  * error, so we don't leak them.
 606                  */
 607                 for (j = 0; j < i; j++)
 608                         close_prot_errno_disarm(ctx->ns_fd[j]);
 609
 610                 return ret_errno(EINVAL);
 611         }
 612
 613         return 0;
 614 }
 615
 616 static int prepare_namespaces(struct attach_context *ctx,
 617                               lxc_attach_options_t *options)
 618 {
 619         if (ctx->init_pidfd < 0)
 620                 return __prepare_namespaces_nsfd(ctx, options);
 621
 622         return __prepare_namespaces_pidfd(ctx);
 623 }
 624
 625 static inline void put_namespaces(struct attach_context *ctx)
 626 {
 627         if (ctx->init_pidfd < 0) {
 628                 for (int i = 0; i < LXC_NS_MAX; i++)
 629                         close_prot_errno_disarm(ctx->ns_fd[i]);
 630         }
 631 }
 632
 633 static int __attach_namespaces_pidfd(struct attach_context *ctx,
 634                                      lxc_attach_options_t *options)
 635 {
 636         unsigned int ns_flags = options->namespaces | ctx->ns_inherited;
 637         int ret;
 638
 639         /* The common case is to attach to all namespaces. */
 640         ret = setns(ctx->init_pidfd, ns_flags);
 641         if (ret)
 642                 return syserror("Failed to attach to namespaces via pidfd");
 643
 644         /* We can't risk leaking file descriptors into the container. */
 645         if (close(ctx->init_pidfd))
 646                 return syserror("Failed to close pidfd");
 647         ctx->init_pidfd = -EBADF;
 648
 649         return log_trace(0, "Attached to container namespaces via pidfd");
 650 }
 651
 652 static int __attach_namespaces_nsfd(struct attach_context *ctx,
 653                                     lxc_attach_options_t *options)
 654 {
 655         int fret = 0;
 656
 657         for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
 658                 int ret;
 659
 660                 if (ctx->ns_fd[i] < 0)
 661                         continue;
 662
 663                 ret = setns(ctx->ns_fd[i], ns_info[i].clone_flag);
 664                 if (ret)
 665                         return syserror("Failed to attach to %s namespace of %d",
 666                                         ns_info[i].proc_name, ctx->init_pid);
 667
 668                 if (close(ctx->ns_fd[i])) {
 669                         fret = -errno;
 670                         SYSERROR("Failed to close file descriptor for %s namespace",
 671                                  ns_info[i].proc_name);
 672                 }
 673                 ctx->ns_fd[i] = -EBADF;
 674         }
 675
 676         return fret;
 677 }
 678
 679 static int attach_namespaces(struct attach_context *ctx,
 680                              lxc_attach_options_t *options)
 681 {
 682         if (lxc_log_trace()) {
 683                 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
 684                         if (ns_info[i].clone_flag & options->namespaces) {
 685                                 TRACE("Attaching to %s namespace", ns_info[i].proc_name);
 686                                 continue;
 687                         }
 688                         if (ns_info[i].clone_flag & ctx->ns_inherited) {
 689                                 TRACE("Sharing %s namespace", ns_info[i].proc_name);
 690                                 continue;
 691                         }
 692                         TRACE("Inheriting %s namespace", ns_info[i].proc_name);
 693                 }
 694         }
 695
 696         if (ctx->init_pidfd < 0)
 697                 return __attach_namespaces_nsfd(ctx, options);
 698
 699         return __attach_namespaces_pidfd(ctx, options);
 700 }
 701
 702 static void put_attach_context(struct attach_context *ctx)
 703 {
 704         if (ctx) {
 705                 if (!(ctx->attach_flags & LXC_ATTACH_LSM_LABEL))
 706                         free_disarm(ctx->lsm_label);
 707                 close_prot_errno_disarm(ctx->dfd_init_pid);
 708
 709                 if (ctx->container) {
 710                         lxc_container_put(ctx->container);
 711                         ctx->container = NULL;
 712                 }
 713
 714                 put_namespaces(ctx);
 715                 free(ctx);
 716         }
 717 }
 718
 719 /*
 720  * Place anything in here that needs to be get rid of before we move into the
 721  * container's context and fail hard if we can't.
 722  */
 723 static bool attach_context_security_barrier(struct attach_context *ctx)
 724 {
 725         if (ctx) {
 726                 if (close(ctx->dfd_self_pid))
 727                         return false;
 728                 ctx->dfd_self_pid = -EBADF;
 729
 730                 if (close(ctx->dfd_init_pid))
 731                         return false;
 732                 ctx->dfd_init_pid = -EBADF;
 733         }
 734
 735         return true;
 736 }
 737
 738 int lxc_attach_remount_sys_proc(void)
 739 {
 740         int ret;
 741
 742         ret = unshare(CLONE_NEWNS);
 743         if (ret < 0)
 744                 return syserror("Failed to unshare mount namespace");
 745
 746         if (detect_shared_rootfs() && mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL))
 747                 SYSERROR("Failed to recursively turn root mount tree into dependent mount. Continuing...");
 748
 749         /* Assume /proc is always mounted, so remount it. */
 750         ret = umount2("/proc", MNT_DETACH);
 751         if (ret < 0)
 752                 return syserror("Failed to unmount /proc");
 753
 754         ret = mount("none", "/proc", "proc", 0, NULL);
 755         if (ret < 0)
 756                 return syserror("Failed to remount /proc");
 757
 758         /*
 759          * Try to umount /sys. If it's not a mount point, we'll get EINVAL, then
 760          * we ignore it because it may not have been mounted in the first place.
 761          */
 762         ret = umount2("/sys", MNT_DETACH);
 763         if (ret < 0 && errno != EINVAL)
 764                 return syserror("Failed to unmount /sys");
 765
 766         /* Remount it. */
 767         if (ret == 0 && mount("none", "/sys", "sysfs", 0, NULL))
 768                 return syserror("Failed to remount /sys");
 769
 770         return 0;
 771 }
 772
 773 static int drop_capabilities(struct attach_context *ctx)
 774 {
 775         int ret;
 776         __u32 last_cap;
 777
 778         ret = lxc_caps_last_cap(&last_cap);
 779         if (ret)
 780                 return syserror_ret(ret, "%d - Failed to drop capabilities", ret);
 781
 782         for (__u32 cap = 0; cap <= last_cap; cap++) {
 783                 if (ctx->capability_mask & (1LL << cap))
 784                         continue;
 785
 786                 if (prctl(PR_CAPBSET_DROP, prctl_arg(cap), prctl_arg(0),
 787                           prctl_arg(0), prctl_arg(0)))
 788                         return syserror("Failed to drop capability %d", cap);
 789
 790                 TRACE("Dropped capability %d", cap);
 791         }
 792
 793         return 0;
 794 }
 795
 796 static int lxc_attach_set_environment(struct attach_context *ctx,
 797                                       enum lxc_attach_env_policy_t policy,
 798                                       char **extra_env, char **extra_keep)
 799 {
 800         int ret;
 801
 802         if (policy == LXC_ATTACH_CLEAR_ENV) {
 803                 int path_kept = 0;
 804                 char **extra_keep_store = NULL;
 805
 806                 if (extra_keep) {
 807                         size_t count, i;
 808
 809                         for (count = 0; extra_keep[count]; count++)
 810                                 ;
 811
 812                         extra_keep_store = zalloc(count * sizeof(char *));
 813                         if (!extra_keep_store)
 814                                 return -1;
 815
 816                         for (i = 0; i < count; i++) {
 817                                 char *v = getenv(extra_keep[i]);
 818                                 if (v) {
 819                                         extra_keep_store[i] = strdup(v);
 820                                         if (!extra_keep_store[i]) {
 821                                                 while (i > 0)
 822                                                         free(extra_keep_store[--i]);
 823
 824                                                 free(extra_keep_store);
 825                                                 return -1;
 826                                         }
 827
 828                                         if (strequal(extra_keep[i], "PATH"))
 829                                                 path_kept = 1;
 830                                 }
 831                         }
 832                 }
 833
 834                 if (clearenv()) {
 835                         if (extra_keep_store) {
 836                                 char **p;
 837
 838                                 for (p = extra_keep_store; *p; p++)
 839                                         free(*p);
 840
 841                                 free(extra_keep_store);
 842                         }
 843
 844                         return syserror("Failed to clear environment");
 845                 }
 846
 847                 if (extra_keep_store) {
 848                         size_t i;
 849
 850                         for (i = 0; extra_keep[i]; i++) {
 851                                 if (extra_keep_store[i]) {
 852                                         ret = setenv(extra_keep[i], extra_keep_store[i], 1);
 853                                         if (ret < 0)
 854                                                 SYSWARN("Failed to set environment variable");
 855                                 }
 856
 857                                 free(extra_keep_store[i]);
 858                         }
 859
 860                         free(extra_keep_store);
 861                 }
 862
 863                 /* Always set a default path; shells and execlp tend to be fine
 864                  * without it, but there is a disturbing number of C programs
 865                  * out there that just assume that getenv("PATH") is never NULL
 866                  * and then die a painful segfault death.
 867                  */
 868                 if (!path_kept) {
 869                         ret = setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
 870                         if (ret < 0)
 871                                 SYSWARN("Failed to set environment variable");
 872                 }
 873         }
 874
 875         ret = putenv("container=lxc");
 876         if (ret < 0)
 877                 return log_warn(-1, "Failed to set environment variable");
 878
 879         /* Set container environment variables.*/
 880         if (ctx->container->lxc_conf) {
 881                 ret = lxc_set_environment(ctx->container->lxc_conf);
 882                 if (ret < 0)
 883                         return -1;
 884         }
 885
 886         /* Set extra environment variables. */
 887         if (extra_env) {
 888                 for (; *extra_env; extra_env++) {
 889                         char *p;
 890
 891                         /* We just assume the user knows what they are doing, so
 892                          * we don't do any checks.
 893                          */
 894                         p = strdup(*extra_env);
 895                         if (!p)
 896                                 return -1;
 897
 898                         ret = putenv(p);
 899                         if (ret < 0)
 900                                 SYSWARN("Failed to set environment variable");
 901                 }
 902         }
 903
 904         return 0;
 905 }
 906
 907 static char *lxc_attach_getpwshell(uid_t uid)
 908 {
 909         __do_free char *line = NULL, *result = NULL;
 910         __do_fclose FILE *pipe_f = NULL;
 911         int fd, ret;
 912         pid_t pid;
 913         int pipes[2];
 914         bool found = false;
 915         size_t line_bufsz = 0;
 916
 917         /* We need to fork off a process that runs the getent program, and we
 918          * need to capture its output, so we use a pipe for that purpose.
 919          */
 920         ret = pipe2(pipes, O_CLOEXEC);
 921         if (ret < 0)
 922                 return NULL;
 923
 924         pid = fork();
 925         if (pid < 0) {
 926                 close(pipes[0]);
 927                 close(pipes[1]);
 928                 return NULL;
 929         }
 930
 931         if (!pid) {
 932                 char uid_buf[32];
 933                 char *arguments[] = {
 934                         "getent",
 935                         "passwd",
 936                         uid_buf,
 937                         NULL
 938                 };
 939
 940                 close(pipes[0]);
 941
 942                 /* We want to capture stdout. */
 943                 ret = dup2(pipes[1], STDOUT_FILENO);
 944                 close(pipes[1]);
 945                 if (ret < 0)
 946                         _exit(EXIT_FAILURE);
 947
 948                 /* Get rid of stdin/stderr, so we try to associate it with
 949                  * /dev/null.
 950                  */
 951                 fd = open_devnull();
 952                 if (fd < 0) {
 953                         close(STDIN_FILENO);
 954                         close(STDERR_FILENO);
 955                 } else {
 956                         (void)dup3(fd, STDIN_FILENO, O_CLOEXEC);
 957                         (void)dup3(fd, STDERR_FILENO, O_CLOEXEC);
 958                         close(fd);
 959                 }
 960
 961                 /* Finish argument list. */
 962                 ret = strnprintf(uid_buf, sizeof(uid_buf), "%ld", (long)uid);
 963                 if (ret <= 0)
 964                         _exit(EXIT_FAILURE);
 965
 966                 /* Try to run getent program. */
 967                 (void)execvp("getent", arguments);
 968                 _exit(EXIT_FAILURE);
 969         }
 970
 971         close(pipes[1]);
 972
 973         pipe_f = fdopen(pipes[0], "re");
 974         if (!pipe_f) {
 975                 close(pipes[0]);
 976                 goto reap_child;
 977         }
 978         /* Transfer ownership of pipes[0] to pipe_f. */
 979         move_fd(pipes[0]);
 980
 981         while (getline(&line, &line_bufsz, pipe_f) != -1) {
 982                 int i;
 983                 long value;
 984                 char *token;
 985                 char *endptr = NULL, *saveptr = NULL;
 986
 987                 /* If we already found something, just continue to read
 988                 * until the pipe doesn't deliver any more data, but
 989                 * don't modify the existing data structure.
 990                  */
 991                 if (found)
 992                         continue;
 993
 994                 if (!line)
 995                         continue;
 996
 997                 /* Trim line on the right hand side. */
 998                 for (i = strlen(line); i > 0 && (line[i - 1] == '\n' || line[i - 1] == '\r'); --i)
 999                         line[i - 1] = '\0';
1000
1001                 /* Split into tokens: first: user name. */
1002                 token = strtok_r(line, ":", &saveptr);
1003                 if (!token)
1004                         continue;
1005
1006                 /* next: placeholder password field */
1007                 token = strtok_r(NULL, ":", &saveptr);
1008                 if (!token)
1009                         continue;
1010
1011                 /* next: user id */
1012                 token = strtok_r(NULL, ":", &saveptr);
1013                 value = token ? strtol(token, &endptr, 10) : 0;
1014                 if (!token || !endptr || *endptr || value == LONG_MIN ||
1015                     value == LONG_MAX)
1016                         continue;
1017
1018                 /* placeholder conherence check: user id matches */
1019                 if ((uid_t)value != uid)
1020                         continue;
1021
1022                 /* skip fields: gid, gecos, dir, go to next field 'shell' */
1023                 for (i = 0; i < 4; i++) {
1024                         token = strtok_r(NULL, ":", &saveptr);
1025                         if (!token)
1026                                 continue;
1027                 }
1028
1029                 if (!token)
1030                         continue;
1031
1032                 free_disarm(result);
1033                 result = strdup(token);
1034
1035                 /* Sanity check that there are no fields after that. */
1036                 token = strtok_r(NULL, ":", &saveptr);
1037                 if (token)
1038                         continue;
1039
1040                 found = true;
1041         }
1042
1043 reap_child:
1044         ret = wait_for_pid(pid);
1045         if (ret < 0)
1046                 return NULL;
1047
1048         if (!found)
1049                 return NULL;
1050
1051         return move_ptr(result);
1052 }
1053
1054 static bool fetch_seccomp(struct lxc_container *c, lxc_attach_options_t *options)
1055 {
1056         __do_free char *path = NULL;
1057         int ret;
1058         bool bret;
1059
1060         if (!attach_lsm(options)) {
1061                 free_disarm(c->lxc_conf->seccomp.seccomp);
1062                 return true;
1063         }
1064
1065         /* Remove current setting. */
1066         if (!c->set_config_item(c, "lxc.seccomp.profile", "") &&
1067             !c->set_config_item(c, "lxc.seccomp", ""))
1068                 return false;
1069
1070         /* Fetch the current profile path over the cmd interface. */
1071         path = c->get_running_config_item(c, "lxc.seccomp.profile");
1072         if (!path) {
1073                 INFO("Failed to retrieve lxc.seccomp.profile");
1074
1075                 path = c->get_running_config_item(c, "lxc.seccomp");
1076                 if (!path)
1077                         return log_info(true, "Failed to retrieve lxc.seccomp");
1078         }
1079
1080         /* Copy the value into the new lxc_conf. */
1081         bret = c->set_config_item(c, "lxc.seccomp.profile", path);
1082         if (!bret)
1083                 return false;
1084
1085         /* Attempt to parse the resulting config. */
1086         ret = lxc_read_seccomp_config(c->lxc_conf);
1087         if (ret < 0)
1088                 return log_error(false, "Failed to retrieve seccomp policy");
1089
1090         return log_info(true, "Retrieved seccomp policy");
1091 }
1092
1093 static bool no_new_privs(struct lxc_container *c, lxc_attach_options_t *options)
1094 {
1095         __do_free char *val = NULL;
1096
1097         /* Remove current setting. */
1098         if (!c->set_config_item(c, "lxc.no_new_privs", ""))
1099                 return log_info(false, "Failed to unset lxc.no_new_privs");
1100
1101         /* Retrieve currently active setting. */
1102         val = c->get_running_config_item(c, "lxc.no_new_privs");
1103         if (!val)
1104                 return log_info(false, "Failed to retrieve lxc.no_new_privs");
1105
1106         /* Set currently active setting. */
1107         return c->set_config_item(c, "lxc.no_new_privs", val);
1108 }
1109
1110 struct attach_payload {
1111         int ipc_socket;
1112         int terminal_pts_fd;
1113         lxc_attach_options_t *options;
1114         struct attach_context *ctx;
1115         lxc_attach_exec_t exec_function;
1116         void *exec_payload;
1117 };
1118
1119 static void put_attach_payload(struct attach_payload *p)
1120 {
1121         if (p) {
1122                 close_prot_errno_disarm(p->ipc_socket);
1123                 close_prot_errno_disarm(p->terminal_pts_fd);
1124                 put_attach_context(p->ctx);
1125                 p->ctx = NULL;
1126         }
1127 }
1128
1129 __noreturn static void do_attach(struct attach_payload *ap)
1130 {
1131         lxc_attach_exec_t attach_function = move_ptr(ap->exec_function);
1132         void *attach_function_args = move_ptr(ap->exec_payload);
1133         int fd_lsm, ret;
1134         lxc_attach_options_t* options = ap->options;
1135         struct attach_context *ctx = ap->ctx;
1136         struct lxc_conf *conf = ctx->container->lxc_conf;
1137
1138         /*
1139          * We currently artificially restrict core scheduling to be a pid
1140          * namespace concept since this makes the code easier. We can revisit
1141          * this no problem and make this work with shared pid namespaces as
1142          * well. This check here makes sure that the container was created with
1143          * a separate pid namespace (ctx->ns_clone_flags) and whether we are
1144          * actually attaching to this pid namespace (options->namespaces).
1145          */
1146         if (core_scheduling_cookie_valid(ctx->core_sched_cookie) &&
1147             (ctx->ns_clone_flags & CLONE_NEWPID) &&
1148             (options->namespaces & CLONE_NEWPID)) {
1149                 __u64 core_sched_cookie;
1150
1151                 ret = core_scheduling_cookie_share_with(1);
1152                 if (ret < 0) {
1153                         SYSERROR("Failed to join core scheduling domain of %d",
1154                                  ctx->init_pid);
1155                         goto on_error;
1156                 }
1157
1158                 ret = core_scheduling_cookie_get(getpid(), &core_sched_cookie);
1159                 if (ret || !core_scheduling_cookie_valid(core_sched_cookie) ||
1160                     (ctx->core_sched_cookie != core_sched_cookie)) {
1161                         SYSERROR("Invalid core scheduling domain cookie %llu != %llu",
1162                                  (llu)core_sched_cookie,
1163                                  (llu)ctx->core_sched_cookie);
1164                         goto on_error;
1165                 }
1166
1167                 INFO("Joined core scheduling domain of %d with cookie %lld",
1168                      ctx->init_pid, (llu)core_sched_cookie);
1169         }
1170
1171         /* A description of the purpose of this functionality is provided in the
1172          * lxc-attach(1) manual page. We have to remount here and not in the
1173          * parent process, otherwise /proc may not properly reflect the new pid
1174          * namespace.
1175          */
1176         if (!(options->namespaces & CLONE_NEWNS) &&
1177             (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) {
1178                 ret = lxc_attach_remount_sys_proc();
1179                 if (ret < 0)
1180                         goto on_error;
1181
1182                 TRACE("Remounted \"/proc\" and \"/sys\"");
1183         }
1184
1185         /* Now perform additional attachments. */
1186         if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) {
1187                 long new_personality;
1188
1189                 if (options->personality == LXC_ATTACH_DETECT_PERSONALITY)
1190                         new_personality = ctx->personality;
1191                 else
1192                         new_personality = options->personality;
1193
1194                 if (new_personality != LXC_ARCH_UNCHANGED) {
1195                         ret = lxc_personality(new_personality);
1196                         if (ret < 0)
1197                                 goto on_error;
1198
1199                         TRACE("Set new personality");
1200                 }
1201         }
1202
1203         if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) {
1204                 ret = drop_capabilities(ctx);
1205                 if (ret < 0)
1206                         goto on_error;
1207
1208                 TRACE("Dropped capabilities");
1209         }
1210
1211         /* Always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL)
1212          * if you want this to be a no-op).
1213          */
1214         ret = lxc_attach_set_environment(ctx,
1215                                          options->env_policy,
1216                                          options->extra_env_vars,
1217                                          options->extra_keep_env);
1218         if (ret < 0)
1219                 goto on_error;
1220
1221         TRACE("Set up environment");
1222
1223         /*
1224          * This remark only affects fully unprivileged containers:
1225          * Receive fd for LSM security module before we set{g,u}id(). The reason
1226          * is that on set{g,u}id() the kernel will a) make us undumpable and b)
1227          * we will change our effective uid. This means our effective uid will
1228          * be different from the effective uid of the process that created us
1229          * which means that this processs no longer has capabilities in our
1230          * namespace including CAP_SYS_PTRACE. This means we will not be able to
1231          * read and /proc/<pid> files for the process anymore when /proc is
1232          * mounted with hidepid={1,2}. So let's get the lsm label fd before the
1233          * set{g,u}id().
1234          */
1235         if (attach_lsm(options) && ctx->lsm_label) {
1236                 if (!sync_wait_fd(ap->ipc_socket, &fd_lsm)) {
1237                         SYSERROR("Failed to receive lsm label fd");
1238                         goto on_error;
1239                 }
1240
1241                 TRACE("Received LSM label file descriptor %d from parent", fd_lsm);
1242         }
1243
1244         if (options->stdin_fd > 0 && isatty(options->stdin_fd)) {
1245                 ret = lxc_make_controlling_terminal(options->stdin_fd);
1246                 if (ret < 0)
1247                         goto on_error;
1248         }
1249
1250         if ((options->attach_flags & LXC_ATTACH_SETGROUPS) &&
1251             options->groups.size > 0) {
1252                 if (!lxc_setgroups(options->groups.list, options->groups.size))
1253                         goto on_error;
1254         } else {
1255                 if (!lxc_drop_groups() && errno != EPERM)
1256                         goto on_error;
1257         }
1258
1259         if (options->namespaces & CLONE_NEWUSER)
1260                 if (!lxc_switch_uid_gid(ctx->setup_ns_uid, ctx->setup_ns_gid))
1261                         goto on_error;
1262
1263         if (attach_lsm(options) && ctx->lsm_label) {
1264                 bool on_exec;
1265
1266                 /* Change into our new LSM profile. */
1267                 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? true : false;
1268                 ret = ctx->lsm_ops->process_label_set_at(ctx->lsm_ops, fd_lsm, ctx->lsm_label, on_exec);
1269                 close_prot_errno_disarm(fd_lsm);
1270                 if (ret < 0)
1271                         goto on_error;
1272
1273                 TRACE("Set %s LSM label to \"%s\"", ctx->lsm_ops->name, ctx->lsm_label);
1274         }
1275
1276         if (conf->no_new_privs || (options->attach_flags & LXC_ATTACH_NO_NEW_PRIVS)) {
1277                 ret = prctl(PR_SET_NO_NEW_PRIVS, prctl_arg(1), prctl_arg(0),
1278                             prctl_arg(0), prctl_arg(0));
1279                 if (ret < 0)
1280                         goto on_error;
1281
1282                 TRACE("Set PR_SET_NO_NEW_PRIVS");
1283         }
1284
1285         /* The following is done after the communication socket is shut down.
1286          * That way, all errors that might (though unlikely) occur up until this
1287          * point will have their messages printed to the original stderr (if
1288          * logging is so configured) and not the fd the user supplied, if any.
1289          */
1290
1291         /* Fd handling for stdin, stdout and stderr; ignore errors here, user
1292          * may want to make sure the fds are closed, for example.
1293          */
1294         if (options->stdin_fd >= 0 && options->stdin_fd != STDIN_FILENO)
1295                 if (dup2(options->stdin_fd, STDIN_FILENO) < 0)
1296                         SYSDEBUG("Failed to replace stdin with %d", options->stdin_fd);
1297
1298         if (options->stdout_fd >= 0 && options->stdout_fd != STDOUT_FILENO)
1299                 if (dup2(options->stdout_fd, STDOUT_FILENO) < 0)
1300                         SYSDEBUG("Failed to replace stdout with %d", options->stdout_fd);
1301
1302         if (options->stderr_fd >= 0 && options->stderr_fd != STDERR_FILENO)
1303                 if (dup2(options->stderr_fd, STDERR_FILENO) < 0)
1304                         SYSDEBUG("Failed to replace stderr with %d", options->stderr_fd);
1305
1306         /* close the old fds */
1307         if (options->stdin_fd > STDERR_FILENO)
1308                 close(options->stdin_fd);
1309
1310         if (options->stdout_fd > STDERR_FILENO)
1311                 close(options->stdout_fd);
1312
1313         if (options->stderr_fd > STDERR_FILENO)
1314                 close(options->stderr_fd);
1315
1316         /*
1317          * Try to remove FD_CLOEXEC flag from stdin/stdout/stderr, but also
1318          * here, ignore errors.
1319          */
1320         for (int fd = STDIN_FILENO; fd <= STDERR_FILENO; fd++) {
1321                 ret = fd_cloexec(fd, false);
1322                 if (ret < 0) {
1323                         SYSERROR("Failed to clear FD_CLOEXEC from file descriptor %d", fd);
1324                         goto on_error;
1325                 }
1326         }
1327
1328         if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1329                 ret = lxc_terminal_prepare_login(ap->terminal_pts_fd);
1330                 if (ret < 0) {
1331                         SYSERROR("Failed to prepare terminal file descriptor %d", ap->terminal_pts_fd);
1332                         goto on_error;
1333                 }
1334
1335                 TRACE("Prepared terminal file descriptor %d", ap->terminal_pts_fd);
1336         }
1337
1338         /* Avoid unnecessary syscalls. */
1339         if (ctx->setup_ns_uid == ctx->target_ns_uid)
1340                 ctx->target_ns_uid = LXC_INVALID_UID;
1341
1342         if (ctx->setup_ns_gid == ctx->target_ns_gid)
1343                 ctx->target_ns_gid = LXC_INVALID_GID;
1344
1345         /*
1346          * Make sure that the processes STDIO is correctly owned by the user
1347          * that we are switching to.
1348          */
1349         ret = fix_stdio_permissions(ctx->target_ns_uid);
1350         if (ret)
1351                 INFO("Failed to adjust stdio permissions");
1352
1353         if (conf->seccomp.seccomp) {
1354                 ret = lxc_seccomp_load(conf);
1355                 if (ret < 0)
1356                         goto on_error;
1357
1358                 TRACE("Loaded seccomp profile");
1359
1360                 ret = lxc_seccomp_send_notifier_fd(&conf->seccomp, ap->ipc_socket);
1361                 if (ret < 0)
1362                         goto on_error;
1363                 lxc_seccomp_close_notifier_fd(&conf->seccomp);
1364         }
1365
1366         if (!lxc_switch_uid_gid(ctx->target_ns_uid, ctx->target_ns_gid))
1367                 goto on_error;
1368
1369         put_attach_payload(ap);
1370
1371         /* We're done, so we can now do whatever the user intended us to do. */
1372         _exit(attach_function(attach_function_args));
1373
1374 on_error:
1375         ERROR("Failed to attach to container");
1376         put_attach_payload(ap);
1377         _exit(EXIT_FAILURE);
1378 }
1379
1380 static int lxc_attach_terminal(const char *name, const char *lxcpath, struct lxc_conf *conf,
1381                                struct lxc_terminal *terminal)
1382 {
1383         int ret;
1384
1385         lxc_terminal_init(terminal);
1386
1387         ret = lxc_terminal_create(name, lxcpath, conf, terminal);
1388         if (ret < 0)
1389                 return syserror("Failed to create terminal");
1390
1391         return 0;
1392 }
1393
1394 static int lxc_attach_terminal_mainloop_init(struct lxc_terminal *terminal,
1395                                              struct lxc_async_descr *descr)
1396 {
1397         int ret;
1398
1399         ret = lxc_mainloop_open(descr);
1400         if (ret < 0)
1401                 return syserror("Failed to create mainloop");
1402
1403         ret = lxc_terminal_mainloop_add(descr, terminal);
1404         if (ret < 0) {
1405                 lxc_mainloop_close(descr);
1406                 return syserror("Failed to add handlers to mainloop");
1407         }
1408
1409         return 0;
1410 }
1411
1412 static inline void lxc_attach_terminal_close_ptx(struct lxc_terminal *terminal)
1413 {
1414         close_prot_errno_disarm(terminal->ptx);
1415 }
1416
1417 static inline void lxc_attach_terminal_close_pts(struct lxc_terminal *terminal)
1418 {
1419         close_prot_errno_disarm(terminal->pty);
1420 }
1421
1422 static inline void lxc_attach_terminal_close_peer(struct lxc_terminal *terminal)
1423 {
1424         close_prot_errno_disarm(terminal->peer);
1425 }
1426
1427 static inline void lxc_attach_terminal_close_log(struct lxc_terminal *terminal)
1428 {
1429         close_prot_errno_disarm(terminal->log_fd);
1430 }
1431
1432 int lxc_attach(struct lxc_container *container, lxc_attach_exec_t exec_function,
1433                void *exec_payload, lxc_attach_options_t *options,
1434                pid_t *attached_process)
1435 {
1436         int ret_parent = -1;
1437         struct lxc_async_descr descr = {};
1438         int ret;
1439         char *name, *lxcpath;
1440         int ipc_sockets[2];
1441         pid_t attached_pid, pid, to_cleanup_pid;
1442         struct attach_context *ctx;
1443         struct lxc_terminal terminal;
1444         struct lxc_conf *conf;
1445
1446         if (!container)
1447                 return ret_errno(EINVAL);
1448
1449         if (!lxc_container_get(container))
1450                 return ret_errno(EINVAL);
1451
1452         name = container->name;
1453         lxcpath = container->config_path;
1454
1455         if (!options) {
1456                 options = &attach_static_default_options;
1457                 options->lsm_label = NULL;
1458         }
1459
1460         ctx = alloc_attach_context();
1461         if (!ctx) {
1462                 lxc_container_put(container);
1463                 return syserror_set(-ENOMEM, "Failed to allocate attach context");
1464         }
1465
1466         ret = get_attach_context(ctx, container, options);
1467         if (ret) {
1468                 put_attach_context(ctx);
1469                 return syserror("Failed to get attach context");
1470         }
1471
1472         conf = ctx->container->lxc_conf;
1473
1474         if (!fetch_seccomp(ctx->container, options))
1475                 WARN("Failed to get seccomp policy");
1476
1477         if (!no_new_privs(ctx->container, options))
1478                 WARN("Could not determine whether PR_SET_NO_NEW_PRIVS is set");
1479
1480         ret = prepare_namespaces(ctx, options);
1481         if (ret) {
1482                 put_attach_context(ctx);
1483                 return syserror("Failed to get namespace file descriptors");
1484         }
1485
1486         if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1487                 ret = lxc_attach_terminal(name, lxcpath, conf, &terminal);
1488                 if (ret < 0) {
1489                         put_attach_context(ctx);
1490                         return syserror("Failed to setup new terminal");
1491                 }
1492
1493                 terminal.log_fd = options->log_fd;
1494         } else {
1495                 lxc_terminal_init(&terminal);
1496         }
1497
1498         /* Create a socket pair for IPC communication; set SOCK_CLOEXEC in order
1499          * to make sure we don't irritate other threads that want to fork+exec
1500          * away
1501          *
1502          * IMPORTANT: if the initial process is multithreaded and another call
1503          * just fork()s away without exec'ing directly after, the socket fd will
1504          * exist in the forked process from the other thread and any close() in
1505          * our own child process will not really cause the socket to close
1506          * properly, potentially causing the parent to get stuck.
1507          *
1508          * For this reason, while IPC is still active, we have to use shutdown()
1509          * if the child exits prematurely in order to signal that the socket is
1510          * closed and cannot assume that the child exiting will automatically do
1511          * that.
1512          *
1513          * IPC mechanism: (X is receiver)
1514          *   initial process        transient process   attached process
1515          *        X           <---  send pid of
1516          *                          attached proc,
1517          *                          then exit
1518          *    send 0 ------------------------------------>    X
1519          *                                              [do initialization]
1520          *        X  <------------------------------------  send 1
1521          *   [add to cgroup, ...]
1522          *    send 2 ------------------------------------>    X
1523          *                                              [set LXC_ATTACH_NO_NEW_PRIVS]
1524          *        X  <------------------------------------  send 3
1525          *   [open LSM label fd]
1526          *    send 4 ------------------------------------>    X
1527          *                                              [set LSM label]
1528          *   close socket                                 close socket
1529          *                                                run program
1530          */
1531         ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
1532         if (ret < 0) {
1533                 put_attach_context(ctx);
1534                 return syserror("Could not set up required IPC mechanism for attaching");
1535         }
1536
1537         /* Create transient process, two reasons:
1538          *       1. We can't setns() in the child itself, since we want to make
1539          *          sure we are properly attached to the pidns.
1540          *       2. Also, the initial thread has to put the attached process
1541          *          into the cgroup, which we can only do if we didn't already
1542          *          setns() (otherwise, user namespaces will hate us).
1543          */
1544         pid = fork();
1545         if (pid < 0) {
1546                 put_attach_context(ctx);
1547                 return syserror("Failed to create first subprocess");
1548         }
1549
1550         if (pid == 0) {
1551                 char *cwd, *new_cwd;
1552
1553                 /* close unneeded file descriptors */
1554                 close_prot_errno_disarm(ipc_sockets[0]);
1555
1556                 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1557                         lxc_attach_terminal_close_ptx(&terminal);
1558                         lxc_attach_terminal_close_peer(&terminal);
1559                         lxc_attach_terminal_close_log(&terminal);
1560                 }
1561
1562                 /* Wait for the parent to have setup cgroups. */
1563                 if (!sync_wait(ipc_sockets[1], ATTACH_SYNC_CGROUP)) {
1564                         shutdown(ipc_sockets[1], SHUT_RDWR);
1565                         put_attach_context(ctx);
1566                         _exit(EXIT_FAILURE);
1567                 }
1568
1569                 if (!attach_context_security_barrier(ctx)) {
1570                         shutdown(ipc_sockets[1], SHUT_RDWR);
1571                         put_attach_context(ctx);
1572                         _exit(EXIT_FAILURE);
1573                 }
1574
1575                 cwd = getcwd(NULL, 0);
1576
1577                 /*
1578                  * Attach now, create another subprocess later, since pid
1579                  * namespaces only really affect the children of the current
1580                  * process.
1581                  *
1582                  * Note that this is a crucial barrier. We're no moving into
1583                  * the container's context so we need to make sure to not leak
1584                  * anything sensitive. That especially means things such as
1585                  * open file descriptors!
1586                  */
1587                 ret = attach_namespaces(ctx, options);
1588                 if (ret < 0) {
1589                         ERROR("Failed to enter namespaces");
1590                         shutdown(ipc_sockets[1], SHUT_RDWR);
1591                         put_attach_context(ctx);
1592                         _exit(EXIT_FAILURE);
1593                 }
1594
1595                 /* Attach succeeded, try to cwd. */
1596                 if (options->initial_cwd)
1597                         new_cwd = options->initial_cwd;
1598                 else
1599                         new_cwd = cwd;
1600                 if (new_cwd) {
1601                         ret = chdir(new_cwd);
1602                         if (ret < 0)
1603                                 WARN("Could not change directory to \"%s\"", new_cwd);
1604                 }
1605                 free_disarm(cwd);
1606
1607                 /* Create attached process. */
1608                 pid = lxc_raw_clone(CLONE_PARENT, NULL);
1609                 if (pid < 0) {
1610                         SYSERROR("Failed to clone attached process");
1611                         shutdown(ipc_sockets[1], SHUT_RDWR);
1612                         put_attach_context(ctx);
1613                         _exit(EXIT_FAILURE);
1614                 }
1615
1616                 if (pid == 0) {
1617                         struct attach_payload ap = {
1618                                 .ipc_socket             = ipc_sockets[1],
1619                                 .options                = options,
1620                                 .ctx                    = ctx,
1621                                 .terminal_pts_fd        = terminal.pty,
1622                                 .exec_function          = exec_function,
1623                                 .exec_payload           = exec_payload,
1624                         };
1625
1626                         if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1627                                 ret = lxc_terminal_signal_sigmask_safe_blocked(&terminal);
1628                                 if (ret < 0) {
1629                                         SYSERROR("Failed to reset signal mask");
1630                                         _exit(EXIT_FAILURE);
1631                                 }
1632                         }
1633
1634                         /* Does not return. */
1635                         do_attach(&ap);
1636                 }
1637                 TRACE("Attached process %d started initializing", pid);
1638
1639                 if (options->attach_flags & LXC_ATTACH_TERMINAL)
1640                         lxc_attach_terminal_close_pts(&terminal);
1641
1642                 /* Tell grandparent the pid of the pid of the newly created child. */
1643                 if (!sync_wake_pid(ipc_sockets[1], pid)) {
1644                         /* If this really happens here, this is very unfortunate, since
1645                          * the parent will not know the pid of the attached process and
1646                          * will not be able to wait for it (and we won't either due to
1647                          * CLONE_PARENT) so the parent won't be able to reap it and the
1648                          * attached process will remain a zombie.
1649                          */
1650                         shutdown(ipc_sockets[1], SHUT_RDWR);
1651                         put_attach_context(ctx);
1652                         _exit(EXIT_FAILURE);
1653                 }
1654
1655                 /* The rest is in the hands of the initial and the attached process. */
1656                 put_attach_context(ctx);
1657                 _exit(EXIT_SUCCESS);
1658         }
1659         TRACE("Transient process %d started initializing", pid);
1660
1661         to_cleanup_pid = pid;
1662
1663         /* close unneeded file descriptors */
1664         close_prot_errno_disarm(ipc_sockets[1]);
1665         put_namespaces(ctx);
1666         if (options->attach_flags & LXC_ATTACH_TERMINAL)
1667                 lxc_attach_terminal_close_pts(&terminal);
1668
1669         /* Attach to cgroup, if requested. */
1670         if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
1671                 /*
1672                  * If this is the unified hierarchy cgroup_attach() is
1673                  * enough.
1674                  */
1675                 ret = cgroup_attach(conf, name, lxcpath, pid);
1676                 if (ret) {
1677                         call_cleaner(cgroup_exit) struct cgroup_ops *cgroup_ops = NULL;
1678                         if (!ERRNO_IS_NOT_SUPPORTED(ret)) {
1679                                 SYSERROR("Failed to attach cgroup");
1680                                 goto on_error;
1681                         }
1682
1683                         cgroup_ops = cgroup_init(conf);
1684                         if (!cgroup_ops)
1685                                 goto on_error;
1686
1687                         if (!cgroup_ops->attach(cgroup_ops, conf, name, lxcpath, pid))
1688                                 goto on_error;
1689                 }
1690
1691                 TRACE("Moved transient process %d into container cgroup", pid);
1692         }
1693
1694         /*
1695          * Close sensitive file descriptors we don't need anymore. Even if
1696          * we're the parent.
1697          */
1698         if (!attach_context_security_barrier(ctx))
1699                 goto on_error;
1700
1701         /* Setup /proc limits */
1702         ret = setup_proc_filesystem(conf, pid);
1703         if (ret < 0)
1704                 goto on_error;
1705
1706         /* Setup resource limits */
1707         ret = setup_resource_limits(conf, pid);
1708         if (ret < 0)
1709                 goto on_error;
1710
1711         if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1712                 ret = lxc_attach_terminal_mainloop_init(&terminal, &descr);
1713                 if (ret < 0)
1714                         goto on_error;
1715
1716                 TRACE("Initialized terminal mainloop");
1717         }
1718
1719         /* Let the child process know to go ahead. */
1720         if (!sync_wake(ipc_sockets[0], ATTACH_SYNC_CGROUP))
1721                 goto close_mainloop;
1722
1723         TRACE("Told transient process to start initializing");
1724
1725         /* Get pid of attached process from transient process. */
1726         if (!sync_wait_pid(ipc_sockets[0], &attached_pid))
1727                 goto close_mainloop;
1728
1729         TRACE("Received pid %d of attached process in parent pid namespace", attached_pid);
1730
1731         /* Ignore SIGKILL (CTRL-C) and SIGQUIT (CTRL-\) - issue #313. */
1732         if (options->stdin_fd == STDIN_FILENO) {
1733                 signal(SIGINT, SIG_IGN);
1734                 signal(SIGQUIT, SIG_IGN);
1735         }
1736
1737         /* Reap transient process. */
1738         ret = wait_for_pid(pid);
1739         if (ret < 0)
1740                 goto close_mainloop;
1741
1742         TRACE("Transient process %d exited", pid);
1743
1744         /* We will always have to reap the attached process now. */
1745         to_cleanup_pid = attached_pid;
1746
1747         /* Open LSM fd and send it to child. */
1748         if (attach_lsm(options) && ctx->lsm_label) {
1749                 __do_close int fd_lsm = -EBADF;
1750                 bool on_exec;
1751
1752                 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? true : false;
1753                 fd_lsm = ctx->lsm_ops->process_label_fd_get(ctx->lsm_ops, attached_pid, on_exec);
1754                 if (fd_lsm < 0)
1755                         goto close_mainloop;
1756
1757                 TRACE("Opened LSM label file descriptor %d", fd_lsm);
1758
1759                 /* Send child fd of the LSM security module to write to. */
1760                 if (!sync_wake_fd(ipc_sockets[0], fd_lsm)) {
1761                         SYSERROR("Failed to send lsm label fd");
1762                         goto close_mainloop;
1763                 }
1764
1765                 TRACE("Sent LSM label file descriptor %d to child", fd_lsm);
1766         }
1767
1768         if (conf->seccomp.seccomp) {
1769                 ret = lxc_seccomp_recv_notifier_fd(&conf->seccomp, ipc_sockets[0]);
1770                 if (ret < 0)
1771                         goto close_mainloop;
1772
1773                 ret = lxc_seccomp_add_notifier(name, lxcpath, &conf->seccomp);
1774                 if (ret < 0)
1775                         goto close_mainloop;
1776         }
1777
1778         /* We're done, the child process should now execute whatever it
1779          * is that the user requested. The parent can now track it with
1780          * waitpid() or similar.
1781          */
1782
1783         *attached_process = attached_pid;
1784
1785         /* Now shut down communication with child, we're done. */
1786         shutdown(ipc_sockets[0], SHUT_RDWR);
1787         close_prot_errno_disarm(ipc_sockets[0]);
1788
1789         ret_parent = 0;
1790         to_cleanup_pid = -1;
1791
1792         if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1793                 ret = lxc_mainloop(&descr, -1);
1794                 if (ret < 0) {
1795                         ret_parent = -1;
1796                         to_cleanup_pid = attached_pid;
1797                 }
1798         }
1799
1800 close_mainloop:
1801         if (options->attach_flags & LXC_ATTACH_TERMINAL)
1802                 lxc_mainloop_close(&descr);
1803
1804 on_error:
1805         if (ipc_sockets[0] >= 0) {
1806                 shutdown(ipc_sockets[0], SHUT_RDWR);
1807                 close_prot_errno_disarm(ipc_sockets[0]);
1808         }
1809
1810         if (to_cleanup_pid > 0)
1811                 (void)wait_for_pid(to_cleanup_pid);
1812
1813         if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1814                 lxc_terminal_delete(&terminal);
1815                 lxc_terminal_conf_free(&terminal);
1816         }
1817
1818         put_attach_context(ctx);
1819         return ret_parent;
1820 }
1821
1822 int lxc_attach_run_command(void *payload)
1823 {
1824         int ret = -1;
1825         lxc_attach_command_t *cmd = payload;
1826
1827         ret = execvp(cmd->program, cmd->argv);
1828         if (ret < 0) {
1829                 switch (errno) {
1830                 case ENOEXEC:
1831                         ret = 126;
1832                         break;
1833                 case ENOENT:
1834                         ret = 127;
1835                         break;
1836                 }
1837         }
1838
1839         return syserror_ret(ret, "Failed to exec \"%s\"", cmd->program);
1840 }
1841
1842 int lxc_attach_run_shell(void* payload)
1843 {
1844         __do_free char *buf = NULL;
1845         uid_t uid;
1846         struct passwd pwent;
1847         struct passwd *pwentp = NULL;
1848         char *user_shell;
1849         ssize_t bufsize;
1850         int ret;
1851
1852         /* Ignore payload parameter. */
1853         (void)payload;
1854
1855         uid = getuid();
1856
1857         bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
1858         if (bufsize < 0)
1859                 bufsize = 1024;
1860
1861         buf = malloc(bufsize);
1862         if (buf) {
1863                 ret = getpwuid_r(uid, &pwent, buf, bufsize, &pwentp);
1864                 if (!pwentp) {
1865                         if (ret == 0)
1866                                 WARN("Could not find matched password record");
1867
1868                         WARN("Failed to get password record - %u", uid);
1869                 }
1870         }
1871
1872         /* This probably happens because of incompatible nss implementations in
1873          * host and container (remember, this code is still using the host's
1874          * glibc but our mount namespace is in the container) we may try to get
1875          * the information by spawning a [getent passwd uid] process and parsing
1876          * the result.
1877          */
1878         if (!pwentp)
1879                 user_shell = lxc_attach_getpwshell(uid);
1880         else
1881                 user_shell = pwent.pw_shell;
1882
1883         if (user_shell)
1884                 execlp(user_shell, user_shell, (char *)NULL);
1885
1886         /* Executed if either no passwd entry or execvp fails, we will fall back
1887          * on /bin/sh as a default shell.
1888          */
1889         execlp("/bin/sh", "/bin/sh", (char *)NULL);
1890
1891         SYSERROR("Failed to execute shell");
1892         if (!pwentp)
1893                 free(user_shell);
1894
1895         return -1;
1896 }