]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/attach.c
macro: s/rexit()/_exit()/g
[mirror_lxc.git] / src / lxc / attach.c
CommitLineData
e0732705
CS
1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
e0732705
CS
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
e0732705
CS
22 */
23
24#define _GNU_SOURCE
e0732705
CS
25#include <errno.h>
26#include <fcntl.h>
e5adb2b5 27#include <termios.h>
c476bdce 28#include <grp.h>
6f4f1937
CB
29#include <pwd.h>
30#include <signal.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <string.h>
34#include <unistd.h>
35#include <linux/unistd.h>
36#include <sys/mount.h>
e0732705
CS
37#include <sys/param.h>
38#include <sys/prctl.h>
5ec27989 39#include <sys/socket.h>
1ba0013f 40#include <sys/syscall.h>
905022f7 41#include <sys/wait.h>
6f4f1937
CB
42
43#include <lxc/lxccontainer.h>
e0732705 44
955e2a02 45#ifndef HAVE_DECL_PR_CAPBSET_DROP
e0732705
CS
46#define PR_CAPBSET_DROP 24
47#endif
48
955e2a02
CB
49#ifndef HAVE_DECL_PR_SET_NO_NEW_PRIVS
50#define PR_SET_NO_NEW_PRIVS 38
51#endif
52
53#ifndef HAVE_DECL_PR_GET_NO_NEW_PRIVS
54#define PR_GET_NO_NEW_PRIVS 39
55#endif
56
81f466d0 57#include "af_unix.h"
e0732705
CS
58#include "attach.h"
59#include "caps.h"
9c4693b8 60#include "cgroup.h"
6f4f1937 61#include "commands.h"
2c4ea790 62#include "conf.h"
6f4f1937 63#include "config.h"
9b8e3c96 64#include "confile.h"
6f4f1937
CB
65#include "log.h"
66#include "lsm/lsm.h"
67#include "lxclock.h"
68#include "lxcseccomp.h"
ba2be1a8 69#include "mainloop.h"
6f4f1937 70#include "namespace.h"
0ed9b1bc 71#include "terminal.h"
6f4f1937 72#include "utils.h"
9c4693b8
CS
73
74#if HAVE_SYS_PERSONALITY_H
75#include <sys/personality.h>
76#endif
e0732705 77
a3da2f3b 78#ifndef SOCK_CLOEXEC
6f4f1937 79#define SOCK_CLOEXEC 02000000
a3da2f3b
SG
80#endif
81
d6a3c917
SG
82#ifndef MS_REC
83#define MS_REC 16384
84#endif
85
86#ifndef MS_SLAVE
6f4f1937 87#define MS_SLAVE (1 << 19)
d6a3c917
SG
88#endif
89
ac2cecc4 90lxc_log_define(attach, lxc);
e0732705 91
8ce83369 92/* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
eab15c1e 93#define __PROC_STATUS_LEN (5 + (LXC_NUMSTRLEN64) + 7 + 1)
74a3920a 94static struct lxc_proc_context_info *lxc_proc_get_context_info(pid_t pid)
e0732705 95{
6f4f1937
CB
96 int ret;
97 bool found;
e0732705 98 FILE *proc_file;
8ce83369 99 char proc_fn[__PROC_STATUS_LEN];
e0732705 100 size_t line_bufsz = 0;
6f4f1937 101 char *line = NULL;
8ce83369 102 struct lxc_proc_context_info *info = NULL;
e0732705 103
8ce83369
CB
104 /* Read capabilities. */
105 ret = snprintf(proc_fn, __PROC_STATUS_LEN, "/proc/%d/status", pid);
106 if (ret < 0 || ret >= __PROC_STATUS_LEN)
107 goto on_error;
e0732705
CS
108
109 proc_file = fopen(proc_fn, "r");
110 if (!proc_file) {
ea918412 111 SYSERROR("Could not open %s", proc_fn);
8ce83369 112 goto on_error;
e0732705
CS
113 }
114
8ce83369
CB
115 info = calloc(1, sizeof(*info));
116 if (!info) {
ea918412 117 SYSERROR("Could not allocate memory");
17ac5301 118 fclose(proc_file);
8ce83369
CB
119 return NULL;
120 }
121
122 found = false;
ea918412 123
e0732705
CS
124 while (getline(&line, &line_bufsz, proc_file) != -1) {
125 ret = sscanf(line, "CapBnd: %llx", &info->capability_mask);
8ce83369
CB
126 if (ret != EOF && ret == 1) {
127 found = true;
e0732705
CS
128 break;
129 }
130 }
131
f10fad2f 132 free(line);
e0732705
CS
133 fclose(proc_file);
134
135 if (!found) {
ea918412 136 ERROR("Could not read capability bounding set from %s",
137 proc_fn);
8ce83369 138 goto on_error;
e0732705
CS
139 }
140
fe4de9a6 141 info->lsm_label = lsm_process_label_get(pid);
877f3a04
CB
142 info->ns_inherited = 0;
143 memset(info->ns_fd, -1, sizeof(int) * LXC_NS_MAX);
e0732705 144
e0732705
CS
145 return info;
146
8ce83369 147on_error:
460a1cf0 148 free(info);
e0732705
CS
149 return NULL;
150}
151
877f3a04
CB
152static inline void lxc_proc_close_ns_fd(struct lxc_proc_context_info *ctx)
153{
154 int i;
155
156 for (i = 0; i < LXC_NS_MAX; i++) {
157 if (ctx->ns_fd[i] < 0)
158 continue;
ea918412 159
877f3a04
CB
160 close(ctx->ns_fd[i]);
161 ctx->ns_fd[i] = -EBADF;
162 }
163}
164
fe4de9a6
DE
165static void lxc_proc_put_context_info(struct lxc_proc_context_info *ctx)
166{
f10fad2f 167 free(ctx->lsm_label);
08ea9270
CB
168 ctx->lsm_label = NULL;
169
170 if (ctx->container) {
2c4ea790 171 lxc_container_put(ctx->container);
08ea9270
CB
172 ctx->container = NULL;
173 }
174
877f3a04 175 lxc_proc_close_ns_fd(ctx);
fe4de9a6
DE
176 free(ctx);
177}
178
299d1198
CB
179/**
180 * in_same_namespace - Check whether two processes are in the same namespace.
181 * @pid1 - PID of the first process.
182 * @pid2 - PID of the second process.
183 * @ns - Name of the namespace to check. Must correspond to one of the names
184 * for the namespaces as shown in /proc/<pid/ns/
185 *
186 * If the two processes are not in the same namespace returns an fd to the
187 * namespace of the second process identified by @pid2. If the two processes are
188 * in the same namespace returns -EINVAL, -1 if an error occurred.
189 */
190static int in_same_namespace(pid_t pid1, pid_t pid2, const char *ns)
191{
192 int ns_fd1 = -1, ns_fd2 = -1, ret = -1;
21d0acc2 193 int saved_errno;
299d1198
CB
194 struct stat ns_st1, ns_st2;
195
196 ns_fd1 = lxc_preserve_ns(pid1, ns);
134284c3
CB
197 if (ns_fd1 < 0) {
198 /* The kernel does not support this namespace. This is not an
199 * error.
200 */
201 if (errno == ENOENT)
202 return -EINVAL;
203
299d1198 204 goto out;
134284c3 205 }
299d1198
CB
206
207 ns_fd2 = lxc_preserve_ns(pid2, ns);
21d0acc2 208 if (ns_fd2 < 0)
299d1198
CB
209 goto out;
210
211 ret = fstat(ns_fd1, &ns_st1);
21d0acc2 212 if (ret < 0)
299d1198
CB
213 goto out;
214
215 ret = fstat(ns_fd2, &ns_st2);
21d0acc2 216 if (ret < 0)
299d1198
CB
217 goto out;
218
219 /* processes are in the same namespace */
ea918412 220 if ((ns_st1.st_dev == ns_st2.st_dev ) && (ns_st1.st_ino == ns_st2.st_ino)) {
221 ret = -EINVAL;
299d1198 222 goto out;
ea918412 223 }
299d1198
CB
224
225 /* processes are in different namespaces */
226 ret = ns_fd2;
227 ns_fd2 = -1;
228
229out:
21d0acc2 230 saved_errno = errno;
231
299d1198
CB
232 if (ns_fd1 >= 0)
233 close(ns_fd1);
ea918412 234
299d1198
CB
235 if (ns_fd2 >= 0)
236 close(ns_fd2);
237
ea918412 238 errno = saved_errno;
299d1198
CB
239 return ret;
240}
241
877f3a04 242static int lxc_attach_to_ns(pid_t pid, struct lxc_proc_context_info *ctx)
99d50954 243{
21d0acc2 244 int i, ret;
99d50954 245
26818618 246 for (i = 0; i < LXC_NS_MAX; i++) {
877f3a04 247 if (ctx->ns_fd[i] < 0)
26818618
CB
248 continue;
249
21d0acc2 250 ret = setns(ctx->ns_fd[i], ns_info[i].clone_flag);
251 if (ret < 0) {
299d1198 252 SYSERROR("Failed to attach to %s namespace of %d",
ea918412 253 ns_info[i].proc_name, pid);
99d50954
CS
254 return -1;
255 }
256
299d1198 257 DEBUG("Attached to %s namespace of %d", ns_info[i].proc_name, pid);
99d50954
CS
258 }
259
260 return 0;
261}
262
74a3920a 263static int lxc_attach_remount_sys_proc(void)
7a0b0b56
CS
264{
265 int ret;
266
267 ret = unshare(CLONE_NEWNS);
268 if (ret < 0) {
ea918412 269 SYSERROR("Failed to unshare mount namespace");
7a0b0b56
CS
270 return -1;
271 }
272
2c6f3fc9 273 if (detect_shared_rootfs()) {
6f4f1937 274 if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) {
ea918412 275 SYSERROR("Failed to make / rslave");
2c6f3fc9
SH
276 ERROR("Continuing...");
277 }
278 }
279
8ce83369 280 /* Assume /proc is always mounted, so remount it. */
7a0b0b56
CS
281 ret = umount2("/proc", MNT_DETACH);
282 if (ret < 0) {
ea918412 283 SYSERROR("Failed to unmount /proc");
7a0b0b56
CS
284 return -1;
285 }
286
287 ret = mount("none", "/proc", "proc", 0, NULL);
288 if (ret < 0) {
ea918412 289 SYSERROR("Failed to remount /proc");
7a0b0b56
CS
290 return -1;
291 }
292
8ce83369
CB
293 /* Try to umount /sys. If it's not a mount point, we'll get EINVAL, then
294 * we ignore it because it may not have been mounted in the first place.
7a0b0b56
CS
295 */
296 ret = umount2("/sys", MNT_DETACH);
297 if (ret < 0 && errno != EINVAL) {
ea918412 298 SYSERROR("Failed to unmount /sys");
7a0b0b56
CS
299 return -1;
300 } else if (ret == 0) {
8ce83369 301 /* Remount it. */
7a0b0b56
CS
302 ret = mount("none", "/sys", "sysfs", 0, NULL);
303 if (ret < 0) {
ea918412 304 SYSERROR("Failed to remount /sys");
7a0b0b56
CS
305 return -1;
306 }
307 }
308
309 return 0;
310}
311
74a3920a 312static int lxc_attach_drop_privs(struct lxc_proc_context_info *ctx)
e0732705 313{
6f4f1937 314 int cap, last_cap;
e0732705 315
6f4f1937 316 last_cap = lxc_caps_last_cap();
e0732705
CS
317 for (cap = 0; cap <= last_cap; cap++) {
318 if (ctx->capability_mask & (1LL << cap))
319 continue;
320
b81689a1
CB
321 if (prctl(PR_CAPBSET_DROP, prctl_arg(cap), prctl_arg(0),
322 prctl_arg(0), prctl_arg(0))) {
94ac256f 323 SYSERROR("Failed to drop capability %d", cap);
e0732705
CS
324 return -1;
325 }
ea918412 326
94ac256f 327 TRACE("Dropped capability %d", cap);
e0732705
CS
328 }
329
330 return 0;
331}
905022f7 332
7385273f 333static int lxc_attach_set_environment(struct lxc_proc_context_info *init_ctx,
334 enum lxc_attach_env_policy_t policy,
6f4f1937 335 char **extra_env, char **extra_keep)
b3a39ba6 336{
3d55242a 337 int ret;
7385273f 338 struct lxc_list *iterator;
339
799f96fd 340 if (policy == LXC_ATTACH_CLEAR_ENV) {
3d5e9f48 341 int path_kept = 0;
6f4f1937 342 char **extra_keep_store = NULL;
3d5e9f48
CS
343
344 if (extra_keep) {
345 size_t count, i;
346
3d55242a
CB
347 for (count = 0; extra_keep[count]; count++)
348 ;
3d5e9f48
CS
349
350 extra_keep_store = calloc(count, sizeof(char *));
3d55242a 351 if (!extra_keep_store)
3d5e9f48 352 return -1;
3d55242a 353
3d5e9f48
CS
354 for (i = 0; i < count; i++) {
355 char *v = getenv(extra_keep[i]);
356 if (v) {
357 extra_keep_store[i] = strdup(v);
358 if (!extra_keep_store[i]) {
3d5e9f48
CS
359 while (i > 0)
360 free(extra_keep_store[--i]);
ea918412 361
3d5e9f48
CS
362 free(extra_keep_store);
363 return -1;
364 }
3d55242a 365
3d5e9f48
CS
366 if (strcmp(extra_keep[i], "PATH") == 0)
367 path_kept = 1;
368 }
3d5e9f48
CS
369 }
370 }
371
799f96fd 372 if (clearenv()) {
a9cab7e3 373 if (extra_keep_store) {
3d55242a
CB
374 char **p;
375
a9cab7e3
CS
376 for (p = extra_keep_store; *p; p++)
377 free(*p);
3d55242a 378
a9cab7e3
CS
379 free(extra_keep_store);
380 }
3d55242a 381
ea918412 382 ERROR("Failed to clear environment");
3d5e9f48
CS
383 return -1;
384 }
385
386 if (extra_keep_store) {
387 size_t i;
6f4f1937 388
3d5e9f48 389 for (i = 0; extra_keep[i]; i++) {
acd4922e 390 if (extra_keep_store[i]) {
3d55242a
CB
391 ret = setenv(extra_keep[i], extra_keep_store[i], 1);
392 if (ret < 0)
a24c5678 393 SYSWARN("Failed to set environment variable");
acd4922e 394 }
ea918412 395
3d5e9f48
CS
396 free(extra_keep_store[i]);
397 }
ea918412 398
3d5e9f48
CS
399 free(extra_keep_store);
400 }
401
8ce83369
CB
402 /* Always set a default path; shells and execlp tend to be fine
403 * without it, but there is a disturbing number of C programs
404 * out there that just assume that getenv("PATH") is never NULL
405 * and then die a painful segfault death.
406 */
3d55242a
CB
407 if (!path_kept) {
408 ret = setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
409 if (ret < 0)
a24c5678 410 SYSWARN("Failed to set environment variable");
3d55242a 411 }
b3a39ba6
DW
412 }
413
3d55242a
CB
414 ret = putenv("container=lxc");
415 if (ret < 0) {
a24c5678 416 SYSWARN("Failed to set environment variable");
b3a39ba6
DW
417 return -1;
418 }
419
7385273f 420 /* Set container environment variables.*/
421 if (init_ctx && init_ctx->container && init_ctx->container->lxc_conf) {
422 lxc_list_for_each(iterator, &init_ctx->container->lxc_conf->environment) {
3d55242a
CB
423 char *env_tmp;
424
425 env_tmp = strdup((char *)iterator->elem);
426 if (!env_tmp)
7385273f 427 return -1;
7385273f 428
3d55242a
CB
429 ret = putenv(env_tmp);
430 if (ret < 0) {
431 SYSERROR("Failed to set environment variable: %s", (char *)iterator->elem);
7385273f 432 return -1;
433 }
434 }
435 }
436
8ce83369 437 /* Set extra environment variables. */
3d5e9f48
CS
438 if (extra_env) {
439 for (; *extra_env; extra_env++) {
3d55242a 440 char *p;
ea918412 441
8ce83369
CB
442 /* We just assume the user knows what they are doing, so
443 * we don't do any checks.
444 */
3d55242a
CB
445 p = strdup(*extra_env);
446 if (!p)
3d5e9f48 447 return -1;
3d55242a
CB
448
449 ret = putenv(p);
450 if (ret < 0)
a24c5678 451 SYSWARN("Failed to set environment variable");
3d5e9f48
CS
452 }
453 }
454
b3a39ba6
DW
455 return 0;
456}
457
74a3920a 458static char *lxc_attach_getpwshell(uid_t uid)
905022f7 459{
6f4f1937 460 int fd, ret;
905022f7
CS
461 pid_t pid;
462 int pipes[2];
3fa23ac3
CB
463 FILE *pipe_f;
464 bool found = false;
465 size_t line_bufsz = 0;
466 char *line = NULL, *result = NULL;
905022f7 467
8ce83369
CB
468 /* We need to fork off a process that runs the getent program, and we
469 * need to capture its output, so we use a pipe for that purpose.
905022f7 470 */
3fa23ac3 471 ret = pipe2(pipes, O_CLOEXEC);
905022f7
CS
472 if (ret < 0)
473 return NULL;
474
475 pid = fork();
476 if (pid < 0) {
477 close(pipes[0]);
478 close(pipes[1]);
479 return NULL;
480 }
481
3fa23ac3 482 if (!pid) {
905022f7
CS
483 char uid_buf[32];
484 char *arguments[] = {
485 "getent",
486 "passwd",
487 uid_buf,
488 NULL
489 };
490
491 close(pipes[0]);
492
8ce83369 493 /* We want to capture stdout. */
3fa23ac3 494 ret = dup2(pipes[1], STDOUT_FILENO);
905022f7 495 close(pipes[1]);
3fa23ac3 496 if (ret < 0)
ea918412 497 _exit(EXIT_FAILURE);
905022f7 498
8ce83369
CB
499 /* Get rid of stdin/stderr, so we try to associate it with
500 * /dev/null.
905022f7 501 */
3fa23ac3 502 fd = open_devnull();
905022f7 503 if (fd < 0) {
3fa23ac3
CB
504 close(STDIN_FILENO);
505 close(STDERR_FILENO);
905022f7 506 } else {
3fa23ac3
CB
507 (void)dup3(fd, STDIN_FILENO, O_CLOEXEC);
508 (void)dup3(fd, STDOUT_FILENO, O_CLOEXEC);
905022f7
CS
509 close(fd);
510 }
511
8ce83369 512 /* Finish argument list. */
3fa23ac3
CB
513 ret = snprintf(uid_buf, sizeof(uid_buf), "%ld", (long)uid);
514 if (ret <= 0 || ret >= sizeof(uid_buf))
ea918412 515 _exit(EXIT_FAILURE);
905022f7 516
8ce83369 517 /* Try to run getent program. */
3fa23ac3 518 (void)execvp("getent", arguments);
ea918412 519 _exit(EXIT_FAILURE);
905022f7 520 }
3fa23ac3
CB
521
522 close(pipes[1]);
523
524 pipe_f = fdopen(pipes[0], "r");
525 while (getline(&line, &line_bufsz, pipe_f) != -1) {
526 int i;
527 long value;
528 char *token;
529 char *endptr = NULL, *saveptr = NULL;
530
531 /* If we already found something, just continue to read
532 * until the pipe doesn't deliver any more data, but
533 * don't modify the existing data structure.
534 */
535 if (found)
536 continue;
537
18d4ffde 538 if (!line)
539 continue;
540
3fa23ac3
CB
541 /* Trim line on the right hand side. */
542 for (i = strlen(line); i > 0 && (line[i - 1] == '\n' || line[i - 1] == '\r'); --i)
543 line[i - 1] = '\0';
544
545 /* Split into tokens: first: user name. */
546 token = strtok_r(line, ":", &saveptr);
547 if (!token)
548 continue;
549
550 /* next: dummy password field */
551 token = strtok_r(NULL, ":", &saveptr);
552 if (!token)
553 continue;
554
555 /* next: user id */
556 token = strtok_r(NULL, ":", &saveptr);
557 value = token ? strtol(token, &endptr, 10) : 0;
558 if (!token || !endptr || *endptr || value == LONG_MIN ||
ea918412 559 value == LONG_MAX)
3fa23ac3
CB
560 continue;
561
562 /* dummy sanity check: user id matches */
563 if ((uid_t)value != uid)
564 continue;
565
566 /* skip fields: gid, gecos, dir, go to next field 'shell' */
567 for (i = 0; i < 4; i++) {
568 token = strtok_r(NULL, ":", &saveptr);
569 if (!token)
570 continue;
571 }
ea918412 572
3fa23ac3
CB
573 if (!token)
574 continue;
ea918412 575
3fa23ac3
CB
576 free(result);
577 result = strdup(token);
578
579 /* Sanity check that there are no fields after that. */
580 token = strtok_r(NULL, ":", &saveptr);
581 if (token)
582 continue;
583
584 found = true;
585 }
ea918412 586
3fa23ac3
CB
587 free(line);
588 fclose(pipe_f);
589
590 ret = wait_for_pid(pid);
591 if (ret < 0) {
592 free(result);
593 return NULL;
594 }
595
596 if (!found) {
597 free(result);
598 return NULL;
599 }
600
601 return result;
905022f7 602}
cb3e61fa 603
6f4f1937 604static void lxc_attach_get_init_uidgid(uid_t *init_uid, gid_t *init_gid)
cb3e61fa
CS
605{
606 FILE *proc_file;
8ce83369
CB
607 char proc_fn[__PROC_STATUS_LEN];
608 int ret;
cb3e61fa
CS
609 char *line = NULL;
610 size_t line_bufsz = 0;
cb3e61fa
CS
611 long value = -1;
612 uid_t uid = (uid_t)-1;
613 gid_t gid = (gid_t)-1;
614
7fb45c93
CB
615 ret = snprintf(proc_fn, __PROC_STATUS_LEN, "/proc/%d/status", 1);
616 if (ret < 0 || ret >= __PROC_STATUS_LEN)
617 return;
cb3e61fa
CS
618
619 proc_file = fopen(proc_fn, "r");
620 if (!proc_file)
621 return;
622
623 while (getline(&line, &line_bufsz, proc_file) != -1) {
8ce83369
CB
624 /* Format is: real, effective, saved set user, fs we only care
625 * about real uid.
cb3e61fa
CS
626 */
627 ret = sscanf(line, "Uid: %ld", &value);
8ce83369 628 if (ret != EOF && ret == 1) {
6f4f1937 629 uid = (uid_t)value;
cb3e61fa
CS
630 } else {
631 ret = sscanf(line, "Gid: %ld", &value);
8ce83369 632 if (ret != EOF && ret == 1)
6f4f1937 633 gid = (gid_t)value;
cb3e61fa 634 }
ea918412 635
cb3e61fa
CS
636 if (uid != (uid_t)-1 && gid != (gid_t)-1)
637 break;
638 }
639
640 fclose(proc_file);
641 free(line);
642
8ce83369 643 /* Only override arguments if we found something. */
cb3e61fa
CS
644 if (uid != (uid_t)-1)
645 *init_uid = uid;
ea918412 646
cb3e61fa
CS
647 if (gid != (gid_t)-1)
648 *init_gid = gid;
649
650 /* TODO: we should also parse supplementary groups and use
8ce83369
CB
651 * setgroups() to set them.
652 */
cb3e61fa 653}
9c4693b8 654
8ce83369 655/* Define default options if no options are supplied by the user. */
9c4693b8
CS
656static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT;
657
d4db3d14 658static bool fetch_seccomp(struct lxc_container *c, lxc_attach_options_t *options)
2c4ea790 659{
d4db3d14
CB
660 int ret;
661 bool bret;
bd7b4e28 662 char *path;
2eef2bda 663
6f4f1937
CB
664 if (!(options->namespaces & CLONE_NEWNS) ||
665 !(options->attach_flags & LXC_ATTACH_LSM)) {
bd4307f0
CB
666 free(c->lxc_conf->seccomp);
667 c->lxc_conf->seccomp = NULL;
2c4ea790 668 return true;
bd4307f0 669 }
bd7b4e28 670
2e812c16 671 /* Remove current setting. */
d4db3d14 672 if (!c->set_config_item(c, "lxc.seccomp.profile", "") &&
ea918412 673 !c->set_config_item(c, "lxc.seccomp", ""))
2c4ea790 674 return false;
bd7b4e28 675
8ce83369 676 /* Fetch the current profile path over the cmd interface. */
0b427da0 677 path = c->get_running_config_item(c, "lxc.seccomp.profile");
bd7b4e28 678 if (!path) {
d4db3d14 679 INFO("Failed to retrieve lxc.seccomp.profile");
ea918412 680
0b427da0 681 path = c->get_running_config_item(c, "lxc.seccomp");
d4db3d14
CB
682 if (!path) {
683 INFO("Failed to retrieve lxc.seccomp");
684 return true;
685 }
bd7b4e28
SG
686 }
687
8ce83369 688 /* Copy the value into the new lxc_conf. */
d4db3d14 689 bret = c->set_config_item(c, "lxc.seccomp.profile", path);
bd7b4e28 690 free(path);
d4db3d14
CB
691 if (!bret)
692 return false;
bd7b4e28 693
8ce83369 694 /* Attempt to parse the resulting config. */
d4db3d14
CB
695 ret = lxc_read_seccomp_config(c->lxc_conf);
696 if (ret < 0) {
697 ERROR("Failed to retrieve seccomp policy");
2c4ea790
SH
698 return false;
699 }
700
d4db3d14 701 INFO("Retrieved seccomp policy");
2e812c16
CB
702 return true;
703}
704
6f4f1937 705static bool no_new_privs(struct lxc_container *c, lxc_attach_options_t *options)
2e812c16 706{
bcbef733 707 bool bret;
2e812c16
CB
708 char *val;
709
2e812c16 710 /* Remove current setting. */
bcbef733
CB
711 if (!c->set_config_item(c, "lxc.no_new_privs", "")) {
712 INFO("Failed to unset lxc.no_new_privs");
2e812c16 713 return false;
bcbef733 714 }
2e812c16
CB
715
716 /* Retrieve currently active setting. */
717 val = c->get_running_config_item(c, "lxc.no_new_privs");
718 if (!val) {
bcbef733 719 INFO("Failed to retrieve lxc.no_new_privs");
2e812c16
CB
720 return false;
721 }
722
723 /* Set currently active setting. */
bcbef733 724 bret = c->set_config_item(c, "lxc.no_new_privs", val);
2e812c16 725 free(val);
bcbef733 726 return bret;
2c4ea790
SH
727}
728
9b8e3c96
SH
729static signed long get_personality(const char *name, const char *lxcpath)
730{
6f4f1937 731 char *p;
9b8e3c96
SH
732 signed long ret;
733
6f4f1937 734 p = lxc_cmd_get_config_item(name, "lxc.arch", lxcpath);
9b8e3c96
SH
735 if (!p)
736 return -1;
6f4f1937 737
9b8e3c96
SH
738 ret = lxc_config_parse_arch(p);
739 free(p);
6f4f1937 740
9b8e3c96
SH
741 return ret;
742}
743
a998454a
CB
744struct attach_clone_payload {
745 int ipc_socket;
9e84479f 746 int terminal_slave_fd;
a998454a
CB
747 lxc_attach_options_t *options;
748 struct lxc_proc_context_info *init_ctx;
749 lxc_attach_exec_t exec_function;
750 void *exec_payload;
751};
752
ba2be1a8
CB
753static void lxc_put_attach_clone_payload(struct attach_clone_payload *p)
754{
755 if (p->ipc_socket >= 0) {
756 shutdown(p->ipc_socket, SHUT_RDWR);
757 close(p->ipc_socket);
758 p->ipc_socket = -EBADF;
759 }
760
9e84479f
CB
761 if (p->terminal_slave_fd >= 0) {
762 close(p->terminal_slave_fd);
763 p->terminal_slave_fd = -EBADF;
ba2be1a8
CB
764 }
765
b21da190 766 if (p->init_ctx) {
ba2be1a8 767 lxc_proc_put_context_info(p->init_ctx);
b21da190
CB
768 p->init_ctx = NULL;
769 }
ba2be1a8
CB
770}
771
a998454a
CB
772static int attach_child_main(struct attach_clone_payload *payload)
773{
57de839f 774 int fd, lsm_fd, ret;
a998454a
CB
775 uid_t new_uid;
776 gid_t new_gid;
a998454a
CB
777 lxc_attach_options_t* options = payload->options;
778 struct lxc_proc_context_info* init_ctx = payload->init_ctx;
57de839f
CB
779 bool needs_lsm = (options->namespaces & CLONE_NEWNS) &&
780 (options->attach_flags & LXC_ATTACH_LSM) &&
781 init_ctx->lsm_label;
a998454a
CB
782
783 /* A description of the purpose of this functionality is provided in the
784 * lxc-attach(1) manual page. We have to remount here and not in the
785 * parent process, otherwise /proc may not properly reflect the new pid
786 * namespace.
787 */
788 if (!(options->namespaces & CLONE_NEWNS) &&
789 (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) {
790 ret = lxc_attach_remount_sys_proc();
b75c344c
CB
791 if (ret < 0)
792 goto on_error;
ea918412 793
b75c344c 794 TRACE("Remounted \"/proc\" and \"/sys\"");
a998454a
CB
795 }
796
b75c344c 797/* Now perform additional attachments. */
a998454a 798#if HAVE_SYS_PERSONALITY_H
a998454a 799 if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) {
b75c344c
CB
800 long new_personality;
801
802 if (options->personality < 0)
803 new_personality = init_ctx->personality;
804 else
805 new_personality = options->personality;
ea918412 806
a998454a 807 ret = personality(new_personality);
b75c344c
CB
808 if (ret < 0)
809 goto on_error;
ea918412 810
b75c344c 811 TRACE("Set new personality");
a998454a
CB
812 }
813#endif
814
815 if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) {
816 ret = lxc_attach_drop_privs(init_ctx);
b75c344c
CB
817 if (ret < 0)
818 goto on_error;
ea918412 819
b75c344c 820 TRACE("Dropped capabilities");
a998454a
CB
821 }
822
823 /* Always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL)
824 * if you want this to be a no-op).
825 */
7385273f 826 ret = lxc_attach_set_environment(init_ctx,
827 options->env_policy,
a998454a
CB
828 options->extra_env_vars,
829 options->extra_keep_env);
b75c344c
CB
830 if (ret < 0)
831 goto on_error;
ea918412 832
b75c344c 833 TRACE("Set up environment");
a998454a 834
57de839f
CB
835 /* This remark only affects fully unprivileged containers:
836 * Receive fd for LSM security module before we set{g,u}id(). The reason
837 * is that on set{g,u}id() the kernel will a) make us undumpable and b)
838 * we will change our effective uid. This means our effective uid will
839 * be different from the effective uid of the process that created us
840 * which means that this processs no longer has capabilities in our
841 * namespace including CAP_SYS_PTRACE. This means we will not be able to
842 * read and /proc/<pid> files for the process anymore when /proc is
843 * mounted with hidepid={1,2}. So let's get the lsm label fd before the
844 * set{g,u}id().
845 */
846 if (needs_lsm) {
b75c344c 847 ret = lxc_abstract_unix_recv_fds(payload->ipc_socket, &lsm_fd, 1, NULL, 0);
9044b79e 848 if (ret <= 0) {
849 if (ret < 0)
850 SYSERROR("Failed to receive lsm label fd");
851
b75c344c 852 goto on_error;
9044b79e 853 }
854
57de839f
CB
855 TRACE("Received LSM label file descriptor %d from parent", lsm_fd);
856 }
857
08ea9270 858 if (options->stdin_fd > 0 && isatty(options->stdin_fd)) {
cd0a2b2f 859 ret = lxc_make_controlling_terminal(options->stdin_fd);
08ea9270
CB
860 if (ret < 0)
861 goto on_error;
862 }
863
a998454a
CB
864 /* Set {u,g}id. */
865 new_uid = 0;
866 new_gid = 0;
ea918412 867
a998454a
CB
868 /* Ignore errors, we will fall back to root in that case (/proc was not
869 * mounted etc.).
870 */
871 if (options->namespaces & CLONE_NEWUSER)
872 lxc_attach_get_init_uidgid(&new_uid, &new_gid);
873
874 if (options->uid != (uid_t)-1)
875 new_uid = options->uid;
ea918412 876
a998454a
CB
877 if (options->gid != (gid_t)-1)
878 new_gid = options->gid;
879
a998454a 880 /* Try to set the {u,g}id combination. */
b75c344c
CB
881 if (new_uid != 0 || new_gid != 0 || options->namespaces & CLONE_NEWUSER) {
882 ret = lxc_switch_uid_gid(new_uid, new_gid);
883 if (ret < 0)
884 goto on_error;
a998454a
CB
885 }
886
24927339 887 ret = lxc_setgroups(0, NULL);
96ec54ac 888 if (ret < 0 && errno != EPERM)
24927339
CB
889 goto on_error;
890
a998454a
CB
891 if ((init_ctx->container && init_ctx->container->lxc_conf &&
892 init_ctx->container->lxc_conf->no_new_privs) ||
893 (options->attach_flags & LXC_ATTACH_NO_NEW_PRIVS)) {
b81689a1
CB
894 ret = prctl(PR_SET_NO_NEW_PRIVS, prctl_arg(1), prctl_arg(0),
895 prctl_arg(0), prctl_arg(0));
b75c344c
CB
896 if (ret < 0)
897 goto on_error;
ea918412 898
b75c344c 899 TRACE("Set PR_SET_NO_NEW_PRIVS");
a998454a
CB
900 }
901
57de839f 902 if (needs_lsm) {
d3ba7c98 903 bool on_exec;
a998454a
CB
904
905 /* Change into our new LSM profile. */
d3ba7c98 906 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? true : false;
ea918412 907
d3ba7c98 908 ret = lsm_process_label_set_at(lsm_fd, init_ctx->lsm_label, on_exec);
57de839f 909 close(lsm_fd);
b75c344c
CB
910 if (ret < 0)
911 goto on_error;
ea918412 912
d3ba7c98 913 TRACE("Set %s LSM label to \"%s\"", lsm_name(), init_ctx->lsm_label);
a998454a
CB
914 }
915
916 if (init_ctx->container && init_ctx->container->lxc_conf &&
b75c344c
CB
917 init_ctx->container->lxc_conf->seccomp) {
918 ret = lxc_seccomp_load(init_ctx->container->lxc_conf);
919 if (ret < 0)
920 goto on_error;
ea918412 921
b75c344c 922 TRACE("Loaded seccomp profile");
a998454a 923 }
ea918412 924
b75c344c
CB
925 shutdown(payload->ipc_socket, SHUT_RDWR);
926 close(payload->ipc_socket);
ba2be1a8
CB
927 payload->ipc_socket = -EBADF;
928 lxc_proc_put_context_info(init_ctx);
d35b372a 929 payload->init_ctx = NULL;
a998454a
CB
930
931 /* The following is done after the communication socket is shut down.
932 * That way, all errors that might (though unlikely) occur up until this
933 * point will have their messages printed to the original stderr (if
934 * logging is so configured) and not the fd the user supplied, if any.
935 */
936
937 /* Fd handling for stdin, stdout and stderr; ignore errors here, user
938 * may want to make sure the fds are closed, for example.
939 */
08ea9270 940 if (options->stdin_fd >= 0 && options->stdin_fd != STDIN_FILENO)
00c72a93 941 (void)dup2(options->stdin_fd, STDIN_FILENO);
08ea9270
CB
942
943 if (options->stdout_fd >= 0 && options->stdout_fd != STDOUT_FILENO)
00c72a93 944 (void)dup2(options->stdout_fd, STDOUT_FILENO);
08ea9270
CB
945
946 if (options->stderr_fd >= 0 && options->stderr_fd != STDERR_FILENO)
00c72a93 947 (void)dup2(options->stderr_fd, STDERR_FILENO);
a998454a
CB
948
949 /* close the old fds */
08ea9270 950 if (options->stdin_fd > STDERR_FILENO)
a998454a 951 close(options->stdin_fd);
08ea9270
CB
952
953 if (options->stdout_fd > STDERR_FILENO)
a998454a 954 close(options->stdout_fd);
08ea9270
CB
955
956 if (options->stderr_fd > STDERR_FILENO)
a998454a
CB
957 close(options->stderr_fd);
958
959 /* Try to remove FD_CLOEXEC flag from stdin/stdout/stderr, but also
960 * here, ignore errors.
961 */
b75c344c 962 for (fd = STDIN_FILENO; fd <= STDERR_FILENO; fd++) {
3f62938a 963 ret = fd_cloexec(fd, false);
b75c344c
CB
964 if (ret < 0) {
965 SYSERROR("Failed to clear FD_CLOEXEC from file descriptor %d", fd);
966 goto on_error;
967 }
a998454a
CB
968 }
969
9e84479f
CB
970 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
971 ret = lxc_terminal_prepare_login(payload->terminal_slave_fd);
ba2be1a8 972 if (ret < 0) {
9e84479f 973 SYSERROR("Failed to prepare terminal file descriptor %d", payload->terminal_slave_fd);
ba2be1a8
CB
974 goto on_error;
975 }
ea918412 976
9e84479f 977 TRACE("Prepared terminal file descriptor %d", payload->terminal_slave_fd);
ba2be1a8
CB
978 }
979
a998454a 980 /* We're done, so we can now do whatever the user intended us to do. */
c7ac2e1c 981 _exit(payload->exec_function(payload->exec_payload));
b75c344c
CB
982
983on_error:
ba2be1a8 984 lxc_put_attach_clone_payload(payload);
c7ac2e1c 985 _exit(EXIT_FAILURE);
a998454a
CB
986}
987
9e84479f
CB
988static int lxc_attach_terminal(struct lxc_conf *conf,
989 struct lxc_terminal *terminal)
ba2be1a8
CB
990{
991 int ret;
992
9e84479f 993 lxc_terminal_init(terminal);
ba2be1a8 994
9e84479f 995 ret = lxc_terminal_create(terminal);
ba2be1a8 996 if (ret < 0) {
ea918412 997 ERROR("Failed to create terminal");
ba2be1a8
CB
998 return -1;
999 }
1000
1001 /* Shift ttys to container. */
9e84479f 1002 ret = lxc_terminal_map_ids(conf, terminal);
ba2be1a8 1003 if (ret < 0) {
9e84479f 1004 ERROR("Failed to chown terminal");
ba2be1a8
CB
1005 goto on_error;
1006 }
1007
1008 return 0;
1009
1010on_error:
9e84479f
CB
1011 lxc_terminal_delete(terminal);
1012 lxc_terminal_conf_free(terminal);
ba2be1a8
CB
1013 return -1;
1014}
1015
9e84479f
CB
1016static int lxc_attach_terminal_mainloop_init(struct lxc_terminal *terminal,
1017 struct lxc_epoll_descr *descr)
ba2be1a8
CB
1018{
1019 int ret;
1020
1021 ret = lxc_mainloop_open(descr);
1022 if (ret < 0) {
1023 ERROR("Failed to create mainloop");
1024 return -1;
1025 }
1026
9e84479f 1027 ret = lxc_terminal_mainloop_add(descr, terminal);
ba2be1a8
CB
1028 if (ret < 0) {
1029 ERROR("Failed to add handlers to mainloop");
1030 lxc_mainloop_close(descr);
1031 return -1;
1032 }
1033
1034 return 0;
1035}
1036
9e84479f 1037static inline void lxc_attach_terminal_close_master(struct lxc_terminal *terminal)
ba2be1a8 1038{
9e84479f 1039 if (terminal->master < 0)
ba2be1a8
CB
1040 return;
1041
9e84479f
CB
1042 close(terminal->master);
1043 terminal->master = -EBADF;
ba2be1a8
CB
1044}
1045
9e84479f 1046static inline void lxc_attach_terminal_close_slave(struct lxc_terminal *terminal)
ba2be1a8 1047{
9e84479f 1048 if (terminal->slave < 0)
ba2be1a8
CB
1049 return;
1050
9e84479f
CB
1051 close(terminal->slave);
1052 terminal->slave = -EBADF;
ba2be1a8
CB
1053}
1054
9e84479f 1055static inline void lxc_attach_terminal_close_peer(struct lxc_terminal *terminal)
ba2be1a8 1056{
9e84479f 1057 if (terminal->peer < 0)
ba2be1a8
CB
1058 return;
1059
9e84479f
CB
1060 close(terminal->peer);
1061 terminal->peer = -EBADF;
ba2be1a8
CB
1062}
1063
9e84479f 1064static inline void lxc_attach_terminal_close_log(struct lxc_terminal *terminal)
ba2be1a8 1065{
9e84479f 1066 if (terminal->log_fd < 0)
ba2be1a8
CB
1067 return;
1068
9e84479f
CB
1069 close(terminal->log_fd);
1070 terminal->log_fd = -EBADF;
ba2be1a8
CB
1071}
1072
6f4f1937
CB
1073int lxc_attach(const char *name, const char *lxcpath,
1074 lxc_attach_exec_t exec_function, void *exec_payload,
1075 lxc_attach_options_t *options, pid_t *attached_process)
9c4693b8 1076{
877f3a04 1077 int i, ret, status;
9c4693b8 1078 int ipc_sockets[2];
6f4f1937 1079 char *cwd, *new_cwd;
9b8e3c96 1080 signed long personality;
ba2be1a8 1081 pid_t attached_pid, init_pid, pid;
6f4f1937 1082 struct lxc_proc_context_info *init_ctx;
9e84479f 1083 struct lxc_terminal terminal;
1cce35e6 1084 struct lxc_conf *conf;
a998454a 1085 struct attach_clone_payload payload = {0};
9c4693b8 1086
877f3a04
CB
1087 ret = access("/proc/self/ns", X_OK);
1088 if (ret) {
ea918412 1089 SYSERROR("Does this kernel version support namespaces?");
877f3a04
CB
1090 return -1;
1091 }
1092
9c4693b8
CS
1093 if (!options)
1094 options = &attach_static_default_options;
1095
1096 init_pid = lxc_cmd_get_init_pid(name, lxcpath);
1097 if (init_pid < 0) {
ae026f55 1098 ERROR("Failed to get init pid");
9c4693b8
CS
1099 return -1;
1100 }
1101
1102 init_ctx = lxc_proc_get_context_info(init_pid);
1103 if (!init_ctx) {
6f4f1937 1104 ERROR("Failed to get context of init process: %ld", (long)init_pid);
9c4693b8
CS
1105 return -1;
1106 }
1107
9b8e3c96
SH
1108 personality = get_personality(name, lxcpath);
1109 if (init_ctx->personality < 0) {
6f4f1937 1110 ERROR("Failed to get personality of the container");
9b8e3c96
SH
1111 lxc_proc_put_context_info(init_ctx);
1112 return -1;
1113 }
1114 init_ctx->personality = personality;
1115
ff07d7bb 1116 init_ctx->container = lxc_container_new(name, lxcpath);
62de1db6
CB
1117 if (!init_ctx->container) {
1118 lxc_proc_put_context_info(init_ctx);
ff07d7bb 1119 return -1;
62de1db6 1120 }
ff07d7bb 1121
ba773996
CB
1122 if (!init_ctx->container->lxc_conf) {
1123 init_ctx->container->lxc_conf = lxc_conf_init();
62de1db6
CB
1124 if (!init_ctx->container->lxc_conf) {
1125 lxc_proc_put_context_info(init_ctx);
ea918412 1126 return -1;
62de1db6 1127 }
ba773996 1128 }
1cce35e6 1129 conf = init_ctx->container->lxc_conf;
ba773996 1130
bd4307f0 1131 if (!fetch_seccomp(init_ctx->container, options))
ae026f55 1132 WARN("Failed to get seccomp policy");
2c4ea790 1133
bd4307f0 1134 if (!no_new_privs(init_ctx->container, options))
ae026f55 1135 WARN("Could not determine whether PR_SET_NO_NEW_PRIVS is set");
2e812c16 1136
9c4693b8
CS
1137 cwd = getcwd(NULL, 0);
1138
8ce83369
CB
1139 /* Determine which namespaces the container was created with
1140 * by asking lxc-start, if necessary.
9c4693b8
CS
1141 */
1142 if (options->namespaces == -1) {
1143 options->namespaces = lxc_cmd_get_clone_flags(name, lxcpath);
1144 /* call failed */
1145 if (options->namespaces == -1) {
8ce83369 1146 ERROR("Failed to automatically determine the "
877f3a04 1147 "namespaces which the container uses");
9c4693b8 1148 free(cwd);
fe4de9a6 1149 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
1150 return -1;
1151 }
877f3a04
CB
1152
1153 for (i = 0; i < LXC_NS_MAX; i++) {
1154 if (ns_info[i].clone_flag & CLONE_NEWCGROUP)
1155 if (!(options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) ||
1156 !cgns_supported())
1157 continue;
1158
1159 if (ns_info[i].clone_flag & options->namespaces)
1160 continue;
1161
1162 init_ctx->ns_inherited |= ns_info[i].clone_flag;
1163 }
1164 }
1165
0059379f 1166 pid = lxc_raw_getpid();
ea918412 1167
877f3a04 1168 for (i = 0; i < LXC_NS_MAX; i++) {
ea918412 1169 int j;
877f3a04
CB
1170
1171 if (options->namespaces & ns_info[i].clone_flag)
1172 init_ctx->ns_fd[i] = lxc_preserve_ns(init_pid, ns_info[i].proc_name);
1173 else if (init_ctx->ns_inherited & ns_info[i].clone_flag)
1174 init_ctx->ns_fd[i] = in_same_namespace(pid, init_pid, ns_info[i].proc_name);
1175 else
1176 continue;
ea918412 1177
877f3a04
CB
1178 if (init_ctx->ns_fd[i] >= 0)
1179 continue;
1180
1181 if (init_ctx->ns_fd[i] == -EINVAL) {
1182 DEBUG("Inheriting %s namespace from %d",
1183 ns_info[i].proc_name, pid);
1184 init_ctx->ns_inherited &= ~ns_info[i].clone_flag;
1185 continue;
1186 }
1187
1188 /* We failed to preserve the namespace. */
ea918412 1189 SYSERROR("Failed to attach to %s namespace of %d",
1190 ns_info[i].proc_name, pid);
1191
877f3a04
CB
1192 /* Close all already opened file descriptors before we return an
1193 * error, so we don't leak them.
1194 */
1195 for (j = 0; j < i; j++)
1196 close(init_ctx->ns_fd[j]);
1197
877f3a04
CB
1198 free(cwd);
1199 lxc_proc_put_context_info(init_ctx);
1200 return -1;
9c4693b8
CS
1201 }
1202
9e84479f
CB
1203 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1204 ret = lxc_attach_terminal(conf, &terminal);
ba2be1a8 1205 if (ret < 0) {
9e84479f 1206 ERROR("Failed to setup new terminal");
ba2be1a8
CB
1207 free(cwd);
1208 lxc_proc_put_context_info(init_ctx);
1209 return -1;
1210 }
1211
9e84479f 1212 terminal.log_fd = options->log_fd;
c948657b 1213 } else {
9e84479f 1214 lxc_terminal_init(&terminal);
ba2be1a8
CB
1215 }
1216
8ce83369
CB
1217 /* Create a socket pair for IPC communication; set SOCK_CLOEXEC in order
1218 * to make sure we don't irritate other threads that want to fork+exec
1219 * away
9c4693b8
CS
1220 *
1221 * IMPORTANT: if the initial process is multithreaded and another call
1222 * just fork()s away without exec'ing directly after, the socket fd will
1223 * exist in the forked process from the other thread and any close() in
8ce83369
CB
1224 * our own child process will not really cause the socket to close
1225 * properly, potentiall causing the parent to hang.
9c4693b8
CS
1226 *
1227 * For this reason, while IPC is still active, we have to use shutdown()
8ce83369
CB
1228 * if the child exits prematurely in order to signal that the socket is
1229 * closed and cannot assume that the child exiting will automatically do
1230 * that.
9c4693b8
CS
1231 *
1232 * IPC mechanism: (X is receiver)
1233 * initial process intermediate attached
1234 * X <--- send pid of
1235 * attached proc,
1236 * then exit
1237 * send 0 ------------------------------------> X
1238 * [do initialization]
1239 * X <------------------------------------ send 1
1240 * [add to cgroup, ...]
1241 * send 2 ------------------------------------> X
81f466d0
CB
1242 * [set LXC_ATTACH_NO_NEW_PRIVS]
1243 * X <------------------------------------ send 3
1244 * [open LSM label fd]
1245 * send 4 ------------------------------------> X
1246 * [set LSM label]
9c4693b8
CS
1247 * close socket close socket
1248 * run program
1249 */
1250 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
1251 if (ret < 0) {
ae026f55 1252 SYSERROR("Could not set up required IPC mechanism for attaching");
9c4693b8 1253 free(cwd);
fe4de9a6 1254 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
1255 return -1;
1256 }
1257
e3f0e436
CB
1258 /* Create intermediate subprocess, two reasons:
1259 * 1. We can't setns() in the child itself, since we want to make
8ce83369 1260 * sure we are properly attached to the pidns.
e3f0e436 1261 * 2. Also, the initial thread has to put the attached process
8ce83369
CB
1262 * into the cgroup, which we can only do if we didn't already
1263 * setns() (otherwise, user namespaces will hate us).
9c4693b8
CS
1264 */
1265 pid = fork();
9c4693b8 1266 if (pid < 0) {
ae026f55 1267 SYSERROR("Failed to create first subprocess");
9c4693b8 1268 free(cwd);
fe4de9a6 1269 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
1270 return -1;
1271 }
1272
1273 if (pid) {
ba2be1a8 1274 int ret_parent = -1;
9c4693b8 1275 pid_t to_cleanup_pid = pid;
ba2be1a8 1276 struct lxc_epoll_descr descr = {0};
9c4693b8 1277
ba2be1a8 1278 /* close unneeded file descriptors */
9c4693b8
CS
1279 close(ipc_sockets[1]);
1280 free(cwd);
ba2be1a8 1281 lxc_proc_close_ns_fd(init_ctx);
9e84479f
CB
1282 if (options->attach_flags & LXC_ATTACH_TERMINAL)
1283 lxc_attach_terminal_close_slave(&terminal);
9c4693b8 1284
8ce83369 1285 /* Attach to cgroup, if requested. */
f4364484 1286 if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
2202afc9
CB
1287 struct cgroup_ops *cgroup_ops;
1288
1289 cgroup_ops = cgroup_init(NULL);
1290 if (!cgroup_ops)
1291 goto on_error;
1292
1293 if (!cgroup_ops->attach(cgroup_ops, name, lxcpath, pid))
8ce83369 1294 goto on_error;
2202afc9
CB
1295
1296 cgroup_exit(cgroup_ops);
1297 TRACE("Moved intermediate process %d into container's cgroups", pid);
f4364484
SG
1298 }
1299
bb2ada6f 1300 /* Setup /proc limits */
1cce35e6
CB
1301 if (!lxc_list_empty(&conf->procs)) {
1302 ret = setup_proc_filesystem(&conf->procs, pid);
bb2ada6f
CB
1303 if (ret < 0)
1304 goto on_error;
1305 }
1306
c6d09e15 1307 /* Setup resource limits */
1cce35e6
CB
1308 if (!lxc_list_empty(&conf->limits)) {
1309 ret = setup_resource_limits(&conf->limits, pid);
ba2be1a8
CB
1310 if (ret < 0)
1311 goto on_error;
1312 }
1313
9e84479f
CB
1314 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1315 ret = lxc_attach_terminal_mainloop_init(&terminal, &descr);
ba2be1a8 1316 if (ret < 0)
6f4f1937 1317 goto on_error;
ea918412 1318
9e84479f 1319 TRACE("Initialized terminal mainloop");
ba2be1a8 1320 }
c6d09e15 1321
8ce83369 1322 /* Let the child process know to go ahead. */
f4364484
SG
1323 status = 0;
1324 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
94ac256f 1325 if (ret != sizeof(status))
ba2be1a8 1326 goto close_mainloop;
ea918412 1327
94ac256f 1328 TRACE("Told intermediate process to start initializing");
f4364484 1329
8ce83369 1330 /* Get pid of attached process from intermediate process. */
94ac256f
CB
1331 ret = lxc_read_nointr(ipc_sockets[0], &attached_pid, sizeof(attached_pid));
1332 if (ret != sizeof(attached_pid))
ba2be1a8 1333 goto close_mainloop;
ea918412 1334
94ac256f 1335 TRACE("Received pid %d of attached process in parent pid namespace", attached_pid);
9c4693b8 1336
8ce83369 1337 /* Ignore SIGKILL (CTRL-C) and SIGQUIT (CTRL-\) - issue #313. */
62183f1a
SH
1338 if (options->stdin_fd == 0) {
1339 signal(SIGINT, SIG_IGN);
1340 signal(SIGQUIT, SIG_IGN);
1341 }
2eef2bda 1342
8ce83369 1343 /* Reap intermediate process. */
9c4693b8
CS
1344 ret = wait_for_pid(pid);
1345 if (ret < 0)
ba2be1a8 1346 goto close_mainloop;
ea918412 1347
94ac256f 1348 TRACE("Intermediate process %d exited", pid);
9c4693b8 1349
8ce83369 1350 /* We will always have to reap the attached process now. */
9c4693b8
CS
1351 to_cleanup_pid = attached_pid;
1352
81f466d0 1353 /* Open LSM fd and send it to child. */
6f4f1937
CB
1354 if ((options->namespaces & CLONE_NEWNS) &&
1355 (options->attach_flags & LXC_ATTACH_LSM) &&
1356 init_ctx->lsm_label) {
94ac256f 1357 int ret = -1;
47ce2cb7
CB
1358 int labelfd;
1359 bool on_exec;
6f4f1937 1360
47ce2cb7
CB
1361 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? true : false;
1362 labelfd = lsm_process_label_fd_get(attached_pid, on_exec);
81f466d0 1363 if (labelfd < 0)
ba2be1a8 1364 goto close_mainloop;
ea918412 1365
94ac256f 1366 TRACE("Opened LSM label file descriptor %d", labelfd);
81f466d0
CB
1367
1368 /* Send child fd of the LSM security module to write to. */
ae467c54 1369 ret = lxc_abstract_unix_send_fds(ipc_sockets[0], &labelfd, 1, NULL, 0);
81f466d0 1370 if (ret <= 0) {
9044b79e 1371 if (ret < 0)
1372 SYSERROR("Failed to send lsm label fd");
1373
1374 close(labelfd);
ba2be1a8 1375 goto close_mainloop;
81f466d0 1376 }
9044b79e 1377
1378 close(labelfd);
94ac256f 1379 TRACE("Sent LSM label file descriptor %d to child", labelfd);
81f466d0
CB
1380 }
1381
8ce83369
CB
1382 /* We're done, the child process should now execute whatever it
1383 * is that the user requested. The parent can now track it with
1384 * waitpid() or similar.
9c4693b8
CS
1385 */
1386
1387 *attached_process = attached_pid;
9c4693b8 1388
ba2be1a8 1389 /* Now shut down communication with child, we're done. */
9c4693b8
CS
1390 shutdown(ipc_sockets[0], SHUT_RDWR);
1391 close(ipc_sockets[0]);
ba2be1a8
CB
1392 ipc_sockets[0] = -1;
1393
1394 ret_parent = 0;
1395 to_cleanup_pid = -1;
ea918412 1396
9e84479f 1397 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
ba2be1a8
CB
1398 ret = lxc_mainloop(&descr, -1);
1399 if (ret < 0) {
1400 ret_parent = -1;
1401 to_cleanup_pid = attached_pid;
1402 }
1403 }
1404
1405 close_mainloop:
9e84479f 1406 if (options->attach_flags & LXC_ATTACH_TERMINAL)
ba2be1a8
CB
1407 lxc_mainloop_close(&descr);
1408
1409 on_error:
1410 if (ipc_sockets[0] >= 0) {
1411 shutdown(ipc_sockets[0], SHUT_RDWR);
1412 close(ipc_sockets[0]);
1413 }
1414
1415 if (to_cleanup_pid > 0)
6f4f1937 1416 (void)wait_for_pid(to_cleanup_pid);
ba2be1a8 1417
9e84479f
CB
1418 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1419 lxc_terminal_delete(&terminal);
1420 lxc_terminal_conf_free(&terminal);
ba2be1a8 1421 }
ea918412 1422
fe4de9a6 1423 lxc_proc_put_context_info(init_ctx);
ba2be1a8 1424 return ret_parent;
9c4693b8
CS
1425 }
1426
ba2be1a8 1427 /* close unneeded file descriptors */
9c4693b8 1428 close(ipc_sockets[0]);
ba2be1a8 1429 ipc_sockets[0] = -EBADF;
ea918412 1430
9e84479f
CB
1431 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1432 lxc_attach_terminal_close_master(&terminal);
1433 lxc_attach_terminal_close_peer(&terminal);
1434 lxc_attach_terminal_close_log(&terminal);
ba2be1a8 1435 }
9c4693b8 1436
8ce83369 1437 /* Wait for the parent to have setup cgroups. */
94ac256f 1438 ret = lxc_read_nointr(ipc_sockets[1], &status, sizeof(status));
ba2be1a8 1439 if (ret != sizeof(status)) {
f4364484 1440 shutdown(ipc_sockets[1], SHUT_RDWR);
62de1db6 1441 lxc_proc_put_context_info(init_ctx);
c7ac2e1c 1442 _exit(EXIT_FAILURE);
f4364484 1443 }
ea918412 1444
94ac256f 1445 TRACE("Intermediate process starting to initialize");
f4364484 1446
8ce83369
CB
1447 /* Attach now, create another subprocess later, since pid namespaces
1448 * only really affect the children of the current process.
9c4693b8 1449 */
877f3a04 1450 ret = lxc_attach_to_ns(init_pid, init_ctx);
9c4693b8 1451 if (ret < 0) {
94ac256f 1452 ERROR("Failed to enter namespaces");
9c4693b8 1453 shutdown(ipc_sockets[1], SHUT_RDWR);
62de1db6 1454 lxc_proc_put_context_info(init_ctx);
c7ac2e1c 1455 _exit(EXIT_FAILURE);
9c4693b8 1456 }
ea918412 1457
877f3a04
CB
1458 /* close namespace file descriptors */
1459 lxc_proc_close_ns_fd(init_ctx);
9c4693b8 1460
8ce83369 1461 /* Attach succeeded, try to cwd. */
9c4693b8
CS
1462 if (options->initial_cwd)
1463 new_cwd = options->initial_cwd;
1464 else
1465 new_cwd = cwd;
d6d979bc
CB
1466 if (new_cwd) {
1467 ret = chdir(new_cwd);
1468 if (ret < 0)
1469 WARN("Could not change directory to \"%s\"", new_cwd);
1470 }
9c4693b8
CS
1471 free(cwd);
1472
a998454a
CB
1473 /* Create attached process. */
1474 payload.ipc_socket = ipc_sockets[1];
1475 payload.options = options;
1476 payload.init_ctx = init_ctx;
9e84479f 1477 payload.terminal_slave_fd = terminal.slave;
a998454a
CB
1478 payload.exec_function = exec_function;
1479 payload.exec_payload = exec_payload;
9c4693b8 1480
a998454a
CB
1481 pid = lxc_raw_clone(CLONE_PARENT);
1482 if (pid < 0) {
94ac256f 1483 SYSERROR("Failed to clone attached process");
9c4693b8 1484 shutdown(ipc_sockets[1], SHUT_RDWR);
62de1db6 1485 lxc_proc_put_context_info(init_ctx);
c7ac2e1c 1486 _exit(EXIT_FAILURE);
9c4693b8 1487 }
a998454a
CB
1488
1489 if (pid == 0) {
1490 ret = attach_child_main(&payload);
1491 if (ret < 0)
1492 ERROR("Failed to exec");
ea918412 1493
a998454a
CB
1494 _exit(EXIT_FAILURE);
1495 }
ea918412 1496
9e84479f
CB
1497 if (options->attach_flags & LXC_ATTACH_TERMINAL)
1498 lxc_attach_terminal_close_slave(&terminal);
9c4693b8 1499
8ce83369 1500 /* Tell grandparent the pid of the pid of the newly created child. */
9c4693b8
CS
1501 ret = lxc_write_nointr(ipc_sockets[1], &pid, sizeof(pid));
1502 if (ret != sizeof(pid)) {
8ce83369
CB
1503 /* If this really happens here, this is very unfortunate, since
1504 * the parent will not know the pid of the attached process and
1505 * will not be able to wait for it (and we won't either due to
1506 * CLONE_PARENT) so the parent won't be able to reap it and the
1507 * attached process will remain a zombie.
9c4693b8 1508 */
9c4693b8 1509 shutdown(ipc_sockets[1], SHUT_RDWR);
62de1db6 1510 lxc_proc_put_context_info(init_ctx);
c7ac2e1c 1511 _exit(EXIT_FAILURE);
9c4693b8 1512 }
ea918412 1513
94ac256f 1514 TRACE("Sending pid %d of attached process", pid);
9c4693b8 1515
8ce83369 1516 /* The rest is in the hands of the initial and the attached process. */
62de1db6 1517 lxc_proc_put_context_info(init_ctx);
c7ac2e1c 1518 _exit(0);
9c4693b8
CS
1519}
1520
9c4693b8
CS
1521int lxc_attach_run_command(void* payload)
1522{
1523 lxc_attach_command_t* cmd = (lxc_attach_command_t*)payload;
1524
1525 execvp(cmd->program, cmd->argv);
ea918412 1526
1527 SYSERROR("Failed to exec \"%s\"", cmd->program);
9c4693b8
CS
1528 return -1;
1529}
1530
1531int lxc_attach_run_shell(void* payload)
1532{
1533 uid_t uid;
cb7aa5e8
DJ
1534 struct passwd pwent;
1535 struct passwd *pwentp = NULL;
9c4693b8 1536 char *user_shell;
cb7aa5e8
DJ
1537 char *buf;
1538 size_t bufsize;
1539 int ret;
9c4693b8 1540
8ce83369 1541 /* Ignore payload parameter. */
9c4693b8
CS
1542 (void)payload;
1543
1544 uid = getuid();
cb7aa5e8
DJ
1545
1546 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
1547 if (bufsize == -1)
1548 bufsize = 1024;
1549
1550 buf = malloc(bufsize);
1551 if (buf) {
1552 ret = getpwuid_r(uid, &pwent, buf, bufsize, &pwentp);
1553 if (!pwentp) {
1554 if (ret == 0)
ea918412 1555 WARN("Could not find matched password record");
cb7aa5e8
DJ
1556
1557 WARN("Failed to get password record - %u", uid);
1558 }
1559 }
9c4693b8 1560
8ce83369
CB
1561 /* This probably happens because of incompatible nss implementations in
1562 * host and container (remember, this code is still using the host's
1563 * glibc but our mount namespace is in the container) we may try to get
1564 * the information by spawning a [getent passwd uid] process and parsing
1565 * the result.
9c4693b8 1566 */
cb7aa5e8 1567 if (!pwentp)
9c4693b8
CS
1568 user_shell = lxc_attach_getpwshell(uid);
1569 else
cb7aa5e8 1570 user_shell = pwent.pw_shell;
ea918412 1571
9c4693b8 1572 if (user_shell)
acf47e1b 1573 execlp(user_shell, user_shell, (char *)NULL);
9c4693b8 1574
8ce83369
CB
1575 /* Executed if either no passwd entry or execvp fails, we will fall back
1576 * on /bin/sh as a default shell.
9c4693b8 1577 */
acf47e1b 1578 execlp("/bin/sh", "/bin/sh", (char *)NULL);
ea918412 1579
edeb1836 1580 SYSERROR("Failed to execute shell");
cb7aa5e8 1581 if (!pwentp)
edeb1836 1582 free(user_shell);
ea918412 1583
cb7aa5e8 1584 free(buf);
9c4693b8
CS
1585 return -1;
1586}