]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/attach.c
pivot_root: switch to a new mechanism (v2)
[mirror_lxc.git] / src / lxc / attach.c
CommitLineData
e0732705
CS
1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
e0732705
CS
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
e0732705
CS
22 */
23
24#define _GNU_SOURCE
25#include <unistd.h>
26#include <stdio.h>
27#include <string.h>
28#include <stdlib.h>
2eef2bda 29#include <signal.h>
e0732705
CS
30#include <errno.h>
31#include <fcntl.h>
c476bdce 32#include <grp.h>
e0732705
CS
33#include <sys/param.h>
34#include <sys/prctl.h>
7a0b0b56 35#include <sys/mount.h>
5ec27989 36#include <sys/socket.h>
1ba0013f 37#include <sys/syscall.h>
905022f7 38#include <sys/wait.h>
910bb4fa 39#include <linux/unistd.h>
905022f7 40#include <pwd.h>
e0732705
CS
41
42#if !HAVE_DECL_PR_CAPBSET_DROP
43#define PR_CAPBSET_DROP 24
44#endif
45
46#include "namespace.h"
47#include "log.h"
48#include "attach.h"
49#include "caps.h"
e0732705 50#include "config.h"
6a44839f 51#include "utils.h"
9c4693b8
CS
52#include "commands.h"
53#include "cgroup.h"
025ed0f3 54#include "lxclock.h"
2c4ea790
SH
55#include "conf.h"
56#include "lxcseccomp.h"
57#include <lxc/lxccontainer.h>
fe4de9a6 58#include "lsm/lsm.h"
9b8e3c96 59#include "confile.h"
9c4693b8
CS
60
61#if HAVE_SYS_PERSONALITY_H
62#include <sys/personality.h>
63#endif
e0732705 64
a3da2f3b
SG
65#ifndef SOCK_CLOEXEC
66# define SOCK_CLOEXEC 02000000
67#endif
68
d6a3c917
SG
69#ifndef MS_REC
70#define MS_REC 16384
71#endif
72
73#ifndef MS_SLAVE
74#define MS_SLAVE (1<<19)
75#endif
76
e0732705
CS
77lxc_log_define(lxc_attach, lxc);
78
74a3920a 79static struct lxc_proc_context_info *lxc_proc_get_context_info(pid_t pid)
e0732705
CS
80{
81 struct lxc_proc_context_info *info = calloc(1, sizeof(*info));
82 FILE *proc_file;
83 char proc_fn[MAXPATHLEN];
460a1cf0 84 char *line = NULL;
e0732705 85 size_t line_bufsz = 0;
460a1cf0 86 int ret, found;
e0732705
CS
87
88 if (!info) {
89 SYSERROR("Could not allocate memory.");
90 return NULL;
91 }
92
93 /* read capabilities */
94 snprintf(proc_fn, MAXPATHLEN, "/proc/%d/status", pid);
95
96 proc_file = fopen(proc_fn, "r");
97 if (!proc_file) {
98 SYSERROR("Could not open %s", proc_fn);
99 goto out_error;
100 }
101
102 found = 0;
103 while (getline(&line, &line_bufsz, proc_file) != -1) {
104 ret = sscanf(line, "CapBnd: %llx", &info->capability_mask);
105 if (ret != EOF && ret > 0) {
106 found = 1;
107 break;
108 }
109 }
110
fa9ac567
SH
111 if (line)
112 free(line);
e0732705
CS
113 fclose(proc_file);
114
115 if (!found) {
116 SYSERROR("Could not read capability bounding set from %s", proc_fn);
117 errno = ENOENT;
118 goto out_error;
119 }
120
fe4de9a6 121 info->lsm_label = lsm_process_label_get(pid);
e0732705 122
e0732705
CS
123 return info;
124
125out_error:
460a1cf0 126 free(info);
e0732705
CS
127 return NULL;
128}
129
fe4de9a6
DE
130static void lxc_proc_put_context_info(struct lxc_proc_context_info *ctx)
131{
132 if (ctx->lsm_label)
133 free(ctx->lsm_label);
2c4ea790
SH
134 if (ctx->container)
135 lxc_container_put(ctx->container);
fe4de9a6
DE
136 free(ctx);
137}
138
74a3920a 139static int lxc_attach_to_ns(pid_t pid, int which)
99d50954
CS
140{
141 char path[MAXPATHLEN];
fc763ab7
CS
142 /* according to <http://article.gmane.org/gmane.linux.kernel.containers.lxc.devel/1429>,
143 * the file for user namepsaces in /proc/$pid/ns will be called
144 * 'user' once the kernel supports it
145 */
f4364484 146 static char *ns[] = { "user", "mnt", "pid", "uts", "ipc", "net" };
fc763ab7 147 static int flags[] = {
f4364484
SG
148 CLONE_NEWUSER, CLONE_NEWNS, CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC,
149 CLONE_NEWNET
fc763ab7
CS
150 };
151 static const int size = sizeof(ns) / sizeof(char *);
99d50954 152 int fd[size];
fc763ab7
CS
153 int i, j, saved_errno;
154
99d50954
CS
155
156 snprintf(path, MAXPATHLEN, "/proc/%d/ns", pid);
157 if (access(path, X_OK)) {
158 ERROR("Does this kernel version support 'attach' ?");
159 return -1;
160 }
161
162 for (i = 0; i < size; i++) {
fc763ab7
CS
163 /* ignore if we are not supposed to attach to that
164 * namespace
165 */
166 if (which != -1 && !(which & flags[i])) {
167 fd[i] = -1;
168 continue;
169 }
170
99d50954 171 snprintf(path, MAXPATHLEN, "/proc/%d/ns/%s", pid, ns[i]);
9c4693b8 172 fd[i] = open(path, O_RDONLY | O_CLOEXEC);
99d50954 173 if (fd[i] < 0) {
fc763ab7
CS
174 saved_errno = errno;
175
176 /* close all already opened file descriptors before
177 * we return an error, so we don't leak them
178 */
179 for (j = 0; j < i; j++)
180 close(fd[j]);
181
182 errno = saved_errno;
99d50954
CS
183 SYSERROR("failed to open '%s'", path);
184 return -1;
185 }
186 }
187
188 for (i = 0; i < size; i++) {
fc763ab7
CS
189 if (fd[i] >= 0 && setns(fd[i], 0) != 0) {
190 saved_errno = errno;
191
192 for (j = i; j < size; j++)
193 close(fd[j]);
194
195 errno = saved_errno;
99d50954
CS
196 SYSERROR("failed to set namespace '%s'", ns[i]);
197 return -1;
198 }
199
200 close(fd[i]);
201 }
202
203 return 0;
204}
205
74a3920a 206static int lxc_attach_remount_sys_proc(void)
7a0b0b56
CS
207{
208 int ret;
209
210 ret = unshare(CLONE_NEWNS);
211 if (ret < 0) {
212 SYSERROR("failed to unshare mount namespace");
213 return -1;
214 }
215
2c6f3fc9
SH
216 if (detect_shared_rootfs()) {
217 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
218 SYSERROR("Failed to make / rslave");
219 ERROR("Continuing...");
220 }
221 }
222
7a0b0b56
CS
223 /* assume /proc is always mounted, so remount it */
224 ret = umount2("/proc", MNT_DETACH);
225 if (ret < 0) {
226 SYSERROR("failed to unmount /proc");
227 return -1;
228 }
229
230 ret = mount("none", "/proc", "proc", 0, NULL);
231 if (ret < 0) {
232 SYSERROR("failed to remount /proc");
233 return -1;
234 }
235
236 /* try to umount /sys - if it's not a mount point,
237 * we'll get EINVAL, then we ignore it because it
238 * may not have been mounted in the first place
239 */
240 ret = umount2("/sys", MNT_DETACH);
241 if (ret < 0 && errno != EINVAL) {
242 SYSERROR("failed to unmount /sys");
243 return -1;
244 } else if (ret == 0) {
245 /* remount it */
246 ret = mount("none", "/sys", "sysfs", 0, NULL);
247 if (ret < 0) {
248 SYSERROR("failed to remount /sys");
249 return -1;
250 }
251 }
252
253 return 0;
254}
255
74a3920a 256static int lxc_attach_drop_privs(struct lxc_proc_context_info *ctx)
e0732705
CS
257{
258 int last_cap = lxc_caps_last_cap();
259 int cap;
260
261 for (cap = 0; cap <= last_cap; cap++) {
262 if (ctx->capability_mask & (1LL << cap))
263 continue;
264
265 if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0)) {
266 SYSERROR("failed to remove capability id %d", cap);
267 return -1;
268 }
269 }
270
271 return 0;
272}
905022f7 273
74a3920a 274static int lxc_attach_set_environment(enum lxc_attach_env_policy_t policy, char** extra_env, char** extra_keep)
b3a39ba6 275{
799f96fd 276 if (policy == LXC_ATTACH_CLEAR_ENV) {
3d5e9f48 277 char **extra_keep_store = NULL;
3d5e9f48
CS
278 int path_kept = 0;
279
280 if (extra_keep) {
281 size_t count, i;
282
283 for (count = 0; extra_keep[count]; count++);
284
285 extra_keep_store = calloc(count, sizeof(char *));
286 if (!extra_keep_store) {
287 SYSERROR("failed to allocate memory for storing current "
288 "environment variable values that will be kept");
289 return -1;
290 }
291 for (i = 0; i < count; i++) {
292 char *v = getenv(extra_keep[i]);
293 if (v) {
294 extra_keep_store[i] = strdup(v);
295 if (!extra_keep_store[i]) {
296 SYSERROR("failed to allocate memory for storing current "
297 "environment variable values that will be kept");
298 while (i > 0)
299 free(extra_keep_store[--i]);
300 free(extra_keep_store);
301 return -1;
302 }
303 if (strcmp(extra_keep[i], "PATH") == 0)
304 path_kept = 1;
305 }
306 /* calloc sets entire array to zero, so we don't
307 * need an else */
308 }
309 }
310
799f96fd 311 if (clearenv()) {
a9cab7e3 312 char **p;
799f96fd 313 SYSERROR("failed to clear environment");
a9cab7e3
CS
314 if (extra_keep_store) {
315 for (p = extra_keep_store; *p; p++)
316 free(*p);
317 free(extra_keep_store);
318 }
3d5e9f48
CS
319 return -1;
320 }
321
322 if (extra_keep_store) {
323 size_t i;
324 for (i = 0; extra_keep[i]; i++) {
acd4922e
SG
325 if (extra_keep_store[i]) {
326 if (setenv(extra_keep[i], extra_keep_store[i], 1) < 0)
327 SYSERROR("Unable to set environment variable");
328 }
3d5e9f48
CS
329 free(extra_keep_store[i]);
330 }
331 free(extra_keep_store);
332 }
333
334 /* always set a default path; shells and execlp tend
335 * to be fine without it, but there is a disturbing
336 * number of C programs out there that just assume
337 * that getenv("PATH") is never NULL and then die a
338 * painful segfault death. */
339 if (!path_kept) {
511a6936
SG
340#ifdef HAVE_CONFSTR
341 size_t n;
342 char *path_env;
343
3d5e9f48
CS
344 n = confstr(_CS_PATH, NULL, 0);
345 path_env = malloc(n);
346 if (path_env) {
347 confstr(_CS_PATH, path_env, n);
348 setenv("PATH", path_env, 1);
349 free(path_env);
350 }
351 /* don't error out, this is just an extra service */
511a6936
SG
352#else
353 setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
354#endif
799f96fd 355 }
b3a39ba6
DW
356 }
357
358 if (putenv("container=lxc")) {
359 SYSERROR("failed to set environment variable");
360 return -1;
361 }
362
3d5e9f48
CS
363 /* set extra environment variables */
364 if (extra_env) {
365 for (; *extra_env; extra_env++) {
366 /* duplicate the string, just to be on
367 * the safe side, because putenv does not
368 * do it for us */
369 char *p = strdup(*extra_env);
370 /* we just assume the user knows what they
371 * are doing, so we don't do any checks */
372 if (!p) {
373 SYSERROR("failed to allocate memory for additional environment "
374 "variables");
375 return -1;
376 }
377 putenv(p);
378 }
379 }
380
b3a39ba6
DW
381 return 0;
382}
383
74a3920a 384static char *lxc_attach_getpwshell(uid_t uid)
905022f7
CS
385{
386 /* local variables */
387 pid_t pid;
388 int pipes[2];
389 int ret;
390 int fd;
391 char *result = NULL;
392
393 /* we need to fork off a process that runs the
394 * getent program, and we need to capture its
395 * output, so we use a pipe for that purpose
396 */
397 ret = pipe(pipes);
398 if (ret < 0)
399 return NULL;
400
401 pid = fork();
402 if (pid < 0) {
403 close(pipes[0]);
404 close(pipes[1]);
405 return NULL;
406 }
407
408 if (pid) {
409 /* parent process */
410 FILE *pipe_f;
411 char *line = NULL;
412 size_t line_bufsz = 0;
413 int found = 0;
414 int status;
415
416 close(pipes[1]);
417
418 pipe_f = fdopen(pipes[0], "r");
419 while (getline(&line, &line_bufsz, pipe_f) != -1) {
420 char *token;
421 char *saveptr = NULL;
422 long value;
423 char *endptr = NULL;
424 int i;
425
426 /* if we already found something, just continue
427 * to read until the pipe doesn't deliver any more
428 * data, but don't modify the existing data
429 * structure
430 */
431 if (found)
432 continue;
433
434 /* trim line on the right hand side */
bbb8a488 435 for (i = strlen(line); i > 0 && (line[i - 1] == '\n' || line[i - 1] == '\r'); --i)
905022f7
CS
436 line[i - 1] = '\0';
437
438 /* split into tokens: first user name */
439 token = strtok_r(line, ":", &saveptr);
440 if (!token)
441 continue;
442 /* next: dummy password field */
443 token = strtok_r(NULL, ":", &saveptr);
444 if (!token)
445 continue;
446 /* next: user id */
447 token = strtok_r(NULL, ":", &saveptr);
448 value = token ? strtol(token, &endptr, 10) : 0;
449 if (!token || !endptr || *endptr || value == LONG_MIN || value == LONG_MAX)
450 continue;
451 /* dummy sanity check: user id matches */
452 if ((uid_t) value != uid)
453 continue;
454 /* skip fields: gid, gecos, dir, go to next field 'shell' */
455 for (i = 0; i < 4; i++) {
456 token = strtok_r(NULL, ":", &saveptr);
457 if (!token)
458 break;
459 }
460 if (!token)
461 continue;
53a54099
SH
462 if (result)
463 free(result);
905022f7
CS
464 result = strdup(token);
465
466 /* sanity check that there are no fields after that */
467 token = strtok_r(NULL, ":", &saveptr);
468 if (token)
469 continue;
470
471 found = 1;
472 }
473
474 free(line);
475 fclose(pipe_f);
476 again:
477 if (waitpid(pid, &status, 0) < 0) {
478 if (errno == EINTR)
479 goto again;
480 return NULL;
481 }
482
483 /* some sanity checks: if anything even hinted at going
484 * wrong: we can't be sure we have a valid result, so
485 * we assume we don't
486 */
487
488 if (!WIFEXITED(status))
489 return NULL;
490
491 if (WEXITSTATUS(status) != 0)
492 return NULL;
493
494 if (!found)
495 return NULL;
496
497 return result;
498 } else {
499 /* child process */
500 char uid_buf[32];
501 char *arguments[] = {
502 "getent",
503 "passwd",
504 uid_buf,
505 NULL
506 };
507
508 close(pipes[0]);
509
510 /* we want to capture stdout */
511 dup2(pipes[1], 1);
512 close(pipes[1]);
513
514 /* get rid of stdin/stderr, so we try to associate it
515 * with /dev/null
516 */
517 fd = open("/dev/null", O_RDWR);
518 if (fd < 0) {
519 close(0);
520 close(2);
521 } else {
522 dup2(fd, 0);
523 dup2(fd, 2);
524 close(fd);
525 }
526
527 /* finish argument list */
528 ret = snprintf(uid_buf, sizeof(uid_buf), "%ld", (long) uid);
529 if (ret <= 0)
530 exit(-1);
531
532 /* try to run getent program */
533 (void) execvp("getent", arguments);
534 exit(-1);
535 }
536}
cb3e61fa 537
74a3920a 538static void lxc_attach_get_init_uidgid(uid_t* init_uid, gid_t* init_gid)
cb3e61fa
CS
539{
540 FILE *proc_file;
541 char proc_fn[MAXPATHLEN];
542 char *line = NULL;
543 size_t line_bufsz = 0;
544 int ret;
545 long value = -1;
546 uid_t uid = (uid_t)-1;
547 gid_t gid = (gid_t)-1;
548
549 /* read capabilities */
550 snprintf(proc_fn, MAXPATHLEN, "/proc/%d/status", 1);
551
552 proc_file = fopen(proc_fn, "r");
553 if (!proc_file)
554 return;
555
556 while (getline(&line, &line_bufsz, proc_file) != -1) {
557 /* format is: real, effective, saved set user, fs
558 * we only care about real uid
559 */
560 ret = sscanf(line, "Uid: %ld", &value);
561 if (ret != EOF && ret > 0) {
562 uid = (uid_t) value;
563 } else {
564 ret = sscanf(line, "Gid: %ld", &value);
565 if (ret != EOF && ret > 0)
566 gid = (gid_t) value;
567 }
568 if (uid != (uid_t)-1 && gid != (gid_t)-1)
569 break;
570 }
571
572 fclose(proc_file);
573 free(line);
574
575 /* only override arguments if we found something */
576 if (uid != (uid_t)-1)
577 *init_uid = uid;
578 if (gid != (gid_t)-1)
579 *init_gid = gid;
580
581 /* TODO: we should also parse supplementary groups and use
582 * setgroups() to set them */
583}
9c4693b8
CS
584
585struct attach_clone_payload {
586 int ipc_socket;
587 lxc_attach_options_t* options;
588 struct lxc_proc_context_info* init_ctx;
589 lxc_attach_exec_t exec_function;
590 void* exec_payload;
591};
592
593static int attach_child_main(void* data);
594
595/* help the optimizer along if it doesn't know that exit always exits */
596#define rexit(c) do { int __c = (c); exit(__c); return __c; } while(0)
597
598/* define default options if no options are supplied by the user */
599static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT;
600
2c4ea790
SH
601static bool fetch_seccomp(const char *name, const char *lxcpath,
602 struct lxc_proc_context_info *i, lxc_attach_options_t *options)
603{
604 struct lxc_container *c;
2eef2bda 605
2c4ea790
SH
606 if (!(options->namespaces & CLONE_NEWNS) || !(options->attach_flags & LXC_ATTACH_LSM))
607 return true;
608
609 c = lxc_container_new(name, lxcpath);
610 if (!c)
611 return false;
612 i->container = c;
613 if (!c->lxc_conf)
614 return false;
615 if (lxc_read_seccomp_config(c->lxc_conf) < 0) {
442f5c0f 616 ERROR("Error reading seccomp policy");
2c4ea790
SH
617 return false;
618 }
619
620 return true;
621}
622
9b8e3c96
SH
623static signed long get_personality(const char *name, const char *lxcpath)
624{
0d7cf7e9 625 char *p = lxc_cmd_get_config_item(name, "lxc.arch", lxcpath);
9b8e3c96
SH
626 signed long ret;
627
628 if (!p)
629 return -1;
630 ret = lxc_config_parse_arch(p);
631 free(p);
632 return ret;
633}
634
9c4693b8
CS
635int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_function, void* exec_payload, lxc_attach_options_t* options, pid_t* attached_process)
636{
637 int ret, status;
f4364484 638 pid_t init_pid, pid, attached_pid, expected;
9c4693b8
CS
639 struct lxc_proc_context_info *init_ctx;
640 char* cwd;
641 char* new_cwd;
642 int ipc_sockets[2];
9b8e3c96 643 signed long personality;
9c4693b8
CS
644
645 if (!options)
646 options = &attach_static_default_options;
647
648 init_pid = lxc_cmd_get_init_pid(name, lxcpath);
649 if (init_pid < 0) {
650 ERROR("failed to get the init pid");
651 return -1;
652 }
653
654 init_ctx = lxc_proc_get_context_info(init_pid);
655 if (!init_ctx) {
656 ERROR("failed to get context of the init process, pid = %ld", (long)init_pid);
657 return -1;
658 }
659
9b8e3c96
SH
660 personality = get_personality(name, lxcpath);
661 if (init_ctx->personality < 0) {
662 ERROR("Failed to get personality of the container");
663 lxc_proc_put_context_info(init_ctx);
664 return -1;
665 }
666 init_ctx->personality = personality;
667
2c4ea790
SH
668 if (!fetch_seccomp(name, lxcpath, init_ctx, options))
669 WARN("Failed to get seccomp policy");
670
9c4693b8
CS
671 cwd = getcwd(NULL, 0);
672
673 /* determine which namespaces the container was created with
674 * by asking lxc-start, if necessary
675 */
676 if (options->namespaces == -1) {
677 options->namespaces = lxc_cmd_get_clone_flags(name, lxcpath);
678 /* call failed */
679 if (options->namespaces == -1) {
680 ERROR("failed to automatically determine the "
681 "namespaces which the container unshared");
682 free(cwd);
fe4de9a6 683 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
684 return -1;
685 }
686 }
687
688 /* create a socket pair for IPC communication; set SOCK_CLOEXEC in order
689 * to make sure we don't irritate other threads that want to fork+exec away
690 *
691 * IMPORTANT: if the initial process is multithreaded and another call
692 * just fork()s away without exec'ing directly after, the socket fd will
693 * exist in the forked process from the other thread and any close() in
694 * our own child process will not really cause the socket to close properly,
695 * potentiall causing the parent to hang.
696 *
697 * For this reason, while IPC is still active, we have to use shutdown()
698 * if the child exits prematurely in order to signal that the socket
699 * is closed and cannot assume that the child exiting will automatically
700 * do that.
701 *
702 * IPC mechanism: (X is receiver)
703 * initial process intermediate attached
704 * X <--- send pid of
705 * attached proc,
706 * then exit
707 * send 0 ------------------------------------> X
708 * [do initialization]
709 * X <------------------------------------ send 1
710 * [add to cgroup, ...]
711 * send 2 ------------------------------------> X
712 * close socket close socket
713 * run program
714 */
715 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
716 if (ret < 0) {
717 SYSERROR("could not set up required IPC mechanism for attaching");
718 free(cwd);
fe4de9a6 719 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
720 return -1;
721 }
722
723 /* create intermediate subprocess, three reasons:
724 * 1. runs all pthread_atfork handlers and the
725 * child will no longer be threaded
726 * (we can't properly setns() in a threaded process)
727 * 2. we can't setns() in the child itself, since
728 * we want to make sure we are properly attached to
729 * the pidns
730 * 3. also, the initial thread has to put the attached
731 * process into the cgroup, which we can only do if
732 * we didn't already setns() (otherwise, user
733 * namespaces will hate us)
734 */
735 pid = fork();
736
737 if (pid < 0) {
738 SYSERROR("failed to create first subprocess");
739 free(cwd);
fe4de9a6 740 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
741 return -1;
742 }
743
744 if (pid) {
745 pid_t to_cleanup_pid = pid;
9c4693b8
CS
746
747 /* inital thread, we close the socket that is for the
748 * subprocesses
749 */
750 close(ipc_sockets[1]);
751 free(cwd);
752
f4364484
SG
753 /* attach to cgroup, if requested */
754 if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
4fb3cba5 755 if (!cgroup_attach(name, lxcpath, pid))
f4364484 756 goto cleanup_error;
f4364484
SG
757 }
758
759 /* Let the child process know to go ahead */
760 status = 0;
761 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
762 if (ret <= 0) {
763 ERROR("error using IPC to notify attached process for initialization (0)");
764 goto cleanup_error;
765 }
766
9c4693b8
CS
767 /* get pid from intermediate process */
768 ret = lxc_read_nointr_expect(ipc_sockets[0], &attached_pid, sizeof(attached_pid), NULL);
769 if (ret <= 0) {
770 if (ret != 0)
771 ERROR("error using IPC to receive pid of attached process");
772 goto cleanup_error;
773 }
774
2eef2bda
ÇO
775 /* ignore SIGKILL (CTRL-C) and SIGQUIT (CTRL-\) - issue #313 */
776 signal(SIGINT, SIG_IGN);
777 signal(SIGQUIT, SIG_IGN);
778
9c4693b8
CS
779 /* reap intermediate process */
780 ret = wait_for_pid(pid);
781 if (ret < 0)
782 goto cleanup_error;
783
784 /* we will always have to reap the grandchild now */
785 to_cleanup_pid = attached_pid;
786
787 /* tell attached process it may start initializing */
788 status = 0;
789 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
790 if (ret <= 0) {
791 ERROR("error using IPC to notify attached process for initialization (0)");
792 goto cleanup_error;
793 }
794
795 /* wait for the attached process to finish initializing */
796 expected = 1;
797 ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected);
798 if (ret <= 0) {
799 if (ret != 0)
800 ERROR("error using IPC to receive notification from attached process (1)");
801 goto cleanup_error;
802 }
803
9c4693b8
CS
804 /* tell attached process we're done */
805 status = 2;
806 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
807 if (ret <= 0) {
808 ERROR("error using IPC to notify attached process for initialization (2)");
809 goto cleanup_error;
810 }
811
812 /* now shut down communication with child, we're done */
813 shutdown(ipc_sockets[0], SHUT_RDWR);
814 close(ipc_sockets[0]);
fe4de9a6 815 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
816
817 /* we're done, the child process should now execute whatever
818 * it is that the user requested. The parent can now track it
819 * with waitpid() or similar.
820 */
821
822 *attached_process = attached_pid;
823 return 0;
824
825 cleanup_error:
826 /* first shut down the socket, then wait for the pid,
827 * otherwise the pid we're waiting for may never exit
828 */
829 shutdown(ipc_sockets[0], SHUT_RDWR);
830 close(ipc_sockets[0]);
831 if (to_cleanup_pid)
832 (void) wait_for_pid(to_cleanup_pid);
fe4de9a6 833 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
834 return -1;
835 }
836
837 /* first subprocess begins here, we close the socket that is for the
838 * initial thread
839 */
840 close(ipc_sockets[0]);
841
f4364484
SG
842 /* Wait for the parent to have setup cgroups */
843 expected = 0;
844 status = -1;
845 ret = lxc_read_nointr_expect(ipc_sockets[1], &status, sizeof(status), &expected);
846 if (ret <= 0) {
847 ERROR("error communicating with child process");
848 shutdown(ipc_sockets[1], SHUT_RDWR);
849 rexit(-1);
850 }
851
9c4693b8
CS
852 /* attach now, create another subprocess later, since pid namespaces
853 * only really affect the children of the current process
854 */
855 ret = lxc_attach_to_ns(init_pid, options->namespaces);
856 if (ret < 0) {
857 ERROR("failed to enter the namespace");
858 shutdown(ipc_sockets[1], SHUT_RDWR);
859 rexit(-1);
860 }
861
862 /* attach succeeded, try to cwd */
863 if (options->initial_cwd)
864 new_cwd = options->initial_cwd;
865 else
866 new_cwd = cwd;
867 ret = chdir(new_cwd);
868 if (ret < 0)
869 WARN("could not change directory to '%s'", new_cwd);
870 free(cwd);
871
872 /* now create the real child process */
873 {
874 struct attach_clone_payload payload = {
875 .ipc_socket = ipc_sockets[1],
876 .options = options,
877 .init_ctx = init_ctx,
878 .exec_function = exec_function,
879 .exec_payload = exec_payload
880 };
881 /* We use clone_parent here to make this subprocess a direct child of
882 * the initial process. Then this intermediate process can exit and
883 * the parent can directly track the attached process.
884 */
885 pid = lxc_clone(attach_child_main, &payload, CLONE_PARENT);
886 }
887
888 /* shouldn't happen, clone() should always return positive pid */
889 if (pid <= 0) {
890 SYSERROR("failed to create subprocess");
891 shutdown(ipc_sockets[1], SHUT_RDWR);
892 rexit(-1);
893 }
894
895 /* tell grandparent the pid of the pid of the newly created child */
896 ret = lxc_write_nointr(ipc_sockets[1], &pid, sizeof(pid));
897 if (ret != sizeof(pid)) {
898 /* if this really happens here, this is very unfortunate, since the
899 * parent will not know the pid of the attached process and will
900 * not be able to wait for it (and we won't either due to CLONE_PARENT)
901 * so the parent won't be able to reap it and the attached process
902 * will remain a zombie
903 */
904 ERROR("error using IPC to notify main process of pid of the attached process");
905 shutdown(ipc_sockets[1], SHUT_RDWR);
906 rexit(-1);
907 }
908
909 /* the rest is in the hands of the initial and the attached process */
910 rexit(0);
911}
912
74a3920a 913static int attach_child_main(void* data)
9c4693b8
CS
914{
915 struct attach_clone_payload* payload = (struct attach_clone_payload*)data;
916 int ipc_socket = payload->ipc_socket;
917 lxc_attach_options_t* options = payload->options;
918 struct lxc_proc_context_info* init_ctx = payload->init_ctx;
1a2e58cf 919#if HAVE_SYS_PERSONALITY_H
9c4693b8 920 long new_personality;
1a2e58cf 921#endif
9c4693b8
CS
922 int ret;
923 int status;
924 int expected;
925 long flags;
926 int fd;
927 uid_t new_uid;
928 gid_t new_gid;
929
930 /* wait for the initial thread to signal us that it's ready
931 * for us to start initializing
932 */
933 expected = 0;
934 status = -1;
935 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
936 if (ret <= 0) {
937 ERROR("error using IPC to receive notification from initial process (0)");
938 shutdown(ipc_socket, SHUT_RDWR);
939 rexit(-1);
940 }
941
9c4693b8
CS
942 /* A description of the purpose of this functionality is
943 * provided in the lxc-attach(1) manual page. We have to
944 * remount here and not in the parent process, otherwise
945 * /proc may not properly reflect the new pid namespace.
946 */
947 if (!(options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) {
948 ret = lxc_attach_remount_sys_proc();
949 if (ret < 0) {
950 shutdown(ipc_socket, SHUT_RDWR);
951 rexit(-1);
952 }
953 }
954
955 /* now perform additional attachments*/
956#if HAVE_SYS_PERSONALITY_H
957 if (options->personality < 0)
958 new_personality = init_ctx->personality;
959 else
960 new_personality = options->personality;
961
962 if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) {
963 ret = personality(new_personality);
964 if (ret < 0) {
965 SYSERROR("could not ensure correct architecture");
966 shutdown(ipc_socket, SHUT_RDWR);
967 rexit(-1);
968 }
969 }
970#endif
971
972 if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) {
973 ret = lxc_attach_drop_privs(init_ctx);
974 if (ret < 0) {
975 ERROR("could not drop privileges");
976 shutdown(ipc_socket, SHUT_RDWR);
977 rexit(-1);
978 }
979 }
980
981 /* always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL) if you want this to be a no-op) */
982 ret = lxc_attach_set_environment(options->env_policy, options->extra_env_vars, options->extra_keep_env);
983 if (ret < 0) {
984 ERROR("could not set initial environment for attached process");
985 shutdown(ipc_socket, SHUT_RDWR);
986 rexit(-1);
987 }
988
989 /* set user / group id */
990 new_uid = 0;
991 new_gid = 0;
992 /* ignore errors, we will fall back to root in that case
993 * (/proc was not mounted etc.)
994 */
995 if (options->namespaces & CLONE_NEWUSER)
996 lxc_attach_get_init_uidgid(&new_uid, &new_gid);
997
998 if (options->uid != (uid_t)-1)
999 new_uid = options->uid;
1000 if (options->gid != (gid_t)-1)
1001 new_gid = options->gid;
1002
1003 /* try to set the uid/gid combination */
c476bdce
SH
1004 if ((new_gid != 0 || options->namespaces & CLONE_NEWUSER)) {
1005 if (setgid(new_gid) || setgroups(0, NULL)) {
1006 SYSERROR("switching to container gid");
1007 shutdown(ipc_socket, SHUT_RDWR);
1008 rexit(-1);
1009 }
9c4693b8
CS
1010 }
1011 if ((new_uid != 0 || options->namespaces & CLONE_NEWUSER) && setuid(new_uid)) {
1012 SYSERROR("switching to container uid");
1013 shutdown(ipc_socket, SHUT_RDWR);
1014 rexit(-1);
1015 }
1016
1017 /* tell initial process it may now put us into the cgroups */
1018 status = 1;
1019 ret = lxc_write_nointr(ipc_socket, &status, sizeof(status));
1020 if (ret != sizeof(status)) {
1021 ERROR("error using IPC to notify initial process for initialization (1)");
1022 shutdown(ipc_socket, SHUT_RDWR);
1023 rexit(-1);
1024 }
1025
1026 /* wait for the initial thread to signal us that it has done
1027 * everything for us when it comes to cgroups etc.
1028 */
1029 expected = 2;
1030 status = -1;
1031 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
1032 if (ret <= 0) {
1033 ERROR("error using IPC to receive final notification from initial process (2)");
1034 shutdown(ipc_socket, SHUT_RDWR);
1035 rexit(-1);
1036 }
1037
1038 shutdown(ipc_socket, SHUT_RDWR);
1039 close(ipc_socket);
72863294
DE
1040
1041 /* set new apparmor profile/selinux context */
1042 if ((options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_LSM)) {
1043 int on_exec;
1044
1045 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? 1 : 0;
1046 ret = lsm_process_label_set(init_ctx->lsm_label, 0, on_exec);
1047 if (ret < 0) {
1048 rexit(-1);
1049 }
1050 }
2c4ea790
SH
1051
1052 if (init_ctx->container && init_ctx->container->lxc_conf &&
1053 lxc_seccomp_load(init_ctx->container->lxc_conf) != 0) {
1054 ERROR("Loading seccomp policy");
1055 rexit(-1);
1056 }
1057
fe4de9a6 1058 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
1059
1060 /* The following is done after the communication socket is
1061 * shut down. That way, all errors that might (though
1062 * unlikely) occur up until this point will have their messages
1063 * printed to the original stderr (if logging is so configured)
1064 * and not the fd the user supplied, if any.
1065 */
1066
1067 /* fd handling for stdin, stdout and stderr;
1068 * ignore errors here, user may want to make sure
1069 * the fds are closed, for example */
1070 if (options->stdin_fd >= 0 && options->stdin_fd != 0)
1071 dup2(options->stdin_fd, 0);
1072 if (options->stdout_fd >= 0 && options->stdout_fd != 1)
1073 dup2(options->stdout_fd, 1);
1074 if (options->stderr_fd >= 0 && options->stderr_fd != 2)
1075 dup2(options->stderr_fd, 2);
1076
1077 /* close the old fds */
1078 if (options->stdin_fd > 2)
1079 close(options->stdin_fd);
1080 if (options->stdout_fd > 2)
1081 close(options->stdout_fd);
1082 if (options->stderr_fd > 2)
1083 close(options->stderr_fd);
1084
1085 /* try to remove CLOEXEC flag from stdin/stdout/stderr,
1086 * but also here, ignore errors */
1087 for (fd = 0; fd <= 2; fd++) {
1088 flags = fcntl(fd, F_GETFL);
1089 if (flags < 0)
1090 continue;
71b2940d
SG
1091 if (flags & FD_CLOEXEC) {
1092 if (fcntl(fd, F_SETFL, flags & ~FD_CLOEXEC) < 0) {
1093 SYSERROR("Unable to clear CLOEXEC from fd");
1094 }
1095 }
9c4693b8
CS
1096 }
1097
1098 /* we're done, so we can now do whatever the user intended us to do */
1099 rexit(payload->exec_function(payload->exec_payload));
1100}
1101
1102int lxc_attach_run_command(void* payload)
1103{
1104 lxc_attach_command_t* cmd = (lxc_attach_command_t*)payload;
1105
1106 execvp(cmd->program, cmd->argv);
1107 SYSERROR("failed to exec '%s'", cmd->program);
1108 return -1;
1109}
1110
1111int lxc_attach_run_shell(void* payload)
1112{
1113 uid_t uid;
1114 struct passwd *passwd;
1115 char *user_shell;
1116
1117 /* ignore payload parameter */
1118 (void)payload;
1119
1120 uid = getuid();
1121 passwd = getpwuid(uid);
1122
1123 /* this probably happens because of incompatible nss
1124 * implementations in host and container (remember, this
1125 * code is still using the host's glibc but our mount
1126 * namespace is in the container)
1127 * we may try to get the information by spawning a
1128 * [getent passwd uid] process and parsing the result
1129 */
1130 if (!passwd)
1131 user_shell = lxc_attach_getpwshell(uid);
1132 else
1133 user_shell = passwd->pw_shell;
1134
1135 if (user_shell)
1136 execlp(user_shell, user_shell, NULL);
1137
1138 /* executed if either no passwd entry or execvp fails,
1139 * we will fall back on /bin/sh as a default shell
1140 */
1141 execlp("/bin/sh", "/bin/sh", NULL);
1142 SYSERROR("failed to exec shell");
1143 return -1;
1144}