]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/attach.c
always check whether rootfs is shared
[mirror_lxc.git] / src / lxc / attach.c
CommitLineData
e0732705
CS
1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
e0732705
CS
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
e0732705
CS
22 */
23
24#define _GNU_SOURCE
25#include <unistd.h>
26#include <stdio.h>
27#include <string.h>
28#include <stdlib.h>
29#include <errno.h>
30#include <fcntl.h>
c476bdce 31#include <grp.h>
e0732705
CS
32#include <sys/param.h>
33#include <sys/prctl.h>
7a0b0b56 34#include <sys/mount.h>
5ec27989 35#include <sys/socket.h>
1ba0013f 36#include <sys/syscall.h>
905022f7 37#include <sys/wait.h>
910bb4fa 38#include <linux/unistd.h>
905022f7 39#include <pwd.h>
e0732705
CS
40
41#if !HAVE_DECL_PR_CAPBSET_DROP
42#define PR_CAPBSET_DROP 24
43#endif
44
45#include "namespace.h"
46#include "log.h"
47#include "attach.h"
48#include "caps.h"
e0732705 49#include "config.h"
6a44839f 50#include "utils.h"
9c4693b8
CS
51#include "commands.h"
52#include "cgroup.h"
025ed0f3 53#include "lxclock.h"
2c4ea790
SH
54#include "conf.h"
55#include "lxcseccomp.h"
56#include <lxc/lxccontainer.h>
fe4de9a6 57#include "lsm/lsm.h"
9c4693b8
CS
58
59#if HAVE_SYS_PERSONALITY_H
60#include <sys/personality.h>
61#endif
e0732705 62
a3da2f3b
SG
63#ifndef SOCK_CLOEXEC
64# define SOCK_CLOEXEC 02000000
65#endif
66
e0732705
CS
67lxc_log_define(lxc_attach, lxc);
68
74a3920a 69static struct lxc_proc_context_info *lxc_proc_get_context_info(pid_t pid)
e0732705
CS
70{
71 struct lxc_proc_context_info *info = calloc(1, sizeof(*info));
72 FILE *proc_file;
73 char proc_fn[MAXPATHLEN];
460a1cf0 74 char *line = NULL;
e0732705 75 size_t line_bufsz = 0;
460a1cf0 76 int ret, found;
e0732705
CS
77
78 if (!info) {
79 SYSERROR("Could not allocate memory.");
80 return NULL;
81 }
82
83 /* read capabilities */
84 snprintf(proc_fn, MAXPATHLEN, "/proc/%d/status", pid);
85
86 proc_file = fopen(proc_fn, "r");
87 if (!proc_file) {
88 SYSERROR("Could not open %s", proc_fn);
89 goto out_error;
90 }
91
92 found = 0;
93 while (getline(&line, &line_bufsz, proc_file) != -1) {
94 ret = sscanf(line, "CapBnd: %llx", &info->capability_mask);
95 if (ret != EOF && ret > 0) {
96 found = 1;
97 break;
98 }
99 }
100
fa9ac567
SH
101 if (line)
102 free(line);
e0732705
CS
103 fclose(proc_file);
104
105 if (!found) {
106 SYSERROR("Could not read capability bounding set from %s", proc_fn);
107 errno = ENOENT;
108 goto out_error;
109 }
110
111 /* read personality */
112 snprintf(proc_fn, MAXPATHLEN, "/proc/%d/personality", pid);
113
114 proc_file = fopen(proc_fn, "r");
115 if (!proc_file) {
116 SYSERROR("Could not open %s", proc_fn);
117 goto out_error;
118 }
119
120 ret = fscanf(proc_file, "%lx", &info->personality);
121 fclose(proc_file);
122
123 if (ret == EOF || ret == 0) {
124 SYSERROR("Could not read personality from %s", proc_fn);
125 errno = ENOENT;
126 goto out_error;
127 }
fe4de9a6 128 info->lsm_label = lsm_process_label_get(pid);
e0732705 129
e0732705
CS
130 return info;
131
132out_error:
460a1cf0 133 free(info);
e0732705
CS
134 return NULL;
135}
136
fe4de9a6
DE
137static void lxc_proc_put_context_info(struct lxc_proc_context_info *ctx)
138{
139 if (ctx->lsm_label)
140 free(ctx->lsm_label);
2c4ea790
SH
141 if (ctx->container)
142 lxc_container_put(ctx->container);
fe4de9a6
DE
143 free(ctx);
144}
145
74a3920a 146static int lxc_attach_to_ns(pid_t pid, int which)
99d50954
CS
147{
148 char path[MAXPATHLEN];
fc763ab7
CS
149 /* according to <http://article.gmane.org/gmane.linux.kernel.containers.lxc.devel/1429>,
150 * the file for user namepsaces in /proc/$pid/ns will be called
151 * 'user' once the kernel supports it
152 */
f4364484 153 static char *ns[] = { "user", "mnt", "pid", "uts", "ipc", "net" };
fc763ab7 154 static int flags[] = {
f4364484
SG
155 CLONE_NEWUSER, CLONE_NEWNS, CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC,
156 CLONE_NEWNET
fc763ab7
CS
157 };
158 static const int size = sizeof(ns) / sizeof(char *);
99d50954 159 int fd[size];
fc763ab7
CS
160 int i, j, saved_errno;
161
99d50954
CS
162
163 snprintf(path, MAXPATHLEN, "/proc/%d/ns", pid);
164 if (access(path, X_OK)) {
165 ERROR("Does this kernel version support 'attach' ?");
166 return -1;
167 }
168
169 for (i = 0; i < size; i++) {
fc763ab7
CS
170 /* ignore if we are not supposed to attach to that
171 * namespace
172 */
173 if (which != -1 && !(which & flags[i])) {
174 fd[i] = -1;
175 continue;
176 }
177
99d50954 178 snprintf(path, MAXPATHLEN, "/proc/%d/ns/%s", pid, ns[i]);
9c4693b8 179 fd[i] = open(path, O_RDONLY | O_CLOEXEC);
99d50954 180 if (fd[i] < 0) {
fc763ab7
CS
181 saved_errno = errno;
182
183 /* close all already opened file descriptors before
184 * we return an error, so we don't leak them
185 */
186 for (j = 0; j < i; j++)
187 close(fd[j]);
188
189 errno = saved_errno;
99d50954
CS
190 SYSERROR("failed to open '%s'", path);
191 return -1;
192 }
193 }
194
195 for (i = 0; i < size; i++) {
fc763ab7
CS
196 if (fd[i] >= 0 && setns(fd[i], 0) != 0) {
197 saved_errno = errno;
198
199 for (j = i; j < size; j++)
200 close(fd[j]);
201
202 errno = saved_errno;
99d50954
CS
203 SYSERROR("failed to set namespace '%s'", ns[i]);
204 return -1;
205 }
206
207 close(fd[i]);
208 }
209
210 return 0;
211}
212
74a3920a 213static int lxc_attach_remount_sys_proc(void)
7a0b0b56
CS
214{
215 int ret;
216
217 ret = unshare(CLONE_NEWNS);
218 if (ret < 0) {
219 SYSERROR("failed to unshare mount namespace");
220 return -1;
221 }
222
2c6f3fc9
SH
223 if (detect_shared_rootfs()) {
224 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
225 SYSERROR("Failed to make / rslave");
226 ERROR("Continuing...");
227 }
228 }
229
7a0b0b56
CS
230 /* assume /proc is always mounted, so remount it */
231 ret = umount2("/proc", MNT_DETACH);
232 if (ret < 0) {
233 SYSERROR("failed to unmount /proc");
234 return -1;
235 }
236
237 ret = mount("none", "/proc", "proc", 0, NULL);
238 if (ret < 0) {
239 SYSERROR("failed to remount /proc");
240 return -1;
241 }
242
243 /* try to umount /sys - if it's not a mount point,
244 * we'll get EINVAL, then we ignore it because it
245 * may not have been mounted in the first place
246 */
247 ret = umount2("/sys", MNT_DETACH);
248 if (ret < 0 && errno != EINVAL) {
249 SYSERROR("failed to unmount /sys");
250 return -1;
251 } else if (ret == 0) {
252 /* remount it */
253 ret = mount("none", "/sys", "sysfs", 0, NULL);
254 if (ret < 0) {
255 SYSERROR("failed to remount /sys");
256 return -1;
257 }
258 }
259
260 return 0;
261}
262
74a3920a 263static int lxc_attach_drop_privs(struct lxc_proc_context_info *ctx)
e0732705
CS
264{
265 int last_cap = lxc_caps_last_cap();
266 int cap;
267
268 for (cap = 0; cap <= last_cap; cap++) {
269 if (ctx->capability_mask & (1LL << cap))
270 continue;
271
272 if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0)) {
273 SYSERROR("failed to remove capability id %d", cap);
274 return -1;
275 }
276 }
277
278 return 0;
279}
905022f7 280
74a3920a 281static int lxc_attach_set_environment(enum lxc_attach_env_policy_t policy, char** extra_env, char** extra_keep)
b3a39ba6 282{
799f96fd 283 if (policy == LXC_ATTACH_CLEAR_ENV) {
3d5e9f48 284 char **extra_keep_store = NULL;
3d5e9f48
CS
285 int path_kept = 0;
286
287 if (extra_keep) {
288 size_t count, i;
289
290 for (count = 0; extra_keep[count]; count++);
291
292 extra_keep_store = calloc(count, sizeof(char *));
293 if (!extra_keep_store) {
294 SYSERROR("failed to allocate memory for storing current "
295 "environment variable values that will be kept");
296 return -1;
297 }
298 for (i = 0; i < count; i++) {
299 char *v = getenv(extra_keep[i]);
300 if (v) {
301 extra_keep_store[i] = strdup(v);
302 if (!extra_keep_store[i]) {
303 SYSERROR("failed to allocate memory for storing current "
304 "environment variable values that will be kept");
305 while (i > 0)
306 free(extra_keep_store[--i]);
307 free(extra_keep_store);
308 return -1;
309 }
310 if (strcmp(extra_keep[i], "PATH") == 0)
311 path_kept = 1;
312 }
313 /* calloc sets entire array to zero, so we don't
314 * need an else */
315 }
316 }
317
799f96fd 318 if (clearenv()) {
a9cab7e3 319 char **p;
799f96fd 320 SYSERROR("failed to clear environment");
a9cab7e3
CS
321 if (extra_keep_store) {
322 for (p = extra_keep_store; *p; p++)
323 free(*p);
324 free(extra_keep_store);
325 }
3d5e9f48
CS
326 return -1;
327 }
328
329 if (extra_keep_store) {
330 size_t i;
331 for (i = 0; extra_keep[i]; i++) {
acd4922e
SG
332 if (extra_keep_store[i]) {
333 if (setenv(extra_keep[i], extra_keep_store[i], 1) < 0)
334 SYSERROR("Unable to set environment variable");
335 }
3d5e9f48
CS
336 free(extra_keep_store[i]);
337 }
338 free(extra_keep_store);
339 }
340
341 /* always set a default path; shells and execlp tend
342 * to be fine without it, but there is a disturbing
343 * number of C programs out there that just assume
344 * that getenv("PATH") is never NULL and then die a
345 * painful segfault death. */
346 if (!path_kept) {
511a6936
SG
347#ifdef HAVE_CONFSTR
348 size_t n;
349 char *path_env;
350
3d5e9f48
CS
351 n = confstr(_CS_PATH, NULL, 0);
352 path_env = malloc(n);
353 if (path_env) {
354 confstr(_CS_PATH, path_env, n);
355 setenv("PATH", path_env, 1);
356 free(path_env);
357 }
358 /* don't error out, this is just an extra service */
511a6936
SG
359#else
360 setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
361#endif
799f96fd 362 }
b3a39ba6
DW
363 }
364
365 if (putenv("container=lxc")) {
366 SYSERROR("failed to set environment variable");
367 return -1;
368 }
369
3d5e9f48
CS
370 /* set extra environment variables */
371 if (extra_env) {
372 for (; *extra_env; extra_env++) {
373 /* duplicate the string, just to be on
374 * the safe side, because putenv does not
375 * do it for us */
376 char *p = strdup(*extra_env);
377 /* we just assume the user knows what they
378 * are doing, so we don't do any checks */
379 if (!p) {
380 SYSERROR("failed to allocate memory for additional environment "
381 "variables");
382 return -1;
383 }
384 putenv(p);
385 }
386 }
387
b3a39ba6
DW
388 return 0;
389}
390
74a3920a 391static char *lxc_attach_getpwshell(uid_t uid)
905022f7
CS
392{
393 /* local variables */
394 pid_t pid;
395 int pipes[2];
396 int ret;
397 int fd;
398 char *result = NULL;
399
400 /* we need to fork off a process that runs the
401 * getent program, and we need to capture its
402 * output, so we use a pipe for that purpose
403 */
404 ret = pipe(pipes);
405 if (ret < 0)
406 return NULL;
407
408 pid = fork();
409 if (pid < 0) {
410 close(pipes[0]);
411 close(pipes[1]);
412 return NULL;
413 }
414
415 if (pid) {
416 /* parent process */
417 FILE *pipe_f;
418 char *line = NULL;
419 size_t line_bufsz = 0;
420 int found = 0;
421 int status;
422
423 close(pipes[1]);
424
425 pipe_f = fdopen(pipes[0], "r");
426 while (getline(&line, &line_bufsz, pipe_f) != -1) {
427 char *token;
428 char *saveptr = NULL;
429 long value;
430 char *endptr = NULL;
431 int i;
432
433 /* if we already found something, just continue
434 * to read until the pipe doesn't deliver any more
435 * data, but don't modify the existing data
436 * structure
437 */
438 if (found)
439 continue;
440
441 /* trim line on the right hand side */
bbb8a488 442 for (i = strlen(line); i > 0 && (line[i - 1] == '\n' || line[i - 1] == '\r'); --i)
905022f7
CS
443 line[i - 1] = '\0';
444
445 /* split into tokens: first user name */
446 token = strtok_r(line, ":", &saveptr);
447 if (!token)
448 continue;
449 /* next: dummy password field */
450 token = strtok_r(NULL, ":", &saveptr);
451 if (!token)
452 continue;
453 /* next: user id */
454 token = strtok_r(NULL, ":", &saveptr);
455 value = token ? strtol(token, &endptr, 10) : 0;
456 if (!token || !endptr || *endptr || value == LONG_MIN || value == LONG_MAX)
457 continue;
458 /* dummy sanity check: user id matches */
459 if ((uid_t) value != uid)
460 continue;
461 /* skip fields: gid, gecos, dir, go to next field 'shell' */
462 for (i = 0; i < 4; i++) {
463 token = strtok_r(NULL, ":", &saveptr);
464 if (!token)
465 break;
466 }
467 if (!token)
468 continue;
53a54099
SH
469 if (result)
470 free(result);
905022f7
CS
471 result = strdup(token);
472
473 /* sanity check that there are no fields after that */
474 token = strtok_r(NULL, ":", &saveptr);
475 if (token)
476 continue;
477
478 found = 1;
479 }
480
481 free(line);
482 fclose(pipe_f);
483 again:
484 if (waitpid(pid, &status, 0) < 0) {
485 if (errno == EINTR)
486 goto again;
487 return NULL;
488 }
489
490 /* some sanity checks: if anything even hinted at going
491 * wrong: we can't be sure we have a valid result, so
492 * we assume we don't
493 */
494
495 if (!WIFEXITED(status))
496 return NULL;
497
498 if (WEXITSTATUS(status) != 0)
499 return NULL;
500
501 if (!found)
502 return NULL;
503
504 return result;
505 } else {
506 /* child process */
507 char uid_buf[32];
508 char *arguments[] = {
509 "getent",
510 "passwd",
511 uid_buf,
512 NULL
513 };
514
515 close(pipes[0]);
516
517 /* we want to capture stdout */
518 dup2(pipes[1], 1);
519 close(pipes[1]);
520
521 /* get rid of stdin/stderr, so we try to associate it
522 * with /dev/null
523 */
524 fd = open("/dev/null", O_RDWR);
525 if (fd < 0) {
526 close(0);
527 close(2);
528 } else {
529 dup2(fd, 0);
530 dup2(fd, 2);
531 close(fd);
532 }
533
534 /* finish argument list */
535 ret = snprintf(uid_buf, sizeof(uid_buf), "%ld", (long) uid);
536 if (ret <= 0)
537 exit(-1);
538
539 /* try to run getent program */
540 (void) execvp("getent", arguments);
541 exit(-1);
542 }
543}
cb3e61fa 544
74a3920a 545static void lxc_attach_get_init_uidgid(uid_t* init_uid, gid_t* init_gid)
cb3e61fa
CS
546{
547 FILE *proc_file;
548 char proc_fn[MAXPATHLEN];
549 char *line = NULL;
550 size_t line_bufsz = 0;
551 int ret;
552 long value = -1;
553 uid_t uid = (uid_t)-1;
554 gid_t gid = (gid_t)-1;
555
556 /* read capabilities */
557 snprintf(proc_fn, MAXPATHLEN, "/proc/%d/status", 1);
558
559 proc_file = fopen(proc_fn, "r");
560 if (!proc_file)
561 return;
562
563 while (getline(&line, &line_bufsz, proc_file) != -1) {
564 /* format is: real, effective, saved set user, fs
565 * we only care about real uid
566 */
567 ret = sscanf(line, "Uid: %ld", &value);
568 if (ret != EOF && ret > 0) {
569 uid = (uid_t) value;
570 } else {
571 ret = sscanf(line, "Gid: %ld", &value);
572 if (ret != EOF && ret > 0)
573 gid = (gid_t) value;
574 }
575 if (uid != (uid_t)-1 && gid != (gid_t)-1)
576 break;
577 }
578
579 fclose(proc_file);
580 free(line);
581
582 /* only override arguments if we found something */
583 if (uid != (uid_t)-1)
584 *init_uid = uid;
585 if (gid != (gid_t)-1)
586 *init_gid = gid;
587
588 /* TODO: we should also parse supplementary groups and use
589 * setgroups() to set them */
590}
9c4693b8
CS
591
592struct attach_clone_payload {
593 int ipc_socket;
594 lxc_attach_options_t* options;
595 struct lxc_proc_context_info* init_ctx;
596 lxc_attach_exec_t exec_function;
597 void* exec_payload;
598};
599
600static int attach_child_main(void* data);
601
602/* help the optimizer along if it doesn't know that exit always exits */
603#define rexit(c) do { int __c = (c); exit(__c); return __c; } while(0)
604
605/* define default options if no options are supplied by the user */
606static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT;
607
2c4ea790
SH
608static bool fetch_seccomp(const char *name, const char *lxcpath,
609 struct lxc_proc_context_info *i, lxc_attach_options_t *options)
610{
611 struct lxc_container *c;
612
613 if (!(options->namespaces & CLONE_NEWNS) || !(options->attach_flags & LXC_ATTACH_LSM))
614 return true;
615
616 c = lxc_container_new(name, lxcpath);
617 if (!c)
618 return false;
619 i->container = c;
620 if (!c->lxc_conf)
621 return false;
622 if (lxc_read_seccomp_config(c->lxc_conf) < 0) {
442f5c0f 623 ERROR("Error reading seccomp policy");
2c4ea790
SH
624 return false;
625 }
626
627 return true;
628}
629
9c4693b8
CS
630int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_function, void* exec_payload, lxc_attach_options_t* options, pid_t* attached_process)
631{
632 int ret, status;
f4364484 633 pid_t init_pid, pid, attached_pid, expected;
9c4693b8
CS
634 struct lxc_proc_context_info *init_ctx;
635 char* cwd;
636 char* new_cwd;
637 int ipc_sockets[2];
638
639 if (!options)
640 options = &attach_static_default_options;
641
642 init_pid = lxc_cmd_get_init_pid(name, lxcpath);
643 if (init_pid < 0) {
644 ERROR("failed to get the init pid");
645 return -1;
646 }
647
648 init_ctx = lxc_proc_get_context_info(init_pid);
649 if (!init_ctx) {
650 ERROR("failed to get context of the init process, pid = %ld", (long)init_pid);
651 return -1;
652 }
653
2c4ea790
SH
654 if (!fetch_seccomp(name, lxcpath, init_ctx, options))
655 WARN("Failed to get seccomp policy");
656
9c4693b8
CS
657 cwd = getcwd(NULL, 0);
658
659 /* determine which namespaces the container was created with
660 * by asking lxc-start, if necessary
661 */
662 if (options->namespaces == -1) {
663 options->namespaces = lxc_cmd_get_clone_flags(name, lxcpath);
664 /* call failed */
665 if (options->namespaces == -1) {
666 ERROR("failed to automatically determine the "
667 "namespaces which the container unshared");
668 free(cwd);
fe4de9a6 669 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
670 return -1;
671 }
672 }
673
674 /* create a socket pair for IPC communication; set SOCK_CLOEXEC in order
675 * to make sure we don't irritate other threads that want to fork+exec away
676 *
677 * IMPORTANT: if the initial process is multithreaded and another call
678 * just fork()s away without exec'ing directly after, the socket fd will
679 * exist in the forked process from the other thread and any close() in
680 * our own child process will not really cause the socket to close properly,
681 * potentiall causing the parent to hang.
682 *
683 * For this reason, while IPC is still active, we have to use shutdown()
684 * if the child exits prematurely in order to signal that the socket
685 * is closed and cannot assume that the child exiting will automatically
686 * do that.
687 *
688 * IPC mechanism: (X is receiver)
689 * initial process intermediate attached
690 * X <--- send pid of
691 * attached proc,
692 * then exit
693 * send 0 ------------------------------------> X
694 * [do initialization]
695 * X <------------------------------------ send 1
696 * [add to cgroup, ...]
697 * send 2 ------------------------------------> X
698 * close socket close socket
699 * run program
700 */
701 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
702 if (ret < 0) {
703 SYSERROR("could not set up required IPC mechanism for attaching");
704 free(cwd);
fe4de9a6 705 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
706 return -1;
707 }
708
709 /* create intermediate subprocess, three reasons:
710 * 1. runs all pthread_atfork handlers and the
711 * child will no longer be threaded
712 * (we can't properly setns() in a threaded process)
713 * 2. we can't setns() in the child itself, since
714 * we want to make sure we are properly attached to
715 * the pidns
716 * 3. also, the initial thread has to put the attached
717 * process into the cgroup, which we can only do if
718 * we didn't already setns() (otherwise, user
719 * namespaces will hate us)
720 */
721 pid = fork();
722
723 if (pid < 0) {
724 SYSERROR("failed to create first subprocess");
725 free(cwd);
fe4de9a6 726 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
727 return -1;
728 }
729
730 if (pid) {
731 pid_t to_cleanup_pid = pid;
9c4693b8
CS
732
733 /* inital thread, we close the socket that is for the
734 * subprocesses
735 */
736 close(ipc_sockets[1]);
737 free(cwd);
738
f4364484
SG
739 /* attach to cgroup, if requested */
740 if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
4fb3cba5 741 if (!cgroup_attach(name, lxcpath, pid))
f4364484 742 goto cleanup_error;
f4364484
SG
743 }
744
745 /* Let the child process know to go ahead */
746 status = 0;
747 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
748 if (ret <= 0) {
749 ERROR("error using IPC to notify attached process for initialization (0)");
750 goto cleanup_error;
751 }
752
9c4693b8
CS
753 /* get pid from intermediate process */
754 ret = lxc_read_nointr_expect(ipc_sockets[0], &attached_pid, sizeof(attached_pid), NULL);
755 if (ret <= 0) {
756 if (ret != 0)
757 ERROR("error using IPC to receive pid of attached process");
758 goto cleanup_error;
759 }
760
761 /* reap intermediate process */
762 ret = wait_for_pid(pid);
763 if (ret < 0)
764 goto cleanup_error;
765
766 /* we will always have to reap the grandchild now */
767 to_cleanup_pid = attached_pid;
768
769 /* tell attached process it may start initializing */
770 status = 0;
771 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
772 if (ret <= 0) {
773 ERROR("error using IPC to notify attached process for initialization (0)");
774 goto cleanup_error;
775 }
776
777 /* wait for the attached process to finish initializing */
778 expected = 1;
779 ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected);
780 if (ret <= 0) {
781 if (ret != 0)
782 ERROR("error using IPC to receive notification from attached process (1)");
783 goto cleanup_error;
784 }
785
9c4693b8
CS
786 /* tell attached process we're done */
787 status = 2;
788 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
789 if (ret <= 0) {
790 ERROR("error using IPC to notify attached process for initialization (2)");
791 goto cleanup_error;
792 }
793
794 /* now shut down communication with child, we're done */
795 shutdown(ipc_sockets[0], SHUT_RDWR);
796 close(ipc_sockets[0]);
fe4de9a6 797 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
798
799 /* we're done, the child process should now execute whatever
800 * it is that the user requested. The parent can now track it
801 * with waitpid() or similar.
802 */
803
804 *attached_process = attached_pid;
805 return 0;
806
807 cleanup_error:
808 /* first shut down the socket, then wait for the pid,
809 * otherwise the pid we're waiting for may never exit
810 */
811 shutdown(ipc_sockets[0], SHUT_RDWR);
812 close(ipc_sockets[0]);
813 if (to_cleanup_pid)
814 (void) wait_for_pid(to_cleanup_pid);
fe4de9a6 815 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
816 return -1;
817 }
818
819 /* first subprocess begins here, we close the socket that is for the
820 * initial thread
821 */
822 close(ipc_sockets[0]);
823
f4364484
SG
824 /* Wait for the parent to have setup cgroups */
825 expected = 0;
826 status = -1;
827 ret = lxc_read_nointr_expect(ipc_sockets[1], &status, sizeof(status), &expected);
828 if (ret <= 0) {
829 ERROR("error communicating with child process");
830 shutdown(ipc_sockets[1], SHUT_RDWR);
831 rexit(-1);
832 }
833
9c4693b8
CS
834 /* attach now, create another subprocess later, since pid namespaces
835 * only really affect the children of the current process
836 */
837 ret = lxc_attach_to_ns(init_pid, options->namespaces);
838 if (ret < 0) {
839 ERROR("failed to enter the namespace");
840 shutdown(ipc_sockets[1], SHUT_RDWR);
841 rexit(-1);
842 }
843
844 /* attach succeeded, try to cwd */
845 if (options->initial_cwd)
846 new_cwd = options->initial_cwd;
847 else
848 new_cwd = cwd;
849 ret = chdir(new_cwd);
850 if (ret < 0)
851 WARN("could not change directory to '%s'", new_cwd);
852 free(cwd);
853
854 /* now create the real child process */
855 {
856 struct attach_clone_payload payload = {
857 .ipc_socket = ipc_sockets[1],
858 .options = options,
859 .init_ctx = init_ctx,
860 .exec_function = exec_function,
861 .exec_payload = exec_payload
862 };
863 /* We use clone_parent here to make this subprocess a direct child of
864 * the initial process. Then this intermediate process can exit and
865 * the parent can directly track the attached process.
866 */
867 pid = lxc_clone(attach_child_main, &payload, CLONE_PARENT);
868 }
869
870 /* shouldn't happen, clone() should always return positive pid */
871 if (pid <= 0) {
872 SYSERROR("failed to create subprocess");
873 shutdown(ipc_sockets[1], SHUT_RDWR);
874 rexit(-1);
875 }
876
877 /* tell grandparent the pid of the pid of the newly created child */
878 ret = lxc_write_nointr(ipc_sockets[1], &pid, sizeof(pid));
879 if (ret != sizeof(pid)) {
880 /* if this really happens here, this is very unfortunate, since the
881 * parent will not know the pid of the attached process and will
882 * not be able to wait for it (and we won't either due to CLONE_PARENT)
883 * so the parent won't be able to reap it and the attached process
884 * will remain a zombie
885 */
886 ERROR("error using IPC to notify main process of pid of the attached process");
887 shutdown(ipc_sockets[1], SHUT_RDWR);
888 rexit(-1);
889 }
890
891 /* the rest is in the hands of the initial and the attached process */
892 rexit(0);
893}
894
74a3920a 895static int attach_child_main(void* data)
9c4693b8
CS
896{
897 struct attach_clone_payload* payload = (struct attach_clone_payload*)data;
898 int ipc_socket = payload->ipc_socket;
899 lxc_attach_options_t* options = payload->options;
900 struct lxc_proc_context_info* init_ctx = payload->init_ctx;
1a2e58cf 901#if HAVE_SYS_PERSONALITY_H
9c4693b8 902 long new_personality;
1a2e58cf 903#endif
9c4693b8
CS
904 int ret;
905 int status;
906 int expected;
907 long flags;
908 int fd;
909 uid_t new_uid;
910 gid_t new_gid;
911
912 /* wait for the initial thread to signal us that it's ready
913 * for us to start initializing
914 */
915 expected = 0;
916 status = -1;
917 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
918 if (ret <= 0) {
919 ERROR("error using IPC to receive notification from initial process (0)");
920 shutdown(ipc_socket, SHUT_RDWR);
921 rexit(-1);
922 }
923
9c4693b8
CS
924 /* A description of the purpose of this functionality is
925 * provided in the lxc-attach(1) manual page. We have to
926 * remount here and not in the parent process, otherwise
927 * /proc may not properly reflect the new pid namespace.
928 */
929 if (!(options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) {
930 ret = lxc_attach_remount_sys_proc();
931 if (ret < 0) {
932 shutdown(ipc_socket, SHUT_RDWR);
933 rexit(-1);
934 }
935 }
936
937 /* now perform additional attachments*/
938#if HAVE_SYS_PERSONALITY_H
939 if (options->personality < 0)
940 new_personality = init_ctx->personality;
941 else
942 new_personality = options->personality;
943
944 if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) {
945 ret = personality(new_personality);
946 if (ret < 0) {
947 SYSERROR("could not ensure correct architecture");
948 shutdown(ipc_socket, SHUT_RDWR);
949 rexit(-1);
950 }
951 }
952#endif
953
954 if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) {
955 ret = lxc_attach_drop_privs(init_ctx);
956 if (ret < 0) {
957 ERROR("could not drop privileges");
958 shutdown(ipc_socket, SHUT_RDWR);
959 rexit(-1);
960 }
961 }
962
963 /* always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL) if you want this to be a no-op) */
964 ret = lxc_attach_set_environment(options->env_policy, options->extra_env_vars, options->extra_keep_env);
965 if (ret < 0) {
966 ERROR("could not set initial environment for attached process");
967 shutdown(ipc_socket, SHUT_RDWR);
968 rexit(-1);
969 }
970
971 /* set user / group id */
972 new_uid = 0;
973 new_gid = 0;
974 /* ignore errors, we will fall back to root in that case
975 * (/proc was not mounted etc.)
976 */
977 if (options->namespaces & CLONE_NEWUSER)
978 lxc_attach_get_init_uidgid(&new_uid, &new_gid);
979
980 if (options->uid != (uid_t)-1)
981 new_uid = options->uid;
982 if (options->gid != (gid_t)-1)
983 new_gid = options->gid;
984
985 /* try to set the uid/gid combination */
c476bdce
SH
986 if ((new_gid != 0 || options->namespaces & CLONE_NEWUSER)) {
987 if (setgid(new_gid) || setgroups(0, NULL)) {
988 SYSERROR("switching to container gid");
989 shutdown(ipc_socket, SHUT_RDWR);
990 rexit(-1);
991 }
9c4693b8
CS
992 }
993 if ((new_uid != 0 || options->namespaces & CLONE_NEWUSER) && setuid(new_uid)) {
994 SYSERROR("switching to container uid");
995 shutdown(ipc_socket, SHUT_RDWR);
996 rexit(-1);
997 }
998
999 /* tell initial process it may now put us into the cgroups */
1000 status = 1;
1001 ret = lxc_write_nointr(ipc_socket, &status, sizeof(status));
1002 if (ret != sizeof(status)) {
1003 ERROR("error using IPC to notify initial process for initialization (1)");
1004 shutdown(ipc_socket, SHUT_RDWR);
1005 rexit(-1);
1006 }
1007
1008 /* wait for the initial thread to signal us that it has done
1009 * everything for us when it comes to cgroups etc.
1010 */
1011 expected = 2;
1012 status = -1;
1013 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
1014 if (ret <= 0) {
1015 ERROR("error using IPC to receive final notification from initial process (2)");
1016 shutdown(ipc_socket, SHUT_RDWR);
1017 rexit(-1);
1018 }
1019
1020 shutdown(ipc_socket, SHUT_RDWR);
1021 close(ipc_socket);
72863294
DE
1022
1023 /* set new apparmor profile/selinux context */
1024 if ((options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_LSM)) {
1025 int on_exec;
1026
1027 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? 1 : 0;
1028 ret = lsm_process_label_set(init_ctx->lsm_label, 0, on_exec);
1029 if (ret < 0) {
1030 rexit(-1);
1031 }
1032 }
2c4ea790
SH
1033
1034 if (init_ctx->container && init_ctx->container->lxc_conf &&
1035 lxc_seccomp_load(init_ctx->container->lxc_conf) != 0) {
1036 ERROR("Loading seccomp policy");
1037 rexit(-1);
1038 }
1039
fe4de9a6 1040 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
1041
1042 /* The following is done after the communication socket is
1043 * shut down. That way, all errors that might (though
1044 * unlikely) occur up until this point will have their messages
1045 * printed to the original stderr (if logging is so configured)
1046 * and not the fd the user supplied, if any.
1047 */
1048
1049 /* fd handling for stdin, stdout and stderr;
1050 * ignore errors here, user may want to make sure
1051 * the fds are closed, for example */
1052 if (options->stdin_fd >= 0 && options->stdin_fd != 0)
1053 dup2(options->stdin_fd, 0);
1054 if (options->stdout_fd >= 0 && options->stdout_fd != 1)
1055 dup2(options->stdout_fd, 1);
1056 if (options->stderr_fd >= 0 && options->stderr_fd != 2)
1057 dup2(options->stderr_fd, 2);
1058
1059 /* close the old fds */
1060 if (options->stdin_fd > 2)
1061 close(options->stdin_fd);
1062 if (options->stdout_fd > 2)
1063 close(options->stdout_fd);
1064 if (options->stderr_fd > 2)
1065 close(options->stderr_fd);
1066
1067 /* try to remove CLOEXEC flag from stdin/stdout/stderr,
1068 * but also here, ignore errors */
1069 for (fd = 0; fd <= 2; fd++) {
1070 flags = fcntl(fd, F_GETFL);
1071 if (flags < 0)
1072 continue;
71b2940d
SG
1073 if (flags & FD_CLOEXEC) {
1074 if (fcntl(fd, F_SETFL, flags & ~FD_CLOEXEC) < 0) {
1075 SYSERROR("Unable to clear CLOEXEC from fd");
1076 }
1077 }
9c4693b8
CS
1078 }
1079
1080 /* we're done, so we can now do whatever the user intended us to do */
1081 rexit(payload->exec_function(payload->exec_payload));
1082}
1083
1084int lxc_attach_run_command(void* payload)
1085{
1086 lxc_attach_command_t* cmd = (lxc_attach_command_t*)payload;
1087
1088 execvp(cmd->program, cmd->argv);
1089 SYSERROR("failed to exec '%s'", cmd->program);
1090 return -1;
1091}
1092
1093int lxc_attach_run_shell(void* payload)
1094{
1095 uid_t uid;
1096 struct passwd *passwd;
1097 char *user_shell;
1098
1099 /* ignore payload parameter */
1100 (void)payload;
1101
1102 uid = getuid();
1103 passwd = getpwuid(uid);
1104
1105 /* this probably happens because of incompatible nss
1106 * implementations in host and container (remember, this
1107 * code is still using the host's glibc but our mount
1108 * namespace is in the container)
1109 * we may try to get the information by spawning a
1110 * [getent passwd uid] process and parsing the result
1111 */
1112 if (!passwd)
1113 user_shell = lxc_attach_getpwshell(uid);
1114 else
1115 user_shell = passwd->pw_shell;
1116
1117 if (user_shell)
1118 execlp(user_shell, user_shell, NULL);
1119
1120 /* executed if either no passwd entry or execvp fails,
1121 * we will fall back on /bin/sh as a default shell
1122 */
1123 execlp("/bin/sh", "/bin/sh", NULL);
1124 SYSERROR("failed to exec shell");
1125 return -1;
1126}