]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/attach.c
attach: call lxc_container_new() earlier
[mirror_lxc.git] / src / lxc / attach.c
CommitLineData
e0732705
CS
1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
e0732705
CS
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
e0732705
CS
22 */
23
24#define _GNU_SOURCE
25#include <unistd.h>
26#include <stdio.h>
27#include <string.h>
28#include <stdlib.h>
2eef2bda 29#include <signal.h>
e0732705
CS
30#include <errno.h>
31#include <fcntl.h>
c476bdce 32#include <grp.h>
e0732705
CS
33#include <sys/param.h>
34#include <sys/prctl.h>
7a0b0b56 35#include <sys/mount.h>
5ec27989 36#include <sys/socket.h>
1ba0013f 37#include <sys/syscall.h>
905022f7 38#include <sys/wait.h>
910bb4fa 39#include <linux/unistd.h>
905022f7 40#include <pwd.h>
e0732705
CS
41
42#if !HAVE_DECL_PR_CAPBSET_DROP
43#define PR_CAPBSET_DROP 24
44#endif
45
46#include "namespace.h"
47#include "log.h"
48#include "attach.h"
49#include "caps.h"
e0732705 50#include "config.h"
6a44839f 51#include "utils.h"
9c4693b8
CS
52#include "commands.h"
53#include "cgroup.h"
025ed0f3 54#include "lxclock.h"
2c4ea790
SH
55#include "conf.h"
56#include "lxcseccomp.h"
57#include <lxc/lxccontainer.h>
fe4de9a6 58#include "lsm/lsm.h"
9b8e3c96 59#include "confile.h"
9c4693b8
CS
60
61#if HAVE_SYS_PERSONALITY_H
62#include <sys/personality.h>
63#endif
e0732705 64
a3da2f3b
SG
65#ifndef SOCK_CLOEXEC
66# define SOCK_CLOEXEC 02000000
67#endif
68
d6a3c917
SG
69#ifndef MS_REC
70#define MS_REC 16384
71#endif
72
73#ifndef MS_SLAVE
74#define MS_SLAVE (1<<19)
75#endif
76
e0732705
CS
77lxc_log_define(lxc_attach, lxc);
78
5c3fcae7
SG
79int lsm_set_label_at(int procfd, int on_exec, char* lsm_label) {
80 int labelfd = -1;
81 int ret = 0;
82 const char* name;
83 char* command = NULL;
84
85 name = lsm_name();
86
87 if (strcmp(name, "nop") == 0)
88 goto out;
89
90 if (strcmp(name, "none") == 0)
91 goto out;
92
93 /* We don't support on-exec with AppArmor */
94 if (strcmp(name, "AppArmor") == 0)
95 on_exec = 0;
96
97 if (on_exec) {
98 labelfd = openat(procfd, "self/attr/exec", O_RDWR);
99 }
100 else {
101 labelfd = openat(procfd, "self/attr/current", O_RDWR);
102 }
103
104 if (labelfd < 0) {
105 SYSERROR("Unable to open LSM label");
106 ret = -1;
107 goto out;
108 }
109
110 if (strcmp(name, "AppArmor") == 0) {
111 int size;
112
113 command = malloc(strlen(lsm_label) + strlen("changeprofile ") + 1);
114 if (!command) {
115 SYSERROR("Failed to write apparmor profile");
116 ret = -1;
117 goto out;
118 }
119
120 size = sprintf(command, "changeprofile %s", lsm_label);
121 if (size < 0) {
122 SYSERROR("Failed to write apparmor profile");
123 ret = -1;
124 goto out;
125 }
126
127 if (write(labelfd, command, size + 1) < 0) {
128 SYSERROR("Unable to set LSM label");
129 ret = -1;
130 goto out;
131 }
132 }
133 else if (strcmp(name, "SELinux") == 0) {
134 if (write(labelfd, lsm_label, strlen(lsm_label) + 1) < 0) {
135 SYSERROR("Unable to set LSM label");
136 ret = -1;
137 goto out;
138 }
139 }
140 else {
141 ERROR("Unable to restore label for unknown LSM: %s", name);
142 ret = -1;
143 goto out;
144 }
145
146out:
147 free(command);
148
149 if (labelfd != -1)
150 close(labelfd);
151
152 return ret;
153}
154
74a3920a 155static struct lxc_proc_context_info *lxc_proc_get_context_info(pid_t pid)
e0732705
CS
156{
157 struct lxc_proc_context_info *info = calloc(1, sizeof(*info));
158 FILE *proc_file;
159 char proc_fn[MAXPATHLEN];
460a1cf0 160 char *line = NULL;
e0732705 161 size_t line_bufsz = 0;
460a1cf0 162 int ret, found;
e0732705
CS
163
164 if (!info) {
165 SYSERROR("Could not allocate memory.");
166 return NULL;
167 }
168
169 /* read capabilities */
170 snprintf(proc_fn, MAXPATHLEN, "/proc/%d/status", pid);
171
172 proc_file = fopen(proc_fn, "r");
173 if (!proc_file) {
174 SYSERROR("Could not open %s", proc_fn);
175 goto out_error;
176 }
177
178 found = 0;
179 while (getline(&line, &line_bufsz, proc_file) != -1) {
180 ret = sscanf(line, "CapBnd: %llx", &info->capability_mask);
181 if (ret != EOF && ret > 0) {
182 found = 1;
183 break;
184 }
185 }
186
f10fad2f 187 free(line);
e0732705
CS
188 fclose(proc_file);
189
190 if (!found) {
191 SYSERROR("Could not read capability bounding set from %s", proc_fn);
192 errno = ENOENT;
193 goto out_error;
194 }
195
fe4de9a6 196 info->lsm_label = lsm_process_label_get(pid);
e0732705 197
e0732705
CS
198 return info;
199
200out_error:
460a1cf0 201 free(info);
e0732705
CS
202 return NULL;
203}
204
fe4de9a6
DE
205static void lxc_proc_put_context_info(struct lxc_proc_context_info *ctx)
206{
f10fad2f 207 free(ctx->lsm_label);
2c4ea790
SH
208 if (ctx->container)
209 lxc_container_put(ctx->container);
fe4de9a6
DE
210 free(ctx);
211}
212
74a3920a 213static int lxc_attach_to_ns(pid_t pid, int which)
99d50954
CS
214{
215 char path[MAXPATHLEN];
fc763ab7
CS
216 /* according to <http://article.gmane.org/gmane.linux.kernel.containers.lxc.devel/1429>,
217 * the file for user namepsaces in /proc/$pid/ns will be called
218 * 'user' once the kernel supports it
219 */
dac862c0 220 static char *ns[] = { "user", "mnt", "pid", "uts", "ipc", "net", "cgroup" };
fc763ab7 221 static int flags[] = {
f4364484 222 CLONE_NEWUSER, CLONE_NEWNS, CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC,
dac862c0 223 CLONE_NEWNET, CLONE_NEWCGROUP
fc763ab7
CS
224 };
225 static const int size = sizeof(ns) / sizeof(char *);
99d50954 226 int fd[size];
fc763ab7
CS
227 int i, j, saved_errno;
228
99d50954
CS
229
230 snprintf(path, MAXPATHLEN, "/proc/%d/ns", pid);
231 if (access(path, X_OK)) {
232 ERROR("Does this kernel version support 'attach' ?");
233 return -1;
234 }
235
236 for (i = 0; i < size; i++) {
fc763ab7
CS
237 /* ignore if we are not supposed to attach to that
238 * namespace
239 */
240 if (which != -1 && !(which & flags[i])) {
241 fd[i] = -1;
242 continue;
243 }
244
99d50954 245 snprintf(path, MAXPATHLEN, "/proc/%d/ns/%s", pid, ns[i]);
9c4693b8 246 fd[i] = open(path, O_RDONLY | O_CLOEXEC);
99d50954 247 if (fd[i] < 0) {
fc763ab7
CS
248 saved_errno = errno;
249
250 /* close all already opened file descriptors before
251 * we return an error, so we don't leak them
252 */
253 for (j = 0; j < i; j++)
254 close(fd[j]);
255
256 errno = saved_errno;
99d50954
CS
257 SYSERROR("failed to open '%s'", path);
258 return -1;
259 }
260 }
261
262 for (i = 0; i < size; i++) {
fc763ab7
CS
263 if (fd[i] >= 0 && setns(fd[i], 0) != 0) {
264 saved_errno = errno;
265
266 for (j = i; j < size; j++)
267 close(fd[j]);
268
269 errno = saved_errno;
99d50954
CS
270 SYSERROR("failed to set namespace '%s'", ns[i]);
271 return -1;
272 }
273
274 close(fd[i]);
275 }
276
277 return 0;
278}
279
74a3920a 280static int lxc_attach_remount_sys_proc(void)
7a0b0b56
CS
281{
282 int ret;
283
284 ret = unshare(CLONE_NEWNS);
285 if (ret < 0) {
286 SYSERROR("failed to unshare mount namespace");
287 return -1;
288 }
289
2c6f3fc9
SH
290 if (detect_shared_rootfs()) {
291 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
292 SYSERROR("Failed to make / rslave");
293 ERROR("Continuing...");
294 }
295 }
296
7a0b0b56
CS
297 /* assume /proc is always mounted, so remount it */
298 ret = umount2("/proc", MNT_DETACH);
299 if (ret < 0) {
300 SYSERROR("failed to unmount /proc");
301 return -1;
302 }
303
304 ret = mount("none", "/proc", "proc", 0, NULL);
305 if (ret < 0) {
306 SYSERROR("failed to remount /proc");
307 return -1;
308 }
309
310 /* try to umount /sys - if it's not a mount point,
311 * we'll get EINVAL, then we ignore it because it
312 * may not have been mounted in the first place
313 */
314 ret = umount2("/sys", MNT_DETACH);
315 if (ret < 0 && errno != EINVAL) {
316 SYSERROR("failed to unmount /sys");
317 return -1;
318 } else if (ret == 0) {
319 /* remount it */
320 ret = mount("none", "/sys", "sysfs", 0, NULL);
321 if (ret < 0) {
322 SYSERROR("failed to remount /sys");
323 return -1;
324 }
325 }
326
327 return 0;
328}
329
74a3920a 330static int lxc_attach_drop_privs(struct lxc_proc_context_info *ctx)
e0732705
CS
331{
332 int last_cap = lxc_caps_last_cap();
333 int cap;
334
335 for (cap = 0; cap <= last_cap; cap++) {
336 if (ctx->capability_mask & (1LL << cap))
337 continue;
338
339 if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0)) {
340 SYSERROR("failed to remove capability id %d", cap);
341 return -1;
342 }
343 }
344
345 return 0;
346}
905022f7 347
74a3920a 348static int lxc_attach_set_environment(enum lxc_attach_env_policy_t policy, char** extra_env, char** extra_keep)
b3a39ba6 349{
799f96fd 350 if (policy == LXC_ATTACH_CLEAR_ENV) {
3d5e9f48 351 char **extra_keep_store = NULL;
3d5e9f48
CS
352 int path_kept = 0;
353
354 if (extra_keep) {
355 size_t count, i;
356
357 for (count = 0; extra_keep[count]; count++);
358
359 extra_keep_store = calloc(count, sizeof(char *));
360 if (!extra_keep_store) {
361 SYSERROR("failed to allocate memory for storing current "
362 "environment variable values that will be kept");
363 return -1;
364 }
365 for (i = 0; i < count; i++) {
366 char *v = getenv(extra_keep[i]);
367 if (v) {
368 extra_keep_store[i] = strdup(v);
369 if (!extra_keep_store[i]) {
370 SYSERROR("failed to allocate memory for storing current "
371 "environment variable values that will be kept");
372 while (i > 0)
373 free(extra_keep_store[--i]);
374 free(extra_keep_store);
375 return -1;
376 }
377 if (strcmp(extra_keep[i], "PATH") == 0)
378 path_kept = 1;
379 }
380 /* calloc sets entire array to zero, so we don't
381 * need an else */
382 }
383 }
384
799f96fd 385 if (clearenv()) {
a9cab7e3 386 char **p;
799f96fd 387 SYSERROR("failed to clear environment");
a9cab7e3
CS
388 if (extra_keep_store) {
389 for (p = extra_keep_store; *p; p++)
390 free(*p);
391 free(extra_keep_store);
392 }
3d5e9f48
CS
393 return -1;
394 }
395
396 if (extra_keep_store) {
397 size_t i;
398 for (i = 0; extra_keep[i]; i++) {
acd4922e
SG
399 if (extra_keep_store[i]) {
400 if (setenv(extra_keep[i], extra_keep_store[i], 1) < 0)
401 SYSERROR("Unable to set environment variable");
402 }
3d5e9f48
CS
403 free(extra_keep_store[i]);
404 }
405 free(extra_keep_store);
406 }
407
408 /* always set a default path; shells and execlp tend
409 * to be fine without it, but there is a disturbing
410 * number of C programs out there that just assume
411 * that getenv("PATH") is never NULL and then die a
412 * painful segfault death. */
cfa70b88 413 if (!path_kept)
511a6936 414 setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
b3a39ba6
DW
415 }
416
417 if (putenv("container=lxc")) {
418 SYSERROR("failed to set environment variable");
419 return -1;
420 }
421
3d5e9f48
CS
422 /* set extra environment variables */
423 if (extra_env) {
424 for (; *extra_env; extra_env++) {
425 /* duplicate the string, just to be on
426 * the safe side, because putenv does not
427 * do it for us */
428 char *p = strdup(*extra_env);
429 /* we just assume the user knows what they
430 * are doing, so we don't do any checks */
431 if (!p) {
432 SYSERROR("failed to allocate memory for additional environment "
433 "variables");
434 return -1;
435 }
436 putenv(p);
437 }
438 }
439
b3a39ba6
DW
440 return 0;
441}
442
74a3920a 443static char *lxc_attach_getpwshell(uid_t uid)
905022f7
CS
444{
445 /* local variables */
446 pid_t pid;
447 int pipes[2];
448 int ret;
449 int fd;
450 char *result = NULL;
451
452 /* we need to fork off a process that runs the
453 * getent program, and we need to capture its
454 * output, so we use a pipe for that purpose
455 */
456 ret = pipe(pipes);
457 if (ret < 0)
458 return NULL;
459
460 pid = fork();
461 if (pid < 0) {
462 close(pipes[0]);
463 close(pipes[1]);
464 return NULL;
465 }
466
467 if (pid) {
468 /* parent process */
469 FILE *pipe_f;
470 char *line = NULL;
471 size_t line_bufsz = 0;
472 int found = 0;
473 int status;
474
475 close(pipes[1]);
476
477 pipe_f = fdopen(pipes[0], "r");
478 while (getline(&line, &line_bufsz, pipe_f) != -1) {
479 char *token;
480 char *saveptr = NULL;
481 long value;
482 char *endptr = NULL;
483 int i;
484
485 /* if we already found something, just continue
486 * to read until the pipe doesn't deliver any more
487 * data, but don't modify the existing data
488 * structure
489 */
490 if (found)
491 continue;
492
493 /* trim line on the right hand side */
bbb8a488 494 for (i = strlen(line); i > 0 && (line[i - 1] == '\n' || line[i - 1] == '\r'); --i)
905022f7
CS
495 line[i - 1] = '\0';
496
497 /* split into tokens: first user name */
498 token = strtok_r(line, ":", &saveptr);
499 if (!token)
500 continue;
501 /* next: dummy password field */
502 token = strtok_r(NULL, ":", &saveptr);
503 if (!token)
504 continue;
505 /* next: user id */
506 token = strtok_r(NULL, ":", &saveptr);
507 value = token ? strtol(token, &endptr, 10) : 0;
508 if (!token || !endptr || *endptr || value == LONG_MIN || value == LONG_MAX)
509 continue;
510 /* dummy sanity check: user id matches */
511 if ((uid_t) value != uid)
512 continue;
513 /* skip fields: gid, gecos, dir, go to next field 'shell' */
514 for (i = 0; i < 4; i++) {
515 token = strtok_r(NULL, ":", &saveptr);
516 if (!token)
517 break;
518 }
519 if (!token)
520 continue;
f10fad2f 521 free(result);
905022f7
CS
522 result = strdup(token);
523
524 /* sanity check that there are no fields after that */
525 token = strtok_r(NULL, ":", &saveptr);
526 if (token)
527 continue;
528
529 found = 1;
530 }
531
532 free(line);
533 fclose(pipe_f);
534 again:
535 if (waitpid(pid, &status, 0) < 0) {
536 if (errno == EINTR)
537 goto again;
538 return NULL;
539 }
540
541 /* some sanity checks: if anything even hinted at going
542 * wrong: we can't be sure we have a valid result, so
543 * we assume we don't
544 */
545
546 if (!WIFEXITED(status))
547 return NULL;
548
549 if (WEXITSTATUS(status) != 0)
550 return NULL;
551
552 if (!found)
553 return NULL;
554
555 return result;
556 } else {
557 /* child process */
558 char uid_buf[32];
559 char *arguments[] = {
560 "getent",
561 "passwd",
562 uid_buf,
563 NULL
564 };
565
566 close(pipes[0]);
567
568 /* we want to capture stdout */
569 dup2(pipes[1], 1);
570 close(pipes[1]);
571
572 /* get rid of stdin/stderr, so we try to associate it
573 * with /dev/null
574 */
575 fd = open("/dev/null", O_RDWR);
576 if (fd < 0) {
577 close(0);
578 close(2);
579 } else {
580 dup2(fd, 0);
581 dup2(fd, 2);
582 close(fd);
583 }
584
585 /* finish argument list */
586 ret = snprintf(uid_buf, sizeof(uid_buf), "%ld", (long) uid);
587 if (ret <= 0)
588 exit(-1);
589
590 /* try to run getent program */
591 (void) execvp("getent", arguments);
592 exit(-1);
593 }
594}
cb3e61fa 595
74a3920a 596static void lxc_attach_get_init_uidgid(uid_t* init_uid, gid_t* init_gid)
cb3e61fa
CS
597{
598 FILE *proc_file;
599 char proc_fn[MAXPATHLEN];
600 char *line = NULL;
601 size_t line_bufsz = 0;
602 int ret;
603 long value = -1;
604 uid_t uid = (uid_t)-1;
605 gid_t gid = (gid_t)-1;
606
607 /* read capabilities */
608 snprintf(proc_fn, MAXPATHLEN, "/proc/%d/status", 1);
609
610 proc_file = fopen(proc_fn, "r");
611 if (!proc_file)
612 return;
613
614 while (getline(&line, &line_bufsz, proc_file) != -1) {
615 /* format is: real, effective, saved set user, fs
616 * we only care about real uid
617 */
618 ret = sscanf(line, "Uid: %ld", &value);
619 if (ret != EOF && ret > 0) {
620 uid = (uid_t) value;
621 } else {
622 ret = sscanf(line, "Gid: %ld", &value);
623 if (ret != EOF && ret > 0)
624 gid = (gid_t) value;
625 }
626 if (uid != (uid_t)-1 && gid != (gid_t)-1)
627 break;
628 }
629
630 fclose(proc_file);
631 free(line);
632
633 /* only override arguments if we found something */
634 if (uid != (uid_t)-1)
635 *init_uid = uid;
636 if (gid != (gid_t)-1)
637 *init_gid = gid;
638
639 /* TODO: we should also parse supplementary groups and use
640 * setgroups() to set them */
641}
9c4693b8
CS
642
643struct attach_clone_payload {
644 int ipc_socket;
645 lxc_attach_options_t* options;
646 struct lxc_proc_context_info* init_ctx;
647 lxc_attach_exec_t exec_function;
648 void* exec_payload;
5c3fcae7 649 int procfd;
9c4693b8
CS
650};
651
652static int attach_child_main(void* data);
653
654/* help the optimizer along if it doesn't know that exit always exits */
5dcc1ca6 655#define rexit(c) do { int __c = (c); _exit(__c); return __c; } while(0)
9c4693b8
CS
656
657/* define default options if no options are supplied by the user */
658static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT;
659
ff07d7bb
CB
660static bool fetch_seccomp(struct lxc_proc_context_info *i,
661 lxc_attach_options_t *options)
2c4ea790
SH
662{
663 struct lxc_container *c;
bd7b4e28 664 char *path;
2eef2bda 665
2c4ea790
SH
666 if (!(options->namespaces & CLONE_NEWNS) || !(options->attach_flags & LXC_ATTACH_LSM))
667 return true;
668
ff07d7bb 669 c = i->container;
bd7b4e28
SG
670
671 /* Initialize an empty lxc_conf */
672 if (!c->set_config_item(c, "lxc.seccomp", "")) {
2c4ea790 673 return false;
bd7b4e28
SG
674 }
675
676 /* Fetch the current profile path over the cmd interface */
677 path = c->get_running_config_item(c, "lxc.seccomp");
678 if (!path) {
679 return true;
680 }
681
682 /* Copy the value into the new lxc_conf */
683 if (!c->set_config_item(c, "lxc.seccomp", path)) {
684 free(path);
685 return false;
686 }
687 free(path);
688
689 /* Attempt to parse the resulting config */
2c4ea790 690 if (lxc_read_seccomp_config(c->lxc_conf) < 0) {
442f5c0f 691 ERROR("Error reading seccomp policy");
2c4ea790
SH
692 return false;
693 }
694
695 return true;
696}
697
9b8e3c96
SH
698static signed long get_personality(const char *name, const char *lxcpath)
699{
0d7cf7e9 700 char *p = lxc_cmd_get_config_item(name, "lxc.arch", lxcpath);
9b8e3c96
SH
701 signed long ret;
702
703 if (!p)
704 return -1;
705 ret = lxc_config_parse_arch(p);
706 free(p);
707 return ret;
708}
709
9c4693b8
CS
710int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_function, void* exec_payload, lxc_attach_options_t* options, pid_t* attached_process)
711{
712 int ret, status;
f4364484 713 pid_t init_pid, pid, attached_pid, expected;
9c4693b8
CS
714 struct lxc_proc_context_info *init_ctx;
715 char* cwd;
716 char* new_cwd;
717 int ipc_sockets[2];
5c3fcae7 718 int procfd;
9b8e3c96 719 signed long personality;
9c4693b8
CS
720
721 if (!options)
722 options = &attach_static_default_options;
723
724 init_pid = lxc_cmd_get_init_pid(name, lxcpath);
725 if (init_pid < 0) {
726 ERROR("failed to get the init pid");
727 return -1;
728 }
729
730 init_ctx = lxc_proc_get_context_info(init_pid);
731 if (!init_ctx) {
732 ERROR("failed to get context of the init process, pid = %ld", (long)init_pid);
733 return -1;
734 }
735
9b8e3c96
SH
736 personality = get_personality(name, lxcpath);
737 if (init_ctx->personality < 0) {
738 ERROR("Failed to get personality of the container");
739 lxc_proc_put_context_info(init_ctx);
740 return -1;
741 }
742 init_ctx->personality = personality;
743
ff07d7bb
CB
744 init_ctx->container = lxc_container_new(name, lxcpath);
745 if (!init_ctx->container)
746 return -1;
747
748 if (!fetch_seccomp(init_ctx, options))
2c4ea790
SH
749 WARN("Failed to get seccomp policy");
750
9c4693b8
CS
751 cwd = getcwd(NULL, 0);
752
753 /* determine which namespaces the container was created with
754 * by asking lxc-start, if necessary
755 */
756 if (options->namespaces == -1) {
757 options->namespaces = lxc_cmd_get_clone_flags(name, lxcpath);
758 /* call failed */
759 if (options->namespaces == -1) {
760 ERROR("failed to automatically determine the "
761 "namespaces which the container unshared");
762 free(cwd);
fe4de9a6 763 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
764 return -1;
765 }
766 }
767
768 /* create a socket pair for IPC communication; set SOCK_CLOEXEC in order
769 * to make sure we don't irritate other threads that want to fork+exec away
770 *
771 * IMPORTANT: if the initial process is multithreaded and another call
772 * just fork()s away without exec'ing directly after, the socket fd will
773 * exist in the forked process from the other thread and any close() in
774 * our own child process will not really cause the socket to close properly,
775 * potentiall causing the parent to hang.
776 *
777 * For this reason, while IPC is still active, we have to use shutdown()
778 * if the child exits prematurely in order to signal that the socket
779 * is closed and cannot assume that the child exiting will automatically
780 * do that.
781 *
782 * IPC mechanism: (X is receiver)
783 * initial process intermediate attached
784 * X <--- send pid of
785 * attached proc,
786 * then exit
787 * send 0 ------------------------------------> X
788 * [do initialization]
789 * X <------------------------------------ send 1
790 * [add to cgroup, ...]
791 * send 2 ------------------------------------> X
792 * close socket close socket
793 * run program
794 */
795 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
796 if (ret < 0) {
797 SYSERROR("could not set up required IPC mechanism for attaching");
798 free(cwd);
fe4de9a6 799 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
800 return -1;
801 }
802
803 /* create intermediate subprocess, three reasons:
804 * 1. runs all pthread_atfork handlers and the
805 * child will no longer be threaded
806 * (we can't properly setns() in a threaded process)
807 * 2. we can't setns() in the child itself, since
808 * we want to make sure we are properly attached to
809 * the pidns
810 * 3. also, the initial thread has to put the attached
811 * process into the cgroup, which we can only do if
812 * we didn't already setns() (otherwise, user
813 * namespaces will hate us)
814 */
815 pid = fork();
816
817 if (pid < 0) {
818 SYSERROR("failed to create first subprocess");
819 free(cwd);
fe4de9a6 820 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
821 return -1;
822 }
823
824 if (pid) {
825 pid_t to_cleanup_pid = pid;
9c4693b8 826
ec64264d 827 /* initial thread, we close the socket that is for the
9c4693b8
CS
828 * subprocesses
829 */
830 close(ipc_sockets[1]);
831 free(cwd);
832
f4364484
SG
833 /* attach to cgroup, if requested */
834 if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
4fb3cba5 835 if (!cgroup_attach(name, lxcpath, pid))
f4364484 836 goto cleanup_error;
f4364484
SG
837 }
838
839 /* Let the child process know to go ahead */
840 status = 0;
841 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
842 if (ret <= 0) {
843 ERROR("error using IPC to notify attached process for initialization (0)");
844 goto cleanup_error;
845 }
846
9c4693b8
CS
847 /* get pid from intermediate process */
848 ret = lxc_read_nointr_expect(ipc_sockets[0], &attached_pid, sizeof(attached_pid), NULL);
849 if (ret <= 0) {
850 if (ret != 0)
851 ERROR("error using IPC to receive pid of attached process");
852 goto cleanup_error;
853 }
854
2eef2bda 855 /* ignore SIGKILL (CTRL-C) and SIGQUIT (CTRL-\) - issue #313 */
62183f1a
SH
856 if (options->stdin_fd == 0) {
857 signal(SIGINT, SIG_IGN);
858 signal(SIGQUIT, SIG_IGN);
859 }
2eef2bda 860
9c4693b8
CS
861 /* reap intermediate process */
862 ret = wait_for_pid(pid);
863 if (ret < 0)
864 goto cleanup_error;
865
866 /* we will always have to reap the grandchild now */
867 to_cleanup_pid = attached_pid;
868
869 /* tell attached process it may start initializing */
870 status = 0;
871 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
872 if (ret <= 0) {
873 ERROR("error using IPC to notify attached process for initialization (0)");
874 goto cleanup_error;
875 }
876
877 /* wait for the attached process to finish initializing */
878 expected = 1;
879 ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected);
880 if (ret <= 0) {
881 if (ret != 0)
882 ERROR("error using IPC to receive notification from attached process (1)");
883 goto cleanup_error;
884 }
885
9c4693b8
CS
886 /* tell attached process we're done */
887 status = 2;
888 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
889 if (ret <= 0) {
890 ERROR("error using IPC to notify attached process for initialization (2)");
891 goto cleanup_error;
892 }
893
894 /* now shut down communication with child, we're done */
895 shutdown(ipc_sockets[0], SHUT_RDWR);
896 close(ipc_sockets[0]);
fe4de9a6 897 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
898
899 /* we're done, the child process should now execute whatever
900 * it is that the user requested. The parent can now track it
901 * with waitpid() or similar.
902 */
903
904 *attached_process = attached_pid;
905 return 0;
906
907 cleanup_error:
908 /* first shut down the socket, then wait for the pid,
909 * otherwise the pid we're waiting for may never exit
910 */
911 shutdown(ipc_sockets[0], SHUT_RDWR);
912 close(ipc_sockets[0]);
913 if (to_cleanup_pid)
914 (void) wait_for_pid(to_cleanup_pid);
fe4de9a6 915 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
916 return -1;
917 }
918
919 /* first subprocess begins here, we close the socket that is for the
920 * initial thread
921 */
922 close(ipc_sockets[0]);
923
f4364484
SG
924 /* Wait for the parent to have setup cgroups */
925 expected = 0;
926 status = -1;
927 ret = lxc_read_nointr_expect(ipc_sockets[1], &status, sizeof(status), &expected);
928 if (ret <= 0) {
929 ERROR("error communicating with child process");
930 shutdown(ipc_sockets[1], SHUT_RDWR);
931 rexit(-1);
932 }
933
dac862c0 934 if ((options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) && cgns_supported())
935 options->namespaces |= CLONE_NEWCGROUP;
fe3c80af 936
5c3fcae7
SG
937 procfd = open("/proc", O_DIRECTORY | O_RDONLY);
938 if (procfd < 0) {
939 SYSERROR("Unable to open /proc");
940 shutdown(ipc_sockets[1], SHUT_RDWR);
941 rexit(-1);
942 }
943
9c4693b8
CS
944 /* attach now, create another subprocess later, since pid namespaces
945 * only really affect the children of the current process
946 */
947 ret = lxc_attach_to_ns(init_pid, options->namespaces);
948 if (ret < 0) {
949 ERROR("failed to enter the namespace");
950 shutdown(ipc_sockets[1], SHUT_RDWR);
951 rexit(-1);
952 }
953
954 /* attach succeeded, try to cwd */
955 if (options->initial_cwd)
956 new_cwd = options->initial_cwd;
957 else
958 new_cwd = cwd;
959 ret = chdir(new_cwd);
960 if (ret < 0)
961 WARN("could not change directory to '%s'", new_cwd);
962 free(cwd);
963
964 /* now create the real child process */
965 {
966 struct attach_clone_payload payload = {
967 .ipc_socket = ipc_sockets[1],
968 .options = options,
969 .init_ctx = init_ctx,
970 .exec_function = exec_function,
5c3fcae7
SG
971 .exec_payload = exec_payload,
972 .procfd = procfd
9c4693b8
CS
973 };
974 /* We use clone_parent here to make this subprocess a direct child of
975 * the initial process. Then this intermediate process can exit and
976 * the parent can directly track the attached process.
977 */
978 pid = lxc_clone(attach_child_main, &payload, CLONE_PARENT);
979 }
980
981 /* shouldn't happen, clone() should always return positive pid */
982 if (pid <= 0) {
983 SYSERROR("failed to create subprocess");
984 shutdown(ipc_sockets[1], SHUT_RDWR);
985 rexit(-1);
986 }
987
988 /* tell grandparent the pid of the pid of the newly created child */
989 ret = lxc_write_nointr(ipc_sockets[1], &pid, sizeof(pid));
990 if (ret != sizeof(pid)) {
991 /* if this really happens here, this is very unfortunate, since the
992 * parent will not know the pid of the attached process and will
993 * not be able to wait for it (and we won't either due to CLONE_PARENT)
994 * so the parent won't be able to reap it and the attached process
995 * will remain a zombie
996 */
997 ERROR("error using IPC to notify main process of pid of the attached process");
998 shutdown(ipc_sockets[1], SHUT_RDWR);
999 rexit(-1);
1000 }
1001
1002 /* the rest is in the hands of the initial and the attached process */
1003 rexit(0);
1004}
1005
74a3920a 1006static int attach_child_main(void* data)
9c4693b8
CS
1007{
1008 struct attach_clone_payload* payload = (struct attach_clone_payload*)data;
1009 int ipc_socket = payload->ipc_socket;
5c3fcae7 1010 int procfd = payload->procfd;
9c4693b8
CS
1011 lxc_attach_options_t* options = payload->options;
1012 struct lxc_proc_context_info* init_ctx = payload->init_ctx;
1a2e58cf 1013#if HAVE_SYS_PERSONALITY_H
9c4693b8 1014 long new_personality;
1a2e58cf 1015#endif
9c4693b8
CS
1016 int ret;
1017 int status;
1018 int expected;
1019 long flags;
1020 int fd;
1021 uid_t new_uid;
1022 gid_t new_gid;
1023
1024 /* wait for the initial thread to signal us that it's ready
1025 * for us to start initializing
1026 */
1027 expected = 0;
1028 status = -1;
1029 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
1030 if (ret <= 0) {
1031 ERROR("error using IPC to receive notification from initial process (0)");
1032 shutdown(ipc_socket, SHUT_RDWR);
1033 rexit(-1);
1034 }
1035
9c4693b8
CS
1036 /* A description of the purpose of this functionality is
1037 * provided in the lxc-attach(1) manual page. We have to
1038 * remount here and not in the parent process, otherwise
1039 * /proc may not properly reflect the new pid namespace.
1040 */
1041 if (!(options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) {
1042 ret = lxc_attach_remount_sys_proc();
1043 if (ret < 0) {
1044 shutdown(ipc_socket, SHUT_RDWR);
1045 rexit(-1);
1046 }
1047 }
1048
1049 /* now perform additional attachments*/
1050#if HAVE_SYS_PERSONALITY_H
1051 if (options->personality < 0)
1052 new_personality = init_ctx->personality;
1053 else
1054 new_personality = options->personality;
1055
1056 if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) {
1057 ret = personality(new_personality);
1058 if (ret < 0) {
1059 SYSERROR("could not ensure correct architecture");
1060 shutdown(ipc_socket, SHUT_RDWR);
1061 rexit(-1);
1062 }
1063 }
1064#endif
1065
1066 if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) {
1067 ret = lxc_attach_drop_privs(init_ctx);
1068 if (ret < 0) {
1069 ERROR("could not drop privileges");
1070 shutdown(ipc_socket, SHUT_RDWR);
1071 rexit(-1);
1072 }
1073 }
1074
1075 /* always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL) if you want this to be a no-op) */
1076 ret = lxc_attach_set_environment(options->env_policy, options->extra_env_vars, options->extra_keep_env);
1077 if (ret < 0) {
1078 ERROR("could not set initial environment for attached process");
1079 shutdown(ipc_socket, SHUT_RDWR);
1080 rexit(-1);
1081 }
1082
1083 /* set user / group id */
1084 new_uid = 0;
1085 new_gid = 0;
1086 /* ignore errors, we will fall back to root in that case
1087 * (/proc was not mounted etc.)
1088 */
1089 if (options->namespaces & CLONE_NEWUSER)
1090 lxc_attach_get_init_uidgid(&new_uid, &new_gid);
1091
1092 if (options->uid != (uid_t)-1)
1093 new_uid = options->uid;
1094 if (options->gid != (gid_t)-1)
1095 new_gid = options->gid;
1096
82e28fe0 1097 /* setup the control tty */
d3b63011 1098 if (options->stdin_fd && isatty(options->stdin_fd)) {
82e28fe0
SG
1099 if (setsid() < 0) {
1100 SYSERROR("unable to setsid");
1101 shutdown(ipc_socket, SHUT_RDWR);
1102 rexit(-1);
1103 }
1104
1105 if (ioctl(options->stdin_fd, TIOCSCTTY, (char *)NULL) < 0) {
1106 SYSERROR("unable to TIOCSTTY");
1107 shutdown(ipc_socket, SHUT_RDWR);
1108 rexit(-1);
1109 }
1110 }
1111
9c4693b8 1112 /* try to set the uid/gid combination */
c476bdce
SH
1113 if ((new_gid != 0 || options->namespaces & CLONE_NEWUSER)) {
1114 if (setgid(new_gid) || setgroups(0, NULL)) {
1115 SYSERROR("switching to container gid");
1116 shutdown(ipc_socket, SHUT_RDWR);
1117 rexit(-1);
1118 }
9c4693b8
CS
1119 }
1120 if ((new_uid != 0 || options->namespaces & CLONE_NEWUSER) && setuid(new_uid)) {
1121 SYSERROR("switching to container uid");
1122 shutdown(ipc_socket, SHUT_RDWR);
1123 rexit(-1);
1124 }
1125
1126 /* tell initial process it may now put us into the cgroups */
1127 status = 1;
1128 ret = lxc_write_nointr(ipc_socket, &status, sizeof(status));
1129 if (ret != sizeof(status)) {
1130 ERROR("error using IPC to notify initial process for initialization (1)");
1131 shutdown(ipc_socket, SHUT_RDWR);
1132 rexit(-1);
1133 }
1134
1135 /* wait for the initial thread to signal us that it has done
1136 * everything for us when it comes to cgroups etc.
1137 */
1138 expected = 2;
1139 status = -1;
1140 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
1141 if (ret <= 0) {
1142 ERROR("error using IPC to receive final notification from initial process (2)");
1143 shutdown(ipc_socket, SHUT_RDWR);
1144 rexit(-1);
1145 }
1146
1147 shutdown(ipc_socket, SHUT_RDWR);
1148 close(ipc_socket);
72863294
DE
1149
1150 /* set new apparmor profile/selinux context */
5c3fcae7 1151 if ((options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_LSM) && init_ctx->lsm_label) {
72863294
DE
1152 int on_exec;
1153
1154 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? 1 : 0;
5c3fcae7 1155 if (lsm_set_label_at(procfd, on_exec, init_ctx->lsm_label) < 0) {
72863294
DE
1156 rexit(-1);
1157 }
1158 }
2c4ea790
SH
1159
1160 if (init_ctx->container && init_ctx->container->lxc_conf &&
1161 lxc_seccomp_load(init_ctx->container->lxc_conf) != 0) {
1162 ERROR("Loading seccomp policy");
1163 rexit(-1);
1164 }
1165
fe4de9a6 1166 lxc_proc_put_context_info(init_ctx);
9c4693b8
CS
1167
1168 /* The following is done after the communication socket is
1169 * shut down. That way, all errors that might (though
1170 * unlikely) occur up until this point will have their messages
1171 * printed to the original stderr (if logging is so configured)
1172 * and not the fd the user supplied, if any.
1173 */
1174
1175 /* fd handling for stdin, stdout and stderr;
1176 * ignore errors here, user may want to make sure
1177 * the fds are closed, for example */
1178 if (options->stdin_fd >= 0 && options->stdin_fd != 0)
1179 dup2(options->stdin_fd, 0);
1180 if (options->stdout_fd >= 0 && options->stdout_fd != 1)
1181 dup2(options->stdout_fd, 1);
1182 if (options->stderr_fd >= 0 && options->stderr_fd != 2)
1183 dup2(options->stderr_fd, 2);
1184
1185 /* close the old fds */
1186 if (options->stdin_fd > 2)
1187 close(options->stdin_fd);
1188 if (options->stdout_fd > 2)
1189 close(options->stdout_fd);
1190 if (options->stderr_fd > 2)
1191 close(options->stderr_fd);
1192
1193 /* try to remove CLOEXEC flag from stdin/stdout/stderr,
1194 * but also here, ignore errors */
1195 for (fd = 0; fd <= 2; fd++) {
1196 flags = fcntl(fd, F_GETFL);
1197 if (flags < 0)
1198 continue;
71b2940d
SG
1199 if (flags & FD_CLOEXEC) {
1200 if (fcntl(fd, F_SETFL, flags & ~FD_CLOEXEC) < 0) {
1201 SYSERROR("Unable to clear CLOEXEC from fd");
1202 }
1203 }
9c4693b8
CS
1204 }
1205
5c3fcae7
SG
1206 /* we don't need proc anymore */
1207 close(procfd);
1208
9c4693b8
CS
1209 /* we're done, so we can now do whatever the user intended us to do */
1210 rexit(payload->exec_function(payload->exec_payload));
1211}
1212
1213int lxc_attach_run_command(void* payload)
1214{
1215 lxc_attach_command_t* cmd = (lxc_attach_command_t*)payload;
1216
1217 execvp(cmd->program, cmd->argv);
1218 SYSERROR("failed to exec '%s'", cmd->program);
1219 return -1;
1220}
1221
1222int lxc_attach_run_shell(void* payload)
1223{
1224 uid_t uid;
1225 struct passwd *passwd;
1226 char *user_shell;
1227
1228 /* ignore payload parameter */
1229 (void)payload;
1230
1231 uid = getuid();
1232 passwd = getpwuid(uid);
1233
1234 /* this probably happens because of incompatible nss
1235 * implementations in host and container (remember, this
1236 * code is still using the host's glibc but our mount
1237 * namespace is in the container)
1238 * we may try to get the information by spawning a
1239 * [getent passwd uid] process and parsing the result
1240 */
1241 if (!passwd)
1242 user_shell = lxc_attach_getpwshell(uid);
1243 else
1244 user_shell = passwd->pw_shell;
1245
1246 if (user_shell)
acf47e1b 1247 execlp(user_shell, user_shell, (char *)NULL);
9c4693b8
CS
1248
1249 /* executed if either no passwd entry or execvp fails,
1250 * we will fall back on /bin/sh as a default shell
1251 */
acf47e1b 1252 execlp("/bin/sh", "/bin/sh", (char *)NULL);
9c4693b8
CS
1253 SYSERROR("failed to exec shell");
1254 return -1;
1255}