]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/attach.c
Merge git://github.com/lxc/lxc
[mirror_lxc.git] / src / lxc / attach.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include <unistd.h>
26 #include <stdio.h>
27 #include <string.h>
28 #include <stdlib.h>
29 #include <errno.h>
30 #include <fcntl.h>
31 #include <sys/param.h>
32 #include <sys/prctl.h>
33 #include <sys/mount.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/wait.h>
37 #include <linux/unistd.h>
38 #include <pwd.h>
39
40 #if !HAVE_DECL_PR_CAPBSET_DROP
41 #define PR_CAPBSET_DROP 24
42 #endif
43
44 #include "namespace.h"
45 #include "log.h"
46 #include "attach.h"
47 #include "caps.h"
48 #include "config.h"
49 #include "apparmor.h"
50 #include "utils.h"
51 #include "commands.h"
52 #include "cgroup.h"
53
54 #if HAVE_SYS_PERSONALITY_H
55 #include <sys/personality.h>
56 #endif
57
58 #ifndef SOCK_CLOEXEC
59 # define SOCK_CLOEXEC 02000000
60 #endif
61
62 lxc_log_define(lxc_attach, lxc);
63
64 struct lxc_proc_context_info *lxc_proc_get_context_info(pid_t pid)
65 {
66 struct lxc_proc_context_info *info = calloc(1, sizeof(*info));
67 FILE *proc_file;
68 char proc_fn[MAXPATHLEN];
69 char *line = NULL;
70 size_t line_bufsz = 0;
71 int ret, found;
72
73 if (!info) {
74 SYSERROR("Could not allocate memory.");
75 return NULL;
76 }
77
78 /* read capabilities */
79 snprintf(proc_fn, MAXPATHLEN, "/proc/%d/status", pid);
80
81 proc_file = fopen(proc_fn, "r");
82 if (!proc_file) {
83 SYSERROR("Could not open %s", proc_fn);
84 goto out_error;
85 }
86
87 found = 0;
88 while (getline(&line, &line_bufsz, proc_file) != -1) {
89 ret = sscanf(line, "CapBnd: %llx", &info->capability_mask);
90 if (ret != EOF && ret > 0) {
91 found = 1;
92 break;
93 }
94 }
95
96 if (line)
97 free(line);
98 fclose(proc_file);
99
100 if (!found) {
101 SYSERROR("Could not read capability bounding set from %s", proc_fn);
102 errno = ENOENT;
103 goto out_error;
104 }
105
106 /* read personality */
107 snprintf(proc_fn, MAXPATHLEN, "/proc/%d/personality", pid);
108
109 proc_file = fopen(proc_fn, "r");
110 if (!proc_file) {
111 SYSERROR("Could not open %s", proc_fn);
112 goto out_error;
113 }
114
115 ret = fscanf(proc_file, "%lx", &info->personality);
116 fclose(proc_file);
117
118 if (ret == EOF || ret == 0) {
119 SYSERROR("Could not read personality from %s", proc_fn);
120 errno = ENOENT;
121 goto out_error;
122 }
123 info->aa_profile = aa_get_profile(pid);
124
125 return info;
126
127 out_error:
128 free(info);
129 return NULL;
130 }
131
132 int lxc_attach_to_ns(pid_t pid, int which)
133 {
134 char path[MAXPATHLEN];
135 /* according to <http://article.gmane.org/gmane.linux.kernel.containers.lxc.devel/1429>,
136 * the file for user namepsaces in /proc/$pid/ns will be called
137 * 'user' once the kernel supports it
138 */
139 static char *ns[] = { "mnt", "pid", "uts", "ipc", "user", "net" };
140 static int flags[] = {
141 CLONE_NEWNS, CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC,
142 CLONE_NEWUSER, CLONE_NEWNET
143 };
144 static const int size = sizeof(ns) / sizeof(char *);
145 int fd[size];
146 int i, j, saved_errno;
147
148
149 snprintf(path, MAXPATHLEN, "/proc/%d/ns", pid);
150 if (access(path, X_OK)) {
151 ERROR("Does this kernel version support 'attach' ?");
152 return -1;
153 }
154
155 for (i = 0; i < size; i++) {
156 /* ignore if we are not supposed to attach to that
157 * namespace
158 */
159 if (which != -1 && !(which & flags[i])) {
160 fd[i] = -1;
161 continue;
162 }
163
164 snprintf(path, MAXPATHLEN, "/proc/%d/ns/%s", pid, ns[i]);
165 fd[i] = open(path, O_RDONLY | O_CLOEXEC);
166 if (fd[i] < 0) {
167 saved_errno = errno;
168
169 /* close all already opened file descriptors before
170 * we return an error, so we don't leak them
171 */
172 for (j = 0; j < i; j++)
173 close(fd[j]);
174
175 errno = saved_errno;
176 SYSERROR("failed to open '%s'", path);
177 return -1;
178 }
179 }
180
181 for (i = 0; i < size; i++) {
182 if (fd[i] >= 0 && setns(fd[i], 0) != 0) {
183 saved_errno = errno;
184
185 for (j = i; j < size; j++)
186 close(fd[j]);
187
188 errno = saved_errno;
189 SYSERROR("failed to set namespace '%s'", ns[i]);
190 return -1;
191 }
192
193 close(fd[i]);
194 }
195
196 return 0;
197 }
198
199 int lxc_attach_remount_sys_proc()
200 {
201 int ret;
202
203 ret = unshare(CLONE_NEWNS);
204 if (ret < 0) {
205 SYSERROR("failed to unshare mount namespace");
206 return -1;
207 }
208
209 /* assume /proc is always mounted, so remount it */
210 ret = umount2("/proc", MNT_DETACH);
211 if (ret < 0) {
212 SYSERROR("failed to unmount /proc");
213 return -1;
214 }
215
216 ret = mount("none", "/proc", "proc", 0, NULL);
217 if (ret < 0) {
218 SYSERROR("failed to remount /proc");
219 return -1;
220 }
221
222 /* try to umount /sys - if it's not a mount point,
223 * we'll get EINVAL, then we ignore it because it
224 * may not have been mounted in the first place
225 */
226 ret = umount2("/sys", MNT_DETACH);
227 if (ret < 0 && errno != EINVAL) {
228 SYSERROR("failed to unmount /sys");
229 return -1;
230 } else if (ret == 0) {
231 /* remount it */
232 ret = mount("none", "/sys", "sysfs", 0, NULL);
233 if (ret < 0) {
234 SYSERROR("failed to remount /sys");
235 return -1;
236 }
237 }
238
239 return 0;
240 }
241
242 int lxc_attach_drop_privs(struct lxc_proc_context_info *ctx)
243 {
244 int last_cap = lxc_caps_last_cap();
245 int cap;
246
247 for (cap = 0; cap <= last_cap; cap++) {
248 if (ctx->capability_mask & (1LL << cap))
249 continue;
250
251 if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0)) {
252 SYSERROR("failed to remove capability id %d", cap);
253 return -1;
254 }
255 }
256
257 return 0;
258 }
259
260 int lxc_attach_set_environment(enum lxc_attach_env_policy_t policy, char** extra_env, char** extra_keep)
261 {
262 if (policy == LXC_ATTACH_CLEAR_ENV) {
263 char **extra_keep_store = NULL;
264 int path_kept = 0;
265
266 if (extra_keep) {
267 size_t count, i;
268
269 for (count = 0; extra_keep[count]; count++);
270
271 extra_keep_store = calloc(count, sizeof(char *));
272 if (!extra_keep_store) {
273 SYSERROR("failed to allocate memory for storing current "
274 "environment variable values that will be kept");
275 return -1;
276 }
277 for (i = 0; i < count; i++) {
278 char *v = getenv(extra_keep[i]);
279 if (v) {
280 extra_keep_store[i] = strdup(v);
281 if (!extra_keep_store[i]) {
282 SYSERROR("failed to allocate memory for storing current "
283 "environment variable values that will be kept");
284 while (i > 0)
285 free(extra_keep_store[--i]);
286 free(extra_keep_store);
287 return -1;
288 }
289 if (strcmp(extra_keep[i], "PATH") == 0)
290 path_kept = 1;
291 }
292 /* calloc sets entire array to zero, so we don't
293 * need an else */
294 }
295 }
296
297 if (clearenv()) {
298 char **p;
299 SYSERROR("failed to clear environment");
300 if (extra_keep_store) {
301 for (p = extra_keep_store; *p; p++)
302 free(*p);
303 free(extra_keep_store);
304 }
305 return -1;
306 }
307
308 if (extra_keep_store) {
309 size_t i;
310 for (i = 0; extra_keep[i]; i++) {
311 if (extra_keep_store[i])
312 setenv(extra_keep[i], extra_keep_store[i], 1);
313 free(extra_keep_store[i]);
314 }
315 free(extra_keep_store);
316 }
317
318 /* always set a default path; shells and execlp tend
319 * to be fine without it, but there is a disturbing
320 * number of C programs out there that just assume
321 * that getenv("PATH") is never NULL and then die a
322 * painful segfault death. */
323 if (!path_kept) {
324 #ifdef HAVE_CONFSTR
325 size_t n;
326 char *path_env;
327
328 n = confstr(_CS_PATH, NULL, 0);
329 path_env = malloc(n);
330 if (path_env) {
331 confstr(_CS_PATH, path_env, n);
332 setenv("PATH", path_env, 1);
333 free(path_env);
334 }
335 /* don't error out, this is just an extra service */
336 #else
337 setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
338 #endif
339 }
340 }
341
342 if (putenv("container=lxc")) {
343 SYSERROR("failed to set environment variable");
344 return -1;
345 }
346
347 /* set extra environment variables */
348 if (extra_env) {
349 for (; *extra_env; extra_env++) {
350 /* duplicate the string, just to be on
351 * the safe side, because putenv does not
352 * do it for us */
353 char *p = strdup(*extra_env);
354 /* we just assume the user knows what they
355 * are doing, so we don't do any checks */
356 if (!p) {
357 SYSERROR("failed to allocate memory for additional environment "
358 "variables");
359 return -1;
360 }
361 putenv(p);
362 }
363 }
364
365 return 0;
366 }
367
368 char *lxc_attach_getpwshell(uid_t uid)
369 {
370 /* local variables */
371 pid_t pid;
372 int pipes[2];
373 int ret;
374 int fd;
375 char *result = NULL;
376
377 /* we need to fork off a process that runs the
378 * getent program, and we need to capture its
379 * output, so we use a pipe for that purpose
380 */
381 ret = pipe(pipes);
382 if (ret < 0)
383 return NULL;
384
385 pid = fork();
386 if (pid < 0) {
387 close(pipes[0]);
388 close(pipes[1]);
389 return NULL;
390 }
391
392 if (pid) {
393 /* parent process */
394 FILE *pipe_f;
395 char *line = NULL;
396 size_t line_bufsz = 0;
397 int found = 0;
398 int status;
399
400 close(pipes[1]);
401
402 pipe_f = fdopen(pipes[0], "r");
403 while (getline(&line, &line_bufsz, pipe_f) != -1) {
404 char *token;
405 char *saveptr = NULL;
406 long value;
407 char *endptr = NULL;
408 int i;
409
410 /* if we already found something, just continue
411 * to read until the pipe doesn't deliver any more
412 * data, but don't modify the existing data
413 * structure
414 */
415 if (found)
416 continue;
417
418 /* trim line on the right hand side */
419 for (i = strlen(line); i > 0 && (line[i - 1] == '\n' || line[i - 1] == '\r'); --i)
420 line[i - 1] = '\0';
421
422 /* split into tokens: first user name */
423 token = strtok_r(line, ":", &saveptr);
424 if (!token)
425 continue;
426 /* next: dummy password field */
427 token = strtok_r(NULL, ":", &saveptr);
428 if (!token)
429 continue;
430 /* next: user id */
431 token = strtok_r(NULL, ":", &saveptr);
432 value = token ? strtol(token, &endptr, 10) : 0;
433 if (!token || !endptr || *endptr || value == LONG_MIN || value == LONG_MAX)
434 continue;
435 /* dummy sanity check: user id matches */
436 if ((uid_t) value != uid)
437 continue;
438 /* skip fields: gid, gecos, dir, go to next field 'shell' */
439 for (i = 0; i < 4; i++) {
440 token = strtok_r(NULL, ":", &saveptr);
441 if (!token)
442 break;
443 }
444 if (!token)
445 continue;
446 if (result)
447 free(result);
448 result = strdup(token);
449
450 /* sanity check that there are no fields after that */
451 token = strtok_r(NULL, ":", &saveptr);
452 if (token)
453 continue;
454
455 found = 1;
456 }
457
458 free(line);
459 fclose(pipe_f);
460 again:
461 if (waitpid(pid, &status, 0) < 0) {
462 if (errno == EINTR)
463 goto again;
464 return NULL;
465 }
466
467 /* some sanity checks: if anything even hinted at going
468 * wrong: we can't be sure we have a valid result, so
469 * we assume we don't
470 */
471
472 if (!WIFEXITED(status))
473 return NULL;
474
475 if (WEXITSTATUS(status) != 0)
476 return NULL;
477
478 if (!found)
479 return NULL;
480
481 return result;
482 } else {
483 /* child process */
484 char uid_buf[32];
485 char *arguments[] = {
486 "getent",
487 "passwd",
488 uid_buf,
489 NULL
490 };
491
492 close(pipes[0]);
493
494 /* we want to capture stdout */
495 dup2(pipes[1], 1);
496 close(pipes[1]);
497
498 /* get rid of stdin/stderr, so we try to associate it
499 * with /dev/null
500 */
501 fd = open("/dev/null", O_RDWR);
502 if (fd < 0) {
503 close(0);
504 close(2);
505 } else {
506 dup2(fd, 0);
507 dup2(fd, 2);
508 close(fd);
509 }
510
511 /* finish argument list */
512 ret = snprintf(uid_buf, sizeof(uid_buf), "%ld", (long) uid);
513 if (ret <= 0)
514 exit(-1);
515
516 /* try to run getent program */
517 (void) execvp("getent", arguments);
518 exit(-1);
519 }
520 }
521
522 void lxc_attach_get_init_uidgid(uid_t* init_uid, gid_t* init_gid)
523 {
524 FILE *proc_file;
525 char proc_fn[MAXPATHLEN];
526 char *line = NULL;
527 size_t line_bufsz = 0;
528 int ret;
529 long value = -1;
530 uid_t uid = (uid_t)-1;
531 gid_t gid = (gid_t)-1;
532
533 /* read capabilities */
534 snprintf(proc_fn, MAXPATHLEN, "/proc/%d/status", 1);
535
536 proc_file = fopen(proc_fn, "r");
537 if (!proc_file)
538 return;
539
540 while (getline(&line, &line_bufsz, proc_file) != -1) {
541 /* format is: real, effective, saved set user, fs
542 * we only care about real uid
543 */
544 ret = sscanf(line, "Uid: %ld", &value);
545 if (ret != EOF && ret > 0) {
546 uid = (uid_t) value;
547 } else {
548 ret = sscanf(line, "Gid: %ld", &value);
549 if (ret != EOF && ret > 0)
550 gid = (gid_t) value;
551 }
552 if (uid != (uid_t)-1 && gid != (gid_t)-1)
553 break;
554 }
555
556 fclose(proc_file);
557 free(line);
558
559 /* only override arguments if we found something */
560 if (uid != (uid_t)-1)
561 *init_uid = uid;
562 if (gid != (gid_t)-1)
563 *init_gid = gid;
564
565 /* TODO: we should also parse supplementary groups and use
566 * setgroups() to set them */
567 }
568
569 struct attach_clone_payload {
570 int ipc_socket;
571 lxc_attach_options_t* options;
572 struct lxc_proc_context_info* init_ctx;
573 lxc_attach_exec_t exec_function;
574 void* exec_payload;
575 };
576
577 static int attach_child_main(void* data);
578
579 /* help the optimizer along if it doesn't know that exit always exits */
580 #define rexit(c) do { int __c = (c); exit(__c); return __c; } while(0)
581
582 /* define default options if no options are supplied by the user */
583 static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT;
584
585 int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_function, void* exec_payload, lxc_attach_options_t* options, pid_t* attached_process)
586 {
587 int ret, status;
588 pid_t init_pid, pid, attached_pid;
589 struct lxc_proc_context_info *init_ctx;
590 char* cwd;
591 char* new_cwd;
592 int ipc_sockets[2];
593
594 if (!options)
595 options = &attach_static_default_options;
596
597 init_pid = lxc_cmd_get_init_pid(name, lxcpath);
598 if (init_pid < 0) {
599 ERROR("failed to get the init pid");
600 return -1;
601 }
602
603 init_ctx = lxc_proc_get_context_info(init_pid);
604 if (!init_ctx) {
605 ERROR("failed to get context of the init process, pid = %ld", (long)init_pid);
606 return -1;
607 }
608
609 cwd = getcwd(NULL, 0);
610
611 /* determine which namespaces the container was created with
612 * by asking lxc-start, if necessary
613 */
614 if (options->namespaces == -1) {
615 options->namespaces = lxc_cmd_get_clone_flags(name, lxcpath);
616 /* call failed */
617 if (options->namespaces == -1) {
618 ERROR("failed to automatically determine the "
619 "namespaces which the container unshared");
620 free(cwd);
621 free(init_ctx->aa_profile);
622 free(init_ctx);
623 return -1;
624 }
625 }
626
627 /* create a socket pair for IPC communication; set SOCK_CLOEXEC in order
628 * to make sure we don't irritate other threads that want to fork+exec away
629 *
630 * IMPORTANT: if the initial process is multithreaded and another call
631 * just fork()s away without exec'ing directly after, the socket fd will
632 * exist in the forked process from the other thread and any close() in
633 * our own child process will not really cause the socket to close properly,
634 * potentiall causing the parent to hang.
635 *
636 * For this reason, while IPC is still active, we have to use shutdown()
637 * if the child exits prematurely in order to signal that the socket
638 * is closed and cannot assume that the child exiting will automatically
639 * do that.
640 *
641 * IPC mechanism: (X is receiver)
642 * initial process intermediate attached
643 * X <--- send pid of
644 * attached proc,
645 * then exit
646 * send 0 ------------------------------------> X
647 * [do initialization]
648 * X <------------------------------------ send 1
649 * [add to cgroup, ...]
650 * send 2 ------------------------------------> X
651 * close socket close socket
652 * run program
653 */
654 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
655 if (ret < 0) {
656 SYSERROR("could not set up required IPC mechanism for attaching");
657 free(cwd);
658 free(init_ctx->aa_profile);
659 free(init_ctx);
660 return -1;
661 }
662
663 /* create intermediate subprocess, three reasons:
664 * 1. runs all pthread_atfork handlers and the
665 * child will no longer be threaded
666 * (we can't properly setns() in a threaded process)
667 * 2. we can't setns() in the child itself, since
668 * we want to make sure we are properly attached to
669 * the pidns
670 * 3. also, the initial thread has to put the attached
671 * process into the cgroup, which we can only do if
672 * we didn't already setns() (otherwise, user
673 * namespaces will hate us)
674 */
675 pid = fork();
676
677 if (pid < 0) {
678 SYSERROR("failed to create first subprocess");
679 free(cwd);
680 free(init_ctx->aa_profile);
681 free(init_ctx);
682 return -1;
683 }
684
685 if (pid) {
686 pid_t to_cleanup_pid = pid;
687 int expected = 0;
688
689 /* inital thread, we close the socket that is for the
690 * subprocesses
691 */
692 close(ipc_sockets[1]);
693 free(cwd);
694
695 /* get pid from intermediate process */
696 ret = lxc_read_nointr_expect(ipc_sockets[0], &attached_pid, sizeof(attached_pid), NULL);
697 if (ret <= 0) {
698 if (ret != 0)
699 ERROR("error using IPC to receive pid of attached process");
700 goto cleanup_error;
701 }
702
703 /* reap intermediate process */
704 ret = wait_for_pid(pid);
705 if (ret < 0)
706 goto cleanup_error;
707
708 /* we will always have to reap the grandchild now */
709 to_cleanup_pid = attached_pid;
710
711 /* tell attached process it may start initializing */
712 status = 0;
713 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
714 if (ret <= 0) {
715 ERROR("error using IPC to notify attached process for initialization (0)");
716 goto cleanup_error;
717 }
718
719 /* wait for the attached process to finish initializing */
720 expected = 1;
721 ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected);
722 if (ret <= 0) {
723 if (ret != 0)
724 ERROR("error using IPC to receive notification from attached process (1)");
725 goto cleanup_error;
726 }
727
728 /* attach to cgroup, if requested */
729 if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
730 ret = lxc_cgroup_attach(attached_pid, name, lxcpath);
731 if (ret < 0) {
732 ERROR("could not move attached process %ld to cgroup of container", (long)attached_pid);
733 goto cleanup_error;
734 }
735 }
736
737 /* tell attached process we're done */
738 status = 2;
739 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
740 if (ret <= 0) {
741 ERROR("error using IPC to notify attached process for initialization (2)");
742 goto cleanup_error;
743 }
744
745 /* now shut down communication with child, we're done */
746 shutdown(ipc_sockets[0], SHUT_RDWR);
747 close(ipc_sockets[0]);
748 free(init_ctx->aa_profile);
749 free(init_ctx);
750
751 /* we're done, the child process should now execute whatever
752 * it is that the user requested. The parent can now track it
753 * with waitpid() or similar.
754 */
755
756 *attached_process = attached_pid;
757 return 0;
758
759 cleanup_error:
760 /* first shut down the socket, then wait for the pid,
761 * otherwise the pid we're waiting for may never exit
762 */
763 shutdown(ipc_sockets[0], SHUT_RDWR);
764 close(ipc_sockets[0]);
765 if (to_cleanup_pid)
766 (void) wait_for_pid(to_cleanup_pid);
767 free(init_ctx->aa_profile);
768 free(init_ctx);
769 return -1;
770 }
771
772 /* first subprocess begins here, we close the socket that is for the
773 * initial thread
774 */
775 close(ipc_sockets[0]);
776
777 /* attach now, create another subprocess later, since pid namespaces
778 * only really affect the children of the current process
779 */
780 ret = lxc_attach_to_ns(init_pid, options->namespaces);
781 if (ret < 0) {
782 ERROR("failed to enter the namespace");
783 shutdown(ipc_sockets[1], SHUT_RDWR);
784 rexit(-1);
785 }
786
787 /* attach succeeded, try to cwd */
788 if (options->initial_cwd)
789 new_cwd = options->initial_cwd;
790 else
791 new_cwd = cwd;
792 ret = chdir(new_cwd);
793 if (ret < 0)
794 WARN("could not change directory to '%s'", new_cwd);
795 free(cwd);
796
797 /* now create the real child process */
798 {
799 struct attach_clone_payload payload = {
800 .ipc_socket = ipc_sockets[1],
801 .options = options,
802 .init_ctx = init_ctx,
803 .exec_function = exec_function,
804 .exec_payload = exec_payload
805 };
806 /* We use clone_parent here to make this subprocess a direct child of
807 * the initial process. Then this intermediate process can exit and
808 * the parent can directly track the attached process.
809 */
810 pid = lxc_clone(attach_child_main, &payload, CLONE_PARENT);
811 }
812
813 /* shouldn't happen, clone() should always return positive pid */
814 if (pid <= 0) {
815 SYSERROR("failed to create subprocess");
816 shutdown(ipc_sockets[1], SHUT_RDWR);
817 rexit(-1);
818 }
819
820 /* tell grandparent the pid of the pid of the newly created child */
821 ret = lxc_write_nointr(ipc_sockets[1], &pid, sizeof(pid));
822 if (ret != sizeof(pid)) {
823 /* if this really happens here, this is very unfortunate, since the
824 * parent will not know the pid of the attached process and will
825 * not be able to wait for it (and we won't either due to CLONE_PARENT)
826 * so the parent won't be able to reap it and the attached process
827 * will remain a zombie
828 */
829 ERROR("error using IPC to notify main process of pid of the attached process");
830 shutdown(ipc_sockets[1], SHUT_RDWR);
831 rexit(-1);
832 }
833
834 /* the rest is in the hands of the initial and the attached process */
835 rexit(0);
836 }
837
838 int attach_child_main(void* data)
839 {
840 struct attach_clone_payload* payload = (struct attach_clone_payload*)data;
841 int ipc_socket = payload->ipc_socket;
842 lxc_attach_options_t* options = payload->options;
843 struct lxc_proc_context_info* init_ctx = payload->init_ctx;
844 #if HAVE_SYS_PERSONALITY_H
845 long new_personality;
846 #endif
847 int ret;
848 int status;
849 int expected;
850 long flags;
851 int fd;
852 uid_t new_uid;
853 gid_t new_gid;
854
855 /* wait for the initial thread to signal us that it's ready
856 * for us to start initializing
857 */
858 expected = 0;
859 status = -1;
860 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
861 if (ret <= 0) {
862 ERROR("error using IPC to receive notification from initial process (0)");
863 shutdown(ipc_socket, SHUT_RDWR);
864 rexit(-1);
865 }
866
867 /* load apparmor profile */
868 if ((options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_APPARMOR)) {
869 ret = attach_apparmor(init_ctx->aa_profile);
870 if (ret < 0) {
871 shutdown(ipc_socket, SHUT_RDWR);
872 rexit(-1);
873 }
874 }
875
876 /* A description of the purpose of this functionality is
877 * provided in the lxc-attach(1) manual page. We have to
878 * remount here and not in the parent process, otherwise
879 * /proc may not properly reflect the new pid namespace.
880 */
881 if (!(options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) {
882 ret = lxc_attach_remount_sys_proc();
883 if (ret < 0) {
884 shutdown(ipc_socket, SHUT_RDWR);
885 rexit(-1);
886 }
887 }
888
889 /* now perform additional attachments*/
890 #if HAVE_SYS_PERSONALITY_H
891 if (options->personality < 0)
892 new_personality = init_ctx->personality;
893 else
894 new_personality = options->personality;
895
896 if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) {
897 ret = personality(new_personality);
898 if (ret < 0) {
899 SYSERROR("could not ensure correct architecture");
900 shutdown(ipc_socket, SHUT_RDWR);
901 rexit(-1);
902 }
903 }
904 #endif
905
906 if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) {
907 ret = lxc_attach_drop_privs(init_ctx);
908 if (ret < 0) {
909 ERROR("could not drop privileges");
910 shutdown(ipc_socket, SHUT_RDWR);
911 rexit(-1);
912 }
913 }
914
915 /* always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL) if you want this to be a no-op) */
916 ret = lxc_attach_set_environment(options->env_policy, options->extra_env_vars, options->extra_keep_env);
917 if (ret < 0) {
918 ERROR("could not set initial environment for attached process");
919 shutdown(ipc_socket, SHUT_RDWR);
920 rexit(-1);
921 }
922
923 /* set user / group id */
924 new_uid = 0;
925 new_gid = 0;
926 /* ignore errors, we will fall back to root in that case
927 * (/proc was not mounted etc.)
928 */
929 if (options->namespaces & CLONE_NEWUSER)
930 lxc_attach_get_init_uidgid(&new_uid, &new_gid);
931
932 if (options->uid != (uid_t)-1)
933 new_uid = options->uid;
934 if (options->gid != (gid_t)-1)
935 new_gid = options->gid;
936
937 /* try to set the uid/gid combination */
938 if ((new_gid != 0 || options->namespaces & CLONE_NEWUSER) && setgid(new_gid)) {
939 SYSERROR("switching to container gid");
940 shutdown(ipc_socket, SHUT_RDWR);
941 rexit(-1);
942 }
943 if ((new_uid != 0 || options->namespaces & CLONE_NEWUSER) && setuid(new_uid)) {
944 SYSERROR("switching to container uid");
945 shutdown(ipc_socket, SHUT_RDWR);
946 rexit(-1);
947 }
948
949 /* tell initial process it may now put us into the cgroups */
950 status = 1;
951 ret = lxc_write_nointr(ipc_socket, &status, sizeof(status));
952 if (ret != sizeof(status)) {
953 ERROR("error using IPC to notify initial process for initialization (1)");
954 shutdown(ipc_socket, SHUT_RDWR);
955 rexit(-1);
956 }
957
958 /* wait for the initial thread to signal us that it has done
959 * everything for us when it comes to cgroups etc.
960 */
961 expected = 2;
962 status = -1;
963 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
964 if (ret <= 0) {
965 ERROR("error using IPC to receive final notification from initial process (2)");
966 shutdown(ipc_socket, SHUT_RDWR);
967 rexit(-1);
968 }
969
970 shutdown(ipc_socket, SHUT_RDWR);
971 close(ipc_socket);
972 free(init_ctx->aa_profile);
973 free(init_ctx);
974
975 /* The following is done after the communication socket is
976 * shut down. That way, all errors that might (though
977 * unlikely) occur up until this point will have their messages
978 * printed to the original stderr (if logging is so configured)
979 * and not the fd the user supplied, if any.
980 */
981
982 /* fd handling for stdin, stdout and stderr;
983 * ignore errors here, user may want to make sure
984 * the fds are closed, for example */
985 if (options->stdin_fd >= 0 && options->stdin_fd != 0)
986 dup2(options->stdin_fd, 0);
987 if (options->stdout_fd >= 0 && options->stdout_fd != 1)
988 dup2(options->stdout_fd, 1);
989 if (options->stderr_fd >= 0 && options->stderr_fd != 2)
990 dup2(options->stderr_fd, 2);
991
992 /* close the old fds */
993 if (options->stdin_fd > 2)
994 close(options->stdin_fd);
995 if (options->stdout_fd > 2)
996 close(options->stdout_fd);
997 if (options->stderr_fd > 2)
998 close(options->stderr_fd);
999
1000 /* try to remove CLOEXEC flag from stdin/stdout/stderr,
1001 * but also here, ignore errors */
1002 for (fd = 0; fd <= 2; fd++) {
1003 flags = fcntl(fd, F_GETFL);
1004 if (flags < 0)
1005 continue;
1006 if (flags & FD_CLOEXEC)
1007 fcntl(fd, F_SETFL, flags & ~FD_CLOEXEC);
1008 }
1009
1010 /* we're done, so we can now do whatever the user intended us to do */
1011 rexit(payload->exec_function(payload->exec_payload));
1012 }
1013
1014 int lxc_attach_run_command(void* payload)
1015 {
1016 lxc_attach_command_t* cmd = (lxc_attach_command_t*)payload;
1017
1018 execvp(cmd->program, cmd->argv);
1019 SYSERROR("failed to exec '%s'", cmd->program);
1020 return -1;
1021 }
1022
1023 int lxc_attach_run_shell(void* payload)
1024 {
1025 uid_t uid;
1026 struct passwd *passwd;
1027 char *user_shell;
1028
1029 /* ignore payload parameter */
1030 (void)payload;
1031
1032 uid = getuid();
1033 passwd = getpwuid(uid);
1034
1035 /* this probably happens because of incompatible nss
1036 * implementations in host and container (remember, this
1037 * code is still using the host's glibc but our mount
1038 * namespace is in the container)
1039 * we may try to get the information by spawning a
1040 * [getent passwd uid] process and parsing the result
1041 */
1042 if (!passwd)
1043 user_shell = lxc_attach_getpwshell(uid);
1044 else
1045 user_shell = passwd->pw_shell;
1046
1047 if (user_shell)
1048 execlp(user_shell, user_shell, NULL);
1049
1050 /* executed if either no passwd entry or execvp fails,
1051 * we will fall back on /bin/sh as a default shell
1052 */
1053 execlp("/bin/sh", "/bin/sh", NULL);
1054 SYSERROR("failed to exec shell");
1055 return -1;
1056 }