]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/attach.c
attach, start: declare PR_{S,G}PR_GET_NO_NEW_PRIVS
[mirror_lxc.git] / src / lxc / attach.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include <unistd.h>
26 #include <stdio.h>
27 #include <string.h>
28 #include <stdlib.h>
29 #include <signal.h>
30 #include <errno.h>
31 #include <fcntl.h>
32 #include <grp.h>
33 #include <sys/param.h>
34 #include <sys/prctl.h>
35 #include <sys/mount.h>
36 #include <sys/socket.h>
37 #include <sys/syscall.h>
38 #include <sys/wait.h>
39 #include <linux/unistd.h>
40 #include <pwd.h>
41
42 #ifndef HAVE_DECL_PR_CAPBSET_DROP
43 #define PR_CAPBSET_DROP 24
44 #endif
45
46 #ifndef HAVE_DECL_PR_SET_NO_NEW_PRIVS
47 #define PR_SET_NO_NEW_PRIVS 38
48 #endif
49
50 #ifndef HAVE_DECL_PR_GET_NO_NEW_PRIVS
51 #define PR_GET_NO_NEW_PRIVS 39
52 #endif
53
54 #include "namespace.h"
55 #include "log.h"
56 #include "attach.h"
57 #include "caps.h"
58 #include "config.h"
59 #include "utils.h"
60 #include "commands.h"
61 #include "cgroup.h"
62 #include "lxclock.h"
63 #include "conf.h"
64 #include "lxcseccomp.h"
65 #include <lxc/lxccontainer.h>
66 #include "lsm/lsm.h"
67 #include "confile.h"
68
69 #if HAVE_SYS_PERSONALITY_H
70 #include <sys/personality.h>
71 #endif
72
73 #ifndef SOCK_CLOEXEC
74 # define SOCK_CLOEXEC 02000000
75 #endif
76
77 #ifndef MS_REC
78 #define MS_REC 16384
79 #endif
80
81 #ifndef MS_SLAVE
82 #define MS_SLAVE (1<<19)
83 #endif
84
85 lxc_log_define(lxc_attach, lxc);
86
87 int lsm_set_label_at(int procfd, int on_exec, char* lsm_label) {
88 int labelfd = -1;
89 int ret = 0;
90 const char* name;
91 char* command = NULL;
92
93 name = lsm_name();
94
95 if (strcmp(name, "nop") == 0)
96 goto out;
97
98 if (strcmp(name, "none") == 0)
99 goto out;
100
101 /* We don't support on-exec with AppArmor */
102 if (strcmp(name, "AppArmor") == 0)
103 on_exec = 0;
104
105 if (on_exec) {
106 labelfd = openat(procfd, "self/attr/exec", O_RDWR);
107 }
108 else {
109 labelfd = openat(procfd, "self/attr/current", O_RDWR);
110 }
111
112 if (labelfd < 0) {
113 SYSERROR("Unable to open LSM label");
114 ret = -1;
115 goto out;
116 }
117
118 if (strcmp(name, "AppArmor") == 0) {
119 int size;
120
121 command = malloc(strlen(lsm_label) + strlen("changeprofile ") + 1);
122 if (!command) {
123 SYSERROR("Failed to write apparmor profile");
124 ret = -1;
125 goto out;
126 }
127
128 size = sprintf(command, "changeprofile %s", lsm_label);
129 if (size < 0) {
130 SYSERROR("Failed to write apparmor profile");
131 ret = -1;
132 goto out;
133 }
134
135 if (write(labelfd, command, size + 1) < 0) {
136 SYSERROR("Unable to set LSM label");
137 ret = -1;
138 goto out;
139 }
140 }
141 else if (strcmp(name, "SELinux") == 0) {
142 if (write(labelfd, lsm_label, strlen(lsm_label) + 1) < 0) {
143 SYSERROR("Unable to set LSM label");
144 ret = -1;
145 goto out;
146 }
147 }
148 else {
149 ERROR("Unable to restore label for unknown LSM: %s", name);
150 ret = -1;
151 goto out;
152 }
153
154 out:
155 free(command);
156
157 if (labelfd != -1)
158 close(labelfd);
159
160 return ret;
161 }
162
163 static struct lxc_proc_context_info *lxc_proc_get_context_info(pid_t pid)
164 {
165 struct lxc_proc_context_info *info = calloc(1, sizeof(*info));
166 FILE *proc_file;
167 char proc_fn[MAXPATHLEN];
168 char *line = NULL;
169 size_t line_bufsz = 0;
170 int ret, found;
171
172 if (!info) {
173 SYSERROR("Could not allocate memory.");
174 return NULL;
175 }
176
177 /* read capabilities */
178 snprintf(proc_fn, MAXPATHLEN, "/proc/%d/status", pid);
179
180 proc_file = fopen(proc_fn, "r");
181 if (!proc_file) {
182 SYSERROR("Could not open %s", proc_fn);
183 goto out_error;
184 }
185
186 found = 0;
187 while (getline(&line, &line_bufsz, proc_file) != -1) {
188 ret = sscanf(line, "CapBnd: %llx", &info->capability_mask);
189 if (ret != EOF && ret > 0) {
190 found = 1;
191 break;
192 }
193 }
194
195 free(line);
196 fclose(proc_file);
197
198 if (!found) {
199 SYSERROR("Could not read capability bounding set from %s", proc_fn);
200 errno = ENOENT;
201 goto out_error;
202 }
203
204 info->lsm_label = lsm_process_label_get(pid);
205
206 return info;
207
208 out_error:
209 free(info);
210 return NULL;
211 }
212
213 static void lxc_proc_put_context_info(struct lxc_proc_context_info *ctx)
214 {
215 free(ctx->lsm_label);
216 if (ctx->container)
217 lxc_container_put(ctx->container);
218 free(ctx);
219 }
220
221 static int lxc_attach_to_ns(pid_t pid, int which)
222 {
223 char path[MAXPATHLEN];
224 /* according to <http://article.gmane.org/gmane.linux.kernel.containers.lxc.devel/1429>,
225 * the file for user namepsaces in /proc/$pid/ns will be called
226 * 'user' once the kernel supports it
227 */
228 static char *ns[] = { "user", "mnt", "pid", "uts", "ipc", "net", "cgroup" };
229 static int flags[] = {
230 CLONE_NEWUSER, CLONE_NEWNS, CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC,
231 CLONE_NEWNET, CLONE_NEWCGROUP
232 };
233 static const int size = sizeof(ns) / sizeof(char *);
234 int fd[size];
235 int i, j, saved_errno;
236
237
238 snprintf(path, MAXPATHLEN, "/proc/%d/ns", pid);
239 if (access(path, X_OK)) {
240 ERROR("Does this kernel version support 'attach' ?");
241 return -1;
242 }
243
244 for (i = 0; i < size; i++) {
245 /* ignore if we are not supposed to attach to that
246 * namespace
247 */
248 if (which != -1 && !(which & flags[i])) {
249 fd[i] = -1;
250 continue;
251 }
252
253 snprintf(path, MAXPATHLEN, "/proc/%d/ns/%s", pid, ns[i]);
254 fd[i] = open(path, O_RDONLY | O_CLOEXEC);
255 if (fd[i] < 0) {
256 saved_errno = errno;
257
258 /* close all already opened file descriptors before
259 * we return an error, so we don't leak them
260 */
261 for (j = 0; j < i; j++)
262 close(fd[j]);
263
264 errno = saved_errno;
265 SYSERROR("failed to open '%s'", path);
266 return -1;
267 }
268 }
269
270 for (i = 0; i < size; i++) {
271 if (fd[i] >= 0 && setns(fd[i], 0) != 0) {
272 saved_errno = errno;
273
274 for (j = i; j < size; j++)
275 close(fd[j]);
276
277 errno = saved_errno;
278 SYSERROR("failed to set namespace '%s'", ns[i]);
279 return -1;
280 }
281
282 close(fd[i]);
283 }
284
285 return 0;
286 }
287
288 static int lxc_attach_remount_sys_proc(void)
289 {
290 int ret;
291
292 ret = unshare(CLONE_NEWNS);
293 if (ret < 0) {
294 SYSERROR("failed to unshare mount namespace");
295 return -1;
296 }
297
298 if (detect_shared_rootfs()) {
299 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
300 SYSERROR("Failed to make / rslave");
301 ERROR("Continuing...");
302 }
303 }
304
305 /* assume /proc is always mounted, so remount it */
306 ret = umount2("/proc", MNT_DETACH);
307 if (ret < 0) {
308 SYSERROR("failed to unmount /proc");
309 return -1;
310 }
311
312 ret = mount("none", "/proc", "proc", 0, NULL);
313 if (ret < 0) {
314 SYSERROR("failed to remount /proc");
315 return -1;
316 }
317
318 /* try to umount /sys - if it's not a mount point,
319 * we'll get EINVAL, then we ignore it because it
320 * may not have been mounted in the first place
321 */
322 ret = umount2("/sys", MNT_DETACH);
323 if (ret < 0 && errno != EINVAL) {
324 SYSERROR("failed to unmount /sys");
325 return -1;
326 } else if (ret == 0) {
327 /* remount it */
328 ret = mount("none", "/sys", "sysfs", 0, NULL);
329 if (ret < 0) {
330 SYSERROR("failed to remount /sys");
331 return -1;
332 }
333 }
334
335 return 0;
336 }
337
338 static int lxc_attach_drop_privs(struct lxc_proc_context_info *ctx)
339 {
340 int last_cap = lxc_caps_last_cap();
341 int cap;
342
343 for (cap = 0; cap <= last_cap; cap++) {
344 if (ctx->capability_mask & (1LL << cap))
345 continue;
346
347 if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0)) {
348 SYSERROR("failed to remove capability id %d", cap);
349 return -1;
350 }
351 }
352
353 return 0;
354 }
355
356 static int lxc_attach_set_environment(enum lxc_attach_env_policy_t policy, char** extra_env, char** extra_keep)
357 {
358 if (policy == LXC_ATTACH_CLEAR_ENV) {
359 char **extra_keep_store = NULL;
360 int path_kept = 0;
361
362 if (extra_keep) {
363 size_t count, i;
364
365 for (count = 0; extra_keep[count]; count++);
366
367 extra_keep_store = calloc(count, sizeof(char *));
368 if (!extra_keep_store) {
369 SYSERROR("failed to allocate memory for storing current "
370 "environment variable values that will be kept");
371 return -1;
372 }
373 for (i = 0; i < count; i++) {
374 char *v = getenv(extra_keep[i]);
375 if (v) {
376 extra_keep_store[i] = strdup(v);
377 if (!extra_keep_store[i]) {
378 SYSERROR("failed to allocate memory for storing current "
379 "environment variable values that will be kept");
380 while (i > 0)
381 free(extra_keep_store[--i]);
382 free(extra_keep_store);
383 return -1;
384 }
385 if (strcmp(extra_keep[i], "PATH") == 0)
386 path_kept = 1;
387 }
388 /* calloc sets entire array to zero, so we don't
389 * need an else */
390 }
391 }
392
393 if (clearenv()) {
394 char **p;
395 SYSERROR("failed to clear environment");
396 if (extra_keep_store) {
397 for (p = extra_keep_store; *p; p++)
398 free(*p);
399 free(extra_keep_store);
400 }
401 return -1;
402 }
403
404 if (extra_keep_store) {
405 size_t i;
406 for (i = 0; extra_keep[i]; i++) {
407 if (extra_keep_store[i]) {
408 if (setenv(extra_keep[i], extra_keep_store[i], 1) < 0)
409 SYSERROR("Unable to set environment variable");
410 }
411 free(extra_keep_store[i]);
412 }
413 free(extra_keep_store);
414 }
415
416 /* always set a default path; shells and execlp tend
417 * to be fine without it, but there is a disturbing
418 * number of C programs out there that just assume
419 * that getenv("PATH") is never NULL and then die a
420 * painful segfault death. */
421 if (!path_kept)
422 setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
423 }
424
425 if (putenv("container=lxc")) {
426 SYSERROR("failed to set environment variable");
427 return -1;
428 }
429
430 /* set extra environment variables */
431 if (extra_env) {
432 for (; *extra_env; extra_env++) {
433 /* duplicate the string, just to be on
434 * the safe side, because putenv does not
435 * do it for us */
436 char *p = strdup(*extra_env);
437 /* we just assume the user knows what they
438 * are doing, so we don't do any checks */
439 if (!p) {
440 SYSERROR("failed to allocate memory for additional environment "
441 "variables");
442 return -1;
443 }
444 putenv(p);
445 }
446 }
447
448 return 0;
449 }
450
451 static char *lxc_attach_getpwshell(uid_t uid)
452 {
453 /* local variables */
454 pid_t pid;
455 int pipes[2];
456 int ret;
457 int fd;
458 char *result = NULL;
459
460 /* we need to fork off a process that runs the
461 * getent program, and we need to capture its
462 * output, so we use a pipe for that purpose
463 */
464 ret = pipe(pipes);
465 if (ret < 0)
466 return NULL;
467
468 pid = fork();
469 if (pid < 0) {
470 close(pipes[0]);
471 close(pipes[1]);
472 return NULL;
473 }
474
475 if (pid) {
476 /* parent process */
477 FILE *pipe_f;
478 char *line = NULL;
479 size_t line_bufsz = 0;
480 int found = 0;
481 int status;
482
483 close(pipes[1]);
484
485 pipe_f = fdopen(pipes[0], "r");
486 while (getline(&line, &line_bufsz, pipe_f) != -1) {
487 char *token;
488 char *saveptr = NULL;
489 long value;
490 char *endptr = NULL;
491 int i;
492
493 /* if we already found something, just continue
494 * to read until the pipe doesn't deliver any more
495 * data, but don't modify the existing data
496 * structure
497 */
498 if (found)
499 continue;
500
501 /* trim line on the right hand side */
502 for (i = strlen(line); i > 0 && (line[i - 1] == '\n' || line[i - 1] == '\r'); --i)
503 line[i - 1] = '\0';
504
505 /* split into tokens: first user name */
506 token = strtok_r(line, ":", &saveptr);
507 if (!token)
508 continue;
509 /* next: dummy password field */
510 token = strtok_r(NULL, ":", &saveptr);
511 if (!token)
512 continue;
513 /* next: user id */
514 token = strtok_r(NULL, ":", &saveptr);
515 value = token ? strtol(token, &endptr, 10) : 0;
516 if (!token || !endptr || *endptr || value == LONG_MIN || value == LONG_MAX)
517 continue;
518 /* dummy sanity check: user id matches */
519 if ((uid_t) value != uid)
520 continue;
521 /* skip fields: gid, gecos, dir, go to next field 'shell' */
522 for (i = 0; i < 4; i++) {
523 token = strtok_r(NULL, ":", &saveptr);
524 if (!token)
525 break;
526 }
527 if (!token)
528 continue;
529 free(result);
530 result = strdup(token);
531
532 /* sanity check that there are no fields after that */
533 token = strtok_r(NULL, ":", &saveptr);
534 if (token)
535 continue;
536
537 found = 1;
538 }
539
540 free(line);
541 fclose(pipe_f);
542 again:
543 if (waitpid(pid, &status, 0) < 0) {
544 if (errno == EINTR)
545 goto again;
546 return NULL;
547 }
548
549 /* some sanity checks: if anything even hinted at going
550 * wrong: we can't be sure we have a valid result, so
551 * we assume we don't
552 */
553
554 if (!WIFEXITED(status))
555 return NULL;
556
557 if (WEXITSTATUS(status) != 0)
558 return NULL;
559
560 if (!found)
561 return NULL;
562
563 return result;
564 } else {
565 /* child process */
566 char uid_buf[32];
567 char *arguments[] = {
568 "getent",
569 "passwd",
570 uid_buf,
571 NULL
572 };
573
574 close(pipes[0]);
575
576 /* we want to capture stdout */
577 dup2(pipes[1], 1);
578 close(pipes[1]);
579
580 /* get rid of stdin/stderr, so we try to associate it
581 * with /dev/null
582 */
583 fd = open("/dev/null", O_RDWR);
584 if (fd < 0) {
585 close(0);
586 close(2);
587 } else {
588 dup2(fd, 0);
589 dup2(fd, 2);
590 close(fd);
591 }
592
593 /* finish argument list */
594 ret = snprintf(uid_buf, sizeof(uid_buf), "%ld", (long) uid);
595 if (ret <= 0)
596 exit(-1);
597
598 /* try to run getent program */
599 (void) execvp("getent", arguments);
600 exit(-1);
601 }
602 }
603
604 static void lxc_attach_get_init_uidgid(uid_t* init_uid, gid_t* init_gid)
605 {
606 FILE *proc_file;
607 char proc_fn[MAXPATHLEN];
608 char *line = NULL;
609 size_t line_bufsz = 0;
610 int ret;
611 long value = -1;
612 uid_t uid = (uid_t)-1;
613 gid_t gid = (gid_t)-1;
614
615 /* read capabilities */
616 snprintf(proc_fn, MAXPATHLEN, "/proc/%d/status", 1);
617
618 proc_file = fopen(proc_fn, "r");
619 if (!proc_file)
620 return;
621
622 while (getline(&line, &line_bufsz, proc_file) != -1) {
623 /* format is: real, effective, saved set user, fs
624 * we only care about real uid
625 */
626 ret = sscanf(line, "Uid: %ld", &value);
627 if (ret != EOF && ret > 0) {
628 uid = (uid_t) value;
629 } else {
630 ret = sscanf(line, "Gid: %ld", &value);
631 if (ret != EOF && ret > 0)
632 gid = (gid_t) value;
633 }
634 if (uid != (uid_t)-1 && gid != (gid_t)-1)
635 break;
636 }
637
638 fclose(proc_file);
639 free(line);
640
641 /* only override arguments if we found something */
642 if (uid != (uid_t)-1)
643 *init_uid = uid;
644 if (gid != (gid_t)-1)
645 *init_gid = gid;
646
647 /* TODO: we should also parse supplementary groups and use
648 * setgroups() to set them */
649 }
650
651 struct attach_clone_payload {
652 int ipc_socket;
653 lxc_attach_options_t* options;
654 struct lxc_proc_context_info* init_ctx;
655 lxc_attach_exec_t exec_function;
656 void* exec_payload;
657 int procfd;
658 };
659
660 static int attach_child_main(void* data);
661
662 /* help the optimizer along if it doesn't know that exit always exits */
663 #define rexit(c) do { int __c = (c); _exit(__c); return __c; } while(0)
664
665 /* define default options if no options are supplied by the user */
666 static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT;
667
668 static bool fetch_seccomp(struct lxc_proc_context_info *i,
669 lxc_attach_options_t *options)
670 {
671 struct lxc_container *c;
672 char *path;
673
674 if (!(options->namespaces & CLONE_NEWNS) || !(options->attach_flags & LXC_ATTACH_LSM))
675 return true;
676
677 c = i->container;
678
679 /* Remove current setting. */
680 if (!c->set_config_item(c, "lxc.seccomp", "")) {
681 return false;
682 }
683
684 /* Fetch the current profile path over the cmd interface */
685 path = c->get_running_config_item(c, "lxc.seccomp");
686 if (!path) {
687 return true;
688 }
689
690 /* Copy the value into the new lxc_conf */
691 if (!c->set_config_item(c, "lxc.seccomp", path)) {
692 free(path);
693 return false;
694 }
695 free(path);
696
697 /* Attempt to parse the resulting config */
698 if (lxc_read_seccomp_config(c->lxc_conf) < 0) {
699 ERROR("Error reading seccomp policy");
700 return false;
701 }
702
703 INFO("Retrieved seccomp policy.");
704 return true;
705 }
706
707 static bool no_new_privs(struct lxc_proc_context_info *ctx,
708 lxc_attach_options_t *options)
709 {
710 struct lxc_container *c;
711 char *val;
712
713 c = ctx->container;
714
715 /* Remove current setting. */
716 if (!c->set_config_item(c, "lxc.no_new_privs", "")) {
717 return false;
718 }
719
720 /* Retrieve currently active setting. */
721 val = c->get_running_config_item(c, "lxc.no_new_privs");
722 if (!val) {
723 INFO("Failed to get running config item for lxc.no_new_privs.");
724 return false;
725 }
726
727 /* Set currently active setting. */
728 if (!c->set_config_item(c, "lxc.no_new_privs", val)) {
729 free(val);
730 return false;
731 }
732 free(val);
733
734 return true;
735 }
736
737 static signed long get_personality(const char *name, const char *lxcpath)
738 {
739 char *p = lxc_cmd_get_config_item(name, "lxc.arch", lxcpath);
740 signed long ret;
741
742 if (!p)
743 return -1;
744 ret = lxc_config_parse_arch(p);
745 free(p);
746 return ret;
747 }
748
749 int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_function, void* exec_payload, lxc_attach_options_t* options, pid_t* attached_process)
750 {
751 int ret, status;
752 pid_t init_pid, pid, attached_pid, expected;
753 struct lxc_proc_context_info *init_ctx;
754 char* cwd;
755 char* new_cwd;
756 int ipc_sockets[2];
757 int procfd;
758 signed long personality;
759
760 if (!options)
761 options = &attach_static_default_options;
762
763 init_pid = lxc_cmd_get_init_pid(name, lxcpath);
764 if (init_pid < 0) {
765 ERROR("failed to get the init pid");
766 return -1;
767 }
768
769 init_ctx = lxc_proc_get_context_info(init_pid);
770 if (!init_ctx) {
771 ERROR("failed to get context of the init process, pid = %ld", (long)init_pid);
772 return -1;
773 }
774
775 personality = get_personality(name, lxcpath);
776 if (init_ctx->personality < 0) {
777 ERROR("Failed to get personality of the container");
778 lxc_proc_put_context_info(init_ctx);
779 return -1;
780 }
781 init_ctx->personality = personality;
782
783 init_ctx->container = lxc_container_new(name, lxcpath);
784 if (!init_ctx->container)
785 return -1;
786
787 if (!fetch_seccomp(init_ctx, options))
788 WARN("Failed to get seccomp policy");
789
790 if (!no_new_privs(init_ctx, options))
791 WARN("Could not determine whether PR_SET_NO_NEW_PRIVS is set.");
792
793 cwd = getcwd(NULL, 0);
794
795 /* determine which namespaces the container was created with
796 * by asking lxc-start, if necessary
797 */
798 if (options->namespaces == -1) {
799 options->namespaces = lxc_cmd_get_clone_flags(name, lxcpath);
800 /* call failed */
801 if (options->namespaces == -1) {
802 ERROR("failed to automatically determine the "
803 "namespaces which the container unshared");
804 free(cwd);
805 lxc_proc_put_context_info(init_ctx);
806 return -1;
807 }
808 }
809
810 /* create a socket pair for IPC communication; set SOCK_CLOEXEC in order
811 * to make sure we don't irritate other threads that want to fork+exec away
812 *
813 * IMPORTANT: if the initial process is multithreaded and another call
814 * just fork()s away without exec'ing directly after, the socket fd will
815 * exist in the forked process from the other thread and any close() in
816 * our own child process will not really cause the socket to close properly,
817 * potentiall causing the parent to hang.
818 *
819 * For this reason, while IPC is still active, we have to use shutdown()
820 * if the child exits prematurely in order to signal that the socket
821 * is closed and cannot assume that the child exiting will automatically
822 * do that.
823 *
824 * IPC mechanism: (X is receiver)
825 * initial process intermediate attached
826 * X <--- send pid of
827 * attached proc,
828 * then exit
829 * send 0 ------------------------------------> X
830 * [do initialization]
831 * X <------------------------------------ send 1
832 * [add to cgroup, ...]
833 * send 2 ------------------------------------> X
834 * close socket close socket
835 * run program
836 */
837 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
838 if (ret < 0) {
839 SYSERROR("could not set up required IPC mechanism for attaching");
840 free(cwd);
841 lxc_proc_put_context_info(init_ctx);
842 return -1;
843 }
844
845 /* create intermediate subprocess, three reasons:
846 * 1. runs all pthread_atfork handlers and the
847 * child will no longer be threaded
848 * (we can't properly setns() in a threaded process)
849 * 2. we can't setns() in the child itself, since
850 * we want to make sure we are properly attached to
851 * the pidns
852 * 3. also, the initial thread has to put the attached
853 * process into the cgroup, which we can only do if
854 * we didn't already setns() (otherwise, user
855 * namespaces will hate us)
856 */
857 pid = fork();
858
859 if (pid < 0) {
860 SYSERROR("failed to create first subprocess");
861 free(cwd);
862 lxc_proc_put_context_info(init_ctx);
863 return -1;
864 }
865
866 if (pid) {
867 pid_t to_cleanup_pid = pid;
868
869 /* initial thread, we close the socket that is for the
870 * subprocesses
871 */
872 close(ipc_sockets[1]);
873 free(cwd);
874
875 /* attach to cgroup, if requested */
876 if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
877 if (!cgroup_attach(name, lxcpath, pid))
878 goto cleanup_error;
879 }
880
881 /* Let the child process know to go ahead */
882 status = 0;
883 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
884 if (ret <= 0) {
885 ERROR("error using IPC to notify attached process for initialization (0)");
886 goto cleanup_error;
887 }
888
889 /* get pid from intermediate process */
890 ret = lxc_read_nointr_expect(ipc_sockets[0], &attached_pid, sizeof(attached_pid), NULL);
891 if (ret <= 0) {
892 if (ret != 0)
893 ERROR("error using IPC to receive pid of attached process");
894 goto cleanup_error;
895 }
896
897 /* ignore SIGKILL (CTRL-C) and SIGQUIT (CTRL-\) - issue #313 */
898 if (options->stdin_fd == 0) {
899 signal(SIGINT, SIG_IGN);
900 signal(SIGQUIT, SIG_IGN);
901 }
902
903 /* reap intermediate process */
904 ret = wait_for_pid(pid);
905 if (ret < 0)
906 goto cleanup_error;
907
908 /* we will always have to reap the grandchild now */
909 to_cleanup_pid = attached_pid;
910
911 /* tell attached process it may start initializing */
912 status = 0;
913 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
914 if (ret <= 0) {
915 ERROR("error using IPC to notify attached process for initialization (0)");
916 goto cleanup_error;
917 }
918
919 /* wait for the attached process to finish initializing */
920 expected = 1;
921 ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected);
922 if (ret <= 0) {
923 if (ret != 0)
924 ERROR("error using IPC to receive notification from attached process (1)");
925 goto cleanup_error;
926 }
927
928 /* tell attached process we're done */
929 status = 2;
930 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
931 if (ret <= 0) {
932 ERROR("error using IPC to notify attached process for initialization (2)");
933 goto cleanup_error;
934 }
935
936 /* now shut down communication with child, we're done */
937 shutdown(ipc_sockets[0], SHUT_RDWR);
938 close(ipc_sockets[0]);
939 lxc_proc_put_context_info(init_ctx);
940
941 /* we're done, the child process should now execute whatever
942 * it is that the user requested. The parent can now track it
943 * with waitpid() or similar.
944 */
945
946 *attached_process = attached_pid;
947 return 0;
948
949 cleanup_error:
950 /* first shut down the socket, then wait for the pid,
951 * otherwise the pid we're waiting for may never exit
952 */
953 shutdown(ipc_sockets[0], SHUT_RDWR);
954 close(ipc_sockets[0]);
955 if (to_cleanup_pid)
956 (void) wait_for_pid(to_cleanup_pid);
957 lxc_proc_put_context_info(init_ctx);
958 return -1;
959 }
960
961 /* first subprocess begins here, we close the socket that is for the
962 * initial thread
963 */
964 close(ipc_sockets[0]);
965
966 /* Wait for the parent to have setup cgroups */
967 expected = 0;
968 status = -1;
969 ret = lxc_read_nointr_expect(ipc_sockets[1], &status, sizeof(status), &expected);
970 if (ret <= 0) {
971 ERROR("error communicating with child process");
972 shutdown(ipc_sockets[1], SHUT_RDWR);
973 rexit(-1);
974 }
975
976 if ((options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) && cgns_supported())
977 options->namespaces |= CLONE_NEWCGROUP;
978
979 procfd = open("/proc", O_DIRECTORY | O_RDONLY);
980 if (procfd < 0) {
981 SYSERROR("Unable to open /proc");
982 shutdown(ipc_sockets[1], SHUT_RDWR);
983 rexit(-1);
984 }
985
986 /* attach now, create another subprocess later, since pid namespaces
987 * only really affect the children of the current process
988 */
989 ret = lxc_attach_to_ns(init_pid, options->namespaces);
990 if (ret < 0) {
991 ERROR("failed to enter the namespace");
992 shutdown(ipc_sockets[1], SHUT_RDWR);
993 rexit(-1);
994 }
995
996 /* attach succeeded, try to cwd */
997 if (options->initial_cwd)
998 new_cwd = options->initial_cwd;
999 else
1000 new_cwd = cwd;
1001 ret = chdir(new_cwd);
1002 if (ret < 0)
1003 WARN("could not change directory to '%s'", new_cwd);
1004 free(cwd);
1005
1006 /* now create the real child process */
1007 {
1008 struct attach_clone_payload payload = {
1009 .ipc_socket = ipc_sockets[1],
1010 .options = options,
1011 .init_ctx = init_ctx,
1012 .exec_function = exec_function,
1013 .exec_payload = exec_payload,
1014 .procfd = procfd
1015 };
1016 /* We use clone_parent here to make this subprocess a direct child of
1017 * the initial process. Then this intermediate process can exit and
1018 * the parent can directly track the attached process.
1019 */
1020 pid = lxc_clone(attach_child_main, &payload, CLONE_PARENT);
1021 }
1022
1023 /* shouldn't happen, clone() should always return positive pid */
1024 if (pid <= 0) {
1025 SYSERROR("failed to create subprocess");
1026 shutdown(ipc_sockets[1], SHUT_RDWR);
1027 rexit(-1);
1028 }
1029
1030 /* tell grandparent the pid of the pid of the newly created child */
1031 ret = lxc_write_nointr(ipc_sockets[1], &pid, sizeof(pid));
1032 if (ret != sizeof(pid)) {
1033 /* if this really happens here, this is very unfortunate, since the
1034 * parent will not know the pid of the attached process and will
1035 * not be able to wait for it (and we won't either due to CLONE_PARENT)
1036 * so the parent won't be able to reap it and the attached process
1037 * will remain a zombie
1038 */
1039 ERROR("error using IPC to notify main process of pid of the attached process");
1040 shutdown(ipc_sockets[1], SHUT_RDWR);
1041 rexit(-1);
1042 }
1043
1044 /* the rest is in the hands of the initial and the attached process */
1045 rexit(0);
1046 }
1047
1048 static int attach_child_main(void* data)
1049 {
1050 struct attach_clone_payload* payload = (struct attach_clone_payload*)data;
1051 int ipc_socket = payload->ipc_socket;
1052 int procfd = payload->procfd;
1053 lxc_attach_options_t* options = payload->options;
1054 struct lxc_proc_context_info* init_ctx = payload->init_ctx;
1055 #if HAVE_SYS_PERSONALITY_H
1056 long new_personality;
1057 #endif
1058 int ret;
1059 int status;
1060 int expected;
1061 long flags;
1062 int fd;
1063 uid_t new_uid;
1064 gid_t new_gid;
1065
1066 /* wait for the initial thread to signal us that it's ready
1067 * for us to start initializing
1068 */
1069 expected = 0;
1070 status = -1;
1071 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
1072 if (ret <= 0) {
1073 ERROR("error using IPC to receive notification from initial process (0)");
1074 shutdown(ipc_socket, SHUT_RDWR);
1075 rexit(-1);
1076 }
1077
1078 /* A description of the purpose of this functionality is
1079 * provided in the lxc-attach(1) manual page. We have to
1080 * remount here and not in the parent process, otherwise
1081 * /proc may not properly reflect the new pid namespace.
1082 */
1083 if (!(options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) {
1084 ret = lxc_attach_remount_sys_proc();
1085 if (ret < 0) {
1086 shutdown(ipc_socket, SHUT_RDWR);
1087 rexit(-1);
1088 }
1089 }
1090
1091 /* now perform additional attachments*/
1092 #if HAVE_SYS_PERSONALITY_H
1093 if (options->personality < 0)
1094 new_personality = init_ctx->personality;
1095 else
1096 new_personality = options->personality;
1097
1098 if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) {
1099 ret = personality(new_personality);
1100 if (ret < 0) {
1101 SYSERROR("could not ensure correct architecture");
1102 shutdown(ipc_socket, SHUT_RDWR);
1103 rexit(-1);
1104 }
1105 }
1106 #endif
1107
1108 if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) {
1109 ret = lxc_attach_drop_privs(init_ctx);
1110 if (ret < 0) {
1111 ERROR("could not drop privileges");
1112 shutdown(ipc_socket, SHUT_RDWR);
1113 rexit(-1);
1114 }
1115 }
1116
1117 /* always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL) if you want this to be a no-op) */
1118 ret = lxc_attach_set_environment(options->env_policy, options->extra_env_vars, options->extra_keep_env);
1119 if (ret < 0) {
1120 ERROR("could not set initial environment for attached process");
1121 shutdown(ipc_socket, SHUT_RDWR);
1122 rexit(-1);
1123 }
1124
1125 /* set user / group id */
1126 new_uid = 0;
1127 new_gid = 0;
1128 /* ignore errors, we will fall back to root in that case
1129 * (/proc was not mounted etc.)
1130 */
1131 if (options->namespaces & CLONE_NEWUSER)
1132 lxc_attach_get_init_uidgid(&new_uid, &new_gid);
1133
1134 if (options->uid != (uid_t)-1)
1135 new_uid = options->uid;
1136 if (options->gid != (gid_t)-1)
1137 new_gid = options->gid;
1138
1139 /* setup the control tty */
1140 if (options->stdin_fd && isatty(options->stdin_fd)) {
1141 if (setsid() < 0) {
1142 SYSERROR("unable to setsid");
1143 shutdown(ipc_socket, SHUT_RDWR);
1144 rexit(-1);
1145 }
1146
1147 if (ioctl(options->stdin_fd, TIOCSCTTY, (char *)NULL) < 0) {
1148 SYSERROR("unable to TIOCSTTY");
1149 shutdown(ipc_socket, SHUT_RDWR);
1150 rexit(-1);
1151 }
1152 }
1153
1154 /* try to set the uid/gid combination */
1155 if ((new_gid != 0 || options->namespaces & CLONE_NEWUSER)) {
1156 if (setgid(new_gid) || setgroups(0, NULL)) {
1157 SYSERROR("switching to container gid");
1158 shutdown(ipc_socket, SHUT_RDWR);
1159 rexit(-1);
1160 }
1161 }
1162 if ((new_uid != 0 || options->namespaces & CLONE_NEWUSER) && setuid(new_uid)) {
1163 SYSERROR("switching to container uid");
1164 shutdown(ipc_socket, SHUT_RDWR);
1165 rexit(-1);
1166 }
1167
1168 /* tell initial process it may now put us into the cgroups */
1169 status = 1;
1170 ret = lxc_write_nointr(ipc_socket, &status, sizeof(status));
1171 if (ret != sizeof(status)) {
1172 ERROR("error using IPC to notify initial process for initialization (1)");
1173 shutdown(ipc_socket, SHUT_RDWR);
1174 rexit(-1);
1175 }
1176
1177 /* wait for the initial thread to signal us that it has done
1178 * everything for us when it comes to cgroups etc.
1179 */
1180 expected = 2;
1181 status = -1;
1182 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
1183 if (ret <= 0) {
1184 ERROR("error using IPC to receive final notification from initial process (2)");
1185 shutdown(ipc_socket, SHUT_RDWR);
1186 rexit(-1);
1187 }
1188
1189 shutdown(ipc_socket, SHUT_RDWR);
1190 close(ipc_socket);
1191
1192 if ((init_ctx->container && init_ctx->container->lxc_conf &&
1193 init_ctx->container->lxc_conf->no_new_privs) ||
1194 (options->attach_flags & LXC_ATTACH_NO_NEW_PRIVS)) {
1195 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
1196 SYSERROR("PR_SET_NO_NEW_PRIVS could not be set. "
1197 "Process can use execve() gainable "
1198 "privileges.");
1199 rexit(-1);
1200 }
1201 INFO("PR_SET_NO_NEW_PRIVS is set. Process cannot use execve() "
1202 "gainable privileges.");
1203 }
1204
1205 /* set new apparmor profile/selinux context */
1206 if ((options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_LSM) && init_ctx->lsm_label) {
1207 int on_exec;
1208
1209 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? 1 : 0;
1210 if (lsm_set_label_at(procfd, on_exec, init_ctx->lsm_label) < 0) {
1211 rexit(-1);
1212 }
1213 }
1214
1215 if (init_ctx->container && init_ctx->container->lxc_conf &&
1216 lxc_seccomp_load(init_ctx->container->lxc_conf) != 0) {
1217 ERROR("Loading seccomp policy");
1218 rexit(-1);
1219 }
1220 lxc_proc_put_context_info(init_ctx);
1221
1222 /* The following is done after the communication socket is
1223 * shut down. That way, all errors that might (though
1224 * unlikely) occur up until this point will have their messages
1225 * printed to the original stderr (if logging is so configured)
1226 * and not the fd the user supplied, if any.
1227 */
1228
1229 /* fd handling for stdin, stdout and stderr;
1230 * ignore errors here, user may want to make sure
1231 * the fds are closed, for example */
1232 if (options->stdin_fd >= 0 && options->stdin_fd != 0)
1233 dup2(options->stdin_fd, 0);
1234 if (options->stdout_fd >= 0 && options->stdout_fd != 1)
1235 dup2(options->stdout_fd, 1);
1236 if (options->stderr_fd >= 0 && options->stderr_fd != 2)
1237 dup2(options->stderr_fd, 2);
1238
1239 /* close the old fds */
1240 if (options->stdin_fd > 2)
1241 close(options->stdin_fd);
1242 if (options->stdout_fd > 2)
1243 close(options->stdout_fd);
1244 if (options->stderr_fd > 2)
1245 close(options->stderr_fd);
1246
1247 /* try to remove CLOEXEC flag from stdin/stdout/stderr,
1248 * but also here, ignore errors */
1249 for (fd = 0; fd <= 2; fd++) {
1250 flags = fcntl(fd, F_GETFL);
1251 if (flags < 0)
1252 continue;
1253 if (flags & FD_CLOEXEC) {
1254 if (fcntl(fd, F_SETFL, flags & ~FD_CLOEXEC) < 0) {
1255 SYSERROR("Unable to clear CLOEXEC from fd");
1256 }
1257 }
1258 }
1259
1260 /* we don't need proc anymore */
1261 close(procfd);
1262
1263 /* we're done, so we can now do whatever the user intended us to do */
1264 rexit(payload->exec_function(payload->exec_payload));
1265 }
1266
1267 int lxc_attach_run_command(void* payload)
1268 {
1269 lxc_attach_command_t* cmd = (lxc_attach_command_t*)payload;
1270
1271 execvp(cmd->program, cmd->argv);
1272 SYSERROR("failed to exec '%s'", cmd->program);
1273 return -1;
1274 }
1275
1276 int lxc_attach_run_shell(void* payload)
1277 {
1278 uid_t uid;
1279 struct passwd *passwd;
1280 char *user_shell;
1281
1282 /* ignore payload parameter */
1283 (void)payload;
1284
1285 uid = getuid();
1286 passwd = getpwuid(uid);
1287
1288 /* this probably happens because of incompatible nss
1289 * implementations in host and container (remember, this
1290 * code is still using the host's glibc but our mount
1291 * namespace is in the container)
1292 * we may try to get the information by spawning a
1293 * [getent passwd uid] process and parsing the result
1294 */
1295 if (!passwd)
1296 user_shell = lxc_attach_getpwshell(uid);
1297 else
1298 user_shell = passwd->pw_shell;
1299
1300 if (user_shell)
1301 execlp(user_shell, user_shell, (char *)NULL);
1302
1303 /* executed if either no passwd entry or execvp fails,
1304 * we will fall back on /bin/sh as a default shell
1305 */
1306 execlp("/bin/sh", "/bin/sh", (char *)NULL);
1307 SYSERROR("failed to exec shell");
1308 return -1;
1309 }