]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/attach.c
Merge pull request #1346 from brauner/2016-11-08/fix_attach_fd_leak_master
[mirror_lxc.git] / src / lxc / attach.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include <unistd.h>
26 #include <stdio.h>
27 #include <string.h>
28 #include <stdlib.h>
29 #include <signal.h>
30 #include <errno.h>
31 #include <fcntl.h>
32 #include <grp.h>
33 #include <sys/param.h>
34 #include <sys/prctl.h>
35 #include <sys/mount.h>
36 #include <sys/socket.h>
37 #include <sys/syscall.h>
38 #include <sys/wait.h>
39 #include <linux/unistd.h>
40 #include <pwd.h>
41
42 #ifndef HAVE_DECL_PR_CAPBSET_DROP
43 #define PR_CAPBSET_DROP 24
44 #endif
45
46 #ifndef HAVE_DECL_PR_SET_NO_NEW_PRIVS
47 #define PR_SET_NO_NEW_PRIVS 38
48 #endif
49
50 #ifndef HAVE_DECL_PR_GET_NO_NEW_PRIVS
51 #define PR_GET_NO_NEW_PRIVS 39
52 #endif
53
54 #include "namespace.h"
55 #include "log.h"
56 #include "af_unix.h"
57 #include "attach.h"
58 #include "caps.h"
59 #include "config.h"
60 #include "utils.h"
61 #include "commands.h"
62 #include "cgroup.h"
63 #include "lxclock.h"
64 #include "conf.h"
65 #include "lxcseccomp.h"
66 #include <lxc/lxccontainer.h>
67 #include "lsm/lsm.h"
68 #include "confile.h"
69
70 #if HAVE_SYS_PERSONALITY_H
71 #include <sys/personality.h>
72 #endif
73
74 #ifndef SOCK_CLOEXEC
75 # define SOCK_CLOEXEC 02000000
76 #endif
77
78 #ifndef MS_REC
79 #define MS_REC 16384
80 #endif
81
82 #ifndef MS_SLAVE
83 #define MS_SLAVE (1<<19)
84 #endif
85
86 lxc_log_define(lxc_attach, lxc);
87
88 /* /proc/pid-to-str/current\0 = (5 + 21 + 7 + 1) */
89 #define __LSMATTRLEN (5 + (LXC_NUMSTRLEN64) + 7 + 1)
90 static int lsm_openat(int procfd, pid_t pid, int on_exec)
91 {
92 int ret = -1;
93 int labelfd = -1;
94 const char *name;
95 char path[__LSMATTRLEN];
96
97 name = lsm_name();
98
99 if (strcmp(name, "nop") == 0)
100 return 0;
101
102 if (strcmp(name, "none") == 0)
103 return 0;
104
105 /* We don't support on-exec with AppArmor */
106 if (strcmp(name, "AppArmor") == 0)
107 on_exec = 0;
108
109 if (on_exec)
110 ret = snprintf(path, __LSMATTRLEN, "%d/attr/exec", pid);
111 else
112 ret = snprintf(path, __LSMATTRLEN, "%d/attr/current", pid);
113 if (ret < 0 || ret >= __LSMATTRLEN)
114 return -1;
115
116 labelfd = openat(procfd, path, O_RDWR);
117 if (labelfd < 0) {
118 SYSERROR("Unable to open file descriptor to set LSM label.");
119 return -1;
120 }
121
122 return labelfd;
123 }
124
125 static int lsm_set_label_at(int lsm_labelfd, int on_exec, char *lsm_label)
126 {
127 int fret = -1;
128 const char* name;
129 char *command = NULL;
130
131 name = lsm_name();
132
133 if (strcmp(name, "nop") == 0)
134 return 0;
135
136 if (strcmp(name, "none") == 0)
137 return 0;
138
139 /* We don't support on-exec with AppArmor */
140 if (strcmp(name, "AppArmor") == 0)
141 on_exec = 0;
142
143 if (strcmp(name, "AppArmor") == 0) {
144 int size;
145
146 command = malloc(strlen(lsm_label) + strlen("changeprofile ") + 1);
147 if (!command) {
148 SYSERROR("Failed to write apparmor profile.");
149 goto out;
150 }
151
152 size = sprintf(command, "changeprofile %s", lsm_label);
153 if (size < 0) {
154 SYSERROR("Failed to write apparmor profile.");
155 goto out;
156 }
157
158 if (write(lsm_labelfd, command, size + 1) < 0) {
159 SYSERROR("Unable to set LSM label: %s.", command);
160 goto out;
161 }
162 INFO("Set LSM label to: %s.", command);
163 } else if (strcmp(name, "SELinux") == 0) {
164 if (write(lsm_labelfd, lsm_label, strlen(lsm_label) + 1) < 0) {
165 SYSERROR("Unable to set LSM label: %s.", lsm_label);
166 goto out;
167 }
168 INFO("Set LSM label to: %s.", lsm_label);
169 } else {
170 ERROR("Unable to restore label for unknown LSM: %s.", name);
171 goto out;
172 }
173 fret = 0;
174
175 out:
176 free(command);
177
178 if (lsm_labelfd != -1)
179 close(lsm_labelfd);
180
181 return fret;
182 }
183
184 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
185 #define __PROC_STATUS_LEN (5 + (LXC_NUMSTRLEN64) + 7 + 1)
186 static struct lxc_proc_context_info *lxc_proc_get_context_info(pid_t pid)
187 {
188 FILE *proc_file;
189 char proc_fn[__PROC_STATUS_LEN];
190 bool found;
191 int ret;
192 char *line = NULL;
193 size_t line_bufsz = 0;
194 struct lxc_proc_context_info *info = NULL;
195
196 /* Read capabilities. */
197 ret = snprintf(proc_fn, __PROC_STATUS_LEN, "/proc/%d/status", pid);
198 if (ret < 0 || ret >= __PROC_STATUS_LEN)
199 goto on_error;
200
201 proc_file = fopen(proc_fn, "r");
202 if (!proc_file) {
203 SYSERROR("Could not open %s.", proc_fn);
204 goto on_error;
205 }
206
207 info = calloc(1, sizeof(*info));
208 if (!info) {
209 SYSERROR("Could not allocate memory.");
210 return NULL;
211 }
212
213 found = false;
214 while (getline(&line, &line_bufsz, proc_file) != -1) {
215 ret = sscanf(line, "CapBnd: %llx", &info->capability_mask);
216 if (ret != EOF && ret == 1) {
217 found = true;
218 break;
219 }
220 }
221
222 free(line);
223 fclose(proc_file);
224
225 if (!found) {
226 SYSERROR("Could not read capability bounding set from %s.", proc_fn);
227 errno = ENOENT;
228 goto on_error;
229 }
230
231 info->lsm_label = lsm_process_label_get(pid);
232
233 return info;
234
235 on_error:
236 free(info);
237 return NULL;
238 }
239
240 static void lxc_proc_put_context_info(struct lxc_proc_context_info *ctx)
241 {
242 free(ctx->lsm_label);
243 if (ctx->container)
244 lxc_container_put(ctx->container);
245 free(ctx);
246 }
247
248 static int lxc_attach_to_ns(pid_t pid, int which)
249 {
250 int fd[LXC_NS_MAX];
251 int i, j, saved_errno;
252
253
254 if (access("/proc/self/ns", X_OK)) {
255 ERROR("Does this kernel version support namespaces?");
256 return -1;
257 }
258
259 for (i = 0; i < LXC_NS_MAX; i++) {
260 /* Ignore if we are not supposed to attach to that namespace. */
261 if (which != -1 && !(which & ns_info[i].clone_flag)) {
262 fd[i] = -1;
263 continue;
264 }
265
266 fd[i] = lxc_preserve_ns(pid, ns_info[i].proc_name);
267 if (fd[i] < 0) {
268 saved_errno = errno;
269
270 /* Close all already opened file descriptors before we
271 * return an error, so we don't leak them.
272 */
273 for (j = 0; j < i; j++)
274 close(fd[j]);
275
276 errno = saved_errno;
277 SYSERROR("Failed to open namespace: \"%s\".", ns_info[i].proc_name);
278 return -1;
279 }
280 }
281
282 for (i = 0; i < LXC_NS_MAX; i++) {
283 if (fd[i] < 0)
284 continue;
285
286 if (setns(fd[i], 0) < 0) {
287 saved_errno = errno;
288
289 for (j = i; j < LXC_NS_MAX; j++)
290 close(fd[j]);
291
292 errno = saved_errno;
293 SYSERROR("Failed to attach to namespace \"%s\".", ns_info[i].proc_name);
294 return -1;
295 }
296
297 DEBUG("Attached to namespace \"%s\".", ns_info[i].proc_name);
298
299 close(fd[i]);
300 }
301
302 return 0;
303 }
304
305 static int lxc_attach_remount_sys_proc(void)
306 {
307 int ret;
308
309 ret = unshare(CLONE_NEWNS);
310 if (ret < 0) {
311 SYSERROR("Failed to unshare mount namespace.");
312 return -1;
313 }
314
315 if (detect_shared_rootfs()) {
316 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
317 SYSERROR("Failed to make / rslave.");
318 ERROR("Continuing...");
319 }
320 }
321
322 /* Assume /proc is always mounted, so remount it. */
323 ret = umount2("/proc", MNT_DETACH);
324 if (ret < 0) {
325 SYSERROR("Failed to unmount /proc.");
326 return -1;
327 }
328
329 ret = mount("none", "/proc", "proc", 0, NULL);
330 if (ret < 0) {
331 SYSERROR("Failed to remount /proc.");
332 return -1;
333 }
334
335 /* Try to umount /sys. If it's not a mount point, we'll get EINVAL, then
336 * we ignore it because it may not have been mounted in the first place.
337 */
338 ret = umount2("/sys", MNT_DETACH);
339 if (ret < 0 && errno != EINVAL) {
340 SYSERROR("Failed to unmount /sys.");
341 return -1;
342 } else if (ret == 0) {
343 /* Remount it. */
344 ret = mount("none", "/sys", "sysfs", 0, NULL);
345 if (ret < 0) {
346 SYSERROR("Failed to remount /sys.");
347 return -1;
348 }
349 }
350
351 return 0;
352 }
353
354 static int lxc_attach_drop_privs(struct lxc_proc_context_info *ctx)
355 {
356 int last_cap = lxc_caps_last_cap();
357 int cap;
358
359 for (cap = 0; cap <= last_cap; cap++) {
360 if (ctx->capability_mask & (1LL << cap))
361 continue;
362
363 if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0)) {
364 SYSERROR("Failed to remove capability id %d.", cap);
365 return -1;
366 }
367 }
368
369 return 0;
370 }
371
372 static int lxc_attach_set_environment(enum lxc_attach_env_policy_t policy, char** extra_env, char** extra_keep)
373 {
374 if (policy == LXC_ATTACH_CLEAR_ENV) {
375 char **extra_keep_store = NULL;
376 int path_kept = 0;
377
378 if (extra_keep) {
379 size_t count, i;
380
381 for (count = 0; extra_keep[count]; count++);
382
383 extra_keep_store = calloc(count, sizeof(char *));
384 if (!extra_keep_store) {
385 SYSERROR("Failed to allocate memory for storing current "
386 "environment variable values that will be kept.");
387 return -1;
388 }
389 for (i = 0; i < count; i++) {
390 char *v = getenv(extra_keep[i]);
391 if (v) {
392 extra_keep_store[i] = strdup(v);
393 if (!extra_keep_store[i]) {
394 SYSERROR("Failed to allocate memory for storing current "
395 "environment variable values that will be kept.");
396 while (i > 0)
397 free(extra_keep_store[--i]);
398 free(extra_keep_store);
399 return -1;
400 }
401 if (strcmp(extra_keep[i], "PATH") == 0)
402 path_kept = 1;
403 }
404 /* Calloc sets entire array to zero, so we don't
405 * need an else.
406 */
407 }
408 }
409
410 if (clearenv()) {
411 char **p;
412 SYSERROR("Failed to clear environment.");
413 if (extra_keep_store) {
414 for (p = extra_keep_store; *p; p++)
415 free(*p);
416 free(extra_keep_store);
417 }
418 return -1;
419 }
420
421 if (extra_keep_store) {
422 size_t i;
423 for (i = 0; extra_keep[i]; i++) {
424 if (extra_keep_store[i]) {
425 if (setenv(extra_keep[i], extra_keep_store[i], 1) < 0)
426 SYSERROR("Unable to set environment variable.");
427 }
428 free(extra_keep_store[i]);
429 }
430 free(extra_keep_store);
431 }
432
433 /* Always set a default path; shells and execlp tend to be fine
434 * without it, but there is a disturbing number of C programs
435 * out there that just assume that getenv("PATH") is never NULL
436 * and then die a painful segfault death.
437 */
438 if (!path_kept)
439 setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
440 }
441
442 if (putenv("container=lxc")) {
443 SYSERROR("Failed to set environment variable.");
444 return -1;
445 }
446
447 /* Set extra environment variables. */
448 if (extra_env) {
449 for (; *extra_env; extra_env++) {
450 /* Duplicate the string, just to be on the safe side,
451 * because putenv does not do it for us.
452 */
453 char *p = strdup(*extra_env);
454 /* We just assume the user knows what they are doing, so
455 * we don't do any checks.
456 */
457 if (!p) {
458 SYSERROR("Failed to allocate memory for additional environment "
459 "variables.");
460 return -1;
461 }
462 putenv(p);
463 }
464 }
465
466 return 0;
467 }
468
469 static char *lxc_attach_getpwshell(uid_t uid)
470 {
471 pid_t pid;
472 int pipes[2];
473 int ret;
474 int fd;
475 char *result = NULL;
476
477 /* We need to fork off a process that runs the getent program, and we
478 * need to capture its output, so we use a pipe for that purpose.
479 */
480 ret = pipe(pipes);
481 if (ret < 0)
482 return NULL;
483
484 pid = fork();
485 if (pid < 0) {
486 close(pipes[0]);
487 close(pipes[1]);
488 return NULL;
489 }
490
491 if (pid) {
492 FILE *pipe_f;
493 char *line = NULL;
494 size_t line_bufsz = 0;
495 int found = 0;
496 int status;
497
498 close(pipes[1]);
499
500 pipe_f = fdopen(pipes[0], "r");
501 while (getline(&line, &line_bufsz, pipe_f) != -1) {
502 char *token;
503 char *saveptr = NULL;
504 long value;
505 char *endptr = NULL;
506 int i;
507
508 /* If we already found something, just continue to read
509 * until the pipe doesn't deliver any more data, but
510 * don't modify the existing data structure.
511 */
512 if (found)
513 continue;
514
515 /* Trim line on the right hand side. */
516 for (i = strlen(line); i > 0 && (line[i - 1] == '\n' || line[i - 1] == '\r'); --i)
517 line[i - 1] = '\0';
518
519 /* Split into tokens: first: user name. */
520 token = strtok_r(line, ":", &saveptr);
521 if (!token)
522 continue;
523 /* next: dummy password field */
524 token = strtok_r(NULL, ":", &saveptr);
525 if (!token)
526 continue;
527 /* next: user id */
528 token = strtok_r(NULL, ":", &saveptr);
529 value = token ? strtol(token, &endptr, 10) : 0;
530 if (!token || !endptr || *endptr || value == LONG_MIN || value == LONG_MAX)
531 continue;
532 /* dummy sanity check: user id matches */
533 if ((uid_t) value != uid)
534 continue;
535 /* skip fields: gid, gecos, dir, go to next field 'shell' */
536 for (i = 0; i < 4; i++) {
537 token = strtok_r(NULL, ":", &saveptr);
538 if (!token)
539 break;
540 }
541 if (!token)
542 continue;
543 free(result);
544 result = strdup(token);
545
546 /* Sanity check that there are no fields after that. */
547 token = strtok_r(NULL, ":", &saveptr);
548 if (token)
549 continue;
550
551 found = 1;
552 }
553
554 free(line);
555 fclose(pipe_f);
556 again:
557 if (waitpid(pid, &status, 0) < 0) {
558 if (errno == EINTR)
559 goto again;
560 return NULL;
561 }
562
563 /* Some sanity checks. If anything even hinted at going wrong,
564 * we can't be sure we have a valid result, so we assume we
565 * don't.
566 */
567
568 if (!WIFEXITED(status))
569 return NULL;
570
571 if (WEXITSTATUS(status) != 0)
572 return NULL;
573
574 if (!found)
575 return NULL;
576
577 return result;
578 } else {
579 char uid_buf[32];
580 char *arguments[] = {
581 "getent",
582 "passwd",
583 uid_buf,
584 NULL
585 };
586
587 close(pipes[0]);
588
589 /* We want to capture stdout. */
590 dup2(pipes[1], 1);
591 close(pipes[1]);
592
593 /* Get rid of stdin/stderr, so we try to associate it with
594 * /dev/null.
595 */
596 fd = open("/dev/null", O_RDWR);
597 if (fd < 0) {
598 close(0);
599 close(2);
600 } else {
601 dup2(fd, 0);
602 dup2(fd, 2);
603 close(fd);
604 }
605
606 /* Finish argument list. */
607 ret = snprintf(uid_buf, sizeof(uid_buf), "%ld", (long) uid);
608 if (ret <= 0)
609 exit(-1);
610
611 /* Try to run getent program. */
612 (void) execvp("getent", arguments);
613 exit(-1);
614 }
615 }
616
617 static void lxc_attach_get_init_uidgid(uid_t* init_uid, gid_t* init_gid)
618 {
619 FILE *proc_file;
620 char proc_fn[__PROC_STATUS_LEN];
621 int ret;
622 char *line = NULL;
623 size_t line_bufsz = 0;
624 long value = -1;
625 uid_t uid = (uid_t)-1;
626 gid_t gid = (gid_t)-1;
627
628 /* Read capabilities. */
629 snprintf(proc_fn, __PROC_STATUS_LEN, "/proc/%d/status", 1);
630
631 proc_file = fopen(proc_fn, "r");
632 if (!proc_file)
633 return;
634
635 while (getline(&line, &line_bufsz, proc_file) != -1) {
636 /* Format is: real, effective, saved set user, fs we only care
637 * about real uid.
638 */
639 ret = sscanf(line, "Uid: %ld", &value);
640 if (ret != EOF && ret == 1) {
641 uid = (uid_t) value;
642 } else {
643 ret = sscanf(line, "Gid: %ld", &value);
644 if (ret != EOF && ret == 1)
645 gid = (gid_t) value;
646 }
647 if (uid != (uid_t)-1 && gid != (gid_t)-1)
648 break;
649 }
650
651 fclose(proc_file);
652 free(line);
653
654 /* Only override arguments if we found something. */
655 if (uid != (uid_t)-1)
656 *init_uid = uid;
657 if (gid != (gid_t)-1)
658 *init_gid = gid;
659
660 /* TODO: we should also parse supplementary groups and use
661 * setgroups() to set them.
662 */
663 }
664
665 struct attach_clone_payload {
666 int ipc_socket;
667 lxc_attach_options_t* options;
668 struct lxc_proc_context_info* init_ctx;
669 lxc_attach_exec_t exec_function;
670 void* exec_payload;
671 };
672
673 static int attach_child_main(void* data);
674
675 /* Help the optimizer along if it doesn't know that exit always exits. */
676 #define rexit(c) do { int __c = (c); _exit(__c); return __c; } while(0)
677
678 /* Define default options if no options are supplied by the user. */
679 static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT;
680
681 static bool fetch_seccomp(struct lxc_container *c,
682 lxc_attach_options_t *options)
683 {
684 char *path;
685
686 if (!(options->namespaces & CLONE_NEWNS) || !(options->attach_flags & LXC_ATTACH_LSM)) {
687 free(c->lxc_conf->seccomp);
688 c->lxc_conf->seccomp = NULL;
689 return true;
690 }
691
692 /* Remove current setting. */
693 if (!c->set_config_item(c, "lxc.seccomp", "")) {
694 return false;
695 }
696
697 /* Fetch the current profile path over the cmd interface. */
698 path = c->get_running_config_item(c, "lxc.seccomp");
699 if (!path) {
700 INFO("Failed to get running config item for lxc.seccomp.");
701 return true;
702 }
703
704 /* Copy the value into the new lxc_conf. */
705 if (!c->set_config_item(c, "lxc.seccomp", path)) {
706 free(path);
707 return false;
708 }
709 free(path);
710
711 /* Attempt to parse the resulting config. */
712 if (lxc_read_seccomp_config(c->lxc_conf) < 0) {
713 ERROR("Error reading seccomp policy.");
714 return false;
715 }
716
717 INFO("Retrieved seccomp policy.");
718 return true;
719 }
720
721 static bool no_new_privs(struct lxc_container *c,
722 lxc_attach_options_t *options)
723 {
724 char *val;
725
726 /* Remove current setting. */
727 if (!c->set_config_item(c, "lxc.no_new_privs", "")) {
728 return false;
729 }
730
731 /* Retrieve currently active setting. */
732 val = c->get_running_config_item(c, "lxc.no_new_privs");
733 if (!val) {
734 INFO("Failed to get running config item for lxc.no_new_privs.");
735 return false;
736 }
737
738 /* Set currently active setting. */
739 if (!c->set_config_item(c, "lxc.no_new_privs", val)) {
740 free(val);
741 return false;
742 }
743 free(val);
744
745 return true;
746 }
747
748 static signed long get_personality(const char *name, const char *lxcpath)
749 {
750 char *p = lxc_cmd_get_config_item(name, "lxc.arch", lxcpath);
751 signed long ret;
752
753 if (!p)
754 return -1;
755 ret = lxc_config_parse_arch(p);
756 free(p);
757 return ret;
758 }
759
760 int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_function, void* exec_payload, lxc_attach_options_t* options, pid_t* attached_process)
761 {
762 int ret, status;
763 pid_t init_pid, pid, attached_pid, expected;
764 struct lxc_proc_context_info *init_ctx;
765 char* cwd;
766 char* new_cwd;
767 int ipc_sockets[2];
768 signed long personality;
769
770 if (!options)
771 options = &attach_static_default_options;
772
773 init_pid = lxc_cmd_get_init_pid(name, lxcpath);
774 if (init_pid < 0) {
775 ERROR("Failed to get init pid.");
776 return -1;
777 }
778
779 init_ctx = lxc_proc_get_context_info(init_pid);
780 if (!init_ctx) {
781 ERROR("Failed to get context of init process: %ld.",
782 (long)init_pid);
783 return -1;
784 }
785
786 personality = get_personality(name, lxcpath);
787 if (init_ctx->personality < 0) {
788 ERROR("Failed to get personality of the container.");
789 lxc_proc_put_context_info(init_ctx);
790 return -1;
791 }
792 init_ctx->personality = personality;
793
794 init_ctx->container = lxc_container_new(name, lxcpath);
795 if (!init_ctx->container)
796 return -1;
797
798 if (!fetch_seccomp(init_ctx->container, options))
799 WARN("Failed to get seccomp policy.");
800
801 if (!no_new_privs(init_ctx->container, options))
802 WARN("Could not determine whether PR_SET_NO_NEW_PRIVS is set.");
803
804 cwd = getcwd(NULL, 0);
805
806 /* Determine which namespaces the container was created with
807 * by asking lxc-start, if necessary.
808 */
809 if (options->namespaces == -1) {
810 options->namespaces = lxc_cmd_get_clone_flags(name, lxcpath);
811 /* call failed */
812 if (options->namespaces == -1) {
813 ERROR("Failed to automatically determine the "
814 "namespaces which the container uses.");
815 free(cwd);
816 lxc_proc_put_context_info(init_ctx);
817 return -1;
818 }
819 }
820
821 /* Create a socket pair for IPC communication; set SOCK_CLOEXEC in order
822 * to make sure we don't irritate other threads that want to fork+exec
823 * away
824 *
825 * IMPORTANT: if the initial process is multithreaded and another call
826 * just fork()s away without exec'ing directly after, the socket fd will
827 * exist in the forked process from the other thread and any close() in
828 * our own child process will not really cause the socket to close
829 * properly, potentiall causing the parent to hang.
830 *
831 * For this reason, while IPC is still active, we have to use shutdown()
832 * if the child exits prematurely in order to signal that the socket is
833 * closed and cannot assume that the child exiting will automatically do
834 * that.
835 *
836 * IPC mechanism: (X is receiver)
837 * initial process intermediate attached
838 * X <--- send pid of
839 * attached proc,
840 * then exit
841 * send 0 ------------------------------------> X
842 * [do initialization]
843 * X <------------------------------------ send 1
844 * [add to cgroup, ...]
845 * send 2 ------------------------------------> X
846 * [set LXC_ATTACH_NO_NEW_PRIVS]
847 * X <------------------------------------ send 3
848 * [open LSM label fd]
849 * send 4 ------------------------------------> X
850 * [set LSM label]
851 * close socket close socket
852 * run program
853 */
854 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
855 if (ret < 0) {
856 SYSERROR("Could not set up required IPC mechanism for attaching.");
857 free(cwd);
858 lxc_proc_put_context_info(init_ctx);
859 return -1;
860 }
861
862 /* Create intermediate subprocess, three reasons:
863 * 1. Runs all pthread_atfork handlers and the child will no
864 * longer be threaded (we can't properly setns() in a threaded
865 * process).
866 * 2. We can't setns() in the child itself, since we want to make
867 * sure we are properly attached to the pidns.
868 * 3. Also, the initial thread has to put the attached process
869 * into the cgroup, which we can only do if we didn't already
870 * setns() (otherwise, user namespaces will hate us).
871 */
872 pid = fork();
873
874 if (pid < 0) {
875 SYSERROR("Failed to create first subprocess.");
876 free(cwd);
877 lxc_proc_put_context_info(init_ctx);
878 return -1;
879 }
880
881 if (pid) {
882 int procfd = -1;
883 pid_t to_cleanup_pid = pid;
884
885 /* Initial thread, we close the socket that is for the
886 * subprocesses.
887 */
888 close(ipc_sockets[1]);
889 free(cwd);
890
891 /* Attach to cgroup, if requested. */
892 if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
893 if (!cgroup_attach(name, lxcpath, pid))
894 goto on_error;
895 }
896
897 /* Open /proc before setns() to the containers namespace so we
898 * don't rely on any information from inside the container.
899 */
900 procfd = open("/proc", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
901 if (procfd < 0) {
902 SYSERROR("Unable to open /proc.");
903 goto on_error;
904 }
905
906 /* Let the child process know to go ahead. */
907 status = 0;
908 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
909 if (ret <= 0) {
910 ERROR("Intended to send sequence number 0: %s.",
911 strerror(errno));
912 goto on_error;
913 }
914
915 /* Get pid of attached process from intermediate process. */
916 ret = lxc_read_nointr_expect(ipc_sockets[0], &attached_pid, sizeof(attached_pid), NULL);
917 if (ret <= 0) {
918 if (ret != 0)
919 ERROR("Expected to receive pid: %s.", strerror(errno));
920 goto on_error;
921 }
922
923 /* Ignore SIGKILL (CTRL-C) and SIGQUIT (CTRL-\) - issue #313. */
924 if (options->stdin_fd == 0) {
925 signal(SIGINT, SIG_IGN);
926 signal(SIGQUIT, SIG_IGN);
927 }
928
929 /* Reap intermediate process. */
930 ret = wait_for_pid(pid);
931 if (ret < 0)
932 goto on_error;
933
934 /* We will always have to reap the attached process now. */
935 to_cleanup_pid = attached_pid;
936
937 /* Tell attached process it may start initializing. */
938 status = 0;
939 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
940 if (ret <= 0) {
941 ERROR("Intended to send sequence number 0: %s.", strerror(errno));
942 goto on_error;
943 }
944
945 /* Wait for the attached process to finish initializing. */
946 expected = 1;
947 ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected);
948 if (ret <= 0) {
949 if (ret != 0)
950 ERROR("Expected to receive sequence number 1: %s.", strerror(errno));
951 goto on_error;
952 }
953
954 /* Tell attached process we're done. */
955 status = 2;
956 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
957 if (ret <= 0) {
958 ERROR("Intended to send sequence number 2: %s.", strerror(errno));
959 goto on_error;
960 }
961
962 /* Wait for the (grand)child to tell us that it's ready to set
963 * up its LSM labels.
964 */
965 expected = 3;
966 ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected);
967 if (ret <= 0) {
968 ERROR("Expected to receive sequence number 3: %s.",
969 strerror(errno));
970 goto on_error;
971 }
972
973 /* Open LSM fd and send it to child. */
974 if ((options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_LSM) && init_ctx->lsm_label) {
975 int on_exec;
976 int labelfd = -1;
977 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? 1 : 0;
978 /* Open fd for the LSM security module. */
979 labelfd = lsm_openat(procfd, attached_pid, on_exec);
980 if (labelfd < 0)
981 goto on_error;
982
983 /* Send child fd of the LSM security module to write to. */
984 ret = lxc_abstract_unix_send_fd(ipc_sockets[0], labelfd, NULL, 0);
985 close(labelfd);
986 if (ret <= 0) {
987 ERROR("Intended to send file descriptor %d: %s.", labelfd, strerror(errno));
988 goto on_error;
989 }
990 }
991
992 if (procfd >= 0)
993 close(procfd);
994 /* Now shut down communication with child, we're done. */
995 shutdown(ipc_sockets[0], SHUT_RDWR);
996 close(ipc_sockets[0]);
997 lxc_proc_put_context_info(init_ctx);
998
999 /* We're done, the child process should now execute whatever it
1000 * is that the user requested. The parent can now track it with
1001 * waitpid() or similar.
1002 */
1003
1004 *attached_process = attached_pid;
1005 return 0;
1006
1007 on_error:
1008 /* First shut down the socket, then wait for the pid, otherwise
1009 * the pid we're waiting for may never exit.
1010 */
1011 if (procfd >= 0)
1012 close(procfd);
1013 shutdown(ipc_sockets[0], SHUT_RDWR);
1014 close(ipc_sockets[0]);
1015 if (to_cleanup_pid)
1016 (void) wait_for_pid(to_cleanup_pid);
1017 lxc_proc_put_context_info(init_ctx);
1018 return -1;
1019 }
1020
1021 /* First subprocess begins here, we close the socket that is for the
1022 * initial thread.
1023 */
1024 close(ipc_sockets[0]);
1025
1026 /* Wait for the parent to have setup cgroups. */
1027 expected = 0;
1028 status = -1;
1029 ret = lxc_read_nointr_expect(ipc_sockets[1], &status, sizeof(status), &expected);
1030 if (ret <= 0) {
1031 ERROR("Expected to receive sequence number 0: %s.", strerror(errno));
1032 shutdown(ipc_sockets[1], SHUT_RDWR);
1033 rexit(-1);
1034 }
1035
1036 if ((options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) && cgns_supported())
1037 options->namespaces |= CLONE_NEWCGROUP;
1038
1039 /* Attach now, create another subprocess later, since pid namespaces
1040 * only really affect the children of the current process.
1041 */
1042 ret = lxc_attach_to_ns(init_pid, options->namespaces);
1043 if (ret < 0) {
1044 ERROR("Failed to enter namespaces.");
1045 shutdown(ipc_sockets[1], SHUT_RDWR);
1046 rexit(-1);
1047 }
1048
1049 /* Attach succeeded, try to cwd. */
1050 if (options->initial_cwd)
1051 new_cwd = options->initial_cwd;
1052 else
1053 new_cwd = cwd;
1054 ret = chdir(new_cwd);
1055 if (ret < 0)
1056 WARN("Could not change directory to \"%s\".", new_cwd);
1057 free(cwd);
1058
1059 /* Now create the real child process. */
1060 {
1061 struct attach_clone_payload payload = {
1062 .ipc_socket = ipc_sockets[1],
1063 .options = options,
1064 .init_ctx = init_ctx,
1065 .exec_function = exec_function,
1066 .exec_payload = exec_payload,
1067 };
1068 /* We use clone_parent here to make this subprocess a direct
1069 * child of the initial process. Then this intermediate process
1070 * can exit and the parent can directly track the attached
1071 * process.
1072 */
1073 pid = lxc_clone(attach_child_main, &payload, CLONE_PARENT);
1074 }
1075
1076 /* Shouldn't happen, clone() should always return positive pid. */
1077 if (pid <= 0) {
1078 SYSERROR("Failed to create subprocess.");
1079 shutdown(ipc_sockets[1], SHUT_RDWR);
1080 rexit(-1);
1081 }
1082
1083 /* Tell grandparent the pid of the pid of the newly created child. */
1084 ret = lxc_write_nointr(ipc_sockets[1], &pid, sizeof(pid));
1085 if (ret != sizeof(pid)) {
1086 /* If this really happens here, this is very unfortunate, since
1087 * the parent will not know the pid of the attached process and
1088 * will not be able to wait for it (and we won't either due to
1089 * CLONE_PARENT) so the parent won't be able to reap it and the
1090 * attached process will remain a zombie.
1091 */
1092 ERROR("Intended to send pid %d: %s.", pid, strerror(errno));
1093 shutdown(ipc_sockets[1], SHUT_RDWR);
1094 rexit(-1);
1095 }
1096
1097 /* The rest is in the hands of the initial and the attached process. */
1098 rexit(0);
1099 }
1100
1101 static int attach_child_main(void* data)
1102 {
1103 struct attach_clone_payload* payload = (struct attach_clone_payload*)data;
1104 int ipc_socket = payload->ipc_socket;
1105 lxc_attach_options_t* options = payload->options;
1106 struct lxc_proc_context_info* init_ctx = payload->init_ctx;
1107 #if HAVE_SYS_PERSONALITY_H
1108 long new_personality;
1109 #endif
1110 int ret;
1111 int status;
1112 int expected;
1113 long flags;
1114 int fd;
1115 int lsm_labelfd;
1116 uid_t new_uid;
1117 gid_t new_gid;
1118
1119 /* Wait for the initial thread to signal us that it's ready for us to
1120 * start initializing.
1121 */
1122 expected = 0;
1123 status = -1;
1124 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
1125 if (ret <= 0) {
1126 ERROR("Expected to receive sequence number 0: %s.", strerror(errno));
1127 shutdown(ipc_socket, SHUT_RDWR);
1128 rexit(-1);
1129 }
1130
1131 /* A description of the purpose of this functionality is provided in the
1132 * lxc-attach(1) manual page. We have to remount here and not in the
1133 * parent process, otherwise /proc may not properly reflect the new pid
1134 * namespace.
1135 */
1136 if (!(options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) {
1137 ret = lxc_attach_remount_sys_proc();
1138 if (ret < 0) {
1139 shutdown(ipc_socket, SHUT_RDWR);
1140 rexit(-1);
1141 }
1142 }
1143
1144 /* Now perform additional attachments. */
1145 #if HAVE_SYS_PERSONALITY_H
1146 if (options->personality < 0)
1147 new_personality = init_ctx->personality;
1148 else
1149 new_personality = options->personality;
1150
1151 if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) {
1152 ret = personality(new_personality);
1153 if (ret < 0) {
1154 SYSERROR("Could not ensure correct architecture.");
1155 shutdown(ipc_socket, SHUT_RDWR);
1156 rexit(-1);
1157 }
1158 }
1159 #endif
1160
1161 if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) {
1162 ret = lxc_attach_drop_privs(init_ctx);
1163 if (ret < 0) {
1164 ERROR("Could not drop privileges.");
1165 shutdown(ipc_socket, SHUT_RDWR);
1166 rexit(-1);
1167 }
1168 }
1169
1170 /* Always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL)
1171 * if you want this to be a no-op).
1172 */
1173 ret = lxc_attach_set_environment(options->env_policy, options->extra_env_vars, options->extra_keep_env);
1174 if (ret < 0) {
1175 ERROR("Could not set initial environment for attached process.");
1176 shutdown(ipc_socket, SHUT_RDWR);
1177 rexit(-1);
1178 }
1179
1180 /* Set {u,g}id. */
1181 new_uid = 0;
1182 new_gid = 0;
1183 /* Ignore errors, we will fall back to root in that case (/proc was not
1184 * mounted etc.).
1185 */
1186 if (options->namespaces & CLONE_NEWUSER)
1187 lxc_attach_get_init_uidgid(&new_uid, &new_gid);
1188
1189 if (options->uid != (uid_t)-1)
1190 new_uid = options->uid;
1191 if (options->gid != (gid_t)-1)
1192 new_gid = options->gid;
1193
1194 /* Setup the controlling tty. */
1195 if (options->stdin_fd && isatty(options->stdin_fd)) {
1196 if (setsid() < 0) {
1197 SYSERROR("Unable to setsid.");
1198 shutdown(ipc_socket, SHUT_RDWR);
1199 rexit(-1);
1200 }
1201
1202 if (ioctl(options->stdin_fd, TIOCSCTTY, (char *)NULL) < 0) {
1203 SYSERROR("Unable to set TIOCSTTY.");
1204 shutdown(ipc_socket, SHUT_RDWR);
1205 rexit(-1);
1206 }
1207 }
1208
1209 /* Try to set the {u,g}id combination. */
1210 if ((new_gid != 0 || options->namespaces & CLONE_NEWUSER)) {
1211 if (setgid(new_gid) || setgroups(0, NULL)) {
1212 SYSERROR("Switching to container gid.");
1213 shutdown(ipc_socket, SHUT_RDWR);
1214 rexit(-1);
1215 }
1216 }
1217 if ((new_uid != 0 || options->namespaces & CLONE_NEWUSER) && setuid(new_uid)) {
1218 SYSERROR("Switching to container uid.");
1219 shutdown(ipc_socket, SHUT_RDWR);
1220 rexit(-1);
1221 }
1222
1223 /* Tell initial process it may now put us into cgroups. */
1224 status = 1;
1225 ret = lxc_write_nointr(ipc_socket, &status, sizeof(status));
1226 if (ret != sizeof(status)) {
1227 ERROR("Intended to send sequence number 1: %s.", strerror(errno));
1228 shutdown(ipc_socket, SHUT_RDWR);
1229 rexit(-1);
1230 }
1231
1232 /* Wait for the initial thread to signal us that it has done everything
1233 * for us when it comes to cgroups etc.
1234 */
1235 expected = 2;
1236 status = -1;
1237 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
1238 if (ret <= 0) {
1239 ERROR("Expected to receive sequence number 2: %s", strerror(errno));
1240 shutdown(ipc_socket, SHUT_RDWR);
1241 rexit(-1);
1242 }
1243
1244 if ((init_ctx->container && init_ctx->container->lxc_conf &&
1245 init_ctx->container->lxc_conf->no_new_privs) ||
1246 (options->attach_flags & LXC_ATTACH_NO_NEW_PRIVS)) {
1247 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
1248 SYSERROR("PR_SET_NO_NEW_PRIVS could not be set. "
1249 "Process can use execve() gainable "
1250 "privileges.");
1251 shutdown(ipc_socket, SHUT_RDWR);
1252 rexit(-1);
1253 }
1254 INFO("PR_SET_NO_NEW_PRIVS is set. Process cannot use execve() "
1255 "gainable privileges.");
1256 }
1257
1258 /* Tell the (grand)parent to send us LSM label fd. */
1259 status = 3;
1260 ret = lxc_write_nointr(ipc_socket, &status, sizeof(status));
1261 if (ret <= 0) {
1262 ERROR("Intended to send sequence number 3: %s.", strerror(errno));
1263 shutdown(ipc_socket, SHUT_RDWR);
1264 rexit(-1);
1265 }
1266
1267 if ((options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_LSM) && init_ctx->lsm_label) {
1268 int on_exec;
1269 /* Receive fd for LSM security module. */
1270 ret = lxc_abstract_unix_recv_fd(ipc_socket, &lsm_labelfd, NULL, 0);
1271 if (ret <= 0) {
1272 ERROR("Expected to receive file descriptor: %s.", strerror(errno));
1273 shutdown(ipc_socket, SHUT_RDWR);
1274 rexit(-1);
1275 }
1276
1277 /* Change into our new LSM profile. */
1278 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? 1 : 0;
1279 if (lsm_set_label_at(lsm_labelfd, on_exec, init_ctx->lsm_label) < 0) {
1280 SYSERROR("Failed to set LSM label.");
1281 shutdown(ipc_socket, SHUT_RDWR);
1282 close(lsm_labelfd);
1283 rexit(-1);
1284 }
1285 close(lsm_labelfd);
1286 }
1287
1288 if (init_ctx->container && init_ctx->container->lxc_conf &&
1289 init_ctx->container->lxc_conf->seccomp &&
1290 (lxc_seccomp_load(init_ctx->container->lxc_conf) != 0)) {
1291 ERROR("Failed to load seccomp policy.");
1292 shutdown(ipc_socket, SHUT_RDWR);
1293 rexit(-1);
1294 }
1295
1296 shutdown(ipc_socket, SHUT_RDWR);
1297 close(ipc_socket);
1298 lxc_proc_put_context_info(init_ctx);
1299
1300 /* The following is done after the communication socket is shut down.
1301 * That way, all errors that might (though unlikely) occur up until this
1302 * point will have their messages printed to the original stderr (if
1303 * logging is so configured) and not the fd the user supplied, if any.
1304 */
1305
1306 /* Fd handling for stdin, stdout and stderr; ignore errors here, user
1307 * may want to make sure the fds are closed, for example.
1308 */
1309 if (options->stdin_fd >= 0 && options->stdin_fd != 0)
1310 dup2(options->stdin_fd, 0);
1311 if (options->stdout_fd >= 0 && options->stdout_fd != 1)
1312 dup2(options->stdout_fd, 1);
1313 if (options->stderr_fd >= 0 && options->stderr_fd != 2)
1314 dup2(options->stderr_fd, 2);
1315
1316 /* close the old fds */
1317 if (options->stdin_fd > 2)
1318 close(options->stdin_fd);
1319 if (options->stdout_fd > 2)
1320 close(options->stdout_fd);
1321 if (options->stderr_fd > 2)
1322 close(options->stderr_fd);
1323
1324 /* Try to remove FD_CLOEXEC flag from stdin/stdout/stderr, but also
1325 * here, ignore errors.
1326 */
1327 for (fd = 0; fd <= 2; fd++) {
1328 flags = fcntl(fd, F_GETFL);
1329 if (flags < 0)
1330 continue;
1331 if (flags & FD_CLOEXEC)
1332 if (fcntl(fd, F_SETFL, flags & ~FD_CLOEXEC) < 0)
1333 SYSERROR("Unable to clear FD_CLOEXEC from file descriptor.");
1334 }
1335
1336 /* We're done, so we can now do whatever the user intended us to do. */
1337 rexit(payload->exec_function(payload->exec_payload));
1338 }
1339
1340 int lxc_attach_run_command(void* payload)
1341 {
1342 lxc_attach_command_t* cmd = (lxc_attach_command_t*)payload;
1343
1344 execvp(cmd->program, cmd->argv);
1345 SYSERROR("Failed to exec \"%s\".", cmd->program);
1346 return -1;
1347 }
1348
1349 int lxc_attach_run_shell(void* payload)
1350 {
1351 uid_t uid;
1352 struct passwd *passwd;
1353 char *user_shell;
1354
1355 /* Ignore payload parameter. */
1356 (void)payload;
1357
1358 uid = getuid();
1359 passwd = getpwuid(uid);
1360
1361 /* This probably happens because of incompatible nss implementations in
1362 * host and container (remember, this code is still using the host's
1363 * glibc but our mount namespace is in the container) we may try to get
1364 * the information by spawning a [getent passwd uid] process and parsing
1365 * the result.
1366 */
1367 if (!passwd)
1368 user_shell = lxc_attach_getpwshell(uid);
1369 else
1370 user_shell = passwd->pw_shell;
1371
1372 if (user_shell)
1373 execlp(user_shell, user_shell, (char *)NULL);
1374
1375 /* Executed if either no passwd entry or execvp fails, we will fall back
1376 * on /bin/sh as a default shell.
1377 */
1378 execlp("/bin/sh", "/bin/sh", (char *)NULL);
1379 SYSERROR("Failed to exec shell.");
1380 return -1;
1381 }