]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/attach.c
Merge pull request #1539 from brauner/2017-05-06/fix_abstract_unix_sockets
[mirror_lxc.git] / src / lxc / attach.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include <unistd.h>
26 #include <stdio.h>
27 #include <string.h>
28 #include <stdlib.h>
29 #include <signal.h>
30 #include <errno.h>
31 #include <fcntl.h>
32 #include <grp.h>
33 #include <sys/param.h>
34 #include <sys/prctl.h>
35 #include <sys/mount.h>
36 #include <sys/socket.h>
37 #include <sys/syscall.h>
38 #include <sys/wait.h>
39 #include <linux/unistd.h>
40 #include <pwd.h>
41
42 #ifndef HAVE_DECL_PR_CAPBSET_DROP
43 #define PR_CAPBSET_DROP 24
44 #endif
45
46 #ifndef HAVE_DECL_PR_SET_NO_NEW_PRIVS
47 #define PR_SET_NO_NEW_PRIVS 38
48 #endif
49
50 #ifndef HAVE_DECL_PR_GET_NO_NEW_PRIVS
51 #define PR_GET_NO_NEW_PRIVS 39
52 #endif
53
54 #include "namespace.h"
55 #include "log.h"
56 #include "af_unix.h"
57 #include "attach.h"
58 #include "caps.h"
59 #include "config.h"
60 #include "utils.h"
61 #include "commands.h"
62 #include "cgroup.h"
63 #include "lxclock.h"
64 #include "conf.h"
65 #include "lxcseccomp.h"
66 #include <lxc/lxccontainer.h>
67 #include "lsm/lsm.h"
68 #include "confile.h"
69
70 #if HAVE_SYS_PERSONALITY_H
71 #include <sys/personality.h>
72 #endif
73
74 #ifndef SOCK_CLOEXEC
75 # define SOCK_CLOEXEC 02000000
76 #endif
77
78 #ifndef MS_REC
79 #define MS_REC 16384
80 #endif
81
82 #ifndef MS_SLAVE
83 #define MS_SLAVE (1<<19)
84 #endif
85
86 lxc_log_define(lxc_attach, lxc);
87
88 /* /proc/pid-to-str/current\0 = (5 + 21 + 7 + 1) */
89 #define __LSMATTRLEN (5 + (LXC_NUMSTRLEN64) + 7 + 1)
90 static int lsm_openat(int procfd, pid_t pid, int on_exec)
91 {
92 int ret = -1;
93 int labelfd = -1;
94 const char *name;
95 char path[__LSMATTRLEN];
96
97 name = lsm_name();
98
99 if (strcmp(name, "nop") == 0)
100 return 0;
101
102 if (strcmp(name, "none") == 0)
103 return 0;
104
105 /* We don't support on-exec with AppArmor */
106 if (strcmp(name, "AppArmor") == 0)
107 on_exec = 0;
108
109 if (on_exec)
110 ret = snprintf(path, __LSMATTRLEN, "%d/attr/exec", pid);
111 else
112 ret = snprintf(path, __LSMATTRLEN, "%d/attr/current", pid);
113 if (ret < 0 || ret >= __LSMATTRLEN)
114 return -1;
115
116 labelfd = openat(procfd, path, O_RDWR);
117 if (labelfd < 0) {
118 SYSERROR("Unable to open file descriptor to set LSM label.");
119 return -1;
120 }
121
122 return labelfd;
123 }
124
125 static int lsm_set_label_at(int lsm_labelfd, int on_exec, char *lsm_label)
126 {
127 int fret = -1;
128 const char* name;
129 char *command = NULL;
130
131 name = lsm_name();
132
133 if (strcmp(name, "nop") == 0)
134 return 0;
135
136 if (strcmp(name, "none") == 0)
137 return 0;
138
139 /* We don't support on-exec with AppArmor */
140 if (strcmp(name, "AppArmor") == 0)
141 on_exec = 0;
142
143 if (strcmp(name, "AppArmor") == 0) {
144 int size;
145
146 command = malloc(strlen(lsm_label) + strlen("changeprofile ") + 1);
147 if (!command) {
148 SYSERROR("Failed to write apparmor profile.");
149 goto out;
150 }
151
152 size = sprintf(command, "changeprofile %s", lsm_label);
153 if (size < 0) {
154 SYSERROR("Failed to write apparmor profile.");
155 goto out;
156 }
157
158 if (write(lsm_labelfd, command, size + 1) < 0) {
159 SYSERROR("Unable to set LSM label: %s.", command);
160 goto out;
161 }
162 INFO("Set LSM label to: %s.", command);
163 } else if (strcmp(name, "SELinux") == 0) {
164 if (write(lsm_labelfd, lsm_label, strlen(lsm_label) + 1) < 0) {
165 SYSERROR("Unable to set LSM label: %s.", lsm_label);
166 goto out;
167 }
168 INFO("Set LSM label to: %s.", lsm_label);
169 } else {
170 ERROR("Unable to restore label for unknown LSM: %s.", name);
171 goto out;
172 }
173 fret = 0;
174
175 out:
176 free(command);
177
178 if (lsm_labelfd != -1)
179 close(lsm_labelfd);
180
181 return fret;
182 }
183
184 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
185 #define __PROC_STATUS_LEN (5 + (LXC_NUMSTRLEN64) + 7 + 1)
186 static struct lxc_proc_context_info *lxc_proc_get_context_info(pid_t pid)
187 {
188 FILE *proc_file;
189 char proc_fn[__PROC_STATUS_LEN];
190 bool found;
191 int ret;
192 char *line = NULL;
193 size_t line_bufsz = 0;
194 struct lxc_proc_context_info *info = NULL;
195
196 /* Read capabilities. */
197 ret = snprintf(proc_fn, __PROC_STATUS_LEN, "/proc/%d/status", pid);
198 if (ret < 0 || ret >= __PROC_STATUS_LEN)
199 goto on_error;
200
201 proc_file = fopen(proc_fn, "r");
202 if (!proc_file) {
203 SYSERROR("Could not open %s.", proc_fn);
204 goto on_error;
205 }
206
207 info = calloc(1, sizeof(*info));
208 if (!info) {
209 SYSERROR("Could not allocate memory.");
210 return NULL;
211 }
212
213 found = false;
214 while (getline(&line, &line_bufsz, proc_file) != -1) {
215 ret = sscanf(line, "CapBnd: %llx", &info->capability_mask);
216 if (ret != EOF && ret == 1) {
217 found = true;
218 break;
219 }
220 }
221
222 free(line);
223 fclose(proc_file);
224
225 if (!found) {
226 SYSERROR("Could not read capability bounding set from %s.", proc_fn);
227 errno = ENOENT;
228 goto on_error;
229 }
230
231 info->lsm_label = lsm_process_label_get(pid);
232
233 return info;
234
235 on_error:
236 free(info);
237 return NULL;
238 }
239
240 static void lxc_proc_put_context_info(struct lxc_proc_context_info *ctx)
241 {
242 free(ctx->lsm_label);
243 if (ctx->container)
244 lxc_container_put(ctx->container);
245 free(ctx);
246 }
247
248 static int lxc_attach_to_ns(pid_t pid, int which)
249 {
250 int fd[LXC_NS_MAX];
251 int i, j, saved_errno;
252
253
254 if (access("/proc/self/ns", X_OK)) {
255 ERROR("Does this kernel version support namespaces?");
256 return -1;
257 }
258
259 for (i = 0; i < LXC_NS_MAX; i++) {
260 /* Ignore if we are not supposed to attach to that namespace. */
261 if (which != -1 && !(which & ns_info[i].clone_flag)) {
262 fd[i] = -1;
263 continue;
264 }
265
266 fd[i] = lxc_preserve_ns(pid, ns_info[i].proc_name);
267 if (fd[i] < 0) {
268 saved_errno = errno;
269
270 /* Close all already opened file descriptors before we
271 * return an error, so we don't leak them.
272 */
273 for (j = 0; j < i; j++)
274 close(fd[j]);
275
276 errno = saved_errno;
277 SYSERROR("Failed to open namespace: \"%s\".", ns_info[i].proc_name);
278 return -1;
279 }
280 }
281
282 for (i = 0; i < LXC_NS_MAX; i++) {
283 if (fd[i] < 0)
284 continue;
285
286 if (setns(fd[i], 0) < 0) {
287 saved_errno = errno;
288
289 for (j = i; j < LXC_NS_MAX; j++)
290 close(fd[j]);
291
292 errno = saved_errno;
293 SYSERROR("Failed to attach to namespace \"%s\".", ns_info[i].proc_name);
294 return -1;
295 }
296
297 DEBUG("Attached to namespace \"%s\".", ns_info[i].proc_name);
298
299 close(fd[i]);
300 }
301
302 return 0;
303 }
304
305 static int lxc_attach_remount_sys_proc(void)
306 {
307 int ret;
308
309 ret = unshare(CLONE_NEWNS);
310 if (ret < 0) {
311 SYSERROR("Failed to unshare mount namespace.");
312 return -1;
313 }
314
315 if (detect_shared_rootfs()) {
316 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
317 SYSERROR("Failed to make / rslave.");
318 ERROR("Continuing...");
319 }
320 }
321
322 /* Assume /proc is always mounted, so remount it. */
323 ret = umount2("/proc", MNT_DETACH);
324 if (ret < 0) {
325 SYSERROR("Failed to unmount /proc.");
326 return -1;
327 }
328
329 ret = mount("none", "/proc", "proc", 0, NULL);
330 if (ret < 0) {
331 SYSERROR("Failed to remount /proc.");
332 return -1;
333 }
334
335 /* Try to umount /sys. If it's not a mount point, we'll get EINVAL, then
336 * we ignore it because it may not have been mounted in the first place.
337 */
338 ret = umount2("/sys", MNT_DETACH);
339 if (ret < 0 && errno != EINVAL) {
340 SYSERROR("Failed to unmount /sys.");
341 return -1;
342 } else if (ret == 0) {
343 /* Remount it. */
344 ret = mount("none", "/sys", "sysfs", 0, NULL);
345 if (ret < 0) {
346 SYSERROR("Failed to remount /sys.");
347 return -1;
348 }
349 }
350
351 return 0;
352 }
353
354 static int lxc_attach_drop_privs(struct lxc_proc_context_info *ctx)
355 {
356 int last_cap = lxc_caps_last_cap();
357 int cap;
358
359 for (cap = 0; cap <= last_cap; cap++) {
360 if (ctx->capability_mask & (1LL << cap))
361 continue;
362
363 if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0)) {
364 SYSERROR("Failed to remove capability id %d.", cap);
365 return -1;
366 }
367 }
368
369 return 0;
370 }
371
372 static int lxc_attach_set_environment(enum lxc_attach_env_policy_t policy, char** extra_env, char** extra_keep)
373 {
374 if (policy == LXC_ATTACH_CLEAR_ENV) {
375 char **extra_keep_store = NULL;
376 int path_kept = 0;
377
378 if (extra_keep) {
379 size_t count, i;
380
381 for (count = 0; extra_keep[count]; count++);
382
383 extra_keep_store = calloc(count, sizeof(char *));
384 if (!extra_keep_store) {
385 SYSERROR("Failed to allocate memory for storing current "
386 "environment variable values that will be kept.");
387 return -1;
388 }
389 for (i = 0; i < count; i++) {
390 char *v = getenv(extra_keep[i]);
391 if (v) {
392 extra_keep_store[i] = strdup(v);
393 if (!extra_keep_store[i]) {
394 SYSERROR("Failed to allocate memory for storing current "
395 "environment variable values that will be kept.");
396 while (i > 0)
397 free(extra_keep_store[--i]);
398 free(extra_keep_store);
399 return -1;
400 }
401 if (strcmp(extra_keep[i], "PATH") == 0)
402 path_kept = 1;
403 }
404 /* Calloc sets entire array to zero, so we don't
405 * need an else.
406 */
407 }
408 }
409
410 if (clearenv()) {
411 char **p;
412 SYSERROR("Failed to clear environment.");
413 if (extra_keep_store) {
414 for (p = extra_keep_store; *p; p++)
415 free(*p);
416 free(extra_keep_store);
417 }
418 return -1;
419 }
420
421 if (extra_keep_store) {
422 size_t i;
423 for (i = 0; extra_keep[i]; i++) {
424 if (extra_keep_store[i]) {
425 if (setenv(extra_keep[i], extra_keep_store[i], 1) < 0)
426 SYSERROR("Unable to set environment variable.");
427 }
428 free(extra_keep_store[i]);
429 }
430 free(extra_keep_store);
431 }
432
433 /* Always set a default path; shells and execlp tend to be fine
434 * without it, but there is a disturbing number of C programs
435 * out there that just assume that getenv("PATH") is never NULL
436 * and then die a painful segfault death.
437 */
438 if (!path_kept)
439 setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
440 }
441
442 if (putenv("container=lxc")) {
443 SYSERROR("Failed to set environment variable.");
444 return -1;
445 }
446
447 /* Set extra environment variables. */
448 if (extra_env) {
449 for (; *extra_env; extra_env++) {
450 /* Duplicate the string, just to be on the safe side,
451 * because putenv does not do it for us.
452 */
453 char *p = strdup(*extra_env);
454 /* We just assume the user knows what they are doing, so
455 * we don't do any checks.
456 */
457 if (!p) {
458 SYSERROR("Failed to allocate memory for additional environment "
459 "variables.");
460 return -1;
461 }
462 putenv(p);
463 }
464 }
465
466 return 0;
467 }
468
469 static char *lxc_attach_getpwshell(uid_t uid)
470 {
471 pid_t pid;
472 int pipes[2];
473 int ret;
474 int fd;
475 char *result = NULL;
476
477 /* We need to fork off a process that runs the getent program, and we
478 * need to capture its output, so we use a pipe for that purpose.
479 */
480 ret = pipe(pipes);
481 if (ret < 0)
482 return NULL;
483
484 pid = fork();
485 if (pid < 0) {
486 close(pipes[0]);
487 close(pipes[1]);
488 return NULL;
489 }
490
491 if (pid) {
492 FILE *pipe_f;
493 char *line = NULL;
494 size_t line_bufsz = 0;
495 int found = 0;
496 int status;
497
498 close(pipes[1]);
499
500 pipe_f = fdopen(pipes[0], "r");
501 while (getline(&line, &line_bufsz, pipe_f) != -1) {
502 char *token;
503 char *saveptr = NULL;
504 long value;
505 char *endptr = NULL;
506 int i;
507
508 /* If we already found something, just continue to read
509 * until the pipe doesn't deliver any more data, but
510 * don't modify the existing data structure.
511 */
512 if (found)
513 continue;
514
515 /* Trim line on the right hand side. */
516 for (i = strlen(line); i > 0 && (line[i - 1] == '\n' || line[i - 1] == '\r'); --i)
517 line[i - 1] = '\0';
518
519 /* Split into tokens: first: user name. */
520 token = strtok_r(line, ":", &saveptr);
521 if (!token)
522 continue;
523 /* next: dummy password field */
524 token = strtok_r(NULL, ":", &saveptr);
525 if (!token)
526 continue;
527 /* next: user id */
528 token = strtok_r(NULL, ":", &saveptr);
529 value = token ? strtol(token, &endptr, 10) : 0;
530 if (!token || !endptr || *endptr || value == LONG_MIN || value == LONG_MAX)
531 continue;
532 /* dummy sanity check: user id matches */
533 if ((uid_t) value != uid)
534 continue;
535 /* skip fields: gid, gecos, dir, go to next field 'shell' */
536 for (i = 0; i < 4; i++) {
537 token = strtok_r(NULL, ":", &saveptr);
538 if (!token)
539 break;
540 }
541 if (!token)
542 continue;
543 free(result);
544 result = strdup(token);
545
546 /* Sanity check that there are no fields after that. */
547 token = strtok_r(NULL, ":", &saveptr);
548 if (token)
549 continue;
550
551 found = 1;
552 }
553
554 free(line);
555 fclose(pipe_f);
556 again:
557 if (waitpid(pid, &status, 0) < 0) {
558 if (errno == EINTR)
559 goto again;
560 return NULL;
561 }
562
563 /* Some sanity checks. If anything even hinted at going wrong,
564 * we can't be sure we have a valid result, so we assume we
565 * don't.
566 */
567
568 if (!WIFEXITED(status))
569 return NULL;
570
571 if (WEXITSTATUS(status) != 0)
572 return NULL;
573
574 if (!found)
575 return NULL;
576
577 return result;
578 } else {
579 char uid_buf[32];
580 char *arguments[] = {
581 "getent",
582 "passwd",
583 uid_buf,
584 NULL
585 };
586
587 close(pipes[0]);
588
589 /* We want to capture stdout. */
590 dup2(pipes[1], 1);
591 close(pipes[1]);
592
593 /* Get rid of stdin/stderr, so we try to associate it with
594 * /dev/null.
595 */
596 fd = open("/dev/null", O_RDWR);
597 if (fd < 0) {
598 close(0);
599 close(2);
600 } else {
601 dup2(fd, 0);
602 dup2(fd, 2);
603 close(fd);
604 }
605
606 /* Finish argument list. */
607 ret = snprintf(uid_buf, sizeof(uid_buf), "%ld", (long) uid);
608 if (ret <= 0)
609 exit(-1);
610
611 /* Try to run getent program. */
612 (void) execvp("getent", arguments);
613 exit(-1);
614 }
615 }
616
617 static void lxc_attach_get_init_uidgid(uid_t* init_uid, gid_t* init_gid)
618 {
619 FILE *proc_file;
620 char proc_fn[__PROC_STATUS_LEN];
621 int ret;
622 char *line = NULL;
623 size_t line_bufsz = 0;
624 long value = -1;
625 uid_t uid = (uid_t)-1;
626 gid_t gid = (gid_t)-1;
627
628 /* Read capabilities. */
629 snprintf(proc_fn, __PROC_STATUS_LEN, "/proc/%d/status", 1);
630
631 proc_file = fopen(proc_fn, "r");
632 if (!proc_file)
633 return;
634
635 while (getline(&line, &line_bufsz, proc_file) != -1) {
636 /* Format is: real, effective, saved set user, fs we only care
637 * about real uid.
638 */
639 ret = sscanf(line, "Uid: %ld", &value);
640 if (ret != EOF && ret == 1) {
641 uid = (uid_t) value;
642 } else {
643 ret = sscanf(line, "Gid: %ld", &value);
644 if (ret != EOF && ret == 1)
645 gid = (gid_t) value;
646 }
647 if (uid != (uid_t)-1 && gid != (gid_t)-1)
648 break;
649 }
650
651 fclose(proc_file);
652 free(line);
653
654 /* Only override arguments if we found something. */
655 if (uid != (uid_t)-1)
656 *init_uid = uid;
657 if (gid != (gid_t)-1)
658 *init_gid = gid;
659
660 /* TODO: we should also parse supplementary groups and use
661 * setgroups() to set them.
662 */
663 }
664
665 struct attach_clone_payload {
666 int ipc_socket;
667 lxc_attach_options_t* options;
668 struct lxc_proc_context_info* init_ctx;
669 lxc_attach_exec_t exec_function;
670 void* exec_payload;
671 };
672
673 static int attach_child_main(void* data);
674
675 /* Help the optimizer along if it doesn't know that exit always exits. */
676 #define rexit(c) do { int __c = (c); _exit(__c); return __c; } while(0)
677
678 /* Define default options if no options are supplied by the user. */
679 static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT;
680
681 static bool fetch_seccomp(struct lxc_container *c,
682 lxc_attach_options_t *options)
683 {
684 char *path;
685
686 if (!(options->namespaces & CLONE_NEWNS) || !(options->attach_flags & LXC_ATTACH_LSM)) {
687 free(c->lxc_conf->seccomp);
688 c->lxc_conf->seccomp = NULL;
689 return true;
690 }
691
692 /* Remove current setting. */
693 if (!c->set_config_item(c, "lxc.seccomp", "")) {
694 return false;
695 }
696
697 /* Fetch the current profile path over the cmd interface. */
698 path = c->get_running_config_item(c, "lxc.seccomp");
699 if (!path) {
700 INFO("Failed to get running config item for lxc.seccomp.");
701 return true;
702 }
703
704 /* Copy the value into the new lxc_conf. */
705 if (!c->set_config_item(c, "lxc.seccomp", path)) {
706 free(path);
707 return false;
708 }
709 free(path);
710
711 /* Attempt to parse the resulting config. */
712 if (lxc_read_seccomp_config(c->lxc_conf) < 0) {
713 ERROR("Error reading seccomp policy.");
714 return false;
715 }
716
717 INFO("Retrieved seccomp policy.");
718 return true;
719 }
720
721 static bool no_new_privs(struct lxc_container *c,
722 lxc_attach_options_t *options)
723 {
724 char *val;
725
726 /* Remove current setting. */
727 if (!c->set_config_item(c, "lxc.no_new_privs", "")) {
728 return false;
729 }
730
731 /* Retrieve currently active setting. */
732 val = c->get_running_config_item(c, "lxc.no_new_privs");
733 if (!val) {
734 INFO("Failed to get running config item for lxc.no_new_privs.");
735 return false;
736 }
737
738 /* Set currently active setting. */
739 if (!c->set_config_item(c, "lxc.no_new_privs", val)) {
740 free(val);
741 return false;
742 }
743 free(val);
744
745 return true;
746 }
747
748 static signed long get_personality(const char *name, const char *lxcpath)
749 {
750 char *p = lxc_cmd_get_config_item(name, "lxc.arch", lxcpath);
751 signed long ret;
752
753 if (!p)
754 return -1;
755 ret = lxc_config_parse_arch(p);
756 free(p);
757 return ret;
758 }
759
760 int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_function, void* exec_payload, lxc_attach_options_t* options, pid_t* attached_process)
761 {
762 int ret, status;
763 pid_t init_pid, pid, attached_pid, expected;
764 struct lxc_proc_context_info *init_ctx;
765 char* cwd;
766 char* new_cwd;
767 int ipc_sockets[2];
768 signed long personality;
769
770 if (!options)
771 options = &attach_static_default_options;
772
773 init_pid = lxc_cmd_get_init_pid(name, lxcpath);
774 if (init_pid < 0) {
775 ERROR("Failed to get init pid.");
776 return -1;
777 }
778
779 init_ctx = lxc_proc_get_context_info(init_pid);
780 if (!init_ctx) {
781 ERROR("Failed to get context of init process: %ld.",
782 (long)init_pid);
783 return -1;
784 }
785
786 personality = get_personality(name, lxcpath);
787 if (init_ctx->personality < 0) {
788 ERROR("Failed to get personality of the container.");
789 lxc_proc_put_context_info(init_ctx);
790 return -1;
791 }
792 init_ctx->personality = personality;
793
794 init_ctx->container = lxc_container_new(name, lxcpath);
795 if (!init_ctx->container)
796 return -1;
797
798 if (!fetch_seccomp(init_ctx->container, options))
799 WARN("Failed to get seccomp policy.");
800
801 if (!no_new_privs(init_ctx->container, options))
802 WARN("Could not determine whether PR_SET_NO_NEW_PRIVS is set.");
803
804 cwd = getcwd(NULL, 0);
805
806 /* Determine which namespaces the container was created with
807 * by asking lxc-start, if necessary.
808 */
809 if (options->namespaces == -1) {
810 options->namespaces = lxc_cmd_get_clone_flags(name, lxcpath);
811 /* call failed */
812 if (options->namespaces == -1) {
813 ERROR("Failed to automatically determine the "
814 "namespaces which the container uses.");
815 free(cwd);
816 lxc_proc_put_context_info(init_ctx);
817 return -1;
818 }
819 }
820
821 /* Create a socket pair for IPC communication; set SOCK_CLOEXEC in order
822 * to make sure we don't irritate other threads that want to fork+exec
823 * away
824 *
825 * IMPORTANT: if the initial process is multithreaded and another call
826 * just fork()s away without exec'ing directly after, the socket fd will
827 * exist in the forked process from the other thread and any close() in
828 * our own child process will not really cause the socket to close
829 * properly, potentiall causing the parent to hang.
830 *
831 * For this reason, while IPC is still active, we have to use shutdown()
832 * if the child exits prematurely in order to signal that the socket is
833 * closed and cannot assume that the child exiting will automatically do
834 * that.
835 *
836 * IPC mechanism: (X is receiver)
837 * initial process intermediate attached
838 * X <--- send pid of
839 * attached proc,
840 * then exit
841 * send 0 ------------------------------------> X
842 * [do initialization]
843 * X <------------------------------------ send 1
844 * [add to cgroup, ...]
845 * send 2 ------------------------------------> X
846 * [set LXC_ATTACH_NO_NEW_PRIVS]
847 * X <------------------------------------ send 3
848 * [open LSM label fd]
849 * send 4 ------------------------------------> X
850 * [set LSM label]
851 * close socket close socket
852 * run program
853 */
854 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
855 if (ret < 0) {
856 SYSERROR("Could not set up required IPC mechanism for attaching.");
857 free(cwd);
858 lxc_proc_put_context_info(init_ctx);
859 return -1;
860 }
861
862 /* Create intermediate subprocess, three reasons:
863 * 1. Runs all pthread_atfork handlers and the child will no
864 * longer be threaded (we can't properly setns() in a threaded
865 * process).
866 * 2. We can't setns() in the child itself, since we want to make
867 * sure we are properly attached to the pidns.
868 * 3. Also, the initial thread has to put the attached process
869 * into the cgroup, which we can only do if we didn't already
870 * setns() (otherwise, user namespaces will hate us).
871 */
872 pid = fork();
873
874 if (pid < 0) {
875 SYSERROR("Failed to create first subprocess.");
876 free(cwd);
877 lxc_proc_put_context_info(init_ctx);
878 return -1;
879 }
880
881 if (pid) {
882 int procfd = -1;
883 pid_t to_cleanup_pid = pid;
884
885 /* Initial thread, we close the socket that is for the
886 * subprocesses.
887 */
888 close(ipc_sockets[1]);
889 free(cwd);
890
891 /* Attach to cgroup, if requested. */
892 if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
893 if (!cgroup_attach(name, lxcpath, pid))
894 goto on_error;
895 }
896
897 /* Setup resource limits */
898 if (!lxc_list_empty(&init_ctx->container->lxc_conf->limits) && setup_resource_limits(&init_ctx->container->lxc_conf->limits, pid)) {
899 goto on_error;
900 }
901
902 /* Open /proc before setns() to the containers namespace so we
903 * don't rely on any information from inside the container.
904 */
905 procfd = open("/proc", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
906 if (procfd < 0) {
907 SYSERROR("Unable to open /proc.");
908 goto on_error;
909 }
910
911 /* Let the child process know to go ahead. */
912 status = 0;
913 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
914 if (ret <= 0) {
915 ERROR("Intended to send sequence number 0: %s.",
916 strerror(errno));
917 goto on_error;
918 }
919
920 /* Get pid of attached process from intermediate process. */
921 ret = lxc_read_nointr_expect(ipc_sockets[0], &attached_pid, sizeof(attached_pid), NULL);
922 if (ret <= 0) {
923 if (ret != 0)
924 ERROR("Expected to receive pid: %s.", strerror(errno));
925 goto on_error;
926 }
927
928 /* Ignore SIGKILL (CTRL-C) and SIGQUIT (CTRL-\) - issue #313. */
929 if (options->stdin_fd == 0) {
930 signal(SIGINT, SIG_IGN);
931 signal(SIGQUIT, SIG_IGN);
932 }
933
934 /* Reap intermediate process. */
935 ret = wait_for_pid(pid);
936 if (ret < 0)
937 goto on_error;
938
939 /* We will always have to reap the attached process now. */
940 to_cleanup_pid = attached_pid;
941
942 /* Tell attached process it may start initializing. */
943 status = 0;
944 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
945 if (ret <= 0) {
946 ERROR("Intended to send sequence number 0: %s.", strerror(errno));
947 goto on_error;
948 }
949
950 /* Wait for the attached process to finish initializing. */
951 expected = 1;
952 ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected);
953 if (ret <= 0) {
954 if (ret != 0)
955 ERROR("Expected to receive sequence number 1: %s.", strerror(errno));
956 goto on_error;
957 }
958
959 /* Tell attached process we're done. */
960 status = 2;
961 ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
962 if (ret <= 0) {
963 ERROR("Intended to send sequence number 2: %s.", strerror(errno));
964 goto on_error;
965 }
966
967 /* Wait for the (grand)child to tell us that it's ready to set
968 * up its LSM labels.
969 */
970 expected = 3;
971 ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected);
972 if (ret <= 0) {
973 ERROR("Expected to receive sequence number 3: %s.",
974 strerror(errno));
975 goto on_error;
976 }
977
978 /* Open LSM fd and send it to child. */
979 if ((options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_LSM) && init_ctx->lsm_label) {
980 int on_exec, saved_errno;
981 int labelfd = -1;
982 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? 1 : 0;
983 /* Open fd for the LSM security module. */
984 labelfd = lsm_openat(procfd, attached_pid, on_exec);
985 if (labelfd < 0)
986 goto on_error;
987
988 /* Send child fd of the LSM security module to write to. */
989 ret = lxc_abstract_unix_send_fd(ipc_sockets[0], labelfd, NULL, 0);
990 saved_errno = errno;
991 close(labelfd);
992 if (ret <= 0) {
993 ERROR("Intended to send file descriptor %d: %s.", labelfd, strerror(saved_errno));
994 goto on_error;
995 }
996 }
997
998 if (procfd >= 0)
999 close(procfd);
1000 /* Now shut down communication with child, we're done. */
1001 shutdown(ipc_sockets[0], SHUT_RDWR);
1002 close(ipc_sockets[0]);
1003 lxc_proc_put_context_info(init_ctx);
1004
1005 /* We're done, the child process should now execute whatever it
1006 * is that the user requested. The parent can now track it with
1007 * waitpid() or similar.
1008 */
1009
1010 *attached_process = attached_pid;
1011 return 0;
1012
1013 on_error:
1014 /* First shut down the socket, then wait for the pid, otherwise
1015 * the pid we're waiting for may never exit.
1016 */
1017 if (procfd >= 0)
1018 close(procfd);
1019 shutdown(ipc_sockets[0], SHUT_RDWR);
1020 close(ipc_sockets[0]);
1021 if (to_cleanup_pid)
1022 (void) wait_for_pid(to_cleanup_pid);
1023 lxc_proc_put_context_info(init_ctx);
1024 return -1;
1025 }
1026
1027 /* First subprocess begins here, we close the socket that is for the
1028 * initial thread.
1029 */
1030 close(ipc_sockets[0]);
1031
1032 /* Wait for the parent to have setup cgroups. */
1033 expected = 0;
1034 status = -1;
1035 ret = lxc_read_nointr_expect(ipc_sockets[1], &status, sizeof(status), &expected);
1036 if (ret <= 0) {
1037 ERROR("Expected to receive sequence number 0: %s.", strerror(errno));
1038 shutdown(ipc_sockets[1], SHUT_RDWR);
1039 rexit(-1);
1040 }
1041
1042 if ((options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) && cgns_supported())
1043 options->namespaces |= CLONE_NEWCGROUP;
1044
1045 /* Attach now, create another subprocess later, since pid namespaces
1046 * only really affect the children of the current process.
1047 */
1048 ret = lxc_attach_to_ns(init_pid, options->namespaces);
1049 if (ret < 0) {
1050 ERROR("Failed to enter namespaces.");
1051 shutdown(ipc_sockets[1], SHUT_RDWR);
1052 rexit(-1);
1053 }
1054
1055 /* Attach succeeded, try to cwd. */
1056 if (options->initial_cwd)
1057 new_cwd = options->initial_cwd;
1058 else
1059 new_cwd = cwd;
1060 ret = chdir(new_cwd);
1061 if (ret < 0)
1062 WARN("Could not change directory to \"%s\".", new_cwd);
1063 free(cwd);
1064
1065 /* Now create the real child process. */
1066 {
1067 struct attach_clone_payload payload = {
1068 .ipc_socket = ipc_sockets[1],
1069 .options = options,
1070 .init_ctx = init_ctx,
1071 .exec_function = exec_function,
1072 .exec_payload = exec_payload,
1073 };
1074 /* We use clone_parent here to make this subprocess a direct
1075 * child of the initial process. Then this intermediate process
1076 * can exit and the parent can directly track the attached
1077 * process.
1078 */
1079 pid = lxc_clone(attach_child_main, &payload, CLONE_PARENT);
1080 }
1081
1082 /* Shouldn't happen, clone() should always return positive pid. */
1083 if (pid <= 0) {
1084 SYSERROR("Failed to create subprocess.");
1085 shutdown(ipc_sockets[1], SHUT_RDWR);
1086 rexit(-1);
1087 }
1088
1089 /* Tell grandparent the pid of the pid of the newly created child. */
1090 ret = lxc_write_nointr(ipc_sockets[1], &pid, sizeof(pid));
1091 if (ret != sizeof(pid)) {
1092 /* If this really happens here, this is very unfortunate, since
1093 * the parent will not know the pid of the attached process and
1094 * will not be able to wait for it (and we won't either due to
1095 * CLONE_PARENT) so the parent won't be able to reap it and the
1096 * attached process will remain a zombie.
1097 */
1098 ERROR("Intended to send pid %d: %s.", pid, strerror(errno));
1099 shutdown(ipc_sockets[1], SHUT_RDWR);
1100 rexit(-1);
1101 }
1102
1103 /* The rest is in the hands of the initial and the attached process. */
1104 rexit(0);
1105 }
1106
1107 static int attach_child_main(void* data)
1108 {
1109 struct attach_clone_payload* payload = (struct attach_clone_payload*)data;
1110 int ipc_socket = payload->ipc_socket;
1111 lxc_attach_options_t* options = payload->options;
1112 struct lxc_proc_context_info* init_ctx = payload->init_ctx;
1113 #if HAVE_SYS_PERSONALITY_H
1114 long new_personality;
1115 #endif
1116 int ret;
1117 int status;
1118 int expected;
1119 long flags;
1120 int fd;
1121 int lsm_labelfd;
1122 uid_t new_uid;
1123 gid_t new_gid;
1124
1125 /* Wait for the initial thread to signal us that it's ready for us to
1126 * start initializing.
1127 */
1128 expected = 0;
1129 status = -1;
1130 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
1131 if (ret <= 0) {
1132 ERROR("Expected to receive sequence number 0: %s.", strerror(errno));
1133 shutdown(ipc_socket, SHUT_RDWR);
1134 rexit(-1);
1135 }
1136
1137 /* A description of the purpose of this functionality is provided in the
1138 * lxc-attach(1) manual page. We have to remount here and not in the
1139 * parent process, otherwise /proc may not properly reflect the new pid
1140 * namespace.
1141 */
1142 if (!(options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) {
1143 ret = lxc_attach_remount_sys_proc();
1144 if (ret < 0) {
1145 shutdown(ipc_socket, SHUT_RDWR);
1146 rexit(-1);
1147 }
1148 }
1149
1150 /* Now perform additional attachments. */
1151 #if HAVE_SYS_PERSONALITY_H
1152 if (options->personality < 0)
1153 new_personality = init_ctx->personality;
1154 else
1155 new_personality = options->personality;
1156
1157 if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) {
1158 ret = personality(new_personality);
1159 if (ret < 0) {
1160 SYSERROR("Could not ensure correct architecture.");
1161 shutdown(ipc_socket, SHUT_RDWR);
1162 rexit(-1);
1163 }
1164 }
1165 #endif
1166
1167 if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) {
1168 ret = lxc_attach_drop_privs(init_ctx);
1169 if (ret < 0) {
1170 ERROR("Could not drop privileges.");
1171 shutdown(ipc_socket, SHUT_RDWR);
1172 rexit(-1);
1173 }
1174 }
1175
1176 /* Always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL)
1177 * if you want this to be a no-op).
1178 */
1179 ret = lxc_attach_set_environment(options->env_policy, options->extra_env_vars, options->extra_keep_env);
1180 if (ret < 0) {
1181 ERROR("Could not set initial environment for attached process.");
1182 shutdown(ipc_socket, SHUT_RDWR);
1183 rexit(-1);
1184 }
1185
1186 /* Set {u,g}id. */
1187 new_uid = 0;
1188 new_gid = 0;
1189 /* Ignore errors, we will fall back to root in that case (/proc was not
1190 * mounted etc.).
1191 */
1192 if (options->namespaces & CLONE_NEWUSER)
1193 lxc_attach_get_init_uidgid(&new_uid, &new_gid);
1194
1195 if (options->uid != (uid_t)-1)
1196 new_uid = options->uid;
1197 if (options->gid != (gid_t)-1)
1198 new_gid = options->gid;
1199
1200 /* Setup the controlling tty. */
1201 if (options->stdin_fd && isatty(options->stdin_fd)) {
1202 if (setsid() < 0) {
1203 SYSERROR("Unable to setsid.");
1204 shutdown(ipc_socket, SHUT_RDWR);
1205 rexit(-1);
1206 }
1207
1208 if (ioctl(options->stdin_fd, TIOCSCTTY, (char *)NULL) < 0) {
1209 SYSERROR("Unable to set TIOCSTTY.");
1210 shutdown(ipc_socket, SHUT_RDWR);
1211 rexit(-1);
1212 }
1213 }
1214
1215 /* Try to set the {u,g}id combination. */
1216 if ((new_gid != 0 || options->namespaces & CLONE_NEWUSER)) {
1217 if (setgid(new_gid) || setgroups(0, NULL)) {
1218 SYSERROR("Switching to container gid.");
1219 shutdown(ipc_socket, SHUT_RDWR);
1220 rexit(-1);
1221 }
1222 }
1223 if ((new_uid != 0 || options->namespaces & CLONE_NEWUSER) && setuid(new_uid)) {
1224 SYSERROR("Switching to container uid.");
1225 shutdown(ipc_socket, SHUT_RDWR);
1226 rexit(-1);
1227 }
1228
1229 /* Tell initial process it may now put us into cgroups. */
1230 status = 1;
1231 ret = lxc_write_nointr(ipc_socket, &status, sizeof(status));
1232 if (ret != sizeof(status)) {
1233 ERROR("Intended to send sequence number 1: %s.", strerror(errno));
1234 shutdown(ipc_socket, SHUT_RDWR);
1235 rexit(-1);
1236 }
1237
1238 /* Wait for the initial thread to signal us that it has done everything
1239 * for us when it comes to cgroups etc.
1240 */
1241 expected = 2;
1242 status = -1;
1243 ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
1244 if (ret <= 0) {
1245 ERROR("Expected to receive sequence number 2: %s", strerror(errno));
1246 shutdown(ipc_socket, SHUT_RDWR);
1247 rexit(-1);
1248 }
1249
1250 if ((init_ctx->container && init_ctx->container->lxc_conf &&
1251 init_ctx->container->lxc_conf->no_new_privs) ||
1252 (options->attach_flags & LXC_ATTACH_NO_NEW_PRIVS)) {
1253 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
1254 SYSERROR("PR_SET_NO_NEW_PRIVS could not be set. "
1255 "Process can use execve() gainable "
1256 "privileges.");
1257 shutdown(ipc_socket, SHUT_RDWR);
1258 rexit(-1);
1259 }
1260 INFO("PR_SET_NO_NEW_PRIVS is set. Process cannot use execve() "
1261 "gainable privileges.");
1262 }
1263
1264 /* Tell the (grand)parent to send us LSM label fd. */
1265 status = 3;
1266 ret = lxc_write_nointr(ipc_socket, &status, sizeof(status));
1267 if (ret <= 0) {
1268 ERROR("Intended to send sequence number 3: %s.", strerror(errno));
1269 shutdown(ipc_socket, SHUT_RDWR);
1270 rexit(-1);
1271 }
1272
1273 if ((options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_LSM) && init_ctx->lsm_label) {
1274 int on_exec;
1275 /* Receive fd for LSM security module. */
1276 ret = lxc_abstract_unix_recv_fd(ipc_socket, &lsm_labelfd, NULL, 0);
1277 if (ret <= 0) {
1278 ERROR("Expected to receive file descriptor: %s.", strerror(errno));
1279 shutdown(ipc_socket, SHUT_RDWR);
1280 rexit(-1);
1281 }
1282
1283 /* Change into our new LSM profile. */
1284 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? 1 : 0;
1285 if (lsm_set_label_at(lsm_labelfd, on_exec, init_ctx->lsm_label) < 0) {
1286 SYSERROR("Failed to set LSM label.");
1287 shutdown(ipc_socket, SHUT_RDWR);
1288 close(lsm_labelfd);
1289 rexit(-1);
1290 }
1291 close(lsm_labelfd);
1292 }
1293
1294 if (init_ctx->container && init_ctx->container->lxc_conf &&
1295 init_ctx->container->lxc_conf->seccomp &&
1296 (lxc_seccomp_load(init_ctx->container->lxc_conf) != 0)) {
1297 ERROR("Failed to load seccomp policy.");
1298 shutdown(ipc_socket, SHUT_RDWR);
1299 rexit(-1);
1300 }
1301
1302 shutdown(ipc_socket, SHUT_RDWR);
1303 close(ipc_socket);
1304 lxc_proc_put_context_info(init_ctx);
1305
1306 /* The following is done after the communication socket is shut down.
1307 * That way, all errors that might (though unlikely) occur up until this
1308 * point will have their messages printed to the original stderr (if
1309 * logging is so configured) and not the fd the user supplied, if any.
1310 */
1311
1312 /* Fd handling for stdin, stdout and stderr; ignore errors here, user
1313 * may want to make sure the fds are closed, for example.
1314 */
1315 if (options->stdin_fd >= 0 && options->stdin_fd != 0)
1316 dup2(options->stdin_fd, 0);
1317 if (options->stdout_fd >= 0 && options->stdout_fd != 1)
1318 dup2(options->stdout_fd, 1);
1319 if (options->stderr_fd >= 0 && options->stderr_fd != 2)
1320 dup2(options->stderr_fd, 2);
1321
1322 /* close the old fds */
1323 if (options->stdin_fd > 2)
1324 close(options->stdin_fd);
1325 if (options->stdout_fd > 2)
1326 close(options->stdout_fd);
1327 if (options->stderr_fd > 2)
1328 close(options->stderr_fd);
1329
1330 /* Try to remove FD_CLOEXEC flag from stdin/stdout/stderr, but also
1331 * here, ignore errors.
1332 */
1333 for (fd = 0; fd <= 2; fd++) {
1334 flags = fcntl(fd, F_GETFL);
1335 if (flags < 0)
1336 continue;
1337 if (flags & FD_CLOEXEC)
1338 if (fcntl(fd, F_SETFL, flags & ~FD_CLOEXEC) < 0)
1339 SYSERROR("Unable to clear FD_CLOEXEC from file descriptor.");
1340 }
1341
1342 /* We're done, so we can now do whatever the user intended us to do. */
1343 rexit(payload->exec_function(payload->exec_payload));
1344 }
1345
1346 int lxc_attach_run_command(void* payload)
1347 {
1348 lxc_attach_command_t* cmd = (lxc_attach_command_t*)payload;
1349
1350 execvp(cmd->program, cmd->argv);
1351 SYSERROR("Failed to exec \"%s\".", cmd->program);
1352 return -1;
1353 }
1354
1355 int lxc_attach_run_shell(void* payload)
1356 {
1357 uid_t uid;
1358 struct passwd *passwd;
1359 char *user_shell;
1360
1361 /* Ignore payload parameter. */
1362 (void)payload;
1363
1364 uid = getuid();
1365 passwd = getpwuid(uid);
1366
1367 /* This probably happens because of incompatible nss implementations in
1368 * host and container (remember, this code is still using the host's
1369 * glibc but our mount namespace is in the container) we may try to get
1370 * the information by spawning a [getent passwd uid] process and parsing
1371 * the result.
1372 */
1373 if (!passwd)
1374 user_shell = lxc_attach_getpwshell(uid);
1375 else
1376 user_shell = passwd->pw_shell;
1377
1378 if (user_shell)
1379 execlp(user_shell, user_shell, (char *)NULL);
1380
1381 /* Executed if either no passwd entry or execvp fails, we will fall back
1382 * on /bin/sh as a default shell.
1383 */
1384 execlp("/bin/sh", "/bin/sh", (char *)NULL);
1385 SYSERROR("Failed to exec shell.");
1386 return -1;
1387 }