1 /* SPDX-License-Identifier: LGPL-2.1+ */
8 #include <linux/unistd.h>
15 #include <sys/mount.h>
16 #include <sys/param.h>
17 #include <sys/prctl.h>
18 #include <sys/socket.h>
19 #include <sys/syscall.h>
29 #include "cgroups/cgroup.h"
30 #include "cgroups/cgroup_utils.h"
37 #include "lxcseccomp.h"
40 #include "memory_utils.h"
41 #include "mount_utils.h"
42 #include "namespace.h"
43 #include "process_utils.h"
45 #include "syscall_wrappers.h"
49 lxc_log_define(attach
, lxc
);
51 /* Define default options if no options are supplied by the user. */
52 static lxc_attach_options_t attach_static_default_options
= LXC_ATTACH_OPTIONS_DEFAULT
;
55 * The context used to attach to the container.
56 * @attach_flags : the attach flags specified in lxc_attach_options_t
57 * @init_pid : the PID of the container's init process
58 * @dfd_init_pid : file descriptor to /proc/@init_pid
59 * __Must be closed in attach_context_security_barrier()__!
60 * @dfd_self_pid : file descriptor to /proc/self
61 * __Must be closed in attach_context_security_barrier()__!
62 * @setup_ns_uid : if CLONE_NEWUSER is specified will contain the uid used
63 * during attach setup.
64 * @setup_ns_gid : if CLONE_NEWUSER is specified will contain the gid used
65 * during attach setup.
66 * @target_ns_uid : if CLONE_NEWUSER is specified the uid that the final
67 * program will be run with.
68 * @target_ns_gid : if CLONE_NEWUSER is specified the gid that the final
69 * program will be run with.
70 * @target_host_uid : if CLONE_NEWUSER is specified the uid that the final
71 * program will be run with on the host.
72 * @target_host_gid : if CLONE_NEWUSER is specified the gid that the final
73 * program will be run with on the host.
74 * @lsm_label : LSM label to be used for the attaching process
75 * @container : the container we're attaching o
76 * @personality : the personality to use for the final program
77 * @capability : the capability mask of the @init_pid
78 * @ns_inherited : flags of namespaces that the final program will inherit
80 * @ns_fd : file descriptors to @init_pid's namespaces
81 * @core_sched_cookie : core scheduling cookie
83 struct attach_context
{
84 unsigned int ns_clone_flags
;
85 unsigned int attach_flags
;
94 uid_t target_host_uid
;
95 uid_t target_host_gid
;
97 struct lxc_container
*container
;
98 personality_t personality
;
99 unsigned long long capability_mask
;
101 int ns_fd
[LXC_NS_MAX
];
102 struct lsm_ops
*lsm_ops
;
103 __u64 core_sched_cookie
;
106 static pid_t
pidfd_get_pid(int dfd_init_pid
, int pidfd
)
108 __do_free
char *line
= NULL
;
109 __do_fclose
FILE *f
= NULL
;
111 char path
[STRLITERALLEN("fdinfo/") + INTTYPE_TO_STRLEN(int) + 1 ] = "fdinfo/";
114 if (dfd_init_pid
< 0 || pidfd
< 0)
115 return ret_errno(EBADF
);
117 ret
= strnprintf(path
+ STRLITERALLEN("fdinfo/"), INTTYPE_TO_STRLEN(int), "%d", pidfd
);
119 return ret_errno(EIO
);
121 f
= fdopen_at(dfd_init_pid
, path
, "re", PROTECT_OPEN
, PROTECT_LOOKUP_BENEATH
);
125 while (getline(&line
, &len
, f
) != -1) {
126 const char *prefix
= "Pid:\t";
127 const size_t prefix_len
= STRLITERALLEN("Pid:\t");
131 if (!strnequal(slider
, prefix
, prefix_len
))
134 slider
+= prefix_len
;
135 slider
= lxc_trim_whitespace_in_place(slider
);
137 ret
= lxc_safe_int(slider
, &pid
);
144 return ret_errno(ENOENT
);
147 static inline bool sync_wake_pid(int fd
, pid_t pid
)
149 return lxc_write_nointr(fd
, &pid
, sizeof(pid_t
)) == sizeof(pid_t
);
152 static inline bool sync_wait_pid(int fd
, pid_t
*pid
)
154 return lxc_read_nointr(fd
, pid
, sizeof(pid_t
)) == sizeof(pid_t
);
157 static inline bool sync_wake_fd(int fd
, int fd_send
)
159 return lxc_abstract_unix_send_fds(fd
, &fd_send
, 1, NULL
, 0) > 0;
162 static inline bool sync_wait_fd(int fd
, int *fd_recv
)
164 return lxc_abstract_unix_recv_one_fd(fd
, fd_recv
, NULL
, 0) > 0;
167 static inline bool attach_lsm(lxc_attach_options_t
*options
)
169 return (options
->attach_flags
& (LXC_ATTACH_LSM
| LXC_ATTACH_LSM_LABEL
));
172 static struct attach_context
*alloc_attach_context(void)
174 struct attach_context
*ctx
;
176 ctx
= zalloc(sizeof(struct attach_context
));
178 return ret_set_errno(NULL
, ENOMEM
);
180 ctx
->init_pid
= -ESRCH
;
182 ctx
->dfd_self_pid
= -EBADF
;
183 ctx
->dfd_init_pid
= -EBADF
;
184 ctx
->init_pidfd
= -EBADF
;
186 ctx
->setup_ns_uid
= LXC_INVALID_UID
;
187 ctx
->setup_ns_gid
= LXC_INVALID_GID
;
188 ctx
->target_ns_uid
= LXC_INVALID_UID
;
189 ctx
->target_ns_gid
= LXC_INVALID_GID
;
190 ctx
->target_host_uid
= LXC_INVALID_UID
;
191 ctx
->target_host_gid
= LXC_INVALID_GID
;
193 ctx
->core_sched_cookie
= INVALID_SCHED_CORE_COOKIE
;
195 for (lxc_namespace_t i
= 0; i
< LXC_NS_MAX
; i
++)
196 ctx
->ns_fd
[i
] = -EBADF
;
201 static int get_personality(const char *name
, const char *lxcpath
,
202 personality_t
*personality
)
204 __do_free
char *p
= NULL
;
208 p
= lxc_cmd_get_config_item(name
, "lxc.arch", lxcpath
);
210 *personality
= LXC_ARCH_UNCHANGED
;
214 ret
= lxc_config_parse_arch(p
, &per
);
216 return syserror("Failed to parse personality");
222 static int userns_setup_ids(struct attach_context
*ctx
,
223 lxc_attach_options_t
*options
)
225 __do_free
char *line
= NULL
;
226 __do_fclose
FILE *f_gidmap
= NULL
, *f_uidmap
= NULL
;
228 uid_t init_ns_uid
= LXC_INVALID_UID
;
229 gid_t init_ns_gid
= LXC_INVALID_GID
;
230 uid_t nsuid
, hostuid
, range_uid
;
231 gid_t nsgid
, hostgid
, range_gid
;
233 if (!(options
->namespaces
& CLONE_NEWUSER
))
236 f_uidmap
= fdopen_at(ctx
->dfd_init_pid
, "uid_map", "re", PROTECT_OPEN
, PROTECT_LOOKUP_BENEATH
);
238 return syserror("Failed to open uid_map");
240 while (getline(&line
, &len
, f_uidmap
) != -1) {
241 if (sscanf(line
, "%u %u %u", &nsuid
, &hostuid
, &range_uid
) != 3)
244 if (0 >= nsuid
&& 0 < nsuid
+ range_uid
) {
245 ctx
->setup_ns_uid
= 0;
246 TRACE("Container has mapping for uid 0");
250 if (ctx
->target_host_uid
>= hostuid
&& ctx
->target_host_uid
< hostuid
+ range_uid
) {
251 init_ns_uid
= (ctx
->target_host_uid
- hostuid
) + nsuid
;
252 TRACE("Container runs with uid %d", init_ns_uid
);
256 f_gidmap
= fdopen_at(ctx
->dfd_init_pid
, "gid_map", "re", PROTECT_OPEN
, PROTECT_LOOKUP_BENEATH
);
258 return syserror("Failed to open gid_map");
260 while (getline(&line
, &len
, f_gidmap
) != -1) {
261 if (sscanf(line
, "%u %u %u", &nsgid
, &hostgid
, &range_gid
) != 3)
264 if (0 >= nsgid
&& 0 < nsgid
+ range_gid
) {
265 ctx
->setup_ns_gid
= 0;
266 TRACE("Container has mapping for gid 0");
270 if (ctx
->target_host_gid
>= hostgid
&& ctx
->target_host_gid
< hostgid
+ range_gid
) {
271 init_ns_gid
= (ctx
->target_host_gid
- hostgid
) + nsgid
;
272 TRACE("Container runs with gid %d", init_ns_gid
);
276 if (ctx
->setup_ns_uid
== LXC_INVALID_UID
)
277 ctx
->setup_ns_uid
= init_ns_uid
;
279 if (ctx
->setup_ns_gid
== LXC_INVALID_UID
)
280 ctx
->setup_ns_gid
= init_ns_gid
;
285 static void userns_target_ids(struct attach_context
*ctx
, lxc_attach_options_t
*options
)
287 if (options
->uid
!= LXC_INVALID_UID
)
288 ctx
->target_ns_uid
= options
->uid
;
289 else if (options
->namespaces
& CLONE_NEWUSER
)
290 ctx
->target_ns_uid
= ctx
->setup_ns_uid
;
292 ctx
->target_ns_uid
= 0;
294 if (ctx
->target_ns_uid
== LXC_INVALID_UID
)
295 WARN("Invalid uid specified");
297 if (options
->gid
!= LXC_INVALID_GID
)
298 ctx
->target_ns_gid
= options
->gid
;
299 else if (options
->namespaces
& CLONE_NEWUSER
)
300 ctx
->target_ns_gid
= ctx
->setup_ns_gid
;
302 ctx
->target_ns_gid
= 0;
304 if (ctx
->target_ns_gid
== LXC_INVALID_GID
)
305 WARN("Invalid gid specified");
308 static int parse_init_status(struct attach_context
*ctx
, lxc_attach_options_t
*options
)
310 __do_free
char *line
= NULL
;
311 __do_fclose
FILE *f
= NULL
;
313 bool caps_found
= false;
316 f
= fdopen_at(ctx
->dfd_init_pid
, "status", "re", PROTECT_OPEN
, PROTECT_LOOKUP_BENEATH
);
318 return syserror("Failed to open status file");
320 while (getline(&line
, &len
, f
) != -1) {
321 signed long value
= -1;
324 * Format is: real, effective, saved set user, fs we only care
327 ret
= sscanf(line
, "Uid: %ld", &value
);
328 if (ret
!= EOF
&& ret
== 1) {
329 ctx
->target_host_uid
= (uid_t
)value
;
330 TRACE("Container's init process runs with hostuid %d", ctx
->target_host_uid
);
334 ret
= sscanf(line
, "Gid: %ld", &value
);
335 if (ret
!= EOF
&& ret
== 1) {
336 ctx
->target_host_gid
= (gid_t
)value
;
337 TRACE("Container's init process runs with hostgid %d", ctx
->target_host_gid
);
341 ret
= sscanf(line
, "CapBnd: %llx", &ctx
->capability_mask
);
342 if (ret
!= EOF
&& ret
== 1) {
348 if (ctx
->target_host_uid
!= LXC_INVALID_UID
&&
349 ctx
->target_host_gid
!= LXC_INVALID_GID
&&
355 ret
= userns_setup_ids(ctx
, options
);
357 return syserror_ret(ret
, "Failed to get setup ids");
358 userns_target_ids(ctx
, options
);
363 static bool pidfd_setns_supported(struct attach_context
*ctx
)
368 * The ability to attach to time namespaces came after the introduction
369 * of of using pidfds for attaching to namespaces. To avoid having to
370 * special-case both CLONE_NEWUSER and CLONE_NEWTIME handling, let's
371 * use CLONE_NEWTIME as gatekeeper.
373 if (ctx
->init_pidfd
>= 0)
374 ret
= setns(ctx
->init_pidfd
, CLONE_NEWTIME
);
377 TRACE("Attaching to namespaces via pidfds %s",
378 ret
? "unsupported" : "supported");
382 static int get_attach_context(struct attach_context
*ctx
,
383 struct lxc_container
*container
,
384 lxc_attach_options_t
*options
)
386 __do_free
char *lsm_label
= NULL
;
388 char path
[LXC_PROC_PID_LEN
];
390 ctx
->container
= container
;
391 ctx
->attach_flags
= options
->attach_flags
;
393 ctx
->dfd_self_pid
= open_at(-EBADF
, "/proc/self",
394 PROTECT_OPATH_FILE
& ~O_NOFOLLOW
,
395 (PROTECT_LOOKUP_ABSOLUTE_WITH_SYMLINKS
& ~RESOLVE_NO_XDEV
), 0);
396 if (ctx
->dfd_self_pid
< 0)
397 return syserror("Failed to open /proc/self");
399 ctx
->init_pidfd
= lxc_cmd_get_init_pidfd(container
->name
, container
->config_path
);
400 if (ctx
->init_pidfd
>= 0)
401 ctx
->init_pid
= pidfd_get_pid(ctx
->dfd_self_pid
, ctx
->init_pidfd
);
403 ctx
->init_pid
= lxc_cmd_get_init_pid(container
->name
, container
->config_path
);
404 if (ctx
->init_pid
< 0)
405 return syserror_ret(-1, "Failed to get init pid");
407 ret
= lxc_cmd_get_clone_flags(container
->name
, container
->config_path
);
409 SYSERROR("Failed to retrieve namespace flags");
410 ctx
->ns_clone_flags
= ret
;
412 ret
= core_scheduling_cookie_get(ctx
->init_pid
, &ctx
->core_sched_cookie
);
413 if (ret
|| !core_scheduling_cookie_valid(ctx
->core_sched_cookie
))
414 INFO("Container does not run in a separate core scheduling domain");
416 INFO("Container runs in separate core scheduling domain %llu",
417 (llu
)ctx
->core_sched_cookie
);
419 ret
= strnprintf(path
, sizeof(path
), "/proc/%d", ctx
->init_pid
);
421 return ret_errno(EIO
);
423 ctx
->dfd_init_pid
= open_at(-EBADF
, path
,
424 PROTECT_OPATH_DIRECTORY
,
425 (PROTECT_LOOKUP_ABSOLUTE
& ~RESOLVE_NO_XDEV
), 0);
426 if (ctx
->dfd_init_pid
< 0)
427 return syserror("Failed to open /proc/%d", ctx
->init_pid
);
429 if (ctx
->init_pidfd
>= 0) {
430 ret
= lxc_raw_pidfd_send_signal(ctx
->init_pidfd
, 0, NULL
, 0);
432 return syserror("Container process exited or PID has been recycled");
434 TRACE("Container process still running and PID was not recycled");
436 if (!pidfd_setns_supported(ctx
)) {
437 /* We can't risk leaking file descriptors during attach. */
438 if (close(ctx
->init_pidfd
))
439 return syserror("Failed to close pidfd");
441 ctx
->init_pidfd
= -EBADF
;
442 TRACE("Attaching to namespaces via pidfds not supported");
446 /* Determine which namespaces the container was created with. */
447 if (options
->namespaces
== -1) {
448 options
->namespaces
= ctx
->ns_clone_flags
;
449 if (options
->namespaces
== -1)
450 return syserror_set(-EINVAL
, "Failed to automatically determine the namespaces which the container uses");
452 for (lxc_namespace_t i
= 0; i
< LXC_NS_MAX
; i
++) {
453 if (ns_info
[i
].clone_flag
& CLONE_NEWCGROUP
)
454 if (!(options
->attach_flags
& LXC_ATTACH_MOVE_TO_CGROUP
) ||
458 if (ns_info
[i
].clone_flag
& options
->namespaces
)
461 ctx
->ns_inherited
|= ns_info
[i
].clone_flag
;
465 ret
= parse_init_status(ctx
, options
);
467 return syserror("Failed to open parse file");
469 ctx
->lsm_ops
= lsm_init_static();
471 if (attach_lsm(options
)) {
472 if (ctx
->attach_flags
& LXC_ATTACH_LSM_LABEL
)
473 lsm_label
= options
->lsm_label
;
475 lsm_label
= ctx
->lsm_ops
->process_label_get_at(ctx
->lsm_ops
, ctx
->dfd_init_pid
);
477 WARN("No security context received");
479 INFO("Retrieved security context %s", lsm_label
);
482 ret
= get_personality(container
->name
, container
->config_path
, &ctx
->personality
);
484 return syserror_ret(ret
, "Failed to get personality of the container");
486 if (!ctx
->container
->lxc_conf
) {
487 ctx
->container
->lxc_conf
= lxc_conf_init();
488 if (!ctx
->container
->lxc_conf
)
489 return syserror_set(-ENOMEM
, "Failed to allocate new lxc config");
492 ctx
->lsm_label
= move_ptr(lsm_label
);
496 static int same_nsfd(int dfd_pid1
, int dfd_pid2
, const char *ns_path
)
499 struct stat ns_st1
, ns_st2
;
501 ret
= fstatat(dfd_pid1
, ns_path
, &ns_st1
, 0);
505 ret
= fstatat(dfd_pid2
, ns_path
, &ns_st2
, 0);
509 /* processes are in the same namespace */
510 if ((ns_st1
.st_dev
== ns_st2
.st_dev
) &&
511 (ns_st1
.st_ino
== ns_st2
.st_ino
))
517 static int same_ns(int dfd_pid1
, int dfd_pid2
, const char *ns_path
)
519 __do_close
int ns_fd2
= -EBADF
;
522 ns_fd2
= open_at(dfd_pid2
, ns_path
, PROTECT_OPEN_WITH_TRAILING_SYMLINKS
,
523 (PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS
&
524 ~(RESOLVE_NO_XDEV
| RESOLVE_BENEATH
)), 0);
528 return syserror("Failed to open %d(%s)", dfd_pid2
, ns_path
);
531 ret
= same_nsfd(dfd_pid1
, dfd_pid2
, ns_path
);
536 return ret_errno(ENOENT
);
538 /* processes are in different namespaces */
539 return move_fd(ns_fd2
);
545 static int __prepare_namespaces_pidfd(struct attach_context
*ctx
)
547 for (lxc_namespace_t i
= 0; i
< LXC_NS_MAX
; i
++) {
550 ret
= same_nsfd(ctx
->dfd_self_pid
,
552 ns_info
[i
].proc_path
);
557 ctx
->ns_inherited
&= ~ns_info
[i
].clone_flag
;
558 TRACE("Shared %s namespace doesn't need attach", ns_info
[i
].proc_name
);
561 TRACE("Different %s namespace needs attach", ns_info
[i
].proc_name
);
565 return syserror("Failed to determine whether %s namespace is shared",
566 ns_info
[i
].proc_name
);
572 static int __prepare_namespaces_nsfd(struct attach_context
*ctx
,
573 lxc_attach_options_t
*options
)
575 for (lxc_namespace_t i
= 0; i
< LXC_NS_MAX
; i
++) {
578 if (options
->namespaces
& ns_info
[i
].clone_flag
)
579 ctx
->ns_fd
[i
] = open_at(ctx
->dfd_init_pid
,
580 ns_info
[i
].proc_path
,
581 PROTECT_OPEN_WITH_TRAILING_SYMLINKS
,
582 (PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS
&
583 ~(RESOLVE_NO_XDEV
| RESOLVE_BENEATH
)),
585 else if (ctx
->ns_inherited
& ns_info
[i
].clone_flag
)
586 ctx
->ns_fd
[i
] = same_ns(ctx
->dfd_self_pid
,
588 ns_info
[i
].proc_path
);
592 if (ctx
->ns_fd
[i
] >= 0)
595 if (ctx
->ns_fd
[i
] == -ENOENT
) {
596 ctx
->ns_inherited
&= ~ns_info
[i
].clone_flag
;
600 /* We failed to preserve the namespace. */
601 SYSERROR("Failed to preserve %s namespace of %d",
602 ns_info
[i
].proc_name
, ctx
->init_pid
);
604 /* Close all already opened file descriptors before we return an
605 * error, so we don't leak them.
607 for (j
= 0; j
< i
; j
++)
608 close_prot_errno_disarm(ctx
->ns_fd
[j
]);
610 return ret_errno(EINVAL
);
616 static int prepare_namespaces(struct attach_context
*ctx
,
617 lxc_attach_options_t
*options
)
619 if (ctx
->init_pidfd
< 0)
620 return __prepare_namespaces_nsfd(ctx
, options
);
622 return __prepare_namespaces_pidfd(ctx
);
625 static inline void put_namespaces(struct attach_context
*ctx
)
627 if (ctx
->init_pidfd
< 0) {
628 for (int i
= 0; i
< LXC_NS_MAX
; i
++)
629 close_prot_errno_disarm(ctx
->ns_fd
[i
]);
633 static int __attach_namespaces_pidfd(struct attach_context
*ctx
,
634 lxc_attach_options_t
*options
)
636 unsigned int ns_flags
= options
->namespaces
| ctx
->ns_inherited
;
639 /* The common case is to attach to all namespaces. */
640 ret
= setns(ctx
->init_pidfd
, ns_flags
);
642 return syserror("Failed to attach to namespaces via pidfd");
644 /* We can't risk leaking file descriptors into the container. */
645 if (close(ctx
->init_pidfd
))
646 return syserror("Failed to close pidfd");
647 ctx
->init_pidfd
= -EBADF
;
649 return log_trace(0, "Attached to container namespaces via pidfd");
652 static int __attach_namespaces_nsfd(struct attach_context
*ctx
,
653 lxc_attach_options_t
*options
)
657 for (lxc_namespace_t i
= 0; i
< LXC_NS_MAX
; i
++) {
660 if (ctx
->ns_fd
[i
] < 0)
663 ret
= setns(ctx
->ns_fd
[i
], ns_info
[i
].clone_flag
);
665 return syserror("Failed to attach to %s namespace of %d",
666 ns_info
[i
].proc_name
, ctx
->init_pid
);
668 if (close(ctx
->ns_fd
[i
])) {
670 SYSERROR("Failed to close file descriptor for %s namespace",
671 ns_info
[i
].proc_name
);
673 ctx
->ns_fd
[i
] = -EBADF
;
679 static int attach_namespaces(struct attach_context
*ctx
,
680 lxc_attach_options_t
*options
)
682 if (lxc_log_trace()) {
683 for (lxc_namespace_t i
= 0; i
< LXC_NS_MAX
; i
++) {
684 if (ns_info
[i
].clone_flag
& options
->namespaces
) {
685 TRACE("Attaching to %s namespace", ns_info
[i
].proc_name
);
688 if (ns_info
[i
].clone_flag
& ctx
->ns_inherited
) {
689 TRACE("Sharing %s namespace", ns_info
[i
].proc_name
);
692 TRACE("Inheriting %s namespace", ns_info
[i
].proc_name
);
696 if (ctx
->init_pidfd
< 0)
697 return __attach_namespaces_nsfd(ctx
, options
);
699 return __attach_namespaces_pidfd(ctx
, options
);
702 static void put_attach_context(struct attach_context
*ctx
)
705 if (!(ctx
->attach_flags
& LXC_ATTACH_LSM_LABEL
))
706 free_disarm(ctx
->lsm_label
);
707 close_prot_errno_disarm(ctx
->dfd_init_pid
);
709 if (ctx
->container
) {
710 lxc_container_put(ctx
->container
);
711 ctx
->container
= NULL
;
720 * Place anything in here that needs to be get rid of before we move into the
721 * container's context and fail hard if we can't.
723 static bool attach_context_security_barrier(struct attach_context
*ctx
)
726 if (close(ctx
->dfd_self_pid
))
728 ctx
->dfd_self_pid
= -EBADF
;
730 if (close(ctx
->dfd_init_pid
))
732 ctx
->dfd_init_pid
= -EBADF
;
738 int lxc_attach_remount_sys_proc(void)
742 ret
= unshare(CLONE_NEWNS
);
744 return syserror("Failed to unshare mount namespace");
746 if (detect_shared_rootfs() && mount(NULL
, "/", NULL
, MS_SLAVE
| MS_REC
, NULL
))
747 SYSERROR("Failed to recursively turn root mount tree into dependent mount. Continuing...");
749 /* Assume /proc is always mounted, so remount it. */
750 ret
= umount2("/proc", MNT_DETACH
);
752 return syserror("Failed to unmount /proc");
754 ret
= mount("none", "/proc", "proc", 0, NULL
);
756 return syserror("Failed to remount /proc");
759 * Try to umount /sys. If it's not a mount point, we'll get EINVAL, then
760 * we ignore it because it may not have been mounted in the first place.
762 ret
= umount2("/sys", MNT_DETACH
);
763 if (ret
< 0 && errno
!= EINVAL
)
764 return syserror("Failed to unmount /sys");
767 if (ret
== 0 && mount("none", "/sys", "sysfs", 0, NULL
))
768 return syserror("Failed to remount /sys");
773 static int drop_capabilities(struct attach_context
*ctx
)
778 ret
= lxc_caps_last_cap(&last_cap
);
780 return syserror_ret(ret
, "%d - Failed to drop capabilities", ret
);
782 for (__u32 cap
= 0; cap
<= last_cap
; cap
++) {
783 if (ctx
->capability_mask
& (1LL << cap
))
786 if (prctl(PR_CAPBSET_DROP
, prctl_arg(cap
), prctl_arg(0),
787 prctl_arg(0), prctl_arg(0)))
788 return syserror("Failed to drop capability %d", cap
);
790 TRACE("Dropped capability %d", cap
);
796 static int lxc_attach_set_environment(struct attach_context
*ctx
,
797 enum lxc_attach_env_policy_t policy
,
798 char **extra_env
, char **extra_keep
)
802 if (policy
== LXC_ATTACH_CLEAR_ENV
) {
804 char **extra_keep_store
= NULL
;
809 for (count
= 0; extra_keep
[count
]; count
++)
812 extra_keep_store
= zalloc(count
* sizeof(char *));
813 if (!extra_keep_store
)
816 for (i
= 0; i
< count
; i
++) {
817 char *v
= getenv(extra_keep
[i
]);
819 extra_keep_store
[i
] = strdup(v
);
820 if (!extra_keep_store
[i
]) {
822 free(extra_keep_store
[--i
]);
824 free(extra_keep_store
);
828 if (strequal(extra_keep
[i
], "PATH"))
835 if (extra_keep_store
) {
838 for (p
= extra_keep_store
; *p
; p
++)
841 free(extra_keep_store
);
844 return syserror("Failed to clear environment");
847 if (extra_keep_store
) {
850 for (i
= 0; extra_keep
[i
]; i
++) {
851 if (extra_keep_store
[i
]) {
852 ret
= setenv(extra_keep
[i
], extra_keep_store
[i
], 1);
854 SYSWARN("Failed to set environment variable");
857 free(extra_keep_store
[i
]);
860 free(extra_keep_store
);
863 /* Always set a default path; shells and execlp tend to be fine
864 * without it, but there is a disturbing number of C programs
865 * out there that just assume that getenv("PATH") is never NULL
866 * and then die a painful segfault death.
869 ret
= setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
871 SYSWARN("Failed to set environment variable");
875 ret
= putenv("container=lxc");
877 return log_warn(-1, "Failed to set environment variable");
879 /* Set container environment variables.*/
880 if (ctx
->container
->lxc_conf
) {
881 ret
= lxc_set_environment(ctx
->container
->lxc_conf
);
886 /* Set extra environment variables. */
888 for (; *extra_env
; extra_env
++) {
891 /* We just assume the user knows what they are doing, so
892 * we don't do any checks.
894 p
= strdup(*extra_env
);
900 SYSWARN("Failed to set environment variable");
907 static char *lxc_attach_getpwshell(uid_t uid
)
909 __do_free
char *line
= NULL
, *result
= NULL
;
910 __do_fclose
FILE *pipe_f
= NULL
;
915 size_t line_bufsz
= 0;
917 /* We need to fork off a process that runs the getent program, and we
918 * need to capture its output, so we use a pipe for that purpose.
920 ret
= pipe2(pipes
, O_CLOEXEC
);
933 char *arguments
[] = {
942 /* We want to capture stdout. */
943 ret
= dup2(pipes
[1], STDOUT_FILENO
);
948 /* Get rid of stdin/stderr, so we try to associate it with
954 close(STDERR_FILENO
);
956 (void)dup3(fd
, STDIN_FILENO
, O_CLOEXEC
);
957 (void)dup3(fd
, STDERR_FILENO
, O_CLOEXEC
);
961 /* Finish argument list. */
962 ret
= strnprintf(uid_buf
, sizeof(uid_buf
), "%ld", (long)uid
);
966 /* Try to run getent program. */
967 (void)execvp("getent", arguments
);
973 pipe_f
= fdopen(pipes
[0], "re");
978 /* Transfer ownership of pipes[0] to pipe_f. */
981 while (getline(&line
, &line_bufsz
, pipe_f
) != -1) {
985 char *endptr
= NULL
, *saveptr
= NULL
;
987 /* If we already found something, just continue to read
988 * until the pipe doesn't deliver any more data, but
989 * don't modify the existing data structure.
997 /* Trim line on the right hand side. */
998 for (i
= strlen(line
); i
> 0 && (line
[i
- 1] == '\n' || line
[i
- 1] == '\r'); --i
)
1001 /* Split into tokens: first: user name. */
1002 token
= strtok_r(line
, ":", &saveptr
);
1006 /* next: placeholder password field */
1007 token
= strtok_r(NULL
, ":", &saveptr
);
1012 token
= strtok_r(NULL
, ":", &saveptr
);
1013 value
= token
? strtol(token
, &endptr
, 10) : 0;
1014 if (!token
|| !endptr
|| *endptr
|| value
== LONG_MIN
||
1018 /* placeholder conherence check: user id matches */
1019 if ((uid_t
)value
!= uid
)
1022 /* skip fields: gid, gecos, dir, go to next field 'shell' */
1023 for (i
= 0; i
< 4; i
++) {
1024 token
= strtok_r(NULL
, ":", &saveptr
);
1032 free_disarm(result
);
1033 result
= strdup(token
);
1035 /* Sanity check that there are no fields after that. */
1036 token
= strtok_r(NULL
, ":", &saveptr
);
1044 ret
= wait_for_pid(pid
);
1051 return move_ptr(result
);
1054 static bool fetch_seccomp(struct lxc_container
*c
, lxc_attach_options_t
*options
)
1056 __do_free
char *path
= NULL
;
1060 if (!attach_lsm(options
)) {
1061 free_disarm(c
->lxc_conf
->seccomp
.seccomp
);
1065 /* Remove current setting. */
1066 if (!c
->set_config_item(c
, "lxc.seccomp.profile", "") &&
1067 !c
->set_config_item(c
, "lxc.seccomp", ""))
1070 /* Fetch the current profile path over the cmd interface. */
1071 path
= c
->get_running_config_item(c
, "lxc.seccomp.profile");
1073 INFO("Failed to retrieve lxc.seccomp.profile");
1075 path
= c
->get_running_config_item(c
, "lxc.seccomp");
1077 return log_info(true, "Failed to retrieve lxc.seccomp");
1080 /* Copy the value into the new lxc_conf. */
1081 bret
= c
->set_config_item(c
, "lxc.seccomp.profile", path
);
1085 /* Attempt to parse the resulting config. */
1086 ret
= lxc_read_seccomp_config(c
->lxc_conf
);
1088 return log_error(false, "Failed to retrieve seccomp policy");
1090 return log_info(true, "Retrieved seccomp policy");
1093 static bool no_new_privs(struct lxc_container
*c
, lxc_attach_options_t
*options
)
1095 __do_free
char *val
= NULL
;
1097 /* Remove current setting. */
1098 if (!c
->set_config_item(c
, "lxc.no_new_privs", ""))
1099 return log_info(false, "Failed to unset lxc.no_new_privs");
1101 /* Retrieve currently active setting. */
1102 val
= c
->get_running_config_item(c
, "lxc.no_new_privs");
1104 return log_info(false, "Failed to retrieve lxc.no_new_privs");
1106 /* Set currently active setting. */
1107 return c
->set_config_item(c
, "lxc.no_new_privs", val
);
1110 struct attach_payload
{
1112 int terminal_pts_fd
;
1113 lxc_attach_options_t
*options
;
1114 struct attach_context
*ctx
;
1115 lxc_attach_exec_t exec_function
;
1119 static void put_attach_payload(struct attach_payload
*p
)
1122 close_prot_errno_disarm(p
->ipc_socket
);
1123 close_prot_errno_disarm(p
->terminal_pts_fd
);
1124 put_attach_context(p
->ctx
);
1129 __noreturn
static void do_attach(struct attach_payload
*ap
)
1131 lxc_attach_exec_t attach_function
= move_ptr(ap
->exec_function
);
1132 void *attach_function_args
= move_ptr(ap
->exec_payload
);
1134 lxc_attach_options_t
* options
= ap
->options
;
1135 struct attach_context
*ctx
= ap
->ctx
;
1136 struct lxc_conf
*conf
= ctx
->container
->lxc_conf
;
1139 * We currently artificially restrict core scheduling to be a pid
1140 * namespace concept since this makes the code easier. We can revisit
1141 * this no problem and make this work with shared pid namespaces as
1142 * well. This check here makes sure that the container was created with
1143 * a separate pid namespace (ctx->ns_clone_flags) and whether we are
1144 * actually attaching to this pid namespace (options->namespaces).
1146 if (core_scheduling_cookie_valid(ctx
->core_sched_cookie
) &&
1147 (ctx
->ns_clone_flags
& CLONE_NEWPID
) &&
1148 (options
->namespaces
& CLONE_NEWPID
)) {
1149 __u64 core_sched_cookie
;
1151 ret
= core_scheduling_cookie_share_with(1);
1153 SYSERROR("Failed to join core scheduling domain of %d",
1158 ret
= core_scheduling_cookie_get(getpid(), &core_sched_cookie
);
1159 if (ret
|| !core_scheduling_cookie_valid(core_sched_cookie
) ||
1160 (ctx
->core_sched_cookie
!= core_sched_cookie
)) {
1161 SYSERROR("Invalid core scheduling domain cookie %llu != %llu",
1162 (llu
)core_sched_cookie
,
1163 (llu
)ctx
->core_sched_cookie
);
1167 INFO("Joined core scheduling domain of %d with cookie %lld",
1168 ctx
->init_pid
, (llu
)core_sched_cookie
);
1171 /* A description of the purpose of this functionality is provided in the
1172 * lxc-attach(1) manual page. We have to remount here and not in the
1173 * parent process, otherwise /proc may not properly reflect the new pid
1176 if (!(options
->namespaces
& CLONE_NEWNS
) &&
1177 (options
->attach_flags
& LXC_ATTACH_REMOUNT_PROC_SYS
)) {
1178 ret
= lxc_attach_remount_sys_proc();
1182 TRACE("Remounted \"/proc\" and \"/sys\"");
1185 /* Now perform additional attachments. */
1186 if (options
->attach_flags
& LXC_ATTACH_SET_PERSONALITY
) {
1187 long new_personality
;
1189 if (options
->personality
== LXC_ATTACH_DETECT_PERSONALITY
)
1190 new_personality
= ctx
->personality
;
1192 new_personality
= options
->personality
;
1194 if (new_personality
!= LXC_ARCH_UNCHANGED
) {
1195 ret
= lxc_personality(new_personality
);
1199 TRACE("Set new personality");
1203 if (options
->attach_flags
& LXC_ATTACH_DROP_CAPABILITIES
) {
1204 ret
= drop_capabilities(ctx
);
1208 TRACE("Dropped capabilities");
1211 /* Always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL)
1212 * if you want this to be a no-op).
1214 ret
= lxc_attach_set_environment(ctx
,
1215 options
->env_policy
,
1216 options
->extra_env_vars
,
1217 options
->extra_keep_env
);
1221 TRACE("Set up environment");
1224 * This remark only affects fully unprivileged containers:
1225 * Receive fd for LSM security module before we set{g,u}id(). The reason
1226 * is that on set{g,u}id() the kernel will a) make us undumpable and b)
1227 * we will change our effective uid. This means our effective uid will
1228 * be different from the effective uid of the process that created us
1229 * which means that this processs no longer has capabilities in our
1230 * namespace including CAP_SYS_PTRACE. This means we will not be able to
1231 * read and /proc/<pid> files for the process anymore when /proc is
1232 * mounted with hidepid={1,2}. So let's get the lsm label fd before the
1235 if (attach_lsm(options
) && ctx
->lsm_label
) {
1236 if (!sync_wait_fd(ap
->ipc_socket
, &fd_lsm
)) {
1237 SYSERROR("Failed to receive lsm label fd");
1241 TRACE("Received LSM label file descriptor %d from parent", fd_lsm
);
1244 if (options
->stdin_fd
> 0 && isatty(options
->stdin_fd
)) {
1245 ret
= lxc_make_controlling_terminal(options
->stdin_fd
);
1250 if ((options
->attach_flags
& LXC_ATTACH_SETGROUPS
) &&
1251 options
->groups
.size
> 0) {
1252 if (!lxc_setgroups(options
->groups
.list
, options
->groups
.size
))
1255 if (!lxc_drop_groups() && errno
!= EPERM
)
1259 if (options
->namespaces
& CLONE_NEWUSER
)
1260 if (!lxc_switch_uid_gid(ctx
->setup_ns_uid
, ctx
->setup_ns_gid
))
1263 if (attach_lsm(options
) && ctx
->lsm_label
) {
1266 /* Change into our new LSM profile. */
1267 on_exec
= options
->attach_flags
& LXC_ATTACH_LSM_EXEC
? true : false;
1268 ret
= ctx
->lsm_ops
->process_label_set_at(ctx
->lsm_ops
, fd_lsm
, ctx
->lsm_label
, on_exec
);
1269 close_prot_errno_disarm(fd_lsm
);
1273 TRACE("Set %s LSM label to \"%s\"", ctx
->lsm_ops
->name
, ctx
->lsm_label
);
1276 if (conf
->no_new_privs
|| (options
->attach_flags
& LXC_ATTACH_NO_NEW_PRIVS
)) {
1277 ret
= prctl(PR_SET_NO_NEW_PRIVS
, prctl_arg(1), prctl_arg(0),
1278 prctl_arg(0), prctl_arg(0));
1282 TRACE("Set PR_SET_NO_NEW_PRIVS");
1285 /* The following is done after the communication socket is shut down.
1286 * That way, all errors that might (though unlikely) occur up until this
1287 * point will have their messages printed to the original stderr (if
1288 * logging is so configured) and not the fd the user supplied, if any.
1291 /* Fd handling for stdin, stdout and stderr; ignore errors here, user
1292 * may want to make sure the fds are closed, for example.
1294 if (options
->stdin_fd
>= 0 && options
->stdin_fd
!= STDIN_FILENO
)
1295 if (dup2(options
->stdin_fd
, STDIN_FILENO
) < 0)
1296 SYSDEBUG("Failed to replace stdin with %d", options
->stdin_fd
);
1298 if (options
->stdout_fd
>= 0 && options
->stdout_fd
!= STDOUT_FILENO
)
1299 if (dup2(options
->stdout_fd
, STDOUT_FILENO
) < 0)
1300 SYSDEBUG("Failed to replace stdout with %d", options
->stdout_fd
);
1302 if (options
->stderr_fd
>= 0 && options
->stderr_fd
!= STDERR_FILENO
)
1303 if (dup2(options
->stderr_fd
, STDERR_FILENO
) < 0)
1304 SYSDEBUG("Failed to replace stderr with %d", options
->stderr_fd
);
1306 /* close the old fds */
1307 if (options
->stdin_fd
> STDERR_FILENO
)
1308 close(options
->stdin_fd
);
1310 if (options
->stdout_fd
> STDERR_FILENO
)
1311 close(options
->stdout_fd
);
1313 if (options
->stderr_fd
> STDERR_FILENO
)
1314 close(options
->stderr_fd
);
1317 * Try to remove FD_CLOEXEC flag from stdin/stdout/stderr, but also
1318 * here, ignore errors.
1320 for (int fd
= STDIN_FILENO
; fd
<= STDERR_FILENO
; fd
++) {
1321 ret
= fd_cloexec(fd
, false);
1323 SYSERROR("Failed to clear FD_CLOEXEC from file descriptor %d", fd
);
1328 if (options
->attach_flags
& LXC_ATTACH_TERMINAL
) {
1329 ret
= lxc_terminal_prepare_login(ap
->terminal_pts_fd
);
1331 SYSERROR("Failed to prepare terminal file descriptor %d", ap
->terminal_pts_fd
);
1335 TRACE("Prepared terminal file descriptor %d", ap
->terminal_pts_fd
);
1338 /* Avoid unnecessary syscalls. */
1339 if (ctx
->setup_ns_uid
== ctx
->target_ns_uid
)
1340 ctx
->target_ns_uid
= LXC_INVALID_UID
;
1342 if (ctx
->setup_ns_gid
== ctx
->target_ns_gid
)
1343 ctx
->target_ns_gid
= LXC_INVALID_GID
;
1346 * Make sure that the processes STDIO is correctly owned by the user
1347 * that we are switching to.
1349 ret
= fix_stdio_permissions(ctx
->target_ns_uid
);
1351 INFO("Failed to adjust stdio permissions");
1353 if (conf
->seccomp
.seccomp
) {
1354 ret
= lxc_seccomp_load(conf
);
1358 TRACE("Loaded seccomp profile");
1360 ret
= lxc_seccomp_send_notifier_fd(&conf
->seccomp
, ap
->ipc_socket
);
1363 lxc_seccomp_close_notifier_fd(&conf
->seccomp
);
1366 if (!lxc_switch_uid_gid(ctx
->target_ns_uid
, ctx
->target_ns_gid
))
1369 put_attach_payload(ap
);
1371 /* We're done, so we can now do whatever the user intended us to do. */
1372 _exit(attach_function(attach_function_args
));
1375 ERROR("Failed to attach to container");
1376 put_attach_payload(ap
);
1377 _exit(EXIT_FAILURE
);
1380 static int lxc_attach_terminal(const char *name
, const char *lxcpath
, struct lxc_conf
*conf
,
1381 struct lxc_terminal
*terminal
)
1385 lxc_terminal_init(terminal
);
1387 ret
= lxc_terminal_create(name
, lxcpath
, conf
, terminal
);
1389 return syserror("Failed to create terminal");
1394 static int lxc_attach_terminal_mainloop_init(struct lxc_terminal
*terminal
,
1395 struct lxc_async_descr
*descr
)
1399 ret
= lxc_mainloop_open(descr
);
1401 return syserror("Failed to create mainloop");
1403 ret
= lxc_terminal_mainloop_add(descr
, terminal
);
1405 lxc_mainloop_close(descr
);
1406 return syserror("Failed to add handlers to mainloop");
1412 static inline void lxc_attach_terminal_close_ptx(struct lxc_terminal
*terminal
)
1414 close_prot_errno_disarm(terminal
->ptx
);
1417 static inline void lxc_attach_terminal_close_pts(struct lxc_terminal
*terminal
)
1419 close_prot_errno_disarm(terminal
->pty
);
1422 static inline void lxc_attach_terminal_close_peer(struct lxc_terminal
*terminal
)
1424 close_prot_errno_disarm(terminal
->peer
);
1427 static inline void lxc_attach_terminal_close_log(struct lxc_terminal
*terminal
)
1429 close_prot_errno_disarm(terminal
->log_fd
);
1432 int lxc_attach(struct lxc_container
*container
, lxc_attach_exec_t exec_function
,
1433 void *exec_payload
, lxc_attach_options_t
*options
,
1434 pid_t
*attached_process
)
1436 int ret_parent
= -1;
1437 struct lxc_async_descr descr
= {};
1439 char *name
, *lxcpath
;
1441 pid_t attached_pid
, pid
, to_cleanup_pid
;
1442 struct attach_context
*ctx
;
1443 struct lxc_terminal terminal
;
1444 struct lxc_conf
*conf
;
1447 return ret_errno(EINVAL
);
1449 if (!lxc_container_get(container
))
1450 return ret_errno(EINVAL
);
1452 name
= container
->name
;
1453 lxcpath
= container
->config_path
;
1456 options
= &attach_static_default_options
;
1457 options
->lsm_label
= NULL
;
1460 ctx
= alloc_attach_context();
1462 lxc_container_put(container
);
1463 return syserror_set(-ENOMEM
, "Failed to allocate attach context");
1466 ret
= get_attach_context(ctx
, container
, options
);
1468 put_attach_context(ctx
);
1469 return syserror("Failed to get attach context");
1472 conf
= ctx
->container
->lxc_conf
;
1474 if (!fetch_seccomp(ctx
->container
, options
))
1475 WARN("Failed to get seccomp policy");
1477 if (!no_new_privs(ctx
->container
, options
))
1478 WARN("Could not determine whether PR_SET_NO_NEW_PRIVS is set");
1480 ret
= prepare_namespaces(ctx
, options
);
1482 put_attach_context(ctx
);
1483 return syserror("Failed to get namespace file descriptors");
1486 if (options
->attach_flags
& LXC_ATTACH_TERMINAL
) {
1487 ret
= lxc_attach_terminal(name
, lxcpath
, conf
, &terminal
);
1489 put_attach_context(ctx
);
1490 return syserror("Failed to setup new terminal");
1493 terminal
.log_fd
= options
->log_fd
;
1495 lxc_terminal_init(&terminal
);
1498 /* Create a socket pair for IPC communication; set SOCK_CLOEXEC in order
1499 * to make sure we don't irritate other threads that want to fork+exec
1502 * IMPORTANT: if the initial process is multithreaded and another call
1503 * just fork()s away without exec'ing directly after, the socket fd will
1504 * exist in the forked process from the other thread and any close() in
1505 * our own child process will not really cause the socket to close
1506 * properly, potentially causing the parent to get stuck.
1508 * For this reason, while IPC is still active, we have to use shutdown()
1509 * if the child exits prematurely in order to signal that the socket is
1510 * closed and cannot assume that the child exiting will automatically do
1513 * IPC mechanism: (X is receiver)
1514 * initial process transient process attached process
1515 * X <--- send pid of
1518 * send 0 ------------------------------------> X
1519 * [do initialization]
1520 * X <------------------------------------ send 1
1521 * [add to cgroup, ...]
1522 * send 2 ------------------------------------> X
1523 * [set LXC_ATTACH_NO_NEW_PRIVS]
1524 * X <------------------------------------ send 3
1525 * [open LSM label fd]
1526 * send 4 ------------------------------------> X
1528 * close socket close socket
1531 ret
= socketpair(PF_LOCAL
, SOCK_STREAM
| SOCK_CLOEXEC
, 0, ipc_sockets
);
1533 put_attach_context(ctx
);
1534 return syserror("Could not set up required IPC mechanism for attaching");
1537 /* Create transient process, two reasons:
1538 * 1. We can't setns() in the child itself, since we want to make
1539 * sure we are properly attached to the pidns.
1540 * 2. Also, the initial thread has to put the attached process
1541 * into the cgroup, which we can only do if we didn't already
1542 * setns() (otherwise, user namespaces will hate us).
1546 put_attach_context(ctx
);
1547 return syserror("Failed to create first subprocess");
1551 char *cwd
, *new_cwd
;
1553 /* close unneeded file descriptors */
1554 close_prot_errno_disarm(ipc_sockets
[0]);
1556 if (options
->attach_flags
& LXC_ATTACH_TERMINAL
) {
1557 lxc_attach_terminal_close_ptx(&terminal
);
1558 lxc_attach_terminal_close_peer(&terminal
);
1559 lxc_attach_terminal_close_log(&terminal
);
1562 /* Wait for the parent to have setup cgroups. */
1563 if (!sync_wait(ipc_sockets
[1], ATTACH_SYNC_CGROUP
)) {
1564 shutdown(ipc_sockets
[1], SHUT_RDWR
);
1565 put_attach_context(ctx
);
1566 _exit(EXIT_FAILURE
);
1569 if (!attach_context_security_barrier(ctx
)) {
1570 shutdown(ipc_sockets
[1], SHUT_RDWR
);
1571 put_attach_context(ctx
);
1572 _exit(EXIT_FAILURE
);
1575 cwd
= getcwd(NULL
, 0);
1578 * Attach now, create another subprocess later, since pid
1579 * namespaces only really affect the children of the current
1582 * Note that this is a crucial barrier. We're no moving into
1583 * the container's context so we need to make sure to not leak
1584 * anything sensitive. That especially means things such as
1585 * open file descriptors!
1587 ret
= attach_namespaces(ctx
, options
);
1589 ERROR("Failed to enter namespaces");
1590 shutdown(ipc_sockets
[1], SHUT_RDWR
);
1591 put_attach_context(ctx
);
1592 _exit(EXIT_FAILURE
);
1595 /* Attach succeeded, try to cwd. */
1596 if (options
->initial_cwd
)
1597 new_cwd
= options
->initial_cwd
;
1601 ret
= chdir(new_cwd
);
1603 WARN("Could not change directory to \"%s\"", new_cwd
);
1607 /* Create attached process. */
1608 pid
= lxc_raw_clone(CLONE_PARENT
, NULL
);
1610 SYSERROR("Failed to clone attached process");
1611 shutdown(ipc_sockets
[1], SHUT_RDWR
);
1612 put_attach_context(ctx
);
1613 _exit(EXIT_FAILURE
);
1617 struct attach_payload ap
= {
1618 .ipc_socket
= ipc_sockets
[1],
1621 .terminal_pts_fd
= terminal
.pty
,
1622 .exec_function
= exec_function
,
1623 .exec_payload
= exec_payload
,
1626 if (options
->attach_flags
& LXC_ATTACH_TERMINAL
) {
1627 ret
= lxc_terminal_signal_sigmask_safe_blocked(&terminal
);
1629 SYSERROR("Failed to reset signal mask");
1630 _exit(EXIT_FAILURE
);
1634 /* Does not return. */
1637 TRACE("Attached process %d started initializing", pid
);
1639 if (options
->attach_flags
& LXC_ATTACH_TERMINAL
)
1640 lxc_attach_terminal_close_pts(&terminal
);
1642 /* Tell grandparent the pid of the pid of the newly created child. */
1643 if (!sync_wake_pid(ipc_sockets
[1], pid
)) {
1644 /* If this really happens here, this is very unfortunate, since
1645 * the parent will not know the pid of the attached process and
1646 * will not be able to wait for it (and we won't either due to
1647 * CLONE_PARENT) so the parent won't be able to reap it and the
1648 * attached process will remain a zombie.
1650 shutdown(ipc_sockets
[1], SHUT_RDWR
);
1651 put_attach_context(ctx
);
1652 _exit(EXIT_FAILURE
);
1655 /* The rest is in the hands of the initial and the attached process. */
1656 put_attach_context(ctx
);
1657 _exit(EXIT_SUCCESS
);
1659 TRACE("Transient process %d started initializing", pid
);
1661 to_cleanup_pid
= pid
;
1663 /* close unneeded file descriptors */
1664 close_prot_errno_disarm(ipc_sockets
[1]);
1665 put_namespaces(ctx
);
1666 if (options
->attach_flags
& LXC_ATTACH_TERMINAL
)
1667 lxc_attach_terminal_close_pts(&terminal
);
1669 /* Attach to cgroup, if requested. */
1670 if (options
->attach_flags
& LXC_ATTACH_MOVE_TO_CGROUP
) {
1672 * If this is the unified hierarchy cgroup_attach() is
1675 ret
= cgroup_attach(conf
, name
, lxcpath
, pid
);
1677 call_cleaner(cgroup_exit
) struct cgroup_ops
*cgroup_ops
= NULL
;
1678 if (!ERRNO_IS_NOT_SUPPORTED(ret
)) {
1679 SYSERROR("Failed to attach cgroup");
1683 cgroup_ops
= cgroup_init(conf
);
1687 if (!cgroup_ops
->attach(cgroup_ops
, conf
, name
, lxcpath
, pid
))
1691 TRACE("Moved transient process %d into container cgroup", pid
);
1695 * Close sensitive file descriptors we don't need anymore. Even if
1698 if (!attach_context_security_barrier(ctx
))
1701 /* Setup /proc limits */
1702 ret
= setup_proc_filesystem(conf
, pid
);
1706 /* Setup resource limits */
1707 ret
= setup_resource_limits(conf
, pid
);
1711 if (options
->attach_flags
& LXC_ATTACH_TERMINAL
) {
1712 ret
= lxc_attach_terminal_mainloop_init(&terminal
, &descr
);
1716 TRACE("Initialized terminal mainloop");
1719 /* Let the child process know to go ahead. */
1720 if (!sync_wake(ipc_sockets
[0], ATTACH_SYNC_CGROUP
))
1721 goto close_mainloop
;
1723 TRACE("Told transient process to start initializing");
1725 /* Get pid of attached process from transient process. */
1726 if (!sync_wait_pid(ipc_sockets
[0], &attached_pid
))
1727 goto close_mainloop
;
1729 TRACE("Received pid %d of attached process in parent pid namespace", attached_pid
);
1731 /* Ignore SIGKILL (CTRL-C) and SIGQUIT (CTRL-\) - issue #313. */
1732 if (options
->stdin_fd
== STDIN_FILENO
) {
1733 signal(SIGINT
, SIG_IGN
);
1734 signal(SIGQUIT
, SIG_IGN
);
1737 /* Reap transient process. */
1738 ret
= wait_for_pid(pid
);
1740 goto close_mainloop
;
1742 TRACE("Transient process %d exited", pid
);
1744 /* We will always have to reap the attached process now. */
1745 to_cleanup_pid
= attached_pid
;
1747 /* Open LSM fd and send it to child. */
1748 if (attach_lsm(options
) && ctx
->lsm_label
) {
1749 __do_close
int fd_lsm
= -EBADF
;
1752 on_exec
= options
->attach_flags
& LXC_ATTACH_LSM_EXEC
? true : false;
1753 fd_lsm
= ctx
->lsm_ops
->process_label_fd_get(ctx
->lsm_ops
, attached_pid
, on_exec
);
1755 goto close_mainloop
;
1757 TRACE("Opened LSM label file descriptor %d", fd_lsm
);
1759 /* Send child fd of the LSM security module to write to. */
1760 if (!sync_wake_fd(ipc_sockets
[0], fd_lsm
)) {
1761 SYSERROR("Failed to send lsm label fd");
1762 goto close_mainloop
;
1765 TRACE("Sent LSM label file descriptor %d to child", fd_lsm
);
1768 if (conf
->seccomp
.seccomp
) {
1769 ret
= lxc_seccomp_recv_notifier_fd(&conf
->seccomp
, ipc_sockets
[0]);
1771 goto close_mainloop
;
1773 ret
= lxc_seccomp_add_notifier(name
, lxcpath
, &conf
->seccomp
);
1775 goto close_mainloop
;
1778 /* We're done, the child process should now execute whatever it
1779 * is that the user requested. The parent can now track it with
1780 * waitpid() or similar.
1783 *attached_process
= attached_pid
;
1785 /* Now shut down communication with child, we're done. */
1786 shutdown(ipc_sockets
[0], SHUT_RDWR
);
1787 close_prot_errno_disarm(ipc_sockets
[0]);
1790 to_cleanup_pid
= -1;
1792 if (options
->attach_flags
& LXC_ATTACH_TERMINAL
) {
1793 ret
= lxc_mainloop(&descr
, -1);
1796 to_cleanup_pid
= attached_pid
;
1801 if (options
->attach_flags
& LXC_ATTACH_TERMINAL
)
1802 lxc_mainloop_close(&descr
);
1805 if (ipc_sockets
[0] >= 0) {
1806 shutdown(ipc_sockets
[0], SHUT_RDWR
);
1807 close_prot_errno_disarm(ipc_sockets
[0]);
1810 if (to_cleanup_pid
> 0)
1811 (void)wait_for_pid(to_cleanup_pid
);
1813 if (options
->attach_flags
& LXC_ATTACH_TERMINAL
) {
1814 lxc_terminal_delete(&terminal
);
1815 lxc_terminal_conf_free(&terminal
);
1818 put_attach_context(ctx
);
1822 int lxc_attach_run_command(void *payload
)
1825 lxc_attach_command_t
*cmd
= payload
;
1827 ret
= execvp(cmd
->program
, cmd
->argv
);
1839 return syserror_ret(ret
, "Failed to exec \"%s\"", cmd
->program
);
1842 int lxc_attach_run_shell(void* payload
)
1844 __do_free
char *buf
= NULL
;
1846 struct passwd pwent
;
1847 struct passwd
*pwentp
= NULL
;
1852 /* Ignore payload parameter. */
1857 bufsize
= sysconf(_SC_GETPW_R_SIZE_MAX
);
1861 buf
= malloc(bufsize
);
1863 ret
= getpwuid_r(uid
, &pwent
, buf
, bufsize
, &pwentp
);
1866 WARN("Could not find matched password record");
1868 WARN("Failed to get password record - %u", uid
);
1872 /* This probably happens because of incompatible nss implementations in
1873 * host and container (remember, this code is still using the host's
1874 * glibc but our mount namespace is in the container) we may try to get
1875 * the information by spawning a [getent passwd uid] process and parsing
1879 user_shell
= lxc_attach_getpwshell(uid
);
1881 user_shell
= pwent
.pw_shell
;
1884 execlp(user_shell
, user_shell
, (char *)NULL
);
1886 /* Executed if either no passwd entry or execvp fails, we will fall back
1887 * on /bin/sh as a default shell.
1889 execlp("/bin/sh", "/bin/sh", (char *)NULL
);
1891 SYSERROR("Failed to execute shell");