]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/attach.c
conf: handle kernels without or not using SMT
[mirror_lxc.git] / src / lxc / attach.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include "config.h"
4
5 #include <errno.h>
6 #include <fcntl.h>
7 #include <grp.h>
8 #include <linux/unistd.h>
9 #include <pwd.h>
10 #include <pthread.h>
11 #include <signal.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <sys/mount.h>
16 #include <sys/param.h>
17 #include <sys/prctl.h>
18 #include <sys/socket.h>
19 #include <sys/syscall.h>
20 #include <sys/wait.h>
21 #include <termios.h>
22 #include <unistd.h>
23
24 #include "attach.h"
25
26 #include "af_unix.h"
27 #include "attach.h"
28 #include "caps.h"
29 #include "cgroups/cgroup.h"
30 #include "cgroups/cgroup_utils.h"
31 #include "commands.h"
32 #include "conf.h"
33 #include "confile.h"
34 #include "log.h"
35 #include "lsm/lsm.h"
36 #include "lxclock.h"
37 #include "lxcseccomp.h"
38 #include "macro.h"
39 #include "mainloop.h"
40 #include "memory_utils.h"
41 #include "mount_utils.h"
42 #include "namespace.h"
43 #include "process_utils.h"
44 #include "sync.h"
45 #include "syscall_wrappers.h"
46 #include "terminal.h"
47 #include "utils.h"
48
49 lxc_log_define(attach, lxc);
50
51 /* Define default options if no options are supplied by the user. */
52 static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT;
53
54 /*
55 * The context used to attach to the container.
56 * @attach_flags : the attach flags specified in lxc_attach_options_t
57 * @init_pid : the PID of the container's init process
58 * @dfd_init_pid : file descriptor to /proc/@init_pid
59 * __Must be closed in attach_context_security_barrier()__!
60 * @dfd_self_pid : file descriptor to /proc/self
61 * __Must be closed in attach_context_security_barrier()__!
62 * @setup_ns_uid : if CLONE_NEWUSER is specified will contain the uid used
63 * during attach setup.
64 * @setup_ns_gid : if CLONE_NEWUSER is specified will contain the gid used
65 * during attach setup.
66 * @target_ns_uid : if CLONE_NEWUSER is specified the uid that the final
67 * program will be run with.
68 * @target_ns_gid : if CLONE_NEWUSER is specified the gid that the final
69 * program will be run with.
70 * @target_host_uid : if CLONE_NEWUSER is specified the uid that the final
71 * program will be run with on the host.
72 * @target_host_gid : if CLONE_NEWUSER is specified the gid that the final
73 * program will be run with on the host.
74 * @lsm_label : LSM label to be used for the attaching process
75 * @container : the container we're attaching o
76 * @personality : the personality to use for the final program
77 * @capability : the capability mask of the @init_pid
78 * @ns_inherited : flags of namespaces that the final program will inherit
79 * from @init_pid
80 * @ns_fd : file descriptors to @init_pid's namespaces
81 * @core_sched_cookie : core scheduling cookie
82 */
83 struct attach_context {
84 unsigned int ns_clone_flags;
85 unsigned int attach_flags;
86 int init_pid;
87 int init_pidfd;
88 int dfd_init_pid;
89 int dfd_self_pid;
90 uid_t setup_ns_uid;
91 gid_t setup_ns_gid;
92 uid_t target_ns_uid;
93 gid_t target_ns_gid;
94 uid_t target_host_uid;
95 uid_t target_host_gid;
96 char *lsm_label;
97 struct lxc_container *container;
98 personality_t personality;
99 unsigned long long capability_mask;
100 int ns_inherited;
101 int ns_fd[LXC_NS_MAX];
102 struct lsm_ops *lsm_ops;
103 __u64 core_sched_cookie;
104 };
105
106 static pid_t pidfd_get_pid(int dfd_init_pid, int pidfd)
107 {
108 __do_free char *line = NULL;
109 __do_fclose FILE *f = NULL;
110 size_t len = 0;
111 char path[STRLITERALLEN("fdinfo/") + INTTYPE_TO_STRLEN(int) + 1 ] = "fdinfo/";
112 int ret;
113
114 if (dfd_init_pid < 0 || pidfd < 0)
115 return ret_errno(EBADF);
116
117 ret = strnprintf(path + STRLITERALLEN("fdinfo/"), INTTYPE_TO_STRLEN(int), "%d", pidfd);
118 if (ret < 0)
119 return ret_errno(EIO);
120
121 f = fdopen_at(dfd_init_pid, path, "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
122 if (!f)
123 return -errno;
124
125 while (getline(&line, &len, f) != -1) {
126 const char *prefix = "Pid:\t";
127 const size_t prefix_len = STRLITERALLEN("Pid:\t");
128 int pid = -ESRCH;
129 char *slider = line;
130
131 if (!strnequal(slider, prefix, prefix_len))
132 continue;
133
134 slider += prefix_len;
135 slider = lxc_trim_whitespace_in_place(slider);
136
137 ret = lxc_safe_int(slider, &pid);
138 if (ret)
139 return -ret;
140
141 return pid;
142 }
143
144 return ret_errno(ENOENT);
145 }
146
147 static inline bool sync_wake_pid(int fd, pid_t pid)
148 {
149 return lxc_write_nointr(fd, &pid, sizeof(pid_t)) == sizeof(pid_t);
150 }
151
152 static inline bool sync_wait_pid(int fd, pid_t *pid)
153 {
154 return lxc_read_nointr(fd, pid, sizeof(pid_t)) == sizeof(pid_t);
155 }
156
157 static inline bool sync_wake_fd(int fd, int fd_send)
158 {
159 return lxc_abstract_unix_send_fds(fd, &fd_send, 1, NULL, 0) > 0;
160 }
161
162 static inline bool sync_wait_fd(int fd, int *fd_recv)
163 {
164 return lxc_abstract_unix_recv_one_fd(fd, fd_recv, NULL, 0) > 0;
165 }
166
167 static inline bool attach_lsm(lxc_attach_options_t *options)
168 {
169 return (options->attach_flags & (LXC_ATTACH_LSM | LXC_ATTACH_LSM_LABEL));
170 }
171
172 static struct attach_context *alloc_attach_context(void)
173 {
174 struct attach_context *ctx;
175
176 ctx = zalloc(sizeof(struct attach_context));
177 if (!ctx)
178 return ret_set_errno(NULL, ENOMEM);
179
180 ctx->init_pid = -ESRCH;
181
182 ctx->dfd_self_pid = -EBADF;
183 ctx->dfd_init_pid = -EBADF;
184 ctx->init_pidfd = -EBADF;
185
186 ctx->setup_ns_uid = LXC_INVALID_UID;
187 ctx->setup_ns_gid = LXC_INVALID_GID;
188 ctx->target_ns_uid = LXC_INVALID_UID;
189 ctx->target_ns_gid = LXC_INVALID_GID;
190 ctx->target_host_uid = LXC_INVALID_UID;
191 ctx->target_host_gid = LXC_INVALID_GID;
192
193 ctx->core_sched_cookie = INVALID_SCHED_CORE_COOKIE;
194
195 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++)
196 ctx->ns_fd[i] = -EBADF;
197
198 return ctx;
199 }
200
201 static int get_personality(const char *name, const char *lxcpath,
202 personality_t *personality)
203 {
204 __do_free char *p = NULL;
205 int ret;
206 signed long per;
207
208 p = lxc_cmd_get_config_item(name, "lxc.arch", lxcpath);
209 if (!p) {
210 *personality = LXC_ARCH_UNCHANGED;
211 return 0;
212 }
213
214 ret = lxc_config_parse_arch(p, &per);
215 if (ret < 0)
216 return syserror("Failed to parse personality");
217
218 *personality = per;
219 return 0;
220 }
221
222 static int userns_setup_ids(struct attach_context *ctx,
223 lxc_attach_options_t *options)
224 {
225 __do_free char *line = NULL;
226 __do_fclose FILE *f_gidmap = NULL, *f_uidmap = NULL;
227 size_t len = 0;
228 uid_t init_ns_uid = LXC_INVALID_UID;
229 gid_t init_ns_gid = LXC_INVALID_GID;
230 uid_t nsuid, hostuid, range_uid;
231 gid_t nsgid, hostgid, range_gid;
232
233 if (!(options->namespaces & CLONE_NEWUSER))
234 return 0;
235
236 f_uidmap = fdopen_at(ctx->dfd_init_pid, "uid_map", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
237 if (!f_uidmap)
238 return syserror("Failed to open uid_map");
239
240 while (getline(&line, &len, f_uidmap) != -1) {
241 if (sscanf(line, "%u %u %u", &nsuid, &hostuid, &range_uid) != 3)
242 continue;
243
244 if (0 >= nsuid && 0 < nsuid + range_uid) {
245 ctx->setup_ns_uid = 0;
246 TRACE("Container has mapping for uid 0");
247 break;
248 }
249
250 if (ctx->target_host_uid >= hostuid && ctx->target_host_uid < hostuid + range_uid) {
251 init_ns_uid = (ctx->target_host_uid - hostuid) + nsuid;
252 TRACE("Container runs with uid %d", init_ns_uid);
253 }
254 }
255
256 f_gidmap = fdopen_at(ctx->dfd_init_pid, "gid_map", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
257 if (!f_gidmap)
258 return syserror("Failed to open gid_map");
259
260 while (getline(&line, &len, f_gidmap) != -1) {
261 if (sscanf(line, "%u %u %u", &nsgid, &hostgid, &range_gid) != 3)
262 continue;
263
264 if (0 >= nsgid && 0 < nsgid + range_gid) {
265 ctx->setup_ns_gid = 0;
266 TRACE("Container has mapping for gid 0");
267 break;
268 }
269
270 if (ctx->target_host_gid >= hostgid && ctx->target_host_gid < hostgid + range_gid) {
271 init_ns_gid = (ctx->target_host_gid - hostgid) + nsgid;
272 TRACE("Container runs with gid %d", init_ns_gid);
273 }
274 }
275
276 if (ctx->setup_ns_uid == LXC_INVALID_UID)
277 ctx->setup_ns_uid = init_ns_uid;
278
279 if (ctx->setup_ns_gid == LXC_INVALID_UID)
280 ctx->setup_ns_gid = init_ns_gid;
281
282 return 0;
283 }
284
285 static void userns_target_ids(struct attach_context *ctx, lxc_attach_options_t *options)
286 {
287 if (options->uid != LXC_INVALID_UID)
288 ctx->target_ns_uid = options->uid;
289 else if (options->namespaces & CLONE_NEWUSER)
290 ctx->target_ns_uid = ctx->setup_ns_uid;
291 else
292 ctx->target_ns_uid = 0;
293
294 if (ctx->target_ns_uid == LXC_INVALID_UID)
295 WARN("Invalid uid specified");
296
297 if (options->gid != LXC_INVALID_GID)
298 ctx->target_ns_gid = options->gid;
299 else if (options->namespaces & CLONE_NEWUSER)
300 ctx->target_ns_gid = ctx->setup_ns_gid;
301 else
302 ctx->target_ns_gid = 0;
303
304 if (ctx->target_ns_gid == LXC_INVALID_GID)
305 WARN("Invalid gid specified");
306 }
307
308 static int parse_init_status(struct attach_context *ctx, lxc_attach_options_t *options)
309 {
310 __do_free char *line = NULL;
311 __do_fclose FILE *f = NULL;
312 size_t len = 0;
313 bool caps_found = false;
314 int ret;
315
316 f = fdopen_at(ctx->dfd_init_pid, "status", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
317 if (!f)
318 return syserror("Failed to open status file");
319
320 while (getline(&line, &len, f) != -1) {
321 signed long value = -1;
322
323 /*
324 * Format is: real, effective, saved set user, fs we only care
325 * about real uid.
326 */
327 ret = sscanf(line, "Uid: %ld", &value);
328 if (ret != EOF && ret == 1) {
329 ctx->target_host_uid = (uid_t)value;
330 TRACE("Container's init process runs with hostuid %d", ctx->target_host_uid);
331 goto next;
332 }
333
334 ret = sscanf(line, "Gid: %ld", &value);
335 if (ret != EOF && ret == 1) {
336 ctx->target_host_gid = (gid_t)value;
337 TRACE("Container's init process runs with hostgid %d", ctx->target_host_gid);
338 goto next;
339 }
340
341 ret = sscanf(line, "CapBnd: %llx", &ctx->capability_mask);
342 if (ret != EOF && ret == 1) {
343 caps_found = true;
344 goto next;
345 }
346
347 next:
348 if (ctx->target_host_uid != LXC_INVALID_UID &&
349 ctx->target_host_gid != LXC_INVALID_GID &&
350 caps_found)
351 break;
352
353 }
354
355 ret = userns_setup_ids(ctx, options);
356 if (ret)
357 return syserror_ret(ret, "Failed to get setup ids");
358 userns_target_ids(ctx, options);
359
360 return 0;
361 }
362
363 static bool pidfd_setns_supported(struct attach_context *ctx)
364 {
365 int ret;
366
367 /*
368 * The ability to attach to time namespaces came after the introduction
369 * of of using pidfds for attaching to namespaces. To avoid having to
370 * special-case both CLONE_NEWUSER and CLONE_NEWTIME handling, let's
371 * use CLONE_NEWTIME as gatekeeper.
372 */
373 if (ctx->init_pidfd >= 0)
374 ret = setns(ctx->init_pidfd, CLONE_NEWTIME);
375 else
376 ret = -EOPNOTSUPP;
377 TRACE("Attaching to namespaces via pidfds %s",
378 ret ? "unsupported" : "supported");
379 return ret == 0;
380 }
381
382 static int get_attach_context(struct attach_context *ctx,
383 struct lxc_container *container,
384 lxc_attach_options_t *options)
385 {
386 __do_free char *lsm_label = NULL;
387 int ret;
388 char path[LXC_PROC_PID_LEN];
389
390 ctx->container = container;
391 ctx->attach_flags = options->attach_flags;
392
393 ctx->dfd_self_pid = open_at(-EBADF, "/proc/self",
394 PROTECT_OPATH_FILE & ~O_NOFOLLOW,
395 (PROTECT_LOOKUP_ABSOLUTE_WITH_SYMLINKS & ~RESOLVE_NO_XDEV), 0);
396 if (ctx->dfd_self_pid < 0)
397 return syserror("Failed to open /proc/self");
398
399 ctx->init_pidfd = lxc_cmd_get_init_pidfd(container->name, container->config_path);
400 if (ctx->init_pidfd >= 0)
401 ctx->init_pid = pidfd_get_pid(ctx->dfd_self_pid, ctx->init_pidfd);
402 else
403 ctx->init_pid = lxc_cmd_get_init_pid(container->name, container->config_path);
404 if (ctx->init_pid < 0)
405 return syserror_ret(-1, "Failed to get init pid");
406
407 ret = lxc_cmd_get_clone_flags(container->name, container->config_path);
408 if (ret < 0)
409 SYSERROR("Failed to retrieve namespace flags");
410 ctx->ns_clone_flags = ret;
411
412 ret = core_scheduling_cookie_get(ctx->init_pid, &ctx->core_sched_cookie);
413 if (ret || !core_scheduling_cookie_valid(ctx->core_sched_cookie))
414 INFO("Container does not run in a separate core scheduling domain");
415 else
416 INFO("Container runs in separate core scheduling domain %llu",
417 (llu)ctx->core_sched_cookie);
418
419 ret = strnprintf(path, sizeof(path), "/proc/%d", ctx->init_pid);
420 if (ret < 0)
421 return ret_errno(EIO);
422
423 ctx->dfd_init_pid = open_at(-EBADF, path,
424 PROTECT_OPATH_DIRECTORY,
425 (PROTECT_LOOKUP_ABSOLUTE & ~RESOLVE_NO_XDEV), 0);
426 if (ctx->dfd_init_pid < 0)
427 return syserror("Failed to open /proc/%d", ctx->init_pid);
428
429 if (ctx->init_pidfd >= 0) {
430 ret = lxc_raw_pidfd_send_signal(ctx->init_pidfd, 0, NULL, 0);
431 if (ret)
432 return syserror("Container process exited or PID has been recycled");
433 else
434 TRACE("Container process still running and PID was not recycled");
435
436 if (!pidfd_setns_supported(ctx)) {
437 /* We can't risk leaking file descriptors during attach. */
438 if (close(ctx->init_pidfd))
439 return syserror("Failed to close pidfd");
440
441 ctx->init_pidfd = -EBADF;
442 TRACE("Attaching to namespaces via pidfds not supported");
443 }
444 }
445
446 /* Determine which namespaces the container was created with. */
447 if (options->namespaces == -1) {
448 options->namespaces = ctx->ns_clone_flags;
449 if (options->namespaces == -1)
450 return syserror_set(-EINVAL, "Failed to automatically determine the namespaces which the container uses");
451
452 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
453 if (ns_info[i].clone_flag & CLONE_NEWCGROUP)
454 if (!(options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) ||
455 !cgns_supported())
456 continue;
457
458 if (ns_info[i].clone_flag & options->namespaces)
459 continue;
460
461 ctx->ns_inherited |= ns_info[i].clone_flag;
462 }
463 }
464
465 ret = parse_init_status(ctx, options);
466 if (ret)
467 return syserror("Failed to open parse file");
468
469 ctx->lsm_ops = lsm_init_static();
470
471 if (attach_lsm(options)) {
472 if (ctx->attach_flags & LXC_ATTACH_LSM_LABEL)
473 lsm_label = options->lsm_label;
474 else
475 lsm_label = ctx->lsm_ops->process_label_get_at(ctx->lsm_ops, ctx->dfd_init_pid);
476 if (!lsm_label)
477 WARN("No security context received");
478 else
479 INFO("Retrieved security context %s", lsm_label);
480 }
481
482 ret = get_personality(container->name, container->config_path, &ctx->personality);
483 if (ret)
484 return syserror_ret(ret, "Failed to get personality of the container");
485
486 if (!ctx->container->lxc_conf) {
487 ctx->container->lxc_conf = lxc_conf_init();
488 if (!ctx->container->lxc_conf)
489 return syserror_set(-ENOMEM, "Failed to allocate new lxc config");
490 }
491
492 ctx->lsm_label = move_ptr(lsm_label);
493 return 0;
494 }
495
496 static int same_nsfd(int dfd_pid1, int dfd_pid2, const char *ns_path)
497 {
498 int ret;
499 struct stat ns_st1, ns_st2;
500
501 ret = fstatat(dfd_pid1, ns_path, &ns_st1, 0);
502 if (ret)
503 return -errno;
504
505 ret = fstatat(dfd_pid2, ns_path, &ns_st2, 0);
506 if (ret)
507 return -errno;
508
509 /* processes are in the same namespace */
510 if ((ns_st1.st_dev == ns_st2.st_dev) &&
511 (ns_st1.st_ino == ns_st2.st_ino))
512 return 1;
513
514 return 0;
515 }
516
517 static int same_ns(int dfd_pid1, int dfd_pid2, const char *ns_path)
518 {
519 __do_close int ns_fd2 = -EBADF;
520 int ret = -1;
521
522 ns_fd2 = open_at(dfd_pid2, ns_path, PROTECT_OPEN_WITH_TRAILING_SYMLINKS,
523 (PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS &
524 ~(RESOLVE_NO_XDEV | RESOLVE_BENEATH)), 0);
525 if (ns_fd2 < 0) {
526 if (errno == ENOENT)
527 return -ENOENT;
528 return syserror("Failed to open %d(%s)", dfd_pid2, ns_path);
529 }
530
531 ret = same_nsfd(dfd_pid1, dfd_pid2, ns_path);
532 switch (ret) {
533 case -ENOENT:
534 __fallthrough;
535 case 1:
536 return ret_errno(ENOENT);
537 case 0:
538 /* processes are in different namespaces */
539 return move_fd(ns_fd2);
540 }
541
542 return ret;
543 }
544
545 static int __prepare_namespaces_pidfd(struct attach_context *ctx)
546 {
547 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
548 int ret;
549
550 ret = same_nsfd(ctx->dfd_self_pid,
551 ctx->dfd_init_pid,
552 ns_info[i].proc_path);
553 switch (ret) {
554 case -ENOENT:
555 __fallthrough;
556 case 1:
557 ctx->ns_inherited &= ~ns_info[i].clone_flag;
558 TRACE("Shared %s namespace doesn't need attach", ns_info[i].proc_name);
559 continue;
560 case 0:
561 TRACE("Different %s namespace needs attach", ns_info[i].proc_name);
562 continue;
563 }
564
565 return syserror("Failed to determine whether %s namespace is shared",
566 ns_info[i].proc_name);
567 }
568
569 return 0;
570 }
571
572 static int __prepare_namespaces_nsfd(struct attach_context *ctx,
573 lxc_attach_options_t *options)
574 {
575 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
576 lxc_namespace_t j;
577
578 if (options->namespaces & ns_info[i].clone_flag)
579 ctx->ns_fd[i] = open_at(ctx->dfd_init_pid,
580 ns_info[i].proc_path,
581 PROTECT_OPEN_WITH_TRAILING_SYMLINKS,
582 (PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS &
583 ~(RESOLVE_NO_XDEV | RESOLVE_BENEATH)),
584 0);
585 else if (ctx->ns_inherited & ns_info[i].clone_flag)
586 ctx->ns_fd[i] = same_ns(ctx->dfd_self_pid,
587 ctx->dfd_init_pid,
588 ns_info[i].proc_path);
589 else
590 continue;
591
592 if (ctx->ns_fd[i] >= 0)
593 continue;
594
595 if (ctx->ns_fd[i] == -ENOENT) {
596 ctx->ns_inherited &= ~ns_info[i].clone_flag;
597 continue;
598 }
599
600 /* We failed to preserve the namespace. */
601 SYSERROR("Failed to preserve %s namespace of %d",
602 ns_info[i].proc_name, ctx->init_pid);
603
604 /* Close all already opened file descriptors before we return an
605 * error, so we don't leak them.
606 */
607 for (j = 0; j < i; j++)
608 close_prot_errno_disarm(ctx->ns_fd[j]);
609
610 return ret_errno(EINVAL);
611 }
612
613 return 0;
614 }
615
616 static int prepare_namespaces(struct attach_context *ctx,
617 lxc_attach_options_t *options)
618 {
619 if (ctx->init_pidfd < 0)
620 return __prepare_namespaces_nsfd(ctx, options);
621
622 return __prepare_namespaces_pidfd(ctx);
623 }
624
625 static inline void put_namespaces(struct attach_context *ctx)
626 {
627 if (ctx->init_pidfd < 0) {
628 for (int i = 0; i < LXC_NS_MAX; i++)
629 close_prot_errno_disarm(ctx->ns_fd[i]);
630 }
631 }
632
633 static int __attach_namespaces_pidfd(struct attach_context *ctx,
634 lxc_attach_options_t *options)
635 {
636 unsigned int ns_flags = options->namespaces | ctx->ns_inherited;
637 int ret;
638
639 /* The common case is to attach to all namespaces. */
640 ret = setns(ctx->init_pidfd, ns_flags);
641 if (ret)
642 return syserror("Failed to attach to namespaces via pidfd");
643
644 /* We can't risk leaking file descriptors into the container. */
645 if (close(ctx->init_pidfd))
646 return syserror("Failed to close pidfd");
647 ctx->init_pidfd = -EBADF;
648
649 return log_trace(0, "Attached to container namespaces via pidfd");
650 }
651
652 static int __attach_namespaces_nsfd(struct attach_context *ctx,
653 lxc_attach_options_t *options)
654 {
655 int fret = 0;
656
657 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
658 int ret;
659
660 if (ctx->ns_fd[i] < 0)
661 continue;
662
663 ret = setns(ctx->ns_fd[i], ns_info[i].clone_flag);
664 if (ret)
665 return syserror("Failed to attach to %s namespace of %d",
666 ns_info[i].proc_name, ctx->init_pid);
667
668 if (close(ctx->ns_fd[i])) {
669 fret = -errno;
670 SYSERROR("Failed to close file descriptor for %s namespace",
671 ns_info[i].proc_name);
672 }
673 ctx->ns_fd[i] = -EBADF;
674 }
675
676 return fret;
677 }
678
679 static int attach_namespaces(struct attach_context *ctx,
680 lxc_attach_options_t *options)
681 {
682 if (lxc_log_trace()) {
683 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
684 if (ns_info[i].clone_flag & options->namespaces) {
685 TRACE("Attaching to %s namespace", ns_info[i].proc_name);
686 continue;
687 }
688 if (ns_info[i].clone_flag & ctx->ns_inherited) {
689 TRACE("Sharing %s namespace", ns_info[i].proc_name);
690 continue;
691 }
692 TRACE("Inheriting %s namespace", ns_info[i].proc_name);
693 }
694 }
695
696 if (ctx->init_pidfd < 0)
697 return __attach_namespaces_nsfd(ctx, options);
698
699 return __attach_namespaces_pidfd(ctx, options);
700 }
701
702 static void put_attach_context(struct attach_context *ctx)
703 {
704 if (ctx) {
705 if (!(ctx->attach_flags & LXC_ATTACH_LSM_LABEL))
706 free_disarm(ctx->lsm_label);
707 close_prot_errno_disarm(ctx->dfd_init_pid);
708
709 if (ctx->container) {
710 lxc_container_put(ctx->container);
711 ctx->container = NULL;
712 }
713
714 put_namespaces(ctx);
715 free(ctx);
716 }
717 }
718
719 /*
720 * Place anything in here that needs to be get rid of before we move into the
721 * container's context and fail hard if we can't.
722 */
723 static bool attach_context_security_barrier(struct attach_context *ctx)
724 {
725 if (ctx) {
726 if (close(ctx->dfd_self_pid))
727 return false;
728 ctx->dfd_self_pid = -EBADF;
729
730 if (close(ctx->dfd_init_pid))
731 return false;
732 ctx->dfd_init_pid = -EBADF;
733 }
734
735 return true;
736 }
737
738 int lxc_attach_remount_sys_proc(void)
739 {
740 int ret;
741
742 ret = unshare(CLONE_NEWNS);
743 if (ret < 0)
744 return syserror("Failed to unshare mount namespace");
745
746 if (detect_shared_rootfs() && mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL))
747 SYSERROR("Failed to recursively turn root mount tree into dependent mount. Continuing...");
748
749 /* Assume /proc is always mounted, so remount it. */
750 ret = umount2("/proc", MNT_DETACH);
751 if (ret < 0)
752 return syserror("Failed to unmount /proc");
753
754 ret = mount("none", "/proc", "proc", 0, NULL);
755 if (ret < 0)
756 return syserror("Failed to remount /proc");
757
758 /*
759 * Try to umount /sys. If it's not a mount point, we'll get EINVAL, then
760 * we ignore it because it may not have been mounted in the first place.
761 */
762 ret = umount2("/sys", MNT_DETACH);
763 if (ret < 0 && errno != EINVAL)
764 return syserror("Failed to unmount /sys");
765
766 /* Remount it. */
767 if (ret == 0 && mount("none", "/sys", "sysfs", 0, NULL))
768 return syserror("Failed to remount /sys");
769
770 return 0;
771 }
772
773 static int drop_capabilities(struct attach_context *ctx)
774 {
775 int ret;
776 __u32 last_cap;
777
778 ret = lxc_caps_last_cap(&last_cap);
779 if (ret)
780 return syserror_ret(ret, "%d - Failed to drop capabilities", ret);
781
782 for (__u32 cap = 0; cap <= last_cap; cap++) {
783 if (ctx->capability_mask & (1LL << cap))
784 continue;
785
786 if (prctl(PR_CAPBSET_DROP, prctl_arg(cap), prctl_arg(0),
787 prctl_arg(0), prctl_arg(0)))
788 return syserror("Failed to drop capability %d", cap);
789
790 TRACE("Dropped capability %d", cap);
791 }
792
793 return 0;
794 }
795
796 static int lxc_attach_set_environment(struct attach_context *ctx,
797 enum lxc_attach_env_policy_t policy,
798 char **extra_env, char **extra_keep)
799 {
800 int ret;
801
802 if (policy == LXC_ATTACH_CLEAR_ENV) {
803 int path_kept = 0;
804 char **extra_keep_store = NULL;
805
806 if (extra_keep) {
807 size_t count, i;
808
809 for (count = 0; extra_keep[count]; count++)
810 ;
811
812 extra_keep_store = zalloc(count * sizeof(char *));
813 if (!extra_keep_store)
814 return -1;
815
816 for (i = 0; i < count; i++) {
817 char *v = getenv(extra_keep[i]);
818 if (v) {
819 extra_keep_store[i] = strdup(v);
820 if (!extra_keep_store[i]) {
821 while (i > 0)
822 free(extra_keep_store[--i]);
823
824 free(extra_keep_store);
825 return -1;
826 }
827
828 if (strequal(extra_keep[i], "PATH"))
829 path_kept = 1;
830 }
831 }
832 }
833
834 if (clearenv()) {
835 if (extra_keep_store) {
836 char **p;
837
838 for (p = extra_keep_store; *p; p++)
839 free(*p);
840
841 free(extra_keep_store);
842 }
843
844 return syserror("Failed to clear environment");
845 }
846
847 if (extra_keep_store) {
848 size_t i;
849
850 for (i = 0; extra_keep[i]; i++) {
851 if (extra_keep_store[i]) {
852 ret = setenv(extra_keep[i], extra_keep_store[i], 1);
853 if (ret < 0)
854 SYSWARN("Failed to set environment variable");
855 }
856
857 free(extra_keep_store[i]);
858 }
859
860 free(extra_keep_store);
861 }
862
863 /* Always set a default path; shells and execlp tend to be fine
864 * without it, but there is a disturbing number of C programs
865 * out there that just assume that getenv("PATH") is never NULL
866 * and then die a painful segfault death.
867 */
868 if (!path_kept) {
869 ret = setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
870 if (ret < 0)
871 SYSWARN("Failed to set environment variable");
872 }
873 }
874
875 ret = putenv("container=lxc");
876 if (ret < 0)
877 return log_warn(-1, "Failed to set environment variable");
878
879 /* Set container environment variables.*/
880 if (ctx->container->lxc_conf) {
881 ret = lxc_set_environment(ctx->container->lxc_conf);
882 if (ret < 0)
883 return -1;
884 }
885
886 /* Set extra environment variables. */
887 if (extra_env) {
888 for (; *extra_env; extra_env++) {
889 char *p;
890
891 /* We just assume the user knows what they are doing, so
892 * we don't do any checks.
893 */
894 p = strdup(*extra_env);
895 if (!p)
896 return -1;
897
898 ret = putenv(p);
899 if (ret < 0)
900 SYSWARN("Failed to set environment variable");
901 }
902 }
903
904 return 0;
905 }
906
907 static char *lxc_attach_getpwshell(uid_t uid)
908 {
909 __do_free char *line = NULL, *result = NULL;
910 __do_fclose FILE *pipe_f = NULL;
911 int fd, ret;
912 pid_t pid;
913 int pipes[2];
914 bool found = false;
915 size_t line_bufsz = 0;
916
917 /* We need to fork off a process that runs the getent program, and we
918 * need to capture its output, so we use a pipe for that purpose.
919 */
920 ret = pipe2(pipes, O_CLOEXEC);
921 if (ret < 0)
922 return NULL;
923
924 pid = fork();
925 if (pid < 0) {
926 close(pipes[0]);
927 close(pipes[1]);
928 return NULL;
929 }
930
931 if (!pid) {
932 char uid_buf[32];
933 char *arguments[] = {
934 "getent",
935 "passwd",
936 uid_buf,
937 NULL
938 };
939
940 close(pipes[0]);
941
942 /* We want to capture stdout. */
943 ret = dup2(pipes[1], STDOUT_FILENO);
944 close(pipes[1]);
945 if (ret < 0)
946 _exit(EXIT_FAILURE);
947
948 /* Get rid of stdin/stderr, so we try to associate it with
949 * /dev/null.
950 */
951 fd = open_devnull();
952 if (fd < 0) {
953 close(STDIN_FILENO);
954 close(STDERR_FILENO);
955 } else {
956 (void)dup3(fd, STDIN_FILENO, O_CLOEXEC);
957 (void)dup3(fd, STDERR_FILENO, O_CLOEXEC);
958 close(fd);
959 }
960
961 /* Finish argument list. */
962 ret = strnprintf(uid_buf, sizeof(uid_buf), "%ld", (long)uid);
963 if (ret <= 0)
964 _exit(EXIT_FAILURE);
965
966 /* Try to run getent program. */
967 (void)execvp("getent", arguments);
968 _exit(EXIT_FAILURE);
969 }
970
971 close(pipes[1]);
972
973 pipe_f = fdopen(pipes[0], "re");
974 if (!pipe_f) {
975 close(pipes[0]);
976 goto reap_child;
977 }
978 /* Transfer ownership of pipes[0] to pipe_f. */
979 move_fd(pipes[0]);
980
981 while (getline(&line, &line_bufsz, pipe_f) != -1) {
982 int i;
983 long value;
984 char *token;
985 char *endptr = NULL, *saveptr = NULL;
986
987 /* If we already found something, just continue to read
988 * until the pipe doesn't deliver any more data, but
989 * don't modify the existing data structure.
990 */
991 if (found)
992 continue;
993
994 if (!line)
995 continue;
996
997 /* Trim line on the right hand side. */
998 for (i = strlen(line); i > 0 && (line[i - 1] == '\n' || line[i - 1] == '\r'); --i)
999 line[i - 1] = '\0';
1000
1001 /* Split into tokens: first: user name. */
1002 token = strtok_r(line, ":", &saveptr);
1003 if (!token)
1004 continue;
1005
1006 /* next: placeholder password field */
1007 token = strtok_r(NULL, ":", &saveptr);
1008 if (!token)
1009 continue;
1010
1011 /* next: user id */
1012 token = strtok_r(NULL, ":", &saveptr);
1013 value = token ? strtol(token, &endptr, 10) : 0;
1014 if (!token || !endptr || *endptr || value == LONG_MIN ||
1015 value == LONG_MAX)
1016 continue;
1017
1018 /* placeholder conherence check: user id matches */
1019 if ((uid_t)value != uid)
1020 continue;
1021
1022 /* skip fields: gid, gecos, dir, go to next field 'shell' */
1023 for (i = 0; i < 4; i++) {
1024 token = strtok_r(NULL, ":", &saveptr);
1025 if (!token)
1026 continue;
1027 }
1028
1029 if (!token)
1030 continue;
1031
1032 free_disarm(result);
1033 result = strdup(token);
1034
1035 /* Sanity check that there are no fields after that. */
1036 token = strtok_r(NULL, ":", &saveptr);
1037 if (token)
1038 continue;
1039
1040 found = true;
1041 }
1042
1043 reap_child:
1044 ret = wait_for_pid(pid);
1045 if (ret < 0)
1046 return NULL;
1047
1048 if (!found)
1049 return NULL;
1050
1051 return move_ptr(result);
1052 }
1053
1054 static bool fetch_seccomp(struct lxc_container *c, lxc_attach_options_t *options)
1055 {
1056 __do_free char *path = NULL;
1057 int ret;
1058 bool bret;
1059
1060 if (!attach_lsm(options)) {
1061 free_disarm(c->lxc_conf->seccomp.seccomp);
1062 return true;
1063 }
1064
1065 /* Remove current setting. */
1066 if (!c->set_config_item(c, "lxc.seccomp.profile", "") &&
1067 !c->set_config_item(c, "lxc.seccomp", ""))
1068 return false;
1069
1070 /* Fetch the current profile path over the cmd interface. */
1071 path = c->get_running_config_item(c, "lxc.seccomp.profile");
1072 if (!path) {
1073 INFO("Failed to retrieve lxc.seccomp.profile");
1074
1075 path = c->get_running_config_item(c, "lxc.seccomp");
1076 if (!path)
1077 return log_info(true, "Failed to retrieve lxc.seccomp");
1078 }
1079
1080 /* Copy the value into the new lxc_conf. */
1081 bret = c->set_config_item(c, "lxc.seccomp.profile", path);
1082 if (!bret)
1083 return false;
1084
1085 /* Attempt to parse the resulting config. */
1086 ret = lxc_read_seccomp_config(c->lxc_conf);
1087 if (ret < 0)
1088 return log_error(false, "Failed to retrieve seccomp policy");
1089
1090 return log_info(true, "Retrieved seccomp policy");
1091 }
1092
1093 static bool no_new_privs(struct lxc_container *c, lxc_attach_options_t *options)
1094 {
1095 __do_free char *val = NULL;
1096
1097 /* Remove current setting. */
1098 if (!c->set_config_item(c, "lxc.no_new_privs", ""))
1099 return log_info(false, "Failed to unset lxc.no_new_privs");
1100
1101 /* Retrieve currently active setting. */
1102 val = c->get_running_config_item(c, "lxc.no_new_privs");
1103 if (!val)
1104 return log_info(false, "Failed to retrieve lxc.no_new_privs");
1105
1106 /* Set currently active setting. */
1107 return c->set_config_item(c, "lxc.no_new_privs", val);
1108 }
1109
1110 struct attach_payload {
1111 int ipc_socket;
1112 int terminal_pts_fd;
1113 lxc_attach_options_t *options;
1114 struct attach_context *ctx;
1115 lxc_attach_exec_t exec_function;
1116 void *exec_payload;
1117 };
1118
1119 static void put_attach_payload(struct attach_payload *p)
1120 {
1121 if (p) {
1122 close_prot_errno_disarm(p->ipc_socket);
1123 close_prot_errno_disarm(p->terminal_pts_fd);
1124 put_attach_context(p->ctx);
1125 p->ctx = NULL;
1126 }
1127 }
1128
1129 __noreturn static void do_attach(struct attach_payload *ap)
1130 {
1131 lxc_attach_exec_t attach_function = move_ptr(ap->exec_function);
1132 void *attach_function_args = move_ptr(ap->exec_payload);
1133 int fd_lsm, ret;
1134 lxc_attach_options_t* options = ap->options;
1135 struct attach_context *ctx = ap->ctx;
1136 struct lxc_conf *conf = ctx->container->lxc_conf;
1137
1138 /*
1139 * We currently artificially restrict core scheduling to be a pid
1140 * namespace concept since this makes the code easier. We can revisit
1141 * this no problem and make this work with shared pid namespaces as
1142 * well. This check here makes sure that the container was created with
1143 * a separate pid namespace (ctx->ns_clone_flags) and whether we are
1144 * actually attaching to this pid namespace (options->namespaces).
1145 */
1146 if (core_scheduling_cookie_valid(ctx->core_sched_cookie) &&
1147 (ctx->ns_clone_flags & CLONE_NEWPID) &&
1148 (options->namespaces & CLONE_NEWPID)) {
1149 __u64 core_sched_cookie;
1150
1151 ret = core_scheduling_cookie_share_with(1);
1152 if (ret < 0) {
1153 SYSERROR("Failed to join core scheduling domain of %d",
1154 ctx->init_pid);
1155 goto on_error;
1156 }
1157
1158 ret = core_scheduling_cookie_get(getpid(), &core_sched_cookie);
1159 if (ret || !core_scheduling_cookie_valid(core_sched_cookie) ||
1160 (ctx->core_sched_cookie != core_sched_cookie)) {
1161 SYSERROR("Invalid core scheduling domain cookie %llu != %llu",
1162 (llu)core_sched_cookie,
1163 (llu)ctx->core_sched_cookie);
1164 goto on_error;
1165 }
1166
1167 INFO("Joined core scheduling domain of %d with cookie %lld",
1168 ctx->init_pid, (llu)core_sched_cookie);
1169 }
1170
1171 /* A description of the purpose of this functionality is provided in the
1172 * lxc-attach(1) manual page. We have to remount here and not in the
1173 * parent process, otherwise /proc may not properly reflect the new pid
1174 * namespace.
1175 */
1176 if (!(options->namespaces & CLONE_NEWNS) &&
1177 (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) {
1178 ret = lxc_attach_remount_sys_proc();
1179 if (ret < 0)
1180 goto on_error;
1181
1182 TRACE("Remounted \"/proc\" and \"/sys\"");
1183 }
1184
1185 /* Now perform additional attachments. */
1186 if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) {
1187 long new_personality;
1188
1189 if (options->personality == LXC_ATTACH_DETECT_PERSONALITY)
1190 new_personality = ctx->personality;
1191 else
1192 new_personality = options->personality;
1193
1194 if (new_personality != LXC_ARCH_UNCHANGED) {
1195 ret = lxc_personality(new_personality);
1196 if (ret < 0)
1197 goto on_error;
1198
1199 TRACE("Set new personality");
1200 }
1201 }
1202
1203 if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) {
1204 ret = drop_capabilities(ctx);
1205 if (ret < 0)
1206 goto on_error;
1207
1208 TRACE("Dropped capabilities");
1209 }
1210
1211 /* Always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL)
1212 * if you want this to be a no-op).
1213 */
1214 ret = lxc_attach_set_environment(ctx,
1215 options->env_policy,
1216 options->extra_env_vars,
1217 options->extra_keep_env);
1218 if (ret < 0)
1219 goto on_error;
1220
1221 TRACE("Set up environment");
1222
1223 /*
1224 * This remark only affects fully unprivileged containers:
1225 * Receive fd for LSM security module before we set{g,u}id(). The reason
1226 * is that on set{g,u}id() the kernel will a) make us undumpable and b)
1227 * we will change our effective uid. This means our effective uid will
1228 * be different from the effective uid of the process that created us
1229 * which means that this processs no longer has capabilities in our
1230 * namespace including CAP_SYS_PTRACE. This means we will not be able to
1231 * read and /proc/<pid> files for the process anymore when /proc is
1232 * mounted with hidepid={1,2}. So let's get the lsm label fd before the
1233 * set{g,u}id().
1234 */
1235 if (attach_lsm(options) && ctx->lsm_label) {
1236 if (!sync_wait_fd(ap->ipc_socket, &fd_lsm)) {
1237 SYSERROR("Failed to receive lsm label fd");
1238 goto on_error;
1239 }
1240
1241 TRACE("Received LSM label file descriptor %d from parent", fd_lsm);
1242 }
1243
1244 if (options->stdin_fd > 0 && isatty(options->stdin_fd)) {
1245 ret = lxc_make_controlling_terminal(options->stdin_fd);
1246 if (ret < 0)
1247 goto on_error;
1248 }
1249
1250 if ((options->attach_flags & LXC_ATTACH_SETGROUPS) &&
1251 options->groups.size > 0) {
1252 if (!lxc_setgroups(options->groups.list, options->groups.size))
1253 goto on_error;
1254 } else {
1255 if (!lxc_drop_groups() && errno != EPERM)
1256 goto on_error;
1257 }
1258
1259 if (options->namespaces & CLONE_NEWUSER)
1260 if (!lxc_switch_uid_gid(ctx->setup_ns_uid, ctx->setup_ns_gid))
1261 goto on_error;
1262
1263 if (attach_lsm(options) && ctx->lsm_label) {
1264 bool on_exec;
1265
1266 /* Change into our new LSM profile. */
1267 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? true : false;
1268 ret = ctx->lsm_ops->process_label_set_at(ctx->lsm_ops, fd_lsm, ctx->lsm_label, on_exec);
1269 close_prot_errno_disarm(fd_lsm);
1270 if (ret < 0)
1271 goto on_error;
1272
1273 TRACE("Set %s LSM label to \"%s\"", ctx->lsm_ops->name, ctx->lsm_label);
1274 }
1275
1276 if (conf->no_new_privs || (options->attach_flags & LXC_ATTACH_NO_NEW_PRIVS)) {
1277 ret = prctl(PR_SET_NO_NEW_PRIVS, prctl_arg(1), prctl_arg(0),
1278 prctl_arg(0), prctl_arg(0));
1279 if (ret < 0)
1280 goto on_error;
1281
1282 TRACE("Set PR_SET_NO_NEW_PRIVS");
1283 }
1284
1285 /* The following is done after the communication socket is shut down.
1286 * That way, all errors that might (though unlikely) occur up until this
1287 * point will have their messages printed to the original stderr (if
1288 * logging is so configured) and not the fd the user supplied, if any.
1289 */
1290
1291 /* Fd handling for stdin, stdout and stderr; ignore errors here, user
1292 * may want to make sure the fds are closed, for example.
1293 */
1294 if (options->stdin_fd >= 0 && options->stdin_fd != STDIN_FILENO)
1295 if (dup2(options->stdin_fd, STDIN_FILENO) < 0)
1296 SYSDEBUG("Failed to replace stdin with %d", options->stdin_fd);
1297
1298 if (options->stdout_fd >= 0 && options->stdout_fd != STDOUT_FILENO)
1299 if (dup2(options->stdout_fd, STDOUT_FILENO) < 0)
1300 SYSDEBUG("Failed to replace stdout with %d", options->stdout_fd);
1301
1302 if (options->stderr_fd >= 0 && options->stderr_fd != STDERR_FILENO)
1303 if (dup2(options->stderr_fd, STDERR_FILENO) < 0)
1304 SYSDEBUG("Failed to replace stderr with %d", options->stderr_fd);
1305
1306 /* close the old fds */
1307 if (options->stdin_fd > STDERR_FILENO)
1308 close(options->stdin_fd);
1309
1310 if (options->stdout_fd > STDERR_FILENO)
1311 close(options->stdout_fd);
1312
1313 if (options->stderr_fd > STDERR_FILENO)
1314 close(options->stderr_fd);
1315
1316 /*
1317 * Try to remove FD_CLOEXEC flag from stdin/stdout/stderr, but also
1318 * here, ignore errors.
1319 */
1320 for (int fd = STDIN_FILENO; fd <= STDERR_FILENO; fd++) {
1321 ret = fd_cloexec(fd, false);
1322 if (ret < 0) {
1323 SYSERROR("Failed to clear FD_CLOEXEC from file descriptor %d", fd);
1324 goto on_error;
1325 }
1326 }
1327
1328 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1329 ret = lxc_terminal_prepare_login(ap->terminal_pts_fd);
1330 if (ret < 0) {
1331 SYSERROR("Failed to prepare terminal file descriptor %d", ap->terminal_pts_fd);
1332 goto on_error;
1333 }
1334
1335 TRACE("Prepared terminal file descriptor %d", ap->terminal_pts_fd);
1336 }
1337
1338 /* Avoid unnecessary syscalls. */
1339 if (ctx->setup_ns_uid == ctx->target_ns_uid)
1340 ctx->target_ns_uid = LXC_INVALID_UID;
1341
1342 if (ctx->setup_ns_gid == ctx->target_ns_gid)
1343 ctx->target_ns_gid = LXC_INVALID_GID;
1344
1345 /*
1346 * Make sure that the processes STDIO is correctly owned by the user
1347 * that we are switching to.
1348 */
1349 ret = fix_stdio_permissions(ctx->target_ns_uid);
1350 if (ret)
1351 INFO("Failed to adjust stdio permissions");
1352
1353 if (conf->seccomp.seccomp) {
1354 ret = lxc_seccomp_load(conf);
1355 if (ret < 0)
1356 goto on_error;
1357
1358 TRACE("Loaded seccomp profile");
1359
1360 ret = lxc_seccomp_send_notifier_fd(&conf->seccomp, ap->ipc_socket);
1361 if (ret < 0)
1362 goto on_error;
1363 lxc_seccomp_close_notifier_fd(&conf->seccomp);
1364 }
1365
1366 if (!lxc_switch_uid_gid(ctx->target_ns_uid, ctx->target_ns_gid))
1367 goto on_error;
1368
1369 put_attach_payload(ap);
1370
1371 /* We're done, so we can now do whatever the user intended us to do. */
1372 _exit(attach_function(attach_function_args));
1373
1374 on_error:
1375 ERROR("Failed to attach to container");
1376 put_attach_payload(ap);
1377 _exit(EXIT_FAILURE);
1378 }
1379
1380 static int lxc_attach_terminal(const char *name, const char *lxcpath, struct lxc_conf *conf,
1381 struct lxc_terminal *terminal)
1382 {
1383 int ret;
1384
1385 lxc_terminal_init(terminal);
1386
1387 ret = lxc_terminal_create(name, lxcpath, conf, terminal);
1388 if (ret < 0)
1389 return syserror("Failed to create terminal");
1390
1391 return 0;
1392 }
1393
1394 static int lxc_attach_terminal_mainloop_init(struct lxc_terminal *terminal,
1395 struct lxc_async_descr *descr)
1396 {
1397 int ret;
1398
1399 ret = lxc_mainloop_open(descr);
1400 if (ret < 0)
1401 return syserror("Failed to create mainloop");
1402
1403 ret = lxc_terminal_mainloop_add(descr, terminal);
1404 if (ret < 0) {
1405 lxc_mainloop_close(descr);
1406 return syserror("Failed to add handlers to mainloop");
1407 }
1408
1409 return 0;
1410 }
1411
1412 static inline void lxc_attach_terminal_close_ptx(struct lxc_terminal *terminal)
1413 {
1414 close_prot_errno_disarm(terminal->ptx);
1415 }
1416
1417 static inline void lxc_attach_terminal_close_pts(struct lxc_terminal *terminal)
1418 {
1419 close_prot_errno_disarm(terminal->pty);
1420 }
1421
1422 static inline void lxc_attach_terminal_close_peer(struct lxc_terminal *terminal)
1423 {
1424 close_prot_errno_disarm(terminal->peer);
1425 }
1426
1427 static inline void lxc_attach_terminal_close_log(struct lxc_terminal *terminal)
1428 {
1429 close_prot_errno_disarm(terminal->log_fd);
1430 }
1431
1432 int lxc_attach(struct lxc_container *container, lxc_attach_exec_t exec_function,
1433 void *exec_payload, lxc_attach_options_t *options,
1434 pid_t *attached_process)
1435 {
1436 int ret_parent = -1;
1437 struct lxc_async_descr descr = {};
1438 int ret;
1439 char *name, *lxcpath;
1440 int ipc_sockets[2];
1441 pid_t attached_pid, pid, to_cleanup_pid;
1442 struct attach_context *ctx;
1443 struct lxc_terminal terminal;
1444 struct lxc_conf *conf;
1445
1446 if (!container)
1447 return ret_errno(EINVAL);
1448
1449 if (!lxc_container_get(container))
1450 return ret_errno(EINVAL);
1451
1452 name = container->name;
1453 lxcpath = container->config_path;
1454
1455 if (!options) {
1456 options = &attach_static_default_options;
1457 options->lsm_label = NULL;
1458 }
1459
1460 ctx = alloc_attach_context();
1461 if (!ctx) {
1462 lxc_container_put(container);
1463 return syserror_set(-ENOMEM, "Failed to allocate attach context");
1464 }
1465
1466 ret = get_attach_context(ctx, container, options);
1467 if (ret) {
1468 put_attach_context(ctx);
1469 return syserror("Failed to get attach context");
1470 }
1471
1472 conf = ctx->container->lxc_conf;
1473
1474 if (!fetch_seccomp(ctx->container, options))
1475 WARN("Failed to get seccomp policy");
1476
1477 if (!no_new_privs(ctx->container, options))
1478 WARN("Could not determine whether PR_SET_NO_NEW_PRIVS is set");
1479
1480 ret = prepare_namespaces(ctx, options);
1481 if (ret) {
1482 put_attach_context(ctx);
1483 return syserror("Failed to get namespace file descriptors");
1484 }
1485
1486 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1487 ret = lxc_attach_terminal(name, lxcpath, conf, &terminal);
1488 if (ret < 0) {
1489 put_attach_context(ctx);
1490 return syserror("Failed to setup new terminal");
1491 }
1492
1493 terminal.log_fd = options->log_fd;
1494 } else {
1495 lxc_terminal_init(&terminal);
1496 }
1497
1498 /* Create a socket pair for IPC communication; set SOCK_CLOEXEC in order
1499 * to make sure we don't irritate other threads that want to fork+exec
1500 * away
1501 *
1502 * IMPORTANT: if the initial process is multithreaded and another call
1503 * just fork()s away without exec'ing directly after, the socket fd will
1504 * exist in the forked process from the other thread and any close() in
1505 * our own child process will not really cause the socket to close
1506 * properly, potentially causing the parent to get stuck.
1507 *
1508 * For this reason, while IPC is still active, we have to use shutdown()
1509 * if the child exits prematurely in order to signal that the socket is
1510 * closed and cannot assume that the child exiting will automatically do
1511 * that.
1512 *
1513 * IPC mechanism: (X is receiver)
1514 * initial process transient process attached process
1515 * X <--- send pid of
1516 * attached proc,
1517 * then exit
1518 * send 0 ------------------------------------> X
1519 * [do initialization]
1520 * X <------------------------------------ send 1
1521 * [add to cgroup, ...]
1522 * send 2 ------------------------------------> X
1523 * [set LXC_ATTACH_NO_NEW_PRIVS]
1524 * X <------------------------------------ send 3
1525 * [open LSM label fd]
1526 * send 4 ------------------------------------> X
1527 * [set LSM label]
1528 * close socket close socket
1529 * run program
1530 */
1531 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
1532 if (ret < 0) {
1533 put_attach_context(ctx);
1534 return syserror("Could not set up required IPC mechanism for attaching");
1535 }
1536
1537 /* Create transient process, two reasons:
1538 * 1. We can't setns() in the child itself, since we want to make
1539 * sure we are properly attached to the pidns.
1540 * 2. Also, the initial thread has to put the attached process
1541 * into the cgroup, which we can only do if we didn't already
1542 * setns() (otherwise, user namespaces will hate us).
1543 */
1544 pid = fork();
1545 if (pid < 0) {
1546 put_attach_context(ctx);
1547 return syserror("Failed to create first subprocess");
1548 }
1549
1550 if (pid == 0) {
1551 char *cwd, *new_cwd;
1552
1553 /* close unneeded file descriptors */
1554 close_prot_errno_disarm(ipc_sockets[0]);
1555
1556 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1557 lxc_attach_terminal_close_ptx(&terminal);
1558 lxc_attach_terminal_close_peer(&terminal);
1559 lxc_attach_terminal_close_log(&terminal);
1560 }
1561
1562 /* Wait for the parent to have setup cgroups. */
1563 if (!sync_wait(ipc_sockets[1], ATTACH_SYNC_CGROUP)) {
1564 shutdown(ipc_sockets[1], SHUT_RDWR);
1565 put_attach_context(ctx);
1566 _exit(EXIT_FAILURE);
1567 }
1568
1569 if (!attach_context_security_barrier(ctx)) {
1570 shutdown(ipc_sockets[1], SHUT_RDWR);
1571 put_attach_context(ctx);
1572 _exit(EXIT_FAILURE);
1573 }
1574
1575 cwd = getcwd(NULL, 0);
1576
1577 /*
1578 * Attach now, create another subprocess later, since pid
1579 * namespaces only really affect the children of the current
1580 * process.
1581 *
1582 * Note that this is a crucial barrier. We're no moving into
1583 * the container's context so we need to make sure to not leak
1584 * anything sensitive. That especially means things such as
1585 * open file descriptors!
1586 */
1587 ret = attach_namespaces(ctx, options);
1588 if (ret < 0) {
1589 ERROR("Failed to enter namespaces");
1590 shutdown(ipc_sockets[1], SHUT_RDWR);
1591 put_attach_context(ctx);
1592 _exit(EXIT_FAILURE);
1593 }
1594
1595 /* Attach succeeded, try to cwd. */
1596 if (options->initial_cwd)
1597 new_cwd = options->initial_cwd;
1598 else
1599 new_cwd = cwd;
1600 if (new_cwd) {
1601 ret = chdir(new_cwd);
1602 if (ret < 0)
1603 WARN("Could not change directory to \"%s\"", new_cwd);
1604 }
1605 free_disarm(cwd);
1606
1607 /* Create attached process. */
1608 pid = lxc_raw_clone(CLONE_PARENT, NULL);
1609 if (pid < 0) {
1610 SYSERROR("Failed to clone attached process");
1611 shutdown(ipc_sockets[1], SHUT_RDWR);
1612 put_attach_context(ctx);
1613 _exit(EXIT_FAILURE);
1614 }
1615
1616 if (pid == 0) {
1617 struct attach_payload ap = {
1618 .ipc_socket = ipc_sockets[1],
1619 .options = options,
1620 .ctx = ctx,
1621 .terminal_pts_fd = terminal.pty,
1622 .exec_function = exec_function,
1623 .exec_payload = exec_payload,
1624 };
1625
1626 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1627 ret = lxc_terminal_signal_sigmask_safe_blocked(&terminal);
1628 if (ret < 0) {
1629 SYSERROR("Failed to reset signal mask");
1630 _exit(EXIT_FAILURE);
1631 }
1632 }
1633
1634 /* Does not return. */
1635 do_attach(&ap);
1636 }
1637 TRACE("Attached process %d started initializing", pid);
1638
1639 if (options->attach_flags & LXC_ATTACH_TERMINAL)
1640 lxc_attach_terminal_close_pts(&terminal);
1641
1642 /* Tell grandparent the pid of the pid of the newly created child. */
1643 if (!sync_wake_pid(ipc_sockets[1], pid)) {
1644 /* If this really happens here, this is very unfortunate, since
1645 * the parent will not know the pid of the attached process and
1646 * will not be able to wait for it (and we won't either due to
1647 * CLONE_PARENT) so the parent won't be able to reap it and the
1648 * attached process will remain a zombie.
1649 */
1650 shutdown(ipc_sockets[1], SHUT_RDWR);
1651 put_attach_context(ctx);
1652 _exit(EXIT_FAILURE);
1653 }
1654
1655 /* The rest is in the hands of the initial and the attached process. */
1656 put_attach_context(ctx);
1657 _exit(EXIT_SUCCESS);
1658 }
1659 TRACE("Transient process %d started initializing", pid);
1660
1661 to_cleanup_pid = pid;
1662
1663 /* close unneeded file descriptors */
1664 close_prot_errno_disarm(ipc_sockets[1]);
1665 put_namespaces(ctx);
1666 if (options->attach_flags & LXC_ATTACH_TERMINAL)
1667 lxc_attach_terminal_close_pts(&terminal);
1668
1669 /* Attach to cgroup, if requested. */
1670 if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
1671 /*
1672 * If this is the unified hierarchy cgroup_attach() is
1673 * enough.
1674 */
1675 ret = cgroup_attach(conf, name, lxcpath, pid);
1676 if (ret) {
1677 call_cleaner(cgroup_exit) struct cgroup_ops *cgroup_ops = NULL;
1678 if (!ERRNO_IS_NOT_SUPPORTED(ret)) {
1679 SYSERROR("Failed to attach cgroup");
1680 goto on_error;
1681 }
1682
1683 cgroup_ops = cgroup_init(conf);
1684 if (!cgroup_ops)
1685 goto on_error;
1686
1687 if (!cgroup_ops->attach(cgroup_ops, conf, name, lxcpath, pid))
1688 goto on_error;
1689 }
1690
1691 TRACE("Moved transient process %d into container cgroup", pid);
1692 }
1693
1694 /*
1695 * Close sensitive file descriptors we don't need anymore. Even if
1696 * we're the parent.
1697 */
1698 if (!attach_context_security_barrier(ctx))
1699 goto on_error;
1700
1701 /* Setup /proc limits */
1702 ret = setup_proc_filesystem(conf, pid);
1703 if (ret < 0)
1704 goto on_error;
1705
1706 /* Setup resource limits */
1707 ret = setup_resource_limits(conf, pid);
1708 if (ret < 0)
1709 goto on_error;
1710
1711 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1712 ret = lxc_attach_terminal_mainloop_init(&terminal, &descr);
1713 if (ret < 0)
1714 goto on_error;
1715
1716 TRACE("Initialized terminal mainloop");
1717 }
1718
1719 /* Let the child process know to go ahead. */
1720 if (!sync_wake(ipc_sockets[0], ATTACH_SYNC_CGROUP))
1721 goto close_mainloop;
1722
1723 TRACE("Told transient process to start initializing");
1724
1725 /* Get pid of attached process from transient process. */
1726 if (!sync_wait_pid(ipc_sockets[0], &attached_pid))
1727 goto close_mainloop;
1728
1729 TRACE("Received pid %d of attached process in parent pid namespace", attached_pid);
1730
1731 /* Ignore SIGKILL (CTRL-C) and SIGQUIT (CTRL-\) - issue #313. */
1732 if (options->stdin_fd == STDIN_FILENO) {
1733 signal(SIGINT, SIG_IGN);
1734 signal(SIGQUIT, SIG_IGN);
1735 }
1736
1737 /* Reap transient process. */
1738 ret = wait_for_pid(pid);
1739 if (ret < 0)
1740 goto close_mainloop;
1741
1742 TRACE("Transient process %d exited", pid);
1743
1744 /* We will always have to reap the attached process now. */
1745 to_cleanup_pid = attached_pid;
1746
1747 /* Open LSM fd and send it to child. */
1748 if (attach_lsm(options) && ctx->lsm_label) {
1749 __do_close int fd_lsm = -EBADF;
1750 bool on_exec;
1751
1752 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? true : false;
1753 fd_lsm = ctx->lsm_ops->process_label_fd_get(ctx->lsm_ops, attached_pid, on_exec);
1754 if (fd_lsm < 0)
1755 goto close_mainloop;
1756
1757 TRACE("Opened LSM label file descriptor %d", fd_lsm);
1758
1759 /* Send child fd of the LSM security module to write to. */
1760 if (!sync_wake_fd(ipc_sockets[0], fd_lsm)) {
1761 SYSERROR("Failed to send lsm label fd");
1762 goto close_mainloop;
1763 }
1764
1765 TRACE("Sent LSM label file descriptor %d to child", fd_lsm);
1766 }
1767
1768 if (conf->seccomp.seccomp) {
1769 ret = lxc_seccomp_recv_notifier_fd(&conf->seccomp, ipc_sockets[0]);
1770 if (ret < 0)
1771 goto close_mainloop;
1772
1773 ret = lxc_seccomp_add_notifier(name, lxcpath, &conf->seccomp);
1774 if (ret < 0)
1775 goto close_mainloop;
1776 }
1777
1778 /* We're done, the child process should now execute whatever it
1779 * is that the user requested. The parent can now track it with
1780 * waitpid() or similar.
1781 */
1782
1783 *attached_process = attached_pid;
1784
1785 /* Now shut down communication with child, we're done. */
1786 shutdown(ipc_sockets[0], SHUT_RDWR);
1787 close_prot_errno_disarm(ipc_sockets[0]);
1788
1789 ret_parent = 0;
1790 to_cleanup_pid = -1;
1791
1792 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1793 ret = lxc_mainloop(&descr, -1);
1794 if (ret < 0) {
1795 ret_parent = -1;
1796 to_cleanup_pid = attached_pid;
1797 }
1798 }
1799
1800 close_mainloop:
1801 if (options->attach_flags & LXC_ATTACH_TERMINAL)
1802 lxc_mainloop_close(&descr);
1803
1804 on_error:
1805 if (ipc_sockets[0] >= 0) {
1806 shutdown(ipc_sockets[0], SHUT_RDWR);
1807 close_prot_errno_disarm(ipc_sockets[0]);
1808 }
1809
1810 if (to_cleanup_pid > 0)
1811 (void)wait_for_pid(to_cleanup_pid);
1812
1813 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1814 lxc_terminal_delete(&terminal);
1815 lxc_terminal_conf_free(&terminal);
1816 }
1817
1818 put_attach_context(ctx);
1819 return ret_parent;
1820 }
1821
1822 int lxc_attach_run_command(void *payload)
1823 {
1824 int ret = -1;
1825 lxc_attach_command_t *cmd = payload;
1826
1827 ret = execvp(cmd->program, cmd->argv);
1828 if (ret < 0) {
1829 switch (errno) {
1830 case ENOEXEC:
1831 ret = 126;
1832 break;
1833 case ENOENT:
1834 ret = 127;
1835 break;
1836 }
1837 }
1838
1839 return syserror_ret(ret, "Failed to exec \"%s\"", cmd->program);
1840 }
1841
1842 int lxc_attach_run_shell(void* payload)
1843 {
1844 __do_free char *buf = NULL;
1845 uid_t uid;
1846 struct passwd pwent;
1847 struct passwd *pwentp = NULL;
1848 char *user_shell;
1849 ssize_t bufsize;
1850 int ret;
1851
1852 /* Ignore payload parameter. */
1853 (void)payload;
1854
1855 uid = getuid();
1856
1857 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
1858 if (bufsize < 0)
1859 bufsize = 1024;
1860
1861 buf = malloc(bufsize);
1862 if (buf) {
1863 ret = getpwuid_r(uid, &pwent, buf, bufsize, &pwentp);
1864 if (!pwentp) {
1865 if (ret == 0)
1866 WARN("Could not find matched password record");
1867
1868 WARN("Failed to get password record - %u", uid);
1869 }
1870 }
1871
1872 /* This probably happens because of incompatible nss implementations in
1873 * host and container (remember, this code is still using the host's
1874 * glibc but our mount namespace is in the container) we may try to get
1875 * the information by spawning a [getent passwd uid] process and parsing
1876 * the result.
1877 */
1878 if (!pwentp)
1879 user_shell = lxc_attach_getpwshell(uid);
1880 else
1881 user_shell = pwent.pw_shell;
1882
1883 if (user_shell)
1884 execlp(user_shell, user_shell, (char *)NULL);
1885
1886 /* Executed if either no passwd entry or execvp fails, we will fall back
1887 * on /bin/sh as a default shell.
1888 */
1889 execlp("/bin/sh", "/bin/sh", (char *)NULL);
1890
1891 SYSERROR("Failed to execute shell");
1892 if (!pwentp)
1893 free(user_shell);
1894
1895 return -1;
1896 }