]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/attach.c
conf: port environment to new list type
[mirror_lxc.git] / src / lxc / attach.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE 1
5 #endif
6 #include <errno.h>
7 #include <fcntl.h>
8 #include <grp.h>
9 #include <linux/unistd.h>
10 #include <pwd.h>
11 #include <pthread.h>
12 #include <signal.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <sys/mount.h>
17 #include <sys/param.h>
18 #include <sys/prctl.h>
19 #include <sys/socket.h>
20 #include <sys/syscall.h>
21 #include <sys/wait.h>
22 #include <termios.h>
23 #include <unistd.h>
24
25 #include <lxc/lxccontainer.h>
26
27 #include "af_unix.h"
28 #include "attach.h"
29 #include "caps.h"
30 #include "cgroups/cgroup.h"
31 #include "cgroups/cgroup_utils.h"
32 #include "commands.h"
33 #include "conf.h"
34 #include "config.h"
35 #include "confile.h"
36 #include "log.h"
37 #include "lsm/lsm.h"
38 #include "lxclock.h"
39 #include "lxcseccomp.h"
40 #include "macro.h"
41 #include "mainloop.h"
42 #include "memory_utils.h"
43 #include "mount_utils.h"
44 #include "namespace.h"
45 #include "process_utils.h"
46 #include "sync.h"
47 #include "syscall_wrappers.h"
48 #include "terminal.h"
49 #include "utils.h"
50
51 lxc_log_define(attach, lxc);
52
53 /* Define default options if no options are supplied by the user. */
54 static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT;
55
56 /*
57 * The context used to attach to the container.
58 * @attach_flags : the attach flags specified in lxc_attach_options_t
59 * @init_pid : the PID of the container's init process
60 * @dfd_init_pid : file descriptor to /proc/@init_pid
61 * __Must be closed in attach_context_security_barrier()__!
62 * @dfd_self_pid : file descriptor to /proc/self
63 * __Must be closed in attach_context_security_barrier()__!
64 * @setup_ns_uid : if CLONE_NEWUSER is specified will contain the uid used
65 * during attach setup.
66 * @setup_ns_gid : if CLONE_NEWUSER is specified will contain the gid used
67 * during attach setup.
68 * @target_ns_uid : if CLONE_NEWUSER is specified the uid that the final
69 * program will be run with.
70 * @target_ns_gid : if CLONE_NEWUSER is specified the gid that the final
71 * program will be run with.
72 * @target_host_uid : if CLONE_NEWUSER is specified the uid that the final
73 * program will be run with on the host.
74 * @target_host_gid : if CLONE_NEWUSER is specified the gid that the final
75 * program will be run with on the host.
76 * @lsm_label : LSM label to be used for the attaching process
77 * @container : the container we're attaching o
78 * @personality : the personality to use for the final program
79 * @capability : the capability mask of the @init_pid
80 * @ns_inherited : flags of namespaces that the final program will inherit
81 * from @init_pid
82 * @ns_fd : file descriptors to @init_pid's namespaces
83 */
84 struct attach_context {
85 unsigned int attach_flags;
86 int init_pid;
87 int init_pidfd;
88 int dfd_init_pid;
89 int dfd_self_pid;
90 uid_t setup_ns_uid;
91 gid_t setup_ns_gid;
92 uid_t target_ns_uid;
93 gid_t target_ns_gid;
94 uid_t target_host_uid;
95 uid_t target_host_gid;
96 char *lsm_label;
97 struct lxc_container *container;
98 personality_t personality;
99 unsigned long long capability_mask;
100 int ns_inherited;
101 int ns_fd[LXC_NS_MAX];
102 struct lsm_ops *lsm_ops;
103 };
104
105 static pid_t pidfd_get_pid(int dfd_init_pid, int pidfd)
106 {
107 __do_free char *line = NULL;
108 __do_fclose FILE *f = NULL;
109 size_t len = 0;
110 char path[STRLITERALLEN("fdinfo/") + INTTYPE_TO_STRLEN(int) + 1 ] = "fdinfo/";
111 int ret;
112
113 if (dfd_init_pid < 0 || pidfd < 0)
114 return ret_errno(EBADF);
115
116 ret = strnprintf(path + STRLITERALLEN("fdinfo/"), INTTYPE_TO_STRLEN(int), "%d", pidfd);
117 if (ret < 0)
118 return ret_errno(EIO);
119
120 f = fdopen_at(dfd_init_pid, path, "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
121 if (!f)
122 return -errno;
123
124 while (getline(&line, &len, f) != -1) {
125 const char *prefix = "Pid:\t";
126 const size_t prefix_len = STRLITERALLEN("Pid:\t");
127 int pid = -ESRCH;
128 char *slider = line;
129
130 if (!strnequal(slider, prefix, prefix_len))
131 continue;
132
133 slider += prefix_len;
134 slider = lxc_trim_whitespace_in_place(slider);
135
136 ret = lxc_safe_int(slider, &pid);
137 if (ret)
138 return -ret;
139
140 return pid;
141 }
142
143 return ret_errno(ENOENT);
144 }
145
146 static inline bool sync_wake_pid(int fd, pid_t pid)
147 {
148 return lxc_write_nointr(fd, &pid, sizeof(pid_t)) == sizeof(pid_t);
149 }
150
151 static inline bool sync_wait_pid(int fd, pid_t *pid)
152 {
153 return lxc_read_nointr(fd, pid, sizeof(pid_t)) == sizeof(pid_t);
154 }
155
156 static inline bool sync_wake_fd(int fd, int fd_send)
157 {
158 return lxc_abstract_unix_send_fds(fd, &fd_send, 1, NULL, 0) > 0;
159 }
160
161 static inline bool sync_wait_fd(int fd, int *fd_recv)
162 {
163 return lxc_abstract_unix_recv_one_fd(fd, fd_recv, NULL, 0) > 0;
164 }
165
166 static bool attach_lsm(lxc_attach_options_t *options)
167 {
168 return (options->attach_flags & (LXC_ATTACH_LSM | LXC_ATTACH_LSM_LABEL));
169 }
170
171 static struct attach_context *alloc_attach_context(void)
172 {
173 struct attach_context *ctx;
174
175 ctx = zalloc(sizeof(struct attach_context));
176 if (!ctx)
177 return ret_set_errno(NULL, ENOMEM);
178
179 ctx->init_pid = -ESRCH;
180
181 ctx->dfd_self_pid = -EBADF;
182 ctx->dfd_init_pid = -EBADF;
183 ctx->init_pidfd = -EBADF;
184
185 ctx->setup_ns_uid = LXC_INVALID_UID;
186 ctx->setup_ns_gid = LXC_INVALID_GID;
187 ctx->target_ns_uid = LXC_INVALID_UID;
188 ctx->target_ns_gid = LXC_INVALID_GID;
189 ctx->target_host_uid = LXC_INVALID_UID;
190 ctx->target_host_gid = LXC_INVALID_GID;
191
192 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++)
193 ctx->ns_fd[i] = -EBADF;
194
195 return ctx;
196 }
197
198 static int get_personality(const char *name, const char *lxcpath,
199 personality_t *personality)
200 {
201 __do_free char *p = NULL;
202 int ret;
203 signed long per;
204
205 p = lxc_cmd_get_config_item(name, "lxc.arch", lxcpath);
206 if (!p) {
207 *personality = LXC_ARCH_UNCHANGED;
208 return 0;
209 }
210
211 ret = lxc_config_parse_arch(p, &per);
212 if (ret < 0)
213 return syserror("Failed to parse personality");
214
215 *personality = per;
216 return 0;
217 }
218
219 static int userns_setup_ids(struct attach_context *ctx,
220 lxc_attach_options_t *options)
221 {
222 __do_free char *line = NULL;
223 __do_fclose FILE *f_gidmap = NULL, *f_uidmap = NULL;
224 size_t len = 0;
225 uid_t init_ns_uid = LXC_INVALID_UID;
226 gid_t init_ns_gid = LXC_INVALID_GID;
227 uid_t nsuid, hostuid, range_uid;
228 gid_t nsgid, hostgid, range_gid;
229
230 if (!(options->namespaces & CLONE_NEWUSER))
231 return 0;
232
233 f_uidmap = fdopen_at(ctx->dfd_init_pid, "uid_map", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
234 if (!f_uidmap)
235 return log_error_errno(-errno, errno, "Failed to open uid_map");
236
237 while (getline(&line, &len, f_uidmap) != -1) {
238 if (sscanf(line, "%u %u %u", &nsuid, &hostuid, &range_uid) != 3)
239 continue;
240
241 if (0 >= nsuid && 0 < nsuid + range_uid) {
242 ctx->setup_ns_uid = 0;
243 TRACE("Container has mapping for uid 0");
244 break;
245 }
246
247 if (ctx->target_host_uid >= hostuid && ctx->target_host_uid < hostuid + range_uid) {
248 init_ns_uid = (ctx->target_host_uid - hostuid) + nsuid;
249 TRACE("Container runs with uid %d", init_ns_uid);
250 }
251 }
252
253 f_gidmap = fdopen_at(ctx->dfd_init_pid, "gid_map", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
254 if (!f_gidmap)
255 return log_error_errno(-errno, errno, "Failed to open gid_map");
256
257 while (getline(&line, &len, f_gidmap) != -1) {
258 if (sscanf(line, "%u %u %u", &nsgid, &hostgid, &range_gid) != 3)
259 continue;
260
261 if (0 >= nsgid && 0 < nsgid + range_gid) {
262 ctx->setup_ns_gid = 0;
263 TRACE("Container has mapping for gid 0");
264 break;
265 }
266
267 if (ctx->target_host_gid >= hostgid && ctx->target_host_gid < hostgid + range_gid) {
268 init_ns_gid = (ctx->target_host_gid - hostgid) + nsgid;
269 TRACE("Container runs with gid %d", init_ns_gid);
270 }
271 }
272
273 if (ctx->setup_ns_uid == LXC_INVALID_UID)
274 ctx->setup_ns_uid = init_ns_uid;
275
276 if (ctx->setup_ns_gid == LXC_INVALID_UID)
277 ctx->setup_ns_gid = init_ns_gid;
278
279 return 0;
280 }
281
282 static void userns_target_ids(struct attach_context *ctx, lxc_attach_options_t *options)
283 {
284 if (options->uid != LXC_INVALID_UID)
285 ctx->target_ns_uid = options->uid;
286 else if (options->namespaces & CLONE_NEWUSER)
287 ctx->target_ns_uid = ctx->setup_ns_uid;
288 else
289 ctx->target_ns_uid = 0;
290
291 if (ctx->target_ns_uid == LXC_INVALID_UID)
292 WARN("Invalid uid specified");
293
294 if (options->gid != LXC_INVALID_GID)
295 ctx->target_ns_gid = options->gid;
296 else if (options->namespaces & CLONE_NEWUSER)
297 ctx->target_ns_gid = ctx->setup_ns_gid;
298 else
299 ctx->target_ns_gid = 0;
300
301 if (ctx->target_ns_gid == LXC_INVALID_GID)
302 WARN("Invalid gid specified");
303 }
304
305 static int parse_init_status(struct attach_context *ctx, lxc_attach_options_t *options)
306 {
307 __do_free char *line = NULL;
308 __do_fclose FILE *f = NULL;
309 size_t len = 0;
310 bool caps_found = false;
311 int ret;
312
313 f = fdopen_at(ctx->dfd_init_pid, "status", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
314 if (!f)
315 return log_error_errno(-errno, errno, "Failed to open status file");
316
317 while (getline(&line, &len, f) != -1) {
318 signed long value = -1;
319
320 /*
321 * Format is: real, effective, saved set user, fs we only care
322 * about real uid.
323 */
324 ret = sscanf(line, "Uid: %ld", &value);
325 if (ret != EOF && ret == 1) {
326 ctx->target_host_uid = (uid_t)value;
327 TRACE("Container's init process runs with hostuid %d", ctx->target_host_uid);
328 goto next;
329 }
330
331 ret = sscanf(line, "Gid: %ld", &value);
332 if (ret != EOF && ret == 1) {
333 ctx->target_host_gid = (gid_t)value;
334 TRACE("Container's init process runs with hostgid %d", ctx->target_host_gid);
335 goto next;
336 }
337
338 ret = sscanf(line, "CapBnd: %llx", &ctx->capability_mask);
339 if (ret != EOF && ret == 1) {
340 caps_found = true;
341 goto next;
342 }
343
344 next:
345 if (ctx->target_host_uid != LXC_INVALID_UID &&
346 ctx->target_host_gid != LXC_INVALID_GID &&
347 caps_found)
348 break;
349
350 }
351
352 ret = userns_setup_ids(ctx, options);
353 if (ret)
354 return log_error_errno(ret, errno, "Failed to get setup ids");
355 userns_target_ids(ctx, options);
356
357 return 0;
358 }
359
360 static bool pidfd_setns_supported(struct attach_context *ctx)
361 {
362 int ret;
363
364 /*
365 * The ability to attach to time namespaces came after the introduction
366 * of of using pidfds for attaching to namespaces. To avoid having to
367 * special-case both CLONE_NEWUSER and CLONE_NEWTIME handling, let's
368 * use CLONE_NEWTIME as gatekeeper.
369 */
370 if (ctx->init_pidfd >= 0)
371 ret = setns(ctx->init_pidfd, CLONE_NEWTIME);
372 else
373 ret = -EOPNOTSUPP;
374 TRACE("Attaching to namespaces via pidfds %s",
375 ret ? "unsupported" : "supported");
376 return ret == 0;
377 }
378
379 static int get_attach_context(struct attach_context *ctx,
380 struct lxc_container *container,
381 lxc_attach_options_t *options)
382 {
383 __do_free char *lsm_label = NULL;
384 int ret;
385 char path[LXC_PROC_PID_LEN];
386
387 ctx->container = container;
388 ctx->attach_flags = options->attach_flags;
389
390 ctx->dfd_self_pid = open_at(-EBADF, "/proc/self",
391 PROTECT_OPATH_FILE & ~O_NOFOLLOW,
392 (PROTECT_LOOKUP_ABSOLUTE_WITH_SYMLINKS & ~RESOLVE_NO_XDEV), 0);
393 if (ctx->dfd_self_pid < 0)
394 return log_error_errno(-errno, errno, "Failed to open /proc/self");
395
396 ctx->init_pidfd = lxc_cmd_get_init_pidfd(container->name, container->config_path);
397 if (ctx->init_pidfd >= 0)
398 ctx->init_pid = pidfd_get_pid(ctx->dfd_self_pid, ctx->init_pidfd);
399 else
400 ctx->init_pid = lxc_cmd_get_init_pid(container->name, container->config_path);
401 if (ctx->init_pid < 0)
402 return log_error(-1, "Failed to get init pid");
403
404 ret = strnprintf(path, sizeof(path), "/proc/%d", ctx->init_pid);
405 if (ret < 0)
406 return ret_errno(EIO);
407
408 ctx->dfd_init_pid = open_at(-EBADF, path,
409 PROTECT_OPATH_DIRECTORY,
410 (PROTECT_LOOKUP_ABSOLUTE & ~RESOLVE_NO_XDEV), 0);
411 if (ctx->dfd_init_pid < 0)
412 return log_error_errno(-errno, errno, "Failed to open /proc/%d", ctx->init_pid);
413
414 if (ctx->init_pidfd >= 0) {
415 ret = lxc_raw_pidfd_send_signal(ctx->init_pidfd, 0, NULL, 0);
416 if (ret)
417 return log_error_errno(-errno, errno, "Container process exited or PID has been recycled");
418 else
419 TRACE("Container process still running and PID was not recycled");
420
421 if (!pidfd_setns_supported(ctx)) {
422 /* We can't risk leaking file descriptors during attach. */
423 if (close(ctx->init_pidfd))
424 return log_error_errno(-errno, errno, "Failed to close pidfd");
425
426 ctx->init_pidfd = -EBADF;
427 TRACE("Attaching to namespaces via pidfds not supported");
428 }
429 }
430
431 /* Determine which namespaces the container was created with. */
432 if (options->namespaces == -1) {
433 options->namespaces = lxc_cmd_get_clone_flags(container->name, container->config_path);
434 if (options->namespaces == -1)
435 return log_error_errno(-EINVAL, EINVAL, "Failed to automatically determine the namespaces which the container uses");
436
437 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
438 if (ns_info[i].clone_flag & CLONE_NEWCGROUP)
439 if (!(options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) ||
440 !cgns_supported())
441 continue;
442
443 if (ns_info[i].clone_flag & options->namespaces)
444 continue;
445
446 ctx->ns_inherited |= ns_info[i].clone_flag;
447 }
448 }
449
450 ret = parse_init_status(ctx, options);
451 if (ret)
452 return log_error_errno(-errno, errno, "Failed to open parse file");
453
454 ctx->lsm_ops = lsm_init_static();
455
456 if (attach_lsm(options)) {
457 if (ctx->attach_flags & LXC_ATTACH_LSM_LABEL)
458 lsm_label = options->lsm_label;
459 else
460 lsm_label = ctx->lsm_ops->process_label_get_at(ctx->lsm_ops, ctx->dfd_init_pid);
461 if (!lsm_label)
462 WARN("No security context received");
463 else
464 INFO("Retrieved security context %s", lsm_label);
465 }
466
467 ret = get_personality(container->name, container->config_path, &ctx->personality);
468 if (ret)
469 return log_error_errno(ret, errno, "Failed to get personality of the container");
470
471 if (!ctx->container->lxc_conf) {
472 ctx->container->lxc_conf = lxc_conf_init();
473 if (!ctx->container->lxc_conf)
474 return log_error_errno(-ENOMEM, ENOMEM, "Failed to allocate new lxc config");
475 }
476
477 ctx->lsm_label = move_ptr(lsm_label);
478 return 0;
479 }
480
481 static int same_nsfd(int dfd_pid1, int dfd_pid2, const char *ns_path)
482 {
483 int ret;
484 struct stat ns_st1, ns_st2;
485
486 ret = fstatat(dfd_pid1, ns_path, &ns_st1, 0);
487 if (ret)
488 return -errno;
489
490 ret = fstatat(dfd_pid2, ns_path, &ns_st2, 0);
491 if (ret)
492 return -errno;
493
494 /* processes are in the same namespace */
495 if ((ns_st1.st_dev == ns_st2.st_dev) &&
496 (ns_st1.st_ino == ns_st2.st_ino))
497 return 1;
498
499 return 0;
500 }
501
502 static int same_ns(int dfd_pid1, int dfd_pid2, const char *ns_path)
503 {
504 __do_close int ns_fd2 = -EBADF;
505 int ret = -1;
506
507 ns_fd2 = open_at(dfd_pid2, ns_path, PROTECT_OPEN_WITH_TRAILING_SYMLINKS,
508 (PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS &
509 ~(RESOLVE_NO_XDEV | RESOLVE_BENEATH)), 0);
510 if (ns_fd2 < 0) {
511 if (errno == ENOENT)
512 return -ENOENT;
513 return syserror("Failed to open %d(%s)", dfd_pid2, ns_path);
514 }
515
516 ret = same_nsfd(dfd_pid1, dfd_pid2, ns_path);
517 switch (ret) {
518 case -ENOENT:
519 __fallthrough;
520 case 1:
521 return ret_errno(ENOENT);
522 case 0:
523 /* processes are in different namespaces */
524 return move_fd(ns_fd2);
525 }
526
527 return ret;
528 }
529
530 static int __prepare_namespaces_pidfd(struct attach_context *ctx)
531 {
532 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
533 int ret;
534
535 ret = same_nsfd(ctx->dfd_self_pid,
536 ctx->dfd_init_pid,
537 ns_info[i].proc_path);
538 switch (ret) {
539 case -ENOENT:
540 __fallthrough;
541 case 1:
542 ctx->ns_inherited &= ~ns_info[i].clone_flag;
543 TRACE("Shared %s namespace doesn't need attach", ns_info[i].proc_name);
544 continue;
545 case 0:
546 TRACE("Different %s namespace needs attach", ns_info[i].proc_name);
547 continue;
548 }
549
550 return syserror("Failed to determine whether %s namespace is shared",
551 ns_info[i].proc_name);
552 }
553
554 return 0;
555 }
556
557 static int __prepare_namespaces_nsfd(struct attach_context *ctx,
558 lxc_attach_options_t *options)
559 {
560 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
561 lxc_namespace_t j;
562
563 if (options->namespaces & ns_info[i].clone_flag)
564 ctx->ns_fd[i] = open_at(ctx->dfd_init_pid,
565 ns_info[i].proc_path,
566 PROTECT_OPEN_WITH_TRAILING_SYMLINKS,
567 (PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS &
568 ~(RESOLVE_NO_XDEV | RESOLVE_BENEATH)),
569 0);
570 else if (ctx->ns_inherited & ns_info[i].clone_flag)
571 ctx->ns_fd[i] = same_ns(ctx->dfd_self_pid,
572 ctx->dfd_init_pid,
573 ns_info[i].proc_path);
574 else
575 continue;
576
577 if (ctx->ns_fd[i] >= 0)
578 continue;
579
580 if (ctx->ns_fd[i] == -ENOENT) {
581 ctx->ns_inherited &= ~ns_info[i].clone_flag;
582 continue;
583 }
584
585 /* We failed to preserve the namespace. */
586 SYSERROR("Failed to preserve %s namespace of %d",
587 ns_info[i].proc_name, ctx->init_pid);
588
589 /* Close all already opened file descriptors before we return an
590 * error, so we don't leak them.
591 */
592 for (j = 0; j < i; j++)
593 close_prot_errno_disarm(ctx->ns_fd[j]);
594
595 return -1;
596 }
597
598 return 0;
599 }
600
601 static int prepare_namespaces(struct attach_context *ctx,
602 lxc_attach_options_t *options)
603 {
604 if (ctx->init_pidfd < 0)
605 return __prepare_namespaces_nsfd(ctx, options);
606
607 return __prepare_namespaces_pidfd(ctx);
608 }
609
610 static inline void put_namespaces(struct attach_context *ctx)
611 {
612 if (ctx->init_pidfd < 0) {
613 for (int i = 0; i < LXC_NS_MAX; i++)
614 close_prot_errno_disarm(ctx->ns_fd[i]);
615 }
616 }
617
618 static int __attach_namespaces_pidfd(struct attach_context *ctx,
619 lxc_attach_options_t *options)
620 {
621 unsigned int ns_flags = options->namespaces | ctx->ns_inherited;
622 int ret;
623
624 /* The common case is to attach to all namespaces. */
625 ret = setns(ctx->init_pidfd, ns_flags);
626 if (ret)
627 return log_error_errno(-errno, errno,
628 "Failed to attach to namespaces via pidfd");
629
630 /* We can't risk leaking file descriptors into the container. */
631 if (close(ctx->init_pidfd))
632 return log_error_errno(-errno, errno, "Failed to close pidfd");
633 ctx->init_pidfd = -EBADF;
634
635 return log_trace(0, "Attached to container namespaces via pidfd");
636 }
637
638 static int __attach_namespaces_nsfd(struct attach_context *ctx,
639 lxc_attach_options_t *options)
640 {
641 int fret = 0;
642
643 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
644 int ret;
645
646 if (ctx->ns_fd[i] < 0)
647 continue;
648
649 ret = setns(ctx->ns_fd[i], ns_info[i].clone_flag);
650 if (ret)
651 return log_error_errno(-errno, errno,
652 "Failed to attach to %s namespace of %d",
653 ns_info[i].proc_name,
654 ctx->init_pid);
655
656 if (close(ctx->ns_fd[i])) {
657 fret = -errno;
658 SYSERROR("Failed to close file descriptor for %s namespace",
659 ns_info[i].proc_name);
660 }
661 ctx->ns_fd[i] = -EBADF;
662 }
663
664 return fret;
665 }
666
667 static int attach_namespaces(struct attach_context *ctx,
668 lxc_attach_options_t *options)
669 {
670 if (lxc_log_trace()) {
671 for (lxc_namespace_t i = 0; i < LXC_NS_MAX; i++) {
672 if (ns_info[i].clone_flag & options->namespaces) {
673 TRACE("Attaching to %s namespace", ns_info[i].proc_name);
674 continue;
675 }
676 if (ns_info[i].clone_flag & ctx->ns_inherited) {
677 TRACE("Sharing %s namespace", ns_info[i].proc_name);
678 continue;
679 }
680 TRACE("Inheriting %s namespace", ns_info[i].proc_name);
681 }
682 }
683
684 if (ctx->init_pidfd < 0)
685 return __attach_namespaces_nsfd(ctx, options);
686
687 return __attach_namespaces_pidfd(ctx, options);
688 }
689
690 static void put_attach_context(struct attach_context *ctx)
691 {
692 if (ctx) {
693 if (!(ctx->attach_flags & LXC_ATTACH_LSM_LABEL))
694 free_disarm(ctx->lsm_label);
695 close_prot_errno_disarm(ctx->dfd_init_pid);
696
697 if (ctx->container) {
698 lxc_container_put(ctx->container);
699 ctx->container = NULL;
700 }
701
702 put_namespaces(ctx);
703 free(ctx);
704 }
705 }
706
707 /*
708 * Place anything in here that needs to be get rid of before we move into the
709 * container's context and fail hard if we can't.
710 */
711 static bool attach_context_security_barrier(struct attach_context *ctx)
712 {
713 if (ctx) {
714 if (close(ctx->dfd_self_pid))
715 return false;
716 ctx->dfd_self_pid = -EBADF;
717
718 if (close(ctx->dfd_init_pid))
719 return false;
720 ctx->dfd_init_pid = -EBADF;
721 }
722
723 return true;
724 }
725
726 int lxc_attach_remount_sys_proc(void)
727 {
728 int ret;
729
730 ret = unshare(CLONE_NEWNS);
731 if (ret < 0)
732 return log_error_errno(-1, errno, "Failed to unshare mount namespace");
733
734 if (detect_shared_rootfs() && mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL))
735 SYSERROR("Failed to recursively turn root mount tree into dependent mount. Continuing...");
736
737 /* Assume /proc is always mounted, so remount it. */
738 ret = umount2("/proc", MNT_DETACH);
739 if (ret < 0)
740 return log_error_errno(-1, errno, "Failed to unmount /proc");
741
742 ret = mount("none", "/proc", "proc", 0, NULL);
743 if (ret < 0)
744 return log_error_errno(-1, errno, "Failed to remount /proc");
745
746 /*
747 * Try to umount /sys. If it's not a mount point, we'll get EINVAL, then
748 * we ignore it because it may not have been mounted in the first place.
749 */
750 ret = umount2("/sys", MNT_DETACH);
751 if (ret < 0 && errno != EINVAL)
752 return log_error_errno(-1, errno, "Failed to unmount /sys");
753
754 /* Remount it. */
755 if (ret == 0 && mount("none", "/sys", "sysfs", 0, NULL))
756 return log_error_errno(-1, errno, "Failed to remount /sys");
757
758 return 0;
759 }
760
761 static int drop_capabilities(struct attach_context *ctx)
762 {
763 int last_cap;
764
765 last_cap = lxc_caps_last_cap();
766 for (int cap = 0; cap <= last_cap; cap++) {
767 if (ctx->capability_mask & (1LL << cap))
768 continue;
769
770 if (prctl(PR_CAPBSET_DROP, prctl_arg(cap), prctl_arg(0),
771 prctl_arg(0), prctl_arg(0)))
772 return log_error_errno(-1, errno, "Failed to drop capability %d", cap);
773
774 TRACE("Dropped capability %d", cap);
775 }
776
777 return 0;
778 }
779
780 static int lxc_attach_set_environment(struct attach_context *ctx,
781 enum lxc_attach_env_policy_t policy,
782 char **extra_env, char **extra_keep)
783 {
784 int ret;
785
786 if (policy == LXC_ATTACH_CLEAR_ENV) {
787 int path_kept = 0;
788 char **extra_keep_store = NULL;
789
790 if (extra_keep) {
791 size_t count, i;
792
793 for (count = 0; extra_keep[count]; count++)
794 ;
795
796 extra_keep_store = zalloc(count * sizeof(char *));
797 if (!extra_keep_store)
798 return -1;
799
800 for (i = 0; i < count; i++) {
801 char *v = getenv(extra_keep[i]);
802 if (v) {
803 extra_keep_store[i] = strdup(v);
804 if (!extra_keep_store[i]) {
805 while (i > 0)
806 free(extra_keep_store[--i]);
807
808 free(extra_keep_store);
809 return -1;
810 }
811
812 if (strequal(extra_keep[i], "PATH"))
813 path_kept = 1;
814 }
815 }
816 }
817
818 if (clearenv()) {
819 if (extra_keep_store) {
820 char **p;
821
822 for (p = extra_keep_store; *p; p++)
823 free(*p);
824
825 free(extra_keep_store);
826 }
827
828 return log_error(-1, "Failed to clear environment");
829 }
830
831 if (extra_keep_store) {
832 size_t i;
833
834 for (i = 0; extra_keep[i]; i++) {
835 if (extra_keep_store[i]) {
836 ret = setenv(extra_keep[i], extra_keep_store[i], 1);
837 if (ret < 0)
838 SYSWARN("Failed to set environment variable");
839 }
840
841 free(extra_keep_store[i]);
842 }
843
844 free(extra_keep_store);
845 }
846
847 /* Always set a default path; shells and execlp tend to be fine
848 * without it, but there is a disturbing number of C programs
849 * out there that just assume that getenv("PATH") is never NULL
850 * and then die a painful segfault death.
851 */
852 if (!path_kept) {
853 ret = setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
854 if (ret < 0)
855 SYSWARN("Failed to set environment variable");
856 }
857 }
858
859 ret = putenv("container=lxc");
860 if (ret < 0)
861 return log_warn(-1, "Failed to set environment variable");
862
863 /* Set container environment variables.*/
864 if (ctx->container->lxc_conf) {
865 ret = lxc_set_environment(ctx->container->lxc_conf);
866 if (ret < 0)
867 return -1;
868 }
869
870 /* Set extra environment variables. */
871 if (extra_env) {
872 for (; *extra_env; extra_env++) {
873 char *p;
874
875 /* We just assume the user knows what they are doing, so
876 * we don't do any checks.
877 */
878 p = strdup(*extra_env);
879 if (!p)
880 return -1;
881
882 ret = putenv(p);
883 if (ret < 0)
884 SYSWARN("Failed to set environment variable");
885 }
886 }
887
888 return 0;
889 }
890
891 static char *lxc_attach_getpwshell(uid_t uid)
892 {
893 __do_free char *line = NULL, *result = NULL;
894 __do_fclose FILE *pipe_f = NULL;
895 int fd, ret;
896 pid_t pid;
897 int pipes[2];
898 bool found = false;
899 size_t line_bufsz = 0;
900
901 /* We need to fork off a process that runs the getent program, and we
902 * need to capture its output, so we use a pipe for that purpose.
903 */
904 ret = pipe2(pipes, O_CLOEXEC);
905 if (ret < 0)
906 return NULL;
907
908 pid = fork();
909 if (pid < 0) {
910 close(pipes[0]);
911 close(pipes[1]);
912 return NULL;
913 }
914
915 if (!pid) {
916 char uid_buf[32];
917 char *arguments[] = {
918 "getent",
919 "passwd",
920 uid_buf,
921 NULL
922 };
923
924 close(pipes[0]);
925
926 /* We want to capture stdout. */
927 ret = dup2(pipes[1], STDOUT_FILENO);
928 close(pipes[1]);
929 if (ret < 0)
930 _exit(EXIT_FAILURE);
931
932 /* Get rid of stdin/stderr, so we try to associate it with
933 * /dev/null.
934 */
935 fd = open_devnull();
936 if (fd < 0) {
937 close(STDIN_FILENO);
938 close(STDERR_FILENO);
939 } else {
940 (void)dup3(fd, STDIN_FILENO, O_CLOEXEC);
941 (void)dup3(fd, STDERR_FILENO, O_CLOEXEC);
942 close(fd);
943 }
944
945 /* Finish argument list. */
946 ret = strnprintf(uid_buf, sizeof(uid_buf), "%ld", (long)uid);
947 if (ret <= 0)
948 _exit(EXIT_FAILURE);
949
950 /* Try to run getent program. */
951 (void)execvp("getent", arguments);
952 _exit(EXIT_FAILURE);
953 }
954
955 close(pipes[1]);
956
957 pipe_f = fdopen(pipes[0], "re");
958 if (!pipe_f) {
959 close(pipes[0]);
960 goto reap_child;
961 }
962 /* Transfer ownership of pipes[0] to pipe_f. */
963 move_fd(pipes[0]);
964
965 while (getline(&line, &line_bufsz, pipe_f) != -1) {
966 int i;
967 long value;
968 char *token;
969 char *endptr = NULL, *saveptr = NULL;
970
971 /* If we already found something, just continue to read
972 * until the pipe doesn't deliver any more data, but
973 * don't modify the existing data structure.
974 */
975 if (found)
976 continue;
977
978 if (!line)
979 continue;
980
981 /* Trim line on the right hand side. */
982 for (i = strlen(line); i > 0 && (line[i - 1] == '\n' || line[i - 1] == '\r'); --i)
983 line[i - 1] = '\0';
984
985 /* Split into tokens: first: user name. */
986 token = strtok_r(line, ":", &saveptr);
987 if (!token)
988 continue;
989
990 /* next: placeholder password field */
991 token = strtok_r(NULL, ":", &saveptr);
992 if (!token)
993 continue;
994
995 /* next: user id */
996 token = strtok_r(NULL, ":", &saveptr);
997 value = token ? strtol(token, &endptr, 10) : 0;
998 if (!token || !endptr || *endptr || value == LONG_MIN ||
999 value == LONG_MAX)
1000 continue;
1001
1002 /* placeholder conherence check: user id matches */
1003 if ((uid_t)value != uid)
1004 continue;
1005
1006 /* skip fields: gid, gecos, dir, go to next field 'shell' */
1007 for (i = 0; i < 4; i++) {
1008 token = strtok_r(NULL, ":", &saveptr);
1009 if (!token)
1010 continue;
1011 }
1012
1013 if (!token)
1014 continue;
1015
1016 free_disarm(result);
1017 result = strdup(token);
1018
1019 /* Sanity check that there are no fields after that. */
1020 token = strtok_r(NULL, ":", &saveptr);
1021 if (token)
1022 continue;
1023
1024 found = true;
1025 }
1026
1027 reap_child:
1028 ret = wait_for_pid(pid);
1029 if (ret < 0)
1030 return NULL;
1031
1032 if (!found)
1033 return NULL;
1034
1035 return move_ptr(result);
1036 }
1037
1038 static bool fetch_seccomp(struct lxc_container *c, lxc_attach_options_t *options)
1039 {
1040 __do_free char *path = NULL;
1041 int ret;
1042 bool bret;
1043
1044 if (!attach_lsm(options)) {
1045 free_disarm(c->lxc_conf->seccomp.seccomp);
1046 return true;
1047 }
1048
1049 /* Remove current setting. */
1050 if (!c->set_config_item(c, "lxc.seccomp.profile", "") &&
1051 !c->set_config_item(c, "lxc.seccomp", ""))
1052 return false;
1053
1054 /* Fetch the current profile path over the cmd interface. */
1055 path = c->get_running_config_item(c, "lxc.seccomp.profile");
1056 if (!path) {
1057 INFO("Failed to retrieve lxc.seccomp.profile");
1058
1059 path = c->get_running_config_item(c, "lxc.seccomp");
1060 if (!path)
1061 return log_info(true, "Failed to retrieve lxc.seccomp");
1062 }
1063
1064 /* Copy the value into the new lxc_conf. */
1065 bret = c->set_config_item(c, "lxc.seccomp.profile", path);
1066 if (!bret)
1067 return false;
1068
1069 /* Attempt to parse the resulting config. */
1070 ret = lxc_read_seccomp_config(c->lxc_conf);
1071 if (ret < 0)
1072 return log_error(false, "Failed to retrieve seccomp policy");
1073
1074 return log_info(true, "Retrieved seccomp policy");
1075 }
1076
1077 static bool no_new_privs(struct lxc_container *c, lxc_attach_options_t *options)
1078 {
1079 __do_free char *val = NULL;
1080
1081 /* Remove current setting. */
1082 if (!c->set_config_item(c, "lxc.no_new_privs", ""))
1083 return log_info(false, "Failed to unset lxc.no_new_privs");
1084
1085 /* Retrieve currently active setting. */
1086 val = c->get_running_config_item(c, "lxc.no_new_privs");
1087 if (!val)
1088 return log_info(false, "Failed to retrieve lxc.no_new_privs");
1089
1090 /* Set currently active setting. */
1091 return c->set_config_item(c, "lxc.no_new_privs", val);
1092 }
1093
1094 struct attach_payload {
1095 int ipc_socket;
1096 int terminal_pts_fd;
1097 lxc_attach_options_t *options;
1098 struct attach_context *ctx;
1099 lxc_attach_exec_t exec_function;
1100 void *exec_payload;
1101 };
1102
1103 static void put_attach_payload(struct attach_payload *p)
1104 {
1105 if (p) {
1106 close_prot_errno_disarm(p->ipc_socket);
1107 close_prot_errno_disarm(p->terminal_pts_fd);
1108 put_attach_context(p->ctx);
1109 p->ctx = NULL;
1110 }
1111 }
1112
1113 __noreturn static void do_attach(struct attach_payload *ap)
1114 {
1115 lxc_attach_exec_t attach_function = move_ptr(ap->exec_function);
1116 void *attach_function_args = move_ptr(ap->exec_payload);
1117 int fd_lsm, ret;
1118 lxc_attach_options_t* options = ap->options;
1119 struct attach_context *ctx = ap->ctx;
1120 struct lxc_conf *conf = ctx->container->lxc_conf;
1121
1122 /* A description of the purpose of this functionality is provided in the
1123 * lxc-attach(1) manual page. We have to remount here and not in the
1124 * parent process, otherwise /proc may not properly reflect the new pid
1125 * namespace.
1126 */
1127 if (!(options->namespaces & CLONE_NEWNS) &&
1128 (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) {
1129 ret = lxc_attach_remount_sys_proc();
1130 if (ret < 0)
1131 goto on_error;
1132
1133 TRACE("Remounted \"/proc\" and \"/sys\"");
1134 }
1135
1136 /* Now perform additional attachments. */
1137 if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) {
1138 long new_personality;
1139
1140 if (options->personality == LXC_ATTACH_DETECT_PERSONALITY)
1141 new_personality = ctx->personality;
1142 else
1143 new_personality = options->personality;
1144
1145 if (new_personality != LXC_ARCH_UNCHANGED) {
1146 ret = lxc_personality(new_personality);
1147 if (ret < 0)
1148 goto on_error;
1149
1150 TRACE("Set new personality");
1151 }
1152 }
1153
1154 if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) {
1155 ret = drop_capabilities(ctx);
1156 if (ret < 0)
1157 goto on_error;
1158
1159 TRACE("Dropped capabilities");
1160 }
1161
1162 /* Always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL)
1163 * if you want this to be a no-op).
1164 */
1165 ret = lxc_attach_set_environment(ctx,
1166 options->env_policy,
1167 options->extra_env_vars,
1168 options->extra_keep_env);
1169 if (ret < 0)
1170 goto on_error;
1171
1172 TRACE("Set up environment");
1173
1174 /*
1175 * This remark only affects fully unprivileged containers:
1176 * Receive fd for LSM security module before we set{g,u}id(). The reason
1177 * is that on set{g,u}id() the kernel will a) make us undumpable and b)
1178 * we will change our effective uid. This means our effective uid will
1179 * be different from the effective uid of the process that created us
1180 * which means that this processs no longer has capabilities in our
1181 * namespace including CAP_SYS_PTRACE. This means we will not be able to
1182 * read and /proc/<pid> files for the process anymore when /proc is
1183 * mounted with hidepid={1,2}. So let's get the lsm label fd before the
1184 * set{g,u}id().
1185 */
1186 if (attach_lsm(options) && ctx->lsm_label) {
1187 if (!sync_wait_fd(ap->ipc_socket, &fd_lsm)) {
1188 SYSERROR("Failed to receive lsm label fd");
1189 goto on_error;
1190 }
1191
1192 TRACE("Received LSM label file descriptor %d from parent", fd_lsm);
1193 }
1194
1195 if (options->stdin_fd > 0 && isatty(options->stdin_fd)) {
1196 ret = lxc_make_controlling_terminal(options->stdin_fd);
1197 if (ret < 0)
1198 goto on_error;
1199 }
1200
1201 if ((options->attach_flags & LXC_ATTACH_SETGROUPS) &&
1202 options->groups.size > 0) {
1203 if (!lxc_setgroups(options->groups.list, options->groups.size))
1204 goto on_error;
1205 } else {
1206 if (!lxc_drop_groups() && errno != EPERM)
1207 goto on_error;
1208 }
1209
1210 if (options->namespaces & CLONE_NEWUSER)
1211 if (!lxc_switch_uid_gid(ctx->setup_ns_uid, ctx->setup_ns_gid))
1212 goto on_error;
1213
1214 if (attach_lsm(options) && ctx->lsm_label) {
1215 bool on_exec;
1216
1217 /* Change into our new LSM profile. */
1218 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? true : false;
1219 ret = ctx->lsm_ops->process_label_set_at(ctx->lsm_ops, fd_lsm, ctx->lsm_label, on_exec);
1220 close_prot_errno_disarm(fd_lsm);
1221 if (ret < 0)
1222 goto on_error;
1223
1224 TRACE("Set %s LSM label to \"%s\"", ctx->lsm_ops->name, ctx->lsm_label);
1225 }
1226
1227 if (conf->no_new_privs || (options->attach_flags & LXC_ATTACH_NO_NEW_PRIVS)) {
1228 ret = prctl(PR_SET_NO_NEW_PRIVS, prctl_arg(1), prctl_arg(0),
1229 prctl_arg(0), prctl_arg(0));
1230 if (ret < 0)
1231 goto on_error;
1232
1233 TRACE("Set PR_SET_NO_NEW_PRIVS");
1234 }
1235
1236 /* The following is done after the communication socket is shut down.
1237 * That way, all errors that might (though unlikely) occur up until this
1238 * point will have their messages printed to the original stderr (if
1239 * logging is so configured) and not the fd the user supplied, if any.
1240 */
1241
1242 /* Fd handling for stdin, stdout and stderr; ignore errors here, user
1243 * may want to make sure the fds are closed, for example.
1244 */
1245 if (options->stdin_fd >= 0 && options->stdin_fd != STDIN_FILENO)
1246 if (dup2(options->stdin_fd, STDIN_FILENO) < 0)
1247 SYSDEBUG("Failed to replace stdin with %d", options->stdin_fd);
1248
1249 if (options->stdout_fd >= 0 && options->stdout_fd != STDOUT_FILENO)
1250 if (dup2(options->stdout_fd, STDOUT_FILENO) < 0)
1251 SYSDEBUG("Failed to replace stdout with %d", options->stdout_fd);
1252
1253 if (options->stderr_fd >= 0 && options->stderr_fd != STDERR_FILENO)
1254 if (dup2(options->stderr_fd, STDERR_FILENO) < 0)
1255 SYSDEBUG("Failed to replace stderr with %d", options->stderr_fd);
1256
1257 /* close the old fds */
1258 if (options->stdin_fd > STDERR_FILENO)
1259 close(options->stdin_fd);
1260
1261 if (options->stdout_fd > STDERR_FILENO)
1262 close(options->stdout_fd);
1263
1264 if (options->stderr_fd > STDERR_FILENO)
1265 close(options->stderr_fd);
1266
1267 /*
1268 * Try to remove FD_CLOEXEC flag from stdin/stdout/stderr, but also
1269 * here, ignore errors.
1270 */
1271 for (int fd = STDIN_FILENO; fd <= STDERR_FILENO; fd++) {
1272 ret = fd_cloexec(fd, false);
1273 if (ret < 0) {
1274 SYSERROR("Failed to clear FD_CLOEXEC from file descriptor %d", fd);
1275 goto on_error;
1276 }
1277 }
1278
1279 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1280 ret = lxc_terminal_prepare_login(ap->terminal_pts_fd);
1281 if (ret < 0) {
1282 SYSERROR("Failed to prepare terminal file descriptor %d", ap->terminal_pts_fd);
1283 goto on_error;
1284 }
1285
1286 TRACE("Prepared terminal file descriptor %d", ap->terminal_pts_fd);
1287 }
1288
1289 /* Avoid unnecessary syscalls. */
1290 if (ctx->setup_ns_uid == ctx->target_ns_uid)
1291 ctx->target_ns_uid = LXC_INVALID_UID;
1292
1293 if (ctx->setup_ns_gid == ctx->target_ns_gid)
1294 ctx->target_ns_gid = LXC_INVALID_GID;
1295
1296 /*
1297 * Make sure that the processes STDIO is correctly owned by the user
1298 * that we are switching to.
1299 */
1300 ret = fix_stdio_permissions(ctx->target_ns_uid);
1301 if (ret)
1302 INFO("Failed to adjust stdio permissions");
1303
1304 if (conf->seccomp.seccomp) {
1305 ret = lxc_seccomp_load(conf);
1306 if (ret < 0)
1307 goto on_error;
1308
1309 TRACE("Loaded seccomp profile");
1310
1311 ret = lxc_seccomp_send_notifier_fd(&conf->seccomp, ap->ipc_socket);
1312 if (ret < 0)
1313 goto on_error;
1314 lxc_seccomp_close_notifier_fd(&conf->seccomp);
1315 }
1316
1317 if (!lxc_switch_uid_gid(ctx->target_ns_uid, ctx->target_ns_gid))
1318 goto on_error;
1319
1320 put_attach_payload(ap);
1321
1322 /* We're done, so we can now do whatever the user intended us to do. */
1323 _exit(attach_function(attach_function_args));
1324
1325 on_error:
1326 ERROR("Failed to attach to container");
1327 put_attach_payload(ap);
1328 _exit(EXIT_FAILURE);
1329 }
1330
1331 static int lxc_attach_terminal(const char *name, const char *lxcpath, struct lxc_conf *conf,
1332 struct lxc_terminal *terminal)
1333 {
1334 int ret;
1335
1336 lxc_terminal_init(terminal);
1337
1338 ret = lxc_terminal_create(name, lxcpath, conf, terminal);
1339 if (ret < 0)
1340 return log_error(-1, "Failed to create terminal");
1341
1342 return 0;
1343 }
1344
1345 static int lxc_attach_terminal_mainloop_init(struct lxc_terminal *terminal,
1346 struct lxc_async_descr *descr)
1347 {
1348 int ret;
1349
1350 ret = lxc_mainloop_open(descr);
1351 if (ret < 0)
1352 return log_error(-1, "Failed to create mainloop");
1353
1354 ret = lxc_terminal_mainloop_add(descr, terminal);
1355 if (ret < 0) {
1356 lxc_mainloop_close(descr);
1357 return log_error(-1, "Failed to add handlers to mainloop");
1358 }
1359
1360 return 0;
1361 }
1362
1363 static inline void lxc_attach_terminal_close_ptx(struct lxc_terminal *terminal)
1364 {
1365 close_prot_errno_disarm(terminal->ptx);
1366 }
1367
1368 static inline void lxc_attach_terminal_close_pts(struct lxc_terminal *terminal)
1369 {
1370 close_prot_errno_disarm(terminal->pty);
1371 }
1372
1373 static inline void lxc_attach_terminal_close_peer(struct lxc_terminal *terminal)
1374 {
1375 close_prot_errno_disarm(terminal->peer);
1376 }
1377
1378 static inline void lxc_attach_terminal_close_log(struct lxc_terminal *terminal)
1379 {
1380 close_prot_errno_disarm(terminal->log_fd);
1381 }
1382
1383 int lxc_attach(struct lxc_container *container, lxc_attach_exec_t exec_function,
1384 void *exec_payload, lxc_attach_options_t *options,
1385 pid_t *attached_process)
1386 {
1387 int ret_parent = -1;
1388 struct lxc_async_descr descr = {};
1389 int ret;
1390 char *name, *lxcpath;
1391 int ipc_sockets[2];
1392 pid_t attached_pid, pid, to_cleanup_pid;
1393 struct attach_context *ctx;
1394 struct lxc_terminal terminal;
1395 struct lxc_conf *conf;
1396
1397 if (!container)
1398 return ret_set_errno(-1, EINVAL);
1399
1400 if (!lxc_container_get(container))
1401 return ret_set_errno(-1, EINVAL);
1402
1403 name = container->name;
1404 lxcpath = container->config_path;
1405
1406 if (!options) {
1407 options = &attach_static_default_options;
1408 options->lsm_label = NULL;
1409 }
1410
1411 ctx = alloc_attach_context();
1412 if (!ctx) {
1413 lxc_container_put(container);
1414 return log_error_errno(-ENOMEM, ENOMEM, "Failed to allocate attach context");
1415 }
1416
1417 ret = get_attach_context(ctx, container, options);
1418 if (ret) {
1419 put_attach_context(ctx);
1420 return log_error(-1, "Failed to get attach context");
1421 }
1422
1423 conf = ctx->container->lxc_conf;
1424
1425 if (!fetch_seccomp(ctx->container, options))
1426 WARN("Failed to get seccomp policy");
1427
1428 if (!no_new_privs(ctx->container, options))
1429 WARN("Could not determine whether PR_SET_NO_NEW_PRIVS is set");
1430
1431 ret = prepare_namespaces(ctx, options);
1432 if (ret) {
1433 put_attach_context(ctx);
1434 return log_error(-1, "Failed to get namespace file descriptors");
1435 }
1436
1437 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1438 ret = lxc_attach_terminal(name, lxcpath, conf, &terminal);
1439 if (ret < 0) {
1440 put_attach_context(ctx);
1441 return log_error(-1, "Failed to setup new terminal");
1442 }
1443
1444 terminal.log_fd = options->log_fd;
1445 } else {
1446 lxc_terminal_init(&terminal);
1447 }
1448
1449 /* Create a socket pair for IPC communication; set SOCK_CLOEXEC in order
1450 * to make sure we don't irritate other threads that want to fork+exec
1451 * away
1452 *
1453 * IMPORTANT: if the initial process is multithreaded and another call
1454 * just fork()s away without exec'ing directly after, the socket fd will
1455 * exist in the forked process from the other thread and any close() in
1456 * our own child process will not really cause the socket to close
1457 * properly, potentially causing the parent to get stuck.
1458 *
1459 * For this reason, while IPC is still active, we have to use shutdown()
1460 * if the child exits prematurely in order to signal that the socket is
1461 * closed and cannot assume that the child exiting will automatically do
1462 * that.
1463 *
1464 * IPC mechanism: (X is receiver)
1465 * initial process transient process attached process
1466 * X <--- send pid of
1467 * attached proc,
1468 * then exit
1469 * send 0 ------------------------------------> X
1470 * [do initialization]
1471 * X <------------------------------------ send 1
1472 * [add to cgroup, ...]
1473 * send 2 ------------------------------------> X
1474 * [set LXC_ATTACH_NO_NEW_PRIVS]
1475 * X <------------------------------------ send 3
1476 * [open LSM label fd]
1477 * send 4 ------------------------------------> X
1478 * [set LSM label]
1479 * close socket close socket
1480 * run program
1481 */
1482 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
1483 if (ret < 0) {
1484 put_attach_context(ctx);
1485 return log_error_errno(-1, errno, "Could not set up required IPC mechanism for attaching");
1486 }
1487
1488 /* Create transient process, two reasons:
1489 * 1. We can't setns() in the child itself, since we want to make
1490 * sure we are properly attached to the pidns.
1491 * 2. Also, the initial thread has to put the attached process
1492 * into the cgroup, which we can only do if we didn't already
1493 * setns() (otherwise, user namespaces will hate us).
1494 */
1495 pid = fork();
1496 if (pid < 0) {
1497 put_attach_context(ctx);
1498 return log_error_errno(-1, errno, "Failed to create first subprocess");
1499 }
1500
1501 if (pid == 0) {
1502 char *cwd, *new_cwd;
1503
1504 /* close unneeded file descriptors */
1505 close_prot_errno_disarm(ipc_sockets[0]);
1506
1507 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1508 lxc_attach_terminal_close_ptx(&terminal);
1509 lxc_attach_terminal_close_peer(&terminal);
1510 lxc_attach_terminal_close_log(&terminal);
1511 }
1512
1513 /* Wait for the parent to have setup cgroups. */
1514 if (!sync_wait(ipc_sockets[1], ATTACH_SYNC_CGROUP)) {
1515 shutdown(ipc_sockets[1], SHUT_RDWR);
1516 put_attach_context(ctx);
1517 _exit(EXIT_FAILURE);
1518 }
1519
1520 if (!attach_context_security_barrier(ctx)) {
1521 shutdown(ipc_sockets[1], SHUT_RDWR);
1522 put_attach_context(ctx);
1523 _exit(EXIT_FAILURE);
1524 }
1525
1526 cwd = getcwd(NULL, 0);
1527
1528 /*
1529 * Attach now, create another subprocess later, since pid
1530 * namespaces only really affect the children of the current
1531 * process.
1532 *
1533 * Note that this is a crucial barrier. We're no moving into
1534 * the container's context so we need to make sure to not leak
1535 * anything sensitive. That especially means things such as
1536 * open file descriptors!
1537 */
1538 ret = attach_namespaces(ctx, options);
1539 if (ret < 0) {
1540 ERROR("Failed to enter namespaces");
1541 shutdown(ipc_sockets[1], SHUT_RDWR);
1542 put_attach_context(ctx);
1543 _exit(EXIT_FAILURE);
1544 }
1545
1546 /* Attach succeeded, try to cwd. */
1547 if (options->initial_cwd)
1548 new_cwd = options->initial_cwd;
1549 else
1550 new_cwd = cwd;
1551 if (new_cwd) {
1552 ret = chdir(new_cwd);
1553 if (ret < 0)
1554 WARN("Could not change directory to \"%s\"", new_cwd);
1555 }
1556 free_disarm(cwd);
1557
1558 /* Create attached process. */
1559 pid = lxc_raw_clone(CLONE_PARENT, NULL);
1560 if (pid < 0) {
1561 SYSERROR("Failed to clone attached process");
1562 shutdown(ipc_sockets[1], SHUT_RDWR);
1563 put_attach_context(ctx);
1564 _exit(EXIT_FAILURE);
1565 }
1566
1567 if (pid == 0) {
1568 struct attach_payload ap = {
1569 .ipc_socket = ipc_sockets[1],
1570 .options = options,
1571 .ctx = ctx,
1572 .terminal_pts_fd = terminal.pty,
1573 .exec_function = exec_function,
1574 .exec_payload = exec_payload,
1575 };
1576
1577 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1578 ret = lxc_terminal_signal_sigmask_safe_blocked(&terminal);
1579 if (ret < 0) {
1580 SYSERROR("Failed to reset signal mask");
1581 _exit(EXIT_FAILURE);
1582 }
1583 }
1584
1585 /* Does not return. */
1586 do_attach(&ap);
1587 }
1588 TRACE("Attached process %d started initializing", pid);
1589
1590 if (options->attach_flags & LXC_ATTACH_TERMINAL)
1591 lxc_attach_terminal_close_pts(&terminal);
1592
1593 /* Tell grandparent the pid of the pid of the newly created child. */
1594 if (!sync_wake_pid(ipc_sockets[1], pid)) {
1595 /* If this really happens here, this is very unfortunate, since
1596 * the parent will not know the pid of the attached process and
1597 * will not be able to wait for it (and we won't either due to
1598 * CLONE_PARENT) so the parent won't be able to reap it and the
1599 * attached process will remain a zombie.
1600 */
1601 shutdown(ipc_sockets[1], SHUT_RDWR);
1602 put_attach_context(ctx);
1603 _exit(EXIT_FAILURE);
1604 }
1605
1606 /* The rest is in the hands of the initial and the attached process. */
1607 put_attach_context(ctx);
1608 _exit(EXIT_SUCCESS);
1609 }
1610 TRACE("Transient process %d started initializing", pid);
1611
1612 to_cleanup_pid = pid;
1613
1614 /* close unneeded file descriptors */
1615 close_prot_errno_disarm(ipc_sockets[1]);
1616 put_namespaces(ctx);
1617 if (options->attach_flags & LXC_ATTACH_TERMINAL)
1618 lxc_attach_terminal_close_pts(&terminal);
1619
1620 /* Attach to cgroup, if requested. */
1621 if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
1622 /*
1623 * If this is the unified hierarchy cgroup_attach() is
1624 * enough.
1625 */
1626 ret = cgroup_attach(conf, name, lxcpath, pid);
1627 if (ret) {
1628 call_cleaner(cgroup_exit) struct cgroup_ops *cgroup_ops = NULL;
1629 if (!ERRNO_IS_NOT_SUPPORTED(ret)) {
1630 SYSERROR("Failed to attach cgroup");
1631 goto on_error;
1632 }
1633
1634 cgroup_ops = cgroup_init(conf);
1635 if (!cgroup_ops)
1636 goto on_error;
1637
1638 if (!cgroup_ops->attach(cgroup_ops, conf, name, lxcpath, pid))
1639 goto on_error;
1640 }
1641
1642 TRACE("Moved transient process %d into container cgroup", pid);
1643 }
1644
1645 /*
1646 * Close sensitive file descriptors we don't need anymore. Even if
1647 * we're the parent.
1648 */
1649 if (!attach_context_security_barrier(ctx))
1650 goto on_error;
1651
1652 /* Setup /proc limits */
1653 ret = setup_proc_filesystem(conf, pid);
1654 if (ret < 0)
1655 goto on_error;
1656
1657 /* Setup resource limits */
1658 ret = setup_resource_limits(conf, pid);
1659 if (ret < 0)
1660 goto on_error;
1661
1662 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1663 ret = lxc_attach_terminal_mainloop_init(&terminal, &descr);
1664 if (ret < 0)
1665 goto on_error;
1666
1667 TRACE("Initialized terminal mainloop");
1668 }
1669
1670 /* Let the child process know to go ahead. */
1671 if (!sync_wake(ipc_sockets[0], ATTACH_SYNC_CGROUP))
1672 goto close_mainloop;
1673
1674 TRACE("Told transient process to start initializing");
1675
1676 /* Get pid of attached process from transient process. */
1677 if (!sync_wait_pid(ipc_sockets[0], &attached_pid))
1678 goto close_mainloop;
1679
1680 TRACE("Received pid %d of attached process in parent pid namespace", attached_pid);
1681
1682 /* Ignore SIGKILL (CTRL-C) and SIGQUIT (CTRL-\) - issue #313. */
1683 if (options->stdin_fd == STDIN_FILENO) {
1684 signal(SIGINT, SIG_IGN);
1685 signal(SIGQUIT, SIG_IGN);
1686 }
1687
1688 /* Reap transient process. */
1689 ret = wait_for_pid(pid);
1690 if (ret < 0)
1691 goto close_mainloop;
1692
1693 TRACE("Transient process %d exited", pid);
1694
1695 /* We will always have to reap the attached process now. */
1696 to_cleanup_pid = attached_pid;
1697
1698 /* Open LSM fd and send it to child. */
1699 if (attach_lsm(options) && ctx->lsm_label) {
1700 __do_close int fd_lsm = -EBADF;
1701 bool on_exec;
1702
1703 on_exec = options->attach_flags & LXC_ATTACH_LSM_EXEC ? true : false;
1704 fd_lsm = ctx->lsm_ops->process_label_fd_get(ctx->lsm_ops, attached_pid, on_exec);
1705 if (fd_lsm < 0)
1706 goto close_mainloop;
1707
1708 TRACE("Opened LSM label file descriptor %d", fd_lsm);
1709
1710 /* Send child fd of the LSM security module to write to. */
1711 if (!sync_wake_fd(ipc_sockets[0], fd_lsm)) {
1712 SYSERROR("Failed to send lsm label fd");
1713 goto close_mainloop;
1714 }
1715
1716 TRACE("Sent LSM label file descriptor %d to child", fd_lsm);
1717 }
1718
1719 if (conf->seccomp.seccomp) {
1720 ret = lxc_seccomp_recv_notifier_fd(&conf->seccomp, ipc_sockets[0]);
1721 if (ret < 0)
1722 goto close_mainloop;
1723
1724 ret = lxc_seccomp_add_notifier(name, lxcpath, &conf->seccomp);
1725 if (ret < 0)
1726 goto close_mainloop;
1727 }
1728
1729 /* We're done, the child process should now execute whatever it
1730 * is that the user requested. The parent can now track it with
1731 * waitpid() or similar.
1732 */
1733
1734 *attached_process = attached_pid;
1735
1736 /* Now shut down communication with child, we're done. */
1737 shutdown(ipc_sockets[0], SHUT_RDWR);
1738 close_prot_errno_disarm(ipc_sockets[0]);
1739
1740 ret_parent = 0;
1741 to_cleanup_pid = -1;
1742
1743 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1744 ret = lxc_mainloop(&descr, -1);
1745 if (ret < 0) {
1746 ret_parent = -1;
1747 to_cleanup_pid = attached_pid;
1748 }
1749 }
1750
1751 close_mainloop:
1752 if (options->attach_flags & LXC_ATTACH_TERMINAL)
1753 lxc_mainloop_close(&descr);
1754
1755 on_error:
1756 if (ipc_sockets[0] >= 0) {
1757 shutdown(ipc_sockets[0], SHUT_RDWR);
1758 close_prot_errno_disarm(ipc_sockets[0]);
1759 }
1760
1761 if (to_cleanup_pid > 0)
1762 (void)wait_for_pid(to_cleanup_pid);
1763
1764 if (options->attach_flags & LXC_ATTACH_TERMINAL) {
1765 lxc_terminal_delete(&terminal);
1766 lxc_terminal_conf_free(&terminal);
1767 }
1768
1769 put_attach_context(ctx);
1770 return ret_parent;
1771 }
1772
1773 int lxc_attach_run_command(void *payload)
1774 {
1775 int ret = -1;
1776 lxc_attach_command_t *cmd = payload;
1777
1778 ret = execvp(cmd->program, cmd->argv);
1779 if (ret < 0) {
1780 switch (errno) {
1781 case ENOEXEC:
1782 ret = 126;
1783 break;
1784 case ENOENT:
1785 ret = 127;
1786 break;
1787 }
1788 }
1789
1790 return log_error_errno(ret, errno, "Failed to exec \"%s\"", cmd->program);
1791 }
1792
1793 int lxc_attach_run_shell(void* payload)
1794 {
1795 __do_free char *buf = NULL;
1796 uid_t uid;
1797 struct passwd pwent;
1798 struct passwd *pwentp = NULL;
1799 char *user_shell;
1800 size_t bufsize;
1801 int ret;
1802
1803 /* Ignore payload parameter. */
1804 (void)payload;
1805
1806 uid = getuid();
1807
1808 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
1809 if (bufsize == -1)
1810 bufsize = 1024;
1811
1812 buf = malloc(bufsize);
1813 if (buf) {
1814 ret = getpwuid_r(uid, &pwent, buf, bufsize, &pwentp);
1815 if (!pwentp) {
1816 if (ret == 0)
1817 WARN("Could not find matched password record");
1818
1819 WARN("Failed to get password record - %u", uid);
1820 }
1821 }
1822
1823 /* This probably happens because of incompatible nss implementations in
1824 * host and container (remember, this code is still using the host's
1825 * glibc but our mount namespace is in the container) we may try to get
1826 * the information by spawning a [getent passwd uid] process and parsing
1827 * the result.
1828 */
1829 if (!pwentp)
1830 user_shell = lxc_attach_getpwshell(uid);
1831 else
1832 user_shell = pwent.pw_shell;
1833
1834 if (user_shell)
1835 execlp(user_shell, user_shell, (char *)NULL);
1836
1837 /* Executed if either no passwd entry or execvp fails, we will fall back
1838 * on /bin/sh as a default shell.
1839 */
1840 execlp("/bin/sh", "/bin/sh", (char *)NULL);
1841
1842 SYSERROR("Failed to execute shell");
1843 if (!pwentp)
1844 free(user_shell);
1845
1846 return -1;
1847 }