]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
conf: use saner mode for console
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
1d52bdf7 2
d38dd64a
CB
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE 1
5#endif
9d257a2a 6#include <arpa/inet.h>
8f3e280e
CB
7#include <dirent.h>
8#include <errno.h>
9#include <fcntl.h>
10#include <grp.h>
11#include <inttypes.h>
12#include <libgen.h>
9d257a2a
CB
13#include <linux/loop.h>
14#include <net/if.h>
15#include <netinet/in.h>
8f3e280e
CB
16#include <pwd.h>
17#include <stdarg.h>
0ad19a3f 18#include <stdio.h>
0ad19a3f 19#include <stdlib.h>
0ad19a3f 20#include <string.h>
8f3e280e
CB
21#include <sys/mman.h>
22#include <sys/mount.h>
23#include <sys/param.h>
24#include <sys/prctl.h>
6a49f05e 25#include <sys/sendfile.h>
8f3e280e 26#include <sys/socket.h>
9d257a2a 27#include <sys/stat.h>
2d76d1d7 28#include <sys/syscall.h>
9d257a2a 29#include <sys/sysmacros.h>
97e9cfa0 30#include <sys/types.h>
8f3e280e
CB
31#include <sys/utsname.h>
32#include <sys/wait.h>
9d257a2a
CB
33#include <time.h>
34#include <unistd.h>
1d52bdf7 35
d38dd64a
CB
36#include "af_unix.h"
37#include "caps.h"
5f126977 38#include "cgroups/cgroup.h"
d38dd64a
CB
39#include "conf.h"
40#include "config.h"
41#include "confile.h"
42#include "confile_utils.h"
43#include "error.h"
44#include "log.h"
45#include "lsm/lsm.h"
46#include "lxclock.h"
47#include "lxcseccomp.h"
48#include "macro.h"
2f443e88 49#include "memory_utils.h"
7f88a1a2 50#include "mount_utils.h"
d38dd64a
CB
51#include "namespace.h"
52#include "network.h"
53#include "parse.h"
f40988c7 54#include "process_utils.h"
d38dd64a
CB
55#include "ringbuf.h"
56#include "start.h"
5f126977 57#include "storage/storage.h"
d38dd64a 58#include "storage/overlay.h"
6b3d24d7 59#include "syscall_wrappers.h"
d38dd64a
CB
60#include "terminal.h"
61#include "utils.h"
20502652 62#include "uuid.h"
d38dd64a 63
af6824fc 64#ifdef MAJOR_IN_MKDEV
9d257a2a 65#include <sys/mkdev.h>
af6824fc 66#endif
af6824fc 67
614305f3 68#ifdef HAVE_STATVFS
2938f7c8 69#include <sys/statvfs.h>
614305f3 70#endif
e827ff7e 71
35eb5cdc 72#if HAVE_OPENPTY
b0a33c1e 73#include <pty.h>
e827ff7e
SG
74#else
75#include <../include/openpty.h>
76#endif
0ad19a3f 77
9d257a2a
CB
78#if HAVE_LIBCAP
79#include <sys/capability.h>
80#endif
81
82#if HAVE_SYS_PERSONALITY_H
83#include <sys/personality.h>
84#endif
85
f1e05b90
DJ
86#ifndef HAVE_STRLCAT
87#include "include/strlcat.h"
88#endif
89
9d257a2a
CB
90#if IS_BIONIC
91#include <../include/lxcmntent.h>
92#else
93#include <mntent.h>
94#endif
95
96#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
97#include <../include/prlimit.h>
98#endif
99
ac2cecc4 100lxc_log_define(conf, lxc);
e5bda9ee 101
0fd73091
CB
102/* The lxc_conf of the container currently being worked on in an API call.
103 * This is used in the error calls.
104 */
105#ifdef HAVE_TLS
d7f19646 106thread_local struct lxc_conf *current_config;
0fd73091
CB
107#else
108struct lxc_conf *current_config;
109#endif
8912711c 110
0fd73091
CB
111char *lxchook_names[NUM_LXC_HOOKS] = {
112 "pre-start",
113 "pre-mount",
114 "mount",
115 "autodev",
116 "start",
117 "stop",
118 "post-stop",
119 "clone",
120 "destroy",
121 "start-host"
122};
72d0e1cb 123
998ac676
RT
124struct mount_opt {
125 char *name;
126 int clear;
127 int flag;
128};
129
81810dd1
DL
130struct caps_opt {
131 char *name;
132 int value;
133};
134
c6d09e15
WB
135struct limit_opt {
136 char *name;
137 int value;
138};
139
998ac676 140static struct mount_opt mount_opt[] = {
470b359b
CB
141 { "async", 1, MS_SYNCHRONOUS },
142 { "atime", 1, MS_NOATIME },
143 { "bind", 0, MS_BIND },
88d413d5 144 { "defaults", 0, 0 },
88d413d5 145 { "dev", 1, MS_NODEV },
470b359b 146 { "diratime", 1, MS_NODIRATIME },
88d413d5 147 { "dirsync", 0, MS_DIRSYNC },
470b359b 148 { "exec", 1, MS_NOEXEC },
8912711c 149 { "lazytime", 0, MS_LAZYTIME },
88d413d5 150 { "mand", 0, MS_MANDLOCK },
88d413d5 151 { "noatime", 0, MS_NOATIME },
470b359b 152 { "nodev", 0, MS_NODEV },
88d413d5 153 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
154 { "noexec", 0, MS_NOEXEC },
155 { "nomand", 1, MS_MANDLOCK },
156 { "norelatime", 1, MS_RELATIME },
157 { "nostrictatime", 1, MS_STRICTATIME },
158 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
159 { "rbind", 0, MS_BIND|MS_REC },
160 { "relatime", 0, MS_RELATIME },
470b359b
CB
161 { "remount", 0, MS_REMOUNT },
162 { "ro", 0, MS_RDONLY },
163 { "rw", 1, MS_RDONLY },
88d413d5 164 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
165 { "suid", 1, MS_NOSUID },
166 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 167 { NULL, 0, 0 },
998ac676
RT
168};
169
d840039e 170static struct mount_opt propagation_opt[] = {
0fd73091
CB
171 { "private", 0, MS_PRIVATE },
172 { "shared", 0, MS_SHARED },
173 { "slave", 0, MS_SLAVE },
174 { "unbindable", 0, MS_UNBINDABLE },
175 { "rprivate", 0, MS_PRIVATE|MS_REC },
176 { "rshared", 0, MS_SHARED|MS_REC },
177 { "rslave", 0, MS_SLAVE|MS_REC },
178 { "runbindable", 0, MS_UNBINDABLE|MS_REC },
179 { NULL, 0, 0 },
d840039e
YT
180};
181
81810dd1 182static struct caps_opt caps_opt[] = {
8560cd36 183#if HAVE_LIBCAP
7b4cd468
CB
184 { "chown", CAP_CHOWN },
185 { "dac_override", CAP_DAC_OVERRIDE },
186 { "dac_read_search", CAP_DAC_READ_SEARCH },
187 { "fowner", CAP_FOWNER },
188 { "fsetid", CAP_FSETID },
189 { "kill", CAP_KILL },
190 { "setgid", CAP_SETGID },
191 { "setuid", CAP_SETUID },
192 { "setpcap", CAP_SETPCAP },
193 { "linux_immutable", CAP_LINUX_IMMUTABLE },
194 { "net_bind_service", CAP_NET_BIND_SERVICE },
195 { "net_broadcast", CAP_NET_BROADCAST },
196 { "net_admin", CAP_NET_ADMIN },
197 { "net_raw", CAP_NET_RAW },
198 { "ipc_lock", CAP_IPC_LOCK },
199 { "ipc_owner", CAP_IPC_OWNER },
200 { "sys_module", CAP_SYS_MODULE },
201 { "sys_rawio", CAP_SYS_RAWIO },
202 { "sys_chroot", CAP_SYS_CHROOT },
203 { "sys_ptrace", CAP_SYS_PTRACE },
204 { "sys_pacct", CAP_SYS_PACCT },
205 { "sys_admin", CAP_SYS_ADMIN },
206 { "sys_boot", CAP_SYS_BOOT },
207 { "sys_nice", CAP_SYS_NICE },
208 { "sys_resource", CAP_SYS_RESOURCE },
209 { "sys_time", CAP_SYS_TIME },
210 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
211 { "mknod", CAP_MKNOD },
212 { "lease", CAP_LEASE },
213 { "audit_write", CAP_AUDIT_WRITE },
214 { "audit_control", CAP_AUDIT_CONTROL },
215 { "setfcap", CAP_SETFCAP },
216 { "mac_override", CAP_MAC_OVERRIDE },
217 { "mac_admin", CAP_MAC_ADMIN },
218 { "syslog", CAP_SYSLOG },
219 { "wake_alarm", CAP_WAKE_ALARM },
220 { "block_suspend", CAP_BLOCK_SUSPEND },
221 { "audit_read", CAP_AUDIT_READ },
222 { "perfmon", CAP_PERFMON },
223 { "bpf", CAP_BPF },
224 { "checkpoint_restore", CAP_CHECKPOINT_RESTORE },
2b54359b 225#endif
8560cd36 226};
81810dd1 227
c6d09e15
WB
228static struct limit_opt limit_opt[] = {
229#ifdef RLIMIT_AS
230 { "as", RLIMIT_AS },
231#endif
232#ifdef RLIMIT_CORE
233 { "core", RLIMIT_CORE },
234#endif
235#ifdef RLIMIT_CPU
236 { "cpu", RLIMIT_CPU },
237#endif
238#ifdef RLIMIT_DATA
239 { "data", RLIMIT_DATA },
240#endif
241#ifdef RLIMIT_FSIZE
242 { "fsize", RLIMIT_FSIZE },
243#endif
244#ifdef RLIMIT_LOCKS
245 { "locks", RLIMIT_LOCKS },
246#endif
247#ifdef RLIMIT_MEMLOCK
248 { "memlock", RLIMIT_MEMLOCK },
249#endif
250#ifdef RLIMIT_MSGQUEUE
251 { "msgqueue", RLIMIT_MSGQUEUE },
252#endif
253#ifdef RLIMIT_NICE
254 { "nice", RLIMIT_NICE },
255#endif
256#ifdef RLIMIT_NOFILE
257 { "nofile", RLIMIT_NOFILE },
258#endif
259#ifdef RLIMIT_NPROC
260 { "nproc", RLIMIT_NPROC },
261#endif
262#ifdef RLIMIT_RSS
263 { "rss", RLIMIT_RSS },
264#endif
265#ifdef RLIMIT_RTPRIO
266 { "rtprio", RLIMIT_RTPRIO },
267#endif
268#ifdef RLIMIT_RTTIME
269 { "rttime", RLIMIT_RTTIME },
270#endif
271#ifdef RLIMIT_SIGPENDING
272 { "sigpending", RLIMIT_SIGPENDING },
273#endif
274#ifdef RLIMIT_STACK
275 { "stack", RLIMIT_STACK },
276#endif
277};
278
91c3830e
SH
279static int run_buffer(char *buffer)
280{
cc6a0e78 281 __do_free char *output = NULL;
55022530 282 __do_lxc_pclose struct lxc_popen_FILE *f = NULL;
ebf3a6af 283 int fd, ret;
91c3830e 284
ebec9176 285 f = lxc_popen(buffer);
55022530
CB
286 if (!f)
287 return log_error_errno(-1, errno, "Failed to popen() %s", buffer);
91c3830e
SH
288
289 output = malloc(LXC_LOG_BUFFER_SIZE);
55022530
CB
290 if (!output)
291 return log_error_errno(-1, ENOMEM, "Failed to allocate memory for %s", buffer);
91c3830e 292
ebf3a6af 293 fd = fileno(f->f);
55022530
CB
294 if (fd < 0)
295 return log_error_errno(-1, errno, "Failed to retrieve underlying file descriptor");
ebf3a6af
CB
296
297 for (int i = 0; i < 10; i++) {
298 ssize_t bytes_read;
299
300 bytes_read = lxc_read_nointr(fd, output, LXC_LOG_BUFFER_SIZE - 1);
301 if (bytes_read > 0) {
302 output[bytes_read] = '\0';
303 DEBUG("Script %s produced output: %s", buffer, output);
304 continue;
305 }
306
307 break;
308 }
91c3830e 309
55022530
CB
310 ret = lxc_pclose(move_ptr(f));
311 if (ret == -1)
312 return log_error_errno(-1, errno, "Script exited with error");
313 else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0)
314 return log_error(-1, "Script exited with status %d", WEXITSTATUS(ret));
315 else if (WIFSIGNALED(ret))
316 return log_error(-1, "Script terminated by signal %d", WTERMSIG(ret));
91c3830e
SH
317
318 return 0;
319}
320
14a7b0f9
CB
321int run_script_argv(const char *name, unsigned int hook_version,
322 const char *section, const char *script,
586b1ce7 323 const char *hookname, char **argv)
148e91f5 324{
e1a94937 325 __do_free char *buffer = NULL;
3f60c2f7 326 int buf_pos, i, ret;
d08e5708 327 size_t size = 0;
148e91f5 328
3f60c2f7 329 if (hook_version == 0)
55022530
CB
330 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
331 script, name, section);
3f60c2f7
CB
332 else
333 INFO("Executing script \"%s\" for container \"%s\"", script, name);
148e91f5 334
586b1ce7
CB
335 for (i = 0; argv && argv[i]; i++)
336 size += strlen(argv[i]) + 1;
148e91f5 337
6333c915
CB
338 size += STRLITERALLEN("exec");
339 size++;
148e91f5 340 size += strlen(script);
3f60c2f7
CB
341 size++;
342
148e91f5 343 if (size > INT_MAX)
3f60c2f7 344 return -EFBIG;
148e91f5 345
3f60c2f7 346 if (hook_version == 0) {
d08e5708
CB
347 size += strlen(hookname);
348 size++;
349
350 size += strlen(name);
351 size++;
352
353 size += strlen(section);
354 size++;
355
356 if (size > INT_MAX)
357 return -EFBIG;
327cce76 358 }
3f60c2f7 359
6f8d00d2
CB
360 buffer = malloc(size);
361 if (!buffer)
362 return -ENOMEM;
363
327cce76 364 if (hook_version == 0)
9bcde680 365 buf_pos = strnprintf(buffer, size, "exec %s %s %s %s", script, name, section, hookname);
327cce76 366 else
9bcde680
CB
367 buf_pos = strnprintf(buffer, size, "exec %s", script);
368 if (buf_pos < 0)
55022530 369 return log_error_errno(-1, errno, "Failed to create command line for script \"%s\"", script);
3f60c2f7 370
327cce76 371 if (hook_version == 1) {
3f60c2f7
CB
372 ret = setenv("LXC_HOOK_TYPE", hookname, 1);
373 if (ret < 0) {
55022530 374 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_HOOK_TYPE=%s", hookname);
3f60c2f7 375 }
90f20466 376 TRACE("Set environment variable: LXC_HOOK_TYPE=%s", hookname);
3f60c2f7
CB
377
378 ret = setenv("LXC_HOOK_SECTION", section, 1);
55022530
CB
379 if (ret < 0)
380 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_HOOK_SECTION=%s", section);
3f60c2f7 381 TRACE("Set environment variable: LXC_HOOK_SECTION=%s", section);
14a7b0f9 382
71528742 383 if (strequal(section, "net")) {
14a7b0f9
CB
384 char *parent;
385
586b1ce7 386 if (!argv || !argv[0])
e1a94937 387 return -1;
14a7b0f9 388
586b1ce7 389 ret = setenv("LXC_NET_TYPE", argv[0], 1);
55022530
CB
390 if (ret < 0)
391 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_TYPE=%s", argv[0]);
586b1ce7 392 TRACE("Set environment variable: LXC_NET_TYPE=%s", argv[0]);
14a7b0f9 393
586b1ce7 394 parent = argv[1] ? argv[1] : "";
14a7b0f9 395
71528742 396 if (strequal(argv[0], "macvlan")) {
14a7b0f9 397 ret = setenv("LXC_NET_PARENT", parent, 1);
55022530
CB
398 if (ret < 0)
399 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
14a7b0f9 400 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
71528742 401 } else if (strequal(argv[0], "phys")) {
14a7b0f9 402 ret = setenv("LXC_NET_PARENT", parent, 1);
55022530
CB
403 if (ret < 0)
404 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
14a7b0f9 405 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
71528742 406 } else if (strequal(argv[0], "veth")) {
586b1ce7 407 char *peer = argv[2] ? argv[2] : "";
14a7b0f9
CB
408
409 ret = setenv("LXC_NET_PEER", peer, 1);
55022530
CB
410 if (ret < 0)
411 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PEER=%s", peer);
14a7b0f9
CB
412 TRACE("Set environment variable: LXC_NET_PEER=%s", peer);
413
414 ret = setenv("LXC_NET_PARENT", parent, 1);
55022530
CB
415 if (ret < 0)
416 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
14a7b0f9
CB
417 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
418 }
419 }
148e91f5
SH
420 }
421
586b1ce7 422 for (i = 0; argv && argv[i]; i++) {
3f60c2f7
CB
423 size_t len = size - buf_pos;
424
9bcde680
CB
425 ret = strnprintf(buffer + buf_pos, len, " %s", argv[i]);
426 if (ret < 0)
55022530 427 return log_error_errno(-1, errno, "Failed to create command line for script \"%s\"", script);
3f60c2f7 428 buf_pos += ret;
148e91f5
SH
429 }
430
e1a94937 431 return run_buffer(buffer);
148e91f5
SH
432}
433
811ef482 434int run_script(const char *name, const char *section, const char *script, ...)
e3b4c4c4 435{
2f443e88 436 __do_free char *buffer = NULL;
abbfd20b 437 int ret;
2f443e88 438 char *p;
abbfd20b 439 va_list ap;
0fd73091 440 size_t size = 0;
751d9dcd 441
0fd73091 442 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
751d9dcd 443 script, name, section);
e3b4c4c4 444
abbfd20b
DL
445 va_start(ap, script);
446 while ((p = va_arg(ap, char *)))
95642a10 447 size += strlen(p) + 1;
abbfd20b
DL
448 va_end(ap);
449
6333c915 450 size += STRLITERALLEN("exec");
abbfd20b
DL
451 size += strlen(script);
452 size += strlen(name);
453 size += strlen(section);
6d1a5f93 454 size += 4;
abbfd20b 455
95642a10
MS
456 if (size > INT_MAX)
457 return -1;
458
2f443e88 459 buffer = must_realloc(NULL, size);
9bcde680
CB
460 ret = strnprintf(buffer, size, "exec %s %s %s", script, name, section);
461 if (ret < 0)
9ba8130c 462 return -1;
751d9dcd 463
abbfd20b 464 va_start(ap, script);
9ba8130c 465 while ((p = va_arg(ap, char *))) {
062b72c6 466 int len = size - ret;
9ba8130c 467 int rc;
9bcde680
CB
468 rc = strnprintf(buffer + ret, len, " %s", p);
469 if (rc < 0) {
7b5a2435 470 va_end(ap);
9ba8130c 471 return -1;
7b5a2435 472 }
9ba8130c
SH
473 ret += rc;
474 }
abbfd20b 475 va_end(ap);
751d9dcd 476
91c3830e 477 return run_buffer(buffer);
e3b4c4c4
ST
478}
479
79ff643d 480/* lxc_rootfs_prepare
63fc76c3 481 * if rootfs is a directory, then open ${rootfs}/.lxc-keep for writing for
b7ed4bf0
CS
482 * the duration of the container run, to prevent the container from marking
483 * the underlying fs readonly on shutdown. unlink the file immediately so
63fc76c3
GJ
484 * no name pollution is happens.
485 * don't unlink on NFS to avoid random named stale handles.
0c547523 486 */
79ff643d 487int lxc_rootfs_prepare(struct lxc_rootfs *rootfs, bool userns)
0c547523 488{
79ff643d
CB
489 __do_close int dfd_path = -EBADF, fd_pin = -EBADF;
490 int ret;
491 struct stat st;
492 struct statfs stfs;
0c547523 493
79ff643d
CB
494 if (rootfs->path) {
495 if (rootfs->bdev_type &&
71528742
CB
496 (strequal(rootfs->bdev_type, "overlay") ||
497 strequal(rootfs->bdev_type, "overlayfs")))
79ff643d 498 return log_trace_errno(0, EINVAL, "Not pinning on stacking filesystem");
e99ee0de 499
79ff643d
CB
500 dfd_path = open_at(-EBADF, rootfs->path, PROTECT_OPATH_FILE, 0, 0);
501 } else {
502 dfd_path = open_at(-EBADF, "/", PROTECT_OPATH_FILE, PROTECT_LOOKUP_ABSOLUTE, 0);
503 }
504 if (dfd_path < 0)
505 return log_error_errno(-errno, errno, "Failed to open \"%s\"", rootfs->path);
506
507 if (!rootfs->path)
508 return log_trace(0, "Not pinning because container does not have a rootfs");
0c547523 509
79ff643d
CB
510 if (userns)
511 return log_trace(0, "Not pinning because container runs in user namespace");
512
513 ret = fstat(dfd_path, &st);
957c4704 514 if (ret < 0)
79ff643d 515 return log_trace_errno(-errno, errno, "Failed to retrieve file status");
0c547523 516
79ff643d
CB
517 if (!S_ISDIR(st.st_mode))
518 return log_trace_errno(0, ENOTDIR, "Not pinning because file descriptor is not a directory");
0c547523 519
79ff643d
CB
520 fd_pin = open_at(dfd_path, ".lxc_keep",
521 PROTECT_OPEN | O_CREAT,
522 PROTECT_LOOKUP_BENEATH,
523 S_IWUSR | S_IRUSR);
524 if (fd_pin < 0)
525 return log_error_errno(-errno, errno, "Failed to pin rootfs");
0c547523 526
79ff643d 527 TRACE("Pinned rootfs %d(.lxc_keep)", fd_pin);
0fd73091 528
79ff643d
CB
529 ret = fstatfs(fd_pin, &stfs);
530 if (ret < 0) {
531 SYSWARN("Failed to retrieve filesystem status");
532 goto out;
533 }
63fc76c3 534
79ff643d
CB
535 if (stfs.f_type == NFS_SUPER_MAGIC) {
536 DEBUG("Not unlinking pinned file on NFS");
537 goto out;
538 }
63fc76c3 539
79ff643d
CB
540 if (unlinkat(dfd_path, ".lxc_keep", 0))
541 SYSTRACE("Failed to unlink rootfs pinning file %d(.lxc_keep)", dfd_path);
542 else
543 TRACE("Unlinked pinned file %d(.lxc_keep)", dfd_path);
0fd73091 544
79ff643d
CB
545out:
546 rootfs->fd_path_pin = move_fd(fd_pin);
547 return 0;
0c547523
SH
548}
549
6b741397
CB
550static int add_shmount_to_list(struct lxc_conf *conf)
551{
6b5a54cd 552 char new_mount[PATH_MAX];
0d190408 553 /* Offset for the leading '/' since the path_cont
6b741397
CB
554 * is absolute inside the container.
555 */
556 int offset = 1, ret = -1;
0d190408 557
9bcde680 558 ret = strnprintf(new_mount, sizeof(new_mount),
6b741397
CB
559 "%s %s none bind,create=dir 0 0", conf->shmount.path_host,
560 conf->shmount.path_cont + offset);
9bcde680 561 if (ret < 0)
0d190408
LT
562 return -1;
563
6b741397 564 return add_elem_to_mount_list(new_mount, conf);
0d190408
LT
565}
566
6d25a524 567static int lxc_mount_auto_mounts(struct lxc_handler *handler, int flags)
368bbc02 568{
7b371c1e 569 int i, ret;
b06b8511
CS
570 static struct {
571 int match_mask;
572 int match_flag;
573 const char *source;
574 const char *destination;
575 const char *fstype;
576 unsigned long flags;
577 const char *options;
e8b9c9ec 578 bool requires_cap_net_admin;
b06b8511 579 } default_mounts[] = {
0fd73091
CB
580 /* Read-only bind-mounting... In older kernels, doing that
581 * required to do one MS_BIND mount and then
582 * MS_REMOUNT|MS_RDONLY the same one. According to mount(2)
583 * manpage, MS_BIND honors MS_RDONLY from kernel 2.6.26
584 * onwards. However, this apparently does not work on kernel
585 * 3.8. Unfortunately, on that very same kernel, doing the same
586 * trick as above doesn't seem to work either, there one needs
587 * to ALSO specify MS_BIND for the remount, otherwise the
588 * entire fs is remounted read-only or the mount fails because
589 * it's busy... MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for
590 * kernels as low as 2.6.32...
368bbc02 591 */
5d1bf4c4 592 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
592fd47a 593 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
5d1bf4c4
CB
594 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL, true },
595 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL, false },
596 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL, false },
597 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL, true },
598 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL, false },
599 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL, false },
600 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
601 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL, false },
602 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL, false },
603 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
604 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL, false },
605 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL, false },
606 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL, false },
607 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL, false },
608 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL, false },
609 { 0, 0, NULL, NULL, NULL, 0, NULL, false }
b06b8511 610 };
6d25a524 611 struct lxc_conf *conf = handler->conf;
e25af1bc
CB
612 struct lxc_rootfs *rootfs = &conf->rootfs;
613 bool has_cap_net_admin;
368bbc02 614
f4bea7cc 615 if (flags & LXC_AUTO_PROC_MASK) {
ea57e424 616 ret = mkdirat(rootfs->dfd_mnt, "proc" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
f4bea7cc
CB
617 if (ret < 0 && errno != EEXIST)
618 return log_error_errno(-errno, errno,
ea57e424 619 "Failed to create proc mountpoint under %d", rootfs->dfd_mnt);
f4bea7cc
CB
620 }
621
622 if (flags & LXC_AUTO_SYS_MASK) {
ea57e424 623 ret = mkdirat(rootfs->dfd_mnt, "sys" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
f4bea7cc
CB
624 if (ret < 0 && errno != EEXIST)
625 return log_error_errno(-errno, errno,
ea57e424 626 "Failed to create sysfs mountpoint under %d", rootfs->dfd_mnt);
f4bea7cc
CB
627 }
628
e25af1bc 629 has_cap_net_admin = lxc_wants_cap(CAP_NET_ADMIN, conf);
d84b26bc 630 for (i = 0; default_mounts[i].match_mask; i++) {
8db92302 631 __do_free char *destination = NULL, *source = NULL;
0fd73091
CB
632 int saved_errno;
633 unsigned long mflags;
0fd73091
CB
634 if ((flags & default_mounts[i].match_mask) != default_mounts[i].match_flag)
635 continue;
636
637 if (default_mounts[i].source) {
cc4fd506 638 /* will act like strdup if %r is not present */
e25af1bc 639 source = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].source);
0fd73091 640 if (!source)
cc4fd506 641 return -1;
0fd73091 642 }
f24a52d5 643
55022530
CB
644 if (!default_mounts[i].destination)
645 return log_error(-1, "BUG: auto mounts destination %d was NULL", i);
0fd73091 646
e8b9c9ec 647 if (!has_cap_net_admin && default_mounts[i].requires_cap_net_admin) {
648 TRACE("Container does not have CAP_NET_ADMIN. Skipping \"%s\" mount", default_mounts[i].source ?: "(null)");
649 continue;
650 }
651
0fd73091 652 /* will act like strdup if %r is not present */
e25af1bc 653 destination = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].destination);
55022530 654 if (!destination)
0fd73091 655 return -1;
0fd73091
CB
656
657 mflags = add_required_remount_flags(source, destination,
658 default_mounts[i].flags);
7b371c1e
CB
659 ret = safe_mount(source, destination, default_mounts[i].fstype,
660 mflags, default_mounts[i].options,
661 rootfs->path ? rootfs->mount : NULL);
0fd73091 662 saved_errno = errno;
7b371c1e 663 if (ret < 0 && errno == ENOENT) {
55022530 664 INFO("Mount source or target for \"%s\" on \"%s\" does not exist. Skipping", source, destination);
7b371c1e
CB
665 ret = 0;
666 } else if (ret < 0) {
0fd73091
CB
667 SYSERROR("Failed to mount \"%s\" on \"%s\" with flags %lu", source, destination, mflags);
668 }
669
7b371c1e 670 if (ret < 0) {
0fd73091
CB
671 errno = saved_errno;
672 return -1;
368bbc02 673 }
368bbc02
CS
674 }
675
b06b8511 676 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
677 int cg_flags;
678
3f69fb12 679 cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE);
0fd73091
CB
680 /* If the type of cgroup mount was not specified, it depends on
681 * the container's capabilities as to what makes sense: if we
682 * have CAP_SYS_ADMIN, the read-only part can be remounted
683 * read-write anyway, so we may as well default to read-write;
684 * then the admin will not be given a false sense of security.
685 * (And if they really want mixed r/o r/w, then they can
686 * explicitly specify :mixed.) OTOH, if the container lacks
687 * CAP_SYS_ADMIN, do only default to :mixed, because then the
688 * container can't remount it read-write.
689 */
9394b6dc 690 if ((cg_flags == LXC_AUTO_CGROUP_NOSPEC) || (cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC)) {
0769b82a 691 int has_sys_admin = 0;
b0ee5983
CB
692
693 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 694 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 695 else
0769b82a 696 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
697
698 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 699 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 700 else
0769b82a 701 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a 702 }
0fd73091 703
3f69fb12 704 if (flags & LXC_AUTO_CGROUP_FORCE)
0fd73091
CB
705 cg_flags |= LXC_AUTO_CGROUP_FORCE;
706
ab8cd5d9 707 if (!handler->cgroup_ops->mount(handler->cgroup_ops, handler, cg_flags))
55022530 708 return log_error_errno(-1, errno, "Failed to mount \"/sys/fs/cgroup\"");
368bbc02
CS
709 }
710
0d190408 711 if (flags & LXC_AUTO_SHMOUNTS_MASK) {
7b371c1e 712 ret = add_shmount_to_list(conf);
55022530
CB
713 if (ret < 0)
714 return log_error(-1, "Failed to add shmount entry to container config");
0d190408
LT
715 }
716
368bbc02 717 return 0;
368bbc02
CS
718}
719
4e5440c6 720static int setup_utsname(struct utsname *utsname)
0ad19a3f 721{
0fd73091
CB
722 int ret;
723
4e5440c6
DL
724 if (!utsname)
725 return 0;
0ad19a3f 726
0fd73091 727 ret = sethostname(utsname->nodename, strlen(utsname->nodename));
55022530
CB
728 if (ret < 0)
729 return log_error_errno(-1, errno, "Failed to set the hostname to \"%s\"",
730 utsname->nodename);
0ad19a3f 731
0fd73091 732 INFO("Set hostname to \"%s\"", utsname->nodename);
cd54d859 733
0ad19a3f 734 return 0;
735}
736
69aa6655
DE
737struct dev_symlinks {
738 const char *oldpath;
739 const char *name;
740};
741
742static const struct dev_symlinks dev_symlinks[] = {
0fd73091
CB
743 { "/proc/self/fd", "fd" },
744 { "/proc/self/fd/0", "stdin" },
745 { "/proc/self/fd/1", "stdout" },
746 { "/proc/self/fd/2", "stderr" },
69aa6655
DE
747};
748
ed8704d0 749static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
69aa6655 750{
79019997
CB
751 for (int i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
752 int ret;
753 struct stat s;
69aa6655 754 const struct dev_symlinks *d = &dev_symlinks[i];
0fd73091 755
79019997
CB
756 /*
757 * Stat the path first. If we don't get an error accept it as
0fd73091 758 * is and don't try to create it
09227be2 759 */
a5a08920 760 ret = fstatat(rootfs->dfd_dev, d->name, &s, 0);
0fd73091 761 if (ret == 0)
09227be2 762 continue;
09227be2 763
a5a08920 764 ret = symlinkat(d->oldpath, rootfs->dfd_dev, d->name);
79019997
CB
765 if (ret) {
766 switch (errno) {
767 case EROFS:
768 WARN("Failed to create \"%s\" on read-only filesystem", d->name);
769 __fallthrough;
770 case EEXIST:
771 break;
772 default:
773 return log_error_errno(-errno, errno, "Failed to create \"%s\"", d->name);
774 }
69aa6655
DE
775 }
776 }
0fd73091 777
69aa6655
DE
778 return 0;
779}
780
2187efd3 781/* Build a space-separate list of ptys to pass to systemd. */
885766f5 782static bool append_ttyname(char **pp, char *name)
b0a33c1e 783{
393903d1 784 char *p;
f1e05b90 785 size_t size;
393903d1
SH
786
787 if (!*pp) {
788 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
789 if (!*pp)
790 return false;
0fd73091 791
393903d1
SH
792 sprintf(*pp, "container_ttys=%s", name);
793 return true;
794 }
0fd73091 795
f1e05b90
DJ
796 size = strlen(*pp) + strlen(name) + 2;
797 p = realloc(*pp, size);
393903d1
SH
798 if (!p)
799 return false;
0fd73091 800
393903d1 801 *pp = p;
f1e05b90
DJ
802 (void)strlcat(p, " ", size);
803 (void)strlcat(p, name, size);
0fd73091 804
393903d1
SH
805 return true;
806}
807
2187efd3 808static int lxc_setup_ttys(struct lxc_conf *conf)
393903d1 809{
7369e6bf
CB
810 int ret;
811 struct lxc_rootfs *rootfs = &conf->rootfs;
0e4be3cf 812 const struct lxc_tty_info *ttys = &conf->ttys;
885766f5 813 char *ttydir = ttys->dir;
b0a33c1e 814
e8bd4e43 815 if (!conf->rootfs.path)
bc9bd0e3
DL
816 return 0;
817
7369e6bf
CB
818 for (int i = 0; i < ttys->max; i++) {
819 __do_close int fd_to = -EBADF;
0e4be3cf 820 struct lxc_terminal_info *tty = &ttys->tty[i];
b0a33c1e 821
7c6ef2a2 822 if (ttydir) {
7369e6bf 823 char *tty_name, *tty_path;
9e1045e3 824
9bcde680 825 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf),
7369e6bf 826 "/dev/%s/tty%d", ttydir, i + 1);
9bcde680 827 if (ret < 0)
7369e6bf
CB
828 return ret_errno(-EIO);
829
830 tty_path = &rootfs->buf[STRLITERALLEN("/dev/")];
831 tty_name = tty_path + strlen(ttydir) + 1;
832
833 /* create bind-mount target */
834 fd_to = open_at(rootfs->dfd_dev, tty_path,
835 PROTECT_OPEN_W | O_CREAT,
836 PROTECT_LOOKUP_BENEATH, 0);
837 if (fd_to < 0)
838 return log_error_errno(-errno, errno,
839 "Failed to create tty mount target %d(%s)",
840 rootfs->dfd_dev, tty_path);
841
842 ret = unlinkat(rootfs->dfd_dev, tty_name, 0);
843 if (ret < 0 && errno != ENOENT)
844 return log_error_errno(-errno, errno,
845 "Failed to unlink %d(%s)",
846 rootfs->dfd_dev, tty_name);
847
de7f9f33 848 if (can_use_mount_api()) {
7369e6bf
CB
849 ret = fd_bind_mount(tty->pty, "",
850 PROTECT_OPATH_FILE,
851 PROTECT_LOOKUP_BENEATH_XDEV,
852 fd_to, "",
853 PROTECT_OPATH_FILE,
854 PROTECT_LOOKUP_BENEATH_XDEV, 0,
855 false);
856 } else {
857 ret = mount(tty->name, rootfs->buf, "none", MS_BIND, 0);
7c6ef2a2 858 }
7369e6bf
CB
859 if (ret < 0)
860 return log_error_errno(-errno, errno,
861 "Failed to bind mount \"%s\" onto \"%s\"",
862 tty->name, rootfs->buf);
863 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, rootfs->buf);
9e1045e3 864
7369e6bf 865 ret = symlinkat(tty_path, rootfs->dfd_dev, tty_name);
55022530 866 if (ret < 0)
7369e6bf
CB
867 return log_error_errno(-errno, errno,
868 "Failed to create symlink \"%d(%s)\" -> \"%d(%s)\"",
869 rootfs->dfd_dev, tty_name,
870 rootfs->dfd_dev, tty_path);
7c6ef2a2 871 } else {
9bcde680
CB
872 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "tty%d", i + 1);
873 if (ret < 0)
7369e6bf
CB
874 return ret_errno(-EIO);
875
876 /* If we populated /dev, then we need to create /dev/tty<idx>. */
877 fd_to = open_at(rootfs->dfd_dev, rootfs->buf,
878 PROTECT_OPEN_W | O_CREAT,
879 PROTECT_LOOKUP_BENEATH, 0);
880 if (fd_to < 0)
881 return log_error_errno(-errno, errno,
882 "Failed to create tty mount target %d(%s)",
883 rootfs->dfd_dev, rootfs->buf);
884
de7f9f33 885 if (can_use_mount_api()) {
7369e6bf
CB
886 ret = fd_bind_mount(tty->pty, "",
887 PROTECT_OPATH_FILE,
888 PROTECT_LOOKUP_BENEATH_XDEV,
889 fd_to, "",
890 PROTECT_OPATH_FILE,
891 PROTECT_LOOKUP_BENEATH, 0,
892 false);
893 } else {
9bcde680
CB
894 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "/dev/tty%d", i + 1);
895 if (ret < 0)
7369e6bf
CB
896 return ret_errno(-EIO);
897
898 ret = mount(tty->name, rootfs->buf, "none", MS_BIND, 0);
7c6ef2a2 899 }
7369e6bf
CB
900 if (ret < 0)
901 return log_error_errno(-errno, errno,
902 "Failed to bind mount \"%s\" onto \"%s\"",
903 tty->name, rootfs->buf);
904 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, rootfs->buf);
393903d1 905 }
9e1045e3 906
55022530
CB
907 if (!append_ttyname(&conf->ttys.tty_names, tty->name))
908 return log_error(-1, "Error setting up container_ttys string");
b0a33c1e 909 }
910
885766f5 911 INFO("Finished setting up %zu /dev/tty<N> device(s)", ttys->max);
b0a33c1e 912 return 0;
913}
914
9d0e129b
CB
915define_cleanup_function(struct lxc_tty_info *, lxc_delete_tty);
916
59eac805 917static int lxc_allocate_ttys(struct lxc_conf *conf)
2187efd3 918{
9d0e129b 919 call_cleaner(lxc_delete_tty) struct lxc_tty_info *ttys = &conf->ttys;
fca23691 920 int ret;
2187efd3
CB
921
922 /* no tty in the configuration */
885766f5 923 if (ttys->max == 0)
2187efd3
CB
924 return 0;
925
9d0e129b
CB
926 ttys->tty = zalloc(sizeof(struct lxc_terminal_info) * ttys->max);
927 if (!ttys->tty)
2187efd3 928 return -ENOMEM;
2187efd3 929
7369e6bf 930 for (size_t i = 0; i < conf->ttys.max; i++) {
9d0e129b 931 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 932
36a94ce8 933 tty->ptx = -EBADF;
41808e20
CB
934 tty->pty = -EBADF;
935 ret = openpty(&tty->ptx, &tty->pty, NULL, NULL, NULL);
77a39805 936 if (ret < 0) {
7369e6bf 937 conf->ttys.max = i;
55022530 938 return log_error_errno(-ENOTTY, ENOTTY, "Failed to create tty %zu", i);
2187efd3
CB
939 }
940
41808e20 941 ret = ttyname_r(tty->pty, tty->name, sizeof(tty->name));
77a39805 942 if (ret < 0) {
7369e6bf 943 conf->ttys.max = i;
41808e20 944 return log_error_errno(-ENOTTY, ENOTTY, "Failed to retrieve name of tty %zu pty", i);
77a39805
CB
945 }
946
7369e6bf 947 DEBUG("Created tty with ptx fd %d and pty fd %d", tty->ptx, tty->pty);
2187efd3
CB
948
949 /* Prevent leaking the file descriptors to the container */
36a94ce8 950 ret = fd_cloexec(tty->ptx, true);
2187efd3 951 if (ret < 0)
36a94ce8
CB
952 SYSWARN("Failed to set FD_CLOEXEC flag on ptx fd %d of tty device \"%s\"",
953 tty->ptx, tty->name);
2187efd3 954
41808e20 955 ret = fd_cloexec(tty->pty, true);
2187efd3 956 if (ret < 0)
41808e20
CB
957 SYSWARN("Failed to set FD_CLOEXEC flag on pty fd %d of tty device \"%s\"",
958 tty->pty, tty->name);
2187efd3 959
7581d645 960 tty->busy = -1;
2187efd3
CB
961 }
962
885766f5 963 INFO("Finished creating %zu tty devices", ttys->max);
9d0e129b 964 move_ptr(ttys);
2187efd3
CB
965 return 0;
966}
967
0e4be3cf 968void lxc_delete_tty(struct lxc_tty_info *ttys)
2187efd3 969{
386e6768
CB
970 if (!ttys->tty)
971 return;
972
55022530 973 for (int i = 0; i < ttys->max; i++) {
0e4be3cf 974 struct lxc_terminal_info *tty = &ttys->tty[i];
36a94ce8 975 close_prot_errno_disarm(tty->ptx);
41808e20 976 close_prot_errno_disarm(tty->pty);
2187efd3
CB
977 }
978
55022530 979 free_disarm(ttys->tty);
2187efd3
CB
980}
981
982static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
983{
984 int i;
0fd73091 985 int ret = -1;
2187efd3 986 struct lxc_conf *conf = handler->conf;
0e4be3cf 987 struct lxc_tty_info *ttys = &conf->ttys;
2187efd3 988 int sock = handler->data_sock[0];
2187efd3 989
885766f5 990 if (ttys->max == 0)
2187efd3
CB
991 return 0;
992
885766f5 993 for (i = 0; i < ttys->max; i++) {
2187efd3 994 int ttyfds[2];
0e4be3cf 995 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 996
36a94ce8 997 ttyfds[0] = tty->ptx;
41808e20 998 ttyfds[1] = tty->pty;
2187efd3
CB
999
1000 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1001 if (ret < 0)
1002 break;
1003
41808e20
CB
1004 TRACE("Sent tty \"%s\" with ptx fd %d and pty fd %d to parent",
1005 tty->name, tty->ptx, tty->pty);
2187efd3
CB
1006 }
1007
1008 if (ret < 0)
6d1400b5 1009 SYSERROR("Failed to send %zu ttys to parent", ttys->max);
2187efd3 1010 else
885766f5 1011 TRACE("Sent %zu ttys to parent", ttys->max);
2187efd3
CB
1012
1013 return ret;
1014}
1015
1016static int lxc_create_ttys(struct lxc_handler *handler)
1017{
1018 int ret = -1;
1019 struct lxc_conf *conf = handler->conf;
1020
663014ee 1021 ret = lxc_allocate_ttys(conf);
2187efd3
CB
1022 if (ret < 0) {
1023 ERROR("Failed to allocate ttys");
1024 goto on_error;
1025 }
1026
1027 ret = lxc_send_ttys_to_parent(handler);
1028 if (ret < 0) {
1029 ERROR("Failed to send ttys to parent");
1030 goto on_error;
1031 }
1032
1033 if (!conf->is_execute) {
1034 ret = lxc_setup_ttys(conf);
1035 if (ret < 0) {
1036 ERROR("Failed to setup ttys");
1037 goto on_error;
1038 }
1039 }
1040
885766f5
CB
1041 if (conf->ttys.tty_names) {
1042 ret = setenv("container_ttys", conf->ttys.tty_names, 1);
2187efd3 1043 if (ret < 0)
885766f5 1044 SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
2187efd3
CB
1045 }
1046
1047 ret = 0;
1048
1049on_error:
0e4be3cf 1050 lxc_delete_tty(&conf->ttys);
2187efd3
CB
1051
1052 return ret;
1053}
1054
7133b912
CB
1055/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1056 * error, log it but don't fail yet.
91c3830e 1057 */
7133b912 1058static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
63012bdd 1059 int autodevtmpfssize, const char *lxcpath)
91c3830e 1060{
bfbfeedf 1061 __do_close int fd_fs = -EBADF;
ee8eeba8 1062 const char *path = rootfs->path ? rootfs->mount : NULL;
bfbfeedf 1063 size_t tmpfs_size = (autodevtmpfssize != 0) ? autodevtmpfssize : 500000;
91c3830e 1064 int ret;
87e0e273 1065 mode_t cur_mask;
63012bdd 1066 char mount_options[128];
91c3830e 1067
7133b912 1068 INFO("Preparing \"/dev\"");
bc6928ff 1069
87e0e273 1070 cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
ea57e424 1071 ret = mkdirat(rootfs->dfd_mnt, "dev" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
87e0e273
CB
1072 if (ret < 0 && errno != EEXIST) {
1073 SYSERROR("Failed to create \"/dev\" directory");
1074 ret = -errno;
1075 goto reset_umask;
bc6928ff 1076 }
87da4ec3 1077
de7f9f33 1078 if (can_use_mount_api()) {
635e7bac
CB
1079 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1080 if (fd_fs < 0)
1081 return log_error_errno(-errno, errno, "Failed to prepare filesystem context for tmpfs");
ee8eeba8 1082
bfbfeedf
CB
1083 sprintf(mount_options, "%zu", tmpfs_size);
1084
1085 ret = fs_set_property(fd_fs, "mode", "0755");
1086 if (ret < 0)
1087 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1088
1089 ret = fs_set_property(fd_fs, "size", mount_options);
1090 if (ret < 0)
1091 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1092
1093 ret = fs_attach(fd_fs, rootfs->dfd_mnt, "dev", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
635e7bac
CB
1094 } else {
1095 __do_free char *fallback_path = NULL;
1096
1097 sprintf(mount_options, "size=%zu,mode=755", tmpfs_size);
1098 DEBUG("Using mount options: %s", mount_options);
1099
1100 if (path) {
1101 fallback_path = must_make_path(path, "/dev", NULL);
1102 ret = safe_mount("none", fallback_path, "tmpfs", 0, mount_options, path);
1103 } else {
1104 ret = safe_mount("none", "dev", "tmpfs", 0, mount_options, NULL);
1105 }
87e0e273 1106 }
bfbfeedf
CB
1107 if (ret < 0) {
1108 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1109 goto reset_umask;
1110 }
1111
87da4ec3 1112
7133b912 1113 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1114 * If not, then create it and exit if that fails...
1115 */
ea57e424 1116 ret = mkdirat(rootfs->dfd_mnt, "dev/pts", S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
87e0e273 1117 if (ret < 0 && errno != EEXIST) {
bfbfeedf 1118 SYSERROR("Failed to create directory \"dev/pts\"");
87e0e273
CB
1119 ret = -errno;
1120 goto reset_umask;
91c3830e
SH
1121 }
1122
87e0e273
CB
1123 ret = 0;
1124
1125reset_umask:
1126 (void)umask(cur_mask);
1127
7133b912 1128 INFO("Prepared \"/dev\"");
87e0e273 1129 return ret;
91c3830e
SH
1130}
1131
5e73416f 1132struct lxc_device_node {
74a3920a 1133 const char *name;
5e73416f
CB
1134 const mode_t mode;
1135 const int maj;
1136 const int min;
c6883f38
SH
1137};
1138
5e73416f 1139static const struct lxc_device_node lxc_devices[] = {
06749971 1140 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
5e73416f 1141 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
06749971
CB
1142 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1143 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
5e73416f
CB
1144 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1145 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
c6883f38
SH
1146};
1147
5067e4dd
CB
1148
1149enum {
1150 LXC_DEVNODE_BIND,
1151 LXC_DEVNODE_MKNOD,
1152 LXC_DEVNODE_PARTIAL,
1153 LXC_DEVNODE_OPEN,
1154};
1155
887ae844 1156static int lxc_fill_autodev(struct lxc_rootfs *rootfs)
c6883f38 1157{
5e73416f 1158 int i, ret;
3a32201c 1159 mode_t cmask;
5067e4dd 1160 int use_mknod = LXC_DEVNODE_MKNOD;
c6883f38 1161
a5a08920 1162 if (rootfs->dfd_dev < 0)
81498328 1163 return log_info(0, "No /dev directory found, skipping setup");
d43d5191 1164
3999be0a
CB
1165 INFO("Populating \"/dev\"");
1166
3a32201c 1167 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
5e73416f 1168 for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
5e73416f 1169 const struct lxc_device_node *device = &lxc_devices[i];
0728ebf4 1170
5067e4dd 1171 if (use_mknod >= LXC_DEVNODE_MKNOD) {
a5a08920 1172 ret = mknodat(rootfs->dfd_dev, device->name, device->mode, makedev(device->maj, device->min));
5e73416f 1173 if (ret == 0 || (ret < 0 && errno == EEXIST)) {
d43d5191 1174 DEBUG("Created device node \"%s\"", device->name);
5067e4dd 1175 } else if (ret < 0) {
55022530 1176 if (errno != EPERM)
d43d5191 1177 return log_error_errno(-1, errno, "Failed to create device node \"%s\"", device->name);
0bbf8572 1178
5067e4dd 1179 use_mknod = LXC_DEVNODE_BIND;
9cb4d183 1180 }
3999be0a 1181
5067e4dd
CB
1182 /* Device nodes are fully useable. */
1183 if (use_mknod == LXC_DEVNODE_OPEN)
1184 continue;
1185
1186 if (use_mknod == LXC_DEVNODE_MKNOD) {
d43d5191 1187 __do_close int fd = -EBADF;
5067e4dd
CB
1188 /* See
1189 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
1190 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
1191 */
a5a08920 1192 fd = open_at(rootfs->dfd_dev, device->name, PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
d43d5191 1193 if (fd >= 0) {
5067e4dd
CB
1194 /* Device nodes are fully useable. */
1195 use_mknod = LXC_DEVNODE_OPEN;
1196 continue;
1197 }
1198
d43d5191 1199 SYSTRACE("Failed to open \"%s\" device", device->name);
5067e4dd
CB
1200 /* Device nodes are only partially useable. */
1201 use_mknod = LXC_DEVNODE_PARTIAL;
1202 }
5e73416f
CB
1203 }
1204
5067e4dd
CB
1205 if (use_mknod != LXC_DEVNODE_PARTIAL) {
1206 /* If we are dealing with partially functional device
1207 * nodes the prio mknod() call will have created the
1208 * device node so we can use it as a bind-mount target.
1209 */
a5a08920 1210 ret = mknodat(rootfs->dfd_dev, device->name, S_IFREG | 0000, 0);
55022530 1211 if (ret < 0 && errno != EEXIST)
d43d5191 1212 return log_error_errno(-1, errno, "Failed to create file \"%s\"", device->name);
5e73416f
CB
1213 }
1214
1215 /* Fallback to bind-mounting the device from the host. */
9bcde680
CB
1216 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "dev/%s", device->name);
1217 if (ret < 0)
b41ff502 1218 return ret_errno(EIO);
5e73416f 1219
de7f9f33 1220 if (can_use_mount_api()) {
887ae844 1221 ret = fd_bind_mount(rootfs->dfd_host, rootfs->buf,
635e7bac
CB
1222 PROTECT_OPATH_FILE,
1223 PROTECT_LOOKUP_BENEATH_XDEV,
1224 rootfs->dfd_dev, device->name,
1225 PROTECT_OPATH_FILE,
1226 PROTECT_LOOKUP_BENEATH, 0, false);
1227 } else {
927ea337
CB
1228 char path[PATH_MAX];
1229
9bcde680
CB
1230 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "/dev/%s", device->name);
1231 if (ret < 0)
927ea337
CB
1232 return ret_errno(EIO);
1233
9bcde680
CB
1234 ret = strnprintf(path, sizeof(path), "%s/dev/%s", get_rootfs_mnt(rootfs), device->name);
1235 if (ret < 0)
927ea337
CB
1236 return log_error(-1, "Failed to create device path for %s", device->name);
1237
887ae844 1238 ret = safe_mount(rootfs->buf, path, 0, MS_BIND, NULL, get_rootfs_mnt(rootfs));
927ea337 1239 if (ret < 0)
887ae844 1240 return log_error_errno(-1, errno, "Failed to bind mount host device node \"%s\" to \"%s\"", rootfs->buf, path);
927ea337 1241
887ae844 1242 DEBUG("Bind mounted host device node \"%s\" to \"%s\"", rootfs->buf, path);
927ea337 1243 continue;
d43d5191 1244 }
887ae844 1245 DEBUG("Bind mounted host device %d(%s) to %d(%s)", rootfs->dfd_host, rootfs->buf, rootfs->dfd_dev, device->name);
c6883f38 1246 }
5e73416f 1247 (void)umask(cmask);
c6883f38 1248
3999be0a 1249 INFO("Populated \"/dev\"");
c6883f38
SH
1250 return 0;
1251}
1252
8ce1abc2 1253static int lxc_mount_rootfs(struct lxc_conf *conf)
0ad19a3f 1254{
9aa76a17 1255 int ret;
10bc1861 1256 struct lxc_storage *bdev;
31f8b2fd 1257 struct lxc_rootfs *rootfs = &conf->rootfs;
cc28d0b0 1258
a0f379bf 1259 if (!rootfs->path) {
0fd73091 1260 ret = mount("", "/", NULL, MS_SLAVE | MS_REC, 0);
55022530 1261 if (ret < 0)
9e61fb1f 1262 return log_error_errno(-1, errno, "Failed to recursively turn root mount tree into dependent mount");
0fd73091 1263
ea57e424
CB
1264 rootfs->dfd_mnt = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
1265 if (rootfs->dfd_mnt < 0)
31f8b2fd
CB
1266 return -errno;
1267
c69bd12f 1268 return 0;
a0f379bf 1269 }
0ad19a3f 1270
0fd73091 1271 ret = access(rootfs->mount, F_OK);
55022530
CB
1272 if (ret != 0)
1273 return log_error_errno(-1, errno, "Failed to access to \"%s\". Check it is present",
1274 rootfs->mount);
b1789442 1275
8a388ed4 1276 bdev = storage_init(conf);
55022530
CB
1277 if (!bdev)
1278 return log_error(-1, "Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1279 rootfs->path, rootfs->mount,
1280 rootfs->options ? rootfs->options : "(null)");
9aa76a17
CB
1281
1282 ret = bdev->ops->mount(bdev);
10bc1861 1283 storage_put(bdev);
55022530
CB
1284 if (ret < 0)
1285 return log_error(-1, "Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1286 rootfs->path, rootfs->mount,
1287 rootfs->options ? rootfs->options : "(null)");
0ad19a3f 1288
0fd73091 1289 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1290 rootfs->path, rootfs->mount,
1291 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1292
ea57e424
CB
1293 rootfs->dfd_mnt = open_at(-EBADF, rootfs->mount, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
1294 if (rootfs->dfd_mnt < 0)
31f8b2fd
CB
1295 return -errno;
1296
ac778708
DL
1297 return 0;
1298}
1299
59eac805 1300static int lxc_chroot(const struct lxc_rootfs *rootfs)
91e93c71 1301{
b8d88764 1302 __do_free char *nroot = NULL;
0fd73091 1303 int i, ret;
8ce1abc2 1304 char *root = rootfs->mount;
91e93c71 1305
74e7b662 1306 nroot = realpath(root, NULL);
55022530
CB
1307 if (!nroot)
1308 return log_error_errno(-1, errno, "Failed to resolve \"%s\"", root);
91e93c71 1309
0fd73091 1310 ret = chdir("/");
b8d88764 1311 if (ret < 0)
0fd73091 1312 return -1;
91e93c71 1313
0fd73091
CB
1314 /* We could use here MS_MOVE, but in userns this mount is locked and
1315 * can't be moved.
91e93c71 1316 */
8ce1abc2 1317 ret = mount(nroot, "/", NULL, MS_REC | MS_BIND, NULL);
55022530
CB
1318 if (ret < 0)
1319 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"/\" as MS_REC | MS_BIND", nroot);
91e93c71 1320
0fd73091 1321 ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
55022530
CB
1322 if (ret < 0)
1323 return log_error_errno(-1, errno, "Failed to remount \"/\"");
91e93c71 1324
aa899945 1325 /* The following code cleans up inherited mounts which are not required
0fd73091 1326 * for CT.
91e93c71
AV
1327 *
1328 * The mountinfo file shows not all mounts, if a few points have been
1329 * unmounted between read operations from the mountinfo. So we need to
1330 * read mountinfo a few times.
1331 *
7ded5fa7 1332 * This loop can be skipped if a container uses userns, because all
91e93c71
AV
1333 * inherited mounts are locked and we should live with all this trash.
1334 */
0fd73091 1335 for (;;) {
4fdd1f72 1336 __do_fclose FILE *f = NULL;
f3d38164
CB
1337 __do_free char *line = NULL;
1338 char *slider1, *slider2;
91e93c71 1339 int progress = 0;
f3d38164 1340 size_t len = 0;
91e93c71 1341
4110345b 1342 f = fopen("./proc/self/mountinfo", "re");
55022530
CB
1343 if (!f)
1344 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
0fd73091 1345
f3d38164
CB
1346 while (getline(&line, &len, f) > 0) {
1347 for (slider1 = line, i = 0; slider1 && i < 4; i++)
1348 slider1 = strchr(slider1 + 1, ' ');
0fd73091 1349
f3d38164 1350 if (!slider1)
91e93c71 1351 continue;
0fd73091 1352
f3d38164
CB
1353 slider2 = strchr(slider1 + 1, ' ');
1354 if (!slider2)
91e93c71
AV
1355 continue;
1356
f3d38164
CB
1357 *slider2 = '\0';
1358 *slider1 = '.';
91e93c71 1359
71528742 1360 if (strequal(slider1 + 1, "/"))
91e93c71 1361 continue;
0fd73091 1362
71528742 1363 if (strequal(slider1 + 1, "/proc"))
91e93c71
AV
1364 continue;
1365
f3d38164 1366 ret = umount2(slider1, MNT_DETACH);
0fd73091 1367 if (ret == 0)
91e93c71
AV
1368 progress++;
1369 }
0fd73091 1370
91e93c71
AV
1371 if (!progress)
1372 break;
1373 }
1374
7ded5fa7 1375 /* This also can be skipped if a container uses userns. */
0fd73091 1376 (void)umount2("./proc", MNT_DETACH);
91e93c71
AV
1377
1378 /* It is weird, but chdir("..") moves us in a new root */
0fd73091 1379 ret = chdir("..");
55022530
CB
1380 if (ret < 0)
1381 return log_error_errno(-1, errno, "Failed to chdir(\"..\")");
91e93c71 1382
0fd73091 1383 ret = chroot(".");
55022530
CB
1384 if (ret < 0)
1385 return log_error_errno(-1, errno, "Failed to chroot(\".\")");
91e93c71
AV
1386
1387 return 0;
1388}
1389
8ce1abc2
CB
1390/* (The following explanation is copied verbatim from the kernel.)
1391 *
1392 * pivot_root Semantics:
1393 * Moves the root file system of the current process to the directory put_old,
1394 * makes new_root as the new root file system of the current process, and sets
1395 * root/cwd of all processes which had them on the current root to new_root.
1396 *
1397 * Restrictions:
1398 * The new_root and put_old must be directories, and must not be on the
1399 * same file system as the current process root. The put_old must be
1400 * underneath new_root, i.e. adding a non-zero number of /.. to the string
1401 * pointed to by put_old must yield the same directory as new_root. No other
1402 * file system may be mounted on put_old. After all, new_root is a mountpoint.
1403 *
1404 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
1405 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
1406 * in this situation.
1407 *
1408 * Notes:
1409 * - we don't move root/cwd if they are not at the root (reason: if something
1410 * cared enough to change them, it's probably wrong to force them elsewhere)
1411 * - it's okay to pick a root that isn't the root of a file system, e.g.
1412 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
1413 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
1414 * first.
1415 */
7f50ec8b 1416static int lxc_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1417{
7f50ec8b 1418 __do_close int fd_oldroot = -EBADF;
b0d7aac4 1419 int ret;
0fd73091 1420
7f50ec8b
CB
1421 fd_oldroot = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
1422 if (fd_oldroot < 0)
55022530 1423 return log_error_errno(-1, errno, "Failed to open old root directory");
ac778708 1424
8ce1abc2 1425 /* change into new root fs */
ea57e424 1426 ret = fchdir(rootfs->dfd_mnt);
55022530 1427 if (ret < 0)
7f50ec8b 1428 return log_error_errno(-errno, errno, "Failed to change into new root directory \"%s\"", rootfs->mount);
39c7b795 1429
8ce1abc2
CB
1430 /* pivot_root into our new root fs */
1431 ret = pivot_root(".", ".");
55022530 1432 if (ret < 0)
7f50ec8b 1433 return log_error_errno(-errno, errno, "Failed to pivot into new root directory \"%s\"", rootfs->mount);
39c7b795 1434
8ce1abc2
CB
1435 /* At this point the old-root is mounted on top of our new-root. To
1436 * unmounted it we must not be chdir'd into it, so escape back to
1437 * old-root.
1438 */
7f50ec8b 1439 ret = fchdir(fd_oldroot);
55022530 1440 if (ret < 0)
7f50ec8b 1441 return log_error_errno(-errno, errno, "Failed to enter old root directory");
c69bd12f 1442
7f50ec8b
CB
1443 /*
1444 * Make fd_oldroot a depedent mount to make sure our umounts don't
1445 * propagate to the host.
8ce1abc2
CB
1446 */
1447 ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
55022530 1448 if (ret < 0)
7f50ec8b 1449 return log_error_errno(-errno, errno, "Failed to recursively turn old root mount tree into dependent mount");
8ce1abc2
CB
1450
1451 ret = umount2(".", MNT_DETACH);
55022530 1452 if (ret < 0)
7f50ec8b 1453 return log_error_errno(-errno, errno, "Failed to detach old root directory");
8ce1abc2 1454
ea57e424 1455 ret = fchdir(rootfs->dfd_mnt);
55022530 1456 if (ret < 0)
7f50ec8b 1457 return log_error_errno(-errno, errno, "Failed to re-enter new root directory \"%s\"", rootfs->mount);
8ce1abc2 1458
7f50ec8b 1459 TRACE("Changed into new rootfs \"%s\"", rootfs->mount);
b0d7aac4 1460 return 0;
0ad19a3f 1461}
1462
8ce1abc2
CB
1463static int lxc_setup_rootfs_switch_root(const struct lxc_rootfs *rootfs)
1464{
55022530
CB
1465 if (!rootfs->path)
1466 return log_debug(0, "Container does not have a rootfs");
8ce1abc2
CB
1467
1468 if (detect_ramfs_rootfs())
1469 return lxc_chroot(rootfs);
1470
7f50ec8b 1471 return lxc_pivot_root(rootfs);
0ad19a3f 1472}
1473
7581a82f 1474static const struct id_map *find_mapped_nsid_entry(const struct lxc_conf *conf,
8ce1abc2
CB
1475 unsigned id,
1476 enum idtype idtype)
f4900711
CB
1477{
1478 struct lxc_list *it;
1479 struct id_map *map;
1480 struct id_map *retmap = NULL;
1481
dcf0ffdf
CB
1482 /* Shortcut for container's root mappings. */
1483 if (id == 0) {
1484 if (idtype == ID_TYPE_UID)
1485 return conf->root_nsuid_map;
1486
1487 if (idtype == ID_TYPE_GID)
1488 return conf->root_nsgid_map;
1489 }
1490
f4900711
CB
1491 lxc_list_for_each(it, &conf->id_map) {
1492 map = it->elem;
1493 if (map->idtype != idtype)
1494 continue;
1495
1496 if (id >= map->nsid && id < map->nsid + map->range) {
1497 retmap = map;
1498 break;
1499 }
1500 }
1501
1502 return retmap;
1503}
1504
68f3899e
CB
1505int lxc_setup_devpts_parent(struct lxc_handler *handler)
1506{
1507 int ret;
1508
1509 if (handler->conf->pty_max <= 0)
1510 return 0;
1511
1512 ret = lxc_abstract_unix_recv_fds(handler->data_sock[1], &handler->conf->devpts_fd, 1,
1513 &handler->conf->devpts_fd, sizeof(handler->conf->devpts_fd));
1514 if (ret < 0)
1515 return log_error_errno(-1, errno, "Failed to receive devpts fd from child");
1516
1517 TRACE("Received devpts file descriptor %d from child", handler->conf->devpts_fd);
1518 return 0;
1519}
1520
1521static int lxc_setup_devpts_child(struct lxc_handler *handler)
3c26f34e 1522{
f797f05e 1523 __do_close int devpts_fd = -EBADF;
70761e5e 1524 int ret;
ce155c60 1525 char **opts;
9d28c4f9 1526 char devpts_mntopts[256];
ce155c60
CB
1527 char *mntopt_sets[5];
1528 char default_devpts_mntopts[256] = "gid=5,newinstance,ptmxmode=0666,mode=0620";
f797f05e 1529 struct lxc_conf *conf = handler->conf;
a26822c5 1530 struct lxc_rootfs *rootfs = &conf->rootfs;
f797f05e 1531 int sock = handler->data_sock[0];
77890c6d 1532
55022530
CB
1533 if (conf->pty_max <= 0)
1534 return log_debug(0, "No new devpts instance will be mounted since no pts devices are requested");
3c26f34e 1535
9bcde680 1536 ret = strnprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu",
e528c735 1537 default_devpts_mntopts, conf->pty_max);
9bcde680 1538 if (ret < 0)
9d28c4f9
CB
1539 return -1;
1540
29a7b484 1541 (void)umount2("/dev/pts", MNT_DETACH);
7e40254a 1542
70761e5e 1543 /* Create mountpoint for devpts instance. */
a5a08920 1544 ret = mkdirat(rootfs->dfd_dev, "pts", 0755);
55022530
CB
1545 if (ret < 0 && errno != EEXIST)
1546 return log_error_errno(-1, errno, "Failed to create \"/dev/pts\" directory");
3c26f34e 1547
ce155c60
CB
1548 /* gid=5 && max= */
1549 mntopt_sets[0] = devpts_mntopts;
dfbd4730 1550
ce155c60 1551 /* !gid=5 && max= */
6333c915 1552 mntopt_sets[1] = devpts_mntopts + STRLITERALLEN("gid=5") + 1;
ce155c60
CB
1553
1554 /* gid=5 && !max= */
1555 mntopt_sets[2] = default_devpts_mntopts;
1556
1557 /* !gid=5 && !max= */
6333c915 1558 mntopt_sets[3] = default_devpts_mntopts + STRLITERALLEN("gid=5") + 1;
ce155c60
CB
1559
1560 /* end */
1561 mntopt_sets[4] = NULL;
1562
1563 for (ret = -1, opts = mntopt_sets; opts && *opts; opts++) {
1564 /* mount new devpts instance */
1565 ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, *opts);
1566 if (ret == 0)
1567 break;
1568 }
1569
55022530
CB
1570 if (ret < 0)
1571 return log_error_errno(-1, errno, "Failed to mount new devpts instance");
ce155c60 1572 DEBUG("Mount new devpts instance with options \"%s\"", *opts);
70761e5e 1573
a5a08920 1574 devpts_fd = open_at(rootfs->dfd_dev, "pts", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
f797f05e 1575 if (devpts_fd < 0) {
fbfe5c82 1576 devpts_fd = -EBADF;
f797f05e 1577 TRACE("Failed to create detached devpts mount");
185b9ee9
CB
1578 ret = lxc_abstract_unix_send_fds(sock, NULL, 0, &devpts_fd, sizeof(int));
1579 } else {
1580 ret = lxc_abstract_unix_send_fds(sock, &devpts_fd, 1, NULL, 0);
f797f05e 1581 }
185b9ee9
CB
1582 if (ret < 0)
1583 return log_error_errno(-1, errno, "Failed to send devpts fd to parent");
1584
1585 TRACE("Sent devpts file descriptor %d to parent", devpts_fd);
f797f05e 1586
d5cb35d6 1587 /* Remove any pre-existing /dev/ptmx file. */
a5a08920 1588 ret = unlinkat(rootfs->dfd_dev, "ptmx", 0);
b29e05d6 1589 if (ret < 0) {
55022530
CB
1590 if (errno != ENOENT)
1591 return log_error_errno(-1, errno, "Failed to remove existing \"/dev/ptmx\" file");
b29e05d6 1592 } else {
0fd73091 1593 DEBUG("Removed existing \"/dev/ptmx\" file");
3c26f34e 1594 }
1595
d5cb35d6 1596 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
a5a08920 1597 ret = mknodat(rootfs->dfd_dev, "ptmx", S_IFREG | 0000, 0);
55022530
CB
1598 if (ret < 0 && errno != EEXIST)
1599 return log_error_errno(-1, errno, "Failed to create dummy \"/dev/ptmx\" file as bind mount target");
0fd73091 1600 DEBUG("Created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1601
d5cb35d6 1602 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1603 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
55022530
CB
1604 if (!ret)
1605 return log_debug(0, "Bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1606 else
d5cb35d6 1607 /* Fallthrough and try to create a symlink. */
0fd73091 1608 ERROR("Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
d5cb35d6
CB
1609
1610 /* Remove the dummy /dev/ptmx file we created above. */
a5a08920 1611 ret = unlinkat(rootfs->dfd_dev, "ptmx", 0);
55022530
CB
1612 if (ret < 0)
1613 return log_error_errno(-1, errno, "Failed to remove existing \"/dev/ptmx\"");
d5cb35d6
CB
1614
1615 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
a5a08920 1616 ret = symlinkat("/dev/pts/ptmx", rootfs->dfd_dev, "/dev/ptmx");
55022530
CB
1617 if (ret < 0)
1618 return log_error_errno(-1, errno, "Failed to create symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
cd54d859 1619
185b9ee9 1620 DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
3c26f34e 1621 return 0;
1622}
1623
cccc74b5
DL
1624static int setup_personality(int persona)
1625{
0fd73091
CB
1626 int ret;
1627
1628#if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1629 if (persona == -1)
1630 return 0;
1631
0fd73091 1632 ret = personality(persona);
55022530
CB
1633 if (ret < 0)
1634 return log_error_errno(-1, errno, "Failed to set personality to \"0x%x\"", persona);
cccc74b5 1635
0fd73091
CB
1636 INFO("Set personality to \"0x%x\"", persona);
1637#endif
cccc74b5
DL
1638
1639 return 0;
1640}
1641
efbfe93f
CB
1642static inline bool wants_console(const struct lxc_terminal *terminal)
1643{
71528742 1644 return !terminal->path || !strequal(terminal->path, "none");
efbfe93f
CB
1645}
1646
37c74fd1
CB
1647static int lxc_bind_mount_console(const struct lxc_terminal *console,
1648 int dfd_to, const char *path_to)
1649{
1650 __do_close int fd_pty = -EBADF;
1651
1652 if (is_empty_string(console->name))
1653 return ret_errno(EINVAL);
1654
1655 /*
1656 * When the pty fd stashed in console->pty has been retrieved via the
1657 * TIOCGPTPEER ioctl() to avoid dangerous path-based lookups when
1658 * allocating new pty devices we can't reopen it through openat2() or
1659 * created a detached mount through open_tree() from it. This means we
1660 * would need to mount using the path stased in console->name which is
1661 * unsafe. We could be mounting a device that isn't identical to the
1662 * one we've already safely opened and stashed in console->pty.
1663 * So, what we do is we open an O_PATH file descriptor for
1664 * console->name and verify that the opened fd and the fd we stashed in
1665 * console->pty refer to the same device. If they do we can go on and
1666 * created a detached mount based on the newly opened O_PATH file
1667 * descriptor and then safely mount.
1668 */
1669 fd_pty = open_at(-EBADF, console->name, PROTECT_OPATH_FILE,
1670 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
1671 if (fd_pty < 0)
1672 return log_error_errno(-errno, errno, "Failed to open \"%s\"", console->name);
1673
1674 if (!same_file_lax(console->pty, fd_pty))
1675 return log_error_errno(-EINVAL, EINVAL, "Console file descriptor changed");
1676
1677 /*
1678 * Note, there are intentionally no open or lookup restrictions since
1679 * we're operating directly on the fd.
1680 */
1681 return fd_bind_mount(fd_pty, "", 0, 0,
1682 dfd_to, path_to, PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH,
1683 0, false);
1684}
1685
58b38111 1686static int lxc_setup_dev_console(struct lxc_rootfs *rootfs,
37c74fd1 1687 const struct lxc_terminal *console)
6e590161 1688{
882671aa 1689 int ret;
86530b0a 1690 char *rootfs_path = rootfs->path ? rootfs->mount : "";
52e35957 1691
cf68ffd9
CB
1692 /*
1693 * When we are asked to setup a console we remove any previous
8b1b1210
CB
1694 * /dev/console bind-mounts.
1695 */
a5a08920 1696 if (exists_file_at(rootfs->dfd_dev, "console")) {
9bcde680
CB
1697 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/dev/console", rootfs_path);
1698 if (ret < 0)
953db219
CB
1699 return -1;
1700
58b38111 1701 ret = lxc_unstack_mountpoint(rootfs->buf, false);
55022530 1702 if (ret < 0)
58b38111 1703 return log_error_errno(-ret, errno, "Failed to unmount \"%s\"", rootfs->buf);
55022530 1704 else
58b38111 1705 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, rootfs->buf);
8b1b1210
CB
1706 }
1707
cf68ffd9
CB
1708 /*
1709 * For unprivileged containers autodev or automounts will already have
8b1b1210
CB
1710 * taken care of creating /dev/console.
1711 */
a5a08920 1712 ret = mknodat(rootfs->dfd_dev, "console", S_IFREG | 0000, 0);
55022530
CB
1713 if (ret < 0 && errno != EEXIST)
1714 return log_error_errno(-errno, errno, "Failed to create console");
52e35957 1715
1dd71c90 1716 ret = fchmod(console->pty, 0620);
55022530
CB
1717 if (ret < 0)
1718 return log_error_errno(-errno, errno, "Failed to set mode \"0%o\" to \"%s\"", S_IXUSR | S_IXGRP, console->name);
13954cce 1719
de7f9f33 1720 if (can_use_mount_api()) {
37c74fd1
CB
1721 ret = lxc_bind_mount_console(console, rootfs->dfd_dev, "console");
1722 } else {
1723 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/dev/console", rootfs_path);
1724 if (ret < 0)
1725 return ret;
953db219 1726
37c74fd1 1727 ret = safe_mount(console->name, rootfs->buf, "none", MS_BIND, NULL, rootfs_path);
953db219 1728 }
37c74fd1
CB
1729 if (ret < 0)
1730 return log_error_errno(ret, errno, "Failed to mount %d(%s) on \"%s\"", console->pty, console->name, rootfs->buf);
6e590161 1731
37c74fd1 1732 DEBUG("Mounted pty device %d(%s) onto \"%s\"", console->pty, console->name, rootfs->buf);
7c6ef2a2
SH
1733 return 0;
1734}
1735
37c74fd1 1736static int lxc_setup_ttydir_console(struct lxc_rootfs *rootfs,
dcad02f8 1737 const struct lxc_terminal *console,
37c74fd1 1738 char *ttydir)
7c6ef2a2 1739{
3b7e332f 1740 int ret;
6b5a54cd 1741 char path[PATH_MAX], lxcpath[PATH_MAX];
86530b0a 1742 char *rootfs_path = rootfs->path ? rootfs->mount : "";
7c6ef2a2
SH
1743
1744 /* create rootfs/dev/<ttydir> directory */
9bcde680
CB
1745 ret = strnprintf(path, sizeof(path), "%s/dev/%s", rootfs_path, ttydir);
1746 if (ret < 0)
7c6ef2a2 1747 return -1;
3d7d929a 1748
7c6ef2a2 1749 ret = mkdir(path, 0755);
55022530
CB
1750 if (ret && errno != EEXIST)
1751 return log_error_errno(-errno, errno, "Failed to create \"%s\"", path);
4742cd9a 1752 DEBUG("Created directory for console and tty devices at \"%s\"", path);
7c6ef2a2 1753
9bcde680
CB
1754 ret = strnprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
1755 if (ret < 0)
3d7d929a
CB
1756 return -1;
1757
3b7e332f 1758 ret = mknod(lxcpath, S_IFREG | 0000, 0);
55022530
CB
1759 if (ret < 0 && errno != EEXIST)
1760 return log_error_errno(-errno, errno, "Failed to create \"%s\"", lxcpath);
7c6ef2a2 1761
9bcde680
CB
1762 ret = strnprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
1763 if (ret < 0)
7c6ef2a2 1764 return -1;
2a12fefd 1765
3dc035f1 1766 if (file_exists(path)) {
a7ba3c7f 1767 ret = lxc_unstack_mountpoint(path, false);
55022530
CB
1768 if (ret < 0)
1769 return log_error_errno(-ret, errno, "Failed to unmount \"%s\"", path);
1770 else
86530b0a 1771 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
3dc035f1 1772 }
2a12fefd 1773
3b7e332f 1774 ret = mknod(path, S_IFREG | 0000, 0);
55022530
CB
1775 if (ret < 0 && errno != EEXIST)
1776 return log_error_errno(-errno, errno, "Failed to create console");
7c6ef2a2 1777
1dd71c90 1778 ret = fchmod(console->pty, 0620);
55022530
CB
1779 if (ret < 0)
1780 return log_error_errno(-errno, errno, "Failed to set mode \"0%o\" to \"%s\"", S_IXUSR | S_IXGRP, console->name);
2a12fefd 1781
3dc035f1 1782 /* bind mount console->name to '/dev/<ttydir>/console' */
de7f9f33 1783 if (can_use_mount_api()) {
37c74fd1
CB
1784 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/console", ttydir);
1785 if (ret < 0)
1786 return ret;
efbfe93f 1787
37c74fd1
CB
1788 ret = lxc_bind_mount_console(console, rootfs->dfd_dev, rootfs->buf);
1789 } else {
1790 ret = safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs_path);
efbfe93f 1791 }
55022530 1792 if (ret < 0)
37c74fd1 1793 return log_error_errno(ret, errno, "Failed to mount %d(%s) on \"%s\"", console->pty, console->name, lxcpath);
86530b0a 1794 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1
L
1795
1796 /* bind mount '/dev/<ttydir>/console' to '/dev/console' */
de7f9f33 1797 if (can_use_mount_api()) {
37c74fd1
CB
1798 ret = fd_bind_mount(rootfs->dfd_dev, rootfs->buf,
1799 PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH_XDEV,
1800 rootfs->dfd_dev, "console",
1801 PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH,
1802 0, false);
1803 } else {
1804 ret = safe_mount(lxcpath, path, "none", MS_BIND, 0, rootfs_path);
1805 }
55022530
CB
1806 if (ret < 0)
1807 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
37c74fd1 1808 DEBUG("Mounted \"%s\" onto \"%s\"", lxcpath, path);
3dc035f1 1809
86530b0a 1810 DEBUG("Console has been setup under \"%s\" and mounted to \"%s\"", lxcpath, path);
6e590161 1811 return 0;
1812}
1813
58b38111 1814static int lxc_setup_console(struct lxc_rootfs *rootfs,
37c74fd1 1815 struct lxc_terminal *console, char *ttydir)
7c6ef2a2 1816{
37c74fd1 1817 int ret;
3d7d929a 1818
37c74fd1
CB
1819 if (!wants_console(console))
1820 return log_trace(0, "Skipping console setup");
7c6ef2a2 1821
37c74fd1
CB
1822 if (ttydir)
1823 ret = lxc_setup_ttydir_console(rootfs, console, ttydir);
1824 else
1825 ret = lxc_setup_dev_console(rootfs, console);
1826 close_prot_errno_disarm(console->pty);
1827 return ret;
7c6ef2a2
SH
1828}
1829
a08bfbe3 1830static int parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
998ac676 1831{
a08bfbe3 1832 ssize_t ret;
998ac676 1833
85c2de39
MB
1834 /* If '=' is contained in opt, the option must go into data. */
1835 if (!strchr(opt, '=')) {
a08bfbe3
CB
1836 /*
1837 * If opt is found in mount_opt, set or clear flags.
1838 * Otherwise append it to data.
1839 */
85c2de39 1840 size_t opt_len = strlen(opt);
a08bfbe3 1841 for (struct mount_opt *mo = &mount_opt[0]; mo->name != NULL; mo++) {
85c2de39 1842 size_t mo_name_len = strlen(mo->name);
a08bfbe3 1843
eed95eb0 1844 if (opt_len == mo_name_len && strnequal(opt, mo->name, mo_name_len)) {
85c2de39
MB
1845 if (mo->clear)
1846 *flags &= ~mo->flag;
1847 else
1848 *flags |= mo->flag;
a08bfbe3 1849 return 0;
85c2de39 1850 }
998ac676
RT
1851 }
1852 }
1853
a08bfbe3
CB
1854 if (strlen(*data)) {
1855 ret = strlcat(*data, ",", size);
1856 if (ret < 0)
1857 return log_error_errno(ret, errno, "Failed to append \",\" to %s", *data);
1858 }
1859
1860 ret = strlcat(*data, opt, size);
1861 if (ret < 0)
1862 return log_error_errno(ret, errno, "Failed to append \"%s\" to %s", opt, *data);
efed99a4 1863
a08bfbe3 1864 return 0;
998ac676
RT
1865}
1866
0fd73091 1867int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata)
998ac676 1868{
a08bfbe3
CB
1869 __do_free char *mntopts_new = NULL, *mntopts_dup = NULL;
1870 char *mntopt_cur = NULL;
efed99a4 1871 size_t size;
998ac676 1872
a08bfbe3
CB
1873 if (*mntdata || *mntflags)
1874 return ret_errno(EINVAL);
911324ef
DL
1875
1876 if (!mntopts)
998ac676
RT
1877 return 0;
1878
a08bfbe3
CB
1879 mntopts_dup = strdup(mntopts);
1880 if (!mntopts_dup)
1881 return ret_errno(ENOMEM);
998ac676 1882
a08bfbe3
CB
1883 size = strlen(mntopts_dup) + 1;
1884 mntopts_new = zalloc(size);
1885 if (!mntopts_new)
1886 return ret_errno(ENOMEM);
998ac676 1887
a08bfbe3
CB
1888 lxc_iterate_parts(mntopt_cur, mntopts_dup, ",")
1889 if (parse_mntopt(mntopt_cur, mntflags, &mntopts_new, size) < 0)
1890 return ret_errno(EINVAL);
998ac676 1891
a08bfbe3
CB
1892 if (*mntopts_new)
1893 *mntdata = move_ptr(mntopts_new);
998ac676
RT
1894
1895 return 0;
1896}
1897
d840039e
YT
1898static void parse_propagationopt(char *opt, unsigned long *flags)
1899{
1900 struct mount_opt *mo;
1901
1902 /* If opt is found in propagation_opt, set or clear flags. */
d840039e 1903 for (mo = &propagation_opt[0]; mo->name != NULL; mo++) {
eed95eb0 1904 if (!strnequal(opt, mo->name, strlen(mo->name)))
0fd73091
CB
1905 continue;
1906
1907 if (mo->clear)
1908 *flags &= ~mo->flag;
1909 else
1910 *flags |= mo->flag;
1911
1912 return;
d840039e
YT
1913 }
1914}
1915
8ce1abc2 1916int parse_propagationopts(const char *mntopts, unsigned long *pflags)
d840039e 1917{
dfd2e059
CB
1918 __do_free char *s = NULL;
1919 char *p;
d840039e
YT
1920
1921 if (!mntopts)
1922 return 0;
1923
1924 s = strdup(mntopts);
55022530
CB
1925 if (!s)
1926 return log_error_errno(-ENOMEM, errno, "Failed to allocate memory");
d840039e 1927
0fd73091 1928 *pflags = 0L;
8db9d26f 1929 lxc_iterate_parts(p, s, ",")
d840039e 1930 parse_propagationopt(p, pflags);
0fd73091 1931
d840039e
YT
1932 return 0;
1933}
1934
6fd5e769
SH
1935static void null_endofword(char *word)
1936{
1937 while (*word && *word != ' ' && *word != '\t')
1938 word++;
1939 *word = '\0';
1940}
1941
0fd73091 1942/* skip @nfields spaces in @src */
6fd5e769
SH
1943static char *get_field(char *src, int nfields)
1944{
6fd5e769 1945 int i;
0fd73091 1946 char *p = src;
6fd5e769
SH
1947
1948 for (i = 0; i < nfields; i++) {
1949 while (*p && *p != ' ' && *p != '\t')
1950 p++;
0fd73091 1951
6fd5e769
SH
1952 if (!*p)
1953 break;
0fd73091 1954
6fd5e769
SH
1955 p++;
1956 }
0fd73091 1957
6fd5e769
SH
1958 return p;
1959}
1960
911324ef
DL
1961static int mount_entry(const char *fsname, const char *target,
1962 const char *fstype, unsigned long mountflags,
d840039e
YT
1963 unsigned long pflags, const char *data, bool optional,
1964 bool dev, bool relative, const char *rootfs)
911324ef 1965{
0ac4b28a 1966 int ret;
6b5a54cd 1967 char srcbuf[PATH_MAX];
181437fd 1968 const char *srcpath = fsname;
614305f3 1969#ifdef HAVE_STATVFS
2938f7c8 1970 struct statvfs sb;
614305f3 1971#endif
2938f7c8 1972
181437fd 1973 if (relative) {
9bcde680
CB
1974 ret = strnprintf(srcbuf, sizeof(srcbuf), "%s/%s", rootfs ? rootfs : "/", fsname ? fsname : "");
1975 if (ret < 0)
55022530 1976 return log_error_errno(-1, errno, "source path is too long");
181437fd
YT
1977 srcpath = srcbuf;
1978 }
1979
1980 ret = safe_mount(srcpath, target, fstype, mountflags & ~MS_REMOUNT, data,
0ac4b28a
CB
1981 rootfs);
1982 if (ret < 0) {
55022530
CB
1983 if (optional)
1984 return log_info_errno(0, errno, "Failed to mount \"%s\" on \"%s\" (optional)",
1985 srcpath ? srcpath : "(null)", target);
0ac4b28a 1986
55022530
CB
1987 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"",
1988 srcpath ? srcpath : "(null)", target);
911324ef
DL
1989 }
1990
1991 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
0ac4b28a 1992
55022530
CB
1993 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount options",
1994 srcpath ? srcpath : "(none)", target ? target : "(none)");
0ac4b28a 1995
614305f3 1996#ifdef HAVE_STATVFS
181437fd 1997 if (srcpath && statvfs(srcpath, &sb) == 0) {
94bef7e4
TA
1998 unsigned long required_flags = 0;
1999
2938f7c8
SH
2000 if (sb.f_flag & MS_NOSUID)
2001 required_flags |= MS_NOSUID;
0ac4b28a 2002
ae7a770e 2003 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 2004 required_flags |= MS_NODEV;
0ac4b28a 2005
2938f7c8
SH
2006 if (sb.f_flag & MS_RDONLY)
2007 required_flags |= MS_RDONLY;
0ac4b28a 2008
2938f7c8
SH
2009 if (sb.f_flag & MS_NOEXEC)
2010 required_flags |= MS_NOEXEC;
0ac4b28a 2011
55022530
CB
2012 DEBUG("Flags for \"%s\" were %lu, required extra flags are %lu",
2013 srcpath, sb.f_flag, required_flags);
0ac4b28a
CB
2014
2015 /* If this was a bind mount request, and required_flags
2938f7c8 2016 * does not have any flags which are not already in
0ac4b28a 2017 * mountflags, then skip the remount.
2938f7c8 2018 */
94bef7e4
TA
2019 if (!(mountflags & MS_REMOUNT) &&
2020 (!(required_flags & ~mountflags) && !(mountflags & MS_RDONLY))) {
15f3e22b
CB
2021 DEBUG("Mountflags already were %lu, skipping remount", mountflags);
2022 goto skipremount;
2938f7c8 2023 }
0ac4b28a 2024
2938f7c8 2025 mountflags |= required_flags;
6fd5e769 2026 }
614305f3 2027#endif
911324ef 2028
181437fd 2029 ret = mount(srcpath, target, fstype, mountflags | MS_REMOUNT, data);
0ac4b28a 2030 if (ret < 0) {
55022530
CB
2031 if (optional)
2032 return log_info_errno(0, errno, "Failed to mount \"%s\" on \"%s\" (optional)",
2033 srcpath ? srcpath : "(null)",
2034 target);
2035
2036 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"",
2037 srcpath ? srcpath : "(null)",
2038 target);
911324ef
DL
2039 }
2040 }
2041
a3ed9b81 2042#ifdef HAVE_STATVFS
2043skipremount:
2044#endif
d840039e
YT
2045 if (pflags) {
2046 ret = mount(NULL, target, NULL, pflags, NULL);
2047 if (ret < 0) {
55022530
CB
2048 if (optional)
2049 return log_info_errno(0, errno, "Failed to change mount propagation for \"%s\" (optional)", target);
2050 else
2051 return log_error_errno(-1, errno, "Failed to change mount propagation for \"%s\" (optional)", target);
d840039e
YT
2052 }
2053 DEBUG("Changed mount propagation for \"%s\"", target);
2054 }
2055
0103eb53 2056 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"",
181437fd 2057 srcpath ? srcpath : "(null)", target, fstype);
911324ef
DL
2058
2059 return 0;
2060}
2061
c5e30de4 2062/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
2063static void cull_mntent_opt(struct mntent *mntent)
2064{
2065 int i;
0fd73091
CB
2066 char *list[] = {
2067 "create=dir",
2068 "create=file",
2069 "optional",
2070 "relative",
2071 NULL
2072 };
c5e30de4
CB
2073
2074 for (i = 0; list[i]; i++) {
2075 char *p, *p2;
2076
2077 p = strstr(mntent->mnt_opts, list[i]);
2078 if (!p)
4e4ca161 2079 continue;
c5e30de4 2080
4e4ca161
SH
2081 p2 = strchr(p, ',');
2082 if (!p2) {
2083 /* no more mntopts, so just chop it here */
2084 *p = '\0';
2085 continue;
2086 }
c5e30de4
CB
2087
2088 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
2089 }
2090}
2091
4d5b72a1 2092static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
2093 const char *path,
2094 const struct lxc_rootfs *rootfs,
0fd73091 2095 const char *lxc_name, const char *lxc_path)
0ad19a3f 2096{
7a76eeaa 2097 __do_free char *p1 = NULL;
3b7e332f 2098 int ret;
7a76eeaa 2099 char *p2;
911324ef 2100
eed95eb0 2101 if (strnequal(mntent->mnt_type, "overlay", 7)) {
749f98d9 2102 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
12e6ab5d
CB
2103 if (ret < 0)
2104 return -1;
2105 }
6e46cc0d 2106
34cfffb3 2107 if (hasmntopt(mntent, "create=dir")) {
749f98d9 2108 ret = mkdir_p(path, 0755);
55022530
CB
2109 if (ret < 0 && errno != EEXIST)
2110 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", path);
34cfffb3
SG
2111 }
2112
0fd73091
CB
2113 if (!hasmntopt(mntent, "create=file"))
2114 return 0;
749f98d9 2115
0fd73091
CB
2116 ret = access(path, F_OK);
2117 if (ret == 0)
2118 return 0;
749f98d9 2119
0fd73091
CB
2120 p1 = strdup(path);
2121 if (!p1)
2122 return -1;
749f98d9 2123
0fd73091 2124 p2 = dirname(p1);
749f98d9 2125
0fd73091 2126 ret = mkdir_p(p2, 0755);
55022530
CB
2127 if (ret < 0 && errno != EEXIST)
2128 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", path);
749f98d9 2129
3b7e332f
CB
2130 ret = mknod(path, S_IFREG | 0000, 0);
2131 if (ret < 0 && errno != EEXIST)
2132 return -errno;
0fd73091 2133
749f98d9 2134 return 0;
4d5b72a1
NC
2135}
2136
ec50007f
CB
2137/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2138 * without a rootfs. */
db4aba38 2139static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
2140 const char *path,
2141 const struct lxc_rootfs *rootfs,
2142 const char *lxc_name,
2143 const char *lxc_path)
4d5b72a1 2144{
fd214f37 2145 __do_free char *mntdata = NULL;
a08bfbe3
CB
2146 unsigned long mntflags = 0, pflags = 0;
2147 char *rootfs_path = NULL;
d8b712bc 2148 int ret;
181437fd 2149 bool dev, optional, relative;
d8b712bc
CB
2150
2151 optional = hasmntopt(mntent, "optional") != NULL;
2152 dev = hasmntopt(mntent, "dev") != NULL;
181437fd 2153 relative = hasmntopt(mntent, "relative") != NULL;
d8b712bc 2154
ec50007f
CB
2155 if (rootfs && rootfs->path)
2156 rootfs_path = rootfs->mount;
2157
d8b712bc
CB
2158 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2159 lxc_path);
2160 if (ret < 0) {
2161 if (optional)
2162 return 0;
608e3567 2163
d8b712bc
CB
2164 return -1;
2165 }
4e4ca161
SH
2166 cull_mntent_opt(mntent);
2167
d840039e
YT
2168 ret = parse_propagationopts(mntent->mnt_opts, &pflags);
2169 if (ret < 0)
2170 return -1;
2171
d8b712bc
CB
2172 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2173 if (ret < 0)
a08bfbe3 2174 return ret;
a17b1e65 2175
6e46cc0d 2176 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
d840039e 2177 pflags, mntdata, optional, dev, relative, rootfs_path);
68c152ef 2178
911324ef
DL
2179 return ret;
2180}
2181
8183f09e
CB
2182static inline int mount_entry_on_systemfs(struct lxc_rootfs *rootfs,
2183 struct mntent *mntent)
db4aba38 2184{
1433c9f9
CB
2185 int ret;
2186
2187 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
2188 * absolute paths starting at / on the host.
2189 */
1433c9f9 2190 if (mntent->mnt_dir[0] != '/')
9bcde680 2191 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "/%s", mntent->mnt_dir);
1433c9f9 2192 else
9bcde680
CB
2193 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s", mntent->mnt_dir);
2194 if (ret < 0)
1433c9f9 2195 return -1;
1433c9f9 2196
8183f09e 2197 return mount_entry_on_generic(mntent, rootfs->buf, NULL, NULL, NULL);
db4aba38
NC
2198}
2199
4e4ca161 2200static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
9c0fd29a 2201 struct lxc_rootfs *rootfs,
0a2dddd4
CB
2202 const char *lxc_name,
2203 const char *lxc_path)
911324ef 2204{
bdd2b34c 2205 int offset;
013bd428 2206 char *aux;
67e571de 2207 const char *lxcpath;
bdd2b34c 2208 int ret = 0;
0ad19a3f 2209
593e8478 2210 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 2211 if (!lxcpath)
2a59a681 2212 return -1;
2a59a681 2213
bdd2b34c
CB
2214 /* If rootfs->path is a blockdev path, allow container fstab to use
2215 * <lxcpath>/<name>/rootfs" as the target prefix.
2216 */
9bcde680
CB
2217 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/%s/rootfs", lxcpath, lxc_name);
2218 if (ret < 0)
80a881b2
SH
2219 goto skipvarlib;
2220
9c0fd29a 2221 aux = strstr(mntent->mnt_dir, rootfs->buf);
80a881b2 2222 if (aux) {
9c0fd29a 2223 offset = strlen(rootfs->buf);
80a881b2
SH
2224 goto skipabs;
2225 }
2226
2227skipvarlib:
013bd428 2228 aux = strstr(mntent->mnt_dir, rootfs->path);
55022530
CB
2229 if (!aux)
2230 return log_warn(ret, "Ignoring mount point \"%s\"", mntent->mnt_dir);
80a881b2
SH
2231 offset = strlen(rootfs->path);
2232
2233skipabs:
9bcde680
CB
2234 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/%s", rootfs->mount, aux + offset);
2235 if (ret < 0)
a17b1e65 2236 return -1;
a17b1e65 2237
9c0fd29a 2238 return mount_entry_on_generic(mntent, rootfs->buf, rootfs, lxc_name, lxc_path);
911324ef 2239}
d330fe7b 2240
4e4ca161 2241static int mount_entry_on_relative_rootfs(struct mntent *mntent,
4806d3b9 2242 struct lxc_rootfs *rootfs,
0a2dddd4
CB
2243 const char *lxc_name,
2244 const char *lxc_path)
911324ef 2245{
911324ef 2246 int ret;
d330fe7b 2247
34cfffb3 2248 /* relative to root mount point */
9bcde680
CB
2249 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/%s", rootfs->mount, mntent->mnt_dir);
2250 if (ret < 0)
9ba8130c 2251 return -1;
911324ef 2252
4806d3b9 2253 return mount_entry_on_generic(mntent, rootfs->buf, rootfs, lxc_name, lxc_path);
911324ef
DL
2254}
2255
8183f09e 2256static int mount_file_entries(struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2257 const char *lxc_name, const char *lxc_path)
911324ef 2258{
9d03d857 2259 char buf[PATH_MAX];
0fd73091 2260 struct mntent mntent;
e76b8764 2261
aaf901be 2262 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
9d03d857
CB
2263 int ret;
2264
1ae3c19f 2265 if (!rootfs->path)
8183f09e 2266 ret = mount_entry_on_systemfs(rootfs, &mntent);
1ae3c19f
CB
2267 else if (mntent.mnt_dir[0] != '/')
2268 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2269 lxc_name, lxc_path);
2270 else
2271 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
9d03d857 2272 lxc_name, lxc_path);
1ae3c19f
CB
2273 if (ret < 0)
2274 return -1;
0ad19a3f 2275 }
cd54d859 2276
55022530
CB
2277 if (!feof(file) || ferror(file))
2278 return log_error(-1, "Failed to parse mount entries");
9d03d857
CB
2279
2280 return 0;
e7938e9e
MN
2281}
2282
55022530
CB
2283static inline void __auto_endmntent__(FILE **f)
2284{
2285 if (*f)
2286 endmntent(*f);
2287}
2288
2289#define __do_endmntent __attribute__((__cleanup__(__auto_endmntent__)))
2290
48e5dcc8 2291static int setup_mount_fstab(struct lxc_rootfs *rootfs, const char *fstab,
8183f09e 2292 const char *lxc_name, const char *lxc_path)
e7938e9e 2293{
55022530 2294 __do_endmntent FILE *f = NULL;
e7938e9e
MN
2295 int ret;
2296
2297 if (!fstab)
2298 return 0;
2299
55022530
CB
2300 f = setmntent(fstab, "re");
2301 if (!f)
2302 return log_error_errno(-1, errno, "Failed to open \"%s\"", fstab);
e7938e9e 2303
a7c6e830 2304 ret = mount_file_entries(rootfs, f, lxc_name, lxc_path);
42dff448
CB
2305 if (ret < 0)
2306 ERROR("Failed to set up mount entries");
e7938e9e 2307
0ad19a3f 2308 return ret;
2309}
2310
1800f924
WB
2311/*
2312 * In order for nested containers to be able to mount /proc and /sys they need
2313 * to see a "pure" proc and sysfs mount points with nothing mounted on top
2314 * (like lxcfs).
2315 * For this we provide proc and sysfs in /dev/.lxc/{proc,sys} while using an
2316 * apparmor rule to deny access to them. This is mostly for convenience: The
2317 * container's root user can mount them anyway and thus has access to the two
2318 * file systems. But a non-root user in the container should not be allowed to
2319 * access them as a side effect without explicitly allowing it.
2320 */
2321static const char nesting_helpers[] =
dc691e34
CB
2322"proc dev/.lxc/proc proc create=dir,optional 0 0\n"
2323"sys dev/.lxc/sys sysfs create=dir,optional 0 0\n";
1800f924
WB
2324
2325FILE *make_anonymous_mount_file(struct lxc_list *mount,
2326 bool include_nesting_helpers)
e7938e9e 2327{
f62cf1d4 2328 __do_close int fd = -EBADF;
4110345b 2329 FILE *f;
5ef5c9a3 2330 int ret;
e7938e9e 2331 char *mount_entry;
5ef5c9a3 2332 struct lxc_list *iterator;
5ef5c9a3 2333
0fd73091 2334 fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
5ef5c9a3 2335 if (fd < 0) {
a324e7eb
CB
2336 char template[] = P_tmpdir "/.lxc_mount_file_XXXXXX";
2337
5ef5c9a3
CB
2338 if (errno != ENOSYS)
2339 return NULL;
a324e7eb
CB
2340
2341 fd = lxc_make_tmpfile(template, true);
55022530
CB
2342 if (fd < 0)
2343 return log_error_errno(NULL, errno, "Could not create temporary mount file");
0fd73091 2344
6bd04140 2345 TRACE("Created temporary mount file");
5ef5c9a3 2346 }
e7938e9e 2347
0fd73091
CB
2348 lxc_list_for_each (iterator, mount) {
2349 size_t len;
2350
e7938e9e 2351 mount_entry = iterator->elem;
0fd73091 2352 len = strlen(mount_entry);
5ef5c9a3 2353
489f39be 2354 ret = lxc_write_nointr(fd, mount_entry, len);
0fd73091 2355 if (ret != len)
79bcf5ee 2356 return NULL;
0fd73091 2357
489f39be 2358 ret = lxc_write_nointr(fd, "\n", 1);
0fd73091 2359 if (ret != 1)
79bcf5ee 2360 return NULL;
e7938e9e
MN
2361 }
2362
1800f924
WB
2363 if (include_nesting_helpers) {
2364 ret = lxc_write_nointr(fd, nesting_helpers,
6333c915
CB
2365 STRARRAYLEN(nesting_helpers));
2366 if (ret != STRARRAYLEN(nesting_helpers))
79bcf5ee 2367 return NULL;
1800f924
WB
2368 }
2369
0fd73091
CB
2370 ret = lseek(fd, 0, SEEK_SET);
2371 if (ret < 0)
79bcf5ee 2372 return NULL;
0fd73091 2373
4110345b
CB
2374 f = fdopen(fd, "re+");
2375 if (f)
2376 move_fd(fd); /* Transfer ownership of fd. */
2377 return f;
9fc7f8c0
TA
2378}
2379
06749971 2380static int setup_mount_entries(const struct lxc_conf *conf,
48e5dcc8
CB
2381 struct lxc_rootfs *rootfs, struct lxc_list *mount,
2382 const char *lxc_name, const char *lxc_path)
9fc7f8c0 2383{
c85ced65 2384 __do_fclose FILE *f = NULL;
9fc7f8c0 2385
1800f924 2386 f = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
19b5d755 2387 if (!f)
9fc7f8c0 2388 return -1;
e7938e9e 2389
a7c6e830 2390 return mount_file_entries(rootfs, f, lxc_name, lxc_path);
e7938e9e
MN
2391}
2392
bab88e68
CS
2393static int parse_cap(const char *cap)
2394{
84760c11 2395 size_t i;
2396 int capid = -1;
0fd73091
CB
2397 size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
2398 char *ptr = NULL;
bab88e68 2399
71528742 2400 if (strequal(cap, "none"))
7035407c
DE
2401 return -2;
2402
8560cd36 2403 for (i = 0; i < end; i++) {
71528742 2404 if (!strequal(cap, caps_opt[i].name))
bab88e68
CS
2405 continue;
2406
2407 capid = caps_opt[i].value;
2408 break;
2409 }
2410
2411 if (capid < 0) {
0fd73091
CB
2412 /* Try to see if it's numeric, so the user may specify
2413 * capabilities that the running kernel knows about but we
2414 * don't
2415 */
bab88e68
CS
2416 errno = 0;
2417 capid = strtol(cap, &ptr, 10);
2418 if (!ptr || *ptr != '\0' || errno != 0)
2419 /* not a valid number */
2420 capid = -1;
2421 else if (capid > lxc_caps_last_cap())
2422 /* we have a number but it's not a valid
2423 * capability */
2424 capid = -1;
2425 }
2426
2427 return capid;
2428}
2429
0769b82a
CS
2430int in_caplist(int cap, struct lxc_list *caps)
2431{
0769b82a 2432 int capid;
0fd73091 2433 struct lxc_list *iterator;
0769b82a 2434
0fd73091 2435 lxc_list_for_each (iterator, caps) {
0769b82a
CS
2436 capid = parse_cap(iterator->elem);
2437 if (capid == cap)
2438 return 1;
2439 }
2440
2441 return 0;
2442}
2443
81810dd1
DL
2444static int setup_caps(struct lxc_list *caps)
2445{
bab88e68 2446 int capid;
0fd73091
CB
2447 char *drop_entry;
2448 struct lxc_list *iterator;
81810dd1 2449
0fd73091
CB
2450 lxc_list_for_each (iterator, caps) {
2451 int ret;
81810dd1
DL
2452
2453 drop_entry = iterator->elem;
2454
bab88e68 2455 capid = parse_cap(drop_entry);
55022530
CB
2456 if (capid < 0)
2457 return log_error(-1, "unknown capability %s", drop_entry);
81810dd1 2458
b81689a1
CB
2459 ret = prctl(PR_CAPBSET_DROP, prctl_arg(capid), prctl_arg(0),
2460 prctl_arg(0), prctl_arg(0));
55022530
CB
2461 if (ret < 0)
2462 return log_error_errno(-1, errno, "Failed to remove %s capability", drop_entry);
0fd73091 2463 DEBUG("Dropped %s (%d) capability", drop_entry, capid);
81810dd1
DL
2464 }
2465
0fd73091 2466 DEBUG("Capabilities have been setup");
1fb86a7c
SH
2467 return 0;
2468}
2469
2470static int dropcaps_except(struct lxc_list *caps)
2471{
2f443e88 2472 __do_free int *caplist = NULL;
0fd73091 2473 int i, capid, numcaps;
1fb86a7c 2474 char *keep_entry;
0fd73091 2475 struct lxc_list *iterator;
1fb86a7c 2476
0fd73091 2477 numcaps = lxc_caps_last_cap() + 1;
2caf9a97
SH
2478 if (numcaps <= 0 || numcaps > 200)
2479 return -1;
0fd73091 2480 TRACE("Found %d capabilities", numcaps);
2caf9a97 2481
1a0e70ac 2482 /* caplist[i] is 1 if we keep capability i */
2f443e88 2483 caplist = must_realloc(NULL, numcaps * sizeof(int));
1fb86a7c
SH
2484 memset(caplist, 0, numcaps * sizeof(int));
2485
0fd73091 2486 lxc_list_for_each (iterator, caps) {
1fb86a7c
SH
2487 keep_entry = iterator->elem;
2488
bab88e68 2489 capid = parse_cap(keep_entry);
7035407c
DE
2490 if (capid == -2)
2491 continue;
2492
55022530
CB
2493 if (capid < 0)
2494 return log_error(-1, "Unknown capability %s", keep_entry);
1fb86a7c 2495
0fd73091 2496 DEBUG("Keep capability %s (%d)", keep_entry, capid);
1fb86a7c
SH
2497 caplist[capid] = 1;
2498 }
0fd73091
CB
2499
2500 for (i = 0; i < numcaps; i++) {
2501 int ret;
2502
1fb86a7c
SH
2503 if (caplist[i])
2504 continue;
0fd73091 2505
b81689a1
CB
2506 ret = prctl(PR_CAPBSET_DROP, prctl_arg(i), prctl_arg(0),
2507 prctl_arg(0), prctl_arg(0));
55022530
CB
2508 if (ret < 0)
2509 return log_error_errno(-1, errno, "Failed to remove capability %d", i);
1fb86a7c
SH
2510 }
2511
0fd73091 2512 DEBUG("Capabilities have been setup");
81810dd1
DL
2513 return 0;
2514}
2515
0fd73091
CB
2516static int parse_resource(const char *res)
2517{
2518 int ret;
c6d09e15
WB
2519 size_t i;
2520 int resid = -1;
2521
0fd73091 2522 for (i = 0; i < sizeof(limit_opt) / sizeof(limit_opt[0]); ++i)
71528742 2523 if (strequal(res, limit_opt[i].name))
c6d09e15 2524 return limit_opt[i].value;
c6d09e15 2525
0fd73091 2526 /* Try to see if it's numeric, so the user may specify
c6d09e15 2527 * resources that the running kernel knows about but
0fd73091
CB
2528 * we don't.
2529 */
2530 ret = lxc_safe_int(res, &resid);
2531 if (ret < 0)
2532 return -1;
2533
2534 return resid;
c6d09e15
WB
2535}
2536
0fd73091
CB
2537int setup_resource_limits(struct lxc_list *limits, pid_t pid)
2538{
2539 int resid;
c6d09e15
WB
2540 struct lxc_list *it;
2541 struct lxc_limit *lim;
c6d09e15 2542
0fd73091 2543 lxc_list_for_each (it, limits) {
c6d09e15
WB
2544 lim = it->elem;
2545
2546 resid = parse_resource(lim->resource);
55022530
CB
2547 if (resid < 0)
2548 return log_error(-1, "Unknown resource %s", lim->resource);
c6d09e15 2549
f48b5fd8 2550#if HAVE_PRLIMIT || HAVE_PRLIMIT64
55022530
CB
2551 if (prlimit(pid, resid, &lim->limit, NULL) != 0)
2552 return log_error_errno(-1, errno, "Failed to set limit %s", lim->resource);
2de12765
CB
2553
2554 TRACE("Setup \"%s\" limit", lim->resource);
f48b5fd8 2555#else
55022530 2556 return log_error(-1, "Cannot set limit \"%s\" as prlimit is missing", lim->resource);
f48b5fd8 2557#endif
c6d09e15 2558 }
0fd73091 2559
c6d09e15
WB
2560 return 0;
2561}
2562
7edd0540
L
2563int setup_sysctl_parameters(struct lxc_list *sysctls)
2564{
e6f76452 2565 __do_free char *tmp = NULL;
7edd0540
L
2566 struct lxc_list *it;
2567 struct lxc_sysctl *elem;
0fd73091 2568 int ret = 0;
6b5a54cd 2569 char filename[PATH_MAX] = {0};
7edd0540 2570
0fd73091 2571 lxc_list_for_each (it, sysctls) {
7edd0540
L
2572 elem = it->elem;
2573 tmp = lxc_string_replace(".", "/", elem->key);
55022530
CB
2574 if (!tmp)
2575 return log_error(-1, "Failed to replace key %s", elem->key);
7edd0540 2576
9bcde680
CB
2577 ret = strnprintf(filename, sizeof(filename), "/proc/sys/%s", tmp);
2578 if (ret < 0)
55022530 2579 return log_error(-1, "Error setting up sysctl parameters path");
7edd0540 2580
0fd73091 2581 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2582 strlen(elem->value), false, 0666);
55022530
CB
2583 if (ret < 0)
2584 return log_error_errno(-1, errno, "Failed to setup sysctl parameters %s to %s",
2585 elem->key, elem->value);
7edd0540 2586 }
0fd73091 2587
7edd0540
L
2588 return 0;
2589}
2590
61d7a733
YT
2591int setup_proc_filesystem(struct lxc_list *procs, pid_t pid)
2592{
0c669152 2593 __do_free char *tmp = NULL;
61d7a733
YT
2594 struct lxc_list *it;
2595 struct lxc_proc *elem;
0fd73091 2596 int ret = 0;
6b5a54cd 2597 char filename[PATH_MAX] = {0};
61d7a733 2598
0fd73091 2599 lxc_list_for_each (it, procs) {
61d7a733
YT
2600 elem = it->elem;
2601 tmp = lxc_string_replace(".", "/", elem->filename);
55022530
CB
2602 if (!tmp)
2603 return log_error(-1, "Failed to replace key %s", elem->filename);
61d7a733 2604
9bcde680
CB
2605 ret = strnprintf(filename, sizeof(filename), "/proc/%d/%s", pid, tmp);
2606 if (ret < 0)
55022530 2607 return log_error(-1, "Error setting up proc filesystem path");
61d7a733 2608
0fd73091 2609 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2610 strlen(elem->value), false, 0666);
55022530
CB
2611 if (ret < 0)
2612 return log_error_errno(-1, errno, "Failed to setup proc filesystem %s to %s", elem->filename, elem->value);
61d7a733 2613 }
0fd73091 2614
61d7a733
YT
2615 return 0;
2616}
2617
ae9242c8
SH
2618static char *default_rootfs_mount = LXCROOTFSMOUNT;
2619
7b379ab3 2620struct lxc_conf *lxc_conf_init(void)
089cd8b8 2621{
26ddeedd 2622 int i;
0fd73091 2623 struct lxc_conf *new;
7b379ab3 2624
13277ec4 2625 new = malloc(sizeof(*new));
0fd73091 2626 if (!new)
7b379ab3 2627 return NULL;
7b379ab3
MN
2628 memset(new, 0, sizeof(*new));
2629
4b73005c 2630 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2631 new->personality = -1;
124fa0a8 2632 new->autodev = 1;
3a784510 2633 new->console.buffer_size = 0;
596a818d
DE
2634 new->console.log_path = NULL;
2635 new->console.log_fd = -1;
861813e5 2636 new->console.log_size = 0;
28a4b0e5 2637 new->console.path = NULL;
63376d7d 2638 new->console.peer = -1;
fb87aa6a 2639 new->console.proxy.busy = -1;
36a94ce8 2640 new->console.proxy.ptx = -1;
41808e20 2641 new->console.proxy.pty = -1;
36a94ce8 2642 new->console.ptx = -1;
41808e20 2643 new->console.pty = -1;
63376d7d 2644 new->console.name[0] = '\0';
732375f5 2645 memset(&new->console.ringbuf, 0, sizeof(struct lxc_ringbuf));
d2e30e99 2646 new->maincmd_fd = -1;
258f8051 2647 new->monitor_signal_pdeath = SIGKILL;
76a26f55 2648 new->nbd_idx = -1;
54c30e29 2649 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2650 if (!new->rootfs.mount) {
53f3f048
SH
2651 free(new);
2652 return NULL;
2653 }
6e54330c 2654 new->rootfs.managed = true;
ea57e424 2655 new->rootfs.dfd_mnt = -EBADF;
a5a08920 2656 new->rootfs.dfd_dev = -EBADF;
ea11a215 2657 new->rootfs.dfd_host = -EBADF;
79ff643d 2658 new->rootfs.fd_path_pin = -EBADF;
858377e4 2659 new->logfd = -1;
7b379ab3 2660 lxc_list_init(&new->cgroup);
54860ed0 2661 lxc_list_init(&new->cgroup2);
4bfb655e 2662 lxc_list_init(&new->devices);
7b379ab3
MN
2663 lxc_list_init(&new->network);
2664 lxc_list_init(&new->mount_list);
81810dd1 2665 lxc_list_init(&new->caps);
1fb86a7c 2666 lxc_list_init(&new->keepcaps);
f6d3e3e4 2667 lxc_list_init(&new->id_map);
46ad64ab
CB
2668 new->root_nsuid_map = NULL;
2669 new->root_nsgid_map = NULL;
f979ac15 2670 lxc_list_init(&new->includes);
4184c3e1 2671 lxc_list_init(&new->aliens);
7c661726 2672 lxc_list_init(&new->environment);
c6d09e15 2673 lxc_list_init(&new->limits);
7edd0540 2674 lxc_list_init(&new->sysctls);
61d7a733 2675 lxc_list_init(&new->procs);
44ae0fb6 2676 new->hooks_version = 0;
28d9e29e 2677 for (i = 0; i < NUM_LXC_HOOKS; i++)
26ddeedd 2678 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2679 lxc_list_init(&new->groups);
d39b10eb 2680 lxc_list_init(&new->state_clients);
fe4de9a6 2681 new->lsm_aa_profile = NULL;
1800f924 2682 lxc_list_init(&new->lsm_aa_raw);
fe4de9a6 2683 new->lsm_se_context = NULL;
4fef78bc 2684 new->lsm_se_keyring_context = NULL;
8f818a84 2685 new->keyring_disable_session = false;
952b5031 2686 new->transient_procfs_mnt = false;
7a41e857
LT
2687 new->shmount.path_host = NULL;
2688 new->shmount.path_cont = NULL;
7b379ab3 2689
72bb04e4
PT
2690 /* if running in a new user namespace, init and COMMAND
2691 * default to running as UID/GID 0 when using lxc-execute */
2692 new->init_uid = 0;
2693 new->init_gid = 0;
c71f64cb 2694 memset(&new->init_groups, 0, sizeof(lxc_groups_t));
43654d34 2695 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
b074bbf1 2696 memset(&new->ns_share, 0, sizeof(char *) * LXC_NS_MAX);
70fd7fc9 2697 memset(&new->timens, 0, sizeof(struct timens_offsets));
c3e3c21a 2698 seccomp_conf_init(new);
72bb04e4 2699
7b379ab3 2700 return new;
089cd8b8
DL
2701}
2702
344c9d81 2703int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
a19b974f 2704 size_t buf_size)
f6d3e3e4 2705{
f62cf1d4 2706 __do_close int fd = -EBADF;
76bcd422 2707 int ret;
6b5a54cd 2708 char path[PATH_MAX];
f6d3e3e4 2709
a19b974f 2710 if (geteuid() != 0 && idtype == ID_TYPE_GID) {
f62cf1d4 2711 __do_close int setgroups_fd = -EBADF;
a19b974f 2712
9bcde680
CB
2713 ret = strnprintf(path, sizeof(path), "/proc/%d/setgroups", pid);
2714 if (ret < 0)
a19b974f 2715 return -E2BIG;
a19b974f 2716
76bcd422 2717 setgroups_fd = open(path, O_WRONLY);
55022530
CB
2718 if (setgroups_fd < 0 && errno != ENOENT)
2719 return log_error_errno(-1, errno, "Failed to open \"%s\"", path);
a19b974f 2720
76bcd422
CB
2721 if (setgroups_fd >= 0) {
2722 ret = lxc_write_nointr(setgroups_fd, "deny\n",
2723 STRLITERALLEN("deny\n"));
55022530
CB
2724 if (ret != STRLITERALLEN("deny\n"))
2725 return log_error_errno(-1, errno, "Failed to write \"deny\" to \"/proc/%d/setgroups\"", pid);
395b1a3e 2726 TRACE("Wrote \"deny\" to \"/proc/%d/setgroups\"", pid);
a19b974f 2727 }
a19b974f
CB
2728 }
2729
9bcde680 2730 ret = strnprintf(path, sizeof(path), "/proc/%d/%cid_map", pid,
29053180 2731 idtype == ID_TYPE_UID ? 'u' : 'g');
9bcde680 2732 if (ret < 0)
f6d3e3e4 2733 return -E2BIG;
29053180 2734
55022530
CB
2735 fd = open(path, O_WRONLY | O_CLOEXEC);
2736 if (fd < 0)
2737 return log_error_errno(-1, errno, "Failed to open \"%s\"", path);
29053180 2738
29053180 2739 ret = lxc_write_nointr(fd, buf, buf_size);
55022530
CB
2740 if (ret != buf_size)
2741 return log_error_errno(-1, errno, "Failed to write %cid mapping to \"%s\"",
2742 idtype == ID_TYPE_UID ? 'u' : 'g', path);
29053180
CB
2743
2744 return 0;
f6d3e3e4
SH
2745}
2746
6e50e704
CB
2747/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2748 *
2749 * @return 1 if functional binary was found
2750 * @return 0 if binary exists but is lacking privilege
2751 * @return -ENOENT if binary does not exist
2752 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
6e50e704 2753 */
df6a2945
CB
2754static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2755{
48411df2 2756 __do_free char *path = NULL;
df6a2945
CB
2757 int ret;
2758 struct stat st;
df6a2945 2759
3275932b 2760 errno = EINVAL;
6e50e704 2761 if (cap != CAP_SETUID && cap != CAP_SETGID)
3275932b 2762 return -1;
6e50e704 2763
3275932b 2764 errno = ENOENT;
df6a2945
CB
2765 path = on_path(binary, NULL);
2766 if (!path)
3275932b 2767 return -1;
df6a2945
CB
2768
2769 ret = stat(path, &st);
3275932b
CB
2770 if (ret < 0)
2771 return -1;
df6a2945
CB
2772
2773 /* Check if the binary is setuid. */
55022530
CB
2774 if (st.st_mode & S_ISUID)
2775 return log_debug(1, "The binary \"%s\" does have the setuid bit set", path);
df6a2945 2776
0fd73091 2777#if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
2778 /* Check if it has the CAP_SETUID capability. */
2779 if ((cap & CAP_SETUID) &&
2780 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
55022530
CB
2781 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED))
2782 return log_debug(1, "The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE and CAP_PERMITTED sets", path);
df6a2945
CB
2783
2784 /* Check if it has the CAP_SETGID capability. */
2785 if ((cap & CAP_SETGID) &&
2786 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
55022530
CB
2787 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED))
2788 return log_debug(1, "The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE and CAP_PERMITTED sets", path);
0fd73091 2789#else
69924fff
CB
2790 /* If we cannot check for file capabilities we need to give the benefit
2791 * of the doubt. Otherwise we might fail even though all the necessary
2792 * file capabilities are set.
2793 */
55022530 2794 DEBUG("Cannot check for file capabilities as full capability support is missing. Manual intervention needed");
0fd73091 2795#endif
df6a2945 2796
3275932b 2797 return 1;
df6a2945
CB
2798}
2799
59eac805 2800static int lxc_map_ids_exec_wrapper(void *args)
986ef930
CB
2801{
2802 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2803 return -1;
2804}
2805
f6d3e3e4
SH
2806int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2807{
0fd73091 2808 int fill, left;
986ef930 2809 char u_or_g;
4bc3b759 2810 char *pos;
6b5a54cd 2811 char cmd_output[PATH_MAX];
0fd73091
CB
2812 struct id_map *map;
2813 struct lxc_list *iterator;
2814 enum idtype type;
0fd73091 2815 int ret = 0, gidmap = 0, uidmap = 0;
c6ba8981
CB
2816 char mapbuf[STRLITERALLEN("new@idmap") + STRLITERALLEN(" ") +
2817 INTTYPE_TO_STRLEN(pid_t) + STRLITERALLEN(" ") +
2818 LXC_IDMAPLEN] = {0};
0fd73091 2819 bool had_entry = false, use_shadow = false;
c724025c
JC
2820 int hostuid, hostgid;
2821
2822 hostuid = geteuid();
2823 hostgid = getegid();
df6a2945
CB
2824
2825 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2826 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
2827 * will protected it by preventing another user from being handed the
2828 * range by shadow.
2829 */
df6a2945 2830 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
2831 if (uidmap == -ENOENT)
2832 WARN("newuidmap binary is missing");
2833 else if (!uidmap)
2834 WARN("newuidmap is lacking necessary privileges");
2835
df6a2945 2836 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
2837 if (gidmap == -ENOENT)
2838 WARN("newgidmap binary is missing");
2839 else if (!gidmap)
2840 WARN("newgidmap is lacking necessary privileges");
2841
df6a2945 2842 if (uidmap > 0 && gidmap > 0) {
0fd73091 2843 DEBUG("Functional newuidmap and newgidmap binary found");
4bc3b759 2844 use_shadow = true;
df6a2945 2845 } else {
99d43365
CB
2846 /* In case unprivileged users run application containers via
2847 * execute() or a start*() there are valid cases where they may
2848 * only want to map their own {g,u}id. Let's not block them from
2849 * doing so by requiring geteuid() == 0.
2850 */
2851 DEBUG("No newuidmap and newgidmap binary found. Trying to "
c724025c
JC
2852 "write directly with euid %d", hostuid);
2853 }
2854
2855 /* Check if we really need to use newuidmap and newgidmap.
2856 * If the user is only remapping his own {g,u}id, we don't need it.
2857 */
2858 if (use_shadow && lxc_list_len(idmap) == 2) {
2859 use_shadow = false;
2860 lxc_list_for_each(iterator, idmap) {
2861 map = iterator->elem;
2862 if (map->idtype == ID_TYPE_UID && map->range == 1 &&
2863 map->nsid == hostuid && map->hostid == hostuid)
2864 continue;
2865 if (map->idtype == ID_TYPE_GID && map->range == 1 &&
2866 map->nsid == hostgid && map->hostid == hostgid)
2867 continue;
2868 use_shadow = true;
2869 break;
2870 }
0e6e3a41 2871 }
251d0d2a 2872
986ef930
CB
2873 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2874 type++, u_or_g = 'g') {
2875 pos = mapbuf;
2876
0e6e3a41 2877 if (use_shadow)
986ef930 2878 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 2879
cf3ef16d 2880 lxc_list_for_each(iterator, idmap) {
251d0d2a 2881 map = iterator->elem;
cf3ef16d
SH
2882 if (map->idtype != type)
2883 continue;
2884
4bc3b759
CB
2885 had_entry = true;
2886
986ef930 2887 left = LXC_IDMAPLEN - (pos - mapbuf);
9bcde680 2888 fill = strnprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
2889 use_shadow ? " " : "", map->nsid,
2890 map->hostid, map->range,
0e6e3a41 2891 use_shadow ? "" : "\n");
55022530
CB
2892 /*
2893 * The kernel only takes <= 4k for writes to
2894 * /proc/<pid>/{g,u}id_map
2895 */
9bcde680 2896 if (fill <= 0)
55022530 2897 return log_error_errno(-1, errno, "Too many %cid mappings defined", u_or_g);
4bc3b759 2898
cf3ef16d 2899 pos += fill;
251d0d2a 2900 }
cf3ef16d 2901 if (!had_entry)
4f7521b4 2902 continue;
cf3ef16d 2903
d85813cd 2904 /* Try to catch the output of new{g,u}idmap to make debugging
986ef930
CB
2905 * easier.
2906 */
2907 if (use_shadow) {
2908 ret = run_command(cmd_output, sizeof(cmd_output),
2909 lxc_map_ids_exec_wrapper,
2910 (void *)mapbuf);
55022530
CB
2911 if (ret < 0)
2912 return log_error(-1, "new%cidmap failed to write mapping \"%s\": %s", u_or_g, cmd_output, mapbuf);
54fbbeb5 2913 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
d1838f34 2914 } else {
986ef930 2915 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
55022530
CB
2916 if (ret < 0)
2917 return log_error(-1, "Failed to write mapping: %s", mapbuf);
54fbbeb5 2918 TRACE("Wrote mapping \"%s\"", mapbuf);
d1838f34 2919 }
986ef930
CB
2920
2921 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 2922 }
251d0d2a 2923
986ef930 2924 return 0;
f6d3e3e4
SH
2925}
2926
234998b4
CB
2927/*
2928 * Return the host uid/gid to which the container root is mapped in val.
0b3a6504 2929 * Return true if id was found, false otherwise.
cf3ef16d 2930 */
234998b4 2931static id_t get_mapped_rootid(const struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2932{
4160c3a0 2933 unsigned nsid;
0fd73091
CB
2934 struct id_map *map;
2935 struct lxc_list *it;
4160c3a0
CB
2936
2937 if (idtype == ID_TYPE_UID)
2938 nsid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
2939 else
2940 nsid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
cf3ef16d 2941
0fd73091 2942 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2943 map = it->elem;
7b50c609 2944 if (map->idtype != idtype)
cf3ef16d 2945 continue;
4160c3a0 2946 if (map->nsid != nsid)
cf3ef16d 2947 continue;
234998b4 2948 return map->hostid;
cf3ef16d 2949 }
4160c3a0 2950
234998b4
CB
2951 if (idtype == ID_TYPE_UID)
2952 return LXC_INVALID_UID;
2953
2954 return LXC_INVALID_GID;
cf3ef16d
SH
2955}
2956
facdf925 2957int mapped_hostid(unsigned id, const struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2958{
cf3ef16d 2959 struct id_map *map;
0fd73091
CB
2960 struct lxc_list *it;
2961
2962 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2963 map = it->elem;
2133f58c 2964 if (map->idtype != idtype)
cf3ef16d 2965 continue;
0fd73091 2966
cf3ef16d 2967 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 2968 return (id - map->hostid) + map->nsid;
cf3ef16d 2969 }
0fd73091 2970
57d116ab 2971 return -1;
cf3ef16d
SH
2972}
2973
7581a82f 2974int find_unmapped_nsid(const struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2975{
cf3ef16d 2976 struct id_map *map;
0fd73091 2977 struct lxc_list *it;
2133f58c 2978 unsigned int freeid = 0;
0fd73091 2979
cf3ef16d 2980again:
0fd73091 2981 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2982 map = it->elem;
2133f58c 2983 if (map->idtype != idtype)
cf3ef16d 2984 continue;
0fd73091 2985
cf3ef16d
SH
2986 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
2987 freeid = map->nsid + map->range;
2988 goto again;
2989 }
2990 }
0fd73091 2991
cf3ef16d
SH
2992 return freeid;
2993}
2994
e1b9d6af
CB
2995/*
2996 * Mount a proc under @rootfs if proc self points to a pid other than
2997 * my own. This is needed to have a known-good proc mount for setting
2998 * up LSMs both at container startup and attach.
2999 *
e1b9d6af
CB
3000 * NOTE: not to be called from inside the container namespace!
3001 */
952b5031 3002static int lxc_transient_proc(struct lxc_rootfs *rootfs)
e1b9d6af 3003{
952b5031
CB
3004 __do_close int fd_proc = -EBADF;
3005 int link_to_pid, link_len, pid_self, ret;
3006 char link[INTTYPE_TO_STRLEN(pid_t) + 1];
e1b9d6af 3007
ea57e424 3008 link_len = readlinkat(rootfs->dfd_mnt, "proc/self", link, sizeof(link));
952b5031 3009 if (link_len < 0) {
ea57e424 3010 ret = mkdirat(rootfs->dfd_mnt, "proc", 0000);
952b5031 3011 if (ret < 0 && errno != EEXIST)
ea57e424 3012 return log_error_errno(-errno, errno, "Failed to create %d(proc)", rootfs->dfd_mnt);
e1b9d6af 3013
952b5031
CB
3014 goto domount;
3015 } else if (link_len >= sizeof(link)) {
3016 return log_error_errno(-EIO, EIO, "Truncated link target");
e1b9d6af 3017 }
952b5031 3018 link[link_len] = '\0';
e1b9d6af 3019
952b5031
CB
3020 pid_self = lxc_raw_getpid();
3021 INFO("Caller's PID is %d; /proc/self points to %s", pid_self, link);
e1b9d6af 3022
952b5031
CB
3023 ret = lxc_safe_int(link, &link_to_pid);
3024 if (ret)
3025 return log_error_errno(-ret, ret, "Failed to parse %s", link);
e1b9d6af 3026
952b5031
CB
3027 /* Correct procfs is already mounted. */
3028 if (link_to_pid == pid_self)
3029 return log_trace(0, "Correct procfs instance mounted");
e1b9d6af 3030
ea57e424 3031 fd_proc = open_at(rootfs->dfd_mnt, "proc", PROTECT_OPATH_DIRECTORY,
952b5031
CB
3032 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3033 if (fd_proc < 0)
3034 return log_error_errno(-errno, errno, "Failed to open transient procfs mountpoint");
e1b9d6af 3035
9bcde680
CB
3036 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "/proc/self/fd/%d", fd_proc);
3037 if (ret < 0)
952b5031 3038 return ret_errno(EIO);
e1b9d6af 3039
952b5031 3040 ret = umount2(rootfs->buf, MNT_DETACH);
e1b9d6af 3041 if (ret < 0)
952b5031 3042 SYSWARN("Failed to umount \"%s\" with MNT_DETACH", rootfs->buf);
e1b9d6af
CB
3043
3044domount:
3045 /* rootfs is NULL */
952b5031
CB
3046 if (!rootfs->path) {
3047 ret = mount("proc", rootfs->buf, "proc", 0, NULL);
3048 } else {
ea57e424 3049 ret = safe_mount_beneath_at(rootfs->dfd_mnt, "none", "proc", "proc", 0, NULL);
952b5031 3050 if (ret < 0) {
9bcde680
CB
3051 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/proc", rootfs->path ? rootfs->mount : "");
3052 if (ret < 0)
952b5031
CB
3053 return ret_errno(EIO);
3054
3055 ret = safe_mount("proc", rootfs->buf, "proc", 0, NULL, rootfs->mount);
3056 }
3057 }
e1b9d6af 3058 if (ret < 0)
952b5031 3059 return log_error_errno(-1, errno, "Failed to mount temporary procfs");
e1b9d6af 3060
952b5031 3061 INFO("Created transient procfs mount");
e1b9d6af
CB
3062 return 1;
3063}
3064
943144d9 3065/* NOTE: Must not be called from inside the container namespace! */
59eac805 3066static int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3067{
3068 int mounted;
3069
952b5031 3070 mounted = lxc_transient_proc(&conf->rootfs);
5112cd70 3071 if (mounted == -1) {
01958b1f 3072 /* continue only if there is no rootfs */
943144d9 3073 if (conf->rootfs.path)
952b5031 3074 return log_error_errno(-EPERM, EPERM, "Failed to create transient procfs mount");
5112cd70 3075 } else if (mounted == 1) {
952b5031 3076 conf->transient_procfs_mnt = true;
5112cd70 3077 }
943144d9 3078
5112cd70
SH
3079 return 0;
3080}
3081
3082void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3083{
952b5031
CB
3084 if (lxc_conf->transient_procfs_mnt) {
3085 (void)umount2("/proc", MNT_DETACH);
3086 lxc_conf->transient_procfs_mnt = false;
3087 }
5112cd70
SH
3088}
3089
9e61fb1f
CB
3090/* Walk /proc/mounts and change any shared entries to dependent mounts. */
3091void turn_into_dependent_mounts(void)
e995d7a2 3092{
7969675f 3093 __do_free char *line = NULL;
003be47b 3094 __do_fclose FILE *f = NULL;
f62cf1d4 3095 __do_close int memfd = -EBADF, mntinfo_fd = -EBADF;
e995d7a2 3096 size_t len = 0;
a39fc34b
CB
3097 ssize_t copied;
3098 int ret;
e995d7a2 3099
6a49f05e 3100 mntinfo_fd = open("/proc/self/mountinfo", O_RDONLY | O_CLOEXEC);
fea3b91d
DJ
3101 if (mntinfo_fd < 0) {
3102 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
6a49f05e 3103 return;
fea3b91d 3104 }
6a49f05e
CB
3105
3106 memfd = memfd_create(".lxc_mountinfo", MFD_CLOEXEC);
3107 if (memfd < 0) {
3108 char template[] = P_tmpdir "/.lxc_mountinfo_XXXXXX";
3109
3110 if (errno != ENOSYS) {
fea3b91d 3111 SYSERROR("Failed to create temporary in-memory file");
6a49f05e
CB
3112 return;
3113 }
3114
3115 memfd = lxc_make_tmpfile(template, true);
fea3b91d 3116 if (memfd < 0) {
fea3b91d
DJ
3117 WARN("Failed to create temporary file");
3118 return;
3119 }
6a49f05e
CB
3120 }
3121
a39fc34b 3122 copied = fd_to_fd(mntinfo_fd, memfd);
6a49f05e 3123 if (copied < 0) {
fea3b91d 3124 SYSERROR("Failed to copy \"/proc/self/mountinfo\"");
6a49f05e
CB
3125 return;
3126 }
6a49f05e 3127
6a49f05e
CB
3128 ret = lseek(memfd, 0, SEEK_SET);
3129 if (ret < 0) {
fea3b91d 3130 SYSERROR("Failed to reset file descriptor offset");
6a49f05e
CB
3131 return;
3132 }
3133
4110345b 3134 f = fdopen(memfd, "re");
e995d7a2 3135 if (!f) {
003be47b 3136 SYSERROR("Failed to open copy of \"/proc/self/mountinfo\" to mark all shared. Continuing");
e995d7a2
SH
3137 return;
3138 }
3139
003be47b
CB
3140 /*
3141 * After a successful fdopen() memfd will be closed when calling
3142 * fclose(f). Calling close(memfd) afterwards is undefined.
3143 */
3144 move_fd(memfd);
3145
e995d7a2 3146 while (getline(&line, &len, f) != -1) {
0fd73091
CB
3147 char *opts, *target;
3148
e995d7a2
SH
3149 target = get_field(line, 4);
3150 if (!target)
3151 continue;
0fd73091 3152
e995d7a2
SH
3153 opts = get_field(target, 2);
3154 if (!opts)
3155 continue;
0fd73091 3156
e995d7a2
SH
3157 null_endofword(opts);
3158 if (!strstr(opts, "shared"))
3159 continue;
0fd73091 3160
e995d7a2 3161 null_endofword(target);
0fd73091
CB
3162 ret = mount(NULL, target, NULL, MS_SLAVE, NULL);
3163 if (ret < 0) {
9e61fb1f 3164 SYSERROR("Failed to recursively turn old root mount tree into dependent mount. Continuing...");
6a49f05e 3165 continue;
e995d7a2 3166 }
9e61fb1f 3167 TRACE("Recursively turned old root mount tree into dependent mount");
e995d7a2 3168 }
9e61fb1f 3169 TRACE("Turned all mount table entries into dependent mount");
e995d7a2
SH
3170}
3171
794248d0 3172static int lxc_execute_bind_init(struct lxc_handler *handler)
2322903b
SH
3173{
3174 int ret;
794248d0
CB
3175 char *p;
3176 char path[PATH_MAX], destpath[PATH_MAX];
3177 struct lxc_conf *conf = handler->conf;
9d9c111c
SH
3178
3179 /* If init exists in the container, don't bind mount a static one */
3180 p = choose_init(conf->rootfs.mount);
3181 if (p) {
22f835ba 3182 __do_free char *old = p;
41089848
TA
3183
3184 p = strdup(old + strlen(conf->rootfs.mount));
41089848
TA
3185 if (!p)
3186 return -ENOMEM;
3187
3188 INFO("Found existing init at \"%s\"", p);
3189 goto out;
9d9c111c 3190 }
2322903b 3191
9bcde680
CB
3192 ret = strnprintf(path, sizeof(path), SBINDIR "/init.lxc.static");
3193 if (ret < 0)
8353b4c9 3194 return -1;
2322903b 3195
55022530
CB
3196 if (!file_exists(path))
3197 return log_error_errno(-1, errno, "The file \"%s\" does not exist on host", path);
2322903b 3198
9bcde680
CB
3199 ret = strnprintf(destpath, sizeof(path), "%s" P_tmpdir "%s", conf->rootfs.mount, "/.lxc-init");
3200 if (ret < 0)
8353b4c9 3201 return -1;
2322903b
SH
3202
3203 if (!file_exists(destpath)) {
794248d0 3204 ret = mknod(destpath, S_IFREG | 0000, 0);
55022530
CB
3205 if (ret < 0 && errno != EEXIST)
3206 return log_error_errno(-1, errno, "Failed to create dummy \"%s\" file as bind mount target", destpath);
2322903b
SH
3207 }
3208
592fd47a 3209 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
55022530
CB
3210 if (ret < 0)
3211 return log_error_errno(-1, errno, "Failed to bind mount lxc.init.static into container");
8353b4c9 3212
794248d0
CB
3213 p = strdup(destpath + strlen(conf->rootfs.mount));
3214 if (!p)
3215 return -ENOMEM;
794248d0 3216
8353b4c9 3217 INFO("Bind mounted lxc.init.static into container at \"%s\"", path);
41089848 3218out:
4b5b3a2a 3219 ((struct execute_args *)handler->data)->init_fd = -1;
41089848 3220 ((struct execute_args *)handler->data)->init_path = p;
8353b4c9 3221 return 0;
2322903b
SH
3222}
3223
0fd73091
CB
3224/* This does the work of remounting / if it is shared, calling the container
3225 * pre-mount hooks, and mounting the rootfs.
35120d9c 3226 */
8ce1abc2
CB
3227int lxc_setup_rootfs_prepare_root(struct lxc_conf *conf, const char *name,
3228 const char *lxcpath)
0ad19a3f 3229{
0fd73091
CB
3230 int ret;
3231
ea11a215
CB
3232 conf->rootfs.dfd_host = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
3233 if (conf->rootfs.dfd_host < 0)
a370f16b
CB
3234 return log_error_errno(-errno, errno, "Failed to open \"/\"");
3235
35120d9c 3236 if (conf->rootfs_setup) {
35120d9c 3237 const char *path = conf->rootfs.mount;
0fd73091
CB
3238
3239 /* The rootfs was set up in another namespace. bind-mount it to
3240 * give us a mount in our own ns so we can pivot_root to it
3241 */
3242 ret = mount(path, path, "rootfs", MS_BIND, NULL);
55022530
CB
3243 if (ret < 0)
3244 return log_error(-1, "Failed to bind mount container / onto itself");
0fd73091 3245
ea57e424
CB
3246 conf->rootfs.dfd_mnt = openat(-EBADF, path, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH | O_NOCTTY);
3247 if (conf->rootfs.dfd_mnt < 0)
26ea5533
CB
3248 return log_error_errno(-errno, errno, "Failed to open file descriptor for container rootfs");
3249
55022530 3250 return log_trace(0, "Bind mounted container / onto itself");
35120d9c 3251 }
d4ef7c50 3252
9e61fb1f 3253 turn_into_dependent_mounts();
e995d7a2 3254
0fd73091 3255 ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
55022530
CB
3256 if (ret < 0)
3257 return log_error(-1, "Failed to run pre-mount hooks");
35120d9c 3258
8ce1abc2 3259 ret = lxc_mount_rootfs(conf);
55022530
CB
3260 if (ret < 0)
3261 return log_error(-1, "Failed to setup rootfs for");
35120d9c
SH
3262
3263 conf->rootfs_setup = true;
3264 return 0;
3265}
3266
1c1c7051
SH
3267static bool verify_start_hooks(struct lxc_conf *conf)
3268{
6b5a54cd 3269 char path[PATH_MAX];
0fd73091
CB
3270 struct lxc_list *it;
3271
3272 lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
1c1c7051 3273 int ret;
0fd73091 3274 char *hookname = it->elem;
1c1c7051 3275
9bcde680 3276 ret = strnprintf(path, sizeof(path), "%s%s",
0fd73091
CB
3277 conf->rootfs.path ? conf->rootfs.mount : "",
3278 hookname);
9bcde680 3279 if (ret < 0)
1c1c7051 3280 return false;
0fd73091 3281
75193660 3282 ret = access(path, X_OK);
55022530
CB
3283 if (ret < 0)
3284 return log_error_errno(false, errno, "Start hook \"%s\" not found in container", hookname);
0fd73091 3285
6a0c909a 3286 return true;
1c1c7051
SH
3287 }
3288
3289 return true;
3290}
3291
4b5b3a2a
TA
3292static bool execveat_supported(void)
3293{
f40988c7 3294 execveat(-1, "", NULL, NULL, AT_EMPTY_PATH);
4b5b3a2a
TA
3295 if (errno == ENOSYS)
3296 return false;
3297
3298 return true;
4b5b3a2a
TA
3299}
3300
20502652
CB
3301static int lxc_setup_boot_id(void)
3302{
3303 int ret;
3304 const char *boot_id_path = "/proc/sys/kernel/random/boot_id";
3305 const char *mock_boot_id_path = "/dev/.lxc-boot-id";
3306 lxc_id128_t n;
3307
3308 if (access(boot_id_path, F_OK))
3309 return 0;
3310
3311 memset(&n, 0, sizeof(n));
3312 if (lxc_id128_randomize(&n)) {
3313 SYSERROR("Failed to generate random data for uuid");
3314 return -1;
3315 }
3316
3317 ret = lxc_id128_write(mock_boot_id_path, n);
3318 if (ret < 0) {
3319 SYSERROR("Failed to write uuid to %s", mock_boot_id_path);
3320 return -1;
3321 }
3322
3323 ret = chmod(mock_boot_id_path, 0444);
3324 if (ret < 0) {
3325 SYSERROR("Failed to chown %s", mock_boot_id_path);
3326 (void)unlink(mock_boot_id_path);
3327 return -1;
3328 }
3329
3330 ret = mount(mock_boot_id_path, boot_id_path, NULL, MS_BIND, NULL);
3331 if (ret < 0) {
3332 SYSERROR("Failed to mount %s to %s", mock_boot_id_path,
3333 boot_id_path);
3334 (void)unlink(mock_boot_id_path);
3335 return -1;
3336 }
3337
3338 ret = mount(NULL, boot_id_path, NULL,
3339 (MS_BIND | MS_REMOUNT | MS_RDONLY | MS_NOSUID | MS_NOEXEC |
3340 MS_NODEV),
3341 NULL);
3342 if (ret < 0) {
3343 SYSERROR("Failed to remount %s read-only", boot_id_path);
3344 (void)unlink(mock_boot_id_path);
3345 return -1;
3346 }
3347
3348 return 0;
3349}
3350
af04d847 3351static int lxc_setup_keyring(struct lsm_ops *lsm_ops, const struct lxc_conf *conf)
d701d729
CB
3352{
3353 key_serial_t keyring;
3354 int ret = 0;
3355
3356 if (conf->lsm_se_keyring_context)
af04d847 3357 ret = lsm_ops->keyring_label_set(lsm_ops, conf->lsm_se_keyring_context);
d701d729 3358 else if (conf->lsm_se_context)
af04d847 3359 ret = lsm_ops->keyring_label_set(lsm_ops, conf->lsm_se_context);
d701d729
CB
3360 if (ret < 0)
3361 return log_error_errno(-1, errno, "Failed to set keyring context");
3362
3363 /*
3364 * Try to allocate a new session keyring for the container to prevent
3365 * information leaks.
3366 */
3367 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, prctl_arg(0),
3368 prctl_arg(0), prctl_arg(0), prctl_arg(0));
3369 if (keyring < 0) {
3370 switch (errno) {
3371 case ENOSYS:
3372 DEBUG("The keyctl() syscall is not supported or blocked");
3373 break;
3374 case EACCES:
3375 __fallthrough;
3376 case EPERM:
3377 DEBUG("Failed to access kernel keyring. Continuing...");
3378 break;
3379 default:
3380 SYSERROR("Failed to create kernel keyring");
3381 break;
3382 }
3383 }
3384
3385 return ret;
3386}
3387
3b988b33 3388int lxc_setup(struct lxc_handler *handler)
35120d9c 3389{
2187efd3 3390 int ret;
0fd73091 3391 const char *lxcpath = handler->lxcpath, *name = handler->name;
35120d9c 3392 struct lxc_conf *lxc_conf = handler->conf;
35120d9c 3393
8ce1abc2 3394 ret = lxc_setup_rootfs_prepare_root(lxc_conf, name, lxcpath);
55022530
CB
3395 if (ret < 0)
3396 return log_error(-1, "Failed to setup rootfs");
35120d9c 3397
b87ee312 3398 if (handler->nsfd[LXC_NS_UTS] == -EBADF) {
8353b4c9 3399 ret = setup_utsname(lxc_conf->utsname);
55022530
CB
3400 if (ret < 0)
3401 return log_error(-1, "Failed to setup the utsname %s", name);
0ad19a3f 3402 }
3403
8f818a84 3404 if (!lxc_conf->keyring_disable_session) {
d701d729 3405 ret = lxc_setup_keyring(handler->lsm_ops, lxc_conf);
8f818a84 3406 if (ret < 0)
d701d729 3407 return log_error(-1, "Failed to setup container keyring");
8f818a84 3408 }
b25291da 3409
e389f2af
CB
3410 if (handler->ns_clone_flags & CLONE_NEWNET) {
3411 ret = lxc_setup_network_in_child_namespaces(lxc_conf,
3412 &lxc_conf->network);
55022530
CB
3413 if (ret < 0)
3414 return log_error(-1, "Failed to setup network");
0ad19a3f 3415
e389f2af 3416 ret = lxc_network_send_name_and_ifindex_to_parent(handler);
55022530
CB
3417 if (ret < 0)
3418 return log_error(-1, "Failed to send network device names and ifindices to parent");
790255cf
CB
3419 }
3420
bc6928ff 3421 if (lxc_conf->autodev > 0) {
63012bdd 3422 ret = mount_autodev(name, &lxc_conf->rootfs, lxc_conf->autodevtmpfssize, lxcpath);
55022530
CB
3423 if (ret < 0)
3424 return log_error(-1, "Failed to mount \"/dev\"");
c6883f38
SH
3425 }
3426
ea57e424 3427 lxc_conf->rootfs.dfd_dev = open_at(lxc_conf->rootfs.dfd_mnt, "dev",
fdb57ab4
CB
3428 PROTECT_OPATH_DIRECTORY,
3429 PROTECT_LOOKUP_BENEATH_XDEV, 0);
a5a08920 3430 if (lxc_conf->rootfs.dfd_dev < 0 && errno != ENOENT)
953db219
CB
3431 return log_error_errno(-errno, errno, "Failed to open \"/dev\"");
3432
8353b4c9
CB
3433 /* Do automatic mounts (mainly /proc and /sys), but exclude those that
3434 * need to wait until other stuff has finished.
368bbc02 3435 */
6d25a524 3436 ret = lxc_mount_auto_mounts(handler, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK);
55022530
CB
3437 if (ret < 0)
3438 return log_error(-1, "Failed to setup first automatic mounts");
368bbc02 3439
48e5dcc8 3440 ret = setup_mount_fstab(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath);
55022530
CB
3441 if (ret < 0)
3442 return log_error(-1, "Failed to setup mounts");
576f946d 3443
c631115d
FA
3444 if (!lxc_list_empty(&lxc_conf->mount_list)) {
3445 ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
3446 &lxc_conf->mount_list, name, lxcpath);
55022530
CB
3447 if (ret < 0)
3448 return log_error(-1, "Failed to setup mount entries");
c631115d
FA
3449 }
3450
8353b4c9 3451 if (lxc_conf->is_execute) {
4b5b3a2a
TA
3452 if (execveat_supported()) {
3453 int fd;
f4bea7cc 3454 char path[STRLITERALLEN(SBINDIR) + STRLITERALLEN("/init.lxc.static") + 1];
4b5b3a2a 3455
9bcde680
CB
3456 ret = strnprintf(path, sizeof(path), SBINDIR "/init.lxc.static");
3457 if (ret < 0)
55022530 3458 return log_error(-1, "Path to init.lxc.static too long");
4b5b3a2a 3459
f4bea7cc 3460 fd = open(path, O_NOCTTY | O_NOFOLLOW | O_CLOEXEC | O_PATH);
55022530
CB
3461 if (fd < 0)
3462 return log_error_errno(-1, errno, "Unable to open lxc.init.static");
4b5b3a2a
TA
3463
3464 ((struct execute_args *)handler->data)->init_fd = fd;
3465 ((struct execute_args *)handler->data)->init_path = NULL;
3466 } else {
3467 ret = lxc_execute_bind_init(handler);
55022530
CB
3468 if (ret < 0)
3469 return log_error(-1, "Failed to bind-mount the lxc init system");
8353b4c9
CB
3470 }
3471 }
2322903b 3472
8353b4c9
CB
3473 /* Now mount only cgroups, if wanted. Before, /sys could not have been
3474 * mounted. It is guaranteed to be mounted now either through
3475 * automatically or via fstab entries.
368bbc02 3476 */
6d25a524 3477 ret = lxc_mount_auto_mounts(handler, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK);
55022530
CB
3478 if (ret < 0)
3479 return log_error(-1, "Failed to setup remaining automatic mounts");
368bbc02 3480
8353b4c9 3481 ret = run_lxc_hooks(name, "mount", lxc_conf, NULL);
55022530
CB
3482 if (ret < 0)
3483 return log_error(-1, "Failed to run mount hooks");
773fb9ca 3484
bc6928ff 3485 if (lxc_conf->autodev > 0) {
8353b4c9 3486 ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
55022530
CB
3487 if (ret < 0)
3488 return log_error(-1, "Failed to run autodev hooks");
06749971 3489
8353b4c9 3490 ret = lxc_fill_autodev(&lxc_conf->rootfs);
55022530
CB
3491 if (ret < 0)
3492 return log_error(-1, "Failed to populate \"/dev\"");
91c3830e 3493 }
368bbc02 3494
75193660 3495 /* Make sure any start hooks are in the container */
55022530
CB
3496 if (!verify_start_hooks(lxc_conf))
3497 return log_error(-1, "Failed to verify start hooks");
75193660 3498
cf68ffd9
CB
3499 ret = lxc_create_tmp_proc_mount(lxc_conf);
3500 if (ret < 0)
3501 return log_error(-1, "Failed to \"/proc\" LSMs");
3502
ed8704d0 3503 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
37c74fd1 3504 lxc_conf->ttys.dir);
55022530
CB
3505 if (ret < 0)
3506 return log_error(-1, "Failed to setup console");
6e590161 3507
ed8704d0 3508 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
55022530
CB
3509 if (ret < 0)
3510 return log_error(-1, "Failed to setup \"/dev\" symlinks");
69aa6655 3511
8ce1abc2 3512 ret = lxc_setup_rootfs_switch_root(&lxc_conf->rootfs);
55022530
CB
3513 if (ret < 0)
3514 return log_error(-1, "Failed to pivot root into rootfs");
ed502555 3515
20502652
CB
3516 /* Setting the boot-id is best-effort for now. */
3517 if (lxc_conf->autodev > 0)
3518 (void)lxc_setup_boot_id();
3519
68f3899e 3520 ret = lxc_setup_devpts_child(handler);
55022530
CB
3521 if (ret < 0)
3522 return log_error(-1, "Failed to setup new devpts instance");
3c26f34e 3523
2187efd3
CB
3524 ret = lxc_create_ttys(handler);
3525 if (ret < 0)
e8bd4e43 3526 return -1;
e8bd4e43 3527
8353b4c9 3528 ret = setup_personality(lxc_conf->personality);
55022530
CB
3529 if (ret < 0)
3530 return log_error(-1, "Failed to set personality");
cccc74b5 3531
8353b4c9
CB
3532 /* Set sysctl value to a path under /proc/sys as determined from the
3533 * key. For e.g. net.ipv4.ip_forward translated to
3534 * /proc/sys/net/ipv4/ip_forward.
7edd0540
L
3535 */
3536 if (!lxc_list_empty(&lxc_conf->sysctls)) {
3537 ret = setup_sysctl_parameters(&lxc_conf->sysctls);
55022530
CB
3538 if (ret < 0)
3539 return log_error(-1, "Failed to setup sysctl parameters");
7edd0540
L
3540 }
3541
97a8f74f 3542 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
55022530
CB
3543 if (!lxc_list_empty(&lxc_conf->caps))
3544 return log_error(-1, "Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both");
8353b4c9 3545
55022530
CB
3546 if (dropcaps_except(&lxc_conf->keepcaps))
3547 return log_error(-1, "Failed to keep capabilities");
97a8f74f 3548 } else if (setup_caps(&lxc_conf->caps)) {
55022530 3549 return log_error(-1, "Failed to drop capabilities");
81810dd1
DL
3550 }
3551
79ff643d 3552 put_lxc_rootfs(&handler->conf->rootfs, true);
8353b4c9 3553 NOTICE("The container \"%s\" is set up", name);
cd54d859 3554
0ad19a3f 3555 return 0;
3556}
26ddeedd 3557
3f60c2f7 3558int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
14a7b0f9 3559 char *argv[])
26ddeedd 3560{
26ddeedd 3561 struct lxc_list *it;
3ea957c6
RK
3562 int which;
3563
3564 for (which = 0; which < NUM_LXC_HOOKS; which ++) {
71528742 3565 if (strequal(hookname, lxchook_names[which]))
3ea957c6
RK
3566 break;
3567 }
3568
3569 if (which >= NUM_LXC_HOOKS)
26ddeedd 3570 return -1;
3f60c2f7 3571
0fd73091 3572 lxc_list_for_each (it, &conf->hooks[which]) {
26ddeedd 3573 int ret;
3f60c2f7
CB
3574 char *hook = it->elem;
3575
3576 ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
14a7b0f9 3577 hookname, argv);
3f60c2f7
CB
3578 if (ret < 0)
3579 return -1;
26ddeedd 3580 }
3f60c2f7 3581
26ddeedd
SH
3582 return 0;
3583}
72d0e1cb 3584
72d0e1cb
SG
3585int lxc_clear_config_caps(struct lxc_conf *c)
3586{
1a0e70ac 3587 struct lxc_list *it, *next;
72d0e1cb 3588
0fd73091 3589 lxc_list_for_each_safe (it, &c->caps, next) {
72d0e1cb
SG
3590 lxc_list_del(it);
3591 free(it->elem);
3592 free(it);
3593 }
0fd73091 3594
72d0e1cb
SG
3595 return 0;
3596}
3597
c7e345ae
CB
3598static int lxc_free_idmap(struct lxc_list *id_map)
3599{
27c27d73
SH
3600 struct lxc_list *it, *next;
3601
46bc6f2a 3602 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
3603 lxc_list_del(it);
3604 free(it->elem);
3605 free(it);
3606 }
c7e345ae 3607
27c27d73
SH
3608 return 0;
3609}
7e621263
CB
3610
3611static int __lxc_free_idmap(struct lxc_list *id_map)
3612{
3613 lxc_free_idmap(id_map);
3614 free(id_map);
3615 return 0;
3616}
3617define_cleanup_function(struct lxc_list *, __lxc_free_idmap);
27c27d73 3618
4355ab5f
SH
3619int lxc_clear_idmaps(struct lxc_conf *c)
3620{
3621 return lxc_free_idmap(&c->id_map);
3622}
3623
1fb86a7c
SH
3624int lxc_clear_config_keepcaps(struct lxc_conf *c)
3625{
0fd73091 3626 struct lxc_list *it, *next;
1fb86a7c 3627
0fd73091 3628 lxc_list_for_each_safe (it, &c->keepcaps, next) {
1fb86a7c
SH
3629 lxc_list_del(it);
3630 free(it->elem);
3631 free(it);
3632 }
0fd73091 3633
1fb86a7c
SH
3634 return 0;
3635}
3636
a3ed9b81 3637int lxc_clear_namespace(struct lxc_conf *c)
3638{
3639 int i;
3640 for (i = 0; i < LXC_NS_MAX; i++) {
3641 free(c->ns_share[i]);
3642 c->ns_share[i] = NULL;
3643 }
3644 return 0;
3645}
3646
54860ed0 3647int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
72d0e1cb 3648{
54860ed0 3649 char *global_token, *namespaced_token;
ab1a6cac 3650 size_t namespaced_token_len;
54860ed0 3651 struct lxc_list *it, *next, *list;
ab1a6cac 3652 const char *k = key;
54860ed0 3653 bool all = false;
72d0e1cb 3654
54860ed0
CB
3655 if (version == CGROUP2_SUPER_MAGIC) {
3656 global_token = "lxc.cgroup2";
3657 namespaced_token = "lxc.cgroup2.";
6333c915 3658 namespaced_token_len = STRLITERALLEN("lxc.cgroup2.");
54860ed0
CB
3659 list = &c->cgroup2;
3660 } else if (version == CGROUP_SUPER_MAGIC) {
3661 global_token = "lxc.cgroup";
3662 namespaced_token = "lxc.cgroup.";
6333c915 3663 namespaced_token_len = STRLITERALLEN("lxc.cgroup.");
54860ed0
CB
3664 list = &c->cgroup;
3665 } else {
ab1a6cac 3666 return -EINVAL;
54860ed0
CB
3667 }
3668
71528742 3669 if (strequal(key, global_token))
72d0e1cb 3670 all = true;
eed95eb0 3671 else if (strnequal(key, namespaced_token, namespaced_token_len))
ab1a6cac 3672 k += namespaced_token_len;
a6390f01 3673 else
ab1a6cac 3674 return -EINVAL;
72d0e1cb 3675
0fd73091 3676 lxc_list_for_each_safe (it, list, next) {
72d0e1cb 3677 struct lxc_cgroup *cg = it->elem;
54860ed0 3678
71528742 3679 if (!all && !strequal(cg->subsystem, k))
72d0e1cb 3680 continue;
54860ed0 3681
72d0e1cb
SG
3682 lxc_list_del(it);
3683 free(cg->subsystem);
3684 free(cg->value);
3685 free(cg);
3686 free(it);
3687 }
e409b214 3688
72d0e1cb
SG
3689 return 0;
3690}
3691
4bfb655e
CB
3692static void lxc_clear_devices(struct lxc_conf *conf)
3693{
3694 struct lxc_list *list = &conf->devices;
3695 struct lxc_list *it, *next;
3696
3697 lxc_list_for_each_safe(it, list, next) {
3698 lxc_list_del(it);
3699 free(it);
3700 }
3701}
3702
c6d09e15
WB
3703int lxc_clear_limits(struct lxc_conf *c, const char *key)
3704{
3705 struct lxc_list *it, *next;
c6d09e15 3706 const char *k = NULL;
0fd73091 3707 bool all = false;
c6d09e15 3708
71528742 3709 if (strequal(key, "lxc.limit") || strequal(key, "lxc.prlimit"))
c6d09e15 3710 all = true;
eed95eb0 3711 else if (strnequal(key, "lxc.limit.", STRLITERALLEN("lxc.limit.")))
6333c915 3712 k = key + STRLITERALLEN("lxc.limit.");
eed95eb0 3713 else if (strnequal(key, "lxc.prlimit.", STRLITERALLEN("lxc.prlimit.")))
6333c915 3714 k = key + STRLITERALLEN("lxc.prlimit.");
c6d09e15
WB
3715 else
3716 return -1;
3717
0fd73091 3718 lxc_list_for_each_safe (it, &c->limits, next) {
c6d09e15 3719 struct lxc_limit *lim = it->elem;
0fd73091 3720
71528742 3721 if (!all && !strequal(lim->resource, k))
c6d09e15 3722 continue;
0fd73091 3723
c6d09e15
WB
3724 lxc_list_del(it);
3725 free(lim->resource);
3726 free(lim);
3727 free(it);
3728 }
b668653c 3729
c6d09e15
WB
3730 return 0;
3731}
3732
7edd0540
L
3733int lxc_clear_sysctls(struct lxc_conf *c, const char *key)
3734{
3735 struct lxc_list *it, *next;
7edd0540 3736 const char *k = NULL;
0fd73091 3737 bool all = false;
7edd0540 3738
71528742 3739 if (strequal(key, "lxc.sysctl"))
7edd0540 3740 all = true;
eed95eb0 3741 else if (strnequal(key, "lxc.sysctl.", STRLITERALLEN("lxc.sysctl.")))
6333c915 3742 k = key + STRLITERALLEN("lxc.sysctl.");
7edd0540
L
3743 else
3744 return -1;
3745
0fd73091 3746 lxc_list_for_each_safe (it, &c->sysctls, next) {
7edd0540 3747 struct lxc_sysctl *elem = it->elem;
0fd73091 3748
71528742 3749 if (!all && !strequal(elem->key, k))
7edd0540 3750 continue;
0fd73091 3751
7edd0540
L
3752 lxc_list_del(it);
3753 free(elem->key);
3754 free(elem->value);
3755 free(elem);
3756 free(it);
3757 }
0fd73091 3758
7edd0540
L
3759 return 0;
3760}
3761
61d7a733
YT
3762int lxc_clear_procs(struct lxc_conf *c, const char *key)
3763{
0fd73091 3764 struct lxc_list *it, *next;
61d7a733 3765 const char *k = NULL;
0fd73091 3766 bool all = false;
61d7a733 3767
71528742 3768 if (strequal(key, "lxc.proc"))
61d7a733 3769 all = true;
eed95eb0 3770 else if (strnequal(key, "lxc.proc.", STRLITERALLEN("lxc.proc.")))
6333c915 3771 k = key + STRLITERALLEN("lxc.proc.");
61d7a733
YT
3772 else
3773 return -1;
3774
0fd73091 3775 lxc_list_for_each_safe (it, &c->procs, next) {
61d7a733 3776 struct lxc_proc *proc = it->elem;
0fd73091 3777
71528742 3778 if (!all && !strequal(proc->filename, k))
61d7a733 3779 continue;
0fd73091 3780
61d7a733
YT
3781 lxc_list_del(it);
3782 free(proc->filename);
3783 free(proc->value);
3784 free(proc);
3785 free(it);
3786 }
3787
3788 return 0;
3789}
3790
ee1e7aa0
SG
3791int lxc_clear_groups(struct lxc_conf *c)
3792{
0fd73091 3793 struct lxc_list *it, *next;
ee1e7aa0 3794
0fd73091 3795 lxc_list_for_each_safe (it, &c->groups, next) {
ee1e7aa0
SG
3796 lxc_list_del(it);
3797 free(it->elem);
3798 free(it);
3799 }
0fd73091 3800
ee1e7aa0
SG
3801 return 0;
3802}
3803
ab799c0b
SG
3804int lxc_clear_environment(struct lxc_conf *c)
3805{
0fd73091 3806 struct lxc_list *it, *next;
ab799c0b 3807
0fd73091 3808 lxc_list_for_each_safe (it, &c->environment, next) {
ab799c0b
SG
3809 lxc_list_del(it);
3810 free(it->elem);
3811 free(it);
3812 }
0fd73091 3813
ab799c0b
SG
3814 return 0;
3815}
3816
72d0e1cb
SG
3817int lxc_clear_mount_entries(struct lxc_conf *c)
3818{
0fd73091 3819 struct lxc_list *it, *next;
72d0e1cb 3820
0fd73091 3821 lxc_list_for_each_safe (it, &c->mount_list, next) {
72d0e1cb
SG
3822 lxc_list_del(it);
3823 free(it->elem);
3824 free(it);
3825 }
0fd73091 3826
72d0e1cb
SG
3827 return 0;
3828}
3829
b099e9e9
SH
3830int lxc_clear_automounts(struct lxc_conf *c)
3831{
3832 c->auto_mounts = 0;
3833 return 0;
3834}
3835
12a50cc6 3836int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 3837{
72d0e1cb 3838 int i;
0fd73091
CB
3839 struct lxc_list *it, *next;
3840 const char *k = NULL;
3841 bool all = false, done = false;
72d0e1cb 3842
71528742 3843 if (strequal(key, "lxc.hook"))
17ed13a3 3844 all = true;
eed95eb0 3845 else if (strnequal(key, "lxc.hook.", STRLITERALLEN("lxc.hook.")))
6333c915 3846 k = key + STRLITERALLEN("lxc.hook.");
a6390f01
WB
3847 else
3848 return -1;
17ed13a3 3849
0fd73091 3850 for (i = 0; i < NUM_LXC_HOOKS; i++) {
71528742 3851 if (all || strequal(k, lxchook_names[i])) {
0fd73091 3852 lxc_list_for_each_safe (it, &c->hooks[i], next) {
17ed13a3
SH
3853 lxc_list_del(it);
3854 free(it->elem);
3855 free(it);
3856 }
0fd73091 3857
17ed13a3 3858 done = true;
72d0e1cb
SG
3859 }
3860 }
17ed13a3 3861
55022530
CB
3862 if (!done)
3863 return log_error(-1, "Invalid hook key: %s", key);
0fd73091 3864
72d0e1cb
SG
3865 return 0;
3866}
8eb5694b 3867
4184c3e1
SH
3868static inline void lxc_clear_aliens(struct lxc_conf *conf)
3869{
0fd73091 3870 struct lxc_list *it, *next;
4184c3e1 3871
0fd73091 3872 lxc_list_for_each_safe (it, &conf->aliens, next) {
4184c3e1
SH
3873 lxc_list_del(it);
3874 free(it->elem);
3875 free(it);
3876 }
3877}
3878
c7b15d1e 3879void lxc_clear_includes(struct lxc_conf *conf)
f979ac15 3880{
0fd73091 3881 struct lxc_list *it, *next;
f979ac15 3882
0fd73091 3883 lxc_list_for_each_safe (it, &conf->includes, next) {
f979ac15
SH
3884 lxc_list_del(it);
3885 free(it->elem);
3886 free(it);
3887 }
3888}
3889
1800f924
WB
3890int lxc_clear_apparmor_raw(struct lxc_conf *c)
3891{
3892 struct lxc_list *it, *next;
3893
3894 lxc_list_for_each_safe (it, &c->lsm_aa_raw, next) {
3895 lxc_list_del(it);
3896 free(it->elem);
3897 free(it);
3898 }
3899
3900 return 0;
3901}
3902
8eb5694b
SH
3903void lxc_conf_free(struct lxc_conf *conf)
3904{
3905 if (!conf)
3906 return;
0fd73091 3907
858377e4
SH
3908 if (current_config == conf)
3909 current_config = NULL;
aed105d5 3910 lxc_terminal_conf_free(&conf->console);
f10fad2f 3911 free(conf->rootfs.mount);
b3b8c97f 3912 free(conf->rootfs.bdev_type);
f10fad2f
ME
3913 free(conf->rootfs.options);
3914 free(conf->rootfs.path);
9dd75981 3915 free(conf->rootfs.data);
79ff643d 3916 put_lxc_rootfs(&conf->rootfs, true);
f10fad2f 3917 free(conf->logfile);
858377e4
SH
3918 if (conf->logfd != -1)
3919 close(conf->logfd);
f10fad2f 3920 free(conf->utsname);
885766f5
CB
3921 free(conf->ttys.dir);
3922 free(conf->ttys.tty_names);
f10fad2f
ME
3923 free(conf->fstab);
3924 free(conf->rcfile);
5cda27c1 3925 free(conf->execute_cmd);
f10fad2f 3926 free(conf->init_cmd);
bf31b337 3927 free(conf->init_groups.list);
3c491553 3928 free(conf->init_cwd);
6b0d5538 3929 free(conf->unexpanded_config);
76d0127f 3930 free(conf->syslog);
c302b476 3931 lxc_free_networks(&conf->network);
f10fad2f 3932 free(conf->lsm_aa_profile);
1800f924 3933 free(conf->lsm_aa_profile_computed);
f10fad2f 3934 free(conf->lsm_se_context);
c3e3c21a 3935 lxc_seccomp_free(&conf->seccomp);
8eb5694b 3936 lxc_clear_config_caps(conf);
1fb86a7c 3937 lxc_clear_config_keepcaps(conf);
54860ed0
CB
3938 lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
3939 lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
4bfb655e 3940 lxc_clear_devices(conf);
17ed13a3 3941 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 3942 lxc_clear_mount_entries(conf);
27c27d73 3943 lxc_clear_idmaps(conf);
ee1e7aa0 3944 lxc_clear_groups(conf);
f979ac15 3945 lxc_clear_includes(conf);
761d81ca 3946 lxc_clear_aliens(conf);
ab799c0b 3947 lxc_clear_environment(conf);
240d4b74 3948 lxc_clear_limits(conf, "lxc.prlimit");
7edd0540 3949 lxc_clear_sysctls(conf, "lxc.sysctl");
61d7a733 3950 lxc_clear_procs(conf, "lxc.proc");
1800f924 3951 lxc_clear_apparmor_raw(conf);
a3ed9b81 3952 lxc_clear_namespace(conf);
43654d34 3953 free(conf->cgroup_meta.dir);
a900cbaf 3954 free(conf->cgroup_meta.monitor_dir);
eb60b564 3955 free(conf->cgroup_meta.monitor_pivot_dir);
a900cbaf
WB
3956 free(conf->cgroup_meta.container_dir);
3957 free(conf->cgroup_meta.namespace_dir);
43654d34 3958 free(conf->cgroup_meta.controllers);
7a41e857
LT
3959 free(conf->shmount.path_host);
3960 free(conf->shmount.path_cont);
8eb5694b
SH
3961 free(conf);
3962}
4355ab5f
SH
3963
3964struct userns_fn_data {
3965 int (*fn)(void *);
c9b7c33e 3966 const char *fn_name;
4355ab5f
SH
3967 void *arg;
3968 int p[2];
3969};
3970
3971static int run_userns_fn(void *data)
3972{
766c5b6d 3973 struct userns_fn_data *d = data;
adaffdd7 3974 int ret;
4355ab5f 3975 char c;
4355ab5f 3976
766c5b6d 3977 close_prot_errno_disarm(d->p[1]);
f8aa4bf3 3978
766c5b6d
CB
3979 /*
3980 * Wait for parent to finish establishing a new mapping in the user
f8aa4bf3
CB
3981 * namespace we are executing in.
3982 */
adaffdd7 3983 ret = lxc_read_nointr(d->p[0], &c, 1);
766c5b6d 3984 close_prot_errno_disarm(d->p[0]);
adaffdd7
CB
3985 if (ret != 1)
3986 return -1;
f8aa4bf3 3987
c9b7c33e 3988 if (d->fn_name)
adaffdd7 3989 TRACE("Calling function \"%s\"", d->fn_name);
0fd73091 3990
f8aa4bf3 3991 /* Call function to run. */
4355ab5f
SH
3992 return d->fn(d->arg);
3993}
3994
7581a82f 3995static struct id_map *mapped_nsid_add(const struct lxc_conf *conf, unsigned id,
db7cfe23
CB
3996 enum idtype idtype)
3997{
5173b710
CB
3998 const struct id_map *map;
3999 struct id_map *retmap;
db7cfe23
CB
4000
4001 map = find_mapped_nsid_entry(conf, id, idtype);
4002 if (!map)
4003 return NULL;
4004
4005 retmap = malloc(sizeof(*retmap));
4006 if (!retmap)
4007 return NULL;
4008
4009 memcpy(retmap, map, sizeof(*retmap));
4010 return retmap;
4011}
4012
7581a82f 4013static struct id_map *find_mapped_hostid_entry(const struct lxc_conf *conf,
c4333195 4014 unsigned id, enum idtype idtype)
f8aa4bf3 4015{
f8aa4bf3 4016 struct id_map *map;
0fd73091 4017 struct lxc_list *it;
f8aa4bf3
CB
4018 struct id_map *retmap = NULL;
4019
0fd73091 4020 lxc_list_for_each (it, &conf->id_map) {
f8aa4bf3
CB
4021 map = it->elem;
4022 if (map->idtype != idtype)
4023 continue;
4024
4025 if (id >= map->hostid && id < map->hostid + map->range) {
4026 retmap = map;
4027 break;
4028 }
4029 }
4030
f8aa4bf3
CB
4031 return retmap;
4032}
4033
0fd73091 4034/* Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
f8aa4bf3 4035 * existing one or establish a new one.
4355ab5f 4036 */
7581a82f 4037static struct id_map *mapped_hostid_add(const struct lxc_conf *conf, uid_t id,
0fd73091 4038 enum idtype type)
4355ab5f 4039{
55022530 4040 __do_free struct id_map *entry = NULL;
28a2d9e7 4041 int hostid_mapped;
55022530 4042 struct id_map *tmp = NULL;
c4333195
CB
4043
4044 entry = malloc(sizeof(*entry));
4045 if (!entry)
4046 return NULL;
f8aa4bf3 4047
28a2d9e7 4048 /* Reuse existing mapping. */
c4333195 4049 tmp = find_mapped_hostid_entry(conf, id, type);
1758c195
CB
4050 if (tmp) {
4051 memcpy(entry, tmp, sizeof(*entry));
4052 } else {
4053 /* Find new mapping. */
4054 hostid_mapped = find_unmapped_nsid(conf, type);
4055 if (hostid_mapped < 0)
4056 return log_debug(NULL, "Failed to find free mapping for id %d", id);
4057
4058 entry->idtype = type;
4059 entry->nsid = hostid_mapped;
4060 entry->hostid = (unsigned long)id;
4061 entry->range = 1;
4062 }
4355ab5f 4063
55022530 4064 return move_ptr(entry);
4355ab5f
SH
4065}
4066
dbfcdf86
CB
4067static struct lxc_list *get_minimal_idmap(const struct lxc_conf *conf,
4068 uid_t *resuid, gid_t *resgid)
4355ab5f 4069{
00d6cfe2
CB
4070 __do_free struct id_map *container_root_uid = NULL,
4071 *container_root_gid = NULL,
4072 *host_uid_map = NULL, *host_gid_map = NULL;
4073 __do_free struct lxc_list *idmap = NULL;
f8aa4bf3 4074 uid_t euid, egid;
4160c3a0
CB
4075 uid_t nsuid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
4076 gid_t nsgid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
00d6cfe2 4077 struct lxc_list *tmplist = NULL;
4355ab5f 4078
db7cfe23 4079 /* Find container root mappings. */
4160c3a0 4080 container_root_uid = mapped_nsid_add(conf, nsuid, ID_TYPE_UID);
55022530
CB
4081 if (!container_root_uid)
4082 return log_debug(NULL, "Failed to find mapping for namespace uid %d", 0);
dcf0ffdf
CB
4083 euid = geteuid();
4084 if (euid >= container_root_uid->hostid &&
4085 euid < (container_root_uid->hostid + container_root_uid->range))
2c996219 4086 host_uid_map = move_ptr(container_root_uid);
f8aa4bf3 4087
4160c3a0 4088 container_root_gid = mapped_nsid_add(conf, nsgid, ID_TYPE_GID);
55022530
CB
4089 if (!container_root_gid)
4090 return log_debug(NULL, "Failed to find mapping for namespace gid %d", 0);
dcf0ffdf
CB
4091 egid = getegid();
4092 if (egid >= container_root_gid->hostid &&
4093 egid < (container_root_gid->hostid + container_root_gid->range))
2c996219 4094 host_gid_map = move_ptr(container_root_gid);
f8aa4bf3
CB
4095
4096 /* Check whether the {g,u}id of the user has a mapping. */
954b7d9b 4097 if (!host_uid_map)
c4333195 4098 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
55022530
CB
4099 if (!host_uid_map)
4100 return log_debug(NULL, "Failed to find mapping for uid %d", euid);
f8aa4bf3 4101
dcf0ffdf
CB
4102 if (!host_gid_map)
4103 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
55022530
CB
4104 if (!host_gid_map)
4105 return log_debug(NULL, "Failed to find mapping for gid %d", egid);
28a2d9e7
CB
4106
4107 /* Allocate new {g,u}id map list. */
4108 idmap = malloc(sizeof(*idmap));
4109 if (!idmap)
00d6cfe2 4110 return NULL;
28a2d9e7
CB
4111 lxc_list_init(idmap);
4112
f8aa4bf3
CB
4113 /* Add container root to the map. */
4114 tmplist = malloc(sizeof(*tmplist));
4115 if (!tmplist)
00d6cfe2 4116 return NULL;
47649d5b
CB
4117 /* idmap will now keep track of that memory. */
4118 lxc_list_add_elem(tmplist, move_ptr(host_uid_map));
f8aa4bf3 4119 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4120
2c996219 4121 if (container_root_uid) {
28a2d9e7
CB
4122 /* Add container root to the map. */
4123 tmplist = malloc(sizeof(*tmplist));
4124 if (!tmplist)
00d6cfe2 4125 return NULL;
47649d5b
CB
4126 /* idmap will now keep track of that memory. */
4127 lxc_list_add_elem(tmplist, move_ptr(container_root_uid));
28a2d9e7 4128 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4129 }
f8aa4bf3
CB
4130
4131 tmplist = malloc(sizeof(*tmplist));
4132 if (!tmplist)
00d6cfe2 4133 return NULL;
47649d5b
CB
4134 /* idmap will now keep track of that memory. */
4135 lxc_list_add_elem(tmplist, move_ptr(host_gid_map));
f8aa4bf3 4136 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4137
2c996219 4138 if (container_root_gid) {
28a2d9e7
CB
4139 tmplist = malloc(sizeof(*tmplist));
4140 if (!tmplist)
00d6cfe2 4141 return NULL;
47649d5b
CB
4142 /* idmap will now keep track of that memory. */
4143 lxc_list_add_elem(tmplist, move_ptr(container_root_gid));
28a2d9e7 4144 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4145 }
f8aa4bf3 4146
dbfcdf86
CB
4147 TRACE("Allocated minimal idmapping for ns uid %d and ns gid %d", nsuid, nsgid);
4148
4149 if (resuid)
4150 *resuid = nsuid;
4151 if (resgid)
4152 *resgid = nsgid;
00d6cfe2 4153 return move_ptr(idmap);
dcf0ffdf
CB
4154}
4155
766c5b6d
CB
4156/*
4157 * Run a function in a new user namespace.
dcf0ffdf
CB
4158 * The caller's euid/egid will be mapped if it is not already.
4159 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4160 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4161 * This means we require only to establish a mapping from:
4162 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4163 * - the container root -> some sub{g,u}id
915e3dbd 4164 * The former we add, if the user did not specify a mapping. The latter we
6f3fd27f 4165 * retrieve from the container's configured {g,u}id mappings as it must have been
dcf0ffdf
CB
4166 * there to start the container in the first place.
4167 */
7581a82f 4168int userns_exec_1(const struct lxc_conf *conf, int (*fn)(void *), void *data,
dcf0ffdf
CB
4169 const char *fn_name)
4170{
7e621263 4171 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
0fd73091
CB
4172 int ret = -1, status = -1;
4173 char c = '1';
46bc6f2a
CB
4174 struct userns_fn_data d = {
4175 .arg = data,
4176 .fn = fn,
4177 .fn_name = fn_name,
4178 };
766c5b6d
CB
4179 pid_t pid;
4180 int pipe_fds[2];
dcf0ffdf 4181
2b2655a8
CB
4182 if (!conf)
4183 return -EINVAL;
4184
dbfcdf86 4185 idmap = get_minimal_idmap(conf, NULL, NULL);
dcf0ffdf 4186 if (!idmap)
766c5b6d 4187 return ret_errno(ENOENT);
dcf0ffdf 4188
766c5b6d
CB
4189 ret = pipe2(pipe_fds, O_CLOEXEC);
4190 if (ret < 0)
4191 return -errno;
4192
766c5b6d
CB
4193 d.p[0] = pipe_fds[0];
4194 d.p[1] = pipe_fds[1];
dcf0ffdf
CB
4195
4196 /* Clone child in new user namespace. */
a59440be 4197 pid = lxc_raw_clone_cb(run_userns_fn, &d, CLONE_NEWUSER, NULL);
dcf0ffdf 4198 if (pid < 0) {
0fd73091 4199 ERROR("Failed to clone process in new user namespace");
dcf0ffdf
CB
4200 goto on_error;
4201 }
4202
766c5b6d 4203 close_prot_errno_disarm(pipe_fds[0]);
dcf0ffdf 4204
62fef886 4205 if (lxc_log_trace()) {
dcf0ffdf 4206 struct id_map *map;
0fd73091 4207 struct lxc_list *it;
dcf0ffdf 4208
766c5b6d 4209 lxc_list_for_each(it, idmap) {
f8aa4bf3 4210 map = it->elem;
766c5b6d
CB
4211 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4212 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
f8aa4bf3 4213 }
4355ab5f
SH
4214 }
4215
f8aa4bf3 4216 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4217 ret = lxc_map_ids(idmap, pid);
f8aa4bf3 4218 if (ret < 0) {
0fd73091 4219 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
f8aa4bf3 4220 goto on_error;
4355ab5f
SH
4221 }
4222
f8aa4bf3 4223 /* Tell child to proceed. */
766c5b6d 4224 if (lxc_write_nointr(pipe_fds[1], &c, 1) != 1) {
dcf0ffdf 4225 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
f8aa4bf3 4226 goto on_error;
4355ab5f
SH
4227 }
4228
686dd5d1 4229on_error:
766c5b6d
CB
4230 close_prot_errno_disarm(pipe_fds[0]);
4231 close_prot_errno_disarm(pipe_fds[1]);
f8aa4bf3 4232
ee1b16bc
TA
4233 /* Wait for child to finish. */
4234 if (pid > 0)
4235 status = wait_for_pid(pid);
4236
686dd5d1
CB
4237 if (status < 0)
4238 ret = -1;
4239
f8aa4bf3 4240 return ret;
4355ab5f 4241}
97e9cfa0 4242
d1783ef4
CB
4243int userns_exec_minimal(const struct lxc_conf *conf,
4244 int (*fn_parent)(void *), void *fn_parent_data,
4245 int (*fn_child)(void *), void *fn_child_data)
edf88289 4246{
7e621263 4247 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
dbfcdf86
CB
4248 uid_t resuid = LXC_INVALID_UID;
4249 gid_t resgid = LXC_INVALID_GID;
edf88289 4250 char c = '1';
dbfcdf86 4251 ssize_t ret;
edf88289
CB
4252 pid_t pid;
4253 int sock_fds[2];
4254
d1783ef4 4255 if (!conf || !fn_child)
dbfcdf86 4256 return ret_errno(EINVAL);
edf88289 4257
dbfcdf86 4258 idmap = get_minimal_idmap(conf, &resuid, &resgid);
edf88289
CB
4259 if (!idmap)
4260 return ret_errno(ENOENT);
4261
4262 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, sock_fds);
4263 if (ret < 0)
4264 return -errno;
4265
4266 pid = fork();
4267 if (pid < 0) {
dbfcdf86 4268 SYSERROR("Failed to create new process");
edf88289
CB
4269 goto on_error;
4270 }
4271
4272 if (pid == 0) {
4273 close_prot_errno_disarm(sock_fds[1]);
4274
4275 ret = unshare(CLONE_NEWUSER);
dbfcdf86
CB
4276 if (ret < 0) {
4277 SYSERROR("Failed to unshare new user namespace");
edf88289 4278 _exit(EXIT_FAILURE);
dbfcdf86 4279 }
edf88289 4280
dbfcdf86
CB
4281 ret = lxc_write_nointr(sock_fds[0], &c, 1);
4282 if (ret != 1)
edf88289
CB
4283 _exit(EXIT_FAILURE);
4284
4285 ret = lxc_read_nointr(sock_fds[0], &c, 1);
4286 if (ret != 1)
4287 _exit(EXIT_FAILURE);
4288
4289 close_prot_errno_disarm(sock_fds[0]);
4290
8917c382 4291 if (!lxc_drop_groups() && errno != EPERM)
edf88289
CB
4292 _exit(EXIT_FAILURE);
4293
dbfcdf86
CB
4294 ret = setresgid(resgid, resgid, resgid);
4295 if (ret < 0) {
4296 SYSERROR("Failed to setresgid(%d, %d, %d)",
4297 resgid, resgid, resgid);
edf88289 4298 _exit(EXIT_FAILURE);
dbfcdf86
CB
4299 }
4300
4301 ret = setresuid(resuid, resuid, resuid);
4302 if (ret < 0) {
4303 SYSERROR("Failed to setresuid(%d, %d, %d)",
4304 resuid, resuid, resuid);
4305 _exit(EXIT_FAILURE);
4306 }
edf88289 4307
d1783ef4 4308 ret = fn_child(fn_child_data);
dbfcdf86
CB
4309 if (ret) {
4310 SYSERROR("Running function in new user namespace failed");
edf88289 4311 _exit(EXIT_FAILURE);
dbfcdf86 4312 }
edf88289
CB
4313
4314 _exit(EXIT_SUCCESS);
4315 }
4316
4317 close_prot_errno_disarm(sock_fds[0]);
4318
62fef886 4319 if (lxc_log_trace()) {
edf88289
CB
4320 struct id_map *map;
4321 struct lxc_list *it;
4322
4323 lxc_list_for_each(it, idmap) {
4324 map = it->elem;
4325 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4326 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
4327 }
4328 }
4329
4330 ret = lxc_read_nointr(sock_fds[1], &c, 1);
4331 if (ret != 1) {
4332 SYSERROR("Failed waiting for child process %d\" to tell us to proceed", pid);
4333 goto on_error;
4334 }
4335
4336 /* Set up {g,u}id mapping for user namespace of child process. */
4337 ret = lxc_map_ids(idmap, pid);
4338 if (ret < 0) {
4339 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4340 goto on_error;
4341 }
4342
4343 /* Tell child to proceed. */
4344 ret = lxc_write_nointr(sock_fds[1], &c, 1);
4345 if (ret != 1) {
4346 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4347 goto on_error;
4348 }
4349
d1783ef4
CB
4350 if (fn_parent && fn_parent(fn_parent_data)) {
4351 SYSERROR("Running parent function failed");
4352 _exit(EXIT_FAILURE);
4353 }
4354
edf88289
CB
4355on_error:
4356 close_prot_errno_disarm(sock_fds[0]);
4357 close_prot_errno_disarm(sock_fds[1]);
4358
4359 /* Wait for child to finish. */
dbfcdf86
CB
4360 if (pid < 0)
4361 return -1;
edf88289 4362
dbfcdf86 4363 return wait_for_pid(pid);
edf88289
CB
4364}
4365
415a8851
CB
4366int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
4367 const char *fn_name)
4368{
4369 pid_t pid;
4370 uid_t euid, egid;
415a8851
CB
4371 int p[2];
4372 struct id_map *map;
4373 struct lxc_list *cur;
0fd73091 4374 struct userns_fn_data d;
415a8851 4375 int ret = -1;
0fd73091 4376 char c = '1';
415a8851
CB
4377 struct lxc_list *idmap = NULL, *tmplist = NULL;
4378 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4379 *host_uid_map = NULL, *host_gid_map = NULL;
4380
2b2655a8
CB
4381 if (!conf)
4382 return -EINVAL;
4383
979f9e34 4384 ret = pipe2(p, O_CLOEXEC);
415a8851
CB
4385 if (ret < 0) {
4386 SYSERROR("opening pipe");
4387 return -1;
4388 }
4389 d.fn = fn;
4390 d.fn_name = fn_name;
4391 d.arg = data;
4392 d.p[0] = p[0];
4393 d.p[1] = p[1];
4394
4395 /* Clone child in new user namespace. */
33258b95 4396 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER, NULL);
415a8851 4397 if (pid < 0) {
0fd73091 4398 ERROR("Failed to clone process in new user namespace");
415a8851
CB
4399 goto on_error;
4400 }
4401
4402 close(p[0]);
4403 p[0] = -1;
4404
4405 euid = geteuid();
4406 egid = getegid();
4407
4408 /* Allocate new {g,u}id map list. */
4409 idmap = malloc(sizeof(*idmap));
4410 if (!idmap)
4411 goto on_error;
4412 lxc_list_init(idmap);
4413
4414 /* Find container root. */
0fd73091 4415 lxc_list_for_each (cur, &conf->id_map) {
415a8851
CB
4416 struct id_map *tmpmap;
4417
4418 tmplist = malloc(sizeof(*tmplist));
4419 if (!tmplist)
4420 goto on_error;
4421
4422 tmpmap = malloc(sizeof(*tmpmap));
4423 if (!tmpmap) {
4424 free(tmplist);
4425 goto on_error;
4426 }
4427
4428 memset(tmpmap, 0, sizeof(*tmpmap));
4429 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
4430 tmplist->elem = tmpmap;
4431
4432 lxc_list_add_tail(idmap, tmplist);
4433
4434 map = cur->elem;
4435
4436 if (map->idtype == ID_TYPE_UID)
4437 if (euid >= map->hostid && euid < map->hostid + map->range)
4438 host_uid_map = map;
4439
4440 if (map->idtype == ID_TYPE_GID)
4441 if (egid >= map->hostid && egid < map->hostid + map->range)
4442 host_gid_map = map;
4443
4444 if (map->nsid != 0)
4445 continue;
4446
4447 if (map->idtype == ID_TYPE_UID)
4448 if (container_root_uid == NULL)
4449 container_root_uid = map;
4450
4451 if (map->idtype == ID_TYPE_GID)
4452 if (container_root_gid == NULL)
4453 container_root_gid = map;
4454 }
4455
4456 if (!container_root_uid || !container_root_gid) {
4457 ERROR("No mapping for container root found");
4458 goto on_error;
4459 }
4460
4461 /* Check whether the {g,u}id of the user has a mapping. */
4462 if (!host_uid_map)
c4333195 4463 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
415a8851
CB
4464 else
4465 host_uid_map = container_root_uid;
4466
4467 if (!host_gid_map)
c4333195 4468 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
415a8851
CB
4469 else
4470 host_gid_map = container_root_gid;
4471
4472 if (!host_uid_map) {
4473 DEBUG("Failed to find mapping for uid %d", euid);
4474 goto on_error;
4475 }
4476
4477 if (!host_gid_map) {
4478 DEBUG("Failed to find mapping for gid %d", egid);
4479 goto on_error;
4480 }
4481
4482 if (host_uid_map && (host_uid_map != container_root_uid)) {
4483 /* Add container root to the map. */
4484 tmplist = malloc(sizeof(*tmplist));
4485 if (!tmplist)
4486 goto on_error;
4487 lxc_list_add_elem(tmplist, host_uid_map);
4488 lxc_list_add_tail(idmap, tmplist);
4489 }
4490 /* idmap will now keep track of that memory. */
4491 host_uid_map = NULL;
4492
4493 if (host_gid_map && (host_gid_map != container_root_gid)) {
4494 tmplist = malloc(sizeof(*tmplist));
4495 if (!tmplist)
4496 goto on_error;
4497 lxc_list_add_elem(tmplist, host_gid_map);
4498 lxc_list_add_tail(idmap, tmplist);
4499 }
4500 /* idmap will now keep track of that memory. */
4501 host_gid_map = NULL;
4502
62fef886 4503 if (lxc_log_trace()) {
0fd73091 4504 lxc_list_for_each (cur, idmap) {
415a8851
CB
4505 map = cur->elem;
4506 TRACE("establishing %cid mapping for \"%d\" in new "
4507 "user namespace: nsuid %lu - hostid %lu - range "
4508 "%lu",
4509 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4510 map->nsid, map->hostid, map->range);
4511 }
4512 }
4513
4514 /* Set up {g,u}id mapping for user namespace of child process. */
4515 ret = lxc_map_ids(idmap, pid);
4516 if (ret < 0) {
0fd73091 4517 ERROR("error setting up {g,u}id mappings for child process \"%d\"", pid);
415a8851
CB
4518 goto on_error;
4519 }
4520
4521 /* Tell child to proceed. */
489f39be 4522 if (lxc_write_nointr(p[1], &c, 1) != 1) {
0fd73091 4523 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
415a8851
CB
4524 goto on_error;
4525 }
4526
686dd5d1 4527on_error:
ee1b16bc
TA
4528 if (p[0] != -1)
4529 close(p[0]);
4530 close(p[1]);
4531
415a8851 4532 /* Wait for child to finish. */
686dd5d1
CB
4533 if (pid > 0)
4534 ret = wait_for_pid(pid);
415a8851 4535
7e621263
CB
4536 if (idmap)
4537 __lxc_free_idmap(idmap);
80758b4b 4538
415a8851
CB
4539 if (host_uid_map && (host_uid_map != container_root_uid))
4540 free(host_uid_map);
4541 if (host_gid_map && (host_gid_map != container_root_gid))
4542 free(host_gid_map);
4543
415a8851
CB
4544 return ret;
4545}
4546
234998b4
CB
4547static int add_idmap_entry(struct lxc_list *idmap, enum idtype idtype,
4548 unsigned long nsid, unsigned long hostid,
4549 unsigned long range)
4550{
4551 __do_free struct id_map *new_idmap = NULL;
4552 __do_free struct lxc_list *new_list = NULL;
4553
4554 new_idmap = zalloc(sizeof(*new_idmap));
4555 if (!new_idmap)
4556 return ret_errno(ENOMEM);
4557
4558 new_idmap->idtype = idtype;
4559 new_idmap->hostid = hostid;
4560 new_idmap->nsid = nsid;
4561 new_idmap->range = range;
4562
4563 new_list = zalloc(sizeof(*new_list));
4564 if (!new_list)
4565 return ret_errno(ENOMEM);
4566
4567 new_list->elem = move_ptr(new_idmap);
4568 lxc_list_add_tail(idmap, move_ptr(new_list));
4569
4570 INFO("Adding id map: type %c nsid %lu hostid %lu range %lu",
4571 idtype == ID_TYPE_UID ? 'u' : 'g', nsid, hostid, range);
4572 return 0;
4573}
4574
4575int userns_exec_mapped_root(const char *path, int path_fd,
4576 const struct lxc_conf *conf)
4577{
7e621263 4578 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
234998b4
CB
4579 __do_close int fd = -EBADF;
4580 int target_fd = -EBADF;
4581 char c = '1';
4582 ssize_t ret;
4583 pid_t pid;
4584 int sock_fds[2];
4585 uid_t container_host_uid, hostuid;
4586 gid_t container_host_gid, hostgid;
4587 struct stat st;
4588
4589 if (!conf || (!path && path_fd < 0))
4590 return ret_errno(EINVAL);
4591
4592 if (!path)
4593 path = "(null)";
4594
4595 container_host_uid = get_mapped_rootid(conf, ID_TYPE_UID);
4596 if (!uid_valid(container_host_uid))
4597 return log_error(-1, "No uid mapping for container root");
4598
4599 container_host_gid = get_mapped_rootid(conf, ID_TYPE_GID);
4600 if (!gid_valid(container_host_gid))
4601 return log_error(-1, "No gid mapping for container root");
4602
cf68ffd9 4603 if (path_fd < 0) {
a72c68f7 4604 fd = open(path, O_CLOEXEC | O_NOCTTY);
234998b4
CB
4605 if (fd < 0)
4606 return log_error_errno(-errno, errno, "Failed to open \"%s\"", path);
4607 target_fd = fd;
4608 } else {
4609 target_fd = path_fd;
4610 }
4611
4612 hostuid = geteuid();
4613 /* We are root so chown directly. */
4614 if (hostuid == 0) {
4615 ret = fchown(target_fd, container_host_uid, container_host_gid);
4616 if (ret)
4617 return log_error_errno(-errno, errno,
4618 "Failed to fchown(%d(%s), %d, %d)",
4619 target_fd, path, container_host_uid,
4620 container_host_gid);
4621 return log_trace(0, "Chowned %d(%s) to uid %d and %d", target_fd, path,
4622 container_host_uid, container_host_gid);
4623 }
4624
4625 /* The container's root host id matches */
4626 if (container_host_uid == hostuid)
4627 return log_info(0, "Container root id is mapped to our uid");
4628
4629 /* Get the current ids of our target. */
4630 ret = fstat(target_fd, &st);
4631 if (ret)
4632 return log_error_errno(-errno, errno, "Failed to stat \"%s\"", path);
4633
4634 hostgid = getegid();
4635 if (st.st_uid == hostuid && mapped_hostid(st.st_gid, conf, ID_TYPE_GID) < 0) {
4636 ret = fchown(target_fd, -1, hostgid);
4637 if (ret)
4638 return log_error_errno(-errno, errno,
4639 "Failed to fchown(%d(%s), -1, %d)",
4640 target_fd, path, hostgid);
2e8013f9 4641 TRACE("Chowned %d(%s) to -1:%d", target_fd, path, hostgid);
234998b4
CB
4642 }
4643
4644 idmap = malloc(sizeof(*idmap));
4645 if (!idmap)
4646 return -ENOMEM;
4647 lxc_list_init(idmap);
4648
4649 /* "u:0:rootuid:1" */
4650 ret = add_idmap_entry(idmap, ID_TYPE_UID, 0, container_host_uid, 1);
4651 if (ret < 0)
4652 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4653
4654 /* "u:hostuid:hostuid:1" */
4655 ret = add_idmap_entry(idmap, ID_TYPE_UID, hostuid, hostuid, 1);
4656 if (ret < 0)
4657 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4658
4659 /* "g:0:rootgid:1" */
4660 ret = add_idmap_entry(idmap, ID_TYPE_GID, 0, container_host_gid, 1);
4661 if (ret < 0)
4662 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4663
4664 /* "g:hostgid:hostgid:1" */
4665 ret = add_idmap_entry(idmap, ID_TYPE_GID, hostgid, hostgid, 1);
4666 if (ret < 0)
4667 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4668
4669 if (hostgid != st.st_gid) {
4670 /* "g:pathgid:rootgid+pathgid:1" */
4671 ret = add_idmap_entry(idmap, ID_TYPE_GID, st.st_gid,
4672 container_host_gid + (gid_t)st.st_gid, 1);
4673 if (ret < 0)
4674 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4675 }
4676
4677 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, sock_fds);
4678 if (ret < 0)
4679 return -errno;
4680
4681 pid = fork();
4682 if (pid < 0) {
4683 SYSERROR("Failed to create new process");
4684 goto on_error;
4685 }
4686
4687 if (pid == 0) {
4688 close_prot_errno_disarm(sock_fds[1]);
4689
4690 ret = unshare(CLONE_NEWUSER);
4691 if (ret < 0) {
4692 SYSERROR("Failed to unshare new user namespace");
4693 _exit(EXIT_FAILURE);
4694 }
4695
4696 ret = lxc_write_nointr(sock_fds[0], &c, 1);
4697 if (ret != 1)
4698 _exit(EXIT_FAILURE);
4699
4700 ret = lxc_read_nointr(sock_fds[0], &c, 1);
4701 if (ret != 1)
4702 _exit(EXIT_FAILURE);
4703
4704 close_prot_errno_disarm(sock_fds[0]);
4705
4706 if (!lxc_switch_uid_gid(0, 0))
4707 _exit(EXIT_FAILURE);
4708
8917c382 4709 if (!lxc_drop_groups())
234998b4
CB
4710 _exit(EXIT_FAILURE);
4711
8053a085 4712 ret = fchown(target_fd, 0, st.st_gid);
234998b4 4713 if (ret) {
8ea93a0f 4714 SYSERROR("Failed to chown %d(%s) to 0:%d", target_fd, path, st.st_gid);
234998b4
CB
4715 _exit(EXIT_FAILURE);
4716 }
4717
2e8013f9 4718 TRACE("Chowned %d(%s) to 0:%d", target_fd, path, st.st_gid);
234998b4
CB
4719 _exit(EXIT_SUCCESS);
4720 }
4721
4722 close_prot_errno_disarm(sock_fds[0]);
4723
62fef886 4724 if (lxc_log_trace()) {
234998b4
CB
4725 struct id_map *map;
4726 struct lxc_list *it;
4727
4728 lxc_list_for_each(it, idmap) {
4729 map = it->elem;
4730 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4731 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
4732 }
4733 }
4734
4735 ret = lxc_read_nointr(sock_fds[1], &c, 1);
4736 if (ret != 1) {
4737 SYSERROR("Failed waiting for child process %d\" to tell us to proceed", pid);
4738 goto on_error;
4739 }
4740
4741 /* Set up {g,u}id mapping for user namespace of child process. */
4742 ret = lxc_map_ids(idmap, pid);
4743 if (ret < 0) {
4744 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4745 goto on_error;
4746 }
4747
4748 /* Tell child to proceed. */
4749 ret = lxc_write_nointr(sock_fds[1], &c, 1);
4750 if (ret != 1) {
4751 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4752 goto on_error;
4753 }
4754
4755on_error:
4756 close_prot_errno_disarm(sock_fds[0]);
4757 close_prot_errno_disarm(sock_fds[1]);
4758
4759 /* Wait for child to finish. */
4760 if (pid < 0)
4761 return -1;
4762
4763 return wait_for_pid(pid);
4764}
4765
a96a8e8c 4766/* not thread-safe, do not use from api without first forking */
0fd73091 4767static char *getuname(void)
97e9cfa0 4768{
4f410b2a 4769 __do_free char *buf = NULL;
cb7aa5e8
DJ
4770 struct passwd pwent;
4771 struct passwd *pwentp = NULL;
cb7aa5e8
DJ
4772 size_t bufsize;
4773 int ret;
97e9cfa0 4774
cb7aa5e8
DJ
4775 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
4776 if (bufsize == -1)
4777 bufsize = 1024;
4778
4779 buf = malloc(bufsize);
4780 if (!buf)
97e9cfa0
SH
4781 return NULL;
4782
cb7aa5e8
DJ
4783 ret = getpwuid_r(geteuid(), &pwent, buf, bufsize, &pwentp);
4784 if (!pwentp) {
4785 if (ret == 0)
4786 WARN("Could not find matched password record.");
4787
55022530 4788 return log_error(NULL, "Failed to get password record - %u", geteuid());
cb7aa5e8
DJ
4789 }
4790
4f410b2a 4791 return strdup(pwent.pw_name);
97e9cfa0
SH
4792}
4793
a96a8e8c 4794/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4795static char *getgname(void)
4796{
4f410b2a 4797 __do_free char *buf = NULL;
3de9fb4c
DJ
4798 struct group grent;
4799 struct group *grentp = NULL;
3de9fb4c
DJ
4800 size_t bufsize;
4801 int ret;
4802
4803 bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
4804 if (bufsize == -1)
4805 bufsize = 1024;
4806
4807 buf = malloc(bufsize);
4808 if (!buf)
4809 return NULL;
4810
4811 ret = getgrgid_r(getegid(), &grent, buf, bufsize, &grentp);
4812 if (!grentp) {
4813 if (ret == 0)
4814 WARN("Could not find matched group record");
97e9cfa0 4815
55022530 4816 return log_error(NULL, "Failed to get group record - %u", getegid());
3de9fb4c
DJ
4817 }
4818
4f410b2a 4819 return strdup(grent.gr_name);
97e9cfa0
SH
4820}
4821
a96a8e8c 4822/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4823void suggest_default_idmap(void)
4824{
3a6e3bf5 4825 __do_free char *gname = NULL, *line = NULL, *uname = NULL;
4aae564f 4826 __do_fclose FILE *subuid_f = NULL, *subgid_f = NULL;
97e9cfa0 4827 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
97e9cfa0
SH
4828 size_t len = 0;
4829
0fd73091
CB
4830 uname = getuname();
4831 if (!uname)
97e9cfa0
SH
4832 return;
4833
0fd73091 4834 gname = getgname();
3a6e3bf5 4835 if (!gname)
97e9cfa0 4836 return;
97e9cfa0 4837
4110345b 4838 subuid_f = fopen(subuidfile, "re");
4aae564f 4839 if (!subuid_f) {
97e9cfa0 4840 ERROR("Your system is not configured with subuids");
97e9cfa0
SH
4841 return;
4842 }
0fd73091 4843
4aae564f 4844 while (getline(&line, &len, subuid_f) != -1) {
0fd73091 4845 char *p, *p2;
b7930180 4846 size_t no_newline = 0;
0fd73091
CB
4847
4848 p = strchr(line, ':');
97e9cfa0
SH
4849 if (*line == '#')
4850 continue;
4851 if (!p)
4852 continue;
4853 *p = '\0';
4854 p++;
0fd73091 4855
71528742 4856 if (!strequal(line, uname))
97e9cfa0 4857 continue;
0fd73091 4858
97e9cfa0
SH
4859 p2 = strchr(p, ':');
4860 if (!p2)
4861 continue;
4862 *p2 = '\0';
4863 p2++;
4864 if (!*p2)
4865 continue;
b7930180
CB
4866 no_newline = strcspn(p2, "\n");
4867 p2[no_newline] = '\0';
4868
b7b2fde4 4869 if (lxc_safe_uint(p, &uid) < 0)
0fd73091 4870 WARN("Could not parse UID");
b7b2fde4 4871 if (lxc_safe_uint(p2, &urange) < 0)
0fd73091 4872 WARN("Could not parse UID range");
97e9cfa0 4873 }
97e9cfa0 4874
4110345b 4875 subgid_f = fopen(subgidfile, "re");
4aae564f 4876 if (!subgid_f) {
97e9cfa0 4877 ERROR("Your system is not configured with subgids");
97e9cfa0
SH
4878 return;
4879 }
0fd73091 4880
4aae564f 4881 while (getline(&line, &len, subgid_f) != -1) {
0fd73091 4882 char *p, *p2;
b7930180 4883 size_t no_newline = 0;
0fd73091
CB
4884
4885 p = strchr(line, ':');
97e9cfa0
SH
4886 if (*line == '#')
4887 continue;
4888 if (!p)
4889 continue;
4890 *p = '\0';
4891 p++;
0fd73091 4892
71528742 4893 if (!strequal(line, uname))
97e9cfa0 4894 continue;
0fd73091 4895
97e9cfa0
SH
4896 p2 = strchr(p, ':');
4897 if (!p2)
4898 continue;
4899 *p2 = '\0';
4900 p2++;
4901 if (!*p2)
4902 continue;
b7930180
CB
4903 no_newline = strcspn(p2, "\n");
4904 p2[no_newline] = '\0';
4905
b7b2fde4 4906 if (lxc_safe_uint(p, &gid) < 0)
0fd73091 4907 WARN("Could not parse GID");
b7b2fde4 4908 if (lxc_safe_uint(p2, &grange) < 0)
0fd73091 4909 WARN("Could not parse GID range");
97e9cfa0 4910 }
97e9cfa0 4911
97e9cfa0
SH
4912 if (!urange || !grange) {
4913 ERROR("You do not have subuids or subgids allocated");
4914 ERROR("Unprivileged containers require subuids and subgids");
4915 return;
4916 }
4917
4918 ERROR("You must either run as root, or define uid mappings");
4919 ERROR("To pass uid mappings to lxc-create, you could create");
4920 ERROR("~/.config/lxc/default.conf:");
4921 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
bdcbb6b3
CB
4922 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4923 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
97e9cfa0 4924}
aaf26830 4925
a7307747
SH
4926static void free_cgroup_settings(struct lxc_list *result)
4927{
4928 struct lxc_list *iterator, *next;
4929
0fd73091 4930 lxc_list_for_each_safe (iterator, result, next) {
a7307747 4931 lxc_list_del(iterator);
55022530 4932 free_disarm(iterator);
a7307747 4933 }
55022530 4934 free_disarm(result);
a7307747
SH
4935}
4936
0fd73091 4937/* Return the list of cgroup_settings sorted according to the following rules
aaf26830
KT
4938 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4939 */
0fd73091 4940struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings)
aaf26830
KT
4941{
4942 struct lxc_list *result;
aaf26830 4943 struct lxc_cgroup *cg = NULL;
0fd73091 4944 struct lxc_list *it = NULL, *item = NULL, *memsw_limit = NULL;
aaf26830
KT
4945
4946 result = malloc(sizeof(*result));
0fd73091 4947 if (!result)
fac7c663 4948 return NULL;
aaf26830
KT
4949 lxc_list_init(result);
4950
0fd73091
CB
4951 /* Iterate over the cgroup settings and copy them to the output list. */
4952 lxc_list_for_each (it, cgroup_settings) {
aaf26830 4953 item = malloc(sizeof(*item));
fac7c663 4954 if (!item) {
a7307747 4955 free_cgroup_settings(result);
fac7c663
KT
4956 return NULL;
4957 }
0fd73091 4958
aaf26830
KT
4959 item->elem = it->elem;
4960 cg = it->elem;
71528742 4961 if (strequal(cg->subsystem, "memory.memsw.limit_in_bytes")) {
aaf26830
KT
4962 /* Store the memsw_limit location */
4963 memsw_limit = item;
71528742 4964 } else if (strequal(cg->subsystem, "memory.limit_in_bytes") &&
0fd73091
CB
4965 memsw_limit != NULL) {
4966 /* lxc.cgroup.memory.memsw.limit_in_bytes is found
4967 * before lxc.cgroup.memory.limit_in_bytes, swap these
4968 * two items */
aaf26830
KT
4969 item->elem = memsw_limit->elem;
4970 memsw_limit->elem = it->elem;
4971 }
4972 lxc_list_add_tail(result, item);
4973 }
4974
4975 return result;
a7307747 4976}