]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
cgroups: convert to strequal()
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
1d52bdf7 2
d38dd64a
CB
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE 1
5#endif
9d257a2a 6#include <arpa/inet.h>
8f3e280e
CB
7#include <dirent.h>
8#include <errno.h>
9#include <fcntl.h>
10#include <grp.h>
11#include <inttypes.h>
12#include <libgen.h>
9d257a2a
CB
13#include <linux/loop.h>
14#include <net/if.h>
15#include <netinet/in.h>
8f3e280e
CB
16#include <pwd.h>
17#include <stdarg.h>
0ad19a3f 18#include <stdio.h>
0ad19a3f 19#include <stdlib.h>
0ad19a3f 20#include <string.h>
8f3e280e
CB
21#include <sys/mman.h>
22#include <sys/mount.h>
23#include <sys/param.h>
24#include <sys/prctl.h>
6a49f05e 25#include <sys/sendfile.h>
8f3e280e 26#include <sys/socket.h>
9d257a2a 27#include <sys/stat.h>
2d76d1d7 28#include <sys/syscall.h>
9d257a2a 29#include <sys/sysmacros.h>
97e9cfa0 30#include <sys/types.h>
8f3e280e
CB
31#include <sys/utsname.h>
32#include <sys/wait.h>
9d257a2a
CB
33#include <time.h>
34#include <unistd.h>
1d52bdf7 35
d38dd64a
CB
36#include "af_unix.h"
37#include "caps.h"
5f126977 38#include "cgroups/cgroup.h"
d38dd64a
CB
39#include "conf.h"
40#include "config.h"
41#include "confile.h"
42#include "confile_utils.h"
43#include "error.h"
44#include "log.h"
45#include "lsm/lsm.h"
46#include "lxclock.h"
47#include "lxcseccomp.h"
48#include "macro.h"
2f443e88 49#include "memory_utils.h"
7f88a1a2 50#include "mount_utils.h"
d38dd64a
CB
51#include "namespace.h"
52#include "network.h"
53#include "parse.h"
f40988c7 54#include "process_utils.h"
d38dd64a
CB
55#include "ringbuf.h"
56#include "start.h"
5f126977 57#include "storage/storage.h"
d38dd64a 58#include "storage/overlay.h"
6b3d24d7 59#include "syscall_wrappers.h"
d38dd64a
CB
60#include "terminal.h"
61#include "utils.h"
20502652 62#include "uuid.h"
d38dd64a 63
af6824fc 64#ifdef MAJOR_IN_MKDEV
9d257a2a 65#include <sys/mkdev.h>
af6824fc 66#endif
af6824fc 67
614305f3 68#ifdef HAVE_STATVFS
2938f7c8 69#include <sys/statvfs.h>
614305f3 70#endif
e827ff7e 71
35eb5cdc 72#if HAVE_OPENPTY
b0a33c1e 73#include <pty.h>
e827ff7e
SG
74#else
75#include <../include/openpty.h>
76#endif
0ad19a3f 77
9d257a2a
CB
78#if HAVE_LIBCAP
79#include <sys/capability.h>
80#endif
81
82#if HAVE_SYS_PERSONALITY_H
83#include <sys/personality.h>
84#endif
85
f1e05b90
DJ
86#ifndef HAVE_STRLCAT
87#include "include/strlcat.h"
88#endif
89
9d257a2a
CB
90#if IS_BIONIC
91#include <../include/lxcmntent.h>
92#else
93#include <mntent.h>
94#endif
95
96#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
97#include <../include/prlimit.h>
98#endif
99
ac2cecc4 100lxc_log_define(conf, lxc);
e5bda9ee 101
0fd73091
CB
102/* The lxc_conf of the container currently being worked on in an API call.
103 * This is used in the error calls.
104 */
105#ifdef HAVE_TLS
d7f19646 106thread_local struct lxc_conf *current_config;
0fd73091
CB
107#else
108struct lxc_conf *current_config;
109#endif
8912711c 110
0fd73091
CB
111char *lxchook_names[NUM_LXC_HOOKS] = {
112 "pre-start",
113 "pre-mount",
114 "mount",
115 "autodev",
116 "start",
117 "stop",
118 "post-stop",
119 "clone",
120 "destroy",
121 "start-host"
122};
72d0e1cb 123
998ac676
RT
124struct mount_opt {
125 char *name;
126 int clear;
127 int flag;
128};
129
81810dd1
DL
130struct caps_opt {
131 char *name;
132 int value;
133};
134
c6d09e15
WB
135struct limit_opt {
136 char *name;
137 int value;
138};
139
998ac676 140static struct mount_opt mount_opt[] = {
470b359b
CB
141 { "async", 1, MS_SYNCHRONOUS },
142 { "atime", 1, MS_NOATIME },
143 { "bind", 0, MS_BIND },
88d413d5 144 { "defaults", 0, 0 },
88d413d5 145 { "dev", 1, MS_NODEV },
470b359b 146 { "diratime", 1, MS_NODIRATIME },
88d413d5 147 { "dirsync", 0, MS_DIRSYNC },
470b359b 148 { "exec", 1, MS_NOEXEC },
8912711c 149 { "lazytime", 0, MS_LAZYTIME },
88d413d5 150 { "mand", 0, MS_MANDLOCK },
88d413d5 151 { "noatime", 0, MS_NOATIME },
470b359b 152 { "nodev", 0, MS_NODEV },
88d413d5 153 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
154 { "noexec", 0, MS_NOEXEC },
155 { "nomand", 1, MS_MANDLOCK },
156 { "norelatime", 1, MS_RELATIME },
157 { "nostrictatime", 1, MS_STRICTATIME },
158 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
159 { "rbind", 0, MS_BIND|MS_REC },
160 { "relatime", 0, MS_RELATIME },
470b359b
CB
161 { "remount", 0, MS_REMOUNT },
162 { "ro", 0, MS_RDONLY },
163 { "rw", 1, MS_RDONLY },
88d413d5 164 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
165 { "suid", 1, MS_NOSUID },
166 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 167 { NULL, 0, 0 },
998ac676
RT
168};
169
d840039e 170static struct mount_opt propagation_opt[] = {
0fd73091
CB
171 { "private", 0, MS_PRIVATE },
172 { "shared", 0, MS_SHARED },
173 { "slave", 0, MS_SLAVE },
174 { "unbindable", 0, MS_UNBINDABLE },
175 { "rprivate", 0, MS_PRIVATE|MS_REC },
176 { "rshared", 0, MS_SHARED|MS_REC },
177 { "rslave", 0, MS_SLAVE|MS_REC },
178 { "runbindable", 0, MS_UNBINDABLE|MS_REC },
179 { NULL, 0, 0 },
d840039e
YT
180};
181
81810dd1 182static struct caps_opt caps_opt[] = {
8560cd36 183#if HAVE_LIBCAP
7b4cd468
CB
184 { "chown", CAP_CHOWN },
185 { "dac_override", CAP_DAC_OVERRIDE },
186 { "dac_read_search", CAP_DAC_READ_SEARCH },
187 { "fowner", CAP_FOWNER },
188 { "fsetid", CAP_FSETID },
189 { "kill", CAP_KILL },
190 { "setgid", CAP_SETGID },
191 { "setuid", CAP_SETUID },
192 { "setpcap", CAP_SETPCAP },
193 { "linux_immutable", CAP_LINUX_IMMUTABLE },
194 { "net_bind_service", CAP_NET_BIND_SERVICE },
195 { "net_broadcast", CAP_NET_BROADCAST },
196 { "net_admin", CAP_NET_ADMIN },
197 { "net_raw", CAP_NET_RAW },
198 { "ipc_lock", CAP_IPC_LOCK },
199 { "ipc_owner", CAP_IPC_OWNER },
200 { "sys_module", CAP_SYS_MODULE },
201 { "sys_rawio", CAP_SYS_RAWIO },
202 { "sys_chroot", CAP_SYS_CHROOT },
203 { "sys_ptrace", CAP_SYS_PTRACE },
204 { "sys_pacct", CAP_SYS_PACCT },
205 { "sys_admin", CAP_SYS_ADMIN },
206 { "sys_boot", CAP_SYS_BOOT },
207 { "sys_nice", CAP_SYS_NICE },
208 { "sys_resource", CAP_SYS_RESOURCE },
209 { "sys_time", CAP_SYS_TIME },
210 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
211 { "mknod", CAP_MKNOD },
212 { "lease", CAP_LEASE },
213 { "audit_write", CAP_AUDIT_WRITE },
214 { "audit_control", CAP_AUDIT_CONTROL },
215 { "setfcap", CAP_SETFCAP },
216 { "mac_override", CAP_MAC_OVERRIDE },
217 { "mac_admin", CAP_MAC_ADMIN },
218 { "syslog", CAP_SYSLOG },
219 { "wake_alarm", CAP_WAKE_ALARM },
220 { "block_suspend", CAP_BLOCK_SUSPEND },
221 { "audit_read", CAP_AUDIT_READ },
222 { "perfmon", CAP_PERFMON },
223 { "bpf", CAP_BPF },
224 { "checkpoint_restore", CAP_CHECKPOINT_RESTORE },
2b54359b 225#endif
8560cd36 226};
81810dd1 227
c6d09e15
WB
228static struct limit_opt limit_opt[] = {
229#ifdef RLIMIT_AS
230 { "as", RLIMIT_AS },
231#endif
232#ifdef RLIMIT_CORE
233 { "core", RLIMIT_CORE },
234#endif
235#ifdef RLIMIT_CPU
236 { "cpu", RLIMIT_CPU },
237#endif
238#ifdef RLIMIT_DATA
239 { "data", RLIMIT_DATA },
240#endif
241#ifdef RLIMIT_FSIZE
242 { "fsize", RLIMIT_FSIZE },
243#endif
244#ifdef RLIMIT_LOCKS
245 { "locks", RLIMIT_LOCKS },
246#endif
247#ifdef RLIMIT_MEMLOCK
248 { "memlock", RLIMIT_MEMLOCK },
249#endif
250#ifdef RLIMIT_MSGQUEUE
251 { "msgqueue", RLIMIT_MSGQUEUE },
252#endif
253#ifdef RLIMIT_NICE
254 { "nice", RLIMIT_NICE },
255#endif
256#ifdef RLIMIT_NOFILE
257 { "nofile", RLIMIT_NOFILE },
258#endif
259#ifdef RLIMIT_NPROC
260 { "nproc", RLIMIT_NPROC },
261#endif
262#ifdef RLIMIT_RSS
263 { "rss", RLIMIT_RSS },
264#endif
265#ifdef RLIMIT_RTPRIO
266 { "rtprio", RLIMIT_RTPRIO },
267#endif
268#ifdef RLIMIT_RTTIME
269 { "rttime", RLIMIT_RTTIME },
270#endif
271#ifdef RLIMIT_SIGPENDING
272 { "sigpending", RLIMIT_SIGPENDING },
273#endif
274#ifdef RLIMIT_STACK
275 { "stack", RLIMIT_STACK },
276#endif
277};
278
91c3830e
SH
279static int run_buffer(char *buffer)
280{
cc6a0e78 281 __do_free char *output = NULL;
55022530 282 __do_lxc_pclose struct lxc_popen_FILE *f = NULL;
ebf3a6af 283 int fd, ret;
91c3830e 284
ebec9176 285 f = lxc_popen(buffer);
55022530
CB
286 if (!f)
287 return log_error_errno(-1, errno, "Failed to popen() %s", buffer);
91c3830e
SH
288
289 output = malloc(LXC_LOG_BUFFER_SIZE);
55022530
CB
290 if (!output)
291 return log_error_errno(-1, ENOMEM, "Failed to allocate memory for %s", buffer);
91c3830e 292
ebf3a6af 293 fd = fileno(f->f);
55022530
CB
294 if (fd < 0)
295 return log_error_errno(-1, errno, "Failed to retrieve underlying file descriptor");
ebf3a6af
CB
296
297 for (int i = 0; i < 10; i++) {
298 ssize_t bytes_read;
299
300 bytes_read = lxc_read_nointr(fd, output, LXC_LOG_BUFFER_SIZE - 1);
301 if (bytes_read > 0) {
302 output[bytes_read] = '\0';
303 DEBUG("Script %s produced output: %s", buffer, output);
304 continue;
305 }
306
307 break;
308 }
91c3830e 309
55022530
CB
310 ret = lxc_pclose(move_ptr(f));
311 if (ret == -1)
312 return log_error_errno(-1, errno, "Script exited with error");
313 else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0)
314 return log_error(-1, "Script exited with status %d", WEXITSTATUS(ret));
315 else if (WIFSIGNALED(ret))
316 return log_error(-1, "Script terminated by signal %d", WTERMSIG(ret));
91c3830e
SH
317
318 return 0;
319}
320
14a7b0f9
CB
321int run_script_argv(const char *name, unsigned int hook_version,
322 const char *section, const char *script,
586b1ce7 323 const char *hookname, char **argv)
148e91f5 324{
e1a94937 325 __do_free char *buffer = NULL;
3f60c2f7 326 int buf_pos, i, ret;
d08e5708 327 size_t size = 0;
148e91f5 328
3f60c2f7 329 if (hook_version == 0)
55022530
CB
330 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
331 script, name, section);
3f60c2f7
CB
332 else
333 INFO("Executing script \"%s\" for container \"%s\"", script, name);
148e91f5 334
586b1ce7
CB
335 for (i = 0; argv && argv[i]; i++)
336 size += strlen(argv[i]) + 1;
148e91f5 337
6333c915
CB
338 size += STRLITERALLEN("exec");
339 size++;
148e91f5 340 size += strlen(script);
3f60c2f7
CB
341 size++;
342
148e91f5 343 if (size > INT_MAX)
3f60c2f7 344 return -EFBIG;
148e91f5 345
3f60c2f7 346 if (hook_version == 0) {
d08e5708
CB
347 size += strlen(hookname);
348 size++;
349
350 size += strlen(name);
351 size++;
352
353 size += strlen(section);
354 size++;
355
356 if (size > INT_MAX)
357 return -EFBIG;
327cce76 358 }
3f60c2f7 359
6f8d00d2
CB
360 buffer = malloc(size);
361 if (!buffer)
362 return -ENOMEM;
363
327cce76 364 if (hook_version == 0)
9bcde680 365 buf_pos = strnprintf(buffer, size, "exec %s %s %s %s", script, name, section, hookname);
327cce76 366 else
9bcde680
CB
367 buf_pos = strnprintf(buffer, size, "exec %s", script);
368 if (buf_pos < 0)
55022530 369 return log_error_errno(-1, errno, "Failed to create command line for script \"%s\"", script);
3f60c2f7 370
327cce76 371 if (hook_version == 1) {
3f60c2f7
CB
372 ret = setenv("LXC_HOOK_TYPE", hookname, 1);
373 if (ret < 0) {
55022530 374 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_HOOK_TYPE=%s", hookname);
3f60c2f7 375 }
90f20466 376 TRACE("Set environment variable: LXC_HOOK_TYPE=%s", hookname);
3f60c2f7
CB
377
378 ret = setenv("LXC_HOOK_SECTION", section, 1);
55022530
CB
379 if (ret < 0)
380 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_HOOK_SECTION=%s", section);
3f60c2f7 381 TRACE("Set environment variable: LXC_HOOK_SECTION=%s", section);
14a7b0f9
CB
382
383 if (strcmp(section, "net") == 0) {
384 char *parent;
385
586b1ce7 386 if (!argv || !argv[0])
e1a94937 387 return -1;
14a7b0f9 388
586b1ce7 389 ret = setenv("LXC_NET_TYPE", argv[0], 1);
55022530
CB
390 if (ret < 0)
391 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_TYPE=%s", argv[0]);
586b1ce7 392 TRACE("Set environment variable: LXC_NET_TYPE=%s", argv[0]);
14a7b0f9 393
586b1ce7 394 parent = argv[1] ? argv[1] : "";
14a7b0f9 395
a8144263 396 if (strcmp(argv[0], "macvlan") == 0) {
14a7b0f9 397 ret = setenv("LXC_NET_PARENT", parent, 1);
55022530
CB
398 if (ret < 0)
399 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
14a7b0f9 400 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 401 } else if (strcmp(argv[0], "phys") == 0) {
14a7b0f9 402 ret = setenv("LXC_NET_PARENT", parent, 1);
55022530
CB
403 if (ret < 0)
404 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
14a7b0f9 405 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 406 } else if (strcmp(argv[0], "veth") == 0) {
586b1ce7 407 char *peer = argv[2] ? argv[2] : "";
14a7b0f9
CB
408
409 ret = setenv("LXC_NET_PEER", peer, 1);
55022530
CB
410 if (ret < 0)
411 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PEER=%s", peer);
14a7b0f9
CB
412 TRACE("Set environment variable: LXC_NET_PEER=%s", peer);
413
414 ret = setenv("LXC_NET_PARENT", parent, 1);
55022530
CB
415 if (ret < 0)
416 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
14a7b0f9
CB
417 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
418 }
419 }
148e91f5
SH
420 }
421
586b1ce7 422 for (i = 0; argv && argv[i]; i++) {
3f60c2f7
CB
423 size_t len = size - buf_pos;
424
9bcde680
CB
425 ret = strnprintf(buffer + buf_pos, len, " %s", argv[i]);
426 if (ret < 0)
55022530 427 return log_error_errno(-1, errno, "Failed to create command line for script \"%s\"", script);
3f60c2f7 428 buf_pos += ret;
148e91f5
SH
429 }
430
e1a94937 431 return run_buffer(buffer);
148e91f5
SH
432}
433
811ef482 434int run_script(const char *name, const char *section, const char *script, ...)
e3b4c4c4 435{
2f443e88 436 __do_free char *buffer = NULL;
abbfd20b 437 int ret;
2f443e88 438 char *p;
abbfd20b 439 va_list ap;
0fd73091 440 size_t size = 0;
751d9dcd 441
0fd73091 442 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
751d9dcd 443 script, name, section);
e3b4c4c4 444
abbfd20b
DL
445 va_start(ap, script);
446 while ((p = va_arg(ap, char *)))
95642a10 447 size += strlen(p) + 1;
abbfd20b
DL
448 va_end(ap);
449
6333c915 450 size += STRLITERALLEN("exec");
abbfd20b
DL
451 size += strlen(script);
452 size += strlen(name);
453 size += strlen(section);
6d1a5f93 454 size += 4;
abbfd20b 455
95642a10
MS
456 if (size > INT_MAX)
457 return -1;
458
2f443e88 459 buffer = must_realloc(NULL, size);
9bcde680
CB
460 ret = strnprintf(buffer, size, "exec %s %s %s", script, name, section);
461 if (ret < 0)
9ba8130c 462 return -1;
751d9dcd 463
abbfd20b 464 va_start(ap, script);
9ba8130c 465 while ((p = va_arg(ap, char *))) {
062b72c6 466 int len = size - ret;
9ba8130c 467 int rc;
9bcde680
CB
468 rc = strnprintf(buffer + ret, len, " %s", p);
469 if (rc < 0) {
7b5a2435 470 va_end(ap);
9ba8130c 471 return -1;
7b5a2435 472 }
9ba8130c
SH
473 ret += rc;
474 }
abbfd20b 475 va_end(ap);
751d9dcd 476
91c3830e 477 return run_buffer(buffer);
e3b4c4c4
ST
478}
479
79ff643d 480/* lxc_rootfs_prepare
63fc76c3 481 * if rootfs is a directory, then open ${rootfs}/.lxc-keep for writing for
b7ed4bf0
CS
482 * the duration of the container run, to prevent the container from marking
483 * the underlying fs readonly on shutdown. unlink the file immediately so
63fc76c3
GJ
484 * no name pollution is happens.
485 * don't unlink on NFS to avoid random named stale handles.
0c547523 486 */
79ff643d 487int lxc_rootfs_prepare(struct lxc_rootfs *rootfs, bool userns)
0c547523 488{
79ff643d
CB
489 __do_close int dfd_path = -EBADF, fd_pin = -EBADF;
490 int ret;
491 struct stat st;
492 struct statfs stfs;
0c547523 493
79ff643d
CB
494 if (rootfs->path) {
495 if (rootfs->bdev_type &&
496 (!strcmp(rootfs->bdev_type, "overlay") ||
497 !strcmp(rootfs->bdev_type, "overlayfs")))
498 return log_trace_errno(0, EINVAL, "Not pinning on stacking filesystem");
e99ee0de 499
79ff643d
CB
500 dfd_path = open_at(-EBADF, rootfs->path, PROTECT_OPATH_FILE, 0, 0);
501 } else {
502 dfd_path = open_at(-EBADF, "/", PROTECT_OPATH_FILE, PROTECT_LOOKUP_ABSOLUTE, 0);
503 }
504 if (dfd_path < 0)
505 return log_error_errno(-errno, errno, "Failed to open \"%s\"", rootfs->path);
506
507 if (!rootfs->path)
508 return log_trace(0, "Not pinning because container does not have a rootfs");
0c547523 509
79ff643d
CB
510 if (userns)
511 return log_trace(0, "Not pinning because container runs in user namespace");
512
513 ret = fstat(dfd_path, &st);
957c4704 514 if (ret < 0)
79ff643d 515 return log_trace_errno(-errno, errno, "Failed to retrieve file status");
0c547523 516
79ff643d
CB
517 if (!S_ISDIR(st.st_mode))
518 return log_trace_errno(0, ENOTDIR, "Not pinning because file descriptor is not a directory");
0c547523 519
79ff643d
CB
520 fd_pin = open_at(dfd_path, ".lxc_keep",
521 PROTECT_OPEN | O_CREAT,
522 PROTECT_LOOKUP_BENEATH,
523 S_IWUSR | S_IRUSR);
524 if (fd_pin < 0)
525 return log_error_errno(-errno, errno, "Failed to pin rootfs");
0c547523 526
79ff643d 527 TRACE("Pinned rootfs %d(.lxc_keep)", fd_pin);
0fd73091 528
79ff643d
CB
529 ret = fstatfs(fd_pin, &stfs);
530 if (ret < 0) {
531 SYSWARN("Failed to retrieve filesystem status");
532 goto out;
533 }
63fc76c3 534
79ff643d
CB
535 if (stfs.f_type == NFS_SUPER_MAGIC) {
536 DEBUG("Not unlinking pinned file on NFS");
537 goto out;
538 }
63fc76c3 539
79ff643d
CB
540 if (unlinkat(dfd_path, ".lxc_keep", 0))
541 SYSTRACE("Failed to unlink rootfs pinning file %d(.lxc_keep)", dfd_path);
542 else
543 TRACE("Unlinked pinned file %d(.lxc_keep)", dfd_path);
0fd73091 544
79ff643d
CB
545out:
546 rootfs->fd_path_pin = move_fd(fd_pin);
547 return 0;
0c547523
SH
548}
549
6b741397
CB
550static int add_shmount_to_list(struct lxc_conf *conf)
551{
6b5a54cd 552 char new_mount[PATH_MAX];
0d190408 553 /* Offset for the leading '/' since the path_cont
6b741397
CB
554 * is absolute inside the container.
555 */
556 int offset = 1, ret = -1;
0d190408 557
9bcde680 558 ret = strnprintf(new_mount, sizeof(new_mount),
6b741397
CB
559 "%s %s none bind,create=dir 0 0", conf->shmount.path_host,
560 conf->shmount.path_cont + offset);
9bcde680 561 if (ret < 0)
0d190408
LT
562 return -1;
563
6b741397 564 return add_elem_to_mount_list(new_mount, conf);
0d190408
LT
565}
566
4fb3cba5 567static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 568{
7b371c1e 569 int i, ret;
b06b8511
CS
570 static struct {
571 int match_mask;
572 int match_flag;
573 const char *source;
574 const char *destination;
575 const char *fstype;
576 unsigned long flags;
577 const char *options;
e8b9c9ec 578 bool requires_cap_net_admin;
b06b8511 579 } default_mounts[] = {
0fd73091
CB
580 /* Read-only bind-mounting... In older kernels, doing that
581 * required to do one MS_BIND mount and then
582 * MS_REMOUNT|MS_RDONLY the same one. According to mount(2)
583 * manpage, MS_BIND honors MS_RDONLY from kernel 2.6.26
584 * onwards. However, this apparently does not work on kernel
585 * 3.8. Unfortunately, on that very same kernel, doing the same
586 * trick as above doesn't seem to work either, there one needs
587 * to ALSO specify MS_BIND for the remount, otherwise the
588 * entire fs is remounted read-only or the mount fails because
589 * it's busy... MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for
590 * kernels as low as 2.6.32...
368bbc02 591 */
5d1bf4c4 592 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
592fd47a 593 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
5d1bf4c4
CB
594 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL, true },
595 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL, false },
596 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL, false },
597 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL, true },
598 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL, false },
599 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL, false },
600 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
601 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL, false },
602 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL, false },
603 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
604 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL, false },
605 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL, false },
606 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL, false },
607 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL, false },
608 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL, false },
609 { 0, 0, NULL, NULL, NULL, 0, NULL, false }
b06b8511 610 };
e25af1bc
CB
611 struct lxc_rootfs *rootfs = &conf->rootfs;
612 bool has_cap_net_admin;
368bbc02 613
f4bea7cc 614 if (flags & LXC_AUTO_PROC_MASK) {
ea57e424 615 ret = mkdirat(rootfs->dfd_mnt, "proc" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
f4bea7cc
CB
616 if (ret < 0 && errno != EEXIST)
617 return log_error_errno(-errno, errno,
ea57e424 618 "Failed to create proc mountpoint under %d", rootfs->dfd_mnt);
f4bea7cc
CB
619 }
620
621 if (flags & LXC_AUTO_SYS_MASK) {
ea57e424 622 ret = mkdirat(rootfs->dfd_mnt, "sys" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
f4bea7cc
CB
623 if (ret < 0 && errno != EEXIST)
624 return log_error_errno(-errno, errno,
ea57e424 625 "Failed to create sysfs mountpoint under %d", rootfs->dfd_mnt);
f4bea7cc
CB
626 }
627
e25af1bc 628 has_cap_net_admin = lxc_wants_cap(CAP_NET_ADMIN, conf);
d84b26bc 629 for (i = 0; default_mounts[i].match_mask; i++) {
8db92302 630 __do_free char *destination = NULL, *source = NULL;
0fd73091
CB
631 int saved_errno;
632 unsigned long mflags;
0fd73091
CB
633 if ((flags & default_mounts[i].match_mask) != default_mounts[i].match_flag)
634 continue;
635
636 if (default_mounts[i].source) {
cc4fd506 637 /* will act like strdup if %r is not present */
e25af1bc 638 source = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].source);
0fd73091 639 if (!source)
cc4fd506 640 return -1;
0fd73091 641 }
f24a52d5 642
55022530
CB
643 if (!default_mounts[i].destination)
644 return log_error(-1, "BUG: auto mounts destination %d was NULL", i);
0fd73091 645
e8b9c9ec 646 if (!has_cap_net_admin && default_mounts[i].requires_cap_net_admin) {
647 TRACE("Container does not have CAP_NET_ADMIN. Skipping \"%s\" mount", default_mounts[i].source ?: "(null)");
648 continue;
649 }
650
0fd73091 651 /* will act like strdup if %r is not present */
e25af1bc 652 destination = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].destination);
55022530 653 if (!destination)
0fd73091 654 return -1;
0fd73091
CB
655
656 mflags = add_required_remount_flags(source, destination,
657 default_mounts[i].flags);
7b371c1e
CB
658 ret = safe_mount(source, destination, default_mounts[i].fstype,
659 mflags, default_mounts[i].options,
660 rootfs->path ? rootfs->mount : NULL);
0fd73091 661 saved_errno = errno;
7b371c1e 662 if (ret < 0 && errno == ENOENT) {
55022530 663 INFO("Mount source or target for \"%s\" on \"%s\" does not exist. Skipping", source, destination);
7b371c1e
CB
664 ret = 0;
665 } else if (ret < 0) {
0fd73091
CB
666 SYSERROR("Failed to mount \"%s\" on \"%s\" with flags %lu", source, destination, mflags);
667 }
668
7b371c1e 669 if (ret < 0) {
0fd73091
CB
670 errno = saved_errno;
671 return -1;
368bbc02 672 }
368bbc02
CS
673 }
674
b06b8511 675 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
676 int cg_flags;
677
3f69fb12 678 cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE);
0fd73091
CB
679 /* If the type of cgroup mount was not specified, it depends on
680 * the container's capabilities as to what makes sense: if we
681 * have CAP_SYS_ADMIN, the read-only part can be remounted
682 * read-write anyway, so we may as well default to read-write;
683 * then the admin will not be given a false sense of security.
684 * (And if they really want mixed r/o r/w, then they can
685 * explicitly specify :mixed.) OTOH, if the container lacks
686 * CAP_SYS_ADMIN, do only default to :mixed, because then the
687 * container can't remount it read-write.
688 */
0769b82a
CS
689 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
690 int has_sys_admin = 0;
b0ee5983
CB
691
692 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 693 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 694 else
0769b82a 695 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
696
697 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 698 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 699 else
0769b82a 700 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a 701 }
0fd73091 702
3f69fb12 703 if (flags & LXC_AUTO_CGROUP_FORCE)
0fd73091
CB
704 cg_flags |= LXC_AUTO_CGROUP_FORCE;
705
315f8a4e 706 if (!handler->cgroup_ops->mount(handler->cgroup_ops, conf, cg_flags))
55022530 707 return log_error_errno(-1, errno, "Failed to mount \"/sys/fs/cgroup\"");
368bbc02
CS
708 }
709
0d190408 710 if (flags & LXC_AUTO_SHMOUNTS_MASK) {
7b371c1e 711 ret = add_shmount_to_list(conf);
55022530
CB
712 if (ret < 0)
713 return log_error(-1, "Failed to add shmount entry to container config");
0d190408
LT
714 }
715
368bbc02 716 return 0;
368bbc02
CS
717}
718
4e5440c6 719static int setup_utsname(struct utsname *utsname)
0ad19a3f 720{
0fd73091
CB
721 int ret;
722
4e5440c6
DL
723 if (!utsname)
724 return 0;
0ad19a3f 725
0fd73091 726 ret = sethostname(utsname->nodename, strlen(utsname->nodename));
55022530
CB
727 if (ret < 0)
728 return log_error_errno(-1, errno, "Failed to set the hostname to \"%s\"",
729 utsname->nodename);
0ad19a3f 730
0fd73091 731 INFO("Set hostname to \"%s\"", utsname->nodename);
cd54d859 732
0ad19a3f 733 return 0;
734}
735
69aa6655
DE
736struct dev_symlinks {
737 const char *oldpath;
738 const char *name;
739};
740
741static const struct dev_symlinks dev_symlinks[] = {
0fd73091
CB
742 { "/proc/self/fd", "fd" },
743 { "/proc/self/fd/0", "stdin" },
744 { "/proc/self/fd/1", "stdout" },
745 { "/proc/self/fd/2", "stderr" },
69aa6655
DE
746};
747
ed8704d0 748static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
69aa6655 749{
79019997
CB
750 for (int i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
751 int ret;
752 struct stat s;
69aa6655 753 const struct dev_symlinks *d = &dev_symlinks[i];
0fd73091 754
79019997
CB
755 /*
756 * Stat the path first. If we don't get an error accept it as
0fd73091 757 * is and don't try to create it
09227be2 758 */
a5a08920 759 ret = fstatat(rootfs->dfd_dev, d->name, &s, 0);
0fd73091 760 if (ret == 0)
09227be2 761 continue;
09227be2 762
a5a08920 763 ret = symlinkat(d->oldpath, rootfs->dfd_dev, d->name);
79019997
CB
764 if (ret) {
765 switch (errno) {
766 case EROFS:
767 WARN("Failed to create \"%s\" on read-only filesystem", d->name);
768 __fallthrough;
769 case EEXIST:
770 break;
771 default:
772 return log_error_errno(-errno, errno, "Failed to create \"%s\"", d->name);
773 }
69aa6655
DE
774 }
775 }
0fd73091 776
69aa6655
DE
777 return 0;
778}
779
2187efd3 780/* Build a space-separate list of ptys to pass to systemd. */
885766f5 781static bool append_ttyname(char **pp, char *name)
b0a33c1e 782{
393903d1 783 char *p;
f1e05b90 784 size_t size;
393903d1
SH
785
786 if (!*pp) {
787 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
788 if (!*pp)
789 return false;
0fd73091 790
393903d1
SH
791 sprintf(*pp, "container_ttys=%s", name);
792 return true;
793 }
0fd73091 794
f1e05b90
DJ
795 size = strlen(*pp) + strlen(name) + 2;
796 p = realloc(*pp, size);
393903d1
SH
797 if (!p)
798 return false;
0fd73091 799
393903d1 800 *pp = p;
f1e05b90
DJ
801 (void)strlcat(p, " ", size);
802 (void)strlcat(p, name, size);
0fd73091 803
393903d1
SH
804 return true;
805}
806
2187efd3 807static int lxc_setup_ttys(struct lxc_conf *conf)
393903d1 808{
7369e6bf
CB
809 int ret;
810 struct lxc_rootfs *rootfs = &conf->rootfs;
0e4be3cf 811 const struct lxc_tty_info *ttys = &conf->ttys;
885766f5 812 char *ttydir = ttys->dir;
b0a33c1e 813
e8bd4e43 814 if (!conf->rootfs.path)
bc9bd0e3
DL
815 return 0;
816
7369e6bf
CB
817 for (int i = 0; i < ttys->max; i++) {
818 __do_close int fd_to = -EBADF;
0e4be3cf 819 struct lxc_terminal_info *tty = &ttys->tty[i];
b0a33c1e 820
7c6ef2a2 821 if (ttydir) {
7369e6bf 822 char *tty_name, *tty_path;
9e1045e3 823
9bcde680 824 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf),
7369e6bf 825 "/dev/%s/tty%d", ttydir, i + 1);
9bcde680 826 if (ret < 0)
7369e6bf
CB
827 return ret_errno(-EIO);
828
829 tty_path = &rootfs->buf[STRLITERALLEN("/dev/")];
830 tty_name = tty_path + strlen(ttydir) + 1;
831
832 /* create bind-mount target */
833 fd_to = open_at(rootfs->dfd_dev, tty_path,
834 PROTECT_OPEN_W | O_CREAT,
835 PROTECT_LOOKUP_BENEATH, 0);
836 if (fd_to < 0)
837 return log_error_errno(-errno, errno,
838 "Failed to create tty mount target %d(%s)",
839 rootfs->dfd_dev, tty_path);
840
841 ret = unlinkat(rootfs->dfd_dev, tty_name, 0);
842 if (ret < 0 && errno != ENOENT)
843 return log_error_errno(-errno, errno,
844 "Failed to unlink %d(%s)",
845 rootfs->dfd_dev, tty_name);
846
de7f9f33 847 if (can_use_mount_api()) {
7369e6bf
CB
848 ret = fd_bind_mount(tty->pty, "",
849 PROTECT_OPATH_FILE,
850 PROTECT_LOOKUP_BENEATH_XDEV,
851 fd_to, "",
852 PROTECT_OPATH_FILE,
853 PROTECT_LOOKUP_BENEATH_XDEV, 0,
854 false);
855 } else {
856 ret = mount(tty->name, rootfs->buf, "none", MS_BIND, 0);
7c6ef2a2 857 }
7369e6bf
CB
858 if (ret < 0)
859 return log_error_errno(-errno, errno,
860 "Failed to bind mount \"%s\" onto \"%s\"",
861 tty->name, rootfs->buf);
862 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, rootfs->buf);
9e1045e3 863
7369e6bf 864 ret = symlinkat(tty_path, rootfs->dfd_dev, tty_name);
55022530 865 if (ret < 0)
7369e6bf
CB
866 return log_error_errno(-errno, errno,
867 "Failed to create symlink \"%d(%s)\" -> \"%d(%s)\"",
868 rootfs->dfd_dev, tty_name,
869 rootfs->dfd_dev, tty_path);
7c6ef2a2 870 } else {
9bcde680
CB
871 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "tty%d", i + 1);
872 if (ret < 0)
7369e6bf
CB
873 return ret_errno(-EIO);
874
875 /* If we populated /dev, then we need to create /dev/tty<idx>. */
876 fd_to = open_at(rootfs->dfd_dev, rootfs->buf,
877 PROTECT_OPEN_W | O_CREAT,
878 PROTECT_LOOKUP_BENEATH, 0);
879 if (fd_to < 0)
880 return log_error_errno(-errno, errno,
881 "Failed to create tty mount target %d(%s)",
882 rootfs->dfd_dev, rootfs->buf);
883
de7f9f33 884 if (can_use_mount_api()) {
7369e6bf
CB
885 ret = fd_bind_mount(tty->pty, "",
886 PROTECT_OPATH_FILE,
887 PROTECT_LOOKUP_BENEATH_XDEV,
888 fd_to, "",
889 PROTECT_OPATH_FILE,
890 PROTECT_LOOKUP_BENEATH, 0,
891 false);
892 } else {
9bcde680
CB
893 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "/dev/tty%d", i + 1);
894 if (ret < 0)
7369e6bf
CB
895 return ret_errno(-EIO);
896
897 ret = mount(tty->name, rootfs->buf, "none", MS_BIND, 0);
7c6ef2a2 898 }
7369e6bf
CB
899 if (ret < 0)
900 return log_error_errno(-errno, errno,
901 "Failed to bind mount \"%s\" onto \"%s\"",
902 tty->name, rootfs->buf);
903 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, rootfs->buf);
393903d1 904 }
9e1045e3 905
55022530
CB
906 if (!append_ttyname(&conf->ttys.tty_names, tty->name))
907 return log_error(-1, "Error setting up container_ttys string");
b0a33c1e 908 }
909
885766f5 910 INFO("Finished setting up %zu /dev/tty<N> device(s)", ttys->max);
b0a33c1e 911 return 0;
912}
913
9d0e129b
CB
914define_cleanup_function(struct lxc_tty_info *, lxc_delete_tty);
915
59eac805 916static int lxc_allocate_ttys(struct lxc_conf *conf)
2187efd3 917{
9d0e129b 918 call_cleaner(lxc_delete_tty) struct lxc_tty_info *ttys = &conf->ttys;
fca23691 919 int ret;
2187efd3
CB
920
921 /* no tty in the configuration */
885766f5 922 if (ttys->max == 0)
2187efd3
CB
923 return 0;
924
9d0e129b
CB
925 ttys->tty = zalloc(sizeof(struct lxc_terminal_info) * ttys->max);
926 if (!ttys->tty)
2187efd3 927 return -ENOMEM;
2187efd3 928
7369e6bf 929 for (size_t i = 0; i < conf->ttys.max; i++) {
9d0e129b 930 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 931
36a94ce8 932 tty->ptx = -EBADF;
41808e20
CB
933 tty->pty = -EBADF;
934 ret = openpty(&tty->ptx, &tty->pty, NULL, NULL, NULL);
77a39805 935 if (ret < 0) {
7369e6bf 936 conf->ttys.max = i;
55022530 937 return log_error_errno(-ENOTTY, ENOTTY, "Failed to create tty %zu", i);
2187efd3
CB
938 }
939
41808e20 940 ret = ttyname_r(tty->pty, tty->name, sizeof(tty->name));
77a39805 941 if (ret < 0) {
7369e6bf 942 conf->ttys.max = i;
41808e20 943 return log_error_errno(-ENOTTY, ENOTTY, "Failed to retrieve name of tty %zu pty", i);
77a39805
CB
944 }
945
7369e6bf 946 DEBUG("Created tty with ptx fd %d and pty fd %d", tty->ptx, tty->pty);
2187efd3
CB
947
948 /* Prevent leaking the file descriptors to the container */
36a94ce8 949 ret = fd_cloexec(tty->ptx, true);
2187efd3 950 if (ret < 0)
36a94ce8
CB
951 SYSWARN("Failed to set FD_CLOEXEC flag on ptx fd %d of tty device \"%s\"",
952 tty->ptx, tty->name);
2187efd3 953
41808e20 954 ret = fd_cloexec(tty->pty, true);
2187efd3 955 if (ret < 0)
41808e20
CB
956 SYSWARN("Failed to set FD_CLOEXEC flag on pty fd %d of tty device \"%s\"",
957 tty->pty, tty->name);
2187efd3 958
7581d645 959 tty->busy = -1;
2187efd3
CB
960 }
961
885766f5 962 INFO("Finished creating %zu tty devices", ttys->max);
9d0e129b 963 move_ptr(ttys);
2187efd3
CB
964 return 0;
965}
966
0e4be3cf 967void lxc_delete_tty(struct lxc_tty_info *ttys)
2187efd3 968{
386e6768
CB
969 if (!ttys->tty)
970 return;
971
55022530 972 for (int i = 0; i < ttys->max; i++) {
0e4be3cf 973 struct lxc_terminal_info *tty = &ttys->tty[i];
36a94ce8 974 close_prot_errno_disarm(tty->ptx);
41808e20 975 close_prot_errno_disarm(tty->pty);
2187efd3
CB
976 }
977
55022530 978 free_disarm(ttys->tty);
2187efd3
CB
979}
980
981static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
982{
983 int i;
0fd73091 984 int ret = -1;
2187efd3 985 struct lxc_conf *conf = handler->conf;
0e4be3cf 986 struct lxc_tty_info *ttys = &conf->ttys;
2187efd3 987 int sock = handler->data_sock[0];
2187efd3 988
885766f5 989 if (ttys->max == 0)
2187efd3
CB
990 return 0;
991
885766f5 992 for (i = 0; i < ttys->max; i++) {
2187efd3 993 int ttyfds[2];
0e4be3cf 994 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 995
36a94ce8 996 ttyfds[0] = tty->ptx;
41808e20 997 ttyfds[1] = tty->pty;
2187efd3
CB
998
999 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1000 if (ret < 0)
1001 break;
1002
41808e20
CB
1003 TRACE("Sent tty \"%s\" with ptx fd %d and pty fd %d to parent",
1004 tty->name, tty->ptx, tty->pty);
2187efd3
CB
1005 }
1006
1007 if (ret < 0)
6d1400b5 1008 SYSERROR("Failed to send %zu ttys to parent", ttys->max);
2187efd3 1009 else
885766f5 1010 TRACE("Sent %zu ttys to parent", ttys->max);
2187efd3
CB
1011
1012 return ret;
1013}
1014
1015static int lxc_create_ttys(struct lxc_handler *handler)
1016{
1017 int ret = -1;
1018 struct lxc_conf *conf = handler->conf;
1019
663014ee 1020 ret = lxc_allocate_ttys(conf);
2187efd3
CB
1021 if (ret < 0) {
1022 ERROR("Failed to allocate ttys");
1023 goto on_error;
1024 }
1025
1026 ret = lxc_send_ttys_to_parent(handler);
1027 if (ret < 0) {
1028 ERROR("Failed to send ttys to parent");
1029 goto on_error;
1030 }
1031
1032 if (!conf->is_execute) {
1033 ret = lxc_setup_ttys(conf);
1034 if (ret < 0) {
1035 ERROR("Failed to setup ttys");
1036 goto on_error;
1037 }
1038 }
1039
885766f5
CB
1040 if (conf->ttys.tty_names) {
1041 ret = setenv("container_ttys", conf->ttys.tty_names, 1);
2187efd3 1042 if (ret < 0)
885766f5 1043 SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
2187efd3
CB
1044 }
1045
1046 ret = 0;
1047
1048on_error:
0e4be3cf 1049 lxc_delete_tty(&conf->ttys);
2187efd3
CB
1050
1051 return ret;
1052}
1053
7133b912
CB
1054/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1055 * error, log it but don't fail yet.
91c3830e 1056 */
7133b912 1057static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
63012bdd 1058 int autodevtmpfssize, const char *lxcpath)
91c3830e 1059{
bfbfeedf 1060 __do_close int fd_fs = -EBADF;
ee8eeba8 1061 const char *path = rootfs->path ? rootfs->mount : NULL;
bfbfeedf 1062 size_t tmpfs_size = (autodevtmpfssize != 0) ? autodevtmpfssize : 500000;
91c3830e 1063 int ret;
87e0e273 1064 mode_t cur_mask;
63012bdd 1065 char mount_options[128];
91c3830e 1066
7133b912 1067 INFO("Preparing \"/dev\"");
bc6928ff 1068
87e0e273 1069 cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
ea57e424 1070 ret = mkdirat(rootfs->dfd_mnt, "dev" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
87e0e273
CB
1071 if (ret < 0 && errno != EEXIST) {
1072 SYSERROR("Failed to create \"/dev\" directory");
1073 ret = -errno;
1074 goto reset_umask;
bc6928ff 1075 }
87da4ec3 1076
de7f9f33 1077 if (can_use_mount_api()) {
635e7bac
CB
1078 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1079 if (fd_fs < 0)
1080 return log_error_errno(-errno, errno, "Failed to prepare filesystem context for tmpfs");
ee8eeba8 1081
bfbfeedf
CB
1082 sprintf(mount_options, "%zu", tmpfs_size);
1083
1084 ret = fs_set_property(fd_fs, "mode", "0755");
1085 if (ret < 0)
1086 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1087
1088 ret = fs_set_property(fd_fs, "size", mount_options);
1089 if (ret < 0)
1090 return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1091
1092 ret = fs_attach(fd_fs, rootfs->dfd_mnt, "dev", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
635e7bac
CB
1093 } else {
1094 __do_free char *fallback_path = NULL;
1095
1096 sprintf(mount_options, "size=%zu,mode=755", tmpfs_size);
1097 DEBUG("Using mount options: %s", mount_options);
1098
1099 if (path) {
1100 fallback_path = must_make_path(path, "/dev", NULL);
1101 ret = safe_mount("none", fallback_path, "tmpfs", 0, mount_options, path);
1102 } else {
1103 ret = safe_mount("none", "dev", "tmpfs", 0, mount_options, NULL);
1104 }
87e0e273 1105 }
bfbfeedf
CB
1106 if (ret < 0) {
1107 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1108 goto reset_umask;
1109 }
1110
87da4ec3 1111
7133b912 1112 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1113 * If not, then create it and exit if that fails...
1114 */
ea57e424 1115 ret = mkdirat(rootfs->dfd_mnt, "dev/pts", S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
87e0e273 1116 if (ret < 0 && errno != EEXIST) {
bfbfeedf 1117 SYSERROR("Failed to create directory \"dev/pts\"");
87e0e273
CB
1118 ret = -errno;
1119 goto reset_umask;
91c3830e
SH
1120 }
1121
87e0e273
CB
1122 ret = 0;
1123
1124reset_umask:
1125 (void)umask(cur_mask);
1126
7133b912 1127 INFO("Prepared \"/dev\"");
87e0e273 1128 return ret;
91c3830e
SH
1129}
1130
5e73416f 1131struct lxc_device_node {
74a3920a 1132 const char *name;
5e73416f
CB
1133 const mode_t mode;
1134 const int maj;
1135 const int min;
c6883f38
SH
1136};
1137
5e73416f 1138static const struct lxc_device_node lxc_devices[] = {
06749971 1139 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
5e73416f 1140 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
06749971
CB
1141 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1142 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
5e73416f
CB
1143 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1144 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
c6883f38
SH
1145};
1146
5067e4dd
CB
1147
1148enum {
1149 LXC_DEVNODE_BIND,
1150 LXC_DEVNODE_MKNOD,
1151 LXC_DEVNODE_PARTIAL,
1152 LXC_DEVNODE_OPEN,
1153};
1154
887ae844 1155static int lxc_fill_autodev(struct lxc_rootfs *rootfs)
c6883f38 1156{
5e73416f 1157 int i, ret;
3a32201c 1158 mode_t cmask;
5067e4dd 1159 int use_mknod = LXC_DEVNODE_MKNOD;
c6883f38 1160
a5a08920 1161 if (rootfs->dfd_dev < 0)
81498328 1162 return log_info(0, "No /dev directory found, skipping setup");
d43d5191 1163
3999be0a
CB
1164 INFO("Populating \"/dev\"");
1165
3a32201c 1166 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
5e73416f 1167 for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
5e73416f 1168 const struct lxc_device_node *device = &lxc_devices[i];
0728ebf4 1169
5067e4dd 1170 if (use_mknod >= LXC_DEVNODE_MKNOD) {
a5a08920 1171 ret = mknodat(rootfs->dfd_dev, device->name, device->mode, makedev(device->maj, device->min));
5e73416f 1172 if (ret == 0 || (ret < 0 && errno == EEXIST)) {
d43d5191 1173 DEBUG("Created device node \"%s\"", device->name);
5067e4dd 1174 } else if (ret < 0) {
55022530 1175 if (errno != EPERM)
d43d5191 1176 return log_error_errno(-1, errno, "Failed to create device node \"%s\"", device->name);
0bbf8572 1177
5067e4dd 1178 use_mknod = LXC_DEVNODE_BIND;
9cb4d183 1179 }
3999be0a 1180
5067e4dd
CB
1181 /* Device nodes are fully useable. */
1182 if (use_mknod == LXC_DEVNODE_OPEN)
1183 continue;
1184
1185 if (use_mknod == LXC_DEVNODE_MKNOD) {
d43d5191 1186 __do_close int fd = -EBADF;
5067e4dd
CB
1187 /* See
1188 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
1189 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
1190 */
a5a08920 1191 fd = open_at(rootfs->dfd_dev, device->name, PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
d43d5191 1192 if (fd >= 0) {
5067e4dd
CB
1193 /* Device nodes are fully useable. */
1194 use_mknod = LXC_DEVNODE_OPEN;
1195 continue;
1196 }
1197
d43d5191 1198 SYSTRACE("Failed to open \"%s\" device", device->name);
5067e4dd
CB
1199 /* Device nodes are only partially useable. */
1200 use_mknod = LXC_DEVNODE_PARTIAL;
1201 }
5e73416f
CB
1202 }
1203
5067e4dd
CB
1204 if (use_mknod != LXC_DEVNODE_PARTIAL) {
1205 /* If we are dealing with partially functional device
1206 * nodes the prio mknod() call will have created the
1207 * device node so we can use it as a bind-mount target.
1208 */
a5a08920 1209 ret = mknodat(rootfs->dfd_dev, device->name, S_IFREG | 0000, 0);
55022530 1210 if (ret < 0 && errno != EEXIST)
d43d5191 1211 return log_error_errno(-1, errno, "Failed to create file \"%s\"", device->name);
5e73416f
CB
1212 }
1213
1214 /* Fallback to bind-mounting the device from the host. */
9bcde680
CB
1215 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "dev/%s", device->name);
1216 if (ret < 0)
b41ff502 1217 return ret_errno(EIO);
5e73416f 1218
de7f9f33 1219 if (can_use_mount_api()) {
887ae844 1220 ret = fd_bind_mount(rootfs->dfd_host, rootfs->buf,
635e7bac
CB
1221 PROTECT_OPATH_FILE,
1222 PROTECT_LOOKUP_BENEATH_XDEV,
1223 rootfs->dfd_dev, device->name,
1224 PROTECT_OPATH_FILE,
1225 PROTECT_LOOKUP_BENEATH, 0, false);
1226 } else {
927ea337
CB
1227 char path[PATH_MAX];
1228
9bcde680
CB
1229 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "/dev/%s", device->name);
1230 if (ret < 0)
927ea337
CB
1231 return ret_errno(EIO);
1232
9bcde680
CB
1233 ret = strnprintf(path, sizeof(path), "%s/dev/%s", get_rootfs_mnt(rootfs), device->name);
1234 if (ret < 0)
927ea337
CB
1235 return log_error(-1, "Failed to create device path for %s", device->name);
1236
887ae844 1237 ret = safe_mount(rootfs->buf, path, 0, MS_BIND, NULL, get_rootfs_mnt(rootfs));
927ea337 1238 if (ret < 0)
887ae844 1239 return log_error_errno(-1, errno, "Failed to bind mount host device node \"%s\" to \"%s\"", rootfs->buf, path);
927ea337 1240
887ae844 1241 DEBUG("Bind mounted host device node \"%s\" to \"%s\"", rootfs->buf, path);
927ea337 1242 continue;
d43d5191 1243 }
887ae844 1244 DEBUG("Bind mounted host device %d(%s) to %d(%s)", rootfs->dfd_host, rootfs->buf, rootfs->dfd_dev, device->name);
c6883f38 1245 }
5e73416f 1246 (void)umask(cmask);
c6883f38 1247
3999be0a 1248 INFO("Populated \"/dev\"");
c6883f38
SH
1249 return 0;
1250}
1251
8ce1abc2 1252static int lxc_mount_rootfs(struct lxc_conf *conf)
0ad19a3f 1253{
9aa76a17 1254 int ret;
10bc1861 1255 struct lxc_storage *bdev;
31f8b2fd 1256 struct lxc_rootfs *rootfs = &conf->rootfs;
cc28d0b0 1257
a0f379bf 1258 if (!rootfs->path) {
0fd73091 1259 ret = mount("", "/", NULL, MS_SLAVE | MS_REC, 0);
55022530 1260 if (ret < 0)
9e61fb1f 1261 return log_error_errno(-1, errno, "Failed to recursively turn root mount tree into dependent mount");
0fd73091 1262
ea57e424
CB
1263 rootfs->dfd_mnt = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
1264 if (rootfs->dfd_mnt < 0)
31f8b2fd
CB
1265 return -errno;
1266
c69bd12f 1267 return 0;
a0f379bf 1268 }
0ad19a3f 1269
0fd73091 1270 ret = access(rootfs->mount, F_OK);
55022530
CB
1271 if (ret != 0)
1272 return log_error_errno(-1, errno, "Failed to access to \"%s\". Check it is present",
1273 rootfs->mount);
b1789442 1274
8a388ed4 1275 bdev = storage_init(conf);
55022530
CB
1276 if (!bdev)
1277 return log_error(-1, "Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1278 rootfs->path, rootfs->mount,
1279 rootfs->options ? rootfs->options : "(null)");
9aa76a17
CB
1280
1281 ret = bdev->ops->mount(bdev);
10bc1861 1282 storage_put(bdev);
55022530
CB
1283 if (ret < 0)
1284 return log_error(-1, "Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1285 rootfs->path, rootfs->mount,
1286 rootfs->options ? rootfs->options : "(null)");
0ad19a3f 1287
0fd73091 1288 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1289 rootfs->path, rootfs->mount,
1290 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1291
ea57e424
CB
1292 rootfs->dfd_mnt = open_at(-EBADF, rootfs->mount, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
1293 if (rootfs->dfd_mnt < 0)
31f8b2fd
CB
1294 return -errno;
1295
ac778708
DL
1296 return 0;
1297}
1298
59eac805 1299static int lxc_chroot(const struct lxc_rootfs *rootfs)
91e93c71 1300{
b8d88764 1301 __do_free char *nroot = NULL;
0fd73091 1302 int i, ret;
8ce1abc2 1303 char *root = rootfs->mount;
91e93c71 1304
74e7b662 1305 nroot = realpath(root, NULL);
55022530
CB
1306 if (!nroot)
1307 return log_error_errno(-1, errno, "Failed to resolve \"%s\"", root);
91e93c71 1308
0fd73091 1309 ret = chdir("/");
b8d88764 1310 if (ret < 0)
0fd73091 1311 return -1;
91e93c71 1312
0fd73091
CB
1313 /* We could use here MS_MOVE, but in userns this mount is locked and
1314 * can't be moved.
91e93c71 1315 */
8ce1abc2 1316 ret = mount(nroot, "/", NULL, MS_REC | MS_BIND, NULL);
55022530
CB
1317 if (ret < 0)
1318 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"/\" as MS_REC | MS_BIND", nroot);
91e93c71 1319
0fd73091 1320 ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
55022530
CB
1321 if (ret < 0)
1322 return log_error_errno(-1, errno, "Failed to remount \"/\"");
91e93c71 1323
aa899945 1324 /* The following code cleans up inherited mounts which are not required
0fd73091 1325 * for CT.
91e93c71
AV
1326 *
1327 * The mountinfo file shows not all mounts, if a few points have been
1328 * unmounted between read operations from the mountinfo. So we need to
1329 * read mountinfo a few times.
1330 *
7ded5fa7 1331 * This loop can be skipped if a container uses userns, because all
91e93c71
AV
1332 * inherited mounts are locked and we should live with all this trash.
1333 */
0fd73091 1334 for (;;) {
4fdd1f72 1335 __do_fclose FILE *f = NULL;
f3d38164
CB
1336 __do_free char *line = NULL;
1337 char *slider1, *slider2;
91e93c71 1338 int progress = 0;
f3d38164 1339 size_t len = 0;
91e93c71 1340
4110345b 1341 f = fopen("./proc/self/mountinfo", "re");
55022530
CB
1342 if (!f)
1343 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
0fd73091 1344
f3d38164
CB
1345 while (getline(&line, &len, f) > 0) {
1346 for (slider1 = line, i = 0; slider1 && i < 4; i++)
1347 slider1 = strchr(slider1 + 1, ' ');
0fd73091 1348
f3d38164 1349 if (!slider1)
91e93c71 1350 continue;
0fd73091 1351
f3d38164
CB
1352 slider2 = strchr(slider1 + 1, ' ');
1353 if (!slider2)
91e93c71
AV
1354 continue;
1355
f3d38164
CB
1356 *slider2 = '\0';
1357 *slider1 = '.';
91e93c71 1358
f3d38164 1359 if (strcmp(slider1 + 1, "/") == 0)
91e93c71 1360 continue;
0fd73091 1361
f3d38164 1362 if (strcmp(slider1 + 1, "/proc") == 0)
91e93c71
AV
1363 continue;
1364
f3d38164 1365 ret = umount2(slider1, MNT_DETACH);
0fd73091 1366 if (ret == 0)
91e93c71
AV
1367 progress++;
1368 }
0fd73091 1369
91e93c71
AV
1370 if (!progress)
1371 break;
1372 }
1373
7ded5fa7 1374 /* This also can be skipped if a container uses userns. */
0fd73091 1375 (void)umount2("./proc", MNT_DETACH);
91e93c71
AV
1376
1377 /* It is weird, but chdir("..") moves us in a new root */
0fd73091 1378 ret = chdir("..");
55022530
CB
1379 if (ret < 0)
1380 return log_error_errno(-1, errno, "Failed to chdir(\"..\")");
91e93c71 1381
0fd73091 1382 ret = chroot(".");
55022530
CB
1383 if (ret < 0)
1384 return log_error_errno(-1, errno, "Failed to chroot(\".\")");
91e93c71
AV
1385
1386 return 0;
1387}
1388
8ce1abc2
CB
1389/* (The following explanation is copied verbatim from the kernel.)
1390 *
1391 * pivot_root Semantics:
1392 * Moves the root file system of the current process to the directory put_old,
1393 * makes new_root as the new root file system of the current process, and sets
1394 * root/cwd of all processes which had them on the current root to new_root.
1395 *
1396 * Restrictions:
1397 * The new_root and put_old must be directories, and must not be on the
1398 * same file system as the current process root. The put_old must be
1399 * underneath new_root, i.e. adding a non-zero number of /.. to the string
1400 * pointed to by put_old must yield the same directory as new_root. No other
1401 * file system may be mounted on put_old. After all, new_root is a mountpoint.
1402 *
1403 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
1404 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
1405 * in this situation.
1406 *
1407 * Notes:
1408 * - we don't move root/cwd if they are not at the root (reason: if something
1409 * cared enough to change them, it's probably wrong to force them elsewhere)
1410 * - it's okay to pick a root that isn't the root of a file system, e.g.
1411 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
1412 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
1413 * first.
1414 */
7f50ec8b 1415static int lxc_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1416{
7f50ec8b 1417 __do_close int fd_oldroot = -EBADF;
b0d7aac4 1418 int ret;
0fd73091 1419
7f50ec8b
CB
1420 fd_oldroot = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
1421 if (fd_oldroot < 0)
55022530 1422 return log_error_errno(-1, errno, "Failed to open old root directory");
ac778708 1423
8ce1abc2 1424 /* change into new root fs */
ea57e424 1425 ret = fchdir(rootfs->dfd_mnt);
55022530 1426 if (ret < 0)
7f50ec8b 1427 return log_error_errno(-errno, errno, "Failed to change into new root directory \"%s\"", rootfs->mount);
39c7b795 1428
8ce1abc2
CB
1429 /* pivot_root into our new root fs */
1430 ret = pivot_root(".", ".");
55022530 1431 if (ret < 0)
7f50ec8b 1432 return log_error_errno(-errno, errno, "Failed to pivot into new root directory \"%s\"", rootfs->mount);
39c7b795 1433
8ce1abc2
CB
1434 /* At this point the old-root is mounted on top of our new-root. To
1435 * unmounted it we must not be chdir'd into it, so escape back to
1436 * old-root.
1437 */
7f50ec8b 1438 ret = fchdir(fd_oldroot);
55022530 1439 if (ret < 0)
7f50ec8b 1440 return log_error_errno(-errno, errno, "Failed to enter old root directory");
c69bd12f 1441
7f50ec8b
CB
1442 /*
1443 * Make fd_oldroot a depedent mount to make sure our umounts don't
1444 * propagate to the host.
8ce1abc2
CB
1445 */
1446 ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
55022530 1447 if (ret < 0)
7f50ec8b 1448 return log_error_errno(-errno, errno, "Failed to recursively turn old root mount tree into dependent mount");
8ce1abc2
CB
1449
1450 ret = umount2(".", MNT_DETACH);
55022530 1451 if (ret < 0)
7f50ec8b 1452 return log_error_errno(-errno, errno, "Failed to detach old root directory");
8ce1abc2 1453
ea57e424 1454 ret = fchdir(rootfs->dfd_mnt);
55022530 1455 if (ret < 0)
7f50ec8b 1456 return log_error_errno(-errno, errno, "Failed to re-enter new root directory \"%s\"", rootfs->mount);
8ce1abc2 1457
7f50ec8b 1458 TRACE("Changed into new rootfs \"%s\"", rootfs->mount);
b0d7aac4 1459 return 0;
0ad19a3f 1460}
1461
8ce1abc2
CB
1462static int lxc_setup_rootfs_switch_root(const struct lxc_rootfs *rootfs)
1463{
55022530
CB
1464 if (!rootfs->path)
1465 return log_debug(0, "Container does not have a rootfs");
8ce1abc2
CB
1466
1467 if (detect_ramfs_rootfs())
1468 return lxc_chroot(rootfs);
1469
7f50ec8b 1470 return lxc_pivot_root(rootfs);
0ad19a3f 1471}
1472
7581a82f 1473static const struct id_map *find_mapped_nsid_entry(const struct lxc_conf *conf,
8ce1abc2
CB
1474 unsigned id,
1475 enum idtype idtype)
f4900711
CB
1476{
1477 struct lxc_list *it;
1478 struct id_map *map;
1479 struct id_map *retmap = NULL;
1480
dcf0ffdf
CB
1481 /* Shortcut for container's root mappings. */
1482 if (id == 0) {
1483 if (idtype == ID_TYPE_UID)
1484 return conf->root_nsuid_map;
1485
1486 if (idtype == ID_TYPE_GID)
1487 return conf->root_nsgid_map;
1488 }
1489
f4900711
CB
1490 lxc_list_for_each(it, &conf->id_map) {
1491 map = it->elem;
1492 if (map->idtype != idtype)
1493 continue;
1494
1495 if (id >= map->nsid && id < map->nsid + map->range) {
1496 retmap = map;
1497 break;
1498 }
1499 }
1500
1501 return retmap;
1502}
1503
68f3899e
CB
1504int lxc_setup_devpts_parent(struct lxc_handler *handler)
1505{
1506 int ret;
1507
1508 if (handler->conf->pty_max <= 0)
1509 return 0;
1510
1511 ret = lxc_abstract_unix_recv_fds(handler->data_sock[1], &handler->conf->devpts_fd, 1,
1512 &handler->conf->devpts_fd, sizeof(handler->conf->devpts_fd));
1513 if (ret < 0)
1514 return log_error_errno(-1, errno, "Failed to receive devpts fd from child");
1515
1516 TRACE("Received devpts file descriptor %d from child", handler->conf->devpts_fd);
1517 return 0;
1518}
1519
1520static int lxc_setup_devpts_child(struct lxc_handler *handler)
3c26f34e 1521{
f797f05e 1522 __do_close int devpts_fd = -EBADF;
70761e5e 1523 int ret;
ce155c60 1524 char **opts;
9d28c4f9 1525 char devpts_mntopts[256];
ce155c60
CB
1526 char *mntopt_sets[5];
1527 char default_devpts_mntopts[256] = "gid=5,newinstance,ptmxmode=0666,mode=0620";
f797f05e 1528 struct lxc_conf *conf = handler->conf;
a26822c5 1529 struct lxc_rootfs *rootfs = &conf->rootfs;
f797f05e 1530 int sock = handler->data_sock[0];
77890c6d 1531
55022530
CB
1532 if (conf->pty_max <= 0)
1533 return log_debug(0, "No new devpts instance will be mounted since no pts devices are requested");
3c26f34e 1534
9bcde680 1535 ret = strnprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu",
e528c735 1536 default_devpts_mntopts, conf->pty_max);
9bcde680 1537 if (ret < 0)
9d28c4f9
CB
1538 return -1;
1539
29a7b484 1540 (void)umount2("/dev/pts", MNT_DETACH);
7e40254a 1541
70761e5e 1542 /* Create mountpoint for devpts instance. */
a5a08920 1543 ret = mkdirat(rootfs->dfd_dev, "pts", 0755);
55022530
CB
1544 if (ret < 0 && errno != EEXIST)
1545 return log_error_errno(-1, errno, "Failed to create \"/dev/pts\" directory");
3c26f34e 1546
ce155c60
CB
1547 /* gid=5 && max= */
1548 mntopt_sets[0] = devpts_mntopts;
dfbd4730 1549
ce155c60 1550 /* !gid=5 && max= */
6333c915 1551 mntopt_sets[1] = devpts_mntopts + STRLITERALLEN("gid=5") + 1;
ce155c60
CB
1552
1553 /* gid=5 && !max= */
1554 mntopt_sets[2] = default_devpts_mntopts;
1555
1556 /* !gid=5 && !max= */
6333c915 1557 mntopt_sets[3] = default_devpts_mntopts + STRLITERALLEN("gid=5") + 1;
ce155c60
CB
1558
1559 /* end */
1560 mntopt_sets[4] = NULL;
1561
1562 for (ret = -1, opts = mntopt_sets; opts && *opts; opts++) {
1563 /* mount new devpts instance */
1564 ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, *opts);
1565 if (ret == 0)
1566 break;
1567 }
1568
55022530
CB
1569 if (ret < 0)
1570 return log_error_errno(-1, errno, "Failed to mount new devpts instance");
ce155c60 1571 DEBUG("Mount new devpts instance with options \"%s\"", *opts);
70761e5e 1572
a5a08920 1573 devpts_fd = open_at(rootfs->dfd_dev, "pts", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
f797f05e 1574 if (devpts_fd < 0) {
fbfe5c82 1575 devpts_fd = -EBADF;
f797f05e 1576 TRACE("Failed to create detached devpts mount");
185b9ee9
CB
1577 ret = lxc_abstract_unix_send_fds(sock, NULL, 0, &devpts_fd, sizeof(int));
1578 } else {
1579 ret = lxc_abstract_unix_send_fds(sock, &devpts_fd, 1, NULL, 0);
f797f05e 1580 }
185b9ee9
CB
1581 if (ret < 0)
1582 return log_error_errno(-1, errno, "Failed to send devpts fd to parent");
1583
1584 TRACE("Sent devpts file descriptor %d to parent", devpts_fd);
f797f05e 1585
d5cb35d6 1586 /* Remove any pre-existing /dev/ptmx file. */
a5a08920 1587 ret = unlinkat(rootfs->dfd_dev, "ptmx", 0);
b29e05d6 1588 if (ret < 0) {
55022530
CB
1589 if (errno != ENOENT)
1590 return log_error_errno(-1, errno, "Failed to remove existing \"/dev/ptmx\" file");
b29e05d6 1591 } else {
0fd73091 1592 DEBUG("Removed existing \"/dev/ptmx\" file");
3c26f34e 1593 }
1594
d5cb35d6 1595 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
a5a08920 1596 ret = mknodat(rootfs->dfd_dev, "ptmx", S_IFREG | 0000, 0);
55022530
CB
1597 if (ret < 0 && errno != EEXIST)
1598 return log_error_errno(-1, errno, "Failed to create dummy \"/dev/ptmx\" file as bind mount target");
0fd73091 1599 DEBUG("Created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1600
d5cb35d6 1601 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1602 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
55022530
CB
1603 if (!ret)
1604 return log_debug(0, "Bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1605 else
d5cb35d6 1606 /* Fallthrough and try to create a symlink. */
0fd73091 1607 ERROR("Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
d5cb35d6
CB
1608
1609 /* Remove the dummy /dev/ptmx file we created above. */
a5a08920 1610 ret = unlinkat(rootfs->dfd_dev, "ptmx", 0);
55022530
CB
1611 if (ret < 0)
1612 return log_error_errno(-1, errno, "Failed to remove existing \"/dev/ptmx\"");
d5cb35d6
CB
1613
1614 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
a5a08920 1615 ret = symlinkat("/dev/pts/ptmx", rootfs->dfd_dev, "/dev/ptmx");
55022530
CB
1616 if (ret < 0)
1617 return log_error_errno(-1, errno, "Failed to create symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
cd54d859 1618
185b9ee9 1619 DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
3c26f34e 1620 return 0;
1621}
1622
cccc74b5
DL
1623static int setup_personality(int persona)
1624{
0fd73091
CB
1625 int ret;
1626
1627#if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1628 if (persona == -1)
1629 return 0;
1630
0fd73091 1631 ret = personality(persona);
55022530
CB
1632 if (ret < 0)
1633 return log_error_errno(-1, errno, "Failed to set personality to \"0x%x\"", persona);
cccc74b5 1634
0fd73091
CB
1635 INFO("Set personality to \"0x%x\"", persona);
1636#endif
cccc74b5
DL
1637
1638 return 0;
1639}
1640
efbfe93f
CB
1641static inline bool wants_console(const struct lxc_terminal *terminal)
1642{
1643 return !terminal->path || strcmp(terminal->path, "none");
1644}
1645
37c74fd1
CB
1646static int lxc_bind_mount_console(const struct lxc_terminal *console,
1647 int dfd_to, const char *path_to)
1648{
1649 __do_close int fd_pty = -EBADF;
1650
1651 if (is_empty_string(console->name))
1652 return ret_errno(EINVAL);
1653
1654 /*
1655 * When the pty fd stashed in console->pty has been retrieved via the
1656 * TIOCGPTPEER ioctl() to avoid dangerous path-based lookups when
1657 * allocating new pty devices we can't reopen it through openat2() or
1658 * created a detached mount through open_tree() from it. This means we
1659 * would need to mount using the path stased in console->name which is
1660 * unsafe. We could be mounting a device that isn't identical to the
1661 * one we've already safely opened and stashed in console->pty.
1662 * So, what we do is we open an O_PATH file descriptor for
1663 * console->name and verify that the opened fd and the fd we stashed in
1664 * console->pty refer to the same device. If they do we can go on and
1665 * created a detached mount based on the newly opened O_PATH file
1666 * descriptor and then safely mount.
1667 */
1668 fd_pty = open_at(-EBADF, console->name, PROTECT_OPATH_FILE,
1669 PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
1670 if (fd_pty < 0)
1671 return log_error_errno(-errno, errno, "Failed to open \"%s\"", console->name);
1672
1673 if (!same_file_lax(console->pty, fd_pty))
1674 return log_error_errno(-EINVAL, EINVAL, "Console file descriptor changed");
1675
1676 /*
1677 * Note, there are intentionally no open or lookup restrictions since
1678 * we're operating directly on the fd.
1679 */
1680 return fd_bind_mount(fd_pty, "", 0, 0,
1681 dfd_to, path_to, PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH,
1682 0, false);
1683}
1684
58b38111 1685static int lxc_setup_dev_console(struct lxc_rootfs *rootfs,
37c74fd1 1686 const struct lxc_terminal *console)
6e590161 1687{
882671aa 1688 int ret;
86530b0a 1689 char *rootfs_path = rootfs->path ? rootfs->mount : "";
52e35957 1690
cf68ffd9
CB
1691 /*
1692 * When we are asked to setup a console we remove any previous
8b1b1210
CB
1693 * /dev/console bind-mounts.
1694 */
a5a08920 1695 if (exists_file_at(rootfs->dfd_dev, "console")) {
9bcde680
CB
1696 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/dev/console", rootfs_path);
1697 if (ret < 0)
953db219
CB
1698 return -1;
1699
58b38111 1700 ret = lxc_unstack_mountpoint(rootfs->buf, false);
55022530 1701 if (ret < 0)
58b38111 1702 return log_error_errno(-ret, errno, "Failed to unmount \"%s\"", rootfs->buf);
55022530 1703 else
58b38111 1704 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, rootfs->buf);
8b1b1210
CB
1705 }
1706
cf68ffd9
CB
1707 /*
1708 * For unprivileged containers autodev or automounts will already have
8b1b1210
CB
1709 * taken care of creating /dev/console.
1710 */
a5a08920 1711 ret = mknodat(rootfs->dfd_dev, "console", S_IFREG | 0000, 0);
55022530
CB
1712 if (ret < 0 && errno != EEXIST)
1713 return log_error_errno(-errno, errno, "Failed to create console");
52e35957 1714
41808e20 1715 ret = fchmod(console->pty, S_IXUSR | S_IXGRP);
55022530
CB
1716 if (ret < 0)
1717 return log_error_errno(-errno, errno, "Failed to set mode \"0%o\" to \"%s\"", S_IXUSR | S_IXGRP, console->name);
13954cce 1718
de7f9f33 1719 if (can_use_mount_api()) {
37c74fd1
CB
1720 ret = lxc_bind_mount_console(console, rootfs->dfd_dev, "console");
1721 } else {
1722 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/dev/console", rootfs_path);
1723 if (ret < 0)
1724 return ret;
953db219 1725
37c74fd1 1726 ret = safe_mount(console->name, rootfs->buf, "none", MS_BIND, NULL, rootfs_path);
953db219 1727 }
37c74fd1
CB
1728 if (ret < 0)
1729 return log_error_errno(ret, errno, "Failed to mount %d(%s) on \"%s\"", console->pty, console->name, rootfs->buf);
6e590161 1730
37c74fd1 1731 DEBUG("Mounted pty device %d(%s) onto \"%s\"", console->pty, console->name, rootfs->buf);
7c6ef2a2
SH
1732 return 0;
1733}
1734
37c74fd1 1735static int lxc_setup_ttydir_console(struct lxc_rootfs *rootfs,
dcad02f8 1736 const struct lxc_terminal *console,
37c74fd1 1737 char *ttydir)
7c6ef2a2 1738{
3b7e332f 1739 int ret;
6b5a54cd 1740 char path[PATH_MAX], lxcpath[PATH_MAX];
86530b0a 1741 char *rootfs_path = rootfs->path ? rootfs->mount : "";
7c6ef2a2
SH
1742
1743 /* create rootfs/dev/<ttydir> directory */
9bcde680
CB
1744 ret = strnprintf(path, sizeof(path), "%s/dev/%s", rootfs_path, ttydir);
1745 if (ret < 0)
7c6ef2a2 1746 return -1;
3d7d929a 1747
7c6ef2a2 1748 ret = mkdir(path, 0755);
55022530
CB
1749 if (ret && errno != EEXIST)
1750 return log_error_errno(-errno, errno, "Failed to create \"%s\"", path);
4742cd9a 1751 DEBUG("Created directory for console and tty devices at \"%s\"", path);
7c6ef2a2 1752
9bcde680
CB
1753 ret = strnprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
1754 if (ret < 0)
3d7d929a
CB
1755 return -1;
1756
3b7e332f 1757 ret = mknod(lxcpath, S_IFREG | 0000, 0);
55022530
CB
1758 if (ret < 0 && errno != EEXIST)
1759 return log_error_errno(-errno, errno, "Failed to create \"%s\"", lxcpath);
7c6ef2a2 1760
9bcde680
CB
1761 ret = strnprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
1762 if (ret < 0)
7c6ef2a2 1763 return -1;
2a12fefd 1764
3dc035f1 1765 if (file_exists(path)) {
a7ba3c7f 1766 ret = lxc_unstack_mountpoint(path, false);
55022530
CB
1767 if (ret < 0)
1768 return log_error_errno(-ret, errno, "Failed to unmount \"%s\"", path);
1769 else
86530b0a 1770 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
3dc035f1 1771 }
2a12fefd 1772
3b7e332f 1773 ret = mknod(path, S_IFREG | 0000, 0);
55022530
CB
1774 if (ret < 0 && errno != EEXIST)
1775 return log_error_errno(-errno, errno, "Failed to create console");
7c6ef2a2 1776
41808e20 1777 ret = fchmod(console->pty, S_IXUSR | S_IXGRP);
55022530
CB
1778 if (ret < 0)
1779 return log_error_errno(-errno, errno, "Failed to set mode \"0%o\" to \"%s\"", S_IXUSR | S_IXGRP, console->name);
2a12fefd 1780
3dc035f1 1781 /* bind mount console->name to '/dev/<ttydir>/console' */
de7f9f33 1782 if (can_use_mount_api()) {
37c74fd1
CB
1783 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/console", ttydir);
1784 if (ret < 0)
1785 return ret;
efbfe93f 1786
37c74fd1
CB
1787 ret = lxc_bind_mount_console(console, rootfs->dfd_dev, rootfs->buf);
1788 } else {
1789 ret = safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs_path);
efbfe93f 1790 }
55022530 1791 if (ret < 0)
37c74fd1 1792 return log_error_errno(ret, errno, "Failed to mount %d(%s) on \"%s\"", console->pty, console->name, lxcpath);
86530b0a 1793 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1
L
1794
1795 /* bind mount '/dev/<ttydir>/console' to '/dev/console' */
de7f9f33 1796 if (can_use_mount_api()) {
37c74fd1
CB
1797 ret = fd_bind_mount(rootfs->dfd_dev, rootfs->buf,
1798 PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH_XDEV,
1799 rootfs->dfd_dev, "console",
1800 PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH,
1801 0, false);
1802 } else {
1803 ret = safe_mount(lxcpath, path, "none", MS_BIND, 0, rootfs_path);
1804 }
55022530
CB
1805 if (ret < 0)
1806 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
37c74fd1 1807 DEBUG("Mounted \"%s\" onto \"%s\"", lxcpath, path);
3dc035f1 1808
86530b0a 1809 DEBUG("Console has been setup under \"%s\" and mounted to \"%s\"", lxcpath, path);
6e590161 1810 return 0;
1811}
1812
58b38111 1813static int lxc_setup_console(struct lxc_rootfs *rootfs,
37c74fd1 1814 struct lxc_terminal *console, char *ttydir)
7c6ef2a2 1815{
37c74fd1 1816 int ret;
3d7d929a 1817
37c74fd1
CB
1818 if (!wants_console(console))
1819 return log_trace(0, "Skipping console setup");
7c6ef2a2 1820
37c74fd1
CB
1821 if (ttydir)
1822 ret = lxc_setup_ttydir_console(rootfs, console, ttydir);
1823 else
1824 ret = lxc_setup_dev_console(rootfs, console);
1825 close_prot_errno_disarm(console->pty);
1826 return ret;
7c6ef2a2
SH
1827}
1828
a08bfbe3 1829static int parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
998ac676 1830{
a08bfbe3 1831 ssize_t ret;
998ac676 1832
85c2de39
MB
1833 /* If '=' is contained in opt, the option must go into data. */
1834 if (!strchr(opt, '=')) {
a08bfbe3
CB
1835 /*
1836 * If opt is found in mount_opt, set or clear flags.
1837 * Otherwise append it to data.
1838 */
85c2de39 1839 size_t opt_len = strlen(opt);
a08bfbe3 1840 for (struct mount_opt *mo = &mount_opt[0]; mo->name != NULL; mo++) {
85c2de39 1841 size_t mo_name_len = strlen(mo->name);
a08bfbe3 1842
85c2de39
MB
1843 if (opt_len == mo_name_len && strncmp(opt, mo->name, mo_name_len) == 0) {
1844 if (mo->clear)
1845 *flags &= ~mo->flag;
1846 else
1847 *flags |= mo->flag;
a08bfbe3 1848 return 0;
85c2de39 1849 }
998ac676
RT
1850 }
1851 }
1852
a08bfbe3
CB
1853 if (strlen(*data)) {
1854 ret = strlcat(*data, ",", size);
1855 if (ret < 0)
1856 return log_error_errno(ret, errno, "Failed to append \",\" to %s", *data);
1857 }
1858
1859 ret = strlcat(*data, opt, size);
1860 if (ret < 0)
1861 return log_error_errno(ret, errno, "Failed to append \"%s\" to %s", opt, *data);
efed99a4 1862
a08bfbe3 1863 return 0;
998ac676
RT
1864}
1865
0fd73091 1866int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata)
998ac676 1867{
a08bfbe3
CB
1868 __do_free char *mntopts_new = NULL, *mntopts_dup = NULL;
1869 char *mntopt_cur = NULL;
efed99a4 1870 size_t size;
998ac676 1871
a08bfbe3
CB
1872 if (*mntdata || *mntflags)
1873 return ret_errno(EINVAL);
911324ef
DL
1874
1875 if (!mntopts)
998ac676
RT
1876 return 0;
1877
a08bfbe3
CB
1878 mntopts_dup = strdup(mntopts);
1879 if (!mntopts_dup)
1880 return ret_errno(ENOMEM);
998ac676 1881
a08bfbe3
CB
1882 size = strlen(mntopts_dup) + 1;
1883 mntopts_new = zalloc(size);
1884 if (!mntopts_new)
1885 return ret_errno(ENOMEM);
998ac676 1886
a08bfbe3
CB
1887 lxc_iterate_parts(mntopt_cur, mntopts_dup, ",")
1888 if (parse_mntopt(mntopt_cur, mntflags, &mntopts_new, size) < 0)
1889 return ret_errno(EINVAL);
998ac676 1890
a08bfbe3
CB
1891 if (*mntopts_new)
1892 *mntdata = move_ptr(mntopts_new);
998ac676
RT
1893
1894 return 0;
1895}
1896
d840039e
YT
1897static void parse_propagationopt(char *opt, unsigned long *flags)
1898{
1899 struct mount_opt *mo;
1900
1901 /* If opt is found in propagation_opt, set or clear flags. */
d840039e 1902 for (mo = &propagation_opt[0]; mo->name != NULL; mo++) {
0fd73091
CB
1903 if (strncmp(opt, mo->name, strlen(mo->name)) != 0)
1904 continue;
1905
1906 if (mo->clear)
1907 *flags &= ~mo->flag;
1908 else
1909 *flags |= mo->flag;
1910
1911 return;
d840039e
YT
1912 }
1913}
1914
8ce1abc2 1915int parse_propagationopts(const char *mntopts, unsigned long *pflags)
d840039e 1916{
dfd2e059
CB
1917 __do_free char *s = NULL;
1918 char *p;
d840039e
YT
1919
1920 if (!mntopts)
1921 return 0;
1922
1923 s = strdup(mntopts);
55022530
CB
1924 if (!s)
1925 return log_error_errno(-ENOMEM, errno, "Failed to allocate memory");
d840039e 1926
0fd73091 1927 *pflags = 0L;
8db9d26f 1928 lxc_iterate_parts(p, s, ",")
d840039e 1929 parse_propagationopt(p, pflags);
0fd73091 1930
d840039e
YT
1931 return 0;
1932}
1933
6fd5e769
SH
1934static void null_endofword(char *word)
1935{
1936 while (*word && *word != ' ' && *word != '\t')
1937 word++;
1938 *word = '\0';
1939}
1940
0fd73091 1941/* skip @nfields spaces in @src */
6fd5e769
SH
1942static char *get_field(char *src, int nfields)
1943{
6fd5e769 1944 int i;
0fd73091 1945 char *p = src;
6fd5e769
SH
1946
1947 for (i = 0; i < nfields; i++) {
1948 while (*p && *p != ' ' && *p != '\t')
1949 p++;
0fd73091 1950
6fd5e769
SH
1951 if (!*p)
1952 break;
0fd73091 1953
6fd5e769
SH
1954 p++;
1955 }
0fd73091 1956
6fd5e769
SH
1957 return p;
1958}
1959
911324ef
DL
1960static int mount_entry(const char *fsname, const char *target,
1961 const char *fstype, unsigned long mountflags,
d840039e
YT
1962 unsigned long pflags, const char *data, bool optional,
1963 bool dev, bool relative, const char *rootfs)
911324ef 1964{
0ac4b28a 1965 int ret;
6b5a54cd 1966 char srcbuf[PATH_MAX];
181437fd 1967 const char *srcpath = fsname;
614305f3 1968#ifdef HAVE_STATVFS
2938f7c8 1969 struct statvfs sb;
614305f3 1970#endif
2938f7c8 1971
181437fd 1972 if (relative) {
9bcde680
CB
1973 ret = strnprintf(srcbuf, sizeof(srcbuf), "%s/%s", rootfs ? rootfs : "/", fsname ? fsname : "");
1974 if (ret < 0)
55022530 1975 return log_error_errno(-1, errno, "source path is too long");
181437fd
YT
1976 srcpath = srcbuf;
1977 }
1978
1979 ret = safe_mount(srcpath, target, fstype, mountflags & ~MS_REMOUNT, data,
0ac4b28a
CB
1980 rootfs);
1981 if (ret < 0) {
55022530
CB
1982 if (optional)
1983 return log_info_errno(0, errno, "Failed to mount \"%s\" on \"%s\" (optional)",
1984 srcpath ? srcpath : "(null)", target);
0ac4b28a 1985
55022530
CB
1986 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"",
1987 srcpath ? srcpath : "(null)", target);
911324ef
DL
1988 }
1989
1990 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
0ac4b28a 1991
55022530
CB
1992 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount options",
1993 srcpath ? srcpath : "(none)", target ? target : "(none)");
0ac4b28a 1994
614305f3 1995#ifdef HAVE_STATVFS
181437fd 1996 if (srcpath && statvfs(srcpath, &sb) == 0) {
94bef7e4
TA
1997 unsigned long required_flags = 0;
1998
2938f7c8
SH
1999 if (sb.f_flag & MS_NOSUID)
2000 required_flags |= MS_NOSUID;
0ac4b28a 2001
ae7a770e 2002 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 2003 required_flags |= MS_NODEV;
0ac4b28a 2004
2938f7c8
SH
2005 if (sb.f_flag & MS_RDONLY)
2006 required_flags |= MS_RDONLY;
0ac4b28a 2007
2938f7c8
SH
2008 if (sb.f_flag & MS_NOEXEC)
2009 required_flags |= MS_NOEXEC;
0ac4b28a 2010
55022530
CB
2011 DEBUG("Flags for \"%s\" were %lu, required extra flags are %lu",
2012 srcpath, sb.f_flag, required_flags);
0ac4b28a
CB
2013
2014 /* If this was a bind mount request, and required_flags
2938f7c8 2015 * does not have any flags which are not already in
0ac4b28a 2016 * mountflags, then skip the remount.
2938f7c8 2017 */
94bef7e4
TA
2018 if (!(mountflags & MS_REMOUNT) &&
2019 (!(required_flags & ~mountflags) && !(mountflags & MS_RDONLY))) {
15f3e22b
CB
2020 DEBUG("Mountflags already were %lu, skipping remount", mountflags);
2021 goto skipremount;
2938f7c8 2022 }
0ac4b28a 2023
2938f7c8 2024 mountflags |= required_flags;
6fd5e769 2025 }
614305f3 2026#endif
911324ef 2027
181437fd 2028 ret = mount(srcpath, target, fstype, mountflags | MS_REMOUNT, data);
0ac4b28a 2029 if (ret < 0) {
55022530
CB
2030 if (optional)
2031 return log_info_errno(0, errno, "Failed to mount \"%s\" on \"%s\" (optional)",
2032 srcpath ? srcpath : "(null)",
2033 target);
2034
2035 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"",
2036 srcpath ? srcpath : "(null)",
2037 target);
911324ef
DL
2038 }
2039 }
2040
a3ed9b81 2041#ifdef HAVE_STATVFS
2042skipremount:
2043#endif
d840039e
YT
2044 if (pflags) {
2045 ret = mount(NULL, target, NULL, pflags, NULL);
2046 if (ret < 0) {
55022530
CB
2047 if (optional)
2048 return log_info_errno(0, errno, "Failed to change mount propagation for \"%s\" (optional)", target);
2049 else
2050 return log_error_errno(-1, errno, "Failed to change mount propagation for \"%s\" (optional)", target);
d840039e
YT
2051 }
2052 DEBUG("Changed mount propagation for \"%s\"", target);
2053 }
2054
0103eb53 2055 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"",
181437fd 2056 srcpath ? srcpath : "(null)", target, fstype);
911324ef
DL
2057
2058 return 0;
2059}
2060
c5e30de4 2061/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
2062static void cull_mntent_opt(struct mntent *mntent)
2063{
2064 int i;
0fd73091
CB
2065 char *list[] = {
2066 "create=dir",
2067 "create=file",
2068 "optional",
2069 "relative",
2070 NULL
2071 };
c5e30de4
CB
2072
2073 for (i = 0; list[i]; i++) {
2074 char *p, *p2;
2075
2076 p = strstr(mntent->mnt_opts, list[i]);
2077 if (!p)
4e4ca161 2078 continue;
c5e30de4 2079
4e4ca161
SH
2080 p2 = strchr(p, ',');
2081 if (!p2) {
2082 /* no more mntopts, so just chop it here */
2083 *p = '\0';
2084 continue;
2085 }
c5e30de4
CB
2086
2087 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
2088 }
2089}
2090
4d5b72a1 2091static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
2092 const char *path,
2093 const struct lxc_rootfs *rootfs,
0fd73091 2094 const char *lxc_name, const char *lxc_path)
0ad19a3f 2095{
7a76eeaa 2096 __do_free char *p1 = NULL;
3b7e332f 2097 int ret;
7a76eeaa 2098 char *p2;
911324ef 2099
12e6ab5d 2100 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
749f98d9 2101 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
12e6ab5d
CB
2102 if (ret < 0)
2103 return -1;
2104 }
6e46cc0d 2105
34cfffb3 2106 if (hasmntopt(mntent, "create=dir")) {
749f98d9 2107 ret = mkdir_p(path, 0755);
55022530
CB
2108 if (ret < 0 && errno != EEXIST)
2109 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", path);
34cfffb3
SG
2110 }
2111
0fd73091
CB
2112 if (!hasmntopt(mntent, "create=file"))
2113 return 0;
749f98d9 2114
0fd73091
CB
2115 ret = access(path, F_OK);
2116 if (ret == 0)
2117 return 0;
749f98d9 2118
0fd73091
CB
2119 p1 = strdup(path);
2120 if (!p1)
2121 return -1;
749f98d9 2122
0fd73091 2123 p2 = dirname(p1);
749f98d9 2124
0fd73091 2125 ret = mkdir_p(p2, 0755);
55022530
CB
2126 if (ret < 0 && errno != EEXIST)
2127 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", path);
749f98d9 2128
3b7e332f
CB
2129 ret = mknod(path, S_IFREG | 0000, 0);
2130 if (ret < 0 && errno != EEXIST)
2131 return -errno;
0fd73091 2132
749f98d9 2133 return 0;
4d5b72a1
NC
2134}
2135
ec50007f
CB
2136/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2137 * without a rootfs. */
db4aba38 2138static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
2139 const char *path,
2140 const struct lxc_rootfs *rootfs,
2141 const char *lxc_name,
2142 const char *lxc_path)
4d5b72a1 2143{
fd214f37 2144 __do_free char *mntdata = NULL;
a08bfbe3
CB
2145 unsigned long mntflags = 0, pflags = 0;
2146 char *rootfs_path = NULL;
d8b712bc 2147 int ret;
181437fd 2148 bool dev, optional, relative;
d8b712bc
CB
2149
2150 optional = hasmntopt(mntent, "optional") != NULL;
2151 dev = hasmntopt(mntent, "dev") != NULL;
181437fd 2152 relative = hasmntopt(mntent, "relative") != NULL;
d8b712bc 2153
ec50007f
CB
2154 if (rootfs && rootfs->path)
2155 rootfs_path = rootfs->mount;
2156
d8b712bc
CB
2157 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2158 lxc_path);
2159 if (ret < 0) {
2160 if (optional)
2161 return 0;
608e3567 2162
d8b712bc
CB
2163 return -1;
2164 }
4e4ca161
SH
2165 cull_mntent_opt(mntent);
2166
d840039e
YT
2167 ret = parse_propagationopts(mntent->mnt_opts, &pflags);
2168 if (ret < 0)
2169 return -1;
2170
d8b712bc
CB
2171 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2172 if (ret < 0)
a08bfbe3 2173 return ret;
a17b1e65 2174
6e46cc0d 2175 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
d840039e 2176 pflags, mntdata, optional, dev, relative, rootfs_path);
68c152ef 2177
911324ef
DL
2178 return ret;
2179}
2180
8183f09e
CB
2181static inline int mount_entry_on_systemfs(struct lxc_rootfs *rootfs,
2182 struct mntent *mntent)
db4aba38 2183{
1433c9f9
CB
2184 int ret;
2185
2186 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
2187 * absolute paths starting at / on the host.
2188 */
1433c9f9 2189 if (mntent->mnt_dir[0] != '/')
9bcde680 2190 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "/%s", mntent->mnt_dir);
1433c9f9 2191 else
9bcde680
CB
2192 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s", mntent->mnt_dir);
2193 if (ret < 0)
1433c9f9 2194 return -1;
1433c9f9 2195
8183f09e 2196 return mount_entry_on_generic(mntent, rootfs->buf, NULL, NULL, NULL);
db4aba38
NC
2197}
2198
4e4ca161 2199static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
9c0fd29a 2200 struct lxc_rootfs *rootfs,
0a2dddd4
CB
2201 const char *lxc_name,
2202 const char *lxc_path)
911324ef 2203{
bdd2b34c 2204 int offset;
013bd428 2205 char *aux;
67e571de 2206 const char *lxcpath;
bdd2b34c 2207 int ret = 0;
0ad19a3f 2208
593e8478 2209 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 2210 if (!lxcpath)
2a59a681 2211 return -1;
2a59a681 2212
bdd2b34c
CB
2213 /* If rootfs->path is a blockdev path, allow container fstab to use
2214 * <lxcpath>/<name>/rootfs" as the target prefix.
2215 */
9bcde680
CB
2216 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/%s/rootfs", lxcpath, lxc_name);
2217 if (ret < 0)
80a881b2
SH
2218 goto skipvarlib;
2219
9c0fd29a 2220 aux = strstr(mntent->mnt_dir, rootfs->buf);
80a881b2 2221 if (aux) {
9c0fd29a 2222 offset = strlen(rootfs->buf);
80a881b2
SH
2223 goto skipabs;
2224 }
2225
2226skipvarlib:
013bd428 2227 aux = strstr(mntent->mnt_dir, rootfs->path);
55022530
CB
2228 if (!aux)
2229 return log_warn(ret, "Ignoring mount point \"%s\"", mntent->mnt_dir);
80a881b2
SH
2230 offset = strlen(rootfs->path);
2231
2232skipabs:
9bcde680
CB
2233 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/%s", rootfs->mount, aux + offset);
2234 if (ret < 0)
a17b1e65 2235 return -1;
a17b1e65 2236
9c0fd29a 2237 return mount_entry_on_generic(mntent, rootfs->buf, rootfs, lxc_name, lxc_path);
911324ef 2238}
d330fe7b 2239
4e4ca161 2240static int mount_entry_on_relative_rootfs(struct mntent *mntent,
4806d3b9 2241 struct lxc_rootfs *rootfs,
0a2dddd4
CB
2242 const char *lxc_name,
2243 const char *lxc_path)
911324ef 2244{
911324ef 2245 int ret;
d330fe7b 2246
34cfffb3 2247 /* relative to root mount point */
9bcde680
CB
2248 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/%s", rootfs->mount, mntent->mnt_dir);
2249 if (ret < 0)
9ba8130c 2250 return -1;
911324ef 2251
4806d3b9 2252 return mount_entry_on_generic(mntent, rootfs->buf, rootfs, lxc_name, lxc_path);
911324ef
DL
2253}
2254
8183f09e 2255static int mount_file_entries(struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2256 const char *lxc_name, const char *lxc_path)
911324ef 2257{
9d03d857 2258 char buf[PATH_MAX];
0fd73091 2259 struct mntent mntent;
e76b8764 2260
aaf901be 2261 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
9d03d857
CB
2262 int ret;
2263
1ae3c19f 2264 if (!rootfs->path)
8183f09e 2265 ret = mount_entry_on_systemfs(rootfs, &mntent);
1ae3c19f
CB
2266 else if (mntent.mnt_dir[0] != '/')
2267 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2268 lxc_name, lxc_path);
2269 else
2270 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
9d03d857 2271 lxc_name, lxc_path);
1ae3c19f
CB
2272 if (ret < 0)
2273 return -1;
0ad19a3f 2274 }
cd54d859 2275
55022530
CB
2276 if (!feof(file) || ferror(file))
2277 return log_error(-1, "Failed to parse mount entries");
9d03d857
CB
2278
2279 return 0;
e7938e9e
MN
2280}
2281
55022530
CB
2282static inline void __auto_endmntent__(FILE **f)
2283{
2284 if (*f)
2285 endmntent(*f);
2286}
2287
2288#define __do_endmntent __attribute__((__cleanup__(__auto_endmntent__)))
2289
48e5dcc8 2290static int setup_mount_fstab(struct lxc_rootfs *rootfs, const char *fstab,
8183f09e 2291 const char *lxc_name, const char *lxc_path)
e7938e9e 2292{
55022530 2293 __do_endmntent FILE *f = NULL;
e7938e9e
MN
2294 int ret;
2295
2296 if (!fstab)
2297 return 0;
2298
55022530
CB
2299 f = setmntent(fstab, "re");
2300 if (!f)
2301 return log_error_errno(-1, errno, "Failed to open \"%s\"", fstab);
e7938e9e 2302
a7c6e830 2303 ret = mount_file_entries(rootfs, f, lxc_name, lxc_path);
42dff448
CB
2304 if (ret < 0)
2305 ERROR("Failed to set up mount entries");
e7938e9e 2306
0ad19a3f 2307 return ret;
2308}
2309
1800f924
WB
2310/*
2311 * In order for nested containers to be able to mount /proc and /sys they need
2312 * to see a "pure" proc and sysfs mount points with nothing mounted on top
2313 * (like lxcfs).
2314 * For this we provide proc and sysfs in /dev/.lxc/{proc,sys} while using an
2315 * apparmor rule to deny access to them. This is mostly for convenience: The
2316 * container's root user can mount them anyway and thus has access to the two
2317 * file systems. But a non-root user in the container should not be allowed to
2318 * access them as a side effect without explicitly allowing it.
2319 */
2320static const char nesting_helpers[] =
dc691e34
CB
2321"proc dev/.lxc/proc proc create=dir,optional 0 0\n"
2322"sys dev/.lxc/sys sysfs create=dir,optional 0 0\n";
1800f924
WB
2323
2324FILE *make_anonymous_mount_file(struct lxc_list *mount,
2325 bool include_nesting_helpers)
e7938e9e 2326{
f62cf1d4 2327 __do_close int fd = -EBADF;
4110345b 2328 FILE *f;
5ef5c9a3 2329 int ret;
e7938e9e 2330 char *mount_entry;
5ef5c9a3 2331 struct lxc_list *iterator;
5ef5c9a3 2332
0fd73091 2333 fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
5ef5c9a3 2334 if (fd < 0) {
a324e7eb
CB
2335 char template[] = P_tmpdir "/.lxc_mount_file_XXXXXX";
2336
5ef5c9a3
CB
2337 if (errno != ENOSYS)
2338 return NULL;
a324e7eb
CB
2339
2340 fd = lxc_make_tmpfile(template, true);
55022530
CB
2341 if (fd < 0)
2342 return log_error_errno(NULL, errno, "Could not create temporary mount file");
0fd73091 2343
6bd04140 2344 TRACE("Created temporary mount file");
5ef5c9a3 2345 }
e7938e9e 2346
0fd73091
CB
2347 lxc_list_for_each (iterator, mount) {
2348 size_t len;
2349
e7938e9e 2350 mount_entry = iterator->elem;
0fd73091 2351 len = strlen(mount_entry);
5ef5c9a3 2352
489f39be 2353 ret = lxc_write_nointr(fd, mount_entry, len);
0fd73091 2354 if (ret != len)
79bcf5ee 2355 return NULL;
0fd73091 2356
489f39be 2357 ret = lxc_write_nointr(fd, "\n", 1);
0fd73091 2358 if (ret != 1)
79bcf5ee 2359 return NULL;
e7938e9e
MN
2360 }
2361
1800f924
WB
2362 if (include_nesting_helpers) {
2363 ret = lxc_write_nointr(fd, nesting_helpers,
6333c915
CB
2364 STRARRAYLEN(nesting_helpers));
2365 if (ret != STRARRAYLEN(nesting_helpers))
79bcf5ee 2366 return NULL;
1800f924
WB
2367 }
2368
0fd73091
CB
2369 ret = lseek(fd, 0, SEEK_SET);
2370 if (ret < 0)
79bcf5ee 2371 return NULL;
0fd73091 2372
4110345b
CB
2373 f = fdopen(fd, "re+");
2374 if (f)
2375 move_fd(fd); /* Transfer ownership of fd. */
2376 return f;
9fc7f8c0
TA
2377}
2378
06749971 2379static int setup_mount_entries(const struct lxc_conf *conf,
48e5dcc8
CB
2380 struct lxc_rootfs *rootfs, struct lxc_list *mount,
2381 const char *lxc_name, const char *lxc_path)
9fc7f8c0 2382{
c85ced65 2383 __do_fclose FILE *f = NULL;
9fc7f8c0 2384
1800f924 2385 f = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
19b5d755 2386 if (!f)
9fc7f8c0 2387 return -1;
e7938e9e 2388
a7c6e830 2389 return mount_file_entries(rootfs, f, lxc_name, lxc_path);
e7938e9e
MN
2390}
2391
bab88e68
CS
2392static int parse_cap(const char *cap)
2393{
84760c11 2394 size_t i;
2395 int capid = -1;
0fd73091
CB
2396 size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
2397 char *ptr = NULL;
bab88e68 2398
0fd73091 2399 if (strcmp(cap, "none") == 0)
7035407c
DE
2400 return -2;
2401
8560cd36 2402 for (i = 0; i < end; i++) {
bab88e68
CS
2403 if (strcmp(cap, caps_opt[i].name))
2404 continue;
2405
2406 capid = caps_opt[i].value;
2407 break;
2408 }
2409
2410 if (capid < 0) {
0fd73091
CB
2411 /* Try to see if it's numeric, so the user may specify
2412 * capabilities that the running kernel knows about but we
2413 * don't
2414 */
bab88e68
CS
2415 errno = 0;
2416 capid = strtol(cap, &ptr, 10);
2417 if (!ptr || *ptr != '\0' || errno != 0)
2418 /* not a valid number */
2419 capid = -1;
2420 else if (capid > lxc_caps_last_cap())
2421 /* we have a number but it's not a valid
2422 * capability */
2423 capid = -1;
2424 }
2425
2426 return capid;
2427}
2428
0769b82a
CS
2429int in_caplist(int cap, struct lxc_list *caps)
2430{
0769b82a 2431 int capid;
0fd73091 2432 struct lxc_list *iterator;
0769b82a 2433
0fd73091 2434 lxc_list_for_each (iterator, caps) {
0769b82a
CS
2435 capid = parse_cap(iterator->elem);
2436 if (capid == cap)
2437 return 1;
2438 }
2439
2440 return 0;
2441}
2442
81810dd1
DL
2443static int setup_caps(struct lxc_list *caps)
2444{
bab88e68 2445 int capid;
0fd73091
CB
2446 char *drop_entry;
2447 struct lxc_list *iterator;
81810dd1 2448
0fd73091
CB
2449 lxc_list_for_each (iterator, caps) {
2450 int ret;
81810dd1
DL
2451
2452 drop_entry = iterator->elem;
2453
bab88e68 2454 capid = parse_cap(drop_entry);
55022530
CB
2455 if (capid < 0)
2456 return log_error(-1, "unknown capability %s", drop_entry);
81810dd1 2457
b81689a1
CB
2458 ret = prctl(PR_CAPBSET_DROP, prctl_arg(capid), prctl_arg(0),
2459 prctl_arg(0), prctl_arg(0));
55022530
CB
2460 if (ret < 0)
2461 return log_error_errno(-1, errno, "Failed to remove %s capability", drop_entry);
0fd73091 2462 DEBUG("Dropped %s (%d) capability", drop_entry, capid);
81810dd1
DL
2463 }
2464
0fd73091 2465 DEBUG("Capabilities have been setup");
1fb86a7c
SH
2466 return 0;
2467}
2468
2469static int dropcaps_except(struct lxc_list *caps)
2470{
2f443e88 2471 __do_free int *caplist = NULL;
0fd73091 2472 int i, capid, numcaps;
1fb86a7c 2473 char *keep_entry;
0fd73091 2474 struct lxc_list *iterator;
1fb86a7c 2475
0fd73091 2476 numcaps = lxc_caps_last_cap() + 1;
2caf9a97
SH
2477 if (numcaps <= 0 || numcaps > 200)
2478 return -1;
0fd73091 2479 TRACE("Found %d capabilities", numcaps);
2caf9a97 2480
1a0e70ac 2481 /* caplist[i] is 1 if we keep capability i */
2f443e88 2482 caplist = must_realloc(NULL, numcaps * sizeof(int));
1fb86a7c
SH
2483 memset(caplist, 0, numcaps * sizeof(int));
2484
0fd73091 2485 lxc_list_for_each (iterator, caps) {
1fb86a7c
SH
2486 keep_entry = iterator->elem;
2487
bab88e68 2488 capid = parse_cap(keep_entry);
7035407c
DE
2489 if (capid == -2)
2490 continue;
2491
55022530
CB
2492 if (capid < 0)
2493 return log_error(-1, "Unknown capability %s", keep_entry);
1fb86a7c 2494
0fd73091 2495 DEBUG("Keep capability %s (%d)", keep_entry, capid);
1fb86a7c
SH
2496 caplist[capid] = 1;
2497 }
0fd73091
CB
2498
2499 for (i = 0; i < numcaps; i++) {
2500 int ret;
2501
1fb86a7c
SH
2502 if (caplist[i])
2503 continue;
0fd73091 2504
b81689a1
CB
2505 ret = prctl(PR_CAPBSET_DROP, prctl_arg(i), prctl_arg(0),
2506 prctl_arg(0), prctl_arg(0));
55022530
CB
2507 if (ret < 0)
2508 return log_error_errno(-1, errno, "Failed to remove capability %d", i);
1fb86a7c
SH
2509 }
2510
0fd73091 2511 DEBUG("Capabilities have been setup");
81810dd1
DL
2512 return 0;
2513}
2514
0fd73091
CB
2515static int parse_resource(const char *res)
2516{
2517 int ret;
c6d09e15
WB
2518 size_t i;
2519 int resid = -1;
2520
0fd73091 2521 for (i = 0; i < sizeof(limit_opt) / sizeof(limit_opt[0]); ++i)
c6d09e15
WB
2522 if (strcmp(res, limit_opt[i].name) == 0)
2523 return limit_opt[i].value;
c6d09e15 2524
0fd73091 2525 /* Try to see if it's numeric, so the user may specify
c6d09e15 2526 * resources that the running kernel knows about but
0fd73091
CB
2527 * we don't.
2528 */
2529 ret = lxc_safe_int(res, &resid);
2530 if (ret < 0)
2531 return -1;
2532
2533 return resid;
c6d09e15
WB
2534}
2535
0fd73091
CB
2536int setup_resource_limits(struct lxc_list *limits, pid_t pid)
2537{
2538 int resid;
c6d09e15
WB
2539 struct lxc_list *it;
2540 struct lxc_limit *lim;
c6d09e15 2541
0fd73091 2542 lxc_list_for_each (it, limits) {
c6d09e15
WB
2543 lim = it->elem;
2544
2545 resid = parse_resource(lim->resource);
55022530
CB
2546 if (resid < 0)
2547 return log_error(-1, "Unknown resource %s", lim->resource);
c6d09e15 2548
f48b5fd8 2549#if HAVE_PRLIMIT || HAVE_PRLIMIT64
55022530
CB
2550 if (prlimit(pid, resid, &lim->limit, NULL) != 0)
2551 return log_error_errno(-1, errno, "Failed to set limit %s", lim->resource);
2de12765
CB
2552
2553 TRACE("Setup \"%s\" limit", lim->resource);
f48b5fd8 2554#else
55022530 2555 return log_error(-1, "Cannot set limit \"%s\" as prlimit is missing", lim->resource);
f48b5fd8 2556#endif
c6d09e15 2557 }
0fd73091 2558
c6d09e15
WB
2559 return 0;
2560}
2561
7edd0540
L
2562int setup_sysctl_parameters(struct lxc_list *sysctls)
2563{
e6f76452 2564 __do_free char *tmp = NULL;
7edd0540
L
2565 struct lxc_list *it;
2566 struct lxc_sysctl *elem;
0fd73091 2567 int ret = 0;
6b5a54cd 2568 char filename[PATH_MAX] = {0};
7edd0540 2569
0fd73091 2570 lxc_list_for_each (it, sysctls) {
7edd0540
L
2571 elem = it->elem;
2572 tmp = lxc_string_replace(".", "/", elem->key);
55022530
CB
2573 if (!tmp)
2574 return log_error(-1, "Failed to replace key %s", elem->key);
7edd0540 2575
9bcde680
CB
2576 ret = strnprintf(filename, sizeof(filename), "/proc/sys/%s", tmp);
2577 if (ret < 0)
55022530 2578 return log_error(-1, "Error setting up sysctl parameters path");
7edd0540 2579
0fd73091 2580 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2581 strlen(elem->value), false, 0666);
55022530
CB
2582 if (ret < 0)
2583 return log_error_errno(-1, errno, "Failed to setup sysctl parameters %s to %s",
2584 elem->key, elem->value);
7edd0540 2585 }
0fd73091 2586
7edd0540
L
2587 return 0;
2588}
2589
61d7a733
YT
2590int setup_proc_filesystem(struct lxc_list *procs, pid_t pid)
2591{
0c669152 2592 __do_free char *tmp = NULL;
61d7a733
YT
2593 struct lxc_list *it;
2594 struct lxc_proc *elem;
0fd73091 2595 int ret = 0;
6b5a54cd 2596 char filename[PATH_MAX] = {0};
61d7a733 2597
0fd73091 2598 lxc_list_for_each (it, procs) {
61d7a733
YT
2599 elem = it->elem;
2600 tmp = lxc_string_replace(".", "/", elem->filename);
55022530
CB
2601 if (!tmp)
2602 return log_error(-1, "Failed to replace key %s", elem->filename);
61d7a733 2603
9bcde680
CB
2604 ret = strnprintf(filename, sizeof(filename), "/proc/%d/%s", pid, tmp);
2605 if (ret < 0)
55022530 2606 return log_error(-1, "Error setting up proc filesystem path");
61d7a733 2607
0fd73091 2608 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2609 strlen(elem->value), false, 0666);
55022530
CB
2610 if (ret < 0)
2611 return log_error_errno(-1, errno, "Failed to setup proc filesystem %s to %s", elem->filename, elem->value);
61d7a733 2612 }
0fd73091 2613
61d7a733
YT
2614 return 0;
2615}
2616
ae9242c8
SH
2617static char *default_rootfs_mount = LXCROOTFSMOUNT;
2618
7b379ab3 2619struct lxc_conf *lxc_conf_init(void)
089cd8b8 2620{
26ddeedd 2621 int i;
0fd73091 2622 struct lxc_conf *new;
7b379ab3 2623
13277ec4 2624 new = malloc(sizeof(*new));
0fd73091 2625 if (!new)
7b379ab3 2626 return NULL;
7b379ab3
MN
2627 memset(new, 0, sizeof(*new));
2628
4b73005c 2629 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2630 new->personality = -1;
124fa0a8 2631 new->autodev = 1;
3a784510 2632 new->console.buffer_size = 0;
596a818d
DE
2633 new->console.log_path = NULL;
2634 new->console.log_fd = -1;
861813e5 2635 new->console.log_size = 0;
28a4b0e5 2636 new->console.path = NULL;
63376d7d 2637 new->console.peer = -1;
fb87aa6a 2638 new->console.proxy.busy = -1;
36a94ce8 2639 new->console.proxy.ptx = -1;
41808e20 2640 new->console.proxy.pty = -1;
36a94ce8 2641 new->console.ptx = -1;
41808e20 2642 new->console.pty = -1;
63376d7d 2643 new->console.name[0] = '\0';
732375f5 2644 memset(&new->console.ringbuf, 0, sizeof(struct lxc_ringbuf));
d2e30e99 2645 new->maincmd_fd = -1;
258f8051 2646 new->monitor_signal_pdeath = SIGKILL;
76a26f55 2647 new->nbd_idx = -1;
54c30e29 2648 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2649 if (!new->rootfs.mount) {
53f3f048
SH
2650 free(new);
2651 return NULL;
2652 }
6e54330c 2653 new->rootfs.managed = true;
ea57e424 2654 new->rootfs.dfd_mnt = -EBADF;
a5a08920 2655 new->rootfs.dfd_dev = -EBADF;
ea11a215 2656 new->rootfs.dfd_host = -EBADF;
79ff643d 2657 new->rootfs.fd_path_pin = -EBADF;
858377e4 2658 new->logfd = -1;
7b379ab3 2659 lxc_list_init(&new->cgroup);
54860ed0 2660 lxc_list_init(&new->cgroup2);
4bfb655e 2661 lxc_list_init(&new->devices);
7b379ab3
MN
2662 lxc_list_init(&new->network);
2663 lxc_list_init(&new->mount_list);
81810dd1 2664 lxc_list_init(&new->caps);
1fb86a7c 2665 lxc_list_init(&new->keepcaps);
f6d3e3e4 2666 lxc_list_init(&new->id_map);
46ad64ab
CB
2667 new->root_nsuid_map = NULL;
2668 new->root_nsgid_map = NULL;
f979ac15 2669 lxc_list_init(&new->includes);
4184c3e1 2670 lxc_list_init(&new->aliens);
7c661726 2671 lxc_list_init(&new->environment);
c6d09e15 2672 lxc_list_init(&new->limits);
7edd0540 2673 lxc_list_init(&new->sysctls);
61d7a733 2674 lxc_list_init(&new->procs);
44ae0fb6 2675 new->hooks_version = 0;
28d9e29e 2676 for (i = 0; i < NUM_LXC_HOOKS; i++)
26ddeedd 2677 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2678 lxc_list_init(&new->groups);
d39b10eb 2679 lxc_list_init(&new->state_clients);
fe4de9a6 2680 new->lsm_aa_profile = NULL;
1800f924 2681 lxc_list_init(&new->lsm_aa_raw);
fe4de9a6 2682 new->lsm_se_context = NULL;
4fef78bc 2683 new->lsm_se_keyring_context = NULL;
8f818a84 2684 new->keyring_disable_session = false;
952b5031 2685 new->transient_procfs_mnt = false;
7a41e857
LT
2686 new->shmount.path_host = NULL;
2687 new->shmount.path_cont = NULL;
7b379ab3 2688
72bb04e4
PT
2689 /* if running in a new user namespace, init and COMMAND
2690 * default to running as UID/GID 0 when using lxc-execute */
2691 new->init_uid = 0;
2692 new->init_gid = 0;
c71f64cb 2693 memset(&new->init_groups, 0, sizeof(lxc_groups_t));
43654d34 2694 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
b074bbf1 2695 memset(&new->ns_share, 0, sizeof(char *) * LXC_NS_MAX);
70fd7fc9 2696 memset(&new->timens, 0, sizeof(struct timens_offsets));
c3e3c21a 2697 seccomp_conf_init(new);
72bb04e4 2698
7b379ab3 2699 return new;
089cd8b8
DL
2700}
2701
344c9d81 2702int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
a19b974f 2703 size_t buf_size)
f6d3e3e4 2704{
f62cf1d4 2705 __do_close int fd = -EBADF;
76bcd422 2706 int ret;
6b5a54cd 2707 char path[PATH_MAX];
f6d3e3e4 2708
a19b974f 2709 if (geteuid() != 0 && idtype == ID_TYPE_GID) {
f62cf1d4 2710 __do_close int setgroups_fd = -EBADF;
a19b974f 2711
9bcde680
CB
2712 ret = strnprintf(path, sizeof(path), "/proc/%d/setgroups", pid);
2713 if (ret < 0)
a19b974f 2714 return -E2BIG;
a19b974f 2715
76bcd422 2716 setgroups_fd = open(path, O_WRONLY);
55022530
CB
2717 if (setgroups_fd < 0 && errno != ENOENT)
2718 return log_error_errno(-1, errno, "Failed to open \"%s\"", path);
a19b974f 2719
76bcd422
CB
2720 if (setgroups_fd >= 0) {
2721 ret = lxc_write_nointr(setgroups_fd, "deny\n",
2722 STRLITERALLEN("deny\n"));
55022530
CB
2723 if (ret != STRLITERALLEN("deny\n"))
2724 return log_error_errno(-1, errno, "Failed to write \"deny\" to \"/proc/%d/setgroups\"", pid);
395b1a3e 2725 TRACE("Wrote \"deny\" to \"/proc/%d/setgroups\"", pid);
a19b974f 2726 }
a19b974f
CB
2727 }
2728
9bcde680 2729 ret = strnprintf(path, sizeof(path), "/proc/%d/%cid_map", pid,
29053180 2730 idtype == ID_TYPE_UID ? 'u' : 'g');
9bcde680 2731 if (ret < 0)
f6d3e3e4 2732 return -E2BIG;
29053180 2733
55022530
CB
2734 fd = open(path, O_WRONLY | O_CLOEXEC);
2735 if (fd < 0)
2736 return log_error_errno(-1, errno, "Failed to open \"%s\"", path);
29053180 2737
29053180 2738 ret = lxc_write_nointr(fd, buf, buf_size);
55022530
CB
2739 if (ret != buf_size)
2740 return log_error_errno(-1, errno, "Failed to write %cid mapping to \"%s\"",
2741 idtype == ID_TYPE_UID ? 'u' : 'g', path);
29053180
CB
2742
2743 return 0;
f6d3e3e4
SH
2744}
2745
6e50e704
CB
2746/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2747 *
2748 * @return 1 if functional binary was found
2749 * @return 0 if binary exists but is lacking privilege
2750 * @return -ENOENT if binary does not exist
2751 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
6e50e704 2752 */
df6a2945
CB
2753static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2754{
48411df2 2755 __do_free char *path = NULL;
df6a2945
CB
2756 int ret;
2757 struct stat st;
df6a2945 2758
3275932b 2759 errno = EINVAL;
6e50e704 2760 if (cap != CAP_SETUID && cap != CAP_SETGID)
3275932b 2761 return -1;
6e50e704 2762
3275932b 2763 errno = ENOENT;
df6a2945
CB
2764 path = on_path(binary, NULL);
2765 if (!path)
3275932b 2766 return -1;
df6a2945
CB
2767
2768 ret = stat(path, &st);
3275932b
CB
2769 if (ret < 0)
2770 return -1;
df6a2945
CB
2771
2772 /* Check if the binary is setuid. */
55022530
CB
2773 if (st.st_mode & S_ISUID)
2774 return log_debug(1, "The binary \"%s\" does have the setuid bit set", path);
df6a2945 2775
0fd73091 2776#if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
2777 /* Check if it has the CAP_SETUID capability. */
2778 if ((cap & CAP_SETUID) &&
2779 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
55022530
CB
2780 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED))
2781 return log_debug(1, "The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE and CAP_PERMITTED sets", path);
df6a2945
CB
2782
2783 /* Check if it has the CAP_SETGID capability. */
2784 if ((cap & CAP_SETGID) &&
2785 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
55022530
CB
2786 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED))
2787 return log_debug(1, "The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE and CAP_PERMITTED sets", path);
0fd73091 2788#else
69924fff
CB
2789 /* If we cannot check for file capabilities we need to give the benefit
2790 * of the doubt. Otherwise we might fail even though all the necessary
2791 * file capabilities are set.
2792 */
55022530 2793 DEBUG("Cannot check for file capabilities as full capability support is missing. Manual intervention needed");
0fd73091 2794#endif
df6a2945 2795
3275932b 2796 return 1;
df6a2945
CB
2797}
2798
59eac805 2799static int lxc_map_ids_exec_wrapper(void *args)
986ef930
CB
2800{
2801 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2802 return -1;
2803}
2804
f6d3e3e4
SH
2805int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2806{
0fd73091 2807 int fill, left;
986ef930 2808 char u_or_g;
4bc3b759 2809 char *pos;
6b5a54cd 2810 char cmd_output[PATH_MAX];
0fd73091
CB
2811 struct id_map *map;
2812 struct lxc_list *iterator;
2813 enum idtype type;
0fd73091 2814 int ret = 0, gidmap = 0, uidmap = 0;
c6ba8981
CB
2815 char mapbuf[STRLITERALLEN("new@idmap") + STRLITERALLEN(" ") +
2816 INTTYPE_TO_STRLEN(pid_t) + STRLITERALLEN(" ") +
2817 LXC_IDMAPLEN] = {0};
0fd73091 2818 bool had_entry = false, use_shadow = false;
c724025c
JC
2819 int hostuid, hostgid;
2820
2821 hostuid = geteuid();
2822 hostgid = getegid();
df6a2945
CB
2823
2824 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2825 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
2826 * will protected it by preventing another user from being handed the
2827 * range by shadow.
2828 */
df6a2945 2829 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
2830 if (uidmap == -ENOENT)
2831 WARN("newuidmap binary is missing");
2832 else if (!uidmap)
2833 WARN("newuidmap is lacking necessary privileges");
2834
df6a2945 2835 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
2836 if (gidmap == -ENOENT)
2837 WARN("newgidmap binary is missing");
2838 else if (!gidmap)
2839 WARN("newgidmap is lacking necessary privileges");
2840
df6a2945 2841 if (uidmap > 0 && gidmap > 0) {
0fd73091 2842 DEBUG("Functional newuidmap and newgidmap binary found");
4bc3b759 2843 use_shadow = true;
df6a2945 2844 } else {
99d43365
CB
2845 /* In case unprivileged users run application containers via
2846 * execute() or a start*() there are valid cases where they may
2847 * only want to map their own {g,u}id. Let's not block them from
2848 * doing so by requiring geteuid() == 0.
2849 */
2850 DEBUG("No newuidmap and newgidmap binary found. Trying to "
c724025c
JC
2851 "write directly with euid %d", hostuid);
2852 }
2853
2854 /* Check if we really need to use newuidmap and newgidmap.
2855 * If the user is only remapping his own {g,u}id, we don't need it.
2856 */
2857 if (use_shadow && lxc_list_len(idmap) == 2) {
2858 use_shadow = false;
2859 lxc_list_for_each(iterator, idmap) {
2860 map = iterator->elem;
2861 if (map->idtype == ID_TYPE_UID && map->range == 1 &&
2862 map->nsid == hostuid && map->hostid == hostuid)
2863 continue;
2864 if (map->idtype == ID_TYPE_GID && map->range == 1 &&
2865 map->nsid == hostgid && map->hostid == hostgid)
2866 continue;
2867 use_shadow = true;
2868 break;
2869 }
0e6e3a41 2870 }
251d0d2a 2871
986ef930
CB
2872 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2873 type++, u_or_g = 'g') {
2874 pos = mapbuf;
2875
0e6e3a41 2876 if (use_shadow)
986ef930 2877 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 2878
cf3ef16d 2879 lxc_list_for_each(iterator, idmap) {
251d0d2a 2880 map = iterator->elem;
cf3ef16d
SH
2881 if (map->idtype != type)
2882 continue;
2883
4bc3b759
CB
2884 had_entry = true;
2885
986ef930 2886 left = LXC_IDMAPLEN - (pos - mapbuf);
9bcde680 2887 fill = strnprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
2888 use_shadow ? " " : "", map->nsid,
2889 map->hostid, map->range,
0e6e3a41 2890 use_shadow ? "" : "\n");
55022530
CB
2891 /*
2892 * The kernel only takes <= 4k for writes to
2893 * /proc/<pid>/{g,u}id_map
2894 */
9bcde680 2895 if (fill <= 0)
55022530 2896 return log_error_errno(-1, errno, "Too many %cid mappings defined", u_or_g);
4bc3b759 2897
cf3ef16d 2898 pos += fill;
251d0d2a 2899 }
cf3ef16d 2900 if (!had_entry)
4f7521b4 2901 continue;
cf3ef16d 2902
d85813cd 2903 /* Try to catch the output of new{g,u}idmap to make debugging
986ef930
CB
2904 * easier.
2905 */
2906 if (use_shadow) {
2907 ret = run_command(cmd_output, sizeof(cmd_output),
2908 lxc_map_ids_exec_wrapper,
2909 (void *)mapbuf);
55022530
CB
2910 if (ret < 0)
2911 return log_error(-1, "new%cidmap failed to write mapping \"%s\": %s", u_or_g, cmd_output, mapbuf);
54fbbeb5 2912 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
d1838f34 2913 } else {
986ef930 2914 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
55022530
CB
2915 if (ret < 0)
2916 return log_error(-1, "Failed to write mapping: %s", mapbuf);
54fbbeb5 2917 TRACE("Wrote mapping \"%s\"", mapbuf);
d1838f34 2918 }
986ef930
CB
2919
2920 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 2921 }
251d0d2a 2922
986ef930 2923 return 0;
f6d3e3e4
SH
2924}
2925
234998b4
CB
2926/*
2927 * Return the host uid/gid to which the container root is mapped in val.
0b3a6504 2928 * Return true if id was found, false otherwise.
cf3ef16d 2929 */
234998b4 2930static id_t get_mapped_rootid(const struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2931{
4160c3a0 2932 unsigned nsid;
0fd73091
CB
2933 struct id_map *map;
2934 struct lxc_list *it;
4160c3a0
CB
2935
2936 if (idtype == ID_TYPE_UID)
2937 nsid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
2938 else
2939 nsid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
cf3ef16d 2940
0fd73091 2941 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2942 map = it->elem;
7b50c609 2943 if (map->idtype != idtype)
cf3ef16d 2944 continue;
4160c3a0 2945 if (map->nsid != nsid)
cf3ef16d 2946 continue;
234998b4 2947 return map->hostid;
cf3ef16d 2948 }
4160c3a0 2949
234998b4
CB
2950 if (idtype == ID_TYPE_UID)
2951 return LXC_INVALID_UID;
2952
2953 return LXC_INVALID_GID;
cf3ef16d
SH
2954}
2955
facdf925 2956int mapped_hostid(unsigned id, const struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2957{
cf3ef16d 2958 struct id_map *map;
0fd73091
CB
2959 struct lxc_list *it;
2960
2961 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2962 map = it->elem;
2133f58c 2963 if (map->idtype != idtype)
cf3ef16d 2964 continue;
0fd73091 2965
cf3ef16d 2966 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 2967 return (id - map->hostid) + map->nsid;
cf3ef16d 2968 }
0fd73091 2969
57d116ab 2970 return -1;
cf3ef16d
SH
2971}
2972
7581a82f 2973int find_unmapped_nsid(const struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2974{
cf3ef16d 2975 struct id_map *map;
0fd73091 2976 struct lxc_list *it;
2133f58c 2977 unsigned int freeid = 0;
0fd73091 2978
cf3ef16d 2979again:
0fd73091 2980 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2981 map = it->elem;
2133f58c 2982 if (map->idtype != idtype)
cf3ef16d 2983 continue;
0fd73091 2984
cf3ef16d
SH
2985 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
2986 freeid = map->nsid + map->range;
2987 goto again;
2988 }
2989 }
0fd73091 2990
cf3ef16d
SH
2991 return freeid;
2992}
2993
e1b9d6af
CB
2994/*
2995 * Mount a proc under @rootfs if proc self points to a pid other than
2996 * my own. This is needed to have a known-good proc mount for setting
2997 * up LSMs both at container startup and attach.
2998 *
e1b9d6af
CB
2999 * NOTE: not to be called from inside the container namespace!
3000 */
952b5031 3001static int lxc_transient_proc(struct lxc_rootfs *rootfs)
e1b9d6af 3002{
952b5031
CB
3003 __do_close int fd_proc = -EBADF;
3004 int link_to_pid, link_len, pid_self, ret;
3005 char link[INTTYPE_TO_STRLEN(pid_t) + 1];
e1b9d6af 3006
ea57e424 3007 link_len = readlinkat(rootfs->dfd_mnt, "proc/self", link, sizeof(link));
952b5031 3008 if (link_len < 0) {
ea57e424 3009 ret = mkdirat(rootfs->dfd_mnt, "proc", 0000);
952b5031 3010 if (ret < 0 && errno != EEXIST)
ea57e424 3011 return log_error_errno(-errno, errno, "Failed to create %d(proc)", rootfs->dfd_mnt);
e1b9d6af 3012
952b5031
CB
3013 goto domount;
3014 } else if (link_len >= sizeof(link)) {
3015 return log_error_errno(-EIO, EIO, "Truncated link target");
e1b9d6af 3016 }
952b5031 3017 link[link_len] = '\0';
e1b9d6af 3018
952b5031
CB
3019 pid_self = lxc_raw_getpid();
3020 INFO("Caller's PID is %d; /proc/self points to %s", pid_self, link);
e1b9d6af 3021
952b5031
CB
3022 ret = lxc_safe_int(link, &link_to_pid);
3023 if (ret)
3024 return log_error_errno(-ret, ret, "Failed to parse %s", link);
e1b9d6af 3025
952b5031
CB
3026 /* Correct procfs is already mounted. */
3027 if (link_to_pid == pid_self)
3028 return log_trace(0, "Correct procfs instance mounted");
e1b9d6af 3029
ea57e424 3030 fd_proc = open_at(rootfs->dfd_mnt, "proc", PROTECT_OPATH_DIRECTORY,
952b5031
CB
3031 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3032 if (fd_proc < 0)
3033 return log_error_errno(-errno, errno, "Failed to open transient procfs mountpoint");
e1b9d6af 3034
9bcde680
CB
3035 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "/proc/self/fd/%d", fd_proc);
3036 if (ret < 0)
952b5031 3037 return ret_errno(EIO);
e1b9d6af 3038
952b5031 3039 ret = umount2(rootfs->buf, MNT_DETACH);
e1b9d6af 3040 if (ret < 0)
952b5031 3041 SYSWARN("Failed to umount \"%s\" with MNT_DETACH", rootfs->buf);
e1b9d6af
CB
3042
3043domount:
3044 /* rootfs is NULL */
952b5031
CB
3045 if (!rootfs->path) {
3046 ret = mount("proc", rootfs->buf, "proc", 0, NULL);
3047 } else {
ea57e424 3048 ret = safe_mount_beneath_at(rootfs->dfd_mnt, "none", "proc", "proc", 0, NULL);
952b5031 3049 if (ret < 0) {
9bcde680
CB
3050 ret = strnprintf(rootfs->buf, sizeof(rootfs->buf), "%s/proc", rootfs->path ? rootfs->mount : "");
3051 if (ret < 0)
952b5031
CB
3052 return ret_errno(EIO);
3053
3054 ret = safe_mount("proc", rootfs->buf, "proc", 0, NULL, rootfs->mount);
3055 }
3056 }
e1b9d6af 3057 if (ret < 0)
952b5031 3058 return log_error_errno(-1, errno, "Failed to mount temporary procfs");
e1b9d6af 3059
952b5031 3060 INFO("Created transient procfs mount");
e1b9d6af
CB
3061 return 1;
3062}
3063
943144d9 3064/* NOTE: Must not be called from inside the container namespace! */
59eac805 3065static int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3066{
3067 int mounted;
3068
952b5031 3069 mounted = lxc_transient_proc(&conf->rootfs);
5112cd70 3070 if (mounted == -1) {
01958b1f 3071 /* continue only if there is no rootfs */
943144d9 3072 if (conf->rootfs.path)
952b5031 3073 return log_error_errno(-EPERM, EPERM, "Failed to create transient procfs mount");
5112cd70 3074 } else if (mounted == 1) {
952b5031 3075 conf->transient_procfs_mnt = true;
5112cd70 3076 }
943144d9 3077
5112cd70
SH
3078 return 0;
3079}
3080
3081void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3082{
952b5031
CB
3083 if (lxc_conf->transient_procfs_mnt) {
3084 (void)umount2("/proc", MNT_DETACH);
3085 lxc_conf->transient_procfs_mnt = false;
3086 }
5112cd70
SH
3087}
3088
9e61fb1f
CB
3089/* Walk /proc/mounts and change any shared entries to dependent mounts. */
3090void turn_into_dependent_mounts(void)
e995d7a2 3091{
7969675f 3092 __do_free char *line = NULL;
003be47b 3093 __do_fclose FILE *f = NULL;
f62cf1d4 3094 __do_close int memfd = -EBADF, mntinfo_fd = -EBADF;
e995d7a2 3095 size_t len = 0;
a39fc34b
CB
3096 ssize_t copied;
3097 int ret;
e995d7a2 3098
6a49f05e 3099 mntinfo_fd = open("/proc/self/mountinfo", O_RDONLY | O_CLOEXEC);
fea3b91d
DJ
3100 if (mntinfo_fd < 0) {
3101 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
6a49f05e 3102 return;
fea3b91d 3103 }
6a49f05e
CB
3104
3105 memfd = memfd_create(".lxc_mountinfo", MFD_CLOEXEC);
3106 if (memfd < 0) {
3107 char template[] = P_tmpdir "/.lxc_mountinfo_XXXXXX";
3108
3109 if (errno != ENOSYS) {
fea3b91d 3110 SYSERROR("Failed to create temporary in-memory file");
6a49f05e
CB
3111 return;
3112 }
3113
3114 memfd = lxc_make_tmpfile(template, true);
fea3b91d 3115 if (memfd < 0) {
fea3b91d
DJ
3116 WARN("Failed to create temporary file");
3117 return;
3118 }
6a49f05e
CB
3119 }
3120
a39fc34b 3121 copied = fd_to_fd(mntinfo_fd, memfd);
6a49f05e 3122 if (copied < 0) {
fea3b91d 3123 SYSERROR("Failed to copy \"/proc/self/mountinfo\"");
6a49f05e
CB
3124 return;
3125 }
6a49f05e 3126
6a49f05e
CB
3127 ret = lseek(memfd, 0, SEEK_SET);
3128 if (ret < 0) {
fea3b91d 3129 SYSERROR("Failed to reset file descriptor offset");
6a49f05e
CB
3130 return;
3131 }
3132
4110345b 3133 f = fdopen(memfd, "re");
e995d7a2 3134 if (!f) {
003be47b 3135 SYSERROR("Failed to open copy of \"/proc/self/mountinfo\" to mark all shared. Continuing");
e995d7a2
SH
3136 return;
3137 }
3138
003be47b
CB
3139 /*
3140 * After a successful fdopen() memfd will be closed when calling
3141 * fclose(f). Calling close(memfd) afterwards is undefined.
3142 */
3143 move_fd(memfd);
3144
e995d7a2 3145 while (getline(&line, &len, f) != -1) {
0fd73091
CB
3146 char *opts, *target;
3147
e995d7a2
SH
3148 target = get_field(line, 4);
3149 if (!target)
3150 continue;
0fd73091 3151
e995d7a2
SH
3152 opts = get_field(target, 2);
3153 if (!opts)
3154 continue;
0fd73091 3155
e995d7a2
SH
3156 null_endofword(opts);
3157 if (!strstr(opts, "shared"))
3158 continue;
0fd73091 3159
e995d7a2 3160 null_endofword(target);
0fd73091
CB
3161 ret = mount(NULL, target, NULL, MS_SLAVE, NULL);
3162 if (ret < 0) {
9e61fb1f 3163 SYSERROR("Failed to recursively turn old root mount tree into dependent mount. Continuing...");
6a49f05e 3164 continue;
e995d7a2 3165 }
9e61fb1f 3166 TRACE("Recursively turned old root mount tree into dependent mount");
e995d7a2 3167 }
9e61fb1f 3168 TRACE("Turned all mount table entries into dependent mount");
e995d7a2
SH
3169}
3170
794248d0 3171static int lxc_execute_bind_init(struct lxc_handler *handler)
2322903b
SH
3172{
3173 int ret;
794248d0
CB
3174 char *p;
3175 char path[PATH_MAX], destpath[PATH_MAX];
3176 struct lxc_conf *conf = handler->conf;
9d9c111c
SH
3177
3178 /* If init exists in the container, don't bind mount a static one */
3179 p = choose_init(conf->rootfs.mount);
3180 if (p) {
22f835ba 3181 __do_free char *old = p;
41089848
TA
3182
3183 p = strdup(old + strlen(conf->rootfs.mount));
41089848
TA
3184 if (!p)
3185 return -ENOMEM;
3186
3187 INFO("Found existing init at \"%s\"", p);
3188 goto out;
9d9c111c 3189 }
2322903b 3190
9bcde680
CB
3191 ret = strnprintf(path, sizeof(path), SBINDIR "/init.lxc.static");
3192 if (ret < 0)
8353b4c9 3193 return -1;
2322903b 3194
55022530
CB
3195 if (!file_exists(path))
3196 return log_error_errno(-1, errno, "The file \"%s\" does not exist on host", path);
2322903b 3197
9bcde680
CB
3198 ret = strnprintf(destpath, sizeof(path), "%s" P_tmpdir "%s", conf->rootfs.mount, "/.lxc-init");
3199 if (ret < 0)
8353b4c9 3200 return -1;
2322903b
SH
3201
3202 if (!file_exists(destpath)) {
794248d0 3203 ret = mknod(destpath, S_IFREG | 0000, 0);
55022530
CB
3204 if (ret < 0 && errno != EEXIST)
3205 return log_error_errno(-1, errno, "Failed to create dummy \"%s\" file as bind mount target", destpath);
2322903b
SH
3206 }
3207
592fd47a 3208 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
55022530
CB
3209 if (ret < 0)
3210 return log_error_errno(-1, errno, "Failed to bind mount lxc.init.static into container");
8353b4c9 3211
794248d0
CB
3212 p = strdup(destpath + strlen(conf->rootfs.mount));
3213 if (!p)
3214 return -ENOMEM;
794248d0 3215
8353b4c9 3216 INFO("Bind mounted lxc.init.static into container at \"%s\"", path);
41089848 3217out:
4b5b3a2a 3218 ((struct execute_args *)handler->data)->init_fd = -1;
41089848 3219 ((struct execute_args *)handler->data)->init_path = p;
8353b4c9 3220 return 0;
2322903b
SH
3221}
3222
0fd73091
CB
3223/* This does the work of remounting / if it is shared, calling the container
3224 * pre-mount hooks, and mounting the rootfs.
35120d9c 3225 */
8ce1abc2
CB
3226int lxc_setup_rootfs_prepare_root(struct lxc_conf *conf, const char *name,
3227 const char *lxcpath)
0ad19a3f 3228{
0fd73091
CB
3229 int ret;
3230
ea11a215
CB
3231 conf->rootfs.dfd_host = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
3232 if (conf->rootfs.dfd_host < 0)
a370f16b
CB
3233 return log_error_errno(-errno, errno, "Failed to open \"/\"");
3234
35120d9c 3235 if (conf->rootfs_setup) {
35120d9c 3236 const char *path = conf->rootfs.mount;
0fd73091
CB
3237
3238 /* The rootfs was set up in another namespace. bind-mount it to
3239 * give us a mount in our own ns so we can pivot_root to it
3240 */
3241 ret = mount(path, path, "rootfs", MS_BIND, NULL);
55022530
CB
3242 if (ret < 0)
3243 return log_error(-1, "Failed to bind mount container / onto itself");
0fd73091 3244
ea57e424
CB
3245 conf->rootfs.dfd_mnt = openat(-EBADF, path, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH | O_NOCTTY);
3246 if (conf->rootfs.dfd_mnt < 0)
26ea5533
CB
3247 return log_error_errno(-errno, errno, "Failed to open file descriptor for container rootfs");
3248
55022530 3249 return log_trace(0, "Bind mounted container / onto itself");
35120d9c 3250 }
d4ef7c50 3251
9e61fb1f 3252 turn_into_dependent_mounts();
e995d7a2 3253
0fd73091 3254 ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
55022530
CB
3255 if (ret < 0)
3256 return log_error(-1, "Failed to run pre-mount hooks");
35120d9c 3257
8ce1abc2 3258 ret = lxc_mount_rootfs(conf);
55022530
CB
3259 if (ret < 0)
3260 return log_error(-1, "Failed to setup rootfs for");
35120d9c
SH
3261
3262 conf->rootfs_setup = true;
3263 return 0;
3264}
3265
1c1c7051
SH
3266static bool verify_start_hooks(struct lxc_conf *conf)
3267{
6b5a54cd 3268 char path[PATH_MAX];
0fd73091
CB
3269 struct lxc_list *it;
3270
3271 lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
1c1c7051 3272 int ret;
0fd73091 3273 char *hookname = it->elem;
1c1c7051 3274
9bcde680 3275 ret = strnprintf(path, sizeof(path), "%s%s",
0fd73091
CB
3276 conf->rootfs.path ? conf->rootfs.mount : "",
3277 hookname);
9bcde680 3278 if (ret < 0)
1c1c7051 3279 return false;
0fd73091 3280
75193660 3281 ret = access(path, X_OK);
55022530
CB
3282 if (ret < 0)
3283 return log_error_errno(false, errno, "Start hook \"%s\" not found in container", hookname);
0fd73091 3284
6a0c909a 3285 return true;
1c1c7051
SH
3286 }
3287
3288 return true;
3289}
3290
4b5b3a2a
TA
3291static bool execveat_supported(void)
3292{
f40988c7 3293 execveat(-1, "", NULL, NULL, AT_EMPTY_PATH);
4b5b3a2a
TA
3294 if (errno == ENOSYS)
3295 return false;
3296
3297 return true;
4b5b3a2a
TA
3298}
3299
20502652
CB
3300static int lxc_setup_boot_id(void)
3301{
3302 int ret;
3303 const char *boot_id_path = "/proc/sys/kernel/random/boot_id";
3304 const char *mock_boot_id_path = "/dev/.lxc-boot-id";
3305 lxc_id128_t n;
3306
3307 if (access(boot_id_path, F_OK))
3308 return 0;
3309
3310 memset(&n, 0, sizeof(n));
3311 if (lxc_id128_randomize(&n)) {
3312 SYSERROR("Failed to generate random data for uuid");
3313 return -1;
3314 }
3315
3316 ret = lxc_id128_write(mock_boot_id_path, n);
3317 if (ret < 0) {
3318 SYSERROR("Failed to write uuid to %s", mock_boot_id_path);
3319 return -1;
3320 }
3321
3322 ret = chmod(mock_boot_id_path, 0444);
3323 if (ret < 0) {
3324 SYSERROR("Failed to chown %s", mock_boot_id_path);
3325 (void)unlink(mock_boot_id_path);
3326 return -1;
3327 }
3328
3329 ret = mount(mock_boot_id_path, boot_id_path, NULL, MS_BIND, NULL);
3330 if (ret < 0) {
3331 SYSERROR("Failed to mount %s to %s", mock_boot_id_path,
3332 boot_id_path);
3333 (void)unlink(mock_boot_id_path);
3334 return -1;
3335 }
3336
3337 ret = mount(NULL, boot_id_path, NULL,
3338 (MS_BIND | MS_REMOUNT | MS_RDONLY | MS_NOSUID | MS_NOEXEC |
3339 MS_NODEV),
3340 NULL);
3341 if (ret < 0) {
3342 SYSERROR("Failed to remount %s read-only", boot_id_path);
3343 (void)unlink(mock_boot_id_path);
3344 return -1;
3345 }
3346
3347 return 0;
3348}
3349
af04d847 3350static int lxc_setup_keyring(struct lsm_ops *lsm_ops, const struct lxc_conf *conf)
d701d729
CB
3351{
3352 key_serial_t keyring;
3353 int ret = 0;
3354
3355 if (conf->lsm_se_keyring_context)
af04d847 3356 ret = lsm_ops->keyring_label_set(lsm_ops, conf->lsm_se_keyring_context);
d701d729 3357 else if (conf->lsm_se_context)
af04d847 3358 ret = lsm_ops->keyring_label_set(lsm_ops, conf->lsm_se_context);
d701d729
CB
3359 if (ret < 0)
3360 return log_error_errno(-1, errno, "Failed to set keyring context");
3361
3362 /*
3363 * Try to allocate a new session keyring for the container to prevent
3364 * information leaks.
3365 */
3366 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, prctl_arg(0),
3367 prctl_arg(0), prctl_arg(0), prctl_arg(0));
3368 if (keyring < 0) {
3369 switch (errno) {
3370 case ENOSYS:
3371 DEBUG("The keyctl() syscall is not supported or blocked");
3372 break;
3373 case EACCES:
3374 __fallthrough;
3375 case EPERM:
3376 DEBUG("Failed to access kernel keyring. Continuing...");
3377 break;
3378 default:
3379 SYSERROR("Failed to create kernel keyring");
3380 break;
3381 }
3382 }
3383
3384 return ret;
3385}
3386
3b988b33 3387int lxc_setup(struct lxc_handler *handler)
35120d9c 3388{
2187efd3 3389 int ret;
0fd73091 3390 const char *lxcpath = handler->lxcpath, *name = handler->name;
35120d9c 3391 struct lxc_conf *lxc_conf = handler->conf;
35120d9c 3392
8ce1abc2 3393 ret = lxc_setup_rootfs_prepare_root(lxc_conf, name, lxcpath);
55022530
CB
3394 if (ret < 0)
3395 return log_error(-1, "Failed to setup rootfs");
35120d9c 3396
b87ee312 3397 if (handler->nsfd[LXC_NS_UTS] == -EBADF) {
8353b4c9 3398 ret = setup_utsname(lxc_conf->utsname);
55022530
CB
3399 if (ret < 0)
3400 return log_error(-1, "Failed to setup the utsname %s", name);
0ad19a3f 3401 }
3402
8f818a84 3403 if (!lxc_conf->keyring_disable_session) {
d701d729 3404 ret = lxc_setup_keyring(handler->lsm_ops, lxc_conf);
8f818a84 3405 if (ret < 0)
d701d729 3406 return log_error(-1, "Failed to setup container keyring");
8f818a84 3407 }
b25291da 3408
e389f2af
CB
3409 if (handler->ns_clone_flags & CLONE_NEWNET) {
3410 ret = lxc_setup_network_in_child_namespaces(lxc_conf,
3411 &lxc_conf->network);
55022530
CB
3412 if (ret < 0)
3413 return log_error(-1, "Failed to setup network");
0ad19a3f 3414
e389f2af 3415 ret = lxc_network_send_name_and_ifindex_to_parent(handler);
55022530
CB
3416 if (ret < 0)
3417 return log_error(-1, "Failed to send network device names and ifindices to parent");
790255cf
CB
3418 }
3419
bc6928ff 3420 if (lxc_conf->autodev > 0) {
63012bdd 3421 ret = mount_autodev(name, &lxc_conf->rootfs, lxc_conf->autodevtmpfssize, lxcpath);
55022530
CB
3422 if (ret < 0)
3423 return log_error(-1, "Failed to mount \"/dev\"");
c6883f38
SH
3424 }
3425
ea57e424 3426 lxc_conf->rootfs.dfd_dev = open_at(lxc_conf->rootfs.dfd_mnt, "dev",
fdb57ab4
CB
3427 PROTECT_OPATH_DIRECTORY,
3428 PROTECT_LOOKUP_BENEATH_XDEV, 0);
a5a08920 3429 if (lxc_conf->rootfs.dfd_dev < 0 && errno != ENOENT)
953db219
CB
3430 return log_error_errno(-errno, errno, "Failed to open \"/dev\"");
3431
8353b4c9
CB
3432 /* Do automatic mounts (mainly /proc and /sys), but exclude those that
3433 * need to wait until other stuff has finished.
368bbc02 3434 */
8353b4c9 3435 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler);
55022530
CB
3436 if (ret < 0)
3437 return log_error(-1, "Failed to setup first automatic mounts");
368bbc02 3438
48e5dcc8 3439 ret = setup_mount_fstab(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath);
55022530
CB
3440 if (ret < 0)
3441 return log_error(-1, "Failed to setup mounts");
576f946d 3442
c631115d
FA
3443 if (!lxc_list_empty(&lxc_conf->mount_list)) {
3444 ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
3445 &lxc_conf->mount_list, name, lxcpath);
55022530
CB
3446 if (ret < 0)
3447 return log_error(-1, "Failed to setup mount entries");
c631115d
FA
3448 }
3449
8353b4c9 3450 if (lxc_conf->is_execute) {
4b5b3a2a
TA
3451 if (execveat_supported()) {
3452 int fd;
f4bea7cc 3453 char path[STRLITERALLEN(SBINDIR) + STRLITERALLEN("/init.lxc.static") + 1];
4b5b3a2a 3454
9bcde680
CB
3455 ret = strnprintf(path, sizeof(path), SBINDIR "/init.lxc.static");
3456 if (ret < 0)
55022530 3457 return log_error(-1, "Path to init.lxc.static too long");
4b5b3a2a 3458
f4bea7cc 3459 fd = open(path, O_NOCTTY | O_NOFOLLOW | O_CLOEXEC | O_PATH);
55022530
CB
3460 if (fd < 0)
3461 return log_error_errno(-1, errno, "Unable to open lxc.init.static");
4b5b3a2a
TA
3462
3463 ((struct execute_args *)handler->data)->init_fd = fd;
3464 ((struct execute_args *)handler->data)->init_path = NULL;
3465 } else {
3466 ret = lxc_execute_bind_init(handler);
55022530
CB
3467 if (ret < 0)
3468 return log_error(-1, "Failed to bind-mount the lxc init system");
8353b4c9
CB
3469 }
3470 }
2322903b 3471
8353b4c9
CB
3472 /* Now mount only cgroups, if wanted. Before, /sys could not have been
3473 * mounted. It is guaranteed to be mounted now either through
3474 * automatically or via fstab entries.
368bbc02 3475 */
8353b4c9 3476 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler);
55022530
CB
3477 if (ret < 0)
3478 return log_error(-1, "Failed to setup remaining automatic mounts");
368bbc02 3479
8353b4c9 3480 ret = run_lxc_hooks(name, "mount", lxc_conf, NULL);
55022530
CB
3481 if (ret < 0)
3482 return log_error(-1, "Failed to run mount hooks");
773fb9ca 3483
bc6928ff 3484 if (lxc_conf->autodev > 0) {
8353b4c9 3485 ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
55022530
CB
3486 if (ret < 0)
3487 return log_error(-1, "Failed to run autodev hooks");
06749971 3488
8353b4c9 3489 ret = lxc_fill_autodev(&lxc_conf->rootfs);
55022530
CB
3490 if (ret < 0)
3491 return log_error(-1, "Failed to populate \"/dev\"");
91c3830e 3492 }
368bbc02 3493
75193660 3494 /* Make sure any start hooks are in the container */
55022530
CB
3495 if (!verify_start_hooks(lxc_conf))
3496 return log_error(-1, "Failed to verify start hooks");
75193660 3497
cf68ffd9
CB
3498 ret = lxc_create_tmp_proc_mount(lxc_conf);
3499 if (ret < 0)
3500 return log_error(-1, "Failed to \"/proc\" LSMs");
3501
ed8704d0 3502 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
37c74fd1 3503 lxc_conf->ttys.dir);
55022530
CB
3504 if (ret < 0)
3505 return log_error(-1, "Failed to setup console");
6e590161 3506
ed8704d0 3507 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
55022530
CB
3508 if (ret < 0)
3509 return log_error(-1, "Failed to setup \"/dev\" symlinks");
69aa6655 3510
8ce1abc2 3511 ret = lxc_setup_rootfs_switch_root(&lxc_conf->rootfs);
55022530
CB
3512 if (ret < 0)
3513 return log_error(-1, "Failed to pivot root into rootfs");
ed502555 3514
20502652
CB
3515 /* Setting the boot-id is best-effort for now. */
3516 if (lxc_conf->autodev > 0)
3517 (void)lxc_setup_boot_id();
3518
68f3899e 3519 ret = lxc_setup_devpts_child(handler);
55022530
CB
3520 if (ret < 0)
3521 return log_error(-1, "Failed to setup new devpts instance");
3c26f34e 3522
2187efd3
CB
3523 ret = lxc_create_ttys(handler);
3524 if (ret < 0)
e8bd4e43 3525 return -1;
e8bd4e43 3526
8353b4c9 3527 ret = setup_personality(lxc_conf->personality);
55022530
CB
3528 if (ret < 0)
3529 return log_error(-1, "Failed to set personality");
cccc74b5 3530
8353b4c9
CB
3531 /* Set sysctl value to a path under /proc/sys as determined from the
3532 * key. For e.g. net.ipv4.ip_forward translated to
3533 * /proc/sys/net/ipv4/ip_forward.
7edd0540
L
3534 */
3535 if (!lxc_list_empty(&lxc_conf->sysctls)) {
3536 ret = setup_sysctl_parameters(&lxc_conf->sysctls);
55022530
CB
3537 if (ret < 0)
3538 return log_error(-1, "Failed to setup sysctl parameters");
7edd0540
L
3539 }
3540
97a8f74f 3541 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
55022530
CB
3542 if (!lxc_list_empty(&lxc_conf->caps))
3543 return log_error(-1, "Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both");
8353b4c9 3544
55022530
CB
3545 if (dropcaps_except(&lxc_conf->keepcaps))
3546 return log_error(-1, "Failed to keep capabilities");
97a8f74f 3547 } else if (setup_caps(&lxc_conf->caps)) {
55022530 3548 return log_error(-1, "Failed to drop capabilities");
81810dd1
DL
3549 }
3550
79ff643d 3551 put_lxc_rootfs(&handler->conf->rootfs, true);
8353b4c9 3552 NOTICE("The container \"%s\" is set up", name);
cd54d859 3553
0ad19a3f 3554 return 0;
3555}
26ddeedd 3556
3f60c2f7 3557int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
14a7b0f9 3558 char *argv[])
26ddeedd 3559{
26ddeedd 3560 struct lxc_list *it;
3ea957c6
RK
3561 int which;
3562
3563 for (which = 0; which < NUM_LXC_HOOKS; which ++) {
3564 if (strcmp(hookname, lxchook_names[which]) == 0)
3565 break;
3566 }
3567
3568 if (which >= NUM_LXC_HOOKS)
26ddeedd 3569 return -1;
3f60c2f7 3570
0fd73091 3571 lxc_list_for_each (it, &conf->hooks[which]) {
26ddeedd 3572 int ret;
3f60c2f7
CB
3573 char *hook = it->elem;
3574
3575 ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
14a7b0f9 3576 hookname, argv);
3f60c2f7
CB
3577 if (ret < 0)
3578 return -1;
26ddeedd 3579 }
3f60c2f7 3580
26ddeedd
SH
3581 return 0;
3582}
72d0e1cb 3583
72d0e1cb
SG
3584int lxc_clear_config_caps(struct lxc_conf *c)
3585{
1a0e70ac 3586 struct lxc_list *it, *next;
72d0e1cb 3587
0fd73091 3588 lxc_list_for_each_safe (it, &c->caps, next) {
72d0e1cb
SG
3589 lxc_list_del(it);
3590 free(it->elem);
3591 free(it);
3592 }
0fd73091 3593
72d0e1cb
SG
3594 return 0;
3595}
3596
c7e345ae
CB
3597static int lxc_free_idmap(struct lxc_list *id_map)
3598{
27c27d73
SH
3599 struct lxc_list *it, *next;
3600
46bc6f2a 3601 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
3602 lxc_list_del(it);
3603 free(it->elem);
3604 free(it);
3605 }
c7e345ae 3606
27c27d73
SH
3607 return 0;
3608}
7e621263
CB
3609
3610static int __lxc_free_idmap(struct lxc_list *id_map)
3611{
3612 lxc_free_idmap(id_map);
3613 free(id_map);
3614 return 0;
3615}
3616define_cleanup_function(struct lxc_list *, __lxc_free_idmap);
27c27d73 3617
4355ab5f
SH
3618int lxc_clear_idmaps(struct lxc_conf *c)
3619{
3620 return lxc_free_idmap(&c->id_map);
3621}
3622
1fb86a7c
SH
3623int lxc_clear_config_keepcaps(struct lxc_conf *c)
3624{
0fd73091 3625 struct lxc_list *it, *next;
1fb86a7c 3626
0fd73091 3627 lxc_list_for_each_safe (it, &c->keepcaps, next) {
1fb86a7c
SH
3628 lxc_list_del(it);
3629 free(it->elem);
3630 free(it);
3631 }
0fd73091 3632
1fb86a7c
SH
3633 return 0;
3634}
3635
a3ed9b81 3636int lxc_clear_namespace(struct lxc_conf *c)
3637{
3638 int i;
3639 for (i = 0; i < LXC_NS_MAX; i++) {
3640 free(c->ns_share[i]);
3641 c->ns_share[i] = NULL;
3642 }
3643 return 0;
3644}
3645
54860ed0 3646int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
72d0e1cb 3647{
54860ed0 3648 char *global_token, *namespaced_token;
ab1a6cac 3649 size_t namespaced_token_len;
54860ed0 3650 struct lxc_list *it, *next, *list;
ab1a6cac 3651 const char *k = key;
54860ed0 3652 bool all = false;
72d0e1cb 3653
54860ed0
CB
3654 if (version == CGROUP2_SUPER_MAGIC) {
3655 global_token = "lxc.cgroup2";
3656 namespaced_token = "lxc.cgroup2.";
6333c915 3657 namespaced_token_len = STRLITERALLEN("lxc.cgroup2.");
54860ed0
CB
3658 list = &c->cgroup2;
3659 } else if (version == CGROUP_SUPER_MAGIC) {
3660 global_token = "lxc.cgroup";
3661 namespaced_token = "lxc.cgroup.";
6333c915 3662 namespaced_token_len = STRLITERALLEN("lxc.cgroup.");
54860ed0
CB
3663 list = &c->cgroup;
3664 } else {
ab1a6cac 3665 return -EINVAL;
54860ed0
CB
3666 }
3667
3668 if (strcmp(key, global_token) == 0)
72d0e1cb 3669 all = true;
6333c915 3670 else if (strncmp(key, namespaced_token, namespaced_token_len) == 0)
ab1a6cac 3671 k += namespaced_token_len;
a6390f01 3672 else
ab1a6cac 3673 return -EINVAL;
72d0e1cb 3674
0fd73091 3675 lxc_list_for_each_safe (it, list, next) {
72d0e1cb 3676 struct lxc_cgroup *cg = it->elem;
54860ed0 3677
72d0e1cb
SG
3678 if (!all && strcmp(cg->subsystem, k) != 0)
3679 continue;
54860ed0 3680
72d0e1cb
SG
3681 lxc_list_del(it);
3682 free(cg->subsystem);
3683 free(cg->value);
3684 free(cg);
3685 free(it);
3686 }
e409b214 3687
72d0e1cb
SG
3688 return 0;
3689}
3690
4bfb655e
CB
3691static void lxc_clear_devices(struct lxc_conf *conf)
3692{
3693 struct lxc_list *list = &conf->devices;
3694 struct lxc_list *it, *next;
3695
3696 lxc_list_for_each_safe(it, list, next) {
3697 lxc_list_del(it);
3698 free(it);
3699 }
3700}
3701
c6d09e15
WB
3702int lxc_clear_limits(struct lxc_conf *c, const char *key)
3703{
3704 struct lxc_list *it, *next;
c6d09e15 3705 const char *k = NULL;
0fd73091 3706 bool all = false;
c6d09e15 3707
b668653c 3708 if (strcmp(key, "lxc.limit") == 0 || strcmp(key, "lxc.prlimit") == 0)
c6d09e15 3709 all = true;
6333c915
CB
3710 else if (strncmp(key, "lxc.limit.", STRLITERALLEN("lxc.limit.")) == 0)
3711 k = key + STRLITERALLEN("lxc.limit.");
3712 else if (strncmp(key, "lxc.prlimit.", STRLITERALLEN("lxc.prlimit.")) == 0)
3713 k = key + STRLITERALLEN("lxc.prlimit.");
c6d09e15
WB
3714 else
3715 return -1;
3716
0fd73091 3717 lxc_list_for_each_safe (it, &c->limits, next) {
c6d09e15 3718 struct lxc_limit *lim = it->elem;
0fd73091 3719
c6d09e15
WB
3720 if (!all && strcmp(lim->resource, k) != 0)
3721 continue;
0fd73091 3722
c6d09e15
WB
3723 lxc_list_del(it);
3724 free(lim->resource);
3725 free(lim);
3726 free(it);
3727 }
b668653c 3728
c6d09e15
WB
3729 return 0;
3730}
3731
7edd0540
L
3732int lxc_clear_sysctls(struct lxc_conf *c, const char *key)
3733{
3734 struct lxc_list *it, *next;
7edd0540 3735 const char *k = NULL;
0fd73091 3736 bool all = false;
7edd0540
L
3737
3738 if (strcmp(key, "lxc.sysctl") == 0)
3739 all = true;
6333c915
CB
3740 else if (strncmp(key, "lxc.sysctl.", STRLITERALLEN("lxc.sysctl.")) == 0)
3741 k = key + STRLITERALLEN("lxc.sysctl.");
7edd0540
L
3742 else
3743 return -1;
3744
0fd73091 3745 lxc_list_for_each_safe (it, &c->sysctls, next) {
7edd0540 3746 struct lxc_sysctl *elem = it->elem;
0fd73091 3747
7edd0540
L
3748 if (!all && strcmp(elem->key, k) != 0)
3749 continue;
0fd73091 3750
7edd0540
L
3751 lxc_list_del(it);
3752 free(elem->key);
3753 free(elem->value);
3754 free(elem);
3755 free(it);
3756 }
0fd73091 3757
7edd0540
L
3758 return 0;
3759}
3760
61d7a733
YT
3761int lxc_clear_procs(struct lxc_conf *c, const char *key)
3762{
0fd73091 3763 struct lxc_list *it, *next;
61d7a733 3764 const char *k = NULL;
0fd73091 3765 bool all = false;
61d7a733
YT
3766
3767 if (strcmp(key, "lxc.proc") == 0)
3768 all = true;
6333c915
CB
3769 else if (strncmp(key, "lxc.proc.", STRLITERALLEN("lxc.proc.")) == 0)
3770 k = key + STRLITERALLEN("lxc.proc.");
61d7a733
YT
3771 else
3772 return -1;
3773
0fd73091 3774 lxc_list_for_each_safe (it, &c->procs, next) {
61d7a733 3775 struct lxc_proc *proc = it->elem;
0fd73091 3776
61d7a733
YT
3777 if (!all && strcmp(proc->filename, k) != 0)
3778 continue;
0fd73091 3779
61d7a733
YT
3780 lxc_list_del(it);
3781 free(proc->filename);
3782 free(proc->value);
3783 free(proc);
3784 free(it);
3785 }
3786
3787 return 0;
3788}
3789
ee1e7aa0
SG
3790int lxc_clear_groups(struct lxc_conf *c)
3791{
0fd73091 3792 struct lxc_list *it, *next;
ee1e7aa0 3793
0fd73091 3794 lxc_list_for_each_safe (it, &c->groups, next) {
ee1e7aa0
SG
3795 lxc_list_del(it);
3796 free(it->elem);
3797 free(it);
3798 }
0fd73091 3799
ee1e7aa0
SG
3800 return 0;
3801}
3802
ab799c0b
SG
3803int lxc_clear_environment(struct lxc_conf *c)
3804{
0fd73091 3805 struct lxc_list *it, *next;
ab799c0b 3806
0fd73091 3807 lxc_list_for_each_safe (it, &c->environment, next) {
ab799c0b
SG
3808 lxc_list_del(it);
3809 free(it->elem);
3810 free(it);
3811 }
0fd73091 3812
ab799c0b
SG
3813 return 0;
3814}
3815
72d0e1cb
SG
3816int lxc_clear_mount_entries(struct lxc_conf *c)
3817{
0fd73091 3818 struct lxc_list *it, *next;
72d0e1cb 3819
0fd73091 3820 lxc_list_for_each_safe (it, &c->mount_list, next) {
72d0e1cb
SG
3821 lxc_list_del(it);
3822 free(it->elem);
3823 free(it);
3824 }
0fd73091 3825
72d0e1cb
SG
3826 return 0;
3827}
3828
b099e9e9
SH
3829int lxc_clear_automounts(struct lxc_conf *c)
3830{
3831 c->auto_mounts = 0;
3832 return 0;
3833}
3834
12a50cc6 3835int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 3836{
72d0e1cb 3837 int i;
0fd73091
CB
3838 struct lxc_list *it, *next;
3839 const char *k = NULL;
3840 bool all = false, done = false;
72d0e1cb 3841
17ed13a3
SH
3842 if (strcmp(key, "lxc.hook") == 0)
3843 all = true;
6333c915
CB
3844 else if (strncmp(key, "lxc.hook.", STRLITERALLEN("lxc.hook.")) == 0)
3845 k = key + STRLITERALLEN("lxc.hook.");
a6390f01
WB
3846 else
3847 return -1;
17ed13a3 3848
0fd73091 3849 for (i = 0; i < NUM_LXC_HOOKS; i++) {
17ed13a3 3850 if (all || strcmp(k, lxchook_names[i]) == 0) {
0fd73091 3851 lxc_list_for_each_safe (it, &c->hooks[i], next) {
17ed13a3
SH
3852 lxc_list_del(it);
3853 free(it->elem);
3854 free(it);
3855 }
0fd73091 3856
17ed13a3 3857 done = true;
72d0e1cb
SG
3858 }
3859 }
17ed13a3 3860
55022530
CB
3861 if (!done)
3862 return log_error(-1, "Invalid hook key: %s", key);
0fd73091 3863
72d0e1cb
SG
3864 return 0;
3865}
8eb5694b 3866
4184c3e1
SH
3867static inline void lxc_clear_aliens(struct lxc_conf *conf)
3868{
0fd73091 3869 struct lxc_list *it, *next;
4184c3e1 3870
0fd73091 3871 lxc_list_for_each_safe (it, &conf->aliens, next) {
4184c3e1
SH
3872 lxc_list_del(it);
3873 free(it->elem);
3874 free(it);
3875 }
3876}
3877
c7b15d1e 3878void lxc_clear_includes(struct lxc_conf *conf)
f979ac15 3879{
0fd73091 3880 struct lxc_list *it, *next;
f979ac15 3881
0fd73091 3882 lxc_list_for_each_safe (it, &conf->includes, next) {
f979ac15
SH
3883 lxc_list_del(it);
3884 free(it->elem);
3885 free(it);
3886 }
3887}
3888
1800f924
WB
3889int lxc_clear_apparmor_raw(struct lxc_conf *c)
3890{
3891 struct lxc_list *it, *next;
3892
3893 lxc_list_for_each_safe (it, &c->lsm_aa_raw, next) {
3894 lxc_list_del(it);
3895 free(it->elem);
3896 free(it);
3897 }
3898
3899 return 0;
3900}
3901
8eb5694b
SH
3902void lxc_conf_free(struct lxc_conf *conf)
3903{
3904 if (!conf)
3905 return;
0fd73091 3906
858377e4
SH
3907 if (current_config == conf)
3908 current_config = NULL;
aed105d5 3909 lxc_terminal_conf_free(&conf->console);
f10fad2f 3910 free(conf->rootfs.mount);
b3b8c97f 3911 free(conf->rootfs.bdev_type);
f10fad2f
ME
3912 free(conf->rootfs.options);
3913 free(conf->rootfs.path);
9dd75981 3914 free(conf->rootfs.data);
79ff643d 3915 put_lxc_rootfs(&conf->rootfs, true);
f10fad2f 3916 free(conf->logfile);
858377e4
SH
3917 if (conf->logfd != -1)
3918 close(conf->logfd);
f10fad2f 3919 free(conf->utsname);
885766f5
CB
3920 free(conf->ttys.dir);
3921 free(conf->ttys.tty_names);
f10fad2f
ME
3922 free(conf->fstab);
3923 free(conf->rcfile);
5cda27c1 3924 free(conf->execute_cmd);
f10fad2f 3925 free(conf->init_cmd);
bf31b337 3926 free(conf->init_groups.list);
3c491553 3927 free(conf->init_cwd);
6b0d5538 3928 free(conf->unexpanded_config);
76d0127f 3929 free(conf->syslog);
c302b476 3930 lxc_free_networks(&conf->network);
f10fad2f 3931 free(conf->lsm_aa_profile);
1800f924 3932 free(conf->lsm_aa_profile_computed);
f10fad2f 3933 free(conf->lsm_se_context);
c3e3c21a 3934 lxc_seccomp_free(&conf->seccomp);
8eb5694b 3935 lxc_clear_config_caps(conf);
1fb86a7c 3936 lxc_clear_config_keepcaps(conf);
54860ed0
CB
3937 lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
3938 lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
4bfb655e 3939 lxc_clear_devices(conf);
17ed13a3 3940 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 3941 lxc_clear_mount_entries(conf);
27c27d73 3942 lxc_clear_idmaps(conf);
ee1e7aa0 3943 lxc_clear_groups(conf);
f979ac15 3944 lxc_clear_includes(conf);
761d81ca 3945 lxc_clear_aliens(conf);
ab799c0b 3946 lxc_clear_environment(conf);
240d4b74 3947 lxc_clear_limits(conf, "lxc.prlimit");
7edd0540 3948 lxc_clear_sysctls(conf, "lxc.sysctl");
61d7a733 3949 lxc_clear_procs(conf, "lxc.proc");
1800f924 3950 lxc_clear_apparmor_raw(conf);
a3ed9b81 3951 lxc_clear_namespace(conf);
43654d34 3952 free(conf->cgroup_meta.dir);
a900cbaf 3953 free(conf->cgroup_meta.monitor_dir);
eb60b564 3954 free(conf->cgroup_meta.monitor_pivot_dir);
a900cbaf
WB
3955 free(conf->cgroup_meta.container_dir);
3956 free(conf->cgroup_meta.namespace_dir);
43654d34 3957 free(conf->cgroup_meta.controllers);
7a41e857
LT
3958 free(conf->shmount.path_host);
3959 free(conf->shmount.path_cont);
8eb5694b
SH
3960 free(conf);
3961}
4355ab5f
SH
3962
3963struct userns_fn_data {
3964 int (*fn)(void *);
c9b7c33e 3965 const char *fn_name;
4355ab5f
SH
3966 void *arg;
3967 int p[2];
3968};
3969
3970static int run_userns_fn(void *data)
3971{
766c5b6d 3972 struct userns_fn_data *d = data;
adaffdd7 3973 int ret;
4355ab5f 3974 char c;
4355ab5f 3975
766c5b6d 3976 close_prot_errno_disarm(d->p[1]);
f8aa4bf3 3977
766c5b6d
CB
3978 /*
3979 * Wait for parent to finish establishing a new mapping in the user
f8aa4bf3
CB
3980 * namespace we are executing in.
3981 */
adaffdd7 3982 ret = lxc_read_nointr(d->p[0], &c, 1);
766c5b6d 3983 close_prot_errno_disarm(d->p[0]);
adaffdd7
CB
3984 if (ret != 1)
3985 return -1;
f8aa4bf3 3986
c9b7c33e 3987 if (d->fn_name)
adaffdd7 3988 TRACE("Calling function \"%s\"", d->fn_name);
0fd73091 3989
f8aa4bf3 3990 /* Call function to run. */
4355ab5f
SH
3991 return d->fn(d->arg);
3992}
3993
7581a82f 3994static struct id_map *mapped_nsid_add(const struct lxc_conf *conf, unsigned id,
db7cfe23
CB
3995 enum idtype idtype)
3996{
5173b710
CB
3997 const struct id_map *map;
3998 struct id_map *retmap;
db7cfe23
CB
3999
4000 map = find_mapped_nsid_entry(conf, id, idtype);
4001 if (!map)
4002 return NULL;
4003
4004 retmap = malloc(sizeof(*retmap));
4005 if (!retmap)
4006 return NULL;
4007
4008 memcpy(retmap, map, sizeof(*retmap));
4009 return retmap;
4010}
4011
7581a82f 4012static struct id_map *find_mapped_hostid_entry(const struct lxc_conf *conf,
c4333195 4013 unsigned id, enum idtype idtype)
f8aa4bf3 4014{
f8aa4bf3 4015 struct id_map *map;
0fd73091 4016 struct lxc_list *it;
f8aa4bf3
CB
4017 struct id_map *retmap = NULL;
4018
0fd73091 4019 lxc_list_for_each (it, &conf->id_map) {
f8aa4bf3
CB
4020 map = it->elem;
4021 if (map->idtype != idtype)
4022 continue;
4023
4024 if (id >= map->hostid && id < map->hostid + map->range) {
4025 retmap = map;
4026 break;
4027 }
4028 }
4029
f8aa4bf3
CB
4030 return retmap;
4031}
4032
0fd73091 4033/* Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
f8aa4bf3 4034 * existing one or establish a new one.
4355ab5f 4035 */
7581a82f 4036static struct id_map *mapped_hostid_add(const struct lxc_conf *conf, uid_t id,
0fd73091 4037 enum idtype type)
4355ab5f 4038{
55022530 4039 __do_free struct id_map *entry = NULL;
28a2d9e7 4040 int hostid_mapped;
55022530 4041 struct id_map *tmp = NULL;
c4333195
CB
4042
4043 entry = malloc(sizeof(*entry));
4044 if (!entry)
4045 return NULL;
f8aa4bf3 4046
28a2d9e7 4047 /* Reuse existing mapping. */
c4333195 4048 tmp = find_mapped_hostid_entry(conf, id, type);
1758c195
CB
4049 if (tmp) {
4050 memcpy(entry, tmp, sizeof(*entry));
4051 } else {
4052 /* Find new mapping. */
4053 hostid_mapped = find_unmapped_nsid(conf, type);
4054 if (hostid_mapped < 0)
4055 return log_debug(NULL, "Failed to find free mapping for id %d", id);
4056
4057 entry->idtype = type;
4058 entry->nsid = hostid_mapped;
4059 entry->hostid = (unsigned long)id;
4060 entry->range = 1;
4061 }
4355ab5f 4062
55022530 4063 return move_ptr(entry);
4355ab5f
SH
4064}
4065
dbfcdf86
CB
4066static struct lxc_list *get_minimal_idmap(const struct lxc_conf *conf,
4067 uid_t *resuid, gid_t *resgid)
4355ab5f 4068{
00d6cfe2
CB
4069 __do_free struct id_map *container_root_uid = NULL,
4070 *container_root_gid = NULL,
4071 *host_uid_map = NULL, *host_gid_map = NULL;
4072 __do_free struct lxc_list *idmap = NULL;
f8aa4bf3 4073 uid_t euid, egid;
4160c3a0
CB
4074 uid_t nsuid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
4075 gid_t nsgid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
00d6cfe2 4076 struct lxc_list *tmplist = NULL;
4355ab5f 4077
db7cfe23 4078 /* Find container root mappings. */
4160c3a0 4079 container_root_uid = mapped_nsid_add(conf, nsuid, ID_TYPE_UID);
55022530
CB
4080 if (!container_root_uid)
4081 return log_debug(NULL, "Failed to find mapping for namespace uid %d", 0);
dcf0ffdf
CB
4082 euid = geteuid();
4083 if (euid >= container_root_uid->hostid &&
4084 euid < (container_root_uid->hostid + container_root_uid->range))
2c996219 4085 host_uid_map = move_ptr(container_root_uid);
f8aa4bf3 4086
4160c3a0 4087 container_root_gid = mapped_nsid_add(conf, nsgid, ID_TYPE_GID);
55022530
CB
4088 if (!container_root_gid)
4089 return log_debug(NULL, "Failed to find mapping for namespace gid %d", 0);
dcf0ffdf
CB
4090 egid = getegid();
4091 if (egid >= container_root_gid->hostid &&
4092 egid < (container_root_gid->hostid + container_root_gid->range))
2c996219 4093 host_gid_map = move_ptr(container_root_gid);
f8aa4bf3
CB
4094
4095 /* Check whether the {g,u}id of the user has a mapping. */
954b7d9b 4096 if (!host_uid_map)
c4333195 4097 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
55022530
CB
4098 if (!host_uid_map)
4099 return log_debug(NULL, "Failed to find mapping for uid %d", euid);
f8aa4bf3 4100
dcf0ffdf
CB
4101 if (!host_gid_map)
4102 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
55022530
CB
4103 if (!host_gid_map)
4104 return log_debug(NULL, "Failed to find mapping for gid %d", egid);
28a2d9e7
CB
4105
4106 /* Allocate new {g,u}id map list. */
4107 idmap = malloc(sizeof(*idmap));
4108 if (!idmap)
00d6cfe2 4109 return NULL;
28a2d9e7
CB
4110 lxc_list_init(idmap);
4111
f8aa4bf3
CB
4112 /* Add container root to the map. */
4113 tmplist = malloc(sizeof(*tmplist));
4114 if (!tmplist)
00d6cfe2 4115 return NULL;
47649d5b
CB
4116 /* idmap will now keep track of that memory. */
4117 lxc_list_add_elem(tmplist, move_ptr(host_uid_map));
f8aa4bf3 4118 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4119
2c996219 4120 if (container_root_uid) {
28a2d9e7
CB
4121 /* Add container root to the map. */
4122 tmplist = malloc(sizeof(*tmplist));
4123 if (!tmplist)
00d6cfe2 4124 return NULL;
47649d5b
CB
4125 /* idmap will now keep track of that memory. */
4126 lxc_list_add_elem(tmplist, move_ptr(container_root_uid));
28a2d9e7 4127 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4128 }
f8aa4bf3
CB
4129
4130 tmplist = malloc(sizeof(*tmplist));
4131 if (!tmplist)
00d6cfe2 4132 return NULL;
47649d5b
CB
4133 /* idmap will now keep track of that memory. */
4134 lxc_list_add_elem(tmplist, move_ptr(host_gid_map));
f8aa4bf3 4135 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4136
2c996219 4137 if (container_root_gid) {
28a2d9e7
CB
4138 tmplist = malloc(sizeof(*tmplist));
4139 if (!tmplist)
00d6cfe2 4140 return NULL;
47649d5b
CB
4141 /* idmap will now keep track of that memory. */
4142 lxc_list_add_elem(tmplist, move_ptr(container_root_gid));
28a2d9e7 4143 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4144 }
f8aa4bf3 4145
dbfcdf86
CB
4146 TRACE("Allocated minimal idmapping for ns uid %d and ns gid %d", nsuid, nsgid);
4147
4148 if (resuid)
4149 *resuid = nsuid;
4150 if (resgid)
4151 *resgid = nsgid;
00d6cfe2 4152 return move_ptr(idmap);
dcf0ffdf
CB
4153}
4154
766c5b6d
CB
4155/*
4156 * Run a function in a new user namespace.
dcf0ffdf
CB
4157 * The caller's euid/egid will be mapped if it is not already.
4158 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4159 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4160 * This means we require only to establish a mapping from:
4161 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4162 * - the container root -> some sub{g,u}id
915e3dbd 4163 * The former we add, if the user did not specify a mapping. The latter we
6f3fd27f 4164 * retrieve from the container's configured {g,u}id mappings as it must have been
dcf0ffdf
CB
4165 * there to start the container in the first place.
4166 */
7581a82f 4167int userns_exec_1(const struct lxc_conf *conf, int (*fn)(void *), void *data,
dcf0ffdf
CB
4168 const char *fn_name)
4169{
7e621263 4170 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
0fd73091
CB
4171 int ret = -1, status = -1;
4172 char c = '1';
46bc6f2a
CB
4173 struct userns_fn_data d = {
4174 .arg = data,
4175 .fn = fn,
4176 .fn_name = fn_name,
4177 };
766c5b6d
CB
4178 pid_t pid;
4179 int pipe_fds[2];
dcf0ffdf 4180
2b2655a8
CB
4181 if (!conf)
4182 return -EINVAL;
4183
dbfcdf86 4184 idmap = get_minimal_idmap(conf, NULL, NULL);
dcf0ffdf 4185 if (!idmap)
766c5b6d 4186 return ret_errno(ENOENT);
dcf0ffdf 4187
766c5b6d
CB
4188 ret = pipe2(pipe_fds, O_CLOEXEC);
4189 if (ret < 0)
4190 return -errno;
4191
766c5b6d
CB
4192 d.p[0] = pipe_fds[0];
4193 d.p[1] = pipe_fds[1];
dcf0ffdf
CB
4194
4195 /* Clone child in new user namespace. */
a59440be 4196 pid = lxc_raw_clone_cb(run_userns_fn, &d, CLONE_NEWUSER, NULL);
dcf0ffdf 4197 if (pid < 0) {
0fd73091 4198 ERROR("Failed to clone process in new user namespace");
dcf0ffdf
CB
4199 goto on_error;
4200 }
4201
766c5b6d 4202 close_prot_errno_disarm(pipe_fds[0]);
dcf0ffdf 4203
62fef886 4204 if (lxc_log_trace()) {
dcf0ffdf 4205 struct id_map *map;
0fd73091 4206 struct lxc_list *it;
dcf0ffdf 4207
766c5b6d 4208 lxc_list_for_each(it, idmap) {
f8aa4bf3 4209 map = it->elem;
766c5b6d
CB
4210 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4211 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
f8aa4bf3 4212 }
4355ab5f
SH
4213 }
4214
f8aa4bf3 4215 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4216 ret = lxc_map_ids(idmap, pid);
f8aa4bf3 4217 if (ret < 0) {
0fd73091 4218 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
f8aa4bf3 4219 goto on_error;
4355ab5f
SH
4220 }
4221
f8aa4bf3 4222 /* Tell child to proceed. */
766c5b6d 4223 if (lxc_write_nointr(pipe_fds[1], &c, 1) != 1) {
dcf0ffdf 4224 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
f8aa4bf3 4225 goto on_error;
4355ab5f
SH
4226 }
4227
686dd5d1 4228on_error:
766c5b6d
CB
4229 close_prot_errno_disarm(pipe_fds[0]);
4230 close_prot_errno_disarm(pipe_fds[1]);
f8aa4bf3 4231
ee1b16bc
TA
4232 /* Wait for child to finish. */
4233 if (pid > 0)
4234 status = wait_for_pid(pid);
4235
686dd5d1
CB
4236 if (status < 0)
4237 ret = -1;
4238
f8aa4bf3 4239 return ret;
4355ab5f 4240}
97e9cfa0 4241
d1783ef4
CB
4242int userns_exec_minimal(const struct lxc_conf *conf,
4243 int (*fn_parent)(void *), void *fn_parent_data,
4244 int (*fn_child)(void *), void *fn_child_data)
edf88289 4245{
7e621263 4246 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
dbfcdf86
CB
4247 uid_t resuid = LXC_INVALID_UID;
4248 gid_t resgid = LXC_INVALID_GID;
edf88289 4249 char c = '1';
dbfcdf86 4250 ssize_t ret;
edf88289
CB
4251 pid_t pid;
4252 int sock_fds[2];
4253
d1783ef4 4254 if (!conf || !fn_child)
dbfcdf86 4255 return ret_errno(EINVAL);
edf88289 4256
dbfcdf86 4257 idmap = get_minimal_idmap(conf, &resuid, &resgid);
edf88289
CB
4258 if (!idmap)
4259 return ret_errno(ENOENT);
4260
4261 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, sock_fds);
4262 if (ret < 0)
4263 return -errno;
4264
4265 pid = fork();
4266 if (pid < 0) {
dbfcdf86 4267 SYSERROR("Failed to create new process");
edf88289
CB
4268 goto on_error;
4269 }
4270
4271 if (pid == 0) {
4272 close_prot_errno_disarm(sock_fds[1]);
4273
4274 ret = unshare(CLONE_NEWUSER);
dbfcdf86
CB
4275 if (ret < 0) {
4276 SYSERROR("Failed to unshare new user namespace");
edf88289 4277 _exit(EXIT_FAILURE);
dbfcdf86 4278 }
edf88289 4279
dbfcdf86
CB
4280 ret = lxc_write_nointr(sock_fds[0], &c, 1);
4281 if (ret != 1)
edf88289
CB
4282 _exit(EXIT_FAILURE);
4283
4284 ret = lxc_read_nointr(sock_fds[0], &c, 1);
4285 if (ret != 1)
4286 _exit(EXIT_FAILURE);
4287
4288 close_prot_errno_disarm(sock_fds[0]);
4289
8917c382 4290 if (!lxc_drop_groups() && errno != EPERM)
edf88289
CB
4291 _exit(EXIT_FAILURE);
4292
dbfcdf86
CB
4293 ret = setresgid(resgid, resgid, resgid);
4294 if (ret < 0) {
4295 SYSERROR("Failed to setresgid(%d, %d, %d)",
4296 resgid, resgid, resgid);
edf88289 4297 _exit(EXIT_FAILURE);
dbfcdf86
CB
4298 }
4299
4300 ret = setresuid(resuid, resuid, resuid);
4301 if (ret < 0) {
4302 SYSERROR("Failed to setresuid(%d, %d, %d)",
4303 resuid, resuid, resuid);
4304 _exit(EXIT_FAILURE);
4305 }
edf88289 4306
d1783ef4 4307 ret = fn_child(fn_child_data);
dbfcdf86
CB
4308 if (ret) {
4309 SYSERROR("Running function in new user namespace failed");
edf88289 4310 _exit(EXIT_FAILURE);
dbfcdf86 4311 }
edf88289
CB
4312
4313 _exit(EXIT_SUCCESS);
4314 }
4315
4316 close_prot_errno_disarm(sock_fds[0]);
4317
62fef886 4318 if (lxc_log_trace()) {
edf88289
CB
4319 struct id_map *map;
4320 struct lxc_list *it;
4321
4322 lxc_list_for_each(it, idmap) {
4323 map = it->elem;
4324 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4325 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
4326 }
4327 }
4328
4329 ret = lxc_read_nointr(sock_fds[1], &c, 1);
4330 if (ret != 1) {
4331 SYSERROR("Failed waiting for child process %d\" to tell us to proceed", pid);
4332 goto on_error;
4333 }
4334
4335 /* Set up {g,u}id mapping for user namespace of child process. */
4336 ret = lxc_map_ids(idmap, pid);
4337 if (ret < 0) {
4338 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4339 goto on_error;
4340 }
4341
4342 /* Tell child to proceed. */
4343 ret = lxc_write_nointr(sock_fds[1], &c, 1);
4344 if (ret != 1) {
4345 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4346 goto on_error;
4347 }
4348
d1783ef4
CB
4349 if (fn_parent && fn_parent(fn_parent_data)) {
4350 SYSERROR("Running parent function failed");
4351 _exit(EXIT_FAILURE);
4352 }
4353
edf88289
CB
4354on_error:
4355 close_prot_errno_disarm(sock_fds[0]);
4356 close_prot_errno_disarm(sock_fds[1]);
4357
4358 /* Wait for child to finish. */
dbfcdf86
CB
4359 if (pid < 0)
4360 return -1;
edf88289 4361
dbfcdf86 4362 return wait_for_pid(pid);
edf88289
CB
4363}
4364
415a8851
CB
4365int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
4366 const char *fn_name)
4367{
4368 pid_t pid;
4369 uid_t euid, egid;
415a8851
CB
4370 int p[2];
4371 struct id_map *map;
4372 struct lxc_list *cur;
0fd73091 4373 struct userns_fn_data d;
415a8851 4374 int ret = -1;
0fd73091 4375 char c = '1';
415a8851
CB
4376 struct lxc_list *idmap = NULL, *tmplist = NULL;
4377 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4378 *host_uid_map = NULL, *host_gid_map = NULL;
4379
2b2655a8
CB
4380 if (!conf)
4381 return -EINVAL;
4382
979f9e34 4383 ret = pipe2(p, O_CLOEXEC);
415a8851
CB
4384 if (ret < 0) {
4385 SYSERROR("opening pipe");
4386 return -1;
4387 }
4388 d.fn = fn;
4389 d.fn_name = fn_name;
4390 d.arg = data;
4391 d.p[0] = p[0];
4392 d.p[1] = p[1];
4393
4394 /* Clone child in new user namespace. */
33258b95 4395 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER, NULL);
415a8851 4396 if (pid < 0) {
0fd73091 4397 ERROR("Failed to clone process in new user namespace");
415a8851
CB
4398 goto on_error;
4399 }
4400
4401 close(p[0]);
4402 p[0] = -1;
4403
4404 euid = geteuid();
4405 egid = getegid();
4406
4407 /* Allocate new {g,u}id map list. */
4408 idmap = malloc(sizeof(*idmap));
4409 if (!idmap)
4410 goto on_error;
4411 lxc_list_init(idmap);
4412
4413 /* Find container root. */
0fd73091 4414 lxc_list_for_each (cur, &conf->id_map) {
415a8851
CB
4415 struct id_map *tmpmap;
4416
4417 tmplist = malloc(sizeof(*tmplist));
4418 if (!tmplist)
4419 goto on_error;
4420
4421 tmpmap = malloc(sizeof(*tmpmap));
4422 if (!tmpmap) {
4423 free(tmplist);
4424 goto on_error;
4425 }
4426
4427 memset(tmpmap, 0, sizeof(*tmpmap));
4428 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
4429 tmplist->elem = tmpmap;
4430
4431 lxc_list_add_tail(idmap, tmplist);
4432
4433 map = cur->elem;
4434
4435 if (map->idtype == ID_TYPE_UID)
4436 if (euid >= map->hostid && euid < map->hostid + map->range)
4437 host_uid_map = map;
4438
4439 if (map->idtype == ID_TYPE_GID)
4440 if (egid >= map->hostid && egid < map->hostid + map->range)
4441 host_gid_map = map;
4442
4443 if (map->nsid != 0)
4444 continue;
4445
4446 if (map->idtype == ID_TYPE_UID)
4447 if (container_root_uid == NULL)
4448 container_root_uid = map;
4449
4450 if (map->idtype == ID_TYPE_GID)
4451 if (container_root_gid == NULL)
4452 container_root_gid = map;
4453 }
4454
4455 if (!container_root_uid || !container_root_gid) {
4456 ERROR("No mapping for container root found");
4457 goto on_error;
4458 }
4459
4460 /* Check whether the {g,u}id of the user has a mapping. */
4461 if (!host_uid_map)
c4333195 4462 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
415a8851
CB
4463 else
4464 host_uid_map = container_root_uid;
4465
4466 if (!host_gid_map)
c4333195 4467 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
415a8851
CB
4468 else
4469 host_gid_map = container_root_gid;
4470
4471 if (!host_uid_map) {
4472 DEBUG("Failed to find mapping for uid %d", euid);
4473 goto on_error;
4474 }
4475
4476 if (!host_gid_map) {
4477 DEBUG("Failed to find mapping for gid %d", egid);
4478 goto on_error;
4479 }
4480
4481 if (host_uid_map && (host_uid_map != container_root_uid)) {
4482 /* Add container root to the map. */
4483 tmplist = malloc(sizeof(*tmplist));
4484 if (!tmplist)
4485 goto on_error;
4486 lxc_list_add_elem(tmplist, host_uid_map);
4487 lxc_list_add_tail(idmap, tmplist);
4488 }
4489 /* idmap will now keep track of that memory. */
4490 host_uid_map = NULL;
4491
4492 if (host_gid_map && (host_gid_map != container_root_gid)) {
4493 tmplist = malloc(sizeof(*tmplist));
4494 if (!tmplist)
4495 goto on_error;
4496 lxc_list_add_elem(tmplist, host_gid_map);
4497 lxc_list_add_tail(idmap, tmplist);
4498 }
4499 /* idmap will now keep track of that memory. */
4500 host_gid_map = NULL;
4501
62fef886 4502 if (lxc_log_trace()) {
0fd73091 4503 lxc_list_for_each (cur, idmap) {
415a8851
CB
4504 map = cur->elem;
4505 TRACE("establishing %cid mapping for \"%d\" in new "
4506 "user namespace: nsuid %lu - hostid %lu - range "
4507 "%lu",
4508 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4509 map->nsid, map->hostid, map->range);
4510 }
4511 }
4512
4513 /* Set up {g,u}id mapping for user namespace of child process. */
4514 ret = lxc_map_ids(idmap, pid);
4515 if (ret < 0) {
0fd73091 4516 ERROR("error setting up {g,u}id mappings for child process \"%d\"", pid);
415a8851
CB
4517 goto on_error;
4518 }
4519
4520 /* Tell child to proceed. */
489f39be 4521 if (lxc_write_nointr(p[1], &c, 1) != 1) {
0fd73091 4522 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
415a8851
CB
4523 goto on_error;
4524 }
4525
686dd5d1 4526on_error:
ee1b16bc
TA
4527 if (p[0] != -1)
4528 close(p[0]);
4529 close(p[1]);
4530
415a8851 4531 /* Wait for child to finish. */
686dd5d1
CB
4532 if (pid > 0)
4533 ret = wait_for_pid(pid);
415a8851 4534
7e621263
CB
4535 if (idmap)
4536 __lxc_free_idmap(idmap);
80758b4b 4537
415a8851
CB
4538 if (host_uid_map && (host_uid_map != container_root_uid))
4539 free(host_uid_map);
4540 if (host_gid_map && (host_gid_map != container_root_gid))
4541 free(host_gid_map);
4542
415a8851
CB
4543 return ret;
4544}
4545
234998b4
CB
4546static int add_idmap_entry(struct lxc_list *idmap, enum idtype idtype,
4547 unsigned long nsid, unsigned long hostid,
4548 unsigned long range)
4549{
4550 __do_free struct id_map *new_idmap = NULL;
4551 __do_free struct lxc_list *new_list = NULL;
4552
4553 new_idmap = zalloc(sizeof(*new_idmap));
4554 if (!new_idmap)
4555 return ret_errno(ENOMEM);
4556
4557 new_idmap->idtype = idtype;
4558 new_idmap->hostid = hostid;
4559 new_idmap->nsid = nsid;
4560 new_idmap->range = range;
4561
4562 new_list = zalloc(sizeof(*new_list));
4563 if (!new_list)
4564 return ret_errno(ENOMEM);
4565
4566 new_list->elem = move_ptr(new_idmap);
4567 lxc_list_add_tail(idmap, move_ptr(new_list));
4568
4569 INFO("Adding id map: type %c nsid %lu hostid %lu range %lu",
4570 idtype == ID_TYPE_UID ? 'u' : 'g', nsid, hostid, range);
4571 return 0;
4572}
4573
4574int userns_exec_mapped_root(const char *path, int path_fd,
4575 const struct lxc_conf *conf)
4576{
7e621263 4577 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
234998b4
CB
4578 __do_close int fd = -EBADF;
4579 int target_fd = -EBADF;
4580 char c = '1';
4581 ssize_t ret;
4582 pid_t pid;
4583 int sock_fds[2];
4584 uid_t container_host_uid, hostuid;
4585 gid_t container_host_gid, hostgid;
4586 struct stat st;
4587
4588 if (!conf || (!path && path_fd < 0))
4589 return ret_errno(EINVAL);
4590
4591 if (!path)
4592 path = "(null)";
4593
4594 container_host_uid = get_mapped_rootid(conf, ID_TYPE_UID);
4595 if (!uid_valid(container_host_uid))
4596 return log_error(-1, "No uid mapping for container root");
4597
4598 container_host_gid = get_mapped_rootid(conf, ID_TYPE_GID);
4599 if (!gid_valid(container_host_gid))
4600 return log_error(-1, "No gid mapping for container root");
4601
cf68ffd9 4602 if (path_fd < 0) {
a72c68f7 4603 fd = open(path, O_CLOEXEC | O_NOCTTY);
234998b4
CB
4604 if (fd < 0)
4605 return log_error_errno(-errno, errno, "Failed to open \"%s\"", path);
4606 target_fd = fd;
4607 } else {
4608 target_fd = path_fd;
4609 }
4610
4611 hostuid = geteuid();
4612 /* We are root so chown directly. */
4613 if (hostuid == 0) {
4614 ret = fchown(target_fd, container_host_uid, container_host_gid);
4615 if (ret)
4616 return log_error_errno(-errno, errno,
4617 "Failed to fchown(%d(%s), %d, %d)",
4618 target_fd, path, container_host_uid,
4619 container_host_gid);
4620 return log_trace(0, "Chowned %d(%s) to uid %d and %d", target_fd, path,
4621 container_host_uid, container_host_gid);
4622 }
4623
4624 /* The container's root host id matches */
4625 if (container_host_uid == hostuid)
4626 return log_info(0, "Container root id is mapped to our uid");
4627
4628 /* Get the current ids of our target. */
4629 ret = fstat(target_fd, &st);
4630 if (ret)
4631 return log_error_errno(-errno, errno, "Failed to stat \"%s\"", path);
4632
4633 hostgid = getegid();
4634 if (st.st_uid == hostuid && mapped_hostid(st.st_gid, conf, ID_TYPE_GID) < 0) {
4635 ret = fchown(target_fd, -1, hostgid);
4636 if (ret)
4637 return log_error_errno(-errno, errno,
4638 "Failed to fchown(%d(%s), -1, %d)",
4639 target_fd, path, hostgid);
2e8013f9 4640 TRACE("Chowned %d(%s) to -1:%d", target_fd, path, hostgid);
234998b4
CB
4641 }
4642
4643 idmap = malloc(sizeof(*idmap));
4644 if (!idmap)
4645 return -ENOMEM;
4646 lxc_list_init(idmap);
4647
4648 /* "u:0:rootuid:1" */
4649 ret = add_idmap_entry(idmap, ID_TYPE_UID, 0, container_host_uid, 1);
4650 if (ret < 0)
4651 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4652
4653 /* "u:hostuid:hostuid:1" */
4654 ret = add_idmap_entry(idmap, ID_TYPE_UID, hostuid, hostuid, 1);
4655 if (ret < 0)
4656 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4657
4658 /* "g:0:rootgid:1" */
4659 ret = add_idmap_entry(idmap, ID_TYPE_GID, 0, container_host_gid, 1);
4660 if (ret < 0)
4661 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4662
4663 /* "g:hostgid:hostgid:1" */
4664 ret = add_idmap_entry(idmap, ID_TYPE_GID, hostgid, hostgid, 1);
4665 if (ret < 0)
4666 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4667
4668 if (hostgid != st.st_gid) {
4669 /* "g:pathgid:rootgid+pathgid:1" */
4670 ret = add_idmap_entry(idmap, ID_TYPE_GID, st.st_gid,
4671 container_host_gid + (gid_t)st.st_gid, 1);
4672 if (ret < 0)
4673 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4674 }
4675
4676 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, sock_fds);
4677 if (ret < 0)
4678 return -errno;
4679
4680 pid = fork();
4681 if (pid < 0) {
4682 SYSERROR("Failed to create new process");
4683 goto on_error;
4684 }
4685
4686 if (pid == 0) {
4687 close_prot_errno_disarm(sock_fds[1]);
4688
4689 ret = unshare(CLONE_NEWUSER);
4690 if (ret < 0) {
4691 SYSERROR("Failed to unshare new user namespace");
4692 _exit(EXIT_FAILURE);
4693 }
4694
4695 ret = lxc_write_nointr(sock_fds[0], &c, 1);
4696 if (ret != 1)
4697 _exit(EXIT_FAILURE);
4698
4699 ret = lxc_read_nointr(sock_fds[0], &c, 1);
4700 if (ret != 1)
4701 _exit(EXIT_FAILURE);
4702
4703 close_prot_errno_disarm(sock_fds[0]);
4704
4705 if (!lxc_switch_uid_gid(0, 0))
4706 _exit(EXIT_FAILURE);
4707
8917c382 4708 if (!lxc_drop_groups())
234998b4
CB
4709 _exit(EXIT_FAILURE);
4710
8053a085 4711 ret = fchown(target_fd, 0, st.st_gid);
234998b4 4712 if (ret) {
8ea93a0f 4713 SYSERROR("Failed to chown %d(%s) to 0:%d", target_fd, path, st.st_gid);
234998b4
CB
4714 _exit(EXIT_FAILURE);
4715 }
4716
2e8013f9 4717 TRACE("Chowned %d(%s) to 0:%d", target_fd, path, st.st_gid);
234998b4
CB
4718 _exit(EXIT_SUCCESS);
4719 }
4720
4721 close_prot_errno_disarm(sock_fds[0]);
4722
62fef886 4723 if (lxc_log_trace()) {
234998b4
CB
4724 struct id_map *map;
4725 struct lxc_list *it;
4726
4727 lxc_list_for_each(it, idmap) {
4728 map = it->elem;
4729 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4730 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
4731 }
4732 }
4733
4734 ret = lxc_read_nointr(sock_fds[1], &c, 1);
4735 if (ret != 1) {
4736 SYSERROR("Failed waiting for child process %d\" to tell us to proceed", pid);
4737 goto on_error;
4738 }
4739
4740 /* Set up {g,u}id mapping for user namespace of child process. */
4741 ret = lxc_map_ids(idmap, pid);
4742 if (ret < 0) {
4743 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4744 goto on_error;
4745 }
4746
4747 /* Tell child to proceed. */
4748 ret = lxc_write_nointr(sock_fds[1], &c, 1);
4749 if (ret != 1) {
4750 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4751 goto on_error;
4752 }
4753
4754on_error:
4755 close_prot_errno_disarm(sock_fds[0]);
4756 close_prot_errno_disarm(sock_fds[1]);
4757
4758 /* Wait for child to finish. */
4759 if (pid < 0)
4760 return -1;
4761
4762 return wait_for_pid(pid);
4763}
4764
a96a8e8c 4765/* not thread-safe, do not use from api without first forking */
0fd73091 4766static char *getuname(void)
97e9cfa0 4767{
4f410b2a 4768 __do_free char *buf = NULL;
cb7aa5e8
DJ
4769 struct passwd pwent;
4770 struct passwd *pwentp = NULL;
cb7aa5e8
DJ
4771 size_t bufsize;
4772 int ret;
97e9cfa0 4773
cb7aa5e8
DJ
4774 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
4775 if (bufsize == -1)
4776 bufsize = 1024;
4777
4778 buf = malloc(bufsize);
4779 if (!buf)
97e9cfa0
SH
4780 return NULL;
4781
cb7aa5e8
DJ
4782 ret = getpwuid_r(geteuid(), &pwent, buf, bufsize, &pwentp);
4783 if (!pwentp) {
4784 if (ret == 0)
4785 WARN("Could not find matched password record.");
4786
55022530 4787 return log_error(NULL, "Failed to get password record - %u", geteuid());
cb7aa5e8
DJ
4788 }
4789
4f410b2a 4790 return strdup(pwent.pw_name);
97e9cfa0
SH
4791}
4792
a96a8e8c 4793/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4794static char *getgname(void)
4795{
4f410b2a 4796 __do_free char *buf = NULL;
3de9fb4c
DJ
4797 struct group grent;
4798 struct group *grentp = NULL;
3de9fb4c
DJ
4799 size_t bufsize;
4800 int ret;
4801
4802 bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
4803 if (bufsize == -1)
4804 bufsize = 1024;
4805
4806 buf = malloc(bufsize);
4807 if (!buf)
4808 return NULL;
4809
4810 ret = getgrgid_r(getegid(), &grent, buf, bufsize, &grentp);
4811 if (!grentp) {
4812 if (ret == 0)
4813 WARN("Could not find matched group record");
97e9cfa0 4814
55022530 4815 return log_error(NULL, "Failed to get group record - %u", getegid());
3de9fb4c
DJ
4816 }
4817
4f410b2a 4818 return strdup(grent.gr_name);
97e9cfa0
SH
4819}
4820
a96a8e8c 4821/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4822void suggest_default_idmap(void)
4823{
3a6e3bf5 4824 __do_free char *gname = NULL, *line = NULL, *uname = NULL;
4aae564f 4825 __do_fclose FILE *subuid_f = NULL, *subgid_f = NULL;
97e9cfa0 4826 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
97e9cfa0
SH
4827 size_t len = 0;
4828
0fd73091
CB
4829 uname = getuname();
4830 if (!uname)
97e9cfa0
SH
4831 return;
4832
0fd73091 4833 gname = getgname();
3a6e3bf5 4834 if (!gname)
97e9cfa0 4835 return;
97e9cfa0 4836
4110345b 4837 subuid_f = fopen(subuidfile, "re");
4aae564f 4838 if (!subuid_f) {
97e9cfa0 4839 ERROR("Your system is not configured with subuids");
97e9cfa0
SH
4840 return;
4841 }
0fd73091 4842
4aae564f 4843 while (getline(&line, &len, subuid_f) != -1) {
0fd73091 4844 char *p, *p2;
b7930180 4845 size_t no_newline = 0;
0fd73091
CB
4846
4847 p = strchr(line, ':');
97e9cfa0
SH
4848 if (*line == '#')
4849 continue;
4850 if (!p)
4851 continue;
4852 *p = '\0';
4853 p++;
0fd73091 4854
97e9cfa0
SH
4855 if (strcmp(line, uname))
4856 continue;
0fd73091 4857
97e9cfa0
SH
4858 p2 = strchr(p, ':');
4859 if (!p2)
4860 continue;
4861 *p2 = '\0';
4862 p2++;
4863 if (!*p2)
4864 continue;
b7930180
CB
4865 no_newline = strcspn(p2, "\n");
4866 p2[no_newline] = '\0';
4867
b7b2fde4 4868 if (lxc_safe_uint(p, &uid) < 0)
0fd73091 4869 WARN("Could not parse UID");
b7b2fde4 4870 if (lxc_safe_uint(p2, &urange) < 0)
0fd73091 4871 WARN("Could not parse UID range");
97e9cfa0 4872 }
97e9cfa0 4873
4110345b 4874 subgid_f = fopen(subgidfile, "re");
4aae564f 4875 if (!subgid_f) {
97e9cfa0 4876 ERROR("Your system is not configured with subgids");
97e9cfa0
SH
4877 return;
4878 }
0fd73091 4879
4aae564f 4880 while (getline(&line, &len, subgid_f) != -1) {
0fd73091 4881 char *p, *p2;
b7930180 4882 size_t no_newline = 0;
0fd73091
CB
4883
4884 p = strchr(line, ':');
97e9cfa0
SH
4885 if (*line == '#')
4886 continue;
4887 if (!p)
4888 continue;
4889 *p = '\0';
4890 p++;
0fd73091 4891
97e9cfa0
SH
4892 if (strcmp(line, uname))
4893 continue;
0fd73091 4894
97e9cfa0
SH
4895 p2 = strchr(p, ':');
4896 if (!p2)
4897 continue;
4898 *p2 = '\0';
4899 p2++;
4900 if (!*p2)
4901 continue;
b7930180
CB
4902 no_newline = strcspn(p2, "\n");
4903 p2[no_newline] = '\0';
4904
b7b2fde4 4905 if (lxc_safe_uint(p, &gid) < 0)
0fd73091 4906 WARN("Could not parse GID");
b7b2fde4 4907 if (lxc_safe_uint(p2, &grange) < 0)
0fd73091 4908 WARN("Could not parse GID range");
97e9cfa0 4909 }
97e9cfa0 4910
97e9cfa0
SH
4911 if (!urange || !grange) {
4912 ERROR("You do not have subuids or subgids allocated");
4913 ERROR("Unprivileged containers require subuids and subgids");
4914 return;
4915 }
4916
4917 ERROR("You must either run as root, or define uid mappings");
4918 ERROR("To pass uid mappings to lxc-create, you could create");
4919 ERROR("~/.config/lxc/default.conf:");
4920 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
bdcbb6b3
CB
4921 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4922 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
97e9cfa0 4923}
aaf26830 4924
a7307747
SH
4925static void free_cgroup_settings(struct lxc_list *result)
4926{
4927 struct lxc_list *iterator, *next;
4928
0fd73091 4929 lxc_list_for_each_safe (iterator, result, next) {
a7307747 4930 lxc_list_del(iterator);
55022530 4931 free_disarm(iterator);
a7307747 4932 }
55022530 4933 free_disarm(result);
a7307747
SH
4934}
4935
0fd73091 4936/* Return the list of cgroup_settings sorted according to the following rules
aaf26830
KT
4937 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4938 */
0fd73091 4939struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings)
aaf26830
KT
4940{
4941 struct lxc_list *result;
aaf26830 4942 struct lxc_cgroup *cg = NULL;
0fd73091 4943 struct lxc_list *it = NULL, *item = NULL, *memsw_limit = NULL;
aaf26830
KT
4944
4945 result = malloc(sizeof(*result));
0fd73091 4946 if (!result)
fac7c663 4947 return NULL;
aaf26830
KT
4948 lxc_list_init(result);
4949
0fd73091
CB
4950 /* Iterate over the cgroup settings and copy them to the output list. */
4951 lxc_list_for_each (it, cgroup_settings) {
aaf26830 4952 item = malloc(sizeof(*item));
fac7c663 4953 if (!item) {
a7307747 4954 free_cgroup_settings(result);
fac7c663
KT
4955 return NULL;
4956 }
0fd73091 4957
aaf26830
KT
4958 item->elem = it->elem;
4959 cg = it->elem;
4960 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4961 /* Store the memsw_limit location */
4962 memsw_limit = item;
0fd73091
CB
4963 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 &&
4964 memsw_limit != NULL) {
4965 /* lxc.cgroup.memory.memsw.limit_in_bytes is found
4966 * before lxc.cgroup.memory.limit_in_bytes, swap these
4967 * two items */
aaf26830
KT
4968 item->elem = memsw_limit->elem;
4969 memsw_limit->elem = it->elem;
4970 }
4971 lxc_list_add_tail(result, item);
4972 }
4973
4974 return result;
a7307747 4975}