]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
syscall_wrappers: fix PROTECT_OPEN_W macro
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
1d52bdf7 2
d38dd64a
CB
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE 1
5#endif
9d257a2a 6#include <arpa/inet.h>
8f3e280e
CB
7#include <dirent.h>
8#include <errno.h>
9#include <fcntl.h>
10#include <grp.h>
11#include <inttypes.h>
12#include <libgen.h>
9d257a2a
CB
13#include <linux/loop.h>
14#include <net/if.h>
15#include <netinet/in.h>
8f3e280e
CB
16#include <pwd.h>
17#include <stdarg.h>
0ad19a3f 18#include <stdio.h>
0ad19a3f 19#include <stdlib.h>
0ad19a3f 20#include <string.h>
8f3e280e
CB
21#include <sys/mman.h>
22#include <sys/mount.h>
23#include <sys/param.h>
24#include <sys/prctl.h>
6a49f05e 25#include <sys/sendfile.h>
8f3e280e 26#include <sys/socket.h>
9d257a2a 27#include <sys/stat.h>
2d76d1d7 28#include <sys/syscall.h>
9d257a2a 29#include <sys/sysmacros.h>
97e9cfa0 30#include <sys/types.h>
8f3e280e
CB
31#include <sys/utsname.h>
32#include <sys/wait.h>
9d257a2a
CB
33#include <time.h>
34#include <unistd.h>
1d52bdf7 35
d38dd64a
CB
36#include "af_unix.h"
37#include "caps.h"
5f126977 38#include "cgroups/cgroup.h"
d38dd64a
CB
39#include "conf.h"
40#include "config.h"
41#include "confile.h"
42#include "confile_utils.h"
43#include "error.h"
44#include "log.h"
45#include "lsm/lsm.h"
46#include "lxclock.h"
47#include "lxcseccomp.h"
48#include "macro.h"
2f443e88 49#include "memory_utils.h"
7f88a1a2 50#include "mount_utils.h"
d38dd64a
CB
51#include "namespace.h"
52#include "network.h"
53#include "parse.h"
f40988c7 54#include "process_utils.h"
d38dd64a
CB
55#include "ringbuf.h"
56#include "start.h"
5f126977 57#include "storage/storage.h"
d38dd64a 58#include "storage/overlay.h"
6b3d24d7 59#include "syscall_wrappers.h"
d38dd64a
CB
60#include "terminal.h"
61#include "utils.h"
20502652 62#include "uuid.h"
d38dd64a 63
af6824fc 64#ifdef MAJOR_IN_MKDEV
9d257a2a 65#include <sys/mkdev.h>
af6824fc 66#endif
af6824fc 67
614305f3 68#ifdef HAVE_STATVFS
2938f7c8 69#include <sys/statvfs.h>
614305f3 70#endif
e827ff7e 71
35eb5cdc 72#if HAVE_OPENPTY
b0a33c1e 73#include <pty.h>
e827ff7e
SG
74#else
75#include <../include/openpty.h>
76#endif
0ad19a3f 77
9d257a2a
CB
78#if HAVE_LIBCAP
79#include <sys/capability.h>
80#endif
81
82#if HAVE_SYS_PERSONALITY_H
83#include <sys/personality.h>
84#endif
85
f1e05b90
DJ
86#ifndef HAVE_STRLCAT
87#include "include/strlcat.h"
88#endif
89
9d257a2a
CB
90#if IS_BIONIC
91#include <../include/lxcmntent.h>
92#else
93#include <mntent.h>
94#endif
95
96#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
97#include <../include/prlimit.h>
98#endif
99
ac2cecc4 100lxc_log_define(conf, lxc);
e5bda9ee 101
0fd73091
CB
102/* The lxc_conf of the container currently being worked on in an API call.
103 * This is used in the error calls.
104 */
105#ifdef HAVE_TLS
d7f19646 106thread_local struct lxc_conf *current_config;
0fd73091
CB
107#else
108struct lxc_conf *current_config;
109#endif
8912711c 110
0fd73091
CB
111char *lxchook_names[NUM_LXC_HOOKS] = {
112 "pre-start",
113 "pre-mount",
114 "mount",
115 "autodev",
116 "start",
117 "stop",
118 "post-stop",
119 "clone",
120 "destroy",
121 "start-host"
122};
72d0e1cb 123
998ac676
RT
124struct mount_opt {
125 char *name;
126 int clear;
127 int flag;
128};
129
81810dd1
DL
130struct caps_opt {
131 char *name;
132 int value;
133};
134
c6d09e15
WB
135struct limit_opt {
136 char *name;
137 int value;
138};
139
998ac676 140static struct mount_opt mount_opt[] = {
470b359b
CB
141 { "async", 1, MS_SYNCHRONOUS },
142 { "atime", 1, MS_NOATIME },
143 { "bind", 0, MS_BIND },
88d413d5 144 { "defaults", 0, 0 },
88d413d5 145 { "dev", 1, MS_NODEV },
470b359b 146 { "diratime", 1, MS_NODIRATIME },
88d413d5 147 { "dirsync", 0, MS_DIRSYNC },
470b359b 148 { "exec", 1, MS_NOEXEC },
8912711c 149 { "lazytime", 0, MS_LAZYTIME },
88d413d5 150 { "mand", 0, MS_MANDLOCK },
88d413d5 151 { "noatime", 0, MS_NOATIME },
470b359b 152 { "nodev", 0, MS_NODEV },
88d413d5 153 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
154 { "noexec", 0, MS_NOEXEC },
155 { "nomand", 1, MS_MANDLOCK },
156 { "norelatime", 1, MS_RELATIME },
157 { "nostrictatime", 1, MS_STRICTATIME },
158 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
159 { "rbind", 0, MS_BIND|MS_REC },
160 { "relatime", 0, MS_RELATIME },
470b359b
CB
161 { "remount", 0, MS_REMOUNT },
162 { "ro", 0, MS_RDONLY },
163 { "rw", 1, MS_RDONLY },
88d413d5 164 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
165 { "suid", 1, MS_NOSUID },
166 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 167 { NULL, 0, 0 },
998ac676
RT
168};
169
d840039e 170static struct mount_opt propagation_opt[] = {
0fd73091
CB
171 { "private", 0, MS_PRIVATE },
172 { "shared", 0, MS_SHARED },
173 { "slave", 0, MS_SLAVE },
174 { "unbindable", 0, MS_UNBINDABLE },
175 { "rprivate", 0, MS_PRIVATE|MS_REC },
176 { "rshared", 0, MS_SHARED|MS_REC },
177 { "rslave", 0, MS_SLAVE|MS_REC },
178 { "runbindable", 0, MS_UNBINDABLE|MS_REC },
179 { NULL, 0, 0 },
d840039e
YT
180};
181
81810dd1 182static struct caps_opt caps_opt[] = {
8560cd36 183#if HAVE_LIBCAP
7b4cd468
CB
184 { "chown", CAP_CHOWN },
185 { "dac_override", CAP_DAC_OVERRIDE },
186 { "dac_read_search", CAP_DAC_READ_SEARCH },
187 { "fowner", CAP_FOWNER },
188 { "fsetid", CAP_FSETID },
189 { "kill", CAP_KILL },
190 { "setgid", CAP_SETGID },
191 { "setuid", CAP_SETUID },
192 { "setpcap", CAP_SETPCAP },
193 { "linux_immutable", CAP_LINUX_IMMUTABLE },
194 { "net_bind_service", CAP_NET_BIND_SERVICE },
195 { "net_broadcast", CAP_NET_BROADCAST },
196 { "net_admin", CAP_NET_ADMIN },
197 { "net_raw", CAP_NET_RAW },
198 { "ipc_lock", CAP_IPC_LOCK },
199 { "ipc_owner", CAP_IPC_OWNER },
200 { "sys_module", CAP_SYS_MODULE },
201 { "sys_rawio", CAP_SYS_RAWIO },
202 { "sys_chroot", CAP_SYS_CHROOT },
203 { "sys_ptrace", CAP_SYS_PTRACE },
204 { "sys_pacct", CAP_SYS_PACCT },
205 { "sys_admin", CAP_SYS_ADMIN },
206 { "sys_boot", CAP_SYS_BOOT },
207 { "sys_nice", CAP_SYS_NICE },
208 { "sys_resource", CAP_SYS_RESOURCE },
209 { "sys_time", CAP_SYS_TIME },
210 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
211 { "mknod", CAP_MKNOD },
212 { "lease", CAP_LEASE },
213 { "audit_write", CAP_AUDIT_WRITE },
214 { "audit_control", CAP_AUDIT_CONTROL },
215 { "setfcap", CAP_SETFCAP },
216 { "mac_override", CAP_MAC_OVERRIDE },
217 { "mac_admin", CAP_MAC_ADMIN },
218 { "syslog", CAP_SYSLOG },
219 { "wake_alarm", CAP_WAKE_ALARM },
220 { "block_suspend", CAP_BLOCK_SUSPEND },
221 { "audit_read", CAP_AUDIT_READ },
222 { "perfmon", CAP_PERFMON },
223 { "bpf", CAP_BPF },
224 { "checkpoint_restore", CAP_CHECKPOINT_RESTORE },
2b54359b 225#endif
8560cd36 226};
81810dd1 227
c6d09e15
WB
228static struct limit_opt limit_opt[] = {
229#ifdef RLIMIT_AS
230 { "as", RLIMIT_AS },
231#endif
232#ifdef RLIMIT_CORE
233 { "core", RLIMIT_CORE },
234#endif
235#ifdef RLIMIT_CPU
236 { "cpu", RLIMIT_CPU },
237#endif
238#ifdef RLIMIT_DATA
239 { "data", RLIMIT_DATA },
240#endif
241#ifdef RLIMIT_FSIZE
242 { "fsize", RLIMIT_FSIZE },
243#endif
244#ifdef RLIMIT_LOCKS
245 { "locks", RLIMIT_LOCKS },
246#endif
247#ifdef RLIMIT_MEMLOCK
248 { "memlock", RLIMIT_MEMLOCK },
249#endif
250#ifdef RLIMIT_MSGQUEUE
251 { "msgqueue", RLIMIT_MSGQUEUE },
252#endif
253#ifdef RLIMIT_NICE
254 { "nice", RLIMIT_NICE },
255#endif
256#ifdef RLIMIT_NOFILE
257 { "nofile", RLIMIT_NOFILE },
258#endif
259#ifdef RLIMIT_NPROC
260 { "nproc", RLIMIT_NPROC },
261#endif
262#ifdef RLIMIT_RSS
263 { "rss", RLIMIT_RSS },
264#endif
265#ifdef RLIMIT_RTPRIO
266 { "rtprio", RLIMIT_RTPRIO },
267#endif
268#ifdef RLIMIT_RTTIME
269 { "rttime", RLIMIT_RTTIME },
270#endif
271#ifdef RLIMIT_SIGPENDING
272 { "sigpending", RLIMIT_SIGPENDING },
273#endif
274#ifdef RLIMIT_STACK
275 { "stack", RLIMIT_STACK },
276#endif
277};
278
91c3830e
SH
279static int run_buffer(char *buffer)
280{
cc6a0e78 281 __do_free char *output = NULL;
55022530 282 __do_lxc_pclose struct lxc_popen_FILE *f = NULL;
ebf3a6af 283 int fd, ret;
91c3830e 284
ebec9176 285 f = lxc_popen(buffer);
55022530
CB
286 if (!f)
287 return log_error_errno(-1, errno, "Failed to popen() %s", buffer);
91c3830e
SH
288
289 output = malloc(LXC_LOG_BUFFER_SIZE);
55022530
CB
290 if (!output)
291 return log_error_errno(-1, ENOMEM, "Failed to allocate memory for %s", buffer);
91c3830e 292
ebf3a6af 293 fd = fileno(f->f);
55022530
CB
294 if (fd < 0)
295 return log_error_errno(-1, errno, "Failed to retrieve underlying file descriptor");
ebf3a6af
CB
296
297 for (int i = 0; i < 10; i++) {
298 ssize_t bytes_read;
299
300 bytes_read = lxc_read_nointr(fd, output, LXC_LOG_BUFFER_SIZE - 1);
301 if (bytes_read > 0) {
302 output[bytes_read] = '\0';
303 DEBUG("Script %s produced output: %s", buffer, output);
304 continue;
305 }
306
307 break;
308 }
91c3830e 309
55022530
CB
310 ret = lxc_pclose(move_ptr(f));
311 if (ret == -1)
312 return log_error_errno(-1, errno, "Script exited with error");
313 else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0)
314 return log_error(-1, "Script exited with status %d", WEXITSTATUS(ret));
315 else if (WIFSIGNALED(ret))
316 return log_error(-1, "Script terminated by signal %d", WTERMSIG(ret));
91c3830e
SH
317
318 return 0;
319}
320
14a7b0f9
CB
321int run_script_argv(const char *name, unsigned int hook_version,
322 const char *section, const char *script,
586b1ce7 323 const char *hookname, char **argv)
148e91f5 324{
e1a94937 325 __do_free char *buffer = NULL;
3f60c2f7 326 int buf_pos, i, ret;
d08e5708 327 size_t size = 0;
148e91f5 328
3f60c2f7 329 if (hook_version == 0)
55022530
CB
330 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
331 script, name, section);
3f60c2f7
CB
332 else
333 INFO("Executing script \"%s\" for container \"%s\"", script, name);
148e91f5 334
586b1ce7
CB
335 for (i = 0; argv && argv[i]; i++)
336 size += strlen(argv[i]) + 1;
148e91f5 337
6333c915
CB
338 size += STRLITERALLEN("exec");
339 size++;
148e91f5 340 size += strlen(script);
3f60c2f7
CB
341 size++;
342
148e91f5 343 if (size > INT_MAX)
3f60c2f7 344 return -EFBIG;
148e91f5 345
3f60c2f7 346 if (hook_version == 0) {
d08e5708
CB
347 size += strlen(hookname);
348 size++;
349
350 size += strlen(name);
351 size++;
352
353 size += strlen(section);
354 size++;
355
356 if (size > INT_MAX)
357 return -EFBIG;
327cce76 358 }
3f60c2f7 359
6f8d00d2
CB
360 buffer = malloc(size);
361 if (!buffer)
362 return -ENOMEM;
363
327cce76 364 if (hook_version == 0)
3f60c2f7 365 buf_pos = snprintf(buffer, size, "exec %s %s %s %s", script, name, section, hookname);
327cce76 366 else
3f60c2f7 367 buf_pos = snprintf(buffer, size, "exec %s", script);
55022530
CB
368 if (buf_pos < 0 || (size_t)buf_pos >= size)
369 return log_error_errno(-1, errno, "Failed to create command line for script \"%s\"", script);
3f60c2f7 370
327cce76 371 if (hook_version == 1) {
3f60c2f7
CB
372 ret = setenv("LXC_HOOK_TYPE", hookname, 1);
373 if (ret < 0) {
55022530 374 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_HOOK_TYPE=%s", hookname);
3f60c2f7 375 }
90f20466 376 TRACE("Set environment variable: LXC_HOOK_TYPE=%s", hookname);
3f60c2f7
CB
377
378 ret = setenv("LXC_HOOK_SECTION", section, 1);
55022530
CB
379 if (ret < 0)
380 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_HOOK_SECTION=%s", section);
3f60c2f7 381 TRACE("Set environment variable: LXC_HOOK_SECTION=%s", section);
14a7b0f9
CB
382
383 if (strcmp(section, "net") == 0) {
384 char *parent;
385
586b1ce7 386 if (!argv || !argv[0])
e1a94937 387 return -1;
14a7b0f9 388
586b1ce7 389 ret = setenv("LXC_NET_TYPE", argv[0], 1);
55022530
CB
390 if (ret < 0)
391 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_TYPE=%s", argv[0]);
586b1ce7 392 TRACE("Set environment variable: LXC_NET_TYPE=%s", argv[0]);
14a7b0f9 393
586b1ce7 394 parent = argv[1] ? argv[1] : "";
14a7b0f9 395
a8144263 396 if (strcmp(argv[0], "macvlan") == 0) {
14a7b0f9 397 ret = setenv("LXC_NET_PARENT", parent, 1);
55022530
CB
398 if (ret < 0)
399 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
14a7b0f9 400 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 401 } else if (strcmp(argv[0], "phys") == 0) {
14a7b0f9 402 ret = setenv("LXC_NET_PARENT", parent, 1);
55022530
CB
403 if (ret < 0)
404 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
14a7b0f9 405 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 406 } else if (strcmp(argv[0], "veth") == 0) {
586b1ce7 407 char *peer = argv[2] ? argv[2] : "";
14a7b0f9
CB
408
409 ret = setenv("LXC_NET_PEER", peer, 1);
55022530
CB
410 if (ret < 0)
411 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PEER=%s", peer);
14a7b0f9
CB
412 TRACE("Set environment variable: LXC_NET_PEER=%s", peer);
413
414 ret = setenv("LXC_NET_PARENT", parent, 1);
55022530
CB
415 if (ret < 0)
416 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
14a7b0f9
CB
417 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
418 }
419 }
148e91f5
SH
420 }
421
586b1ce7 422 for (i = 0; argv && argv[i]; i++) {
3f60c2f7
CB
423 size_t len = size - buf_pos;
424
586b1ce7 425 ret = snprintf(buffer + buf_pos, len, " %s", argv[i]);
55022530
CB
426 if (ret < 0 || (size_t)ret >= len)
427 return log_error_errno(-1, errno, "Failed to create command line for script \"%s\"", script);
3f60c2f7 428 buf_pos += ret;
148e91f5
SH
429 }
430
e1a94937 431 return run_buffer(buffer);
148e91f5
SH
432}
433
811ef482 434int run_script(const char *name, const char *section, const char *script, ...)
e3b4c4c4 435{
2f443e88 436 __do_free char *buffer = NULL;
abbfd20b 437 int ret;
2f443e88 438 char *p;
abbfd20b 439 va_list ap;
0fd73091 440 size_t size = 0;
751d9dcd 441
0fd73091 442 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
751d9dcd 443 script, name, section);
e3b4c4c4 444
abbfd20b
DL
445 va_start(ap, script);
446 while ((p = va_arg(ap, char *)))
95642a10 447 size += strlen(p) + 1;
abbfd20b
DL
448 va_end(ap);
449
6333c915 450 size += STRLITERALLEN("exec");
abbfd20b
DL
451 size += strlen(script);
452 size += strlen(name);
453 size += strlen(section);
6d1a5f93 454 size += 4;
abbfd20b 455
95642a10
MS
456 if (size > INT_MAX)
457 return -1;
458
2f443e88 459 buffer = must_realloc(NULL, size);
6d1a5f93 460 ret = snprintf(buffer, size, "exec %s %s %s", script, name, section);
0fd73091 461 if (ret < 0 || ret >= size)
9ba8130c 462 return -1;
751d9dcd 463
abbfd20b 464 va_start(ap, script);
9ba8130c 465 while ((p = va_arg(ap, char *))) {
062b72c6 466 int len = size - ret;
9ba8130c
SH
467 int rc;
468 rc = snprintf(buffer + ret, len, " %s", p);
7b5a2435
DJ
469 if (rc < 0 || rc >= len) {
470 va_end(ap);
9ba8130c 471 return -1;
7b5a2435 472 }
9ba8130c
SH
473 ret += rc;
474 }
abbfd20b 475 va_end(ap);
751d9dcd 476
91c3830e 477 return run_buffer(buffer);
e3b4c4c4
ST
478}
479
0fd73091 480/* pin_rootfs
63fc76c3 481 * if rootfs is a directory, then open ${rootfs}/.lxc-keep for writing for
b7ed4bf0
CS
482 * the duration of the container run, to prevent the container from marking
483 * the underlying fs readonly on shutdown. unlink the file immediately so
63fc76c3
GJ
484 * no name pollution is happens.
485 * don't unlink on NFS to avoid random named stale handles.
0c547523
SH
486 * return -1 on error.
487 * return -2 if nothing needed to be pinned.
488 * return an open fd (>=0) if we pinned it.
489 */
490int pin_rootfs(const char *rootfs)
491{
957c4704 492 __do_free char *absrootfs = NULL;
0fd73091 493 int fd, ret;
6b5a54cd 494 char absrootfspin[PATH_MAX];
0c547523 495 struct stat s;
63fc76c3 496 struct statfs sfs;
0c547523 497
e99ee0de 498 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 499 return -2;
e99ee0de 500
74e7b662 501 absrootfs = realpath(rootfs, NULL);
502 if (!absrootfs)
9be53773 503 return -2;
0c547523 504
0fd73091 505 ret = stat(absrootfs, &s);
957c4704 506 if (ret < 0)
0c547523 507 return -1;
0c547523 508
957c4704 509 if (!S_ISDIR(s.st_mode))
0c547523
SH
510 return -2;
511
55022530
CB
512 ret = snprintf(absrootfspin, sizeof(absrootfspin), "%s/.lxc-keep", absrootfs);
513 if (ret < 0 || (size_t)ret >= sizeof(absrootfspin))
0c547523 514 return -1;
0c547523 515
55022530 516 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR | S_IRUSR | O_CLOEXEC);
b7ed4bf0
CS
517 if (fd < 0)
518 return fd;
0fd73091 519
205fc010
CB
520 ret = fstatfs (fd, &sfs);
521 if (ret < 0)
522 return fd;
63fc76c3 523
55022530
CB
524 if (sfs.f_type == NFS_SUPER_MAGIC)
525 return log_debug(fd, "Rootfs on NFS, not unlinking pin file \"%s\"", absrootfspin);
63fc76c3 526
b7ed4bf0 527 (void)unlink(absrootfspin);
0fd73091 528
0c547523
SH
529 return fd;
530}
531
0fd73091
CB
532/* If we are asking to remount something, make sure that any NOEXEC etc are
533 * honored.
e2a7e8dc 534 */
5ae72b98 535unsigned long add_required_remount_flags(const char *s, const char *d,
5285689c 536 unsigned long flags)
e2a7e8dc 537{
614305f3 538#ifdef HAVE_STATVFS
0fd73091 539 int ret;
e2a7e8dc
SH
540 struct statvfs sb;
541 unsigned long required_flags = 0;
542
e2a7e8dc
SH
543 if (!s)
544 s = d;
545
546 if (!s)
547 return flags;
0fd73091
CB
548
549 ret = statvfs(s, &sb);
550 if (ret < 0)
e2a7e8dc
SH
551 return flags;
552
69eadddb
CB
553 if (flags & MS_REMOUNT) {
554 if (sb.f_flag & MS_NOSUID)
555 required_flags |= MS_NOSUID;
556 if (sb.f_flag & MS_NODEV)
557 required_flags |= MS_NODEV;
558 if (sb.f_flag & MS_RDONLY)
559 required_flags |= MS_RDONLY;
560 if (sb.f_flag & MS_NOEXEC)
561 required_flags |= MS_NOEXEC;
562 }
563
564 if (sb.f_flag & MS_NOATIME)
565 required_flags |= MS_NOATIME;
566 if (sb.f_flag & MS_NODIRATIME)
567 required_flags |= MS_NODIRATIME;
568 if (sb.f_flag & MS_LAZYTIME)
569 required_flags |= MS_LAZYTIME;
570 if (sb.f_flag & MS_RELATIME)
571 required_flags |= MS_RELATIME;
572 if (sb.f_flag & MS_STRICTATIME)
573 required_flags |= MS_STRICTATIME;
e2a7e8dc
SH
574
575 return flags | required_flags;
614305f3
SH
576#else
577 return flags;
578#endif
e2a7e8dc
SH
579}
580
6b741397
CB
581static int add_shmount_to_list(struct lxc_conf *conf)
582{
6b5a54cd 583 char new_mount[PATH_MAX];
0d190408 584 /* Offset for the leading '/' since the path_cont
6b741397
CB
585 * is absolute inside the container.
586 */
587 int offset = 1, ret = -1;
0d190408 588
6b741397
CB
589 ret = snprintf(new_mount, sizeof(new_mount),
590 "%s %s none bind,create=dir 0 0", conf->shmount.path_host,
591 conf->shmount.path_cont + offset);
60534030 592 if (ret < 0 || (size_t)ret >= sizeof(new_mount))
0d190408
LT
593 return -1;
594
6b741397 595 return add_elem_to_mount_list(new_mount, conf);
0d190408
LT
596}
597
4fb3cba5 598static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 599{
7b371c1e 600 int i, ret;
b06b8511
CS
601 static struct {
602 int match_mask;
603 int match_flag;
604 const char *source;
605 const char *destination;
606 const char *fstype;
607 unsigned long flags;
608 const char *options;
e8b9c9ec 609 bool requires_cap_net_admin;
b06b8511 610 } default_mounts[] = {
0fd73091
CB
611 /* Read-only bind-mounting... In older kernels, doing that
612 * required to do one MS_BIND mount and then
613 * MS_REMOUNT|MS_RDONLY the same one. According to mount(2)
614 * manpage, MS_BIND honors MS_RDONLY from kernel 2.6.26
615 * onwards. However, this apparently does not work on kernel
616 * 3.8. Unfortunately, on that very same kernel, doing the same
617 * trick as above doesn't seem to work either, there one needs
618 * to ALSO specify MS_BIND for the remount, otherwise the
619 * entire fs is remounted read-only or the mount fails because
620 * it's busy... MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for
621 * kernels as low as 2.6.32...
368bbc02 622 */
5d1bf4c4 623 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
592fd47a 624 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
5d1bf4c4
CB
625 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL, true },
626 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL, false },
627 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL, false },
628 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL, true },
629 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL, false },
630 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL, false },
631 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
632 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL, false },
633 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL, false },
634 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
635 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL, false },
636 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL, false },
637 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL, false },
638 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL, false },
639 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL, false },
640 { 0, 0, NULL, NULL, NULL, 0, NULL, false }
b06b8511 641 };
e25af1bc
CB
642 struct lxc_rootfs *rootfs = &conf->rootfs;
643 bool has_cap_net_admin;
368bbc02 644
f4bea7cc
CB
645 if (flags & LXC_AUTO_PROC_MASK) {
646 ret = mkdirat(rootfs->mntpt_fd, "proc" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
647 if (ret < 0 && errno != EEXIST)
648 return log_error_errno(-errno, errno,
649 "Failed to create proc mountpoint under %d", rootfs->mntpt_fd);
650 }
651
652 if (flags & LXC_AUTO_SYS_MASK) {
653 ret = mkdirat(rootfs->mntpt_fd, "sys" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
654 if (ret < 0 && errno != EEXIST)
655 return log_error_errno(-errno, errno,
656 "Failed to create sysfs mountpoint under %d", rootfs->mntpt_fd);
657 }
658
e25af1bc 659 has_cap_net_admin = lxc_wants_cap(CAP_NET_ADMIN, conf);
d84b26bc 660 for (i = 0; default_mounts[i].match_mask; i++) {
8db92302 661 __do_free char *destination = NULL, *source = NULL;
0fd73091
CB
662 int saved_errno;
663 unsigned long mflags;
0fd73091
CB
664 if ((flags & default_mounts[i].match_mask) != default_mounts[i].match_flag)
665 continue;
666
667 if (default_mounts[i].source) {
cc4fd506 668 /* will act like strdup if %r is not present */
e25af1bc 669 source = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].source);
0fd73091 670 if (!source)
cc4fd506 671 return -1;
0fd73091 672 }
f24a52d5 673
55022530
CB
674 if (!default_mounts[i].destination)
675 return log_error(-1, "BUG: auto mounts destination %d was NULL", i);
0fd73091 676
e8b9c9ec 677 if (!has_cap_net_admin && default_mounts[i].requires_cap_net_admin) {
678 TRACE("Container does not have CAP_NET_ADMIN. Skipping \"%s\" mount", default_mounts[i].source ?: "(null)");
679 continue;
680 }
681
0fd73091 682 /* will act like strdup if %r is not present */
e25af1bc 683 destination = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].destination);
55022530 684 if (!destination)
0fd73091 685 return -1;
0fd73091
CB
686
687 mflags = add_required_remount_flags(source, destination,
688 default_mounts[i].flags);
7b371c1e
CB
689 ret = safe_mount(source, destination, default_mounts[i].fstype,
690 mflags, default_mounts[i].options,
691 rootfs->path ? rootfs->mount : NULL);
0fd73091 692 saved_errno = errno;
7b371c1e 693 if (ret < 0 && errno == ENOENT) {
55022530 694 INFO("Mount source or target for \"%s\" on \"%s\" does not exist. Skipping", source, destination);
7b371c1e
CB
695 ret = 0;
696 } else if (ret < 0) {
0fd73091
CB
697 SYSERROR("Failed to mount \"%s\" on \"%s\" with flags %lu", source, destination, mflags);
698 }
699
7b371c1e 700 if (ret < 0) {
0fd73091
CB
701 errno = saved_errno;
702 return -1;
368bbc02 703 }
368bbc02
CS
704 }
705
b06b8511 706 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
707 int cg_flags;
708
3f69fb12 709 cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE);
0fd73091
CB
710 /* If the type of cgroup mount was not specified, it depends on
711 * the container's capabilities as to what makes sense: if we
712 * have CAP_SYS_ADMIN, the read-only part can be remounted
713 * read-write anyway, so we may as well default to read-write;
714 * then the admin will not be given a false sense of security.
715 * (And if they really want mixed r/o r/w, then they can
716 * explicitly specify :mixed.) OTOH, if the container lacks
717 * CAP_SYS_ADMIN, do only default to :mixed, because then the
718 * container can't remount it read-write.
719 */
0769b82a
CS
720 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
721 int has_sys_admin = 0;
b0ee5983
CB
722
723 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 724 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 725 else
0769b82a 726 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
727
728 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 729 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 730 else
0769b82a 731 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a 732 }
0fd73091 733
3f69fb12 734 if (flags & LXC_AUTO_CGROUP_FORCE)
0fd73091
CB
735 cg_flags |= LXC_AUTO_CGROUP_FORCE;
736
315f8a4e 737 if (!handler->cgroup_ops->mount(handler->cgroup_ops, conf, cg_flags))
55022530 738 return log_error_errno(-1, errno, "Failed to mount \"/sys/fs/cgroup\"");
368bbc02
CS
739 }
740
0d190408 741 if (flags & LXC_AUTO_SHMOUNTS_MASK) {
7b371c1e 742 ret = add_shmount_to_list(conf);
55022530
CB
743 if (ret < 0)
744 return log_error(-1, "Failed to add shmount entry to container config");
0d190408
LT
745 }
746
368bbc02 747 return 0;
368bbc02
CS
748}
749
4e5440c6 750static int setup_utsname(struct utsname *utsname)
0ad19a3f 751{
0fd73091
CB
752 int ret;
753
4e5440c6
DL
754 if (!utsname)
755 return 0;
0ad19a3f 756
0fd73091 757 ret = sethostname(utsname->nodename, strlen(utsname->nodename));
55022530
CB
758 if (ret < 0)
759 return log_error_errno(-1, errno, "Failed to set the hostname to \"%s\"",
760 utsname->nodename);
0ad19a3f 761
0fd73091 762 INFO("Set hostname to \"%s\"", utsname->nodename);
cd54d859 763
0ad19a3f 764 return 0;
765}
766
69aa6655
DE
767struct dev_symlinks {
768 const char *oldpath;
769 const char *name;
770};
771
772static const struct dev_symlinks dev_symlinks[] = {
0fd73091
CB
773 { "/proc/self/fd", "fd" },
774 { "/proc/self/fd/0", "stdin" },
775 { "/proc/self/fd/1", "stdout" },
776 { "/proc/self/fd/2", "stderr" },
69aa6655
DE
777};
778
ed8704d0 779static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
69aa6655 780{
79019997
CB
781 for (int i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
782 int ret;
783 struct stat s;
69aa6655 784 const struct dev_symlinks *d = &dev_symlinks[i];
0fd73091 785
79019997
CB
786 /*
787 * Stat the path first. If we don't get an error accept it as
0fd73091 788 * is and don't try to create it
09227be2 789 */
79019997 790 ret = fstatat(rootfs->dev_mntpt_fd, d->name, &s, 0);
0fd73091 791 if (ret == 0)
09227be2 792 continue;
09227be2 793
79019997
CB
794 ret = symlinkat(d->oldpath, rootfs->dev_mntpt_fd, d->name);
795 if (ret) {
796 switch (errno) {
797 case EROFS:
798 WARN("Failed to create \"%s\" on read-only filesystem", d->name);
799 __fallthrough;
800 case EEXIST:
801 break;
802 default:
803 return log_error_errno(-errno, errno, "Failed to create \"%s\"", d->name);
804 }
69aa6655
DE
805 }
806 }
0fd73091 807
69aa6655
DE
808 return 0;
809}
810
2187efd3 811/* Build a space-separate list of ptys to pass to systemd. */
885766f5 812static bool append_ttyname(char **pp, char *name)
b0a33c1e 813{
393903d1 814 char *p;
f1e05b90 815 size_t size;
393903d1
SH
816
817 if (!*pp) {
818 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
819 if (!*pp)
820 return false;
0fd73091 821
393903d1
SH
822 sprintf(*pp, "container_ttys=%s", name);
823 return true;
824 }
0fd73091 825
f1e05b90
DJ
826 size = strlen(*pp) + strlen(name) + 2;
827 p = realloc(*pp, size);
393903d1
SH
828 if (!p)
829 return false;
0fd73091 830
393903d1 831 *pp = p;
f1e05b90
DJ
832 (void)strlcat(p, " ", size);
833 (void)strlcat(p, name, size);
0fd73091 834
393903d1
SH
835 return true;
836}
837
2187efd3 838static int lxc_setup_ttys(struct lxc_conf *conf)
393903d1 839{
9e1045e3 840 int i, ret;
0e4be3cf 841 const struct lxc_tty_info *ttys = &conf->ttys;
885766f5 842 char *ttydir = ttys->dir;
6b5a54cd 843 char path[PATH_MAX], lxcpath[PATH_MAX];
b0a33c1e 844
e8bd4e43 845 if (!conf->rootfs.path)
bc9bd0e3
DL
846 return 0;
847
885766f5 848 for (i = 0; i < ttys->max; i++) {
0e4be3cf 849 struct lxc_terminal_info *tty = &ttys->tty[i];
b0a33c1e 850
e8bd4e43 851 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
73363c61 852 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 853 return -1;
9e1045e3 854
7c6ef2a2
SH
855 if (ttydir) {
856 /* create dev/lxc/tty%d" */
9e1045e3
CB
857 ret = snprintf(lxcpath, sizeof(lxcpath),
858 "/dev/%s/tty%d", ttydir, i + 1);
73363c61 859 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 860 return -1;
9e1045e3 861
adc1c715 862 ret = mknod(lxcpath, S_IFREG | 0000, 0);
9e1045e3 863 if (ret < 0 && errno != EEXIST) {
73363c61 864 SYSERROR("Failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
865 return -1;
866 }
9e1045e3 867
7c6ef2a2 868 ret = unlink(path);
9e1045e3 869 if (ret < 0 && errno != ENOENT) {
73363c61 870 SYSERROR("Failed to unlink \"%s\"", path);
7c6ef2a2
SH
871 return -1;
872 }
b0a33c1e 873
2520facd 874 ret = mount(tty->name, lxcpath, "none", MS_BIND, 0);
9e1045e3 875 if (ret < 0) {
55022530 876 SYSWARN("Failed to bind mount \"%s\" onto \"%s\"", tty->name, lxcpath);
7c6ef2a2
SH
877 continue;
878 }
55022530 879 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, lxcpath);
13954cce 880
9e1045e3
CB
881 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
882 ttydir, i + 1);
73363c61 883 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 884 return -1;
9e1045e3 885
7c6ef2a2 886 ret = symlink(lxcpath, path);
55022530
CB
887 if (ret < 0)
888 return log_error_errno(-1, errno, "Failed to create symlink \"%s\" -> \"%s\"", path, lxcpath);
7c6ef2a2 889 } else {
9e1045e3
CB
890 /* If we populated /dev, then we need to create
891 * /dev/ttyN
892 */
d3ccc04e
CB
893 ret = mknod(path, S_IFREG | 0000, 0);
894 if (ret < 0) /* this isn't fatal, continue */
6d1400b5 895 SYSERROR("Failed to create \"%s\"", path);
9e1045e3 896
2520facd 897 ret = mount(tty->name, path, "none", MS_BIND, 0);
9e1045e3 898 if (ret < 0) {
2520facd 899 SYSERROR("Failed to mount '%s'->'%s'", tty->name, path);
7c6ef2a2
SH
900 continue;
901 }
9e1045e3 902
d3ccc04e 903 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, path);
393903d1 904 }
9e1045e3 905
55022530
CB
906 if (!append_ttyname(&conf->ttys.tty_names, tty->name))
907 return log_error(-1, "Error setting up container_ttys string");
b0a33c1e 908 }
909
885766f5 910 INFO("Finished setting up %zu /dev/tty<N> device(s)", ttys->max);
b0a33c1e 911 return 0;
912}
913
586a3fe8
CB
914define_cleanup_function(struct lxc_tty_info *, lxc_delete_tty);
915
59eac805 916static int lxc_allocate_ttys(struct lxc_conf *conf)
2187efd3 917{
586a3fe8 918 struct lxc_terminal_info *tty_new = NULL;
fca23691 919 int ret;
586a3fe8 920 call_cleaner(lxc_delete_tty) struct lxc_tty_info *ttys = &conf->ttys;
2187efd3
CB
921
922 /* no tty in the configuration */
885766f5 923 if (ttys->max == 0)
2187efd3
CB
924 return 0;
925
55022530
CB
926 tty_new = malloc(sizeof(struct lxc_terminal_info) * ttys->max);
927 if (!tty_new)
2187efd3 928 return -ENOMEM;
55022530 929 ttys->tty = tty_new;
2187efd3 930
55022530 931 for (size_t i = 0; i < ttys->max; i++) {
0e4be3cf 932 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 933
36a94ce8 934 tty->ptx = -EBADF;
41808e20
CB
935 tty->pty = -EBADF;
936 ret = openpty(&tty->ptx, &tty->pty, NULL, NULL, NULL);
77a39805 937 if (ret < 0) {
885766f5 938 ttys->max = i;
55022530 939 return log_error_errno(-ENOTTY, ENOTTY, "Failed to create tty %zu", i);
2187efd3
CB
940 }
941
41808e20 942 ret = ttyname_r(tty->pty, tty->name, sizeof(tty->name));
77a39805 943 if (ret < 0) {
77a39805 944 ttys->max = i;
41808e20 945 return log_error_errno(-ENOTTY, ENOTTY, "Failed to retrieve name of tty %zu pty", i);
77a39805
CB
946 }
947
41808e20
CB
948 DEBUG("Created tty \"%s\" with ptx fd %d and pty fd %d",
949 tty->name, tty->ptx, tty->pty);
2187efd3
CB
950
951 /* Prevent leaking the file descriptors to the container */
36a94ce8 952 ret = fd_cloexec(tty->ptx, true);
2187efd3 953 if (ret < 0)
36a94ce8
CB
954 SYSWARN("Failed to set FD_CLOEXEC flag on ptx fd %d of tty device \"%s\"",
955 tty->ptx, tty->name);
2187efd3 956
41808e20 957 ret = fd_cloexec(tty->pty, true);
2187efd3 958 if (ret < 0)
41808e20
CB
959 SYSWARN("Failed to set FD_CLOEXEC flag on pty fd %d of tty device \"%s\"",
960 tty->pty, tty->name);
2187efd3 961
7581d645 962 tty->busy = -1;
2187efd3
CB
963 }
964
885766f5 965 INFO("Finished creating %zu tty devices", ttys->max);
586a3fe8 966 move_ptr(ttys);
2187efd3
CB
967 return 0;
968}
969
0e4be3cf 970void lxc_delete_tty(struct lxc_tty_info *ttys)
2187efd3 971{
386e6768
CB
972 if (!ttys->tty)
973 return;
974
55022530 975 for (int i = 0; i < ttys->max; i++) {
0e4be3cf 976 struct lxc_terminal_info *tty = &ttys->tty[i];
36a94ce8 977 close_prot_errno_disarm(tty->ptx);
41808e20 978 close_prot_errno_disarm(tty->pty);
2187efd3
CB
979 }
980
55022530 981 free_disarm(ttys->tty);
2187efd3
CB
982}
983
984static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
985{
986 int i;
0fd73091 987 int ret = -1;
2187efd3 988 struct lxc_conf *conf = handler->conf;
0e4be3cf 989 struct lxc_tty_info *ttys = &conf->ttys;
2187efd3 990 int sock = handler->data_sock[0];
2187efd3 991
885766f5 992 if (ttys->max == 0)
2187efd3
CB
993 return 0;
994
885766f5 995 for (i = 0; i < ttys->max; i++) {
2187efd3 996 int ttyfds[2];
0e4be3cf 997 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 998
36a94ce8 999 ttyfds[0] = tty->ptx;
41808e20 1000 ttyfds[1] = tty->pty;
2187efd3
CB
1001
1002 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1003 if (ret < 0)
1004 break;
1005
41808e20
CB
1006 TRACE("Sent tty \"%s\" with ptx fd %d and pty fd %d to parent",
1007 tty->name, tty->ptx, tty->pty);
2187efd3
CB
1008 }
1009
1010 if (ret < 0)
6d1400b5 1011 SYSERROR("Failed to send %zu ttys to parent", ttys->max);
2187efd3 1012 else
885766f5 1013 TRACE("Sent %zu ttys to parent", ttys->max);
2187efd3
CB
1014
1015 return ret;
1016}
1017
1018static int lxc_create_ttys(struct lxc_handler *handler)
1019{
1020 int ret = -1;
1021 struct lxc_conf *conf = handler->conf;
1022
663014ee 1023 ret = lxc_allocate_ttys(conf);
2187efd3
CB
1024 if (ret < 0) {
1025 ERROR("Failed to allocate ttys");
1026 goto on_error;
1027 }
1028
1029 ret = lxc_send_ttys_to_parent(handler);
1030 if (ret < 0) {
1031 ERROR("Failed to send ttys to parent");
1032 goto on_error;
1033 }
1034
1035 if (!conf->is_execute) {
1036 ret = lxc_setup_ttys(conf);
1037 if (ret < 0) {
1038 ERROR("Failed to setup ttys");
1039 goto on_error;
1040 }
1041 }
1042
885766f5
CB
1043 if (conf->ttys.tty_names) {
1044 ret = setenv("container_ttys", conf->ttys.tty_names, 1);
2187efd3 1045 if (ret < 0)
885766f5 1046 SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
2187efd3
CB
1047 }
1048
1049 ret = 0;
1050
1051on_error:
0e4be3cf 1052 lxc_delete_tty(&conf->ttys);
2187efd3
CB
1053
1054 return ret;
1055}
1056
7133b912
CB
1057/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1058 * error, log it but don't fail yet.
91c3830e 1059 */
7133b912 1060static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
63012bdd 1061 int autodevtmpfssize, const char *lxcpath)
91c3830e 1062{
ee8eeba8 1063 const char *path = rootfs->path ? rootfs->mount : NULL;
91c3830e 1064 int ret;
87e0e273 1065 mode_t cur_mask;
63012bdd 1066 char mount_options[128];
91c3830e 1067
7133b912 1068 INFO("Preparing \"/dev\"");
bc6928ff 1069
63012bdd
CK
1070 sprintf(mount_options, "size=%d,mode=755", (autodevtmpfssize != 0) ? autodevtmpfssize : 500000);
1071 DEBUG("Using mount options: %s", mount_options);
bc6928ff 1072
87e0e273 1073 cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
ae9215cf 1074 ret = mkdirat(rootfs->mntpt_fd, "dev" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
87e0e273
CB
1075 if (ret < 0 && errno != EEXIST) {
1076 SYSERROR("Failed to create \"/dev\" directory");
1077 ret = -errno;
1078 goto reset_umask;
bc6928ff 1079 }
87da4ec3 1080
ae9215cf 1081 ret = safe_mount_beneath_at(rootfs->mntpt_fd, "none", "dev", "tmpfs", 0, mount_options);
7133b912 1082 if (ret < 0) {
ee8eeba8 1083 __do_free char *fallback_path = NULL;
87da4ec3 1084
ee8eeba8
CB
1085 if (errno != ENOSYS) {
1086 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1087 goto reset_umask;
1088 }
1089
1090 if (path) {
1091 fallback_path = must_make_path(path, "/dev", NULL);
1092 ret = safe_mount("none", fallback_path, "tmpfs", 0, mount_options, path);
1093 } else {
1094 ret = safe_mount("none", "dev", "tmpfs", 0, mount_options, NULL);
1095 }
1096 if (ret < 0) {
1097 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1098 goto reset_umask;
1099 }
87e0e273 1100 }
ee8eeba8 1101 TRACE("Mounted tmpfs on \"%s\"", path);
87da4ec3 1102
7133b912 1103 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1104 * If not, then create it and exit if that fails...
1105 */
ae9215cf 1106 ret = mkdirat(rootfs->mntpt_fd, "dev/pts", S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
87e0e273
CB
1107 if (ret < 0 && errno != EEXIST) {
1108 SYSERROR("Failed to create directory \"%s\"", path);
1109 ret = -errno;
1110 goto reset_umask;
91c3830e
SH
1111 }
1112
87e0e273
CB
1113 ret = 0;
1114
1115reset_umask:
1116 (void)umask(cur_mask);
1117
7133b912 1118 INFO("Prepared \"/dev\"");
87e0e273 1119 return ret;
91c3830e
SH
1120}
1121
5e73416f 1122struct lxc_device_node {
74a3920a 1123 const char *name;
5e73416f
CB
1124 const mode_t mode;
1125 const int maj;
1126 const int min;
c6883f38
SH
1127};
1128
5e73416f 1129static const struct lxc_device_node lxc_devices[] = {
06749971 1130 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
5e73416f 1131 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
06749971
CB
1132 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1133 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
5e73416f
CB
1134 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1135 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
c6883f38
SH
1136};
1137
5067e4dd
CB
1138
1139enum {
1140 LXC_DEVNODE_BIND,
1141 LXC_DEVNODE_MKNOD,
1142 LXC_DEVNODE_PARTIAL,
1143 LXC_DEVNODE_OPEN,
1144};
1145
27245ff7 1146static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38 1147{
5e73416f 1148 int i, ret;
3a32201c 1149 mode_t cmask;
5067e4dd 1150 int use_mknod = LXC_DEVNODE_MKNOD;
c6883f38 1151
81498328
CB
1152 if (rootfs->dev_mntpt_fd < 0)
1153 return log_info(0, "No /dev directory found, skipping setup");
d43d5191 1154
3999be0a
CB
1155 INFO("Populating \"/dev\"");
1156
3a32201c 1157 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
5e73416f 1158 for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
927ea337 1159 char device_path[PATH_MAX];
5e73416f 1160 const struct lxc_device_node *device = &lxc_devices[i];
0728ebf4 1161
5067e4dd 1162 if (use_mknod >= LXC_DEVNODE_MKNOD) {
81498328 1163 ret = mknodat(rootfs->dev_mntpt_fd, device->name, device->mode, makedev(device->maj, device->min));
5e73416f 1164 if (ret == 0 || (ret < 0 && errno == EEXIST)) {
d43d5191 1165 DEBUG("Created device node \"%s\"", device->name);
5067e4dd 1166 } else if (ret < 0) {
55022530 1167 if (errno != EPERM)
d43d5191 1168 return log_error_errno(-1, errno, "Failed to create device node \"%s\"", device->name);
0bbf8572 1169
5067e4dd 1170 use_mknod = LXC_DEVNODE_BIND;
9cb4d183 1171 }
3999be0a 1172
5067e4dd
CB
1173 /* Device nodes are fully useable. */
1174 if (use_mknod == LXC_DEVNODE_OPEN)
1175 continue;
1176
1177 if (use_mknod == LXC_DEVNODE_MKNOD) {
d43d5191 1178 __do_close int fd = -EBADF;
5067e4dd
CB
1179 /* See
1180 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
1181 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
1182 */
81498328 1183 fd = open_at(rootfs->dev_mntpt_fd, device->name, PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
d43d5191 1184 if (fd >= 0) {
5067e4dd
CB
1185 /* Device nodes are fully useable. */
1186 use_mknod = LXC_DEVNODE_OPEN;
1187 continue;
1188 }
1189
d43d5191 1190 SYSTRACE("Failed to open \"%s\" device", device->name);
5067e4dd
CB
1191 /* Device nodes are only partially useable. */
1192 use_mknod = LXC_DEVNODE_PARTIAL;
1193 }
5e73416f
CB
1194 }
1195
5067e4dd
CB
1196 if (use_mknod != LXC_DEVNODE_PARTIAL) {
1197 /* If we are dealing with partially functional device
1198 * nodes the prio mknod() call will have created the
1199 * device node so we can use it as a bind-mount target.
1200 */
81498328 1201 ret = mknodat(rootfs->dev_mntpt_fd, device->name, S_IFREG | 0000, 0);
55022530 1202 if (ret < 0 && errno != EEXIST)
d43d5191 1203 return log_error_errno(-1, errno, "Failed to create file \"%s\"", device->name);
5e73416f
CB
1204 }
1205
1206 /* Fallback to bind-mounting the device from the host. */
927ea337
CB
1207 ret = snprintf(device_path, sizeof(device_path), "dev/%s", device->name);
1208 if (ret < 0 || (size_t)ret >= sizeof(device_path))
b41ff502 1209 return ret_errno(EIO);
5e73416f 1210
927ea337
CB
1211 ret = mount_from_at(rootfs->dfd_root_host, device_path,
1212 PROTECT_OPATH_FILE,
1213 PROTECT_LOOKUP_BENEATH_XDEV,
1214 rootfs->dev_mntpt_fd, device->name,
1215 PROTECT_OPATH_FILE,
1216 PROTECT_LOOKUP_BENEATH,
1217 NULL /* fstype */,
1218 MS_BIND /* mount flags */,
1219 NULL);
d43d5191 1220 if (ret < 0) {
927ea337
CB
1221 char path[PATH_MAX];
1222
1223 if (errno != ENOSYS)
1224 return log_error_errno(-errno, errno,
1225 "Failed to mount %d(%s) to %d(%s)",
1226 rootfs->dfd_root_host,
1227 device_path,
1228 rootfs->dev_mntpt_fd,
1229 device->name);
1230
1231 ret = snprintf(device_path, sizeof(device_path), "/dev/%s", device->name);
1232 if (ret < 0 || (size_t)ret >= sizeof(device_path))
1233 return ret_errno(EIO);
1234
1235 ret = snprintf(path, sizeof(path), "%s/dev/%s", get_rootfs_mnt(rootfs), device->name);
1236 if (ret < 0 || ret >= sizeof(path))
1237 return log_error(-1, "Failed to create device path for %s", device->name);
1238
1239 ret = safe_mount(device_path, path, 0, MS_BIND, NULL, get_rootfs_mnt(rootfs));
1240 if (ret < 0)
1241 return log_error_errno(-1, errno, "Failed to bind mount host device node \"%s\" to \"%s\"", device_path, path);
1242
1243 DEBUG("Bind mounted host device node \"%s\" to \"%s\"", device_path, path);
1244 continue;
d43d5191 1245 }
927ea337 1246 DEBUG("Bind mounted host device %d(%s) to %d(%s)", rootfs->dfd_root_host, device_path, rootfs->dev_mntpt_fd, device->name);
c6883f38 1247 }
5e73416f 1248 (void)umask(cmask);
c6883f38 1249
3999be0a 1250 INFO("Populated \"/dev\"");
c6883f38
SH
1251 return 0;
1252}
1253
8ce1abc2 1254static int lxc_mount_rootfs(struct lxc_conf *conf)
0ad19a3f 1255{
9aa76a17 1256 int ret;
10bc1861 1257 struct lxc_storage *bdev;
31f8b2fd 1258 struct lxc_rootfs *rootfs = &conf->rootfs;
cc28d0b0 1259
a0f379bf 1260 if (!rootfs->path) {
0fd73091 1261 ret = mount("", "/", NULL, MS_SLAVE | MS_REC, 0);
55022530 1262 if (ret < 0)
9e61fb1f 1263 return log_error_errno(-1, errno, "Failed to recursively turn root mount tree into dependent mount");
0fd73091 1264
ccf53741 1265 rootfs->mntpt_fd = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
31f8b2fd
CB
1266 if (rootfs->mntpt_fd < 0)
1267 return -errno;
1268
c69bd12f 1269 return 0;
a0f379bf 1270 }
0ad19a3f 1271
0fd73091 1272 ret = access(rootfs->mount, F_OK);
55022530
CB
1273 if (ret != 0)
1274 return log_error_errno(-1, errno, "Failed to access to \"%s\". Check it is present",
1275 rootfs->mount);
b1789442 1276
8a388ed4 1277 bdev = storage_init(conf);
55022530
CB
1278 if (!bdev)
1279 return log_error(-1, "Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1280 rootfs->path, rootfs->mount,
1281 rootfs->options ? rootfs->options : "(null)");
9aa76a17
CB
1282
1283 ret = bdev->ops->mount(bdev);
10bc1861 1284 storage_put(bdev);
55022530
CB
1285 if (ret < 0)
1286 return log_error(-1, "Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1287 rootfs->path, rootfs->mount,
1288 rootfs->options ? rootfs->options : "(null)");
0ad19a3f 1289
0fd73091 1290 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1291 rootfs->path, rootfs->mount,
1292 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1293
99ca5632 1294 rootfs->mntpt_fd = open_at(-EBADF, rootfs->mount, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
31f8b2fd
CB
1295 if (rootfs->mntpt_fd < 0)
1296 return -errno;
1297
ac778708
DL
1298 return 0;
1299}
1300
59eac805 1301static int lxc_chroot(const struct lxc_rootfs *rootfs)
91e93c71 1302{
b8d88764 1303 __do_free char *nroot = NULL;
0fd73091 1304 int i, ret;
8ce1abc2 1305 char *root = rootfs->mount;
91e93c71 1306
74e7b662 1307 nroot = realpath(root, NULL);
55022530
CB
1308 if (!nroot)
1309 return log_error_errno(-1, errno, "Failed to resolve \"%s\"", root);
91e93c71 1310
0fd73091 1311 ret = chdir("/");
b8d88764 1312 if (ret < 0)
0fd73091 1313 return -1;
91e93c71 1314
0fd73091
CB
1315 /* We could use here MS_MOVE, but in userns this mount is locked and
1316 * can't be moved.
91e93c71 1317 */
8ce1abc2 1318 ret = mount(nroot, "/", NULL, MS_REC | MS_BIND, NULL);
55022530
CB
1319 if (ret < 0)
1320 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"/\" as MS_REC | MS_BIND", nroot);
91e93c71 1321
0fd73091 1322 ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
55022530
CB
1323 if (ret < 0)
1324 return log_error_errno(-1, errno, "Failed to remount \"/\"");
91e93c71 1325
aa899945 1326 /* The following code cleans up inherited mounts which are not required
0fd73091 1327 * for CT.
91e93c71
AV
1328 *
1329 * The mountinfo file shows not all mounts, if a few points have been
1330 * unmounted between read operations from the mountinfo. So we need to
1331 * read mountinfo a few times.
1332 *
7ded5fa7 1333 * This loop can be skipped if a container uses userns, because all
91e93c71
AV
1334 * inherited mounts are locked and we should live with all this trash.
1335 */
0fd73091 1336 for (;;) {
4fdd1f72 1337 __do_fclose FILE *f = NULL;
f3d38164
CB
1338 __do_free char *line = NULL;
1339 char *slider1, *slider2;
91e93c71 1340 int progress = 0;
f3d38164 1341 size_t len = 0;
91e93c71 1342
4110345b 1343 f = fopen("./proc/self/mountinfo", "re");
55022530
CB
1344 if (!f)
1345 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
0fd73091 1346
f3d38164
CB
1347 while (getline(&line, &len, f) > 0) {
1348 for (slider1 = line, i = 0; slider1 && i < 4; i++)
1349 slider1 = strchr(slider1 + 1, ' ');
0fd73091 1350
f3d38164 1351 if (!slider1)
91e93c71 1352 continue;
0fd73091 1353
f3d38164
CB
1354 slider2 = strchr(slider1 + 1, ' ');
1355 if (!slider2)
91e93c71
AV
1356 continue;
1357
f3d38164
CB
1358 *slider2 = '\0';
1359 *slider1 = '.';
91e93c71 1360
f3d38164 1361 if (strcmp(slider1 + 1, "/") == 0)
91e93c71 1362 continue;
0fd73091 1363
f3d38164 1364 if (strcmp(slider1 + 1, "/proc") == 0)
91e93c71
AV
1365 continue;
1366
f3d38164 1367 ret = umount2(slider1, MNT_DETACH);
0fd73091 1368 if (ret == 0)
91e93c71
AV
1369 progress++;
1370 }
0fd73091 1371
91e93c71
AV
1372 if (!progress)
1373 break;
1374 }
1375
7ded5fa7 1376 /* This also can be skipped if a container uses userns. */
0fd73091 1377 (void)umount2("./proc", MNT_DETACH);
91e93c71
AV
1378
1379 /* It is weird, but chdir("..") moves us in a new root */
0fd73091 1380 ret = chdir("..");
55022530
CB
1381 if (ret < 0)
1382 return log_error_errno(-1, errno, "Failed to chdir(\"..\")");
91e93c71 1383
0fd73091 1384 ret = chroot(".");
55022530
CB
1385 if (ret < 0)
1386 return log_error_errno(-1, errno, "Failed to chroot(\".\")");
91e93c71
AV
1387
1388 return 0;
1389}
1390
8ce1abc2
CB
1391/* (The following explanation is copied verbatim from the kernel.)
1392 *
1393 * pivot_root Semantics:
1394 * Moves the root file system of the current process to the directory put_old,
1395 * makes new_root as the new root file system of the current process, and sets
1396 * root/cwd of all processes which had them on the current root to new_root.
1397 *
1398 * Restrictions:
1399 * The new_root and put_old must be directories, and must not be on the
1400 * same file system as the current process root. The put_old must be
1401 * underneath new_root, i.e. adding a non-zero number of /.. to the string
1402 * pointed to by put_old must yield the same directory as new_root. No other
1403 * file system may be mounted on put_old. After all, new_root is a mountpoint.
1404 *
1405 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
1406 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
1407 * in this situation.
1408 *
1409 * Notes:
1410 * - we don't move root/cwd if they are not at the root (reason: if something
1411 * cared enough to change them, it's probably wrong to force them elsewhere)
1412 * - it's okay to pick a root that isn't the root of a file system, e.g.
1413 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
1414 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
1415 * first.
1416 */
7f50ec8b 1417static int lxc_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1418{
7f50ec8b 1419 __do_close int fd_oldroot = -EBADF;
b0d7aac4 1420 int ret;
0fd73091 1421
7f50ec8b
CB
1422 fd_oldroot = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
1423 if (fd_oldroot < 0)
55022530 1424 return log_error_errno(-1, errno, "Failed to open old root directory");
ac778708 1425
8ce1abc2 1426 /* change into new root fs */
7f50ec8b 1427 ret = fchdir(rootfs->mntpt_fd);
55022530 1428 if (ret < 0)
7f50ec8b 1429 return log_error_errno(-errno, errno, "Failed to change into new root directory \"%s\"", rootfs->mount);
39c7b795 1430
8ce1abc2
CB
1431 /* pivot_root into our new root fs */
1432 ret = pivot_root(".", ".");
55022530 1433 if (ret < 0)
7f50ec8b 1434 return log_error_errno(-errno, errno, "Failed to pivot into new root directory \"%s\"", rootfs->mount);
39c7b795 1435
8ce1abc2
CB
1436 /* At this point the old-root is mounted on top of our new-root. To
1437 * unmounted it we must not be chdir'd into it, so escape back to
1438 * old-root.
1439 */
7f50ec8b 1440 ret = fchdir(fd_oldroot);
55022530 1441 if (ret < 0)
7f50ec8b 1442 return log_error_errno(-errno, errno, "Failed to enter old root directory");
c69bd12f 1443
7f50ec8b
CB
1444 /*
1445 * Make fd_oldroot a depedent mount to make sure our umounts don't
1446 * propagate to the host.
8ce1abc2
CB
1447 */
1448 ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
55022530 1449 if (ret < 0)
7f50ec8b 1450 return log_error_errno(-errno, errno, "Failed to recursively turn old root mount tree into dependent mount");
8ce1abc2
CB
1451
1452 ret = umount2(".", MNT_DETACH);
55022530 1453 if (ret < 0)
7f50ec8b 1454 return log_error_errno(-errno, errno, "Failed to detach old root directory");
8ce1abc2 1455
7f50ec8b 1456 ret = fchdir(rootfs->mntpt_fd);
55022530 1457 if (ret < 0)
7f50ec8b 1458 return log_error_errno(-errno, errno, "Failed to re-enter new root directory \"%s\"", rootfs->mount);
8ce1abc2 1459
7f50ec8b 1460 TRACE("Changed into new rootfs \"%s\"", rootfs->mount);
b0d7aac4 1461 return 0;
0ad19a3f 1462}
1463
8ce1abc2
CB
1464static int lxc_setup_rootfs_switch_root(const struct lxc_rootfs *rootfs)
1465{
55022530
CB
1466 if (!rootfs->path)
1467 return log_debug(0, "Container does not have a rootfs");
8ce1abc2
CB
1468
1469 if (detect_ramfs_rootfs())
1470 return lxc_chroot(rootfs);
1471
7f50ec8b 1472 return lxc_pivot_root(rootfs);
0ad19a3f 1473}
1474
7581a82f 1475static const struct id_map *find_mapped_nsid_entry(const struct lxc_conf *conf,
8ce1abc2
CB
1476 unsigned id,
1477 enum idtype idtype)
f4900711
CB
1478{
1479 struct lxc_list *it;
1480 struct id_map *map;
1481 struct id_map *retmap = NULL;
1482
dcf0ffdf
CB
1483 /* Shortcut for container's root mappings. */
1484 if (id == 0) {
1485 if (idtype == ID_TYPE_UID)
1486 return conf->root_nsuid_map;
1487
1488 if (idtype == ID_TYPE_GID)
1489 return conf->root_nsgid_map;
1490 }
1491
f4900711
CB
1492 lxc_list_for_each(it, &conf->id_map) {
1493 map = it->elem;
1494 if (map->idtype != idtype)
1495 continue;
1496
1497 if (id >= map->nsid && id < map->nsid + map->range) {
1498 retmap = map;
1499 break;
1500 }
1501 }
1502
1503 return retmap;
1504}
1505
68f3899e
CB
1506int lxc_setup_devpts_parent(struct lxc_handler *handler)
1507{
1508 int ret;
1509
1510 if (handler->conf->pty_max <= 0)
1511 return 0;
1512
1513 ret = lxc_abstract_unix_recv_fds(handler->data_sock[1], &handler->conf->devpts_fd, 1,
1514 &handler->conf->devpts_fd, sizeof(handler->conf->devpts_fd));
1515 if (ret < 0)
1516 return log_error_errno(-1, errno, "Failed to receive devpts fd from child");
1517
1518 TRACE("Received devpts file descriptor %d from child", handler->conf->devpts_fd);
1519 return 0;
1520}
1521
1522static int lxc_setup_devpts_child(struct lxc_handler *handler)
3c26f34e 1523{
f797f05e 1524 __do_close int devpts_fd = -EBADF;
70761e5e 1525 int ret;
ce155c60 1526 char **opts;
9d28c4f9 1527 char devpts_mntopts[256];
ce155c60
CB
1528 char *mntopt_sets[5];
1529 char default_devpts_mntopts[256] = "gid=5,newinstance,ptmxmode=0666,mode=0620";
f797f05e 1530 struct lxc_conf *conf = handler->conf;
a26822c5 1531 struct lxc_rootfs *rootfs = &conf->rootfs;
f797f05e 1532 int sock = handler->data_sock[0];
77890c6d 1533
55022530
CB
1534 if (conf->pty_max <= 0)
1535 return log_debug(0, "No new devpts instance will be mounted since no pts devices are requested");
3c26f34e 1536
e528c735
CB
1537 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu",
1538 default_devpts_mntopts, conf->pty_max);
9d28c4f9
CB
1539 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1540 return -1;
1541
29a7b484 1542 (void)umount2("/dev/pts", MNT_DETACH);
7e40254a 1543
70761e5e 1544 /* Create mountpoint for devpts instance. */
a26822c5 1545 ret = mkdirat(rootfs->dev_mntpt_fd, "pts", 0755);
55022530
CB
1546 if (ret < 0 && errno != EEXIST)
1547 return log_error_errno(-1, errno, "Failed to create \"/dev/pts\" directory");
3c26f34e 1548
ce155c60
CB
1549 /* gid=5 && max= */
1550 mntopt_sets[0] = devpts_mntopts;
dfbd4730 1551
ce155c60 1552 /* !gid=5 && max= */
6333c915 1553 mntopt_sets[1] = devpts_mntopts + STRLITERALLEN("gid=5") + 1;
ce155c60
CB
1554
1555 /* gid=5 && !max= */
1556 mntopt_sets[2] = default_devpts_mntopts;
1557
1558 /* !gid=5 && !max= */
6333c915 1559 mntopt_sets[3] = default_devpts_mntopts + STRLITERALLEN("gid=5") + 1;
ce155c60
CB
1560
1561 /* end */
1562 mntopt_sets[4] = NULL;
1563
1564 for (ret = -1, opts = mntopt_sets; opts && *opts; opts++) {
1565 /* mount new devpts instance */
1566 ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, *opts);
1567 if (ret == 0)
1568 break;
1569 }
1570
55022530
CB
1571 if (ret < 0)
1572 return log_error_errno(-1, errno, "Failed to mount new devpts instance");
ce155c60 1573 DEBUG("Mount new devpts instance with options \"%s\"", *opts);
70761e5e 1574
a26822c5 1575 devpts_fd = open_at(rootfs->dev_mntpt_fd, "pts", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
f797f05e 1576 if (devpts_fd < 0) {
fbfe5c82 1577 devpts_fd = -EBADF;
f797f05e 1578 TRACE("Failed to create detached devpts mount");
185b9ee9
CB
1579 ret = lxc_abstract_unix_send_fds(sock, NULL, 0, &devpts_fd, sizeof(int));
1580 } else {
1581 ret = lxc_abstract_unix_send_fds(sock, &devpts_fd, 1, NULL, 0);
f797f05e 1582 }
185b9ee9
CB
1583 if (ret < 0)
1584 return log_error_errno(-1, errno, "Failed to send devpts fd to parent");
1585
1586 TRACE("Sent devpts file descriptor %d to parent", devpts_fd);
f797f05e 1587
d5cb35d6 1588 /* Remove any pre-existing /dev/ptmx file. */
a26822c5 1589 ret = unlinkat(rootfs->dev_mntpt_fd, "ptmx", 0);
b29e05d6 1590 if (ret < 0) {
55022530
CB
1591 if (errno != ENOENT)
1592 return log_error_errno(-1, errno, "Failed to remove existing \"/dev/ptmx\" file");
b29e05d6 1593 } else {
0fd73091 1594 DEBUG("Removed existing \"/dev/ptmx\" file");
3c26f34e 1595 }
1596
d5cb35d6 1597 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
a26822c5 1598 ret = mknodat(rootfs->dev_mntpt_fd, "ptmx", S_IFREG | 0000, 0);
55022530
CB
1599 if (ret < 0 && errno != EEXIST)
1600 return log_error_errno(-1, errno, "Failed to create dummy \"/dev/ptmx\" file as bind mount target");
0fd73091 1601 DEBUG("Created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1602
d5cb35d6 1603 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1604 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
55022530
CB
1605 if (!ret)
1606 return log_debug(0, "Bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1607 else
d5cb35d6 1608 /* Fallthrough and try to create a symlink. */
0fd73091 1609 ERROR("Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
d5cb35d6
CB
1610
1611 /* Remove the dummy /dev/ptmx file we created above. */
a26822c5 1612 ret = unlinkat(rootfs->dev_mntpt_fd, "ptmx", 0);
55022530
CB
1613 if (ret < 0)
1614 return log_error_errno(-1, errno, "Failed to remove existing \"/dev/ptmx\"");
d5cb35d6
CB
1615
1616 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
a26822c5 1617 ret = symlinkat("/dev/pts/ptmx", rootfs->dev_mntpt_fd, "/dev/ptmx");
55022530
CB
1618 if (ret < 0)
1619 return log_error_errno(-1, errno, "Failed to create symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
cd54d859 1620
185b9ee9 1621 DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
3c26f34e 1622 return 0;
1623}
1624
cccc74b5
DL
1625static int setup_personality(int persona)
1626{
0fd73091
CB
1627 int ret;
1628
1629#if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1630 if (persona == -1)
1631 return 0;
1632
0fd73091 1633 ret = personality(persona);
55022530
CB
1634 if (ret < 0)
1635 return log_error_errno(-1, errno, "Failed to set personality to \"0x%x\"", persona);
cccc74b5 1636
0fd73091
CB
1637 INFO("Set personality to \"0x%x\"", persona);
1638#endif
cccc74b5
DL
1639
1640 return 0;
1641}
1642
efbfe93f
CB
1643static inline bool wants_console(const struct lxc_terminal *terminal)
1644{
1645 return !terminal->path || strcmp(terminal->path, "none");
1646}
1647
3d7d929a 1648static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
cf68ffd9 1649 const struct lxc_terminal *console,
41808e20 1650 int pty_mnt_fd)
6e590161 1651{
882671aa 1652 int ret;
6b5a54cd 1653 char path[PATH_MAX];
86530b0a 1654 char *rootfs_path = rootfs->path ? rootfs->mount : "";
52e35957 1655
efbfe93f 1656 if (!wants_console(console))
8b1b1210
CB
1657 return 0;
1658
cf68ffd9
CB
1659 /*
1660 * When we are asked to setup a console we remove any previous
8b1b1210
CB
1661 * /dev/console bind-mounts.
1662 */
953db219
CB
1663 if (exists_file_at(rootfs->dev_mntpt_fd, "console")) {
1664 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
1665 if (ret < 0 || (size_t)ret >= sizeof(path))
1666 return -1;
1667
a7ba3c7f 1668 ret = lxc_unstack_mountpoint(path, false);
55022530
CB
1669 if (ret < 0)
1670 return log_error_errno(-ret, errno, "Failed to unmount \"%s\"", path);
1671 else
86530b0a 1672 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
8b1b1210
CB
1673 }
1674
cf68ffd9
CB
1675 /*
1676 * For unprivileged containers autodev or automounts will already have
8b1b1210
CB
1677 * taken care of creating /dev/console.
1678 */
953db219 1679 ret = mknodat(rootfs->dev_mntpt_fd, "console", S_IFREG | 0000, 0);
55022530
CB
1680 if (ret < 0 && errno != EEXIST)
1681 return log_error_errno(-errno, errno, "Failed to create console");
52e35957 1682
41808e20 1683 ret = fchmod(console->pty, S_IXUSR | S_IXGRP);
55022530
CB
1684 if (ret < 0)
1685 return log_error_errno(-errno, errno, "Failed to set mode \"0%o\" to \"%s\"", S_IXUSR | S_IXGRP, console->name);
13954cce 1686
41808e20 1687 if (pty_mnt_fd >= 0) {
953db219 1688 ret = move_mount(pty_mnt_fd, "", rootfs->dev_mntpt_fd, "console", MOVE_MOUNT_F_EMPTY_PATH);
efbfe93f 1689 if (!ret) {
86087bd6
CB
1690 DEBUG("Moved mount \"%s\" onto %d(console)", console->name, rootfs->dev_mntpt_fd);
1691 return 0;
efbfe93f
CB
1692 }
1693
1694 if (ret && errno != ENOSYS)
1695 return log_error_errno(-1, errno,
86087bd6
CB
1696 "Failed to mount %d(%s) on %d(console)",
1697 pty_mnt_fd, console->name, rootfs->dev_mntpt_fd);
efbfe93f
CB
1698 }
1699
953db219
CB
1700 ret = safe_mount_beneath_at(rootfs->dev_mntpt_fd, console->name, "console", NULL, MS_BIND, NULL);
1701 if (ret < 0) {
1702 if (errno == ENOSYS) {
1703 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
1704 if (ret < 0 || (size_t)ret >= sizeof(path))
1705 return -1;
1706
1707 ret = safe_mount(console->name, path, "none", MS_BIND, NULL, rootfs_path);
1708 if (ret < 0)
1709 return log_error_errno(-1, errno, "Failed to mount %d(%s) on \"%s\"", pty_mnt_fd, console->name, path);
1710 }
1711 }
6e590161 1712
41808e20 1713 DEBUG("Mounted pty device %d(%s) onto \"%s\"", pty_mnt_fd, console->name, path);
7c6ef2a2
SH
1714 return 0;
1715}
1716
3d7d929a 1717static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
dcad02f8 1718 const struct lxc_terminal *console,
41808e20 1719 char *ttydir, int pty_mnt_fd)
7c6ef2a2 1720{
3b7e332f 1721 int ret;
6b5a54cd 1722 char path[PATH_MAX], lxcpath[PATH_MAX];
86530b0a 1723 char *rootfs_path = rootfs->path ? rootfs->mount : "";
7c6ef2a2 1724
efbfe93f 1725 if (!wants_console(console))
3dc035f1
L
1726 return 0;
1727
7c6ef2a2 1728 /* create rootfs/dev/<ttydir> directory */
86530b0a 1729 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs_path, ttydir);
3d7d929a 1730 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1731 return -1;
3d7d929a 1732
7c6ef2a2 1733 ret = mkdir(path, 0755);
55022530
CB
1734 if (ret && errno != EEXIST)
1735 return log_error_errno(-errno, errno, "Failed to create \"%s\"", path);
4742cd9a 1736 DEBUG("Created directory for console and tty devices at \"%s\"", path);
7c6ef2a2 1737
86530b0a 1738 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
3d7d929a
CB
1739 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1740 return -1;
1741
3b7e332f 1742 ret = mknod(lxcpath, S_IFREG | 0000, 0);
55022530
CB
1743 if (ret < 0 && errno != EEXIST)
1744 return log_error_errno(-errno, errno, "Failed to create \"%s\"", lxcpath);
7c6ef2a2 1745
86530b0a 1746 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
3dc035f1 1747 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1748 return -1;
2a12fefd 1749
3dc035f1 1750 if (file_exists(path)) {
a7ba3c7f 1751 ret = lxc_unstack_mountpoint(path, false);
55022530
CB
1752 if (ret < 0)
1753 return log_error_errno(-ret, errno, "Failed to unmount \"%s\"", path);
1754 else
86530b0a 1755 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
3dc035f1 1756 }
2a12fefd 1757
3b7e332f 1758 ret = mknod(path, S_IFREG | 0000, 0);
55022530
CB
1759 if (ret < 0 && errno != EEXIST)
1760 return log_error_errno(-errno, errno, "Failed to create console");
7c6ef2a2 1761
41808e20 1762 ret = fchmod(console->pty, S_IXUSR | S_IXGRP);
55022530
CB
1763 if (ret < 0)
1764 return log_error_errno(-errno, errno, "Failed to set mode \"0%o\" to \"%s\"", S_IXUSR | S_IXGRP, console->name);
2a12fefd 1765
3dc035f1 1766 /* bind mount console->name to '/dev/<ttydir>/console' */
41808e20
CB
1767 if (pty_mnt_fd >= 0) {
1768 ret = move_mount(pty_mnt_fd, "", -EBADF, lxcpath, MOVE_MOUNT_F_EMPTY_PATH);
efbfe93f
CB
1769 if (!ret) {
1770 DEBUG("Moved mount \"%s\" onto \"%s\"", console->name, lxcpath);
1771 goto finish;
1772 }
1773
1774 if (ret && errno != ENOSYS)
1775 return log_error_errno(-1, errno,
1776 "Failed to mount %d(%s) on \"%s\"",
41808e20 1777 pty_mnt_fd, console->name, lxcpath);
efbfe93f
CB
1778 }
1779
1780 ret = safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs_path);
55022530 1781 if (ret < 0)
41808e20 1782 return log_error_errno(-1, errno, "Failed to mount %d(%s) on \"%s\"", pty_mnt_fd, console->name, lxcpath);
86530b0a 1783 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1 1784
efbfe93f 1785finish:
3dc035f1 1786 /* bind mount '/dev/<ttydir>/console' to '/dev/console' */
86530b0a 1787 ret = safe_mount(lxcpath, path, "none", MS_BIND, 0, rootfs_path);
55022530
CB
1788 if (ret < 0)
1789 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
86530b0a 1790 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1 1791
86530b0a 1792 DEBUG("Console has been setup under \"%s\" and mounted to \"%s\"", lxcpath, path);
6e590161 1793 return 0;
1794}
1795
3d7d929a 1796static int lxc_setup_console(const struct lxc_rootfs *rootfs,
cf68ffd9 1797 const struct lxc_terminal *console, char *ttydir,
41808e20 1798 int pty_mnt_fd)
7c6ef2a2 1799{
3d7d929a 1800
7c6ef2a2 1801 if (!ttydir)
41808e20 1802 return lxc_setup_dev_console(rootfs, console, pty_mnt_fd);
7c6ef2a2 1803
41808e20 1804 return lxc_setup_ttydir_console(rootfs, console, ttydir, pty_mnt_fd);
7c6ef2a2
SH
1805}
1806
a08bfbe3 1807static int parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
998ac676 1808{
a08bfbe3 1809 ssize_t ret;
998ac676 1810
85c2de39
MB
1811 /* If '=' is contained in opt, the option must go into data. */
1812 if (!strchr(opt, '=')) {
a08bfbe3
CB
1813 /*
1814 * If opt is found in mount_opt, set or clear flags.
1815 * Otherwise append it to data.
1816 */
85c2de39 1817 size_t opt_len = strlen(opt);
a08bfbe3 1818 for (struct mount_opt *mo = &mount_opt[0]; mo->name != NULL; mo++) {
85c2de39 1819 size_t mo_name_len = strlen(mo->name);
a08bfbe3 1820
85c2de39
MB
1821 if (opt_len == mo_name_len && strncmp(opt, mo->name, mo_name_len) == 0) {
1822 if (mo->clear)
1823 *flags &= ~mo->flag;
1824 else
1825 *flags |= mo->flag;
a08bfbe3 1826 return 0;
85c2de39 1827 }
998ac676
RT
1828 }
1829 }
1830
a08bfbe3
CB
1831 if (strlen(*data)) {
1832 ret = strlcat(*data, ",", size);
1833 if (ret < 0)
1834 return log_error_errno(ret, errno, "Failed to append \",\" to %s", *data);
1835 }
1836
1837 ret = strlcat(*data, opt, size);
1838 if (ret < 0)
1839 return log_error_errno(ret, errno, "Failed to append \"%s\" to %s", opt, *data);
efed99a4 1840
a08bfbe3 1841 return 0;
998ac676
RT
1842}
1843
0fd73091 1844int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata)
998ac676 1845{
a08bfbe3
CB
1846 __do_free char *mntopts_new = NULL, *mntopts_dup = NULL;
1847 char *mntopt_cur = NULL;
efed99a4 1848 size_t size;
998ac676 1849
a08bfbe3
CB
1850 if (*mntdata || *mntflags)
1851 return ret_errno(EINVAL);
911324ef
DL
1852
1853 if (!mntopts)
998ac676
RT
1854 return 0;
1855
a08bfbe3
CB
1856 mntopts_dup = strdup(mntopts);
1857 if (!mntopts_dup)
1858 return ret_errno(ENOMEM);
998ac676 1859
a08bfbe3
CB
1860 size = strlen(mntopts_dup) + 1;
1861 mntopts_new = zalloc(size);
1862 if (!mntopts_new)
1863 return ret_errno(ENOMEM);
998ac676 1864
a08bfbe3
CB
1865 lxc_iterate_parts(mntopt_cur, mntopts_dup, ",")
1866 if (parse_mntopt(mntopt_cur, mntflags, &mntopts_new, size) < 0)
1867 return ret_errno(EINVAL);
998ac676 1868
a08bfbe3
CB
1869 if (*mntopts_new)
1870 *mntdata = move_ptr(mntopts_new);
998ac676
RT
1871
1872 return 0;
1873}
1874
d840039e
YT
1875static void parse_propagationopt(char *opt, unsigned long *flags)
1876{
1877 struct mount_opt *mo;
1878
1879 /* If opt is found in propagation_opt, set or clear flags. */
d840039e 1880 for (mo = &propagation_opt[0]; mo->name != NULL; mo++) {
0fd73091
CB
1881 if (strncmp(opt, mo->name, strlen(mo->name)) != 0)
1882 continue;
1883
1884 if (mo->clear)
1885 *flags &= ~mo->flag;
1886 else
1887 *flags |= mo->flag;
1888
1889 return;
d840039e
YT
1890 }
1891}
1892
8ce1abc2 1893int parse_propagationopts(const char *mntopts, unsigned long *pflags)
d840039e 1894{
dfd2e059
CB
1895 __do_free char *s = NULL;
1896 char *p;
d840039e
YT
1897
1898 if (!mntopts)
1899 return 0;
1900
1901 s = strdup(mntopts);
55022530
CB
1902 if (!s)
1903 return log_error_errno(-ENOMEM, errno, "Failed to allocate memory");
d840039e 1904
0fd73091 1905 *pflags = 0L;
8db9d26f 1906 lxc_iterate_parts(p, s, ",")
d840039e 1907 parse_propagationopt(p, pflags);
0fd73091 1908
d840039e
YT
1909 return 0;
1910}
1911
6fd5e769
SH
1912static void null_endofword(char *word)
1913{
1914 while (*word && *word != ' ' && *word != '\t')
1915 word++;
1916 *word = '\0';
1917}
1918
0fd73091 1919/* skip @nfields spaces in @src */
6fd5e769
SH
1920static char *get_field(char *src, int nfields)
1921{
6fd5e769 1922 int i;
0fd73091 1923 char *p = src;
6fd5e769
SH
1924
1925 for (i = 0; i < nfields; i++) {
1926 while (*p && *p != ' ' && *p != '\t')
1927 p++;
0fd73091 1928
6fd5e769
SH
1929 if (!*p)
1930 break;
0fd73091 1931
6fd5e769
SH
1932 p++;
1933 }
0fd73091 1934
6fd5e769
SH
1935 return p;
1936}
1937
911324ef
DL
1938static int mount_entry(const char *fsname, const char *target,
1939 const char *fstype, unsigned long mountflags,
d840039e
YT
1940 unsigned long pflags, const char *data, bool optional,
1941 bool dev, bool relative, const char *rootfs)
911324ef 1942{
0ac4b28a 1943 int ret;
6b5a54cd 1944 char srcbuf[PATH_MAX];
181437fd 1945 const char *srcpath = fsname;
614305f3 1946#ifdef HAVE_STATVFS
2938f7c8 1947 struct statvfs sb;
614305f3 1948#endif
2938f7c8 1949
181437fd 1950 if (relative) {
55022530
CB
1951 ret = snprintf(srcbuf, sizeof(srcbuf), "%s/%s", rootfs ? rootfs : "/", fsname ? fsname : "");
1952 if (ret < 0 || ret >= sizeof(srcbuf))
1953 return log_error_errno(-1, errno, "source path is too long");
181437fd
YT
1954 srcpath = srcbuf;
1955 }
1956
1957 ret = safe_mount(srcpath, target, fstype, mountflags & ~MS_REMOUNT, data,
0ac4b28a
CB
1958 rootfs);
1959 if (ret < 0) {
55022530
CB
1960 if (optional)
1961 return log_info_errno(0, errno, "Failed to mount \"%s\" on \"%s\" (optional)",
1962 srcpath ? srcpath : "(null)", target);
0ac4b28a 1963
55022530
CB
1964 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"",
1965 srcpath ? srcpath : "(null)", target);
911324ef
DL
1966 }
1967
1968 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
0ac4b28a 1969
55022530
CB
1970 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount options",
1971 srcpath ? srcpath : "(none)", target ? target : "(none)");
0ac4b28a 1972
614305f3 1973#ifdef HAVE_STATVFS
181437fd 1974 if (srcpath && statvfs(srcpath, &sb) == 0) {
94bef7e4
TA
1975 unsigned long required_flags = 0;
1976
2938f7c8
SH
1977 if (sb.f_flag & MS_NOSUID)
1978 required_flags |= MS_NOSUID;
0ac4b28a 1979
ae7a770e 1980 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 1981 required_flags |= MS_NODEV;
0ac4b28a 1982
2938f7c8
SH
1983 if (sb.f_flag & MS_RDONLY)
1984 required_flags |= MS_RDONLY;
0ac4b28a 1985
2938f7c8
SH
1986 if (sb.f_flag & MS_NOEXEC)
1987 required_flags |= MS_NOEXEC;
0ac4b28a 1988
55022530
CB
1989 DEBUG("Flags for \"%s\" were %lu, required extra flags are %lu",
1990 srcpath, sb.f_flag, required_flags);
0ac4b28a
CB
1991
1992 /* If this was a bind mount request, and required_flags
2938f7c8 1993 * does not have any flags which are not already in
0ac4b28a 1994 * mountflags, then skip the remount.
2938f7c8 1995 */
94bef7e4
TA
1996 if (!(mountflags & MS_REMOUNT) &&
1997 (!(required_flags & ~mountflags) && !(mountflags & MS_RDONLY))) {
15f3e22b
CB
1998 DEBUG("Mountflags already were %lu, skipping remount", mountflags);
1999 goto skipremount;
2938f7c8 2000 }
0ac4b28a 2001
2938f7c8 2002 mountflags |= required_flags;
6fd5e769 2003 }
614305f3 2004#endif
911324ef 2005
181437fd 2006 ret = mount(srcpath, target, fstype, mountflags | MS_REMOUNT, data);
0ac4b28a 2007 if (ret < 0) {
55022530
CB
2008 if (optional)
2009 return log_info_errno(0, errno, "Failed to mount \"%s\" on \"%s\" (optional)",
2010 srcpath ? srcpath : "(null)",
2011 target);
2012
2013 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"",
2014 srcpath ? srcpath : "(null)",
2015 target);
911324ef
DL
2016 }
2017 }
2018
a3ed9b81 2019#ifdef HAVE_STATVFS
2020skipremount:
2021#endif
d840039e
YT
2022 if (pflags) {
2023 ret = mount(NULL, target, NULL, pflags, NULL);
2024 if (ret < 0) {
55022530
CB
2025 if (optional)
2026 return log_info_errno(0, errno, "Failed to change mount propagation for \"%s\" (optional)", target);
2027 else
2028 return log_error_errno(-1, errno, "Failed to change mount propagation for \"%s\" (optional)", target);
d840039e
YT
2029 }
2030 DEBUG("Changed mount propagation for \"%s\"", target);
2031 }
2032
0103eb53 2033 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"",
181437fd 2034 srcpath ? srcpath : "(null)", target, fstype);
911324ef
DL
2035
2036 return 0;
2037}
2038
c5e30de4 2039/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
2040static void cull_mntent_opt(struct mntent *mntent)
2041{
2042 int i;
0fd73091
CB
2043 char *list[] = {
2044 "create=dir",
2045 "create=file",
2046 "optional",
2047 "relative",
2048 NULL
2049 };
c5e30de4
CB
2050
2051 for (i = 0; list[i]; i++) {
2052 char *p, *p2;
2053
2054 p = strstr(mntent->mnt_opts, list[i]);
2055 if (!p)
4e4ca161 2056 continue;
c5e30de4 2057
4e4ca161
SH
2058 p2 = strchr(p, ',');
2059 if (!p2) {
2060 /* no more mntopts, so just chop it here */
2061 *p = '\0';
2062 continue;
2063 }
c5e30de4
CB
2064
2065 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
2066 }
2067}
2068
4d5b72a1 2069static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
2070 const char *path,
2071 const struct lxc_rootfs *rootfs,
0fd73091 2072 const char *lxc_name, const char *lxc_path)
0ad19a3f 2073{
7a76eeaa 2074 __do_free char *p1 = NULL;
3b7e332f 2075 int ret;
7a76eeaa 2076 char *p2;
911324ef 2077
12e6ab5d 2078 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
749f98d9 2079 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
12e6ab5d
CB
2080 if (ret < 0)
2081 return -1;
2082 }
6e46cc0d 2083
34cfffb3 2084 if (hasmntopt(mntent, "create=dir")) {
749f98d9 2085 ret = mkdir_p(path, 0755);
55022530
CB
2086 if (ret < 0 && errno != EEXIST)
2087 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", path);
34cfffb3
SG
2088 }
2089
0fd73091
CB
2090 if (!hasmntopt(mntent, "create=file"))
2091 return 0;
749f98d9 2092
0fd73091
CB
2093 ret = access(path, F_OK);
2094 if (ret == 0)
2095 return 0;
749f98d9 2096
0fd73091
CB
2097 p1 = strdup(path);
2098 if (!p1)
2099 return -1;
749f98d9 2100
0fd73091 2101 p2 = dirname(p1);
749f98d9 2102
0fd73091 2103 ret = mkdir_p(p2, 0755);
55022530
CB
2104 if (ret < 0 && errno != EEXIST)
2105 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", path);
749f98d9 2106
3b7e332f
CB
2107 ret = mknod(path, S_IFREG | 0000, 0);
2108 if (ret < 0 && errno != EEXIST)
2109 return -errno;
0fd73091 2110
749f98d9 2111 return 0;
4d5b72a1
NC
2112}
2113
ec50007f
CB
2114/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2115 * without a rootfs. */
db4aba38 2116static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
2117 const char *path,
2118 const struct lxc_rootfs *rootfs,
2119 const char *lxc_name,
2120 const char *lxc_path)
4d5b72a1 2121{
fd214f37 2122 __do_free char *mntdata = NULL;
a08bfbe3
CB
2123 unsigned long mntflags = 0, pflags = 0;
2124 char *rootfs_path = NULL;
d8b712bc 2125 int ret;
181437fd 2126 bool dev, optional, relative;
d8b712bc
CB
2127
2128 optional = hasmntopt(mntent, "optional") != NULL;
2129 dev = hasmntopt(mntent, "dev") != NULL;
181437fd 2130 relative = hasmntopt(mntent, "relative") != NULL;
d8b712bc 2131
ec50007f
CB
2132 if (rootfs && rootfs->path)
2133 rootfs_path = rootfs->mount;
2134
d8b712bc
CB
2135 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2136 lxc_path);
2137 if (ret < 0) {
2138 if (optional)
2139 return 0;
608e3567 2140
d8b712bc
CB
2141 return -1;
2142 }
4e4ca161
SH
2143 cull_mntent_opt(mntent);
2144
d840039e
YT
2145 ret = parse_propagationopts(mntent->mnt_opts, &pflags);
2146 if (ret < 0)
2147 return -1;
2148
d8b712bc
CB
2149 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2150 if (ret < 0)
a08bfbe3 2151 return ret;
a17b1e65 2152
6e46cc0d 2153 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
d840039e 2154 pflags, mntdata, optional, dev, relative, rootfs_path);
68c152ef 2155
911324ef
DL
2156 return ret;
2157}
2158
db4aba38
NC
2159static inline int mount_entry_on_systemfs(struct mntent *mntent)
2160{
1433c9f9 2161 int ret;
6b5a54cd 2162 char path[PATH_MAX];
1433c9f9
CB
2163
2164 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
2165 * absolute paths starting at / on the host.
2166 */
1433c9f9
CB
2167 if (mntent->mnt_dir[0] != '/')
2168 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2169 else
2170 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
07667a6a 2171 if (ret < 0 || ret >= sizeof(path))
1433c9f9 2172 return -1;
1433c9f9
CB
2173
2174 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
2175}
2176
4e4ca161 2177static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 2178 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
2179 const char *lxc_name,
2180 const char *lxc_path)
911324ef 2181{
bdd2b34c 2182 int offset;
013bd428 2183 char *aux;
67e571de 2184 const char *lxcpath;
6b5a54cd 2185 char path[PATH_MAX];
bdd2b34c 2186 int ret = 0;
0ad19a3f 2187
593e8478 2188 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 2189 if (!lxcpath)
2a59a681 2190 return -1;
2a59a681 2191
bdd2b34c
CB
2192 /* If rootfs->path is a blockdev path, allow container fstab to use
2193 * <lxcpath>/<name>/rootfs" as the target prefix.
2194 */
6b5a54cd
CB
2195 ret = snprintf(path, PATH_MAX, "%s/%s/rootfs", lxcpath, lxc_name);
2196 if (ret < 0 || ret >= PATH_MAX)
80a881b2
SH
2197 goto skipvarlib;
2198
2199 aux = strstr(mntent->mnt_dir, path);
2200 if (aux) {
2201 offset = strlen(path);
2202 goto skipabs;
2203 }
2204
2205skipvarlib:
013bd428 2206 aux = strstr(mntent->mnt_dir, rootfs->path);
55022530
CB
2207 if (!aux)
2208 return log_warn(ret, "Ignoring mount point \"%s\"", mntent->mnt_dir);
80a881b2
SH
2209 offset = strlen(rootfs->path);
2210
2211skipabs:
6b5a54cd
CB
2212 ret = snprintf(path, PATH_MAX, "%s/%s", rootfs->mount, aux + offset);
2213 if (ret < 0 || ret >= PATH_MAX)
a17b1e65 2214 return -1;
a17b1e65 2215
0a2dddd4 2216 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 2217}
d330fe7b 2218
4e4ca161 2219static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
2220 const struct lxc_rootfs *rootfs,
2221 const char *lxc_name,
2222 const char *lxc_path)
911324ef 2223{
911324ef 2224 int ret;
6b5a54cd 2225 char path[PATH_MAX];
d330fe7b 2226
34cfffb3 2227 /* relative to root mount point */
6e46cc0d 2228 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
0fd73091 2229 if (ret < 0 || (size_t)ret >= sizeof(path))
9ba8130c 2230 return -1;
911324ef 2231
0a2dddd4 2232 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2233}
2234
a7c6e830 2235static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2236 const char *lxc_name, const char *lxc_path)
911324ef 2237{
9d03d857 2238 char buf[PATH_MAX];
0fd73091 2239 struct mntent mntent;
e76b8764 2240
aaf901be 2241 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
9d03d857
CB
2242 int ret;
2243
1ae3c19f
CB
2244 if (!rootfs->path)
2245 ret = mount_entry_on_systemfs(&mntent);
2246 else if (mntent.mnt_dir[0] != '/')
2247 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2248 lxc_name, lxc_path);
2249 else
2250 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
9d03d857 2251 lxc_name, lxc_path);
1ae3c19f
CB
2252 if (ret < 0)
2253 return -1;
0ad19a3f 2254 }
cd54d859 2255
55022530
CB
2256 if (!feof(file) || ferror(file))
2257 return log_error(-1, "Failed to parse mount entries");
9d03d857
CB
2258
2259 return 0;
e7938e9e
MN
2260}
2261
55022530
CB
2262static inline void __auto_endmntent__(FILE **f)
2263{
2264 if (*f)
2265 endmntent(*f);
2266}
2267
2268#define __do_endmntent __attribute__((__cleanup__(__auto_endmntent__)))
2269
06749971
CB
2270static int setup_mount(const struct lxc_conf *conf,
2271 const struct lxc_rootfs *rootfs, const char *fstab,
42dff448 2272 const char *lxc_name, const char *lxc_path)
e7938e9e 2273{
55022530 2274 __do_endmntent FILE *f = NULL;
e7938e9e
MN
2275 int ret;
2276
2277 if (!fstab)
2278 return 0;
2279
55022530
CB
2280 f = setmntent(fstab, "re");
2281 if (!f)
2282 return log_error_errno(-1, errno, "Failed to open \"%s\"", fstab);
e7938e9e 2283
a7c6e830 2284 ret = mount_file_entries(rootfs, f, lxc_name, lxc_path);
42dff448
CB
2285 if (ret < 0)
2286 ERROR("Failed to set up mount entries");
e7938e9e 2287
0ad19a3f 2288 return ret;
2289}
2290
1800f924
WB
2291/*
2292 * In order for nested containers to be able to mount /proc and /sys they need
2293 * to see a "pure" proc and sysfs mount points with nothing mounted on top
2294 * (like lxcfs).
2295 * For this we provide proc and sysfs in /dev/.lxc/{proc,sys} while using an
2296 * apparmor rule to deny access to them. This is mostly for convenience: The
2297 * container's root user can mount them anyway and thus has access to the two
2298 * file systems. But a non-root user in the container should not be allowed to
2299 * access them as a side effect without explicitly allowing it.
2300 */
2301static const char nesting_helpers[] =
dc691e34
CB
2302"proc dev/.lxc/proc proc create=dir,optional 0 0\n"
2303"sys dev/.lxc/sys sysfs create=dir,optional 0 0\n";
1800f924
WB
2304
2305FILE *make_anonymous_mount_file(struct lxc_list *mount,
2306 bool include_nesting_helpers)
e7938e9e 2307{
f62cf1d4 2308 __do_close int fd = -EBADF;
4110345b 2309 FILE *f;
5ef5c9a3 2310 int ret;
e7938e9e 2311 char *mount_entry;
5ef5c9a3 2312 struct lxc_list *iterator;
5ef5c9a3 2313
0fd73091 2314 fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
5ef5c9a3 2315 if (fd < 0) {
a324e7eb
CB
2316 char template[] = P_tmpdir "/.lxc_mount_file_XXXXXX";
2317
5ef5c9a3
CB
2318 if (errno != ENOSYS)
2319 return NULL;
a324e7eb
CB
2320
2321 fd = lxc_make_tmpfile(template, true);
55022530
CB
2322 if (fd < 0)
2323 return log_error_errno(NULL, errno, "Could not create temporary mount file");
0fd73091 2324
6bd04140 2325 TRACE("Created temporary mount file");
5ef5c9a3 2326 }
e7938e9e 2327
0fd73091
CB
2328 lxc_list_for_each (iterator, mount) {
2329 size_t len;
2330
e7938e9e 2331 mount_entry = iterator->elem;
0fd73091 2332 len = strlen(mount_entry);
5ef5c9a3 2333
489f39be 2334 ret = lxc_write_nointr(fd, mount_entry, len);
0fd73091 2335 if (ret != len)
79bcf5ee 2336 return NULL;
0fd73091 2337
489f39be 2338 ret = lxc_write_nointr(fd, "\n", 1);
0fd73091 2339 if (ret != 1)
79bcf5ee 2340 return NULL;
e7938e9e
MN
2341 }
2342
1800f924
WB
2343 if (include_nesting_helpers) {
2344 ret = lxc_write_nointr(fd, nesting_helpers,
6333c915
CB
2345 STRARRAYLEN(nesting_helpers));
2346 if (ret != STRARRAYLEN(nesting_helpers))
79bcf5ee 2347 return NULL;
1800f924
WB
2348 }
2349
0fd73091
CB
2350 ret = lseek(fd, 0, SEEK_SET);
2351 if (ret < 0)
79bcf5ee 2352 return NULL;
0fd73091 2353
4110345b
CB
2354 f = fdopen(fd, "re+");
2355 if (f)
2356 move_fd(fd); /* Transfer ownership of fd. */
2357 return f;
9fc7f8c0
TA
2358}
2359
06749971
CB
2360static int setup_mount_entries(const struct lxc_conf *conf,
2361 const struct lxc_rootfs *rootfs,
5ef5c9a3
CB
2362 struct lxc_list *mount, const char *lxc_name,
2363 const char *lxc_path)
9fc7f8c0 2364{
c85ced65 2365 __do_fclose FILE *f = NULL;
9fc7f8c0 2366
1800f924 2367 f = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
19b5d755 2368 if (!f)
9fc7f8c0 2369 return -1;
e7938e9e 2370
a7c6e830 2371 return mount_file_entries(rootfs, f, lxc_name, lxc_path);
e7938e9e
MN
2372}
2373
bab88e68
CS
2374static int parse_cap(const char *cap)
2375{
84760c11 2376 size_t i;
2377 int capid = -1;
0fd73091
CB
2378 size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
2379 char *ptr = NULL;
bab88e68 2380
0fd73091 2381 if (strcmp(cap, "none") == 0)
7035407c
DE
2382 return -2;
2383
8560cd36 2384 for (i = 0; i < end; i++) {
bab88e68
CS
2385 if (strcmp(cap, caps_opt[i].name))
2386 continue;
2387
2388 capid = caps_opt[i].value;
2389 break;
2390 }
2391
2392 if (capid < 0) {
0fd73091
CB
2393 /* Try to see if it's numeric, so the user may specify
2394 * capabilities that the running kernel knows about but we
2395 * don't
2396 */
bab88e68
CS
2397 errno = 0;
2398 capid = strtol(cap, &ptr, 10);
2399 if (!ptr || *ptr != '\0' || errno != 0)
2400 /* not a valid number */
2401 capid = -1;
2402 else if (capid > lxc_caps_last_cap())
2403 /* we have a number but it's not a valid
2404 * capability */
2405 capid = -1;
2406 }
2407
2408 return capid;
2409}
2410
0769b82a
CS
2411int in_caplist(int cap, struct lxc_list *caps)
2412{
0769b82a 2413 int capid;
0fd73091 2414 struct lxc_list *iterator;
0769b82a 2415
0fd73091 2416 lxc_list_for_each (iterator, caps) {
0769b82a
CS
2417 capid = parse_cap(iterator->elem);
2418 if (capid == cap)
2419 return 1;
2420 }
2421
2422 return 0;
2423}
2424
81810dd1
DL
2425static int setup_caps(struct lxc_list *caps)
2426{
bab88e68 2427 int capid;
0fd73091
CB
2428 char *drop_entry;
2429 struct lxc_list *iterator;
81810dd1 2430
0fd73091
CB
2431 lxc_list_for_each (iterator, caps) {
2432 int ret;
81810dd1
DL
2433
2434 drop_entry = iterator->elem;
2435
bab88e68 2436 capid = parse_cap(drop_entry);
55022530
CB
2437 if (capid < 0)
2438 return log_error(-1, "unknown capability %s", drop_entry);
81810dd1 2439
b81689a1
CB
2440 ret = prctl(PR_CAPBSET_DROP, prctl_arg(capid), prctl_arg(0),
2441 prctl_arg(0), prctl_arg(0));
55022530
CB
2442 if (ret < 0)
2443 return log_error_errno(-1, errno, "Failed to remove %s capability", drop_entry);
0fd73091 2444 DEBUG("Dropped %s (%d) capability", drop_entry, capid);
81810dd1
DL
2445 }
2446
0fd73091 2447 DEBUG("Capabilities have been setup");
1fb86a7c
SH
2448 return 0;
2449}
2450
2451static int dropcaps_except(struct lxc_list *caps)
2452{
2f443e88 2453 __do_free int *caplist = NULL;
0fd73091 2454 int i, capid, numcaps;
1fb86a7c 2455 char *keep_entry;
0fd73091 2456 struct lxc_list *iterator;
1fb86a7c 2457
0fd73091 2458 numcaps = lxc_caps_last_cap() + 1;
2caf9a97
SH
2459 if (numcaps <= 0 || numcaps > 200)
2460 return -1;
0fd73091 2461 TRACE("Found %d capabilities", numcaps);
2caf9a97 2462
1a0e70ac 2463 /* caplist[i] is 1 if we keep capability i */
2f443e88 2464 caplist = must_realloc(NULL, numcaps * sizeof(int));
1fb86a7c
SH
2465 memset(caplist, 0, numcaps * sizeof(int));
2466
0fd73091 2467 lxc_list_for_each (iterator, caps) {
1fb86a7c
SH
2468 keep_entry = iterator->elem;
2469
bab88e68 2470 capid = parse_cap(keep_entry);
7035407c
DE
2471 if (capid == -2)
2472 continue;
2473
55022530
CB
2474 if (capid < 0)
2475 return log_error(-1, "Unknown capability %s", keep_entry);
1fb86a7c 2476
0fd73091 2477 DEBUG("Keep capability %s (%d)", keep_entry, capid);
1fb86a7c
SH
2478 caplist[capid] = 1;
2479 }
0fd73091
CB
2480
2481 for (i = 0; i < numcaps; i++) {
2482 int ret;
2483
1fb86a7c
SH
2484 if (caplist[i])
2485 continue;
0fd73091 2486
b81689a1
CB
2487 ret = prctl(PR_CAPBSET_DROP, prctl_arg(i), prctl_arg(0),
2488 prctl_arg(0), prctl_arg(0));
55022530
CB
2489 if (ret < 0)
2490 return log_error_errno(-1, errno, "Failed to remove capability %d", i);
1fb86a7c
SH
2491 }
2492
0fd73091 2493 DEBUG("Capabilities have been setup");
81810dd1
DL
2494 return 0;
2495}
2496
0fd73091
CB
2497static int parse_resource(const char *res)
2498{
2499 int ret;
c6d09e15
WB
2500 size_t i;
2501 int resid = -1;
2502
0fd73091 2503 for (i = 0; i < sizeof(limit_opt) / sizeof(limit_opt[0]); ++i)
c6d09e15
WB
2504 if (strcmp(res, limit_opt[i].name) == 0)
2505 return limit_opt[i].value;
c6d09e15 2506
0fd73091 2507 /* Try to see if it's numeric, so the user may specify
c6d09e15 2508 * resources that the running kernel knows about but
0fd73091
CB
2509 * we don't.
2510 */
2511 ret = lxc_safe_int(res, &resid);
2512 if (ret < 0)
2513 return -1;
2514
2515 return resid;
c6d09e15
WB
2516}
2517
0fd73091
CB
2518int setup_resource_limits(struct lxc_list *limits, pid_t pid)
2519{
2520 int resid;
c6d09e15
WB
2521 struct lxc_list *it;
2522 struct lxc_limit *lim;
c6d09e15 2523
0fd73091 2524 lxc_list_for_each (it, limits) {
c6d09e15
WB
2525 lim = it->elem;
2526
2527 resid = parse_resource(lim->resource);
55022530
CB
2528 if (resid < 0)
2529 return log_error(-1, "Unknown resource %s", lim->resource);
c6d09e15 2530
f48b5fd8 2531#if HAVE_PRLIMIT || HAVE_PRLIMIT64
55022530
CB
2532 if (prlimit(pid, resid, &lim->limit, NULL) != 0)
2533 return log_error_errno(-1, errno, "Failed to set limit %s", lim->resource);
2de12765
CB
2534
2535 TRACE("Setup \"%s\" limit", lim->resource);
f48b5fd8 2536#else
55022530 2537 return log_error(-1, "Cannot set limit \"%s\" as prlimit is missing", lim->resource);
f48b5fd8 2538#endif
c6d09e15 2539 }
0fd73091 2540
c6d09e15
WB
2541 return 0;
2542}
2543
7edd0540
L
2544int setup_sysctl_parameters(struct lxc_list *sysctls)
2545{
e6f76452 2546 __do_free char *tmp = NULL;
7edd0540
L
2547 struct lxc_list *it;
2548 struct lxc_sysctl *elem;
0fd73091 2549 int ret = 0;
6b5a54cd 2550 char filename[PATH_MAX] = {0};
7edd0540 2551
0fd73091 2552 lxc_list_for_each (it, sysctls) {
7edd0540
L
2553 elem = it->elem;
2554 tmp = lxc_string_replace(".", "/", elem->key);
55022530
CB
2555 if (!tmp)
2556 return log_error(-1, "Failed to replace key %s", elem->key);
7edd0540
L
2557
2558 ret = snprintf(filename, sizeof(filename), "/proc/sys/%s", tmp);
55022530
CB
2559 if (ret < 0 || (size_t)ret >= sizeof(filename))
2560 return log_error(-1, "Error setting up sysctl parameters path");
7edd0540 2561
0fd73091 2562 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2563 strlen(elem->value), false, 0666);
55022530
CB
2564 if (ret < 0)
2565 return log_error_errno(-1, errno, "Failed to setup sysctl parameters %s to %s",
2566 elem->key, elem->value);
7edd0540 2567 }
0fd73091 2568
7edd0540
L
2569 return 0;
2570}
2571
61d7a733
YT
2572int setup_proc_filesystem(struct lxc_list *procs, pid_t pid)
2573{
0c669152 2574 __do_free char *tmp = NULL;
61d7a733
YT
2575 struct lxc_list *it;
2576 struct lxc_proc *elem;
0fd73091 2577 int ret = 0;
6b5a54cd 2578 char filename[PATH_MAX] = {0};
61d7a733 2579
0fd73091 2580 lxc_list_for_each (it, procs) {
61d7a733
YT
2581 elem = it->elem;
2582 tmp = lxc_string_replace(".", "/", elem->filename);
55022530
CB
2583 if (!tmp)
2584 return log_error(-1, "Failed to replace key %s", elem->filename);
61d7a733
YT
2585
2586 ret = snprintf(filename, sizeof(filename), "/proc/%d/%s", pid, tmp);
55022530
CB
2587 if (ret < 0 || (size_t)ret >= sizeof(filename))
2588 return log_error(-1, "Error setting up proc filesystem path");
61d7a733 2589
0fd73091 2590 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2591 strlen(elem->value), false, 0666);
55022530
CB
2592 if (ret < 0)
2593 return log_error_errno(-1, errno, "Failed to setup proc filesystem %s to %s", elem->filename, elem->value);
61d7a733 2594 }
0fd73091 2595
61d7a733
YT
2596 return 0;
2597}
2598
ae9242c8
SH
2599static char *default_rootfs_mount = LXCROOTFSMOUNT;
2600
7b379ab3 2601struct lxc_conf *lxc_conf_init(void)
089cd8b8 2602{
26ddeedd 2603 int i;
0fd73091 2604 struct lxc_conf *new;
7b379ab3 2605
13277ec4 2606 new = malloc(sizeof(*new));
0fd73091 2607 if (!new)
7b379ab3 2608 return NULL;
7b379ab3
MN
2609 memset(new, 0, sizeof(*new));
2610
4b73005c 2611 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2612 new->personality = -1;
124fa0a8 2613 new->autodev = 1;
3a784510 2614 new->console.buffer_size = 0;
596a818d
DE
2615 new->console.log_path = NULL;
2616 new->console.log_fd = -1;
861813e5 2617 new->console.log_size = 0;
28a4b0e5 2618 new->console.path = NULL;
63376d7d 2619 new->console.peer = -1;
fb87aa6a 2620 new->console.proxy.busy = -1;
36a94ce8 2621 new->console.proxy.ptx = -1;
41808e20 2622 new->console.proxy.pty = -1;
36a94ce8 2623 new->console.ptx = -1;
41808e20 2624 new->console.pty = -1;
63376d7d 2625 new->console.name[0] = '\0';
732375f5 2626 memset(&new->console.ringbuf, 0, sizeof(struct lxc_ringbuf));
d2e30e99 2627 new->maincmd_fd = -1;
258f8051 2628 new->monitor_signal_pdeath = SIGKILL;
76a26f55 2629 new->nbd_idx = -1;
54c30e29 2630 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2631 if (!new->rootfs.mount) {
53f3f048
SH
2632 free(new);
2633 return NULL;
2634 }
6e54330c 2635 new->rootfs.managed = true;
31f8b2fd 2636 new->rootfs.mntpt_fd = -EBADF;
953db219 2637 new->rootfs.dev_mntpt_fd = -EBADF;
a370f16b 2638 new->rootfs.dfd_root_host = -EBADF;
858377e4 2639 new->logfd = -1;
7b379ab3 2640 lxc_list_init(&new->cgroup);
54860ed0 2641 lxc_list_init(&new->cgroup2);
4bfb655e 2642 lxc_list_init(&new->devices);
7b379ab3
MN
2643 lxc_list_init(&new->network);
2644 lxc_list_init(&new->mount_list);
81810dd1 2645 lxc_list_init(&new->caps);
1fb86a7c 2646 lxc_list_init(&new->keepcaps);
f6d3e3e4 2647 lxc_list_init(&new->id_map);
46ad64ab
CB
2648 new->root_nsuid_map = NULL;
2649 new->root_nsgid_map = NULL;
f979ac15 2650 lxc_list_init(&new->includes);
4184c3e1 2651 lxc_list_init(&new->aliens);
7c661726 2652 lxc_list_init(&new->environment);
c6d09e15 2653 lxc_list_init(&new->limits);
7edd0540 2654 lxc_list_init(&new->sysctls);
61d7a733 2655 lxc_list_init(&new->procs);
44ae0fb6 2656 new->hooks_version = 0;
28d9e29e 2657 for (i = 0; i < NUM_LXC_HOOKS; i++)
26ddeedd 2658 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2659 lxc_list_init(&new->groups);
d39b10eb 2660 lxc_list_init(&new->state_clients);
fe4de9a6 2661 new->lsm_aa_profile = NULL;
1800f924 2662 lxc_list_init(&new->lsm_aa_raw);
fe4de9a6 2663 new->lsm_se_context = NULL;
4fef78bc 2664 new->lsm_se_keyring_context = NULL;
8f818a84 2665 new->keyring_disable_session = false;
952b5031 2666 new->transient_procfs_mnt = false;
7a41e857
LT
2667 new->shmount.path_host = NULL;
2668 new->shmount.path_cont = NULL;
7b379ab3 2669
72bb04e4
PT
2670 /* if running in a new user namespace, init and COMMAND
2671 * default to running as UID/GID 0 when using lxc-execute */
2672 new->init_uid = 0;
2673 new->init_gid = 0;
43654d34 2674 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
b074bbf1 2675 memset(&new->ns_share, 0, sizeof(char *) * LXC_NS_MAX);
70fd7fc9 2676 memset(&new->timens, 0, sizeof(struct timens_offsets));
c3e3c21a 2677 seccomp_conf_init(new);
72bb04e4 2678
7b379ab3 2679 return new;
089cd8b8
DL
2680}
2681
344c9d81 2682int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
a19b974f 2683 size_t buf_size)
f6d3e3e4 2684{
f62cf1d4 2685 __do_close int fd = -EBADF;
76bcd422 2686 int ret;
6b5a54cd 2687 char path[PATH_MAX];
f6d3e3e4 2688
a19b974f 2689 if (geteuid() != 0 && idtype == ID_TYPE_GID) {
f62cf1d4 2690 __do_close int setgroups_fd = -EBADF;
a19b974f 2691
6b5a54cd
CB
2692 ret = snprintf(path, PATH_MAX, "/proc/%d/setgroups", pid);
2693 if (ret < 0 || ret >= PATH_MAX)
a19b974f 2694 return -E2BIG;
a19b974f 2695
76bcd422 2696 setgroups_fd = open(path, O_WRONLY);
55022530
CB
2697 if (setgroups_fd < 0 && errno != ENOENT)
2698 return log_error_errno(-1, errno, "Failed to open \"%s\"", path);
a19b974f 2699
76bcd422
CB
2700 if (setgroups_fd >= 0) {
2701 ret = lxc_write_nointr(setgroups_fd, "deny\n",
2702 STRLITERALLEN("deny\n"));
55022530
CB
2703 if (ret != STRLITERALLEN("deny\n"))
2704 return log_error_errno(-1, errno, "Failed to write \"deny\" to \"/proc/%d/setgroups\"", pid);
395b1a3e 2705 TRACE("Wrote \"deny\" to \"/proc/%d/setgroups\"", pid);
a19b974f 2706 }
a19b974f
CB
2707 }
2708
6b5a54cd 2709 ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid,
29053180 2710 idtype == ID_TYPE_UID ? 'u' : 'g');
6b5a54cd 2711 if (ret < 0 || ret >= PATH_MAX)
f6d3e3e4 2712 return -E2BIG;
29053180 2713
55022530
CB
2714 fd = open(path, O_WRONLY | O_CLOEXEC);
2715 if (fd < 0)
2716 return log_error_errno(-1, errno, "Failed to open \"%s\"", path);
29053180 2717
29053180 2718 ret = lxc_write_nointr(fd, buf, buf_size);
55022530
CB
2719 if (ret != buf_size)
2720 return log_error_errno(-1, errno, "Failed to write %cid mapping to \"%s\"",
2721 idtype == ID_TYPE_UID ? 'u' : 'g', path);
29053180
CB
2722
2723 return 0;
f6d3e3e4
SH
2724}
2725
6e50e704
CB
2726/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2727 *
2728 * @return 1 if functional binary was found
2729 * @return 0 if binary exists but is lacking privilege
2730 * @return -ENOENT if binary does not exist
2731 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
6e50e704 2732 */
df6a2945
CB
2733static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2734{
48411df2 2735 __do_free char *path = NULL;
df6a2945
CB
2736 int ret;
2737 struct stat st;
df6a2945 2738
3275932b 2739 errno = EINVAL;
6e50e704 2740 if (cap != CAP_SETUID && cap != CAP_SETGID)
3275932b 2741 return -1;
6e50e704 2742
3275932b 2743 errno = ENOENT;
df6a2945
CB
2744 path = on_path(binary, NULL);
2745 if (!path)
3275932b 2746 return -1;
df6a2945
CB
2747
2748 ret = stat(path, &st);
3275932b
CB
2749 if (ret < 0)
2750 return -1;
df6a2945
CB
2751
2752 /* Check if the binary is setuid. */
55022530
CB
2753 if (st.st_mode & S_ISUID)
2754 return log_debug(1, "The binary \"%s\" does have the setuid bit set", path);
df6a2945 2755
0fd73091 2756#if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
2757 /* Check if it has the CAP_SETUID capability. */
2758 if ((cap & CAP_SETUID) &&
2759 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
55022530
CB
2760 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED))
2761 return log_debug(1, "The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE and CAP_PERMITTED sets", path);
df6a2945
CB
2762
2763 /* Check if it has the CAP_SETGID capability. */
2764 if ((cap & CAP_SETGID) &&
2765 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
55022530
CB
2766 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED))
2767 return log_debug(1, "The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE and CAP_PERMITTED sets", path);
0fd73091 2768#else
69924fff
CB
2769 /* If we cannot check for file capabilities we need to give the benefit
2770 * of the doubt. Otherwise we might fail even though all the necessary
2771 * file capabilities are set.
2772 */
55022530 2773 DEBUG("Cannot check for file capabilities as full capability support is missing. Manual intervention needed");
0fd73091 2774#endif
df6a2945 2775
3275932b 2776 return 1;
df6a2945
CB
2777}
2778
59eac805 2779static int lxc_map_ids_exec_wrapper(void *args)
986ef930
CB
2780{
2781 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2782 return -1;
2783}
2784
f6d3e3e4
SH
2785int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2786{
0fd73091 2787 int fill, left;
986ef930 2788 char u_or_g;
4bc3b759 2789 char *pos;
6b5a54cd 2790 char cmd_output[PATH_MAX];
0fd73091
CB
2791 struct id_map *map;
2792 struct lxc_list *iterator;
2793 enum idtype type;
0fd73091 2794 int ret = 0, gidmap = 0, uidmap = 0;
c6ba8981
CB
2795 char mapbuf[STRLITERALLEN("new@idmap") + STRLITERALLEN(" ") +
2796 INTTYPE_TO_STRLEN(pid_t) + STRLITERALLEN(" ") +
2797 LXC_IDMAPLEN] = {0};
0fd73091 2798 bool had_entry = false, use_shadow = false;
c724025c
JC
2799 int hostuid, hostgid;
2800
2801 hostuid = geteuid();
2802 hostgid = getegid();
df6a2945
CB
2803
2804 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2805 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
2806 * will protected it by preventing another user from being handed the
2807 * range by shadow.
2808 */
df6a2945 2809 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
2810 if (uidmap == -ENOENT)
2811 WARN("newuidmap binary is missing");
2812 else if (!uidmap)
2813 WARN("newuidmap is lacking necessary privileges");
2814
df6a2945 2815 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
2816 if (gidmap == -ENOENT)
2817 WARN("newgidmap binary is missing");
2818 else if (!gidmap)
2819 WARN("newgidmap is lacking necessary privileges");
2820
df6a2945 2821 if (uidmap > 0 && gidmap > 0) {
0fd73091 2822 DEBUG("Functional newuidmap and newgidmap binary found");
4bc3b759 2823 use_shadow = true;
df6a2945 2824 } else {
99d43365
CB
2825 /* In case unprivileged users run application containers via
2826 * execute() or a start*() there are valid cases where they may
2827 * only want to map their own {g,u}id. Let's not block them from
2828 * doing so by requiring geteuid() == 0.
2829 */
2830 DEBUG("No newuidmap and newgidmap binary found. Trying to "
c724025c
JC
2831 "write directly with euid %d", hostuid);
2832 }
2833
2834 /* Check if we really need to use newuidmap and newgidmap.
2835 * If the user is only remapping his own {g,u}id, we don't need it.
2836 */
2837 if (use_shadow && lxc_list_len(idmap) == 2) {
2838 use_shadow = false;
2839 lxc_list_for_each(iterator, idmap) {
2840 map = iterator->elem;
2841 if (map->idtype == ID_TYPE_UID && map->range == 1 &&
2842 map->nsid == hostuid && map->hostid == hostuid)
2843 continue;
2844 if (map->idtype == ID_TYPE_GID && map->range == 1 &&
2845 map->nsid == hostgid && map->hostid == hostgid)
2846 continue;
2847 use_shadow = true;
2848 break;
2849 }
0e6e3a41 2850 }
251d0d2a 2851
986ef930
CB
2852 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2853 type++, u_or_g = 'g') {
2854 pos = mapbuf;
2855
0e6e3a41 2856 if (use_shadow)
986ef930 2857 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 2858
cf3ef16d 2859 lxc_list_for_each(iterator, idmap) {
251d0d2a 2860 map = iterator->elem;
cf3ef16d
SH
2861 if (map->idtype != type)
2862 continue;
2863
4bc3b759
CB
2864 had_entry = true;
2865
986ef930 2866 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 2867 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
2868 use_shadow ? " " : "", map->nsid,
2869 map->hostid, map->range,
0e6e3a41 2870 use_shadow ? "" : "\n");
55022530
CB
2871 /*
2872 * The kernel only takes <= 4k for writes to
2873 * /proc/<pid>/{g,u}id_map
2874 */
2875 if (fill <= 0 || fill >= left)
2876 return log_error_errno(-1, errno, "Too many %cid mappings defined", u_or_g);
4bc3b759 2877
cf3ef16d 2878 pos += fill;
251d0d2a 2879 }
cf3ef16d 2880 if (!had_entry)
4f7521b4 2881 continue;
cf3ef16d 2882
d85813cd 2883 /* Try to catch the output of new{g,u}idmap to make debugging
986ef930
CB
2884 * easier.
2885 */
2886 if (use_shadow) {
2887 ret = run_command(cmd_output, sizeof(cmd_output),
2888 lxc_map_ids_exec_wrapper,
2889 (void *)mapbuf);
55022530
CB
2890 if (ret < 0)
2891 return log_error(-1, "new%cidmap failed to write mapping \"%s\": %s", u_or_g, cmd_output, mapbuf);
54fbbeb5 2892 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
d1838f34 2893 } else {
986ef930 2894 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
55022530
CB
2895 if (ret < 0)
2896 return log_error(-1, "Failed to write mapping: %s", mapbuf);
54fbbeb5 2897 TRACE("Wrote mapping \"%s\"", mapbuf);
d1838f34 2898 }
986ef930
CB
2899
2900 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 2901 }
251d0d2a 2902
986ef930 2903 return 0;
f6d3e3e4
SH
2904}
2905
234998b4
CB
2906/*
2907 * Return the host uid/gid to which the container root is mapped in val.
0b3a6504 2908 * Return true if id was found, false otherwise.
cf3ef16d 2909 */
234998b4 2910static id_t get_mapped_rootid(const struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2911{
4160c3a0 2912 unsigned nsid;
0fd73091
CB
2913 struct id_map *map;
2914 struct lxc_list *it;
4160c3a0
CB
2915
2916 if (idtype == ID_TYPE_UID)
2917 nsid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
2918 else
2919 nsid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
cf3ef16d 2920
0fd73091 2921 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2922 map = it->elem;
7b50c609 2923 if (map->idtype != idtype)
cf3ef16d 2924 continue;
4160c3a0 2925 if (map->nsid != nsid)
cf3ef16d 2926 continue;
234998b4 2927 return map->hostid;
cf3ef16d 2928 }
4160c3a0 2929
234998b4
CB
2930 if (idtype == ID_TYPE_UID)
2931 return LXC_INVALID_UID;
2932
2933 return LXC_INVALID_GID;
cf3ef16d
SH
2934}
2935
facdf925 2936int mapped_hostid(unsigned id, const struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2937{
cf3ef16d 2938 struct id_map *map;
0fd73091
CB
2939 struct lxc_list *it;
2940
2941 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2942 map = it->elem;
2133f58c 2943 if (map->idtype != idtype)
cf3ef16d 2944 continue;
0fd73091 2945
cf3ef16d 2946 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 2947 return (id - map->hostid) + map->nsid;
cf3ef16d 2948 }
0fd73091 2949
57d116ab 2950 return -1;
cf3ef16d
SH
2951}
2952
7581a82f 2953int find_unmapped_nsid(const struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2954{
cf3ef16d 2955 struct id_map *map;
0fd73091 2956 struct lxc_list *it;
2133f58c 2957 unsigned int freeid = 0;
0fd73091 2958
cf3ef16d 2959again:
0fd73091 2960 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2961 map = it->elem;
2133f58c 2962 if (map->idtype != idtype)
cf3ef16d 2963 continue;
0fd73091 2964
cf3ef16d
SH
2965 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
2966 freeid = map->nsid + map->range;
2967 goto again;
2968 }
2969 }
0fd73091 2970
cf3ef16d
SH
2971 return freeid;
2972}
2973
e1b9d6af
CB
2974/*
2975 * Mount a proc under @rootfs if proc self points to a pid other than
2976 * my own. This is needed to have a known-good proc mount for setting
2977 * up LSMs both at container startup and attach.
2978 *
e1b9d6af
CB
2979 * NOTE: not to be called from inside the container namespace!
2980 */
952b5031 2981static int lxc_transient_proc(struct lxc_rootfs *rootfs)
e1b9d6af 2982{
952b5031
CB
2983 __do_close int fd_proc = -EBADF;
2984 int link_to_pid, link_len, pid_self, ret;
2985 char link[INTTYPE_TO_STRLEN(pid_t) + 1];
e1b9d6af 2986
952b5031
CB
2987 link_len = readlinkat(rootfs->mntpt_fd, "proc/self", link, sizeof(link));
2988 if (link_len < 0) {
2989 ret = mkdirat(rootfs->mntpt_fd, "proc", 0000);
2990 if (ret < 0 && errno != EEXIST)
2991 return log_error_errno(-errno, errno, "Failed to create %d(proc)", rootfs->mntpt_fd);
e1b9d6af 2992
952b5031
CB
2993 goto domount;
2994 } else if (link_len >= sizeof(link)) {
2995 return log_error_errno(-EIO, EIO, "Truncated link target");
e1b9d6af 2996 }
952b5031 2997 link[link_len] = '\0';
e1b9d6af 2998
952b5031
CB
2999 pid_self = lxc_raw_getpid();
3000 INFO("Caller's PID is %d; /proc/self points to %s", pid_self, link);
e1b9d6af 3001
952b5031
CB
3002 ret = lxc_safe_int(link, &link_to_pid);
3003 if (ret)
3004 return log_error_errno(-ret, ret, "Failed to parse %s", link);
e1b9d6af 3005
952b5031
CB
3006 /* Correct procfs is already mounted. */
3007 if (link_to_pid == pid_self)
3008 return log_trace(0, "Correct procfs instance mounted");
e1b9d6af 3009
952b5031
CB
3010 fd_proc = open_at(rootfs->mntpt_fd, "proc", PROTECT_OPATH_DIRECTORY,
3011 PROTECT_LOOKUP_BENEATH_XDEV, 0);
3012 if (fd_proc < 0)
3013 return log_error_errno(-errno, errno, "Failed to open transient procfs mountpoint");
e1b9d6af 3014
952b5031
CB
3015 ret = snprintf(rootfs->buf, sizeof(rootfs->buf), "/proc/self/fd/%d", fd_proc);
3016 if (ret < 0 || (size_t)ret >= sizeof(rootfs->buf))
3017 return ret_errno(EIO);
e1b9d6af 3018
952b5031 3019 ret = umount2(rootfs->buf, MNT_DETACH);
e1b9d6af 3020 if (ret < 0)
952b5031 3021 SYSWARN("Failed to umount \"%s\" with MNT_DETACH", rootfs->buf);
e1b9d6af
CB
3022
3023domount:
3024 /* rootfs is NULL */
952b5031
CB
3025 if (!rootfs->path) {
3026 ret = mount("proc", rootfs->buf, "proc", 0, NULL);
3027 } else {
3028 ret = safe_mount_beneath_at(rootfs->mntpt_fd, "none", "proc", "proc", 0, NULL);
3029 if (ret < 0) {
3030 ret = snprintf(rootfs->buf, sizeof(rootfs->buf), "%s/proc", rootfs->path ? rootfs->mount : "");
3031 if (ret < 0 || (size_t)ret >= sizeof(rootfs->buf))
3032 return ret_errno(EIO);
3033
3034 ret = safe_mount("proc", rootfs->buf, "proc", 0, NULL, rootfs->mount);
3035 }
3036 }
e1b9d6af 3037 if (ret < 0)
952b5031 3038 return log_error_errno(-1, errno, "Failed to mount temporary procfs");
e1b9d6af 3039
952b5031 3040 INFO("Created transient procfs mount");
e1b9d6af
CB
3041 return 1;
3042}
3043
943144d9 3044/* NOTE: Must not be called from inside the container namespace! */
59eac805 3045static int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3046{
3047 int mounted;
3048
952b5031 3049 mounted = lxc_transient_proc(&conf->rootfs);
5112cd70 3050 if (mounted == -1) {
01958b1f 3051 /* continue only if there is no rootfs */
943144d9 3052 if (conf->rootfs.path)
952b5031 3053 return log_error_errno(-EPERM, EPERM, "Failed to create transient procfs mount");
5112cd70 3054 } else if (mounted == 1) {
952b5031 3055 conf->transient_procfs_mnt = true;
5112cd70 3056 }
943144d9 3057
5112cd70
SH
3058 return 0;
3059}
3060
3061void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3062{
952b5031
CB
3063 if (lxc_conf->transient_procfs_mnt) {
3064 (void)umount2("/proc", MNT_DETACH);
3065 lxc_conf->transient_procfs_mnt = false;
3066 }
5112cd70
SH
3067}
3068
9e61fb1f
CB
3069/* Walk /proc/mounts and change any shared entries to dependent mounts. */
3070void turn_into_dependent_mounts(void)
e995d7a2 3071{
7969675f 3072 __do_free char *line = NULL;
003be47b 3073 __do_fclose FILE *f = NULL;
f62cf1d4 3074 __do_close int memfd = -EBADF, mntinfo_fd = -EBADF;
e995d7a2 3075 size_t len = 0;
a39fc34b
CB
3076 ssize_t copied;
3077 int ret;
e995d7a2 3078
6a49f05e 3079 mntinfo_fd = open("/proc/self/mountinfo", O_RDONLY | O_CLOEXEC);
fea3b91d
DJ
3080 if (mntinfo_fd < 0) {
3081 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
6a49f05e 3082 return;
fea3b91d 3083 }
6a49f05e
CB
3084
3085 memfd = memfd_create(".lxc_mountinfo", MFD_CLOEXEC);
3086 if (memfd < 0) {
3087 char template[] = P_tmpdir "/.lxc_mountinfo_XXXXXX";
3088
3089 if (errno != ENOSYS) {
fea3b91d 3090 SYSERROR("Failed to create temporary in-memory file");
6a49f05e
CB
3091 return;
3092 }
3093
3094 memfd = lxc_make_tmpfile(template, true);
fea3b91d 3095 if (memfd < 0) {
fea3b91d
DJ
3096 WARN("Failed to create temporary file");
3097 return;
3098 }
6a49f05e
CB
3099 }
3100
a39fc34b 3101 copied = fd_to_fd(mntinfo_fd, memfd);
6a49f05e 3102 if (copied < 0) {
fea3b91d 3103 SYSERROR("Failed to copy \"/proc/self/mountinfo\"");
6a49f05e
CB
3104 return;
3105 }
6a49f05e 3106
6a49f05e
CB
3107 ret = lseek(memfd, 0, SEEK_SET);
3108 if (ret < 0) {
fea3b91d 3109 SYSERROR("Failed to reset file descriptor offset");
6a49f05e
CB
3110 return;
3111 }
3112
4110345b 3113 f = fdopen(memfd, "re");
e995d7a2 3114 if (!f) {
003be47b 3115 SYSERROR("Failed to open copy of \"/proc/self/mountinfo\" to mark all shared. Continuing");
e995d7a2
SH
3116 return;
3117 }
3118
003be47b
CB
3119 /*
3120 * After a successful fdopen() memfd will be closed when calling
3121 * fclose(f). Calling close(memfd) afterwards is undefined.
3122 */
3123 move_fd(memfd);
3124
e995d7a2 3125 while (getline(&line, &len, f) != -1) {
0fd73091
CB
3126 char *opts, *target;
3127
e995d7a2
SH
3128 target = get_field(line, 4);
3129 if (!target)
3130 continue;
0fd73091 3131
e995d7a2
SH
3132 opts = get_field(target, 2);
3133 if (!opts)
3134 continue;
0fd73091 3135
e995d7a2
SH
3136 null_endofword(opts);
3137 if (!strstr(opts, "shared"))
3138 continue;
0fd73091 3139
e995d7a2 3140 null_endofword(target);
0fd73091
CB
3141 ret = mount(NULL, target, NULL, MS_SLAVE, NULL);
3142 if (ret < 0) {
9e61fb1f 3143 SYSERROR("Failed to recursively turn old root mount tree into dependent mount. Continuing...");
6a49f05e 3144 continue;
e995d7a2 3145 }
9e61fb1f 3146 TRACE("Recursively turned old root mount tree into dependent mount");
e995d7a2 3147 }
9e61fb1f 3148 TRACE("Turned all mount table entries into dependent mount");
e995d7a2
SH
3149}
3150
794248d0 3151static int lxc_execute_bind_init(struct lxc_handler *handler)
2322903b
SH
3152{
3153 int ret;
794248d0
CB
3154 char *p;
3155 char path[PATH_MAX], destpath[PATH_MAX];
3156 struct lxc_conf *conf = handler->conf;
9d9c111c
SH
3157
3158 /* If init exists in the container, don't bind mount a static one */
3159 p = choose_init(conf->rootfs.mount);
3160 if (p) {
22f835ba 3161 __do_free char *old = p;
41089848
TA
3162
3163 p = strdup(old + strlen(conf->rootfs.mount));
41089848
TA
3164 if (!p)
3165 return -ENOMEM;
3166
3167 INFO("Found existing init at \"%s\"", p);
3168 goto out;
9d9c111c 3169 }
2322903b
SH
3170
3171 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
0fd73091 3172 if (ret < 0 || ret >= PATH_MAX)
8353b4c9 3173 return -1;
2322903b 3174
55022530
CB
3175 if (!file_exists(path))
3176 return log_error_errno(-1, errno, "The file \"%s\" does not exist on host", path);
2322903b 3177
794248d0 3178 ret = snprintf(destpath, PATH_MAX, "%s" P_tmpdir "%s", conf->rootfs.mount, "/.lxc-init");
0fd73091 3179 if (ret < 0 || ret >= PATH_MAX)
8353b4c9 3180 return -1;
2322903b
SH
3181
3182 if (!file_exists(destpath)) {
794248d0 3183 ret = mknod(destpath, S_IFREG | 0000, 0);
55022530
CB
3184 if (ret < 0 && errno != EEXIST)
3185 return log_error_errno(-1, errno, "Failed to create dummy \"%s\" file as bind mount target", destpath);
2322903b
SH
3186 }
3187
592fd47a 3188 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
55022530
CB
3189 if (ret < 0)
3190 return log_error_errno(-1, errno, "Failed to bind mount lxc.init.static into container");
8353b4c9 3191
794248d0
CB
3192 p = strdup(destpath + strlen(conf->rootfs.mount));
3193 if (!p)
3194 return -ENOMEM;
794248d0 3195
8353b4c9 3196 INFO("Bind mounted lxc.init.static into container at \"%s\"", path);
41089848 3197out:
4b5b3a2a 3198 ((struct execute_args *)handler->data)->init_fd = -1;
41089848 3199 ((struct execute_args *)handler->data)->init_path = p;
8353b4c9 3200 return 0;
2322903b
SH
3201}
3202
0fd73091
CB
3203/* This does the work of remounting / if it is shared, calling the container
3204 * pre-mount hooks, and mounting the rootfs.
35120d9c 3205 */
8ce1abc2
CB
3206int lxc_setup_rootfs_prepare_root(struct lxc_conf *conf, const char *name,
3207 const char *lxcpath)
0ad19a3f 3208{
0fd73091
CB
3209 int ret;
3210
a370f16b
CB
3211 conf->rootfs.dfd_root_host = open_at(-EBADF, "/", PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE, 0);
3212 if (conf->rootfs.dfd_root_host < 0)
3213 return log_error_errno(-errno, errno, "Failed to open \"/\"");
3214
35120d9c 3215 if (conf->rootfs_setup) {
35120d9c 3216 const char *path = conf->rootfs.mount;
0fd73091
CB
3217
3218 /* The rootfs was set up in another namespace. bind-mount it to
3219 * give us a mount in our own ns so we can pivot_root to it
3220 */
3221 ret = mount(path, path, "rootfs", MS_BIND, NULL);
55022530
CB
3222 if (ret < 0)
3223 return log_error(-1, "Failed to bind mount container / onto itself");
0fd73091 3224
26ea5533
CB
3225 conf->rootfs.mntpt_fd = openat(-EBADF, path, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH | O_NOCTTY);
3226 if (conf->rootfs.mntpt_fd < 0)
3227 return log_error_errno(-errno, errno, "Failed to open file descriptor for container rootfs");
3228
55022530 3229 return log_trace(0, "Bind mounted container / onto itself");
35120d9c 3230 }
d4ef7c50 3231
9e61fb1f 3232 turn_into_dependent_mounts();
e995d7a2 3233
0fd73091 3234 ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
55022530
CB
3235 if (ret < 0)
3236 return log_error(-1, "Failed to run pre-mount hooks");
35120d9c 3237
8ce1abc2 3238 ret = lxc_mount_rootfs(conf);
55022530
CB
3239 if (ret < 0)
3240 return log_error(-1, "Failed to setup rootfs for");
35120d9c
SH
3241
3242 conf->rootfs_setup = true;
3243 return 0;
3244}
3245
1c1c7051
SH
3246static bool verify_start_hooks(struct lxc_conf *conf)
3247{
6b5a54cd 3248 char path[PATH_MAX];
0fd73091
CB
3249 struct lxc_list *it;
3250
3251 lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
1c1c7051 3252 int ret;
0fd73091 3253 char *hookname = it->elem;
1c1c7051 3254
6b5a54cd 3255 ret = snprintf(path, PATH_MAX, "%s%s",
0fd73091
CB
3256 conf->rootfs.path ? conf->rootfs.mount : "",
3257 hookname);
6b5a54cd 3258 if (ret < 0 || ret >= PATH_MAX)
1c1c7051 3259 return false;
0fd73091 3260
75193660 3261 ret = access(path, X_OK);
55022530
CB
3262 if (ret < 0)
3263 return log_error_errno(false, errno, "Start hook \"%s\" not found in container", hookname);
0fd73091 3264
6a0c909a 3265 return true;
1c1c7051
SH
3266 }
3267
3268 return true;
3269}
3270
4b5b3a2a
TA
3271static bool execveat_supported(void)
3272{
f40988c7 3273 execveat(-1, "", NULL, NULL, AT_EMPTY_PATH);
4b5b3a2a
TA
3274 if (errno == ENOSYS)
3275 return false;
3276
3277 return true;
4b5b3a2a
TA
3278}
3279
20502652
CB
3280static int lxc_setup_boot_id(void)
3281{
3282 int ret;
3283 const char *boot_id_path = "/proc/sys/kernel/random/boot_id";
3284 const char *mock_boot_id_path = "/dev/.lxc-boot-id";
3285 lxc_id128_t n;
3286
3287 if (access(boot_id_path, F_OK))
3288 return 0;
3289
3290 memset(&n, 0, sizeof(n));
3291 if (lxc_id128_randomize(&n)) {
3292 SYSERROR("Failed to generate random data for uuid");
3293 return -1;
3294 }
3295
3296 ret = lxc_id128_write(mock_boot_id_path, n);
3297 if (ret < 0) {
3298 SYSERROR("Failed to write uuid to %s", mock_boot_id_path);
3299 return -1;
3300 }
3301
3302 ret = chmod(mock_boot_id_path, 0444);
3303 if (ret < 0) {
3304 SYSERROR("Failed to chown %s", mock_boot_id_path);
3305 (void)unlink(mock_boot_id_path);
3306 return -1;
3307 }
3308
3309 ret = mount(mock_boot_id_path, boot_id_path, NULL, MS_BIND, NULL);
3310 if (ret < 0) {
3311 SYSERROR("Failed to mount %s to %s", mock_boot_id_path,
3312 boot_id_path);
3313 (void)unlink(mock_boot_id_path);
3314 return -1;
3315 }
3316
3317 ret = mount(NULL, boot_id_path, NULL,
3318 (MS_BIND | MS_REMOUNT | MS_RDONLY | MS_NOSUID | MS_NOEXEC |
3319 MS_NODEV),
3320 NULL);
3321 if (ret < 0) {
3322 SYSERROR("Failed to remount %s read-only", boot_id_path);
3323 (void)unlink(mock_boot_id_path);
3324 return -1;
3325 }
3326
3327 return 0;
3328}
3329
af04d847 3330static int lxc_setup_keyring(struct lsm_ops *lsm_ops, const struct lxc_conf *conf)
d701d729
CB
3331{
3332 key_serial_t keyring;
3333 int ret = 0;
3334
3335 if (conf->lsm_se_keyring_context)
af04d847 3336 ret = lsm_ops->keyring_label_set(lsm_ops, conf->lsm_se_keyring_context);
d701d729 3337 else if (conf->lsm_se_context)
af04d847 3338 ret = lsm_ops->keyring_label_set(lsm_ops, conf->lsm_se_context);
d701d729
CB
3339 if (ret < 0)
3340 return log_error_errno(-1, errno, "Failed to set keyring context");
3341
3342 /*
3343 * Try to allocate a new session keyring for the container to prevent
3344 * information leaks.
3345 */
3346 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, prctl_arg(0),
3347 prctl_arg(0), prctl_arg(0), prctl_arg(0));
3348 if (keyring < 0) {
3349 switch (errno) {
3350 case ENOSYS:
3351 DEBUG("The keyctl() syscall is not supported or blocked");
3352 break;
3353 case EACCES:
3354 __fallthrough;
3355 case EPERM:
3356 DEBUG("Failed to access kernel keyring. Continuing...");
3357 break;
3358 default:
3359 SYSERROR("Failed to create kernel keyring");
3360 break;
3361 }
3362 }
3363
3364 return ret;
3365}
3366
3b988b33 3367int lxc_setup(struct lxc_handler *handler)
35120d9c 3368{
41808e20 3369 __do_close int pty_mnt_fd = -EBADF;
2187efd3 3370 int ret;
0fd73091 3371 const char *lxcpath = handler->lxcpath, *name = handler->name;
35120d9c 3372 struct lxc_conf *lxc_conf = handler->conf;
35120d9c 3373
8ce1abc2 3374 ret = lxc_setup_rootfs_prepare_root(lxc_conf, name, lxcpath);
55022530
CB
3375 if (ret < 0)
3376 return log_error(-1, "Failed to setup rootfs");
35120d9c 3377
b87ee312 3378 if (handler->nsfd[LXC_NS_UTS] == -EBADF) {
8353b4c9 3379 ret = setup_utsname(lxc_conf->utsname);
55022530
CB
3380 if (ret < 0)
3381 return log_error(-1, "Failed to setup the utsname %s", name);
0ad19a3f 3382 }
3383
8f818a84 3384 if (!lxc_conf->keyring_disable_session) {
d701d729 3385 ret = lxc_setup_keyring(handler->lsm_ops, lxc_conf);
8f818a84 3386 if (ret < 0)
d701d729 3387 return log_error(-1, "Failed to setup container keyring");
8f818a84 3388 }
b25291da 3389
e389f2af
CB
3390 if (handler->ns_clone_flags & CLONE_NEWNET) {
3391 ret = lxc_setup_network_in_child_namespaces(lxc_conf,
3392 &lxc_conf->network);
55022530
CB
3393 if (ret < 0)
3394 return log_error(-1, "Failed to setup network");
0ad19a3f 3395
e389f2af 3396 ret = lxc_network_send_name_and_ifindex_to_parent(handler);
55022530
CB
3397 if (ret < 0)
3398 return log_error(-1, "Failed to send network device names and ifindices to parent");
790255cf
CB
3399 }
3400
efbfe93f 3401 if (wants_console(&lxc_conf->console)) {
41808e20 3402 pty_mnt_fd = open_tree(-EBADF, lxc_conf->console.name,
efbfe93f 3403 OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
41808e20 3404 if (pty_mnt_fd < 0)
efbfe93f
CB
3405 SYSTRACE("Failed to create detached mount for container's console \"%s\"",
3406 lxc_conf->console.name);
3407 else
3408 TRACE("Created detached mount for container's console \"%s\"",
3409 lxc_conf->console.name);
3410 }
cf68ffd9 3411
bc6928ff 3412 if (lxc_conf->autodev > 0) {
63012bdd 3413 ret = mount_autodev(name, &lxc_conf->rootfs, lxc_conf->autodevtmpfssize, lxcpath);
55022530
CB
3414 if (ret < 0)
3415 return log_error(-1, "Failed to mount \"/dev\"");
c6883f38
SH
3416 }
3417
ce011f53 3418 lxc_conf->rootfs.dev_mntpt_fd = open_at(lxc_conf->rootfs.mntpt_fd, "dev",
fdb57ab4
CB
3419 PROTECT_OPATH_DIRECTORY,
3420 PROTECT_LOOKUP_BENEATH_XDEV, 0);
953db219
CB
3421 if (lxc_conf->rootfs.dev_mntpt_fd < 0 && errno != ENOENT)
3422 return log_error_errno(-errno, errno, "Failed to open \"/dev\"");
3423
8353b4c9
CB
3424 /* Do automatic mounts (mainly /proc and /sys), but exclude those that
3425 * need to wait until other stuff has finished.
368bbc02 3426 */
8353b4c9 3427 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler);
55022530
CB
3428 if (ret < 0)
3429 return log_error(-1, "Failed to setup first automatic mounts");
368bbc02 3430
8353b4c9 3431 ret = setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath);
55022530
CB
3432 if (ret < 0)
3433 return log_error(-1, "Failed to setup mounts");
576f946d 3434
c631115d
FA
3435 if (!lxc_list_empty(&lxc_conf->mount_list)) {
3436 ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
3437 &lxc_conf->mount_list, name, lxcpath);
55022530
CB
3438 if (ret < 0)
3439 return log_error(-1, "Failed to setup mount entries");
c631115d
FA
3440 }
3441
8353b4c9 3442 if (lxc_conf->is_execute) {
4b5b3a2a
TA
3443 if (execveat_supported()) {
3444 int fd;
f4bea7cc 3445 char path[STRLITERALLEN(SBINDIR) + STRLITERALLEN("/init.lxc.static") + 1];
4b5b3a2a 3446
f4bea7cc 3447 ret = snprintf(path, sizeof(path), SBINDIR "/init.lxc.static");
55022530
CB
3448 if (ret < 0 || ret >= PATH_MAX)
3449 return log_error(-1, "Path to init.lxc.static too long");
4b5b3a2a 3450
f4bea7cc 3451 fd = open(path, O_NOCTTY | O_NOFOLLOW | O_CLOEXEC | O_PATH);
55022530
CB
3452 if (fd < 0)
3453 return log_error_errno(-1, errno, "Unable to open lxc.init.static");
4b5b3a2a
TA
3454
3455 ((struct execute_args *)handler->data)->init_fd = fd;
3456 ((struct execute_args *)handler->data)->init_path = NULL;
3457 } else {
3458 ret = lxc_execute_bind_init(handler);
55022530
CB
3459 if (ret < 0)
3460 return log_error(-1, "Failed to bind-mount the lxc init system");
8353b4c9
CB
3461 }
3462 }
2322903b 3463
8353b4c9
CB
3464 /* Now mount only cgroups, if wanted. Before, /sys could not have been
3465 * mounted. It is guaranteed to be mounted now either through
3466 * automatically or via fstab entries.
368bbc02 3467 */
8353b4c9 3468 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler);
55022530
CB
3469 if (ret < 0)
3470 return log_error(-1, "Failed to setup remaining automatic mounts");
368bbc02 3471
8353b4c9 3472 ret = run_lxc_hooks(name, "mount", lxc_conf, NULL);
55022530
CB
3473 if (ret < 0)
3474 return log_error(-1, "Failed to run mount hooks");
773fb9ca 3475
bc6928ff 3476 if (lxc_conf->autodev > 0) {
8353b4c9 3477 ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
55022530
CB
3478 if (ret < 0)
3479 return log_error(-1, "Failed to run autodev hooks");
06749971 3480
8353b4c9 3481 ret = lxc_fill_autodev(&lxc_conf->rootfs);
55022530
CB
3482 if (ret < 0)
3483 return log_error(-1, "Failed to populate \"/dev\"");
91c3830e 3484 }
368bbc02 3485
75193660 3486 /* Make sure any start hooks are in the container */
55022530
CB
3487 if (!verify_start_hooks(lxc_conf))
3488 return log_error(-1, "Failed to verify start hooks");
75193660 3489
cf68ffd9
CB
3490 ret = lxc_create_tmp_proc_mount(lxc_conf);
3491 if (ret < 0)
3492 return log_error(-1, "Failed to \"/proc\" LSMs");
3493
ed8704d0 3494 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
41808e20 3495 lxc_conf->ttys.dir, pty_mnt_fd);
55022530
CB
3496 if (ret < 0)
3497 return log_error(-1, "Failed to setup console");
6e590161 3498
ed8704d0 3499 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
55022530
CB
3500 if (ret < 0)
3501 return log_error(-1, "Failed to setup \"/dev\" symlinks");
69aa6655 3502
8ce1abc2 3503 ret = lxc_setup_rootfs_switch_root(&lxc_conf->rootfs);
55022530
CB
3504 if (ret < 0)
3505 return log_error(-1, "Failed to pivot root into rootfs");
ed502555 3506
20502652
CB
3507 /* Setting the boot-id is best-effort for now. */
3508 if (lxc_conf->autodev > 0)
3509 (void)lxc_setup_boot_id();
3510
68f3899e 3511 ret = lxc_setup_devpts_child(handler);
55022530
CB
3512 if (ret < 0)
3513 return log_error(-1, "Failed to setup new devpts instance");
3c26f34e 3514
2187efd3
CB
3515 ret = lxc_create_ttys(handler);
3516 if (ret < 0)
e8bd4e43 3517 return -1;
e8bd4e43 3518
8353b4c9 3519 ret = setup_personality(lxc_conf->personality);
55022530
CB
3520 if (ret < 0)
3521 return log_error(-1, "Failed to set personality");
cccc74b5 3522
8353b4c9
CB
3523 /* Set sysctl value to a path under /proc/sys as determined from the
3524 * key. For e.g. net.ipv4.ip_forward translated to
3525 * /proc/sys/net/ipv4/ip_forward.
7edd0540
L
3526 */
3527 if (!lxc_list_empty(&lxc_conf->sysctls)) {
3528 ret = setup_sysctl_parameters(&lxc_conf->sysctls);
55022530
CB
3529 if (ret < 0)
3530 return log_error(-1, "Failed to setup sysctl parameters");
7edd0540
L
3531 }
3532
97a8f74f 3533 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
55022530
CB
3534 if (!lxc_list_empty(&lxc_conf->caps))
3535 return log_error(-1, "Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both");
8353b4c9 3536
55022530
CB
3537 if (dropcaps_except(&lxc_conf->keepcaps))
3538 return log_error(-1, "Failed to keep capabilities");
97a8f74f 3539 } else if (setup_caps(&lxc_conf->caps)) {
55022530 3540 return log_error(-1, "Failed to drop capabilities");
81810dd1
DL
3541 }
3542
953db219
CB
3543 close_prot_errno_disarm(lxc_conf->rootfs.mntpt_fd)
3544 close_prot_errno_disarm(lxc_conf->rootfs.dev_mntpt_fd)
a370f16b 3545 close_prot_errno_disarm(lxc_conf->rootfs.dfd_root_host)
8353b4c9 3546 NOTICE("The container \"%s\" is set up", name);
cd54d859 3547
0ad19a3f 3548 return 0;
3549}
26ddeedd 3550
3f60c2f7 3551int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
14a7b0f9 3552 char *argv[])
26ddeedd 3553{
26ddeedd 3554 struct lxc_list *it;
3ea957c6
RK
3555 int which;
3556
3557 for (which = 0; which < NUM_LXC_HOOKS; which ++) {
3558 if (strcmp(hookname, lxchook_names[which]) == 0)
3559 break;
3560 }
3561
3562 if (which >= NUM_LXC_HOOKS)
26ddeedd 3563 return -1;
3f60c2f7 3564
0fd73091 3565 lxc_list_for_each (it, &conf->hooks[which]) {
26ddeedd 3566 int ret;
3f60c2f7
CB
3567 char *hook = it->elem;
3568
3569 ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
14a7b0f9 3570 hookname, argv);
3f60c2f7
CB
3571 if (ret < 0)
3572 return -1;
26ddeedd 3573 }
3f60c2f7 3574
26ddeedd
SH
3575 return 0;
3576}
72d0e1cb 3577
72d0e1cb
SG
3578int lxc_clear_config_caps(struct lxc_conf *c)
3579{
1a0e70ac 3580 struct lxc_list *it, *next;
72d0e1cb 3581
0fd73091 3582 lxc_list_for_each_safe (it, &c->caps, next) {
72d0e1cb
SG
3583 lxc_list_del(it);
3584 free(it->elem);
3585 free(it);
3586 }
0fd73091 3587
72d0e1cb
SG
3588 return 0;
3589}
3590
c7e345ae
CB
3591static int lxc_free_idmap(struct lxc_list *id_map)
3592{
27c27d73
SH
3593 struct lxc_list *it, *next;
3594
46bc6f2a 3595 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
3596 lxc_list_del(it);
3597 free(it->elem);
3598 free(it);
3599 }
c7e345ae 3600
27c27d73
SH
3601 return 0;
3602}
7e621263
CB
3603
3604static int __lxc_free_idmap(struct lxc_list *id_map)
3605{
3606 lxc_free_idmap(id_map);
3607 free(id_map);
3608 return 0;
3609}
3610define_cleanup_function(struct lxc_list *, __lxc_free_idmap);
27c27d73 3611
4355ab5f
SH
3612int lxc_clear_idmaps(struct lxc_conf *c)
3613{
3614 return lxc_free_idmap(&c->id_map);
3615}
3616
1fb86a7c
SH
3617int lxc_clear_config_keepcaps(struct lxc_conf *c)
3618{
0fd73091 3619 struct lxc_list *it, *next;
1fb86a7c 3620
0fd73091 3621 lxc_list_for_each_safe (it, &c->keepcaps, next) {
1fb86a7c
SH
3622 lxc_list_del(it);
3623 free(it->elem);
3624 free(it);
3625 }
0fd73091 3626
1fb86a7c
SH
3627 return 0;
3628}
3629
a3ed9b81 3630int lxc_clear_namespace(struct lxc_conf *c)
3631{
3632 int i;
3633 for (i = 0; i < LXC_NS_MAX; i++) {
3634 free(c->ns_share[i]);
3635 c->ns_share[i] = NULL;
3636 }
3637 return 0;
3638}
3639
54860ed0 3640int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
72d0e1cb 3641{
54860ed0 3642 char *global_token, *namespaced_token;
ab1a6cac 3643 size_t namespaced_token_len;
54860ed0 3644 struct lxc_list *it, *next, *list;
ab1a6cac 3645 const char *k = key;
54860ed0 3646 bool all = false;
72d0e1cb 3647
54860ed0
CB
3648 if (version == CGROUP2_SUPER_MAGIC) {
3649 global_token = "lxc.cgroup2";
3650 namespaced_token = "lxc.cgroup2.";
6333c915 3651 namespaced_token_len = STRLITERALLEN("lxc.cgroup2.");
54860ed0
CB
3652 list = &c->cgroup2;
3653 } else if (version == CGROUP_SUPER_MAGIC) {
3654 global_token = "lxc.cgroup";
3655 namespaced_token = "lxc.cgroup.";
6333c915 3656 namespaced_token_len = STRLITERALLEN("lxc.cgroup.");
54860ed0
CB
3657 list = &c->cgroup;
3658 } else {
ab1a6cac 3659 return -EINVAL;
54860ed0
CB
3660 }
3661
3662 if (strcmp(key, global_token) == 0)
72d0e1cb 3663 all = true;
6333c915 3664 else if (strncmp(key, namespaced_token, namespaced_token_len) == 0)
ab1a6cac 3665 k += namespaced_token_len;
a6390f01 3666 else
ab1a6cac 3667 return -EINVAL;
72d0e1cb 3668
0fd73091 3669 lxc_list_for_each_safe (it, list, next) {
72d0e1cb 3670 struct lxc_cgroup *cg = it->elem;
54860ed0 3671
72d0e1cb
SG
3672 if (!all && strcmp(cg->subsystem, k) != 0)
3673 continue;
54860ed0 3674
72d0e1cb
SG
3675 lxc_list_del(it);
3676 free(cg->subsystem);
3677 free(cg->value);
3678 free(cg);
3679 free(it);
3680 }
e409b214 3681
72d0e1cb
SG
3682 return 0;
3683}
3684
4bfb655e
CB
3685static void lxc_clear_devices(struct lxc_conf *conf)
3686{
3687 struct lxc_list *list = &conf->devices;
3688 struct lxc_list *it, *next;
3689
3690 lxc_list_for_each_safe(it, list, next) {
3691 lxc_list_del(it);
3692 free(it);
3693 }
3694}
3695
c6d09e15
WB
3696int lxc_clear_limits(struct lxc_conf *c, const char *key)
3697{
3698 struct lxc_list *it, *next;
c6d09e15 3699 const char *k = NULL;
0fd73091 3700 bool all = false;
c6d09e15 3701
b668653c 3702 if (strcmp(key, "lxc.limit") == 0 || strcmp(key, "lxc.prlimit") == 0)
c6d09e15 3703 all = true;
6333c915
CB
3704 else if (strncmp(key, "lxc.limit.", STRLITERALLEN("lxc.limit.")) == 0)
3705 k = key + STRLITERALLEN("lxc.limit.");
3706 else if (strncmp(key, "lxc.prlimit.", STRLITERALLEN("lxc.prlimit.")) == 0)
3707 k = key + STRLITERALLEN("lxc.prlimit.");
c6d09e15
WB
3708 else
3709 return -1;
3710
0fd73091 3711 lxc_list_for_each_safe (it, &c->limits, next) {
c6d09e15 3712 struct lxc_limit *lim = it->elem;
0fd73091 3713
c6d09e15
WB
3714 if (!all && strcmp(lim->resource, k) != 0)
3715 continue;
0fd73091 3716
c6d09e15
WB
3717 lxc_list_del(it);
3718 free(lim->resource);
3719 free(lim);
3720 free(it);
3721 }
b668653c 3722
c6d09e15
WB
3723 return 0;
3724}
3725
7edd0540
L
3726int lxc_clear_sysctls(struct lxc_conf *c, const char *key)
3727{
3728 struct lxc_list *it, *next;
7edd0540 3729 const char *k = NULL;
0fd73091 3730 bool all = false;
7edd0540
L
3731
3732 if (strcmp(key, "lxc.sysctl") == 0)
3733 all = true;
6333c915
CB
3734 else if (strncmp(key, "lxc.sysctl.", STRLITERALLEN("lxc.sysctl.")) == 0)
3735 k = key + STRLITERALLEN("lxc.sysctl.");
7edd0540
L
3736 else
3737 return -1;
3738
0fd73091 3739 lxc_list_for_each_safe (it, &c->sysctls, next) {
7edd0540 3740 struct lxc_sysctl *elem = it->elem;
0fd73091 3741
7edd0540
L
3742 if (!all && strcmp(elem->key, k) != 0)
3743 continue;
0fd73091 3744
7edd0540
L
3745 lxc_list_del(it);
3746 free(elem->key);
3747 free(elem->value);
3748 free(elem);
3749 free(it);
3750 }
0fd73091 3751
7edd0540
L
3752 return 0;
3753}
3754
61d7a733
YT
3755int lxc_clear_procs(struct lxc_conf *c, const char *key)
3756{
0fd73091 3757 struct lxc_list *it, *next;
61d7a733 3758 const char *k = NULL;
0fd73091 3759 bool all = false;
61d7a733
YT
3760
3761 if (strcmp(key, "lxc.proc") == 0)
3762 all = true;
6333c915
CB
3763 else if (strncmp(key, "lxc.proc.", STRLITERALLEN("lxc.proc.")) == 0)
3764 k = key + STRLITERALLEN("lxc.proc.");
61d7a733
YT
3765 else
3766 return -1;
3767
0fd73091 3768 lxc_list_for_each_safe (it, &c->procs, next) {
61d7a733 3769 struct lxc_proc *proc = it->elem;
0fd73091 3770
61d7a733
YT
3771 if (!all && strcmp(proc->filename, k) != 0)
3772 continue;
0fd73091 3773
61d7a733
YT
3774 lxc_list_del(it);
3775 free(proc->filename);
3776 free(proc->value);
3777 free(proc);
3778 free(it);
3779 }
3780
3781 return 0;
3782}
3783
ee1e7aa0
SG
3784int lxc_clear_groups(struct lxc_conf *c)
3785{
0fd73091 3786 struct lxc_list *it, *next;
ee1e7aa0 3787
0fd73091 3788 lxc_list_for_each_safe (it, &c->groups, next) {
ee1e7aa0
SG
3789 lxc_list_del(it);
3790 free(it->elem);
3791 free(it);
3792 }
0fd73091 3793
ee1e7aa0
SG
3794 return 0;
3795}
3796
ab799c0b
SG
3797int lxc_clear_environment(struct lxc_conf *c)
3798{
0fd73091 3799 struct lxc_list *it, *next;
ab799c0b 3800
0fd73091 3801 lxc_list_for_each_safe (it, &c->environment, next) {
ab799c0b
SG
3802 lxc_list_del(it);
3803 free(it->elem);
3804 free(it);
3805 }
0fd73091 3806
ab799c0b
SG
3807 return 0;
3808}
3809
72d0e1cb
SG
3810int lxc_clear_mount_entries(struct lxc_conf *c)
3811{
0fd73091 3812 struct lxc_list *it, *next;
72d0e1cb 3813
0fd73091 3814 lxc_list_for_each_safe (it, &c->mount_list, next) {
72d0e1cb
SG
3815 lxc_list_del(it);
3816 free(it->elem);
3817 free(it);
3818 }
0fd73091 3819
72d0e1cb
SG
3820 return 0;
3821}
3822
b099e9e9
SH
3823int lxc_clear_automounts(struct lxc_conf *c)
3824{
3825 c->auto_mounts = 0;
3826 return 0;
3827}
3828
12a50cc6 3829int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 3830{
72d0e1cb 3831 int i;
0fd73091
CB
3832 struct lxc_list *it, *next;
3833 const char *k = NULL;
3834 bool all = false, done = false;
72d0e1cb 3835
17ed13a3
SH
3836 if (strcmp(key, "lxc.hook") == 0)
3837 all = true;
6333c915
CB
3838 else if (strncmp(key, "lxc.hook.", STRLITERALLEN("lxc.hook.")) == 0)
3839 k = key + STRLITERALLEN("lxc.hook.");
a6390f01
WB
3840 else
3841 return -1;
17ed13a3 3842
0fd73091 3843 for (i = 0; i < NUM_LXC_HOOKS; i++) {
17ed13a3 3844 if (all || strcmp(k, lxchook_names[i]) == 0) {
0fd73091 3845 lxc_list_for_each_safe (it, &c->hooks[i], next) {
17ed13a3
SH
3846 lxc_list_del(it);
3847 free(it->elem);
3848 free(it);
3849 }
0fd73091 3850
17ed13a3 3851 done = true;
72d0e1cb
SG
3852 }
3853 }
17ed13a3 3854
55022530
CB
3855 if (!done)
3856 return log_error(-1, "Invalid hook key: %s", key);
0fd73091 3857
72d0e1cb
SG
3858 return 0;
3859}
8eb5694b 3860
4184c3e1
SH
3861static inline void lxc_clear_aliens(struct lxc_conf *conf)
3862{
0fd73091 3863 struct lxc_list *it, *next;
4184c3e1 3864
0fd73091 3865 lxc_list_for_each_safe (it, &conf->aliens, next) {
4184c3e1
SH
3866 lxc_list_del(it);
3867 free(it->elem);
3868 free(it);
3869 }
3870}
3871
c7b15d1e 3872void lxc_clear_includes(struct lxc_conf *conf)
f979ac15 3873{
0fd73091 3874 struct lxc_list *it, *next;
f979ac15 3875
0fd73091 3876 lxc_list_for_each_safe (it, &conf->includes, next) {
f979ac15
SH
3877 lxc_list_del(it);
3878 free(it->elem);
3879 free(it);
3880 }
3881}
3882
1800f924
WB
3883int lxc_clear_apparmor_raw(struct lxc_conf *c)
3884{
3885 struct lxc_list *it, *next;
3886
3887 lxc_list_for_each_safe (it, &c->lsm_aa_raw, next) {
3888 lxc_list_del(it);
3889 free(it->elem);
3890 free(it);
3891 }
3892
3893 return 0;
3894}
3895
8eb5694b
SH
3896void lxc_conf_free(struct lxc_conf *conf)
3897{
3898 if (!conf)
3899 return;
0fd73091 3900
858377e4
SH
3901 if (current_config == conf)
3902 current_config = NULL;
aed105d5 3903 lxc_terminal_conf_free(&conf->console);
f10fad2f 3904 free(conf->rootfs.mount);
b3b8c97f 3905 free(conf->rootfs.bdev_type);
f10fad2f
ME
3906 free(conf->rootfs.options);
3907 free(conf->rootfs.path);
9dd75981 3908 free(conf->rootfs.data);
31f8b2fd 3909 close_prot_errno_disarm(conf->rootfs.mntpt_fd);
953db219 3910 close_prot_errno_disarm(conf->rootfs.dev_mntpt_fd);
a370f16b 3911 close_prot_errno_disarm(conf->rootfs.dfd_root_host);
f10fad2f 3912 free(conf->logfile);
858377e4
SH
3913 if (conf->logfd != -1)
3914 close(conf->logfd);
f10fad2f 3915 free(conf->utsname);
885766f5
CB
3916 free(conf->ttys.dir);
3917 free(conf->ttys.tty_names);
f10fad2f
ME
3918 free(conf->fstab);
3919 free(conf->rcfile);
5cda27c1 3920 free(conf->execute_cmd);
f10fad2f 3921 free(conf->init_cmd);
3c491553 3922 free(conf->init_cwd);
6b0d5538 3923 free(conf->unexpanded_config);
76d0127f 3924 free(conf->syslog);
c302b476 3925 lxc_free_networks(&conf->network);
f10fad2f 3926 free(conf->lsm_aa_profile);
1800f924 3927 free(conf->lsm_aa_profile_computed);
f10fad2f 3928 free(conf->lsm_se_context);
c3e3c21a 3929 lxc_seccomp_free(&conf->seccomp);
8eb5694b 3930 lxc_clear_config_caps(conf);
1fb86a7c 3931 lxc_clear_config_keepcaps(conf);
54860ed0
CB
3932 lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
3933 lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
4bfb655e 3934 lxc_clear_devices(conf);
17ed13a3 3935 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 3936 lxc_clear_mount_entries(conf);
27c27d73 3937 lxc_clear_idmaps(conf);
ee1e7aa0 3938 lxc_clear_groups(conf);
f979ac15 3939 lxc_clear_includes(conf);
761d81ca 3940 lxc_clear_aliens(conf);
ab799c0b 3941 lxc_clear_environment(conf);
240d4b74 3942 lxc_clear_limits(conf, "lxc.prlimit");
7edd0540 3943 lxc_clear_sysctls(conf, "lxc.sysctl");
61d7a733 3944 lxc_clear_procs(conf, "lxc.proc");
1800f924 3945 lxc_clear_apparmor_raw(conf);
a3ed9b81 3946 lxc_clear_namespace(conf);
43654d34 3947 free(conf->cgroup_meta.dir);
a900cbaf 3948 free(conf->cgroup_meta.monitor_dir);
eb60b564 3949 free(conf->cgroup_meta.monitor_pivot_dir);
a900cbaf
WB
3950 free(conf->cgroup_meta.container_dir);
3951 free(conf->cgroup_meta.namespace_dir);
43654d34 3952 free(conf->cgroup_meta.controllers);
7a41e857
LT
3953 free(conf->shmount.path_host);
3954 free(conf->shmount.path_cont);
8eb5694b
SH
3955 free(conf);
3956}
4355ab5f
SH
3957
3958struct userns_fn_data {
3959 int (*fn)(void *);
c9b7c33e 3960 const char *fn_name;
4355ab5f
SH
3961 void *arg;
3962 int p[2];
3963};
3964
3965static int run_userns_fn(void *data)
3966{
766c5b6d 3967 struct userns_fn_data *d = data;
adaffdd7 3968 int ret;
4355ab5f 3969 char c;
4355ab5f 3970
766c5b6d 3971 close_prot_errno_disarm(d->p[1]);
f8aa4bf3 3972
766c5b6d
CB
3973 /*
3974 * Wait for parent to finish establishing a new mapping in the user
f8aa4bf3
CB
3975 * namespace we are executing in.
3976 */
adaffdd7 3977 ret = lxc_read_nointr(d->p[0], &c, 1);
766c5b6d 3978 close_prot_errno_disarm(d->p[0]);
adaffdd7
CB
3979 if (ret != 1)
3980 return -1;
f8aa4bf3 3981
c9b7c33e 3982 if (d->fn_name)
adaffdd7 3983 TRACE("Calling function \"%s\"", d->fn_name);
0fd73091 3984
f8aa4bf3 3985 /* Call function to run. */
4355ab5f
SH
3986 return d->fn(d->arg);
3987}
3988
7581a82f 3989static struct id_map *mapped_nsid_add(const struct lxc_conf *conf, unsigned id,
db7cfe23
CB
3990 enum idtype idtype)
3991{
5173b710
CB
3992 const struct id_map *map;
3993 struct id_map *retmap;
db7cfe23
CB
3994
3995 map = find_mapped_nsid_entry(conf, id, idtype);
3996 if (!map)
3997 return NULL;
3998
3999 retmap = malloc(sizeof(*retmap));
4000 if (!retmap)
4001 return NULL;
4002
4003 memcpy(retmap, map, sizeof(*retmap));
4004 return retmap;
4005}
4006
7581a82f 4007static struct id_map *find_mapped_hostid_entry(const struct lxc_conf *conf,
c4333195 4008 unsigned id, enum idtype idtype)
f8aa4bf3 4009{
f8aa4bf3 4010 struct id_map *map;
0fd73091 4011 struct lxc_list *it;
f8aa4bf3
CB
4012 struct id_map *retmap = NULL;
4013
0fd73091 4014 lxc_list_for_each (it, &conf->id_map) {
f8aa4bf3
CB
4015 map = it->elem;
4016 if (map->idtype != idtype)
4017 continue;
4018
4019 if (id >= map->hostid && id < map->hostid + map->range) {
4020 retmap = map;
4021 break;
4022 }
4023 }
4024
f8aa4bf3
CB
4025 return retmap;
4026}
4027
0fd73091 4028/* Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
f8aa4bf3 4029 * existing one or establish a new one.
4355ab5f 4030 */
7581a82f 4031static struct id_map *mapped_hostid_add(const struct lxc_conf *conf, uid_t id,
0fd73091 4032 enum idtype type)
4355ab5f 4033{
55022530 4034 __do_free struct id_map *entry = NULL;
28a2d9e7 4035 int hostid_mapped;
55022530 4036 struct id_map *tmp = NULL;
c4333195
CB
4037
4038 entry = malloc(sizeof(*entry));
4039 if (!entry)
4040 return NULL;
f8aa4bf3 4041
28a2d9e7 4042 /* Reuse existing mapping. */
c4333195 4043 tmp = find_mapped_hostid_entry(conf, id, type);
1758c195
CB
4044 if (tmp) {
4045 memcpy(entry, tmp, sizeof(*entry));
4046 } else {
4047 /* Find new mapping. */
4048 hostid_mapped = find_unmapped_nsid(conf, type);
4049 if (hostid_mapped < 0)
4050 return log_debug(NULL, "Failed to find free mapping for id %d", id);
4051
4052 entry->idtype = type;
4053 entry->nsid = hostid_mapped;
4054 entry->hostid = (unsigned long)id;
4055 entry->range = 1;
4056 }
4355ab5f 4057
55022530 4058 return move_ptr(entry);
4355ab5f
SH
4059}
4060
dbfcdf86
CB
4061static struct lxc_list *get_minimal_idmap(const struct lxc_conf *conf,
4062 uid_t *resuid, gid_t *resgid)
4355ab5f 4063{
00d6cfe2
CB
4064 __do_free struct id_map *container_root_uid = NULL,
4065 *container_root_gid = NULL,
4066 *host_uid_map = NULL, *host_gid_map = NULL;
4067 __do_free struct lxc_list *idmap = NULL;
f8aa4bf3 4068 uid_t euid, egid;
4160c3a0
CB
4069 uid_t nsuid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
4070 gid_t nsgid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
00d6cfe2 4071 struct lxc_list *tmplist = NULL;
4355ab5f 4072
db7cfe23 4073 /* Find container root mappings. */
4160c3a0 4074 container_root_uid = mapped_nsid_add(conf, nsuid, ID_TYPE_UID);
55022530
CB
4075 if (!container_root_uid)
4076 return log_debug(NULL, "Failed to find mapping for namespace uid %d", 0);
dcf0ffdf
CB
4077 euid = geteuid();
4078 if (euid >= container_root_uid->hostid &&
4079 euid < (container_root_uid->hostid + container_root_uid->range))
2c996219 4080 host_uid_map = move_ptr(container_root_uid);
f8aa4bf3 4081
4160c3a0 4082 container_root_gid = mapped_nsid_add(conf, nsgid, ID_TYPE_GID);
55022530
CB
4083 if (!container_root_gid)
4084 return log_debug(NULL, "Failed to find mapping for namespace gid %d", 0);
dcf0ffdf
CB
4085 egid = getegid();
4086 if (egid >= container_root_gid->hostid &&
4087 egid < (container_root_gid->hostid + container_root_gid->range))
2c996219 4088 host_gid_map = move_ptr(container_root_gid);
f8aa4bf3
CB
4089
4090 /* Check whether the {g,u}id of the user has a mapping. */
954b7d9b 4091 if (!host_uid_map)
c4333195 4092 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
55022530
CB
4093 if (!host_uid_map)
4094 return log_debug(NULL, "Failed to find mapping for uid %d", euid);
f8aa4bf3 4095
dcf0ffdf
CB
4096 if (!host_gid_map)
4097 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
55022530
CB
4098 if (!host_gid_map)
4099 return log_debug(NULL, "Failed to find mapping for gid %d", egid);
28a2d9e7
CB
4100
4101 /* Allocate new {g,u}id map list. */
4102 idmap = malloc(sizeof(*idmap));
4103 if (!idmap)
00d6cfe2 4104 return NULL;
28a2d9e7
CB
4105 lxc_list_init(idmap);
4106
f8aa4bf3
CB
4107 /* Add container root to the map. */
4108 tmplist = malloc(sizeof(*tmplist));
4109 if (!tmplist)
00d6cfe2 4110 return NULL;
47649d5b
CB
4111 /* idmap will now keep track of that memory. */
4112 lxc_list_add_elem(tmplist, move_ptr(host_uid_map));
f8aa4bf3 4113 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4114
2c996219 4115 if (container_root_uid) {
28a2d9e7
CB
4116 /* Add container root to the map. */
4117 tmplist = malloc(sizeof(*tmplist));
4118 if (!tmplist)
00d6cfe2 4119 return NULL;
47649d5b
CB
4120 /* idmap will now keep track of that memory. */
4121 lxc_list_add_elem(tmplist, move_ptr(container_root_uid));
28a2d9e7 4122 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4123 }
f8aa4bf3
CB
4124
4125 tmplist = malloc(sizeof(*tmplist));
4126 if (!tmplist)
00d6cfe2 4127 return NULL;
47649d5b
CB
4128 /* idmap will now keep track of that memory. */
4129 lxc_list_add_elem(tmplist, move_ptr(host_gid_map));
f8aa4bf3 4130 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4131
2c996219 4132 if (container_root_gid) {
28a2d9e7
CB
4133 tmplist = malloc(sizeof(*tmplist));
4134 if (!tmplist)
00d6cfe2 4135 return NULL;
47649d5b
CB
4136 /* idmap will now keep track of that memory. */
4137 lxc_list_add_elem(tmplist, move_ptr(container_root_gid));
28a2d9e7 4138 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4139 }
f8aa4bf3 4140
dbfcdf86
CB
4141 TRACE("Allocated minimal idmapping for ns uid %d and ns gid %d", nsuid, nsgid);
4142
4143 if (resuid)
4144 *resuid = nsuid;
4145 if (resgid)
4146 *resgid = nsgid;
00d6cfe2 4147 return move_ptr(idmap);
dcf0ffdf
CB
4148}
4149
766c5b6d
CB
4150/*
4151 * Run a function in a new user namespace.
dcf0ffdf
CB
4152 * The caller's euid/egid will be mapped if it is not already.
4153 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4154 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4155 * This means we require only to establish a mapping from:
4156 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4157 * - the container root -> some sub{g,u}id
915e3dbd 4158 * The former we add, if the user did not specify a mapping. The latter we
6f3fd27f 4159 * retrieve from the container's configured {g,u}id mappings as it must have been
dcf0ffdf
CB
4160 * there to start the container in the first place.
4161 */
7581a82f 4162int userns_exec_1(const struct lxc_conf *conf, int (*fn)(void *), void *data,
dcf0ffdf
CB
4163 const char *fn_name)
4164{
7e621263 4165 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
0fd73091
CB
4166 int ret = -1, status = -1;
4167 char c = '1';
46bc6f2a
CB
4168 struct userns_fn_data d = {
4169 .arg = data,
4170 .fn = fn,
4171 .fn_name = fn_name,
4172 };
766c5b6d
CB
4173 pid_t pid;
4174 int pipe_fds[2];
dcf0ffdf 4175
2b2655a8
CB
4176 if (!conf)
4177 return -EINVAL;
4178
dbfcdf86 4179 idmap = get_minimal_idmap(conf, NULL, NULL);
dcf0ffdf 4180 if (!idmap)
766c5b6d 4181 return ret_errno(ENOENT);
dcf0ffdf 4182
766c5b6d
CB
4183 ret = pipe2(pipe_fds, O_CLOEXEC);
4184 if (ret < 0)
4185 return -errno;
4186
766c5b6d
CB
4187 d.p[0] = pipe_fds[0];
4188 d.p[1] = pipe_fds[1];
dcf0ffdf
CB
4189
4190 /* Clone child in new user namespace. */
a59440be 4191 pid = lxc_raw_clone_cb(run_userns_fn, &d, CLONE_NEWUSER, NULL);
dcf0ffdf 4192 if (pid < 0) {
0fd73091 4193 ERROR("Failed to clone process in new user namespace");
dcf0ffdf
CB
4194 goto on_error;
4195 }
4196
766c5b6d 4197 close_prot_errno_disarm(pipe_fds[0]);
dcf0ffdf 4198
62fef886 4199 if (lxc_log_trace()) {
dcf0ffdf 4200 struct id_map *map;
0fd73091 4201 struct lxc_list *it;
dcf0ffdf 4202
766c5b6d 4203 lxc_list_for_each(it, idmap) {
f8aa4bf3 4204 map = it->elem;
766c5b6d
CB
4205 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4206 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
f8aa4bf3 4207 }
4355ab5f
SH
4208 }
4209
f8aa4bf3 4210 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4211 ret = lxc_map_ids(idmap, pid);
f8aa4bf3 4212 if (ret < 0) {
0fd73091 4213 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
f8aa4bf3 4214 goto on_error;
4355ab5f
SH
4215 }
4216
f8aa4bf3 4217 /* Tell child to proceed. */
766c5b6d 4218 if (lxc_write_nointr(pipe_fds[1], &c, 1) != 1) {
dcf0ffdf 4219 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
f8aa4bf3 4220 goto on_error;
4355ab5f
SH
4221 }
4222
686dd5d1 4223on_error:
766c5b6d
CB
4224 close_prot_errno_disarm(pipe_fds[0]);
4225 close_prot_errno_disarm(pipe_fds[1]);
f8aa4bf3 4226
ee1b16bc
TA
4227 /* Wait for child to finish. */
4228 if (pid > 0)
4229 status = wait_for_pid(pid);
4230
686dd5d1
CB
4231 if (status < 0)
4232 ret = -1;
4233
f8aa4bf3 4234 return ret;
4355ab5f 4235}
97e9cfa0 4236
d1783ef4
CB
4237int userns_exec_minimal(const struct lxc_conf *conf,
4238 int (*fn_parent)(void *), void *fn_parent_data,
4239 int (*fn_child)(void *), void *fn_child_data)
edf88289 4240{
7e621263 4241 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
dbfcdf86
CB
4242 uid_t resuid = LXC_INVALID_UID;
4243 gid_t resgid = LXC_INVALID_GID;
edf88289 4244 char c = '1';
dbfcdf86 4245 ssize_t ret;
edf88289
CB
4246 pid_t pid;
4247 int sock_fds[2];
4248
d1783ef4 4249 if (!conf || !fn_child)
dbfcdf86 4250 return ret_errno(EINVAL);
edf88289 4251
dbfcdf86 4252 idmap = get_minimal_idmap(conf, &resuid, &resgid);
edf88289
CB
4253 if (!idmap)
4254 return ret_errno(ENOENT);
4255
4256 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, sock_fds);
4257 if (ret < 0)
4258 return -errno;
4259
4260 pid = fork();
4261 if (pid < 0) {
dbfcdf86 4262 SYSERROR("Failed to create new process");
edf88289
CB
4263 goto on_error;
4264 }
4265
4266 if (pid == 0) {
4267 close_prot_errno_disarm(sock_fds[1]);
4268
4269 ret = unshare(CLONE_NEWUSER);
dbfcdf86
CB
4270 if (ret < 0) {
4271 SYSERROR("Failed to unshare new user namespace");
edf88289 4272 _exit(EXIT_FAILURE);
dbfcdf86 4273 }
edf88289 4274
dbfcdf86
CB
4275 ret = lxc_write_nointr(sock_fds[0], &c, 1);
4276 if (ret != 1)
edf88289
CB
4277 _exit(EXIT_FAILURE);
4278
4279 ret = lxc_read_nointr(sock_fds[0], &c, 1);
4280 if (ret != 1)
4281 _exit(EXIT_FAILURE);
4282
4283 close_prot_errno_disarm(sock_fds[0]);
4284
4285 if (!lxc_setgroups(0, NULL) && errno != EPERM)
4286 _exit(EXIT_FAILURE);
4287
dbfcdf86
CB
4288 ret = setresgid(resgid, resgid, resgid);
4289 if (ret < 0) {
4290 SYSERROR("Failed to setresgid(%d, %d, %d)",
4291 resgid, resgid, resgid);
edf88289 4292 _exit(EXIT_FAILURE);
dbfcdf86
CB
4293 }
4294
4295 ret = setresuid(resuid, resuid, resuid);
4296 if (ret < 0) {
4297 SYSERROR("Failed to setresuid(%d, %d, %d)",
4298 resuid, resuid, resuid);
4299 _exit(EXIT_FAILURE);
4300 }
edf88289 4301
d1783ef4 4302 ret = fn_child(fn_child_data);
dbfcdf86
CB
4303 if (ret) {
4304 SYSERROR("Running function in new user namespace failed");
edf88289 4305 _exit(EXIT_FAILURE);
dbfcdf86 4306 }
edf88289
CB
4307
4308 _exit(EXIT_SUCCESS);
4309 }
4310
4311 close_prot_errno_disarm(sock_fds[0]);
4312
62fef886 4313 if (lxc_log_trace()) {
edf88289
CB
4314 struct id_map *map;
4315 struct lxc_list *it;
4316
4317 lxc_list_for_each(it, idmap) {
4318 map = it->elem;
4319 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4320 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
4321 }
4322 }
4323
4324 ret = lxc_read_nointr(sock_fds[1], &c, 1);
4325 if (ret != 1) {
4326 SYSERROR("Failed waiting for child process %d\" to tell us to proceed", pid);
4327 goto on_error;
4328 }
4329
4330 /* Set up {g,u}id mapping for user namespace of child process. */
4331 ret = lxc_map_ids(idmap, pid);
4332 if (ret < 0) {
4333 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4334 goto on_error;
4335 }
4336
4337 /* Tell child to proceed. */
4338 ret = lxc_write_nointr(sock_fds[1], &c, 1);
4339 if (ret != 1) {
4340 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4341 goto on_error;
4342 }
4343
d1783ef4
CB
4344 if (fn_parent && fn_parent(fn_parent_data)) {
4345 SYSERROR("Running parent function failed");
4346 _exit(EXIT_FAILURE);
4347 }
4348
edf88289
CB
4349on_error:
4350 close_prot_errno_disarm(sock_fds[0]);
4351 close_prot_errno_disarm(sock_fds[1]);
4352
4353 /* Wait for child to finish. */
dbfcdf86
CB
4354 if (pid < 0)
4355 return -1;
edf88289 4356
dbfcdf86 4357 return wait_for_pid(pid);
edf88289
CB
4358}
4359
415a8851
CB
4360int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
4361 const char *fn_name)
4362{
4363 pid_t pid;
4364 uid_t euid, egid;
415a8851
CB
4365 int p[2];
4366 struct id_map *map;
4367 struct lxc_list *cur;
0fd73091 4368 struct userns_fn_data d;
415a8851 4369 int ret = -1;
0fd73091 4370 char c = '1';
415a8851
CB
4371 struct lxc_list *idmap = NULL, *tmplist = NULL;
4372 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4373 *host_uid_map = NULL, *host_gid_map = NULL;
4374
2b2655a8
CB
4375 if (!conf)
4376 return -EINVAL;
4377
979f9e34 4378 ret = pipe2(p, O_CLOEXEC);
415a8851
CB
4379 if (ret < 0) {
4380 SYSERROR("opening pipe");
4381 return -1;
4382 }
4383 d.fn = fn;
4384 d.fn_name = fn_name;
4385 d.arg = data;
4386 d.p[0] = p[0];
4387 d.p[1] = p[1];
4388
4389 /* Clone child in new user namespace. */
33258b95 4390 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER, NULL);
415a8851 4391 if (pid < 0) {
0fd73091 4392 ERROR("Failed to clone process in new user namespace");
415a8851
CB
4393 goto on_error;
4394 }
4395
4396 close(p[0]);
4397 p[0] = -1;
4398
4399 euid = geteuid();
4400 egid = getegid();
4401
4402 /* Allocate new {g,u}id map list. */
4403 idmap = malloc(sizeof(*idmap));
4404 if (!idmap)
4405 goto on_error;
4406 lxc_list_init(idmap);
4407
4408 /* Find container root. */
0fd73091 4409 lxc_list_for_each (cur, &conf->id_map) {
415a8851
CB
4410 struct id_map *tmpmap;
4411
4412 tmplist = malloc(sizeof(*tmplist));
4413 if (!tmplist)
4414 goto on_error;
4415
4416 tmpmap = malloc(sizeof(*tmpmap));
4417 if (!tmpmap) {
4418 free(tmplist);
4419 goto on_error;
4420 }
4421
4422 memset(tmpmap, 0, sizeof(*tmpmap));
4423 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
4424 tmplist->elem = tmpmap;
4425
4426 lxc_list_add_tail(idmap, tmplist);
4427
4428 map = cur->elem;
4429
4430 if (map->idtype == ID_TYPE_UID)
4431 if (euid >= map->hostid && euid < map->hostid + map->range)
4432 host_uid_map = map;
4433
4434 if (map->idtype == ID_TYPE_GID)
4435 if (egid >= map->hostid && egid < map->hostid + map->range)
4436 host_gid_map = map;
4437
4438 if (map->nsid != 0)
4439 continue;
4440
4441 if (map->idtype == ID_TYPE_UID)
4442 if (container_root_uid == NULL)
4443 container_root_uid = map;
4444
4445 if (map->idtype == ID_TYPE_GID)
4446 if (container_root_gid == NULL)
4447 container_root_gid = map;
4448 }
4449
4450 if (!container_root_uid || !container_root_gid) {
4451 ERROR("No mapping for container root found");
4452 goto on_error;
4453 }
4454
4455 /* Check whether the {g,u}id of the user has a mapping. */
4456 if (!host_uid_map)
c4333195 4457 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
415a8851
CB
4458 else
4459 host_uid_map = container_root_uid;
4460
4461 if (!host_gid_map)
c4333195 4462 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
415a8851
CB
4463 else
4464 host_gid_map = container_root_gid;
4465
4466 if (!host_uid_map) {
4467 DEBUG("Failed to find mapping for uid %d", euid);
4468 goto on_error;
4469 }
4470
4471 if (!host_gid_map) {
4472 DEBUG("Failed to find mapping for gid %d", egid);
4473 goto on_error;
4474 }
4475
4476 if (host_uid_map && (host_uid_map != container_root_uid)) {
4477 /* Add container root to the map. */
4478 tmplist = malloc(sizeof(*tmplist));
4479 if (!tmplist)
4480 goto on_error;
4481 lxc_list_add_elem(tmplist, host_uid_map);
4482 lxc_list_add_tail(idmap, tmplist);
4483 }
4484 /* idmap will now keep track of that memory. */
4485 host_uid_map = NULL;
4486
4487 if (host_gid_map && (host_gid_map != container_root_gid)) {
4488 tmplist = malloc(sizeof(*tmplist));
4489 if (!tmplist)
4490 goto on_error;
4491 lxc_list_add_elem(tmplist, host_gid_map);
4492 lxc_list_add_tail(idmap, tmplist);
4493 }
4494 /* idmap will now keep track of that memory. */
4495 host_gid_map = NULL;
4496
62fef886 4497 if (lxc_log_trace()) {
0fd73091 4498 lxc_list_for_each (cur, idmap) {
415a8851
CB
4499 map = cur->elem;
4500 TRACE("establishing %cid mapping for \"%d\" in new "
4501 "user namespace: nsuid %lu - hostid %lu - range "
4502 "%lu",
4503 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4504 map->nsid, map->hostid, map->range);
4505 }
4506 }
4507
4508 /* Set up {g,u}id mapping for user namespace of child process. */
4509 ret = lxc_map_ids(idmap, pid);
4510 if (ret < 0) {
0fd73091 4511 ERROR("error setting up {g,u}id mappings for child process \"%d\"", pid);
415a8851
CB
4512 goto on_error;
4513 }
4514
4515 /* Tell child to proceed. */
489f39be 4516 if (lxc_write_nointr(p[1], &c, 1) != 1) {
0fd73091 4517 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
415a8851
CB
4518 goto on_error;
4519 }
4520
686dd5d1 4521on_error:
ee1b16bc
TA
4522 if (p[0] != -1)
4523 close(p[0]);
4524 close(p[1]);
4525
415a8851 4526 /* Wait for child to finish. */
686dd5d1
CB
4527 if (pid > 0)
4528 ret = wait_for_pid(pid);
415a8851 4529
7e621263
CB
4530 if (idmap)
4531 __lxc_free_idmap(idmap);
80758b4b 4532
415a8851
CB
4533 if (host_uid_map && (host_uid_map != container_root_uid))
4534 free(host_uid_map);
4535 if (host_gid_map && (host_gid_map != container_root_gid))
4536 free(host_gid_map);
4537
415a8851
CB
4538 return ret;
4539}
4540
234998b4
CB
4541static int add_idmap_entry(struct lxc_list *idmap, enum idtype idtype,
4542 unsigned long nsid, unsigned long hostid,
4543 unsigned long range)
4544{
4545 __do_free struct id_map *new_idmap = NULL;
4546 __do_free struct lxc_list *new_list = NULL;
4547
4548 new_idmap = zalloc(sizeof(*new_idmap));
4549 if (!new_idmap)
4550 return ret_errno(ENOMEM);
4551
4552 new_idmap->idtype = idtype;
4553 new_idmap->hostid = hostid;
4554 new_idmap->nsid = nsid;
4555 new_idmap->range = range;
4556
4557 new_list = zalloc(sizeof(*new_list));
4558 if (!new_list)
4559 return ret_errno(ENOMEM);
4560
4561 new_list->elem = move_ptr(new_idmap);
4562 lxc_list_add_tail(idmap, move_ptr(new_list));
4563
4564 INFO("Adding id map: type %c nsid %lu hostid %lu range %lu",
4565 idtype == ID_TYPE_UID ? 'u' : 'g', nsid, hostid, range);
4566 return 0;
4567}
4568
4569int userns_exec_mapped_root(const char *path, int path_fd,
4570 const struct lxc_conf *conf)
4571{
7e621263 4572 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
234998b4
CB
4573 __do_close int fd = -EBADF;
4574 int target_fd = -EBADF;
4575 char c = '1';
4576 ssize_t ret;
4577 pid_t pid;
4578 int sock_fds[2];
4579 uid_t container_host_uid, hostuid;
4580 gid_t container_host_gid, hostgid;
4581 struct stat st;
4582
4583 if (!conf || (!path && path_fd < 0))
4584 return ret_errno(EINVAL);
4585
4586 if (!path)
4587 path = "(null)";
4588
4589 container_host_uid = get_mapped_rootid(conf, ID_TYPE_UID);
4590 if (!uid_valid(container_host_uid))
4591 return log_error(-1, "No uid mapping for container root");
4592
4593 container_host_gid = get_mapped_rootid(conf, ID_TYPE_GID);
4594 if (!gid_valid(container_host_gid))
4595 return log_error(-1, "No gid mapping for container root");
4596
cf68ffd9 4597 if (path_fd < 0) {
a72c68f7 4598 fd = open(path, O_CLOEXEC | O_NOCTTY);
234998b4
CB
4599 if (fd < 0)
4600 return log_error_errno(-errno, errno, "Failed to open \"%s\"", path);
4601 target_fd = fd;
4602 } else {
4603 target_fd = path_fd;
4604 }
4605
4606 hostuid = geteuid();
4607 /* We are root so chown directly. */
4608 if (hostuid == 0) {
4609 ret = fchown(target_fd, container_host_uid, container_host_gid);
4610 if (ret)
4611 return log_error_errno(-errno, errno,
4612 "Failed to fchown(%d(%s), %d, %d)",
4613 target_fd, path, container_host_uid,
4614 container_host_gid);
4615 return log_trace(0, "Chowned %d(%s) to uid %d and %d", target_fd, path,
4616 container_host_uid, container_host_gid);
4617 }
4618
4619 /* The container's root host id matches */
4620 if (container_host_uid == hostuid)
4621 return log_info(0, "Container root id is mapped to our uid");
4622
4623 /* Get the current ids of our target. */
4624 ret = fstat(target_fd, &st);
4625 if (ret)
4626 return log_error_errno(-errno, errno, "Failed to stat \"%s\"", path);
4627
4628 hostgid = getegid();
4629 if (st.st_uid == hostuid && mapped_hostid(st.st_gid, conf, ID_TYPE_GID) < 0) {
4630 ret = fchown(target_fd, -1, hostgid);
4631 if (ret)
4632 return log_error_errno(-errno, errno,
4633 "Failed to fchown(%d(%s), -1, %d)",
4634 target_fd, path, hostgid);
2e8013f9 4635 TRACE("Chowned %d(%s) to -1:%d", target_fd, path, hostgid);
234998b4
CB
4636 }
4637
4638 idmap = malloc(sizeof(*idmap));
4639 if (!idmap)
4640 return -ENOMEM;
4641 lxc_list_init(idmap);
4642
4643 /* "u:0:rootuid:1" */
4644 ret = add_idmap_entry(idmap, ID_TYPE_UID, 0, container_host_uid, 1);
4645 if (ret < 0)
4646 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4647
4648 /* "u:hostuid:hostuid:1" */
4649 ret = add_idmap_entry(idmap, ID_TYPE_UID, hostuid, hostuid, 1);
4650 if (ret < 0)
4651 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4652
4653 /* "g:0:rootgid:1" */
4654 ret = add_idmap_entry(idmap, ID_TYPE_GID, 0, container_host_gid, 1);
4655 if (ret < 0)
4656 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4657
4658 /* "g:hostgid:hostgid:1" */
4659 ret = add_idmap_entry(idmap, ID_TYPE_GID, hostgid, hostgid, 1);
4660 if (ret < 0)
4661 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4662
4663 if (hostgid != st.st_gid) {
4664 /* "g:pathgid:rootgid+pathgid:1" */
4665 ret = add_idmap_entry(idmap, ID_TYPE_GID, st.st_gid,
4666 container_host_gid + (gid_t)st.st_gid, 1);
4667 if (ret < 0)
4668 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4669 }
4670
4671 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, sock_fds);
4672 if (ret < 0)
4673 return -errno;
4674
4675 pid = fork();
4676 if (pid < 0) {
4677 SYSERROR("Failed to create new process");
4678 goto on_error;
4679 }
4680
4681 if (pid == 0) {
4682 close_prot_errno_disarm(sock_fds[1]);
4683
4684 ret = unshare(CLONE_NEWUSER);
4685 if (ret < 0) {
4686 SYSERROR("Failed to unshare new user namespace");
4687 _exit(EXIT_FAILURE);
4688 }
4689
4690 ret = lxc_write_nointr(sock_fds[0], &c, 1);
4691 if (ret != 1)
4692 _exit(EXIT_FAILURE);
4693
4694 ret = lxc_read_nointr(sock_fds[0], &c, 1);
4695 if (ret != 1)
4696 _exit(EXIT_FAILURE);
4697
4698 close_prot_errno_disarm(sock_fds[0]);
4699
4700 if (!lxc_switch_uid_gid(0, 0))
4701 _exit(EXIT_FAILURE);
4702
4703 if (!lxc_setgroups(0, NULL))
4704 _exit(EXIT_FAILURE);
4705
8053a085 4706 ret = fchown(target_fd, 0, st.st_gid);
234998b4 4707 if (ret) {
8ea93a0f 4708 SYSERROR("Failed to chown %d(%s) to 0:%d", target_fd, path, st.st_gid);
234998b4
CB
4709 _exit(EXIT_FAILURE);
4710 }
4711
2e8013f9 4712 TRACE("Chowned %d(%s) to 0:%d", target_fd, path, st.st_gid);
234998b4
CB
4713 _exit(EXIT_SUCCESS);
4714 }
4715
4716 close_prot_errno_disarm(sock_fds[0]);
4717
62fef886 4718 if (lxc_log_trace()) {
234998b4
CB
4719 struct id_map *map;
4720 struct lxc_list *it;
4721
4722 lxc_list_for_each(it, idmap) {
4723 map = it->elem;
4724 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4725 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
4726 }
4727 }
4728
4729 ret = lxc_read_nointr(sock_fds[1], &c, 1);
4730 if (ret != 1) {
4731 SYSERROR("Failed waiting for child process %d\" to tell us to proceed", pid);
4732 goto on_error;
4733 }
4734
4735 /* Set up {g,u}id mapping for user namespace of child process. */
4736 ret = lxc_map_ids(idmap, pid);
4737 if (ret < 0) {
4738 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4739 goto on_error;
4740 }
4741
4742 /* Tell child to proceed. */
4743 ret = lxc_write_nointr(sock_fds[1], &c, 1);
4744 if (ret != 1) {
4745 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4746 goto on_error;
4747 }
4748
4749on_error:
4750 close_prot_errno_disarm(sock_fds[0]);
4751 close_prot_errno_disarm(sock_fds[1]);
4752
4753 /* Wait for child to finish. */
4754 if (pid < 0)
4755 return -1;
4756
4757 return wait_for_pid(pid);
4758}
4759
a96a8e8c 4760/* not thread-safe, do not use from api without first forking */
0fd73091 4761static char *getuname(void)
97e9cfa0 4762{
4f410b2a 4763 __do_free char *buf = NULL;
cb7aa5e8
DJ
4764 struct passwd pwent;
4765 struct passwd *pwentp = NULL;
cb7aa5e8
DJ
4766 size_t bufsize;
4767 int ret;
97e9cfa0 4768
cb7aa5e8
DJ
4769 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
4770 if (bufsize == -1)
4771 bufsize = 1024;
4772
4773 buf = malloc(bufsize);
4774 if (!buf)
97e9cfa0
SH
4775 return NULL;
4776
cb7aa5e8
DJ
4777 ret = getpwuid_r(geteuid(), &pwent, buf, bufsize, &pwentp);
4778 if (!pwentp) {
4779 if (ret == 0)
4780 WARN("Could not find matched password record.");
4781
55022530 4782 return log_error(NULL, "Failed to get password record - %u", geteuid());
cb7aa5e8
DJ
4783 }
4784
4f410b2a 4785 return strdup(pwent.pw_name);
97e9cfa0
SH
4786}
4787
a96a8e8c 4788/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4789static char *getgname(void)
4790{
4f410b2a 4791 __do_free char *buf = NULL;
3de9fb4c
DJ
4792 struct group grent;
4793 struct group *grentp = NULL;
3de9fb4c
DJ
4794 size_t bufsize;
4795 int ret;
4796
4797 bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
4798 if (bufsize == -1)
4799 bufsize = 1024;
4800
4801 buf = malloc(bufsize);
4802 if (!buf)
4803 return NULL;
4804
4805 ret = getgrgid_r(getegid(), &grent, buf, bufsize, &grentp);
4806 if (!grentp) {
4807 if (ret == 0)
4808 WARN("Could not find matched group record");
97e9cfa0 4809
55022530 4810 return log_error(NULL, "Failed to get group record - %u", getegid());
3de9fb4c
DJ
4811 }
4812
4f410b2a 4813 return strdup(grent.gr_name);
97e9cfa0
SH
4814}
4815
a96a8e8c 4816/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4817void suggest_default_idmap(void)
4818{
3a6e3bf5 4819 __do_free char *gname = NULL, *line = NULL, *uname = NULL;
4aae564f 4820 __do_fclose FILE *subuid_f = NULL, *subgid_f = NULL;
97e9cfa0 4821 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
97e9cfa0
SH
4822 size_t len = 0;
4823
0fd73091
CB
4824 uname = getuname();
4825 if (!uname)
97e9cfa0
SH
4826 return;
4827
0fd73091 4828 gname = getgname();
3a6e3bf5 4829 if (!gname)
97e9cfa0 4830 return;
97e9cfa0 4831
4110345b 4832 subuid_f = fopen(subuidfile, "re");
4aae564f 4833 if (!subuid_f) {
97e9cfa0 4834 ERROR("Your system is not configured with subuids");
97e9cfa0
SH
4835 return;
4836 }
0fd73091 4837
4aae564f 4838 while (getline(&line, &len, subuid_f) != -1) {
0fd73091 4839 char *p, *p2;
b7930180 4840 size_t no_newline = 0;
0fd73091
CB
4841
4842 p = strchr(line, ':');
97e9cfa0
SH
4843 if (*line == '#')
4844 continue;
4845 if (!p)
4846 continue;
4847 *p = '\0';
4848 p++;
0fd73091 4849
97e9cfa0
SH
4850 if (strcmp(line, uname))
4851 continue;
0fd73091 4852
97e9cfa0
SH
4853 p2 = strchr(p, ':');
4854 if (!p2)
4855 continue;
4856 *p2 = '\0';
4857 p2++;
4858 if (!*p2)
4859 continue;
b7930180
CB
4860 no_newline = strcspn(p2, "\n");
4861 p2[no_newline] = '\0';
4862
b7b2fde4 4863 if (lxc_safe_uint(p, &uid) < 0)
0fd73091 4864 WARN("Could not parse UID");
b7b2fde4 4865 if (lxc_safe_uint(p2, &urange) < 0)
0fd73091 4866 WARN("Could not parse UID range");
97e9cfa0 4867 }
97e9cfa0 4868
4110345b 4869 subgid_f = fopen(subgidfile, "re");
4aae564f 4870 if (!subgid_f) {
97e9cfa0 4871 ERROR("Your system is not configured with subgids");
97e9cfa0
SH
4872 return;
4873 }
0fd73091 4874
4aae564f 4875 while (getline(&line, &len, subgid_f) != -1) {
0fd73091 4876 char *p, *p2;
b7930180 4877 size_t no_newline = 0;
0fd73091
CB
4878
4879 p = strchr(line, ':');
97e9cfa0
SH
4880 if (*line == '#')
4881 continue;
4882 if (!p)
4883 continue;
4884 *p = '\0';
4885 p++;
0fd73091 4886
97e9cfa0
SH
4887 if (strcmp(line, uname))
4888 continue;
0fd73091 4889
97e9cfa0
SH
4890 p2 = strchr(p, ':');
4891 if (!p2)
4892 continue;
4893 *p2 = '\0';
4894 p2++;
4895 if (!*p2)
4896 continue;
b7930180
CB
4897 no_newline = strcspn(p2, "\n");
4898 p2[no_newline] = '\0';
4899
b7b2fde4 4900 if (lxc_safe_uint(p, &gid) < 0)
0fd73091 4901 WARN("Could not parse GID");
b7b2fde4 4902 if (lxc_safe_uint(p2, &grange) < 0)
0fd73091 4903 WARN("Could not parse GID range");
97e9cfa0 4904 }
97e9cfa0 4905
97e9cfa0
SH
4906 if (!urange || !grange) {
4907 ERROR("You do not have subuids or subgids allocated");
4908 ERROR("Unprivileged containers require subuids and subgids");
4909 return;
4910 }
4911
4912 ERROR("You must either run as root, or define uid mappings");
4913 ERROR("To pass uid mappings to lxc-create, you could create");
4914 ERROR("~/.config/lxc/default.conf:");
4915 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
bdcbb6b3
CB
4916 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4917 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
97e9cfa0 4918}
aaf26830 4919
a7307747
SH
4920static void free_cgroup_settings(struct lxc_list *result)
4921{
4922 struct lxc_list *iterator, *next;
4923
0fd73091 4924 lxc_list_for_each_safe (iterator, result, next) {
a7307747 4925 lxc_list_del(iterator);
55022530 4926 free_disarm(iterator);
a7307747 4927 }
55022530 4928 free_disarm(result);
a7307747
SH
4929}
4930
0fd73091 4931/* Return the list of cgroup_settings sorted according to the following rules
aaf26830
KT
4932 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4933 */
0fd73091 4934struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings)
aaf26830
KT
4935{
4936 struct lxc_list *result;
aaf26830 4937 struct lxc_cgroup *cg = NULL;
0fd73091 4938 struct lxc_list *it = NULL, *item = NULL, *memsw_limit = NULL;
aaf26830
KT
4939
4940 result = malloc(sizeof(*result));
0fd73091 4941 if (!result)
fac7c663 4942 return NULL;
aaf26830
KT
4943 lxc_list_init(result);
4944
0fd73091
CB
4945 /* Iterate over the cgroup settings and copy them to the output list. */
4946 lxc_list_for_each (it, cgroup_settings) {
aaf26830 4947 item = malloc(sizeof(*item));
fac7c663 4948 if (!item) {
a7307747 4949 free_cgroup_settings(result);
fac7c663
KT
4950 return NULL;
4951 }
0fd73091 4952
aaf26830
KT
4953 item->elem = it->elem;
4954 cg = it->elem;
4955 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4956 /* Store the memsw_limit location */
4957 memsw_limit = item;
0fd73091
CB
4958 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 &&
4959 memsw_limit != NULL) {
4960 /* lxc.cgroup.memory.memsw.limit_in_bytes is found
4961 * before lxc.cgroup.memory.limit_in_bytes, swap these
4962 * two items */
aaf26830
KT
4963 item->elem = memsw_limit->elem;
4964 memsw_limit->elem = it->elem;
4965 }
4966 lxc_list_add_tail(result, item);
4967 }
4968
4969 return result;
a7307747 4970}