]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
tree-wide: make files cloexec whenever possible
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
1d52bdf7 2
d38dd64a
CB
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE 1
5#endif
9d257a2a 6#include <arpa/inet.h>
8f3e280e
CB
7#include <dirent.h>
8#include <errno.h>
9#include <fcntl.h>
10#include <grp.h>
11#include <inttypes.h>
12#include <libgen.h>
9d257a2a
CB
13#include <linux/loop.h>
14#include <net/if.h>
15#include <netinet/in.h>
8f3e280e
CB
16#include <pwd.h>
17#include <stdarg.h>
0ad19a3f 18#include <stdio.h>
0ad19a3f 19#include <stdlib.h>
0ad19a3f 20#include <string.h>
8f3e280e
CB
21#include <sys/mman.h>
22#include <sys/mount.h>
23#include <sys/param.h>
24#include <sys/prctl.h>
6a49f05e 25#include <sys/sendfile.h>
8f3e280e 26#include <sys/socket.h>
9d257a2a 27#include <sys/stat.h>
2d76d1d7 28#include <sys/syscall.h>
9d257a2a 29#include <sys/sysmacros.h>
97e9cfa0 30#include <sys/types.h>
8f3e280e
CB
31#include <sys/utsname.h>
32#include <sys/wait.h>
9d257a2a
CB
33#include <time.h>
34#include <unistd.h>
1d52bdf7 35
d38dd64a
CB
36#include "af_unix.h"
37#include "caps.h"
38#include "cgroup.h"
bf651989 39#include "cgroup2_devices.h"
d38dd64a
CB
40#include "conf.h"
41#include "config.h"
42#include "confile.h"
43#include "confile_utils.h"
44#include "error.h"
45#include "log.h"
46#include "lsm/lsm.h"
47#include "lxclock.h"
48#include "lxcseccomp.h"
49#include "macro.h"
2f443e88 50#include "memory_utils.h"
d38dd64a
CB
51#include "namespace.h"
52#include "network.h"
53#include "parse.h"
13be2733 54#include "raw_syscalls.h"
d38dd64a
CB
55#include "ringbuf.h"
56#include "start.h"
57#include "storage.h"
58#include "storage/overlay.h"
6b3d24d7 59#include "syscall_wrappers.h"
d38dd64a
CB
60#include "terminal.h"
61#include "utils.h"
20502652 62#include "uuid.h"
d38dd64a 63
af6824fc 64#ifdef MAJOR_IN_MKDEV
9d257a2a 65#include <sys/mkdev.h>
af6824fc 66#endif
af6824fc 67
614305f3 68#ifdef HAVE_STATVFS
2938f7c8 69#include <sys/statvfs.h>
614305f3 70#endif
e827ff7e
SG
71
72#if HAVE_PTY_H
b0a33c1e 73#include <pty.h>
e827ff7e
SG
74#else
75#include <../include/openpty.h>
76#endif
0ad19a3f 77
9d257a2a
CB
78#if HAVE_LIBCAP
79#include <sys/capability.h>
80#endif
81
82#if HAVE_SYS_PERSONALITY_H
83#include <sys/personality.h>
84#endif
85
f1e05b90
DJ
86#ifndef HAVE_STRLCAT
87#include "include/strlcat.h"
88#endif
89
9d257a2a
CB
90#if IS_BIONIC
91#include <../include/lxcmntent.h>
92#else
93#include <mntent.h>
94#endif
95
96#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
97#include <../include/prlimit.h>
98#endif
99
ac2cecc4 100lxc_log_define(conf, lxc);
e5bda9ee 101
0fd73091
CB
102/* The lxc_conf of the container currently being worked on in an API call.
103 * This is used in the error calls.
104 */
105#ifdef HAVE_TLS
d7f19646 106thread_local struct lxc_conf *current_config;
0fd73091
CB
107#else
108struct lxc_conf *current_config;
109#endif
8912711c 110
0fd73091
CB
111char *lxchook_names[NUM_LXC_HOOKS] = {
112 "pre-start",
113 "pre-mount",
114 "mount",
115 "autodev",
116 "start",
117 "stop",
118 "post-stop",
119 "clone",
120 "destroy",
121 "start-host"
122};
72d0e1cb 123
998ac676
RT
124struct mount_opt {
125 char *name;
126 int clear;
127 int flag;
128};
129
81810dd1
DL
130struct caps_opt {
131 char *name;
132 int value;
133};
134
c6d09e15
WB
135struct limit_opt {
136 char *name;
137 int value;
138};
139
998ac676 140static struct mount_opt mount_opt[] = {
470b359b
CB
141 { "async", 1, MS_SYNCHRONOUS },
142 { "atime", 1, MS_NOATIME },
143 { "bind", 0, MS_BIND },
88d413d5 144 { "defaults", 0, 0 },
88d413d5 145 { "dev", 1, MS_NODEV },
470b359b 146 { "diratime", 1, MS_NODIRATIME },
88d413d5 147 { "dirsync", 0, MS_DIRSYNC },
470b359b 148 { "exec", 1, MS_NOEXEC },
8912711c 149 { "lazytime", 0, MS_LAZYTIME },
88d413d5 150 { "mand", 0, MS_MANDLOCK },
88d413d5 151 { "noatime", 0, MS_NOATIME },
470b359b 152 { "nodev", 0, MS_NODEV },
88d413d5 153 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
154 { "noexec", 0, MS_NOEXEC },
155 { "nomand", 1, MS_MANDLOCK },
156 { "norelatime", 1, MS_RELATIME },
157 { "nostrictatime", 1, MS_STRICTATIME },
158 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
159 { "rbind", 0, MS_BIND|MS_REC },
160 { "relatime", 0, MS_RELATIME },
470b359b
CB
161 { "remount", 0, MS_REMOUNT },
162 { "ro", 0, MS_RDONLY },
163 { "rw", 1, MS_RDONLY },
88d413d5 164 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
165 { "suid", 1, MS_NOSUID },
166 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 167 { NULL, 0, 0 },
998ac676
RT
168};
169
d840039e 170static struct mount_opt propagation_opt[] = {
0fd73091
CB
171 { "private", 0, MS_PRIVATE },
172 { "shared", 0, MS_SHARED },
173 { "slave", 0, MS_SLAVE },
174 { "unbindable", 0, MS_UNBINDABLE },
175 { "rprivate", 0, MS_PRIVATE|MS_REC },
176 { "rshared", 0, MS_SHARED|MS_REC },
177 { "rslave", 0, MS_SLAVE|MS_REC },
178 { "runbindable", 0, MS_UNBINDABLE|MS_REC },
179 { NULL, 0, 0 },
d840039e
YT
180};
181
81810dd1 182static struct caps_opt caps_opt[] = {
8560cd36 183#if HAVE_LIBCAP
0fd73091
CB
184 { "chown", CAP_CHOWN },
185 { "dac_override", CAP_DAC_OVERRIDE },
186 { "dac_read_search", CAP_DAC_READ_SEARCH },
187 { "fowner", CAP_FOWNER },
188 { "fsetid", CAP_FSETID },
189 { "kill", CAP_KILL },
190 { "setgid", CAP_SETGID },
191 { "setuid", CAP_SETUID },
192 { "setpcap", CAP_SETPCAP },
193 { "linux_immutable", CAP_LINUX_IMMUTABLE },
194 { "net_bind_service", CAP_NET_BIND_SERVICE },
195 { "net_broadcast", CAP_NET_BROADCAST },
196 { "net_admin", CAP_NET_ADMIN },
197 { "net_raw", CAP_NET_RAW },
198 { "ipc_lock", CAP_IPC_LOCK },
199 { "ipc_owner", CAP_IPC_OWNER },
200 { "sys_module", CAP_SYS_MODULE },
201 { "sys_rawio", CAP_SYS_RAWIO },
202 { "sys_chroot", CAP_SYS_CHROOT },
203 { "sys_ptrace", CAP_SYS_PTRACE },
204 { "sys_pacct", CAP_SYS_PACCT },
205 { "sys_admin", CAP_SYS_ADMIN },
206 { "sys_boot", CAP_SYS_BOOT },
207 { "sys_nice", CAP_SYS_NICE },
208 { "sys_resource", CAP_SYS_RESOURCE },
209 { "sys_time", CAP_SYS_TIME },
210 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
211 { "mknod", CAP_MKNOD },
212 { "lease", CAP_LEASE },
57b837e2 213#ifdef CAP_AUDIT_READ
0fd73091 214 { "audit_read", CAP_AUDIT_READ },
57b837e2 215#endif
9527e566 216#ifdef CAP_AUDIT_WRITE
0fd73091 217 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
218#endif
219#ifdef CAP_AUDIT_CONTROL
0fd73091 220 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 221#endif
0fd73091
CB
222 { "setfcap", CAP_SETFCAP },
223 { "mac_override", CAP_MAC_OVERRIDE },
224 { "mac_admin", CAP_MAC_ADMIN },
5170c716 225#ifdef CAP_SYSLOG
0fd73091 226 { "syslog", CAP_SYSLOG },
5170c716
CS
227#endif
228#ifdef CAP_WAKE_ALARM
0fd73091 229 { "wake_alarm", CAP_WAKE_ALARM },
5170c716 230#endif
2b54359b 231#ifdef CAP_BLOCK_SUSPEND
0fd73091 232 { "block_suspend", CAP_BLOCK_SUSPEND },
2b54359b 233#endif
495d2046 234#endif
8560cd36 235};
81810dd1 236
c6d09e15
WB
237static struct limit_opt limit_opt[] = {
238#ifdef RLIMIT_AS
239 { "as", RLIMIT_AS },
240#endif
241#ifdef RLIMIT_CORE
242 { "core", RLIMIT_CORE },
243#endif
244#ifdef RLIMIT_CPU
245 { "cpu", RLIMIT_CPU },
246#endif
247#ifdef RLIMIT_DATA
248 { "data", RLIMIT_DATA },
249#endif
250#ifdef RLIMIT_FSIZE
251 { "fsize", RLIMIT_FSIZE },
252#endif
253#ifdef RLIMIT_LOCKS
254 { "locks", RLIMIT_LOCKS },
255#endif
256#ifdef RLIMIT_MEMLOCK
257 { "memlock", RLIMIT_MEMLOCK },
258#endif
259#ifdef RLIMIT_MSGQUEUE
260 { "msgqueue", RLIMIT_MSGQUEUE },
261#endif
262#ifdef RLIMIT_NICE
263 { "nice", RLIMIT_NICE },
264#endif
265#ifdef RLIMIT_NOFILE
266 { "nofile", RLIMIT_NOFILE },
267#endif
268#ifdef RLIMIT_NPROC
269 { "nproc", RLIMIT_NPROC },
270#endif
271#ifdef RLIMIT_RSS
272 { "rss", RLIMIT_RSS },
273#endif
274#ifdef RLIMIT_RTPRIO
275 { "rtprio", RLIMIT_RTPRIO },
276#endif
277#ifdef RLIMIT_RTTIME
278 { "rttime", RLIMIT_RTTIME },
279#endif
280#ifdef RLIMIT_SIGPENDING
281 { "sigpending", RLIMIT_SIGPENDING },
282#endif
283#ifdef RLIMIT_STACK
284 { "stack", RLIMIT_STACK },
285#endif
286};
287
91c3830e
SH
288static int run_buffer(char *buffer)
289{
cc6a0e78 290 __do_free char *output = NULL;
ebf3a6af 291 int fd, ret;
0fd73091 292 struct lxc_popen_FILE *f;
91c3830e 293
ebec9176 294 f = lxc_popen(buffer);
91c3830e 295 if (!f) {
3f60c2f7 296 SYSERROR("Failed to popen() %s", buffer);
91c3830e
SH
297 return -1;
298 }
299
300 output = malloc(LXC_LOG_BUFFER_SIZE);
301 if (!output) {
3f60c2f7 302 ERROR("Failed to allocate memory for %s", buffer);
ebec9176 303 lxc_pclose(f);
91c3830e
SH
304 return -1;
305 }
306
ebf3a6af
CB
307 fd = fileno(f->f);
308 if (fd < 0) {
309 SYSERROR("Failed to retrieve underlying file descriptor");
310 lxc_pclose(f);
311 return -1;
312 }
313
314 for (int i = 0; i < 10; i++) {
315 ssize_t bytes_read;
316
317 bytes_read = lxc_read_nointr(fd, output, LXC_LOG_BUFFER_SIZE - 1);
318 if (bytes_read > 0) {
319 output[bytes_read] = '\0';
320 DEBUG("Script %s produced output: %s", buffer, output);
321 continue;
322 }
323
324 break;
325 }
91c3830e 326
ebec9176 327 ret = lxc_pclose(f);
8e7da691 328 if (ret == -1) {
3f60c2f7 329 SYSERROR("Script exited with error");
91c3830e 330 return -1;
8e7da691 331 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
3f60c2f7 332 ERROR("Script exited with status %d", WEXITSTATUS(ret));
8e7da691
DE
333 return -1;
334 } else if (WIFSIGNALED(ret)) {
3f60c2f7 335 ERROR("Script terminated by signal %d", WTERMSIG(ret));
8e7da691 336 return -1;
91c3830e
SH
337 }
338
339 return 0;
340}
341
14a7b0f9
CB
342int run_script_argv(const char *name, unsigned int hook_version,
343 const char *section, const char *script,
586b1ce7 344 const char *hookname, char **argv)
148e91f5 345{
e1a94937 346 __do_free char *buffer = NULL;
3f60c2f7 347 int buf_pos, i, ret;
d08e5708 348 size_t size = 0;
148e91f5 349
3f60c2f7
CB
350 if (hook_version == 0)
351 INFO("Executing script \"%s\" for container \"%s\", config "
352 "section \"%s\"", script, name, section);
353 else
354 INFO("Executing script \"%s\" for container \"%s\"", script, name);
148e91f5 355
586b1ce7
CB
356 for (i = 0; argv && argv[i]; i++)
357 size += strlen(argv[i]) + 1;
148e91f5 358
6333c915
CB
359 size += STRLITERALLEN("exec");
360 size++;
148e91f5 361 size += strlen(script);
3f60c2f7
CB
362 size++;
363
148e91f5 364 if (size > INT_MAX)
3f60c2f7 365 return -EFBIG;
148e91f5 366
3f60c2f7 367 if (hook_version == 0) {
d08e5708
CB
368 size += strlen(hookname);
369 size++;
370
371 size += strlen(name);
372 size++;
373
374 size += strlen(section);
375 size++;
376
377 if (size > INT_MAX)
378 return -EFBIG;
327cce76 379 }
3f60c2f7 380
6f8d00d2
CB
381 buffer = malloc(size);
382 if (!buffer)
383 return -ENOMEM;
384
327cce76 385 if (hook_version == 0)
3f60c2f7 386 buf_pos = snprintf(buffer, size, "exec %s %s %s %s", script, name, section, hookname);
327cce76 387 else
3f60c2f7 388 buf_pos = snprintf(buffer, size, "exec %s", script);
327cce76
CB
389 if (buf_pos < 0 || (size_t)buf_pos >= size) {
390 ERROR("Failed to create command line for script \"%s\"", script);
e1a94937 391 return -1;
327cce76 392 }
3f60c2f7 393
327cce76 394 if (hook_version == 1) {
3f60c2f7
CB
395 ret = setenv("LXC_HOOK_TYPE", hookname, 1);
396 if (ret < 0) {
397 SYSERROR("Failed to set environment variable: "
398 "LXC_HOOK_TYPE=%s", hookname);
e1a94937 399 return -1;
3f60c2f7 400 }
90f20466 401 TRACE("Set environment variable: LXC_HOOK_TYPE=%s", hookname);
3f60c2f7
CB
402
403 ret = setenv("LXC_HOOK_SECTION", section, 1);
404 if (ret < 0) {
405 SYSERROR("Failed to set environment variable: "
406 "LXC_HOOK_SECTION=%s", section);
e1a94937 407 return -1;
3f60c2f7
CB
408 }
409 TRACE("Set environment variable: LXC_HOOK_SECTION=%s", section);
14a7b0f9
CB
410
411 if (strcmp(section, "net") == 0) {
412 char *parent;
413
586b1ce7 414 if (!argv || !argv[0])
e1a94937 415 return -1;
14a7b0f9 416
586b1ce7 417 ret = setenv("LXC_NET_TYPE", argv[0], 1);
14a7b0f9
CB
418 if (ret < 0) {
419 SYSERROR("Failed to set environment variable: "
586b1ce7 420 "LXC_NET_TYPE=%s", argv[0]);
e1a94937 421 return -1;
14a7b0f9 422 }
586b1ce7 423 TRACE("Set environment variable: LXC_NET_TYPE=%s", argv[0]);
14a7b0f9 424
586b1ce7 425 parent = argv[1] ? argv[1] : "";
14a7b0f9 426
a8144263 427 if (strcmp(argv[0], "macvlan") == 0) {
14a7b0f9
CB
428 ret = setenv("LXC_NET_PARENT", parent, 1);
429 if (ret < 0) {
430 SYSERROR("Failed to set environment "
431 "variable: LXC_NET_PARENT=%s", parent);
e1a94937 432 return -1;
14a7b0f9
CB
433 }
434 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 435 } else if (strcmp(argv[0], "phys") == 0) {
14a7b0f9
CB
436 ret = setenv("LXC_NET_PARENT", parent, 1);
437 if (ret < 0) {
438 SYSERROR("Failed to set environment "
439 "variable: LXC_NET_PARENT=%s", parent);
e1a94937 440 return -1;
14a7b0f9
CB
441 }
442 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 443 } else if (strcmp(argv[0], "veth") == 0) {
586b1ce7 444 char *peer = argv[2] ? argv[2] : "";
14a7b0f9
CB
445
446 ret = setenv("LXC_NET_PEER", peer, 1);
447 if (ret < 0) {
448 SYSERROR("Failed to set environment "
449 "variable: LXC_NET_PEER=%s", peer);
e1a94937 450 return -1;
14a7b0f9
CB
451 }
452 TRACE("Set environment variable: LXC_NET_PEER=%s", peer);
453
454 ret = setenv("LXC_NET_PARENT", parent, 1);
455 if (ret < 0) {
456 SYSERROR("Failed to set environment "
457 "variable: LXC_NET_PARENT=%s", parent);
e1a94937 458 return -1;
14a7b0f9
CB
459 }
460 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
461 }
462 }
148e91f5
SH
463 }
464
586b1ce7 465 for (i = 0; argv && argv[i]; i++) {
3f60c2f7
CB
466 size_t len = size - buf_pos;
467
586b1ce7 468 ret = snprintf(buffer + buf_pos, len, " %s", argv[i]);
3f60c2f7
CB
469 if (ret < 0 || (size_t)ret >= len) {
470 ERROR("Failed to create command line for script \"%s\"", script);
e1a94937 471 return -1;
148e91f5 472 }
3f60c2f7 473 buf_pos += ret;
148e91f5
SH
474 }
475
e1a94937 476 return run_buffer(buffer);
148e91f5
SH
477}
478
811ef482 479int run_script(const char *name, const char *section, const char *script, ...)
e3b4c4c4 480{
2f443e88 481 __do_free char *buffer = NULL;
abbfd20b 482 int ret;
2f443e88 483 char *p;
abbfd20b 484 va_list ap;
0fd73091 485 size_t size = 0;
751d9dcd 486
0fd73091 487 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
751d9dcd 488 script, name, section);
e3b4c4c4 489
abbfd20b
DL
490 va_start(ap, script);
491 while ((p = va_arg(ap, char *)))
95642a10 492 size += strlen(p) + 1;
abbfd20b
DL
493 va_end(ap);
494
6333c915 495 size += STRLITERALLEN("exec");
abbfd20b
DL
496 size += strlen(script);
497 size += strlen(name);
498 size += strlen(section);
6d1a5f93 499 size += 4;
abbfd20b 500
95642a10
MS
501 if (size > INT_MAX)
502 return -1;
503
2f443e88 504 buffer = must_realloc(NULL, size);
6d1a5f93 505 ret = snprintf(buffer, size, "exec %s %s %s", script, name, section);
0fd73091 506 if (ret < 0 || ret >= size)
9ba8130c 507 return -1;
751d9dcd 508
abbfd20b 509 va_start(ap, script);
9ba8130c 510 while ((p = va_arg(ap, char *))) {
062b72c6 511 int len = size - ret;
9ba8130c
SH
512 int rc;
513 rc = snprintf(buffer + ret, len, " %s", p);
7b5a2435
DJ
514 if (rc < 0 || rc >= len) {
515 va_end(ap);
9ba8130c 516 return -1;
7b5a2435 517 }
9ba8130c
SH
518 ret += rc;
519 }
abbfd20b 520 va_end(ap);
751d9dcd 521
91c3830e 522 return run_buffer(buffer);
e3b4c4c4
ST
523}
524
0fd73091 525/* pin_rootfs
63fc76c3 526 * if rootfs is a directory, then open ${rootfs}/.lxc-keep for writing for
b7ed4bf0
CS
527 * the duration of the container run, to prevent the container from marking
528 * the underlying fs readonly on shutdown. unlink the file immediately so
63fc76c3
GJ
529 * no name pollution is happens.
530 * don't unlink on NFS to avoid random named stale handles.
0c547523
SH
531 * return -1 on error.
532 * return -2 if nothing needed to be pinned.
533 * return an open fd (>=0) if we pinned it.
534 */
535int pin_rootfs(const char *rootfs)
536{
957c4704 537 __do_free char *absrootfs = NULL;
0fd73091 538 int fd, ret;
6b5a54cd 539 char absrootfspin[PATH_MAX];
0c547523 540 struct stat s;
63fc76c3 541 struct statfs sfs;
0c547523 542
e99ee0de 543 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 544 return -2;
e99ee0de 545
74e7b662 546 absrootfs = realpath(rootfs, NULL);
547 if (!absrootfs)
9be53773 548 return -2;
0c547523 549
0fd73091 550 ret = stat(absrootfs, &s);
957c4704 551 if (ret < 0)
0c547523 552 return -1;
0c547523 553
957c4704 554 if (!S_ISDIR(s.st_mode))
0c547523
SH
555 return -2;
556
6b5a54cd 557 ret = snprintf(absrootfspin, PATH_MAX, "%s/.lxc-keep", absrootfs);
6b5a54cd 558 if (ret < 0 || ret >= PATH_MAX)
0c547523 559 return -1;
0c547523 560
0fd73091 561 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR | S_IRUSR);
b7ed4bf0
CS
562 if (fd < 0)
563 return fd;
0fd73091 564
205fc010
CB
565 ret = fstatfs (fd, &sfs);
566 if (ret < 0)
567 return fd;
63fc76c3
GJ
568
569 if (sfs.f_type == NFS_SUPER_MAGIC) {
205fc010 570 DEBUG("Rootfs on NFS, not unlinking pin file \"%s\"", absrootfspin);
63fc76c3
GJ
571 return fd;
572 }
573
b7ed4bf0 574 (void)unlink(absrootfspin);
0fd73091 575
0c547523
SH
576 return fd;
577}
578
0fd73091
CB
579/* If we are asking to remount something, make sure that any NOEXEC etc are
580 * honored.
e2a7e8dc 581 */
5ae72b98 582unsigned long add_required_remount_flags(const char *s, const char *d,
5285689c 583 unsigned long flags)
e2a7e8dc 584{
614305f3 585#ifdef HAVE_STATVFS
0fd73091 586 int ret;
e2a7e8dc
SH
587 struct statvfs sb;
588 unsigned long required_flags = 0;
589
e2a7e8dc
SH
590 if (!s)
591 s = d;
592
593 if (!s)
594 return flags;
0fd73091
CB
595
596 ret = statvfs(s, &sb);
597 if (ret < 0)
e2a7e8dc
SH
598 return flags;
599
69eadddb
CB
600 if (flags & MS_REMOUNT) {
601 if (sb.f_flag & MS_NOSUID)
602 required_flags |= MS_NOSUID;
603 if (sb.f_flag & MS_NODEV)
604 required_flags |= MS_NODEV;
605 if (sb.f_flag & MS_RDONLY)
606 required_flags |= MS_RDONLY;
607 if (sb.f_flag & MS_NOEXEC)
608 required_flags |= MS_NOEXEC;
609 }
610
611 if (sb.f_flag & MS_NOATIME)
612 required_flags |= MS_NOATIME;
613 if (sb.f_flag & MS_NODIRATIME)
614 required_flags |= MS_NODIRATIME;
615 if (sb.f_flag & MS_LAZYTIME)
616 required_flags |= MS_LAZYTIME;
617 if (sb.f_flag & MS_RELATIME)
618 required_flags |= MS_RELATIME;
619 if (sb.f_flag & MS_STRICTATIME)
620 required_flags |= MS_STRICTATIME;
e2a7e8dc
SH
621
622 return flags | required_flags;
614305f3
SH
623#else
624 return flags;
625#endif
e2a7e8dc
SH
626}
627
6b741397
CB
628static int add_shmount_to_list(struct lxc_conf *conf)
629{
6b5a54cd 630 char new_mount[PATH_MAX];
0d190408 631 /* Offset for the leading '/' since the path_cont
6b741397
CB
632 * is absolute inside the container.
633 */
634 int offset = 1, ret = -1;
0d190408 635
6b741397
CB
636 ret = snprintf(new_mount, sizeof(new_mount),
637 "%s %s none bind,create=dir 0 0", conf->shmount.path_host,
638 conf->shmount.path_cont + offset);
60534030 639 if (ret < 0 || (size_t)ret >= sizeof(new_mount))
0d190408
LT
640 return -1;
641
6b741397 642 return add_elem_to_mount_list(new_mount, conf);
0d190408
LT
643}
644
4fb3cba5 645static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 646{
0fd73091 647 int i, r;
b06b8511
CS
648 static struct {
649 int match_mask;
650 int match_flag;
651 const char *source;
652 const char *destination;
653 const char *fstype;
654 unsigned long flags;
655 const char *options;
656 } default_mounts[] = {
0fd73091
CB
657 /* Read-only bind-mounting... In older kernels, doing that
658 * required to do one MS_BIND mount and then
659 * MS_REMOUNT|MS_RDONLY the same one. According to mount(2)
660 * manpage, MS_BIND honors MS_RDONLY from kernel 2.6.26
661 * onwards. However, this apparently does not work on kernel
662 * 3.8. Unfortunately, on that very same kernel, doing the same
663 * trick as above doesn't seem to work either, there one needs
664 * to ALSO specify MS_BIND for the remount, otherwise the
665 * entire fs is remounted read-only or the mount fails because
666 * it's busy... MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for
667 * kernels as low as 2.6.32...
368bbc02 668 */
0fd73091 669 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a 670 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
0fd73091
CB
671 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
672 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
673 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
674 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
675 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
676 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
677 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
678 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
679 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
680 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
d1c203f4 681 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
0fd73091
CB
682 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
683 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
684 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
685 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
686 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 687 };
368bbc02 688
b06b8511 689 for (i = 0; default_mounts[i].match_mask; i++) {
8db92302 690 __do_free char *destination = NULL, *source = NULL;
0fd73091
CB
691 int saved_errno;
692 unsigned long mflags;
0fd73091
CB
693 if ((flags & default_mounts[i].match_mask) != default_mounts[i].match_flag)
694 continue;
695
696 if (default_mounts[i].source) {
cc4fd506 697 /* will act like strdup if %r is not present */
0fd73091
CB
698 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
699 if (!source)
cc4fd506 700 return -1;
0fd73091 701 }
f24a52d5 702
0fd73091
CB
703 if (!default_mounts[i].destination) {
704 ERROR("BUG: auto mounts destination %d was NULL", i);
0fd73091
CB
705 return -1;
706 }
707
708 /* will act like strdup if %r is not present */
709 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
710 if (!destination) {
0fd73091
CB
711 return -1;
712 }
713
714 mflags = add_required_remount_flags(source, destination,
715 default_mounts[i].flags);
716 r = safe_mount(source, destination, default_mounts[i].fstype,
717 mflags, default_mounts[i].options,
718 conf->rootfs.path ? conf->rootfs.mount : NULL);
719 saved_errno = errno;
720 if (r < 0 && errno == ENOENT) {
721 INFO("Mount source or target for \"%s\" on \"%s\" does "
722 "not exist. Skipping", source, destination);
723 r = 0;
724 } else if (r < 0) {
725 SYSERROR("Failed to mount \"%s\" on \"%s\" with flags %lu", source, destination, mflags);
726 }
727
0fd73091
CB
728 if (r < 0) {
729 errno = saved_errno;
730 return -1;
368bbc02 731 }
368bbc02
CS
732 }
733
b06b8511 734 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
735 int cg_flags;
736
3f69fb12 737 cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE);
0fd73091
CB
738 /* If the type of cgroup mount was not specified, it depends on
739 * the container's capabilities as to what makes sense: if we
740 * have CAP_SYS_ADMIN, the read-only part can be remounted
741 * read-write anyway, so we may as well default to read-write;
742 * then the admin will not be given a false sense of security.
743 * (And if they really want mixed r/o r/w, then they can
744 * explicitly specify :mixed.) OTOH, if the container lacks
745 * CAP_SYS_ADMIN, do only default to :mixed, because then the
746 * container can't remount it read-write.
747 */
0769b82a
CS
748 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
749 int has_sys_admin = 0;
b0ee5983
CB
750
751 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 752 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 753 else
0769b82a 754 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
755
756 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 757 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 758 else
0769b82a 759 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a 760 }
0fd73091 761
3f69fb12 762 if (flags & LXC_AUTO_CGROUP_FORCE)
0fd73091
CB
763 cg_flags |= LXC_AUTO_CGROUP_FORCE;
764
2202afc9
CB
765 if (!handler->cgroup_ops->mount(handler->cgroup_ops,
766 handler,
767 conf->rootfs.path ? conf->rootfs.mount : "",
768 cg_flags)) {
0fd73091 769 SYSERROR("Failed to mount \"/sys/fs/cgroup\"");
b06b8511 770 return -1;
368bbc02
CS
771 }
772 }
773
0d190408
LT
774 if (flags & LXC_AUTO_SHMOUNTS_MASK) {
775 int ret = add_shmount_to_list(conf);
776 if (ret < 0) {
777 ERROR("Failed to add shmount entry to container config");
6b741397 778 return -1;
0d190408
LT
779 }
780 }
781
368bbc02 782 return 0;
368bbc02
CS
783}
784
4e5440c6 785static int setup_utsname(struct utsname *utsname)
0ad19a3f 786{
0fd73091
CB
787 int ret;
788
4e5440c6
DL
789 if (!utsname)
790 return 0;
0ad19a3f 791
0fd73091
CB
792 ret = sethostname(utsname->nodename, strlen(utsname->nodename));
793 if (ret < 0) {
794 SYSERROR("Failed to set the hostname to \"%s\"", utsname->nodename);
0ad19a3f 795 return -1;
796 }
797
0fd73091 798 INFO("Set hostname to \"%s\"", utsname->nodename);
cd54d859 799
0ad19a3f 800 return 0;
801}
802
69aa6655
DE
803struct dev_symlinks {
804 const char *oldpath;
805 const char *name;
806};
807
808static const struct dev_symlinks dev_symlinks[] = {
0fd73091
CB
809 { "/proc/self/fd", "fd" },
810 { "/proc/self/fd/0", "stdin" },
811 { "/proc/self/fd/1", "stdout" },
812 { "/proc/self/fd/2", "stderr" },
69aa6655
DE
813};
814
ed8704d0 815static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
69aa6655 816{
0fd73091 817 int i, ret;
6b5a54cd 818 char path[PATH_MAX];
09227be2 819 struct stat s;
69aa6655 820
69aa6655
DE
821 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
822 const struct dev_symlinks *d = &dev_symlinks[i];
0fd73091
CB
823
824 ret = snprintf(path, sizeof(path), "%s/dev/%s",
825 rootfs->path ? rootfs->mount : "", d->name);
6b5a54cd 826 if (ret < 0 || ret >= PATH_MAX)
69aa6655 827 return -1;
09227be2 828
0fd73091
CB
829 /* Stat the path first. If we don't get an error accept it as
830 * is and don't try to create it
09227be2 831 */
0fd73091
CB
832 ret = stat(path, &s);
833 if (ret == 0)
09227be2 834 continue;
09227be2 835
69aa6655
DE
836 ret = symlink(d->oldpath, path);
837 if (ret && errno != EEXIST) {
0fd73091
CB
838 if (errno == EROFS) {
839 WARN("Failed to create \"%s\". Read-only filesystem", path);
09227be2 840 } else {
0fd73091 841 SYSERROR("Failed to create \"%s\"", path);
09227be2
MW
842 return -1;
843 }
69aa6655
DE
844 }
845 }
0fd73091 846
69aa6655
DE
847 return 0;
848}
849
2187efd3 850/* Build a space-separate list of ptys to pass to systemd. */
885766f5 851static bool append_ttyname(char **pp, char *name)
b0a33c1e 852{
393903d1 853 char *p;
f1e05b90 854 size_t size;
393903d1
SH
855
856 if (!*pp) {
857 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
858 if (!*pp)
859 return false;
0fd73091 860
393903d1
SH
861 sprintf(*pp, "container_ttys=%s", name);
862 return true;
863 }
0fd73091 864
f1e05b90
DJ
865 size = strlen(*pp) + strlen(name) + 2;
866 p = realloc(*pp, size);
393903d1
SH
867 if (!p)
868 return false;
0fd73091 869
393903d1 870 *pp = p;
f1e05b90
DJ
871 (void)strlcat(p, " ", size);
872 (void)strlcat(p, name, size);
0fd73091 873
393903d1
SH
874 return true;
875}
876
2187efd3 877static int lxc_setup_ttys(struct lxc_conf *conf)
393903d1 878{
9e1045e3 879 int i, ret;
0e4be3cf 880 const struct lxc_tty_info *ttys = &conf->ttys;
885766f5 881 char *ttydir = ttys->dir;
6b5a54cd 882 char path[PATH_MAX], lxcpath[PATH_MAX];
b0a33c1e 883
e8bd4e43 884 if (!conf->rootfs.path)
bc9bd0e3
DL
885 return 0;
886
885766f5 887 for (i = 0; i < ttys->max; i++) {
0e4be3cf 888 struct lxc_terminal_info *tty = &ttys->tty[i];
b0a33c1e 889
e8bd4e43 890 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
73363c61 891 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 892 return -1;
9e1045e3 893
7c6ef2a2
SH
894 if (ttydir) {
895 /* create dev/lxc/tty%d" */
9e1045e3
CB
896 ret = snprintf(lxcpath, sizeof(lxcpath),
897 "/dev/%s/tty%d", ttydir, i + 1);
73363c61 898 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 899 return -1;
9e1045e3 900
adc1c715 901 ret = mknod(lxcpath, S_IFREG | 0000, 0);
9e1045e3 902 if (ret < 0 && errno != EEXIST) {
73363c61 903 SYSERROR("Failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
904 return -1;
905 }
9e1045e3 906
7c6ef2a2 907 ret = unlink(path);
9e1045e3 908 if (ret < 0 && errno != ENOENT) {
73363c61 909 SYSERROR("Failed to unlink \"%s\"", path);
7c6ef2a2
SH
910 return -1;
911 }
b0a33c1e 912
2520facd 913 ret = mount(tty->name, lxcpath, "none", MS_BIND, 0);
9e1045e3 914 if (ret < 0) {
adc1c715
WB
915 SYSWARN("Failed to bind mount \"%s\" onto \"%s\"",
916 tty->name, lxcpath);
7c6ef2a2
SH
917 continue;
918 }
0fd73091 919 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name,
adc1c715 920 lxcpath);
13954cce 921
9e1045e3
CB
922 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
923 ttydir, i + 1);
73363c61 924 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 925 return -1;
9e1045e3 926
7c6ef2a2 927 ret = symlink(lxcpath, path);
9e1045e3 928 if (ret < 0) {
73363c61 929 SYSERROR("Failed to create symlink \"%s\" -> \"%s\"",
9e1045e3 930 path, lxcpath);
7c6ef2a2
SH
931 return -1;
932 }
933 } else {
9e1045e3
CB
934 /* If we populated /dev, then we need to create
935 * /dev/ttyN
936 */
d3ccc04e
CB
937 ret = mknod(path, S_IFREG | 0000, 0);
938 if (ret < 0) /* this isn't fatal, continue */
6d1400b5 939 SYSERROR("Failed to create \"%s\"", path);
9e1045e3 940
2520facd 941 ret = mount(tty->name, path, "none", MS_BIND, 0);
9e1045e3 942 if (ret < 0) {
2520facd 943 SYSERROR("Failed to mount '%s'->'%s'", tty->name, path);
7c6ef2a2
SH
944 continue;
945 }
9e1045e3 946
d3ccc04e 947 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, path);
393903d1 948 }
9e1045e3 949
885766f5 950 if (!append_ttyname(&conf->ttys.tty_names, tty->name)) {
393903d1
SH
951 ERROR("Error setting up container_ttys string");
952 return -1;
b0a33c1e 953 }
954 }
955
885766f5 956 INFO("Finished setting up %zu /dev/tty<N> device(s)", ttys->max);
b0a33c1e 957 return 0;
958}
959
663014ee 960int lxc_allocate_ttys(struct lxc_conf *conf)
2187efd3 961{
fca23691 962 size_t i;
963 int ret;
0fd73091 964 struct lxc_tty_info *ttys = &conf->ttys;
2187efd3
CB
965
966 /* no tty in the configuration */
885766f5 967 if (ttys->max == 0)
2187efd3
CB
968 return 0;
969
885766f5 970 ttys->tty = malloc(sizeof(*ttys->tty) * ttys->max);
0e4be3cf 971 if (!ttys->tty)
2187efd3 972 return -ENOMEM;
2187efd3 973
885766f5 974 for (i = 0; i < ttys->max; i++) {
0e4be3cf 975 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 976
386e6768
CB
977 tty->master = -EBADF;
978 tty->slave = -EBADF;
77a39805
CB
979 ret = openpty(&tty->master, &tty->slave, NULL, NULL, NULL);
980 if (ret < 0) {
fca23691 981 SYSERROR("Failed to create tty %zu", i);
885766f5 982 ttys->max = i;
0e4be3cf 983 lxc_delete_tty(ttys);
2187efd3
CB
984 return -ENOTTY;
985 }
986
77a39805
CB
987 ret = ttyname_r(tty->slave, tty->name, sizeof(tty->name));
988 if (ret < 0) {
fca23691 989 SYSERROR("Failed to retrieve name of tty %zu slave", i);
77a39805
CB
990 ttys->max = i;
991 lxc_delete_tty(ttys);
992 return -ENOTTY;
993 }
994
0fd73091 995 DEBUG("Created tty \"%s\" with master fd %d and slave fd %d",
2520facd 996 tty->name, tty->master, tty->slave);
2187efd3
CB
997
998 /* Prevent leaking the file descriptors to the container */
615f24ff 999 ret = fd_cloexec(tty->master, true);
2187efd3 1000 if (ret < 0)
a24c5678 1001 SYSWARN("Failed to set FD_CLOEXEC flag on master fd %d of "
1002 "tty device \"%s\"", tty->master, tty->name);
2187efd3 1003
615f24ff 1004 ret = fd_cloexec(tty->slave, true);
2187efd3 1005 if (ret < 0)
a24c5678 1006 SYSWARN("Failed to set FD_CLOEXEC flag on slave fd %d of "
1007 "tty device \"%s\"", tty->slave, tty->name);
2187efd3 1008
7581d645 1009 tty->busy = -1;
2187efd3
CB
1010 }
1011
885766f5 1012 INFO("Finished creating %zu tty devices", ttys->max);
2187efd3
CB
1013 return 0;
1014}
1015
0e4be3cf 1016void lxc_delete_tty(struct lxc_tty_info *ttys)
2187efd3
CB
1017{
1018 int i;
1019
386e6768
CB
1020 if (!ttys->tty)
1021 return;
1022
885766f5 1023 for (i = 0; i < ttys->max; i++) {
0e4be3cf 1024 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 1025
386e6768
CB
1026 if (tty->master >= 0) {
1027 close(tty->master);
1028 tty->master = -EBADF;
1029 }
1030
1031 if (tty->slave >= 0) {
1032 close(tty->slave);
1033 tty->slave = -EBADF;
1034 }
2187efd3
CB
1035 }
1036
0e4be3cf
CB
1037 free(ttys->tty);
1038 ttys->tty = NULL;
2187efd3
CB
1039}
1040
1041static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
1042{
1043 int i;
0fd73091 1044 int ret = -1;
2187efd3 1045 struct lxc_conf *conf = handler->conf;
0e4be3cf 1046 struct lxc_tty_info *ttys = &conf->ttys;
2187efd3 1047 int sock = handler->data_sock[0];
2187efd3 1048
885766f5 1049 if (ttys->max == 0)
2187efd3
CB
1050 return 0;
1051
885766f5 1052 for (i = 0; i < ttys->max; i++) {
2187efd3 1053 int ttyfds[2];
0e4be3cf 1054 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 1055
2520facd
CB
1056 ttyfds[0] = tty->master;
1057 ttyfds[1] = tty->slave;
2187efd3
CB
1058
1059 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1060 if (ret < 0)
1061 break;
1062
7d7333b3 1063 TRACE("Sent tty \"%s\" with master fd %d and slave fd %d to "
2520facd 1064 "parent", tty->name, tty->master, tty->slave);
2187efd3
CB
1065 }
1066
1067 if (ret < 0)
6d1400b5 1068 SYSERROR("Failed to send %zu ttys to parent", ttys->max);
2187efd3 1069 else
885766f5 1070 TRACE("Sent %zu ttys to parent", ttys->max);
2187efd3
CB
1071
1072 return ret;
1073}
1074
1075static int lxc_create_ttys(struct lxc_handler *handler)
1076{
1077 int ret = -1;
1078 struct lxc_conf *conf = handler->conf;
1079
663014ee 1080 ret = lxc_allocate_ttys(conf);
2187efd3
CB
1081 if (ret < 0) {
1082 ERROR("Failed to allocate ttys");
1083 goto on_error;
1084 }
1085
1086 ret = lxc_send_ttys_to_parent(handler);
1087 if (ret < 0) {
1088 ERROR("Failed to send ttys to parent");
1089 goto on_error;
1090 }
1091
1092 if (!conf->is_execute) {
1093 ret = lxc_setup_ttys(conf);
1094 if (ret < 0) {
1095 ERROR("Failed to setup ttys");
1096 goto on_error;
1097 }
1098 }
1099
885766f5
CB
1100 if (conf->ttys.tty_names) {
1101 ret = setenv("container_ttys", conf->ttys.tty_names, 1);
2187efd3 1102 if (ret < 0)
885766f5 1103 SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
2187efd3
CB
1104 }
1105
1106 ret = 0;
1107
1108on_error:
0e4be3cf 1109 lxc_delete_tty(&conf->ttys);
2187efd3
CB
1110
1111 return ret;
1112}
1113
7133b912
CB
1114/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1115 * error, log it but don't fail yet.
91c3830e 1116 */
7133b912 1117static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
63012bdd 1118 int autodevtmpfssize, const char *lxcpath)
91c3830e 1119{
2f443e88 1120 __do_free char *path = NULL;
91c3830e 1121 int ret;
87da4ec3 1122 size_t clen;
87e0e273 1123 mode_t cur_mask;
63012bdd 1124 char mount_options[128];
91c3830e 1125
7133b912 1126 INFO("Preparing \"/dev\"");
bc6928ff 1127
14221cbb 1128 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1129 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
2f443e88 1130 path = must_realloc(NULL, clen);
63012bdd
CK
1131 sprintf(mount_options, "size=%d,mode=755", (autodevtmpfssize != 0) ? autodevtmpfssize : 500000);
1132 DEBUG("Using mount options: %s", mount_options);
bc6928ff 1133
ec50007f 1134 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
7133b912 1135 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1136 return -1;
bc6928ff 1137
87e0e273
CB
1138 cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1139 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1140 if (ret < 0 && errno != EEXIST) {
1141 SYSERROR("Failed to create \"/dev\" directory");
1142 ret = -errno;
1143 goto reset_umask;
bc6928ff 1144 }
87da4ec3 1145
63012bdd
CK
1146 ret = safe_mount("none", path, "tmpfs", 0, mount_options,
1147 rootfs->path ? rootfs->mount : NULL );
7133b912
CB
1148 if (ret < 0) {
1149 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
87e0e273 1150 goto reset_umask;
91c3830e 1151 }
87e0e273 1152 TRACE("Mounted tmpfs on \"%s\"", path);
87da4ec3 1153
ec50007f 1154 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
87e0e273
CB
1155 if (ret < 0 || (size_t)ret >= clen) {
1156 ret = -1;
1157 goto reset_umask;
1158 }
87da4ec3 1159
7133b912 1160 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1161 * If not, then create it and exit if that fails...
1162 */
87e0e273
CB
1163 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1164 if (ret < 0 && errno != EEXIST) {
1165 SYSERROR("Failed to create directory \"%s\"", path);
1166 ret = -errno;
1167 goto reset_umask;
91c3830e
SH
1168 }
1169
87e0e273
CB
1170 ret = 0;
1171
1172reset_umask:
1173 (void)umask(cur_mask);
1174
7133b912 1175 INFO("Prepared \"/dev\"");
87e0e273 1176 return ret;
91c3830e
SH
1177}
1178
5e73416f 1179struct lxc_device_node {
74a3920a 1180 const char *name;
5e73416f
CB
1181 const mode_t mode;
1182 const int maj;
1183 const int min;
c6883f38
SH
1184};
1185
5e73416f 1186static const struct lxc_device_node lxc_devices[] = {
06749971 1187 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
5e73416f 1188 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
06749971
CB
1189 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1190 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
5e73416f
CB
1191 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1192 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
c6883f38
SH
1193};
1194
5067e4dd
CB
1195
1196enum {
1197 LXC_DEVNODE_BIND,
1198 LXC_DEVNODE_MKNOD,
1199 LXC_DEVNODE_PARTIAL,
1200 LXC_DEVNODE_OPEN,
1201};
1202
27245ff7 1203static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38 1204{
5e73416f 1205 int i, ret;
6b5a54cd 1206 char path[PATH_MAX];
3a32201c 1207 mode_t cmask;
5067e4dd 1208 int use_mknod = LXC_DEVNODE_MKNOD;
c6883f38 1209
6b5a54cd 1210 ret = snprintf(path, PATH_MAX, "%s/dev",
3999be0a 1211 rootfs->path ? rootfs->mount : "");
6b5a54cd 1212 if (ret < 0 || ret >= PATH_MAX)
c6883f38 1213 return -1;
91c3830e 1214
0bbf8572
CB
1215 /* ignore, just don't try to fill in */
1216 if (!dir_exists(path))
9cb4d183
SH
1217 return 0;
1218
3999be0a
CB
1219 INFO("Populating \"/dev\"");
1220
3a32201c 1221 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
5e73416f 1222 for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
6b5a54cd 1223 char hostpath[PATH_MAX];
5e73416f 1224 const struct lxc_device_node *device = &lxc_devices[i];
0728ebf4 1225
6b5a54cd 1226 ret = snprintf(path, PATH_MAX, "%s/dev/%s",
5e73416f 1227 rootfs->path ? rootfs->mount : "", device->name);
6b5a54cd 1228 if (ret < 0 || ret >= PATH_MAX)
c6883f38 1229 return -1;
0bbf8572 1230
5067e4dd 1231 if (use_mknod >= LXC_DEVNODE_MKNOD) {
5e73416f
CB
1232 ret = mknod(path, device->mode, makedev(device->maj, device->min));
1233 if (ret == 0 || (ret < 0 && errno == EEXIST)) {
1234 DEBUG("Created device node \"%s\"", path);
5067e4dd
CB
1235 } else if (ret < 0) {
1236 if (errno != EPERM) {
1237 SYSERROR("Failed to create device node \"%s\"", path);
1238 return -1;
1239 }
0bbf8572 1240
5067e4dd 1241 use_mknod = LXC_DEVNODE_BIND;
9cb4d183 1242 }
3999be0a 1243
5067e4dd
CB
1244 /* Device nodes are fully useable. */
1245 if (use_mknod == LXC_DEVNODE_OPEN)
1246 continue;
1247
1248 if (use_mknod == LXC_DEVNODE_MKNOD) {
1249 /* See
1250 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
1251 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
1252 */
1253 ret = open(path, O_RDONLY | O_CLOEXEC);
1254 if (ret >= 0) {
ae2a3d81 1255 close_prot_errno_disarm(ret);
5067e4dd
CB
1256 /* Device nodes are fully useable. */
1257 use_mknod = LXC_DEVNODE_OPEN;
1258 continue;
1259 }
1260
1261 SYSTRACE("Failed to open \"%s\" device", path);
1262 /* Device nodes are only partially useable. */
1263 use_mknod = LXC_DEVNODE_PARTIAL;
1264 }
5e73416f
CB
1265 }
1266
5067e4dd
CB
1267 if (use_mknod != LXC_DEVNODE_PARTIAL) {
1268 /* If we are dealing with partially functional device
1269 * nodes the prio mknod() call will have created the
1270 * device node so we can use it as a bind-mount target.
1271 */
1272 ret = mknod(path, S_IFREG | 0000, 0);
1273 if (ret < 0 && errno != EEXIST) {
1274 SYSERROR("Failed to create file \"%s\"", path);
1275 return -1;
1276 }
5e73416f
CB
1277 }
1278
1279 /* Fallback to bind-mounting the device from the host. */
6b5a54cd
CB
1280 ret = snprintf(hostpath, PATH_MAX, "/dev/%s", device->name);
1281 if (ret < 0 || ret >= PATH_MAX)
5e73416f
CB
1282 return -1;
1283
1284 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1285 rootfs->path ? rootfs->mount : NULL);
1286 if (ret < 0) {
1287 SYSERROR("Failed to bind mount host device node \"%s\" "
1288 "onto \"%s\"", hostpath, path);
1289 return -1;
c6883f38 1290 }
5e73416f
CB
1291 DEBUG("Bind mounted host device node \"%s\" onto \"%s\"",
1292 hostpath, path);
c6883f38 1293 }
5e73416f 1294 (void)umask(cmask);
c6883f38 1295
3999be0a 1296 INFO("Populated \"/dev\"");
c6883f38
SH
1297 return 0;
1298}
1299
8ce1abc2 1300static int lxc_mount_rootfs(struct lxc_conf *conf)
0ad19a3f 1301{
9aa76a17 1302 int ret;
10bc1861 1303 struct lxc_storage *bdev;
8ce1abc2 1304 const struct lxc_rootfs *rootfs = &conf->rootfs;
cc28d0b0 1305
a0f379bf 1306 if (!rootfs->path) {
0fd73091
CB
1307 ret = mount("", "/", NULL, MS_SLAVE | MS_REC, 0);
1308 if (ret < 0) {
8ce1abc2 1309 SYSERROR("Failed to remount \"/\" MS_REC | MS_SLAVE");
a0f379bf
DW
1310 return -1;
1311 }
0fd73091 1312
c69bd12f 1313 return 0;
a0f379bf 1314 }
0ad19a3f 1315
0fd73091
CB
1316 ret = access(rootfs->mount, F_OK);
1317 if (ret != 0) {
1318 SYSERROR("Failed to access to \"%s\". Check it is present",
12297168 1319 rootfs->mount);
b1789442
DL
1320 return -1;
1321 }
1322
8a388ed4 1323 bdev = storage_init(conf);
9aa76a17 1324 if (!bdev) {
0fd73091 1325 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1326 rootfs->path, rootfs->mount,
1327 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1328 return -1;
9be53773 1329 }
9aa76a17
CB
1330
1331 ret = bdev->ops->mount(bdev);
10bc1861 1332 storage_put(bdev);
9aa76a17 1333 if (ret < 0) {
0fd73091 1334 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1335 rootfs->path, rootfs->mount,
1336 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1337 return -1;
1338 }
0ad19a3f 1339
0fd73091 1340 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1341 rootfs->path, rootfs->mount,
1342 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1343
ac778708
DL
1344 return 0;
1345}
1346
8ce1abc2 1347int lxc_chroot(const struct lxc_rootfs *rootfs)
91e93c71 1348{
b8d88764 1349 __do_free char *nroot = NULL;
0fd73091 1350 int i, ret;
8ce1abc2 1351 char *root = rootfs->mount;
91e93c71 1352
74e7b662 1353 nroot = realpath(root, NULL);
1354 if (!nroot) {
8ce1abc2 1355 SYSERROR("Failed to resolve \"%s\"", root);
0fd73091 1356 return -1;
8ce1abc2 1357 }
91e93c71 1358
0fd73091 1359 ret = chdir("/");
b8d88764 1360 if (ret < 0)
0fd73091 1361 return -1;
91e93c71 1362
0fd73091
CB
1363 /* We could use here MS_MOVE, but in userns this mount is locked and
1364 * can't be moved.
91e93c71 1365 */
8ce1abc2 1366 ret = mount(nroot, "/", NULL, MS_REC | MS_BIND, NULL);
0fd73091 1367 if (ret < 0) {
8ce1abc2 1368 SYSERROR("Failed to mount \"%s\" onto \"/\" as MS_REC | MS_BIND", nroot);
0fd73091 1369 return -1;
91e93c71
AV
1370 }
1371
0fd73091
CB
1372 ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
1373 if (ret < 0) {
8ce1abc2 1374 SYSERROR("Failed to remount \"/\"");
0fd73091 1375 return -1;
91e93c71
AV
1376 }
1377
aa899945 1378 /* The following code cleans up inherited mounts which are not required
0fd73091 1379 * for CT.
91e93c71
AV
1380 *
1381 * The mountinfo file shows not all mounts, if a few points have been
1382 * unmounted between read operations from the mountinfo. So we need to
1383 * read mountinfo a few times.
1384 *
7ded5fa7 1385 * This loop can be skipped if a container uses userns, because all
91e93c71
AV
1386 * inherited mounts are locked and we should live with all this trash.
1387 */
0fd73091 1388 for (;;) {
4fdd1f72 1389 __do_fclose FILE *f = NULL;
f3d38164
CB
1390 __do_free char *line = NULL;
1391 char *slider1, *slider2;
91e93c71 1392 int progress = 0;
f3d38164 1393 size_t len = 0;
91e93c71 1394
4110345b 1395 f = fopen("./proc/self/mountinfo", "re");
91e93c71 1396 if (!f) {
8ce1abc2 1397 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
91e93c71
AV
1398 return -1;
1399 }
0fd73091 1400
f3d38164
CB
1401 while (getline(&line, &len, f) > 0) {
1402 for (slider1 = line, i = 0; slider1 && i < 4; i++)
1403 slider1 = strchr(slider1 + 1, ' ');
0fd73091 1404
f3d38164 1405 if (!slider1)
91e93c71 1406 continue;
0fd73091 1407
f3d38164
CB
1408 slider2 = strchr(slider1 + 1, ' ');
1409 if (!slider2)
91e93c71
AV
1410 continue;
1411
f3d38164
CB
1412 *slider2 = '\0';
1413 *slider1 = '.';
91e93c71 1414
f3d38164 1415 if (strcmp(slider1 + 1, "/") == 0)
91e93c71 1416 continue;
0fd73091 1417
f3d38164 1418 if (strcmp(slider1 + 1, "/proc") == 0)
91e93c71
AV
1419 continue;
1420
f3d38164 1421 ret = umount2(slider1, MNT_DETACH);
0fd73091 1422 if (ret == 0)
91e93c71
AV
1423 progress++;
1424 }
0fd73091 1425
91e93c71
AV
1426 if (!progress)
1427 break;
1428 }
1429
7ded5fa7 1430 /* This also can be skipped if a container uses userns. */
0fd73091 1431 (void)umount2("./proc", MNT_DETACH);
91e93c71
AV
1432
1433 /* It is weird, but chdir("..") moves us in a new root */
0fd73091
CB
1434 ret = chdir("..");
1435 if (ret < 0) {
8ce1abc2 1436 SYSERROR("Failed to chdir(\"..\")");
91e93c71
AV
1437 return -1;
1438 }
1439
0fd73091
CB
1440 ret = chroot(".");
1441 if (ret < 0) {
8ce1abc2 1442 SYSERROR("Failed to chroot(\".\")");
91e93c71
AV
1443 return -1;
1444 }
1445
1446 return 0;
1447}
1448
8ce1abc2
CB
1449/* (The following explanation is copied verbatim from the kernel.)
1450 *
1451 * pivot_root Semantics:
1452 * Moves the root file system of the current process to the directory put_old,
1453 * makes new_root as the new root file system of the current process, and sets
1454 * root/cwd of all processes which had them on the current root to new_root.
1455 *
1456 * Restrictions:
1457 * The new_root and put_old must be directories, and must not be on the
1458 * same file system as the current process root. The put_old must be
1459 * underneath new_root, i.e. adding a non-zero number of /.. to the string
1460 * pointed to by put_old must yield the same directory as new_root. No other
1461 * file system may be mounted on put_old. After all, new_root is a mountpoint.
1462 *
1463 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
1464 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
1465 * in this situation.
1466 *
1467 * Notes:
1468 * - we don't move root/cwd if they are not at the root (reason: if something
1469 * cared enough to change them, it's probably wrong to force them elsewhere)
1470 * - it's okay to pick a root that isn't the root of a file system, e.g.
1471 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
1472 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
1473 * first.
1474 */
1475static int lxc_pivot_root(const char *rootfs)
ac778708 1476{
b0d7aac4
CB
1477 __do_close_prot_errno int oldroot = -EBADF, newroot = -EBADF;
1478 int ret;
0fd73091 1479
7806ebd7 1480 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
8ce1abc2
CB
1481 if (oldroot < 0) {
1482 SYSERROR("Failed to open old root directory");
1483 return -1;
39c7b795 1484 }
ac778708 1485
7806ebd7 1486 newroot = open(rootfs, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
8ce1abc2
CB
1487 if (newroot < 0) {
1488 SYSERROR("Failed to open new root directory");
b0d7aac4 1489 return -1;
8ce1abc2 1490 }
0fd73091 1491
8ce1abc2
CB
1492 /* change into new root fs */
1493 ret = fchdir(newroot);
1494 if (ret < 0) {
8ce1abc2 1495 SYSERROR("Failed to change to new rootfs \"%s\"", rootfs);
b0d7aac4 1496 return -1;
8ce1abc2 1497 }
39c7b795 1498
8ce1abc2
CB
1499 /* pivot_root into our new root fs */
1500 ret = pivot_root(".", ".");
1501 if (ret < 0) {
8ce1abc2 1502 SYSERROR("Failed to pivot_root()");
b0d7aac4 1503 return -1;
39c7b795
CB
1504 }
1505
8ce1abc2
CB
1506 /* At this point the old-root is mounted on top of our new-root. To
1507 * unmounted it we must not be chdir'd into it, so escape back to
1508 * old-root.
1509 */
1510 ret = fchdir(oldroot);
0fd73091 1511 if (ret < 0) {
8ce1abc2 1512 SYSERROR("Failed to enter old root directory");
b0d7aac4 1513 return -1;
c69bd12f
DL
1514 }
1515
8ce1abc2
CB
1516 /* Make oldroot rslave to make sure our umounts don't propagate to the
1517 * host.
1518 */
1519 ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
1520 if (ret < 0) {
8ce1abc2 1521 SYSERROR("Failed to make oldroot rslave");
b0d7aac4 1522 return -1;
8ce1abc2
CB
1523 }
1524
1525 ret = umount2(".", MNT_DETACH);
1526 if (ret < 0) {
8ce1abc2 1527 SYSERROR("Failed to detach old root directory");
b0d7aac4 1528 return -1;
8ce1abc2
CB
1529 }
1530
1531 ret = fchdir(newroot);
1532 if (ret < 0) {
8ce1abc2 1533 SYSERROR("Failed to re-enter new root directory");
b0d7aac4 1534 return -1;
8ce1abc2
CB
1535 }
1536
8ce1abc2
CB
1537 TRACE("pivot_root(\"%s\") successful", rootfs);
1538
b0d7aac4 1539 return 0;
0ad19a3f 1540}
1541
8ce1abc2
CB
1542static int lxc_setup_rootfs_switch_root(const struct lxc_rootfs *rootfs)
1543{
1544 if (!rootfs->path) {
1545 DEBUG("Container does not have a rootfs");
1546 return 0;
1547 }
1548
1549 if (detect_ramfs_rootfs())
1550 return lxc_chroot(rootfs);
1551
1552 return lxc_pivot_root(rootfs->mount);
0ad19a3f 1553}
1554
8ce1abc2
CB
1555static const struct id_map *find_mapped_nsid_entry(struct lxc_conf *conf,
1556 unsigned id,
1557 enum idtype idtype)
f4900711
CB
1558{
1559 struct lxc_list *it;
1560 struct id_map *map;
1561 struct id_map *retmap = NULL;
1562
dcf0ffdf
CB
1563 /* Shortcut for container's root mappings. */
1564 if (id == 0) {
1565 if (idtype == ID_TYPE_UID)
1566 return conf->root_nsuid_map;
1567
1568 if (idtype == ID_TYPE_GID)
1569 return conf->root_nsgid_map;
1570 }
1571
f4900711
CB
1572 lxc_list_for_each(it, &conf->id_map) {
1573 map = it->elem;
1574 if (map->idtype != idtype)
1575 continue;
1576
1577 if (id >= map->nsid && id < map->nsid + map->range) {
1578 retmap = map;
1579 break;
1580 }
1581 }
1582
1583 return retmap;
1584}
1585
1586static int lxc_setup_devpts(struct lxc_conf *conf)
3c26f34e 1587{
70761e5e 1588 int ret;
ce155c60 1589 char **opts;
9d28c4f9 1590 char devpts_mntopts[256];
ce155c60
CB
1591 char *mntopt_sets[5];
1592 char default_devpts_mntopts[256] = "gid=5,newinstance,ptmxmode=0666,mode=0620";
77890c6d 1593
e528c735 1594 if (conf->pty_max <= 0) {
0fd73091 1595 DEBUG("No new devpts instance will be mounted since no pts "
70761e5e 1596 "devices are requested");
d852c78c 1597 return 0;
3c26f34e 1598 }
1599
e528c735
CB
1600 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu",
1601 default_devpts_mntopts, conf->pty_max);
9d28c4f9
CB
1602 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1603 return -1;
1604
29a7b484 1605 (void)umount2("/dev/pts", MNT_DETACH);
7e40254a 1606
70761e5e
CB
1607 /* Create mountpoint for devpts instance. */
1608 ret = mkdir("/dev/pts", 0755);
1609 if (ret < 0 && errno != EEXIST) {
0fd73091 1610 SYSERROR("Failed to create \"/dev/pts\" directory");
3c26f34e 1611 return -1;
1612 }
1613
ce155c60
CB
1614 /* gid=5 && max= */
1615 mntopt_sets[0] = devpts_mntopts;
dfbd4730 1616
ce155c60 1617 /* !gid=5 && max= */
6333c915 1618 mntopt_sets[1] = devpts_mntopts + STRLITERALLEN("gid=5") + 1;
ce155c60
CB
1619
1620 /* gid=5 && !max= */
1621 mntopt_sets[2] = default_devpts_mntopts;
1622
1623 /* !gid=5 && !max= */
6333c915 1624 mntopt_sets[3] = default_devpts_mntopts + STRLITERALLEN("gid=5") + 1;
ce155c60
CB
1625
1626 /* end */
1627 mntopt_sets[4] = NULL;
1628
1629 for (ret = -1, opts = mntopt_sets; opts && *opts; opts++) {
1630 /* mount new devpts instance */
1631 ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, *opts);
1632 if (ret == 0)
1633 break;
1634 }
1635
1636 if (ret < 0) {
1637 SYSERROR("Failed to mount new devpts instance");
1638 return -1;
70761e5e 1639 }
ce155c60 1640 DEBUG("Mount new devpts instance with options \"%s\"", *opts);
70761e5e 1641
d5cb35d6 1642 /* Remove any pre-existing /dev/ptmx file. */
b29e05d6
CB
1643 ret = remove("/dev/ptmx");
1644 if (ret < 0) {
1645 if (errno != ENOENT) {
0fd73091 1646 SYSERROR("Failed to remove existing \"/dev/ptmx\" file");
d5cb35d6 1647 return -1;
70761e5e 1648 }
b29e05d6 1649 } else {
0fd73091 1650 DEBUG("Removed existing \"/dev/ptmx\" file");
3c26f34e 1651 }
1652
d5cb35d6 1653 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
3b7e332f
CB
1654 ret = mknod("/dev/ptmx", S_IFREG | 0000, 0);
1655 if (ret < 0 && errno != EEXIST) {
0fd73091 1656 SYSERROR("Failed to create dummy \"/dev/ptmx\" file as bind mount target");
d5cb35d6
CB
1657 return -1;
1658 }
0fd73091 1659 DEBUG("Created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1660
d5cb35d6 1661 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1662 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6 1663 if (!ret) {
0fd73091 1664 DEBUG("Bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
d5cb35d6
CB
1665 return 0;
1666 } else {
1667 /* Fallthrough and try to create a symlink. */
0fd73091 1668 ERROR("Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
d5cb35d6
CB
1669 }
1670
1671 /* Remove the dummy /dev/ptmx file we created above. */
1672 ret = remove("/dev/ptmx");
70761e5e 1673 if (ret < 0) {
0fd73091 1674 SYSERROR("Failed to remove existing \"/dev/ptmx\"");
d5cb35d6
CB
1675 return -1;
1676 }
1677
1678 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1679 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1680 if (ret < 0) {
0fd73091 1681 SYSERROR("Failed to create symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
3c26f34e 1682 return -1;
1683 }
0fd73091 1684 DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
cd54d859 1685
3c26f34e 1686 return 0;
1687}
1688
cccc74b5
DL
1689static int setup_personality(int persona)
1690{
0fd73091
CB
1691 int ret;
1692
1693#if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1694 if (persona == -1)
1695 return 0;
1696
0fd73091
CB
1697 ret = personality(persona);
1698 if (ret < 0) {
1699 SYSERROR("Failed to set personality to \"0x%x\"", persona);
cccc74b5
DL
1700 return -1;
1701 }
1702
0fd73091
CB
1703 INFO("Set personality to \"0x%x\"", persona);
1704#endif
cccc74b5
DL
1705
1706 return 0;
1707}
1708
3d7d929a 1709static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
dcad02f8 1710 const struct lxc_terminal *console)
6e590161 1711{
882671aa 1712 int ret;
6b5a54cd 1713 char path[PATH_MAX];
86530b0a 1714 char *rootfs_path = rootfs->path ? rootfs->mount : "";
52e35957 1715
8b1b1210
CB
1716 if (console->path && !strcmp(console->path, "none"))
1717 return 0;
1718
86530b0a 1719 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
3d7d929a 1720 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1721 return -1;
52e35957 1722
8b1b1210
CB
1723 /* When we are asked to setup a console we remove any previous
1724 * /dev/console bind-mounts.
1725 */
a7ba3c7f
CB
1726 if (file_exists(path)) {
1727 ret = lxc_unstack_mountpoint(path, false);
1728 if (ret < 0) {
6d1400b5 1729 SYSERROR("Failed to unmount \"%s\"", path);
a7ba3c7f
CB
1730 return -ret;
1731 } else {
86530b0a 1732 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
a7ba3c7f 1733 }
8b1b1210
CB
1734 }
1735
1736 /* For unprivileged containers autodev or automounts will already have
1737 * taken care of creating /dev/console.
1738 */
882671aa 1739 ret = mknod(path, S_IFREG | 0000, 0);
3b7e332f
CB
1740 if (ret < 0 && errno != EEXIST) {
1741 SYSERROR("Failed to create console");
1742 return -errno;
52e35957
DL
1743 }
1744
e581b9b5 1745 ret = fchmod(console->slave, S_IXUSR | S_IXGRP);
86530b0a 1746 if (ret < 0) {
0fd73091 1747 SYSERROR("Failed to set mode \"0%o\" to \"%s\"",
e581b9b5 1748 S_IXUSR | S_IXGRP, console->name);
3d7d929a 1749 return -errno;
63376d7d 1750 }
13954cce 1751
86530b0a
L
1752 ret = safe_mount(console->name, path, "none", MS_BIND, 0, rootfs_path);
1753 if (ret < 0) {
0fd73091 1754 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, path);
6e590161 1755 return -1;
1756 }
1757
86530b0a 1758 DEBUG("Mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1759 return 0;
1760}
1761
3d7d929a 1762static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
dcad02f8 1763 const struct lxc_terminal *console,
3d7d929a 1764 char *ttydir)
7c6ef2a2 1765{
3b7e332f 1766 int ret;
6b5a54cd 1767 char path[PATH_MAX], lxcpath[PATH_MAX];
86530b0a 1768 char *rootfs_path = rootfs->path ? rootfs->mount : "";
7c6ef2a2 1769
3dc035f1
L
1770 if (console->path && !strcmp(console->path, "none"))
1771 return 0;
1772
7c6ef2a2 1773 /* create rootfs/dev/<ttydir> directory */
86530b0a 1774 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs_path, ttydir);
3d7d929a 1775 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1776 return -1;
3d7d929a 1777
7c6ef2a2
SH
1778 ret = mkdir(path, 0755);
1779 if (ret && errno != EEXIST) {
0fd73091 1780 SYSERROR("Failed to create \"%s\"", path);
3d7d929a 1781 return -errno;
7c6ef2a2 1782 }
4742cd9a 1783 DEBUG("Created directory for console and tty devices at \"%s\"", path);
7c6ef2a2 1784
86530b0a 1785 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
3d7d929a
CB
1786 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1787 return -1;
1788
3b7e332f
CB
1789 ret = mknod(lxcpath, S_IFREG | 0000, 0);
1790 if (ret < 0 && errno != EEXIST) {
0fd73091 1791 SYSERROR("Failed to create \"%s\"", lxcpath);
3d7d929a 1792 return -errno;
7c6ef2a2 1793 }
7c6ef2a2 1794
86530b0a 1795 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
3dc035f1 1796 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1797 return -1;
2a12fefd 1798
3dc035f1 1799 if (file_exists(path)) {
a7ba3c7f 1800 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1801 if (ret < 0) {
6d1400b5 1802 SYSERROR("Failed to unmount \"%s\"", path);
a7ba3c7f
CB
1803 return -ret;
1804 } else {
86530b0a 1805 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
a7ba3c7f 1806 }
3dc035f1 1807 }
2a12fefd 1808
3b7e332f
CB
1809 ret = mknod(path, S_IFREG | 0000, 0);
1810 if (ret < 0 && errno != EEXIST) {
1811 SYSERROR("Failed to create console");
1812 return -errno;
7c6ef2a2
SH
1813 }
1814
e581b9b5 1815 ret = fchmod(console->slave, S_IXUSR | S_IXGRP);
86530b0a 1816 if (ret < 0) {
0fd73091 1817 SYSERROR("Failed to set mode \"0%o\" to \"%s\"",
e581b9b5 1818 S_IXUSR | S_IXGRP, console->name);
2a12fefd
CB
1819 return -errno;
1820 }
1821
3dc035f1 1822 /* bind mount console->name to '/dev/<ttydir>/console' */
86530b0a
L
1823 ret = safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs_path);
1824 if (ret < 0) {
0fd73091 1825 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1826 return -1;
1827 }
86530b0a 1828 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1
L
1829
1830 /* bind mount '/dev/<ttydir>/console' to '/dev/console' */
86530b0a
L
1831 ret = safe_mount(lxcpath, path, "none", MS_BIND, 0, rootfs_path);
1832 if (ret < 0) {
0fd73091 1833 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
3dc035f1
L
1834 return -1;
1835 }
86530b0a 1836 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1 1837
86530b0a 1838 DEBUG("Console has been setup under \"%s\" and mounted to \"%s\"", lxcpath, path);
6e590161 1839 return 0;
1840}
1841
3d7d929a 1842static int lxc_setup_console(const struct lxc_rootfs *rootfs,
dcad02f8 1843 const struct lxc_terminal *console, char *ttydir)
7c6ef2a2 1844{
3d7d929a 1845
7c6ef2a2 1846 if (!ttydir)
3d7d929a 1847 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1848
3d7d929a 1849 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1850}
1851
efed99a4 1852static void parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
998ac676
RT
1853{
1854 struct mount_opt *mo;
1855
85c2de39
MB
1856 /* If '=' is contained in opt, the option must go into data. */
1857 if (!strchr(opt, '=')) {
1858
1859 /* If opt is found in mount_opt, set or clear flags.
1860 * Otherwise append it to data. */
1861 size_t opt_len = strlen(opt);
1862 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1863 size_t mo_name_len = strlen(mo->name);
1864 if (opt_len == mo_name_len && strncmp(opt, mo->name, mo_name_len) == 0) {
1865 if (mo->clear)
1866 *flags &= ~mo->flag;
1867 else
1868 *flags |= mo->flag;
1869 return;
1870 }
998ac676
RT
1871 }
1872 }
1873
f1e05b90
DJ
1874 if (strlen(*data))
1875 (void)strlcat(*data, ",", size);
efed99a4 1876
f1e05b90 1877 (void)strlcat(*data, opt, size);
998ac676
RT
1878}
1879
0fd73091 1880int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata)
998ac676 1881{
a71f619c
CB
1882 __do_free char *data = NULL, *s = NULL;
1883 char *p;
efed99a4 1884 size_t size;
998ac676 1885
911324ef 1886 *mntdata = NULL;
91656ce5 1887 *mntflags = 0L;
911324ef
DL
1888
1889 if (!mntopts)
998ac676
RT
1890 return 0;
1891
911324ef 1892 s = strdup(mntopts);
0fd73091 1893 if (!s)
998ac676 1894 return -1;
998ac676 1895
efed99a4
DJ
1896 size = strlen(s) + 1;
1897 data = malloc(size);
a71f619c 1898 if (!data)
998ac676 1899 return -1;
998ac676
RT
1900 *data = 0;
1901
8db9d26f 1902 lxc_iterate_parts(p, s, ",")
efed99a4 1903 parse_mntopt(p, mntflags, &data, size);
998ac676
RT
1904
1905 if (*data)
a71f619c 1906 *mntdata = move_ptr(data);
998ac676
RT
1907
1908 return 0;
1909}
1910
d840039e
YT
1911static void parse_propagationopt(char *opt, unsigned long *flags)
1912{
1913 struct mount_opt *mo;
1914
1915 /* If opt is found in propagation_opt, set or clear flags. */
d840039e 1916 for (mo = &propagation_opt[0]; mo->name != NULL; mo++) {
0fd73091
CB
1917 if (strncmp(opt, mo->name, strlen(mo->name)) != 0)
1918 continue;
1919
1920 if (mo->clear)
1921 *flags &= ~mo->flag;
1922 else
1923 *flags |= mo->flag;
1924
1925 return;
d840039e
YT
1926 }
1927}
1928
8ce1abc2 1929int parse_propagationopts(const char *mntopts, unsigned long *pflags)
d840039e 1930{
dfd2e059
CB
1931 __do_free char *s = NULL;
1932 char *p;
d840039e
YT
1933
1934 if (!mntopts)
1935 return 0;
1936
1937 s = strdup(mntopts);
1938 if (!s) {
1939 SYSERROR("Failed to allocate memory");
1940 return -ENOMEM;
1941 }
1942
0fd73091 1943 *pflags = 0L;
8db9d26f 1944 lxc_iterate_parts(p, s, ",")
d840039e 1945 parse_propagationopt(p, pflags);
0fd73091 1946
d840039e
YT
1947 return 0;
1948}
1949
6fd5e769
SH
1950static void null_endofword(char *word)
1951{
1952 while (*word && *word != ' ' && *word != '\t')
1953 word++;
1954 *word = '\0';
1955}
1956
0fd73091 1957/* skip @nfields spaces in @src */
6fd5e769
SH
1958static char *get_field(char *src, int nfields)
1959{
6fd5e769 1960 int i;
0fd73091 1961 char *p = src;
6fd5e769
SH
1962
1963 for (i = 0; i < nfields; i++) {
1964 while (*p && *p != ' ' && *p != '\t')
1965 p++;
0fd73091 1966
6fd5e769
SH
1967 if (!*p)
1968 break;
0fd73091 1969
6fd5e769
SH
1970 p++;
1971 }
0fd73091 1972
6fd5e769
SH
1973 return p;
1974}
1975
911324ef
DL
1976static int mount_entry(const char *fsname, const char *target,
1977 const char *fstype, unsigned long mountflags,
d840039e
YT
1978 unsigned long pflags, const char *data, bool optional,
1979 bool dev, bool relative, const char *rootfs)
911324ef 1980{
0ac4b28a 1981 int ret;
6b5a54cd 1982 char srcbuf[PATH_MAX];
181437fd 1983 const char *srcpath = fsname;
614305f3 1984#ifdef HAVE_STATVFS
2938f7c8 1985 struct statvfs sb;
614305f3 1986#endif
2938f7c8 1987
181437fd 1988 if (relative) {
6b5a54cd
CB
1989 ret = snprintf(srcbuf, PATH_MAX, "%s/%s", rootfs ? rootfs : "/", fsname ? fsname : "");
1990 if (ret < 0 || ret >= PATH_MAX) {
181437fd
YT
1991 ERROR("source path is too long");
1992 return -1;
1993 }
1994 srcpath = srcbuf;
1995 }
1996
1997 ret = safe_mount(srcpath, target, fstype, mountflags & ~MS_REMOUNT, data,
0ac4b28a
CB
1998 rootfs);
1999 if (ret < 0) {
1fc64d22 2000 if (optional) {
7874d81a 2001 SYSINFO("Failed to mount \"%s\" on \"%s\" (optional)",
2002 srcpath ? srcpath : "(null)", target);
1fc64d22
SG
2003 return 0;
2004 }
0ac4b28a 2005
0103eb53 2006 SYSERROR("Failed to mount \"%s\" on \"%s\"",
181437fd 2007 srcpath ? srcpath : "(null)", target);
0ac4b28a 2008 return -1;
911324ef
DL
2009 }
2010
2011 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
7c5b6e7c 2012 unsigned long rqd_flags = 0;
0ac4b28a
CB
2013
2014 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
181437fd 2015 "options", srcpath ? srcpath : "(none)", target ? target : "(none)");
0ac4b28a 2016
7c5b6e7c
AS
2017 if (mountflags & MS_RDONLY)
2018 rqd_flags |= MS_RDONLY;
614305f3 2019#ifdef HAVE_STATVFS
181437fd 2020 if (srcpath && statvfs(srcpath, &sb) == 0) {
7c5b6e7c 2021 unsigned long required_flags = rqd_flags;
0ac4b28a 2022
2938f7c8
SH
2023 if (sb.f_flag & MS_NOSUID)
2024 required_flags |= MS_NOSUID;
0ac4b28a 2025
ae7a770e 2026 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 2027 required_flags |= MS_NODEV;
0ac4b28a 2028
2938f7c8
SH
2029 if (sb.f_flag & MS_RDONLY)
2030 required_flags |= MS_RDONLY;
0ac4b28a 2031
2938f7c8
SH
2032 if (sb.f_flag & MS_NOEXEC)
2033 required_flags |= MS_NOEXEC;
0ac4b28a
CB
2034
2035 DEBUG("Flags for \"%s\" were %lu, required extra flags "
181437fd 2036 "are %lu", srcpath, sb.f_flag, required_flags);
0ac4b28a
CB
2037
2038 /* If this was a bind mount request, and required_flags
2938f7c8 2039 * does not have any flags which are not already in
0ac4b28a 2040 * mountflags, then skip the remount.
2938f7c8
SH
2041 */
2042 if (!(mountflags & MS_REMOUNT)) {
0ac4b28a
CB
2043 if (!(required_flags & ~mountflags) &&
2044 rqd_flags == 0) {
2045 DEBUG("Mountflags already were %lu, "
2046 "skipping remount", mountflags);
2938f7c8
SH
2047 goto skipremount;
2048 }
2049 }
0ac4b28a 2050
2938f7c8 2051 mountflags |= required_flags;
6fd5e769 2052 }
614305f3 2053#endif
911324ef 2054
181437fd 2055 ret = mount(srcpath, target, fstype, mountflags | MS_REMOUNT, data);
0ac4b28a 2056 if (ret < 0) {
1fc64d22 2057 if (optional) {
7874d81a 2058 SYSINFO("Failed to mount \"%s\" on \"%s\" (optional)",
2059 srcpath ? srcpath : "(null)", target);
1fc64d22
SG
2060 return 0;
2061 }
0ac4b28a 2062
0103eb53 2063 SYSERROR("Failed to mount \"%s\" on \"%s\"",
181437fd 2064 srcpath ? srcpath : "(null)", target);
0ac4b28a 2065 return -1;
911324ef
DL
2066 }
2067 }
2068
a3ed9b81 2069#ifdef HAVE_STATVFS
2070skipremount:
2071#endif
d840039e
YT
2072 if (pflags) {
2073 ret = mount(NULL, target, NULL, pflags, NULL);
2074 if (ret < 0) {
2075 if (optional) {
7874d81a 2076 SYSINFO("Failed to change mount propagation "
2077 "for \"%s\" (optional)", target);
d840039e
YT
2078 return 0;
2079 } else {
2080 SYSERROR("Failed to change mount propagation "
2081 "for \"%s\" (optional)", target);
2082 return -1;
2083 }
2084 }
2085 DEBUG("Changed mount propagation for \"%s\"", target);
2086 }
2087
0103eb53 2088 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"",
181437fd 2089 srcpath ? srcpath : "(null)", target, fstype);
911324ef
DL
2090
2091 return 0;
2092}
2093
c5e30de4 2094/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
2095static void cull_mntent_opt(struct mntent *mntent)
2096{
2097 int i;
0fd73091
CB
2098 char *list[] = {
2099 "create=dir",
2100 "create=file",
2101 "optional",
2102 "relative",
2103 NULL
2104 };
c5e30de4
CB
2105
2106 for (i = 0; list[i]; i++) {
2107 char *p, *p2;
2108
2109 p = strstr(mntent->mnt_opts, list[i]);
2110 if (!p)
4e4ca161 2111 continue;
c5e30de4 2112
4e4ca161
SH
2113 p2 = strchr(p, ',');
2114 if (!p2) {
2115 /* no more mntopts, so just chop it here */
2116 *p = '\0';
2117 continue;
2118 }
c5e30de4
CB
2119
2120 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
2121 }
2122}
2123
4d5b72a1 2124static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
2125 const char *path,
2126 const struct lxc_rootfs *rootfs,
0fd73091 2127 const char *lxc_name, const char *lxc_path)
0ad19a3f 2128{
7a76eeaa 2129 __do_free char *p1 = NULL;
3b7e332f 2130 int ret;
7a76eeaa 2131 char *p2;
911324ef 2132
12e6ab5d 2133 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
749f98d9 2134 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
12e6ab5d
CB
2135 if (ret < 0)
2136 return -1;
2137 }
6e46cc0d 2138
34cfffb3 2139 if (hasmntopt(mntent, "create=dir")) {
749f98d9
CB
2140 ret = mkdir_p(path, 0755);
2141 if (ret < 0 && errno != EEXIST) {
2142 SYSERROR("Failed to create directory \"%s\"", path);
2143 return -1;
34cfffb3
SG
2144 }
2145 }
2146
0fd73091
CB
2147 if (!hasmntopt(mntent, "create=file"))
2148 return 0;
749f98d9 2149
0fd73091
CB
2150 ret = access(path, F_OK);
2151 if (ret == 0)
2152 return 0;
749f98d9 2153
0fd73091
CB
2154 p1 = strdup(path);
2155 if (!p1)
2156 return -1;
749f98d9 2157
0fd73091 2158 p2 = dirname(p1);
749f98d9 2159
0fd73091 2160 ret = mkdir_p(p2, 0755);
0fd73091
CB
2161 if (ret < 0 && errno != EEXIST) {
2162 SYSERROR("Failed to create directory \"%s\"", path);
2163 return -1;
34cfffb3 2164 }
749f98d9 2165
3b7e332f
CB
2166 ret = mknod(path, S_IFREG | 0000, 0);
2167 if (ret < 0 && errno != EEXIST)
2168 return -errno;
0fd73091 2169
749f98d9 2170 return 0;
4d5b72a1
NC
2171}
2172
ec50007f
CB
2173/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2174 * without a rootfs. */
db4aba38 2175static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
2176 const char *path,
2177 const struct lxc_rootfs *rootfs,
2178 const char *lxc_name,
2179 const char *lxc_path)
4d5b72a1 2180{
fd214f37 2181 __do_free char *mntdata = NULL;
d8b712bc 2182 int ret;
949d0338 2183 unsigned long mntflags;
181437fd 2184 bool dev, optional, relative;
949d0338 2185 unsigned long pflags = 0;
ec50007f 2186 char *rootfs_path = NULL;
d8b712bc
CB
2187
2188 optional = hasmntopt(mntent, "optional") != NULL;
2189 dev = hasmntopt(mntent, "dev") != NULL;
181437fd 2190 relative = hasmntopt(mntent, "relative") != NULL;
d8b712bc 2191
ec50007f
CB
2192 if (rootfs && rootfs->path)
2193 rootfs_path = rootfs->mount;
2194
d8b712bc
CB
2195 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2196 lxc_path);
2197 if (ret < 0) {
2198 if (optional)
2199 return 0;
608e3567 2200
d8b712bc
CB
2201 return -1;
2202 }
4e4ca161
SH
2203 cull_mntent_opt(mntent);
2204
d840039e
YT
2205 ret = parse_propagationopts(mntent->mnt_opts, &pflags);
2206 if (ret < 0)
2207 return -1;
2208
d8b712bc
CB
2209 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2210 if (ret < 0)
a17b1e65 2211 return -1;
a17b1e65 2212
6e46cc0d 2213 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
d840039e 2214 pflags, mntdata, optional, dev, relative, rootfs_path);
68c152ef 2215
911324ef
DL
2216 return ret;
2217}
2218
db4aba38
NC
2219static inline int mount_entry_on_systemfs(struct mntent *mntent)
2220{
1433c9f9 2221 int ret;
6b5a54cd 2222 char path[PATH_MAX];
1433c9f9
CB
2223
2224 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
2225 * absolute paths starting at / on the host.
2226 */
1433c9f9
CB
2227 if (mntent->mnt_dir[0] != '/')
2228 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2229 else
2230 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
07667a6a 2231 if (ret < 0 || ret >= sizeof(path))
1433c9f9 2232 return -1;
1433c9f9
CB
2233
2234 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
2235}
2236
4e4ca161 2237static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 2238 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
2239 const char *lxc_name,
2240 const char *lxc_path)
911324ef 2241{
bdd2b34c 2242 int offset;
013bd428 2243 char *aux;
67e571de 2244 const char *lxcpath;
6b5a54cd 2245 char path[PATH_MAX];
bdd2b34c 2246 int ret = 0;
0ad19a3f 2247
593e8478 2248 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 2249 if (!lxcpath)
2a59a681 2250 return -1;
2a59a681 2251
bdd2b34c
CB
2252 /* If rootfs->path is a blockdev path, allow container fstab to use
2253 * <lxcpath>/<name>/rootfs" as the target prefix.
2254 */
6b5a54cd
CB
2255 ret = snprintf(path, PATH_MAX, "%s/%s/rootfs", lxcpath, lxc_name);
2256 if (ret < 0 || ret >= PATH_MAX)
80a881b2
SH
2257 goto skipvarlib;
2258
2259 aux = strstr(mntent->mnt_dir, path);
2260 if (aux) {
2261 offset = strlen(path);
2262 goto skipabs;
2263 }
2264
2265skipvarlib:
013bd428
DL
2266 aux = strstr(mntent->mnt_dir, rootfs->path);
2267 if (!aux) {
bdd2b34c 2268 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
db4aba38 2269 return ret;
013bd428 2270 }
80a881b2
SH
2271 offset = strlen(rootfs->path);
2272
2273skipabs:
6b5a54cd
CB
2274 ret = snprintf(path, PATH_MAX, "%s/%s", rootfs->mount, aux + offset);
2275 if (ret < 0 || ret >= PATH_MAX)
a17b1e65 2276 return -1;
a17b1e65 2277
0a2dddd4 2278 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 2279}
d330fe7b 2280
4e4ca161 2281static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
2282 const struct lxc_rootfs *rootfs,
2283 const char *lxc_name,
2284 const char *lxc_path)
911324ef 2285{
911324ef 2286 int ret;
6b5a54cd 2287 char path[PATH_MAX];
d330fe7b 2288
34cfffb3 2289 /* relative to root mount point */
6e46cc0d 2290 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
0fd73091 2291 if (ret < 0 || (size_t)ret >= sizeof(path))
9ba8130c 2292 return -1;
911324ef 2293
0a2dddd4 2294 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2295}
2296
06749971
CB
2297static int mount_file_entries(const struct lxc_conf *conf,
2298 const struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2299 const char *lxc_name, const char *lxc_path)
911324ef 2300{
9d03d857 2301 char buf[PATH_MAX];
0fd73091 2302 struct mntent mntent;
e76b8764 2303
aaf901be 2304 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
9d03d857
CB
2305 int ret;
2306
1ae3c19f
CB
2307 if (!rootfs->path)
2308 ret = mount_entry_on_systemfs(&mntent);
2309 else if (mntent.mnt_dir[0] != '/')
2310 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2311 lxc_name, lxc_path);
2312 else
2313 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
9d03d857 2314 lxc_name, lxc_path);
1ae3c19f
CB
2315 if (ret < 0)
2316 return -1;
0ad19a3f 2317 }
cd54d859 2318
9d03d857
CB
2319 if (!feof(file) || ferror(file)) {
2320 ERROR("Failed to parse mount entries");
2321 return -1;
2322 }
2323
2324 return 0;
e7938e9e
MN
2325}
2326
06749971
CB
2327static int setup_mount(const struct lxc_conf *conf,
2328 const struct lxc_rootfs *rootfs, const char *fstab,
42dff448 2329 const char *lxc_name, const char *lxc_path)
e7938e9e 2330{
42dff448 2331 FILE *f;
e7938e9e
MN
2332 int ret;
2333
2334 if (!fstab)
2335 return 0;
2336
42dff448
CB
2337 f = setmntent(fstab, "r");
2338 if (!f) {
2339 SYSERROR("Failed to open \"%s\"", fstab);
e7938e9e
MN
2340 return -1;
2341 }
2342
06749971 2343 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
42dff448
CB
2344 if (ret < 0)
2345 ERROR("Failed to set up mount entries");
e7938e9e 2346
42dff448 2347 endmntent(f);
0ad19a3f 2348 return ret;
2349}
2350
1800f924
WB
2351/*
2352 * In order for nested containers to be able to mount /proc and /sys they need
2353 * to see a "pure" proc and sysfs mount points with nothing mounted on top
2354 * (like lxcfs).
2355 * For this we provide proc and sysfs in /dev/.lxc/{proc,sys} while using an
2356 * apparmor rule to deny access to them. This is mostly for convenience: The
2357 * container's root user can mount them anyway and thus has access to the two
2358 * file systems. But a non-root user in the container should not be allowed to
2359 * access them as a side effect without explicitly allowing it.
2360 */
2361static const char nesting_helpers[] =
dc691e34
CB
2362"proc dev/.lxc/proc proc create=dir,optional 0 0\n"
2363"sys dev/.lxc/sys sysfs create=dir,optional 0 0\n";
1800f924
WB
2364
2365FILE *make_anonymous_mount_file(struct lxc_list *mount,
2366 bool include_nesting_helpers)
e7938e9e 2367{
79bcf5ee 2368 __do_close_prot_errno int fd = -EBADF;
4110345b 2369 FILE *f;
5ef5c9a3 2370 int ret;
e7938e9e 2371 char *mount_entry;
5ef5c9a3 2372 struct lxc_list *iterator;
5ef5c9a3 2373
0fd73091 2374 fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
5ef5c9a3 2375 if (fd < 0) {
a324e7eb
CB
2376 char template[] = P_tmpdir "/.lxc_mount_file_XXXXXX";
2377
5ef5c9a3
CB
2378 if (errno != ENOSYS)
2379 return NULL;
a324e7eb
CB
2380
2381 fd = lxc_make_tmpfile(template, true);
0fd73091
CB
2382 if (fd < 0) {
2383 SYSERROR("Could not create temporary mount file");
2384 return NULL;
2385 }
2386
6bd04140 2387 TRACE("Created temporary mount file");
5ef5c9a3 2388 }
e7938e9e 2389
0fd73091
CB
2390 lxc_list_for_each (iterator, mount) {
2391 size_t len;
2392
e7938e9e 2393 mount_entry = iterator->elem;
0fd73091 2394 len = strlen(mount_entry);
5ef5c9a3 2395
489f39be 2396 ret = lxc_write_nointr(fd, mount_entry, len);
0fd73091 2397 if (ret != len)
79bcf5ee 2398 return NULL;
0fd73091 2399
489f39be 2400 ret = lxc_write_nointr(fd, "\n", 1);
0fd73091 2401 if (ret != 1)
79bcf5ee 2402 return NULL;
e7938e9e
MN
2403 }
2404
1800f924
WB
2405 if (include_nesting_helpers) {
2406 ret = lxc_write_nointr(fd, nesting_helpers,
6333c915
CB
2407 STRARRAYLEN(nesting_helpers));
2408 if (ret != STRARRAYLEN(nesting_helpers))
79bcf5ee 2409 return NULL;
1800f924
WB
2410 }
2411
0fd73091
CB
2412 ret = lseek(fd, 0, SEEK_SET);
2413 if (ret < 0)
79bcf5ee 2414 return NULL;
0fd73091 2415
4110345b
CB
2416 f = fdopen(fd, "re+");
2417 if (f)
2418 move_fd(fd); /* Transfer ownership of fd. */
2419 return f;
9fc7f8c0
TA
2420}
2421
06749971
CB
2422static int setup_mount_entries(const struct lxc_conf *conf,
2423 const struct lxc_rootfs *rootfs,
5ef5c9a3
CB
2424 struct lxc_list *mount, const char *lxc_name,
2425 const char *lxc_path)
9fc7f8c0 2426{
c85ced65 2427 __do_fclose FILE *f = NULL;
9fc7f8c0 2428
1800f924 2429 f = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
19b5d755 2430 if (!f)
9fc7f8c0 2431 return -1;
e7938e9e 2432
c85ced65 2433 return mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
e7938e9e
MN
2434}
2435
bab88e68
CS
2436static int parse_cap(const char *cap)
2437{
84760c11 2438 size_t i;
2439 int capid = -1;
0fd73091
CB
2440 size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
2441 char *ptr = NULL;
bab88e68 2442
0fd73091 2443 if (strcmp(cap, "none") == 0)
7035407c
DE
2444 return -2;
2445
8560cd36 2446 for (i = 0; i < end; i++) {
bab88e68
CS
2447 if (strcmp(cap, caps_opt[i].name))
2448 continue;
2449
2450 capid = caps_opt[i].value;
2451 break;
2452 }
2453
2454 if (capid < 0) {
0fd73091
CB
2455 /* Try to see if it's numeric, so the user may specify
2456 * capabilities that the running kernel knows about but we
2457 * don't
2458 */
bab88e68
CS
2459 errno = 0;
2460 capid = strtol(cap, &ptr, 10);
2461 if (!ptr || *ptr != '\0' || errno != 0)
2462 /* not a valid number */
2463 capid = -1;
2464 else if (capid > lxc_caps_last_cap())
2465 /* we have a number but it's not a valid
2466 * capability */
2467 capid = -1;
2468 }
2469
2470 return capid;
2471}
2472
0769b82a
CS
2473int in_caplist(int cap, struct lxc_list *caps)
2474{
0769b82a 2475 int capid;
0fd73091 2476 struct lxc_list *iterator;
0769b82a 2477
0fd73091 2478 lxc_list_for_each (iterator, caps) {
0769b82a
CS
2479 capid = parse_cap(iterator->elem);
2480 if (capid == cap)
2481 return 1;
2482 }
2483
2484 return 0;
2485}
2486
81810dd1
DL
2487static int setup_caps(struct lxc_list *caps)
2488{
bab88e68 2489 int capid;
0fd73091
CB
2490 char *drop_entry;
2491 struct lxc_list *iterator;
81810dd1 2492
0fd73091
CB
2493 lxc_list_for_each (iterator, caps) {
2494 int ret;
81810dd1
DL
2495
2496 drop_entry = iterator->elem;
2497
bab88e68 2498 capid = parse_cap(drop_entry);
0fd73091 2499 if (capid < 0) {
1e11be34
DL
2500 ERROR("unknown capability %s", drop_entry);
2501 return -1;
81810dd1
DL
2502 }
2503
b81689a1
CB
2504 ret = prctl(PR_CAPBSET_DROP, prctl_arg(capid), prctl_arg(0),
2505 prctl_arg(0), prctl_arg(0));
0fd73091
CB
2506 if (ret < 0) {
2507 SYSERROR("Failed to remove %s capability", drop_entry);
3ec1648d
SH
2508 return -1;
2509 }
0fd73091 2510 DEBUG("Dropped %s (%d) capability", drop_entry, capid);
81810dd1
DL
2511 }
2512
0fd73091 2513 DEBUG("Capabilities have been setup");
1fb86a7c
SH
2514 return 0;
2515}
2516
2517static int dropcaps_except(struct lxc_list *caps)
2518{
2f443e88 2519 __do_free int *caplist = NULL;
0fd73091 2520 int i, capid, numcaps;
1fb86a7c 2521 char *keep_entry;
0fd73091 2522 struct lxc_list *iterator;
1fb86a7c 2523
0fd73091 2524 numcaps = lxc_caps_last_cap() + 1;
2caf9a97
SH
2525 if (numcaps <= 0 || numcaps > 200)
2526 return -1;
0fd73091 2527 TRACE("Found %d capabilities", numcaps);
2caf9a97 2528
1a0e70ac 2529 /* caplist[i] is 1 if we keep capability i */
2f443e88 2530 caplist = must_realloc(NULL, numcaps * sizeof(int));
1fb86a7c
SH
2531 memset(caplist, 0, numcaps * sizeof(int));
2532
0fd73091 2533 lxc_list_for_each (iterator, caps) {
1fb86a7c
SH
2534 keep_entry = iterator->elem;
2535
bab88e68 2536 capid = parse_cap(keep_entry);
7035407c
DE
2537 if (capid == -2)
2538 continue;
2539
0fd73091
CB
2540 if (capid < 0) {
2541 ERROR("Unknown capability %s", keep_entry);
1fb86a7c
SH
2542 return -1;
2543 }
2544
0fd73091 2545 DEBUG("Keep capability %s (%d)", keep_entry, capid);
1fb86a7c
SH
2546 caplist[capid] = 1;
2547 }
0fd73091
CB
2548
2549 for (i = 0; i < numcaps; i++) {
2550 int ret;
2551
1fb86a7c
SH
2552 if (caplist[i])
2553 continue;
0fd73091 2554
b81689a1
CB
2555 ret = prctl(PR_CAPBSET_DROP, prctl_arg(i), prctl_arg(0),
2556 prctl_arg(0), prctl_arg(0));
0fd73091
CB
2557 if (ret < 0) {
2558 SYSERROR("Failed to remove capability %d", i);
3ec1648d
SH
2559 return -1;
2560 }
1fb86a7c
SH
2561 }
2562
0fd73091 2563 DEBUG("Capabilities have been setup");
81810dd1
DL
2564 return 0;
2565}
2566
0fd73091
CB
2567static int parse_resource(const char *res)
2568{
2569 int ret;
c6d09e15
WB
2570 size_t i;
2571 int resid = -1;
2572
0fd73091 2573 for (i = 0; i < sizeof(limit_opt) / sizeof(limit_opt[0]); ++i)
c6d09e15
WB
2574 if (strcmp(res, limit_opt[i].name) == 0)
2575 return limit_opt[i].value;
c6d09e15 2576
0fd73091 2577 /* Try to see if it's numeric, so the user may specify
c6d09e15 2578 * resources that the running kernel knows about but
0fd73091
CB
2579 * we don't.
2580 */
2581 ret = lxc_safe_int(res, &resid);
2582 if (ret < 0)
2583 return -1;
2584
2585 return resid;
c6d09e15
WB
2586}
2587
0fd73091
CB
2588int setup_resource_limits(struct lxc_list *limits, pid_t pid)
2589{
2590 int resid;
c6d09e15
WB
2591 struct lxc_list *it;
2592 struct lxc_limit *lim;
c6d09e15 2593
0fd73091 2594 lxc_list_for_each (it, limits) {
c6d09e15
WB
2595 lim = it->elem;
2596
2597 resid = parse_resource(lim->resource);
2598 if (resid < 0) {
0fd73091 2599 ERROR("Unknown resource %s", lim->resource);
c6d09e15
WB
2600 return -1;
2601 }
2602
f48b5fd8 2603#if HAVE_PRLIMIT || HAVE_PRLIMIT64
c6d09e15 2604 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
6d1400b5 2605 SYSERROR("Failed to set limit %s", lim->resource);
c6d09e15
WB
2606 return -1;
2607 }
2de12765
CB
2608
2609 TRACE("Setup \"%s\" limit", lim->resource);
f48b5fd8 2610#else
2de12765 2611 ERROR("Cannot set limit \"%s\" as prlimit is missing", lim->resource);
f48b5fd8
FF
2612 return -1;
2613#endif
c6d09e15 2614 }
0fd73091 2615
c6d09e15
WB
2616 return 0;
2617}
2618
7edd0540
L
2619int setup_sysctl_parameters(struct lxc_list *sysctls)
2620{
e6f76452 2621 __do_free char *tmp = NULL;
7edd0540
L
2622 struct lxc_list *it;
2623 struct lxc_sysctl *elem;
0fd73091 2624 int ret = 0;
6b5a54cd 2625 char filename[PATH_MAX] = {0};
7edd0540 2626
0fd73091 2627 lxc_list_for_each (it, sysctls) {
7edd0540
L
2628 elem = it->elem;
2629 tmp = lxc_string_replace(".", "/", elem->key);
2630 if (!tmp) {
2631 ERROR("Failed to replace key %s", elem->key);
2632 return -1;
2633 }
2634
2635 ret = snprintf(filename, sizeof(filename), "/proc/sys/%s", tmp);
7edd0540
L
2636 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2637 ERROR("Error setting up sysctl parameters path");
2638 return -1;
2639 }
2640
0fd73091 2641 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2642 strlen(elem->value), false, 0666);
7edd0540 2643 if (ret < 0) {
688e8982
WB
2644 SYSERROR("Failed to setup sysctl parameters %s to %s",
2645 elem->key, elem->value);
7edd0540
L
2646 return -1;
2647 }
2648 }
0fd73091 2649
7edd0540
L
2650 return 0;
2651}
2652
61d7a733
YT
2653int setup_proc_filesystem(struct lxc_list *procs, pid_t pid)
2654{
0c669152 2655 __do_free char *tmp = NULL;
61d7a733
YT
2656 struct lxc_list *it;
2657 struct lxc_proc *elem;
0fd73091 2658 int ret = 0;
6b5a54cd 2659 char filename[PATH_MAX] = {0};
61d7a733 2660
0fd73091 2661 lxc_list_for_each (it, procs) {
61d7a733
YT
2662 elem = it->elem;
2663 tmp = lxc_string_replace(".", "/", elem->filename);
2664 if (!tmp) {
2665 ERROR("Failed to replace key %s", elem->filename);
2666 return -1;
2667 }
2668
2669 ret = snprintf(filename, sizeof(filename), "/proc/%d/%s", pid, tmp);
61d7a733
YT
2670 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2671 ERROR("Error setting up proc filesystem path");
2672 return -1;
2673 }
2674
0fd73091 2675 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2676 strlen(elem->value), false, 0666);
61d7a733 2677 if (ret < 0) {
688e8982
WB
2678 SYSERROR("Failed to setup proc filesystem %s to %s",
2679 elem->filename, elem->value);
61d7a733
YT
2680 return -1;
2681 }
2682 }
0fd73091 2683
61d7a733
YT
2684 return 0;
2685}
2686
ae9242c8
SH
2687static char *default_rootfs_mount = LXCROOTFSMOUNT;
2688
7b379ab3 2689struct lxc_conf *lxc_conf_init(void)
089cd8b8 2690{
26ddeedd 2691 int i;
0fd73091 2692 struct lxc_conf *new;
7b379ab3 2693
13277ec4 2694 new = malloc(sizeof(*new));
0fd73091 2695 if (!new)
7b379ab3 2696 return NULL;
7b379ab3
MN
2697 memset(new, 0, sizeof(*new));
2698
4b73005c 2699 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2700 new->personality = -1;
124fa0a8 2701 new->autodev = 1;
3a784510 2702 new->console.buffer_size = 0;
596a818d
DE
2703 new->console.log_path = NULL;
2704 new->console.log_fd = -1;
861813e5 2705 new->console.log_size = 0;
28a4b0e5 2706 new->console.path = NULL;
63376d7d 2707 new->console.peer = -1;
fb87aa6a
CB
2708 new->console.proxy.busy = -1;
2709 new->console.proxy.master = -1;
2710 new->console.proxy.slave = -1;
63376d7d
DL
2711 new->console.master = -1;
2712 new->console.slave = -1;
2713 new->console.name[0] = '\0';
732375f5 2714 memset(&new->console.ringbuf, 0, sizeof(struct lxc_ringbuf));
d2e30e99 2715 new->maincmd_fd = -1;
258f8051 2716 new->monitor_signal_pdeath = SIGKILL;
76a26f55 2717 new->nbd_idx = -1;
54c30e29 2718 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2719 if (!new->rootfs.mount) {
53f3f048
SH
2720 free(new);
2721 return NULL;
2722 }
6e54330c 2723 new->rootfs.managed = true;
858377e4 2724 new->logfd = -1;
7b379ab3 2725 lxc_list_init(&new->cgroup);
54860ed0 2726 lxc_list_init(&new->cgroup2);
4bfb655e 2727 lxc_list_init(&new->devices);
7b379ab3
MN
2728 lxc_list_init(&new->network);
2729 lxc_list_init(&new->mount_list);
81810dd1 2730 lxc_list_init(&new->caps);
1fb86a7c 2731 lxc_list_init(&new->keepcaps);
f6d3e3e4 2732 lxc_list_init(&new->id_map);
46ad64ab
CB
2733 new->root_nsuid_map = NULL;
2734 new->root_nsgid_map = NULL;
f979ac15 2735 lxc_list_init(&new->includes);
4184c3e1 2736 lxc_list_init(&new->aliens);
7c661726 2737 lxc_list_init(&new->environment);
c6d09e15 2738 lxc_list_init(&new->limits);
7edd0540 2739 lxc_list_init(&new->sysctls);
61d7a733 2740 lxc_list_init(&new->procs);
44ae0fb6 2741 new->hooks_version = 0;
28d9e29e 2742 for (i = 0; i < NUM_LXC_HOOKS; i++)
26ddeedd 2743 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2744 lxc_list_init(&new->groups);
d39b10eb 2745 lxc_list_init(&new->state_clients);
fe4de9a6 2746 new->lsm_aa_profile = NULL;
1800f924 2747 lxc_list_init(&new->lsm_aa_raw);
fe4de9a6 2748 new->lsm_se_context = NULL;
4fef78bc 2749 new->lsm_se_keyring_context = NULL;
8f818a84 2750 new->keyring_disable_session = false;
7a0bcca3 2751 new->tmp_umount_proc = false;
7a41e857
LT
2752 new->tmp_umount_proc = 0;
2753 new->shmount.path_host = NULL;
2754 new->shmount.path_cont = NULL;
7b379ab3 2755
72bb04e4
PT
2756 /* if running in a new user namespace, init and COMMAND
2757 * default to running as UID/GID 0 when using lxc-execute */
2758 new->init_uid = 0;
2759 new->init_gid = 0;
43654d34 2760 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
b074bbf1 2761 memset(&new->ns_share, 0, sizeof(char *) * LXC_NS_MAX);
c3e3c21a 2762 seccomp_conf_init(new);
72bb04e4 2763
7b379ab3 2764 return new;
089cd8b8
DL
2765}
2766
344c9d81 2767int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
a19b974f 2768 size_t buf_size)
f6d3e3e4 2769{
41e00749 2770 __do_close_prot_errno int fd = -EBADF;
76bcd422 2771 int ret;
6b5a54cd 2772 char path[PATH_MAX];
f6d3e3e4 2773
a19b974f 2774 if (geteuid() != 0 && idtype == ID_TYPE_GID) {
76bcd422 2775 __do_close_prot_errno int setgroups_fd = -EBADF;
a19b974f 2776
6b5a54cd
CB
2777 ret = snprintf(path, PATH_MAX, "/proc/%d/setgroups", pid);
2778 if (ret < 0 || ret >= PATH_MAX)
a19b974f 2779 return -E2BIG;
a19b974f 2780
76bcd422
CB
2781 setgroups_fd = open(path, O_WRONLY);
2782 if (setgroups_fd < 0 && errno != ENOENT) {
a19b974f
CB
2783 SYSERROR("Failed to open \"%s\"", path);
2784 return -1;
2785 }
2786
76bcd422
CB
2787 if (setgroups_fd >= 0) {
2788 ret = lxc_write_nointr(setgroups_fd, "deny\n",
2789 STRLITERALLEN("deny\n"));
2790 if (ret != STRLITERALLEN("deny\n")) {
2791 SYSERROR("Failed to write \"deny\" to \"/proc/%d/setgroups\"", pid);
2388737b
CB
2792 return -1;
2793 }
395b1a3e 2794 TRACE("Wrote \"deny\" to \"/proc/%d/setgroups\"", pid);
a19b974f 2795 }
a19b974f
CB
2796 }
2797
6b5a54cd 2798 ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid,
29053180 2799 idtype == ID_TYPE_UID ? 'u' : 'g');
6b5a54cd 2800 if (ret < 0 || ret >= PATH_MAX)
f6d3e3e4 2801 return -E2BIG;
29053180
CB
2802
2803 fd = open(path, O_WRONLY);
2804 if (fd < 0) {
a19b974f 2805 SYSERROR("Failed to open \"%s\"", path);
29053180 2806 return -1;
f6d3e3e4 2807 }
29053180 2808
29053180
CB
2809 ret = lxc_write_nointr(fd, buf, buf_size);
2810 if (ret != buf_size) {
a19b974f 2811 SYSERROR("Failed to write %cid mapping to \"%s\"",
29053180 2812 idtype == ID_TYPE_UID ? 'u' : 'g', path);
29053180
CB
2813 return -1;
2814 }
29053180
CB
2815
2816 return 0;
f6d3e3e4
SH
2817}
2818
6e50e704
CB
2819/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2820 *
2821 * @return 1 if functional binary was found
2822 * @return 0 if binary exists but is lacking privilege
2823 * @return -ENOENT if binary does not exist
2824 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
6e50e704 2825 */
df6a2945
CB
2826static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2827{
48411df2 2828 __do_free char *path = NULL;
df6a2945
CB
2829 int ret;
2830 struct stat st;
df6a2945 2831
3275932b 2832 errno = EINVAL;
6e50e704 2833 if (cap != CAP_SETUID && cap != CAP_SETGID)
3275932b 2834 return -1;
6e50e704 2835
3275932b 2836 errno = ENOENT;
df6a2945
CB
2837 path = on_path(binary, NULL);
2838 if (!path)
3275932b 2839 return -1;
df6a2945
CB
2840
2841 ret = stat(path, &st);
3275932b
CB
2842 if (ret < 0)
2843 return -1;
df6a2945
CB
2844
2845 /* Check if the binary is setuid. */
2846 if (st.st_mode & S_ISUID) {
0fd73091 2847 DEBUG("The binary \"%s\" does have the setuid bit set", path);
3275932b 2848 return 1;
df6a2945
CB
2849 }
2850
0fd73091 2851#if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
2852 /* Check if it has the CAP_SETUID capability. */
2853 if ((cap & CAP_SETUID) &&
2854 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2855 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2856 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
0fd73091 2857 "and CAP_PERMITTED sets", path);
3275932b 2858 return 1;
df6a2945
CB
2859 }
2860
2861 /* Check if it has the CAP_SETGID capability. */
2862 if ((cap & CAP_SETGID) &&
2863 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2864 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2865 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
0fd73091 2866 "and CAP_PERMITTED sets", path);
3275932b 2867 return 1;
df6a2945 2868 }
0fd73091 2869#else
69924fff
CB
2870 /* If we cannot check for file capabilities we need to give the benefit
2871 * of the doubt. Otherwise we might fail even though all the necessary
2872 * file capabilities are set.
2873 */
ffc40301 2874 DEBUG("Cannot check for file capabilities as full capability support is "
0fd73091 2875 "missing. Manual intervention needed");
0fd73091 2876#endif
df6a2945 2877
3275932b 2878 return 1;
df6a2945
CB
2879}
2880
986ef930
CB
2881int lxc_map_ids_exec_wrapper(void *args)
2882{
2883 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2884 return -1;
2885}
2886
f6d3e3e4
SH
2887int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2888{
0fd73091 2889 int fill, left;
986ef930 2890 char u_or_g;
4bc3b759 2891 char *pos;
6b5a54cd 2892 char cmd_output[PATH_MAX];
0fd73091
CB
2893 struct id_map *map;
2894 struct lxc_list *iterator;
2895 enum idtype type;
986ef930
CB
2896 /* strlen("new@idmap") = 9
2897 * +
2898 * strlen(" ") = 1
2899 * +
d33968ad 2900 * INTTYPE_TO_STRLEN(uint32_t)
986ef930
CB
2901 * +
2902 * strlen(" ") = 1
2903 *
2904 * We add some additional space to make sure that we really have
2905 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2906 */
0fd73091 2907 int ret = 0, gidmap = 0, uidmap = 0;
d33968ad 2908 char mapbuf[9 + 1 + INTTYPE_TO_STRLEN(uint32_t) + 1 + LXC_IDMAPLEN] = {0};
0fd73091 2909 bool had_entry = false, use_shadow = false;
c724025c
JC
2910 int hostuid, hostgid;
2911
2912 hostuid = geteuid();
2913 hostgid = getegid();
df6a2945
CB
2914
2915 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2916 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
2917 * will protected it by preventing another user from being handed the
2918 * range by shadow.
2919 */
df6a2945 2920 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
2921 if (uidmap == -ENOENT)
2922 WARN("newuidmap binary is missing");
2923 else if (!uidmap)
2924 WARN("newuidmap is lacking necessary privileges");
2925
df6a2945 2926 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
2927 if (gidmap == -ENOENT)
2928 WARN("newgidmap binary is missing");
2929 else if (!gidmap)
2930 WARN("newgidmap is lacking necessary privileges");
2931
df6a2945 2932 if (uidmap > 0 && gidmap > 0) {
0fd73091 2933 DEBUG("Functional newuidmap and newgidmap binary found");
4bc3b759 2934 use_shadow = true;
df6a2945 2935 } else {
99d43365
CB
2936 /* In case unprivileged users run application containers via
2937 * execute() or a start*() there are valid cases where they may
2938 * only want to map their own {g,u}id. Let's not block them from
2939 * doing so by requiring geteuid() == 0.
2940 */
2941 DEBUG("No newuidmap and newgidmap binary found. Trying to "
c724025c
JC
2942 "write directly with euid %d", hostuid);
2943 }
2944
2945 /* Check if we really need to use newuidmap and newgidmap.
2946 * If the user is only remapping his own {g,u}id, we don't need it.
2947 */
2948 if (use_shadow && lxc_list_len(idmap) == 2) {
2949 use_shadow = false;
2950 lxc_list_for_each(iterator, idmap) {
2951 map = iterator->elem;
2952 if (map->idtype == ID_TYPE_UID && map->range == 1 &&
2953 map->nsid == hostuid && map->hostid == hostuid)
2954 continue;
2955 if (map->idtype == ID_TYPE_GID && map->range == 1 &&
2956 map->nsid == hostgid && map->hostid == hostgid)
2957 continue;
2958 use_shadow = true;
2959 break;
2960 }
0e6e3a41 2961 }
251d0d2a 2962
986ef930
CB
2963 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2964 type++, u_or_g = 'g') {
2965 pos = mapbuf;
2966
0e6e3a41 2967 if (use_shadow)
986ef930 2968 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 2969
cf3ef16d 2970 lxc_list_for_each(iterator, idmap) {
251d0d2a 2971 map = iterator->elem;
cf3ef16d
SH
2972 if (map->idtype != type)
2973 continue;
2974
4bc3b759
CB
2975 had_entry = true;
2976
986ef930 2977 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 2978 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
2979 use_shadow ? " " : "", map->nsid,
2980 map->hostid, map->range,
0e6e3a41 2981 use_shadow ? "" : "\n");
a427e268
CB
2982 if (fill <= 0 || fill >= left) {
2983 /* The kernel only takes <= 4k for writes to
2984 * /proc/<pid>/{g,u}id_map
2985 */
2986 SYSERROR("Too many %cid mappings defined", u_or_g);
2987 return -1;
2988 }
4bc3b759 2989
cf3ef16d 2990 pos += fill;
251d0d2a 2991 }
cf3ef16d 2992 if (!had_entry)
4f7521b4 2993 continue;
cf3ef16d 2994
d85813cd 2995 /* Try to catch the output of new{g,u}idmap to make debugging
986ef930
CB
2996 * easier.
2997 */
2998 if (use_shadow) {
2999 ret = run_command(cmd_output, sizeof(cmd_output),
3000 lxc_map_ids_exec_wrapper,
3001 (void *)mapbuf);
3002 if (ret < 0) {
54fbbeb5
CB
3003 ERROR("new%cidmap failed to write mapping \"%s\": %s",
3004 u_or_g, cmd_output, mapbuf);
986ef930
CB
3005 return -1;
3006 }
54fbbeb5 3007 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
d1838f34 3008 } else {
986ef930 3009 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
54fbbeb5 3010 if (ret < 0) {
da0f9977 3011 ERROR("Failed to write mapping: %s", mapbuf);
986ef930 3012 return -1;
54fbbeb5
CB
3013 }
3014 TRACE("Wrote mapping \"%s\"", mapbuf);
d1838f34 3015 }
986ef930
CB
3016
3017 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 3018 }
251d0d2a 3019
986ef930 3020 return 0;
f6d3e3e4
SH
3021}
3022
0fd73091 3023/* Return the host uid/gid to which the container root is mapped in val.
0b3a6504 3024 * Return true if id was found, false otherwise.
cf3ef16d 3025 */
2a9a80cb 3026bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
4160c3a0 3027 unsigned long *val)
cf3ef16d 3028{
4160c3a0 3029 unsigned nsid;
0fd73091
CB
3030 struct id_map *map;
3031 struct lxc_list *it;
4160c3a0
CB
3032
3033 if (idtype == ID_TYPE_UID)
3034 nsid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
3035 else
3036 nsid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
cf3ef16d 3037
0fd73091 3038 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 3039 map = it->elem;
7b50c609 3040 if (map->idtype != idtype)
cf3ef16d 3041 continue;
4160c3a0 3042 if (map->nsid != nsid)
cf3ef16d 3043 continue;
2a9a80cb
SH
3044 *val = map->hostid;
3045 return true;
cf3ef16d 3046 }
4160c3a0 3047
2a9a80cb 3048 return false;
cf3ef16d
SH
3049}
3050
2133f58c 3051int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 3052{
cf3ef16d 3053 struct id_map *map;
0fd73091
CB
3054 struct lxc_list *it;
3055
3056 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 3057 map = it->elem;
2133f58c 3058 if (map->idtype != idtype)
cf3ef16d 3059 continue;
0fd73091 3060
cf3ef16d 3061 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 3062 return (id - map->hostid) + map->nsid;
cf3ef16d 3063 }
0fd73091 3064
57d116ab 3065 return -1;
cf3ef16d
SH
3066}
3067
339efad9 3068int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 3069{
cf3ef16d 3070 struct id_map *map;
0fd73091 3071 struct lxc_list *it;
2133f58c 3072 unsigned int freeid = 0;
0fd73091 3073
cf3ef16d 3074again:
0fd73091 3075 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 3076 map = it->elem;
2133f58c 3077 if (map->idtype != idtype)
cf3ef16d 3078 continue;
0fd73091 3079
cf3ef16d
SH
3080 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3081 freeid = map->nsid + map->range;
3082 goto again;
3083 }
3084 }
0fd73091 3085
cf3ef16d
SH
3086 return freeid;
3087}
3088
f4f52cb5
CB
3089int chown_mapped_root_exec_wrapper(void *args)
3090{
3091 execvp("lxc-usernsexec", args);
3092 return -1;
3093}
3094
0fd73091 3095/* chown_mapped_root: for an unprivileged user with uid/gid X to
7b50c609
TS
3096 * chown a dir to subuid/subgid Y, he needs to run chown as root
3097 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3098 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3099 * root is privileged with respect to hostuid/hostgid X, allowing
3100 * him to do the chown.
f6d3e3e4 3101 */
41dc7155 3102int chown_mapped_root(const char *path, struct lxc_conf *conf)
f6d3e3e4 3103{
f4f52cb5 3104 uid_t rootuid, rootgid;
2a9a80cb 3105 unsigned long val;
f4f52cb5
CB
3106 int hostuid, hostgid, ret;
3107 struct stat sb;
3108 char map1[100], map2[100], map3[100], map4[100], map5[100];
3109 char ugid[100];
41dc7155 3110 const char *args1[] = {"lxc-usernsexec",
f4f52cb5
CB
3111 "-m", map1,
3112 "-m", map2,
3113 "-m", map3,
3114 "-m", map5,
3115 "--", "chown", ugid, path,
3116 NULL};
41dc7155 3117 const char *args2[] = {"lxc-usernsexec",
f4f52cb5
CB
3118 "-m", map1,
3119 "-m", map2,
3120 "-m", map3,
3121 "-m", map4,
3122 "-m", map5,
3123 "--", "chown", ugid, path,
3124 NULL};
6b5a54cd 3125 char cmd_output[PATH_MAX];
f4f52cb5
CB
3126
3127 hostuid = geteuid();
3128 hostgid = getegid();
f6d3e3e4 3129
2a9a80cb 3130 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 3131 ERROR("No uid mapping for container root");
c4d10a05 3132 return -1;
f6d3e3e4 3133 }
f4f52cb5 3134 rootuid = (uid_t)val;
0fd73091 3135
7b50c609 3136 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 3137 ERROR("No gid mapping for container root");
7b50c609
TS
3138 return -1;
3139 }
f4f52cb5 3140 rootgid = (gid_t)val;
2a9a80cb 3141
f4f52cb5 3142 if (hostuid == 0) {
7b50c609 3143 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3144 ERROR("Error chowning %s", path);
3145 return -1;
3146 }
0fd73091 3147
c4d10a05
SH
3148 return 0;
3149 }
f3d7e4ca 3150
f4f52cb5 3151 if (rootuid == hostuid) {
1a0e70ac 3152 /* nothing to do */
b103ceac 3153 INFO("Container root is our uid; no need to chown");
f3d7e4ca
SH
3154 return 0;
3155 }
3156
bbdbf8f0 3157 /* save the current gid of "path" */
f4f52cb5
CB
3158 if (stat(path, &sb) < 0) {
3159 ERROR("Error stat %s", path);
f6d3e3e4
SH
3160 return -1;
3161 }
7b50c609 3162
bbdbf8f0
CB
3163 /* Update the path argument in case this was overlayfs. */
3164 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3165 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3166
f4f52cb5
CB
3167 /*
3168 * A file has to be group-owned by a gid mapped into the
3169 * container, or the container won't be privileged over it.
3170 */
3171 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3172 if (sb.st_uid == hostuid &&
3173 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3174 chown(path, -1, hostgid) < 0) {
3175 ERROR("Failed chgrping %s", path);
3176 return -1;
3177 }
f6d3e3e4 3178
1a0e70ac 3179 /* "u:0:rootuid:1" */
f4f52cb5
CB
3180 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3181 if (ret < 0 || ret >= 100) {
3182 ERROR("Error uid printing map string");
3183 return -1;
3184 }
7b50c609 3185
1a0e70ac 3186 /* "u:hostuid:hostuid:1" */
f4f52cb5
CB
3187 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3188 if (ret < 0 || ret >= 100) {
3189 ERROR("Error uid printing map string");
3190 return -1;
3191 }
c4d10a05 3192
1a0e70ac 3193 /* "g:0:rootgid:1" */
f4f52cb5
CB
3194 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3195 if (ret < 0 || ret >= 100) {
3196 ERROR("Error gid printing map string");
3197 return -1;
3198 }
98e5ba51 3199
1a0e70ac 3200 /* "g:pathgid:rootgid+pathgid:1" */
f4f52cb5
CB
3201 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3202 rootgid + (gid_t)sb.st_gid);
3203 if (ret < 0 || ret >= 100) {
3204 ERROR("Error gid printing map string");
3205 return -1;
3206 }
c4d10a05 3207
1a0e70ac 3208 /* "g:hostgid:hostgid:1" */
f4f52cb5
CB
3209 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3210 if (ret < 0 || ret >= 100) {
3211 ERROR("Error gid printing map string");
3212 return -1;
3213 }
7b50c609 3214
1a0e70ac 3215 /* "0:pathgid" (chown) */
f4f52cb5
CB
3216 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3217 if (ret < 0 || ret >= 100) {
3218 ERROR("Error owner printing format string for chown");
3219 return -1;
3220 }
7b50c609 3221
f4f52cb5
CB
3222 if (hostgid == sb.st_gid)
3223 ret = run_command(cmd_output, sizeof(cmd_output),
3224 chown_mapped_root_exec_wrapper,
3225 (void *)args1);
3226 else
3227 ret = run_command(cmd_output, sizeof(cmd_output),
3228 chown_mapped_root_exec_wrapper,
3229 (void *)args2);
3230 if (ret < 0)
3231 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 3232
f4f52cb5 3233 return ret;
f6d3e3e4
SH
3234}
3235
943144d9
CB
3236/* NOTE: Must not be called from inside the container namespace! */
3237int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3238{
3239 int mounted;
3240
943144d9 3241 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3242 if (mounted == -1) {
0fd73091 3243 SYSERROR("Failed to mount proc in the container");
01958b1f 3244 /* continue only if there is no rootfs */
943144d9 3245 if (conf->rootfs.path)
01958b1f 3246 return -1;
5112cd70 3247 } else if (mounted == 1) {
7a0bcca3 3248 conf->tmp_umount_proc = true;
5112cd70 3249 }
943144d9 3250
5112cd70
SH
3251 return 0;
3252}
3253
3254void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3255{
7a0bcca3 3256 if (!lxc_conf->tmp_umount_proc)
0fd73091
CB
3257 return;
3258
7a0bcca3
CB
3259 (void)umount2("/proc", MNT_DETACH);
3260 lxc_conf->tmp_umount_proc = false;
5112cd70
SH
3261}
3262
0fd73091 3263/* Walk /proc/mounts and change any shared entries to slave. */
6a0c909a 3264void remount_all_slave(void)
e995d7a2 3265{
7969675f 3266 __do_free char *line = NULL;
003be47b
CB
3267 __do_fclose FILE *f = NULL;
3268 __do_close_prot_errno int memfd = -EBADF, mntinfo_fd = -EBADF;
3269 int ret;
6a49f05e 3270 ssize_t copied;
e995d7a2
SH
3271 size_t len = 0;
3272
6a49f05e 3273 mntinfo_fd = open("/proc/self/mountinfo", O_RDONLY | O_CLOEXEC);
fea3b91d
DJ
3274 if (mntinfo_fd < 0) {
3275 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
6a49f05e 3276 return;
fea3b91d 3277 }
6a49f05e
CB
3278
3279 memfd = memfd_create(".lxc_mountinfo", MFD_CLOEXEC);
3280 if (memfd < 0) {
3281 char template[] = P_tmpdir "/.lxc_mountinfo_XXXXXX";
3282
3283 if (errno != ENOSYS) {
fea3b91d 3284 SYSERROR("Failed to create temporary in-memory file");
6a49f05e
CB
3285 return;
3286 }
3287
3288 memfd = lxc_make_tmpfile(template, true);
fea3b91d 3289 if (memfd < 0) {
fea3b91d
DJ
3290 WARN("Failed to create temporary file");
3291 return;
3292 }
6a49f05e
CB
3293 }
3294
6a49f05e 3295again:
7c4d9466 3296 copied = lxc_sendfile_nointr(memfd, mntinfo_fd, NULL, LXC_SENDFILE_MAX);
6a49f05e
CB
3297 if (copied < 0) {
3298 if (errno == EINTR)
3299 goto again;
3300
fea3b91d 3301 SYSERROR("Failed to copy \"/proc/self/mountinfo\"");
6a49f05e
CB
3302 return;
3303 }
6a49f05e 3304
6a49f05e
CB
3305 ret = lseek(memfd, 0, SEEK_SET);
3306 if (ret < 0) {
fea3b91d 3307 SYSERROR("Failed to reset file descriptor offset");
6a49f05e
CB
3308 return;
3309 }
3310
4110345b 3311 f = fdopen(memfd, "re");
e995d7a2 3312 if (!f) {
003be47b 3313 SYSERROR("Failed to open copy of \"/proc/self/mountinfo\" to mark all shared. Continuing");
e995d7a2
SH
3314 return;
3315 }
3316
003be47b
CB
3317 /*
3318 * After a successful fdopen() memfd will be closed when calling
3319 * fclose(f). Calling close(memfd) afterwards is undefined.
3320 */
3321 move_fd(memfd);
3322
e995d7a2 3323 while (getline(&line, &len, f) != -1) {
0fd73091
CB
3324 char *opts, *target;
3325
e995d7a2
SH
3326 target = get_field(line, 4);
3327 if (!target)
3328 continue;
0fd73091 3329
e995d7a2
SH
3330 opts = get_field(target, 2);
3331 if (!opts)
3332 continue;
0fd73091 3333
e995d7a2
SH
3334 null_endofword(opts);
3335 if (!strstr(opts, "shared"))
3336 continue;
0fd73091 3337
e995d7a2 3338 null_endofword(target);
0fd73091
CB
3339 ret = mount(NULL, target, NULL, MS_SLAVE, NULL);
3340 if (ret < 0) {
3341 SYSERROR("Failed to make \"%s\" MS_SLAVE", target);
e995d7a2 3342 ERROR("Continuing...");
6a49f05e 3343 continue;
e995d7a2 3344 }
6a49f05e 3345 TRACE("Remounted \"%s\" as MS_SLAVE", target);
e995d7a2 3346 }
6a49f05e 3347 TRACE("Remounted all mount table entries as MS_SLAVE");
e995d7a2
SH
3348}
3349
794248d0 3350static int lxc_execute_bind_init(struct lxc_handler *handler)
2322903b
SH
3351{
3352 int ret;
794248d0
CB
3353 char *p;
3354 char path[PATH_MAX], destpath[PATH_MAX];
3355 struct lxc_conf *conf = handler->conf;
9d9c111c
SH
3356
3357 /* If init exists in the container, don't bind mount a static one */
3358 p = choose_init(conf->rootfs.mount);
3359 if (p) {
22f835ba 3360 __do_free char *old = p;
41089848
TA
3361
3362 p = strdup(old + strlen(conf->rootfs.mount));
41089848
TA
3363 if (!p)
3364 return -ENOMEM;
3365
3366 INFO("Found existing init at \"%s\"", p);
3367 goto out;
9d9c111c 3368 }
2322903b
SH
3369
3370 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
0fd73091 3371 if (ret < 0 || ret >= PATH_MAX)
8353b4c9 3372 return -1;
2322903b
SH
3373
3374 if (!file_exists(path)) {
0fd73091 3375 ERROR("The file \"%s\" does not exist on host", path);
8353b4c9 3376 return -1;
2322903b
SH
3377 }
3378
794248d0 3379 ret = snprintf(destpath, PATH_MAX, "%s" P_tmpdir "%s", conf->rootfs.mount, "/.lxc-init");
0fd73091 3380 if (ret < 0 || ret >= PATH_MAX)
8353b4c9 3381 return -1;
2322903b
SH
3382
3383 if (!file_exists(destpath)) {
794248d0
CB
3384 ret = mknod(destpath, S_IFREG | 0000, 0);
3385 if (ret < 0 && errno != EEXIST) {
3386 SYSERROR("Failed to create dummy \"%s\" file as bind mount target", destpath);
8353b4c9 3387 return -1;
2322903b 3388 }
2322903b
SH
3389 }
3390
592fd47a 3391 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
8353b4c9 3392 if (ret < 0) {
0fd73091 3393 SYSERROR("Failed to bind mount lxc.init.static into container");
8353b4c9
CB
3394 return -1;
3395 }
3396
794248d0
CB
3397 p = strdup(destpath + strlen(conf->rootfs.mount));
3398 if (!p)
3399 return -ENOMEM;
794248d0 3400
8353b4c9 3401 INFO("Bind mounted lxc.init.static into container at \"%s\"", path);
41089848 3402out:
4b5b3a2a 3403 ((struct execute_args *)handler->data)->init_fd = -1;
41089848 3404 ((struct execute_args *)handler->data)->init_path = p;
8353b4c9 3405 return 0;
2322903b
SH
3406}
3407
0fd73091
CB
3408/* This does the work of remounting / if it is shared, calling the container
3409 * pre-mount hooks, and mounting the rootfs.
35120d9c 3410 */
8ce1abc2
CB
3411int lxc_setup_rootfs_prepare_root(struct lxc_conf *conf, const char *name,
3412 const char *lxcpath)
0ad19a3f 3413{
0fd73091
CB
3414 int ret;
3415
35120d9c 3416 if (conf->rootfs_setup) {
35120d9c 3417 const char *path = conf->rootfs.mount;
0fd73091
CB
3418
3419 /* The rootfs was set up in another namespace. bind-mount it to
3420 * give us a mount in our own ns so we can pivot_root to it
3421 */
3422 ret = mount(path, path, "rootfs", MS_BIND, NULL);
3423 if (ret < 0) {
3424 ERROR("Failed to bind mount container / onto itself");
145832ba 3425 return -1;
35120d9c 3426 }
0fd73091
CB
3427
3428 TRACE("Bind mounted container / onto itself");
145832ba 3429 return 0;
35120d9c 3430 }
d4ef7c50 3431
e995d7a2
SH
3432 remount_all_slave();
3433
0fd73091
CB
3434 ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
3435 if (ret < 0) {
3436 ERROR("Failed to run pre-mount hooks");
35120d9c
SH
3437 return -1;
3438 }
3439
8ce1abc2 3440 ret = lxc_mount_rootfs(conf);
0fd73091
CB
3441 if (ret < 0) {
3442 ERROR("Failed to setup rootfs for");
35120d9c
SH
3443 return -1;
3444 }
3445
3446 conf->rootfs_setup = true;
3447 return 0;
3448}
3449
1c1c7051
SH
3450static bool verify_start_hooks(struct lxc_conf *conf)
3451{
6b5a54cd 3452 char path[PATH_MAX];
0fd73091
CB
3453 struct lxc_list *it;
3454
3455 lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
1c1c7051 3456 int ret;
0fd73091 3457 char *hookname = it->elem;
1c1c7051 3458
6b5a54cd 3459 ret = snprintf(path, PATH_MAX, "%s%s",
0fd73091
CB
3460 conf->rootfs.path ? conf->rootfs.mount : "",
3461 hookname);
6b5a54cd 3462 if (ret < 0 || ret >= PATH_MAX)
1c1c7051 3463 return false;
0fd73091 3464
75193660 3465 ret = access(path, X_OK);
0fd73091 3466 if (ret < 0) {
75193660 3467 SYSERROR("Start hook \"%s\" not found in container",
0fd73091 3468 hookname);
1c1c7051
SH
3469 return false;
3470 }
0fd73091 3471
6a0c909a 3472 return true;
1c1c7051
SH
3473 }
3474
3475 return true;
3476}
3477
4b5b3a2a
TA
3478static bool execveat_supported(void)
3479{
13be2733 3480 lxc_raw_execveat(-1, "", NULL, NULL, AT_EMPTY_PATH);
4b5b3a2a
TA
3481 if (errno == ENOSYS)
3482 return false;
3483
3484 return true;
4b5b3a2a
TA
3485}
3486
20502652
CB
3487static int lxc_setup_boot_id(void)
3488{
3489 int ret;
3490 const char *boot_id_path = "/proc/sys/kernel/random/boot_id";
3491 const char *mock_boot_id_path = "/dev/.lxc-boot-id";
3492 lxc_id128_t n;
3493
3494 if (access(boot_id_path, F_OK))
3495 return 0;
3496
3497 memset(&n, 0, sizeof(n));
3498 if (lxc_id128_randomize(&n)) {
3499 SYSERROR("Failed to generate random data for uuid");
3500 return -1;
3501 }
3502
3503 ret = lxc_id128_write(mock_boot_id_path, n);
3504 if (ret < 0) {
3505 SYSERROR("Failed to write uuid to %s", mock_boot_id_path);
3506 return -1;
3507 }
3508
3509 ret = chmod(mock_boot_id_path, 0444);
3510 if (ret < 0) {
3511 SYSERROR("Failed to chown %s", mock_boot_id_path);
3512 (void)unlink(mock_boot_id_path);
3513 return -1;
3514 }
3515
3516 ret = mount(mock_boot_id_path, boot_id_path, NULL, MS_BIND, NULL);
3517 if (ret < 0) {
3518 SYSERROR("Failed to mount %s to %s", mock_boot_id_path,
3519 boot_id_path);
3520 (void)unlink(mock_boot_id_path);
3521 return -1;
3522 }
3523
3524 ret = mount(NULL, boot_id_path, NULL,
3525 (MS_BIND | MS_REMOUNT | MS_RDONLY | MS_NOSUID | MS_NOEXEC |
3526 MS_NODEV),
3527 NULL);
3528 if (ret < 0) {
3529 SYSERROR("Failed to remount %s read-only", boot_id_path);
3530 (void)unlink(mock_boot_id_path);
3531 return -1;
3532 }
3533
3534 return 0;
3535}
3536
3b988b33 3537int lxc_setup(struct lxc_handler *handler)
35120d9c 3538{
2187efd3 3539 int ret;
0fd73091 3540 const char *lxcpath = handler->lxcpath, *name = handler->name;
35120d9c 3541 struct lxc_conf *lxc_conf = handler->conf;
4fef78bc 3542 char *keyring_context = NULL;
35120d9c 3543
8ce1abc2 3544 ret = lxc_setup_rootfs_prepare_root(lxc_conf, name, lxcpath);
8353b4c9
CB
3545 if (ret < 0) {
3546 ERROR("Failed to setup rootfs");
35120d9c
SH
3547 return -1;
3548 }
3549
28d9e29e 3550 if (handler->nsfd[LXC_NS_UTS] == -1) {
8353b4c9
CB
3551 ret = setup_utsname(lxc_conf->utsname);
3552 if (ret < 0) {
0fd73091 3553 ERROR("Failed to setup the utsname %s", name);
6c544cb3
MM
3554 return -1;
3555 }
0ad19a3f 3556 }
3557
8f818a84
MB
3558 if (!lxc_conf->keyring_disable_session) {
3559 if (lxc_conf->lsm_se_keyring_context) {
3560 keyring_context = lxc_conf->lsm_se_keyring_context;
3561 } else if (lxc_conf->lsm_se_context) {
3562 keyring_context = lxc_conf->lsm_se_context;
3563 }
4fef78bc 3564
8f818a84
MB
3565 ret = lxc_setup_keyring(keyring_context);
3566 if (ret < 0)
3567 return -1;
3568 }
b25291da 3569
e389f2af
CB
3570 if (handler->ns_clone_flags & CLONE_NEWNET) {
3571 ret = lxc_setup_network_in_child_namespaces(lxc_conf,
3572 &lxc_conf->network);
3573 if (ret < 0) {
3574 ERROR("Failed to setup network");
3575 return -1;
3576 }
0ad19a3f 3577
e389f2af
CB
3578 ret = lxc_network_send_name_and_ifindex_to_parent(handler);
3579 if (ret < 0) {
3580 ERROR("Failed to send network device names and ifindices to parent");
3581 return -1;
3582 }
790255cf
CB
3583 }
3584
bc6928ff 3585 if (lxc_conf->autodev > 0) {
63012bdd 3586 ret = mount_autodev(name, &lxc_conf->rootfs, lxc_conf->autodevtmpfssize, lxcpath);
8353b4c9
CB
3587 if (ret < 0) {
3588 ERROR("Failed to mount \"/dev\"");
c6883f38
SH
3589 return -1;
3590 }
3591 }
3592
8353b4c9
CB
3593 /* Do automatic mounts (mainly /proc and /sys), but exclude those that
3594 * need to wait until other stuff has finished.
368bbc02 3595 */
8353b4c9
CB
3596 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler);
3597 if (ret < 0) {
3598 ERROR("Failed to setup first automatic mounts");
368bbc02
CS
3599 return -1;
3600 }
3601
8353b4c9
CB
3602 ret = setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath);
3603 if (ret < 0) {
3604 ERROR("Failed to setup mounts");
95b5ffaf 3605 return -1;
576f946d 3606 }
3607
c631115d
FA
3608 if (!lxc_list_empty(&lxc_conf->mount_list)) {
3609 ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
3610 &lxc_conf->mount_list, name, lxcpath);
3611 if (ret < 0) {
3612 ERROR("Failed to setup mount entries");
3613 return -1;
3614 }
3615 }
3616
8353b4c9 3617 if (lxc_conf->is_execute) {
4b5b3a2a
TA
3618 if (execveat_supported()) {
3619 int fd;
3620 char path[PATH_MAX];
3621
3622 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3623 if (ret < 0 || ret >= PATH_MAX) {
3624 ERROR("Path to init.lxc.static too long");
3625 return -1;
3626 }
3627
3628 fd = open(path, O_PATH | O_CLOEXEC);
3629 if (fd < 0) {
3630 SYSERROR("Unable to open lxc.init.static");
3631 return -1;
3632 }
3633
3634 ((struct execute_args *)handler->data)->init_fd = fd;
3635 ((struct execute_args *)handler->data)->init_path = NULL;
3636 } else {
3637 ret = lxc_execute_bind_init(handler);
3638 if (ret < 0) {
3639 ERROR("Failed to bind-mount the lxc init system");
3640 return -1;
3641 }
8353b4c9
CB
3642 }
3643 }
2322903b 3644
8353b4c9
CB
3645 /* Now mount only cgroups, if wanted. Before, /sys could not have been
3646 * mounted. It is guaranteed to be mounted now either through
3647 * automatically or via fstab entries.
368bbc02 3648 */
8353b4c9
CB
3649 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler);
3650 if (ret < 0) {
3651 ERROR("Failed to setup remaining automatic mounts");
368bbc02
CS
3652 return -1;
3653 }
3654
8353b4c9 3655 ret = run_lxc_hooks(name, "mount", lxc_conf, NULL);
1a2cf89d 3656 if (ret < 0) {
8353b4c9 3657 ERROR("Failed to run mount hooks");
773fb9ca
SH
3658 return -1;
3659 }
3660
bc6928ff 3661 if (lxc_conf->autodev > 0) {
8353b4c9
CB
3662 ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
3663 if (ret < 0) {
3664 ERROR("Failed to run autodev hooks");
f7bee6c6
MW
3665 return -1;
3666 }
06749971 3667
8353b4c9
CB
3668 ret = lxc_fill_autodev(&lxc_conf->rootfs);
3669 if (ret < 0) {
3670 ERROR("Failed to populate \"/dev\"");
91c3830e
SH
3671 return -1;
3672 }
3673 }
368bbc02 3674
75193660
CB
3675 /* Make sure any start hooks are in the container */
3676 if (!verify_start_hooks(lxc_conf)) {
3677 ERROR("Failed to verify start hooks");
3678 return -1;
3679 }
3680
ed8704d0 3681 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
885766f5 3682 lxc_conf->ttys.dir);
ed8704d0
CB
3683 if (ret < 0) {
3684 ERROR("Failed to setup console");
95b5ffaf 3685 return -1;
6e590161 3686 }
3687
ed8704d0
CB
3688 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
3689 if (ret < 0) {
8353b4c9 3690 ERROR("Failed to setup \"/dev\" symlinks");
69aa6655
DE
3691 return -1;
3692 }
3693
8353b4c9
CB
3694 ret = lxc_create_tmp_proc_mount(lxc_conf);
3695 if (ret < 0) {
3696 ERROR("Failed to \"/proc\" LSMs");
e075f5d9 3697 return -1;
e075f5d9 3698 }
e075f5d9 3699
8ce1abc2 3700 ret = lxc_setup_rootfs_switch_root(&lxc_conf->rootfs);
8353b4c9
CB
3701 if (ret < 0) {
3702 ERROR("Failed to pivot root into rootfs");
95b5ffaf 3703 return -1;
ed502555 3704 }
3705
20502652
CB
3706 /* Setting the boot-id is best-effort for now. */
3707 if (lxc_conf->autodev > 0)
3708 (void)lxc_setup_boot_id();
3709
8353b4c9
CB
3710 ret = lxc_setup_devpts(lxc_conf);
3711 if (ret < 0) {
3712 ERROR("Failed to setup new devpts instance");
95b5ffaf 3713 return -1;
3c26f34e 3714 }
3715
2187efd3
CB
3716 ret = lxc_create_ttys(handler);
3717 if (ret < 0)
e8bd4e43 3718 return -1;
e8bd4e43 3719
8353b4c9
CB
3720 ret = setup_personality(lxc_conf->personality);
3721 if (ret < 0) {
3722 ERROR("Failed to set personality");
cccc74b5
DL
3723 return -1;
3724 }
3725
8353b4c9
CB
3726 /* Set sysctl value to a path under /proc/sys as determined from the
3727 * key. For e.g. net.ipv4.ip_forward translated to
3728 * /proc/sys/net/ipv4/ip_forward.
7edd0540
L
3729 */
3730 if (!lxc_list_empty(&lxc_conf->sysctls)) {
3731 ret = setup_sysctl_parameters(&lxc_conf->sysctls);
8353b4c9
CB
3732 if (ret < 0) {
3733 ERROR("Failed to setup sysctl parameters");
7edd0540 3734 return -1;
8353b4c9 3735 }
7edd0540
L
3736 }
3737
97a8f74f
SG
3738 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3739 if (!lxc_list_empty(&lxc_conf->caps)) {
8353b4c9
CB
3740 ERROR("Container requests lxc.cap.drop and "
3741 "lxc.cap.keep: either use lxc.cap.drop or "
3742 "lxc.cap.keep, not both");
f6d3e3e4
SH
3743 return -1;
3744 }
8353b4c9 3745
97a8f74f 3746 if (dropcaps_except(&lxc_conf->keepcaps)) {
8353b4c9 3747 ERROR("Failed to keep capabilities");
97a8f74f
SG
3748 return -1;
3749 }
3750 } else if (setup_caps(&lxc_conf->caps)) {
8353b4c9 3751 ERROR("Failed to drop capabilities");
97a8f74f 3752 return -1;
81810dd1
DL
3753 }
3754
8353b4c9 3755 NOTICE("The container \"%s\" is set up", name);
cd54d859 3756
0ad19a3f 3757 return 0;
3758}
26ddeedd 3759
3f60c2f7 3760int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
14a7b0f9 3761 char *argv[])
26ddeedd 3762{
26ddeedd 3763 struct lxc_list *it;
3ea957c6
RK
3764 int which;
3765
3766 for (which = 0; which < NUM_LXC_HOOKS; which ++) {
3767 if (strcmp(hookname, lxchook_names[which]) == 0)
3768 break;
3769 }
3770
3771 if (which >= NUM_LXC_HOOKS)
26ddeedd 3772 return -1;
3f60c2f7 3773
0fd73091 3774 lxc_list_for_each (it, &conf->hooks[which]) {
26ddeedd 3775 int ret;
3f60c2f7
CB
3776 char *hook = it->elem;
3777
3778 ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
14a7b0f9 3779 hookname, argv);
3f60c2f7
CB
3780 if (ret < 0)
3781 return -1;
26ddeedd 3782 }
3f60c2f7 3783
26ddeedd
SH
3784 return 0;
3785}
72d0e1cb 3786
72d0e1cb
SG
3787int lxc_clear_config_caps(struct lxc_conf *c)
3788{
1a0e70ac 3789 struct lxc_list *it, *next;
72d0e1cb 3790
0fd73091 3791 lxc_list_for_each_safe (it, &c->caps, next) {
72d0e1cb
SG
3792 lxc_list_del(it);
3793 free(it->elem);
3794 free(it);
3795 }
0fd73091 3796
72d0e1cb
SG
3797 return 0;
3798}
3799
c7e345ae
CB
3800static int lxc_free_idmap(struct lxc_list *id_map)
3801{
27c27d73
SH
3802 struct lxc_list *it, *next;
3803
0fd73091 3804 lxc_list_for_each_safe (it, id_map, next) {
27c27d73
SH
3805 lxc_list_del(it);
3806 free(it->elem);
3807 free(it);
3808 }
c7e345ae 3809
27c27d73
SH
3810 return 0;
3811}
3812
4355ab5f
SH
3813int lxc_clear_idmaps(struct lxc_conf *c)
3814{
3815 return lxc_free_idmap(&c->id_map);
3816}
3817
1fb86a7c
SH
3818int lxc_clear_config_keepcaps(struct lxc_conf *c)
3819{
0fd73091 3820 struct lxc_list *it, *next;
1fb86a7c 3821
0fd73091 3822 lxc_list_for_each_safe (it, &c->keepcaps, next) {
1fb86a7c
SH
3823 lxc_list_del(it);
3824 free(it->elem);
3825 free(it);
3826 }
0fd73091 3827
1fb86a7c
SH
3828 return 0;
3829}
3830
a3ed9b81 3831int lxc_clear_namespace(struct lxc_conf *c)
3832{
3833 int i;
3834 for (i = 0; i < LXC_NS_MAX; i++) {
3835 free(c->ns_share[i]);
3836 c->ns_share[i] = NULL;
3837 }
3838 return 0;
3839}
3840
54860ed0 3841int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
72d0e1cb 3842{
54860ed0 3843 char *global_token, *namespaced_token;
ab1a6cac 3844 size_t namespaced_token_len;
54860ed0 3845 struct lxc_list *it, *next, *list;
ab1a6cac 3846 const char *k = key;
54860ed0 3847 bool all = false;
72d0e1cb 3848
54860ed0
CB
3849 if (version == CGROUP2_SUPER_MAGIC) {
3850 global_token = "lxc.cgroup2";
3851 namespaced_token = "lxc.cgroup2.";
6333c915 3852 namespaced_token_len = STRLITERALLEN("lxc.cgroup2.");
54860ed0
CB
3853 list = &c->cgroup2;
3854 } else if (version == CGROUP_SUPER_MAGIC) {
3855 global_token = "lxc.cgroup";
3856 namespaced_token = "lxc.cgroup.";
6333c915 3857 namespaced_token_len = STRLITERALLEN("lxc.cgroup.");
54860ed0
CB
3858 list = &c->cgroup;
3859 } else {
ab1a6cac 3860 return -EINVAL;
54860ed0
CB
3861 }
3862
3863 if (strcmp(key, global_token) == 0)
72d0e1cb 3864 all = true;
6333c915 3865 else if (strncmp(key, namespaced_token, namespaced_token_len) == 0)
ab1a6cac 3866 k += namespaced_token_len;
a6390f01 3867 else
ab1a6cac 3868 return -EINVAL;
72d0e1cb 3869
0fd73091 3870 lxc_list_for_each_safe (it, list, next) {
72d0e1cb 3871 struct lxc_cgroup *cg = it->elem;
54860ed0 3872
72d0e1cb
SG
3873 if (!all && strcmp(cg->subsystem, k) != 0)
3874 continue;
54860ed0 3875
72d0e1cb
SG
3876 lxc_list_del(it);
3877 free(cg->subsystem);
3878 free(cg->value);
3879 free(cg);
3880 free(it);
3881 }
e409b214 3882
72d0e1cb
SG
3883 return 0;
3884}
3885
4bfb655e
CB
3886static void lxc_clear_devices(struct lxc_conf *conf)
3887{
3888 struct lxc_list *list = &conf->devices;
3889 struct lxc_list *it, *next;
3890
3891 lxc_list_for_each_safe(it, list, next) {
3892 lxc_list_del(it);
3893 free(it);
3894 }
3895}
3896
c6d09e15
WB
3897int lxc_clear_limits(struct lxc_conf *c, const char *key)
3898{
3899 struct lxc_list *it, *next;
c6d09e15 3900 const char *k = NULL;
0fd73091 3901 bool all = false;
c6d09e15 3902
b668653c 3903 if (strcmp(key, "lxc.limit") == 0 || strcmp(key, "lxc.prlimit") == 0)
c6d09e15 3904 all = true;
6333c915
CB
3905 else if (strncmp(key, "lxc.limit.", STRLITERALLEN("lxc.limit.")) == 0)
3906 k = key + STRLITERALLEN("lxc.limit.");
3907 else if (strncmp(key, "lxc.prlimit.", STRLITERALLEN("lxc.prlimit.")) == 0)
3908 k = key + STRLITERALLEN("lxc.prlimit.");
c6d09e15
WB
3909 else
3910 return -1;
3911
0fd73091 3912 lxc_list_for_each_safe (it, &c->limits, next) {
c6d09e15 3913 struct lxc_limit *lim = it->elem;
0fd73091 3914
c6d09e15
WB
3915 if (!all && strcmp(lim->resource, k) != 0)
3916 continue;
0fd73091 3917
c6d09e15
WB
3918 lxc_list_del(it);
3919 free(lim->resource);
3920 free(lim);
3921 free(it);
3922 }
b668653c 3923
c6d09e15
WB
3924 return 0;
3925}
3926
7edd0540
L
3927int lxc_clear_sysctls(struct lxc_conf *c, const char *key)
3928{
3929 struct lxc_list *it, *next;
7edd0540 3930 const char *k = NULL;
0fd73091 3931 bool all = false;
7edd0540
L
3932
3933 if (strcmp(key, "lxc.sysctl") == 0)
3934 all = true;
6333c915
CB
3935 else if (strncmp(key, "lxc.sysctl.", STRLITERALLEN("lxc.sysctl.")) == 0)
3936 k = key + STRLITERALLEN("lxc.sysctl.");
7edd0540
L
3937 else
3938 return -1;
3939
0fd73091 3940 lxc_list_for_each_safe (it, &c->sysctls, next) {
7edd0540 3941 struct lxc_sysctl *elem = it->elem;
0fd73091 3942
7edd0540
L
3943 if (!all && strcmp(elem->key, k) != 0)
3944 continue;
0fd73091 3945
7edd0540
L
3946 lxc_list_del(it);
3947 free(elem->key);
3948 free(elem->value);
3949 free(elem);
3950 free(it);
3951 }
0fd73091 3952
7edd0540
L
3953 return 0;
3954}
3955
61d7a733
YT
3956int lxc_clear_procs(struct lxc_conf *c, const char *key)
3957{
0fd73091 3958 struct lxc_list *it, *next;
61d7a733 3959 const char *k = NULL;
0fd73091 3960 bool all = false;
61d7a733
YT
3961
3962 if (strcmp(key, "lxc.proc") == 0)
3963 all = true;
6333c915
CB
3964 else if (strncmp(key, "lxc.proc.", STRLITERALLEN("lxc.proc.")) == 0)
3965 k = key + STRLITERALLEN("lxc.proc.");
61d7a733
YT
3966 else
3967 return -1;
3968
0fd73091 3969 lxc_list_for_each_safe (it, &c->procs, next) {
61d7a733 3970 struct lxc_proc *proc = it->elem;
0fd73091 3971
61d7a733
YT
3972 if (!all && strcmp(proc->filename, k) != 0)
3973 continue;
0fd73091 3974
61d7a733
YT
3975 lxc_list_del(it);
3976 free(proc->filename);
3977 free(proc->value);
3978 free(proc);
3979 free(it);
3980 }
3981
3982 return 0;
3983}
3984
ee1e7aa0
SG
3985int lxc_clear_groups(struct lxc_conf *c)
3986{
0fd73091 3987 struct lxc_list *it, *next;
ee1e7aa0 3988
0fd73091 3989 lxc_list_for_each_safe (it, &c->groups, next) {
ee1e7aa0
SG
3990 lxc_list_del(it);
3991 free(it->elem);
3992 free(it);
3993 }
0fd73091 3994
ee1e7aa0
SG
3995 return 0;
3996}
3997
ab799c0b
SG
3998int lxc_clear_environment(struct lxc_conf *c)
3999{
0fd73091 4000 struct lxc_list *it, *next;
ab799c0b 4001
0fd73091 4002 lxc_list_for_each_safe (it, &c->environment, next) {
ab799c0b
SG
4003 lxc_list_del(it);
4004 free(it->elem);
4005 free(it);
4006 }
0fd73091 4007
ab799c0b
SG
4008 return 0;
4009}
4010
72d0e1cb
SG
4011int lxc_clear_mount_entries(struct lxc_conf *c)
4012{
0fd73091 4013 struct lxc_list *it, *next;
72d0e1cb 4014
0fd73091 4015 lxc_list_for_each_safe (it, &c->mount_list, next) {
72d0e1cb
SG
4016 lxc_list_del(it);
4017 free(it->elem);
4018 free(it);
4019 }
0fd73091 4020
72d0e1cb
SG
4021 return 0;
4022}
4023
b099e9e9
SH
4024int lxc_clear_automounts(struct lxc_conf *c)
4025{
4026 c->auto_mounts = 0;
4027 return 0;
4028}
4029
12a50cc6 4030int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 4031{
72d0e1cb 4032 int i;
0fd73091
CB
4033 struct lxc_list *it, *next;
4034 const char *k = NULL;
4035 bool all = false, done = false;
72d0e1cb 4036
17ed13a3
SH
4037 if (strcmp(key, "lxc.hook") == 0)
4038 all = true;
6333c915
CB
4039 else if (strncmp(key, "lxc.hook.", STRLITERALLEN("lxc.hook.")) == 0)
4040 k = key + STRLITERALLEN("lxc.hook.");
a6390f01
WB
4041 else
4042 return -1;
17ed13a3 4043
0fd73091 4044 for (i = 0; i < NUM_LXC_HOOKS; i++) {
17ed13a3 4045 if (all || strcmp(k, lxchook_names[i]) == 0) {
0fd73091 4046 lxc_list_for_each_safe (it, &c->hooks[i], next) {
17ed13a3
SH
4047 lxc_list_del(it);
4048 free(it->elem);
4049 free(it);
4050 }
0fd73091 4051
17ed13a3 4052 done = true;
72d0e1cb
SG
4053 }
4054 }
17ed13a3
SH
4055
4056 if (!done) {
4057 ERROR("Invalid hook key: %s", key);
4058 return -1;
4059 }
0fd73091 4060
72d0e1cb
SG
4061 return 0;
4062}
8eb5694b 4063
4184c3e1
SH
4064static inline void lxc_clear_aliens(struct lxc_conf *conf)
4065{
0fd73091 4066 struct lxc_list *it, *next;
4184c3e1 4067
0fd73091 4068 lxc_list_for_each_safe (it, &conf->aliens, next) {
4184c3e1
SH
4069 lxc_list_del(it);
4070 free(it->elem);
4071 free(it);
4072 }
4073}
4074
c7b15d1e 4075void lxc_clear_includes(struct lxc_conf *conf)
f979ac15 4076{
0fd73091 4077 struct lxc_list *it, *next;
f979ac15 4078
0fd73091 4079 lxc_list_for_each_safe (it, &conf->includes, next) {
f979ac15
SH
4080 lxc_list_del(it);
4081 free(it->elem);
4082 free(it);
4083 }
4084}
4085
1800f924
WB
4086int lxc_clear_apparmor_raw(struct lxc_conf *c)
4087{
4088 struct lxc_list *it, *next;
4089
4090 lxc_list_for_each_safe (it, &c->lsm_aa_raw, next) {
4091 lxc_list_del(it);
4092 free(it->elem);
4093 free(it);
4094 }
4095
4096 return 0;
4097}
4098
8eb5694b
SH
4099void lxc_conf_free(struct lxc_conf *conf)
4100{
4101 if (!conf)
4102 return;
0fd73091 4103
858377e4
SH
4104 if (current_config == conf)
4105 current_config = NULL;
aed105d5 4106 lxc_terminal_conf_free(&conf->console);
f10fad2f 4107 free(conf->rootfs.mount);
b3b8c97f 4108 free(conf->rootfs.bdev_type);
f10fad2f
ME
4109 free(conf->rootfs.options);
4110 free(conf->rootfs.path);
9dd75981 4111 free(conf->rootfs.data);
f10fad2f 4112 free(conf->logfile);
858377e4
SH
4113 if (conf->logfd != -1)
4114 close(conf->logfd);
f10fad2f 4115 free(conf->utsname);
885766f5
CB
4116 free(conf->ttys.dir);
4117 free(conf->ttys.tty_names);
f10fad2f
ME
4118 free(conf->fstab);
4119 free(conf->rcfile);
5cda27c1 4120 free(conf->execute_cmd);
f10fad2f 4121 free(conf->init_cmd);
3c491553 4122 free(conf->init_cwd);
6b0d5538 4123 free(conf->unexpanded_config);
76d0127f 4124 free(conf->syslog);
c302b476 4125 lxc_free_networks(&conf->network);
f10fad2f 4126 free(conf->lsm_aa_profile);
1800f924 4127 free(conf->lsm_aa_profile_computed);
f10fad2f 4128 free(conf->lsm_se_context);
c3e3c21a 4129 lxc_seccomp_free(&conf->seccomp);
8eb5694b 4130 lxc_clear_config_caps(conf);
1fb86a7c 4131 lxc_clear_config_keepcaps(conf);
54860ed0
CB
4132 lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
4133 lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
4bfb655e 4134 lxc_clear_devices(conf);
bf651989 4135 lxc_clear_cgroup2_devices(conf);
17ed13a3 4136 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 4137 lxc_clear_mount_entries(conf);
27c27d73 4138 lxc_clear_idmaps(conf);
ee1e7aa0 4139 lxc_clear_groups(conf);
f979ac15 4140 lxc_clear_includes(conf);
761d81ca 4141 lxc_clear_aliens(conf);
ab799c0b 4142 lxc_clear_environment(conf);
240d4b74 4143 lxc_clear_limits(conf, "lxc.prlimit");
7edd0540 4144 lxc_clear_sysctls(conf, "lxc.sysctl");
61d7a733 4145 lxc_clear_procs(conf, "lxc.proc");
1800f924 4146 lxc_clear_apparmor_raw(conf);
a3ed9b81 4147 lxc_clear_namespace(conf);
43654d34
CB
4148 free(conf->cgroup_meta.dir);
4149 free(conf->cgroup_meta.controllers);
7a41e857
LT
4150 free(conf->shmount.path_host);
4151 free(conf->shmount.path_cont);
8eb5694b
SH
4152 free(conf);
4153}
4355ab5f
SH
4154
4155struct userns_fn_data {
4156 int (*fn)(void *);
c9b7c33e 4157 const char *fn_name;
4355ab5f
SH
4158 void *arg;
4159 int p[2];
4160};
4161
4162static int run_userns_fn(void *data)
4163{
adaffdd7 4164 int ret;
4355ab5f 4165 char c;
0fd73091 4166 struct userns_fn_data *d = data;
4355ab5f 4167
f8aa4bf3 4168 /* Close write end of the pipe. */
4355ab5f 4169 close(d->p[1]);
f8aa4bf3
CB
4170
4171 /* Wait for parent to finish establishing a new mapping in the user
4172 * namespace we are executing in.
4173 */
adaffdd7 4174 ret = lxc_read_nointr(d->p[0], &c, 1);
f8aa4bf3 4175 /* Close read end of the pipe. */
4355ab5f 4176 close(d->p[0]);
adaffdd7
CB
4177 if (ret != 1)
4178 return -1;
f8aa4bf3 4179
c9b7c33e 4180 if (d->fn_name)
adaffdd7 4181 TRACE("Calling function \"%s\"", d->fn_name);
0fd73091 4182
f8aa4bf3 4183 /* Call function to run. */
4355ab5f
SH
4184 return d->fn(d->arg);
4185}
4186
db7cfe23
CB
4187static struct id_map *mapped_nsid_add(struct lxc_conf *conf, unsigned id,
4188 enum idtype idtype)
4189{
5173b710
CB
4190 const struct id_map *map;
4191 struct id_map *retmap;
db7cfe23
CB
4192
4193 map = find_mapped_nsid_entry(conf, id, idtype);
4194 if (!map)
4195 return NULL;
4196
4197 retmap = malloc(sizeof(*retmap));
4198 if (!retmap)
4199 return NULL;
4200
4201 memcpy(retmap, map, sizeof(*retmap));
4202 return retmap;
4203}
4204
c4333195
CB
4205static struct id_map *find_mapped_hostid_entry(struct lxc_conf *conf,
4206 unsigned id, enum idtype idtype)
f8aa4bf3 4207{
f8aa4bf3 4208 struct id_map *map;
0fd73091 4209 struct lxc_list *it;
f8aa4bf3
CB
4210 struct id_map *retmap = NULL;
4211
0fd73091 4212 lxc_list_for_each (it, &conf->id_map) {
f8aa4bf3
CB
4213 map = it->elem;
4214 if (map->idtype != idtype)
4215 continue;
4216
4217 if (id >= map->hostid && id < map->hostid + map->range) {
4218 retmap = map;
4219 break;
4220 }
4221 }
4222
f8aa4bf3
CB
4223 return retmap;
4224}
4225
0fd73091 4226/* Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
f8aa4bf3 4227 * existing one or establish a new one.
4355ab5f 4228 */
0fd73091
CB
4229static struct id_map *mapped_hostid_add(struct lxc_conf *conf, uid_t id,
4230 enum idtype type)
4355ab5f 4231{
28a2d9e7 4232 int hostid_mapped;
c4333195
CB
4233 struct id_map *entry = NULL, *tmp = NULL;
4234
4235 entry = malloc(sizeof(*entry));
4236 if (!entry)
4237 return NULL;
f8aa4bf3 4238
28a2d9e7 4239 /* Reuse existing mapping. */
c4333195
CB
4240 tmp = find_mapped_hostid_entry(conf, id, type);
4241 if (tmp)
4242 return memcpy(entry, tmp, sizeof(*entry));
f8aa4bf3 4243
28a2d9e7
CB
4244 /* Find new mapping. */
4245 hostid_mapped = find_unmapped_nsid(conf, type);
4246 if (hostid_mapped < 0) {
c4333195
CB
4247 DEBUG("Failed to find free mapping for id %d", id);
4248 free(entry);
28a2d9e7 4249 return NULL;
f8aa4bf3 4250 }
f8aa4bf3 4251
28a2d9e7
CB
4252 entry->idtype = type;
4253 entry->nsid = hostid_mapped;
4254 entry->hostid = (unsigned long)id;
4255 entry->range = 1;
4355ab5f 4256
28a2d9e7 4257 return entry;
4355ab5f
SH
4258}
4259
dcf0ffdf 4260struct lxc_list *get_minimal_idmap(struct lxc_conf *conf)
4355ab5f 4261{
00d6cfe2
CB
4262 __do_free struct id_map *container_root_uid = NULL,
4263 *container_root_gid = NULL,
4264 *host_uid_map = NULL, *host_gid_map = NULL;
4265 __do_free struct lxc_list *idmap = NULL;
f8aa4bf3 4266 uid_t euid, egid;
4160c3a0
CB
4267 uid_t nsuid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
4268 gid_t nsgid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
00d6cfe2 4269 struct lxc_list *tmplist = NULL;
4355ab5f 4270
db7cfe23 4271 /* Find container root mappings. */
4160c3a0 4272 container_root_uid = mapped_nsid_add(conf, nsuid, ID_TYPE_UID);
db7cfe23 4273 if (!container_root_uid) {
dcf0ffdf 4274 DEBUG("Failed to find mapping for namespace uid %d", 0);
00d6cfe2 4275 return NULL;
f8aa4bf3 4276 }
dcf0ffdf
CB
4277 euid = geteuid();
4278 if (euid >= container_root_uid->hostid &&
4279 euid < (container_root_uid->hostid + container_root_uid->range))
db7cfe23 4280 host_uid_map = container_root_uid;
f8aa4bf3 4281
4160c3a0 4282 container_root_gid = mapped_nsid_add(conf, nsgid, ID_TYPE_GID);
db7cfe23 4283 if (!container_root_gid) {
dcf0ffdf 4284 DEBUG("Failed to find mapping for namespace gid %d", 0);
00d6cfe2 4285 return NULL;
f8aa4bf3 4286 }
dcf0ffdf
CB
4287 egid = getegid();
4288 if (egid >= container_root_gid->hostid &&
4289 egid < (container_root_gid->hostid + container_root_gid->range))
db7cfe23 4290 host_gid_map = container_root_gid;
f8aa4bf3
CB
4291
4292 /* Check whether the {g,u}id of the user has a mapping. */
954b7d9b 4293 if (!host_uid_map)
c4333195 4294 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
28a2d9e7 4295 if (!host_uid_map) {
db7cfe23 4296 DEBUG("Failed to find mapping for uid %d", euid);
00d6cfe2 4297 return NULL;
f8aa4bf3
CB
4298 }
4299
dcf0ffdf
CB
4300 if (!host_gid_map)
4301 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
28a2d9e7 4302 if (!host_gid_map) {
db7cfe23 4303 DEBUG("Failed to find mapping for gid %d", egid);
00d6cfe2 4304 return NULL;
28a2d9e7
CB
4305 }
4306
4307 /* Allocate new {g,u}id map list. */
4308 idmap = malloc(sizeof(*idmap));
4309 if (!idmap)
00d6cfe2 4310 return NULL;
28a2d9e7
CB
4311 lxc_list_init(idmap);
4312
f8aa4bf3
CB
4313 /* Add container root to the map. */
4314 tmplist = malloc(sizeof(*tmplist));
4315 if (!tmplist)
00d6cfe2 4316 return NULL;
f8aa4bf3
CB
4317 lxc_list_add_elem(tmplist, container_root_uid);
4318 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4319
1d90e064 4320 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7 4321 /* idmap will now keep track of that memory. */
00d6cfe2 4322 move_ptr(container_root_uid);
28a2d9e7
CB
4323
4324 /* Add container root to the map. */
4325 tmplist = malloc(sizeof(*tmplist));
4326 if (!tmplist)
00d6cfe2 4327 return NULL;
28a2d9e7
CB
4328 lxc_list_add_elem(tmplist, host_uid_map);
4329 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4330 }
1d90e064 4331 /* idmap will now keep track of that memory. */
00d6cfe2 4332 move_ptr(container_root_uid);
1d90e064 4333 /* idmap will now keep track of that memory. */
00d6cfe2 4334 move_ptr(host_uid_map);
f8aa4bf3
CB
4335
4336 tmplist = malloc(sizeof(*tmplist));
4337 if (!tmplist)
00d6cfe2 4338 return NULL;
f8aa4bf3
CB
4339 lxc_list_add_elem(tmplist, container_root_gid);
4340 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4341
1d90e064 4342 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7 4343 /* idmap will now keep track of that memory. */
00d6cfe2 4344 move_ptr(container_root_gid);
28a2d9e7
CB
4345
4346 tmplist = malloc(sizeof(*tmplist));
4347 if (!tmplist)
00d6cfe2 4348 return NULL;
28a2d9e7
CB
4349 lxc_list_add_elem(tmplist, host_gid_map);
4350 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4351 }
1d90e064 4352 /* idmap will now keep track of that memory. */
00d6cfe2 4353 move_ptr(container_root_gid);
1d90e064 4354 /* idmap will now keep track of that memory. */
00d6cfe2 4355 move_ptr(host_gid_map);
f8aa4bf3 4356
dcf0ffdf 4357 TRACE("Allocated minimal idmapping");
00d6cfe2 4358 return move_ptr(idmap);
dcf0ffdf
CB
4359}
4360
4361/* Run a function in a new user namespace.
4362 * The caller's euid/egid will be mapped if it is not already.
4363 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4364 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4365 * This means we require only to establish a mapping from:
4366 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4367 * - the container root -> some sub{g,u}id
915e3dbd 4368 * The former we add, if the user did not specify a mapping. The latter we
6f3fd27f 4369 * retrieve from the container's configured {g,u}id mappings as it must have been
dcf0ffdf
CB
4370 * there to start the container in the first place.
4371 */
4372int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4373 const char *fn_name)
4374{
4375 pid_t pid;
dcf0ffdf 4376 int p[2];
0fd73091 4377 struct userns_fn_data d;
dcf0ffdf 4378 struct lxc_list *idmap;
0fd73091
CB
4379 int ret = -1, status = -1;
4380 char c = '1';
dcf0ffdf 4381
2b2655a8
CB
4382 if (!conf)
4383 return -EINVAL;
4384
dcf0ffdf
CB
4385 idmap = get_minimal_idmap(conf);
4386 if (!idmap)
4387 return -1;
4388
979f9e34 4389 ret = pipe2(p, O_CLOEXEC);
dcf0ffdf
CB
4390 if (ret < 0) {
4391 SYSERROR("Failed to create pipe");
4392 return -1;
4393 }
4394 d.fn = fn;
4395 d.fn_name = fn_name;
4396 d.arg = data;
4397 d.p[0] = p[0];
4398 d.p[1] = p[1];
4399
4400 /* Clone child in new user namespace. */
a59440be 4401 pid = lxc_raw_clone_cb(run_userns_fn, &d, CLONE_NEWUSER, NULL);
dcf0ffdf 4402 if (pid < 0) {
0fd73091 4403 ERROR("Failed to clone process in new user namespace");
dcf0ffdf
CB
4404 goto on_error;
4405 }
4406
4407 close(p[0]);
4408 p[0] = -1;
4409
4b73005c
CB
4410 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4411 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
dcf0ffdf 4412 struct id_map *map;
0fd73091 4413 struct lxc_list *it;
dcf0ffdf 4414
0fd73091 4415 lxc_list_for_each (it, idmap) {
f8aa4bf3 4416 map = it->elem;
dcf0ffdf 4417 TRACE("Establishing %cid mapping for \"%d\" in new "
f8aa4bf3 4418 "user namespace: nsuid %lu - hostid %lu - range "
0fd73091
CB
4419 "%lu",
4420 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4421 map->nsid, map->hostid, map->range);
f8aa4bf3 4422 }
4355ab5f
SH
4423 }
4424
f8aa4bf3 4425 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4426 ret = lxc_map_ids(idmap, pid);
f8aa4bf3 4427 if (ret < 0) {
0fd73091 4428 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
f8aa4bf3 4429 goto on_error;
4355ab5f
SH
4430 }
4431
f8aa4bf3 4432 /* Tell child to proceed. */
489f39be 4433 if (lxc_write_nointr(p[1], &c, 1) != 1) {
dcf0ffdf 4434 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
f8aa4bf3 4435 goto on_error;
4355ab5f
SH
4436 }
4437
686dd5d1 4438on_error:
4355ab5f
SH
4439 if (p[0] != -1)
4440 close(p[0]);
4441 close(p[1]);
f8aa4bf3 4442
ee1b16bc
TA
4443 /* Wait for child to finish. */
4444 if (pid > 0)
4445 status = wait_for_pid(pid);
4446
686dd5d1
CB
4447 if (status < 0)
4448 ret = -1;
4449
f8aa4bf3 4450 return ret;
4355ab5f 4451}
97e9cfa0 4452
415a8851
CB
4453int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
4454 const char *fn_name)
4455{
4456 pid_t pid;
4457 uid_t euid, egid;
415a8851
CB
4458 int p[2];
4459 struct id_map *map;
4460 struct lxc_list *cur;
0fd73091 4461 struct userns_fn_data d;
415a8851 4462 int ret = -1;
0fd73091 4463 char c = '1';
415a8851
CB
4464 struct lxc_list *idmap = NULL, *tmplist = NULL;
4465 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4466 *host_uid_map = NULL, *host_gid_map = NULL;
4467
2b2655a8
CB
4468 if (!conf)
4469 return -EINVAL;
4470
979f9e34 4471 ret = pipe2(p, O_CLOEXEC);
415a8851
CB
4472 if (ret < 0) {
4473 SYSERROR("opening pipe");
4474 return -1;
4475 }
4476 d.fn = fn;
4477 d.fn_name = fn_name;
4478 d.arg = data;
4479 d.p[0] = p[0];
4480 d.p[1] = p[1];
4481
4482 /* Clone child in new user namespace. */
33258b95 4483 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER, NULL);
415a8851 4484 if (pid < 0) {
0fd73091 4485 ERROR("Failed to clone process in new user namespace");
415a8851
CB
4486 goto on_error;
4487 }
4488
4489 close(p[0]);
4490 p[0] = -1;
4491
4492 euid = geteuid();
4493 egid = getegid();
4494
4495 /* Allocate new {g,u}id map list. */
4496 idmap = malloc(sizeof(*idmap));
4497 if (!idmap)
4498 goto on_error;
4499 lxc_list_init(idmap);
4500
4501 /* Find container root. */
0fd73091 4502 lxc_list_for_each (cur, &conf->id_map) {
415a8851
CB
4503 struct id_map *tmpmap;
4504
4505 tmplist = malloc(sizeof(*tmplist));
4506 if (!tmplist)
4507 goto on_error;
4508
4509 tmpmap = malloc(sizeof(*tmpmap));
4510 if (!tmpmap) {
4511 free(tmplist);
4512 goto on_error;
4513 }
4514
4515 memset(tmpmap, 0, sizeof(*tmpmap));
4516 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
4517 tmplist->elem = tmpmap;
4518
4519 lxc_list_add_tail(idmap, tmplist);
4520
4521 map = cur->elem;
4522
4523 if (map->idtype == ID_TYPE_UID)
4524 if (euid >= map->hostid && euid < map->hostid + map->range)
4525 host_uid_map = map;
4526
4527 if (map->idtype == ID_TYPE_GID)
4528 if (egid >= map->hostid && egid < map->hostid + map->range)
4529 host_gid_map = map;
4530
4531 if (map->nsid != 0)
4532 continue;
4533
4534 if (map->idtype == ID_TYPE_UID)
4535 if (container_root_uid == NULL)
4536 container_root_uid = map;
4537
4538 if (map->idtype == ID_TYPE_GID)
4539 if (container_root_gid == NULL)
4540 container_root_gid = map;
4541 }
4542
4543 if (!container_root_uid || !container_root_gid) {
4544 ERROR("No mapping for container root found");
4545 goto on_error;
4546 }
4547
4548 /* Check whether the {g,u}id of the user has a mapping. */
4549 if (!host_uid_map)
c4333195 4550 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
415a8851
CB
4551 else
4552 host_uid_map = container_root_uid;
4553
4554 if (!host_gid_map)
c4333195 4555 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
415a8851
CB
4556 else
4557 host_gid_map = container_root_gid;
4558
4559 if (!host_uid_map) {
4560 DEBUG("Failed to find mapping for uid %d", euid);
4561 goto on_error;
4562 }
4563
4564 if (!host_gid_map) {
4565 DEBUG("Failed to find mapping for gid %d", egid);
4566 goto on_error;
4567 }
4568
4569 if (host_uid_map && (host_uid_map != container_root_uid)) {
4570 /* Add container root to the map. */
4571 tmplist = malloc(sizeof(*tmplist));
4572 if (!tmplist)
4573 goto on_error;
4574 lxc_list_add_elem(tmplist, host_uid_map);
4575 lxc_list_add_tail(idmap, tmplist);
4576 }
4577 /* idmap will now keep track of that memory. */
4578 host_uid_map = NULL;
4579
4580 if (host_gid_map && (host_gid_map != container_root_gid)) {
4581 tmplist = malloc(sizeof(*tmplist));
4582 if (!tmplist)
4583 goto on_error;
4584 lxc_list_add_elem(tmplist, host_gid_map);
4585 lxc_list_add_tail(idmap, tmplist);
4586 }
4587 /* idmap will now keep track of that memory. */
4588 host_gid_map = NULL;
4589
4590 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4591 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
0fd73091 4592 lxc_list_for_each (cur, idmap) {
415a8851
CB
4593 map = cur->elem;
4594 TRACE("establishing %cid mapping for \"%d\" in new "
4595 "user namespace: nsuid %lu - hostid %lu - range "
4596 "%lu",
4597 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4598 map->nsid, map->hostid, map->range);
4599 }
4600 }
4601
4602 /* Set up {g,u}id mapping for user namespace of child process. */
4603 ret = lxc_map_ids(idmap, pid);
4604 if (ret < 0) {
0fd73091 4605 ERROR("error setting up {g,u}id mappings for child process \"%d\"", pid);
415a8851
CB
4606 goto on_error;
4607 }
4608
4609 /* Tell child to proceed. */
489f39be 4610 if (lxc_write_nointr(p[1], &c, 1) != 1) {
0fd73091 4611 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
415a8851
CB
4612 goto on_error;
4613 }
4614
686dd5d1 4615on_error:
ee1b16bc
TA
4616 if (p[0] != -1)
4617 close(p[0]);
4618 close(p[1]);
4619
415a8851 4620 /* Wait for child to finish. */
686dd5d1
CB
4621 if (pid > 0)
4622 ret = wait_for_pid(pid);
415a8851 4623
80758b4b 4624 if (idmap) {
415a8851 4625 lxc_free_idmap(idmap);
80758b4b
DJ
4626 free(idmap);
4627 }
4628
415a8851
CB
4629 if (host_uid_map && (host_uid_map != container_root_uid))
4630 free(host_uid_map);
4631 if (host_gid_map && (host_gid_map != container_root_gid))
4632 free(host_gid_map);
4633
415a8851
CB
4634 return ret;
4635}
4636
a96a8e8c 4637/* not thread-safe, do not use from api without first forking */
0fd73091 4638static char *getuname(void)
97e9cfa0 4639{
4f410b2a 4640 __do_free char *buf = NULL;
cb7aa5e8
DJ
4641 struct passwd pwent;
4642 struct passwd *pwentp = NULL;
cb7aa5e8
DJ
4643 size_t bufsize;
4644 int ret;
97e9cfa0 4645
cb7aa5e8
DJ
4646 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
4647 if (bufsize == -1)
4648 bufsize = 1024;
4649
4650 buf = malloc(bufsize);
4651 if (!buf)
97e9cfa0
SH
4652 return NULL;
4653
cb7aa5e8
DJ
4654 ret = getpwuid_r(geteuid(), &pwent, buf, bufsize, &pwentp);
4655 if (!pwentp) {
4656 if (ret == 0)
4657 WARN("Could not find matched password record.");
4658
4659 ERROR("Failed to get password record - %u", geteuid());
cb7aa5e8
DJ
4660 return NULL;
4661 }
4662
4f410b2a 4663 return strdup(pwent.pw_name);
97e9cfa0
SH
4664}
4665
a96a8e8c 4666/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4667static char *getgname(void)
4668{
4f410b2a 4669 __do_free char *buf = NULL;
3de9fb4c
DJ
4670 struct group grent;
4671 struct group *grentp = NULL;
3de9fb4c
DJ
4672 size_t bufsize;
4673 int ret;
4674
4675 bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
4676 if (bufsize == -1)
4677 bufsize = 1024;
4678
4679 buf = malloc(bufsize);
4680 if (!buf)
4681 return NULL;
4682
4683 ret = getgrgid_r(getegid(), &grent, buf, bufsize, &grentp);
4684 if (!grentp) {
4685 if (ret == 0)
4686 WARN("Could not find matched group record");
97e9cfa0 4687
3de9fb4c 4688 ERROR("Failed to get group record - %u", getegid());
97e9cfa0 4689 return NULL;
3de9fb4c
DJ
4690 }
4691
4f410b2a 4692 return strdup(grent.gr_name);
97e9cfa0
SH
4693}
4694
a96a8e8c 4695/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4696void suggest_default_idmap(void)
4697{
3a6e3bf5 4698 __do_free char *gname = NULL, *line = NULL, *uname = NULL;
4aae564f 4699 __do_fclose FILE *subuid_f = NULL, *subgid_f = NULL;
97e9cfa0 4700 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
97e9cfa0
SH
4701 size_t len = 0;
4702
0fd73091
CB
4703 uname = getuname();
4704 if (!uname)
97e9cfa0
SH
4705 return;
4706
0fd73091 4707 gname = getgname();
3a6e3bf5 4708 if (!gname)
97e9cfa0 4709 return;
97e9cfa0 4710
4110345b 4711 subuid_f = fopen(subuidfile, "re");
4aae564f 4712 if (!subuid_f) {
97e9cfa0 4713 ERROR("Your system is not configured with subuids");
97e9cfa0
SH
4714 return;
4715 }
0fd73091 4716
4aae564f 4717 while (getline(&line, &len, subuid_f) != -1) {
0fd73091 4718 char *p, *p2;
b7930180 4719 size_t no_newline = 0;
0fd73091
CB
4720
4721 p = strchr(line, ':');
97e9cfa0
SH
4722 if (*line == '#')
4723 continue;
4724 if (!p)
4725 continue;
4726 *p = '\0';
4727 p++;
0fd73091 4728
97e9cfa0
SH
4729 if (strcmp(line, uname))
4730 continue;
0fd73091 4731
97e9cfa0
SH
4732 p2 = strchr(p, ':');
4733 if (!p2)
4734 continue;
4735 *p2 = '\0';
4736 p2++;
4737 if (!*p2)
4738 continue;
b7930180
CB
4739 no_newline = strcspn(p2, "\n");
4740 p2[no_newline] = '\0';
4741
b7b2fde4 4742 if (lxc_safe_uint(p, &uid) < 0)
0fd73091 4743 WARN("Could not parse UID");
b7b2fde4 4744 if (lxc_safe_uint(p2, &urange) < 0)
0fd73091 4745 WARN("Could not parse UID range");
97e9cfa0 4746 }
97e9cfa0 4747
4110345b 4748 subgid_f = fopen(subgidfile, "re");
4aae564f 4749 if (!subgid_f) {
97e9cfa0 4750 ERROR("Your system is not configured with subgids");
97e9cfa0
SH
4751 return;
4752 }
0fd73091 4753
4aae564f 4754 while (getline(&line, &len, subgid_f) != -1) {
0fd73091 4755 char *p, *p2;
b7930180 4756 size_t no_newline = 0;
0fd73091
CB
4757
4758 p = strchr(line, ':');
97e9cfa0
SH
4759 if (*line == '#')
4760 continue;
4761 if (!p)
4762 continue;
4763 *p = '\0';
4764 p++;
0fd73091 4765
97e9cfa0
SH
4766 if (strcmp(line, uname))
4767 continue;
0fd73091 4768
97e9cfa0
SH
4769 p2 = strchr(p, ':');
4770 if (!p2)
4771 continue;
4772 *p2 = '\0';
4773 p2++;
4774 if (!*p2)
4775 continue;
b7930180
CB
4776 no_newline = strcspn(p2, "\n");
4777 p2[no_newline] = '\0';
4778
b7b2fde4 4779 if (lxc_safe_uint(p, &gid) < 0)
0fd73091 4780 WARN("Could not parse GID");
b7b2fde4 4781 if (lxc_safe_uint(p2, &grange) < 0)
0fd73091 4782 WARN("Could not parse GID range");
97e9cfa0 4783 }
97e9cfa0 4784
97e9cfa0
SH
4785 if (!urange || !grange) {
4786 ERROR("You do not have subuids or subgids allocated");
4787 ERROR("Unprivileged containers require subuids and subgids");
4788 return;
4789 }
4790
4791 ERROR("You must either run as root, or define uid mappings");
4792 ERROR("To pass uid mappings to lxc-create, you could create");
4793 ERROR("~/.config/lxc/default.conf:");
4794 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
bdcbb6b3
CB
4795 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4796 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
97e9cfa0 4797}
aaf26830 4798
a7307747
SH
4799static void free_cgroup_settings(struct lxc_list *result)
4800{
4801 struct lxc_list *iterator, *next;
4802
0fd73091 4803 lxc_list_for_each_safe (iterator, result, next) {
a7307747
SH
4804 lxc_list_del(iterator);
4805 free(iterator);
4806 }
4807 free(result);
4808}
4809
0fd73091 4810/* Return the list of cgroup_settings sorted according to the following rules
aaf26830
KT
4811 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4812 */
0fd73091 4813struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings)
aaf26830
KT
4814{
4815 struct lxc_list *result;
aaf26830 4816 struct lxc_cgroup *cg = NULL;
0fd73091 4817 struct lxc_list *it = NULL, *item = NULL, *memsw_limit = NULL;
aaf26830
KT
4818
4819 result = malloc(sizeof(*result));
0fd73091 4820 if (!result)
fac7c663 4821 return NULL;
aaf26830
KT
4822 lxc_list_init(result);
4823
0fd73091
CB
4824 /* Iterate over the cgroup settings and copy them to the output list. */
4825 lxc_list_for_each (it, cgroup_settings) {
aaf26830 4826 item = malloc(sizeof(*item));
fac7c663 4827 if (!item) {
a7307747 4828 free_cgroup_settings(result);
fac7c663
KT
4829 return NULL;
4830 }
0fd73091 4831
aaf26830
KT
4832 item->elem = it->elem;
4833 cg = it->elem;
4834 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4835 /* Store the memsw_limit location */
4836 memsw_limit = item;
0fd73091
CB
4837 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 &&
4838 memsw_limit != NULL) {
4839 /* lxc.cgroup.memory.memsw.limit_in_bytes is found
4840 * before lxc.cgroup.memory.limit_in_bytes, swap these
4841 * two items */
aaf26830
KT
4842 item->elem = memsw_limit->elem;
4843 memsw_limit->elem = it->elem;
4844 }
4845 lxc_list_add_tail(result, item);
4846 }
4847
4848 return result;
a7307747 4849}