]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
macro: define TIOCGPTPEER if missing
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
cc73685d 1/* SPDX-License-Identifier: LGPL-2.1+ */
1d52bdf7 2
d38dd64a
CB
3#ifndef _GNU_SOURCE
4#define _GNU_SOURCE 1
5#endif
9d257a2a 6#include <arpa/inet.h>
8f3e280e
CB
7#include <dirent.h>
8#include <errno.h>
9#include <fcntl.h>
10#include <grp.h>
11#include <inttypes.h>
12#include <libgen.h>
9d257a2a
CB
13#include <linux/loop.h>
14#include <net/if.h>
15#include <netinet/in.h>
8f3e280e
CB
16#include <pwd.h>
17#include <stdarg.h>
0ad19a3f 18#include <stdio.h>
0ad19a3f 19#include <stdlib.h>
0ad19a3f 20#include <string.h>
8f3e280e
CB
21#include <sys/mman.h>
22#include <sys/mount.h>
23#include <sys/param.h>
24#include <sys/prctl.h>
6a49f05e 25#include <sys/sendfile.h>
8f3e280e 26#include <sys/socket.h>
9d257a2a 27#include <sys/stat.h>
2d76d1d7 28#include <sys/syscall.h>
9d257a2a 29#include <sys/sysmacros.h>
97e9cfa0 30#include <sys/types.h>
8f3e280e
CB
31#include <sys/utsname.h>
32#include <sys/wait.h>
9d257a2a
CB
33#include <time.h>
34#include <unistd.h>
1d52bdf7 35
d38dd64a
CB
36#include "af_unix.h"
37#include "caps.h"
38#include "cgroup.h"
bf651989 39#include "cgroup2_devices.h"
d38dd64a
CB
40#include "conf.h"
41#include "config.h"
42#include "confile.h"
43#include "confile_utils.h"
44#include "error.h"
45#include "log.h"
46#include "lsm/lsm.h"
47#include "lxclock.h"
48#include "lxcseccomp.h"
49#include "macro.h"
2f443e88 50#include "memory_utils.h"
7f88a1a2 51#include "mount_utils.h"
d38dd64a
CB
52#include "namespace.h"
53#include "network.h"
54#include "parse.h"
f40988c7 55#include "process_utils.h"
d38dd64a
CB
56#include "ringbuf.h"
57#include "start.h"
58#include "storage.h"
59#include "storage/overlay.h"
6b3d24d7 60#include "syscall_wrappers.h"
d38dd64a
CB
61#include "terminal.h"
62#include "utils.h"
20502652 63#include "uuid.h"
d38dd64a 64
af6824fc 65#ifdef MAJOR_IN_MKDEV
9d257a2a 66#include <sys/mkdev.h>
af6824fc 67#endif
af6824fc 68
614305f3 69#ifdef HAVE_STATVFS
2938f7c8 70#include <sys/statvfs.h>
614305f3 71#endif
e827ff7e 72
35eb5cdc 73#if HAVE_OPENPTY
b0a33c1e 74#include <pty.h>
e827ff7e
SG
75#else
76#include <../include/openpty.h>
77#endif
0ad19a3f 78
9d257a2a
CB
79#if HAVE_LIBCAP
80#include <sys/capability.h>
81#endif
82
83#if HAVE_SYS_PERSONALITY_H
84#include <sys/personality.h>
85#endif
86
f1e05b90
DJ
87#ifndef HAVE_STRLCAT
88#include "include/strlcat.h"
89#endif
90
9d257a2a
CB
91#if IS_BIONIC
92#include <../include/lxcmntent.h>
93#else
94#include <mntent.h>
95#endif
96
97#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
98#include <../include/prlimit.h>
99#endif
100
ac2cecc4 101lxc_log_define(conf, lxc);
e5bda9ee 102
0fd73091
CB
103/* The lxc_conf of the container currently being worked on in an API call.
104 * This is used in the error calls.
105 */
106#ifdef HAVE_TLS
d7f19646 107thread_local struct lxc_conf *current_config;
0fd73091
CB
108#else
109struct lxc_conf *current_config;
110#endif
8912711c 111
0fd73091
CB
112char *lxchook_names[NUM_LXC_HOOKS] = {
113 "pre-start",
114 "pre-mount",
115 "mount",
116 "autodev",
117 "start",
118 "stop",
119 "post-stop",
120 "clone",
121 "destroy",
122 "start-host"
123};
72d0e1cb 124
998ac676
RT
125struct mount_opt {
126 char *name;
127 int clear;
128 int flag;
129};
130
81810dd1
DL
131struct caps_opt {
132 char *name;
133 int value;
134};
135
c6d09e15
WB
136struct limit_opt {
137 char *name;
138 int value;
139};
140
998ac676 141static struct mount_opt mount_opt[] = {
470b359b
CB
142 { "async", 1, MS_SYNCHRONOUS },
143 { "atime", 1, MS_NOATIME },
144 { "bind", 0, MS_BIND },
88d413d5 145 { "defaults", 0, 0 },
88d413d5 146 { "dev", 1, MS_NODEV },
470b359b 147 { "diratime", 1, MS_NODIRATIME },
88d413d5 148 { "dirsync", 0, MS_DIRSYNC },
470b359b 149 { "exec", 1, MS_NOEXEC },
8912711c 150 { "lazytime", 0, MS_LAZYTIME },
88d413d5 151 { "mand", 0, MS_MANDLOCK },
88d413d5 152 { "noatime", 0, MS_NOATIME },
470b359b 153 { "nodev", 0, MS_NODEV },
88d413d5 154 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
155 { "noexec", 0, MS_NOEXEC },
156 { "nomand", 1, MS_MANDLOCK },
157 { "norelatime", 1, MS_RELATIME },
158 { "nostrictatime", 1, MS_STRICTATIME },
159 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
160 { "rbind", 0, MS_BIND|MS_REC },
161 { "relatime", 0, MS_RELATIME },
470b359b
CB
162 { "remount", 0, MS_REMOUNT },
163 { "ro", 0, MS_RDONLY },
164 { "rw", 1, MS_RDONLY },
88d413d5 165 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
166 { "suid", 1, MS_NOSUID },
167 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 168 { NULL, 0, 0 },
998ac676
RT
169};
170
d840039e 171static struct mount_opt propagation_opt[] = {
0fd73091
CB
172 { "private", 0, MS_PRIVATE },
173 { "shared", 0, MS_SHARED },
174 { "slave", 0, MS_SLAVE },
175 { "unbindable", 0, MS_UNBINDABLE },
176 { "rprivate", 0, MS_PRIVATE|MS_REC },
177 { "rshared", 0, MS_SHARED|MS_REC },
178 { "rslave", 0, MS_SLAVE|MS_REC },
179 { "runbindable", 0, MS_UNBINDABLE|MS_REC },
180 { NULL, 0, 0 },
d840039e
YT
181};
182
81810dd1 183static struct caps_opt caps_opt[] = {
8560cd36 184#if HAVE_LIBCAP
0fd73091
CB
185 { "chown", CAP_CHOWN },
186 { "dac_override", CAP_DAC_OVERRIDE },
187 { "dac_read_search", CAP_DAC_READ_SEARCH },
188 { "fowner", CAP_FOWNER },
189 { "fsetid", CAP_FSETID },
190 { "kill", CAP_KILL },
191 { "setgid", CAP_SETGID },
192 { "setuid", CAP_SETUID },
193 { "setpcap", CAP_SETPCAP },
194 { "linux_immutable", CAP_LINUX_IMMUTABLE },
195 { "net_bind_service", CAP_NET_BIND_SERVICE },
196 { "net_broadcast", CAP_NET_BROADCAST },
197 { "net_admin", CAP_NET_ADMIN },
198 { "net_raw", CAP_NET_RAW },
199 { "ipc_lock", CAP_IPC_LOCK },
200 { "ipc_owner", CAP_IPC_OWNER },
201 { "sys_module", CAP_SYS_MODULE },
202 { "sys_rawio", CAP_SYS_RAWIO },
203 { "sys_chroot", CAP_SYS_CHROOT },
204 { "sys_ptrace", CAP_SYS_PTRACE },
205 { "sys_pacct", CAP_SYS_PACCT },
206 { "sys_admin", CAP_SYS_ADMIN },
207 { "sys_boot", CAP_SYS_BOOT },
208 { "sys_nice", CAP_SYS_NICE },
209 { "sys_resource", CAP_SYS_RESOURCE },
210 { "sys_time", CAP_SYS_TIME },
211 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
212 { "mknod", CAP_MKNOD },
213 { "lease", CAP_LEASE },
57b837e2 214#ifdef CAP_AUDIT_READ
0fd73091 215 { "audit_read", CAP_AUDIT_READ },
57b837e2 216#endif
9527e566 217#ifdef CAP_AUDIT_WRITE
0fd73091 218 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
219#endif
220#ifdef CAP_AUDIT_CONTROL
0fd73091 221 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 222#endif
0fd73091
CB
223 { "setfcap", CAP_SETFCAP },
224 { "mac_override", CAP_MAC_OVERRIDE },
225 { "mac_admin", CAP_MAC_ADMIN },
5170c716 226#ifdef CAP_SYSLOG
0fd73091 227 { "syslog", CAP_SYSLOG },
5170c716
CS
228#endif
229#ifdef CAP_WAKE_ALARM
0fd73091 230 { "wake_alarm", CAP_WAKE_ALARM },
5170c716 231#endif
2b54359b 232#ifdef CAP_BLOCK_SUSPEND
0fd73091 233 { "block_suspend", CAP_BLOCK_SUSPEND },
2b54359b 234#endif
495d2046 235#endif
8560cd36 236};
81810dd1 237
c6d09e15
WB
238static struct limit_opt limit_opt[] = {
239#ifdef RLIMIT_AS
240 { "as", RLIMIT_AS },
241#endif
242#ifdef RLIMIT_CORE
243 { "core", RLIMIT_CORE },
244#endif
245#ifdef RLIMIT_CPU
246 { "cpu", RLIMIT_CPU },
247#endif
248#ifdef RLIMIT_DATA
249 { "data", RLIMIT_DATA },
250#endif
251#ifdef RLIMIT_FSIZE
252 { "fsize", RLIMIT_FSIZE },
253#endif
254#ifdef RLIMIT_LOCKS
255 { "locks", RLIMIT_LOCKS },
256#endif
257#ifdef RLIMIT_MEMLOCK
258 { "memlock", RLIMIT_MEMLOCK },
259#endif
260#ifdef RLIMIT_MSGQUEUE
261 { "msgqueue", RLIMIT_MSGQUEUE },
262#endif
263#ifdef RLIMIT_NICE
264 { "nice", RLIMIT_NICE },
265#endif
266#ifdef RLIMIT_NOFILE
267 { "nofile", RLIMIT_NOFILE },
268#endif
269#ifdef RLIMIT_NPROC
270 { "nproc", RLIMIT_NPROC },
271#endif
272#ifdef RLIMIT_RSS
273 { "rss", RLIMIT_RSS },
274#endif
275#ifdef RLIMIT_RTPRIO
276 { "rtprio", RLIMIT_RTPRIO },
277#endif
278#ifdef RLIMIT_RTTIME
279 { "rttime", RLIMIT_RTTIME },
280#endif
281#ifdef RLIMIT_SIGPENDING
282 { "sigpending", RLIMIT_SIGPENDING },
283#endif
284#ifdef RLIMIT_STACK
285 { "stack", RLIMIT_STACK },
286#endif
287};
288
91c3830e
SH
289static int run_buffer(char *buffer)
290{
cc6a0e78 291 __do_free char *output = NULL;
55022530 292 __do_lxc_pclose struct lxc_popen_FILE *f = NULL;
ebf3a6af 293 int fd, ret;
91c3830e 294
ebec9176 295 f = lxc_popen(buffer);
55022530
CB
296 if (!f)
297 return log_error_errno(-1, errno, "Failed to popen() %s", buffer);
91c3830e
SH
298
299 output = malloc(LXC_LOG_BUFFER_SIZE);
55022530
CB
300 if (!output)
301 return log_error_errno(-1, ENOMEM, "Failed to allocate memory for %s", buffer);
91c3830e 302
ebf3a6af 303 fd = fileno(f->f);
55022530
CB
304 if (fd < 0)
305 return log_error_errno(-1, errno, "Failed to retrieve underlying file descriptor");
ebf3a6af
CB
306
307 for (int i = 0; i < 10; i++) {
308 ssize_t bytes_read;
309
310 bytes_read = lxc_read_nointr(fd, output, LXC_LOG_BUFFER_SIZE - 1);
311 if (bytes_read > 0) {
312 output[bytes_read] = '\0';
313 DEBUG("Script %s produced output: %s", buffer, output);
314 continue;
315 }
316
317 break;
318 }
91c3830e 319
55022530
CB
320 ret = lxc_pclose(move_ptr(f));
321 if (ret == -1)
322 return log_error_errno(-1, errno, "Script exited with error");
323 else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0)
324 return log_error(-1, "Script exited with status %d", WEXITSTATUS(ret));
325 else if (WIFSIGNALED(ret))
326 return log_error(-1, "Script terminated by signal %d", WTERMSIG(ret));
91c3830e
SH
327
328 return 0;
329}
330
14a7b0f9
CB
331int run_script_argv(const char *name, unsigned int hook_version,
332 const char *section, const char *script,
586b1ce7 333 const char *hookname, char **argv)
148e91f5 334{
e1a94937 335 __do_free char *buffer = NULL;
3f60c2f7 336 int buf_pos, i, ret;
d08e5708 337 size_t size = 0;
148e91f5 338
3f60c2f7 339 if (hook_version == 0)
55022530
CB
340 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
341 script, name, section);
3f60c2f7
CB
342 else
343 INFO("Executing script \"%s\" for container \"%s\"", script, name);
148e91f5 344
586b1ce7
CB
345 for (i = 0; argv && argv[i]; i++)
346 size += strlen(argv[i]) + 1;
148e91f5 347
6333c915
CB
348 size += STRLITERALLEN("exec");
349 size++;
148e91f5 350 size += strlen(script);
3f60c2f7
CB
351 size++;
352
148e91f5 353 if (size > INT_MAX)
3f60c2f7 354 return -EFBIG;
148e91f5 355
3f60c2f7 356 if (hook_version == 0) {
d08e5708
CB
357 size += strlen(hookname);
358 size++;
359
360 size += strlen(name);
361 size++;
362
363 size += strlen(section);
364 size++;
365
366 if (size > INT_MAX)
367 return -EFBIG;
327cce76 368 }
3f60c2f7 369
6f8d00d2
CB
370 buffer = malloc(size);
371 if (!buffer)
372 return -ENOMEM;
373
327cce76 374 if (hook_version == 0)
3f60c2f7 375 buf_pos = snprintf(buffer, size, "exec %s %s %s %s", script, name, section, hookname);
327cce76 376 else
3f60c2f7 377 buf_pos = snprintf(buffer, size, "exec %s", script);
55022530
CB
378 if (buf_pos < 0 || (size_t)buf_pos >= size)
379 return log_error_errno(-1, errno, "Failed to create command line for script \"%s\"", script);
3f60c2f7 380
327cce76 381 if (hook_version == 1) {
3f60c2f7
CB
382 ret = setenv("LXC_HOOK_TYPE", hookname, 1);
383 if (ret < 0) {
55022530 384 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_HOOK_TYPE=%s", hookname);
3f60c2f7 385 }
90f20466 386 TRACE("Set environment variable: LXC_HOOK_TYPE=%s", hookname);
3f60c2f7
CB
387
388 ret = setenv("LXC_HOOK_SECTION", section, 1);
55022530
CB
389 if (ret < 0)
390 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_HOOK_SECTION=%s", section);
3f60c2f7 391 TRACE("Set environment variable: LXC_HOOK_SECTION=%s", section);
14a7b0f9
CB
392
393 if (strcmp(section, "net") == 0) {
394 char *parent;
395
586b1ce7 396 if (!argv || !argv[0])
e1a94937 397 return -1;
14a7b0f9 398
586b1ce7 399 ret = setenv("LXC_NET_TYPE", argv[0], 1);
55022530
CB
400 if (ret < 0)
401 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_TYPE=%s", argv[0]);
586b1ce7 402 TRACE("Set environment variable: LXC_NET_TYPE=%s", argv[0]);
14a7b0f9 403
586b1ce7 404 parent = argv[1] ? argv[1] : "";
14a7b0f9 405
a8144263 406 if (strcmp(argv[0], "macvlan") == 0) {
14a7b0f9 407 ret = setenv("LXC_NET_PARENT", parent, 1);
55022530
CB
408 if (ret < 0)
409 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
14a7b0f9 410 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 411 } else if (strcmp(argv[0], "phys") == 0) {
14a7b0f9 412 ret = setenv("LXC_NET_PARENT", parent, 1);
55022530
CB
413 if (ret < 0)
414 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
14a7b0f9 415 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 416 } else if (strcmp(argv[0], "veth") == 0) {
586b1ce7 417 char *peer = argv[2] ? argv[2] : "";
14a7b0f9
CB
418
419 ret = setenv("LXC_NET_PEER", peer, 1);
55022530
CB
420 if (ret < 0)
421 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PEER=%s", peer);
14a7b0f9
CB
422 TRACE("Set environment variable: LXC_NET_PEER=%s", peer);
423
424 ret = setenv("LXC_NET_PARENT", parent, 1);
55022530
CB
425 if (ret < 0)
426 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
14a7b0f9
CB
427 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
428 }
429 }
148e91f5
SH
430 }
431
586b1ce7 432 for (i = 0; argv && argv[i]; i++) {
3f60c2f7
CB
433 size_t len = size - buf_pos;
434
586b1ce7 435 ret = snprintf(buffer + buf_pos, len, " %s", argv[i]);
55022530
CB
436 if (ret < 0 || (size_t)ret >= len)
437 return log_error_errno(-1, errno, "Failed to create command line for script \"%s\"", script);
3f60c2f7 438 buf_pos += ret;
148e91f5
SH
439 }
440
e1a94937 441 return run_buffer(buffer);
148e91f5
SH
442}
443
811ef482 444int run_script(const char *name, const char *section, const char *script, ...)
e3b4c4c4 445{
2f443e88 446 __do_free char *buffer = NULL;
abbfd20b 447 int ret;
2f443e88 448 char *p;
abbfd20b 449 va_list ap;
0fd73091 450 size_t size = 0;
751d9dcd 451
0fd73091 452 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
751d9dcd 453 script, name, section);
e3b4c4c4 454
abbfd20b
DL
455 va_start(ap, script);
456 while ((p = va_arg(ap, char *)))
95642a10 457 size += strlen(p) + 1;
abbfd20b
DL
458 va_end(ap);
459
6333c915 460 size += STRLITERALLEN("exec");
abbfd20b
DL
461 size += strlen(script);
462 size += strlen(name);
463 size += strlen(section);
6d1a5f93 464 size += 4;
abbfd20b 465
95642a10
MS
466 if (size > INT_MAX)
467 return -1;
468
2f443e88 469 buffer = must_realloc(NULL, size);
6d1a5f93 470 ret = snprintf(buffer, size, "exec %s %s %s", script, name, section);
0fd73091 471 if (ret < 0 || ret >= size)
9ba8130c 472 return -1;
751d9dcd 473
abbfd20b 474 va_start(ap, script);
9ba8130c 475 while ((p = va_arg(ap, char *))) {
062b72c6 476 int len = size - ret;
9ba8130c
SH
477 int rc;
478 rc = snprintf(buffer + ret, len, " %s", p);
7b5a2435
DJ
479 if (rc < 0 || rc >= len) {
480 va_end(ap);
9ba8130c 481 return -1;
7b5a2435 482 }
9ba8130c
SH
483 ret += rc;
484 }
abbfd20b 485 va_end(ap);
751d9dcd 486
91c3830e 487 return run_buffer(buffer);
e3b4c4c4
ST
488}
489
0fd73091 490/* pin_rootfs
63fc76c3 491 * if rootfs is a directory, then open ${rootfs}/.lxc-keep for writing for
b7ed4bf0
CS
492 * the duration of the container run, to prevent the container from marking
493 * the underlying fs readonly on shutdown. unlink the file immediately so
63fc76c3
GJ
494 * no name pollution is happens.
495 * don't unlink on NFS to avoid random named stale handles.
0c547523
SH
496 * return -1 on error.
497 * return -2 if nothing needed to be pinned.
498 * return an open fd (>=0) if we pinned it.
499 */
500int pin_rootfs(const char *rootfs)
501{
957c4704 502 __do_free char *absrootfs = NULL;
0fd73091 503 int fd, ret;
6b5a54cd 504 char absrootfspin[PATH_MAX];
0c547523 505 struct stat s;
63fc76c3 506 struct statfs sfs;
0c547523 507
e99ee0de 508 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 509 return -2;
e99ee0de 510
74e7b662 511 absrootfs = realpath(rootfs, NULL);
512 if (!absrootfs)
9be53773 513 return -2;
0c547523 514
0fd73091 515 ret = stat(absrootfs, &s);
957c4704 516 if (ret < 0)
0c547523 517 return -1;
0c547523 518
957c4704 519 if (!S_ISDIR(s.st_mode))
0c547523
SH
520 return -2;
521
55022530
CB
522 ret = snprintf(absrootfspin, sizeof(absrootfspin), "%s/.lxc-keep", absrootfs);
523 if (ret < 0 || (size_t)ret >= sizeof(absrootfspin))
0c547523 524 return -1;
0c547523 525
55022530 526 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR | S_IRUSR | O_CLOEXEC);
b7ed4bf0
CS
527 if (fd < 0)
528 return fd;
0fd73091 529
205fc010
CB
530 ret = fstatfs (fd, &sfs);
531 if (ret < 0)
532 return fd;
63fc76c3 533
55022530
CB
534 if (sfs.f_type == NFS_SUPER_MAGIC)
535 return log_debug(fd, "Rootfs on NFS, not unlinking pin file \"%s\"", absrootfspin);
63fc76c3 536
b7ed4bf0 537 (void)unlink(absrootfspin);
0fd73091 538
0c547523
SH
539 return fd;
540}
541
0fd73091
CB
542/* If we are asking to remount something, make sure that any NOEXEC etc are
543 * honored.
e2a7e8dc 544 */
5ae72b98 545unsigned long add_required_remount_flags(const char *s, const char *d,
5285689c 546 unsigned long flags)
e2a7e8dc 547{
614305f3 548#ifdef HAVE_STATVFS
0fd73091 549 int ret;
e2a7e8dc
SH
550 struct statvfs sb;
551 unsigned long required_flags = 0;
552
e2a7e8dc
SH
553 if (!s)
554 s = d;
555
556 if (!s)
557 return flags;
0fd73091
CB
558
559 ret = statvfs(s, &sb);
560 if (ret < 0)
e2a7e8dc
SH
561 return flags;
562
69eadddb
CB
563 if (flags & MS_REMOUNT) {
564 if (sb.f_flag & MS_NOSUID)
565 required_flags |= MS_NOSUID;
566 if (sb.f_flag & MS_NODEV)
567 required_flags |= MS_NODEV;
568 if (sb.f_flag & MS_RDONLY)
569 required_flags |= MS_RDONLY;
570 if (sb.f_flag & MS_NOEXEC)
571 required_flags |= MS_NOEXEC;
572 }
573
574 if (sb.f_flag & MS_NOATIME)
575 required_flags |= MS_NOATIME;
576 if (sb.f_flag & MS_NODIRATIME)
577 required_flags |= MS_NODIRATIME;
578 if (sb.f_flag & MS_LAZYTIME)
579 required_flags |= MS_LAZYTIME;
580 if (sb.f_flag & MS_RELATIME)
581 required_flags |= MS_RELATIME;
582 if (sb.f_flag & MS_STRICTATIME)
583 required_flags |= MS_STRICTATIME;
e2a7e8dc
SH
584
585 return flags | required_flags;
614305f3
SH
586#else
587 return flags;
588#endif
e2a7e8dc
SH
589}
590
6b741397
CB
591static int add_shmount_to_list(struct lxc_conf *conf)
592{
6b5a54cd 593 char new_mount[PATH_MAX];
0d190408 594 /* Offset for the leading '/' since the path_cont
6b741397
CB
595 * is absolute inside the container.
596 */
597 int offset = 1, ret = -1;
0d190408 598
6b741397
CB
599 ret = snprintf(new_mount, sizeof(new_mount),
600 "%s %s none bind,create=dir 0 0", conf->shmount.path_host,
601 conf->shmount.path_cont + offset);
60534030 602 if (ret < 0 || (size_t)ret >= sizeof(new_mount))
0d190408
LT
603 return -1;
604
6b741397 605 return add_elem_to_mount_list(new_mount, conf);
0d190408
LT
606}
607
4fb3cba5 608static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 609{
0fd73091 610 int i, r;
b06b8511
CS
611 static struct {
612 int match_mask;
613 int match_flag;
614 const char *source;
615 const char *destination;
616 const char *fstype;
617 unsigned long flags;
618 const char *options;
619 } default_mounts[] = {
0fd73091
CB
620 /* Read-only bind-mounting... In older kernels, doing that
621 * required to do one MS_BIND mount and then
622 * MS_REMOUNT|MS_RDONLY the same one. According to mount(2)
623 * manpage, MS_BIND honors MS_RDONLY from kernel 2.6.26
624 * onwards. However, this apparently does not work on kernel
625 * 3.8. Unfortunately, on that very same kernel, doing the same
626 * trick as above doesn't seem to work either, there one needs
627 * to ALSO specify MS_BIND for the remount, otherwise the
628 * entire fs is remounted read-only or the mount fails because
629 * it's busy... MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for
630 * kernels as low as 2.6.32...
368bbc02 631 */
0fd73091 632 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a 633 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
0fd73091
CB
634 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
635 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
636 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
637 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
638 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
639 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
640 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
641 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
642 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
643 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
d1c203f4 644 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
0fd73091
CB
645 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
646 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
647 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
648 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
649 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 650 };
368bbc02 651
b06b8511 652 for (i = 0; default_mounts[i].match_mask; i++) {
8db92302 653 __do_free char *destination = NULL, *source = NULL;
0fd73091
CB
654 int saved_errno;
655 unsigned long mflags;
0fd73091
CB
656 if ((flags & default_mounts[i].match_mask) != default_mounts[i].match_flag)
657 continue;
658
659 if (default_mounts[i].source) {
cc4fd506 660 /* will act like strdup if %r is not present */
0fd73091
CB
661 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
662 if (!source)
cc4fd506 663 return -1;
0fd73091 664 }
f24a52d5 665
55022530
CB
666 if (!default_mounts[i].destination)
667 return log_error(-1, "BUG: auto mounts destination %d was NULL", i);
0fd73091
CB
668
669 /* will act like strdup if %r is not present */
670 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
55022530 671 if (!destination)
0fd73091 672 return -1;
0fd73091
CB
673
674 mflags = add_required_remount_flags(source, destination,
675 default_mounts[i].flags);
676 r = safe_mount(source, destination, default_mounts[i].fstype,
677 mflags, default_mounts[i].options,
678 conf->rootfs.path ? conf->rootfs.mount : NULL);
679 saved_errno = errno;
680 if (r < 0 && errno == ENOENT) {
55022530 681 INFO("Mount source or target for \"%s\" on \"%s\" does not exist. Skipping", source, destination);
0fd73091
CB
682 r = 0;
683 } else if (r < 0) {
684 SYSERROR("Failed to mount \"%s\" on \"%s\" with flags %lu", source, destination, mflags);
685 }
686
0fd73091
CB
687 if (r < 0) {
688 errno = saved_errno;
689 return -1;
368bbc02 690 }
368bbc02
CS
691 }
692
b06b8511 693 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
694 int cg_flags;
695
3f69fb12 696 cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE);
0fd73091
CB
697 /* If the type of cgroup mount was not specified, it depends on
698 * the container's capabilities as to what makes sense: if we
699 * have CAP_SYS_ADMIN, the read-only part can be remounted
700 * read-write anyway, so we may as well default to read-write;
701 * then the admin will not be given a false sense of security.
702 * (And if they really want mixed r/o r/w, then they can
703 * explicitly specify :mixed.) OTOH, if the container lacks
704 * CAP_SYS_ADMIN, do only default to :mixed, because then the
705 * container can't remount it read-write.
706 */
0769b82a
CS
707 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
708 int has_sys_admin = 0;
b0ee5983
CB
709
710 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 711 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 712 else
0769b82a 713 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
714
715 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 716 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 717 else
0769b82a 718 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a 719 }
0fd73091 720
3f69fb12 721 if (flags & LXC_AUTO_CGROUP_FORCE)
0fd73091
CB
722 cg_flags |= LXC_AUTO_CGROUP_FORCE;
723
2202afc9
CB
724 if (!handler->cgroup_ops->mount(handler->cgroup_ops,
725 handler,
726 conf->rootfs.path ? conf->rootfs.mount : "",
55022530
CB
727 cg_flags))
728 return log_error_errno(-1, errno, "Failed to mount \"/sys/fs/cgroup\"");
368bbc02
CS
729 }
730
0d190408
LT
731 if (flags & LXC_AUTO_SHMOUNTS_MASK) {
732 int ret = add_shmount_to_list(conf);
55022530
CB
733 if (ret < 0)
734 return log_error(-1, "Failed to add shmount entry to container config");
0d190408
LT
735 }
736
368bbc02 737 return 0;
368bbc02
CS
738}
739
4e5440c6 740static int setup_utsname(struct utsname *utsname)
0ad19a3f 741{
0fd73091
CB
742 int ret;
743
4e5440c6
DL
744 if (!utsname)
745 return 0;
0ad19a3f 746
0fd73091 747 ret = sethostname(utsname->nodename, strlen(utsname->nodename));
55022530
CB
748 if (ret < 0)
749 return log_error_errno(-1, errno, "Failed to set the hostname to \"%s\"",
750 utsname->nodename);
0ad19a3f 751
0fd73091 752 INFO("Set hostname to \"%s\"", utsname->nodename);
cd54d859 753
0ad19a3f 754 return 0;
755}
756
69aa6655
DE
757struct dev_symlinks {
758 const char *oldpath;
759 const char *name;
760};
761
762static const struct dev_symlinks dev_symlinks[] = {
0fd73091
CB
763 { "/proc/self/fd", "fd" },
764 { "/proc/self/fd/0", "stdin" },
765 { "/proc/self/fd/1", "stdout" },
766 { "/proc/self/fd/2", "stderr" },
69aa6655
DE
767};
768
ed8704d0 769static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
69aa6655 770{
0fd73091 771 int i, ret;
6b5a54cd 772 char path[PATH_MAX];
09227be2 773 struct stat s;
69aa6655 774
69aa6655
DE
775 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
776 const struct dev_symlinks *d = &dev_symlinks[i];
0fd73091
CB
777
778 ret = snprintf(path, sizeof(path), "%s/dev/%s",
779 rootfs->path ? rootfs->mount : "", d->name);
55022530 780 if (ret < 0 || (size_t)ret >= sizeof(path))
69aa6655 781 return -1;
09227be2 782
0fd73091
CB
783 /* Stat the path first. If we don't get an error accept it as
784 * is and don't try to create it
09227be2 785 */
0fd73091
CB
786 ret = stat(path, &s);
787 if (ret == 0)
09227be2 788 continue;
09227be2 789
69aa6655
DE
790 ret = symlink(d->oldpath, path);
791 if (ret && errno != EEXIST) {
55022530 792 if (errno == EROFS)
0fd73091 793 WARN("Failed to create \"%s\". Read-only filesystem", path);
55022530
CB
794 else
795 return log_error_errno(-1, errno, "Failed to create \"%s\"", path);
69aa6655
DE
796 }
797 }
0fd73091 798
69aa6655
DE
799 return 0;
800}
801
2187efd3 802/* Build a space-separate list of ptys to pass to systemd. */
885766f5 803static bool append_ttyname(char **pp, char *name)
b0a33c1e 804{
393903d1 805 char *p;
f1e05b90 806 size_t size;
393903d1
SH
807
808 if (!*pp) {
809 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
810 if (!*pp)
811 return false;
0fd73091 812
393903d1
SH
813 sprintf(*pp, "container_ttys=%s", name);
814 return true;
815 }
0fd73091 816
f1e05b90
DJ
817 size = strlen(*pp) + strlen(name) + 2;
818 p = realloc(*pp, size);
393903d1
SH
819 if (!p)
820 return false;
0fd73091 821
393903d1 822 *pp = p;
f1e05b90
DJ
823 (void)strlcat(p, " ", size);
824 (void)strlcat(p, name, size);
0fd73091 825
393903d1
SH
826 return true;
827}
828
2187efd3 829static int lxc_setup_ttys(struct lxc_conf *conf)
393903d1 830{
9e1045e3 831 int i, ret;
0e4be3cf 832 const struct lxc_tty_info *ttys = &conf->ttys;
885766f5 833 char *ttydir = ttys->dir;
6b5a54cd 834 char path[PATH_MAX], lxcpath[PATH_MAX];
b0a33c1e 835
e8bd4e43 836 if (!conf->rootfs.path)
bc9bd0e3
DL
837 return 0;
838
885766f5 839 for (i = 0; i < ttys->max; i++) {
0e4be3cf 840 struct lxc_terminal_info *tty = &ttys->tty[i];
b0a33c1e 841
e8bd4e43 842 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
73363c61 843 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 844 return -1;
9e1045e3 845
7c6ef2a2
SH
846 if (ttydir) {
847 /* create dev/lxc/tty%d" */
9e1045e3
CB
848 ret = snprintf(lxcpath, sizeof(lxcpath),
849 "/dev/%s/tty%d", ttydir, i + 1);
73363c61 850 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 851 return -1;
9e1045e3 852
adc1c715 853 ret = mknod(lxcpath, S_IFREG | 0000, 0);
9e1045e3 854 if (ret < 0 && errno != EEXIST) {
73363c61 855 SYSERROR("Failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
856 return -1;
857 }
9e1045e3 858
7c6ef2a2 859 ret = unlink(path);
9e1045e3 860 if (ret < 0 && errno != ENOENT) {
73363c61 861 SYSERROR("Failed to unlink \"%s\"", path);
7c6ef2a2
SH
862 return -1;
863 }
b0a33c1e 864
2520facd 865 ret = mount(tty->name, lxcpath, "none", MS_BIND, 0);
9e1045e3 866 if (ret < 0) {
55022530 867 SYSWARN("Failed to bind mount \"%s\" onto \"%s\"", tty->name, lxcpath);
7c6ef2a2
SH
868 continue;
869 }
55022530 870 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, lxcpath);
13954cce 871
9e1045e3
CB
872 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
873 ttydir, i + 1);
73363c61 874 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 875 return -1;
9e1045e3 876
7c6ef2a2 877 ret = symlink(lxcpath, path);
55022530
CB
878 if (ret < 0)
879 return log_error_errno(-1, errno, "Failed to create symlink \"%s\" -> \"%s\"", path, lxcpath);
7c6ef2a2 880 } else {
9e1045e3
CB
881 /* If we populated /dev, then we need to create
882 * /dev/ttyN
883 */
d3ccc04e
CB
884 ret = mknod(path, S_IFREG | 0000, 0);
885 if (ret < 0) /* this isn't fatal, continue */
6d1400b5 886 SYSERROR("Failed to create \"%s\"", path);
9e1045e3 887
2520facd 888 ret = mount(tty->name, path, "none", MS_BIND, 0);
9e1045e3 889 if (ret < 0) {
2520facd 890 SYSERROR("Failed to mount '%s'->'%s'", tty->name, path);
7c6ef2a2
SH
891 continue;
892 }
9e1045e3 893
d3ccc04e 894 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, path);
393903d1 895 }
9e1045e3 896
55022530
CB
897 if (!append_ttyname(&conf->ttys.tty_names, tty->name))
898 return log_error(-1, "Error setting up container_ttys string");
b0a33c1e 899 }
900
885766f5 901 INFO("Finished setting up %zu /dev/tty<N> device(s)", ttys->max);
b0a33c1e 902 return 0;
903}
904
586a3fe8
CB
905define_cleanup_function(struct lxc_tty_info *, lxc_delete_tty);
906
59eac805 907static int lxc_allocate_ttys(struct lxc_conf *conf)
2187efd3 908{
586a3fe8 909 struct lxc_terminal_info *tty_new = NULL;
fca23691 910 int ret;
586a3fe8 911 call_cleaner(lxc_delete_tty) struct lxc_tty_info *ttys = &conf->ttys;
2187efd3
CB
912
913 /* no tty in the configuration */
885766f5 914 if (ttys->max == 0)
2187efd3
CB
915 return 0;
916
55022530
CB
917 tty_new = malloc(sizeof(struct lxc_terminal_info) * ttys->max);
918 if (!tty_new)
2187efd3 919 return -ENOMEM;
55022530 920 ttys->tty = tty_new;
2187efd3 921
55022530 922 for (size_t i = 0; i < ttys->max; i++) {
0e4be3cf 923 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 924
36a94ce8 925 tty->ptx = -EBADF;
41808e20
CB
926 tty->pty = -EBADF;
927 ret = openpty(&tty->ptx, &tty->pty, NULL, NULL, NULL);
77a39805 928 if (ret < 0) {
885766f5 929 ttys->max = i;
55022530 930 return log_error_errno(-ENOTTY, ENOTTY, "Failed to create tty %zu", i);
2187efd3
CB
931 }
932
41808e20 933 ret = ttyname_r(tty->pty, tty->name, sizeof(tty->name));
77a39805 934 if (ret < 0) {
77a39805 935 ttys->max = i;
41808e20 936 return log_error_errno(-ENOTTY, ENOTTY, "Failed to retrieve name of tty %zu pty", i);
77a39805
CB
937 }
938
41808e20
CB
939 DEBUG("Created tty \"%s\" with ptx fd %d and pty fd %d",
940 tty->name, tty->ptx, tty->pty);
2187efd3
CB
941
942 /* Prevent leaking the file descriptors to the container */
36a94ce8 943 ret = fd_cloexec(tty->ptx, true);
2187efd3 944 if (ret < 0)
36a94ce8
CB
945 SYSWARN("Failed to set FD_CLOEXEC flag on ptx fd %d of tty device \"%s\"",
946 tty->ptx, tty->name);
2187efd3 947
41808e20 948 ret = fd_cloexec(tty->pty, true);
2187efd3 949 if (ret < 0)
41808e20
CB
950 SYSWARN("Failed to set FD_CLOEXEC flag on pty fd %d of tty device \"%s\"",
951 tty->pty, tty->name);
2187efd3 952
7581d645 953 tty->busy = -1;
2187efd3
CB
954 }
955
885766f5 956 INFO("Finished creating %zu tty devices", ttys->max);
586a3fe8 957 move_ptr(ttys);
2187efd3
CB
958 return 0;
959}
960
0e4be3cf 961void lxc_delete_tty(struct lxc_tty_info *ttys)
2187efd3 962{
386e6768
CB
963 if (!ttys->tty)
964 return;
965
55022530 966 for (int i = 0; i < ttys->max; i++) {
0e4be3cf 967 struct lxc_terminal_info *tty = &ttys->tty[i];
36a94ce8 968 close_prot_errno_disarm(tty->ptx);
41808e20 969 close_prot_errno_disarm(tty->pty);
2187efd3
CB
970 }
971
55022530 972 free_disarm(ttys->tty);
2187efd3
CB
973}
974
975static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
976{
977 int i;
0fd73091 978 int ret = -1;
2187efd3 979 struct lxc_conf *conf = handler->conf;
0e4be3cf 980 struct lxc_tty_info *ttys = &conf->ttys;
2187efd3 981 int sock = handler->data_sock[0];
2187efd3 982
885766f5 983 if (ttys->max == 0)
2187efd3
CB
984 return 0;
985
885766f5 986 for (i = 0; i < ttys->max; i++) {
2187efd3 987 int ttyfds[2];
0e4be3cf 988 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 989
36a94ce8 990 ttyfds[0] = tty->ptx;
41808e20 991 ttyfds[1] = tty->pty;
2187efd3
CB
992
993 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
994 if (ret < 0)
995 break;
996
41808e20
CB
997 TRACE("Sent tty \"%s\" with ptx fd %d and pty fd %d to parent",
998 tty->name, tty->ptx, tty->pty);
2187efd3
CB
999 }
1000
1001 if (ret < 0)
6d1400b5 1002 SYSERROR("Failed to send %zu ttys to parent", ttys->max);
2187efd3 1003 else
885766f5 1004 TRACE("Sent %zu ttys to parent", ttys->max);
2187efd3
CB
1005
1006 return ret;
1007}
1008
1009static int lxc_create_ttys(struct lxc_handler *handler)
1010{
1011 int ret = -1;
1012 struct lxc_conf *conf = handler->conf;
1013
663014ee 1014 ret = lxc_allocate_ttys(conf);
2187efd3
CB
1015 if (ret < 0) {
1016 ERROR("Failed to allocate ttys");
1017 goto on_error;
1018 }
1019
1020 ret = lxc_send_ttys_to_parent(handler);
1021 if (ret < 0) {
1022 ERROR("Failed to send ttys to parent");
1023 goto on_error;
1024 }
1025
1026 if (!conf->is_execute) {
1027 ret = lxc_setup_ttys(conf);
1028 if (ret < 0) {
1029 ERROR("Failed to setup ttys");
1030 goto on_error;
1031 }
1032 }
1033
885766f5
CB
1034 if (conf->ttys.tty_names) {
1035 ret = setenv("container_ttys", conf->ttys.tty_names, 1);
2187efd3 1036 if (ret < 0)
885766f5 1037 SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
2187efd3
CB
1038 }
1039
1040 ret = 0;
1041
1042on_error:
0e4be3cf 1043 lxc_delete_tty(&conf->ttys);
2187efd3
CB
1044
1045 return ret;
1046}
1047
7133b912
CB
1048/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1049 * error, log it but don't fail yet.
91c3830e 1050 */
7133b912 1051static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
63012bdd 1052 int autodevtmpfssize, const char *lxcpath)
91c3830e 1053{
2f443e88 1054 __do_free char *path = NULL;
91c3830e 1055 int ret;
87da4ec3 1056 size_t clen;
87e0e273 1057 mode_t cur_mask;
63012bdd 1058 char mount_options[128];
91c3830e 1059
7133b912 1060 INFO("Preparing \"/dev\"");
bc6928ff 1061
14221cbb 1062 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1063 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
2f443e88 1064 path = must_realloc(NULL, clen);
63012bdd
CK
1065 sprintf(mount_options, "size=%d,mode=755", (autodevtmpfssize != 0) ? autodevtmpfssize : 500000);
1066 DEBUG("Using mount options: %s", mount_options);
bc6928ff 1067
ec50007f 1068 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
7133b912 1069 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1070 return -1;
bc6928ff 1071
87e0e273
CB
1072 cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1073 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1074 if (ret < 0 && errno != EEXIST) {
1075 SYSERROR("Failed to create \"/dev\" directory");
1076 ret = -errno;
1077 goto reset_umask;
bc6928ff 1078 }
87da4ec3 1079
63012bdd
CK
1080 ret = safe_mount("none", path, "tmpfs", 0, mount_options,
1081 rootfs->path ? rootfs->mount : NULL );
7133b912
CB
1082 if (ret < 0) {
1083 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
87e0e273 1084 goto reset_umask;
91c3830e 1085 }
87e0e273 1086 TRACE("Mounted tmpfs on \"%s\"", path);
87da4ec3 1087
ec50007f 1088 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
87e0e273
CB
1089 if (ret < 0 || (size_t)ret >= clen) {
1090 ret = -1;
1091 goto reset_umask;
1092 }
87da4ec3 1093
7133b912 1094 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1095 * If not, then create it and exit if that fails...
1096 */
87e0e273
CB
1097 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1098 if (ret < 0 && errno != EEXIST) {
1099 SYSERROR("Failed to create directory \"%s\"", path);
1100 ret = -errno;
1101 goto reset_umask;
91c3830e
SH
1102 }
1103
87e0e273
CB
1104 ret = 0;
1105
1106reset_umask:
1107 (void)umask(cur_mask);
1108
7133b912 1109 INFO("Prepared \"/dev\"");
87e0e273 1110 return ret;
91c3830e
SH
1111}
1112
5e73416f 1113struct lxc_device_node {
74a3920a 1114 const char *name;
5e73416f
CB
1115 const mode_t mode;
1116 const int maj;
1117 const int min;
c6883f38
SH
1118};
1119
5e73416f 1120static const struct lxc_device_node lxc_devices[] = {
06749971 1121 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
5e73416f 1122 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
06749971
CB
1123 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1124 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
5e73416f
CB
1125 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1126 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
c6883f38
SH
1127};
1128
5067e4dd
CB
1129
1130enum {
1131 LXC_DEVNODE_BIND,
1132 LXC_DEVNODE_MKNOD,
1133 LXC_DEVNODE_PARTIAL,
1134 LXC_DEVNODE_OPEN,
1135};
1136
27245ff7 1137static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38 1138{
5e73416f 1139 int i, ret;
6b5a54cd 1140 char path[PATH_MAX];
3a32201c 1141 mode_t cmask;
5067e4dd 1142 int use_mknod = LXC_DEVNODE_MKNOD;
c6883f38 1143
6b5a54cd 1144 ret = snprintf(path, PATH_MAX, "%s/dev",
3999be0a 1145 rootfs->path ? rootfs->mount : "");
6b5a54cd 1146 if (ret < 0 || ret >= PATH_MAX)
c6883f38 1147 return -1;
91c3830e 1148
0bbf8572
CB
1149 /* ignore, just don't try to fill in */
1150 if (!dir_exists(path))
9cb4d183
SH
1151 return 0;
1152
3999be0a
CB
1153 INFO("Populating \"/dev\"");
1154
3a32201c 1155 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
5e73416f 1156 for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
6b5a54cd 1157 char hostpath[PATH_MAX];
5e73416f 1158 const struct lxc_device_node *device = &lxc_devices[i];
0728ebf4 1159
6b5a54cd 1160 ret = snprintf(path, PATH_MAX, "%s/dev/%s",
5e73416f 1161 rootfs->path ? rootfs->mount : "", device->name);
6b5a54cd 1162 if (ret < 0 || ret >= PATH_MAX)
c6883f38 1163 return -1;
0bbf8572 1164
5067e4dd 1165 if (use_mknod >= LXC_DEVNODE_MKNOD) {
5e73416f
CB
1166 ret = mknod(path, device->mode, makedev(device->maj, device->min));
1167 if (ret == 0 || (ret < 0 && errno == EEXIST)) {
1168 DEBUG("Created device node \"%s\"", path);
5067e4dd 1169 } else if (ret < 0) {
55022530
CB
1170 if (errno != EPERM)
1171 return log_error_errno(-1, errno, "Failed to create device node \"%s\"", path);
0bbf8572 1172
5067e4dd 1173 use_mknod = LXC_DEVNODE_BIND;
9cb4d183 1174 }
3999be0a 1175
5067e4dd
CB
1176 /* Device nodes are fully useable. */
1177 if (use_mknod == LXC_DEVNODE_OPEN)
1178 continue;
1179
1180 if (use_mknod == LXC_DEVNODE_MKNOD) {
1181 /* See
1182 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
1183 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
1184 */
1185 ret = open(path, O_RDONLY | O_CLOEXEC);
1186 if (ret >= 0) {
ae2a3d81 1187 close_prot_errno_disarm(ret);
5067e4dd
CB
1188 /* Device nodes are fully useable. */
1189 use_mknod = LXC_DEVNODE_OPEN;
1190 continue;
1191 }
1192
1193 SYSTRACE("Failed to open \"%s\" device", path);
1194 /* Device nodes are only partially useable. */
1195 use_mknod = LXC_DEVNODE_PARTIAL;
1196 }
5e73416f
CB
1197 }
1198
5067e4dd
CB
1199 if (use_mknod != LXC_DEVNODE_PARTIAL) {
1200 /* If we are dealing with partially functional device
1201 * nodes the prio mknod() call will have created the
1202 * device node so we can use it as a bind-mount target.
1203 */
1204 ret = mknod(path, S_IFREG | 0000, 0);
55022530
CB
1205 if (ret < 0 && errno != EEXIST)
1206 return log_error_errno(-1, errno, "Failed to create file \"%s\"", path);
5e73416f
CB
1207 }
1208
1209 /* Fallback to bind-mounting the device from the host. */
6b5a54cd
CB
1210 ret = snprintf(hostpath, PATH_MAX, "/dev/%s", device->name);
1211 if (ret < 0 || ret >= PATH_MAX)
5e73416f
CB
1212 return -1;
1213
1214 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1215 rootfs->path ? rootfs->mount : NULL);
55022530
CB
1216 if (ret < 0)
1217 return log_error_errno(-1, errno, "Failed to bind mount host device node \"%s\" onto \"%s\"",
1218 hostpath, path);
1219 DEBUG("Bind mounted host device node \"%s\" onto \"%s\"", hostpath, path);
c6883f38 1220 }
5e73416f 1221 (void)umask(cmask);
c6883f38 1222
3999be0a 1223 INFO("Populated \"/dev\"");
c6883f38
SH
1224 return 0;
1225}
1226
8ce1abc2 1227static int lxc_mount_rootfs(struct lxc_conf *conf)
0ad19a3f 1228{
9aa76a17 1229 int ret;
10bc1861 1230 struct lxc_storage *bdev;
8ce1abc2 1231 const struct lxc_rootfs *rootfs = &conf->rootfs;
cc28d0b0 1232
a0f379bf 1233 if (!rootfs->path) {
0fd73091 1234 ret = mount("", "/", NULL, MS_SLAVE | MS_REC, 0);
55022530 1235 if (ret < 0)
9e61fb1f 1236 return log_error_errno(-1, errno, "Failed to recursively turn root mount tree into dependent mount");
0fd73091 1237
c69bd12f 1238 return 0;
a0f379bf 1239 }
0ad19a3f 1240
0fd73091 1241 ret = access(rootfs->mount, F_OK);
55022530
CB
1242 if (ret != 0)
1243 return log_error_errno(-1, errno, "Failed to access to \"%s\". Check it is present",
1244 rootfs->mount);
b1789442 1245
8a388ed4 1246 bdev = storage_init(conf);
55022530
CB
1247 if (!bdev)
1248 return log_error(-1, "Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1249 rootfs->path, rootfs->mount,
1250 rootfs->options ? rootfs->options : "(null)");
9aa76a17
CB
1251
1252 ret = bdev->ops->mount(bdev);
10bc1861 1253 storage_put(bdev);
55022530
CB
1254 if (ret < 0)
1255 return log_error(-1, "Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1256 rootfs->path, rootfs->mount,
1257 rootfs->options ? rootfs->options : "(null)");
0ad19a3f 1258
0fd73091 1259 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1260 rootfs->path, rootfs->mount,
1261 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1262
ac778708
DL
1263 return 0;
1264}
1265
59eac805 1266static int lxc_chroot(const struct lxc_rootfs *rootfs)
91e93c71 1267{
b8d88764 1268 __do_free char *nroot = NULL;
0fd73091 1269 int i, ret;
8ce1abc2 1270 char *root = rootfs->mount;
91e93c71 1271
74e7b662 1272 nroot = realpath(root, NULL);
55022530
CB
1273 if (!nroot)
1274 return log_error_errno(-1, errno, "Failed to resolve \"%s\"", root);
91e93c71 1275
0fd73091 1276 ret = chdir("/");
b8d88764 1277 if (ret < 0)
0fd73091 1278 return -1;
91e93c71 1279
0fd73091
CB
1280 /* We could use here MS_MOVE, but in userns this mount is locked and
1281 * can't be moved.
91e93c71 1282 */
8ce1abc2 1283 ret = mount(nroot, "/", NULL, MS_REC | MS_BIND, NULL);
55022530
CB
1284 if (ret < 0)
1285 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"/\" as MS_REC | MS_BIND", nroot);
91e93c71 1286
0fd73091 1287 ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
55022530
CB
1288 if (ret < 0)
1289 return log_error_errno(-1, errno, "Failed to remount \"/\"");
91e93c71 1290
aa899945 1291 /* The following code cleans up inherited mounts which are not required
0fd73091 1292 * for CT.
91e93c71
AV
1293 *
1294 * The mountinfo file shows not all mounts, if a few points have been
1295 * unmounted between read operations from the mountinfo. So we need to
1296 * read mountinfo a few times.
1297 *
7ded5fa7 1298 * This loop can be skipped if a container uses userns, because all
91e93c71
AV
1299 * inherited mounts are locked and we should live with all this trash.
1300 */
0fd73091 1301 for (;;) {
4fdd1f72 1302 __do_fclose FILE *f = NULL;
f3d38164
CB
1303 __do_free char *line = NULL;
1304 char *slider1, *slider2;
91e93c71 1305 int progress = 0;
f3d38164 1306 size_t len = 0;
91e93c71 1307
4110345b 1308 f = fopen("./proc/self/mountinfo", "re");
55022530
CB
1309 if (!f)
1310 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
0fd73091 1311
f3d38164
CB
1312 while (getline(&line, &len, f) > 0) {
1313 for (slider1 = line, i = 0; slider1 && i < 4; i++)
1314 slider1 = strchr(slider1 + 1, ' ');
0fd73091 1315
f3d38164 1316 if (!slider1)
91e93c71 1317 continue;
0fd73091 1318
f3d38164
CB
1319 slider2 = strchr(slider1 + 1, ' ');
1320 if (!slider2)
91e93c71
AV
1321 continue;
1322
f3d38164
CB
1323 *slider2 = '\0';
1324 *slider1 = '.';
91e93c71 1325
f3d38164 1326 if (strcmp(slider1 + 1, "/") == 0)
91e93c71 1327 continue;
0fd73091 1328
f3d38164 1329 if (strcmp(slider1 + 1, "/proc") == 0)
91e93c71
AV
1330 continue;
1331
f3d38164 1332 ret = umount2(slider1, MNT_DETACH);
0fd73091 1333 if (ret == 0)
91e93c71
AV
1334 progress++;
1335 }
0fd73091 1336
91e93c71
AV
1337 if (!progress)
1338 break;
1339 }
1340
7ded5fa7 1341 /* This also can be skipped if a container uses userns. */
0fd73091 1342 (void)umount2("./proc", MNT_DETACH);
91e93c71
AV
1343
1344 /* It is weird, but chdir("..") moves us in a new root */
0fd73091 1345 ret = chdir("..");
55022530
CB
1346 if (ret < 0)
1347 return log_error_errno(-1, errno, "Failed to chdir(\"..\")");
91e93c71 1348
0fd73091 1349 ret = chroot(".");
55022530
CB
1350 if (ret < 0)
1351 return log_error_errno(-1, errno, "Failed to chroot(\".\")");
91e93c71
AV
1352
1353 return 0;
1354}
1355
8ce1abc2
CB
1356/* (The following explanation is copied verbatim from the kernel.)
1357 *
1358 * pivot_root Semantics:
1359 * Moves the root file system of the current process to the directory put_old,
1360 * makes new_root as the new root file system of the current process, and sets
1361 * root/cwd of all processes which had them on the current root to new_root.
1362 *
1363 * Restrictions:
1364 * The new_root and put_old must be directories, and must not be on the
1365 * same file system as the current process root. The put_old must be
1366 * underneath new_root, i.e. adding a non-zero number of /.. to the string
1367 * pointed to by put_old must yield the same directory as new_root. No other
1368 * file system may be mounted on put_old. After all, new_root is a mountpoint.
1369 *
1370 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
1371 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
1372 * in this situation.
1373 *
1374 * Notes:
1375 * - we don't move root/cwd if they are not at the root (reason: if something
1376 * cared enough to change them, it's probably wrong to force them elsewhere)
1377 * - it's okay to pick a root that isn't the root of a file system, e.g.
1378 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
1379 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
1380 * first.
1381 */
1382static int lxc_pivot_root(const char *rootfs)
ac778708 1383{
f62cf1d4 1384 __do_close int oldroot = -EBADF, newroot = -EBADF;
b0d7aac4 1385 int ret;
0fd73091 1386
7806ebd7 1387 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
55022530
CB
1388 if (oldroot < 0)
1389 return log_error_errno(-1, errno, "Failed to open old root directory");
ac778708 1390
7806ebd7 1391 newroot = open(rootfs, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
55022530
CB
1392 if (newroot < 0)
1393 return log_error_errno(-1, errno, "Failed to open new root directory");
0fd73091 1394
8ce1abc2
CB
1395 /* change into new root fs */
1396 ret = fchdir(newroot);
55022530
CB
1397 if (ret < 0)
1398 return log_error_errno(-1, errno, "Failed to change to new rootfs \"%s\"", rootfs);
39c7b795 1399
8ce1abc2
CB
1400 /* pivot_root into our new root fs */
1401 ret = pivot_root(".", ".");
55022530
CB
1402 if (ret < 0)
1403 return log_error_errno(-1, errno, "Failed to pivot_root()");
39c7b795 1404
8ce1abc2
CB
1405 /* At this point the old-root is mounted on top of our new-root. To
1406 * unmounted it we must not be chdir'd into it, so escape back to
1407 * old-root.
1408 */
1409 ret = fchdir(oldroot);
55022530
CB
1410 if (ret < 0)
1411 return log_error_errno(-1, errno, "Failed to enter old root directory");
c69bd12f 1412
9e61fb1f 1413 /* Make oldroot a depedent mount to make sure our umounts don't propagate to the
8ce1abc2
CB
1414 * host.
1415 */
1416 ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
55022530 1417 if (ret < 0)
9e61fb1f 1418 return log_error_errno(-1, errno, "Failed to recursively turn old root mount tree into dependent mount");
8ce1abc2
CB
1419
1420 ret = umount2(".", MNT_DETACH);
55022530
CB
1421 if (ret < 0)
1422 return log_error_errno(-1, errno, "Failed to detach old root directory");
8ce1abc2
CB
1423
1424 ret = fchdir(newroot);
55022530
CB
1425 if (ret < 0)
1426 return log_error_errno(-1, errno, "Failed to re-enter new root directory");
8ce1abc2 1427
8ce1abc2
CB
1428 TRACE("pivot_root(\"%s\") successful", rootfs);
1429
b0d7aac4 1430 return 0;
0ad19a3f 1431}
1432
8ce1abc2
CB
1433static int lxc_setup_rootfs_switch_root(const struct lxc_rootfs *rootfs)
1434{
55022530
CB
1435 if (!rootfs->path)
1436 return log_debug(0, "Container does not have a rootfs");
8ce1abc2
CB
1437
1438 if (detect_ramfs_rootfs())
1439 return lxc_chroot(rootfs);
1440
1441 return lxc_pivot_root(rootfs->mount);
0ad19a3f 1442}
1443
7581a82f 1444static const struct id_map *find_mapped_nsid_entry(const struct lxc_conf *conf,
8ce1abc2
CB
1445 unsigned id,
1446 enum idtype idtype)
f4900711
CB
1447{
1448 struct lxc_list *it;
1449 struct id_map *map;
1450 struct id_map *retmap = NULL;
1451
dcf0ffdf
CB
1452 /* Shortcut for container's root mappings. */
1453 if (id == 0) {
1454 if (idtype == ID_TYPE_UID)
1455 return conf->root_nsuid_map;
1456
1457 if (idtype == ID_TYPE_GID)
1458 return conf->root_nsgid_map;
1459 }
1460
f4900711
CB
1461 lxc_list_for_each(it, &conf->id_map) {
1462 map = it->elem;
1463 if (map->idtype != idtype)
1464 continue;
1465
1466 if (id >= map->nsid && id < map->nsid + map->range) {
1467 retmap = map;
1468 break;
1469 }
1470 }
1471
1472 return retmap;
1473}
1474
f797f05e 1475static int lxc_setup_devpts(struct lxc_handler *handler)
3c26f34e 1476{
f797f05e 1477 __do_close int devpts_fd = -EBADF;
70761e5e 1478 int ret;
ce155c60 1479 char **opts;
9d28c4f9 1480 char devpts_mntopts[256];
ce155c60
CB
1481 char *mntopt_sets[5];
1482 char default_devpts_mntopts[256] = "gid=5,newinstance,ptmxmode=0666,mode=0620";
f797f05e
CB
1483 struct lxc_conf *conf = handler->conf;
1484 int sock = handler->data_sock[0];
77890c6d 1485
55022530
CB
1486 if (conf->pty_max <= 0)
1487 return log_debug(0, "No new devpts instance will be mounted since no pts devices are requested");
3c26f34e 1488
e528c735
CB
1489 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu",
1490 default_devpts_mntopts, conf->pty_max);
9d28c4f9
CB
1491 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1492 return -1;
1493
29a7b484 1494 (void)umount2("/dev/pts", MNT_DETACH);
7e40254a 1495
70761e5e
CB
1496 /* Create mountpoint for devpts instance. */
1497 ret = mkdir("/dev/pts", 0755);
55022530
CB
1498 if (ret < 0 && errno != EEXIST)
1499 return log_error_errno(-1, errno, "Failed to create \"/dev/pts\" directory");
3c26f34e 1500
ce155c60
CB
1501 /* gid=5 && max= */
1502 mntopt_sets[0] = devpts_mntopts;
dfbd4730 1503
ce155c60 1504 /* !gid=5 && max= */
6333c915 1505 mntopt_sets[1] = devpts_mntopts + STRLITERALLEN("gid=5") + 1;
ce155c60
CB
1506
1507 /* gid=5 && !max= */
1508 mntopt_sets[2] = default_devpts_mntopts;
1509
1510 /* !gid=5 && !max= */
6333c915 1511 mntopt_sets[3] = default_devpts_mntopts + STRLITERALLEN("gid=5") + 1;
ce155c60
CB
1512
1513 /* end */
1514 mntopt_sets[4] = NULL;
1515
1516 for (ret = -1, opts = mntopt_sets; opts && *opts; opts++) {
1517 /* mount new devpts instance */
1518 ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, *opts);
1519 if (ret == 0)
1520 break;
1521 }
1522
55022530
CB
1523 if (ret < 0)
1524 return log_error_errno(-1, errno, "Failed to mount new devpts instance");
ce155c60 1525 DEBUG("Mount new devpts instance with options \"%s\"", *opts);
70761e5e 1526
f797f05e
CB
1527 devpts_fd = open_tree(-EBADF, "/dev/pts", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
1528 if (devpts_fd < 0) {
1529 TRACE("Failed to create detached devpts mount");
1530 ret = lxc_abstract_unix_send_fds(sock, NULL, 0, NULL, 0);
1531 } else {
1532 ret = lxc_abstract_unix_send_fds(sock, &devpts_fd, 1, NULL, 0);
1533 }
1534 if (ret < 0)
1535 return log_error_errno(-1, errno, "Failed to send devpts fd to parent");
1536
d5cb35d6 1537 /* Remove any pre-existing /dev/ptmx file. */
b29e05d6
CB
1538 ret = remove("/dev/ptmx");
1539 if (ret < 0) {
55022530
CB
1540 if (errno != ENOENT)
1541 return log_error_errno(-1, errno, "Failed to remove existing \"/dev/ptmx\" file");
b29e05d6 1542 } else {
0fd73091 1543 DEBUG("Removed existing \"/dev/ptmx\" file");
3c26f34e 1544 }
1545
d5cb35d6 1546 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
3b7e332f 1547 ret = mknod("/dev/ptmx", S_IFREG | 0000, 0);
55022530
CB
1548 if (ret < 0 && errno != EEXIST)
1549 return log_error_errno(-1, errno, "Failed to create dummy \"/dev/ptmx\" file as bind mount target");
0fd73091 1550 DEBUG("Created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1551
d5cb35d6 1552 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1553 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
55022530
CB
1554 if (!ret)
1555 return log_debug(0, "Bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1556 else
d5cb35d6 1557 /* Fallthrough and try to create a symlink. */
0fd73091 1558 ERROR("Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
d5cb35d6
CB
1559
1560 /* Remove the dummy /dev/ptmx file we created above. */
1561 ret = remove("/dev/ptmx");
55022530
CB
1562 if (ret < 0)
1563 return log_error_errno(-1, errno, "Failed to remove existing \"/dev/ptmx\"");
d5cb35d6
CB
1564
1565 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1566 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
55022530
CB
1567 if (ret < 0)
1568 return log_error_errno(-1, errno, "Failed to create symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
0fd73091 1569 DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
cd54d859 1570
3c26f34e 1571 return 0;
1572}
1573
cccc74b5
DL
1574static int setup_personality(int persona)
1575{
0fd73091
CB
1576 int ret;
1577
1578#if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1579 if (persona == -1)
1580 return 0;
1581
0fd73091 1582 ret = personality(persona);
55022530
CB
1583 if (ret < 0)
1584 return log_error_errno(-1, errno, "Failed to set personality to \"0x%x\"", persona);
cccc74b5 1585
0fd73091
CB
1586 INFO("Set personality to \"0x%x\"", persona);
1587#endif
cccc74b5
DL
1588
1589 return 0;
1590}
1591
efbfe93f
CB
1592static inline bool wants_console(const struct lxc_terminal *terminal)
1593{
1594 return !terminal->path || strcmp(terminal->path, "none");
1595}
1596
3d7d929a 1597static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
cf68ffd9 1598 const struct lxc_terminal *console,
41808e20 1599 int pty_mnt_fd)
6e590161 1600{
882671aa 1601 int ret;
6b5a54cd 1602 char path[PATH_MAX];
86530b0a 1603 char *rootfs_path = rootfs->path ? rootfs->mount : "";
52e35957 1604
efbfe93f 1605 if (!wants_console(console))
8b1b1210
CB
1606 return 0;
1607
86530b0a 1608 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
3d7d929a 1609 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1610 return -1;
52e35957 1611
cf68ffd9
CB
1612 /*
1613 * When we are asked to setup a console we remove any previous
8b1b1210
CB
1614 * /dev/console bind-mounts.
1615 */
a7ba3c7f
CB
1616 if (file_exists(path)) {
1617 ret = lxc_unstack_mountpoint(path, false);
55022530
CB
1618 if (ret < 0)
1619 return log_error_errno(-ret, errno, "Failed to unmount \"%s\"", path);
1620 else
86530b0a 1621 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
8b1b1210
CB
1622 }
1623
cf68ffd9
CB
1624 /*
1625 * For unprivileged containers autodev or automounts will already have
8b1b1210
CB
1626 * taken care of creating /dev/console.
1627 */
882671aa 1628 ret = mknod(path, S_IFREG | 0000, 0);
55022530
CB
1629 if (ret < 0 && errno != EEXIST)
1630 return log_error_errno(-errno, errno, "Failed to create console");
52e35957 1631
41808e20 1632 ret = fchmod(console->pty, S_IXUSR | S_IXGRP);
55022530
CB
1633 if (ret < 0)
1634 return log_error_errno(-errno, errno, "Failed to set mode \"0%o\" to \"%s\"", S_IXUSR | S_IXGRP, console->name);
13954cce 1635
41808e20
CB
1636 if (pty_mnt_fd >= 0) {
1637 ret = move_mount(pty_mnt_fd, "", -EBADF, path, MOVE_MOUNT_F_EMPTY_PATH);
efbfe93f
CB
1638 if (!ret) {
1639 DEBUG("Moved mount \"%s\" onto \"%s\"", console->name, path);
1640 goto finish;
1641 }
1642
1643 if (ret && errno != ENOSYS)
1644 return log_error_errno(-1, errno,
1645 "Failed to mount %d(%s) on \"%s\"",
41808e20 1646 pty_mnt_fd, console->name, path);
efbfe93f
CB
1647 }
1648
1649 ret = safe_mount(console->name, path, "none", MS_BIND, 0, rootfs_path);
55022530 1650 if (ret < 0)
41808e20 1651 return log_error_errno(-1, errno, "Failed to mount %d(%s) on \"%s\"", pty_mnt_fd, console->name, path);
6e590161 1652
efbfe93f 1653finish:
41808e20 1654 DEBUG("Mounted pty device %d(%s) onto \"%s\"", pty_mnt_fd, console->name, path);
7c6ef2a2
SH
1655 return 0;
1656}
1657
3d7d929a 1658static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
dcad02f8 1659 const struct lxc_terminal *console,
41808e20 1660 char *ttydir, int pty_mnt_fd)
7c6ef2a2 1661{
3b7e332f 1662 int ret;
6b5a54cd 1663 char path[PATH_MAX], lxcpath[PATH_MAX];
86530b0a 1664 char *rootfs_path = rootfs->path ? rootfs->mount : "";
7c6ef2a2 1665
efbfe93f 1666 if (!wants_console(console))
3dc035f1
L
1667 return 0;
1668
7c6ef2a2 1669 /* create rootfs/dev/<ttydir> directory */
86530b0a 1670 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs_path, ttydir);
3d7d929a 1671 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1672 return -1;
3d7d929a 1673
7c6ef2a2 1674 ret = mkdir(path, 0755);
55022530
CB
1675 if (ret && errno != EEXIST)
1676 return log_error_errno(-errno, errno, "Failed to create \"%s\"", path);
4742cd9a 1677 DEBUG("Created directory for console and tty devices at \"%s\"", path);
7c6ef2a2 1678
86530b0a 1679 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
3d7d929a
CB
1680 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1681 return -1;
1682
3b7e332f 1683 ret = mknod(lxcpath, S_IFREG | 0000, 0);
55022530
CB
1684 if (ret < 0 && errno != EEXIST)
1685 return log_error_errno(-errno, errno, "Failed to create \"%s\"", lxcpath);
7c6ef2a2 1686
86530b0a 1687 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
3dc035f1 1688 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1689 return -1;
2a12fefd 1690
3dc035f1 1691 if (file_exists(path)) {
a7ba3c7f 1692 ret = lxc_unstack_mountpoint(path, false);
55022530
CB
1693 if (ret < 0)
1694 return log_error_errno(-ret, errno, "Failed to unmount \"%s\"", path);
1695 else
86530b0a 1696 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
3dc035f1 1697 }
2a12fefd 1698
3b7e332f 1699 ret = mknod(path, S_IFREG | 0000, 0);
55022530
CB
1700 if (ret < 0 && errno != EEXIST)
1701 return log_error_errno(-errno, errno, "Failed to create console");
7c6ef2a2 1702
41808e20 1703 ret = fchmod(console->pty, S_IXUSR | S_IXGRP);
55022530
CB
1704 if (ret < 0)
1705 return log_error_errno(-errno, errno, "Failed to set mode \"0%o\" to \"%s\"", S_IXUSR | S_IXGRP, console->name);
2a12fefd 1706
3dc035f1 1707 /* bind mount console->name to '/dev/<ttydir>/console' */
41808e20
CB
1708 if (pty_mnt_fd >= 0) {
1709 ret = move_mount(pty_mnt_fd, "", -EBADF, lxcpath, MOVE_MOUNT_F_EMPTY_PATH);
efbfe93f
CB
1710 if (!ret) {
1711 DEBUG("Moved mount \"%s\" onto \"%s\"", console->name, lxcpath);
1712 goto finish;
1713 }
1714
1715 if (ret && errno != ENOSYS)
1716 return log_error_errno(-1, errno,
1717 "Failed to mount %d(%s) on \"%s\"",
41808e20 1718 pty_mnt_fd, console->name, lxcpath);
efbfe93f
CB
1719 }
1720
1721 ret = safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs_path);
55022530 1722 if (ret < 0)
41808e20 1723 return log_error_errno(-1, errno, "Failed to mount %d(%s) on \"%s\"", pty_mnt_fd, console->name, lxcpath);
86530b0a 1724 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1 1725
efbfe93f 1726finish:
3dc035f1 1727 /* bind mount '/dev/<ttydir>/console' to '/dev/console' */
86530b0a 1728 ret = safe_mount(lxcpath, path, "none", MS_BIND, 0, rootfs_path);
55022530
CB
1729 if (ret < 0)
1730 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
86530b0a 1731 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1 1732
86530b0a 1733 DEBUG("Console has been setup under \"%s\" and mounted to \"%s\"", lxcpath, path);
6e590161 1734 return 0;
1735}
1736
3d7d929a 1737static int lxc_setup_console(const struct lxc_rootfs *rootfs,
cf68ffd9 1738 const struct lxc_terminal *console, char *ttydir,
41808e20 1739 int pty_mnt_fd)
7c6ef2a2 1740{
3d7d929a 1741
7c6ef2a2 1742 if (!ttydir)
41808e20 1743 return lxc_setup_dev_console(rootfs, console, pty_mnt_fd);
7c6ef2a2 1744
41808e20 1745 return lxc_setup_ttydir_console(rootfs, console, ttydir, pty_mnt_fd);
7c6ef2a2
SH
1746}
1747
a08bfbe3 1748static int parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
998ac676 1749{
a08bfbe3 1750 ssize_t ret;
998ac676 1751
85c2de39
MB
1752 /* If '=' is contained in opt, the option must go into data. */
1753 if (!strchr(opt, '=')) {
a08bfbe3
CB
1754 /*
1755 * If opt is found in mount_opt, set or clear flags.
1756 * Otherwise append it to data.
1757 */
85c2de39 1758 size_t opt_len = strlen(opt);
a08bfbe3 1759 for (struct mount_opt *mo = &mount_opt[0]; mo->name != NULL; mo++) {
85c2de39 1760 size_t mo_name_len = strlen(mo->name);
a08bfbe3 1761
85c2de39
MB
1762 if (opt_len == mo_name_len && strncmp(opt, mo->name, mo_name_len) == 0) {
1763 if (mo->clear)
1764 *flags &= ~mo->flag;
1765 else
1766 *flags |= mo->flag;
a08bfbe3 1767 return 0;
85c2de39 1768 }
998ac676
RT
1769 }
1770 }
1771
a08bfbe3
CB
1772 if (strlen(*data)) {
1773 ret = strlcat(*data, ",", size);
1774 if (ret < 0)
1775 return log_error_errno(ret, errno, "Failed to append \",\" to %s", *data);
1776 }
1777
1778 ret = strlcat(*data, opt, size);
1779 if (ret < 0)
1780 return log_error_errno(ret, errno, "Failed to append \"%s\" to %s", opt, *data);
efed99a4 1781
a08bfbe3 1782 return 0;
998ac676
RT
1783}
1784
0fd73091 1785int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata)
998ac676 1786{
a08bfbe3
CB
1787 __do_free char *mntopts_new = NULL, *mntopts_dup = NULL;
1788 char *mntopt_cur = NULL;
efed99a4 1789 size_t size;
998ac676 1790
a08bfbe3
CB
1791 if (*mntdata || *mntflags)
1792 return ret_errno(EINVAL);
911324ef
DL
1793
1794 if (!mntopts)
998ac676
RT
1795 return 0;
1796
a08bfbe3
CB
1797 mntopts_dup = strdup(mntopts);
1798 if (!mntopts_dup)
1799 return ret_errno(ENOMEM);
998ac676 1800
a08bfbe3
CB
1801 size = strlen(mntopts_dup) + 1;
1802 mntopts_new = zalloc(size);
1803 if (!mntopts_new)
1804 return ret_errno(ENOMEM);
998ac676 1805
a08bfbe3
CB
1806 lxc_iterate_parts(mntopt_cur, mntopts_dup, ",")
1807 if (parse_mntopt(mntopt_cur, mntflags, &mntopts_new, size) < 0)
1808 return ret_errno(EINVAL);
998ac676 1809
a08bfbe3
CB
1810 if (*mntopts_new)
1811 *mntdata = move_ptr(mntopts_new);
998ac676
RT
1812
1813 return 0;
1814}
1815
d840039e
YT
1816static void parse_propagationopt(char *opt, unsigned long *flags)
1817{
1818 struct mount_opt *mo;
1819
1820 /* If opt is found in propagation_opt, set or clear flags. */
d840039e 1821 for (mo = &propagation_opt[0]; mo->name != NULL; mo++) {
0fd73091
CB
1822 if (strncmp(opt, mo->name, strlen(mo->name)) != 0)
1823 continue;
1824
1825 if (mo->clear)
1826 *flags &= ~mo->flag;
1827 else
1828 *flags |= mo->flag;
1829
1830 return;
d840039e
YT
1831 }
1832}
1833
8ce1abc2 1834int parse_propagationopts(const char *mntopts, unsigned long *pflags)
d840039e 1835{
dfd2e059
CB
1836 __do_free char *s = NULL;
1837 char *p;
d840039e
YT
1838
1839 if (!mntopts)
1840 return 0;
1841
1842 s = strdup(mntopts);
55022530
CB
1843 if (!s)
1844 return log_error_errno(-ENOMEM, errno, "Failed to allocate memory");
d840039e 1845
0fd73091 1846 *pflags = 0L;
8db9d26f 1847 lxc_iterate_parts(p, s, ",")
d840039e 1848 parse_propagationopt(p, pflags);
0fd73091 1849
d840039e
YT
1850 return 0;
1851}
1852
6fd5e769
SH
1853static void null_endofword(char *word)
1854{
1855 while (*word && *word != ' ' && *word != '\t')
1856 word++;
1857 *word = '\0';
1858}
1859
0fd73091 1860/* skip @nfields spaces in @src */
6fd5e769
SH
1861static char *get_field(char *src, int nfields)
1862{
6fd5e769 1863 int i;
0fd73091 1864 char *p = src;
6fd5e769
SH
1865
1866 for (i = 0; i < nfields; i++) {
1867 while (*p && *p != ' ' && *p != '\t')
1868 p++;
0fd73091 1869
6fd5e769
SH
1870 if (!*p)
1871 break;
0fd73091 1872
6fd5e769
SH
1873 p++;
1874 }
0fd73091 1875
6fd5e769
SH
1876 return p;
1877}
1878
911324ef
DL
1879static int mount_entry(const char *fsname, const char *target,
1880 const char *fstype, unsigned long mountflags,
d840039e
YT
1881 unsigned long pflags, const char *data, bool optional,
1882 bool dev, bool relative, const char *rootfs)
911324ef 1883{
0ac4b28a 1884 int ret;
6b5a54cd 1885 char srcbuf[PATH_MAX];
181437fd 1886 const char *srcpath = fsname;
614305f3 1887#ifdef HAVE_STATVFS
2938f7c8 1888 struct statvfs sb;
614305f3 1889#endif
2938f7c8 1890
181437fd 1891 if (relative) {
55022530
CB
1892 ret = snprintf(srcbuf, sizeof(srcbuf), "%s/%s", rootfs ? rootfs : "/", fsname ? fsname : "");
1893 if (ret < 0 || ret >= sizeof(srcbuf))
1894 return log_error_errno(-1, errno, "source path is too long");
181437fd
YT
1895 srcpath = srcbuf;
1896 }
1897
1898 ret = safe_mount(srcpath, target, fstype, mountflags & ~MS_REMOUNT, data,
0ac4b28a
CB
1899 rootfs);
1900 if (ret < 0) {
55022530
CB
1901 if (optional)
1902 return log_info_errno(0, errno, "Failed to mount \"%s\" on \"%s\" (optional)",
1903 srcpath ? srcpath : "(null)", target);
0ac4b28a 1904
55022530
CB
1905 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"",
1906 srcpath ? srcpath : "(null)", target);
911324ef
DL
1907 }
1908
1909 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
0ac4b28a 1910
55022530
CB
1911 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount options",
1912 srcpath ? srcpath : "(none)", target ? target : "(none)");
0ac4b28a 1913
614305f3 1914#ifdef HAVE_STATVFS
181437fd 1915 if (srcpath && statvfs(srcpath, &sb) == 0) {
94bef7e4
TA
1916 unsigned long required_flags = 0;
1917
2938f7c8
SH
1918 if (sb.f_flag & MS_NOSUID)
1919 required_flags |= MS_NOSUID;
0ac4b28a 1920
ae7a770e 1921 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 1922 required_flags |= MS_NODEV;
0ac4b28a 1923
2938f7c8
SH
1924 if (sb.f_flag & MS_RDONLY)
1925 required_flags |= MS_RDONLY;
0ac4b28a 1926
2938f7c8
SH
1927 if (sb.f_flag & MS_NOEXEC)
1928 required_flags |= MS_NOEXEC;
0ac4b28a 1929
55022530
CB
1930 DEBUG("Flags for \"%s\" were %lu, required extra flags are %lu",
1931 srcpath, sb.f_flag, required_flags);
0ac4b28a
CB
1932
1933 /* If this was a bind mount request, and required_flags
2938f7c8 1934 * does not have any flags which are not already in
0ac4b28a 1935 * mountflags, then skip the remount.
2938f7c8 1936 */
94bef7e4
TA
1937 if (!(mountflags & MS_REMOUNT) &&
1938 (!(required_flags & ~mountflags) && !(mountflags & MS_RDONLY))) {
15f3e22b
CB
1939 DEBUG("Mountflags already were %lu, skipping remount", mountflags);
1940 goto skipremount;
2938f7c8 1941 }
0ac4b28a 1942
2938f7c8 1943 mountflags |= required_flags;
6fd5e769 1944 }
614305f3 1945#endif
911324ef 1946
181437fd 1947 ret = mount(srcpath, target, fstype, mountflags | MS_REMOUNT, data);
0ac4b28a 1948 if (ret < 0) {
55022530
CB
1949 if (optional)
1950 return log_info_errno(0, errno, "Failed to mount \"%s\" on \"%s\" (optional)",
1951 srcpath ? srcpath : "(null)",
1952 target);
1953
1954 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"",
1955 srcpath ? srcpath : "(null)",
1956 target);
911324ef
DL
1957 }
1958 }
1959
a3ed9b81 1960#ifdef HAVE_STATVFS
1961skipremount:
1962#endif
d840039e
YT
1963 if (pflags) {
1964 ret = mount(NULL, target, NULL, pflags, NULL);
1965 if (ret < 0) {
55022530
CB
1966 if (optional)
1967 return log_info_errno(0, errno, "Failed to change mount propagation for \"%s\" (optional)", target);
1968 else
1969 return log_error_errno(-1, errno, "Failed to change mount propagation for \"%s\" (optional)", target);
d840039e
YT
1970 }
1971 DEBUG("Changed mount propagation for \"%s\"", target);
1972 }
1973
0103eb53 1974 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"",
181437fd 1975 srcpath ? srcpath : "(null)", target, fstype);
911324ef
DL
1976
1977 return 0;
1978}
1979
c5e30de4 1980/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
1981static void cull_mntent_opt(struct mntent *mntent)
1982{
1983 int i;
0fd73091
CB
1984 char *list[] = {
1985 "create=dir",
1986 "create=file",
1987 "optional",
1988 "relative",
1989 NULL
1990 };
c5e30de4
CB
1991
1992 for (i = 0; list[i]; i++) {
1993 char *p, *p2;
1994
1995 p = strstr(mntent->mnt_opts, list[i]);
1996 if (!p)
4e4ca161 1997 continue;
c5e30de4 1998
4e4ca161
SH
1999 p2 = strchr(p, ',');
2000 if (!p2) {
2001 /* no more mntopts, so just chop it here */
2002 *p = '\0';
2003 continue;
2004 }
c5e30de4
CB
2005
2006 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
2007 }
2008}
2009
4d5b72a1 2010static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
2011 const char *path,
2012 const struct lxc_rootfs *rootfs,
0fd73091 2013 const char *lxc_name, const char *lxc_path)
0ad19a3f 2014{
7a76eeaa 2015 __do_free char *p1 = NULL;
3b7e332f 2016 int ret;
7a76eeaa 2017 char *p2;
911324ef 2018
12e6ab5d 2019 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
749f98d9 2020 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
12e6ab5d
CB
2021 if (ret < 0)
2022 return -1;
2023 }
6e46cc0d 2024
34cfffb3 2025 if (hasmntopt(mntent, "create=dir")) {
749f98d9 2026 ret = mkdir_p(path, 0755);
55022530
CB
2027 if (ret < 0 && errno != EEXIST)
2028 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", path);
34cfffb3
SG
2029 }
2030
0fd73091
CB
2031 if (!hasmntopt(mntent, "create=file"))
2032 return 0;
749f98d9 2033
0fd73091
CB
2034 ret = access(path, F_OK);
2035 if (ret == 0)
2036 return 0;
749f98d9 2037
0fd73091
CB
2038 p1 = strdup(path);
2039 if (!p1)
2040 return -1;
749f98d9 2041
0fd73091 2042 p2 = dirname(p1);
749f98d9 2043
0fd73091 2044 ret = mkdir_p(p2, 0755);
55022530
CB
2045 if (ret < 0 && errno != EEXIST)
2046 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", path);
749f98d9 2047
3b7e332f
CB
2048 ret = mknod(path, S_IFREG | 0000, 0);
2049 if (ret < 0 && errno != EEXIST)
2050 return -errno;
0fd73091 2051
749f98d9 2052 return 0;
4d5b72a1
NC
2053}
2054
ec50007f
CB
2055/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2056 * without a rootfs. */
db4aba38 2057static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
2058 const char *path,
2059 const struct lxc_rootfs *rootfs,
2060 const char *lxc_name,
2061 const char *lxc_path)
4d5b72a1 2062{
fd214f37 2063 __do_free char *mntdata = NULL;
a08bfbe3
CB
2064 unsigned long mntflags = 0, pflags = 0;
2065 char *rootfs_path = NULL;
d8b712bc 2066 int ret;
181437fd 2067 bool dev, optional, relative;
d8b712bc
CB
2068
2069 optional = hasmntopt(mntent, "optional") != NULL;
2070 dev = hasmntopt(mntent, "dev") != NULL;
181437fd 2071 relative = hasmntopt(mntent, "relative") != NULL;
d8b712bc 2072
ec50007f
CB
2073 if (rootfs && rootfs->path)
2074 rootfs_path = rootfs->mount;
2075
d8b712bc
CB
2076 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2077 lxc_path);
2078 if (ret < 0) {
2079 if (optional)
2080 return 0;
608e3567 2081
d8b712bc
CB
2082 return -1;
2083 }
4e4ca161
SH
2084 cull_mntent_opt(mntent);
2085
d840039e
YT
2086 ret = parse_propagationopts(mntent->mnt_opts, &pflags);
2087 if (ret < 0)
2088 return -1;
2089
d8b712bc
CB
2090 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2091 if (ret < 0)
a08bfbe3 2092 return ret;
a17b1e65 2093
6e46cc0d 2094 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
d840039e 2095 pflags, mntdata, optional, dev, relative, rootfs_path);
68c152ef 2096
911324ef
DL
2097 return ret;
2098}
2099
db4aba38
NC
2100static inline int mount_entry_on_systemfs(struct mntent *mntent)
2101{
1433c9f9 2102 int ret;
6b5a54cd 2103 char path[PATH_MAX];
1433c9f9
CB
2104
2105 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
2106 * absolute paths starting at / on the host.
2107 */
1433c9f9
CB
2108 if (mntent->mnt_dir[0] != '/')
2109 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2110 else
2111 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
07667a6a 2112 if (ret < 0 || ret >= sizeof(path))
1433c9f9 2113 return -1;
1433c9f9
CB
2114
2115 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
2116}
2117
4e4ca161 2118static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 2119 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
2120 const char *lxc_name,
2121 const char *lxc_path)
911324ef 2122{
bdd2b34c 2123 int offset;
013bd428 2124 char *aux;
67e571de 2125 const char *lxcpath;
6b5a54cd 2126 char path[PATH_MAX];
bdd2b34c 2127 int ret = 0;
0ad19a3f 2128
593e8478 2129 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 2130 if (!lxcpath)
2a59a681 2131 return -1;
2a59a681 2132
bdd2b34c
CB
2133 /* If rootfs->path is a blockdev path, allow container fstab to use
2134 * <lxcpath>/<name>/rootfs" as the target prefix.
2135 */
6b5a54cd
CB
2136 ret = snprintf(path, PATH_MAX, "%s/%s/rootfs", lxcpath, lxc_name);
2137 if (ret < 0 || ret >= PATH_MAX)
80a881b2
SH
2138 goto skipvarlib;
2139
2140 aux = strstr(mntent->mnt_dir, path);
2141 if (aux) {
2142 offset = strlen(path);
2143 goto skipabs;
2144 }
2145
2146skipvarlib:
013bd428 2147 aux = strstr(mntent->mnt_dir, rootfs->path);
55022530
CB
2148 if (!aux)
2149 return log_warn(ret, "Ignoring mount point \"%s\"", mntent->mnt_dir);
80a881b2
SH
2150 offset = strlen(rootfs->path);
2151
2152skipabs:
6b5a54cd
CB
2153 ret = snprintf(path, PATH_MAX, "%s/%s", rootfs->mount, aux + offset);
2154 if (ret < 0 || ret >= PATH_MAX)
a17b1e65 2155 return -1;
a17b1e65 2156
0a2dddd4 2157 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 2158}
d330fe7b 2159
4e4ca161 2160static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
2161 const struct lxc_rootfs *rootfs,
2162 const char *lxc_name,
2163 const char *lxc_path)
911324ef 2164{
911324ef 2165 int ret;
6b5a54cd 2166 char path[PATH_MAX];
d330fe7b 2167
34cfffb3 2168 /* relative to root mount point */
6e46cc0d 2169 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
0fd73091 2170 if (ret < 0 || (size_t)ret >= sizeof(path))
9ba8130c 2171 return -1;
911324ef 2172
0a2dddd4 2173 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2174}
2175
06749971
CB
2176static int mount_file_entries(const struct lxc_conf *conf,
2177 const struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2178 const char *lxc_name, const char *lxc_path)
911324ef 2179{
9d03d857 2180 char buf[PATH_MAX];
0fd73091 2181 struct mntent mntent;
e76b8764 2182
aaf901be 2183 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
9d03d857
CB
2184 int ret;
2185
1ae3c19f
CB
2186 if (!rootfs->path)
2187 ret = mount_entry_on_systemfs(&mntent);
2188 else if (mntent.mnt_dir[0] != '/')
2189 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2190 lxc_name, lxc_path);
2191 else
2192 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
9d03d857 2193 lxc_name, lxc_path);
1ae3c19f
CB
2194 if (ret < 0)
2195 return -1;
0ad19a3f 2196 }
cd54d859 2197
55022530
CB
2198 if (!feof(file) || ferror(file))
2199 return log_error(-1, "Failed to parse mount entries");
9d03d857
CB
2200
2201 return 0;
e7938e9e
MN
2202}
2203
55022530
CB
2204static inline void __auto_endmntent__(FILE **f)
2205{
2206 if (*f)
2207 endmntent(*f);
2208}
2209
2210#define __do_endmntent __attribute__((__cleanup__(__auto_endmntent__)))
2211
06749971
CB
2212static int setup_mount(const struct lxc_conf *conf,
2213 const struct lxc_rootfs *rootfs, const char *fstab,
42dff448 2214 const char *lxc_name, const char *lxc_path)
e7938e9e 2215{
55022530 2216 __do_endmntent FILE *f = NULL;
e7938e9e
MN
2217 int ret;
2218
2219 if (!fstab)
2220 return 0;
2221
55022530
CB
2222 f = setmntent(fstab, "re");
2223 if (!f)
2224 return log_error_errno(-1, errno, "Failed to open \"%s\"", fstab);
e7938e9e 2225
06749971 2226 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
42dff448
CB
2227 if (ret < 0)
2228 ERROR("Failed to set up mount entries");
e7938e9e 2229
0ad19a3f 2230 return ret;
2231}
2232
1800f924
WB
2233/*
2234 * In order for nested containers to be able to mount /proc and /sys they need
2235 * to see a "pure" proc and sysfs mount points with nothing mounted on top
2236 * (like lxcfs).
2237 * For this we provide proc and sysfs in /dev/.lxc/{proc,sys} while using an
2238 * apparmor rule to deny access to them. This is mostly for convenience: The
2239 * container's root user can mount them anyway and thus has access to the two
2240 * file systems. But a non-root user in the container should not be allowed to
2241 * access them as a side effect without explicitly allowing it.
2242 */
2243static const char nesting_helpers[] =
dc691e34
CB
2244"proc dev/.lxc/proc proc create=dir,optional 0 0\n"
2245"sys dev/.lxc/sys sysfs create=dir,optional 0 0\n";
1800f924
WB
2246
2247FILE *make_anonymous_mount_file(struct lxc_list *mount,
2248 bool include_nesting_helpers)
e7938e9e 2249{
f62cf1d4 2250 __do_close int fd = -EBADF;
4110345b 2251 FILE *f;
5ef5c9a3 2252 int ret;
e7938e9e 2253 char *mount_entry;
5ef5c9a3 2254 struct lxc_list *iterator;
5ef5c9a3 2255
0fd73091 2256 fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
5ef5c9a3 2257 if (fd < 0) {
a324e7eb
CB
2258 char template[] = P_tmpdir "/.lxc_mount_file_XXXXXX";
2259
5ef5c9a3
CB
2260 if (errno != ENOSYS)
2261 return NULL;
a324e7eb
CB
2262
2263 fd = lxc_make_tmpfile(template, true);
55022530
CB
2264 if (fd < 0)
2265 return log_error_errno(NULL, errno, "Could not create temporary mount file");
0fd73091 2266
6bd04140 2267 TRACE("Created temporary mount file");
5ef5c9a3 2268 }
e7938e9e 2269
0fd73091
CB
2270 lxc_list_for_each (iterator, mount) {
2271 size_t len;
2272
e7938e9e 2273 mount_entry = iterator->elem;
0fd73091 2274 len = strlen(mount_entry);
5ef5c9a3 2275
489f39be 2276 ret = lxc_write_nointr(fd, mount_entry, len);
0fd73091 2277 if (ret != len)
79bcf5ee 2278 return NULL;
0fd73091 2279
489f39be 2280 ret = lxc_write_nointr(fd, "\n", 1);
0fd73091 2281 if (ret != 1)
79bcf5ee 2282 return NULL;
e7938e9e
MN
2283 }
2284
1800f924
WB
2285 if (include_nesting_helpers) {
2286 ret = lxc_write_nointr(fd, nesting_helpers,
6333c915
CB
2287 STRARRAYLEN(nesting_helpers));
2288 if (ret != STRARRAYLEN(nesting_helpers))
79bcf5ee 2289 return NULL;
1800f924
WB
2290 }
2291
0fd73091
CB
2292 ret = lseek(fd, 0, SEEK_SET);
2293 if (ret < 0)
79bcf5ee 2294 return NULL;
0fd73091 2295
4110345b
CB
2296 f = fdopen(fd, "re+");
2297 if (f)
2298 move_fd(fd); /* Transfer ownership of fd. */
2299 return f;
9fc7f8c0
TA
2300}
2301
06749971
CB
2302static int setup_mount_entries(const struct lxc_conf *conf,
2303 const struct lxc_rootfs *rootfs,
5ef5c9a3
CB
2304 struct lxc_list *mount, const char *lxc_name,
2305 const char *lxc_path)
9fc7f8c0 2306{
c85ced65 2307 __do_fclose FILE *f = NULL;
9fc7f8c0 2308
1800f924 2309 f = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
19b5d755 2310 if (!f)
9fc7f8c0 2311 return -1;
e7938e9e 2312
c85ced65 2313 return mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
e7938e9e
MN
2314}
2315
bab88e68
CS
2316static int parse_cap(const char *cap)
2317{
84760c11 2318 size_t i;
2319 int capid = -1;
0fd73091
CB
2320 size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
2321 char *ptr = NULL;
bab88e68 2322
0fd73091 2323 if (strcmp(cap, "none") == 0)
7035407c
DE
2324 return -2;
2325
8560cd36 2326 for (i = 0; i < end; i++) {
bab88e68
CS
2327 if (strcmp(cap, caps_opt[i].name))
2328 continue;
2329
2330 capid = caps_opt[i].value;
2331 break;
2332 }
2333
2334 if (capid < 0) {
0fd73091
CB
2335 /* Try to see if it's numeric, so the user may specify
2336 * capabilities that the running kernel knows about but we
2337 * don't
2338 */
bab88e68
CS
2339 errno = 0;
2340 capid = strtol(cap, &ptr, 10);
2341 if (!ptr || *ptr != '\0' || errno != 0)
2342 /* not a valid number */
2343 capid = -1;
2344 else if (capid > lxc_caps_last_cap())
2345 /* we have a number but it's not a valid
2346 * capability */
2347 capid = -1;
2348 }
2349
2350 return capid;
2351}
2352
0769b82a
CS
2353int in_caplist(int cap, struct lxc_list *caps)
2354{
0769b82a 2355 int capid;
0fd73091 2356 struct lxc_list *iterator;
0769b82a 2357
0fd73091 2358 lxc_list_for_each (iterator, caps) {
0769b82a
CS
2359 capid = parse_cap(iterator->elem);
2360 if (capid == cap)
2361 return 1;
2362 }
2363
2364 return 0;
2365}
2366
81810dd1
DL
2367static int setup_caps(struct lxc_list *caps)
2368{
bab88e68 2369 int capid;
0fd73091
CB
2370 char *drop_entry;
2371 struct lxc_list *iterator;
81810dd1 2372
0fd73091
CB
2373 lxc_list_for_each (iterator, caps) {
2374 int ret;
81810dd1
DL
2375
2376 drop_entry = iterator->elem;
2377
bab88e68 2378 capid = parse_cap(drop_entry);
55022530
CB
2379 if (capid < 0)
2380 return log_error(-1, "unknown capability %s", drop_entry);
81810dd1 2381
b81689a1
CB
2382 ret = prctl(PR_CAPBSET_DROP, prctl_arg(capid), prctl_arg(0),
2383 prctl_arg(0), prctl_arg(0));
55022530
CB
2384 if (ret < 0)
2385 return log_error_errno(-1, errno, "Failed to remove %s capability", drop_entry);
0fd73091 2386 DEBUG("Dropped %s (%d) capability", drop_entry, capid);
81810dd1
DL
2387 }
2388
0fd73091 2389 DEBUG("Capabilities have been setup");
1fb86a7c
SH
2390 return 0;
2391}
2392
2393static int dropcaps_except(struct lxc_list *caps)
2394{
2f443e88 2395 __do_free int *caplist = NULL;
0fd73091 2396 int i, capid, numcaps;
1fb86a7c 2397 char *keep_entry;
0fd73091 2398 struct lxc_list *iterator;
1fb86a7c 2399
0fd73091 2400 numcaps = lxc_caps_last_cap() + 1;
2caf9a97
SH
2401 if (numcaps <= 0 || numcaps > 200)
2402 return -1;
0fd73091 2403 TRACE("Found %d capabilities", numcaps);
2caf9a97 2404
1a0e70ac 2405 /* caplist[i] is 1 if we keep capability i */
2f443e88 2406 caplist = must_realloc(NULL, numcaps * sizeof(int));
1fb86a7c
SH
2407 memset(caplist, 0, numcaps * sizeof(int));
2408
0fd73091 2409 lxc_list_for_each (iterator, caps) {
1fb86a7c
SH
2410 keep_entry = iterator->elem;
2411
bab88e68 2412 capid = parse_cap(keep_entry);
7035407c
DE
2413 if (capid == -2)
2414 continue;
2415
55022530
CB
2416 if (capid < 0)
2417 return log_error(-1, "Unknown capability %s", keep_entry);
1fb86a7c 2418
0fd73091 2419 DEBUG("Keep capability %s (%d)", keep_entry, capid);
1fb86a7c
SH
2420 caplist[capid] = 1;
2421 }
0fd73091
CB
2422
2423 for (i = 0; i < numcaps; i++) {
2424 int ret;
2425
1fb86a7c
SH
2426 if (caplist[i])
2427 continue;
0fd73091 2428
b81689a1
CB
2429 ret = prctl(PR_CAPBSET_DROP, prctl_arg(i), prctl_arg(0),
2430 prctl_arg(0), prctl_arg(0));
55022530
CB
2431 if (ret < 0)
2432 return log_error_errno(-1, errno, "Failed to remove capability %d", i);
1fb86a7c
SH
2433 }
2434
0fd73091 2435 DEBUG("Capabilities have been setup");
81810dd1
DL
2436 return 0;
2437}
2438
0fd73091
CB
2439static int parse_resource(const char *res)
2440{
2441 int ret;
c6d09e15
WB
2442 size_t i;
2443 int resid = -1;
2444
0fd73091 2445 for (i = 0; i < sizeof(limit_opt) / sizeof(limit_opt[0]); ++i)
c6d09e15
WB
2446 if (strcmp(res, limit_opt[i].name) == 0)
2447 return limit_opt[i].value;
c6d09e15 2448
0fd73091 2449 /* Try to see if it's numeric, so the user may specify
c6d09e15 2450 * resources that the running kernel knows about but
0fd73091
CB
2451 * we don't.
2452 */
2453 ret = lxc_safe_int(res, &resid);
2454 if (ret < 0)
2455 return -1;
2456
2457 return resid;
c6d09e15
WB
2458}
2459
0fd73091
CB
2460int setup_resource_limits(struct lxc_list *limits, pid_t pid)
2461{
2462 int resid;
c6d09e15
WB
2463 struct lxc_list *it;
2464 struct lxc_limit *lim;
c6d09e15 2465
0fd73091 2466 lxc_list_for_each (it, limits) {
c6d09e15
WB
2467 lim = it->elem;
2468
2469 resid = parse_resource(lim->resource);
55022530
CB
2470 if (resid < 0)
2471 return log_error(-1, "Unknown resource %s", lim->resource);
c6d09e15 2472
f48b5fd8 2473#if HAVE_PRLIMIT || HAVE_PRLIMIT64
55022530
CB
2474 if (prlimit(pid, resid, &lim->limit, NULL) != 0)
2475 return log_error_errno(-1, errno, "Failed to set limit %s", lim->resource);
2de12765
CB
2476
2477 TRACE("Setup \"%s\" limit", lim->resource);
f48b5fd8 2478#else
55022530 2479 return log_error(-1, "Cannot set limit \"%s\" as prlimit is missing", lim->resource);
f48b5fd8 2480#endif
c6d09e15 2481 }
0fd73091 2482
c6d09e15
WB
2483 return 0;
2484}
2485
7edd0540
L
2486int setup_sysctl_parameters(struct lxc_list *sysctls)
2487{
e6f76452 2488 __do_free char *tmp = NULL;
7edd0540
L
2489 struct lxc_list *it;
2490 struct lxc_sysctl *elem;
0fd73091 2491 int ret = 0;
6b5a54cd 2492 char filename[PATH_MAX] = {0};
7edd0540 2493
0fd73091 2494 lxc_list_for_each (it, sysctls) {
7edd0540
L
2495 elem = it->elem;
2496 tmp = lxc_string_replace(".", "/", elem->key);
55022530
CB
2497 if (!tmp)
2498 return log_error(-1, "Failed to replace key %s", elem->key);
7edd0540
L
2499
2500 ret = snprintf(filename, sizeof(filename), "/proc/sys/%s", tmp);
55022530
CB
2501 if (ret < 0 || (size_t)ret >= sizeof(filename))
2502 return log_error(-1, "Error setting up sysctl parameters path");
7edd0540 2503
0fd73091 2504 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2505 strlen(elem->value), false, 0666);
55022530
CB
2506 if (ret < 0)
2507 return log_error_errno(-1, errno, "Failed to setup sysctl parameters %s to %s",
2508 elem->key, elem->value);
7edd0540 2509 }
0fd73091 2510
7edd0540
L
2511 return 0;
2512}
2513
61d7a733
YT
2514int setup_proc_filesystem(struct lxc_list *procs, pid_t pid)
2515{
0c669152 2516 __do_free char *tmp = NULL;
61d7a733
YT
2517 struct lxc_list *it;
2518 struct lxc_proc *elem;
0fd73091 2519 int ret = 0;
6b5a54cd 2520 char filename[PATH_MAX] = {0};
61d7a733 2521
0fd73091 2522 lxc_list_for_each (it, procs) {
61d7a733
YT
2523 elem = it->elem;
2524 tmp = lxc_string_replace(".", "/", elem->filename);
55022530
CB
2525 if (!tmp)
2526 return log_error(-1, "Failed to replace key %s", elem->filename);
61d7a733
YT
2527
2528 ret = snprintf(filename, sizeof(filename), "/proc/%d/%s", pid, tmp);
55022530
CB
2529 if (ret < 0 || (size_t)ret >= sizeof(filename))
2530 return log_error(-1, "Error setting up proc filesystem path");
61d7a733 2531
0fd73091 2532 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2533 strlen(elem->value), false, 0666);
55022530
CB
2534 if (ret < 0)
2535 return log_error_errno(-1, errno, "Failed to setup proc filesystem %s to %s", elem->filename, elem->value);
61d7a733 2536 }
0fd73091 2537
61d7a733
YT
2538 return 0;
2539}
2540
ae9242c8
SH
2541static char *default_rootfs_mount = LXCROOTFSMOUNT;
2542
7b379ab3 2543struct lxc_conf *lxc_conf_init(void)
089cd8b8 2544{
26ddeedd 2545 int i;
0fd73091 2546 struct lxc_conf *new;
7b379ab3 2547
13277ec4 2548 new = malloc(sizeof(*new));
0fd73091 2549 if (!new)
7b379ab3 2550 return NULL;
7b379ab3
MN
2551 memset(new, 0, sizeof(*new));
2552
4b73005c 2553 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2554 new->personality = -1;
124fa0a8 2555 new->autodev = 1;
3a784510 2556 new->console.buffer_size = 0;
596a818d
DE
2557 new->console.log_path = NULL;
2558 new->console.log_fd = -1;
861813e5 2559 new->console.log_size = 0;
28a4b0e5 2560 new->console.path = NULL;
63376d7d 2561 new->console.peer = -1;
fb87aa6a 2562 new->console.proxy.busy = -1;
36a94ce8 2563 new->console.proxy.ptx = -1;
41808e20 2564 new->console.proxy.pty = -1;
36a94ce8 2565 new->console.ptx = -1;
41808e20 2566 new->console.pty = -1;
63376d7d 2567 new->console.name[0] = '\0';
732375f5 2568 memset(&new->console.ringbuf, 0, sizeof(struct lxc_ringbuf));
d2e30e99 2569 new->maincmd_fd = -1;
258f8051 2570 new->monitor_signal_pdeath = SIGKILL;
76a26f55 2571 new->nbd_idx = -1;
54c30e29 2572 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2573 if (!new->rootfs.mount) {
53f3f048
SH
2574 free(new);
2575 return NULL;
2576 }
6e54330c 2577 new->rootfs.managed = true;
858377e4 2578 new->logfd = -1;
7b379ab3 2579 lxc_list_init(&new->cgroup);
54860ed0 2580 lxc_list_init(&new->cgroup2);
4bfb655e 2581 lxc_list_init(&new->devices);
7b379ab3
MN
2582 lxc_list_init(&new->network);
2583 lxc_list_init(&new->mount_list);
81810dd1 2584 lxc_list_init(&new->caps);
1fb86a7c 2585 lxc_list_init(&new->keepcaps);
f6d3e3e4 2586 lxc_list_init(&new->id_map);
46ad64ab
CB
2587 new->root_nsuid_map = NULL;
2588 new->root_nsgid_map = NULL;
f979ac15 2589 lxc_list_init(&new->includes);
4184c3e1 2590 lxc_list_init(&new->aliens);
7c661726 2591 lxc_list_init(&new->environment);
c6d09e15 2592 lxc_list_init(&new->limits);
7edd0540 2593 lxc_list_init(&new->sysctls);
61d7a733 2594 lxc_list_init(&new->procs);
44ae0fb6 2595 new->hooks_version = 0;
28d9e29e 2596 for (i = 0; i < NUM_LXC_HOOKS; i++)
26ddeedd 2597 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2598 lxc_list_init(&new->groups);
d39b10eb 2599 lxc_list_init(&new->state_clients);
fe4de9a6 2600 new->lsm_aa_profile = NULL;
1800f924 2601 lxc_list_init(&new->lsm_aa_raw);
fe4de9a6 2602 new->lsm_se_context = NULL;
4fef78bc 2603 new->lsm_se_keyring_context = NULL;
8f818a84 2604 new->keyring_disable_session = false;
7a0bcca3 2605 new->tmp_umount_proc = false;
7a41e857
LT
2606 new->tmp_umount_proc = 0;
2607 new->shmount.path_host = NULL;
2608 new->shmount.path_cont = NULL;
7b379ab3 2609
72bb04e4
PT
2610 /* if running in a new user namespace, init and COMMAND
2611 * default to running as UID/GID 0 when using lxc-execute */
2612 new->init_uid = 0;
2613 new->init_gid = 0;
43654d34 2614 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
b074bbf1 2615 memset(&new->ns_share, 0, sizeof(char *) * LXC_NS_MAX);
70fd7fc9 2616 memset(&new->timens, 0, sizeof(struct timens_offsets));
c3e3c21a 2617 seccomp_conf_init(new);
72bb04e4 2618
7b379ab3 2619 return new;
089cd8b8
DL
2620}
2621
344c9d81 2622int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
a19b974f 2623 size_t buf_size)
f6d3e3e4 2624{
f62cf1d4 2625 __do_close int fd = -EBADF;
76bcd422 2626 int ret;
6b5a54cd 2627 char path[PATH_MAX];
f6d3e3e4 2628
a19b974f 2629 if (geteuid() != 0 && idtype == ID_TYPE_GID) {
f62cf1d4 2630 __do_close int setgroups_fd = -EBADF;
a19b974f 2631
6b5a54cd
CB
2632 ret = snprintf(path, PATH_MAX, "/proc/%d/setgroups", pid);
2633 if (ret < 0 || ret >= PATH_MAX)
a19b974f 2634 return -E2BIG;
a19b974f 2635
76bcd422 2636 setgroups_fd = open(path, O_WRONLY);
55022530
CB
2637 if (setgroups_fd < 0 && errno != ENOENT)
2638 return log_error_errno(-1, errno, "Failed to open \"%s\"", path);
a19b974f 2639
76bcd422
CB
2640 if (setgroups_fd >= 0) {
2641 ret = lxc_write_nointr(setgroups_fd, "deny\n",
2642 STRLITERALLEN("deny\n"));
55022530
CB
2643 if (ret != STRLITERALLEN("deny\n"))
2644 return log_error_errno(-1, errno, "Failed to write \"deny\" to \"/proc/%d/setgroups\"", pid);
395b1a3e 2645 TRACE("Wrote \"deny\" to \"/proc/%d/setgroups\"", pid);
a19b974f 2646 }
a19b974f
CB
2647 }
2648
6b5a54cd 2649 ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid,
29053180 2650 idtype == ID_TYPE_UID ? 'u' : 'g');
6b5a54cd 2651 if (ret < 0 || ret >= PATH_MAX)
f6d3e3e4 2652 return -E2BIG;
29053180 2653
55022530
CB
2654 fd = open(path, O_WRONLY | O_CLOEXEC);
2655 if (fd < 0)
2656 return log_error_errno(-1, errno, "Failed to open \"%s\"", path);
29053180 2657
29053180 2658 ret = lxc_write_nointr(fd, buf, buf_size);
55022530
CB
2659 if (ret != buf_size)
2660 return log_error_errno(-1, errno, "Failed to write %cid mapping to \"%s\"",
2661 idtype == ID_TYPE_UID ? 'u' : 'g', path);
29053180
CB
2662
2663 return 0;
f6d3e3e4
SH
2664}
2665
6e50e704
CB
2666/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2667 *
2668 * @return 1 if functional binary was found
2669 * @return 0 if binary exists but is lacking privilege
2670 * @return -ENOENT if binary does not exist
2671 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
6e50e704 2672 */
df6a2945
CB
2673static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2674{
48411df2 2675 __do_free char *path = NULL;
df6a2945
CB
2676 int ret;
2677 struct stat st;
df6a2945 2678
3275932b 2679 errno = EINVAL;
6e50e704 2680 if (cap != CAP_SETUID && cap != CAP_SETGID)
3275932b 2681 return -1;
6e50e704 2682
3275932b 2683 errno = ENOENT;
df6a2945
CB
2684 path = on_path(binary, NULL);
2685 if (!path)
3275932b 2686 return -1;
df6a2945
CB
2687
2688 ret = stat(path, &st);
3275932b
CB
2689 if (ret < 0)
2690 return -1;
df6a2945
CB
2691
2692 /* Check if the binary is setuid. */
55022530
CB
2693 if (st.st_mode & S_ISUID)
2694 return log_debug(1, "The binary \"%s\" does have the setuid bit set", path);
df6a2945 2695
0fd73091 2696#if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
2697 /* Check if it has the CAP_SETUID capability. */
2698 if ((cap & CAP_SETUID) &&
2699 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
55022530
CB
2700 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED))
2701 return log_debug(1, "The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE and CAP_PERMITTED sets", path);
df6a2945
CB
2702
2703 /* Check if it has the CAP_SETGID capability. */
2704 if ((cap & CAP_SETGID) &&
2705 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
55022530
CB
2706 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED))
2707 return log_debug(1, "The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE and CAP_PERMITTED sets", path);
0fd73091 2708#else
69924fff
CB
2709 /* If we cannot check for file capabilities we need to give the benefit
2710 * of the doubt. Otherwise we might fail even though all the necessary
2711 * file capabilities are set.
2712 */
55022530 2713 DEBUG("Cannot check for file capabilities as full capability support is missing. Manual intervention needed");
0fd73091 2714#endif
df6a2945 2715
3275932b 2716 return 1;
df6a2945
CB
2717}
2718
59eac805 2719static int lxc_map_ids_exec_wrapper(void *args)
986ef930
CB
2720{
2721 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2722 return -1;
2723}
2724
f6d3e3e4
SH
2725int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2726{
0fd73091 2727 int fill, left;
986ef930 2728 char u_or_g;
4bc3b759 2729 char *pos;
6b5a54cd 2730 char cmd_output[PATH_MAX];
0fd73091
CB
2731 struct id_map *map;
2732 struct lxc_list *iterator;
2733 enum idtype type;
0fd73091 2734 int ret = 0, gidmap = 0, uidmap = 0;
c6ba8981
CB
2735 char mapbuf[STRLITERALLEN("new@idmap") + STRLITERALLEN(" ") +
2736 INTTYPE_TO_STRLEN(pid_t) + STRLITERALLEN(" ") +
2737 LXC_IDMAPLEN] = {0};
0fd73091 2738 bool had_entry = false, use_shadow = false;
c724025c
JC
2739 int hostuid, hostgid;
2740
2741 hostuid = geteuid();
2742 hostgid = getegid();
df6a2945
CB
2743
2744 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2745 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
2746 * will protected it by preventing another user from being handed the
2747 * range by shadow.
2748 */
df6a2945 2749 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
2750 if (uidmap == -ENOENT)
2751 WARN("newuidmap binary is missing");
2752 else if (!uidmap)
2753 WARN("newuidmap is lacking necessary privileges");
2754
df6a2945 2755 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
2756 if (gidmap == -ENOENT)
2757 WARN("newgidmap binary is missing");
2758 else if (!gidmap)
2759 WARN("newgidmap is lacking necessary privileges");
2760
df6a2945 2761 if (uidmap > 0 && gidmap > 0) {
0fd73091 2762 DEBUG("Functional newuidmap and newgidmap binary found");
4bc3b759 2763 use_shadow = true;
df6a2945 2764 } else {
99d43365
CB
2765 /* In case unprivileged users run application containers via
2766 * execute() or a start*() there are valid cases where they may
2767 * only want to map their own {g,u}id. Let's not block them from
2768 * doing so by requiring geteuid() == 0.
2769 */
2770 DEBUG("No newuidmap and newgidmap binary found. Trying to "
c724025c
JC
2771 "write directly with euid %d", hostuid);
2772 }
2773
2774 /* Check if we really need to use newuidmap and newgidmap.
2775 * If the user is only remapping his own {g,u}id, we don't need it.
2776 */
2777 if (use_shadow && lxc_list_len(idmap) == 2) {
2778 use_shadow = false;
2779 lxc_list_for_each(iterator, idmap) {
2780 map = iterator->elem;
2781 if (map->idtype == ID_TYPE_UID && map->range == 1 &&
2782 map->nsid == hostuid && map->hostid == hostuid)
2783 continue;
2784 if (map->idtype == ID_TYPE_GID && map->range == 1 &&
2785 map->nsid == hostgid && map->hostid == hostgid)
2786 continue;
2787 use_shadow = true;
2788 break;
2789 }
0e6e3a41 2790 }
251d0d2a 2791
986ef930
CB
2792 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2793 type++, u_or_g = 'g') {
2794 pos = mapbuf;
2795
0e6e3a41 2796 if (use_shadow)
986ef930 2797 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 2798
cf3ef16d 2799 lxc_list_for_each(iterator, idmap) {
251d0d2a 2800 map = iterator->elem;
cf3ef16d
SH
2801 if (map->idtype != type)
2802 continue;
2803
4bc3b759
CB
2804 had_entry = true;
2805
986ef930 2806 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 2807 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
2808 use_shadow ? " " : "", map->nsid,
2809 map->hostid, map->range,
0e6e3a41 2810 use_shadow ? "" : "\n");
55022530
CB
2811 /*
2812 * The kernel only takes <= 4k for writes to
2813 * /proc/<pid>/{g,u}id_map
2814 */
2815 if (fill <= 0 || fill >= left)
2816 return log_error_errno(-1, errno, "Too many %cid mappings defined", u_or_g);
4bc3b759 2817
cf3ef16d 2818 pos += fill;
251d0d2a 2819 }
cf3ef16d 2820 if (!had_entry)
4f7521b4 2821 continue;
cf3ef16d 2822
d85813cd 2823 /* Try to catch the output of new{g,u}idmap to make debugging
986ef930
CB
2824 * easier.
2825 */
2826 if (use_shadow) {
2827 ret = run_command(cmd_output, sizeof(cmd_output),
2828 lxc_map_ids_exec_wrapper,
2829 (void *)mapbuf);
55022530
CB
2830 if (ret < 0)
2831 return log_error(-1, "new%cidmap failed to write mapping \"%s\": %s", u_or_g, cmd_output, mapbuf);
54fbbeb5 2832 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
d1838f34 2833 } else {
986ef930 2834 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
55022530
CB
2835 if (ret < 0)
2836 return log_error(-1, "Failed to write mapping: %s", mapbuf);
54fbbeb5 2837 TRACE("Wrote mapping \"%s\"", mapbuf);
d1838f34 2838 }
986ef930
CB
2839
2840 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 2841 }
251d0d2a 2842
986ef930 2843 return 0;
f6d3e3e4
SH
2844}
2845
234998b4
CB
2846/*
2847 * Return the host uid/gid to which the container root is mapped in val.
0b3a6504 2848 * Return true if id was found, false otherwise.
cf3ef16d 2849 */
234998b4 2850static id_t get_mapped_rootid(const struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2851{
4160c3a0 2852 unsigned nsid;
0fd73091
CB
2853 struct id_map *map;
2854 struct lxc_list *it;
4160c3a0
CB
2855
2856 if (idtype == ID_TYPE_UID)
2857 nsid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
2858 else
2859 nsid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
cf3ef16d 2860
0fd73091 2861 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2862 map = it->elem;
7b50c609 2863 if (map->idtype != idtype)
cf3ef16d 2864 continue;
4160c3a0 2865 if (map->nsid != nsid)
cf3ef16d 2866 continue;
234998b4 2867 return map->hostid;
cf3ef16d 2868 }
4160c3a0 2869
234998b4
CB
2870 if (idtype == ID_TYPE_UID)
2871 return LXC_INVALID_UID;
2872
2873 return LXC_INVALID_GID;
cf3ef16d
SH
2874}
2875
facdf925 2876int mapped_hostid(unsigned id, const struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2877{
cf3ef16d 2878 struct id_map *map;
0fd73091
CB
2879 struct lxc_list *it;
2880
2881 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2882 map = it->elem;
2133f58c 2883 if (map->idtype != idtype)
cf3ef16d 2884 continue;
0fd73091 2885
cf3ef16d 2886 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 2887 return (id - map->hostid) + map->nsid;
cf3ef16d 2888 }
0fd73091 2889
57d116ab 2890 return -1;
cf3ef16d
SH
2891}
2892
7581a82f 2893int find_unmapped_nsid(const struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2894{
cf3ef16d 2895 struct id_map *map;
0fd73091 2896 struct lxc_list *it;
2133f58c 2897 unsigned int freeid = 0;
0fd73091 2898
cf3ef16d 2899again:
0fd73091 2900 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2901 map = it->elem;
2133f58c 2902 if (map->idtype != idtype)
cf3ef16d 2903 continue;
0fd73091 2904
cf3ef16d
SH
2905 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
2906 freeid = map->nsid + map->range;
2907 goto again;
2908 }
2909 }
0fd73091 2910
cf3ef16d
SH
2911 return freeid;
2912}
2913
943144d9 2914/* NOTE: Must not be called from inside the container namespace! */
59eac805 2915static int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
2916{
2917 int mounted;
2918
943144d9 2919 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 2920 if (mounted == -1) {
0fd73091 2921 SYSERROR("Failed to mount proc in the container");
01958b1f 2922 /* continue only if there is no rootfs */
943144d9 2923 if (conf->rootfs.path)
01958b1f 2924 return -1;
5112cd70 2925 } else if (mounted == 1) {
7a0bcca3 2926 conf->tmp_umount_proc = true;
5112cd70 2927 }
943144d9 2928
5112cd70
SH
2929 return 0;
2930}
2931
2932void tmp_proc_unmount(struct lxc_conf *lxc_conf)
2933{
7a0bcca3 2934 if (!lxc_conf->tmp_umount_proc)
0fd73091
CB
2935 return;
2936
7a0bcca3
CB
2937 (void)umount2("/proc", MNT_DETACH);
2938 lxc_conf->tmp_umount_proc = false;
5112cd70
SH
2939}
2940
9e61fb1f
CB
2941/* Walk /proc/mounts and change any shared entries to dependent mounts. */
2942void turn_into_dependent_mounts(void)
e995d7a2 2943{
7969675f 2944 __do_free char *line = NULL;
003be47b 2945 __do_fclose FILE *f = NULL;
f62cf1d4 2946 __do_close int memfd = -EBADF, mntinfo_fd = -EBADF;
003be47b 2947 int ret;
6a49f05e 2948 ssize_t copied;
e995d7a2
SH
2949 size_t len = 0;
2950
6a49f05e 2951 mntinfo_fd = open("/proc/self/mountinfo", O_RDONLY | O_CLOEXEC);
fea3b91d
DJ
2952 if (mntinfo_fd < 0) {
2953 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
6a49f05e 2954 return;
fea3b91d 2955 }
6a49f05e
CB
2956
2957 memfd = memfd_create(".lxc_mountinfo", MFD_CLOEXEC);
2958 if (memfd < 0) {
2959 char template[] = P_tmpdir "/.lxc_mountinfo_XXXXXX";
2960
2961 if (errno != ENOSYS) {
fea3b91d 2962 SYSERROR("Failed to create temporary in-memory file");
6a49f05e
CB
2963 return;
2964 }
2965
2966 memfd = lxc_make_tmpfile(template, true);
fea3b91d 2967 if (memfd < 0) {
fea3b91d
DJ
2968 WARN("Failed to create temporary file");
2969 return;
2970 }
6a49f05e
CB
2971 }
2972
6a49f05e 2973again:
7c4d9466 2974 copied = lxc_sendfile_nointr(memfd, mntinfo_fd, NULL, LXC_SENDFILE_MAX);
6a49f05e
CB
2975 if (copied < 0) {
2976 if (errno == EINTR)
2977 goto again;
2978
fea3b91d 2979 SYSERROR("Failed to copy \"/proc/self/mountinfo\"");
6a49f05e
CB
2980 return;
2981 }
6a49f05e 2982
6a49f05e
CB
2983 ret = lseek(memfd, 0, SEEK_SET);
2984 if (ret < 0) {
fea3b91d 2985 SYSERROR("Failed to reset file descriptor offset");
6a49f05e
CB
2986 return;
2987 }
2988
4110345b 2989 f = fdopen(memfd, "re");
e995d7a2 2990 if (!f) {
003be47b 2991 SYSERROR("Failed to open copy of \"/proc/self/mountinfo\" to mark all shared. Continuing");
e995d7a2
SH
2992 return;
2993 }
2994
003be47b
CB
2995 /*
2996 * After a successful fdopen() memfd will be closed when calling
2997 * fclose(f). Calling close(memfd) afterwards is undefined.
2998 */
2999 move_fd(memfd);
3000
e995d7a2 3001 while (getline(&line, &len, f) != -1) {
0fd73091
CB
3002 char *opts, *target;
3003
e995d7a2
SH
3004 target = get_field(line, 4);
3005 if (!target)
3006 continue;
0fd73091 3007
e995d7a2
SH
3008 opts = get_field(target, 2);
3009 if (!opts)
3010 continue;
0fd73091 3011
e995d7a2
SH
3012 null_endofword(opts);
3013 if (!strstr(opts, "shared"))
3014 continue;
0fd73091 3015
e995d7a2 3016 null_endofword(target);
0fd73091
CB
3017 ret = mount(NULL, target, NULL, MS_SLAVE, NULL);
3018 if (ret < 0) {
9e61fb1f 3019 SYSERROR("Failed to recursively turn old root mount tree into dependent mount. Continuing...");
6a49f05e 3020 continue;
e995d7a2 3021 }
9e61fb1f 3022 TRACE("Recursively turned old root mount tree into dependent mount");
e995d7a2 3023 }
9e61fb1f 3024 TRACE("Turned all mount table entries into dependent mount");
e995d7a2
SH
3025}
3026
794248d0 3027static int lxc_execute_bind_init(struct lxc_handler *handler)
2322903b
SH
3028{
3029 int ret;
794248d0
CB
3030 char *p;
3031 char path[PATH_MAX], destpath[PATH_MAX];
3032 struct lxc_conf *conf = handler->conf;
9d9c111c
SH
3033
3034 /* If init exists in the container, don't bind mount a static one */
3035 p = choose_init(conf->rootfs.mount);
3036 if (p) {
22f835ba 3037 __do_free char *old = p;
41089848
TA
3038
3039 p = strdup(old + strlen(conf->rootfs.mount));
41089848
TA
3040 if (!p)
3041 return -ENOMEM;
3042
3043 INFO("Found existing init at \"%s\"", p);
3044 goto out;
9d9c111c 3045 }
2322903b
SH
3046
3047 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
0fd73091 3048 if (ret < 0 || ret >= PATH_MAX)
8353b4c9 3049 return -1;
2322903b 3050
55022530
CB
3051 if (!file_exists(path))
3052 return log_error_errno(-1, errno, "The file \"%s\" does not exist on host", path);
2322903b 3053
794248d0 3054 ret = snprintf(destpath, PATH_MAX, "%s" P_tmpdir "%s", conf->rootfs.mount, "/.lxc-init");
0fd73091 3055 if (ret < 0 || ret >= PATH_MAX)
8353b4c9 3056 return -1;
2322903b
SH
3057
3058 if (!file_exists(destpath)) {
794248d0 3059 ret = mknod(destpath, S_IFREG | 0000, 0);
55022530
CB
3060 if (ret < 0 && errno != EEXIST)
3061 return log_error_errno(-1, errno, "Failed to create dummy \"%s\" file as bind mount target", destpath);
2322903b
SH
3062 }
3063
592fd47a 3064 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
55022530
CB
3065 if (ret < 0)
3066 return log_error_errno(-1, errno, "Failed to bind mount lxc.init.static into container");
8353b4c9 3067
794248d0
CB
3068 p = strdup(destpath + strlen(conf->rootfs.mount));
3069 if (!p)
3070 return -ENOMEM;
794248d0 3071
8353b4c9 3072 INFO("Bind mounted lxc.init.static into container at \"%s\"", path);
41089848 3073out:
4b5b3a2a 3074 ((struct execute_args *)handler->data)->init_fd = -1;
41089848 3075 ((struct execute_args *)handler->data)->init_path = p;
8353b4c9 3076 return 0;
2322903b
SH
3077}
3078
0fd73091
CB
3079/* This does the work of remounting / if it is shared, calling the container
3080 * pre-mount hooks, and mounting the rootfs.
35120d9c 3081 */
8ce1abc2
CB
3082int lxc_setup_rootfs_prepare_root(struct lxc_conf *conf, const char *name,
3083 const char *lxcpath)
0ad19a3f 3084{
0fd73091
CB
3085 int ret;
3086
35120d9c 3087 if (conf->rootfs_setup) {
35120d9c 3088 const char *path = conf->rootfs.mount;
0fd73091
CB
3089
3090 /* The rootfs was set up in another namespace. bind-mount it to
3091 * give us a mount in our own ns so we can pivot_root to it
3092 */
3093 ret = mount(path, path, "rootfs", MS_BIND, NULL);
55022530
CB
3094 if (ret < 0)
3095 return log_error(-1, "Failed to bind mount container / onto itself");
0fd73091 3096
55022530 3097 return log_trace(0, "Bind mounted container / onto itself");
35120d9c 3098 }
d4ef7c50 3099
9e61fb1f 3100 turn_into_dependent_mounts();
e995d7a2 3101
0fd73091 3102 ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
55022530
CB
3103 if (ret < 0)
3104 return log_error(-1, "Failed to run pre-mount hooks");
35120d9c 3105
8ce1abc2 3106 ret = lxc_mount_rootfs(conf);
55022530
CB
3107 if (ret < 0)
3108 return log_error(-1, "Failed to setup rootfs for");
35120d9c
SH
3109
3110 conf->rootfs_setup = true;
3111 return 0;
3112}
3113
1c1c7051
SH
3114static bool verify_start_hooks(struct lxc_conf *conf)
3115{
6b5a54cd 3116 char path[PATH_MAX];
0fd73091
CB
3117 struct lxc_list *it;
3118
3119 lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
1c1c7051 3120 int ret;
0fd73091 3121 char *hookname = it->elem;
1c1c7051 3122
6b5a54cd 3123 ret = snprintf(path, PATH_MAX, "%s%s",
0fd73091
CB
3124 conf->rootfs.path ? conf->rootfs.mount : "",
3125 hookname);
6b5a54cd 3126 if (ret < 0 || ret >= PATH_MAX)
1c1c7051 3127 return false;
0fd73091 3128
75193660 3129 ret = access(path, X_OK);
55022530
CB
3130 if (ret < 0)
3131 return log_error_errno(false, errno, "Start hook \"%s\" not found in container", hookname);
0fd73091 3132
6a0c909a 3133 return true;
1c1c7051
SH
3134 }
3135
3136 return true;
3137}
3138
4b5b3a2a
TA
3139static bool execveat_supported(void)
3140{
f40988c7 3141 execveat(-1, "", NULL, NULL, AT_EMPTY_PATH);
4b5b3a2a
TA
3142 if (errno == ENOSYS)
3143 return false;
3144
3145 return true;
4b5b3a2a
TA
3146}
3147
20502652
CB
3148static int lxc_setup_boot_id(void)
3149{
3150 int ret;
3151 const char *boot_id_path = "/proc/sys/kernel/random/boot_id";
3152 const char *mock_boot_id_path = "/dev/.lxc-boot-id";
3153 lxc_id128_t n;
3154
3155 if (access(boot_id_path, F_OK))
3156 return 0;
3157
3158 memset(&n, 0, sizeof(n));
3159 if (lxc_id128_randomize(&n)) {
3160 SYSERROR("Failed to generate random data for uuid");
3161 return -1;
3162 }
3163
3164 ret = lxc_id128_write(mock_boot_id_path, n);
3165 if (ret < 0) {
3166 SYSERROR("Failed to write uuid to %s", mock_boot_id_path);
3167 return -1;
3168 }
3169
3170 ret = chmod(mock_boot_id_path, 0444);
3171 if (ret < 0) {
3172 SYSERROR("Failed to chown %s", mock_boot_id_path);
3173 (void)unlink(mock_boot_id_path);
3174 return -1;
3175 }
3176
3177 ret = mount(mock_boot_id_path, boot_id_path, NULL, MS_BIND, NULL);
3178 if (ret < 0) {
3179 SYSERROR("Failed to mount %s to %s", mock_boot_id_path,
3180 boot_id_path);
3181 (void)unlink(mock_boot_id_path);
3182 return -1;
3183 }
3184
3185 ret = mount(NULL, boot_id_path, NULL,
3186 (MS_BIND | MS_REMOUNT | MS_RDONLY | MS_NOSUID | MS_NOEXEC |
3187 MS_NODEV),
3188 NULL);
3189 if (ret < 0) {
3190 SYSERROR("Failed to remount %s read-only", boot_id_path);
3191 (void)unlink(mock_boot_id_path);
3192 return -1;
3193 }
3194
3195 return 0;
3196}
3197
3b988b33 3198int lxc_setup(struct lxc_handler *handler)
35120d9c 3199{
41808e20 3200 __do_close int pty_mnt_fd = -EBADF;
2187efd3 3201 int ret;
0fd73091 3202 const char *lxcpath = handler->lxcpath, *name = handler->name;
35120d9c 3203 struct lxc_conf *lxc_conf = handler->conf;
4fef78bc 3204 char *keyring_context = NULL;
35120d9c 3205
8ce1abc2 3206 ret = lxc_setup_rootfs_prepare_root(lxc_conf, name, lxcpath);
55022530
CB
3207 if (ret < 0)
3208 return log_error(-1, "Failed to setup rootfs");
35120d9c 3209
b87ee312 3210 if (handler->nsfd[LXC_NS_UTS] == -EBADF) {
8353b4c9 3211 ret = setup_utsname(lxc_conf->utsname);
55022530
CB
3212 if (ret < 0)
3213 return log_error(-1, "Failed to setup the utsname %s", name);
0ad19a3f 3214 }
3215
8f818a84
MB
3216 if (!lxc_conf->keyring_disable_session) {
3217 if (lxc_conf->lsm_se_keyring_context) {
3218 keyring_context = lxc_conf->lsm_se_keyring_context;
3219 } else if (lxc_conf->lsm_se_context) {
3220 keyring_context = lxc_conf->lsm_se_context;
3221 }
4fef78bc 3222
8f818a84
MB
3223 ret = lxc_setup_keyring(keyring_context);
3224 if (ret < 0)
3225 return -1;
3226 }
b25291da 3227
e389f2af
CB
3228 if (handler->ns_clone_flags & CLONE_NEWNET) {
3229 ret = lxc_setup_network_in_child_namespaces(lxc_conf,
3230 &lxc_conf->network);
55022530
CB
3231 if (ret < 0)
3232 return log_error(-1, "Failed to setup network");
0ad19a3f 3233
e389f2af 3234 ret = lxc_network_send_name_and_ifindex_to_parent(handler);
55022530
CB
3235 if (ret < 0)
3236 return log_error(-1, "Failed to send network device names and ifindices to parent");
790255cf
CB
3237 }
3238
efbfe93f 3239 if (wants_console(&lxc_conf->console)) {
41808e20 3240 pty_mnt_fd = open_tree(-EBADF, lxc_conf->console.name,
efbfe93f 3241 OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
41808e20 3242 if (pty_mnt_fd < 0)
efbfe93f
CB
3243 SYSTRACE("Failed to create detached mount for container's console \"%s\"",
3244 lxc_conf->console.name);
3245 else
3246 TRACE("Created detached mount for container's console \"%s\"",
3247 lxc_conf->console.name);
3248 }
cf68ffd9 3249
bc6928ff 3250 if (lxc_conf->autodev > 0) {
63012bdd 3251 ret = mount_autodev(name, &lxc_conf->rootfs, lxc_conf->autodevtmpfssize, lxcpath);
55022530
CB
3252 if (ret < 0)
3253 return log_error(-1, "Failed to mount \"/dev\"");
c6883f38
SH
3254 }
3255
8353b4c9
CB
3256 /* Do automatic mounts (mainly /proc and /sys), but exclude those that
3257 * need to wait until other stuff has finished.
368bbc02 3258 */
8353b4c9 3259 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler);
55022530
CB
3260 if (ret < 0)
3261 return log_error(-1, "Failed to setup first automatic mounts");
368bbc02 3262
8353b4c9 3263 ret = setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath);
55022530
CB
3264 if (ret < 0)
3265 return log_error(-1, "Failed to setup mounts");
576f946d 3266
c631115d
FA
3267 if (!lxc_list_empty(&lxc_conf->mount_list)) {
3268 ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
3269 &lxc_conf->mount_list, name, lxcpath);
55022530
CB
3270 if (ret < 0)
3271 return log_error(-1, "Failed to setup mount entries");
c631115d
FA
3272 }
3273
8353b4c9 3274 if (lxc_conf->is_execute) {
4b5b3a2a
TA
3275 if (execveat_supported()) {
3276 int fd;
3277 char path[PATH_MAX];
3278
3279 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
55022530
CB
3280 if (ret < 0 || ret >= PATH_MAX)
3281 return log_error(-1, "Path to init.lxc.static too long");
4b5b3a2a
TA
3282
3283 fd = open(path, O_PATH | O_CLOEXEC);
55022530
CB
3284 if (fd < 0)
3285 return log_error_errno(-1, errno, "Unable to open lxc.init.static");
4b5b3a2a
TA
3286
3287 ((struct execute_args *)handler->data)->init_fd = fd;
3288 ((struct execute_args *)handler->data)->init_path = NULL;
3289 } else {
3290 ret = lxc_execute_bind_init(handler);
55022530
CB
3291 if (ret < 0)
3292 return log_error(-1, "Failed to bind-mount the lxc init system");
8353b4c9
CB
3293 }
3294 }
2322903b 3295
8353b4c9
CB
3296 /* Now mount only cgroups, if wanted. Before, /sys could not have been
3297 * mounted. It is guaranteed to be mounted now either through
3298 * automatically or via fstab entries.
368bbc02 3299 */
8353b4c9 3300 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler);
55022530
CB
3301 if (ret < 0)
3302 return log_error(-1, "Failed to setup remaining automatic mounts");
368bbc02 3303
8353b4c9 3304 ret = run_lxc_hooks(name, "mount", lxc_conf, NULL);
55022530
CB
3305 if (ret < 0)
3306 return log_error(-1, "Failed to run mount hooks");
773fb9ca 3307
bc6928ff 3308 if (lxc_conf->autodev > 0) {
8353b4c9 3309 ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
55022530
CB
3310 if (ret < 0)
3311 return log_error(-1, "Failed to run autodev hooks");
06749971 3312
8353b4c9 3313 ret = lxc_fill_autodev(&lxc_conf->rootfs);
55022530
CB
3314 if (ret < 0)
3315 return log_error(-1, "Failed to populate \"/dev\"");
91c3830e 3316 }
368bbc02 3317
75193660 3318 /* Make sure any start hooks are in the container */
55022530
CB
3319 if (!verify_start_hooks(lxc_conf))
3320 return log_error(-1, "Failed to verify start hooks");
75193660 3321
cf68ffd9
CB
3322 ret = lxc_create_tmp_proc_mount(lxc_conf);
3323 if (ret < 0)
3324 return log_error(-1, "Failed to \"/proc\" LSMs");
3325
ed8704d0 3326 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
41808e20 3327 lxc_conf->ttys.dir, pty_mnt_fd);
55022530
CB
3328 if (ret < 0)
3329 return log_error(-1, "Failed to setup console");
6e590161 3330
ed8704d0 3331 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
55022530
CB
3332 if (ret < 0)
3333 return log_error(-1, "Failed to setup \"/dev\" symlinks");
69aa6655 3334
8ce1abc2 3335 ret = lxc_setup_rootfs_switch_root(&lxc_conf->rootfs);
55022530
CB
3336 if (ret < 0)
3337 return log_error(-1, "Failed to pivot root into rootfs");
ed502555 3338
20502652
CB
3339 /* Setting the boot-id is best-effort for now. */
3340 if (lxc_conf->autodev > 0)
3341 (void)lxc_setup_boot_id();
3342
f797f05e 3343 ret = lxc_setup_devpts(handler);
55022530
CB
3344 if (ret < 0)
3345 return log_error(-1, "Failed to setup new devpts instance");
3c26f34e 3346
2187efd3
CB
3347 ret = lxc_create_ttys(handler);
3348 if (ret < 0)
e8bd4e43 3349 return -1;
e8bd4e43 3350
8353b4c9 3351 ret = setup_personality(lxc_conf->personality);
55022530
CB
3352 if (ret < 0)
3353 return log_error(-1, "Failed to set personality");
cccc74b5 3354
8353b4c9
CB
3355 /* Set sysctl value to a path under /proc/sys as determined from the
3356 * key. For e.g. net.ipv4.ip_forward translated to
3357 * /proc/sys/net/ipv4/ip_forward.
7edd0540
L
3358 */
3359 if (!lxc_list_empty(&lxc_conf->sysctls)) {
3360 ret = setup_sysctl_parameters(&lxc_conf->sysctls);
55022530
CB
3361 if (ret < 0)
3362 return log_error(-1, "Failed to setup sysctl parameters");
7edd0540
L
3363 }
3364
97a8f74f 3365 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
55022530
CB
3366 if (!lxc_list_empty(&lxc_conf->caps))
3367 return log_error(-1, "Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both");
8353b4c9 3368
55022530
CB
3369 if (dropcaps_except(&lxc_conf->keepcaps))
3370 return log_error(-1, "Failed to keep capabilities");
97a8f74f 3371 } else if (setup_caps(&lxc_conf->caps)) {
55022530 3372 return log_error(-1, "Failed to drop capabilities");
81810dd1
DL
3373 }
3374
8353b4c9 3375 NOTICE("The container \"%s\" is set up", name);
cd54d859 3376
0ad19a3f 3377 return 0;
3378}
26ddeedd 3379
3f60c2f7 3380int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
14a7b0f9 3381 char *argv[])
26ddeedd 3382{
26ddeedd 3383 struct lxc_list *it;
3ea957c6
RK
3384 int which;
3385
3386 for (which = 0; which < NUM_LXC_HOOKS; which ++) {
3387 if (strcmp(hookname, lxchook_names[which]) == 0)
3388 break;
3389 }
3390
3391 if (which >= NUM_LXC_HOOKS)
26ddeedd 3392 return -1;
3f60c2f7 3393
0fd73091 3394 lxc_list_for_each (it, &conf->hooks[which]) {
26ddeedd 3395 int ret;
3f60c2f7
CB
3396 char *hook = it->elem;
3397
3398 ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
14a7b0f9 3399 hookname, argv);
3f60c2f7
CB
3400 if (ret < 0)
3401 return -1;
26ddeedd 3402 }
3f60c2f7 3403
26ddeedd
SH
3404 return 0;
3405}
72d0e1cb 3406
72d0e1cb
SG
3407int lxc_clear_config_caps(struct lxc_conf *c)
3408{
1a0e70ac 3409 struct lxc_list *it, *next;
72d0e1cb 3410
0fd73091 3411 lxc_list_for_each_safe (it, &c->caps, next) {
72d0e1cb
SG
3412 lxc_list_del(it);
3413 free(it->elem);
3414 free(it);
3415 }
0fd73091 3416
72d0e1cb
SG
3417 return 0;
3418}
3419
c7e345ae
CB
3420static int lxc_free_idmap(struct lxc_list *id_map)
3421{
27c27d73
SH
3422 struct lxc_list *it, *next;
3423
46bc6f2a 3424 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
3425 lxc_list_del(it);
3426 free(it->elem);
3427 free(it);
3428 }
c7e345ae 3429
27c27d73
SH
3430 return 0;
3431}
7e621263
CB
3432
3433static int __lxc_free_idmap(struct lxc_list *id_map)
3434{
3435 lxc_free_idmap(id_map);
3436 free(id_map);
3437 return 0;
3438}
3439define_cleanup_function(struct lxc_list *, __lxc_free_idmap);
27c27d73 3440
4355ab5f
SH
3441int lxc_clear_idmaps(struct lxc_conf *c)
3442{
3443 return lxc_free_idmap(&c->id_map);
3444}
3445
1fb86a7c
SH
3446int lxc_clear_config_keepcaps(struct lxc_conf *c)
3447{
0fd73091 3448 struct lxc_list *it, *next;
1fb86a7c 3449
0fd73091 3450 lxc_list_for_each_safe (it, &c->keepcaps, next) {
1fb86a7c
SH
3451 lxc_list_del(it);
3452 free(it->elem);
3453 free(it);
3454 }
0fd73091 3455
1fb86a7c
SH
3456 return 0;
3457}
3458
a3ed9b81 3459int lxc_clear_namespace(struct lxc_conf *c)
3460{
3461 int i;
3462 for (i = 0; i < LXC_NS_MAX; i++) {
3463 free(c->ns_share[i]);
3464 c->ns_share[i] = NULL;
3465 }
3466 return 0;
3467}
3468
54860ed0 3469int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
72d0e1cb 3470{
54860ed0 3471 char *global_token, *namespaced_token;
ab1a6cac 3472 size_t namespaced_token_len;
54860ed0 3473 struct lxc_list *it, *next, *list;
ab1a6cac 3474 const char *k = key;
54860ed0 3475 bool all = false;
72d0e1cb 3476
54860ed0
CB
3477 if (version == CGROUP2_SUPER_MAGIC) {
3478 global_token = "lxc.cgroup2";
3479 namespaced_token = "lxc.cgroup2.";
6333c915 3480 namespaced_token_len = STRLITERALLEN("lxc.cgroup2.");
54860ed0
CB
3481 list = &c->cgroup2;
3482 } else if (version == CGROUP_SUPER_MAGIC) {
3483 global_token = "lxc.cgroup";
3484 namespaced_token = "lxc.cgroup.";
6333c915 3485 namespaced_token_len = STRLITERALLEN("lxc.cgroup.");
54860ed0
CB
3486 list = &c->cgroup;
3487 } else {
ab1a6cac 3488 return -EINVAL;
54860ed0
CB
3489 }
3490
3491 if (strcmp(key, global_token) == 0)
72d0e1cb 3492 all = true;
6333c915 3493 else if (strncmp(key, namespaced_token, namespaced_token_len) == 0)
ab1a6cac 3494 k += namespaced_token_len;
a6390f01 3495 else
ab1a6cac 3496 return -EINVAL;
72d0e1cb 3497
0fd73091 3498 lxc_list_for_each_safe (it, list, next) {
72d0e1cb 3499 struct lxc_cgroup *cg = it->elem;
54860ed0 3500
72d0e1cb
SG
3501 if (!all && strcmp(cg->subsystem, k) != 0)
3502 continue;
54860ed0 3503
72d0e1cb
SG
3504 lxc_list_del(it);
3505 free(cg->subsystem);
3506 free(cg->value);
3507 free(cg);
3508 free(it);
3509 }
e409b214 3510
72d0e1cb
SG
3511 return 0;
3512}
3513
4bfb655e
CB
3514static void lxc_clear_devices(struct lxc_conf *conf)
3515{
3516 struct lxc_list *list = &conf->devices;
3517 struct lxc_list *it, *next;
3518
3519 lxc_list_for_each_safe(it, list, next) {
3520 lxc_list_del(it);
3521 free(it);
3522 }
3523}
3524
c6d09e15
WB
3525int lxc_clear_limits(struct lxc_conf *c, const char *key)
3526{
3527 struct lxc_list *it, *next;
c6d09e15 3528 const char *k = NULL;
0fd73091 3529 bool all = false;
c6d09e15 3530
b668653c 3531 if (strcmp(key, "lxc.limit") == 0 || strcmp(key, "lxc.prlimit") == 0)
c6d09e15 3532 all = true;
6333c915
CB
3533 else if (strncmp(key, "lxc.limit.", STRLITERALLEN("lxc.limit.")) == 0)
3534 k = key + STRLITERALLEN("lxc.limit.");
3535 else if (strncmp(key, "lxc.prlimit.", STRLITERALLEN("lxc.prlimit.")) == 0)
3536 k = key + STRLITERALLEN("lxc.prlimit.");
c6d09e15
WB
3537 else
3538 return -1;
3539
0fd73091 3540 lxc_list_for_each_safe (it, &c->limits, next) {
c6d09e15 3541 struct lxc_limit *lim = it->elem;
0fd73091 3542
c6d09e15
WB
3543 if (!all && strcmp(lim->resource, k) != 0)
3544 continue;
0fd73091 3545
c6d09e15
WB
3546 lxc_list_del(it);
3547 free(lim->resource);
3548 free(lim);
3549 free(it);
3550 }
b668653c 3551
c6d09e15
WB
3552 return 0;
3553}
3554
7edd0540
L
3555int lxc_clear_sysctls(struct lxc_conf *c, const char *key)
3556{
3557 struct lxc_list *it, *next;
7edd0540 3558 const char *k = NULL;
0fd73091 3559 bool all = false;
7edd0540
L
3560
3561 if (strcmp(key, "lxc.sysctl") == 0)
3562 all = true;
6333c915
CB
3563 else if (strncmp(key, "lxc.sysctl.", STRLITERALLEN("lxc.sysctl.")) == 0)
3564 k = key + STRLITERALLEN("lxc.sysctl.");
7edd0540
L
3565 else
3566 return -1;
3567
0fd73091 3568 lxc_list_for_each_safe (it, &c->sysctls, next) {
7edd0540 3569 struct lxc_sysctl *elem = it->elem;
0fd73091 3570
7edd0540
L
3571 if (!all && strcmp(elem->key, k) != 0)
3572 continue;
0fd73091 3573
7edd0540
L
3574 lxc_list_del(it);
3575 free(elem->key);
3576 free(elem->value);
3577 free(elem);
3578 free(it);
3579 }
0fd73091 3580
7edd0540
L
3581 return 0;
3582}
3583
61d7a733
YT
3584int lxc_clear_procs(struct lxc_conf *c, const char *key)
3585{
0fd73091 3586 struct lxc_list *it, *next;
61d7a733 3587 const char *k = NULL;
0fd73091 3588 bool all = false;
61d7a733
YT
3589
3590 if (strcmp(key, "lxc.proc") == 0)
3591 all = true;
6333c915
CB
3592 else if (strncmp(key, "lxc.proc.", STRLITERALLEN("lxc.proc.")) == 0)
3593 k = key + STRLITERALLEN("lxc.proc.");
61d7a733
YT
3594 else
3595 return -1;
3596
0fd73091 3597 lxc_list_for_each_safe (it, &c->procs, next) {
61d7a733 3598 struct lxc_proc *proc = it->elem;
0fd73091 3599
61d7a733
YT
3600 if (!all && strcmp(proc->filename, k) != 0)
3601 continue;
0fd73091 3602
61d7a733
YT
3603 lxc_list_del(it);
3604 free(proc->filename);
3605 free(proc->value);
3606 free(proc);
3607 free(it);
3608 }
3609
3610 return 0;
3611}
3612
ee1e7aa0
SG
3613int lxc_clear_groups(struct lxc_conf *c)
3614{
0fd73091 3615 struct lxc_list *it, *next;
ee1e7aa0 3616
0fd73091 3617 lxc_list_for_each_safe (it, &c->groups, next) {
ee1e7aa0
SG
3618 lxc_list_del(it);
3619 free(it->elem);
3620 free(it);
3621 }
0fd73091 3622
ee1e7aa0
SG
3623 return 0;
3624}
3625
ab799c0b
SG
3626int lxc_clear_environment(struct lxc_conf *c)
3627{
0fd73091 3628 struct lxc_list *it, *next;
ab799c0b 3629
0fd73091 3630 lxc_list_for_each_safe (it, &c->environment, next) {
ab799c0b
SG
3631 lxc_list_del(it);
3632 free(it->elem);
3633 free(it);
3634 }
0fd73091 3635
ab799c0b
SG
3636 return 0;
3637}
3638
72d0e1cb
SG
3639int lxc_clear_mount_entries(struct lxc_conf *c)
3640{
0fd73091 3641 struct lxc_list *it, *next;
72d0e1cb 3642
0fd73091 3643 lxc_list_for_each_safe (it, &c->mount_list, next) {
72d0e1cb
SG
3644 lxc_list_del(it);
3645 free(it->elem);
3646 free(it);
3647 }
0fd73091 3648
72d0e1cb
SG
3649 return 0;
3650}
3651
b099e9e9
SH
3652int lxc_clear_automounts(struct lxc_conf *c)
3653{
3654 c->auto_mounts = 0;
3655 return 0;
3656}
3657
12a50cc6 3658int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 3659{
72d0e1cb 3660 int i;
0fd73091
CB
3661 struct lxc_list *it, *next;
3662 const char *k = NULL;
3663 bool all = false, done = false;
72d0e1cb 3664
17ed13a3
SH
3665 if (strcmp(key, "lxc.hook") == 0)
3666 all = true;
6333c915
CB
3667 else if (strncmp(key, "lxc.hook.", STRLITERALLEN("lxc.hook.")) == 0)
3668 k = key + STRLITERALLEN("lxc.hook.");
a6390f01
WB
3669 else
3670 return -1;
17ed13a3 3671
0fd73091 3672 for (i = 0; i < NUM_LXC_HOOKS; i++) {
17ed13a3 3673 if (all || strcmp(k, lxchook_names[i]) == 0) {
0fd73091 3674 lxc_list_for_each_safe (it, &c->hooks[i], next) {
17ed13a3
SH
3675 lxc_list_del(it);
3676 free(it->elem);
3677 free(it);
3678 }
0fd73091 3679
17ed13a3 3680 done = true;
72d0e1cb
SG
3681 }
3682 }
17ed13a3 3683
55022530
CB
3684 if (!done)
3685 return log_error(-1, "Invalid hook key: %s", key);
0fd73091 3686
72d0e1cb
SG
3687 return 0;
3688}
8eb5694b 3689
4184c3e1
SH
3690static inline void lxc_clear_aliens(struct lxc_conf *conf)
3691{
0fd73091 3692 struct lxc_list *it, *next;
4184c3e1 3693
0fd73091 3694 lxc_list_for_each_safe (it, &conf->aliens, next) {
4184c3e1
SH
3695 lxc_list_del(it);
3696 free(it->elem);
3697 free(it);
3698 }
3699}
3700
c7b15d1e 3701void lxc_clear_includes(struct lxc_conf *conf)
f979ac15 3702{
0fd73091 3703 struct lxc_list *it, *next;
f979ac15 3704
0fd73091 3705 lxc_list_for_each_safe (it, &conf->includes, next) {
f979ac15
SH
3706 lxc_list_del(it);
3707 free(it->elem);
3708 free(it);
3709 }
3710}
3711
1800f924
WB
3712int lxc_clear_apparmor_raw(struct lxc_conf *c)
3713{
3714 struct lxc_list *it, *next;
3715
3716 lxc_list_for_each_safe (it, &c->lsm_aa_raw, next) {
3717 lxc_list_del(it);
3718 free(it->elem);
3719 free(it);
3720 }
3721
3722 return 0;
3723}
3724
8eb5694b
SH
3725void lxc_conf_free(struct lxc_conf *conf)
3726{
3727 if (!conf)
3728 return;
0fd73091 3729
858377e4
SH
3730 if (current_config == conf)
3731 current_config = NULL;
aed105d5 3732 lxc_terminal_conf_free(&conf->console);
f10fad2f 3733 free(conf->rootfs.mount);
b3b8c97f 3734 free(conf->rootfs.bdev_type);
f10fad2f
ME
3735 free(conf->rootfs.options);
3736 free(conf->rootfs.path);
9dd75981 3737 free(conf->rootfs.data);
f10fad2f 3738 free(conf->logfile);
858377e4
SH
3739 if (conf->logfd != -1)
3740 close(conf->logfd);
f10fad2f 3741 free(conf->utsname);
885766f5
CB
3742 free(conf->ttys.dir);
3743 free(conf->ttys.tty_names);
f10fad2f
ME
3744 free(conf->fstab);
3745 free(conf->rcfile);
5cda27c1 3746 free(conf->execute_cmd);
f10fad2f 3747 free(conf->init_cmd);
3c491553 3748 free(conf->init_cwd);
6b0d5538 3749 free(conf->unexpanded_config);
76d0127f 3750 free(conf->syslog);
c302b476 3751 lxc_free_networks(&conf->network);
f10fad2f 3752 free(conf->lsm_aa_profile);
1800f924 3753 free(conf->lsm_aa_profile_computed);
f10fad2f 3754 free(conf->lsm_se_context);
c3e3c21a 3755 lxc_seccomp_free(&conf->seccomp);
8eb5694b 3756 lxc_clear_config_caps(conf);
1fb86a7c 3757 lxc_clear_config_keepcaps(conf);
54860ed0
CB
3758 lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
3759 lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
4bfb655e 3760 lxc_clear_devices(conf);
bf651989 3761 lxc_clear_cgroup2_devices(conf);
17ed13a3 3762 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 3763 lxc_clear_mount_entries(conf);
27c27d73 3764 lxc_clear_idmaps(conf);
ee1e7aa0 3765 lxc_clear_groups(conf);
f979ac15 3766 lxc_clear_includes(conf);
761d81ca 3767 lxc_clear_aliens(conf);
ab799c0b 3768 lxc_clear_environment(conf);
240d4b74 3769 lxc_clear_limits(conf, "lxc.prlimit");
7edd0540 3770 lxc_clear_sysctls(conf, "lxc.sysctl");
61d7a733 3771 lxc_clear_procs(conf, "lxc.proc");
1800f924 3772 lxc_clear_apparmor_raw(conf);
a3ed9b81 3773 lxc_clear_namespace(conf);
43654d34 3774 free(conf->cgroup_meta.dir);
a900cbaf
WB
3775 free(conf->cgroup_meta.monitor_dir);
3776 free(conf->cgroup_meta.container_dir);
3777 free(conf->cgroup_meta.namespace_dir);
43654d34 3778 free(conf->cgroup_meta.controllers);
7a41e857
LT
3779 free(conf->shmount.path_host);
3780 free(conf->shmount.path_cont);
8eb5694b
SH
3781 free(conf);
3782}
4355ab5f
SH
3783
3784struct userns_fn_data {
3785 int (*fn)(void *);
c9b7c33e 3786 const char *fn_name;
4355ab5f
SH
3787 void *arg;
3788 int p[2];
3789};
3790
3791static int run_userns_fn(void *data)
3792{
766c5b6d 3793 struct userns_fn_data *d = data;
adaffdd7 3794 int ret;
4355ab5f 3795 char c;
4355ab5f 3796
766c5b6d 3797 close_prot_errno_disarm(d->p[1]);
f8aa4bf3 3798
766c5b6d
CB
3799 /*
3800 * Wait for parent to finish establishing a new mapping in the user
f8aa4bf3
CB
3801 * namespace we are executing in.
3802 */
adaffdd7 3803 ret = lxc_read_nointr(d->p[0], &c, 1);
766c5b6d 3804 close_prot_errno_disarm(d->p[0]);
adaffdd7
CB
3805 if (ret != 1)
3806 return -1;
f8aa4bf3 3807
c9b7c33e 3808 if (d->fn_name)
adaffdd7 3809 TRACE("Calling function \"%s\"", d->fn_name);
0fd73091 3810
f8aa4bf3 3811 /* Call function to run. */
4355ab5f
SH
3812 return d->fn(d->arg);
3813}
3814
7581a82f 3815static struct id_map *mapped_nsid_add(const struct lxc_conf *conf, unsigned id,
db7cfe23
CB
3816 enum idtype idtype)
3817{
5173b710
CB
3818 const struct id_map *map;
3819 struct id_map *retmap;
db7cfe23
CB
3820
3821 map = find_mapped_nsid_entry(conf, id, idtype);
3822 if (!map)
3823 return NULL;
3824
3825 retmap = malloc(sizeof(*retmap));
3826 if (!retmap)
3827 return NULL;
3828
3829 memcpy(retmap, map, sizeof(*retmap));
3830 return retmap;
3831}
3832
7581a82f 3833static struct id_map *find_mapped_hostid_entry(const struct lxc_conf *conf,
c4333195 3834 unsigned id, enum idtype idtype)
f8aa4bf3 3835{
f8aa4bf3 3836 struct id_map *map;
0fd73091 3837 struct lxc_list *it;
f8aa4bf3
CB
3838 struct id_map *retmap = NULL;
3839
0fd73091 3840 lxc_list_for_each (it, &conf->id_map) {
f8aa4bf3
CB
3841 map = it->elem;
3842 if (map->idtype != idtype)
3843 continue;
3844
3845 if (id >= map->hostid && id < map->hostid + map->range) {
3846 retmap = map;
3847 break;
3848 }
3849 }
3850
f8aa4bf3
CB
3851 return retmap;
3852}
3853
0fd73091 3854/* Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
f8aa4bf3 3855 * existing one or establish a new one.
4355ab5f 3856 */
7581a82f 3857static struct id_map *mapped_hostid_add(const struct lxc_conf *conf, uid_t id,
0fd73091 3858 enum idtype type)
4355ab5f 3859{
55022530 3860 __do_free struct id_map *entry = NULL;
28a2d9e7 3861 int hostid_mapped;
55022530 3862 struct id_map *tmp = NULL;
c4333195
CB
3863
3864 entry = malloc(sizeof(*entry));
3865 if (!entry)
3866 return NULL;
f8aa4bf3 3867
28a2d9e7 3868 /* Reuse existing mapping. */
c4333195 3869 tmp = find_mapped_hostid_entry(conf, id, type);
1758c195
CB
3870 if (tmp) {
3871 memcpy(entry, tmp, sizeof(*entry));
3872 } else {
3873 /* Find new mapping. */
3874 hostid_mapped = find_unmapped_nsid(conf, type);
3875 if (hostid_mapped < 0)
3876 return log_debug(NULL, "Failed to find free mapping for id %d", id);
3877
3878 entry->idtype = type;
3879 entry->nsid = hostid_mapped;
3880 entry->hostid = (unsigned long)id;
3881 entry->range = 1;
3882 }
4355ab5f 3883
55022530 3884 return move_ptr(entry);
4355ab5f
SH
3885}
3886
dbfcdf86
CB
3887static struct lxc_list *get_minimal_idmap(const struct lxc_conf *conf,
3888 uid_t *resuid, gid_t *resgid)
4355ab5f 3889{
00d6cfe2
CB
3890 __do_free struct id_map *container_root_uid = NULL,
3891 *container_root_gid = NULL,
3892 *host_uid_map = NULL, *host_gid_map = NULL;
3893 __do_free struct lxc_list *idmap = NULL;
f8aa4bf3 3894 uid_t euid, egid;
4160c3a0
CB
3895 uid_t nsuid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
3896 gid_t nsgid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
00d6cfe2 3897 struct lxc_list *tmplist = NULL;
4355ab5f 3898
db7cfe23 3899 /* Find container root mappings. */
4160c3a0 3900 container_root_uid = mapped_nsid_add(conf, nsuid, ID_TYPE_UID);
55022530
CB
3901 if (!container_root_uid)
3902 return log_debug(NULL, "Failed to find mapping for namespace uid %d", 0);
dcf0ffdf
CB
3903 euid = geteuid();
3904 if (euid >= container_root_uid->hostid &&
3905 euid < (container_root_uid->hostid + container_root_uid->range))
2c996219 3906 host_uid_map = move_ptr(container_root_uid);
f8aa4bf3 3907
4160c3a0 3908 container_root_gid = mapped_nsid_add(conf, nsgid, ID_TYPE_GID);
55022530
CB
3909 if (!container_root_gid)
3910 return log_debug(NULL, "Failed to find mapping for namespace gid %d", 0);
dcf0ffdf
CB
3911 egid = getegid();
3912 if (egid >= container_root_gid->hostid &&
3913 egid < (container_root_gid->hostid + container_root_gid->range))
2c996219 3914 host_gid_map = move_ptr(container_root_gid);
f8aa4bf3
CB
3915
3916 /* Check whether the {g,u}id of the user has a mapping. */
954b7d9b 3917 if (!host_uid_map)
c4333195 3918 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
55022530
CB
3919 if (!host_uid_map)
3920 return log_debug(NULL, "Failed to find mapping for uid %d", euid);
f8aa4bf3 3921
dcf0ffdf
CB
3922 if (!host_gid_map)
3923 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
55022530
CB
3924 if (!host_gid_map)
3925 return log_debug(NULL, "Failed to find mapping for gid %d", egid);
28a2d9e7
CB
3926
3927 /* Allocate new {g,u}id map list. */
3928 idmap = malloc(sizeof(*idmap));
3929 if (!idmap)
00d6cfe2 3930 return NULL;
28a2d9e7
CB
3931 lxc_list_init(idmap);
3932
f8aa4bf3
CB
3933 /* Add container root to the map. */
3934 tmplist = malloc(sizeof(*tmplist));
3935 if (!tmplist)
00d6cfe2 3936 return NULL;
47649d5b
CB
3937 /* idmap will now keep track of that memory. */
3938 lxc_list_add_elem(tmplist, move_ptr(host_uid_map));
f8aa4bf3 3939 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3940
2c996219 3941 if (container_root_uid) {
28a2d9e7
CB
3942 /* Add container root to the map. */
3943 tmplist = malloc(sizeof(*tmplist));
3944 if (!tmplist)
00d6cfe2 3945 return NULL;
47649d5b
CB
3946 /* idmap will now keep track of that memory. */
3947 lxc_list_add_elem(tmplist, move_ptr(container_root_uid));
28a2d9e7 3948 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3949 }
f8aa4bf3
CB
3950
3951 tmplist = malloc(sizeof(*tmplist));
3952 if (!tmplist)
00d6cfe2 3953 return NULL;
47649d5b
CB
3954 /* idmap will now keep track of that memory. */
3955 lxc_list_add_elem(tmplist, move_ptr(host_gid_map));
f8aa4bf3 3956 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3957
2c996219 3958 if (container_root_gid) {
28a2d9e7
CB
3959 tmplist = malloc(sizeof(*tmplist));
3960 if (!tmplist)
00d6cfe2 3961 return NULL;
47649d5b
CB
3962 /* idmap will now keep track of that memory. */
3963 lxc_list_add_elem(tmplist, move_ptr(container_root_gid));
28a2d9e7 3964 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3965 }
f8aa4bf3 3966
dbfcdf86
CB
3967 TRACE("Allocated minimal idmapping for ns uid %d and ns gid %d", nsuid, nsgid);
3968
3969 if (resuid)
3970 *resuid = nsuid;
3971 if (resgid)
3972 *resgid = nsgid;
00d6cfe2 3973 return move_ptr(idmap);
dcf0ffdf
CB
3974}
3975
766c5b6d
CB
3976/*
3977 * Run a function in a new user namespace.
dcf0ffdf
CB
3978 * The caller's euid/egid will be mapped if it is not already.
3979 * Afaict, userns_exec_1() is only used to operate based on privileges for the
3980 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
3981 * This means we require only to establish a mapping from:
3982 * - the container root {g,u}id as seen from the host > user's host {g,u}id
3983 * - the container root -> some sub{g,u}id
915e3dbd 3984 * The former we add, if the user did not specify a mapping. The latter we
6f3fd27f 3985 * retrieve from the container's configured {g,u}id mappings as it must have been
dcf0ffdf
CB
3986 * there to start the container in the first place.
3987 */
7581a82f 3988int userns_exec_1(const struct lxc_conf *conf, int (*fn)(void *), void *data,
dcf0ffdf
CB
3989 const char *fn_name)
3990{
7e621263 3991 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
0fd73091
CB
3992 int ret = -1, status = -1;
3993 char c = '1';
46bc6f2a
CB
3994 struct userns_fn_data d = {
3995 .arg = data,
3996 .fn = fn,
3997 .fn_name = fn_name,
3998 };
766c5b6d
CB
3999 pid_t pid;
4000 int pipe_fds[2];
dcf0ffdf 4001
2b2655a8
CB
4002 if (!conf)
4003 return -EINVAL;
4004
dbfcdf86 4005 idmap = get_minimal_idmap(conf, NULL, NULL);
dcf0ffdf 4006 if (!idmap)
766c5b6d 4007 return ret_errno(ENOENT);
dcf0ffdf 4008
766c5b6d
CB
4009 ret = pipe2(pipe_fds, O_CLOEXEC);
4010 if (ret < 0)
4011 return -errno;
4012
766c5b6d
CB
4013 d.p[0] = pipe_fds[0];
4014 d.p[1] = pipe_fds[1];
dcf0ffdf
CB
4015
4016 /* Clone child in new user namespace. */
a59440be 4017 pid = lxc_raw_clone_cb(run_userns_fn, &d, CLONE_NEWUSER, NULL);
dcf0ffdf 4018 if (pid < 0) {
0fd73091 4019 ERROR("Failed to clone process in new user namespace");
dcf0ffdf
CB
4020 goto on_error;
4021 }
4022
766c5b6d 4023 close_prot_errno_disarm(pipe_fds[0]);
dcf0ffdf 4024
4b73005c
CB
4025 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4026 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
dcf0ffdf 4027 struct id_map *map;
0fd73091 4028 struct lxc_list *it;
dcf0ffdf 4029
766c5b6d 4030 lxc_list_for_each(it, idmap) {
f8aa4bf3 4031 map = it->elem;
766c5b6d
CB
4032 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4033 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
f8aa4bf3 4034 }
4355ab5f
SH
4035 }
4036
f8aa4bf3 4037 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4038 ret = lxc_map_ids(idmap, pid);
f8aa4bf3 4039 if (ret < 0) {
0fd73091 4040 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
f8aa4bf3 4041 goto on_error;
4355ab5f
SH
4042 }
4043
f8aa4bf3 4044 /* Tell child to proceed. */
766c5b6d 4045 if (lxc_write_nointr(pipe_fds[1], &c, 1) != 1) {
dcf0ffdf 4046 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
f8aa4bf3 4047 goto on_error;
4355ab5f
SH
4048 }
4049
686dd5d1 4050on_error:
766c5b6d
CB
4051 close_prot_errno_disarm(pipe_fds[0]);
4052 close_prot_errno_disarm(pipe_fds[1]);
f8aa4bf3 4053
ee1b16bc
TA
4054 /* Wait for child to finish. */
4055 if (pid > 0)
4056 status = wait_for_pid(pid);
4057
686dd5d1
CB
4058 if (status < 0)
4059 ret = -1;
4060
f8aa4bf3 4061 return ret;
4355ab5f 4062}
97e9cfa0 4063
d1783ef4
CB
4064int userns_exec_minimal(const struct lxc_conf *conf,
4065 int (*fn_parent)(void *), void *fn_parent_data,
4066 int (*fn_child)(void *), void *fn_child_data)
edf88289 4067{
7e621263 4068 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
dbfcdf86
CB
4069 uid_t resuid = LXC_INVALID_UID;
4070 gid_t resgid = LXC_INVALID_GID;
edf88289 4071 char c = '1';
dbfcdf86 4072 ssize_t ret;
edf88289
CB
4073 pid_t pid;
4074 int sock_fds[2];
4075
d1783ef4 4076 if (!conf || !fn_child)
dbfcdf86 4077 return ret_errno(EINVAL);
edf88289 4078
dbfcdf86 4079 idmap = get_minimal_idmap(conf, &resuid, &resgid);
edf88289
CB
4080 if (!idmap)
4081 return ret_errno(ENOENT);
4082
4083 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, sock_fds);
4084 if (ret < 0)
4085 return -errno;
4086
4087 pid = fork();
4088 if (pid < 0) {
dbfcdf86 4089 SYSERROR("Failed to create new process");
edf88289
CB
4090 goto on_error;
4091 }
4092
4093 if (pid == 0) {
4094 close_prot_errno_disarm(sock_fds[1]);
4095
4096 ret = unshare(CLONE_NEWUSER);
dbfcdf86
CB
4097 if (ret < 0) {
4098 SYSERROR("Failed to unshare new user namespace");
edf88289 4099 _exit(EXIT_FAILURE);
dbfcdf86 4100 }
edf88289 4101
dbfcdf86
CB
4102 ret = lxc_write_nointr(sock_fds[0], &c, 1);
4103 if (ret != 1)
edf88289
CB
4104 _exit(EXIT_FAILURE);
4105
4106 ret = lxc_read_nointr(sock_fds[0], &c, 1);
4107 if (ret != 1)
4108 _exit(EXIT_FAILURE);
4109
4110 close_prot_errno_disarm(sock_fds[0]);
4111
4112 if (!lxc_setgroups(0, NULL) && errno != EPERM)
4113 _exit(EXIT_FAILURE);
4114
dbfcdf86
CB
4115 ret = setresgid(resgid, resgid, resgid);
4116 if (ret < 0) {
4117 SYSERROR("Failed to setresgid(%d, %d, %d)",
4118 resgid, resgid, resgid);
edf88289 4119 _exit(EXIT_FAILURE);
dbfcdf86
CB
4120 }
4121
4122 ret = setresuid(resuid, resuid, resuid);
4123 if (ret < 0) {
4124 SYSERROR("Failed to setresuid(%d, %d, %d)",
4125 resuid, resuid, resuid);
4126 _exit(EXIT_FAILURE);
4127 }
edf88289 4128
d1783ef4 4129 ret = fn_child(fn_child_data);
dbfcdf86
CB
4130 if (ret) {
4131 SYSERROR("Running function in new user namespace failed");
edf88289 4132 _exit(EXIT_FAILURE);
dbfcdf86 4133 }
edf88289
CB
4134
4135 _exit(EXIT_SUCCESS);
4136 }
4137
4138 close_prot_errno_disarm(sock_fds[0]);
4139
4140 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4141 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4142 struct id_map *map;
4143 struct lxc_list *it;
4144
4145 lxc_list_for_each(it, idmap) {
4146 map = it->elem;
4147 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4148 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
4149 }
4150 }
4151
4152 ret = lxc_read_nointr(sock_fds[1], &c, 1);
4153 if (ret != 1) {
4154 SYSERROR("Failed waiting for child process %d\" to tell us to proceed", pid);
4155 goto on_error;
4156 }
4157
4158 /* Set up {g,u}id mapping for user namespace of child process. */
4159 ret = lxc_map_ids(idmap, pid);
4160 if (ret < 0) {
4161 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4162 goto on_error;
4163 }
4164
4165 /* Tell child to proceed. */
4166 ret = lxc_write_nointr(sock_fds[1], &c, 1);
4167 if (ret != 1) {
4168 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4169 goto on_error;
4170 }
4171
d1783ef4
CB
4172 if (fn_parent && fn_parent(fn_parent_data)) {
4173 SYSERROR("Running parent function failed");
4174 _exit(EXIT_FAILURE);
4175 }
4176
edf88289
CB
4177on_error:
4178 close_prot_errno_disarm(sock_fds[0]);
4179 close_prot_errno_disarm(sock_fds[1]);
4180
4181 /* Wait for child to finish. */
dbfcdf86
CB
4182 if (pid < 0)
4183 return -1;
edf88289 4184
dbfcdf86 4185 return wait_for_pid(pid);
edf88289
CB
4186}
4187
415a8851
CB
4188int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
4189 const char *fn_name)
4190{
4191 pid_t pid;
4192 uid_t euid, egid;
415a8851
CB
4193 int p[2];
4194 struct id_map *map;
4195 struct lxc_list *cur;
0fd73091 4196 struct userns_fn_data d;
415a8851 4197 int ret = -1;
0fd73091 4198 char c = '1';
415a8851
CB
4199 struct lxc_list *idmap = NULL, *tmplist = NULL;
4200 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4201 *host_uid_map = NULL, *host_gid_map = NULL;
4202
2b2655a8
CB
4203 if (!conf)
4204 return -EINVAL;
4205
979f9e34 4206 ret = pipe2(p, O_CLOEXEC);
415a8851
CB
4207 if (ret < 0) {
4208 SYSERROR("opening pipe");
4209 return -1;
4210 }
4211 d.fn = fn;
4212 d.fn_name = fn_name;
4213 d.arg = data;
4214 d.p[0] = p[0];
4215 d.p[1] = p[1];
4216
4217 /* Clone child in new user namespace. */
33258b95 4218 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER, NULL);
415a8851 4219 if (pid < 0) {
0fd73091 4220 ERROR("Failed to clone process in new user namespace");
415a8851
CB
4221 goto on_error;
4222 }
4223
4224 close(p[0]);
4225 p[0] = -1;
4226
4227 euid = geteuid();
4228 egid = getegid();
4229
4230 /* Allocate new {g,u}id map list. */
4231 idmap = malloc(sizeof(*idmap));
4232 if (!idmap)
4233 goto on_error;
4234 lxc_list_init(idmap);
4235
4236 /* Find container root. */
0fd73091 4237 lxc_list_for_each (cur, &conf->id_map) {
415a8851
CB
4238 struct id_map *tmpmap;
4239
4240 tmplist = malloc(sizeof(*tmplist));
4241 if (!tmplist)
4242 goto on_error;
4243
4244 tmpmap = malloc(sizeof(*tmpmap));
4245 if (!tmpmap) {
4246 free(tmplist);
4247 goto on_error;
4248 }
4249
4250 memset(tmpmap, 0, sizeof(*tmpmap));
4251 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
4252 tmplist->elem = tmpmap;
4253
4254 lxc_list_add_tail(idmap, tmplist);
4255
4256 map = cur->elem;
4257
4258 if (map->idtype == ID_TYPE_UID)
4259 if (euid >= map->hostid && euid < map->hostid + map->range)
4260 host_uid_map = map;
4261
4262 if (map->idtype == ID_TYPE_GID)
4263 if (egid >= map->hostid && egid < map->hostid + map->range)
4264 host_gid_map = map;
4265
4266 if (map->nsid != 0)
4267 continue;
4268
4269 if (map->idtype == ID_TYPE_UID)
4270 if (container_root_uid == NULL)
4271 container_root_uid = map;
4272
4273 if (map->idtype == ID_TYPE_GID)
4274 if (container_root_gid == NULL)
4275 container_root_gid = map;
4276 }
4277
4278 if (!container_root_uid || !container_root_gid) {
4279 ERROR("No mapping for container root found");
4280 goto on_error;
4281 }
4282
4283 /* Check whether the {g,u}id of the user has a mapping. */
4284 if (!host_uid_map)
c4333195 4285 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
415a8851
CB
4286 else
4287 host_uid_map = container_root_uid;
4288
4289 if (!host_gid_map)
c4333195 4290 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
415a8851
CB
4291 else
4292 host_gid_map = container_root_gid;
4293
4294 if (!host_uid_map) {
4295 DEBUG("Failed to find mapping for uid %d", euid);
4296 goto on_error;
4297 }
4298
4299 if (!host_gid_map) {
4300 DEBUG("Failed to find mapping for gid %d", egid);
4301 goto on_error;
4302 }
4303
4304 if (host_uid_map && (host_uid_map != container_root_uid)) {
4305 /* Add container root to the map. */
4306 tmplist = malloc(sizeof(*tmplist));
4307 if (!tmplist)
4308 goto on_error;
4309 lxc_list_add_elem(tmplist, host_uid_map);
4310 lxc_list_add_tail(idmap, tmplist);
4311 }
4312 /* idmap will now keep track of that memory. */
4313 host_uid_map = NULL;
4314
4315 if (host_gid_map && (host_gid_map != container_root_gid)) {
4316 tmplist = malloc(sizeof(*tmplist));
4317 if (!tmplist)
4318 goto on_error;
4319 lxc_list_add_elem(tmplist, host_gid_map);
4320 lxc_list_add_tail(idmap, tmplist);
4321 }
4322 /* idmap will now keep track of that memory. */
4323 host_gid_map = NULL;
4324
4325 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4326 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
0fd73091 4327 lxc_list_for_each (cur, idmap) {
415a8851
CB
4328 map = cur->elem;
4329 TRACE("establishing %cid mapping for \"%d\" in new "
4330 "user namespace: nsuid %lu - hostid %lu - range "
4331 "%lu",
4332 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4333 map->nsid, map->hostid, map->range);
4334 }
4335 }
4336
4337 /* Set up {g,u}id mapping for user namespace of child process. */
4338 ret = lxc_map_ids(idmap, pid);
4339 if (ret < 0) {
0fd73091 4340 ERROR("error setting up {g,u}id mappings for child process \"%d\"", pid);
415a8851
CB
4341 goto on_error;
4342 }
4343
4344 /* Tell child to proceed. */
489f39be 4345 if (lxc_write_nointr(p[1], &c, 1) != 1) {
0fd73091 4346 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
415a8851
CB
4347 goto on_error;
4348 }
4349
686dd5d1 4350on_error:
ee1b16bc
TA
4351 if (p[0] != -1)
4352 close(p[0]);
4353 close(p[1]);
4354
415a8851 4355 /* Wait for child to finish. */
686dd5d1
CB
4356 if (pid > 0)
4357 ret = wait_for_pid(pid);
415a8851 4358
7e621263
CB
4359 if (idmap)
4360 __lxc_free_idmap(idmap);
80758b4b 4361
415a8851
CB
4362 if (host_uid_map && (host_uid_map != container_root_uid))
4363 free(host_uid_map);
4364 if (host_gid_map && (host_gid_map != container_root_gid))
4365 free(host_gid_map);
4366
415a8851
CB
4367 return ret;
4368}
4369
234998b4
CB
4370static int add_idmap_entry(struct lxc_list *idmap, enum idtype idtype,
4371 unsigned long nsid, unsigned long hostid,
4372 unsigned long range)
4373{
4374 __do_free struct id_map *new_idmap = NULL;
4375 __do_free struct lxc_list *new_list = NULL;
4376
4377 new_idmap = zalloc(sizeof(*new_idmap));
4378 if (!new_idmap)
4379 return ret_errno(ENOMEM);
4380
4381 new_idmap->idtype = idtype;
4382 new_idmap->hostid = hostid;
4383 new_idmap->nsid = nsid;
4384 new_idmap->range = range;
4385
4386 new_list = zalloc(sizeof(*new_list));
4387 if (!new_list)
4388 return ret_errno(ENOMEM);
4389
4390 new_list->elem = move_ptr(new_idmap);
4391 lxc_list_add_tail(idmap, move_ptr(new_list));
4392
4393 INFO("Adding id map: type %c nsid %lu hostid %lu range %lu",
4394 idtype == ID_TYPE_UID ? 'u' : 'g', nsid, hostid, range);
4395 return 0;
4396}
4397
4398int userns_exec_mapped_root(const char *path, int path_fd,
4399 const struct lxc_conf *conf)
4400{
7e621263 4401 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
234998b4
CB
4402 __do_close int fd = -EBADF;
4403 int target_fd = -EBADF;
4404 char c = '1';
4405 ssize_t ret;
4406 pid_t pid;
4407 int sock_fds[2];
4408 uid_t container_host_uid, hostuid;
4409 gid_t container_host_gid, hostgid;
4410 struct stat st;
4411
4412 if (!conf || (!path && path_fd < 0))
4413 return ret_errno(EINVAL);
4414
4415 if (!path)
4416 path = "(null)";
4417
4418 container_host_uid = get_mapped_rootid(conf, ID_TYPE_UID);
4419 if (!uid_valid(container_host_uid))
4420 return log_error(-1, "No uid mapping for container root");
4421
4422 container_host_gid = get_mapped_rootid(conf, ID_TYPE_GID);
4423 if (!gid_valid(container_host_gid))
4424 return log_error(-1, "No gid mapping for container root");
4425
cf68ffd9 4426 if (path_fd < 0) {
a72c68f7 4427 fd = open(path, O_CLOEXEC | O_NOCTTY);
234998b4
CB
4428 if (fd < 0)
4429 return log_error_errno(-errno, errno, "Failed to open \"%s\"", path);
4430 target_fd = fd;
4431 } else {
4432 target_fd = path_fd;
4433 }
4434
4435 hostuid = geteuid();
4436 /* We are root so chown directly. */
4437 if (hostuid == 0) {
4438 ret = fchown(target_fd, container_host_uid, container_host_gid);
4439 if (ret)
4440 return log_error_errno(-errno, errno,
4441 "Failed to fchown(%d(%s), %d, %d)",
4442 target_fd, path, container_host_uid,
4443 container_host_gid);
4444 return log_trace(0, "Chowned %d(%s) to uid %d and %d", target_fd, path,
4445 container_host_uid, container_host_gid);
4446 }
4447
4448 /* The container's root host id matches */
4449 if (container_host_uid == hostuid)
4450 return log_info(0, "Container root id is mapped to our uid");
4451
4452 /* Get the current ids of our target. */
4453 ret = fstat(target_fd, &st);
4454 if (ret)
4455 return log_error_errno(-errno, errno, "Failed to stat \"%s\"", path);
4456
4457 hostgid = getegid();
4458 if (st.st_uid == hostuid && mapped_hostid(st.st_gid, conf, ID_TYPE_GID) < 0) {
4459 ret = fchown(target_fd, -1, hostgid);
4460 if (ret)
4461 return log_error_errno(-errno, errno,
4462 "Failed to fchown(%d(%s), -1, %d)",
4463 target_fd, path, hostgid);
2e8013f9 4464 TRACE("Chowned %d(%s) to -1:%d", target_fd, path, hostgid);
234998b4
CB
4465 }
4466
4467 idmap = malloc(sizeof(*idmap));
4468 if (!idmap)
4469 return -ENOMEM;
4470 lxc_list_init(idmap);
4471
4472 /* "u:0:rootuid:1" */
4473 ret = add_idmap_entry(idmap, ID_TYPE_UID, 0, container_host_uid, 1);
4474 if (ret < 0)
4475 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4476
4477 /* "u:hostuid:hostuid:1" */
4478 ret = add_idmap_entry(idmap, ID_TYPE_UID, hostuid, hostuid, 1);
4479 if (ret < 0)
4480 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4481
4482 /* "g:0:rootgid:1" */
4483 ret = add_idmap_entry(idmap, ID_TYPE_GID, 0, container_host_gid, 1);
4484 if (ret < 0)
4485 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4486
4487 /* "g:hostgid:hostgid:1" */
4488 ret = add_idmap_entry(idmap, ID_TYPE_GID, hostgid, hostgid, 1);
4489 if (ret < 0)
4490 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4491
4492 if (hostgid != st.st_gid) {
4493 /* "g:pathgid:rootgid+pathgid:1" */
4494 ret = add_idmap_entry(idmap, ID_TYPE_GID, st.st_gid,
4495 container_host_gid + (gid_t)st.st_gid, 1);
4496 if (ret < 0)
4497 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4498 }
4499
4500 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, sock_fds);
4501 if (ret < 0)
4502 return -errno;
4503
4504 pid = fork();
4505 if (pid < 0) {
4506 SYSERROR("Failed to create new process");
4507 goto on_error;
4508 }
4509
4510 if (pid == 0) {
4511 close_prot_errno_disarm(sock_fds[1]);
4512
4513 ret = unshare(CLONE_NEWUSER);
4514 if (ret < 0) {
4515 SYSERROR("Failed to unshare new user namespace");
4516 _exit(EXIT_FAILURE);
4517 }
4518
4519 ret = lxc_write_nointr(sock_fds[0], &c, 1);
4520 if (ret != 1)
4521 _exit(EXIT_FAILURE);
4522
4523 ret = lxc_read_nointr(sock_fds[0], &c, 1);
4524 if (ret != 1)
4525 _exit(EXIT_FAILURE);
4526
4527 close_prot_errno_disarm(sock_fds[0]);
4528
4529 if (!lxc_switch_uid_gid(0, 0))
4530 _exit(EXIT_FAILURE);
4531
4532 if (!lxc_setgroups(0, NULL))
4533 _exit(EXIT_FAILURE);
4534
8053a085 4535 ret = fchown(target_fd, 0, st.st_gid);
234998b4 4536 if (ret) {
2e8013f9 4537 SYSERROR("Failed to chown %d(%s) to -1:%d", target_fd, path, st.st_gid);
234998b4
CB
4538 _exit(EXIT_FAILURE);
4539 }
4540
2e8013f9 4541 TRACE("Chowned %d(%s) to 0:%d", target_fd, path, st.st_gid);
234998b4
CB
4542 _exit(EXIT_SUCCESS);
4543 }
4544
4545 close_prot_errno_disarm(sock_fds[0]);
4546
4547 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4548 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4549 struct id_map *map;
4550 struct lxc_list *it;
4551
4552 lxc_list_for_each(it, idmap) {
4553 map = it->elem;
4554 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4555 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
4556 }
4557 }
4558
4559 ret = lxc_read_nointr(sock_fds[1], &c, 1);
4560 if (ret != 1) {
4561 SYSERROR("Failed waiting for child process %d\" to tell us to proceed", pid);
4562 goto on_error;
4563 }
4564
4565 /* Set up {g,u}id mapping for user namespace of child process. */
4566 ret = lxc_map_ids(idmap, pid);
4567 if (ret < 0) {
4568 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4569 goto on_error;
4570 }
4571
4572 /* Tell child to proceed. */
4573 ret = lxc_write_nointr(sock_fds[1], &c, 1);
4574 if (ret != 1) {
4575 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4576 goto on_error;
4577 }
4578
4579on_error:
4580 close_prot_errno_disarm(sock_fds[0]);
4581 close_prot_errno_disarm(sock_fds[1]);
4582
4583 /* Wait for child to finish. */
4584 if (pid < 0)
4585 return -1;
4586
4587 return wait_for_pid(pid);
4588}
4589
a96a8e8c 4590/* not thread-safe, do not use from api without first forking */
0fd73091 4591static char *getuname(void)
97e9cfa0 4592{
4f410b2a 4593 __do_free char *buf = NULL;
cb7aa5e8
DJ
4594 struct passwd pwent;
4595 struct passwd *pwentp = NULL;
cb7aa5e8
DJ
4596 size_t bufsize;
4597 int ret;
97e9cfa0 4598
cb7aa5e8
DJ
4599 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
4600 if (bufsize == -1)
4601 bufsize = 1024;
4602
4603 buf = malloc(bufsize);
4604 if (!buf)
97e9cfa0
SH
4605 return NULL;
4606
cb7aa5e8
DJ
4607 ret = getpwuid_r(geteuid(), &pwent, buf, bufsize, &pwentp);
4608 if (!pwentp) {
4609 if (ret == 0)
4610 WARN("Could not find matched password record.");
4611
55022530 4612 return log_error(NULL, "Failed to get password record - %u", geteuid());
cb7aa5e8
DJ
4613 }
4614
4f410b2a 4615 return strdup(pwent.pw_name);
97e9cfa0
SH
4616}
4617
a96a8e8c 4618/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4619static char *getgname(void)
4620{
4f410b2a 4621 __do_free char *buf = NULL;
3de9fb4c
DJ
4622 struct group grent;
4623 struct group *grentp = NULL;
3de9fb4c
DJ
4624 size_t bufsize;
4625 int ret;
4626
4627 bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
4628 if (bufsize == -1)
4629 bufsize = 1024;
4630
4631 buf = malloc(bufsize);
4632 if (!buf)
4633 return NULL;
4634
4635 ret = getgrgid_r(getegid(), &grent, buf, bufsize, &grentp);
4636 if (!grentp) {
4637 if (ret == 0)
4638 WARN("Could not find matched group record");
97e9cfa0 4639
55022530 4640 return log_error(NULL, "Failed to get group record - %u", getegid());
3de9fb4c
DJ
4641 }
4642
4f410b2a 4643 return strdup(grent.gr_name);
97e9cfa0
SH
4644}
4645
a96a8e8c 4646/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4647void suggest_default_idmap(void)
4648{
3a6e3bf5 4649 __do_free char *gname = NULL, *line = NULL, *uname = NULL;
4aae564f 4650 __do_fclose FILE *subuid_f = NULL, *subgid_f = NULL;
97e9cfa0 4651 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
97e9cfa0
SH
4652 size_t len = 0;
4653
0fd73091
CB
4654 uname = getuname();
4655 if (!uname)
97e9cfa0
SH
4656 return;
4657
0fd73091 4658 gname = getgname();
3a6e3bf5 4659 if (!gname)
97e9cfa0 4660 return;
97e9cfa0 4661
4110345b 4662 subuid_f = fopen(subuidfile, "re");
4aae564f 4663 if (!subuid_f) {
97e9cfa0 4664 ERROR("Your system is not configured with subuids");
97e9cfa0
SH
4665 return;
4666 }
0fd73091 4667
4aae564f 4668 while (getline(&line, &len, subuid_f) != -1) {
0fd73091 4669 char *p, *p2;
b7930180 4670 size_t no_newline = 0;
0fd73091
CB
4671
4672 p = strchr(line, ':');
97e9cfa0
SH
4673 if (*line == '#')
4674 continue;
4675 if (!p)
4676 continue;
4677 *p = '\0';
4678 p++;
0fd73091 4679
97e9cfa0
SH
4680 if (strcmp(line, uname))
4681 continue;
0fd73091 4682
97e9cfa0
SH
4683 p2 = strchr(p, ':');
4684 if (!p2)
4685 continue;
4686 *p2 = '\0';
4687 p2++;
4688 if (!*p2)
4689 continue;
b7930180
CB
4690 no_newline = strcspn(p2, "\n");
4691 p2[no_newline] = '\0';
4692
b7b2fde4 4693 if (lxc_safe_uint(p, &uid) < 0)
0fd73091 4694 WARN("Could not parse UID");
b7b2fde4 4695 if (lxc_safe_uint(p2, &urange) < 0)
0fd73091 4696 WARN("Could not parse UID range");
97e9cfa0 4697 }
97e9cfa0 4698
4110345b 4699 subgid_f = fopen(subgidfile, "re");
4aae564f 4700 if (!subgid_f) {
97e9cfa0 4701 ERROR("Your system is not configured with subgids");
97e9cfa0
SH
4702 return;
4703 }
0fd73091 4704
4aae564f 4705 while (getline(&line, &len, subgid_f) != -1) {
0fd73091 4706 char *p, *p2;
b7930180 4707 size_t no_newline = 0;
0fd73091
CB
4708
4709 p = strchr(line, ':');
97e9cfa0
SH
4710 if (*line == '#')
4711 continue;
4712 if (!p)
4713 continue;
4714 *p = '\0';
4715 p++;
0fd73091 4716
97e9cfa0
SH
4717 if (strcmp(line, uname))
4718 continue;
0fd73091 4719
97e9cfa0
SH
4720 p2 = strchr(p, ':');
4721 if (!p2)
4722 continue;
4723 *p2 = '\0';
4724 p2++;
4725 if (!*p2)
4726 continue;
b7930180
CB
4727 no_newline = strcspn(p2, "\n");
4728 p2[no_newline] = '\0';
4729
b7b2fde4 4730 if (lxc_safe_uint(p, &gid) < 0)
0fd73091 4731 WARN("Could not parse GID");
b7b2fde4 4732 if (lxc_safe_uint(p2, &grange) < 0)
0fd73091 4733 WARN("Could not parse GID range");
97e9cfa0 4734 }
97e9cfa0 4735
97e9cfa0
SH
4736 if (!urange || !grange) {
4737 ERROR("You do not have subuids or subgids allocated");
4738 ERROR("Unprivileged containers require subuids and subgids");
4739 return;
4740 }
4741
4742 ERROR("You must either run as root, or define uid mappings");
4743 ERROR("To pass uid mappings to lxc-create, you could create");
4744 ERROR("~/.config/lxc/default.conf:");
4745 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
bdcbb6b3
CB
4746 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4747 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
97e9cfa0 4748}
aaf26830 4749
a7307747
SH
4750static void free_cgroup_settings(struct lxc_list *result)
4751{
4752 struct lxc_list *iterator, *next;
4753
0fd73091 4754 lxc_list_for_each_safe (iterator, result, next) {
a7307747 4755 lxc_list_del(iterator);
55022530 4756 free_disarm(iterator);
a7307747 4757 }
55022530 4758 free_disarm(result);
a7307747
SH
4759}
4760
0fd73091 4761/* Return the list of cgroup_settings sorted according to the following rules
aaf26830
KT
4762 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4763 */
0fd73091 4764struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings)
aaf26830
KT
4765{
4766 struct lxc_list *result;
aaf26830 4767 struct lxc_cgroup *cg = NULL;
0fd73091 4768 struct lxc_list *it = NULL, *item = NULL, *memsw_limit = NULL;
aaf26830
KT
4769
4770 result = malloc(sizeof(*result));
0fd73091 4771 if (!result)
fac7c663 4772 return NULL;
aaf26830
KT
4773 lxc_list_init(result);
4774
0fd73091
CB
4775 /* Iterate over the cgroup settings and copy them to the output list. */
4776 lxc_list_for_each (it, cgroup_settings) {
aaf26830 4777 item = malloc(sizeof(*item));
fac7c663 4778 if (!item) {
a7307747 4779 free_cgroup_settings(result);
fac7c663
KT
4780 return NULL;
4781 }
0fd73091 4782
aaf26830
KT
4783 item->elem = it->elem;
4784 cg = it->elem;
4785 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4786 /* Store the memsw_limit location */
4787 memsw_limit = item;
0fd73091
CB
4788 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 &&
4789 memsw_limit != NULL) {
4790 /* lxc.cgroup.memory.memsw.limit_in_bytes is found
4791 * before lxc.cgroup.memory.limit_in_bytes, swap these
4792 * two items */
aaf26830
KT
4793 item->elem = memsw_limit->elem;
4794 memsw_limit->elem = it->elem;
4795 }
4796 lxc_list_add_tail(result, item);
4797 }
4798
4799 return result;
a7307747 4800}