]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
conf, confile: introduce basic structs for shared mount point
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
9d257a2a 27#include <arpa/inet.h>
8f3e280e
CB
28#include <dirent.h>
29#include <errno.h>
30#include <fcntl.h>
31#include <grp.h>
32#include <inttypes.h>
33#include <libgen.h>
9d257a2a
CB
34#include <linux/loop.h>
35#include <net/if.h>
36#include <netinet/in.h>
8f3e280e
CB
37#include <pwd.h>
38#include <stdarg.h>
0ad19a3f 39#include <stdio.h>
0ad19a3f 40#include <stdlib.h>
0ad19a3f 41#include <string.h>
8f3e280e
CB
42#include <sys/mman.h>
43#include <sys/mount.h>
44#include <sys/param.h>
45#include <sys/prctl.h>
6a49f05e 46#include <sys/sendfile.h>
8f3e280e 47#include <sys/socket.h>
9d257a2a 48#include <sys/stat.h>
2d76d1d7 49#include <sys/syscall.h>
9d257a2a 50#include <sys/sysmacros.h>
97e9cfa0 51#include <sys/types.h>
8f3e280e
CB
52#include <sys/utsname.h>
53#include <sys/wait.h>
9d257a2a
CB
54#include <time.h>
55#include <unistd.h>
1d52bdf7 56
af6824fc 57#ifdef MAJOR_IN_MKDEV
9d257a2a 58#include <sys/mkdev.h>
af6824fc 59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
9d257a2a
CB
71#if HAVE_LIBCAP
72#include <sys/capability.h>
73#endif
74
75#if HAVE_SYS_PERSONALITY_H
76#include <sys/personality.h>
77#endif
78
f1e05b90
DJ
79#ifndef HAVE_STRLCAT
80#include "include/strlcat.h"
81#endif
82
9d257a2a
CB
83#if IS_BIONIC
84#include <../include/lxcmntent.h>
85#else
86#include <mntent.h>
87#endif
88
89#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
90#include <../include/prlimit.h>
91#endif
92
e8bd4e43 93#include "af_unix.h"
9d257a2a 94#include "caps.h"
8f3e280e 95#include "cgroup.h"
1b09f2c0 96#include "conf.h"
1ed6ba91 97#include "confile_utils.h"
8f3e280e 98#include "error.h"
1b09f2c0 99#include "log.h"
0ed9b1bc 100#include "lsm/lsm.h"
025ed0f3 101#include "lxclock.h"
8f3e280e 102#include "lxcseccomp.h"
4355ab5f 103#include "namespace.h"
8f3e280e
CB
104#include "network.h"
105#include "parse.h"
732375f5 106#include "ringbuf.h"
794248d0 107#include "start.h"
28d832c4 108#include "storage.h"
28d832c4 109#include "storage/overlay.h"
0ed9b1bc 110#include "terminal.h"
8f3e280e 111#include "utils.h"
d0a36f2c 112
9d257a2a
CB
113#ifndef MS_PRIVATE
114#define MS_PRIVATE (1<<18)
edaf8b1b
SG
115#endif
116
9d257a2a
CB
117#ifndef MS_LAZYTIME
118#define MS_LAZYTIME (1<<25)
f48b5fd8
FF
119#endif
120
ac2cecc4 121lxc_log_define(conf, lxc);
e5bda9ee 122
0fd73091
CB
123/* The lxc_conf of the container currently being worked on in an API call.
124 * This is used in the error calls.
125 */
126#ifdef HAVE_TLS
127__thread struct lxc_conf *current_config;
128#else
129struct lxc_conf *current_config;
130#endif
131
2d76d1d7
SG
132/* Define pivot_root() if missing from the C library */
133#ifndef HAVE_PIVOT_ROOT
9d257a2a 134static int pivot_root(const char *new_root, const char *put_old)
2d76d1d7
SG
135{
136#ifdef __NR_pivot_root
8f3e280e 137 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 138#else
8f3e280e
CB
139 errno = ENOSYS;
140 return -1;
2d76d1d7
SG
141#endif
142}
143#else
9d257a2a 144extern int pivot_root(const char *new_root, const char *put_old);
8912711c
CB
145#endif
146
0fd73091
CB
147char *lxchook_names[NUM_LXC_HOOKS] = {
148 "pre-start",
149 "pre-mount",
150 "mount",
151 "autodev",
152 "start",
153 "stop",
154 "post-stop",
155 "clone",
156 "destroy",
157 "start-host"
158};
72d0e1cb 159
998ac676
RT
160struct mount_opt {
161 char *name;
162 int clear;
163 int flag;
164};
165
81810dd1
DL
166struct caps_opt {
167 char *name;
168 int value;
169};
170
c6d09e15
WB
171struct limit_opt {
172 char *name;
173 int value;
174};
175
998ac676 176static struct mount_opt mount_opt[] = {
470b359b
CB
177 { "async", 1, MS_SYNCHRONOUS },
178 { "atime", 1, MS_NOATIME },
179 { "bind", 0, MS_BIND },
88d413d5 180 { "defaults", 0, 0 },
88d413d5 181 { "dev", 1, MS_NODEV },
470b359b 182 { "diratime", 1, MS_NODIRATIME },
88d413d5 183 { "dirsync", 0, MS_DIRSYNC },
470b359b 184 { "exec", 1, MS_NOEXEC },
8912711c 185 { "lazytime", 0, MS_LAZYTIME },
88d413d5 186 { "mand", 0, MS_MANDLOCK },
88d413d5 187 { "noatime", 0, MS_NOATIME },
470b359b 188 { "nodev", 0, MS_NODEV },
88d413d5 189 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
190 { "noexec", 0, MS_NOEXEC },
191 { "nomand", 1, MS_MANDLOCK },
192 { "norelatime", 1, MS_RELATIME },
193 { "nostrictatime", 1, MS_STRICTATIME },
194 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
195 { "rbind", 0, MS_BIND|MS_REC },
196 { "relatime", 0, MS_RELATIME },
470b359b
CB
197 { "remount", 0, MS_REMOUNT },
198 { "ro", 0, MS_RDONLY },
199 { "rw", 1, MS_RDONLY },
88d413d5 200 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
201 { "suid", 1, MS_NOSUID },
202 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 203 { NULL, 0, 0 },
998ac676
RT
204};
205
d840039e 206static struct mount_opt propagation_opt[] = {
0fd73091
CB
207 { "private", 0, MS_PRIVATE },
208 { "shared", 0, MS_SHARED },
209 { "slave", 0, MS_SLAVE },
210 { "unbindable", 0, MS_UNBINDABLE },
211 { "rprivate", 0, MS_PRIVATE|MS_REC },
212 { "rshared", 0, MS_SHARED|MS_REC },
213 { "rslave", 0, MS_SLAVE|MS_REC },
214 { "runbindable", 0, MS_UNBINDABLE|MS_REC },
215 { NULL, 0, 0 },
d840039e
YT
216};
217
81810dd1 218static struct caps_opt caps_opt[] = {
8560cd36 219#if HAVE_LIBCAP
0fd73091
CB
220 { "chown", CAP_CHOWN },
221 { "dac_override", CAP_DAC_OVERRIDE },
222 { "dac_read_search", CAP_DAC_READ_SEARCH },
223 { "fowner", CAP_FOWNER },
224 { "fsetid", CAP_FSETID },
225 { "kill", CAP_KILL },
226 { "setgid", CAP_SETGID },
227 { "setuid", CAP_SETUID },
228 { "setpcap", CAP_SETPCAP },
229 { "linux_immutable", CAP_LINUX_IMMUTABLE },
230 { "net_bind_service", CAP_NET_BIND_SERVICE },
231 { "net_broadcast", CAP_NET_BROADCAST },
232 { "net_admin", CAP_NET_ADMIN },
233 { "net_raw", CAP_NET_RAW },
234 { "ipc_lock", CAP_IPC_LOCK },
235 { "ipc_owner", CAP_IPC_OWNER },
236 { "sys_module", CAP_SYS_MODULE },
237 { "sys_rawio", CAP_SYS_RAWIO },
238 { "sys_chroot", CAP_SYS_CHROOT },
239 { "sys_ptrace", CAP_SYS_PTRACE },
240 { "sys_pacct", CAP_SYS_PACCT },
241 { "sys_admin", CAP_SYS_ADMIN },
242 { "sys_boot", CAP_SYS_BOOT },
243 { "sys_nice", CAP_SYS_NICE },
244 { "sys_resource", CAP_SYS_RESOURCE },
245 { "sys_time", CAP_SYS_TIME },
246 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
247 { "mknod", CAP_MKNOD },
248 { "lease", CAP_LEASE },
57b837e2 249#ifdef CAP_AUDIT_READ
0fd73091 250 { "audit_read", CAP_AUDIT_READ },
57b837e2 251#endif
9527e566 252#ifdef CAP_AUDIT_WRITE
0fd73091 253 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
254#endif
255#ifdef CAP_AUDIT_CONTROL
0fd73091 256 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 257#endif
0fd73091
CB
258 { "setfcap", CAP_SETFCAP },
259 { "mac_override", CAP_MAC_OVERRIDE },
260 { "mac_admin", CAP_MAC_ADMIN },
5170c716 261#ifdef CAP_SYSLOG
0fd73091 262 { "syslog", CAP_SYSLOG },
5170c716
CS
263#endif
264#ifdef CAP_WAKE_ALARM
0fd73091 265 { "wake_alarm", CAP_WAKE_ALARM },
5170c716 266#endif
2b54359b 267#ifdef CAP_BLOCK_SUSPEND
0fd73091 268 { "block_suspend", CAP_BLOCK_SUSPEND },
2b54359b 269#endif
495d2046 270#endif
8560cd36 271};
81810dd1 272
c6d09e15
WB
273static struct limit_opt limit_opt[] = {
274#ifdef RLIMIT_AS
275 { "as", RLIMIT_AS },
276#endif
277#ifdef RLIMIT_CORE
278 { "core", RLIMIT_CORE },
279#endif
280#ifdef RLIMIT_CPU
281 { "cpu", RLIMIT_CPU },
282#endif
283#ifdef RLIMIT_DATA
284 { "data", RLIMIT_DATA },
285#endif
286#ifdef RLIMIT_FSIZE
287 { "fsize", RLIMIT_FSIZE },
288#endif
289#ifdef RLIMIT_LOCKS
290 { "locks", RLIMIT_LOCKS },
291#endif
292#ifdef RLIMIT_MEMLOCK
293 { "memlock", RLIMIT_MEMLOCK },
294#endif
295#ifdef RLIMIT_MSGQUEUE
296 { "msgqueue", RLIMIT_MSGQUEUE },
297#endif
298#ifdef RLIMIT_NICE
299 { "nice", RLIMIT_NICE },
300#endif
301#ifdef RLIMIT_NOFILE
302 { "nofile", RLIMIT_NOFILE },
303#endif
304#ifdef RLIMIT_NPROC
305 { "nproc", RLIMIT_NPROC },
306#endif
307#ifdef RLIMIT_RSS
308 { "rss", RLIMIT_RSS },
309#endif
310#ifdef RLIMIT_RTPRIO
311 { "rtprio", RLIMIT_RTPRIO },
312#endif
313#ifdef RLIMIT_RTTIME
314 { "rttime", RLIMIT_RTTIME },
315#endif
316#ifdef RLIMIT_SIGPENDING
317 { "sigpending", RLIMIT_SIGPENDING },
318#endif
319#ifdef RLIMIT_STACK
320 { "stack", RLIMIT_STACK },
321#endif
322};
323
91c3830e
SH
324static int run_buffer(char *buffer)
325{
8e7da691 326 int ret;
0fd73091
CB
327 char *output;
328 struct lxc_popen_FILE *f;
91c3830e 329
ebec9176 330 f = lxc_popen(buffer);
91c3830e 331 if (!f) {
3f60c2f7 332 SYSERROR("Failed to popen() %s", buffer);
91c3830e
SH
333 return -1;
334 }
335
336 output = malloc(LXC_LOG_BUFFER_SIZE);
337 if (!output) {
3f60c2f7 338 ERROR("Failed to allocate memory for %s", buffer);
ebec9176 339 lxc_pclose(f);
91c3830e
SH
340 return -1;
341 }
342
062b72c6 343 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
3f60c2f7 344 DEBUG("Script %s with output: %s", buffer, output);
91c3830e
SH
345
346 free(output);
347
ebec9176 348 ret = lxc_pclose(f);
8e7da691 349 if (ret == -1) {
3f60c2f7 350 SYSERROR("Script exited with error");
91c3830e 351 return -1;
8e7da691 352 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
3f60c2f7 353 ERROR("Script exited with status %d", WEXITSTATUS(ret));
8e7da691
DE
354 return -1;
355 } else if (WIFSIGNALED(ret)) {
3f60c2f7 356 ERROR("Script terminated by signal %d", WTERMSIG(ret));
8e7da691 357 return -1;
91c3830e
SH
358 }
359
360 return 0;
361}
362
14a7b0f9
CB
363int run_script_argv(const char *name, unsigned int hook_version,
364 const char *section, const char *script,
586b1ce7 365 const char *hookname, char **argv)
148e91f5 366{
3f60c2f7 367 int buf_pos, i, ret;
148e91f5 368 char *buffer;
6f8d00d2 369 int fret = -1;
d08e5708 370 size_t size = 0;
148e91f5 371
3f60c2f7
CB
372 if (hook_version == 0)
373 INFO("Executing script \"%s\" for container \"%s\", config "
374 "section \"%s\"", script, name, section);
375 else
376 INFO("Executing script \"%s\" for container \"%s\"", script, name);
148e91f5 377
586b1ce7
CB
378 for (i = 0; argv && argv[i]; i++)
379 size += strlen(argv[i]) + 1;
148e91f5 380
3f60c2f7 381 size += sizeof("exec");
148e91f5 382 size += strlen(script);
3f60c2f7
CB
383 size++;
384
148e91f5 385 if (size > INT_MAX)
3f60c2f7 386 return -EFBIG;
148e91f5 387
3f60c2f7 388 if (hook_version == 0) {
d08e5708
CB
389 size += strlen(hookname);
390 size++;
391
392 size += strlen(name);
393 size++;
394
395 size += strlen(section);
396 size++;
397
398 if (size > INT_MAX)
399 return -EFBIG;
327cce76 400 }
3f60c2f7 401
6f8d00d2
CB
402 buffer = malloc(size);
403 if (!buffer)
404 return -ENOMEM;
405
327cce76 406 if (hook_version == 0)
3f60c2f7 407 buf_pos = snprintf(buffer, size, "exec %s %s %s %s", script, name, section, hookname);
327cce76 408 else
3f60c2f7 409 buf_pos = snprintf(buffer, size, "exec %s", script);
327cce76
CB
410 if (buf_pos < 0 || (size_t)buf_pos >= size) {
411 ERROR("Failed to create command line for script \"%s\"", script);
6f8d00d2 412 goto on_error;
327cce76 413 }
3f60c2f7 414
327cce76 415 if (hook_version == 1) {
3f60c2f7
CB
416 ret = setenv("LXC_HOOK_TYPE", hookname, 1);
417 if (ret < 0) {
418 SYSERROR("Failed to set environment variable: "
419 "LXC_HOOK_TYPE=%s", hookname);
6f8d00d2 420 goto on_error;
3f60c2f7 421 }
90f20466 422 TRACE("Set environment variable: LXC_HOOK_TYPE=%s", hookname);
3f60c2f7
CB
423
424 ret = setenv("LXC_HOOK_SECTION", section, 1);
425 if (ret < 0) {
426 SYSERROR("Failed to set environment variable: "
427 "LXC_HOOK_SECTION=%s", section);
6f8d00d2 428 goto on_error;
3f60c2f7
CB
429 }
430 TRACE("Set environment variable: LXC_HOOK_SECTION=%s", section);
14a7b0f9
CB
431
432 if (strcmp(section, "net") == 0) {
433 char *parent;
434
586b1ce7 435 if (!argv || !argv[0])
6f8d00d2 436 goto on_error;
14a7b0f9 437
586b1ce7 438 ret = setenv("LXC_NET_TYPE", argv[0], 1);
14a7b0f9
CB
439 if (ret < 0) {
440 SYSERROR("Failed to set environment variable: "
586b1ce7 441 "LXC_NET_TYPE=%s", argv[0]);
6f8d00d2 442 goto on_error;
14a7b0f9 443 }
586b1ce7 444 TRACE("Set environment variable: LXC_NET_TYPE=%s", argv[0]);
14a7b0f9 445
586b1ce7 446 parent = argv[1] ? argv[1] : "";
14a7b0f9 447
a8144263 448 if (strcmp(argv[0], "macvlan") == 0) {
14a7b0f9
CB
449 ret = setenv("LXC_NET_PARENT", parent, 1);
450 if (ret < 0) {
451 SYSERROR("Failed to set environment "
452 "variable: LXC_NET_PARENT=%s", parent);
6f8d00d2 453 goto on_error;
14a7b0f9
CB
454 }
455 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 456 } else if (strcmp(argv[0], "phys") == 0) {
14a7b0f9
CB
457 ret = setenv("LXC_NET_PARENT", parent, 1);
458 if (ret < 0) {
459 SYSERROR("Failed to set environment "
460 "variable: LXC_NET_PARENT=%s", parent);
6f8d00d2 461 goto on_error;
14a7b0f9
CB
462 }
463 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 464 } else if (strcmp(argv[0], "veth") == 0) {
586b1ce7 465 char *peer = argv[2] ? argv[2] : "";
14a7b0f9
CB
466
467 ret = setenv("LXC_NET_PEER", peer, 1);
468 if (ret < 0) {
469 SYSERROR("Failed to set environment "
470 "variable: LXC_NET_PEER=%s", peer);
6f8d00d2 471 goto on_error;
14a7b0f9
CB
472 }
473 TRACE("Set environment variable: LXC_NET_PEER=%s", peer);
474
475 ret = setenv("LXC_NET_PARENT", parent, 1);
476 if (ret < 0) {
477 SYSERROR("Failed to set environment "
478 "variable: LXC_NET_PARENT=%s", parent);
6f8d00d2 479 goto on_error;
14a7b0f9
CB
480 }
481 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
482 }
483 }
148e91f5
SH
484 }
485
586b1ce7 486 for (i = 0; argv && argv[i]; i++) {
3f60c2f7
CB
487 size_t len = size - buf_pos;
488
586b1ce7 489 ret = snprintf(buffer + buf_pos, len, " %s", argv[i]);
3f60c2f7
CB
490 if (ret < 0 || (size_t)ret >= len) {
491 ERROR("Failed to create command line for script \"%s\"", script);
6f8d00d2 492 goto on_error;
148e91f5 493 }
3f60c2f7 494 buf_pos += ret;
148e91f5
SH
495 }
496
6f8d00d2
CB
497 fret = run_buffer(buffer);
498
499on_error:
500 free(buffer);
501 return fret;
148e91f5
SH
502}
503
811ef482 504int run_script(const char *name, const char *section, const char *script, ...)
e3b4c4c4 505{
abbfd20b 506 int ret;
91c3830e 507 char *buffer, *p;
abbfd20b 508 va_list ap;
0fd73091 509 size_t size = 0;
751d9dcd 510
0fd73091 511 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
751d9dcd 512 script, name, section);
e3b4c4c4 513
abbfd20b
DL
514 va_start(ap, script);
515 while ((p = va_arg(ap, char *)))
95642a10 516 size += strlen(p) + 1;
abbfd20b
DL
517 va_end(ap);
518
6d1a5f93 519 size += strlen("exec");
abbfd20b
DL
520 size += strlen(script);
521 size += strlen(name);
522 size += strlen(section);
6d1a5f93 523 size += 4;
abbfd20b 524
95642a10
MS
525 if (size > INT_MAX)
526 return -1;
527
528 buffer = alloca(size);
6d1a5f93 529 ret = snprintf(buffer, size, "exec %s %s %s", script, name, section);
0fd73091 530 if (ret < 0 || ret >= size)
9ba8130c 531 return -1;
751d9dcd 532
abbfd20b 533 va_start(ap, script);
9ba8130c 534 while ((p = va_arg(ap, char *))) {
062b72c6 535 int len = size - ret;
9ba8130c
SH
536 int rc;
537 rc = snprintf(buffer + ret, len, " %s", p);
7b5a2435
DJ
538 if (rc < 0 || rc >= len) {
539 va_end(ap);
9ba8130c 540 return -1;
7b5a2435 541 }
9ba8130c
SH
542 ret += rc;
543 }
abbfd20b 544 va_end(ap);
751d9dcd 545
91c3830e 546 return run_buffer(buffer);
e3b4c4c4
ST
547}
548
0fd73091 549/* pin_rootfs
63fc76c3 550 * if rootfs is a directory, then open ${rootfs}/.lxc-keep for writing for
b7ed4bf0
CS
551 * the duration of the container run, to prevent the container from marking
552 * the underlying fs readonly on shutdown. unlink the file immediately so
63fc76c3
GJ
553 * no name pollution is happens.
554 * don't unlink on NFS to avoid random named stale handles.
0c547523
SH
555 * return -1 on error.
556 * return -2 if nothing needed to be pinned.
557 * return an open fd (>=0) if we pinned it.
558 */
559int pin_rootfs(const char *rootfs)
560{
0fd73091
CB
561 int fd, ret;
562 char absrootfs[MAXPATHLEN], absrootfspin[MAXPATHLEN];
0c547523 563 struct stat s;
63fc76c3 564 struct statfs sfs;
0c547523 565
e99ee0de 566 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 567 return -2;
e99ee0de 568
00ec333b 569 if (!realpath(rootfs, absrootfs))
9be53773 570 return -2;
0c547523 571
0fd73091
CB
572 ret = stat(absrootfs, &s);
573 if (ret < 0)
0c547523 574 return -1;
0c547523 575
72f919c4 576 if (!S_ISDIR(s.st_mode))
0c547523
SH
577 return -2;
578
63fc76c3 579 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/.lxc-keep", absrootfs);
00ec333b 580 if (ret >= MAXPATHLEN)
0c547523 581 return -1;
0c547523 582
0fd73091 583 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR | S_IRUSR);
b7ed4bf0
CS
584 if (fd < 0)
585 return fd;
0fd73091 586
205fc010
CB
587 ret = fstatfs (fd, &sfs);
588 if (ret < 0)
589 return fd;
63fc76c3
GJ
590
591 if (sfs.f_type == NFS_SUPER_MAGIC) {
205fc010 592 DEBUG("Rootfs on NFS, not unlinking pin file \"%s\"", absrootfspin);
63fc76c3
GJ
593 return fd;
594 }
595
b7ed4bf0 596 (void)unlink(absrootfspin);
0fd73091 597
0c547523
SH
598 return fd;
599}
600
0fd73091
CB
601/* If we are asking to remount something, make sure that any NOEXEC etc are
602 * honored.
e2a7e8dc 603 */
5ae72b98 604unsigned long add_required_remount_flags(const char *s, const char *d,
5285689c 605 unsigned long flags)
e2a7e8dc 606{
614305f3 607#ifdef HAVE_STATVFS
0fd73091 608 int ret;
e2a7e8dc
SH
609 struct statvfs sb;
610 unsigned long required_flags = 0;
611
e2a7e8dc
SH
612 if (!s)
613 s = d;
614
615 if (!s)
616 return flags;
0fd73091
CB
617
618 ret = statvfs(s, &sb);
619 if (ret < 0)
e2a7e8dc
SH
620 return flags;
621
69eadddb
CB
622 if (flags & MS_REMOUNT) {
623 if (sb.f_flag & MS_NOSUID)
624 required_flags |= MS_NOSUID;
625 if (sb.f_flag & MS_NODEV)
626 required_flags |= MS_NODEV;
627 if (sb.f_flag & MS_RDONLY)
628 required_flags |= MS_RDONLY;
629 if (sb.f_flag & MS_NOEXEC)
630 required_flags |= MS_NOEXEC;
631 }
632
633 if (sb.f_flag & MS_NOATIME)
634 required_flags |= MS_NOATIME;
635 if (sb.f_flag & MS_NODIRATIME)
636 required_flags |= MS_NODIRATIME;
637 if (sb.f_flag & MS_LAZYTIME)
638 required_flags |= MS_LAZYTIME;
639 if (sb.f_flag & MS_RELATIME)
640 required_flags |= MS_RELATIME;
641 if (sb.f_flag & MS_STRICTATIME)
642 required_flags |= MS_STRICTATIME;
e2a7e8dc
SH
643
644 return flags | required_flags;
614305f3
SH
645#else
646 return flags;
647#endif
e2a7e8dc
SH
648}
649
4fb3cba5 650static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 651{
0fd73091 652 int i, r;
b06b8511
CS
653 static struct {
654 int match_mask;
655 int match_flag;
656 const char *source;
657 const char *destination;
658 const char *fstype;
659 unsigned long flags;
660 const char *options;
661 } default_mounts[] = {
0fd73091
CB
662 /* Read-only bind-mounting... In older kernels, doing that
663 * required to do one MS_BIND mount and then
664 * MS_REMOUNT|MS_RDONLY the same one. According to mount(2)
665 * manpage, MS_BIND honors MS_RDONLY from kernel 2.6.26
666 * onwards. However, this apparently does not work on kernel
667 * 3.8. Unfortunately, on that very same kernel, doing the same
668 * trick as above doesn't seem to work either, there one needs
669 * to ALSO specify MS_BIND for the remount, otherwise the
670 * entire fs is remounted read-only or the mount fails because
671 * it's busy... MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for
672 * kernels as low as 2.6.32...
368bbc02 673 */
0fd73091 674 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a 675 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
0fd73091
CB
676 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
677 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
678 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
679 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
680 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
681 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
682 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
683 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
684 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
685 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
686 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
687 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
688 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
689 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
690 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
691 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 692 };
368bbc02 693
b06b8511 694 for (i = 0; default_mounts[i].match_mask; i++) {
0fd73091
CB
695 int saved_errno;
696 unsigned long mflags;
697 char *destination = NULL;
698 char *source = NULL;
699 if ((flags & default_mounts[i].match_mask) != default_mounts[i].match_flag)
700 continue;
701
702 if (default_mounts[i].source) {
cc4fd506 703 /* will act like strdup if %r is not present */
0fd73091
CB
704 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
705 if (!source)
cc4fd506 706 return -1;
0fd73091 707 }
f24a52d5 708
0fd73091
CB
709 if (!default_mounts[i].destination) {
710 ERROR("BUG: auto mounts destination %d was NULL", i);
b06b8511 711 free(source);
0fd73091
CB
712 return -1;
713 }
714
715 /* will act like strdup if %r is not present */
716 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
717 if (!destination) {
718 saved_errno = errno;
719 free(source);
720 errno = saved_errno;
721 return -1;
722 }
723
724 mflags = add_required_remount_flags(source, destination,
725 default_mounts[i].flags);
726 r = safe_mount(source, destination, default_mounts[i].fstype,
727 mflags, default_mounts[i].options,
728 conf->rootfs.path ? conf->rootfs.mount : NULL);
729 saved_errno = errno;
730 if (r < 0 && errno == ENOENT) {
731 INFO("Mount source or target for \"%s\" on \"%s\" does "
732 "not exist. Skipping", source, destination);
733 r = 0;
734 } else if (r < 0) {
735 SYSERROR("Failed to mount \"%s\" on \"%s\" with flags %lu", source, destination, mflags);
736 }
737
738 free(source);
739 free(destination);
740 if (r < 0) {
741 errno = saved_errno;
742 return -1;
368bbc02 743 }
368bbc02
CS
744 }
745
b06b8511 746 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
747 int cg_flags;
748
3f69fb12 749 cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE);
0fd73091
CB
750 /* If the type of cgroup mount was not specified, it depends on
751 * the container's capabilities as to what makes sense: if we
752 * have CAP_SYS_ADMIN, the read-only part can be remounted
753 * read-write anyway, so we may as well default to read-write;
754 * then the admin will not be given a false sense of security.
755 * (And if they really want mixed r/o r/w, then they can
756 * explicitly specify :mixed.) OTOH, if the container lacks
757 * CAP_SYS_ADMIN, do only default to :mixed, because then the
758 * container can't remount it read-write.
759 */
0769b82a
CS
760 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
761 int has_sys_admin = 0;
b0ee5983
CB
762
763 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 764 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 765 else
0769b82a 766 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
767
768 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 769 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 770 else
0769b82a 771 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a 772 }
0fd73091 773
3f69fb12 774 if (flags & LXC_AUTO_CGROUP_FORCE)
0fd73091
CB
775 cg_flags |= LXC_AUTO_CGROUP_FORCE;
776
2202afc9
CB
777 if (!handler->cgroup_ops->mount(handler->cgroup_ops,
778 handler,
779 conf->rootfs.path ? conf->rootfs.mount : "",
780 cg_flags)) {
0fd73091 781 SYSERROR("Failed to mount \"/sys/fs/cgroup\"");
b06b8511 782 return -1;
368bbc02
CS
783 }
784 }
785
368bbc02 786 return 0;
368bbc02
CS
787}
788
4e5440c6 789static int setup_utsname(struct utsname *utsname)
0ad19a3f 790{
0fd73091
CB
791 int ret;
792
4e5440c6
DL
793 if (!utsname)
794 return 0;
0ad19a3f 795
0fd73091
CB
796 ret = sethostname(utsname->nodename, strlen(utsname->nodename));
797 if (ret < 0) {
798 SYSERROR("Failed to set the hostname to \"%s\"", utsname->nodename);
0ad19a3f 799 return -1;
800 }
801
0fd73091 802 INFO("Set hostname to \"%s\"", utsname->nodename);
cd54d859 803
0ad19a3f 804 return 0;
805}
806
69aa6655
DE
807struct dev_symlinks {
808 const char *oldpath;
809 const char *name;
810};
811
812static const struct dev_symlinks dev_symlinks[] = {
0fd73091
CB
813 { "/proc/self/fd", "fd" },
814 { "/proc/self/fd/0", "stdin" },
815 { "/proc/self/fd/1", "stdout" },
816 { "/proc/self/fd/2", "stderr" },
69aa6655
DE
817};
818
ed8704d0 819static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
69aa6655 820{
0fd73091 821 int i, ret;
69aa6655 822 char path[MAXPATHLEN];
09227be2 823 struct stat s;
69aa6655 824
69aa6655
DE
825 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
826 const struct dev_symlinks *d = &dev_symlinks[i];
0fd73091
CB
827
828 ret = snprintf(path, sizeof(path), "%s/dev/%s",
829 rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
830 if (ret < 0 || ret >= MAXPATHLEN)
831 return -1;
09227be2 832
0fd73091
CB
833 /* Stat the path first. If we don't get an error accept it as
834 * is and don't try to create it
09227be2 835 */
0fd73091
CB
836 ret = stat(path, &s);
837 if (ret == 0)
09227be2 838 continue;
09227be2 839
69aa6655
DE
840 ret = symlink(d->oldpath, path);
841 if (ret && errno != EEXIST) {
0fd73091
CB
842 if (errno == EROFS) {
843 WARN("Failed to create \"%s\". Read-only filesystem", path);
09227be2 844 } else {
0fd73091 845 SYSERROR("Failed to create \"%s\"", path);
09227be2
MW
846 return -1;
847 }
69aa6655
DE
848 }
849 }
0fd73091 850
69aa6655
DE
851 return 0;
852}
853
2187efd3 854/* Build a space-separate list of ptys to pass to systemd. */
885766f5 855static bool append_ttyname(char **pp, char *name)
b0a33c1e 856{
393903d1 857 char *p;
f1e05b90 858 size_t size;
393903d1
SH
859
860 if (!*pp) {
861 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
862 if (!*pp)
863 return false;
0fd73091 864
393903d1
SH
865 sprintf(*pp, "container_ttys=%s", name);
866 return true;
867 }
0fd73091 868
f1e05b90
DJ
869 size = strlen(*pp) + strlen(name) + 2;
870 p = realloc(*pp, size);
393903d1
SH
871 if (!p)
872 return false;
0fd73091 873
393903d1 874 *pp = p;
f1e05b90
DJ
875 (void)strlcat(p, " ", size);
876 (void)strlcat(p, name, size);
0fd73091 877
393903d1
SH
878 return true;
879}
880
2187efd3 881static int lxc_setup_ttys(struct lxc_conf *conf)
393903d1 882{
9e1045e3 883 int i, ret;
0e4be3cf 884 const struct lxc_tty_info *ttys = &conf->ttys;
885766f5 885 char *ttydir = ttys->dir;
7c6ef2a2 886 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 887
e8bd4e43 888 if (!conf->rootfs.path)
bc9bd0e3
DL
889 return 0;
890
885766f5 891 for (i = 0; i < ttys->max; i++) {
0e4be3cf 892 struct lxc_terminal_info *tty = &ttys->tty[i];
b0a33c1e 893
e8bd4e43 894 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
73363c61 895 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 896 return -1;
9e1045e3 897
7c6ef2a2
SH
898 if (ttydir) {
899 /* create dev/lxc/tty%d" */
9e1045e3
CB
900 ret = snprintf(lxcpath, sizeof(lxcpath),
901 "/dev/%s/tty%d", ttydir, i + 1);
73363c61 902 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 903 return -1;
9e1045e3 904
3b7e332f 905 ret = mknod(path, S_IFREG | 0000, 0);
9e1045e3 906 if (ret < 0 && errno != EEXIST) {
73363c61 907 SYSERROR("Failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
908 return -1;
909 }
9e1045e3 910
7c6ef2a2 911 ret = unlink(path);
9e1045e3 912 if (ret < 0 && errno != ENOENT) {
73363c61 913 SYSERROR("Failed to unlink \"%s\"", path);
7c6ef2a2
SH
914 return -1;
915 }
b0a33c1e 916
2520facd 917 ret = mount(tty->name, lxcpath, "none", MS_BIND, 0);
9e1045e3 918 if (ret < 0) {
73363c61 919 WARN("Failed to bind mount \"%s\" onto \"%s\"",
2520facd 920 tty->name, path);
7c6ef2a2
SH
921 continue;
922 }
0fd73091 923 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name,
9e1045e3 924 path);
13954cce 925
9e1045e3
CB
926 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
927 ttydir, i + 1);
73363c61 928 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 929 return -1;
9e1045e3 930
7c6ef2a2 931 ret = symlink(lxcpath, path);
9e1045e3 932 if (ret < 0) {
73363c61 933 SYSERROR("Failed to create symlink \"%s\" -> \"%s\"",
9e1045e3 934 path, lxcpath);
7c6ef2a2
SH
935 return -1;
936 }
937 } else {
9e1045e3
CB
938 /* If we populated /dev, then we need to create
939 * /dev/ttyN
940 */
d3ccc04e
CB
941 ret = mknod(path, S_IFREG | 0000, 0);
942 if (ret < 0) /* this isn't fatal, continue */
6d1400b5 943 SYSERROR("Failed to create \"%s\"", path);
9e1045e3 944
2520facd 945 ret = mount(tty->name, path, "none", MS_BIND, 0);
9e1045e3 946 if (ret < 0) {
2520facd 947 SYSERROR("Failed to mount '%s'->'%s'", tty->name, path);
7c6ef2a2
SH
948 continue;
949 }
9e1045e3 950
d3ccc04e 951 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, path);
393903d1 952 }
9e1045e3 953
885766f5 954 if (!append_ttyname(&conf->ttys.tty_names, tty->name)) {
393903d1
SH
955 ERROR("Error setting up container_ttys string");
956 return -1;
b0a33c1e 957 }
958 }
959
885766f5 960 INFO("Finished setting up %zu /dev/tty<N> device(s)", ttys->max);
b0a33c1e 961 return 0;
962}
963
663014ee 964int lxc_allocate_ttys(struct lxc_conf *conf)
2187efd3 965{
2187efd3 966 int i, ret;
0fd73091 967 struct lxc_tty_info *ttys = &conf->ttys;
2187efd3
CB
968
969 /* no tty in the configuration */
885766f5 970 if (ttys->max == 0)
2187efd3
CB
971 return 0;
972
885766f5 973 ttys->tty = malloc(sizeof(*ttys->tty) * ttys->max);
0e4be3cf 974 if (!ttys->tty)
2187efd3 975 return -ENOMEM;
2187efd3 976
885766f5 977 for (i = 0; i < ttys->max; i++) {
0e4be3cf 978 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 979
386e6768
CB
980 tty->master = -EBADF;
981 tty->slave = -EBADF;
77a39805
CB
982 ret = openpty(&tty->master, &tty->slave, NULL, NULL, NULL);
983 if (ret < 0) {
0fd73091 984 SYSERROR("Failed to create tty %d", i);
885766f5 985 ttys->max = i;
0e4be3cf 986 lxc_delete_tty(ttys);
2187efd3
CB
987 return -ENOTTY;
988 }
989
77a39805
CB
990 ret = ttyname_r(tty->slave, tty->name, sizeof(tty->name));
991 if (ret < 0) {
992 SYSERROR("Failed to retrieve name of tty %d slave", i);
993 ttys->max = i;
994 lxc_delete_tty(ttys);
995 return -ENOTTY;
996 }
997
0fd73091 998 DEBUG("Created tty \"%s\" with master fd %d and slave fd %d",
2520facd 999 tty->name, tty->master, tty->slave);
2187efd3
CB
1000
1001 /* Prevent leaking the file descriptors to the container */
615f24ff 1002 ret = fd_cloexec(tty->master, true);
2187efd3 1003 if (ret < 0)
a24c5678 1004 SYSWARN("Failed to set FD_CLOEXEC flag on master fd %d of "
1005 "tty device \"%s\"", tty->master, tty->name);
2187efd3 1006
615f24ff 1007 ret = fd_cloexec(tty->slave, true);
2187efd3 1008 if (ret < 0)
a24c5678 1009 SYSWARN("Failed to set FD_CLOEXEC flag on slave fd %d of "
1010 "tty device \"%s\"", tty->slave, tty->name);
2187efd3 1011
2520facd 1012 tty->busy = 0;
2187efd3
CB
1013 }
1014
885766f5 1015 INFO("Finished creating %zu tty devices", ttys->max);
2187efd3
CB
1016 return 0;
1017}
1018
0e4be3cf 1019void lxc_delete_tty(struct lxc_tty_info *ttys)
2187efd3
CB
1020{
1021 int i;
1022
386e6768
CB
1023 if (!ttys->tty)
1024 return;
1025
885766f5 1026 for (i = 0; i < ttys->max; i++) {
0e4be3cf 1027 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 1028
386e6768
CB
1029 if (tty->master >= 0) {
1030 close(tty->master);
1031 tty->master = -EBADF;
1032 }
1033
1034 if (tty->slave >= 0) {
1035 close(tty->slave);
1036 tty->slave = -EBADF;
1037 }
2187efd3
CB
1038 }
1039
0e4be3cf
CB
1040 free(ttys->tty);
1041 ttys->tty = NULL;
2187efd3
CB
1042}
1043
1044static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
1045{
1046 int i;
0fd73091 1047 int ret = -1;
2187efd3 1048 struct lxc_conf *conf = handler->conf;
0e4be3cf 1049 struct lxc_tty_info *ttys = &conf->ttys;
2187efd3 1050 int sock = handler->data_sock[0];
2187efd3 1051
885766f5 1052 if (ttys->max == 0)
2187efd3
CB
1053 return 0;
1054
885766f5 1055 for (i = 0; i < ttys->max; i++) {
2187efd3 1056 int ttyfds[2];
0e4be3cf 1057 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 1058
2520facd
CB
1059 ttyfds[0] = tty->master;
1060 ttyfds[1] = tty->slave;
2187efd3
CB
1061
1062 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1063 if (ret < 0)
1064 break;
1065
0fd73091 1066 TRACE("Sent ty \"%s\" with master fd %d and slave fd %d to "
2520facd 1067 "parent", tty->name, tty->master, tty->slave);
2187efd3
CB
1068 }
1069
1070 if (ret < 0)
6d1400b5 1071 SYSERROR("Failed to send %zu ttys to parent", ttys->max);
2187efd3 1072 else
885766f5 1073 TRACE("Sent %zu ttys to parent", ttys->max);
2187efd3
CB
1074
1075 return ret;
1076}
1077
1078static int lxc_create_ttys(struct lxc_handler *handler)
1079{
1080 int ret = -1;
1081 struct lxc_conf *conf = handler->conf;
1082
663014ee 1083 ret = lxc_allocate_ttys(conf);
2187efd3
CB
1084 if (ret < 0) {
1085 ERROR("Failed to allocate ttys");
1086 goto on_error;
1087 }
1088
1089 ret = lxc_send_ttys_to_parent(handler);
1090 if (ret < 0) {
1091 ERROR("Failed to send ttys to parent");
1092 goto on_error;
1093 }
1094
1095 if (!conf->is_execute) {
1096 ret = lxc_setup_ttys(conf);
1097 if (ret < 0) {
1098 ERROR("Failed to setup ttys");
1099 goto on_error;
1100 }
1101 }
1102
885766f5
CB
1103 if (conf->ttys.tty_names) {
1104 ret = setenv("container_ttys", conf->ttys.tty_names, 1);
2187efd3 1105 if (ret < 0)
885766f5 1106 SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
2187efd3
CB
1107 }
1108
1109 ret = 0;
1110
1111on_error:
0e4be3cf 1112 lxc_delete_tty(&conf->ttys);
2187efd3
CB
1113
1114 return ret;
1115}
1116
59bb8698 1117static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 1118{
0fd73091
CB
1119 int ret;
1120 int newroot = -1, oldroot = -1;
bf601689 1121
2d489f9e
SH
1122 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1123 if (oldroot < 0) {
0fd73091 1124 SYSERROR("Failed to open old root directory");
9ba8130c
SH
1125 return -1;
1126 }
0fd73091 1127
2d489f9e
SH
1128 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1129 if (newroot < 0) {
0fd73091
CB
1130 SYSERROR("Failed to open new root directory");
1131 goto on_error;
c08556c6 1132 }
bf601689 1133
cc6f6dd7 1134 /* change into new root fs */
0fd73091
CB
1135 ret = fchdir(newroot);
1136 if (ret < 0) {
1137 SYSERROR("Failed to change to new rootfs \"%s\"", rootfs);
1138 goto on_error;
cc6f6dd7
DL
1139 }
1140
cc6f6dd7 1141 /* pivot_root into our new root fs */
0fd73091
CB
1142 ret = pivot_root(".", ".");
1143 if (ret < 0) {
1144 SYSERROR("Failed to pivot_root()");
1145 goto on_error;
bf601689 1146 }
cc6f6dd7 1147
e599717b 1148 /* At this point the old-root is mounted on top of our new-root. To
0fd73091
CB
1149 * unmounted it we must not be chdir'd into it, so escape back to
1150 * old-root.
2d489f9e 1151 */
0fd73091
CB
1152 ret = fchdir(oldroot);
1153 if (ret < 0) {
1154 SYSERROR("Failed to enter old root directory");
1155 goto on_error;
2d489f9e 1156 }
0fd73091 1157
e599717b
FW
1158 /* Make oldroot rslave to make sure our umounts don't propagate to the
1159 * host.
1160 */
1161 ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
1162 if (ret < 0) {
1163 SYSERROR("Failed to make oldroot rslave");
1164 goto on_error;
1165 }
1166
0fd73091
CB
1167 ret = umount2(".", MNT_DETACH);
1168 if (ret < 0) {
1169 SYSERROR("Failed to detach old root directory");
1170 goto on_error;
cc6f6dd7
DL
1171 }
1172
0fd73091
CB
1173 ret = fchdir(newroot);
1174 if (ret < 0) {
1175 SYSERROR("Failed to re-enter new root directory");
1176 goto on_error;
2d489f9e 1177 }
cc6f6dd7 1178
2d489f9e
SH
1179 close(oldroot);
1180 close(newroot);
bf601689 1181
0fd73091 1182 DEBUG("pivot_root(\"%s\") successful", rootfs);
bf601689 1183
bf601689 1184 return 0;
2d489f9e 1185
0fd73091 1186on_error:
2d489f9e
SH
1187 if (oldroot != -1)
1188 close(oldroot);
1189 if (newroot != -1)
1190 close(newroot);
0fd73091 1191
2d489f9e 1192 return -1;
bf601689
MH
1193}
1194
7133b912
CB
1195/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1196 * error, log it but don't fail yet.
91c3830e 1197 */
7133b912
CB
1198static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1199 const char *lxcpath)
91c3830e
SH
1200{
1201 int ret;
87da4ec3
SH
1202 size_t clen;
1203 char *path;
87e0e273 1204 mode_t cur_mask;
91c3830e 1205
7133b912 1206 INFO("Preparing \"/dev\"");
bc6928ff 1207
14221cbb 1208 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1209 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1210 path = alloca(clen);
bc6928ff 1211
ec50007f 1212 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
7133b912 1213 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1214 return -1;
bc6928ff 1215
87e0e273
CB
1216 cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1217 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1218 if (ret < 0 && errno != EEXIST) {
1219 SYSERROR("Failed to create \"/dev\" directory");
1220 ret = -errno;
1221 goto reset_umask;
bc6928ff 1222 }
87da4ec3 1223
1ec0e8e3 1224 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
7133b912
CB
1225 rootfs->path ? rootfs->mount : NULL);
1226 if (ret < 0) {
1227 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
87e0e273 1228 goto reset_umask;
91c3830e 1229 }
87e0e273 1230 TRACE("Mounted tmpfs on \"%s\"", path);
87da4ec3 1231
ec50007f 1232 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
87e0e273
CB
1233 if (ret < 0 || (size_t)ret >= clen) {
1234 ret = -1;
1235 goto reset_umask;
1236 }
87da4ec3 1237
7133b912 1238 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1239 * If not, then create it and exit if that fails...
1240 */
87e0e273
CB
1241 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1242 if (ret < 0 && errno != EEXIST) {
1243 SYSERROR("Failed to create directory \"%s\"", path);
1244 ret = -errno;
1245 goto reset_umask;
91c3830e
SH
1246 }
1247
87e0e273
CB
1248 ret = 0;
1249
1250reset_umask:
1251 (void)umask(cur_mask);
1252
7133b912 1253 INFO("Prepared \"/dev\"");
87e0e273 1254 return ret;
91c3830e
SH
1255}
1256
5e73416f 1257struct lxc_device_node {
74a3920a 1258 const char *name;
5e73416f
CB
1259 const mode_t mode;
1260 const int maj;
1261 const int min;
c6883f38
SH
1262};
1263
5e73416f 1264static const struct lxc_device_node lxc_devices[] = {
06749971 1265 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
5e73416f 1266 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
06749971
CB
1267 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1268 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
5e73416f
CB
1269 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1270 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
c6883f38
SH
1271};
1272
5067e4dd
CB
1273
1274enum {
1275 LXC_DEVNODE_BIND,
1276 LXC_DEVNODE_MKNOD,
1277 LXC_DEVNODE_PARTIAL,
1278 LXC_DEVNODE_OPEN,
1279};
1280
27245ff7 1281static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38 1282{
5e73416f 1283 int i, ret;
c6883f38 1284 char path[MAXPATHLEN];
3a32201c 1285 mode_t cmask;
5067e4dd 1286 int use_mknod = LXC_DEVNODE_MKNOD;
c6883f38 1287
3999be0a
CB
1288 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1289 rootfs->path ? rootfs->mount : "");
1290 if (ret < 0 || ret >= MAXPATHLEN)
c6883f38 1291 return -1;
91c3830e 1292
0bbf8572
CB
1293 /* ignore, just don't try to fill in */
1294 if (!dir_exists(path))
9cb4d183
SH
1295 return 0;
1296
3999be0a
CB
1297 INFO("Populating \"/dev\"");
1298
3a32201c 1299 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
5e73416f
CB
1300 for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
1301 char hostpath[MAXPATHLEN];
1302 const struct lxc_device_node *device = &lxc_devices[i];
0728ebf4 1303
3999be0a 1304 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
5e73416f 1305 rootfs->path ? rootfs->mount : "", device->name);
c6883f38
SH
1306 if (ret < 0 || ret >= MAXPATHLEN)
1307 return -1;
0bbf8572 1308
5067e4dd 1309 if (use_mknod >= LXC_DEVNODE_MKNOD) {
5e73416f
CB
1310 ret = mknod(path, device->mode, makedev(device->maj, device->min));
1311 if (ret == 0 || (ret < 0 && errno == EEXIST)) {
1312 DEBUG("Created device node \"%s\"", path);
5067e4dd
CB
1313 } else if (ret < 0) {
1314 if (errno != EPERM) {
1315 SYSERROR("Failed to create device node \"%s\"", path);
1316 return -1;
1317 }
0bbf8572 1318
5067e4dd 1319 use_mknod = LXC_DEVNODE_BIND;
9cb4d183 1320 }
3999be0a 1321
5067e4dd
CB
1322 /* Device nodes are fully useable. */
1323 if (use_mknod == LXC_DEVNODE_OPEN)
1324 continue;
1325
1326 if (use_mknod == LXC_DEVNODE_MKNOD) {
1327 /* See
1328 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
1329 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
1330 */
1331 ret = open(path, O_RDONLY | O_CLOEXEC);
1332 if (ret >= 0) {
1333 close(ret);
1334 /* Device nodes are fully useable. */
1335 use_mknod = LXC_DEVNODE_OPEN;
1336 continue;
1337 }
1338
1339 SYSTRACE("Failed to open \"%s\" device", path);
1340 /* Device nodes are only partially useable. */
1341 use_mknod = LXC_DEVNODE_PARTIAL;
1342 }
5e73416f
CB
1343 }
1344
5067e4dd
CB
1345 if (use_mknod != LXC_DEVNODE_PARTIAL) {
1346 /* If we are dealing with partially functional device
1347 * nodes the prio mknod() call will have created the
1348 * device node so we can use it as a bind-mount target.
1349 */
1350 ret = mknod(path, S_IFREG | 0000, 0);
1351 if (ret < 0 && errno != EEXIST) {
1352 SYSERROR("Failed to create file \"%s\"", path);
1353 return -1;
1354 }
5e73416f
CB
1355 }
1356
1357 /* Fallback to bind-mounting the device from the host. */
1358 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", device->name);
1359 if (ret < 0 || ret >= MAXPATHLEN)
1360 return -1;
1361
1362 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1363 rootfs->path ? rootfs->mount : NULL);
1364 if (ret < 0) {
1365 SYSERROR("Failed to bind mount host device node \"%s\" "
1366 "onto \"%s\"", hostpath, path);
1367 return -1;
c6883f38 1368 }
5e73416f
CB
1369 DEBUG("Bind mounted host device node \"%s\" onto \"%s\"",
1370 hostpath, path);
c6883f38 1371 }
5e73416f 1372 (void)umask(cmask);
c6883f38 1373
3999be0a 1374 INFO("Populated \"/dev\"");
c6883f38
SH
1375 return 0;
1376}
1377
9aa76a17 1378static int lxc_setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1379{
9aa76a17 1380 int ret;
10bc1861 1381 struct lxc_storage *bdev;
91c3e281 1382 const struct lxc_rootfs *rootfs;
cc28d0b0 1383
91c3e281 1384 rootfs = &conf->rootfs;
a0f379bf 1385 if (!rootfs->path) {
0fd73091
CB
1386 ret = mount("", "/", NULL, MS_SLAVE | MS_REC, 0);
1387 if (ret < 0) {
1388 SYSERROR("Failed to make / rslave");
a0f379bf
DW
1389 return -1;
1390 }
0fd73091 1391
c69bd12f 1392 return 0;
a0f379bf 1393 }
0ad19a3f 1394
0fd73091
CB
1395 ret = access(rootfs->mount, F_OK);
1396 if (ret != 0) {
1397 SYSERROR("Failed to access to \"%s\". Check it is present",
12297168 1398 rootfs->mount);
b1789442
DL
1399 return -1;
1400 }
1401
8a388ed4 1402 bdev = storage_init(conf);
9aa76a17 1403 if (!bdev) {
0fd73091 1404 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1405 rootfs->path, rootfs->mount,
1406 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1407 return -1;
9be53773 1408 }
9aa76a17
CB
1409
1410 ret = bdev->ops->mount(bdev);
10bc1861 1411 storage_put(bdev);
9aa76a17 1412 if (ret < 0) {
0fd73091 1413 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1414 rootfs->path, rootfs->mount,
1415 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1416 return -1;
1417 }
0ad19a3f 1418
0fd73091 1419 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1420 rootfs->path, rootfs->mount,
1421 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1422
ac778708
DL
1423 return 0;
1424}
1425
91e93c71
AV
1426int prepare_ramfs_root(char *root)
1427{
0fd73091
CB
1428 int i, ret;
1429 char *p, *p2;
1430 char buf[LXC_LINELEN], nroot[PATH_MAX];
91e93c71 1431 FILE *f;
91e93c71 1432
0fd73091
CB
1433 if (!realpath(root, nroot))
1434 return -1;
91e93c71 1435
0fd73091
CB
1436 ret = chdir("/");
1437 if (ret < 0)
1438 return -1;
91e93c71 1439
0fd73091
CB
1440 /* We could use here MS_MOVE, but in userns this mount is locked and
1441 * can't be moved.
91e93c71 1442 */
0fd73091
CB
1443 ret = mount(root, "/", NULL, MS_REC | MS_BIND, NULL);
1444 if (ret < 0) {
1445 SYSERROR("Failed to move \"%s\" into \"/\"", root);
1446 return -1;
91e93c71
AV
1447 }
1448
0fd73091
CB
1449 ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
1450 if (ret < 0) {
1451 SYSERROR("Failed to make \"/\" rprivate");
1452 return -1;
91e93c71
AV
1453 }
1454
0fd73091
CB
1455 /* The following code cleans up inhereted mounts which are not required
1456 * for CT.
91e93c71
AV
1457 *
1458 * The mountinfo file shows not all mounts, if a few points have been
1459 * unmounted between read operations from the mountinfo. So we need to
1460 * read mountinfo a few times.
1461 *
1462 * This loop can be skipped if a container uses unserns, because all
1463 * inherited mounts are locked and we should live with all this trash.
1464 */
0fd73091 1465 for (;;) {
91e93c71
AV
1466 int progress = 0;
1467
1468 f = fopen("./proc/self/mountinfo", "r");
1469 if (!f) {
1470 SYSERROR("Unable to open /proc/self/mountinfo");
1471 return -1;
1472 }
0fd73091 1473
eab15c1e 1474 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1475 for (p = buf, i=0; p && i < 4; i++)
1476 p = strchr(p+1, ' ');
0fd73091 1477
91e93c71
AV
1478 if (!p)
1479 continue;
0fd73091 1480
91e93c71
AV
1481 p2 = strchr(p+1, ' ');
1482 if (!p2)
1483 continue;
1484
1485 *p2 = '\0';
1486 *p = '.';
1487
1488 if (strcmp(p + 1, "/") == 0)
1489 continue;
0fd73091 1490
91e93c71
AV
1491 if (strcmp(p + 1, "/proc") == 0)
1492 continue;
1493
0fd73091
CB
1494 ret = umount2(p, MNT_DETACH);
1495 if (ret == 0)
91e93c71
AV
1496 progress++;
1497 }
0fd73091 1498
91e93c71 1499 fclose(f);
0fd73091 1500
91e93c71
AV
1501 if (!progress)
1502 break;
1503 }
1504
0fd73091
CB
1505 /* This also can be skipped if a container uses unserns. */
1506 (void)umount2("./proc", MNT_DETACH);
91e93c71
AV
1507
1508 /* It is weird, but chdir("..") moves us in a new root */
0fd73091
CB
1509 ret = chdir("..");
1510 if (ret < 0) {
91e93c71
AV
1511 SYSERROR("Unable to change working directory");
1512 return -1;
1513 }
1514
0fd73091
CB
1515 ret = chroot(".");
1516 if (ret < 0) {
91e93c71
AV
1517 SYSERROR("Unable to chroot");
1518 return -1;
1519 }
1520
1521 return 0;
1522}
1523
74a3920a 1524static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1525{
0fd73091
CB
1526 int ret;
1527
39c7b795 1528 if (!rootfs->path) {
0fd73091 1529 DEBUG("Container does not have a rootfs");
ac778708 1530 return 0;
39c7b795 1531 }
ac778708 1532
91e93c71 1533 if (detect_ramfs_rootfs()) {
0fd73091
CB
1534 DEBUG("Detected that container is on ramfs");
1535
1536 ret = prepare_ramfs_root(rootfs->mount);
1537 if (ret < 0) {
1538 ERROR("Failed to prepare minimal ramfs root");
91e93c71 1539 return -1;
39c7b795
CB
1540 }
1541
0fd73091 1542 DEBUG("Prepared ramfs root for container");
39c7b795
CB
1543 return 0;
1544 }
1545
0fd73091
CB
1546 ret = setup_rootfs_pivot_root(rootfs->mount);
1547 if (ret < 0) {
1548 ERROR("Failed to pivot_root()");
25368b52 1549 return -1;
c69bd12f
DL
1550 }
1551
0fd73091 1552 DEBUG("Finished pivot_root()");
25368b52 1553 return 0;
0ad19a3f 1554}
1555
5173b710 1556static const struct id_map *find_mapped_nsid_entry(struct lxc_conf *conf, unsigned id,
f4900711
CB
1557 enum idtype idtype)
1558{
1559 struct lxc_list *it;
1560 struct id_map *map;
1561 struct id_map *retmap = NULL;
1562
dcf0ffdf
CB
1563 /* Shortcut for container's root mappings. */
1564 if (id == 0) {
1565 if (idtype == ID_TYPE_UID)
1566 return conf->root_nsuid_map;
1567
1568 if (idtype == ID_TYPE_GID)
1569 return conf->root_nsgid_map;
1570 }
1571
f4900711
CB
1572 lxc_list_for_each(it, &conf->id_map) {
1573 map = it->elem;
1574 if (map->idtype != idtype)
1575 continue;
1576
1577 if (id >= map->nsid && id < map->nsid + map->range) {
1578 retmap = map;
1579 break;
1580 }
1581 }
1582
1583 return retmap;
1584}
1585
1586static int lxc_setup_devpts(struct lxc_conf *conf)
3c26f34e 1587{
70761e5e 1588 int ret;
11293068 1589 const char *default_devpts_mntopts = "gid=5,newinstance,ptmxmode=0666,mode=0620";
9d28c4f9 1590 char devpts_mntopts[256];
77890c6d 1591
e528c735 1592 if (conf->pty_max <= 0) {
0fd73091 1593 DEBUG("No new devpts instance will be mounted since no pts "
70761e5e 1594 "devices are requested");
d852c78c 1595 return 0;
3c26f34e 1596 }
1597
e528c735
CB
1598 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu",
1599 default_devpts_mntopts, conf->pty_max);
9d28c4f9
CB
1600 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1601 return -1;
1602
77f94854
CB
1603 ret = umount2("/dev/pts", MNT_DETACH);
1604 if (ret < 0)
a24c5678 1605 SYSWARN("Failed to unmount old devpts instance");
77f94854 1606 else
0fd73091 1607 DEBUG("Unmounted old devpts instance");
7e40254a 1608
70761e5e
CB
1609 /* Create mountpoint for devpts instance. */
1610 ret = mkdir("/dev/pts", 0755);
1611 if (ret < 0 && errno != EEXIST) {
0fd73091 1612 SYSERROR("Failed to create \"/dev/pts\" directory");
3c26f34e 1613 return -1;
1614 }
1615
11293068 1616 /* mount new devpts instance */
f4900711 1617 ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, devpts_mntopts);
70761e5e 1618 if (ret < 0) {
11293068
CB
1619 /* try mounting without gid=5 */
1620 ret = mount("devpts", "/dev/pts", "devpts",
1621 MS_NOSUID | MS_NOEXEC, devpts_mntopts + sizeof("gid=5"));
1622 if (ret < 0) {
1623 SYSERROR("Failed to mount new devpts instance");
1624 return -1;
1625 }
70761e5e 1626 }
0fd73091 1627 DEBUG("Mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1628
d5cb35d6 1629 /* Remove any pre-existing /dev/ptmx file. */
b29e05d6
CB
1630 ret = remove("/dev/ptmx");
1631 if (ret < 0) {
1632 if (errno != ENOENT) {
0fd73091 1633 SYSERROR("Failed to remove existing \"/dev/ptmx\" file");
d5cb35d6 1634 return -1;
70761e5e 1635 }
b29e05d6 1636 } else {
0fd73091 1637 DEBUG("Removed existing \"/dev/ptmx\" file");
3c26f34e 1638 }
1639
d5cb35d6 1640 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
3b7e332f
CB
1641 ret = mknod("/dev/ptmx", S_IFREG | 0000, 0);
1642 if (ret < 0 && errno != EEXIST) {
0fd73091 1643 SYSERROR("Failed to create dummy \"/dev/ptmx\" file as bind mount target");
d5cb35d6
CB
1644 return -1;
1645 }
0fd73091 1646 DEBUG("Created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1647
d5cb35d6 1648 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1649 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6 1650 if (!ret) {
0fd73091 1651 DEBUG("Bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
d5cb35d6
CB
1652 return 0;
1653 } else {
1654 /* Fallthrough and try to create a symlink. */
0fd73091 1655 ERROR("Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
d5cb35d6
CB
1656 }
1657
1658 /* Remove the dummy /dev/ptmx file we created above. */
1659 ret = remove("/dev/ptmx");
70761e5e 1660 if (ret < 0) {
0fd73091 1661 SYSERROR("Failed to remove existing \"/dev/ptmx\"");
d5cb35d6
CB
1662 return -1;
1663 }
1664
1665 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1666 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1667 if (ret < 0) {
0fd73091 1668 SYSERROR("Failed to create symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
3c26f34e 1669 return -1;
1670 }
0fd73091 1671 DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
cd54d859 1672
3c26f34e 1673 return 0;
1674}
1675
cccc74b5
DL
1676static int setup_personality(int persona)
1677{
0fd73091
CB
1678 int ret;
1679
1680#if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1681 if (persona == -1)
1682 return 0;
1683
0fd73091
CB
1684 ret = personality(persona);
1685 if (ret < 0) {
1686 SYSERROR("Failed to set personality to \"0x%x\"", persona);
cccc74b5
DL
1687 return -1;
1688 }
1689
0fd73091
CB
1690 INFO("Set personality to \"0x%x\"", persona);
1691#endif
cccc74b5
DL
1692
1693 return 0;
1694}
1695
3d7d929a 1696static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
dcad02f8 1697 const struct lxc_terminal *console)
6e590161 1698{
882671aa 1699 int ret;
63376d7d 1700 char path[MAXPATHLEN];
86530b0a 1701 char *rootfs_path = rootfs->path ? rootfs->mount : "";
52e35957 1702
8b1b1210
CB
1703 if (console->path && !strcmp(console->path, "none"))
1704 return 0;
1705
86530b0a 1706 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
3d7d929a 1707 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1708 return -1;
52e35957 1709
8b1b1210
CB
1710 /* When we are asked to setup a console we remove any previous
1711 * /dev/console bind-mounts.
1712 */
a7ba3c7f
CB
1713 if (file_exists(path)) {
1714 ret = lxc_unstack_mountpoint(path, false);
1715 if (ret < 0) {
6d1400b5 1716 SYSERROR("Failed to unmount \"%s\"", path);
a7ba3c7f
CB
1717 return -ret;
1718 } else {
86530b0a 1719 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
a7ba3c7f 1720 }
8b1b1210
CB
1721 }
1722
1723 /* For unprivileged containers autodev or automounts will already have
1724 * taken care of creating /dev/console.
1725 */
882671aa 1726 ret = mknod(path, S_IFREG | 0000, 0);
3b7e332f
CB
1727 if (ret < 0 && errno != EEXIST) {
1728 SYSERROR("Failed to create console");
1729 return -errno;
52e35957
DL
1730 }
1731
882671aa 1732 ret = fchmod(console->slave, S_IXUSR | S_IXGRP | S_IXOTH);
86530b0a 1733 if (ret < 0) {
0fd73091
CB
1734 SYSERROR("Failed to set mode \"0%o\" to \"%s\"",
1735 S_IXUSR | S_IXGRP | S_IXOTH, console->name);
3d7d929a 1736 return -errno;
63376d7d 1737 }
13954cce 1738
86530b0a
L
1739 ret = safe_mount(console->name, path, "none", MS_BIND, 0, rootfs_path);
1740 if (ret < 0) {
0fd73091 1741 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, path);
6e590161 1742 return -1;
1743 }
1744
86530b0a 1745 DEBUG("Mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1746 return 0;
1747}
1748
3d7d929a 1749static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
dcad02f8 1750 const struct lxc_terminal *console,
3d7d929a 1751 char *ttydir)
7c6ef2a2 1752{
3b7e332f 1753 int ret;
3d7d929a 1754 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
86530b0a 1755 char *rootfs_path = rootfs->path ? rootfs->mount : "";
7c6ef2a2 1756
3dc035f1
L
1757 if (console->path && !strcmp(console->path, "none"))
1758 return 0;
1759
7c6ef2a2 1760 /* create rootfs/dev/<ttydir> directory */
86530b0a 1761 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs_path, ttydir);
3d7d929a 1762 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1763 return -1;
3d7d929a 1764
7c6ef2a2
SH
1765 ret = mkdir(path, 0755);
1766 if (ret && errno != EEXIST) {
0fd73091 1767 SYSERROR("Failed to create \"%s\"", path);
3d7d929a 1768 return -errno;
7c6ef2a2 1769 }
4742cd9a 1770 DEBUG("Created directory for console and tty devices at \"%s\"", path);
7c6ef2a2 1771
86530b0a 1772 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
3d7d929a
CB
1773 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1774 return -1;
1775
3b7e332f
CB
1776 ret = mknod(lxcpath, S_IFREG | 0000, 0);
1777 if (ret < 0 && errno != EEXIST) {
0fd73091 1778 SYSERROR("Failed to create \"%s\"", lxcpath);
3d7d929a 1779 return -errno;
7c6ef2a2 1780 }
7c6ef2a2 1781
86530b0a 1782 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
3dc035f1 1783 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1784 return -1;
2a12fefd 1785
3dc035f1 1786 if (file_exists(path)) {
a7ba3c7f 1787 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1788 if (ret < 0) {
6d1400b5 1789 SYSERROR("Failed to unmount \"%s\"", path);
a7ba3c7f
CB
1790 return -ret;
1791 } else {
86530b0a 1792 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
a7ba3c7f 1793 }
3dc035f1 1794 }
2a12fefd 1795
3b7e332f
CB
1796 ret = mknod(path, S_IFREG | 0000, 0);
1797 if (ret < 0 && errno != EEXIST) {
1798 SYSERROR("Failed to create console");
1799 return -errno;
7c6ef2a2
SH
1800 }
1801
3b7e332f 1802 ret = fchmod(console->slave, S_IXUSR | S_IXGRP | S_IXOTH);
86530b0a 1803 if (ret < 0) {
0fd73091
CB
1804 SYSERROR("Failed to set mode \"0%o\" to \"%s\"",
1805 S_IXUSR | S_IXGRP | S_IXOTH, console->name);
2a12fefd
CB
1806 return -errno;
1807 }
1808
3dc035f1 1809 /* bind mount console->name to '/dev/<ttydir>/console' */
86530b0a
L
1810 ret = safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs_path);
1811 if (ret < 0) {
0fd73091 1812 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1813 return -1;
1814 }
86530b0a 1815 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1
L
1816
1817 /* bind mount '/dev/<ttydir>/console' to '/dev/console' */
86530b0a
L
1818 ret = safe_mount(lxcpath, path, "none", MS_BIND, 0, rootfs_path);
1819 if (ret < 0) {
0fd73091 1820 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
3dc035f1
L
1821 return -1;
1822 }
86530b0a 1823 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1 1824
86530b0a 1825 DEBUG("Console has been setup under \"%s\" and mounted to \"%s\"", lxcpath, path);
6e590161 1826 return 0;
1827}
1828
3d7d929a 1829static int lxc_setup_console(const struct lxc_rootfs *rootfs,
dcad02f8 1830 const struct lxc_terminal *console, char *ttydir)
7c6ef2a2 1831{
3d7d929a 1832
7c6ef2a2 1833 if (!ttydir)
3d7d929a 1834 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1835
3d7d929a 1836 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1837}
1838
efed99a4 1839static void parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
998ac676
RT
1840{
1841 struct mount_opt *mo;
1842
1843 /* If opt is found in mount_opt, set or clear flags.
1844 * Otherwise append it to data. */
1845
1846 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
0fd73091 1847 if (strncmp(opt, mo->name, strlen(mo->name)) == 0) {
998ac676
RT
1848 if (mo->clear)
1849 *flags &= ~mo->flag;
1850 else
1851 *flags |= mo->flag;
1852 return;
1853 }
1854 }
1855
f1e05b90
DJ
1856 if (strlen(*data))
1857 (void)strlcat(*data, ",", size);
efed99a4 1858
f1e05b90 1859 (void)strlcat(*data, opt, size);
998ac676
RT
1860}
1861
0fd73091 1862int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata)
998ac676 1863{
0fd73091
CB
1864 char *data, *p, *s;
1865 char *saveptr = NULL;
efed99a4 1866 size_t size;
998ac676 1867
911324ef 1868 *mntdata = NULL;
91656ce5 1869 *mntflags = 0L;
911324ef
DL
1870
1871 if (!mntopts)
998ac676
RT
1872 return 0;
1873
911324ef 1874 s = strdup(mntopts);
0fd73091 1875 if (!s)
998ac676 1876 return -1;
998ac676 1877
efed99a4
DJ
1878 size = strlen(s) + 1;
1879 data = malloc(size);
998ac676 1880 if (!data) {
998ac676
RT
1881 free(s);
1882 return -1;
1883 }
1884 *data = 0;
1885
0fd73091 1886 for (; (p = strtok_r(s, ",", &saveptr)); s = NULL)
efed99a4 1887 parse_mntopt(p, mntflags, &data, size);
998ac676
RT
1888
1889 if (*data)
1890 *mntdata = data;
1891 else
1892 free(data);
1893 free(s);
1894
1895 return 0;
1896}
1897
d840039e
YT
1898static void parse_propagationopt(char *opt, unsigned long *flags)
1899{
1900 struct mount_opt *mo;
1901
1902 /* If opt is found in propagation_opt, set or clear flags. */
d840039e 1903 for (mo = &propagation_opt[0]; mo->name != NULL; mo++) {
0fd73091
CB
1904 if (strncmp(opt, mo->name, strlen(mo->name)) != 0)
1905 continue;
1906
1907 if (mo->clear)
1908 *flags &= ~mo->flag;
1909 else
1910 *flags |= mo->flag;
1911
1912 return;
d840039e
YT
1913 }
1914}
1915
1916static int parse_propagationopts(const char *mntopts, unsigned long *pflags)
1917{
0fd73091
CB
1918 char *p, *s;
1919 char *saveptr = NULL;
d840039e
YT
1920
1921 if (!mntopts)
1922 return 0;
1923
1924 s = strdup(mntopts);
1925 if (!s) {
1926 SYSERROR("Failed to allocate memory");
1927 return -ENOMEM;
1928 }
1929
0fd73091
CB
1930 *pflags = 0L;
1931 for (; (p = strtok_r(s, ",", &saveptr)); s = NULL)
d840039e 1932 parse_propagationopt(p, pflags);
d840039e 1933 free(s);
0fd73091 1934
d840039e
YT
1935 return 0;
1936}
1937
6fd5e769
SH
1938static void null_endofword(char *word)
1939{
1940 while (*word && *word != ' ' && *word != '\t')
1941 word++;
1942 *word = '\0';
1943}
1944
0fd73091 1945/* skip @nfields spaces in @src */
6fd5e769
SH
1946static char *get_field(char *src, int nfields)
1947{
6fd5e769 1948 int i;
0fd73091 1949 char *p = src;
6fd5e769
SH
1950
1951 for (i = 0; i < nfields; i++) {
1952 while (*p && *p != ' ' && *p != '\t')
1953 p++;
0fd73091 1954
6fd5e769
SH
1955 if (!*p)
1956 break;
0fd73091 1957
6fd5e769
SH
1958 p++;
1959 }
0fd73091 1960
6fd5e769
SH
1961 return p;
1962}
1963
911324ef
DL
1964static int mount_entry(const char *fsname, const char *target,
1965 const char *fstype, unsigned long mountflags,
d840039e
YT
1966 unsigned long pflags, const char *data, bool optional,
1967 bool dev, bool relative, const char *rootfs)
911324ef 1968{
0ac4b28a 1969 int ret;
181437fd
YT
1970 char srcbuf[MAXPATHLEN];
1971 const char *srcpath = fsname;
614305f3 1972#ifdef HAVE_STATVFS
2938f7c8 1973 struct statvfs sb;
614305f3 1974#endif
2938f7c8 1975
181437fd
YT
1976 if (relative) {
1977 ret = snprintf(srcbuf, MAXPATHLEN, "%s/%s", rootfs ? rootfs : "/", fsname ? fsname : "");
1978 if (ret < 0 || ret >= MAXPATHLEN) {
1979 ERROR("source path is too long");
1980 return -1;
1981 }
1982 srcpath = srcbuf;
1983 }
1984
1985 ret = safe_mount(srcpath, target, fstype, mountflags & ~MS_REMOUNT, data,
0ac4b28a
CB
1986 rootfs);
1987 if (ret < 0) {
1fc64d22 1988 if (optional) {
7874d81a 1989 SYSINFO("Failed to mount \"%s\" on \"%s\" (optional)",
1990 srcpath ? srcpath : "(null)", target);
1fc64d22
SG
1991 return 0;
1992 }
0ac4b28a 1993
0103eb53 1994 SYSERROR("Failed to mount \"%s\" on \"%s\"",
181437fd 1995 srcpath ? srcpath : "(null)", target);
0ac4b28a 1996 return -1;
911324ef
DL
1997 }
1998
1999 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
7c5b6e7c 2000 unsigned long rqd_flags = 0;
0ac4b28a
CB
2001
2002 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
181437fd 2003 "options", srcpath ? srcpath : "(none)", target ? target : "(none)");
0ac4b28a 2004
7c5b6e7c
AS
2005 if (mountflags & MS_RDONLY)
2006 rqd_flags |= MS_RDONLY;
614305f3 2007#ifdef HAVE_STATVFS
181437fd 2008 if (srcpath && statvfs(srcpath, &sb) == 0) {
7c5b6e7c 2009 unsigned long required_flags = rqd_flags;
0ac4b28a 2010
2938f7c8
SH
2011 if (sb.f_flag & MS_NOSUID)
2012 required_flags |= MS_NOSUID;
0ac4b28a 2013
ae7a770e 2014 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 2015 required_flags |= MS_NODEV;
0ac4b28a 2016
2938f7c8
SH
2017 if (sb.f_flag & MS_RDONLY)
2018 required_flags |= MS_RDONLY;
0ac4b28a 2019
2938f7c8
SH
2020 if (sb.f_flag & MS_NOEXEC)
2021 required_flags |= MS_NOEXEC;
0ac4b28a
CB
2022
2023 DEBUG("Flags for \"%s\" were %lu, required extra flags "
181437fd 2024 "are %lu", srcpath, sb.f_flag, required_flags);
0ac4b28a
CB
2025
2026 /* If this was a bind mount request, and required_flags
2938f7c8 2027 * does not have any flags which are not already in
0ac4b28a 2028 * mountflags, then skip the remount.
2938f7c8
SH
2029 */
2030 if (!(mountflags & MS_REMOUNT)) {
0ac4b28a
CB
2031 if (!(required_flags & ~mountflags) &&
2032 rqd_flags == 0) {
2033 DEBUG("Mountflags already were %lu, "
2034 "skipping remount", mountflags);
2938f7c8
SH
2035 goto skipremount;
2036 }
2037 }
0ac4b28a 2038
2938f7c8 2039 mountflags |= required_flags;
6fd5e769 2040 }
614305f3 2041#endif
911324ef 2042
181437fd 2043 ret = mount(srcpath, target, fstype, mountflags | MS_REMOUNT, data);
0ac4b28a 2044 if (ret < 0) {
1fc64d22 2045 if (optional) {
7874d81a 2046 SYSINFO("Failed to mount \"%s\" on \"%s\" (optional)",
2047 srcpath ? srcpath : "(null)", target);
1fc64d22
SG
2048 return 0;
2049 }
0ac4b28a 2050
0103eb53 2051 SYSERROR("Failed to mount \"%s\" on \"%s\"",
181437fd 2052 srcpath ? srcpath : "(null)", target);
0ac4b28a 2053 return -1;
911324ef
DL
2054 }
2055 }
2056
d840039e
YT
2057 if (pflags) {
2058 ret = mount(NULL, target, NULL, pflags, NULL);
2059 if (ret < 0) {
2060 if (optional) {
7874d81a 2061 SYSINFO("Failed to change mount propagation "
2062 "for \"%s\" (optional)", target);
d840039e
YT
2063 return 0;
2064 } else {
2065 SYSERROR("Failed to change mount propagation "
2066 "for \"%s\" (optional)", target);
2067 return -1;
2068 }
2069 }
2070 DEBUG("Changed mount propagation for \"%s\"", target);
2071 }
2072
2073
614305f3 2074#ifdef HAVE_STATVFS
6fd5e769 2075skipremount:
614305f3 2076#endif
0103eb53 2077 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"",
181437fd 2078 srcpath ? srcpath : "(null)", target, fstype);
911324ef
DL
2079
2080 return 0;
2081}
2082
c5e30de4 2083/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
2084static void cull_mntent_opt(struct mntent *mntent)
2085{
2086 int i;
0fd73091
CB
2087 char *list[] = {
2088 "create=dir",
2089 "create=file",
2090 "optional",
2091 "relative",
2092 NULL
2093 };
c5e30de4
CB
2094
2095 for (i = 0; list[i]; i++) {
2096 char *p, *p2;
2097
2098 p = strstr(mntent->mnt_opts, list[i]);
2099 if (!p)
4e4ca161 2100 continue;
c5e30de4 2101
4e4ca161
SH
2102 p2 = strchr(p, ',');
2103 if (!p2) {
2104 /* no more mntopts, so just chop it here */
2105 *p = '\0';
2106 continue;
2107 }
c5e30de4
CB
2108
2109 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
2110 }
2111}
2112
4d5b72a1 2113static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
2114 const char *path,
2115 const struct lxc_rootfs *rootfs,
0fd73091 2116 const char *lxc_name, const char *lxc_path)
0ad19a3f 2117{
3b7e332f 2118 int ret;
12e6ab5d 2119 char *p1, *p2;
911324ef 2120
12e6ab5d 2121 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
749f98d9 2122 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
12e6ab5d
CB
2123 if (ret < 0)
2124 return -1;
2125 }
6e46cc0d 2126
34cfffb3 2127 if (hasmntopt(mntent, "create=dir")) {
749f98d9
CB
2128 ret = mkdir_p(path, 0755);
2129 if (ret < 0 && errno != EEXIST) {
2130 SYSERROR("Failed to create directory \"%s\"", path);
2131 return -1;
34cfffb3
SG
2132 }
2133 }
2134
0fd73091
CB
2135 if (!hasmntopt(mntent, "create=file"))
2136 return 0;
749f98d9 2137
0fd73091
CB
2138 ret = access(path, F_OK);
2139 if (ret == 0)
2140 return 0;
749f98d9 2141
0fd73091
CB
2142 p1 = strdup(path);
2143 if (!p1)
2144 return -1;
749f98d9 2145
0fd73091 2146 p2 = dirname(p1);
749f98d9 2147
0fd73091
CB
2148 ret = mkdir_p(p2, 0755);
2149 free(p1);
2150 if (ret < 0 && errno != EEXIST) {
2151 SYSERROR("Failed to create directory \"%s\"", path);
2152 return -1;
34cfffb3 2153 }
749f98d9 2154
3b7e332f
CB
2155 ret = mknod(path, S_IFREG | 0000, 0);
2156 if (ret < 0 && errno != EEXIST)
2157 return -errno;
0fd73091 2158
749f98d9 2159 return 0;
4d5b72a1
NC
2160}
2161
ec50007f
CB
2162/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2163 * without a rootfs. */
db4aba38 2164static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
2165 const char *path,
2166 const struct lxc_rootfs *rootfs,
2167 const char *lxc_name,
2168 const char *lxc_path)
4d5b72a1 2169{
d8b712bc 2170 int ret;
949d0338 2171 unsigned long mntflags;
4d5b72a1 2172 char *mntdata;
181437fd 2173 bool dev, optional, relative;
949d0338 2174 unsigned long pflags = 0;
ec50007f 2175 char *rootfs_path = NULL;
d8b712bc
CB
2176
2177 optional = hasmntopt(mntent, "optional") != NULL;
2178 dev = hasmntopt(mntent, "dev") != NULL;
181437fd 2179 relative = hasmntopt(mntent, "relative") != NULL;
d8b712bc 2180
ec50007f
CB
2181 if (rootfs && rootfs->path)
2182 rootfs_path = rootfs->mount;
2183
d8b712bc
CB
2184 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2185 lxc_path);
2186 if (ret < 0) {
2187 if (optional)
2188 return 0;
608e3567 2189
d8b712bc
CB
2190 return -1;
2191 }
4e4ca161
SH
2192 cull_mntent_opt(mntent);
2193
d840039e
YT
2194 ret = parse_propagationopts(mntent->mnt_opts, &pflags);
2195 if (ret < 0)
2196 return -1;
2197
d8b712bc
CB
2198 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2199 if (ret < 0)
a17b1e65 2200 return -1;
a17b1e65 2201
6e46cc0d 2202 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
d840039e 2203 pflags, mntdata, optional, dev, relative, rootfs_path);
68c152ef 2204
911324ef 2205 free(mntdata);
911324ef
DL
2206 return ret;
2207}
2208
db4aba38
NC
2209static inline int mount_entry_on_systemfs(struct mntent *mntent)
2210{
1433c9f9 2211 int ret;
07667a6a 2212 char path[MAXPATHLEN];
1433c9f9
CB
2213
2214 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
2215 * absolute paths starting at / on the host.
2216 */
1433c9f9
CB
2217 if (mntent->mnt_dir[0] != '/')
2218 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2219 else
2220 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
07667a6a 2221 if (ret < 0 || ret >= sizeof(path))
1433c9f9 2222 return -1;
1433c9f9
CB
2223
2224 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
2225}
2226
4e4ca161 2227static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 2228 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
2229 const char *lxc_name,
2230 const char *lxc_path)
911324ef 2231{
bdd2b34c 2232 int offset;
013bd428 2233 char *aux;
67e571de 2234 const char *lxcpath;
bdd2b34c
CB
2235 char path[MAXPATHLEN];
2236 int ret = 0;
0ad19a3f 2237
593e8478 2238 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 2239 if (!lxcpath)
2a59a681 2240 return -1;
2a59a681 2241
bdd2b34c
CB
2242 /* If rootfs->path is a blockdev path, allow container fstab to use
2243 * <lxcpath>/<name>/rootfs" as the target prefix.
2244 */
2245 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
2246 if (ret < 0 || ret >= MAXPATHLEN)
80a881b2
SH
2247 goto skipvarlib;
2248
2249 aux = strstr(mntent->mnt_dir, path);
2250 if (aux) {
2251 offset = strlen(path);
2252 goto skipabs;
2253 }
2254
2255skipvarlib:
013bd428
DL
2256 aux = strstr(mntent->mnt_dir, rootfs->path);
2257 if (!aux) {
bdd2b34c 2258 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
db4aba38 2259 return ret;
013bd428 2260 }
80a881b2
SH
2261 offset = strlen(rootfs->path);
2262
2263skipabs:
bdd2b34c
CB
2264 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
2265 if (ret < 0 || ret >= MAXPATHLEN)
a17b1e65 2266 return -1;
a17b1e65 2267
0a2dddd4 2268 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 2269}
d330fe7b 2270
4e4ca161 2271static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
2272 const struct lxc_rootfs *rootfs,
2273 const char *lxc_name,
2274 const char *lxc_path)
911324ef 2275{
911324ef 2276 int ret;
0fd73091 2277 char path[MAXPATHLEN];
d330fe7b 2278
34cfffb3 2279 /* relative to root mount point */
6e46cc0d 2280 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
0fd73091 2281 if (ret < 0 || (size_t)ret >= sizeof(path))
9ba8130c 2282 return -1;
911324ef 2283
0a2dddd4 2284 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2285}
2286
06749971
CB
2287static int mount_file_entries(const struct lxc_conf *conf,
2288 const struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2289 const char *lxc_name, const char *lxc_path)
911324ef 2290{
aaf901be 2291 char buf[4096];
0fd73091 2292 struct mntent mntent;
911324ef 2293 int ret = -1;
e76b8764 2294
aaf901be 2295 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
1ae3c19f
CB
2296 if (!rootfs->path)
2297 ret = mount_entry_on_systemfs(&mntent);
2298 else if (mntent.mnt_dir[0] != '/')
2299 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2300 lxc_name, lxc_path);
2301 else
2302 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2303 lxc_name, lxc_path);
2304 if (ret < 0)
2305 return -1;
0ad19a3f 2306 }
2307 ret = 0;
cd54d859 2308
0fd73091 2309 INFO("Finished setting up mounts");
e7938e9e
MN
2310 return ret;
2311}
2312
06749971
CB
2313static int setup_mount(const struct lxc_conf *conf,
2314 const struct lxc_rootfs *rootfs, const char *fstab,
42dff448 2315 const char *lxc_name, const char *lxc_path)
e7938e9e 2316{
42dff448 2317 FILE *f;
e7938e9e
MN
2318 int ret;
2319
2320 if (!fstab)
2321 return 0;
2322
42dff448
CB
2323 f = setmntent(fstab, "r");
2324 if (!f) {
2325 SYSERROR("Failed to open \"%s\"", fstab);
e7938e9e
MN
2326 return -1;
2327 }
2328
06749971 2329 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
42dff448
CB
2330 if (ret < 0)
2331 ERROR("Failed to set up mount entries");
e7938e9e 2332
42dff448 2333 endmntent(f);
0ad19a3f 2334 return ret;
2335}
2336
5ef5c9a3 2337FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2338{
5ef5c9a3 2339 int ret;
e7938e9e 2340 char *mount_entry;
5ef5c9a3 2341 struct lxc_list *iterator;
5ef5c9a3
CB
2342 int fd = -1;
2343
0fd73091 2344 fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
5ef5c9a3 2345 if (fd < 0) {
a324e7eb
CB
2346 char template[] = P_tmpdir "/.lxc_mount_file_XXXXXX";
2347
5ef5c9a3
CB
2348 if (errno != ENOSYS)
2349 return NULL;
a324e7eb
CB
2350
2351 fd = lxc_make_tmpfile(template, true);
0fd73091
CB
2352 if (fd < 0) {
2353 SYSERROR("Could not create temporary mount file");
2354 return NULL;
2355 }
2356
6bd04140 2357 TRACE("Created temporary mount file");
5ef5c9a3 2358 }
0fd73091
CB
2359 if (fd < 0) {
2360 SYSERROR("Could not create temporary mount file");
9fc7f8c0 2361 return NULL;
e7938e9e
MN
2362 }
2363
0fd73091
CB
2364 lxc_list_for_each (iterator, mount) {
2365 size_t len;
2366
e7938e9e 2367 mount_entry = iterator->elem;
0fd73091 2368 len = strlen(mount_entry);
5ef5c9a3 2369
489f39be 2370 ret = lxc_write_nointr(fd, mount_entry, len);
0fd73091
CB
2371 if (ret != len)
2372 goto on_error;
2373
489f39be 2374 ret = lxc_write_nointr(fd, "\n", 1);
0fd73091
CB
2375 if (ret != 1)
2376 goto on_error;
e7938e9e
MN
2377 }
2378
0fd73091
CB
2379 ret = lseek(fd, 0, SEEK_SET);
2380 if (ret < 0)
2381 goto on_error;
2382
2383 return fdopen(fd, "r+");
2384
2385on_error:
2386 SYSERROR("Failed to write mount entry to temporary mount file");
2387 close(fd);
2388 return NULL;
9fc7f8c0
TA
2389}
2390
06749971
CB
2391static int setup_mount_entries(const struct lxc_conf *conf,
2392 const struct lxc_rootfs *rootfs,
5ef5c9a3
CB
2393 struct lxc_list *mount, const char *lxc_name,
2394 const char *lxc_path)
9fc7f8c0 2395{
9fc7f8c0 2396 int ret;
0fd73091 2397 FILE *f;
9fc7f8c0 2398
19b5d755
CB
2399 f = make_anonymous_mount_file(mount);
2400 if (!f)
9fc7f8c0 2401 return -1;
e7938e9e 2402
06749971 2403 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
19b5d755 2404 fclose(f);
0fd73091 2405
e7938e9e
MN
2406 return ret;
2407}
2408
bab88e68
CS
2409static int parse_cap(const char *cap)
2410{
84760c11 2411 size_t i;
2412 int capid = -1;
0fd73091
CB
2413 size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
2414 char *ptr = NULL;
bab88e68 2415
0fd73091 2416 if (strcmp(cap, "none") == 0)
7035407c
DE
2417 return -2;
2418
8560cd36 2419 for (i = 0; i < end; i++) {
bab88e68
CS
2420 if (strcmp(cap, caps_opt[i].name))
2421 continue;
2422
2423 capid = caps_opt[i].value;
2424 break;
2425 }
2426
2427 if (capid < 0) {
0fd73091
CB
2428 /* Try to see if it's numeric, so the user may specify
2429 * capabilities that the running kernel knows about but we
2430 * don't
2431 */
bab88e68
CS
2432 errno = 0;
2433 capid = strtol(cap, &ptr, 10);
2434 if (!ptr || *ptr != '\0' || errno != 0)
2435 /* not a valid number */
2436 capid = -1;
2437 else if (capid > lxc_caps_last_cap())
2438 /* we have a number but it's not a valid
2439 * capability */
2440 capid = -1;
2441 }
2442
2443 return capid;
2444}
2445
0769b82a
CS
2446int in_caplist(int cap, struct lxc_list *caps)
2447{
0769b82a 2448 int capid;
0fd73091 2449 struct lxc_list *iterator;
0769b82a 2450
0fd73091 2451 lxc_list_for_each (iterator, caps) {
0769b82a
CS
2452 capid = parse_cap(iterator->elem);
2453 if (capid == cap)
2454 return 1;
2455 }
2456
2457 return 0;
2458}
2459
81810dd1
DL
2460static int setup_caps(struct lxc_list *caps)
2461{
bab88e68 2462 int capid;
0fd73091
CB
2463 char *drop_entry;
2464 struct lxc_list *iterator;
81810dd1 2465
0fd73091
CB
2466 lxc_list_for_each (iterator, caps) {
2467 int ret;
81810dd1
DL
2468
2469 drop_entry = iterator->elem;
2470
bab88e68 2471 capid = parse_cap(drop_entry);
0fd73091 2472 if (capid < 0) {
1e11be34
DL
2473 ERROR("unknown capability %s", drop_entry);
2474 return -1;
81810dd1
DL
2475 }
2476
0fd73091
CB
2477 ret = prctl(PR_CAPBSET_DROP, capid, 0, 0, 0);
2478 if (ret < 0) {
2479 SYSERROR("Failed to remove %s capability", drop_entry);
3ec1648d
SH
2480 return -1;
2481 }
0fd73091 2482 DEBUG("Dropped %s (%d) capability", drop_entry, capid);
81810dd1
DL
2483 }
2484
0fd73091 2485 DEBUG("Capabilities have been setup");
1fb86a7c
SH
2486 return 0;
2487}
2488
2489static int dropcaps_except(struct lxc_list *caps)
2490{
0fd73091 2491 int i, capid, numcaps;
1fb86a7c 2492 char *keep_entry;
0fd73091 2493 struct lxc_list *iterator;
1fb86a7c 2494
0fd73091 2495 numcaps = lxc_caps_last_cap() + 1;
2caf9a97
SH
2496 if (numcaps <= 0 || numcaps > 200)
2497 return -1;
0fd73091 2498 TRACE("Found %d capabilities", numcaps);
2caf9a97 2499
1a0e70ac 2500 /* caplist[i] is 1 if we keep capability i */
1fb86a7c
SH
2501 int *caplist = alloca(numcaps * sizeof(int));
2502 memset(caplist, 0, numcaps * sizeof(int));
2503
0fd73091 2504 lxc_list_for_each (iterator, caps) {
1fb86a7c
SH
2505 keep_entry = iterator->elem;
2506
bab88e68 2507 capid = parse_cap(keep_entry);
7035407c
DE
2508 if (capid == -2)
2509 continue;
2510
0fd73091
CB
2511 if (capid < 0) {
2512 ERROR("Unknown capability %s", keep_entry);
1fb86a7c
SH
2513 return -1;
2514 }
2515
0fd73091 2516 DEBUG("Keep capability %s (%d)", keep_entry, capid);
1fb86a7c
SH
2517 caplist[capid] = 1;
2518 }
0fd73091
CB
2519
2520 for (i = 0; i < numcaps; i++) {
2521 int ret;
2522
1fb86a7c
SH
2523 if (caplist[i])
2524 continue;
0fd73091
CB
2525
2526 ret = prctl(PR_CAPBSET_DROP, i, 0, 0, 0);
2527 if (ret < 0) {
2528 SYSERROR("Failed to remove capability %d", i);
3ec1648d
SH
2529 return -1;
2530 }
1fb86a7c
SH
2531 }
2532
0fd73091 2533 DEBUG("Capabilities have been setup");
81810dd1
DL
2534 return 0;
2535}
2536
0fd73091
CB
2537static int parse_resource(const char *res)
2538{
2539 int ret;
c6d09e15
WB
2540 size_t i;
2541 int resid = -1;
2542
0fd73091 2543 for (i = 0; i < sizeof(limit_opt) / sizeof(limit_opt[0]); ++i)
c6d09e15
WB
2544 if (strcmp(res, limit_opt[i].name) == 0)
2545 return limit_opt[i].value;
c6d09e15 2546
0fd73091 2547 /* Try to see if it's numeric, so the user may specify
c6d09e15 2548 * resources that the running kernel knows about but
0fd73091
CB
2549 * we don't.
2550 */
2551 ret = lxc_safe_int(res, &resid);
2552 if (ret < 0)
2553 return -1;
2554
2555 return resid;
c6d09e15
WB
2556}
2557
0fd73091
CB
2558int setup_resource_limits(struct lxc_list *limits, pid_t pid)
2559{
2560 int resid;
c6d09e15
WB
2561 struct lxc_list *it;
2562 struct lxc_limit *lim;
c6d09e15 2563
0fd73091 2564 lxc_list_for_each (it, limits) {
c6d09e15
WB
2565 lim = it->elem;
2566
2567 resid = parse_resource(lim->resource);
2568 if (resid < 0) {
0fd73091 2569 ERROR("Unknown resource %s", lim->resource);
c6d09e15
WB
2570 return -1;
2571 }
2572
f48b5fd8 2573#if HAVE_PRLIMIT || HAVE_PRLIMIT64
c6d09e15 2574 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
6d1400b5 2575 SYSERROR("Failed to set limit %s", lim->resource);
c6d09e15
WB
2576 return -1;
2577 }
f48b5fd8
FF
2578#else
2579 ERROR("Cannot set limit %s as prlimit is missing", lim->resource);
2580 return -1;
2581#endif
c6d09e15 2582 }
0fd73091 2583
c6d09e15
WB
2584 return 0;
2585}
2586
7edd0540
L
2587int setup_sysctl_parameters(struct lxc_list *sysctls)
2588{
2589 struct lxc_list *it;
2590 struct lxc_sysctl *elem;
0fd73091 2591 int ret = 0;
7edd0540
L
2592 char *tmp = NULL;
2593 char filename[MAXPATHLEN] = {0};
7edd0540 2594
0fd73091 2595 lxc_list_for_each (it, sysctls) {
7edd0540
L
2596 elem = it->elem;
2597 tmp = lxc_string_replace(".", "/", elem->key);
2598 if (!tmp) {
2599 ERROR("Failed to replace key %s", elem->key);
2600 return -1;
2601 }
2602
2603 ret = snprintf(filename, sizeof(filename), "/proc/sys/%s", tmp);
2604 free(tmp);
2605 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2606 ERROR("Error setting up sysctl parameters path");
2607 return -1;
2608 }
2609
0fd73091 2610 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2611 strlen(elem->value), false, 0666);
7edd0540 2612 if (ret < 0) {
0fd73091
CB
2613 ERROR("Failed to setup sysctl parameters %s to %s",
2614 elem->key, elem->value);
7edd0540
L
2615 return -1;
2616 }
2617 }
0fd73091 2618
7edd0540
L
2619 return 0;
2620}
2621
61d7a733
YT
2622int setup_proc_filesystem(struct lxc_list *procs, pid_t pid)
2623{
2624 struct lxc_list *it;
2625 struct lxc_proc *elem;
0fd73091 2626 int ret = 0;
61d7a733
YT
2627 char *tmp = NULL;
2628 char filename[MAXPATHLEN] = {0};
61d7a733 2629
0fd73091 2630 lxc_list_for_each (it, procs) {
61d7a733
YT
2631 elem = it->elem;
2632 tmp = lxc_string_replace(".", "/", elem->filename);
2633 if (!tmp) {
2634 ERROR("Failed to replace key %s", elem->filename);
2635 return -1;
2636 }
2637
2638 ret = snprintf(filename, sizeof(filename), "/proc/%d/%s", pid, tmp);
2639 free(tmp);
2640 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2641 ERROR("Error setting up proc filesystem path");
2642 return -1;
2643 }
2644
0fd73091 2645 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2646 strlen(elem->value), false, 0666);
61d7a733 2647 if (ret < 0) {
0fd73091
CB
2648 ERROR("Failed to setup proc filesystem %s to %s",
2649 elem->filename, elem->value);
61d7a733
YT
2650 return -1;
2651 }
2652 }
0fd73091 2653
61d7a733
YT
2654 return 0;
2655}
2656
ae9242c8
SH
2657static char *default_rootfs_mount = LXCROOTFSMOUNT;
2658
7b379ab3 2659struct lxc_conf *lxc_conf_init(void)
089cd8b8 2660{
26ddeedd 2661 int i;
0fd73091 2662 struct lxc_conf *new;
7b379ab3 2663
13277ec4 2664 new = malloc(sizeof(*new));
0fd73091 2665 if (!new)
7b379ab3 2666 return NULL;
7b379ab3
MN
2667 memset(new, 0, sizeof(*new));
2668
4b73005c 2669 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2670 new->personality = -1;
124fa0a8 2671 new->autodev = 1;
3a784510 2672 new->console.buffer_size = 0;
596a818d
DE
2673 new->console.log_path = NULL;
2674 new->console.log_fd = -1;
861813e5 2675 new->console.log_size = 0;
28a4b0e5 2676 new->console.path = NULL;
63376d7d 2677 new->console.peer = -1;
fb87aa6a
CB
2678 new->console.proxy.busy = -1;
2679 new->console.proxy.master = -1;
2680 new->console.proxy.slave = -1;
63376d7d
DL
2681 new->console.master = -1;
2682 new->console.slave = -1;
2683 new->console.name[0] = '\0';
732375f5 2684 memset(&new->console.ringbuf, 0, sizeof(struct lxc_ringbuf));
d2e30e99 2685 new->maincmd_fd = -1;
258f8051 2686 new->monitor_signal_pdeath = SIGKILL;
76a26f55 2687 new->nbd_idx = -1;
54c30e29 2688 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2689 if (!new->rootfs.mount) {
53f3f048
SH
2690 free(new);
2691 return NULL;
2692 }
858377e4 2693 new->logfd = -1;
7b379ab3 2694 lxc_list_init(&new->cgroup);
54860ed0 2695 lxc_list_init(&new->cgroup2);
7b379ab3
MN
2696 lxc_list_init(&new->network);
2697 lxc_list_init(&new->mount_list);
81810dd1 2698 lxc_list_init(&new->caps);
1fb86a7c 2699 lxc_list_init(&new->keepcaps);
f6d3e3e4 2700 lxc_list_init(&new->id_map);
46ad64ab
CB
2701 new->root_nsuid_map = NULL;
2702 new->root_nsgid_map = NULL;
f979ac15 2703 lxc_list_init(&new->includes);
4184c3e1 2704 lxc_list_init(&new->aliens);
7c661726 2705 lxc_list_init(&new->environment);
c6d09e15 2706 lxc_list_init(&new->limits);
7edd0540 2707 lxc_list_init(&new->sysctls);
61d7a733 2708 lxc_list_init(&new->procs);
44ae0fb6 2709 new->hooks_version = 0;
28d9e29e 2710 for (i = 0; i < NUM_LXC_HOOKS; i++)
26ddeedd 2711 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2712 lxc_list_init(&new->groups);
d39b10eb 2713 lxc_list_init(&new->state_clients);
fe4de9a6
DE
2714 new->lsm_aa_profile = NULL;
2715 new->lsm_se_context = NULL;
7a0bcca3 2716 new->tmp_umount_proc = false;
adf0ba1f
LT
2717 new->lxc_shmount.path_host = NULL;
2718 new->lxc_shmount.path_cont = NULL;
7b379ab3 2719
72bb04e4
PT
2720 /* if running in a new user namespace, init and COMMAND
2721 * default to running as UID/GID 0 when using lxc-execute */
2722 new->init_uid = 0;
2723 new->init_gid = 0;
43654d34 2724 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
b074bbf1 2725 memset(&new->ns_share, 0, sizeof(char *) * LXC_NS_MAX);
72bb04e4 2726
7b379ab3 2727 return new;
089cd8b8
DL
2728}
2729
344c9d81 2730int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
a19b974f 2731 size_t buf_size)
f6d3e3e4 2732{
29053180 2733 int fd, ret;
0fd73091 2734 char path[MAXPATHLEN];
f6d3e3e4 2735
a19b974f
CB
2736 if (geteuid() != 0 && idtype == ID_TYPE_GID) {
2737 size_t buflen;
2738
2739 ret = snprintf(path, MAXPATHLEN, "/proc/%d/setgroups", pid);
0fd73091 2740 if (ret < 0 || ret >= MAXPATHLEN)
a19b974f 2741 return -E2BIG;
a19b974f
CB
2742
2743 fd = open(path, O_WRONLY);
2744 if (fd < 0 && errno != ENOENT) {
2745 SYSERROR("Failed to open \"%s\"", path);
2746 return -1;
2747 }
2748
2388737b
CB
2749 if (fd >= 0) {
2750 buflen = sizeof("deny\n") - 1;
2751 errno = 0;
2752 ret = lxc_write_nointr(fd, "deny\n", buflen);
395b1a3e 2753 close(fd);
2388737b 2754 if (ret != buflen) {
0fd73091
CB
2755 SYSERROR("Failed to write \"deny\" to "
2756 "\"/proc/%d/setgroups\"", pid);
2388737b
CB
2757 return -1;
2758 }
395b1a3e 2759 TRACE("Wrote \"deny\" to \"/proc/%d/setgroups\"", pid);
a19b974f 2760 }
a19b974f
CB
2761 }
2762
29053180
CB
2763 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
2764 idtype == ID_TYPE_UID ? 'u' : 'g');
0fd73091 2765 if (ret < 0 || ret >= MAXPATHLEN)
f6d3e3e4 2766 return -E2BIG;
29053180
CB
2767
2768 fd = open(path, O_WRONLY);
2769 if (fd < 0) {
a19b974f 2770 SYSERROR("Failed to open \"%s\"", path);
29053180 2771 return -1;
f6d3e3e4 2772 }
29053180
CB
2773
2774 errno = 0;
2775 ret = lxc_write_nointr(fd, buf, buf_size);
395b1a3e 2776 close(fd);
29053180 2777 if (ret != buf_size) {
a19b974f 2778 SYSERROR("Failed to write %cid mapping to \"%s\"",
29053180 2779 idtype == ID_TYPE_UID ? 'u' : 'g', path);
29053180
CB
2780 return -1;
2781 }
29053180
CB
2782
2783 return 0;
f6d3e3e4
SH
2784}
2785
6e50e704
CB
2786/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2787 *
2788 * @return 1 if functional binary was found
2789 * @return 0 if binary exists but is lacking privilege
2790 * @return -ENOENT if binary does not exist
2791 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
6e50e704 2792 */
df6a2945
CB
2793static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2794{
2795 char *path;
2796 int ret;
2797 struct stat st;
2798 int fret = 0;
2799
6e50e704
CB
2800 if (cap != CAP_SETUID && cap != CAP_SETGID)
2801 return -EINVAL;
2802
df6a2945
CB
2803 path = on_path(binary, NULL);
2804 if (!path)
2805 return -ENOENT;
2806
2807 ret = stat(path, &st);
2808 if (ret < 0) {
2809 fret = -errno;
2810 goto cleanup;
2811 }
2812
2813 /* Check if the binary is setuid. */
2814 if (st.st_mode & S_ISUID) {
0fd73091 2815 DEBUG("The binary \"%s\" does have the setuid bit set", path);
df6a2945
CB
2816 fret = 1;
2817 goto cleanup;
2818 }
2819
0fd73091 2820#if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
2821 /* Check if it has the CAP_SETUID capability. */
2822 if ((cap & CAP_SETUID) &&
2823 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2824 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2825 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
0fd73091 2826 "and CAP_PERMITTED sets", path);
df6a2945
CB
2827 fret = 1;
2828 goto cleanup;
2829 }
2830
2831 /* Check if it has the CAP_SETGID capability. */
2832 if ((cap & CAP_SETGID) &&
2833 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2834 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2835 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
0fd73091 2836 "and CAP_PERMITTED sets", path);
df6a2945
CB
2837 fret = 1;
2838 goto cleanup;
2839 }
0fd73091 2840#else
69924fff
CB
2841 /* If we cannot check for file capabilities we need to give the benefit
2842 * of the doubt. Otherwise we might fail even though all the necessary
2843 * file capabilities are set.
2844 */
d6018f88 2845 DEBUG("Cannot check for file capabilites as full capability support is "
0fd73091 2846 "missing. Manual intervention needed");
d6018f88 2847 fret = 1;
0fd73091 2848#endif
df6a2945
CB
2849
2850cleanup:
2851 free(path);
2852 return fret;
2853}
2854
986ef930
CB
2855int lxc_map_ids_exec_wrapper(void *args)
2856{
2857 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2858 return -1;
2859}
2860
f6d3e3e4
SH
2861int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2862{
0fd73091 2863 int fill, left;
986ef930 2864 char u_or_g;
4bc3b759 2865 char *pos;
986ef930 2866 char cmd_output[MAXPATHLEN];
0fd73091
CB
2867 struct id_map *map;
2868 struct lxc_list *iterator;
2869 enum idtype type;
986ef930
CB
2870 /* strlen("new@idmap") = 9
2871 * +
2872 * strlen(" ") = 1
2873 * +
2874 * LXC_NUMSTRLEN64
2875 * +
2876 * strlen(" ") = 1
2877 *
2878 * We add some additional space to make sure that we really have
2879 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2880 */
0fd73091 2881 int ret = 0, gidmap = 0, uidmap = 0;
986ef930 2882 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
0fd73091 2883 bool had_entry = false, use_shadow = false;
c724025c
JC
2884 int hostuid, hostgid;
2885
2886 hostuid = geteuid();
2887 hostgid = getegid();
df6a2945
CB
2888
2889 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2890 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
2891 * will protected it by preventing another user from being handed the
2892 * range by shadow.
2893 */
df6a2945 2894 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
2895 if (uidmap == -ENOENT)
2896 WARN("newuidmap binary is missing");
2897 else if (!uidmap)
2898 WARN("newuidmap is lacking necessary privileges");
2899
df6a2945 2900 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
2901 if (gidmap == -ENOENT)
2902 WARN("newgidmap binary is missing");
2903 else if (!gidmap)
2904 WARN("newgidmap is lacking necessary privileges");
2905
df6a2945 2906 if (uidmap > 0 && gidmap > 0) {
0fd73091 2907 DEBUG("Functional newuidmap and newgidmap binary found");
4bc3b759 2908 use_shadow = true;
df6a2945 2909 } else {
99d43365
CB
2910 /* In case unprivileged users run application containers via
2911 * execute() or a start*() there are valid cases where they may
2912 * only want to map their own {g,u}id. Let's not block them from
2913 * doing so by requiring geteuid() == 0.
2914 */
2915 DEBUG("No newuidmap and newgidmap binary found. Trying to "
c724025c
JC
2916 "write directly with euid %d", hostuid);
2917 }
2918
2919 /* Check if we really need to use newuidmap and newgidmap.
2920 * If the user is only remapping his own {g,u}id, we don't need it.
2921 */
2922 if (use_shadow && lxc_list_len(idmap) == 2) {
2923 use_shadow = false;
2924 lxc_list_for_each(iterator, idmap) {
2925 map = iterator->elem;
2926 if (map->idtype == ID_TYPE_UID && map->range == 1 &&
2927 map->nsid == hostuid && map->hostid == hostuid)
2928 continue;
2929 if (map->idtype == ID_TYPE_GID && map->range == 1 &&
2930 map->nsid == hostgid && map->hostid == hostgid)
2931 continue;
2932 use_shadow = true;
2933 break;
2934 }
0e6e3a41 2935 }
251d0d2a 2936
986ef930
CB
2937 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2938 type++, u_or_g = 'g') {
2939 pos = mapbuf;
2940
0e6e3a41 2941 if (use_shadow)
986ef930 2942 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 2943
cf3ef16d 2944 lxc_list_for_each(iterator, idmap) {
251d0d2a 2945 map = iterator->elem;
cf3ef16d
SH
2946 if (map->idtype != type)
2947 continue;
2948
4bc3b759
CB
2949 had_entry = true;
2950
986ef930 2951 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 2952 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
2953 use_shadow ? " " : "", map->nsid,
2954 map->hostid, map->range,
0e6e3a41 2955 use_shadow ? "" : "\n");
a427e268
CB
2956 if (fill <= 0 || fill >= left) {
2957 /* The kernel only takes <= 4k for writes to
2958 * /proc/<pid>/{g,u}id_map
2959 */
2960 SYSERROR("Too many %cid mappings defined", u_or_g);
2961 return -1;
2962 }
4bc3b759 2963
cf3ef16d 2964 pos += fill;
251d0d2a 2965 }
cf3ef16d 2966 if (!had_entry)
4f7521b4 2967 continue;
cf3ef16d 2968
986ef930
CB
2969 /* Try to catch the ouput of new{g,u}idmap to make debugging
2970 * easier.
2971 */
2972 if (use_shadow) {
2973 ret = run_command(cmd_output, sizeof(cmd_output),
2974 lxc_map_ids_exec_wrapper,
2975 (void *)mapbuf);
2976 if (ret < 0) {
54fbbeb5
CB
2977 ERROR("new%cidmap failed to write mapping \"%s\": %s",
2978 u_or_g, cmd_output, mapbuf);
986ef930
CB
2979 return -1;
2980 }
54fbbeb5 2981 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
d1838f34 2982 } else {
986ef930 2983 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
54fbbeb5 2984 if (ret < 0) {
da0f9977 2985 ERROR("Failed to write mapping: %s", mapbuf);
986ef930 2986 return -1;
54fbbeb5
CB
2987 }
2988 TRACE("Wrote mapping \"%s\"", mapbuf);
d1838f34 2989 }
986ef930
CB
2990
2991 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 2992 }
251d0d2a 2993
986ef930 2994 return 0;
f6d3e3e4
SH
2995}
2996
0fd73091 2997/* Return the host uid/gid to which the container root is mapped in val.
0b3a6504 2998 * Return true if id was found, false otherwise.
cf3ef16d 2999 */
2a9a80cb 3000bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
4160c3a0 3001 unsigned long *val)
cf3ef16d 3002{
4160c3a0 3003 unsigned nsid;
0fd73091
CB
3004 struct id_map *map;
3005 struct lxc_list *it;
4160c3a0
CB
3006
3007 if (idtype == ID_TYPE_UID)
3008 nsid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
3009 else
3010 nsid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
cf3ef16d 3011
0fd73091 3012 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 3013 map = it->elem;
7b50c609 3014 if (map->idtype != idtype)
cf3ef16d 3015 continue;
4160c3a0 3016 if (map->nsid != nsid)
cf3ef16d 3017 continue;
2a9a80cb
SH
3018 *val = map->hostid;
3019 return true;
cf3ef16d 3020 }
4160c3a0 3021
2a9a80cb 3022 return false;
cf3ef16d
SH
3023}
3024
2133f58c 3025int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 3026{
cf3ef16d 3027 struct id_map *map;
0fd73091
CB
3028 struct lxc_list *it;
3029
3030 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 3031 map = it->elem;
2133f58c 3032 if (map->idtype != idtype)
cf3ef16d 3033 continue;
0fd73091 3034
cf3ef16d 3035 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 3036 return (id - map->hostid) + map->nsid;
cf3ef16d 3037 }
0fd73091 3038
57d116ab 3039 return -1;
cf3ef16d
SH
3040}
3041
339efad9 3042int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 3043{
cf3ef16d 3044 struct id_map *map;
0fd73091 3045 struct lxc_list *it;
2133f58c 3046 unsigned int freeid = 0;
0fd73091 3047
cf3ef16d 3048again:
0fd73091 3049 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 3050 map = it->elem;
2133f58c 3051 if (map->idtype != idtype)
cf3ef16d 3052 continue;
0fd73091 3053
cf3ef16d
SH
3054 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3055 freeid = map->nsid + map->range;
3056 goto again;
3057 }
3058 }
0fd73091 3059
cf3ef16d
SH
3060 return freeid;
3061}
3062
f4f52cb5
CB
3063int chown_mapped_root_exec_wrapper(void *args)
3064{
3065 execvp("lxc-usernsexec", args);
3066 return -1;
3067}
3068
0fd73091 3069/* chown_mapped_root: for an unprivileged user with uid/gid X to
7b50c609
TS
3070 * chown a dir to subuid/subgid Y, he needs to run chown as root
3071 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3072 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3073 * root is privileged with respect to hostuid/hostgid X, allowing
3074 * him to do the chown.
f6d3e3e4 3075 */
41dc7155 3076int chown_mapped_root(const char *path, struct lxc_conf *conf)
f6d3e3e4 3077{
f4f52cb5 3078 uid_t rootuid, rootgid;
2a9a80cb 3079 unsigned long val;
f4f52cb5
CB
3080 int hostuid, hostgid, ret;
3081 struct stat sb;
3082 char map1[100], map2[100], map3[100], map4[100], map5[100];
3083 char ugid[100];
41dc7155 3084 const char *args1[] = {"lxc-usernsexec",
f4f52cb5
CB
3085 "-m", map1,
3086 "-m", map2,
3087 "-m", map3,
3088 "-m", map5,
3089 "--", "chown", ugid, path,
3090 NULL};
41dc7155 3091 const char *args2[] = {"lxc-usernsexec",
f4f52cb5
CB
3092 "-m", map1,
3093 "-m", map2,
3094 "-m", map3,
3095 "-m", map4,
3096 "-m", map5,
3097 "--", "chown", ugid, path,
3098 NULL};
3099 char cmd_output[MAXPATHLEN];
3100
3101 hostuid = geteuid();
3102 hostgid = getegid();
f6d3e3e4 3103
2a9a80cb 3104 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 3105 ERROR("No uid mapping for container root");
c4d10a05 3106 return -1;
f6d3e3e4 3107 }
f4f52cb5 3108 rootuid = (uid_t)val;
0fd73091 3109
7b50c609 3110 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 3111 ERROR("No gid mapping for container root");
7b50c609
TS
3112 return -1;
3113 }
f4f52cb5 3114 rootgid = (gid_t)val;
2a9a80cb 3115
f4f52cb5 3116 if (hostuid == 0) {
7b50c609 3117 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3118 ERROR("Error chowning %s", path);
3119 return -1;
3120 }
0fd73091 3121
c4d10a05
SH
3122 return 0;
3123 }
f3d7e4ca 3124
f4f52cb5 3125 if (rootuid == hostuid) {
1a0e70ac 3126 /* nothing to do */
b103ceac 3127 INFO("Container root is our uid; no need to chown");
f3d7e4ca
SH
3128 return 0;
3129 }
3130
bbdbf8f0 3131 /* save the current gid of "path" */
f4f52cb5
CB
3132 if (stat(path, &sb) < 0) {
3133 ERROR("Error stat %s", path);
f6d3e3e4
SH
3134 return -1;
3135 }
7b50c609 3136
bbdbf8f0
CB
3137 /* Update the path argument in case this was overlayfs. */
3138 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3139 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3140
f4f52cb5
CB
3141 /*
3142 * A file has to be group-owned by a gid mapped into the
3143 * container, or the container won't be privileged over it.
3144 */
3145 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3146 if (sb.st_uid == hostuid &&
3147 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3148 chown(path, -1, hostgid) < 0) {
3149 ERROR("Failed chgrping %s", path);
3150 return -1;
3151 }
f6d3e3e4 3152
1a0e70ac 3153 /* "u:0:rootuid:1" */
f4f52cb5
CB
3154 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3155 if (ret < 0 || ret >= 100) {
3156 ERROR("Error uid printing map string");
3157 return -1;
3158 }
7b50c609 3159
1a0e70ac 3160 /* "u:hostuid:hostuid:1" */
f4f52cb5
CB
3161 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3162 if (ret < 0 || ret >= 100) {
3163 ERROR("Error uid printing map string");
3164 return -1;
3165 }
c4d10a05 3166
1a0e70ac 3167 /* "g:0:rootgid:1" */
f4f52cb5
CB
3168 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3169 if (ret < 0 || ret >= 100) {
3170 ERROR("Error gid printing map string");
3171 return -1;
3172 }
98e5ba51 3173
1a0e70ac 3174 /* "g:pathgid:rootgid+pathgid:1" */
f4f52cb5
CB
3175 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3176 rootgid + (gid_t)sb.st_gid);
3177 if (ret < 0 || ret >= 100) {
3178 ERROR("Error gid printing map string");
3179 return -1;
3180 }
c4d10a05 3181
1a0e70ac 3182 /* "g:hostgid:hostgid:1" */
f4f52cb5
CB
3183 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3184 if (ret < 0 || ret >= 100) {
3185 ERROR("Error gid printing map string");
3186 return -1;
3187 }
7b50c609 3188
1a0e70ac 3189 /* "0:pathgid" (chown) */
f4f52cb5
CB
3190 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3191 if (ret < 0 || ret >= 100) {
3192 ERROR("Error owner printing format string for chown");
3193 return -1;
3194 }
7b50c609 3195
f4f52cb5
CB
3196 if (hostgid == sb.st_gid)
3197 ret = run_command(cmd_output, sizeof(cmd_output),
3198 chown_mapped_root_exec_wrapper,
3199 (void *)args1);
3200 else
3201 ret = run_command(cmd_output, sizeof(cmd_output),
3202 chown_mapped_root_exec_wrapper,
3203 (void *)args2);
3204 if (ret < 0)
3205 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 3206
f4f52cb5 3207 return ret;
f6d3e3e4
SH
3208}
3209
943144d9
CB
3210/* NOTE: Must not be called from inside the container namespace! */
3211int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3212{
3213 int mounted;
3214
943144d9 3215 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3216 if (mounted == -1) {
0fd73091 3217 SYSERROR("Failed to mount proc in the container");
01958b1f 3218 /* continue only if there is no rootfs */
943144d9 3219 if (conf->rootfs.path)
01958b1f 3220 return -1;
5112cd70 3221 } else if (mounted == 1) {
7a0bcca3 3222 conf->tmp_umount_proc = true;
5112cd70 3223 }
943144d9 3224
5112cd70
SH
3225 return 0;
3226}
3227
3228void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3229{
7a0bcca3 3230 if (!lxc_conf->tmp_umount_proc)
0fd73091
CB
3231 return;
3232
7a0bcca3
CB
3233 (void)umount2("/proc", MNT_DETACH);
3234 lxc_conf->tmp_umount_proc = false;
5112cd70
SH
3235}
3236
0fd73091 3237/* Walk /proc/mounts and change any shared entries to slave. */
6a0c909a 3238void remount_all_slave(void)
e995d7a2 3239{
6a49f05e
CB
3240 int memfd, mntinfo_fd, ret;
3241 ssize_t copied;
0fd73091 3242 FILE *f;
e995d7a2 3243 size_t len = 0;
0fd73091 3244 char *line = NULL;
e995d7a2 3245
6a49f05e 3246 mntinfo_fd = open("/proc/self/mountinfo", O_RDONLY | O_CLOEXEC);
fea3b91d
DJ
3247 if (mntinfo_fd < 0) {
3248 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
6a49f05e 3249 return;
fea3b91d 3250 }
6a49f05e
CB
3251
3252 memfd = memfd_create(".lxc_mountinfo", MFD_CLOEXEC);
3253 if (memfd < 0) {
3254 char template[] = P_tmpdir "/.lxc_mountinfo_XXXXXX";
3255
3256 if (errno != ENOSYS) {
fea3b91d 3257 SYSERROR("Failed to create temporary in-memory file");
6a49f05e 3258 close(mntinfo_fd);
6a49f05e
CB
3259 return;
3260 }
3261
3262 memfd = lxc_make_tmpfile(template, true);
fea3b91d
DJ
3263 if (memfd < 0) {
3264 close(mntinfo_fd);
3265 WARN("Failed to create temporary file");
3266 return;
3267 }
6a49f05e
CB
3268 }
3269
3270#define __LXC_SENDFILE_MAX 0x7ffff000 /* maximum number of bytes sendfile can handle */
3271again:
3272 copied = sendfile(memfd, mntinfo_fd, NULL, __LXC_SENDFILE_MAX);
3273 if (copied < 0) {
3274 if (errno == EINTR)
3275 goto again;
3276
fea3b91d 3277 SYSERROR("Failed to copy \"/proc/self/mountinfo\"");
6a49f05e
CB
3278 close(mntinfo_fd);
3279 close(memfd);
6a49f05e
CB
3280 return;
3281 }
3282 close(mntinfo_fd);
3283
3284 /* After a successful fdopen() memfd will be closed when calling
3285 * fclose(f). Calling close(memfd) afterwards is undefined.
3286 */
3287 ret = lseek(memfd, 0, SEEK_SET);
3288 if (ret < 0) {
fea3b91d 3289 SYSERROR("Failed to reset file descriptor offset");
6a49f05e 3290 close(memfd);
6a49f05e
CB
3291 return;
3292 }
3293
3294 f = fdopen(memfd, "r");
e995d7a2 3295 if (!f) {
fea3b91d
DJ
3296 SYSERROR("Failed to open copy of \"/proc/self/mountinfo\" to mark "
3297 "all shared. Continuing");
6a49f05e 3298 close(memfd);
e995d7a2
SH
3299 return;
3300 }
3301
3302 while (getline(&line, &len, f) != -1) {
0fd73091
CB
3303 int ret;
3304 char *opts, *target;
3305
e995d7a2
SH
3306 target = get_field(line, 4);
3307 if (!target)
3308 continue;
0fd73091 3309
e995d7a2
SH
3310 opts = get_field(target, 2);
3311 if (!opts)
3312 continue;
0fd73091 3313
e995d7a2
SH
3314 null_endofword(opts);
3315 if (!strstr(opts, "shared"))
3316 continue;
0fd73091 3317
e995d7a2 3318 null_endofword(target);
0fd73091
CB
3319 ret = mount(NULL, target, NULL, MS_SLAVE, NULL);
3320 if (ret < 0) {
3321 SYSERROR("Failed to make \"%s\" MS_SLAVE", target);
e995d7a2 3322 ERROR("Continuing...");
6a49f05e 3323 continue;
e995d7a2 3324 }
6a49f05e 3325 TRACE("Remounted \"%s\" as MS_SLAVE", target);
e995d7a2
SH
3326 }
3327 fclose(f);
f10fad2f 3328 free(line);
6a49f05e 3329 TRACE("Remounted all mount table entries as MS_SLAVE");
e995d7a2
SH
3330}
3331
794248d0 3332static int lxc_execute_bind_init(struct lxc_handler *handler)
2322903b
SH
3333{
3334 int ret;
794248d0
CB
3335 char *p;
3336 char path[PATH_MAX], destpath[PATH_MAX];
3337 struct lxc_conf *conf = handler->conf;
9d9c111c
SH
3338
3339 /* If init exists in the container, don't bind mount a static one */
3340 p = choose_init(conf->rootfs.mount);
3341 if (p) {
41089848
TA
3342 char *old = p;
3343
3344 p = strdup(old + strlen(conf->rootfs.mount));
3345 free(old);
3346 if (!p)
3347 return -ENOMEM;
3348
3349 INFO("Found existing init at \"%s\"", p);
3350 goto out;
9d9c111c 3351 }
2322903b
SH
3352
3353 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
0fd73091 3354 if (ret < 0 || ret >= PATH_MAX)
8353b4c9 3355 return -1;
2322903b
SH
3356
3357 if (!file_exists(path)) {
0fd73091 3358 ERROR("The file \"%s\" does not exist on host", path);
8353b4c9 3359 return -1;
2322903b
SH
3360 }
3361
794248d0 3362 ret = snprintf(destpath, PATH_MAX, "%s" P_tmpdir "%s", conf->rootfs.mount, "/.lxc-init");
0fd73091 3363 if (ret < 0 || ret >= PATH_MAX)
8353b4c9 3364 return -1;
2322903b
SH
3365
3366 if (!file_exists(destpath)) {
794248d0
CB
3367 ret = mknod(destpath, S_IFREG | 0000, 0);
3368 if (ret < 0 && errno != EEXIST) {
3369 SYSERROR("Failed to create dummy \"%s\" file as bind mount target", destpath);
8353b4c9 3370 return -1;
2322903b 3371 }
2322903b
SH
3372 }
3373
592fd47a 3374 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
8353b4c9 3375 if (ret < 0) {
0fd73091 3376 SYSERROR("Failed to bind mount lxc.init.static into container");
8353b4c9
CB
3377 return -1;
3378 }
3379
794248d0
CB
3380 p = strdup(destpath + strlen(conf->rootfs.mount));
3381 if (!p)
3382 return -ENOMEM;
794248d0 3383
8353b4c9 3384 INFO("Bind mounted lxc.init.static into container at \"%s\"", path);
41089848 3385out:
4b5b3a2a 3386 ((struct execute_args *)handler->data)->init_fd = -1;
41089848 3387 ((struct execute_args *)handler->data)->init_path = p;
8353b4c9 3388 return 0;
2322903b
SH
3389}
3390
0fd73091
CB
3391/* This does the work of remounting / if it is shared, calling the container
3392 * pre-mount hooks, and mounting the rootfs.
35120d9c
SH
3393 */
3394int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 3395{
0fd73091
CB
3396 int ret;
3397
35120d9c 3398 if (conf->rootfs_setup) {
35120d9c 3399 const char *path = conf->rootfs.mount;
0fd73091
CB
3400
3401 /* The rootfs was set up in another namespace. bind-mount it to
3402 * give us a mount in our own ns so we can pivot_root to it
3403 */
3404 ret = mount(path, path, "rootfs", MS_BIND, NULL);
3405 if (ret < 0) {
3406 ERROR("Failed to bind mount container / onto itself");
145832ba 3407 return -1;
35120d9c 3408 }
0fd73091
CB
3409
3410 TRACE("Bind mounted container / onto itself");
145832ba 3411 return 0;
35120d9c 3412 }
d4ef7c50 3413
e995d7a2
SH
3414 remount_all_slave();
3415
0fd73091
CB
3416 ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
3417 if (ret < 0) {
3418 ERROR("Failed to run pre-mount hooks");
35120d9c
SH
3419 return -1;
3420 }
3421
0fd73091
CB
3422 ret = lxc_setup_rootfs(conf);
3423 if (ret < 0) {
3424 ERROR("Failed to setup rootfs for");
35120d9c
SH
3425 return -1;
3426 }
3427
3428 conf->rootfs_setup = true;
3429 return 0;
3430}
3431
1c1c7051
SH
3432static bool verify_start_hooks(struct lxc_conf *conf)
3433{
1c1c7051 3434 char path[MAXPATHLEN];
0fd73091
CB
3435 struct lxc_list *it;
3436
3437 lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
1c1c7051 3438 int ret;
0fd73091
CB
3439 struct stat st;
3440 char *hookname = it->elem;
1c1c7051
SH
3441
3442 ret = snprintf(path, MAXPATHLEN, "%s%s",
0fd73091
CB
3443 conf->rootfs.path ? conf->rootfs.mount : "",
3444 hookname);
1c1c7051
SH
3445 if (ret < 0 || ret >= MAXPATHLEN)
3446 return false;
0fd73091 3447
1c1c7051 3448 ret = stat(path, &st);
0fd73091 3449 if (ret < 0) {
7b6753e7 3450 SYSERROR("Start hook %s not found in container",
0fd73091 3451 hookname);
1c1c7051
SH
3452 return false;
3453 }
0fd73091 3454
6a0c909a 3455 return true;
1c1c7051
SH
3456 }
3457
3458 return true;
3459}
3460
4b5b3a2a
TA
3461static bool execveat_supported(void)
3462{
3463#ifdef __NR_execveat
3464 /*
3465 * We use the syscall here, because it was introduced in kernel 3.19,
3466 * while glibc got support for using the syscall much later, in 2.27.
3467 * We don't want to use glibc because it falls back to /proc, and the
3468 * container may not have /proc mounted depending on its configuration.
3469 */
3470 syscall(__NR_execveat, -1, "", NULL, NULL, AT_EMPTY_PATH);
3471 if (errno == ENOSYS)
3472 return false;
3473
3474 return true;
3475#else
3476 return false;
3477#endif
3478}
3479
3b988b33 3480int lxc_setup(struct lxc_handler *handler)
35120d9c 3481{
2187efd3 3482 int ret;
0fd73091 3483 const char *lxcpath = handler->lxcpath, *name = handler->name;
35120d9c 3484 struct lxc_conf *lxc_conf = handler->conf;
35120d9c 3485
8353b4c9
CB
3486 ret = do_rootfs_setup(lxc_conf, name, lxcpath);
3487 if (ret < 0) {
3488 ERROR("Failed to setup rootfs");
35120d9c
SH
3489 return -1;
3490 }
3491
28d9e29e 3492 if (handler->nsfd[LXC_NS_UTS] == -1) {
8353b4c9
CB
3493 ret = setup_utsname(lxc_conf->utsname);
3494 if (ret < 0) {
0fd73091 3495 ERROR("Failed to setup the utsname %s", name);
6c544cb3
MM
3496 return -1;
3497 }
0ad19a3f 3498 }
3499
8353b4c9
CB
3500 ret = lxc_setup_network_in_child_namespaces(lxc_conf, &lxc_conf->network);
3501 if (ret < 0) {
3502 ERROR("Failed to setup network");
95b5ffaf 3503 return -1;
0ad19a3f 3504 }
3505
8353b4c9
CB
3506 ret = lxc_network_send_name_and_ifindex_to_parent(handler);
3507 if (ret < 0) {
3508 ERROR("Failed to send network device names and ifindices to parent");
790255cf
CB
3509 return -1;
3510 }
3511
bc6928ff 3512 if (lxc_conf->autodev > 0) {
8353b4c9
CB
3513 ret = mount_autodev(name, &lxc_conf->rootfs, lxcpath);
3514 if (ret < 0) {
3515 ERROR("Failed to mount \"/dev\"");
c6883f38
SH
3516 return -1;
3517 }
3518 }
3519
8353b4c9
CB
3520 /* Do automatic mounts (mainly /proc and /sys), but exclude those that
3521 * need to wait until other stuff has finished.
368bbc02 3522 */
8353b4c9
CB
3523 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler);
3524 if (ret < 0) {
3525 ERROR("Failed to setup first automatic mounts");
368bbc02
CS
3526 return -1;
3527 }
3528
8353b4c9
CB
3529 ret = setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath);
3530 if (ret < 0) {
3531 ERROR("Failed to setup mounts");
95b5ffaf 3532 return -1;
576f946d 3533 }
3534
7b6753e7 3535 /* Make sure any start hooks are in the container */
1c1c7051
SH
3536 if (!verify_start_hooks(lxc_conf))
3537 return -1;
3538
8353b4c9 3539 if (lxc_conf->is_execute) {
4b5b3a2a
TA
3540 if (execveat_supported()) {
3541 int fd;
3542 char path[PATH_MAX];
3543
3544 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3545 if (ret < 0 || ret >= PATH_MAX) {
3546 ERROR("Path to init.lxc.static too long");
3547 return -1;
3548 }
3549
3550 fd = open(path, O_PATH | O_CLOEXEC);
3551 if (fd < 0) {
3552 SYSERROR("Unable to open lxc.init.static");
3553 return -1;
3554 }
3555
3556 ((struct execute_args *)handler->data)->init_fd = fd;
3557 ((struct execute_args *)handler->data)->init_path = NULL;
3558 } else {
3559 ret = lxc_execute_bind_init(handler);
3560 if (ret < 0) {
3561 ERROR("Failed to bind-mount the lxc init system");
3562 return -1;
3563 }
8353b4c9
CB
3564 }
3565 }
2322903b 3566
8353b4c9
CB
3567 /* Now mount only cgroups, if wanted. Before, /sys could not have been
3568 * mounted. It is guaranteed to be mounted now either through
3569 * automatically or via fstab entries.
368bbc02 3570 */
8353b4c9
CB
3571 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler);
3572 if (ret < 0) {
3573 ERROR("Failed to setup remaining automatic mounts");
368bbc02
CS
3574 return -1;
3575 }
3576
8353b4c9 3577 ret = run_lxc_hooks(name, "mount", lxc_conf, NULL);
1a2cf89d 3578 if (ret < 0) {
8353b4c9 3579 ERROR("Failed to run mount hooks");
773fb9ca
SH
3580 return -1;
3581 }
3582
bc6928ff 3583 if (lxc_conf->autodev > 0) {
8353b4c9
CB
3584 ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
3585 if (ret < 0) {
3586 ERROR("Failed to run autodev hooks");
f7bee6c6
MW
3587 return -1;
3588 }
06749971 3589
8353b4c9
CB
3590 ret = lxc_fill_autodev(&lxc_conf->rootfs);
3591 if (ret < 0) {
3592 ERROR("Failed to populate \"/dev\"");
91c3830e
SH
3593 return -1;
3594 }
3595 }
368bbc02 3596
8353b4c9
CB
3597 if (!lxc_list_empty(&lxc_conf->mount_list)) {
3598 ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
3599 &lxc_conf->mount_list, name, lxcpath);
3600 if (ret < 0) {
3601 ERROR("Failed to setup mount entries");
3602 return -1;
3603 }
181437fd
YT
3604 }
3605
ed8704d0 3606 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
885766f5 3607 lxc_conf->ttys.dir);
ed8704d0
CB
3608 if (ret < 0) {
3609 ERROR("Failed to setup console");
95b5ffaf 3610 return -1;
6e590161 3611 }
3612
ed8704d0
CB
3613 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
3614 if (ret < 0) {
8353b4c9 3615 ERROR("Failed to setup \"/dev\" symlinks");
69aa6655
DE
3616 return -1;
3617 }
3618
8353b4c9
CB
3619 ret = lxc_create_tmp_proc_mount(lxc_conf);
3620 if (ret < 0) {
3621 ERROR("Failed to \"/proc\" LSMs");
e075f5d9 3622 return -1;
e075f5d9 3623 }
e075f5d9 3624
8353b4c9
CB
3625 ret = setup_pivot_root(&lxc_conf->rootfs);
3626 if (ret < 0) {
3627 ERROR("Failed to pivot root into rootfs");
95b5ffaf 3628 return -1;
ed502555 3629 }
3630
8353b4c9
CB
3631 ret = lxc_setup_devpts(lxc_conf);
3632 if (ret < 0) {
3633 ERROR("Failed to setup new devpts instance");
95b5ffaf 3634 return -1;
3c26f34e 3635 }
3636
2187efd3
CB
3637 ret = lxc_create_ttys(handler);
3638 if (ret < 0)
e8bd4e43 3639 return -1;
e8bd4e43 3640
8353b4c9
CB
3641 ret = setup_personality(lxc_conf->personality);
3642 if (ret < 0) {
3643 ERROR("Failed to set personality");
cccc74b5
DL
3644 return -1;
3645 }
3646
8353b4c9
CB
3647 /* Set sysctl value to a path under /proc/sys as determined from the
3648 * key. For e.g. net.ipv4.ip_forward translated to
3649 * /proc/sys/net/ipv4/ip_forward.
7edd0540
L
3650 */
3651 if (!lxc_list_empty(&lxc_conf->sysctls)) {
3652 ret = setup_sysctl_parameters(&lxc_conf->sysctls);
8353b4c9
CB
3653 if (ret < 0) {
3654 ERROR("Failed to setup sysctl parameters");
7edd0540 3655 return -1;
8353b4c9 3656 }
7edd0540
L
3657 }
3658
97a8f74f
SG
3659 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3660 if (!lxc_list_empty(&lxc_conf->caps)) {
8353b4c9
CB
3661 ERROR("Container requests lxc.cap.drop and "
3662 "lxc.cap.keep: either use lxc.cap.drop or "
3663 "lxc.cap.keep, not both");
f6d3e3e4
SH
3664 return -1;
3665 }
8353b4c9 3666
97a8f74f 3667 if (dropcaps_except(&lxc_conf->keepcaps)) {
8353b4c9 3668 ERROR("Failed to keep capabilities");
97a8f74f
SG
3669 return -1;
3670 }
3671 } else if (setup_caps(&lxc_conf->caps)) {
8353b4c9 3672 ERROR("Failed to drop capabilities");
97a8f74f 3673 return -1;
81810dd1
DL
3674 }
3675
8353b4c9 3676 NOTICE("The container \"%s\" is set up", name);
cd54d859 3677
0ad19a3f 3678 return 0;
3679}
26ddeedd 3680
3f60c2f7 3681int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
14a7b0f9 3682 char *argv[])
26ddeedd 3683{
26ddeedd 3684 struct lxc_list *it;
3f60c2f7 3685 int which = -1;
26ddeedd 3686
3f60c2f7 3687 if (strcmp(hookname, "pre-start") == 0)
26ddeedd 3688 which = LXCHOOK_PRESTART;
3f60c2f7 3689 else if (strcmp(hookname, "start-host") == 0)
08dd2805 3690 which = LXCHOOK_START_HOST;
3f60c2f7 3691 else if (strcmp(hookname, "pre-mount") == 0)
5ea6163a 3692 which = LXCHOOK_PREMOUNT;
3f60c2f7 3693 else if (strcmp(hookname, "mount") == 0)
26ddeedd 3694 which = LXCHOOK_MOUNT;
3f60c2f7 3695 else if (strcmp(hookname, "autodev") == 0)
f7bee6c6 3696 which = LXCHOOK_AUTODEV;
3f60c2f7 3697 else if (strcmp(hookname, "start") == 0)
26ddeedd 3698 which = LXCHOOK_START;
3f60c2f7 3699 else if (strcmp(hookname, "stop") == 0)
52492063 3700 which = LXCHOOK_STOP;
3f60c2f7 3701 else if (strcmp(hookname, "post-stop") == 0)
26ddeedd 3702 which = LXCHOOK_POSTSTOP;
3f60c2f7 3703 else if (strcmp(hookname, "clone") == 0)
148e91f5 3704 which = LXCHOOK_CLONE;
3f60c2f7 3705 else if (strcmp(hookname, "destroy") == 0)
37cf711b 3706 which = LXCHOOK_DESTROY;
26ddeedd
SH
3707 else
3708 return -1;
3f60c2f7 3709
0fd73091 3710 lxc_list_for_each (it, &conf->hooks[which]) {
26ddeedd 3711 int ret;
3f60c2f7
CB
3712 char *hook = it->elem;
3713
3714 ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
14a7b0f9 3715 hookname, argv);
3f60c2f7
CB
3716 if (ret < 0)
3717 return -1;
26ddeedd 3718 }
3f60c2f7 3719
26ddeedd
SH
3720 return 0;
3721}
72d0e1cb 3722
72d0e1cb
SG
3723int lxc_clear_config_caps(struct lxc_conf *c)
3724{
1a0e70ac 3725 struct lxc_list *it, *next;
72d0e1cb 3726
0fd73091 3727 lxc_list_for_each_safe (it, &c->caps, next) {
72d0e1cb
SG
3728 lxc_list_del(it);
3729 free(it->elem);
3730 free(it);
3731 }
0fd73091 3732
72d0e1cb
SG
3733 return 0;
3734}
3735
c7e345ae
CB
3736static int lxc_free_idmap(struct lxc_list *id_map)
3737{
27c27d73
SH
3738 struct lxc_list *it, *next;
3739
0fd73091 3740 lxc_list_for_each_safe (it, id_map, next) {
27c27d73
SH
3741 lxc_list_del(it);
3742 free(it->elem);
3743 free(it);
3744 }
c7e345ae 3745
27c27d73
SH
3746 return 0;
3747}
3748
4355ab5f
SH
3749int lxc_clear_idmaps(struct lxc_conf *c)
3750{
3751 return lxc_free_idmap(&c->id_map);
3752}
3753
1fb86a7c
SH
3754int lxc_clear_config_keepcaps(struct lxc_conf *c)
3755{
0fd73091 3756 struct lxc_list *it, *next;
1fb86a7c 3757
0fd73091 3758 lxc_list_for_each_safe (it, &c->keepcaps, next) {
1fb86a7c
SH
3759 lxc_list_del(it);
3760 free(it->elem);
3761 free(it);
3762 }
0fd73091 3763
1fb86a7c
SH
3764 return 0;
3765}
3766
54860ed0 3767int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
72d0e1cb 3768{
54860ed0 3769 char *global_token, *namespaced_token;
ab1a6cac 3770 size_t namespaced_token_len;
54860ed0 3771 struct lxc_list *it, *next, *list;
ab1a6cac 3772 const char *k = key;
54860ed0 3773 bool all = false;
72d0e1cb 3774
54860ed0
CB
3775 if (version == CGROUP2_SUPER_MAGIC) {
3776 global_token = "lxc.cgroup2";
3777 namespaced_token = "lxc.cgroup2.";
0fd73091 3778 namespaced_token_len = sizeof("lxc.cgroup2.") - 1;
54860ed0
CB
3779 list = &c->cgroup2;
3780 } else if (version == CGROUP_SUPER_MAGIC) {
3781 global_token = "lxc.cgroup";
3782 namespaced_token = "lxc.cgroup.";
0fd73091 3783 namespaced_token_len = sizeof("lxc.cgroup.") - 1;
54860ed0
CB
3784 list = &c->cgroup;
3785 } else {
ab1a6cac 3786 return -EINVAL;
54860ed0
CB
3787 }
3788
3789 if (strcmp(key, global_token) == 0)
72d0e1cb 3790 all = true;
54860ed0 3791 else if (strncmp(key, namespaced_token, sizeof(namespaced_token) - 1) == 0)
ab1a6cac 3792 k += namespaced_token_len;
a6390f01 3793 else
ab1a6cac 3794 return -EINVAL;
72d0e1cb 3795
0fd73091 3796 lxc_list_for_each_safe (it, list, next) {
72d0e1cb 3797 struct lxc_cgroup *cg = it->elem;
54860ed0 3798
72d0e1cb
SG
3799 if (!all && strcmp(cg->subsystem, k) != 0)
3800 continue;
54860ed0 3801
72d0e1cb
SG
3802 lxc_list_del(it);
3803 free(cg->subsystem);
3804 free(cg->value);
3805 free(cg);
3806 free(it);
3807 }
e409b214 3808
72d0e1cb
SG
3809 return 0;
3810}
3811
c6d09e15
WB
3812int lxc_clear_limits(struct lxc_conf *c, const char *key)
3813{
3814 struct lxc_list *it, *next;
c6d09e15 3815 const char *k = NULL;
0fd73091 3816 bool all = false;
c6d09e15 3817
b668653c 3818 if (strcmp(key, "lxc.limit") == 0 || strcmp(key, "lxc.prlimit") == 0)
c6d09e15 3819 all = true;
b668653c
CB
3820 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.") - 1) == 0)
3821 k = key + sizeof("lxc.limit.") - 1;
3822 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.") - 1) == 0)
3823 k = key + sizeof("lxc.prlimit.") - 1;
c6d09e15
WB
3824 else
3825 return -1;
3826
0fd73091 3827 lxc_list_for_each_safe (it, &c->limits, next) {
c6d09e15 3828 struct lxc_limit *lim = it->elem;
0fd73091 3829
c6d09e15
WB
3830 if (!all && strcmp(lim->resource, k) != 0)
3831 continue;
0fd73091 3832
c6d09e15
WB
3833 lxc_list_del(it);
3834 free(lim->resource);
3835 free(lim);
3836 free(it);
3837 }
b668653c 3838
c6d09e15
WB
3839 return 0;
3840}
3841
7edd0540
L
3842int lxc_clear_sysctls(struct lxc_conf *c, const char *key)
3843{
3844 struct lxc_list *it, *next;
7edd0540 3845 const char *k = NULL;
0fd73091 3846 bool all = false;
7edd0540
L
3847
3848 if (strcmp(key, "lxc.sysctl") == 0)
3849 all = true;
3850 else if (strncmp(key, "lxc.sysctl.", sizeof("lxc.sysctl.") - 1) == 0)
3851 k = key + sizeof("lxc.sysctl.") - 1;
3852 else
3853 return -1;
3854
0fd73091 3855 lxc_list_for_each_safe (it, &c->sysctls, next) {
7edd0540 3856 struct lxc_sysctl *elem = it->elem;
0fd73091 3857
7edd0540
L
3858 if (!all && strcmp(elem->key, k) != 0)
3859 continue;
0fd73091 3860
7edd0540
L
3861 lxc_list_del(it);
3862 free(elem->key);
3863 free(elem->value);
3864 free(elem);
3865 free(it);
3866 }
0fd73091 3867
7edd0540
L
3868 return 0;
3869}
3870
61d7a733
YT
3871int lxc_clear_procs(struct lxc_conf *c, const char *key)
3872{
0fd73091 3873 struct lxc_list *it, *next;
61d7a733 3874 const char *k = NULL;
0fd73091 3875 bool all = false;
61d7a733
YT
3876
3877 if (strcmp(key, "lxc.proc") == 0)
3878 all = true;
3879 else if (strncmp(key, "lxc.proc.", sizeof("lxc.proc.") - 1) == 0)
3880 k = key + sizeof("lxc.proc.") - 1;
3881 else
3882 return -1;
3883
0fd73091 3884 lxc_list_for_each_safe (it, &c->procs, next) {
61d7a733 3885 struct lxc_proc *proc = it->elem;
0fd73091 3886
61d7a733
YT
3887 if (!all && strcmp(proc->filename, k) != 0)
3888 continue;
0fd73091 3889
61d7a733
YT
3890 lxc_list_del(it);
3891 free(proc->filename);
3892 free(proc->value);
3893 free(proc);
3894 free(it);
3895 }
3896
3897 return 0;
3898}
3899
ee1e7aa0
SG
3900int lxc_clear_groups(struct lxc_conf *c)
3901{
0fd73091 3902 struct lxc_list *it, *next;
ee1e7aa0 3903
0fd73091 3904 lxc_list_for_each_safe (it, &c->groups, next) {
ee1e7aa0
SG
3905 lxc_list_del(it);
3906 free(it->elem);
3907 free(it);
3908 }
0fd73091 3909
ee1e7aa0
SG
3910 return 0;
3911}
3912
ab799c0b
SG
3913int lxc_clear_environment(struct lxc_conf *c)
3914{
0fd73091 3915 struct lxc_list *it, *next;
ab799c0b 3916
0fd73091 3917 lxc_list_for_each_safe (it, &c->environment, next) {
ab799c0b
SG
3918 lxc_list_del(it);
3919 free(it->elem);
3920 free(it);
3921 }
0fd73091 3922
ab799c0b
SG
3923 return 0;
3924}
3925
72d0e1cb
SG
3926int lxc_clear_mount_entries(struct lxc_conf *c)
3927{
0fd73091 3928 struct lxc_list *it, *next;
72d0e1cb 3929
0fd73091 3930 lxc_list_for_each_safe (it, &c->mount_list, next) {
72d0e1cb
SG
3931 lxc_list_del(it);
3932 free(it->elem);
3933 free(it);
3934 }
0fd73091 3935
72d0e1cb
SG
3936 return 0;
3937}
3938
b099e9e9
SH
3939int lxc_clear_automounts(struct lxc_conf *c)
3940{
3941 c->auto_mounts = 0;
3942 return 0;
3943}
3944
12a50cc6 3945int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 3946{
72d0e1cb 3947 int i;
0fd73091
CB
3948 struct lxc_list *it, *next;
3949 const char *k = NULL;
3950 bool all = false, done = false;
72d0e1cb 3951
17ed13a3
SH
3952 if (strcmp(key, "lxc.hook") == 0)
3953 all = true;
0fd73091
CB
3954 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.") - 1) == 0)
3955 k = key + sizeof("lxc.hook.") - 1;
a6390f01
WB
3956 else
3957 return -1;
17ed13a3 3958
0fd73091 3959 for (i = 0; i < NUM_LXC_HOOKS; i++) {
17ed13a3 3960 if (all || strcmp(k, lxchook_names[i]) == 0) {
0fd73091 3961 lxc_list_for_each_safe (it, &c->hooks[i], next) {
17ed13a3
SH
3962 lxc_list_del(it);
3963 free(it->elem);
3964 free(it);
3965 }
0fd73091 3966
17ed13a3 3967 done = true;
72d0e1cb
SG
3968 }
3969 }
17ed13a3
SH
3970
3971 if (!done) {
3972 ERROR("Invalid hook key: %s", key);
3973 return -1;
3974 }
0fd73091 3975
72d0e1cb
SG
3976 return 0;
3977}
8eb5694b 3978
4184c3e1
SH
3979static inline void lxc_clear_aliens(struct lxc_conf *conf)
3980{
0fd73091 3981 struct lxc_list *it, *next;
4184c3e1 3982
0fd73091 3983 lxc_list_for_each_safe (it, &conf->aliens, next) {
4184c3e1
SH
3984 lxc_list_del(it);
3985 free(it->elem);
3986 free(it);
3987 }
3988}
3989
c7b15d1e 3990void lxc_clear_includes(struct lxc_conf *conf)
f979ac15 3991{
0fd73091 3992 struct lxc_list *it, *next;
f979ac15 3993
0fd73091 3994 lxc_list_for_each_safe (it, &conf->includes, next) {
f979ac15
SH
3995 lxc_list_del(it);
3996 free(it->elem);
3997 free(it);
3998 }
3999}
4000
8eb5694b
SH
4001void lxc_conf_free(struct lxc_conf *conf)
4002{
4003 if (!conf)
4004 return;
0fd73091 4005
858377e4
SH
4006 if (current_config == conf)
4007 current_config = NULL;
aed105d5 4008 lxc_terminal_conf_free(&conf->console);
f10fad2f 4009 free(conf->rootfs.mount);
b3b8c97f 4010 free(conf->rootfs.bdev_type);
f10fad2f
ME
4011 free(conf->rootfs.options);
4012 free(conf->rootfs.path);
f10fad2f 4013 free(conf->logfile);
858377e4
SH
4014 if (conf->logfd != -1)
4015 close(conf->logfd);
f10fad2f 4016 free(conf->utsname);
885766f5
CB
4017 free(conf->ttys.dir);
4018 free(conf->ttys.tty_names);
f10fad2f
ME
4019 free(conf->fstab);
4020 free(conf->rcfile);
5cda27c1 4021 free(conf->execute_cmd);
f10fad2f 4022 free(conf->init_cmd);
3c491553 4023 free(conf->init_cwd);
6b0d5538 4024 free(conf->unexpanded_config);
76d0127f 4025 free(conf->syslog);
c302b476 4026 lxc_free_networks(&conf->network);
f10fad2f
ME
4027 free(conf->lsm_aa_profile);
4028 free(conf->lsm_se_context);
769872f9 4029 lxc_seccomp_free(conf);
8eb5694b 4030 lxc_clear_config_caps(conf);
1fb86a7c 4031 lxc_clear_config_keepcaps(conf);
54860ed0
CB
4032 lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
4033 lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
17ed13a3 4034 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 4035 lxc_clear_mount_entries(conf);
27c27d73 4036 lxc_clear_idmaps(conf);
ee1e7aa0 4037 lxc_clear_groups(conf);
f979ac15 4038 lxc_clear_includes(conf);
761d81ca 4039 lxc_clear_aliens(conf);
ab799c0b 4040 lxc_clear_environment(conf);
240d4b74 4041 lxc_clear_limits(conf, "lxc.prlimit");
7edd0540 4042 lxc_clear_sysctls(conf, "lxc.sysctl");
61d7a733 4043 lxc_clear_procs(conf, "lxc.proc");
43654d34
CB
4044 free(conf->cgroup_meta.dir);
4045 free(conf->cgroup_meta.controllers);
adf0ba1f
LT
4046 free(conf->lxc_shmount.path_host);
4047 free(conf->lxc_shmount.path_cont);
8eb5694b
SH
4048 free(conf);
4049}
4355ab5f
SH
4050
4051struct userns_fn_data {
4052 int (*fn)(void *);
c9b7c33e 4053 const char *fn_name;
4355ab5f
SH
4054 void *arg;
4055 int p[2];
4056};
4057
4058static int run_userns_fn(void *data)
4059{
4355ab5f 4060 char c;
0fd73091 4061 struct userns_fn_data *d = data;
4355ab5f 4062
f8aa4bf3 4063 /* Close write end of the pipe. */
4355ab5f 4064 close(d->p[1]);
f8aa4bf3
CB
4065
4066 /* Wait for parent to finish establishing a new mapping in the user
4067 * namespace we are executing in.
4068 */
489f39be 4069 if (lxc_read_nointr(d->p[0], &c, 1) != 1)
4355ab5f 4070 return -1;
f8aa4bf3
CB
4071
4072 /* Close read end of the pipe. */
4355ab5f 4073 close(d->p[0]);
f8aa4bf3 4074
c9b7c33e
CB
4075 if (d->fn_name)
4076 TRACE("calling function \"%s\"", d->fn_name);
0fd73091 4077
f8aa4bf3 4078 /* Call function to run. */
4355ab5f
SH
4079 return d->fn(d->arg);
4080}
4081
db7cfe23
CB
4082static struct id_map *mapped_nsid_add(struct lxc_conf *conf, unsigned id,
4083 enum idtype idtype)
4084{
5173b710
CB
4085 const struct id_map *map;
4086 struct id_map *retmap;
db7cfe23
CB
4087
4088 map = find_mapped_nsid_entry(conf, id, idtype);
4089 if (!map)
4090 return NULL;
4091
4092 retmap = malloc(sizeof(*retmap));
4093 if (!retmap)
4094 return NULL;
4095
4096 memcpy(retmap, map, sizeof(*retmap));
4097 return retmap;
4098}
4099
c4333195
CB
4100static struct id_map *find_mapped_hostid_entry(struct lxc_conf *conf,
4101 unsigned id, enum idtype idtype)
f8aa4bf3 4102{
f8aa4bf3 4103 struct id_map *map;
0fd73091 4104 struct lxc_list *it;
f8aa4bf3
CB
4105 struct id_map *retmap = NULL;
4106
0fd73091 4107 lxc_list_for_each (it, &conf->id_map) {
f8aa4bf3
CB
4108 map = it->elem;
4109 if (map->idtype != idtype)
4110 continue;
4111
4112 if (id >= map->hostid && id < map->hostid + map->range) {
4113 retmap = map;
4114 break;
4115 }
4116 }
4117
f8aa4bf3
CB
4118 return retmap;
4119}
4120
0fd73091 4121/* Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
f8aa4bf3 4122 * existing one or establish a new one.
4355ab5f 4123 */
0fd73091
CB
4124static struct id_map *mapped_hostid_add(struct lxc_conf *conf, uid_t id,
4125 enum idtype type)
4355ab5f 4126{
28a2d9e7 4127 int hostid_mapped;
c4333195
CB
4128 struct id_map *entry = NULL, *tmp = NULL;
4129
4130 entry = malloc(sizeof(*entry));
4131 if (!entry)
4132 return NULL;
f8aa4bf3 4133
28a2d9e7 4134 /* Reuse existing mapping. */
c4333195
CB
4135 tmp = find_mapped_hostid_entry(conf, id, type);
4136 if (tmp)
4137 return memcpy(entry, tmp, sizeof(*entry));
f8aa4bf3 4138
28a2d9e7
CB
4139 /* Find new mapping. */
4140 hostid_mapped = find_unmapped_nsid(conf, type);
4141 if (hostid_mapped < 0) {
c4333195
CB
4142 DEBUG("Failed to find free mapping for id %d", id);
4143 free(entry);
28a2d9e7 4144 return NULL;
f8aa4bf3 4145 }
f8aa4bf3 4146
28a2d9e7
CB
4147 entry->idtype = type;
4148 entry->nsid = hostid_mapped;
4149 entry->hostid = (unsigned long)id;
4150 entry->range = 1;
4355ab5f 4151
28a2d9e7 4152 return entry;
4355ab5f
SH
4153}
4154
dcf0ffdf 4155struct lxc_list *get_minimal_idmap(struct lxc_conf *conf)
4355ab5f 4156{
f8aa4bf3 4157 uid_t euid, egid;
4160c3a0
CB
4158 uid_t nsuid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
4159 gid_t nsgid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
f8aa4bf3 4160 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
4161 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4162 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 4163
db7cfe23 4164 /* Find container root mappings. */
4160c3a0 4165 container_root_uid = mapped_nsid_add(conf, nsuid, ID_TYPE_UID);
db7cfe23 4166 if (!container_root_uid) {
dcf0ffdf 4167 DEBUG("Failed to find mapping for namespace uid %d", 0);
db7cfe23 4168 goto on_error;
f8aa4bf3 4169 }
dcf0ffdf
CB
4170 euid = geteuid();
4171 if (euid >= container_root_uid->hostid &&
4172 euid < (container_root_uid->hostid + container_root_uid->range))
db7cfe23 4173 host_uid_map = container_root_uid;
f8aa4bf3 4174
4160c3a0 4175 container_root_gid = mapped_nsid_add(conf, nsgid, ID_TYPE_GID);
db7cfe23 4176 if (!container_root_gid) {
dcf0ffdf 4177 DEBUG("Failed to find mapping for namespace gid %d", 0);
f8aa4bf3
CB
4178 goto on_error;
4179 }
dcf0ffdf
CB
4180 egid = getegid();
4181 if (egid >= container_root_gid->hostid &&
4182 egid < (container_root_gid->hostid + container_root_gid->range))
db7cfe23 4183 host_gid_map = container_root_gid;
f8aa4bf3
CB
4184
4185 /* Check whether the {g,u}id of the user has a mapping. */
954b7d9b 4186 if (!host_uid_map)
c4333195 4187 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
28a2d9e7 4188 if (!host_uid_map) {
db7cfe23 4189 DEBUG("Failed to find mapping for uid %d", euid);
f8aa4bf3
CB
4190 goto on_error;
4191 }
4192
dcf0ffdf
CB
4193 if (!host_gid_map)
4194 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
28a2d9e7 4195 if (!host_gid_map) {
db7cfe23 4196 DEBUG("Failed to find mapping for gid %d", egid);
28a2d9e7
CB
4197 goto on_error;
4198 }
4199
4200 /* Allocate new {g,u}id map list. */
4201 idmap = malloc(sizeof(*idmap));
4202 if (!idmap)
4203 goto on_error;
4204 lxc_list_init(idmap);
4205
f8aa4bf3
CB
4206 /* Add container root to the map. */
4207 tmplist = malloc(sizeof(*tmplist));
4208 if (!tmplist)
4209 goto on_error;
4210 lxc_list_add_elem(tmplist, container_root_uid);
4211 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4212
1d90e064 4213 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
4214 /* idmap will now keep track of that memory. */
4215 container_root_uid = NULL;
4216
4217 /* Add container root to the map. */
4218 tmplist = malloc(sizeof(*tmplist));
4219 if (!tmplist)
4220 goto on_error;
4221 lxc_list_add_elem(tmplist, host_uid_map);
4222 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4223 }
1d90e064
CB
4224 /* idmap will now keep track of that memory. */
4225 container_root_uid = NULL;
4226 /* idmap will now keep track of that memory. */
4227 host_uid_map = NULL;
f8aa4bf3
CB
4228
4229 tmplist = malloc(sizeof(*tmplist));
4230 if (!tmplist)
4231 goto on_error;
4232 lxc_list_add_elem(tmplist, container_root_gid);
4233 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4234
1d90e064 4235 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
4236 /* idmap will now keep track of that memory. */
4237 container_root_gid = NULL;
4238
4239 tmplist = malloc(sizeof(*tmplist));
4240 if (!tmplist)
4241 goto on_error;
4242 lxc_list_add_elem(tmplist, host_gid_map);
4243 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4244 }
1d90e064
CB
4245 /* idmap will now keep track of that memory. */
4246 container_root_gid = NULL;
4247 /* idmap will now keep track of that memory. */
4248 host_gid_map = NULL;
f8aa4bf3 4249
dcf0ffdf
CB
4250 TRACE("Allocated minimal idmapping");
4251 return idmap;
4252
4253on_error:
4dc41f99 4254 if (idmap) {
dcf0ffdf 4255 lxc_free_idmap(idmap);
4dc41f99
SX
4256 free(idmap);
4257 }
dcf0ffdf
CB
4258 if (container_root_uid)
4259 free(container_root_uid);
4260 if (container_root_gid)
4261 free(container_root_gid);
4262 if (host_uid_map && (host_uid_map != container_root_uid))
4263 free(host_uid_map);
4264 if (host_gid_map && (host_gid_map != container_root_gid))
4265 free(host_gid_map);
4266
4267 return NULL;
4268}
4269
4270/* Run a function in a new user namespace.
4271 * The caller's euid/egid will be mapped if it is not already.
4272 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4273 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4274 * This means we require only to establish a mapping from:
4275 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4276 * - the container root -> some sub{g,u}id
4277 * The former we add, if the user did not specifiy a mapping. The latter we
4278 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4279 * there to start the container in the first place.
4280 */
4281int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4282 const char *fn_name)
4283{
4284 pid_t pid;
dcf0ffdf 4285 int p[2];
0fd73091 4286 struct userns_fn_data d;
dcf0ffdf 4287 struct lxc_list *idmap;
0fd73091
CB
4288 int ret = -1, status = -1;
4289 char c = '1';
dcf0ffdf 4290
2b2655a8
CB
4291 if (!conf)
4292 return -EINVAL;
4293
dcf0ffdf
CB
4294 idmap = get_minimal_idmap(conf);
4295 if (!idmap)
4296 return -1;
4297
4298 ret = pipe(p);
4299 if (ret < 0) {
4300 SYSERROR("Failed to create pipe");
4301 return -1;
4302 }
4303 d.fn = fn;
4304 d.fn_name = fn_name;
4305 d.arg = data;
4306 d.p[0] = p[0];
4307 d.p[1] = p[1];
4308
4309 /* Clone child in new user namespace. */
4310 pid = lxc_raw_clone_cb(run_userns_fn, &d, CLONE_NEWUSER);
4311 if (pid < 0) {
0fd73091 4312 ERROR("Failed to clone process in new user namespace");
dcf0ffdf
CB
4313 goto on_error;
4314 }
4315
4316 close(p[0]);
4317 p[0] = -1;
4318
4b73005c
CB
4319 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4320 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
dcf0ffdf 4321 struct id_map *map;
0fd73091 4322 struct lxc_list *it;
dcf0ffdf 4323
0fd73091 4324 lxc_list_for_each (it, idmap) {
f8aa4bf3 4325 map = it->elem;
dcf0ffdf 4326 TRACE("Establishing %cid mapping for \"%d\" in new "
f8aa4bf3 4327 "user namespace: nsuid %lu - hostid %lu - range "
0fd73091
CB
4328 "%lu",
4329 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4330 map->nsid, map->hostid, map->range);
f8aa4bf3 4331 }
4355ab5f
SH
4332 }
4333
f8aa4bf3 4334 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4335 ret = lxc_map_ids(idmap, pid);
f8aa4bf3 4336 if (ret < 0) {
0fd73091 4337 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
f8aa4bf3 4338 goto on_error;
4355ab5f
SH
4339 }
4340
f8aa4bf3 4341 /* Tell child to proceed. */
489f39be 4342 if (lxc_write_nointr(p[1], &c, 1) != 1) {
dcf0ffdf 4343 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
f8aa4bf3 4344 goto on_error;
4355ab5f
SH
4345 }
4346
686dd5d1 4347on_error:
4355ab5f
SH
4348 if (p[0] != -1)
4349 close(p[0]);
4350 close(p[1]);
f8aa4bf3 4351
ee1b16bc
TA
4352 /* Wait for child to finish. */
4353 if (pid > 0)
4354 status = wait_for_pid(pid);
4355
686dd5d1
CB
4356 if (status < 0)
4357 ret = -1;
4358
f8aa4bf3 4359 return ret;
4355ab5f 4360}
97e9cfa0 4361
415a8851
CB
4362int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
4363 const char *fn_name)
4364{
4365 pid_t pid;
4366 uid_t euid, egid;
415a8851
CB
4367 int p[2];
4368 struct id_map *map;
4369 struct lxc_list *cur;
0fd73091 4370 struct userns_fn_data d;
415a8851 4371 int ret = -1;
0fd73091 4372 char c = '1';
415a8851
CB
4373 struct lxc_list *idmap = NULL, *tmplist = NULL;
4374 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4375 *host_uid_map = NULL, *host_gid_map = NULL;
4376
2b2655a8
CB
4377 if (!conf)
4378 return -EINVAL;
4379
415a8851
CB
4380 ret = pipe(p);
4381 if (ret < 0) {
4382 SYSERROR("opening pipe");
4383 return -1;
4384 }
4385 d.fn = fn;
4386 d.fn_name = fn_name;
4387 d.arg = data;
4388 d.p[0] = p[0];
4389 d.p[1] = p[1];
4390
4391 /* Clone child in new user namespace. */
4392 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4393 if (pid < 0) {
0fd73091 4394 ERROR("Failed to clone process in new user namespace");
415a8851
CB
4395 goto on_error;
4396 }
4397
4398 close(p[0]);
4399 p[0] = -1;
4400
4401 euid = geteuid();
4402 egid = getegid();
4403
4404 /* Allocate new {g,u}id map list. */
4405 idmap = malloc(sizeof(*idmap));
4406 if (!idmap)
4407 goto on_error;
4408 lxc_list_init(idmap);
4409
4410 /* Find container root. */
0fd73091 4411 lxc_list_for_each (cur, &conf->id_map) {
415a8851
CB
4412 struct id_map *tmpmap;
4413
4414 tmplist = malloc(sizeof(*tmplist));
4415 if (!tmplist)
4416 goto on_error;
4417
4418 tmpmap = malloc(sizeof(*tmpmap));
4419 if (!tmpmap) {
4420 free(tmplist);
4421 goto on_error;
4422 }
4423
4424 memset(tmpmap, 0, sizeof(*tmpmap));
4425 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
4426 tmplist->elem = tmpmap;
4427
4428 lxc_list_add_tail(idmap, tmplist);
4429
4430 map = cur->elem;
4431
4432 if (map->idtype == ID_TYPE_UID)
4433 if (euid >= map->hostid && euid < map->hostid + map->range)
4434 host_uid_map = map;
4435
4436 if (map->idtype == ID_TYPE_GID)
4437 if (egid >= map->hostid && egid < map->hostid + map->range)
4438 host_gid_map = map;
4439
4440 if (map->nsid != 0)
4441 continue;
4442
4443 if (map->idtype == ID_TYPE_UID)
4444 if (container_root_uid == NULL)
4445 container_root_uid = map;
4446
4447 if (map->idtype == ID_TYPE_GID)
4448 if (container_root_gid == NULL)
4449 container_root_gid = map;
4450 }
4451
4452 if (!container_root_uid || !container_root_gid) {
4453 ERROR("No mapping for container root found");
4454 goto on_error;
4455 }
4456
4457 /* Check whether the {g,u}id of the user has a mapping. */
4458 if (!host_uid_map)
c4333195 4459 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
415a8851
CB
4460 else
4461 host_uid_map = container_root_uid;
4462
4463 if (!host_gid_map)
c4333195 4464 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
415a8851
CB
4465 else
4466 host_gid_map = container_root_gid;
4467
4468 if (!host_uid_map) {
4469 DEBUG("Failed to find mapping for uid %d", euid);
4470 goto on_error;
4471 }
4472
4473 if (!host_gid_map) {
4474 DEBUG("Failed to find mapping for gid %d", egid);
4475 goto on_error;
4476 }
4477
4478 if (host_uid_map && (host_uid_map != container_root_uid)) {
4479 /* Add container root to the map. */
4480 tmplist = malloc(sizeof(*tmplist));
4481 if (!tmplist)
4482 goto on_error;
4483 lxc_list_add_elem(tmplist, host_uid_map);
4484 lxc_list_add_tail(idmap, tmplist);
4485 }
4486 /* idmap will now keep track of that memory. */
4487 host_uid_map = NULL;
4488
4489 if (host_gid_map && (host_gid_map != container_root_gid)) {
4490 tmplist = malloc(sizeof(*tmplist));
4491 if (!tmplist)
4492 goto on_error;
4493 lxc_list_add_elem(tmplist, host_gid_map);
4494 lxc_list_add_tail(idmap, tmplist);
4495 }
4496 /* idmap will now keep track of that memory. */
4497 host_gid_map = NULL;
4498
4499 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4500 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
0fd73091 4501 lxc_list_for_each (cur, idmap) {
415a8851
CB
4502 map = cur->elem;
4503 TRACE("establishing %cid mapping for \"%d\" in new "
4504 "user namespace: nsuid %lu - hostid %lu - range "
4505 "%lu",
4506 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4507 map->nsid, map->hostid, map->range);
4508 }
4509 }
4510
4511 /* Set up {g,u}id mapping for user namespace of child process. */
4512 ret = lxc_map_ids(idmap, pid);
4513 if (ret < 0) {
0fd73091 4514 ERROR("error setting up {g,u}id mappings for child process \"%d\"", pid);
415a8851
CB
4515 goto on_error;
4516 }
4517
4518 /* Tell child to proceed. */
489f39be 4519 if (lxc_write_nointr(p[1], &c, 1) != 1) {
0fd73091 4520 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
415a8851
CB
4521 goto on_error;
4522 }
4523
686dd5d1 4524on_error:
ee1b16bc
TA
4525 if (p[0] != -1)
4526 close(p[0]);
4527 close(p[1]);
4528
415a8851 4529 /* Wait for child to finish. */
686dd5d1
CB
4530 if (pid > 0)
4531 ret = wait_for_pid(pid);
415a8851 4532
80758b4b 4533 if (idmap) {
415a8851 4534 lxc_free_idmap(idmap);
80758b4b
DJ
4535 free(idmap);
4536 }
4537
415a8851
CB
4538 if (host_uid_map && (host_uid_map != container_root_uid))
4539 free(host_uid_map);
4540 if (host_gid_map && (host_gid_map != container_root_gid))
4541 free(host_gid_map);
4542
415a8851
CB
4543 return ret;
4544}
4545
a96a8e8c 4546/* not thread-safe, do not use from api without first forking */
0fd73091 4547static char *getuname(void)
97e9cfa0 4548{
cb7aa5e8
DJ
4549 struct passwd pwent;
4550 struct passwd *pwentp = NULL;
4551 char *buf;
4552 char *username;
4553 size_t bufsize;
4554 int ret;
97e9cfa0 4555
cb7aa5e8
DJ
4556 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
4557 if (bufsize == -1)
4558 bufsize = 1024;
4559
4560 buf = malloc(bufsize);
4561 if (!buf)
97e9cfa0
SH
4562 return NULL;
4563
cb7aa5e8
DJ
4564 ret = getpwuid_r(geteuid(), &pwent, buf, bufsize, &pwentp);
4565 if (!pwentp) {
4566 if (ret == 0)
4567 WARN("Could not find matched password record.");
4568
4569 ERROR("Failed to get password record - %u", geteuid());
4570 free(buf);
4571 return NULL;
4572 }
4573
4574 username = strdup(pwent.pw_name);
4575 free(buf);
4576
4577 return username;
97e9cfa0
SH
4578}
4579
a96a8e8c 4580/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4581static char *getgname(void)
4582{
3de9fb4c
DJ
4583 struct group grent;
4584 struct group *grentp = NULL;
4585 char *buf;
4586 char *grname;
4587 size_t bufsize;
4588 int ret;
4589
4590 bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
4591 if (bufsize == -1)
4592 bufsize = 1024;
4593
4594 buf = malloc(bufsize);
4595 if (!buf)
4596 return NULL;
4597
4598 ret = getgrgid_r(getegid(), &grent, buf, bufsize, &grentp);
4599 if (!grentp) {
4600 if (ret == 0)
4601 WARN("Could not find matched group record");
97e9cfa0 4602
3de9fb4c
DJ
4603 ERROR("Failed to get group record - %u", getegid());
4604 free(buf);
97e9cfa0 4605 return NULL;
3de9fb4c
DJ
4606 }
4607
4608 grname = strdup(grent.gr_name);
4609 free(buf);
97e9cfa0 4610
3de9fb4c 4611 return grname;
97e9cfa0
SH
4612}
4613
a96a8e8c 4614/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4615void suggest_default_idmap(void)
4616{
0fd73091 4617 char *uname, *gname;
97e9cfa0
SH
4618 FILE *f;
4619 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
97e9cfa0 4620 size_t len = 0;
0fd73091 4621 char *line = NULL;
97e9cfa0 4622
0fd73091
CB
4623 uname = getuname();
4624 if (!uname)
97e9cfa0
SH
4625 return;
4626
0fd73091
CB
4627 gname = getgname();
4628 if (!gname) {
97e9cfa0
SH
4629 free(uname);
4630 return;
4631 }
4632
4633 f = fopen(subuidfile, "r");
4634 if (!f) {
4635 ERROR("Your system is not configured with subuids");
4636 free(gname);
4637 free(uname);
4638 return;
4639 }
0fd73091 4640
97e9cfa0 4641 while (getline(&line, &len, f) != -1) {
0fd73091 4642 char *p, *p2;
b7930180 4643 size_t no_newline = 0;
0fd73091
CB
4644
4645 p = strchr(line, ':');
97e9cfa0
SH
4646 if (*line == '#')
4647 continue;
4648 if (!p)
4649 continue;
4650 *p = '\0';
4651 p++;
0fd73091 4652
97e9cfa0
SH
4653 if (strcmp(line, uname))
4654 continue;
0fd73091 4655
97e9cfa0
SH
4656 p2 = strchr(p, ':');
4657 if (!p2)
4658 continue;
4659 *p2 = '\0';
4660 p2++;
4661 if (!*p2)
4662 continue;
b7930180
CB
4663 no_newline = strcspn(p2, "\n");
4664 p2[no_newline] = '\0';
4665
b7b2fde4 4666 if (lxc_safe_uint(p, &uid) < 0)
0fd73091 4667 WARN("Could not parse UID");
b7b2fde4 4668 if (lxc_safe_uint(p2, &urange) < 0)
0fd73091 4669 WARN("Could not parse UID range");
97e9cfa0
SH
4670 }
4671 fclose(f);
4672
6be7389a 4673 f = fopen(subgidfile, "r");
97e9cfa0
SH
4674 if (!f) {
4675 ERROR("Your system is not configured with subgids");
4676 free(gname);
4677 free(uname);
4678 return;
4679 }
0fd73091 4680
97e9cfa0 4681 while (getline(&line, &len, f) != -1) {
0fd73091 4682 char *p, *p2;
b7930180 4683 size_t no_newline = 0;
0fd73091
CB
4684
4685 p = strchr(line, ':');
97e9cfa0
SH
4686 if (*line == '#')
4687 continue;
4688 if (!p)
4689 continue;
4690 *p = '\0';
4691 p++;
0fd73091 4692
97e9cfa0
SH
4693 if (strcmp(line, uname))
4694 continue;
0fd73091 4695
97e9cfa0
SH
4696 p2 = strchr(p, ':');
4697 if (!p2)
4698 continue;
4699 *p2 = '\0';
4700 p2++;
4701 if (!*p2)
4702 continue;
b7930180
CB
4703 no_newline = strcspn(p2, "\n");
4704 p2[no_newline] = '\0';
4705
b7b2fde4 4706 if (lxc_safe_uint(p, &gid) < 0)
0fd73091 4707 WARN("Could not parse GID");
b7b2fde4 4708 if (lxc_safe_uint(p2, &grange) < 0)
0fd73091 4709 WARN("Could not parse GID range");
97e9cfa0
SH
4710 }
4711 fclose(f);
4712
f10fad2f 4713 free(line);
97e9cfa0
SH
4714
4715 if (!urange || !grange) {
4716 ERROR("You do not have subuids or subgids allocated");
4717 ERROR("Unprivileged containers require subuids and subgids");
fbd4a4d1 4718 free(uname);
1e7cd2f7 4719 free(gname);
97e9cfa0
SH
4720 return;
4721 }
4722
4723 ERROR("You must either run as root, or define uid mappings");
4724 ERROR("To pass uid mappings to lxc-create, you could create");
4725 ERROR("~/.config/lxc/default.conf:");
4726 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
bdcbb6b3
CB
4727 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4728 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
97e9cfa0
SH
4729
4730 free(gname);
4731 free(uname);
4732}
aaf26830 4733
a7307747
SH
4734static void free_cgroup_settings(struct lxc_list *result)
4735{
4736 struct lxc_list *iterator, *next;
4737
0fd73091 4738 lxc_list_for_each_safe (iterator, result, next) {
a7307747
SH
4739 lxc_list_del(iterator);
4740 free(iterator);
4741 }
4742 free(result);
4743}
4744
0fd73091 4745/* Return the list of cgroup_settings sorted according to the following rules
aaf26830
KT
4746 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4747 */
0fd73091 4748struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings)
aaf26830
KT
4749{
4750 struct lxc_list *result;
aaf26830 4751 struct lxc_cgroup *cg = NULL;
0fd73091 4752 struct lxc_list *it = NULL, *item = NULL, *memsw_limit = NULL;
aaf26830
KT
4753
4754 result = malloc(sizeof(*result));
0fd73091 4755 if (!result)
fac7c663 4756 return NULL;
aaf26830
KT
4757 lxc_list_init(result);
4758
0fd73091
CB
4759 /* Iterate over the cgroup settings and copy them to the output list. */
4760 lxc_list_for_each (it, cgroup_settings) {
aaf26830 4761 item = malloc(sizeof(*item));
fac7c663 4762 if (!item) {
a7307747 4763 free_cgroup_settings(result);
fac7c663
KT
4764 return NULL;
4765 }
0fd73091 4766
aaf26830
KT
4767 item->elem = it->elem;
4768 cg = it->elem;
4769 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4770 /* Store the memsw_limit location */
4771 memsw_limit = item;
0fd73091
CB
4772 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 &&
4773 memsw_limit != NULL) {
4774 /* lxc.cgroup.memory.memsw.limit_in_bytes is found
4775 * before lxc.cgroup.memory.limit_in_bytes, swap these
4776 * two items */
aaf26830
KT
4777 item->elem = memsw_limit->elem;
4778 memsw_limit->elem = it->elem;
4779 }
4780 lxc_list_add_tail(result, item);
4781 }
4782
4783 return result;
a7307747 4784}