]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
conf: use fd_cloexec()
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
9d257a2a 27#include <arpa/inet.h>
8f3e280e
CB
28#include <dirent.h>
29#include <errno.h>
30#include <fcntl.h>
31#include <grp.h>
32#include <inttypes.h>
33#include <libgen.h>
9d257a2a
CB
34#include <linux/loop.h>
35#include <net/if.h>
36#include <netinet/in.h>
8f3e280e
CB
37#include <pwd.h>
38#include <stdarg.h>
0ad19a3f 39#include <stdio.h>
0ad19a3f 40#include <stdlib.h>
0ad19a3f 41#include <string.h>
8f3e280e
CB
42#include <sys/mman.h>
43#include <sys/mount.h>
44#include <sys/param.h>
45#include <sys/prctl.h>
6a49f05e 46#include <sys/sendfile.h>
8f3e280e 47#include <sys/socket.h>
9d257a2a 48#include <sys/stat.h>
2d76d1d7 49#include <sys/syscall.h>
9d257a2a 50#include <sys/sysmacros.h>
97e9cfa0 51#include <sys/types.h>
8f3e280e
CB
52#include <sys/utsname.h>
53#include <sys/wait.h>
9d257a2a
CB
54#include <time.h>
55#include <unistd.h>
1d52bdf7 56
af6824fc 57#ifdef MAJOR_IN_MKDEV
9d257a2a 58#include <sys/mkdev.h>
af6824fc 59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
9d257a2a
CB
71#if HAVE_LIBCAP
72#include <sys/capability.h>
73#endif
74
75#if HAVE_SYS_PERSONALITY_H
76#include <sys/personality.h>
77#endif
78
f1e05b90
DJ
79#ifndef HAVE_STRLCAT
80#include "include/strlcat.h"
81#endif
82
9d257a2a
CB
83#if IS_BIONIC
84#include <../include/lxcmntent.h>
85#else
86#include <mntent.h>
87#endif
88
89#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
90#include <../include/prlimit.h>
91#endif
92
e8bd4e43 93#include "af_unix.h"
9d257a2a 94#include "caps.h"
8f3e280e 95#include "cgroup.h"
1b09f2c0 96#include "conf.h"
1ed6ba91 97#include "confile_utils.h"
8f3e280e 98#include "error.h"
1b09f2c0 99#include "log.h"
0ed9b1bc 100#include "lsm/lsm.h"
025ed0f3 101#include "lxclock.h"
8f3e280e 102#include "lxcseccomp.h"
4355ab5f 103#include "namespace.h"
8f3e280e
CB
104#include "network.h"
105#include "parse.h"
732375f5 106#include "ringbuf.h"
794248d0 107#include "start.h"
28d832c4 108#include "storage.h"
28d832c4 109#include "storage/overlay.h"
0ed9b1bc 110#include "terminal.h"
8f3e280e 111#include "utils.h"
d0a36f2c 112
9d257a2a
CB
113#ifndef MS_PRIVATE
114#define MS_PRIVATE (1<<18)
edaf8b1b
SG
115#endif
116
9d257a2a
CB
117#ifndef MS_LAZYTIME
118#define MS_LAZYTIME (1<<25)
f48b5fd8
FF
119#endif
120
ac2cecc4 121lxc_log_define(conf, lxc);
e5bda9ee 122
0fd73091
CB
123/* The lxc_conf of the container currently being worked on in an API call.
124 * This is used in the error calls.
125 */
126#ifdef HAVE_TLS
127__thread struct lxc_conf *current_config;
128#else
129struct lxc_conf *current_config;
130#endif
131
2d76d1d7
SG
132/* Define pivot_root() if missing from the C library */
133#ifndef HAVE_PIVOT_ROOT
9d257a2a 134static int pivot_root(const char *new_root, const char *put_old)
2d76d1d7
SG
135{
136#ifdef __NR_pivot_root
8f3e280e 137 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 138#else
8f3e280e
CB
139 errno = ENOSYS;
140 return -1;
2d76d1d7
SG
141#endif
142}
143#else
9d257a2a 144extern int pivot_root(const char *new_root, const char *put_old);
8912711c
CB
145#endif
146
0fd73091
CB
147char *lxchook_names[NUM_LXC_HOOKS] = {
148 "pre-start",
149 "pre-mount",
150 "mount",
151 "autodev",
152 "start",
153 "stop",
154 "post-stop",
155 "clone",
156 "destroy",
157 "start-host"
158};
72d0e1cb 159
998ac676
RT
160struct mount_opt {
161 char *name;
162 int clear;
163 int flag;
164};
165
81810dd1
DL
166struct caps_opt {
167 char *name;
168 int value;
169};
170
c6d09e15
WB
171struct limit_opt {
172 char *name;
173 int value;
174};
175
998ac676 176static struct mount_opt mount_opt[] = {
470b359b
CB
177 { "async", 1, MS_SYNCHRONOUS },
178 { "atime", 1, MS_NOATIME },
179 { "bind", 0, MS_BIND },
88d413d5 180 { "defaults", 0, 0 },
88d413d5 181 { "dev", 1, MS_NODEV },
470b359b 182 { "diratime", 1, MS_NODIRATIME },
88d413d5 183 { "dirsync", 0, MS_DIRSYNC },
470b359b 184 { "exec", 1, MS_NOEXEC },
8912711c 185 { "lazytime", 0, MS_LAZYTIME },
88d413d5 186 { "mand", 0, MS_MANDLOCK },
88d413d5 187 { "noatime", 0, MS_NOATIME },
470b359b 188 { "nodev", 0, MS_NODEV },
88d413d5 189 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
190 { "noexec", 0, MS_NOEXEC },
191 { "nomand", 1, MS_MANDLOCK },
192 { "norelatime", 1, MS_RELATIME },
193 { "nostrictatime", 1, MS_STRICTATIME },
194 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
195 { "rbind", 0, MS_BIND|MS_REC },
196 { "relatime", 0, MS_RELATIME },
470b359b
CB
197 { "remount", 0, MS_REMOUNT },
198 { "ro", 0, MS_RDONLY },
199 { "rw", 1, MS_RDONLY },
88d413d5 200 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
201 { "suid", 1, MS_NOSUID },
202 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 203 { NULL, 0, 0 },
998ac676
RT
204};
205
d840039e 206static struct mount_opt propagation_opt[] = {
0fd73091
CB
207 { "private", 0, MS_PRIVATE },
208 { "shared", 0, MS_SHARED },
209 { "slave", 0, MS_SLAVE },
210 { "unbindable", 0, MS_UNBINDABLE },
211 { "rprivate", 0, MS_PRIVATE|MS_REC },
212 { "rshared", 0, MS_SHARED|MS_REC },
213 { "rslave", 0, MS_SLAVE|MS_REC },
214 { "runbindable", 0, MS_UNBINDABLE|MS_REC },
215 { NULL, 0, 0 },
d840039e
YT
216};
217
81810dd1 218static struct caps_opt caps_opt[] = {
8560cd36 219#if HAVE_LIBCAP
0fd73091
CB
220 { "chown", CAP_CHOWN },
221 { "dac_override", CAP_DAC_OVERRIDE },
222 { "dac_read_search", CAP_DAC_READ_SEARCH },
223 { "fowner", CAP_FOWNER },
224 { "fsetid", CAP_FSETID },
225 { "kill", CAP_KILL },
226 { "setgid", CAP_SETGID },
227 { "setuid", CAP_SETUID },
228 { "setpcap", CAP_SETPCAP },
229 { "linux_immutable", CAP_LINUX_IMMUTABLE },
230 { "net_bind_service", CAP_NET_BIND_SERVICE },
231 { "net_broadcast", CAP_NET_BROADCAST },
232 { "net_admin", CAP_NET_ADMIN },
233 { "net_raw", CAP_NET_RAW },
234 { "ipc_lock", CAP_IPC_LOCK },
235 { "ipc_owner", CAP_IPC_OWNER },
236 { "sys_module", CAP_SYS_MODULE },
237 { "sys_rawio", CAP_SYS_RAWIO },
238 { "sys_chroot", CAP_SYS_CHROOT },
239 { "sys_ptrace", CAP_SYS_PTRACE },
240 { "sys_pacct", CAP_SYS_PACCT },
241 { "sys_admin", CAP_SYS_ADMIN },
242 { "sys_boot", CAP_SYS_BOOT },
243 { "sys_nice", CAP_SYS_NICE },
244 { "sys_resource", CAP_SYS_RESOURCE },
245 { "sys_time", CAP_SYS_TIME },
246 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
247 { "mknod", CAP_MKNOD },
248 { "lease", CAP_LEASE },
57b837e2 249#ifdef CAP_AUDIT_READ
0fd73091 250 { "audit_read", CAP_AUDIT_READ },
57b837e2 251#endif
9527e566 252#ifdef CAP_AUDIT_WRITE
0fd73091 253 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
254#endif
255#ifdef CAP_AUDIT_CONTROL
0fd73091 256 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 257#endif
0fd73091
CB
258 { "setfcap", CAP_SETFCAP },
259 { "mac_override", CAP_MAC_OVERRIDE },
260 { "mac_admin", CAP_MAC_ADMIN },
5170c716 261#ifdef CAP_SYSLOG
0fd73091 262 { "syslog", CAP_SYSLOG },
5170c716
CS
263#endif
264#ifdef CAP_WAKE_ALARM
0fd73091 265 { "wake_alarm", CAP_WAKE_ALARM },
5170c716 266#endif
2b54359b 267#ifdef CAP_BLOCK_SUSPEND
0fd73091 268 { "block_suspend", CAP_BLOCK_SUSPEND },
2b54359b 269#endif
495d2046 270#endif
8560cd36 271};
81810dd1 272
c6d09e15
WB
273static struct limit_opt limit_opt[] = {
274#ifdef RLIMIT_AS
275 { "as", RLIMIT_AS },
276#endif
277#ifdef RLIMIT_CORE
278 { "core", RLIMIT_CORE },
279#endif
280#ifdef RLIMIT_CPU
281 { "cpu", RLIMIT_CPU },
282#endif
283#ifdef RLIMIT_DATA
284 { "data", RLIMIT_DATA },
285#endif
286#ifdef RLIMIT_FSIZE
287 { "fsize", RLIMIT_FSIZE },
288#endif
289#ifdef RLIMIT_LOCKS
290 { "locks", RLIMIT_LOCKS },
291#endif
292#ifdef RLIMIT_MEMLOCK
293 { "memlock", RLIMIT_MEMLOCK },
294#endif
295#ifdef RLIMIT_MSGQUEUE
296 { "msgqueue", RLIMIT_MSGQUEUE },
297#endif
298#ifdef RLIMIT_NICE
299 { "nice", RLIMIT_NICE },
300#endif
301#ifdef RLIMIT_NOFILE
302 { "nofile", RLIMIT_NOFILE },
303#endif
304#ifdef RLIMIT_NPROC
305 { "nproc", RLIMIT_NPROC },
306#endif
307#ifdef RLIMIT_RSS
308 { "rss", RLIMIT_RSS },
309#endif
310#ifdef RLIMIT_RTPRIO
311 { "rtprio", RLIMIT_RTPRIO },
312#endif
313#ifdef RLIMIT_RTTIME
314 { "rttime", RLIMIT_RTTIME },
315#endif
316#ifdef RLIMIT_SIGPENDING
317 { "sigpending", RLIMIT_SIGPENDING },
318#endif
319#ifdef RLIMIT_STACK
320 { "stack", RLIMIT_STACK },
321#endif
322};
323
91c3830e
SH
324static int run_buffer(char *buffer)
325{
8e7da691 326 int ret;
0fd73091
CB
327 char *output;
328 struct lxc_popen_FILE *f;
91c3830e 329
ebec9176 330 f = lxc_popen(buffer);
91c3830e 331 if (!f) {
3f60c2f7 332 SYSERROR("Failed to popen() %s", buffer);
91c3830e
SH
333 return -1;
334 }
335
336 output = malloc(LXC_LOG_BUFFER_SIZE);
337 if (!output) {
3f60c2f7 338 ERROR("Failed to allocate memory for %s", buffer);
ebec9176 339 lxc_pclose(f);
91c3830e
SH
340 return -1;
341 }
342
062b72c6 343 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
3f60c2f7 344 DEBUG("Script %s with output: %s", buffer, output);
91c3830e
SH
345
346 free(output);
347
ebec9176 348 ret = lxc_pclose(f);
8e7da691 349 if (ret == -1) {
3f60c2f7 350 SYSERROR("Script exited with error");
91c3830e 351 return -1;
8e7da691 352 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
3f60c2f7 353 ERROR("Script exited with status %d", WEXITSTATUS(ret));
8e7da691
DE
354 return -1;
355 } else if (WIFSIGNALED(ret)) {
3f60c2f7 356 ERROR("Script terminated by signal %d", WTERMSIG(ret));
8e7da691 357 return -1;
91c3830e
SH
358 }
359
360 return 0;
361}
362
14a7b0f9
CB
363int run_script_argv(const char *name, unsigned int hook_version,
364 const char *section, const char *script,
586b1ce7 365 const char *hookname, char **argv)
148e91f5 366{
3f60c2f7 367 int buf_pos, i, ret;
148e91f5 368 char *buffer;
6f8d00d2 369 int fret = -1;
d08e5708 370 size_t size = 0;
148e91f5 371
3f60c2f7
CB
372 if (hook_version == 0)
373 INFO("Executing script \"%s\" for container \"%s\", config "
374 "section \"%s\"", script, name, section);
375 else
376 INFO("Executing script \"%s\" for container \"%s\"", script, name);
148e91f5 377
586b1ce7
CB
378 for (i = 0; argv && argv[i]; i++)
379 size += strlen(argv[i]) + 1;
148e91f5 380
3f60c2f7 381 size += sizeof("exec");
148e91f5 382 size += strlen(script);
3f60c2f7
CB
383 size++;
384
148e91f5 385 if (size > INT_MAX)
3f60c2f7 386 return -EFBIG;
148e91f5 387
3f60c2f7 388 if (hook_version == 0) {
d08e5708
CB
389 size += strlen(hookname);
390 size++;
391
392 size += strlen(name);
393 size++;
394
395 size += strlen(section);
396 size++;
397
398 if (size > INT_MAX)
399 return -EFBIG;
327cce76 400 }
3f60c2f7 401
6f8d00d2
CB
402 buffer = malloc(size);
403 if (!buffer)
404 return -ENOMEM;
405
327cce76 406 if (hook_version == 0)
3f60c2f7 407 buf_pos = snprintf(buffer, size, "exec %s %s %s %s", script, name, section, hookname);
327cce76 408 else
3f60c2f7 409 buf_pos = snprintf(buffer, size, "exec %s", script);
327cce76
CB
410 if (buf_pos < 0 || (size_t)buf_pos >= size) {
411 ERROR("Failed to create command line for script \"%s\"", script);
6f8d00d2 412 goto on_error;
327cce76 413 }
3f60c2f7 414
327cce76 415 if (hook_version == 1) {
3f60c2f7
CB
416 ret = setenv("LXC_HOOK_TYPE", hookname, 1);
417 if (ret < 0) {
418 SYSERROR("Failed to set environment variable: "
419 "LXC_HOOK_TYPE=%s", hookname);
6f8d00d2 420 goto on_error;
3f60c2f7 421 }
90f20466 422 TRACE("Set environment variable: LXC_HOOK_TYPE=%s", hookname);
3f60c2f7
CB
423
424 ret = setenv("LXC_HOOK_SECTION", section, 1);
425 if (ret < 0) {
426 SYSERROR("Failed to set environment variable: "
427 "LXC_HOOK_SECTION=%s", section);
6f8d00d2 428 goto on_error;
3f60c2f7
CB
429 }
430 TRACE("Set environment variable: LXC_HOOK_SECTION=%s", section);
14a7b0f9
CB
431
432 if (strcmp(section, "net") == 0) {
433 char *parent;
434
586b1ce7 435 if (!argv || !argv[0])
6f8d00d2 436 goto on_error;
14a7b0f9 437
586b1ce7 438 ret = setenv("LXC_NET_TYPE", argv[0], 1);
14a7b0f9
CB
439 if (ret < 0) {
440 SYSERROR("Failed to set environment variable: "
586b1ce7 441 "LXC_NET_TYPE=%s", argv[0]);
6f8d00d2 442 goto on_error;
14a7b0f9 443 }
586b1ce7 444 TRACE("Set environment variable: LXC_NET_TYPE=%s", argv[0]);
14a7b0f9 445
586b1ce7 446 parent = argv[1] ? argv[1] : "";
14a7b0f9 447
a8144263 448 if (strcmp(argv[0], "macvlan") == 0) {
14a7b0f9
CB
449 ret = setenv("LXC_NET_PARENT", parent, 1);
450 if (ret < 0) {
451 SYSERROR("Failed to set environment "
452 "variable: LXC_NET_PARENT=%s", parent);
6f8d00d2 453 goto on_error;
14a7b0f9
CB
454 }
455 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 456 } else if (strcmp(argv[0], "phys") == 0) {
14a7b0f9
CB
457 ret = setenv("LXC_NET_PARENT", parent, 1);
458 if (ret < 0) {
459 SYSERROR("Failed to set environment "
460 "variable: LXC_NET_PARENT=%s", parent);
6f8d00d2 461 goto on_error;
14a7b0f9
CB
462 }
463 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 464 } else if (strcmp(argv[0], "veth") == 0) {
586b1ce7 465 char *peer = argv[2] ? argv[2] : "";
14a7b0f9
CB
466
467 ret = setenv("LXC_NET_PEER", peer, 1);
468 if (ret < 0) {
469 SYSERROR("Failed to set environment "
470 "variable: LXC_NET_PEER=%s", peer);
6f8d00d2 471 goto on_error;
14a7b0f9
CB
472 }
473 TRACE("Set environment variable: LXC_NET_PEER=%s", peer);
474
475 ret = setenv("LXC_NET_PARENT", parent, 1);
476 if (ret < 0) {
477 SYSERROR("Failed to set environment "
478 "variable: LXC_NET_PARENT=%s", parent);
6f8d00d2 479 goto on_error;
14a7b0f9
CB
480 }
481 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
482 }
483 }
148e91f5
SH
484 }
485
586b1ce7 486 for (i = 0; argv && argv[i]; i++) {
3f60c2f7
CB
487 size_t len = size - buf_pos;
488
586b1ce7 489 ret = snprintf(buffer + buf_pos, len, " %s", argv[i]);
3f60c2f7
CB
490 if (ret < 0 || (size_t)ret >= len) {
491 ERROR("Failed to create command line for script \"%s\"", script);
6f8d00d2 492 goto on_error;
148e91f5 493 }
3f60c2f7 494 buf_pos += ret;
148e91f5
SH
495 }
496
6f8d00d2
CB
497 fret = run_buffer(buffer);
498
499on_error:
500 free(buffer);
501 return fret;
148e91f5
SH
502}
503
811ef482 504int run_script(const char *name, const char *section, const char *script, ...)
e3b4c4c4 505{
abbfd20b 506 int ret;
91c3830e 507 char *buffer, *p;
abbfd20b 508 va_list ap;
0fd73091 509 size_t size = 0;
751d9dcd 510
0fd73091 511 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
751d9dcd 512 script, name, section);
e3b4c4c4 513
abbfd20b
DL
514 va_start(ap, script);
515 while ((p = va_arg(ap, char *)))
95642a10 516 size += strlen(p) + 1;
abbfd20b
DL
517 va_end(ap);
518
6d1a5f93 519 size += strlen("exec");
abbfd20b
DL
520 size += strlen(script);
521 size += strlen(name);
522 size += strlen(section);
6d1a5f93 523 size += 4;
abbfd20b 524
95642a10
MS
525 if (size > INT_MAX)
526 return -1;
527
528 buffer = alloca(size);
6d1a5f93 529 ret = snprintf(buffer, size, "exec %s %s %s", script, name, section);
0fd73091 530 if (ret < 0 || ret >= size)
9ba8130c 531 return -1;
751d9dcd 532
abbfd20b 533 va_start(ap, script);
9ba8130c 534 while ((p = va_arg(ap, char *))) {
062b72c6 535 int len = size - ret;
9ba8130c
SH
536 int rc;
537 rc = snprintf(buffer + ret, len, " %s", p);
7b5a2435
DJ
538 if (rc < 0 || rc >= len) {
539 va_end(ap);
9ba8130c 540 return -1;
7b5a2435 541 }
9ba8130c
SH
542 ret += rc;
543 }
abbfd20b 544 va_end(ap);
751d9dcd 545
91c3830e 546 return run_buffer(buffer);
e3b4c4c4
ST
547}
548
0fd73091 549/* pin_rootfs
63fc76c3 550 * if rootfs is a directory, then open ${rootfs}/.lxc-keep for writing for
b7ed4bf0
CS
551 * the duration of the container run, to prevent the container from marking
552 * the underlying fs readonly on shutdown. unlink the file immediately so
63fc76c3
GJ
553 * no name pollution is happens.
554 * don't unlink on NFS to avoid random named stale handles.
0c547523
SH
555 * return -1 on error.
556 * return -2 if nothing needed to be pinned.
557 * return an open fd (>=0) if we pinned it.
558 */
559int pin_rootfs(const char *rootfs)
560{
0fd73091
CB
561 int fd, ret;
562 char absrootfs[MAXPATHLEN], absrootfspin[MAXPATHLEN];
0c547523 563 struct stat s;
63fc76c3 564 struct statfs sfs;
0c547523 565
e99ee0de 566 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 567 return -2;
e99ee0de 568
00ec333b 569 if (!realpath(rootfs, absrootfs))
9be53773 570 return -2;
0c547523 571
0fd73091
CB
572 ret = stat(absrootfs, &s);
573 if (ret < 0)
0c547523 574 return -1;
0c547523 575
72f919c4 576 if (!S_ISDIR(s.st_mode))
0c547523
SH
577 return -2;
578
63fc76c3 579 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/.lxc-keep", absrootfs);
00ec333b 580 if (ret >= MAXPATHLEN)
0c547523 581 return -1;
0c547523 582
0fd73091 583 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR | S_IRUSR);
b7ed4bf0
CS
584 if (fd < 0)
585 return fd;
0fd73091 586
205fc010
CB
587 ret = fstatfs (fd, &sfs);
588 if (ret < 0)
589 return fd;
63fc76c3
GJ
590
591 if (sfs.f_type == NFS_SUPER_MAGIC) {
205fc010 592 DEBUG("Rootfs on NFS, not unlinking pin file \"%s\"", absrootfspin);
63fc76c3
GJ
593 return fd;
594 }
595
b7ed4bf0 596 (void)unlink(absrootfspin);
0fd73091 597
0c547523
SH
598 return fd;
599}
600
0fd73091
CB
601/* If we are asking to remount something, make sure that any NOEXEC etc are
602 * honored.
e2a7e8dc 603 */
5ae72b98 604unsigned long add_required_remount_flags(const char *s, const char *d,
5285689c 605 unsigned long flags)
e2a7e8dc 606{
614305f3 607#ifdef HAVE_STATVFS
0fd73091 608 int ret;
e2a7e8dc
SH
609 struct statvfs sb;
610 unsigned long required_flags = 0;
611
612 if (!(flags & MS_REMOUNT))
613 return flags;
614
615 if (!s)
616 s = d;
617
618 if (!s)
619 return flags;
0fd73091
CB
620
621 ret = statvfs(s, &sb);
622 if (ret < 0)
e2a7e8dc
SH
623 return flags;
624
625 if (sb.f_flag & MS_NOSUID)
626 required_flags |= MS_NOSUID;
627 if (sb.f_flag & MS_NODEV)
628 required_flags |= MS_NODEV;
629 if (sb.f_flag & MS_RDONLY)
630 required_flags |= MS_RDONLY;
631 if (sb.f_flag & MS_NOEXEC)
632 required_flags |= MS_NOEXEC;
633
634 return flags | required_flags;
614305f3
SH
635#else
636 return flags;
637#endif
e2a7e8dc
SH
638}
639
4fb3cba5 640static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 641{
0fd73091 642 int i, r;
b06b8511
CS
643 static struct {
644 int match_mask;
645 int match_flag;
646 const char *source;
647 const char *destination;
648 const char *fstype;
649 unsigned long flags;
650 const char *options;
651 } default_mounts[] = {
0fd73091
CB
652 /* Read-only bind-mounting... In older kernels, doing that
653 * required to do one MS_BIND mount and then
654 * MS_REMOUNT|MS_RDONLY the same one. According to mount(2)
655 * manpage, MS_BIND honors MS_RDONLY from kernel 2.6.26
656 * onwards. However, this apparently does not work on kernel
657 * 3.8. Unfortunately, on that very same kernel, doing the same
658 * trick as above doesn't seem to work either, there one needs
659 * to ALSO specify MS_BIND for the remount, otherwise the
660 * entire fs is remounted read-only or the mount fails because
661 * it's busy... MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for
662 * kernels as low as 2.6.32...
368bbc02 663 */
0fd73091 664 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a 665 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
0fd73091
CB
666 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
667 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
668 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
669 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
670 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
671 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
672 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
673 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
674 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
675 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
676 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
677 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
678 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
679 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
680 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
681 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 682 };
368bbc02 683
b06b8511 684 for (i = 0; default_mounts[i].match_mask; i++) {
0fd73091
CB
685 int saved_errno;
686 unsigned long mflags;
687 char *destination = NULL;
688 char *source = NULL;
689 if ((flags & default_mounts[i].match_mask) != default_mounts[i].match_flag)
690 continue;
691
692 if (default_mounts[i].source) {
cc4fd506 693 /* will act like strdup if %r is not present */
0fd73091
CB
694 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
695 if (!source)
cc4fd506 696 return -1;
0fd73091 697 }
f24a52d5 698
0fd73091
CB
699 if (!default_mounts[i].destination) {
700 ERROR("BUG: auto mounts destination %d was NULL", i);
b06b8511 701 free(source);
0fd73091
CB
702 return -1;
703 }
704
705 /* will act like strdup if %r is not present */
706 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
707 if (!destination) {
708 saved_errno = errno;
709 free(source);
710 errno = saved_errno;
711 return -1;
712 }
713
714 mflags = add_required_remount_flags(source, destination,
715 default_mounts[i].flags);
716 r = safe_mount(source, destination, default_mounts[i].fstype,
717 mflags, default_mounts[i].options,
718 conf->rootfs.path ? conf->rootfs.mount : NULL);
719 saved_errno = errno;
720 if (r < 0 && errno == ENOENT) {
721 INFO("Mount source or target for \"%s\" on \"%s\" does "
722 "not exist. Skipping", source, destination);
723 r = 0;
724 } else if (r < 0) {
725 SYSERROR("Failed to mount \"%s\" on \"%s\" with flags %lu", source, destination, mflags);
726 }
727
728 free(source);
729 free(destination);
730 if (r < 0) {
731 errno = saved_errno;
732 return -1;
368bbc02 733 }
368bbc02
CS
734 }
735
b06b8511 736 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
737 int cg_flags;
738
3f69fb12 739 cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE);
0fd73091
CB
740 /* If the type of cgroup mount was not specified, it depends on
741 * the container's capabilities as to what makes sense: if we
742 * have CAP_SYS_ADMIN, the read-only part can be remounted
743 * read-write anyway, so we may as well default to read-write;
744 * then the admin will not be given a false sense of security.
745 * (And if they really want mixed r/o r/w, then they can
746 * explicitly specify :mixed.) OTOH, if the container lacks
747 * CAP_SYS_ADMIN, do only default to :mixed, because then the
748 * container can't remount it read-write.
749 */
0769b82a
CS
750 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
751 int has_sys_admin = 0;
b0ee5983
CB
752
753 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 754 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 755 else
0769b82a 756 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
757
758 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 759 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 760 else
0769b82a 761 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a 762 }
0fd73091 763
3f69fb12 764 if (flags & LXC_AUTO_CGROUP_FORCE)
0fd73091
CB
765 cg_flags |= LXC_AUTO_CGROUP_FORCE;
766
2202afc9
CB
767 if (!handler->cgroup_ops->mount(handler->cgroup_ops,
768 handler,
769 conf->rootfs.path ? conf->rootfs.mount : "",
770 cg_flags)) {
0fd73091 771 SYSERROR("Failed to mount \"/sys/fs/cgroup\"");
b06b8511 772 return -1;
368bbc02
CS
773 }
774 }
775
368bbc02 776 return 0;
368bbc02
CS
777}
778
4e5440c6 779static int setup_utsname(struct utsname *utsname)
0ad19a3f 780{
0fd73091
CB
781 int ret;
782
4e5440c6
DL
783 if (!utsname)
784 return 0;
0ad19a3f 785
0fd73091
CB
786 ret = sethostname(utsname->nodename, strlen(utsname->nodename));
787 if (ret < 0) {
788 SYSERROR("Failed to set the hostname to \"%s\"", utsname->nodename);
0ad19a3f 789 return -1;
790 }
791
0fd73091 792 INFO("Set hostname to \"%s\"", utsname->nodename);
cd54d859 793
0ad19a3f 794 return 0;
795}
796
69aa6655
DE
797struct dev_symlinks {
798 const char *oldpath;
799 const char *name;
800};
801
802static const struct dev_symlinks dev_symlinks[] = {
0fd73091
CB
803 { "/proc/self/fd", "fd" },
804 { "/proc/self/fd/0", "stdin" },
805 { "/proc/self/fd/1", "stdout" },
806 { "/proc/self/fd/2", "stderr" },
69aa6655
DE
807};
808
ed8704d0 809static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
69aa6655 810{
0fd73091 811 int i, ret;
69aa6655 812 char path[MAXPATHLEN];
09227be2 813 struct stat s;
69aa6655 814
69aa6655
DE
815 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
816 const struct dev_symlinks *d = &dev_symlinks[i];
0fd73091
CB
817
818 ret = snprintf(path, sizeof(path), "%s/dev/%s",
819 rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
820 if (ret < 0 || ret >= MAXPATHLEN)
821 return -1;
09227be2 822
0fd73091
CB
823 /* Stat the path first. If we don't get an error accept it as
824 * is and don't try to create it
09227be2 825 */
0fd73091
CB
826 ret = stat(path, &s);
827 if (ret == 0)
09227be2 828 continue;
09227be2 829
69aa6655
DE
830 ret = symlink(d->oldpath, path);
831 if (ret && errno != EEXIST) {
0fd73091
CB
832 if (errno == EROFS) {
833 WARN("Failed to create \"%s\". Read-only filesystem", path);
09227be2 834 } else {
0fd73091 835 SYSERROR("Failed to create \"%s\"", path);
09227be2
MW
836 return -1;
837 }
69aa6655
DE
838 }
839 }
0fd73091 840
69aa6655
DE
841 return 0;
842}
843
2187efd3 844/* Build a space-separate list of ptys to pass to systemd. */
885766f5 845static bool append_ttyname(char **pp, char *name)
b0a33c1e 846{
393903d1 847 char *p;
f1e05b90 848 size_t size;
393903d1
SH
849
850 if (!*pp) {
851 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
852 if (!*pp)
853 return false;
0fd73091 854
393903d1
SH
855 sprintf(*pp, "container_ttys=%s", name);
856 return true;
857 }
0fd73091 858
f1e05b90
DJ
859 size = strlen(*pp) + strlen(name) + 2;
860 p = realloc(*pp, size);
393903d1
SH
861 if (!p)
862 return false;
0fd73091 863
393903d1 864 *pp = p;
f1e05b90
DJ
865 (void)strlcat(p, " ", size);
866 (void)strlcat(p, name, size);
0fd73091 867
393903d1
SH
868 return true;
869}
870
2187efd3 871static int lxc_setup_ttys(struct lxc_conf *conf)
393903d1 872{
9e1045e3 873 int i, ret;
0e4be3cf 874 const struct lxc_tty_info *ttys = &conf->ttys;
885766f5 875 char *ttydir = ttys->dir;
7c6ef2a2 876 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 877
e8bd4e43 878 if (!conf->rootfs.path)
bc9bd0e3
DL
879 return 0;
880
885766f5 881 for (i = 0; i < ttys->max; i++) {
0e4be3cf 882 struct lxc_terminal_info *tty = &ttys->tty[i];
b0a33c1e 883
e8bd4e43 884 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
73363c61 885 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 886 return -1;
9e1045e3 887
7c6ef2a2
SH
888 if (ttydir) {
889 /* create dev/lxc/tty%d" */
9e1045e3
CB
890 ret = snprintf(lxcpath, sizeof(lxcpath),
891 "/dev/%s/tty%d", ttydir, i + 1);
73363c61 892 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 893 return -1;
9e1045e3 894
7c6ef2a2 895 ret = creat(lxcpath, 0660);
9e1045e3 896 if (ret < 0 && errno != EEXIST) {
73363c61 897 SYSERROR("Failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
898 return -1;
899 }
4d44e274
SH
900 if (ret >= 0)
901 close(ret);
9e1045e3 902
7c6ef2a2 903 ret = unlink(path);
9e1045e3 904 if (ret < 0 && errno != ENOENT) {
73363c61 905 SYSERROR("Failed to unlink \"%s\"", path);
7c6ef2a2
SH
906 return -1;
907 }
b0a33c1e 908
2520facd 909 ret = mount(tty->name, lxcpath, "none", MS_BIND, 0);
9e1045e3 910 if (ret < 0) {
73363c61 911 WARN("Failed to bind mount \"%s\" onto \"%s\"",
2520facd 912 tty->name, path);
7c6ef2a2
SH
913 continue;
914 }
0fd73091 915 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name,
9e1045e3 916 path);
13954cce 917
9e1045e3
CB
918 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
919 ttydir, i + 1);
73363c61 920 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 921 return -1;
9e1045e3 922
7c6ef2a2 923 ret = symlink(lxcpath, path);
9e1045e3 924 if (ret < 0) {
73363c61 925 SYSERROR("Failed to create symlink \"%s\" -> \"%s\"",
9e1045e3 926 path, lxcpath);
7c6ef2a2
SH
927 return -1;
928 }
929 } else {
9e1045e3
CB
930 /* If we populated /dev, then we need to create
931 * /dev/ttyN
932 */
d3ccc04e
CB
933 ret = mknod(path, S_IFREG | 0000, 0);
934 if (ret < 0) /* this isn't fatal, continue */
6d1400b5 935 SYSERROR("Failed to create \"%s\"", path);
9e1045e3 936
2520facd 937 ret = mount(tty->name, path, "none", MS_BIND, 0);
9e1045e3 938 if (ret < 0) {
2520facd 939 SYSERROR("Failed to mount '%s'->'%s'", tty->name, path);
7c6ef2a2
SH
940 continue;
941 }
9e1045e3 942
d3ccc04e 943 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, path);
393903d1 944 }
9e1045e3 945
885766f5 946 if (!append_ttyname(&conf->ttys.tty_names, tty->name)) {
393903d1
SH
947 ERROR("Error setting up container_ttys string");
948 return -1;
b0a33c1e 949 }
950 }
951
885766f5 952 INFO("Finished setting up %zu /dev/tty<N> device(s)", ttys->max);
b0a33c1e 953 return 0;
954}
955
663014ee 956int lxc_allocate_ttys(struct lxc_conf *conf)
2187efd3 957{
2187efd3 958 int i, ret;
0fd73091 959 struct lxc_tty_info *ttys = &conf->ttys;
2187efd3
CB
960
961 /* no tty in the configuration */
885766f5 962 if (ttys->max == 0)
2187efd3
CB
963 return 0;
964
885766f5 965 ttys->tty = malloc(sizeof(*ttys->tty) * ttys->max);
0e4be3cf 966 if (!ttys->tty)
2187efd3 967 return -ENOMEM;
2187efd3 968
885766f5 969 for (i = 0; i < ttys->max; i++) {
0e4be3cf 970 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 971
386e6768
CB
972 tty->master = -EBADF;
973 tty->slave = -EBADF;
2520facd
CB
974 ret = openpty(&tty->master, &tty->slave,
975 tty->name, NULL, NULL);
2187efd3 976 if (ret) {
0fd73091 977 SYSERROR("Failed to create tty %d", i);
885766f5 978 ttys->max = i;
0e4be3cf 979 lxc_delete_tty(ttys);
2187efd3
CB
980 return -ENOTTY;
981 }
982
0fd73091 983 DEBUG("Created tty \"%s\" with master fd %d and slave fd %d",
2520facd 984 tty->name, tty->master, tty->slave);
2187efd3
CB
985
986 /* Prevent leaking the file descriptors to the container */
615f24ff 987 ret = fd_cloexec(tty->master, true);
2187efd3 988 if (ret < 0)
a24c5678 989 SYSWARN("Failed to set FD_CLOEXEC flag on master fd %d of "
990 "tty device \"%s\"", tty->master, tty->name);
2187efd3 991
615f24ff 992 ret = fd_cloexec(tty->slave, true);
2187efd3 993 if (ret < 0)
a24c5678 994 SYSWARN("Failed to set FD_CLOEXEC flag on slave fd %d of "
995 "tty device \"%s\"", tty->slave, tty->name);
2187efd3 996
2520facd 997 tty->busy = 0;
2187efd3
CB
998 }
999
885766f5 1000 INFO("Finished creating %zu tty devices", ttys->max);
2187efd3
CB
1001 return 0;
1002}
1003
0e4be3cf 1004void lxc_delete_tty(struct lxc_tty_info *ttys)
2187efd3
CB
1005{
1006 int i;
1007
386e6768
CB
1008 if (!ttys->tty)
1009 return;
1010
885766f5 1011 for (i = 0; i < ttys->max; i++) {
0e4be3cf 1012 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 1013
386e6768
CB
1014 if (tty->master >= 0) {
1015 close(tty->master);
1016 tty->master = -EBADF;
1017 }
1018
1019 if (tty->slave >= 0) {
1020 close(tty->slave);
1021 tty->slave = -EBADF;
1022 }
2187efd3
CB
1023 }
1024
0e4be3cf
CB
1025 free(ttys->tty);
1026 ttys->tty = NULL;
2187efd3
CB
1027}
1028
1029static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
1030{
1031 int i;
0fd73091 1032 int ret = -1;
2187efd3 1033 struct lxc_conf *conf = handler->conf;
0e4be3cf 1034 struct lxc_tty_info *ttys = &conf->ttys;
2187efd3 1035 int sock = handler->data_sock[0];
2187efd3 1036
885766f5 1037 if (ttys->max == 0)
2187efd3
CB
1038 return 0;
1039
885766f5 1040 for (i = 0; i < ttys->max; i++) {
2187efd3 1041 int ttyfds[2];
0e4be3cf 1042 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 1043
2520facd
CB
1044 ttyfds[0] = tty->master;
1045 ttyfds[1] = tty->slave;
2187efd3
CB
1046
1047 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1048 if (ret < 0)
1049 break;
1050
0fd73091 1051 TRACE("Sent ty \"%s\" with master fd %d and slave fd %d to "
2520facd 1052 "parent", tty->name, tty->master, tty->slave);
2187efd3
CB
1053 }
1054
1055 if (ret < 0)
6d1400b5 1056 SYSERROR("Failed to send %zu ttys to parent", ttys->max);
2187efd3 1057 else
885766f5 1058 TRACE("Sent %zu ttys to parent", ttys->max);
2187efd3
CB
1059
1060 return ret;
1061}
1062
1063static int lxc_create_ttys(struct lxc_handler *handler)
1064{
1065 int ret = -1;
1066 struct lxc_conf *conf = handler->conf;
1067
663014ee 1068 ret = lxc_allocate_ttys(conf);
2187efd3
CB
1069 if (ret < 0) {
1070 ERROR("Failed to allocate ttys");
1071 goto on_error;
1072 }
1073
1074 ret = lxc_send_ttys_to_parent(handler);
1075 if (ret < 0) {
1076 ERROR("Failed to send ttys to parent");
1077 goto on_error;
1078 }
1079
1080 if (!conf->is_execute) {
1081 ret = lxc_setup_ttys(conf);
1082 if (ret < 0) {
1083 ERROR("Failed to setup ttys");
1084 goto on_error;
1085 }
1086 }
1087
885766f5
CB
1088 if (conf->ttys.tty_names) {
1089 ret = setenv("container_ttys", conf->ttys.tty_names, 1);
2187efd3 1090 if (ret < 0)
885766f5 1091 SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
2187efd3
CB
1092 }
1093
1094 ret = 0;
1095
1096on_error:
0e4be3cf 1097 lxc_delete_tty(&conf->ttys);
2187efd3
CB
1098
1099 return ret;
1100}
1101
59bb8698 1102static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 1103{
0fd73091
CB
1104 int ret;
1105 int newroot = -1, oldroot = -1;
bf601689 1106
2d489f9e
SH
1107 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1108 if (oldroot < 0) {
0fd73091 1109 SYSERROR("Failed to open old root directory");
9ba8130c
SH
1110 return -1;
1111 }
0fd73091 1112
2d489f9e
SH
1113 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1114 if (newroot < 0) {
0fd73091
CB
1115 SYSERROR("Failed to open new root directory");
1116 goto on_error;
c08556c6 1117 }
bf601689 1118
cc6f6dd7 1119 /* change into new root fs */
0fd73091
CB
1120 ret = fchdir(newroot);
1121 if (ret < 0) {
1122 SYSERROR("Failed to change to new rootfs \"%s\"", rootfs);
1123 goto on_error;
cc6f6dd7
DL
1124 }
1125
cc6f6dd7 1126 /* pivot_root into our new root fs */
0fd73091
CB
1127 ret = pivot_root(".", ".");
1128 if (ret < 0) {
1129 SYSERROR("Failed to pivot_root()");
1130 goto on_error;
bf601689 1131 }
cc6f6dd7 1132
e599717b 1133 /* At this point the old-root is mounted on top of our new-root. To
0fd73091
CB
1134 * unmounted it we must not be chdir'd into it, so escape back to
1135 * old-root.
2d489f9e 1136 */
0fd73091
CB
1137 ret = fchdir(oldroot);
1138 if (ret < 0) {
1139 SYSERROR("Failed to enter old root directory");
1140 goto on_error;
2d489f9e 1141 }
0fd73091 1142
e599717b
FW
1143 /* Make oldroot rslave to make sure our umounts don't propagate to the
1144 * host.
1145 */
1146 ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
1147 if (ret < 0) {
1148 SYSERROR("Failed to make oldroot rslave");
1149 goto on_error;
1150 }
1151
0fd73091
CB
1152 ret = umount2(".", MNT_DETACH);
1153 if (ret < 0) {
1154 SYSERROR("Failed to detach old root directory");
1155 goto on_error;
cc6f6dd7
DL
1156 }
1157
0fd73091
CB
1158 ret = fchdir(newroot);
1159 if (ret < 0) {
1160 SYSERROR("Failed to re-enter new root directory");
1161 goto on_error;
2d489f9e 1162 }
cc6f6dd7 1163
2d489f9e
SH
1164 close(oldroot);
1165 close(newroot);
bf601689 1166
0fd73091 1167 DEBUG("pivot_root(\"%s\") successful", rootfs);
bf601689 1168
bf601689 1169 return 0;
2d489f9e 1170
0fd73091 1171on_error:
2d489f9e
SH
1172 if (oldroot != -1)
1173 close(oldroot);
1174 if (newroot != -1)
1175 close(newroot);
0fd73091 1176
2d489f9e 1177 return -1;
bf601689
MH
1178}
1179
7133b912
CB
1180/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1181 * error, log it but don't fail yet.
91c3830e 1182 */
7133b912
CB
1183static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1184 const char *lxcpath)
91c3830e
SH
1185{
1186 int ret;
87da4ec3
SH
1187 size_t clen;
1188 char *path;
91c3830e 1189
7133b912 1190 INFO("Preparing \"/dev\"");
bc6928ff 1191
14221cbb 1192 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1193 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1194 path = alloca(clen);
bc6928ff 1195
ec50007f 1196 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
7133b912 1197 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1198 return -1;
bc6928ff 1199
87da4ec3 1200 if (!dir_exists(path)) {
7133b912
CB
1201 WARN("\"/dev\" directory does not exist. Proceeding without "
1202 "autodev being set up");
87da4ec3 1203 return 0;
bc6928ff 1204 }
87da4ec3 1205
1ec0e8e3 1206 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
7133b912
CB
1207 rootfs->path ? rootfs->mount : NULL);
1208 if (ret < 0) {
1209 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1ec0e8e3 1210 return -1;
91c3830e 1211 }
7133b912 1212 INFO("Mounted tmpfs on \"%s\"", path);
87da4ec3 1213
ec50007f 1214 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
7133b912 1215 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1216 return -1;
87da4ec3 1217
7133b912 1218 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1219 * If not, then create it and exit if that fails...
1220 */
87da4ec3 1221 if (!dir_exists(path)) {
bc6928ff 1222 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
7133b912
CB
1223 if (ret < 0) {
1224 SYSERROR("Failed to create directory \"%s\"", path);
bc6928ff
MW
1225 return -1;
1226 }
91c3830e
SH
1227 }
1228
7133b912 1229 INFO("Prepared \"/dev\"");
91c3830e
SH
1230 return 0;
1231}
1232
5e73416f 1233struct lxc_device_node {
74a3920a 1234 const char *name;
5e73416f
CB
1235 const mode_t mode;
1236 const int maj;
1237 const int min;
c6883f38
SH
1238};
1239
5e73416f 1240static const struct lxc_device_node lxc_devices[] = {
06749971 1241 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
5e73416f 1242 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
06749971
CB
1243 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1244 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
5e73416f
CB
1245 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1246 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
c6883f38
SH
1247};
1248
27245ff7 1249static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38 1250{
5e73416f 1251 int i, ret;
c6883f38 1252 char path[MAXPATHLEN];
3a32201c 1253 mode_t cmask;
3e04a608 1254 int can_mknod = 1;
c6883f38 1255
3999be0a
CB
1256 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1257 rootfs->path ? rootfs->mount : "");
1258 if (ret < 0 || ret >= MAXPATHLEN)
c6883f38 1259 return -1;
91c3830e 1260
0bbf8572
CB
1261 /* ignore, just don't try to fill in */
1262 if (!dir_exists(path))
9cb4d183
SH
1263 return 0;
1264
3999be0a
CB
1265 INFO("Populating \"/dev\"");
1266
3a32201c 1267 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
5e73416f
CB
1268 for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
1269 char hostpath[MAXPATHLEN];
1270 const struct lxc_device_node *device = &lxc_devices[i];
0728ebf4 1271
3999be0a 1272 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
5e73416f 1273 rootfs->path ? rootfs->mount : "", device->name);
c6883f38
SH
1274 if (ret < 0 || ret >= MAXPATHLEN)
1275 return -1;
0bbf8572 1276
3e04a608
CB
1277 /* See
1278 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
1279 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
1280 */
1281 if (can_mknod == 2 || (can_mknod == 1 && !am_host_unpriv())) {
5e73416f
CB
1282 ret = mknod(path, device->mode, makedev(device->maj, device->min));
1283 if (ret == 0 || (ret < 0 && errno == EEXIST)) {
1284 DEBUG("Created device node \"%s\"", path);
0bbf8572
CB
1285 continue;
1286 }
1287
5e73416f
CB
1288 if (errno != EPERM) {
1289 SYSERROR("Failed to create device node \"%s\"", path);
9cb4d183
SH
1290 return -1;
1291 }
3999be0a 1292
5e73416f
CB
1293 /* This can e.g. happen when the container is
1294 * unprivileged or CAP_MKNOD has been dropped.
1295 */
3e04a608
CB
1296 can_mknod = 2;
1297 } else {
1298 can_mknod = 0;
5e73416f
CB
1299 }
1300
1301 ret = mknod(path, S_IFREG, 0);
1302 if (ret < 0 && errno != EEXIST) {
1303 SYSERROR("Failed to create file \"%s\"", path);
1304 return -1;
1305 }
1306
1307 /* Fallback to bind-mounting the device from the host. */
1308 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", device->name);
1309 if (ret < 0 || ret >= MAXPATHLEN)
1310 return -1;
1311
1312 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1313 rootfs->path ? rootfs->mount : NULL);
1314 if (ret < 0) {
1315 SYSERROR("Failed to bind mount host device node \"%s\" "
1316 "onto \"%s\"", hostpath, path);
1317 return -1;
c6883f38 1318 }
5e73416f
CB
1319 DEBUG("Bind mounted host device node \"%s\" onto \"%s\"",
1320 hostpath, path);
c6883f38 1321 }
5e73416f 1322 (void)umask(cmask);
c6883f38 1323
3999be0a 1324 INFO("Populated \"/dev\"");
c6883f38
SH
1325 return 0;
1326}
1327
9aa76a17 1328static int lxc_setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1329{
9aa76a17 1330 int ret;
10bc1861 1331 struct lxc_storage *bdev;
91c3e281 1332 const struct lxc_rootfs *rootfs;
cc28d0b0 1333
91c3e281 1334 rootfs = &conf->rootfs;
a0f379bf 1335 if (!rootfs->path) {
0fd73091
CB
1336 ret = mount("", "/", NULL, MS_SLAVE | MS_REC, 0);
1337 if (ret < 0) {
1338 SYSERROR("Failed to make / rslave");
a0f379bf
DW
1339 return -1;
1340 }
0fd73091 1341
c69bd12f 1342 return 0;
a0f379bf 1343 }
0ad19a3f 1344
0fd73091
CB
1345 ret = access(rootfs->mount, F_OK);
1346 if (ret != 0) {
1347 SYSERROR("Failed to access to \"%s\". Check it is present",
12297168 1348 rootfs->mount);
b1789442
DL
1349 return -1;
1350 }
1351
8a388ed4 1352 bdev = storage_init(conf);
9aa76a17 1353 if (!bdev) {
0fd73091 1354 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1355 rootfs->path, rootfs->mount,
1356 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1357 return -1;
9be53773 1358 }
9aa76a17
CB
1359
1360 ret = bdev->ops->mount(bdev);
10bc1861 1361 storage_put(bdev);
9aa76a17 1362 if (ret < 0) {
0fd73091 1363 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1364 rootfs->path, rootfs->mount,
1365 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1366 return -1;
1367 }
0ad19a3f 1368
0fd73091 1369 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1370 rootfs->path, rootfs->mount,
1371 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1372
ac778708
DL
1373 return 0;
1374}
1375
91e93c71
AV
1376int prepare_ramfs_root(char *root)
1377{
0fd73091
CB
1378 int i, ret;
1379 char *p, *p2;
1380 char buf[LXC_LINELEN], nroot[PATH_MAX];
91e93c71 1381 FILE *f;
91e93c71 1382
0fd73091
CB
1383 if (!realpath(root, nroot))
1384 return -1;
91e93c71 1385
0fd73091
CB
1386 ret = chdir("/");
1387 if (ret < 0)
1388 return -1;
91e93c71 1389
0fd73091
CB
1390 /* We could use here MS_MOVE, but in userns this mount is locked and
1391 * can't be moved.
91e93c71 1392 */
0fd73091
CB
1393 ret = mount(root, "/", NULL, MS_REC | MS_BIND, NULL);
1394 if (ret < 0) {
1395 SYSERROR("Failed to move \"%s\" into \"/\"", root);
1396 return -1;
91e93c71
AV
1397 }
1398
0fd73091
CB
1399 ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
1400 if (ret < 0) {
1401 SYSERROR("Failed to make \"/\" rprivate");
1402 return -1;
91e93c71
AV
1403 }
1404
0fd73091
CB
1405 /* The following code cleans up inhereted mounts which are not required
1406 * for CT.
91e93c71
AV
1407 *
1408 * The mountinfo file shows not all mounts, if a few points have been
1409 * unmounted between read operations from the mountinfo. So we need to
1410 * read mountinfo a few times.
1411 *
1412 * This loop can be skipped if a container uses unserns, because all
1413 * inherited mounts are locked and we should live with all this trash.
1414 */
0fd73091 1415 for (;;) {
91e93c71
AV
1416 int progress = 0;
1417
1418 f = fopen("./proc/self/mountinfo", "r");
1419 if (!f) {
1420 SYSERROR("Unable to open /proc/self/mountinfo");
1421 return -1;
1422 }
0fd73091 1423
eab15c1e 1424 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1425 for (p = buf, i=0; p && i < 4; i++)
1426 p = strchr(p+1, ' ');
0fd73091 1427
91e93c71
AV
1428 if (!p)
1429 continue;
0fd73091 1430
91e93c71
AV
1431 p2 = strchr(p+1, ' ');
1432 if (!p2)
1433 continue;
1434
1435 *p2 = '\0';
1436 *p = '.';
1437
1438 if (strcmp(p + 1, "/") == 0)
1439 continue;
0fd73091 1440
91e93c71
AV
1441 if (strcmp(p + 1, "/proc") == 0)
1442 continue;
1443
0fd73091
CB
1444 ret = umount2(p, MNT_DETACH);
1445 if (ret == 0)
91e93c71
AV
1446 progress++;
1447 }
0fd73091 1448
91e93c71 1449 fclose(f);
0fd73091 1450
91e93c71
AV
1451 if (!progress)
1452 break;
1453 }
1454
0fd73091
CB
1455 /* This also can be skipped if a container uses unserns. */
1456 (void)umount2("./proc", MNT_DETACH);
91e93c71
AV
1457
1458 /* It is weird, but chdir("..") moves us in a new root */
0fd73091
CB
1459 ret = chdir("..");
1460 if (ret < 0) {
91e93c71
AV
1461 SYSERROR("Unable to change working directory");
1462 return -1;
1463 }
1464
0fd73091
CB
1465 ret = chroot(".");
1466 if (ret < 0) {
91e93c71
AV
1467 SYSERROR("Unable to chroot");
1468 return -1;
1469 }
1470
1471 return 0;
1472}
1473
74a3920a 1474static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1475{
0fd73091
CB
1476 int ret;
1477
39c7b795 1478 if (!rootfs->path) {
0fd73091 1479 DEBUG("Container does not have a rootfs");
ac778708 1480 return 0;
39c7b795 1481 }
ac778708 1482
91e93c71 1483 if (detect_ramfs_rootfs()) {
0fd73091
CB
1484 DEBUG("Detected that container is on ramfs");
1485
1486 ret = prepare_ramfs_root(rootfs->mount);
1487 if (ret < 0) {
1488 ERROR("Failed to prepare minimal ramfs root");
91e93c71 1489 return -1;
39c7b795
CB
1490 }
1491
0fd73091 1492 DEBUG("Prepared ramfs root for container");
39c7b795
CB
1493 return 0;
1494 }
1495
0fd73091
CB
1496 ret = setup_rootfs_pivot_root(rootfs->mount);
1497 if (ret < 0) {
1498 ERROR("Failed to pivot_root()");
25368b52 1499 return -1;
c69bd12f
DL
1500 }
1501
0fd73091 1502 DEBUG("Finished pivot_root()");
25368b52 1503 return 0;
0ad19a3f 1504}
1505
5173b710 1506static const struct id_map *find_mapped_nsid_entry(struct lxc_conf *conf, unsigned id,
f4900711
CB
1507 enum idtype idtype)
1508{
1509 struct lxc_list *it;
1510 struct id_map *map;
1511 struct id_map *retmap = NULL;
1512
dcf0ffdf
CB
1513 /* Shortcut for container's root mappings. */
1514 if (id == 0) {
1515 if (idtype == ID_TYPE_UID)
1516 return conf->root_nsuid_map;
1517
1518 if (idtype == ID_TYPE_GID)
1519 return conf->root_nsgid_map;
1520 }
1521
f4900711
CB
1522 lxc_list_for_each(it, &conf->id_map) {
1523 map = it->elem;
1524 if (map->idtype != idtype)
1525 continue;
1526
1527 if (id >= map->nsid && id < map->nsid + map->range) {
1528 retmap = map;
1529 break;
1530 }
1531 }
1532
1533 return retmap;
1534}
1535
1536static int lxc_setup_devpts(struct lxc_conf *conf)
3c26f34e 1537{
70761e5e 1538 int ret;
11293068 1539 const char *default_devpts_mntopts = "gid=5,newinstance,ptmxmode=0666,mode=0620";
9d28c4f9 1540 char devpts_mntopts[256];
77890c6d 1541
e528c735 1542 if (conf->pty_max <= 0) {
0fd73091 1543 DEBUG("No new devpts instance will be mounted since no pts "
70761e5e 1544 "devices are requested");
d852c78c 1545 return 0;
3c26f34e 1546 }
1547
e528c735
CB
1548 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu",
1549 default_devpts_mntopts, conf->pty_max);
9d28c4f9
CB
1550 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1551 return -1;
1552
77f94854
CB
1553 ret = umount2("/dev/pts", MNT_DETACH);
1554 if (ret < 0)
a24c5678 1555 SYSWARN("Failed to unmount old devpts instance");
77f94854 1556 else
0fd73091 1557 DEBUG("Unmounted old devpts instance");
7e40254a 1558
70761e5e
CB
1559 /* Create mountpoint for devpts instance. */
1560 ret = mkdir("/dev/pts", 0755);
1561 if (ret < 0 && errno != EEXIST) {
0fd73091 1562 SYSERROR("Failed to create \"/dev/pts\" directory");
3c26f34e 1563 return -1;
1564 }
1565
11293068 1566 /* mount new devpts instance */
f4900711 1567 ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, devpts_mntopts);
70761e5e 1568 if (ret < 0) {
11293068
CB
1569 /* try mounting without gid=5 */
1570 ret = mount("devpts", "/dev/pts", "devpts",
1571 MS_NOSUID | MS_NOEXEC, devpts_mntopts + sizeof("gid=5"));
1572 if (ret < 0) {
1573 SYSERROR("Failed to mount new devpts instance");
1574 return -1;
1575 }
70761e5e 1576 }
0fd73091 1577 DEBUG("Mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1578
d5cb35d6 1579 /* Remove any pre-existing /dev/ptmx file. */
b29e05d6
CB
1580 ret = remove("/dev/ptmx");
1581 if (ret < 0) {
1582 if (errno != ENOENT) {
0fd73091 1583 SYSERROR("Failed to remove existing \"/dev/ptmx\" file");
d5cb35d6 1584 return -1;
70761e5e 1585 }
b29e05d6 1586 } else {
0fd73091 1587 DEBUG("Removed existing \"/dev/ptmx\" file");
3c26f34e 1588 }
1589
d5cb35d6
CB
1590 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1591 ret = open("/dev/ptmx", O_CREAT, 0666);
1592 if (ret < 0) {
0fd73091 1593 SYSERROR("Failed to create dummy \"/dev/ptmx\" file as bind mount target");
d5cb35d6
CB
1594 return -1;
1595 }
e87bd19c 1596 close(ret);
0fd73091 1597 DEBUG("Created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1598
d5cb35d6 1599 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1600 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6 1601 if (!ret) {
0fd73091 1602 DEBUG("Bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
d5cb35d6
CB
1603 return 0;
1604 } else {
1605 /* Fallthrough and try to create a symlink. */
0fd73091 1606 ERROR("Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
d5cb35d6
CB
1607 }
1608
1609 /* Remove the dummy /dev/ptmx file we created above. */
1610 ret = remove("/dev/ptmx");
70761e5e 1611 if (ret < 0) {
0fd73091 1612 SYSERROR("Failed to remove existing \"/dev/ptmx\"");
d5cb35d6
CB
1613 return -1;
1614 }
1615
1616 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1617 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1618 if (ret < 0) {
0fd73091 1619 SYSERROR("Failed to create symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
3c26f34e 1620 return -1;
1621 }
0fd73091 1622 DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
cd54d859 1623
3c26f34e 1624 return 0;
1625}
1626
cccc74b5
DL
1627static int setup_personality(int persona)
1628{
0fd73091
CB
1629 int ret;
1630
1631#if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1632 if (persona == -1)
1633 return 0;
1634
0fd73091
CB
1635 ret = personality(persona);
1636 if (ret < 0) {
1637 SYSERROR("Failed to set personality to \"0x%x\"", persona);
cccc74b5
DL
1638 return -1;
1639 }
1640
0fd73091
CB
1641 INFO("Set personality to \"0x%x\"", persona);
1642#endif
cccc74b5
DL
1643
1644 return 0;
1645}
1646
3d7d929a 1647static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
dcad02f8 1648 const struct lxc_terminal *console)
6e590161 1649{
882671aa 1650 int ret;
63376d7d 1651 char path[MAXPATHLEN];
86530b0a 1652 char *rootfs_path = rootfs->path ? rootfs->mount : "";
52e35957 1653
8b1b1210
CB
1654 if (console->path && !strcmp(console->path, "none"))
1655 return 0;
1656
86530b0a 1657 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
3d7d929a 1658 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1659 return -1;
52e35957 1660
8b1b1210
CB
1661 /* When we are asked to setup a console we remove any previous
1662 * /dev/console bind-mounts.
1663 */
a7ba3c7f
CB
1664 if (file_exists(path)) {
1665 ret = lxc_unstack_mountpoint(path, false);
1666 if (ret < 0) {
6d1400b5 1667 SYSERROR("Failed to unmount \"%s\"", path);
a7ba3c7f
CB
1668 return -ret;
1669 } else {
86530b0a 1670 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
a7ba3c7f 1671 }
8b1b1210
CB
1672 }
1673
1674 /* For unprivileged containers autodev or automounts will already have
1675 * taken care of creating /dev/console.
1676 */
882671aa
CB
1677 ret = mknod(path, S_IFREG | 0000, 0);
1678 if (ret < 0) {
0728ebf4 1679 if (errno != EEXIST) {
86530b0a 1680 SYSERROR("Failed to create console");
3d7d929a 1681 return -errno;
0728ebf4 1682 }
52e35957
DL
1683 }
1684
882671aa 1685 ret = fchmod(console->slave, S_IXUSR | S_IXGRP | S_IXOTH);
86530b0a 1686 if (ret < 0) {
0fd73091
CB
1687 SYSERROR("Failed to set mode \"0%o\" to \"%s\"",
1688 S_IXUSR | S_IXGRP | S_IXOTH, console->name);
3d7d929a 1689 return -errno;
63376d7d 1690 }
13954cce 1691
86530b0a
L
1692 ret = safe_mount(console->name, path, "none", MS_BIND, 0, rootfs_path);
1693 if (ret < 0) {
0fd73091 1694 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, path);
6e590161 1695 return -1;
1696 }
1697
86530b0a 1698 DEBUG("Mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1699 return 0;
1700}
1701
3d7d929a 1702static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
dcad02f8 1703 const struct lxc_terminal *console,
3d7d929a 1704 char *ttydir)
7c6ef2a2 1705{
3dc035f1 1706 int ret, fd;
3d7d929a 1707 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
86530b0a 1708 char *rootfs_path = rootfs->path ? rootfs->mount : "";
7c6ef2a2 1709
3dc035f1
L
1710 if (console->path && !strcmp(console->path, "none"))
1711 return 0;
1712
7c6ef2a2 1713 /* create rootfs/dev/<ttydir> directory */
86530b0a 1714 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs_path, ttydir);
3d7d929a 1715 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1716 return -1;
3d7d929a 1717
7c6ef2a2
SH
1718 ret = mkdir(path, 0755);
1719 if (ret && errno != EEXIST) {
0fd73091 1720 SYSERROR("Failed to create \"%s\"", path);
3d7d929a 1721 return -errno;
7c6ef2a2 1722 }
4742cd9a 1723 DEBUG("Created directory for console and tty devices at \"%s\"", path);
7c6ef2a2 1724
86530b0a 1725 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
3d7d929a
CB
1726 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1727 return -1;
1728
7c6ef2a2 1729 ret = creat(lxcpath, 0660);
3d7d929a 1730 if (ret == -1 && errno != EEXIST) {
0fd73091 1731 SYSERROR("Failed to create \"%s\"", lxcpath);
3d7d929a 1732 return -errno;
7c6ef2a2 1733 }
4d44e274
SH
1734 if (ret >= 0)
1735 close(ret);
7c6ef2a2 1736
86530b0a 1737 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
3dc035f1 1738 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1739 return -1;
2a12fefd 1740
3dc035f1 1741 if (file_exists(path)) {
a7ba3c7f 1742 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1743 if (ret < 0) {
6d1400b5 1744 SYSERROR("Failed to unmount \"%s\"", path);
a7ba3c7f
CB
1745 return -ret;
1746 } else {
86530b0a 1747 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
a7ba3c7f 1748 }
3dc035f1 1749 }
2a12fefd 1750
3dc035f1
L
1751 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1752 if (fd < 0) {
1753 if (errno != EEXIST) {
86530b0a 1754 SYSERROR("Failed to create console");
3dc035f1 1755 return -errno;
2a12fefd 1756 }
3dc035f1
L
1757 } else {
1758 close(fd);
7c6ef2a2
SH
1759 }
1760
86530b0a
L
1761 ret = chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH);
1762 if (ret < 0) {
0fd73091
CB
1763 SYSERROR("Failed to set mode \"0%o\" to \"%s\"",
1764 S_IXUSR | S_IXGRP | S_IXOTH, console->name);
2a12fefd
CB
1765 return -errno;
1766 }
1767
3dc035f1 1768 /* bind mount console->name to '/dev/<ttydir>/console' */
86530b0a
L
1769 ret = safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs_path);
1770 if (ret < 0) {
0fd73091 1771 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1772 return -1;
1773 }
86530b0a 1774 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1
L
1775
1776 /* bind mount '/dev/<ttydir>/console' to '/dev/console' */
86530b0a
L
1777 ret = safe_mount(lxcpath, path, "none", MS_BIND, 0, rootfs_path);
1778 if (ret < 0) {
0fd73091 1779 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
3dc035f1
L
1780 return -1;
1781 }
86530b0a 1782 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1 1783
86530b0a 1784 DEBUG("Console has been setup under \"%s\" and mounted to \"%s\"", lxcpath, path);
6e590161 1785 return 0;
1786}
1787
3d7d929a 1788static int lxc_setup_console(const struct lxc_rootfs *rootfs,
dcad02f8 1789 const struct lxc_terminal *console, char *ttydir)
7c6ef2a2 1790{
3d7d929a 1791
7c6ef2a2 1792 if (!ttydir)
3d7d929a 1793 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1794
3d7d929a 1795 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1796}
1797
efed99a4 1798static void parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
998ac676
RT
1799{
1800 struct mount_opt *mo;
1801
1802 /* If opt is found in mount_opt, set or clear flags.
1803 * Otherwise append it to data. */
1804
1805 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
0fd73091 1806 if (strncmp(opt, mo->name, strlen(mo->name)) == 0) {
998ac676
RT
1807 if (mo->clear)
1808 *flags &= ~mo->flag;
1809 else
1810 *flags |= mo->flag;
1811 return;
1812 }
1813 }
1814
f1e05b90
DJ
1815 if (strlen(*data))
1816 (void)strlcat(*data, ",", size);
efed99a4 1817
f1e05b90 1818 (void)strlcat(*data, opt, size);
998ac676
RT
1819}
1820
0fd73091 1821int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata)
998ac676 1822{
0fd73091
CB
1823 char *data, *p, *s;
1824 char *saveptr = NULL;
efed99a4 1825 size_t size;
998ac676 1826
911324ef 1827 *mntdata = NULL;
91656ce5 1828 *mntflags = 0L;
911324ef
DL
1829
1830 if (!mntopts)
998ac676
RT
1831 return 0;
1832
911324ef 1833 s = strdup(mntopts);
0fd73091 1834 if (!s)
998ac676 1835 return -1;
998ac676 1836
efed99a4
DJ
1837 size = strlen(s) + 1;
1838 data = malloc(size);
998ac676 1839 if (!data) {
998ac676
RT
1840 free(s);
1841 return -1;
1842 }
1843 *data = 0;
1844
0fd73091 1845 for (; (p = strtok_r(s, ",", &saveptr)); s = NULL)
efed99a4 1846 parse_mntopt(p, mntflags, &data, size);
998ac676
RT
1847
1848 if (*data)
1849 *mntdata = data;
1850 else
1851 free(data);
1852 free(s);
1853
1854 return 0;
1855}
1856
d840039e
YT
1857static void parse_propagationopt(char *opt, unsigned long *flags)
1858{
1859 struct mount_opt *mo;
1860
1861 /* If opt is found in propagation_opt, set or clear flags. */
d840039e 1862 for (mo = &propagation_opt[0]; mo->name != NULL; mo++) {
0fd73091
CB
1863 if (strncmp(opt, mo->name, strlen(mo->name)) != 0)
1864 continue;
1865
1866 if (mo->clear)
1867 *flags &= ~mo->flag;
1868 else
1869 *flags |= mo->flag;
1870
1871 return;
d840039e
YT
1872 }
1873}
1874
1875static int parse_propagationopts(const char *mntopts, unsigned long *pflags)
1876{
0fd73091
CB
1877 char *p, *s;
1878 char *saveptr = NULL;
d840039e
YT
1879
1880 if (!mntopts)
1881 return 0;
1882
1883 s = strdup(mntopts);
1884 if (!s) {
1885 SYSERROR("Failed to allocate memory");
1886 return -ENOMEM;
1887 }
1888
0fd73091
CB
1889 *pflags = 0L;
1890 for (; (p = strtok_r(s, ",", &saveptr)); s = NULL)
d840039e 1891 parse_propagationopt(p, pflags);
d840039e 1892 free(s);
0fd73091 1893
d840039e
YT
1894 return 0;
1895}
1896
6fd5e769
SH
1897static void null_endofword(char *word)
1898{
1899 while (*word && *word != ' ' && *word != '\t')
1900 word++;
1901 *word = '\0';
1902}
1903
0fd73091 1904/* skip @nfields spaces in @src */
6fd5e769
SH
1905static char *get_field(char *src, int nfields)
1906{
6fd5e769 1907 int i;
0fd73091 1908 char *p = src;
6fd5e769
SH
1909
1910 for (i = 0; i < nfields; i++) {
1911 while (*p && *p != ' ' && *p != '\t')
1912 p++;
0fd73091 1913
6fd5e769
SH
1914 if (!*p)
1915 break;
0fd73091 1916
6fd5e769
SH
1917 p++;
1918 }
0fd73091 1919
6fd5e769
SH
1920 return p;
1921}
1922
911324ef
DL
1923static int mount_entry(const char *fsname, const char *target,
1924 const char *fstype, unsigned long mountflags,
d840039e
YT
1925 unsigned long pflags, const char *data, bool optional,
1926 bool dev, bool relative, const char *rootfs)
911324ef 1927{
0ac4b28a 1928 int ret;
181437fd
YT
1929 char srcbuf[MAXPATHLEN];
1930 const char *srcpath = fsname;
614305f3 1931#ifdef HAVE_STATVFS
2938f7c8 1932 struct statvfs sb;
614305f3 1933#endif
2938f7c8 1934
181437fd
YT
1935 if (relative) {
1936 ret = snprintf(srcbuf, MAXPATHLEN, "%s/%s", rootfs ? rootfs : "/", fsname ? fsname : "");
1937 if (ret < 0 || ret >= MAXPATHLEN) {
1938 ERROR("source path is too long");
1939 return -1;
1940 }
1941 srcpath = srcbuf;
1942 }
1943
1944 ret = safe_mount(srcpath, target, fstype, mountflags & ~MS_REMOUNT, data,
0ac4b28a
CB
1945 rootfs);
1946 if (ret < 0) {
1fc64d22 1947 if (optional) {
7874d81a 1948 SYSINFO("Failed to mount \"%s\" on \"%s\" (optional)",
1949 srcpath ? srcpath : "(null)", target);
1fc64d22
SG
1950 return 0;
1951 }
0ac4b28a 1952
0103eb53 1953 SYSERROR("Failed to mount \"%s\" on \"%s\"",
181437fd 1954 srcpath ? srcpath : "(null)", target);
0ac4b28a 1955 return -1;
911324ef
DL
1956 }
1957
1958 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
7c5b6e7c 1959 unsigned long rqd_flags = 0;
0ac4b28a
CB
1960
1961 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
181437fd 1962 "options", srcpath ? srcpath : "(none)", target ? target : "(none)");
0ac4b28a 1963
7c5b6e7c
AS
1964 if (mountflags & MS_RDONLY)
1965 rqd_flags |= MS_RDONLY;
614305f3 1966#ifdef HAVE_STATVFS
181437fd 1967 if (srcpath && statvfs(srcpath, &sb) == 0) {
7c5b6e7c 1968 unsigned long required_flags = rqd_flags;
0ac4b28a 1969
2938f7c8
SH
1970 if (sb.f_flag & MS_NOSUID)
1971 required_flags |= MS_NOSUID;
0ac4b28a 1972
ae7a770e 1973 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 1974 required_flags |= MS_NODEV;
0ac4b28a 1975
2938f7c8
SH
1976 if (sb.f_flag & MS_RDONLY)
1977 required_flags |= MS_RDONLY;
0ac4b28a 1978
2938f7c8
SH
1979 if (sb.f_flag & MS_NOEXEC)
1980 required_flags |= MS_NOEXEC;
0ac4b28a
CB
1981
1982 DEBUG("Flags for \"%s\" were %lu, required extra flags "
181437fd 1983 "are %lu", srcpath, sb.f_flag, required_flags);
0ac4b28a
CB
1984
1985 /* If this was a bind mount request, and required_flags
2938f7c8 1986 * does not have any flags which are not already in
0ac4b28a 1987 * mountflags, then skip the remount.
2938f7c8
SH
1988 */
1989 if (!(mountflags & MS_REMOUNT)) {
0ac4b28a
CB
1990 if (!(required_flags & ~mountflags) &&
1991 rqd_flags == 0) {
1992 DEBUG("Mountflags already were %lu, "
1993 "skipping remount", mountflags);
2938f7c8
SH
1994 goto skipremount;
1995 }
1996 }
0ac4b28a 1997
2938f7c8 1998 mountflags |= required_flags;
6fd5e769 1999 }
614305f3 2000#endif
911324ef 2001
181437fd 2002 ret = mount(srcpath, target, fstype, mountflags | MS_REMOUNT, data);
0ac4b28a 2003 if (ret < 0) {
1fc64d22 2004 if (optional) {
7874d81a 2005 SYSINFO("Failed to mount \"%s\" on \"%s\" (optional)",
2006 srcpath ? srcpath : "(null)", target);
1fc64d22
SG
2007 return 0;
2008 }
0ac4b28a 2009
0103eb53 2010 SYSERROR("Failed to mount \"%s\" on \"%s\"",
181437fd 2011 srcpath ? srcpath : "(null)", target);
0ac4b28a 2012 return -1;
911324ef
DL
2013 }
2014 }
2015
d840039e
YT
2016 if (pflags) {
2017 ret = mount(NULL, target, NULL, pflags, NULL);
2018 if (ret < 0) {
2019 if (optional) {
7874d81a 2020 SYSINFO("Failed to change mount propagation "
2021 "for \"%s\" (optional)", target);
d840039e
YT
2022 return 0;
2023 } else {
2024 SYSERROR("Failed to change mount propagation "
2025 "for \"%s\" (optional)", target);
2026 return -1;
2027 }
2028 }
2029 DEBUG("Changed mount propagation for \"%s\"", target);
2030 }
2031
2032
614305f3 2033#ifdef HAVE_STATVFS
6fd5e769 2034skipremount:
614305f3 2035#endif
0103eb53 2036 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"",
181437fd 2037 srcpath ? srcpath : "(null)", target, fstype);
911324ef
DL
2038
2039 return 0;
2040}
2041
c5e30de4 2042/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
2043static void cull_mntent_opt(struct mntent *mntent)
2044{
2045 int i;
0fd73091
CB
2046 char *list[] = {
2047 "create=dir",
2048 "create=file",
2049 "optional",
2050 "relative",
2051 NULL
2052 };
c5e30de4
CB
2053
2054 for (i = 0; list[i]; i++) {
2055 char *p, *p2;
2056
2057 p = strstr(mntent->mnt_opts, list[i]);
2058 if (!p)
4e4ca161 2059 continue;
c5e30de4 2060
4e4ca161
SH
2061 p2 = strchr(p, ',');
2062 if (!p2) {
2063 /* no more mntopts, so just chop it here */
2064 *p = '\0';
2065 continue;
2066 }
c5e30de4
CB
2067
2068 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
2069 }
2070}
2071
4d5b72a1 2072static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
2073 const char *path,
2074 const struct lxc_rootfs *rootfs,
0fd73091 2075 const char *lxc_name, const char *lxc_path)
0ad19a3f 2076{
12e6ab5d
CB
2077 int fd, ret;
2078 char *p1, *p2;
911324ef 2079
12e6ab5d 2080 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
749f98d9 2081 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
12e6ab5d
CB
2082 if (ret < 0)
2083 return -1;
2084 }
6e46cc0d 2085
34cfffb3 2086 if (hasmntopt(mntent, "create=dir")) {
749f98d9
CB
2087 ret = mkdir_p(path, 0755);
2088 if (ret < 0 && errno != EEXIST) {
2089 SYSERROR("Failed to create directory \"%s\"", path);
2090 return -1;
34cfffb3
SG
2091 }
2092 }
2093
0fd73091
CB
2094 if (!hasmntopt(mntent, "create=file"))
2095 return 0;
749f98d9 2096
0fd73091
CB
2097 ret = access(path, F_OK);
2098 if (ret == 0)
2099 return 0;
749f98d9 2100
0fd73091
CB
2101 p1 = strdup(path);
2102 if (!p1)
2103 return -1;
749f98d9 2104
0fd73091 2105 p2 = dirname(p1);
749f98d9 2106
0fd73091
CB
2107 ret = mkdir_p(p2, 0755);
2108 free(p1);
2109 if (ret < 0 && errno != EEXIST) {
2110 SYSERROR("Failed to create directory \"%s\"", path);
2111 return -1;
34cfffb3 2112 }
749f98d9 2113
0fd73091
CB
2114 fd = open(path, O_CREAT, 0644);
2115 if (fd < 0)
2116 return -1;
2117 close(fd);
2118
749f98d9 2119 return 0;
4d5b72a1
NC
2120}
2121
ec50007f
CB
2122/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2123 * without a rootfs. */
db4aba38 2124static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
2125 const char *path,
2126 const struct lxc_rootfs *rootfs,
2127 const char *lxc_name,
2128 const char *lxc_path)
4d5b72a1 2129{
d8b712bc 2130 int ret;
949d0338 2131 unsigned long mntflags;
4d5b72a1 2132 char *mntdata;
181437fd 2133 bool dev, optional, relative;
949d0338 2134 unsigned long pflags = 0;
ec50007f 2135 char *rootfs_path = NULL;
d8b712bc
CB
2136
2137 optional = hasmntopt(mntent, "optional") != NULL;
2138 dev = hasmntopt(mntent, "dev") != NULL;
181437fd 2139 relative = hasmntopt(mntent, "relative") != NULL;
d8b712bc 2140
ec50007f
CB
2141 if (rootfs && rootfs->path)
2142 rootfs_path = rootfs->mount;
2143
d8b712bc
CB
2144 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2145 lxc_path);
2146 if (ret < 0) {
2147 if (optional)
2148 return 0;
608e3567 2149
d8b712bc
CB
2150 return -1;
2151 }
4e4ca161
SH
2152 cull_mntent_opt(mntent);
2153
d840039e
YT
2154 ret = parse_propagationopts(mntent->mnt_opts, &pflags);
2155 if (ret < 0)
2156 return -1;
2157
d8b712bc
CB
2158 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2159 if (ret < 0)
a17b1e65 2160 return -1;
a17b1e65 2161
6e46cc0d 2162 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
d840039e 2163 pflags, mntdata, optional, dev, relative, rootfs_path);
68c152ef 2164
911324ef 2165 free(mntdata);
911324ef
DL
2166 return ret;
2167}
2168
db4aba38
NC
2169static inline int mount_entry_on_systemfs(struct mntent *mntent)
2170{
1433c9f9 2171 int ret;
07667a6a 2172 char path[MAXPATHLEN];
1433c9f9
CB
2173
2174 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
2175 * absolute paths starting at / on the host.
2176 */
1433c9f9
CB
2177 if (mntent->mnt_dir[0] != '/')
2178 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2179 else
2180 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
07667a6a 2181 if (ret < 0 || ret >= sizeof(path))
1433c9f9 2182 return -1;
1433c9f9
CB
2183
2184 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
2185}
2186
4e4ca161 2187static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 2188 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
2189 const char *lxc_name,
2190 const char *lxc_path)
911324ef 2191{
bdd2b34c 2192 int offset;
013bd428 2193 char *aux;
67e571de 2194 const char *lxcpath;
bdd2b34c
CB
2195 char path[MAXPATHLEN];
2196 int ret = 0;
0ad19a3f 2197
593e8478 2198 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 2199 if (!lxcpath)
2a59a681 2200 return -1;
2a59a681 2201
bdd2b34c
CB
2202 /* If rootfs->path is a blockdev path, allow container fstab to use
2203 * <lxcpath>/<name>/rootfs" as the target prefix.
2204 */
2205 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
2206 if (ret < 0 || ret >= MAXPATHLEN)
80a881b2
SH
2207 goto skipvarlib;
2208
2209 aux = strstr(mntent->mnt_dir, path);
2210 if (aux) {
2211 offset = strlen(path);
2212 goto skipabs;
2213 }
2214
2215skipvarlib:
013bd428
DL
2216 aux = strstr(mntent->mnt_dir, rootfs->path);
2217 if (!aux) {
bdd2b34c 2218 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
db4aba38 2219 return ret;
013bd428 2220 }
80a881b2
SH
2221 offset = strlen(rootfs->path);
2222
2223skipabs:
bdd2b34c
CB
2224 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
2225 if (ret < 0 || ret >= MAXPATHLEN)
a17b1e65 2226 return -1;
a17b1e65 2227
0a2dddd4 2228 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 2229}
d330fe7b 2230
4e4ca161 2231static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
2232 const struct lxc_rootfs *rootfs,
2233 const char *lxc_name,
2234 const char *lxc_path)
911324ef 2235{
911324ef 2236 int ret;
0fd73091 2237 char path[MAXPATHLEN];
d330fe7b 2238
34cfffb3 2239 /* relative to root mount point */
6e46cc0d 2240 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
0fd73091 2241 if (ret < 0 || (size_t)ret >= sizeof(path))
9ba8130c 2242 return -1;
911324ef 2243
0a2dddd4 2244 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2245}
2246
06749971
CB
2247static int mount_file_entries(const struct lxc_conf *conf,
2248 const struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2249 const char *lxc_name, const char *lxc_path)
911324ef 2250{
aaf901be 2251 char buf[4096];
0fd73091 2252 struct mntent mntent;
911324ef 2253 int ret = -1;
e76b8764 2254
aaf901be 2255 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
1ae3c19f
CB
2256 if (!rootfs->path)
2257 ret = mount_entry_on_systemfs(&mntent);
2258 else if (mntent.mnt_dir[0] != '/')
2259 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2260 lxc_name, lxc_path);
2261 else
2262 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2263 lxc_name, lxc_path);
2264 if (ret < 0)
2265 return -1;
0ad19a3f 2266 }
2267 ret = 0;
cd54d859 2268
0fd73091 2269 INFO("Finished setting up mounts");
e7938e9e
MN
2270 return ret;
2271}
2272
06749971
CB
2273static int setup_mount(const struct lxc_conf *conf,
2274 const struct lxc_rootfs *rootfs, const char *fstab,
42dff448 2275 const char *lxc_name, const char *lxc_path)
e7938e9e 2276{
42dff448 2277 FILE *f;
e7938e9e
MN
2278 int ret;
2279
2280 if (!fstab)
2281 return 0;
2282
42dff448
CB
2283 f = setmntent(fstab, "r");
2284 if (!f) {
2285 SYSERROR("Failed to open \"%s\"", fstab);
e7938e9e
MN
2286 return -1;
2287 }
2288
06749971 2289 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
42dff448
CB
2290 if (ret < 0)
2291 ERROR("Failed to set up mount entries");
e7938e9e 2292
42dff448 2293 endmntent(f);
0ad19a3f 2294 return ret;
2295}
2296
5ef5c9a3 2297FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2298{
5ef5c9a3 2299 int ret;
e7938e9e 2300 char *mount_entry;
5ef5c9a3 2301 struct lxc_list *iterator;
5ef5c9a3
CB
2302 int fd = -1;
2303
0fd73091 2304 fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
5ef5c9a3 2305 if (fd < 0) {
a324e7eb
CB
2306 char template[] = P_tmpdir "/.lxc_mount_file_XXXXXX";
2307
5ef5c9a3
CB
2308 if (errno != ENOSYS)
2309 return NULL;
a324e7eb
CB
2310
2311 fd = lxc_make_tmpfile(template, true);
0fd73091
CB
2312 if (fd < 0) {
2313 SYSERROR("Could not create temporary mount file");
2314 return NULL;
2315 }
2316
6bd04140 2317 TRACE("Created temporary mount file");
5ef5c9a3 2318 }
0fd73091
CB
2319 if (fd < 0) {
2320 SYSERROR("Could not create temporary mount file");
9fc7f8c0 2321 return NULL;
e7938e9e
MN
2322 }
2323
0fd73091
CB
2324 lxc_list_for_each (iterator, mount) {
2325 size_t len;
2326
e7938e9e 2327 mount_entry = iterator->elem;
0fd73091 2328 len = strlen(mount_entry);
5ef5c9a3 2329
489f39be 2330 ret = lxc_write_nointr(fd, mount_entry, len);
0fd73091
CB
2331 if (ret != len)
2332 goto on_error;
2333
489f39be 2334 ret = lxc_write_nointr(fd, "\n", 1);
0fd73091
CB
2335 if (ret != 1)
2336 goto on_error;
e7938e9e
MN
2337 }
2338
0fd73091
CB
2339 ret = lseek(fd, 0, SEEK_SET);
2340 if (ret < 0)
2341 goto on_error;
2342
2343 return fdopen(fd, "r+");
2344
2345on_error:
2346 SYSERROR("Failed to write mount entry to temporary mount file");
2347 close(fd);
2348 return NULL;
9fc7f8c0
TA
2349}
2350
06749971
CB
2351static int setup_mount_entries(const struct lxc_conf *conf,
2352 const struct lxc_rootfs *rootfs,
5ef5c9a3
CB
2353 struct lxc_list *mount, const char *lxc_name,
2354 const char *lxc_path)
9fc7f8c0 2355{
9fc7f8c0 2356 int ret;
0fd73091 2357 FILE *f;
9fc7f8c0 2358
19b5d755
CB
2359 f = make_anonymous_mount_file(mount);
2360 if (!f)
9fc7f8c0 2361 return -1;
e7938e9e 2362
06749971 2363 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
19b5d755 2364 fclose(f);
0fd73091 2365
e7938e9e
MN
2366 return ret;
2367}
2368
bab88e68
CS
2369static int parse_cap(const char *cap)
2370{
84760c11 2371 size_t i;
2372 int capid = -1;
0fd73091
CB
2373 size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
2374 char *ptr = NULL;
bab88e68 2375
0fd73091 2376 if (strcmp(cap, "none") == 0)
7035407c
DE
2377 return -2;
2378
8560cd36 2379 for (i = 0; i < end; i++) {
bab88e68
CS
2380 if (strcmp(cap, caps_opt[i].name))
2381 continue;
2382
2383 capid = caps_opt[i].value;
2384 break;
2385 }
2386
2387 if (capid < 0) {
0fd73091
CB
2388 /* Try to see if it's numeric, so the user may specify
2389 * capabilities that the running kernel knows about but we
2390 * don't
2391 */
bab88e68
CS
2392 errno = 0;
2393 capid = strtol(cap, &ptr, 10);
2394 if (!ptr || *ptr != '\0' || errno != 0)
2395 /* not a valid number */
2396 capid = -1;
2397 else if (capid > lxc_caps_last_cap())
2398 /* we have a number but it's not a valid
2399 * capability */
2400 capid = -1;
2401 }
2402
2403 return capid;
2404}
2405
0769b82a
CS
2406int in_caplist(int cap, struct lxc_list *caps)
2407{
0769b82a 2408 int capid;
0fd73091 2409 struct lxc_list *iterator;
0769b82a 2410
0fd73091 2411 lxc_list_for_each (iterator, caps) {
0769b82a
CS
2412 capid = parse_cap(iterator->elem);
2413 if (capid == cap)
2414 return 1;
2415 }
2416
2417 return 0;
2418}
2419
81810dd1
DL
2420static int setup_caps(struct lxc_list *caps)
2421{
bab88e68 2422 int capid;
0fd73091
CB
2423 char *drop_entry;
2424 struct lxc_list *iterator;
81810dd1 2425
0fd73091
CB
2426 lxc_list_for_each (iterator, caps) {
2427 int ret;
81810dd1
DL
2428
2429 drop_entry = iterator->elem;
2430
bab88e68 2431 capid = parse_cap(drop_entry);
0fd73091 2432 if (capid < 0) {
1e11be34
DL
2433 ERROR("unknown capability %s", drop_entry);
2434 return -1;
81810dd1
DL
2435 }
2436
0fd73091
CB
2437 ret = prctl(PR_CAPBSET_DROP, capid, 0, 0, 0);
2438 if (ret < 0) {
2439 SYSERROR("Failed to remove %s capability", drop_entry);
3ec1648d
SH
2440 return -1;
2441 }
0fd73091 2442 DEBUG("Dropped %s (%d) capability", drop_entry, capid);
81810dd1
DL
2443 }
2444
0fd73091 2445 DEBUG("Capabilities have been setup");
1fb86a7c
SH
2446 return 0;
2447}
2448
2449static int dropcaps_except(struct lxc_list *caps)
2450{
0fd73091 2451 int i, capid, numcaps;
1fb86a7c 2452 char *keep_entry;
0fd73091 2453 struct lxc_list *iterator;
1fb86a7c 2454
0fd73091 2455 numcaps = lxc_caps_last_cap() + 1;
2caf9a97
SH
2456 if (numcaps <= 0 || numcaps > 200)
2457 return -1;
0fd73091 2458 TRACE("Found %d capabilities", numcaps);
2caf9a97 2459
1a0e70ac 2460 /* caplist[i] is 1 if we keep capability i */
1fb86a7c
SH
2461 int *caplist = alloca(numcaps * sizeof(int));
2462 memset(caplist, 0, numcaps * sizeof(int));
2463
0fd73091 2464 lxc_list_for_each (iterator, caps) {
1fb86a7c
SH
2465 keep_entry = iterator->elem;
2466
bab88e68 2467 capid = parse_cap(keep_entry);
7035407c
DE
2468 if (capid == -2)
2469 continue;
2470
0fd73091
CB
2471 if (capid < 0) {
2472 ERROR("Unknown capability %s", keep_entry);
1fb86a7c
SH
2473 return -1;
2474 }
2475
0fd73091 2476 DEBUG("Keep capability %s (%d)", keep_entry, capid);
1fb86a7c
SH
2477 caplist[capid] = 1;
2478 }
0fd73091
CB
2479
2480 for (i = 0; i < numcaps; i++) {
2481 int ret;
2482
1fb86a7c
SH
2483 if (caplist[i])
2484 continue;
0fd73091
CB
2485
2486 ret = prctl(PR_CAPBSET_DROP, i, 0, 0, 0);
2487 if (ret < 0) {
2488 SYSERROR("Failed to remove capability %d", i);
3ec1648d
SH
2489 return -1;
2490 }
1fb86a7c
SH
2491 }
2492
0fd73091 2493 DEBUG("Capabilities have been setup");
81810dd1
DL
2494 return 0;
2495}
2496
0fd73091
CB
2497static int parse_resource(const char *res)
2498{
2499 int ret;
c6d09e15
WB
2500 size_t i;
2501 int resid = -1;
2502
0fd73091 2503 for (i = 0; i < sizeof(limit_opt) / sizeof(limit_opt[0]); ++i)
c6d09e15
WB
2504 if (strcmp(res, limit_opt[i].name) == 0)
2505 return limit_opt[i].value;
c6d09e15 2506
0fd73091 2507 /* Try to see if it's numeric, so the user may specify
c6d09e15 2508 * resources that the running kernel knows about but
0fd73091
CB
2509 * we don't.
2510 */
2511 ret = lxc_safe_int(res, &resid);
2512 if (ret < 0)
2513 return -1;
2514
2515 return resid;
c6d09e15
WB
2516}
2517
0fd73091
CB
2518int setup_resource_limits(struct lxc_list *limits, pid_t pid)
2519{
2520 int resid;
c6d09e15
WB
2521 struct lxc_list *it;
2522 struct lxc_limit *lim;
c6d09e15 2523
0fd73091 2524 lxc_list_for_each (it, limits) {
c6d09e15
WB
2525 lim = it->elem;
2526
2527 resid = parse_resource(lim->resource);
2528 if (resid < 0) {
0fd73091 2529 ERROR("Unknown resource %s", lim->resource);
c6d09e15
WB
2530 return -1;
2531 }
2532
f48b5fd8 2533#if HAVE_PRLIMIT || HAVE_PRLIMIT64
c6d09e15 2534 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
6d1400b5 2535 SYSERROR("Failed to set limit %s", lim->resource);
c6d09e15
WB
2536 return -1;
2537 }
f48b5fd8
FF
2538#else
2539 ERROR("Cannot set limit %s as prlimit is missing", lim->resource);
2540 return -1;
2541#endif
c6d09e15 2542 }
0fd73091 2543
c6d09e15
WB
2544 return 0;
2545}
2546
7edd0540
L
2547int setup_sysctl_parameters(struct lxc_list *sysctls)
2548{
2549 struct lxc_list *it;
2550 struct lxc_sysctl *elem;
0fd73091 2551 int ret = 0;
7edd0540
L
2552 char *tmp = NULL;
2553 char filename[MAXPATHLEN] = {0};
7edd0540 2554
0fd73091 2555 lxc_list_for_each (it, sysctls) {
7edd0540
L
2556 elem = it->elem;
2557 tmp = lxc_string_replace(".", "/", elem->key);
2558 if (!tmp) {
2559 ERROR("Failed to replace key %s", elem->key);
2560 return -1;
2561 }
2562
2563 ret = snprintf(filename, sizeof(filename), "/proc/sys/%s", tmp);
2564 free(tmp);
2565 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2566 ERROR("Error setting up sysctl parameters path");
2567 return -1;
2568 }
2569
0fd73091 2570 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2571 strlen(elem->value), false, 0666);
7edd0540 2572 if (ret < 0) {
0fd73091
CB
2573 ERROR("Failed to setup sysctl parameters %s to %s",
2574 elem->key, elem->value);
7edd0540
L
2575 return -1;
2576 }
2577 }
0fd73091 2578
7edd0540
L
2579 return 0;
2580}
2581
61d7a733
YT
2582int setup_proc_filesystem(struct lxc_list *procs, pid_t pid)
2583{
2584 struct lxc_list *it;
2585 struct lxc_proc *elem;
0fd73091 2586 int ret = 0;
61d7a733
YT
2587 char *tmp = NULL;
2588 char filename[MAXPATHLEN] = {0};
61d7a733 2589
0fd73091 2590 lxc_list_for_each (it, procs) {
61d7a733
YT
2591 elem = it->elem;
2592 tmp = lxc_string_replace(".", "/", elem->filename);
2593 if (!tmp) {
2594 ERROR("Failed to replace key %s", elem->filename);
2595 return -1;
2596 }
2597
2598 ret = snprintf(filename, sizeof(filename), "/proc/%d/%s", pid, tmp);
2599 free(tmp);
2600 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2601 ERROR("Error setting up proc filesystem path");
2602 return -1;
2603 }
2604
0fd73091 2605 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2606 strlen(elem->value), false, 0666);
61d7a733 2607 if (ret < 0) {
0fd73091
CB
2608 ERROR("Failed to setup proc filesystem %s to %s",
2609 elem->filename, elem->value);
61d7a733
YT
2610 return -1;
2611 }
2612 }
0fd73091 2613
61d7a733
YT
2614 return 0;
2615}
2616
ae9242c8
SH
2617static char *default_rootfs_mount = LXCROOTFSMOUNT;
2618
7b379ab3 2619struct lxc_conf *lxc_conf_init(void)
089cd8b8 2620{
26ddeedd 2621 int i;
0fd73091 2622 struct lxc_conf *new;
7b379ab3 2623
13277ec4 2624 new = malloc(sizeof(*new));
0fd73091 2625 if (!new)
7b379ab3 2626 return NULL;
7b379ab3
MN
2627 memset(new, 0, sizeof(*new));
2628
4b73005c 2629 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2630 new->personality = -1;
124fa0a8 2631 new->autodev = 1;
3a784510 2632 new->console.buffer_size = 0;
596a818d
DE
2633 new->console.log_path = NULL;
2634 new->console.log_fd = -1;
861813e5 2635 new->console.log_size = 0;
28a4b0e5 2636 new->console.path = NULL;
63376d7d 2637 new->console.peer = -1;
fb87aa6a
CB
2638 new->console.proxy.busy = -1;
2639 new->console.proxy.master = -1;
2640 new->console.proxy.slave = -1;
63376d7d
DL
2641 new->console.master = -1;
2642 new->console.slave = -1;
2643 new->console.name[0] = '\0';
732375f5 2644 memset(&new->console.ringbuf, 0, sizeof(struct lxc_ringbuf));
d2e30e99 2645 new->maincmd_fd = -1;
76a26f55 2646 new->nbd_idx = -1;
54c30e29 2647 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2648 if (!new->rootfs.mount) {
53f3f048
SH
2649 free(new);
2650 return NULL;
2651 }
858377e4 2652 new->logfd = -1;
7b379ab3 2653 lxc_list_init(&new->cgroup);
54860ed0 2654 lxc_list_init(&new->cgroup2);
7b379ab3
MN
2655 lxc_list_init(&new->network);
2656 lxc_list_init(&new->mount_list);
81810dd1 2657 lxc_list_init(&new->caps);
1fb86a7c 2658 lxc_list_init(&new->keepcaps);
f6d3e3e4 2659 lxc_list_init(&new->id_map);
46ad64ab
CB
2660 new->root_nsuid_map = NULL;
2661 new->root_nsgid_map = NULL;
f979ac15 2662 lxc_list_init(&new->includes);
4184c3e1 2663 lxc_list_init(&new->aliens);
7c661726 2664 lxc_list_init(&new->environment);
c6d09e15 2665 lxc_list_init(&new->limits);
7edd0540 2666 lxc_list_init(&new->sysctls);
61d7a733 2667 lxc_list_init(&new->procs);
44ae0fb6 2668 new->hooks_version = 0;
28d9e29e 2669 for (i = 0; i < NUM_LXC_HOOKS; i++)
26ddeedd 2670 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2671 lxc_list_init(&new->groups);
d39b10eb 2672 lxc_list_init(&new->state_clients);
fe4de9a6
DE
2673 new->lsm_aa_profile = NULL;
2674 new->lsm_se_context = NULL;
7a0bcca3 2675 new->tmp_umount_proc = false;
7b379ab3 2676
72bb04e4
PT
2677 /* if running in a new user namespace, init and COMMAND
2678 * default to running as UID/GID 0 when using lxc-execute */
2679 new->init_uid = 0;
2680 new->init_gid = 0;
43654d34 2681 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
b074bbf1 2682 memset(&new->ns_share, 0, sizeof(char *) * LXC_NS_MAX);
72bb04e4 2683
7b379ab3 2684 return new;
089cd8b8
DL
2685}
2686
344c9d81 2687int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
a19b974f 2688 size_t buf_size)
f6d3e3e4 2689{
29053180 2690 int fd, ret;
0fd73091 2691 char path[MAXPATHLEN];
f6d3e3e4 2692
a19b974f
CB
2693 if (geteuid() != 0 && idtype == ID_TYPE_GID) {
2694 size_t buflen;
2695
2696 ret = snprintf(path, MAXPATHLEN, "/proc/%d/setgroups", pid);
0fd73091 2697 if (ret < 0 || ret >= MAXPATHLEN)
a19b974f 2698 return -E2BIG;
a19b974f
CB
2699
2700 fd = open(path, O_WRONLY);
2701 if (fd < 0 && errno != ENOENT) {
2702 SYSERROR("Failed to open \"%s\"", path);
2703 return -1;
2704 }
2705
2388737b
CB
2706 if (fd >= 0) {
2707 buflen = sizeof("deny\n") - 1;
2708 errno = 0;
2709 ret = lxc_write_nointr(fd, "deny\n", buflen);
395b1a3e 2710 close(fd);
2388737b 2711 if (ret != buflen) {
0fd73091
CB
2712 SYSERROR("Failed to write \"deny\" to "
2713 "\"/proc/%d/setgroups\"", pid);
2388737b
CB
2714 return -1;
2715 }
395b1a3e 2716 TRACE("Wrote \"deny\" to \"/proc/%d/setgroups\"", pid);
a19b974f 2717 }
a19b974f
CB
2718 }
2719
29053180
CB
2720 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
2721 idtype == ID_TYPE_UID ? 'u' : 'g');
0fd73091 2722 if (ret < 0 || ret >= MAXPATHLEN)
f6d3e3e4 2723 return -E2BIG;
29053180
CB
2724
2725 fd = open(path, O_WRONLY);
2726 if (fd < 0) {
a19b974f 2727 SYSERROR("Failed to open \"%s\"", path);
29053180 2728 return -1;
f6d3e3e4 2729 }
29053180
CB
2730
2731 errno = 0;
2732 ret = lxc_write_nointr(fd, buf, buf_size);
395b1a3e 2733 close(fd);
29053180 2734 if (ret != buf_size) {
a19b974f 2735 SYSERROR("Failed to write %cid mapping to \"%s\"",
29053180 2736 idtype == ID_TYPE_UID ? 'u' : 'g', path);
29053180
CB
2737 return -1;
2738 }
29053180
CB
2739
2740 return 0;
f6d3e3e4
SH
2741}
2742
6e50e704
CB
2743/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2744 *
2745 * @return 1 if functional binary was found
2746 * @return 0 if binary exists but is lacking privilege
2747 * @return -ENOENT if binary does not exist
2748 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
6e50e704 2749 */
df6a2945
CB
2750static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2751{
2752 char *path;
2753 int ret;
2754 struct stat st;
2755 int fret = 0;
2756
6e50e704
CB
2757 if (cap != CAP_SETUID && cap != CAP_SETGID)
2758 return -EINVAL;
2759
df6a2945
CB
2760 path = on_path(binary, NULL);
2761 if (!path)
2762 return -ENOENT;
2763
2764 ret = stat(path, &st);
2765 if (ret < 0) {
2766 fret = -errno;
2767 goto cleanup;
2768 }
2769
2770 /* Check if the binary is setuid. */
2771 if (st.st_mode & S_ISUID) {
0fd73091 2772 DEBUG("The binary \"%s\" does have the setuid bit set", path);
df6a2945
CB
2773 fret = 1;
2774 goto cleanup;
2775 }
2776
0fd73091 2777#if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
2778 /* Check if it has the CAP_SETUID capability. */
2779 if ((cap & CAP_SETUID) &&
2780 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2781 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2782 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
0fd73091 2783 "and CAP_PERMITTED sets", path);
df6a2945
CB
2784 fret = 1;
2785 goto cleanup;
2786 }
2787
2788 /* Check if it has the CAP_SETGID capability. */
2789 if ((cap & CAP_SETGID) &&
2790 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2791 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2792 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
0fd73091 2793 "and CAP_PERMITTED sets", path);
df6a2945
CB
2794 fret = 1;
2795 goto cleanup;
2796 }
0fd73091 2797#else
69924fff
CB
2798 /* If we cannot check for file capabilities we need to give the benefit
2799 * of the doubt. Otherwise we might fail even though all the necessary
2800 * file capabilities are set.
2801 */
d6018f88 2802 DEBUG("Cannot check for file capabilites as full capability support is "
0fd73091 2803 "missing. Manual intervention needed");
d6018f88 2804 fret = 1;
0fd73091 2805#endif
df6a2945
CB
2806
2807cleanup:
2808 free(path);
2809 return fret;
2810}
2811
986ef930
CB
2812int lxc_map_ids_exec_wrapper(void *args)
2813{
2814 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2815 return -1;
2816}
2817
f6d3e3e4
SH
2818int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2819{
0fd73091 2820 int fill, left;
986ef930 2821 char u_or_g;
4bc3b759 2822 char *pos;
986ef930 2823 char cmd_output[MAXPATHLEN];
0fd73091
CB
2824 struct id_map *map;
2825 struct lxc_list *iterator;
2826 enum idtype type;
986ef930
CB
2827 /* strlen("new@idmap") = 9
2828 * +
2829 * strlen(" ") = 1
2830 * +
2831 * LXC_NUMSTRLEN64
2832 * +
2833 * strlen(" ") = 1
2834 *
2835 * We add some additional space to make sure that we really have
2836 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2837 */
0fd73091 2838 int ret = 0, gidmap = 0, uidmap = 0;
986ef930 2839 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
0fd73091 2840 bool had_entry = false, use_shadow = false;
c724025c
JC
2841 int hostuid, hostgid;
2842
2843 hostuid = geteuid();
2844 hostgid = getegid();
df6a2945
CB
2845
2846 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2847 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
2848 * will protected it by preventing another user from being handed the
2849 * range by shadow.
2850 */
df6a2945 2851 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
2852 if (uidmap == -ENOENT)
2853 WARN("newuidmap binary is missing");
2854 else if (!uidmap)
2855 WARN("newuidmap is lacking necessary privileges");
2856
df6a2945 2857 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
2858 if (gidmap == -ENOENT)
2859 WARN("newgidmap binary is missing");
2860 else if (!gidmap)
2861 WARN("newgidmap is lacking necessary privileges");
2862
df6a2945 2863 if (uidmap > 0 && gidmap > 0) {
0fd73091 2864 DEBUG("Functional newuidmap and newgidmap binary found");
4bc3b759 2865 use_shadow = true;
df6a2945 2866 } else {
99d43365
CB
2867 /* In case unprivileged users run application containers via
2868 * execute() or a start*() there are valid cases where they may
2869 * only want to map their own {g,u}id. Let's not block them from
2870 * doing so by requiring geteuid() == 0.
2871 */
2872 DEBUG("No newuidmap and newgidmap binary found. Trying to "
c724025c
JC
2873 "write directly with euid %d", hostuid);
2874 }
2875
2876 /* Check if we really need to use newuidmap and newgidmap.
2877 * If the user is only remapping his own {g,u}id, we don't need it.
2878 */
2879 if (use_shadow && lxc_list_len(idmap) == 2) {
2880 use_shadow = false;
2881 lxc_list_for_each(iterator, idmap) {
2882 map = iterator->elem;
2883 if (map->idtype == ID_TYPE_UID && map->range == 1 &&
2884 map->nsid == hostuid && map->hostid == hostuid)
2885 continue;
2886 if (map->idtype == ID_TYPE_GID && map->range == 1 &&
2887 map->nsid == hostgid && map->hostid == hostgid)
2888 continue;
2889 use_shadow = true;
2890 break;
2891 }
0e6e3a41 2892 }
251d0d2a 2893
986ef930
CB
2894 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2895 type++, u_or_g = 'g') {
2896 pos = mapbuf;
2897
0e6e3a41 2898 if (use_shadow)
986ef930 2899 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 2900
cf3ef16d 2901 lxc_list_for_each(iterator, idmap) {
251d0d2a 2902 map = iterator->elem;
cf3ef16d
SH
2903 if (map->idtype != type)
2904 continue;
2905
4bc3b759
CB
2906 had_entry = true;
2907
986ef930 2908 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 2909 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
2910 use_shadow ? " " : "", map->nsid,
2911 map->hostid, map->range,
0e6e3a41 2912 use_shadow ? "" : "\n");
a427e268
CB
2913 if (fill <= 0 || fill >= left) {
2914 /* The kernel only takes <= 4k for writes to
2915 * /proc/<pid>/{g,u}id_map
2916 */
2917 SYSERROR("Too many %cid mappings defined", u_or_g);
2918 return -1;
2919 }
4bc3b759 2920
cf3ef16d 2921 pos += fill;
251d0d2a 2922 }
cf3ef16d 2923 if (!had_entry)
4f7521b4 2924 continue;
cf3ef16d 2925
986ef930
CB
2926 /* Try to catch the ouput of new{g,u}idmap to make debugging
2927 * easier.
2928 */
2929 if (use_shadow) {
2930 ret = run_command(cmd_output, sizeof(cmd_output),
2931 lxc_map_ids_exec_wrapper,
2932 (void *)mapbuf);
2933 if (ret < 0) {
54fbbeb5
CB
2934 ERROR("new%cidmap failed to write mapping \"%s\": %s",
2935 u_or_g, cmd_output, mapbuf);
986ef930
CB
2936 return -1;
2937 }
54fbbeb5 2938 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
d1838f34 2939 } else {
986ef930 2940 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
54fbbeb5 2941 if (ret < 0) {
da0f9977 2942 ERROR("Failed to write mapping: %s", mapbuf);
986ef930 2943 return -1;
54fbbeb5
CB
2944 }
2945 TRACE("Wrote mapping \"%s\"", mapbuf);
d1838f34 2946 }
986ef930
CB
2947
2948 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 2949 }
251d0d2a 2950
986ef930 2951 return 0;
f6d3e3e4
SH
2952}
2953
0fd73091 2954/* Return the host uid/gid to which the container root is mapped in val.
0b3a6504 2955 * Return true if id was found, false otherwise.
cf3ef16d 2956 */
2a9a80cb 2957bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
4160c3a0 2958 unsigned long *val)
cf3ef16d 2959{
4160c3a0 2960 unsigned nsid;
0fd73091
CB
2961 struct id_map *map;
2962 struct lxc_list *it;
4160c3a0
CB
2963
2964 if (idtype == ID_TYPE_UID)
2965 nsid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
2966 else
2967 nsid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
cf3ef16d 2968
0fd73091 2969 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2970 map = it->elem;
7b50c609 2971 if (map->idtype != idtype)
cf3ef16d 2972 continue;
4160c3a0 2973 if (map->nsid != nsid)
cf3ef16d 2974 continue;
2a9a80cb
SH
2975 *val = map->hostid;
2976 return true;
cf3ef16d 2977 }
4160c3a0 2978
2a9a80cb 2979 return false;
cf3ef16d
SH
2980}
2981
2133f58c 2982int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 2983{
cf3ef16d 2984 struct id_map *map;
0fd73091
CB
2985 struct lxc_list *it;
2986
2987 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 2988 map = it->elem;
2133f58c 2989 if (map->idtype != idtype)
cf3ef16d 2990 continue;
0fd73091 2991
cf3ef16d 2992 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 2993 return (id - map->hostid) + map->nsid;
cf3ef16d 2994 }
0fd73091 2995
57d116ab 2996 return -1;
cf3ef16d
SH
2997}
2998
339efad9 2999int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 3000{
cf3ef16d 3001 struct id_map *map;
0fd73091 3002 struct lxc_list *it;
2133f58c 3003 unsigned int freeid = 0;
0fd73091 3004
cf3ef16d 3005again:
0fd73091 3006 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 3007 map = it->elem;
2133f58c 3008 if (map->idtype != idtype)
cf3ef16d 3009 continue;
0fd73091 3010
cf3ef16d
SH
3011 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3012 freeid = map->nsid + map->range;
3013 goto again;
3014 }
3015 }
0fd73091 3016
cf3ef16d
SH
3017 return freeid;
3018}
3019
f4f52cb5
CB
3020int chown_mapped_root_exec_wrapper(void *args)
3021{
3022 execvp("lxc-usernsexec", args);
3023 return -1;
3024}
3025
0fd73091 3026/* chown_mapped_root: for an unprivileged user with uid/gid X to
7b50c609
TS
3027 * chown a dir to subuid/subgid Y, he needs to run chown as root
3028 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3029 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3030 * root is privileged with respect to hostuid/hostgid X, allowing
3031 * him to do the chown.
f6d3e3e4 3032 */
41dc7155 3033int chown_mapped_root(const char *path, struct lxc_conf *conf)
f6d3e3e4 3034{
f4f52cb5 3035 uid_t rootuid, rootgid;
2a9a80cb 3036 unsigned long val;
f4f52cb5
CB
3037 int hostuid, hostgid, ret;
3038 struct stat sb;
3039 char map1[100], map2[100], map3[100], map4[100], map5[100];
3040 char ugid[100];
41dc7155 3041 const char *args1[] = {"lxc-usernsexec",
f4f52cb5
CB
3042 "-m", map1,
3043 "-m", map2,
3044 "-m", map3,
3045 "-m", map5,
3046 "--", "chown", ugid, path,
3047 NULL};
41dc7155 3048 const char *args2[] = {"lxc-usernsexec",
f4f52cb5
CB
3049 "-m", map1,
3050 "-m", map2,
3051 "-m", map3,
3052 "-m", map4,
3053 "-m", map5,
3054 "--", "chown", ugid, path,
3055 NULL};
3056 char cmd_output[MAXPATHLEN];
3057
3058 hostuid = geteuid();
3059 hostgid = getegid();
f6d3e3e4 3060
2a9a80cb 3061 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 3062 ERROR("No uid mapping for container root");
c4d10a05 3063 return -1;
f6d3e3e4 3064 }
f4f52cb5 3065 rootuid = (uid_t)val;
0fd73091 3066
7b50c609 3067 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 3068 ERROR("No gid mapping for container root");
7b50c609
TS
3069 return -1;
3070 }
f4f52cb5 3071 rootgid = (gid_t)val;
2a9a80cb 3072
f4f52cb5 3073 if (hostuid == 0) {
7b50c609 3074 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3075 ERROR("Error chowning %s", path);
3076 return -1;
3077 }
0fd73091 3078
c4d10a05
SH
3079 return 0;
3080 }
f3d7e4ca 3081
f4f52cb5 3082 if (rootuid == hostuid) {
1a0e70ac 3083 /* nothing to do */
b103ceac 3084 INFO("Container root is our uid; no need to chown");
f3d7e4ca
SH
3085 return 0;
3086 }
3087
bbdbf8f0 3088 /* save the current gid of "path" */
f4f52cb5
CB
3089 if (stat(path, &sb) < 0) {
3090 ERROR("Error stat %s", path);
f6d3e3e4
SH
3091 return -1;
3092 }
7b50c609 3093
bbdbf8f0
CB
3094 /* Update the path argument in case this was overlayfs. */
3095 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3096 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3097
f4f52cb5
CB
3098 /*
3099 * A file has to be group-owned by a gid mapped into the
3100 * container, or the container won't be privileged over it.
3101 */
3102 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3103 if (sb.st_uid == hostuid &&
3104 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3105 chown(path, -1, hostgid) < 0) {
3106 ERROR("Failed chgrping %s", path);
3107 return -1;
3108 }
f6d3e3e4 3109
1a0e70ac 3110 /* "u:0:rootuid:1" */
f4f52cb5
CB
3111 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3112 if (ret < 0 || ret >= 100) {
3113 ERROR("Error uid printing map string");
3114 return -1;
3115 }
7b50c609 3116
1a0e70ac 3117 /* "u:hostuid:hostuid:1" */
f4f52cb5
CB
3118 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3119 if (ret < 0 || ret >= 100) {
3120 ERROR("Error uid printing map string");
3121 return -1;
3122 }
c4d10a05 3123
1a0e70ac 3124 /* "g:0:rootgid:1" */
f4f52cb5
CB
3125 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3126 if (ret < 0 || ret >= 100) {
3127 ERROR("Error gid printing map string");
3128 return -1;
3129 }
98e5ba51 3130
1a0e70ac 3131 /* "g:pathgid:rootgid+pathgid:1" */
f4f52cb5
CB
3132 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3133 rootgid + (gid_t)sb.st_gid);
3134 if (ret < 0 || ret >= 100) {
3135 ERROR("Error gid printing map string");
3136 return -1;
3137 }
c4d10a05 3138
1a0e70ac 3139 /* "g:hostgid:hostgid:1" */
f4f52cb5
CB
3140 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3141 if (ret < 0 || ret >= 100) {
3142 ERROR("Error gid printing map string");
3143 return -1;
3144 }
7b50c609 3145
1a0e70ac 3146 /* "0:pathgid" (chown) */
f4f52cb5
CB
3147 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3148 if (ret < 0 || ret >= 100) {
3149 ERROR("Error owner printing format string for chown");
3150 return -1;
3151 }
7b50c609 3152
f4f52cb5
CB
3153 if (hostgid == sb.st_gid)
3154 ret = run_command(cmd_output, sizeof(cmd_output),
3155 chown_mapped_root_exec_wrapper,
3156 (void *)args1);
3157 else
3158 ret = run_command(cmd_output, sizeof(cmd_output),
3159 chown_mapped_root_exec_wrapper,
3160 (void *)args2);
3161 if (ret < 0)
3162 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 3163
f4f52cb5 3164 return ret;
f6d3e3e4
SH
3165}
3166
943144d9
CB
3167/* NOTE: Must not be called from inside the container namespace! */
3168int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3169{
3170 int mounted;
3171
943144d9 3172 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3173 if (mounted == -1) {
0fd73091 3174 SYSERROR("Failed to mount proc in the container");
01958b1f 3175 /* continue only if there is no rootfs */
943144d9 3176 if (conf->rootfs.path)
01958b1f 3177 return -1;
5112cd70 3178 } else if (mounted == 1) {
7a0bcca3 3179 conf->tmp_umount_proc = true;
5112cd70 3180 }
943144d9 3181
5112cd70
SH
3182 return 0;
3183}
3184
3185void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3186{
7a0bcca3 3187 if (!lxc_conf->tmp_umount_proc)
0fd73091
CB
3188 return;
3189
7a0bcca3
CB
3190 (void)umount2("/proc", MNT_DETACH);
3191 lxc_conf->tmp_umount_proc = false;
5112cd70
SH
3192}
3193
0fd73091 3194/* Walk /proc/mounts and change any shared entries to slave. */
6a0c909a 3195void remount_all_slave(void)
e995d7a2 3196{
6a49f05e
CB
3197 int memfd, mntinfo_fd, ret;
3198 ssize_t copied;
0fd73091 3199 FILE *f;
e995d7a2 3200 size_t len = 0;
0fd73091 3201 char *line = NULL;
e995d7a2 3202
6a49f05e 3203 mntinfo_fd = open("/proc/self/mountinfo", O_RDONLY | O_CLOEXEC);
fea3b91d
DJ
3204 if (mntinfo_fd < 0) {
3205 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
6a49f05e 3206 return;
fea3b91d 3207 }
6a49f05e
CB
3208
3209 memfd = memfd_create(".lxc_mountinfo", MFD_CLOEXEC);
3210 if (memfd < 0) {
3211 char template[] = P_tmpdir "/.lxc_mountinfo_XXXXXX";
3212
3213 if (errno != ENOSYS) {
fea3b91d 3214 SYSERROR("Failed to create temporary in-memory file");
6a49f05e 3215 close(mntinfo_fd);
6a49f05e
CB
3216 return;
3217 }
3218
3219 memfd = lxc_make_tmpfile(template, true);
fea3b91d
DJ
3220 if (memfd < 0) {
3221 close(mntinfo_fd);
3222 WARN("Failed to create temporary file");
3223 return;
3224 }
6a49f05e
CB
3225 }
3226
3227#define __LXC_SENDFILE_MAX 0x7ffff000 /* maximum number of bytes sendfile can handle */
3228again:
3229 copied = sendfile(memfd, mntinfo_fd, NULL, __LXC_SENDFILE_MAX);
3230 if (copied < 0) {
3231 if (errno == EINTR)
3232 goto again;
3233
fea3b91d 3234 SYSERROR("Failed to copy \"/proc/self/mountinfo\"");
6a49f05e
CB
3235 close(mntinfo_fd);
3236 close(memfd);
6a49f05e
CB
3237 return;
3238 }
3239 close(mntinfo_fd);
3240
3241 /* After a successful fdopen() memfd will be closed when calling
3242 * fclose(f). Calling close(memfd) afterwards is undefined.
3243 */
3244 ret = lseek(memfd, 0, SEEK_SET);
3245 if (ret < 0) {
fea3b91d 3246 SYSERROR("Failed to reset file descriptor offset");
6a49f05e 3247 close(memfd);
6a49f05e
CB
3248 return;
3249 }
3250
3251 f = fdopen(memfd, "r");
e995d7a2 3252 if (!f) {
fea3b91d
DJ
3253 SYSERROR("Failed to open copy of \"/proc/self/mountinfo\" to mark "
3254 "all shared. Continuing");
6a49f05e 3255 close(memfd);
e995d7a2
SH
3256 return;
3257 }
3258
3259 while (getline(&line, &len, f) != -1) {
0fd73091
CB
3260 int ret;
3261 char *opts, *target;
3262
e995d7a2
SH
3263 target = get_field(line, 4);
3264 if (!target)
3265 continue;
0fd73091 3266
e995d7a2
SH
3267 opts = get_field(target, 2);
3268 if (!opts)
3269 continue;
0fd73091 3270
e995d7a2
SH
3271 null_endofword(opts);
3272 if (!strstr(opts, "shared"))
3273 continue;
0fd73091 3274
e995d7a2 3275 null_endofword(target);
0fd73091
CB
3276 ret = mount(NULL, target, NULL, MS_SLAVE, NULL);
3277 if (ret < 0) {
3278 SYSERROR("Failed to make \"%s\" MS_SLAVE", target);
e995d7a2 3279 ERROR("Continuing...");
6a49f05e 3280 continue;
e995d7a2 3281 }
6a49f05e 3282 TRACE("Remounted \"%s\" as MS_SLAVE", target);
e995d7a2
SH
3283 }
3284 fclose(f);
f10fad2f 3285 free(line);
6a49f05e 3286 TRACE("Remounted all mount table entries as MS_SLAVE");
e995d7a2
SH
3287}
3288
794248d0 3289static int lxc_execute_bind_init(struct lxc_handler *handler)
2322903b
SH
3290{
3291 int ret;
794248d0
CB
3292 char *p;
3293 char path[PATH_MAX], destpath[PATH_MAX];
3294 struct lxc_conf *conf = handler->conf;
9d9c111c
SH
3295
3296 /* If init exists in the container, don't bind mount a static one */
3297 p = choose_init(conf->rootfs.mount);
3298 if (p) {
41089848
TA
3299 char *old = p;
3300
3301 p = strdup(old + strlen(conf->rootfs.mount));
3302 free(old);
3303 if (!p)
3304 return -ENOMEM;
3305
3306 INFO("Found existing init at \"%s\"", p);
3307 goto out;
9d9c111c 3308 }
2322903b
SH
3309
3310 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
0fd73091 3311 if (ret < 0 || ret >= PATH_MAX)
8353b4c9 3312 return -1;
2322903b
SH
3313
3314 if (!file_exists(path)) {
0fd73091 3315 ERROR("The file \"%s\" does not exist on host", path);
8353b4c9 3316 return -1;
2322903b
SH
3317 }
3318
794248d0 3319 ret = snprintf(destpath, PATH_MAX, "%s" P_tmpdir "%s", conf->rootfs.mount, "/.lxc-init");
0fd73091 3320 if (ret < 0 || ret >= PATH_MAX)
8353b4c9 3321 return -1;
2322903b
SH
3322
3323 if (!file_exists(destpath)) {
794248d0
CB
3324 ret = mknod(destpath, S_IFREG | 0000, 0);
3325 if (ret < 0 && errno != EEXIST) {
3326 SYSERROR("Failed to create dummy \"%s\" file as bind mount target", destpath);
8353b4c9 3327 return -1;
2322903b 3328 }
2322903b
SH
3329 }
3330
592fd47a 3331 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
8353b4c9 3332 if (ret < 0) {
0fd73091 3333 SYSERROR("Failed to bind mount lxc.init.static into container");
8353b4c9
CB
3334 return -1;
3335 }
3336
794248d0
CB
3337 p = strdup(destpath + strlen(conf->rootfs.mount));
3338 if (!p)
3339 return -ENOMEM;
794248d0 3340
8353b4c9 3341 INFO("Bind mounted lxc.init.static into container at \"%s\"", path);
41089848 3342out:
4b5b3a2a 3343 ((struct execute_args *)handler->data)->init_fd = -1;
41089848 3344 ((struct execute_args *)handler->data)->init_path = p;
8353b4c9 3345 return 0;
2322903b
SH
3346}
3347
0fd73091
CB
3348/* This does the work of remounting / if it is shared, calling the container
3349 * pre-mount hooks, and mounting the rootfs.
35120d9c
SH
3350 */
3351int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 3352{
0fd73091
CB
3353 int ret;
3354
35120d9c 3355 if (conf->rootfs_setup) {
35120d9c 3356 const char *path = conf->rootfs.mount;
0fd73091
CB
3357
3358 /* The rootfs was set up in another namespace. bind-mount it to
3359 * give us a mount in our own ns so we can pivot_root to it
3360 */
3361 ret = mount(path, path, "rootfs", MS_BIND, NULL);
3362 if (ret < 0) {
3363 ERROR("Failed to bind mount container / onto itself");
145832ba 3364 return -1;
35120d9c 3365 }
0fd73091
CB
3366
3367 TRACE("Bind mounted container / onto itself");
145832ba 3368 return 0;
35120d9c 3369 }
d4ef7c50 3370
e995d7a2
SH
3371 remount_all_slave();
3372
0fd73091
CB
3373 ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
3374 if (ret < 0) {
3375 ERROR("Failed to run pre-mount hooks");
35120d9c
SH
3376 return -1;
3377 }
3378
0fd73091
CB
3379 ret = lxc_setup_rootfs(conf);
3380 if (ret < 0) {
3381 ERROR("Failed to setup rootfs for");
35120d9c
SH
3382 return -1;
3383 }
3384
3385 conf->rootfs_setup = true;
3386 return 0;
3387}
3388
1c1c7051
SH
3389static bool verify_start_hooks(struct lxc_conf *conf)
3390{
1c1c7051 3391 char path[MAXPATHLEN];
0fd73091
CB
3392 struct lxc_list *it;
3393
3394 lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
1c1c7051 3395 int ret;
0fd73091
CB
3396 struct stat st;
3397 char *hookname = it->elem;
1c1c7051
SH
3398
3399 ret = snprintf(path, MAXPATHLEN, "%s%s",
0fd73091
CB
3400 conf->rootfs.path ? conf->rootfs.mount : "",
3401 hookname);
1c1c7051
SH
3402 if (ret < 0 || ret >= MAXPATHLEN)
3403 return false;
0fd73091 3404
1c1c7051 3405 ret = stat(path, &st);
0fd73091 3406 if (ret < 0) {
7b6753e7 3407 SYSERROR("Start hook %s not found in container",
0fd73091 3408 hookname);
1c1c7051
SH
3409 return false;
3410 }
0fd73091 3411
6a0c909a 3412 return true;
1c1c7051
SH
3413 }
3414
3415 return true;
3416}
3417
4b5b3a2a
TA
3418static bool execveat_supported(void)
3419{
3420#ifdef __NR_execveat
3421 /*
3422 * We use the syscall here, because it was introduced in kernel 3.19,
3423 * while glibc got support for using the syscall much later, in 2.27.
3424 * We don't want to use glibc because it falls back to /proc, and the
3425 * container may not have /proc mounted depending on its configuration.
3426 */
3427 syscall(__NR_execveat, -1, "", NULL, NULL, AT_EMPTY_PATH);
3428 if (errno == ENOSYS)
3429 return false;
3430
3431 return true;
3432#else
3433 return false;
3434#endif
3435}
3436
3b988b33 3437int lxc_setup(struct lxc_handler *handler)
35120d9c 3438{
2187efd3 3439 int ret;
0fd73091 3440 const char *lxcpath = handler->lxcpath, *name = handler->name;
35120d9c 3441 struct lxc_conf *lxc_conf = handler->conf;
35120d9c 3442
8353b4c9
CB
3443 ret = do_rootfs_setup(lxc_conf, name, lxcpath);
3444 if (ret < 0) {
3445 ERROR("Failed to setup rootfs");
35120d9c
SH
3446 return -1;
3447 }
3448
28d9e29e 3449 if (handler->nsfd[LXC_NS_UTS] == -1) {
8353b4c9
CB
3450 ret = setup_utsname(lxc_conf->utsname);
3451 if (ret < 0) {
0fd73091 3452 ERROR("Failed to setup the utsname %s", name);
6c544cb3
MM
3453 return -1;
3454 }
0ad19a3f 3455 }
3456
8353b4c9
CB
3457 ret = lxc_setup_network_in_child_namespaces(lxc_conf, &lxc_conf->network);
3458 if (ret < 0) {
3459 ERROR("Failed to setup network");
95b5ffaf 3460 return -1;
0ad19a3f 3461 }
3462
8353b4c9
CB
3463 ret = lxc_network_send_name_and_ifindex_to_parent(handler);
3464 if (ret < 0) {
3465 ERROR("Failed to send network device names and ifindices to parent");
790255cf
CB
3466 return -1;
3467 }
3468
bc6928ff 3469 if (lxc_conf->autodev > 0) {
8353b4c9
CB
3470 ret = mount_autodev(name, &lxc_conf->rootfs, lxcpath);
3471 if (ret < 0) {
3472 ERROR("Failed to mount \"/dev\"");
c6883f38
SH
3473 return -1;
3474 }
3475 }
3476
8353b4c9
CB
3477 /* Do automatic mounts (mainly /proc and /sys), but exclude those that
3478 * need to wait until other stuff has finished.
368bbc02 3479 */
8353b4c9
CB
3480 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler);
3481 if (ret < 0) {
3482 ERROR("Failed to setup first automatic mounts");
368bbc02
CS
3483 return -1;
3484 }
3485
8353b4c9
CB
3486 ret = setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath);
3487 if (ret < 0) {
3488 ERROR("Failed to setup mounts");
95b5ffaf 3489 return -1;
576f946d 3490 }
3491
7b6753e7 3492 /* Make sure any start hooks are in the container */
1c1c7051
SH
3493 if (!verify_start_hooks(lxc_conf))
3494 return -1;
3495
8353b4c9 3496 if (lxc_conf->is_execute) {
4b5b3a2a
TA
3497 if (execveat_supported()) {
3498 int fd;
3499 char path[PATH_MAX];
3500
3501 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3502 if (ret < 0 || ret >= PATH_MAX) {
3503 ERROR("Path to init.lxc.static too long");
3504 return -1;
3505 }
3506
3507 fd = open(path, O_PATH | O_CLOEXEC);
3508 if (fd < 0) {
3509 SYSERROR("Unable to open lxc.init.static");
3510 return -1;
3511 }
3512
3513 ((struct execute_args *)handler->data)->init_fd = fd;
3514 ((struct execute_args *)handler->data)->init_path = NULL;
3515 } else {
3516 ret = lxc_execute_bind_init(handler);
3517 if (ret < 0) {
3518 ERROR("Failed to bind-mount the lxc init system");
3519 return -1;
3520 }
8353b4c9
CB
3521 }
3522 }
2322903b 3523
8353b4c9
CB
3524 /* Now mount only cgroups, if wanted. Before, /sys could not have been
3525 * mounted. It is guaranteed to be mounted now either through
3526 * automatically or via fstab entries.
368bbc02 3527 */
8353b4c9
CB
3528 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler);
3529 if (ret < 0) {
3530 ERROR("Failed to setup remaining automatic mounts");
368bbc02
CS
3531 return -1;
3532 }
3533
8353b4c9 3534 ret = run_lxc_hooks(name, "mount", lxc_conf, NULL);
1a2cf89d 3535 if (ret < 0) {
8353b4c9 3536 ERROR("Failed to run mount hooks");
773fb9ca
SH
3537 return -1;
3538 }
3539
bc6928ff 3540 if (lxc_conf->autodev > 0) {
8353b4c9
CB
3541 ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
3542 if (ret < 0) {
3543 ERROR("Failed to run autodev hooks");
f7bee6c6
MW
3544 return -1;
3545 }
06749971 3546
8353b4c9
CB
3547 ret = lxc_fill_autodev(&lxc_conf->rootfs);
3548 if (ret < 0) {
3549 ERROR("Failed to populate \"/dev\"");
91c3830e
SH
3550 return -1;
3551 }
3552 }
368bbc02 3553
8353b4c9
CB
3554 if (!lxc_list_empty(&lxc_conf->mount_list)) {
3555 ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
3556 &lxc_conf->mount_list, name, lxcpath);
3557 if (ret < 0) {
3558 ERROR("Failed to setup mount entries");
3559 return -1;
3560 }
181437fd
YT
3561 }
3562
ed8704d0 3563 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
885766f5 3564 lxc_conf->ttys.dir);
ed8704d0
CB
3565 if (ret < 0) {
3566 ERROR("Failed to setup console");
95b5ffaf 3567 return -1;
6e590161 3568 }
3569
ed8704d0
CB
3570 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
3571 if (ret < 0) {
8353b4c9 3572 ERROR("Failed to setup \"/dev\" symlinks");
69aa6655
DE
3573 return -1;
3574 }
3575
8353b4c9
CB
3576 ret = lxc_create_tmp_proc_mount(lxc_conf);
3577 if (ret < 0) {
3578 ERROR("Failed to \"/proc\" LSMs");
e075f5d9 3579 return -1;
e075f5d9 3580 }
e075f5d9 3581
8353b4c9
CB
3582 ret = setup_pivot_root(&lxc_conf->rootfs);
3583 if (ret < 0) {
3584 ERROR("Failed to pivot root into rootfs");
95b5ffaf 3585 return -1;
ed502555 3586 }
3587
8353b4c9
CB
3588 ret = lxc_setup_devpts(lxc_conf);
3589 if (ret < 0) {
3590 ERROR("Failed to setup new devpts instance");
95b5ffaf 3591 return -1;
3c26f34e 3592 }
3593
2187efd3
CB
3594 ret = lxc_create_ttys(handler);
3595 if (ret < 0)
e8bd4e43 3596 return -1;
e8bd4e43 3597
8353b4c9
CB
3598 ret = setup_personality(lxc_conf->personality);
3599 if (ret < 0) {
3600 ERROR("Failed to set personality");
cccc74b5
DL
3601 return -1;
3602 }
3603
8353b4c9
CB
3604 /* Set sysctl value to a path under /proc/sys as determined from the
3605 * key. For e.g. net.ipv4.ip_forward translated to
3606 * /proc/sys/net/ipv4/ip_forward.
7edd0540
L
3607 */
3608 if (!lxc_list_empty(&lxc_conf->sysctls)) {
3609 ret = setup_sysctl_parameters(&lxc_conf->sysctls);
8353b4c9
CB
3610 if (ret < 0) {
3611 ERROR("Failed to setup sysctl parameters");
7edd0540 3612 return -1;
8353b4c9 3613 }
7edd0540
L
3614 }
3615
97a8f74f
SG
3616 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3617 if (!lxc_list_empty(&lxc_conf->caps)) {
8353b4c9
CB
3618 ERROR("Container requests lxc.cap.drop and "
3619 "lxc.cap.keep: either use lxc.cap.drop or "
3620 "lxc.cap.keep, not both");
f6d3e3e4
SH
3621 return -1;
3622 }
8353b4c9 3623
97a8f74f 3624 if (dropcaps_except(&lxc_conf->keepcaps)) {
8353b4c9 3625 ERROR("Failed to keep capabilities");
97a8f74f
SG
3626 return -1;
3627 }
3628 } else if (setup_caps(&lxc_conf->caps)) {
8353b4c9 3629 ERROR("Failed to drop capabilities");
97a8f74f 3630 return -1;
81810dd1
DL
3631 }
3632
8353b4c9 3633 NOTICE("The container \"%s\" is set up", name);
cd54d859 3634
0ad19a3f 3635 return 0;
3636}
26ddeedd 3637
3f60c2f7 3638int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
14a7b0f9 3639 char *argv[])
26ddeedd 3640{
26ddeedd 3641 struct lxc_list *it;
3f60c2f7 3642 int which = -1;
26ddeedd 3643
3f60c2f7 3644 if (strcmp(hookname, "pre-start") == 0)
26ddeedd 3645 which = LXCHOOK_PRESTART;
3f60c2f7 3646 else if (strcmp(hookname, "start-host") == 0)
08dd2805 3647 which = LXCHOOK_START_HOST;
3f60c2f7 3648 else if (strcmp(hookname, "pre-mount") == 0)
5ea6163a 3649 which = LXCHOOK_PREMOUNT;
3f60c2f7 3650 else if (strcmp(hookname, "mount") == 0)
26ddeedd 3651 which = LXCHOOK_MOUNT;
3f60c2f7 3652 else if (strcmp(hookname, "autodev") == 0)
f7bee6c6 3653 which = LXCHOOK_AUTODEV;
3f60c2f7 3654 else if (strcmp(hookname, "start") == 0)
26ddeedd 3655 which = LXCHOOK_START;
3f60c2f7 3656 else if (strcmp(hookname, "stop") == 0)
52492063 3657 which = LXCHOOK_STOP;
3f60c2f7 3658 else if (strcmp(hookname, "post-stop") == 0)
26ddeedd 3659 which = LXCHOOK_POSTSTOP;
3f60c2f7 3660 else if (strcmp(hookname, "clone") == 0)
148e91f5 3661 which = LXCHOOK_CLONE;
3f60c2f7 3662 else if (strcmp(hookname, "destroy") == 0)
37cf711b 3663 which = LXCHOOK_DESTROY;
26ddeedd
SH
3664 else
3665 return -1;
3f60c2f7 3666
0fd73091 3667 lxc_list_for_each (it, &conf->hooks[which]) {
26ddeedd 3668 int ret;
3f60c2f7
CB
3669 char *hook = it->elem;
3670
3671 ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
14a7b0f9 3672 hookname, argv);
3f60c2f7
CB
3673 if (ret < 0)
3674 return -1;
26ddeedd 3675 }
3f60c2f7 3676
26ddeedd
SH
3677 return 0;
3678}
72d0e1cb 3679
72d0e1cb
SG
3680int lxc_clear_config_caps(struct lxc_conf *c)
3681{
1a0e70ac 3682 struct lxc_list *it, *next;
72d0e1cb 3683
0fd73091 3684 lxc_list_for_each_safe (it, &c->caps, next) {
72d0e1cb
SG
3685 lxc_list_del(it);
3686 free(it->elem);
3687 free(it);
3688 }
0fd73091 3689
72d0e1cb
SG
3690 return 0;
3691}
3692
c7e345ae
CB
3693static int lxc_free_idmap(struct lxc_list *id_map)
3694{
27c27d73
SH
3695 struct lxc_list *it, *next;
3696
0fd73091 3697 lxc_list_for_each_safe (it, id_map, next) {
27c27d73
SH
3698 lxc_list_del(it);
3699 free(it->elem);
3700 free(it);
3701 }
c7e345ae 3702
27c27d73
SH
3703 return 0;
3704}
3705
4355ab5f
SH
3706int lxc_clear_idmaps(struct lxc_conf *c)
3707{
3708 return lxc_free_idmap(&c->id_map);
3709}
3710
1fb86a7c
SH
3711int lxc_clear_config_keepcaps(struct lxc_conf *c)
3712{
0fd73091 3713 struct lxc_list *it, *next;
1fb86a7c 3714
0fd73091 3715 lxc_list_for_each_safe (it, &c->keepcaps, next) {
1fb86a7c
SH
3716 lxc_list_del(it);
3717 free(it->elem);
3718 free(it);
3719 }
0fd73091 3720
1fb86a7c
SH
3721 return 0;
3722}
3723
54860ed0 3724int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
72d0e1cb 3725{
54860ed0 3726 char *global_token, *namespaced_token;
ab1a6cac 3727 size_t namespaced_token_len;
54860ed0 3728 struct lxc_list *it, *next, *list;
ab1a6cac 3729 const char *k = key;
54860ed0 3730 bool all = false;
72d0e1cb 3731
54860ed0
CB
3732 if (version == CGROUP2_SUPER_MAGIC) {
3733 global_token = "lxc.cgroup2";
3734 namespaced_token = "lxc.cgroup2.";
0fd73091 3735 namespaced_token_len = sizeof("lxc.cgroup2.") - 1;
54860ed0
CB
3736 list = &c->cgroup2;
3737 } else if (version == CGROUP_SUPER_MAGIC) {
3738 global_token = "lxc.cgroup";
3739 namespaced_token = "lxc.cgroup.";
0fd73091 3740 namespaced_token_len = sizeof("lxc.cgroup.") - 1;
54860ed0
CB
3741 list = &c->cgroup;
3742 } else {
ab1a6cac 3743 return -EINVAL;
54860ed0
CB
3744 }
3745
3746 if (strcmp(key, global_token) == 0)
72d0e1cb 3747 all = true;
54860ed0 3748 else if (strncmp(key, namespaced_token, sizeof(namespaced_token) - 1) == 0)
ab1a6cac 3749 k += namespaced_token_len;
a6390f01 3750 else
ab1a6cac 3751 return -EINVAL;
72d0e1cb 3752
0fd73091 3753 lxc_list_for_each_safe (it, list, next) {
72d0e1cb 3754 struct lxc_cgroup *cg = it->elem;
54860ed0 3755
72d0e1cb
SG
3756 if (!all && strcmp(cg->subsystem, k) != 0)
3757 continue;
54860ed0 3758
72d0e1cb
SG
3759 lxc_list_del(it);
3760 free(cg->subsystem);
3761 free(cg->value);
3762 free(cg);
3763 free(it);
3764 }
e409b214 3765
72d0e1cb
SG
3766 return 0;
3767}
3768
c6d09e15
WB
3769int lxc_clear_limits(struct lxc_conf *c, const char *key)
3770{
3771 struct lxc_list *it, *next;
c6d09e15 3772 const char *k = NULL;
0fd73091 3773 bool all = false;
c6d09e15 3774
b668653c 3775 if (strcmp(key, "lxc.limit") == 0 || strcmp(key, "lxc.prlimit") == 0)
c6d09e15 3776 all = true;
b668653c
CB
3777 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.") - 1) == 0)
3778 k = key + sizeof("lxc.limit.") - 1;
3779 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.") - 1) == 0)
3780 k = key + sizeof("lxc.prlimit.") - 1;
c6d09e15
WB
3781 else
3782 return -1;
3783
0fd73091 3784 lxc_list_for_each_safe (it, &c->limits, next) {
c6d09e15 3785 struct lxc_limit *lim = it->elem;
0fd73091 3786
c6d09e15
WB
3787 if (!all && strcmp(lim->resource, k) != 0)
3788 continue;
0fd73091 3789
c6d09e15
WB
3790 lxc_list_del(it);
3791 free(lim->resource);
3792 free(lim);
3793 free(it);
3794 }
b668653c 3795
c6d09e15
WB
3796 return 0;
3797}
3798
7edd0540
L
3799int lxc_clear_sysctls(struct lxc_conf *c, const char *key)
3800{
3801 struct lxc_list *it, *next;
7edd0540 3802 const char *k = NULL;
0fd73091 3803 bool all = false;
7edd0540
L
3804
3805 if (strcmp(key, "lxc.sysctl") == 0)
3806 all = true;
3807 else if (strncmp(key, "lxc.sysctl.", sizeof("lxc.sysctl.") - 1) == 0)
3808 k = key + sizeof("lxc.sysctl.") - 1;
3809 else
3810 return -1;
3811
0fd73091 3812 lxc_list_for_each_safe (it, &c->sysctls, next) {
7edd0540 3813 struct lxc_sysctl *elem = it->elem;
0fd73091 3814
7edd0540
L
3815 if (!all && strcmp(elem->key, k) != 0)
3816 continue;
0fd73091 3817
7edd0540
L
3818 lxc_list_del(it);
3819 free(elem->key);
3820 free(elem->value);
3821 free(elem);
3822 free(it);
3823 }
0fd73091 3824
7edd0540
L
3825 return 0;
3826}
3827
61d7a733
YT
3828int lxc_clear_procs(struct lxc_conf *c, const char *key)
3829{
0fd73091 3830 struct lxc_list *it, *next;
61d7a733 3831 const char *k = NULL;
0fd73091 3832 bool all = false;
61d7a733
YT
3833
3834 if (strcmp(key, "lxc.proc") == 0)
3835 all = true;
3836 else if (strncmp(key, "lxc.proc.", sizeof("lxc.proc.") - 1) == 0)
3837 k = key + sizeof("lxc.proc.") - 1;
3838 else
3839 return -1;
3840
0fd73091 3841 lxc_list_for_each_safe (it, &c->procs, next) {
61d7a733 3842 struct lxc_proc *proc = it->elem;
0fd73091 3843
61d7a733
YT
3844 if (!all && strcmp(proc->filename, k) != 0)
3845 continue;
0fd73091 3846
61d7a733
YT
3847 lxc_list_del(it);
3848 free(proc->filename);
3849 free(proc->value);
3850 free(proc);
3851 free(it);
3852 }
3853
3854 return 0;
3855}
3856
ee1e7aa0
SG
3857int lxc_clear_groups(struct lxc_conf *c)
3858{
0fd73091 3859 struct lxc_list *it, *next;
ee1e7aa0 3860
0fd73091 3861 lxc_list_for_each_safe (it, &c->groups, next) {
ee1e7aa0
SG
3862 lxc_list_del(it);
3863 free(it->elem);
3864 free(it);
3865 }
0fd73091 3866
ee1e7aa0
SG
3867 return 0;
3868}
3869
ab799c0b
SG
3870int lxc_clear_environment(struct lxc_conf *c)
3871{
0fd73091 3872 struct lxc_list *it, *next;
ab799c0b 3873
0fd73091 3874 lxc_list_for_each_safe (it, &c->environment, next) {
ab799c0b
SG
3875 lxc_list_del(it);
3876 free(it->elem);
3877 free(it);
3878 }
0fd73091 3879
ab799c0b
SG
3880 return 0;
3881}
3882
72d0e1cb
SG
3883int lxc_clear_mount_entries(struct lxc_conf *c)
3884{
0fd73091 3885 struct lxc_list *it, *next;
72d0e1cb 3886
0fd73091 3887 lxc_list_for_each_safe (it, &c->mount_list, next) {
72d0e1cb
SG
3888 lxc_list_del(it);
3889 free(it->elem);
3890 free(it);
3891 }
0fd73091 3892
72d0e1cb
SG
3893 return 0;
3894}
3895
b099e9e9
SH
3896int lxc_clear_automounts(struct lxc_conf *c)
3897{
3898 c->auto_mounts = 0;
3899 return 0;
3900}
3901
12a50cc6 3902int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 3903{
72d0e1cb 3904 int i;
0fd73091
CB
3905 struct lxc_list *it, *next;
3906 const char *k = NULL;
3907 bool all = false, done = false;
72d0e1cb 3908
17ed13a3
SH
3909 if (strcmp(key, "lxc.hook") == 0)
3910 all = true;
0fd73091
CB
3911 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.") - 1) == 0)
3912 k = key + sizeof("lxc.hook.") - 1;
a6390f01
WB
3913 else
3914 return -1;
17ed13a3 3915
0fd73091 3916 for (i = 0; i < NUM_LXC_HOOKS; i++) {
17ed13a3 3917 if (all || strcmp(k, lxchook_names[i]) == 0) {
0fd73091 3918 lxc_list_for_each_safe (it, &c->hooks[i], next) {
17ed13a3
SH
3919 lxc_list_del(it);
3920 free(it->elem);
3921 free(it);
3922 }
0fd73091 3923
17ed13a3 3924 done = true;
72d0e1cb
SG
3925 }
3926 }
17ed13a3
SH
3927
3928 if (!done) {
3929 ERROR("Invalid hook key: %s", key);
3930 return -1;
3931 }
0fd73091 3932
72d0e1cb
SG
3933 return 0;
3934}
8eb5694b 3935
4184c3e1
SH
3936static inline void lxc_clear_aliens(struct lxc_conf *conf)
3937{
0fd73091 3938 struct lxc_list *it, *next;
4184c3e1 3939
0fd73091 3940 lxc_list_for_each_safe (it, &conf->aliens, next) {
4184c3e1
SH
3941 lxc_list_del(it);
3942 free(it->elem);
3943 free(it);
3944 }
3945}
3946
c7b15d1e 3947void lxc_clear_includes(struct lxc_conf *conf)
f979ac15 3948{
0fd73091 3949 struct lxc_list *it, *next;
f979ac15 3950
0fd73091 3951 lxc_list_for_each_safe (it, &conf->includes, next) {
f979ac15
SH
3952 lxc_list_del(it);
3953 free(it->elem);
3954 free(it);
3955 }
3956}
3957
8eb5694b
SH
3958void lxc_conf_free(struct lxc_conf *conf)
3959{
3960 if (!conf)
3961 return;
0fd73091 3962
858377e4
SH
3963 if (current_config == conf)
3964 current_config = NULL;
aed105d5 3965 lxc_terminal_conf_free(&conf->console);
f10fad2f 3966 free(conf->rootfs.mount);
b3b8c97f 3967 free(conf->rootfs.bdev_type);
f10fad2f
ME
3968 free(conf->rootfs.options);
3969 free(conf->rootfs.path);
f10fad2f 3970 free(conf->logfile);
858377e4
SH
3971 if (conf->logfd != -1)
3972 close(conf->logfd);
f10fad2f 3973 free(conf->utsname);
885766f5
CB
3974 free(conf->ttys.dir);
3975 free(conf->ttys.tty_names);
f10fad2f
ME
3976 free(conf->fstab);
3977 free(conf->rcfile);
5cda27c1 3978 free(conf->execute_cmd);
f10fad2f 3979 free(conf->init_cmd);
3c491553 3980 free(conf->init_cwd);
6b0d5538 3981 free(conf->unexpanded_config);
76d0127f 3982 free(conf->syslog);
c302b476 3983 lxc_free_networks(&conf->network);
f10fad2f
ME
3984 free(conf->lsm_aa_profile);
3985 free(conf->lsm_se_context);
769872f9 3986 lxc_seccomp_free(conf);
8eb5694b 3987 lxc_clear_config_caps(conf);
1fb86a7c 3988 lxc_clear_config_keepcaps(conf);
54860ed0
CB
3989 lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
3990 lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
17ed13a3 3991 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 3992 lxc_clear_mount_entries(conf);
27c27d73 3993 lxc_clear_idmaps(conf);
ee1e7aa0 3994 lxc_clear_groups(conf);
f979ac15 3995 lxc_clear_includes(conf);
761d81ca 3996 lxc_clear_aliens(conf);
ab799c0b 3997 lxc_clear_environment(conf);
240d4b74 3998 lxc_clear_limits(conf, "lxc.prlimit");
7edd0540 3999 lxc_clear_sysctls(conf, "lxc.sysctl");
61d7a733 4000 lxc_clear_procs(conf, "lxc.proc");
43654d34
CB
4001 free(conf->cgroup_meta.dir);
4002 free(conf->cgroup_meta.controllers);
8eb5694b
SH
4003 free(conf);
4004}
4355ab5f
SH
4005
4006struct userns_fn_data {
4007 int (*fn)(void *);
c9b7c33e 4008 const char *fn_name;
4355ab5f
SH
4009 void *arg;
4010 int p[2];
4011};
4012
4013static int run_userns_fn(void *data)
4014{
4355ab5f 4015 char c;
0fd73091 4016 struct userns_fn_data *d = data;
4355ab5f 4017
f8aa4bf3 4018 /* Close write end of the pipe. */
4355ab5f 4019 close(d->p[1]);
f8aa4bf3
CB
4020
4021 /* Wait for parent to finish establishing a new mapping in the user
4022 * namespace we are executing in.
4023 */
489f39be 4024 if (lxc_read_nointr(d->p[0], &c, 1) != 1)
4355ab5f 4025 return -1;
f8aa4bf3
CB
4026
4027 /* Close read end of the pipe. */
4355ab5f 4028 close(d->p[0]);
f8aa4bf3 4029
c9b7c33e
CB
4030 if (d->fn_name)
4031 TRACE("calling function \"%s\"", d->fn_name);
0fd73091 4032
f8aa4bf3 4033 /* Call function to run. */
4355ab5f
SH
4034 return d->fn(d->arg);
4035}
4036
db7cfe23
CB
4037static struct id_map *mapped_nsid_add(struct lxc_conf *conf, unsigned id,
4038 enum idtype idtype)
4039{
5173b710
CB
4040 const struct id_map *map;
4041 struct id_map *retmap;
db7cfe23
CB
4042
4043 map = find_mapped_nsid_entry(conf, id, idtype);
4044 if (!map)
4045 return NULL;
4046
4047 retmap = malloc(sizeof(*retmap));
4048 if (!retmap)
4049 return NULL;
4050
4051 memcpy(retmap, map, sizeof(*retmap));
4052 return retmap;
4053}
4054
c4333195
CB
4055static struct id_map *find_mapped_hostid_entry(struct lxc_conf *conf,
4056 unsigned id, enum idtype idtype)
f8aa4bf3 4057{
f8aa4bf3 4058 struct id_map *map;
0fd73091 4059 struct lxc_list *it;
f8aa4bf3
CB
4060 struct id_map *retmap = NULL;
4061
0fd73091 4062 lxc_list_for_each (it, &conf->id_map) {
f8aa4bf3
CB
4063 map = it->elem;
4064 if (map->idtype != idtype)
4065 continue;
4066
4067 if (id >= map->hostid && id < map->hostid + map->range) {
4068 retmap = map;
4069 break;
4070 }
4071 }
4072
f8aa4bf3
CB
4073 return retmap;
4074}
4075
0fd73091 4076/* Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
f8aa4bf3 4077 * existing one or establish a new one.
4355ab5f 4078 */
0fd73091
CB
4079static struct id_map *mapped_hostid_add(struct lxc_conf *conf, uid_t id,
4080 enum idtype type)
4355ab5f 4081{
28a2d9e7 4082 int hostid_mapped;
c4333195
CB
4083 struct id_map *entry = NULL, *tmp = NULL;
4084
4085 entry = malloc(sizeof(*entry));
4086 if (!entry)
4087 return NULL;
f8aa4bf3 4088
28a2d9e7 4089 /* Reuse existing mapping. */
c4333195
CB
4090 tmp = find_mapped_hostid_entry(conf, id, type);
4091 if (tmp)
4092 return memcpy(entry, tmp, sizeof(*entry));
f8aa4bf3 4093
28a2d9e7
CB
4094 /* Find new mapping. */
4095 hostid_mapped = find_unmapped_nsid(conf, type);
4096 if (hostid_mapped < 0) {
c4333195
CB
4097 DEBUG("Failed to find free mapping for id %d", id);
4098 free(entry);
28a2d9e7 4099 return NULL;
f8aa4bf3 4100 }
f8aa4bf3 4101
28a2d9e7
CB
4102 entry->idtype = type;
4103 entry->nsid = hostid_mapped;
4104 entry->hostid = (unsigned long)id;
4105 entry->range = 1;
4355ab5f 4106
28a2d9e7 4107 return entry;
4355ab5f
SH
4108}
4109
dcf0ffdf 4110struct lxc_list *get_minimal_idmap(struct lxc_conf *conf)
4355ab5f 4111{
f8aa4bf3 4112 uid_t euid, egid;
4160c3a0
CB
4113 uid_t nsuid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
4114 gid_t nsgid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
f8aa4bf3 4115 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
4116 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4117 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 4118
db7cfe23 4119 /* Find container root mappings. */
4160c3a0 4120 container_root_uid = mapped_nsid_add(conf, nsuid, ID_TYPE_UID);
db7cfe23 4121 if (!container_root_uid) {
dcf0ffdf 4122 DEBUG("Failed to find mapping for namespace uid %d", 0);
db7cfe23 4123 goto on_error;
f8aa4bf3 4124 }
dcf0ffdf
CB
4125 euid = geteuid();
4126 if (euid >= container_root_uid->hostid &&
4127 euid < (container_root_uid->hostid + container_root_uid->range))
db7cfe23 4128 host_uid_map = container_root_uid;
f8aa4bf3 4129
4160c3a0 4130 container_root_gid = mapped_nsid_add(conf, nsgid, ID_TYPE_GID);
db7cfe23 4131 if (!container_root_gid) {
dcf0ffdf 4132 DEBUG("Failed to find mapping for namespace gid %d", 0);
f8aa4bf3
CB
4133 goto on_error;
4134 }
dcf0ffdf
CB
4135 egid = getegid();
4136 if (egid >= container_root_gid->hostid &&
4137 egid < (container_root_gid->hostid + container_root_gid->range))
db7cfe23 4138 host_gid_map = container_root_gid;
f8aa4bf3
CB
4139
4140 /* Check whether the {g,u}id of the user has a mapping. */
954b7d9b 4141 if (!host_uid_map)
c4333195 4142 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
28a2d9e7 4143 if (!host_uid_map) {
db7cfe23 4144 DEBUG("Failed to find mapping for uid %d", euid);
f8aa4bf3
CB
4145 goto on_error;
4146 }
4147
dcf0ffdf
CB
4148 if (!host_gid_map)
4149 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
28a2d9e7 4150 if (!host_gid_map) {
db7cfe23 4151 DEBUG("Failed to find mapping for gid %d", egid);
28a2d9e7
CB
4152 goto on_error;
4153 }
4154
4155 /* Allocate new {g,u}id map list. */
4156 idmap = malloc(sizeof(*idmap));
4157 if (!idmap)
4158 goto on_error;
4159 lxc_list_init(idmap);
4160
f8aa4bf3
CB
4161 /* Add container root to the map. */
4162 tmplist = malloc(sizeof(*tmplist));
4163 if (!tmplist)
4164 goto on_error;
4165 lxc_list_add_elem(tmplist, container_root_uid);
4166 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4167
1d90e064 4168 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
4169 /* idmap will now keep track of that memory. */
4170 container_root_uid = NULL;
4171
4172 /* Add container root to the map. */
4173 tmplist = malloc(sizeof(*tmplist));
4174 if (!tmplist)
4175 goto on_error;
4176 lxc_list_add_elem(tmplist, host_uid_map);
4177 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4178 }
1d90e064
CB
4179 /* idmap will now keep track of that memory. */
4180 container_root_uid = NULL;
4181 /* idmap will now keep track of that memory. */
4182 host_uid_map = NULL;
f8aa4bf3
CB
4183
4184 tmplist = malloc(sizeof(*tmplist));
4185 if (!tmplist)
4186 goto on_error;
4187 lxc_list_add_elem(tmplist, container_root_gid);
4188 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4189
1d90e064 4190 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
4191 /* idmap will now keep track of that memory. */
4192 container_root_gid = NULL;
4193
4194 tmplist = malloc(sizeof(*tmplist));
4195 if (!tmplist)
4196 goto on_error;
4197 lxc_list_add_elem(tmplist, host_gid_map);
4198 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4199 }
1d90e064
CB
4200 /* idmap will now keep track of that memory. */
4201 container_root_gid = NULL;
4202 /* idmap will now keep track of that memory. */
4203 host_gid_map = NULL;
f8aa4bf3 4204
dcf0ffdf
CB
4205 TRACE("Allocated minimal idmapping");
4206 return idmap;
4207
4208on_error:
4dc41f99 4209 if (idmap) {
dcf0ffdf 4210 lxc_free_idmap(idmap);
4dc41f99
SX
4211 free(idmap);
4212 }
dcf0ffdf
CB
4213 if (container_root_uid)
4214 free(container_root_uid);
4215 if (container_root_gid)
4216 free(container_root_gid);
4217 if (host_uid_map && (host_uid_map != container_root_uid))
4218 free(host_uid_map);
4219 if (host_gid_map && (host_gid_map != container_root_gid))
4220 free(host_gid_map);
4221
4222 return NULL;
4223}
4224
4225/* Run a function in a new user namespace.
4226 * The caller's euid/egid will be mapped if it is not already.
4227 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4228 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4229 * This means we require only to establish a mapping from:
4230 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4231 * - the container root -> some sub{g,u}id
4232 * The former we add, if the user did not specifiy a mapping. The latter we
4233 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4234 * there to start the container in the first place.
4235 */
4236int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4237 const char *fn_name)
4238{
4239 pid_t pid;
dcf0ffdf 4240 int p[2];
0fd73091 4241 struct userns_fn_data d;
dcf0ffdf 4242 struct lxc_list *idmap;
0fd73091
CB
4243 int ret = -1, status = -1;
4244 char c = '1';
dcf0ffdf 4245
2b2655a8
CB
4246 if (!conf)
4247 return -EINVAL;
4248
dcf0ffdf
CB
4249 idmap = get_minimal_idmap(conf);
4250 if (!idmap)
4251 return -1;
4252
4253 ret = pipe(p);
4254 if (ret < 0) {
4255 SYSERROR("Failed to create pipe");
4256 return -1;
4257 }
4258 d.fn = fn;
4259 d.fn_name = fn_name;
4260 d.arg = data;
4261 d.p[0] = p[0];
4262 d.p[1] = p[1];
4263
4264 /* Clone child in new user namespace. */
4265 pid = lxc_raw_clone_cb(run_userns_fn, &d, CLONE_NEWUSER);
4266 if (pid < 0) {
0fd73091 4267 ERROR("Failed to clone process in new user namespace");
dcf0ffdf
CB
4268 goto on_error;
4269 }
4270
4271 close(p[0]);
4272 p[0] = -1;
4273
4b73005c
CB
4274 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4275 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
dcf0ffdf 4276 struct id_map *map;
0fd73091 4277 struct lxc_list *it;
dcf0ffdf 4278
0fd73091 4279 lxc_list_for_each (it, idmap) {
f8aa4bf3 4280 map = it->elem;
dcf0ffdf 4281 TRACE("Establishing %cid mapping for \"%d\" in new "
f8aa4bf3 4282 "user namespace: nsuid %lu - hostid %lu - range "
0fd73091
CB
4283 "%lu",
4284 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4285 map->nsid, map->hostid, map->range);
f8aa4bf3 4286 }
4355ab5f
SH
4287 }
4288
f8aa4bf3 4289 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4290 ret = lxc_map_ids(idmap, pid);
f8aa4bf3 4291 if (ret < 0) {
0fd73091 4292 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
f8aa4bf3 4293 goto on_error;
4355ab5f
SH
4294 }
4295
f8aa4bf3 4296 /* Tell child to proceed. */
489f39be 4297 if (lxc_write_nointr(p[1], &c, 1) != 1) {
dcf0ffdf 4298 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
f8aa4bf3 4299 goto on_error;
4355ab5f
SH
4300 }
4301
686dd5d1 4302on_error:
4355ab5f
SH
4303 if (p[0] != -1)
4304 close(p[0]);
4305 close(p[1]);
f8aa4bf3 4306
ee1b16bc
TA
4307 /* Wait for child to finish. */
4308 if (pid > 0)
4309 status = wait_for_pid(pid);
4310
686dd5d1
CB
4311 if (status < 0)
4312 ret = -1;
4313
f8aa4bf3 4314 return ret;
4355ab5f 4315}
97e9cfa0 4316
415a8851
CB
4317int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
4318 const char *fn_name)
4319{
4320 pid_t pid;
4321 uid_t euid, egid;
415a8851
CB
4322 int p[2];
4323 struct id_map *map;
4324 struct lxc_list *cur;
0fd73091 4325 struct userns_fn_data d;
415a8851 4326 int ret = -1;
0fd73091 4327 char c = '1';
415a8851
CB
4328 struct lxc_list *idmap = NULL, *tmplist = NULL;
4329 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4330 *host_uid_map = NULL, *host_gid_map = NULL;
4331
2b2655a8
CB
4332 if (!conf)
4333 return -EINVAL;
4334
415a8851
CB
4335 ret = pipe(p);
4336 if (ret < 0) {
4337 SYSERROR("opening pipe");
4338 return -1;
4339 }
4340 d.fn = fn;
4341 d.fn_name = fn_name;
4342 d.arg = data;
4343 d.p[0] = p[0];
4344 d.p[1] = p[1];
4345
4346 /* Clone child in new user namespace. */
4347 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4348 if (pid < 0) {
0fd73091 4349 ERROR("Failed to clone process in new user namespace");
415a8851
CB
4350 goto on_error;
4351 }
4352
4353 close(p[0]);
4354 p[0] = -1;
4355
4356 euid = geteuid();
4357 egid = getegid();
4358
4359 /* Allocate new {g,u}id map list. */
4360 idmap = malloc(sizeof(*idmap));
4361 if (!idmap)
4362 goto on_error;
4363 lxc_list_init(idmap);
4364
4365 /* Find container root. */
0fd73091 4366 lxc_list_for_each (cur, &conf->id_map) {
415a8851
CB
4367 struct id_map *tmpmap;
4368
4369 tmplist = malloc(sizeof(*tmplist));
4370 if (!tmplist)
4371 goto on_error;
4372
4373 tmpmap = malloc(sizeof(*tmpmap));
4374 if (!tmpmap) {
4375 free(tmplist);
4376 goto on_error;
4377 }
4378
4379 memset(tmpmap, 0, sizeof(*tmpmap));
4380 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
4381 tmplist->elem = tmpmap;
4382
4383 lxc_list_add_tail(idmap, tmplist);
4384
4385 map = cur->elem;
4386
4387 if (map->idtype == ID_TYPE_UID)
4388 if (euid >= map->hostid && euid < map->hostid + map->range)
4389 host_uid_map = map;
4390
4391 if (map->idtype == ID_TYPE_GID)
4392 if (egid >= map->hostid && egid < map->hostid + map->range)
4393 host_gid_map = map;
4394
4395 if (map->nsid != 0)
4396 continue;
4397
4398 if (map->idtype == ID_TYPE_UID)
4399 if (container_root_uid == NULL)
4400 container_root_uid = map;
4401
4402 if (map->idtype == ID_TYPE_GID)
4403 if (container_root_gid == NULL)
4404 container_root_gid = map;
4405 }
4406
4407 if (!container_root_uid || !container_root_gid) {
4408 ERROR("No mapping for container root found");
4409 goto on_error;
4410 }
4411
4412 /* Check whether the {g,u}id of the user has a mapping. */
4413 if (!host_uid_map)
c4333195 4414 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
415a8851
CB
4415 else
4416 host_uid_map = container_root_uid;
4417
4418 if (!host_gid_map)
c4333195 4419 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
415a8851
CB
4420 else
4421 host_gid_map = container_root_gid;
4422
4423 if (!host_uid_map) {
4424 DEBUG("Failed to find mapping for uid %d", euid);
4425 goto on_error;
4426 }
4427
4428 if (!host_gid_map) {
4429 DEBUG("Failed to find mapping for gid %d", egid);
4430 goto on_error;
4431 }
4432
4433 if (host_uid_map && (host_uid_map != container_root_uid)) {
4434 /* Add container root to the map. */
4435 tmplist = malloc(sizeof(*tmplist));
4436 if (!tmplist)
4437 goto on_error;
4438 lxc_list_add_elem(tmplist, host_uid_map);
4439 lxc_list_add_tail(idmap, tmplist);
4440 }
4441 /* idmap will now keep track of that memory. */
4442 host_uid_map = NULL;
4443
4444 if (host_gid_map && (host_gid_map != container_root_gid)) {
4445 tmplist = malloc(sizeof(*tmplist));
4446 if (!tmplist)
4447 goto on_error;
4448 lxc_list_add_elem(tmplist, host_gid_map);
4449 lxc_list_add_tail(idmap, tmplist);
4450 }
4451 /* idmap will now keep track of that memory. */
4452 host_gid_map = NULL;
4453
4454 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4455 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
0fd73091 4456 lxc_list_for_each (cur, idmap) {
415a8851
CB
4457 map = cur->elem;
4458 TRACE("establishing %cid mapping for \"%d\" in new "
4459 "user namespace: nsuid %lu - hostid %lu - range "
4460 "%lu",
4461 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4462 map->nsid, map->hostid, map->range);
4463 }
4464 }
4465
4466 /* Set up {g,u}id mapping for user namespace of child process. */
4467 ret = lxc_map_ids(idmap, pid);
4468 if (ret < 0) {
0fd73091 4469 ERROR("error setting up {g,u}id mappings for child process \"%d\"", pid);
415a8851
CB
4470 goto on_error;
4471 }
4472
4473 /* Tell child to proceed. */
489f39be 4474 if (lxc_write_nointr(p[1], &c, 1) != 1) {
0fd73091 4475 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
415a8851
CB
4476 goto on_error;
4477 }
4478
686dd5d1 4479on_error:
ee1b16bc
TA
4480 if (p[0] != -1)
4481 close(p[0]);
4482 close(p[1]);
4483
415a8851 4484 /* Wait for child to finish. */
686dd5d1
CB
4485 if (pid > 0)
4486 ret = wait_for_pid(pid);
415a8851 4487
80758b4b 4488 if (idmap) {
415a8851 4489 lxc_free_idmap(idmap);
80758b4b
DJ
4490 free(idmap);
4491 }
4492
415a8851
CB
4493 if (host_uid_map && (host_uid_map != container_root_uid))
4494 free(host_uid_map);
4495 if (host_gid_map && (host_gid_map != container_root_gid))
4496 free(host_gid_map);
4497
415a8851
CB
4498 return ret;
4499}
4500
a96a8e8c 4501/* not thread-safe, do not use from api without first forking */
0fd73091 4502static char *getuname(void)
97e9cfa0 4503{
cb7aa5e8
DJ
4504 struct passwd pwent;
4505 struct passwd *pwentp = NULL;
4506 char *buf;
4507 char *username;
4508 size_t bufsize;
4509 int ret;
97e9cfa0 4510
cb7aa5e8
DJ
4511 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
4512 if (bufsize == -1)
4513 bufsize = 1024;
4514
4515 buf = malloc(bufsize);
4516 if (!buf)
97e9cfa0
SH
4517 return NULL;
4518
cb7aa5e8
DJ
4519 ret = getpwuid_r(geteuid(), &pwent, buf, bufsize, &pwentp);
4520 if (!pwentp) {
4521 if (ret == 0)
4522 WARN("Could not find matched password record.");
4523
4524 ERROR("Failed to get password record - %u", geteuid());
4525 free(buf);
4526 return NULL;
4527 }
4528
4529 username = strdup(pwent.pw_name);
4530 free(buf);
4531
4532 return username;
97e9cfa0
SH
4533}
4534
a96a8e8c 4535/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4536static char *getgname(void)
4537{
3de9fb4c
DJ
4538 struct group grent;
4539 struct group *grentp = NULL;
4540 char *buf;
4541 char *grname;
4542 size_t bufsize;
4543 int ret;
4544
4545 bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
4546 if (bufsize == -1)
4547 bufsize = 1024;
4548
4549 buf = malloc(bufsize);
4550 if (!buf)
4551 return NULL;
4552
4553 ret = getgrgid_r(getegid(), &grent, buf, bufsize, &grentp);
4554 if (!grentp) {
4555 if (ret == 0)
4556 WARN("Could not find matched group record");
97e9cfa0 4557
3de9fb4c
DJ
4558 ERROR("Failed to get group record - %u", getegid());
4559 free(buf);
97e9cfa0 4560 return NULL;
3de9fb4c
DJ
4561 }
4562
4563 grname = strdup(grent.gr_name);
4564 free(buf);
97e9cfa0 4565
3de9fb4c 4566 return grname;
97e9cfa0
SH
4567}
4568
a96a8e8c 4569/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4570void suggest_default_idmap(void)
4571{
0fd73091 4572 char *uname, *gname;
97e9cfa0
SH
4573 FILE *f;
4574 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
97e9cfa0 4575 size_t len = 0;
0fd73091 4576 char *line = NULL;
97e9cfa0 4577
0fd73091
CB
4578 uname = getuname();
4579 if (!uname)
97e9cfa0
SH
4580 return;
4581
0fd73091
CB
4582 gname = getgname();
4583 if (!gname) {
97e9cfa0
SH
4584 free(uname);
4585 return;
4586 }
4587
4588 f = fopen(subuidfile, "r");
4589 if (!f) {
4590 ERROR("Your system is not configured with subuids");
4591 free(gname);
4592 free(uname);
4593 return;
4594 }
0fd73091 4595
97e9cfa0 4596 while (getline(&line, &len, f) != -1) {
0fd73091 4597 char *p, *p2;
b7930180 4598 size_t no_newline = 0;
0fd73091
CB
4599
4600 p = strchr(line, ':');
97e9cfa0
SH
4601 if (*line == '#')
4602 continue;
4603 if (!p)
4604 continue;
4605 *p = '\0';
4606 p++;
0fd73091 4607
97e9cfa0
SH
4608 if (strcmp(line, uname))
4609 continue;
0fd73091 4610
97e9cfa0
SH
4611 p2 = strchr(p, ':');
4612 if (!p2)
4613 continue;
4614 *p2 = '\0';
4615 p2++;
4616 if (!*p2)
4617 continue;
b7930180
CB
4618 no_newline = strcspn(p2, "\n");
4619 p2[no_newline] = '\0';
4620
b7b2fde4 4621 if (lxc_safe_uint(p, &uid) < 0)
0fd73091 4622 WARN("Could not parse UID");
b7b2fde4 4623 if (lxc_safe_uint(p2, &urange) < 0)
0fd73091 4624 WARN("Could not parse UID range");
97e9cfa0
SH
4625 }
4626 fclose(f);
4627
6be7389a 4628 f = fopen(subgidfile, "r");
97e9cfa0
SH
4629 if (!f) {
4630 ERROR("Your system is not configured with subgids");
4631 free(gname);
4632 free(uname);
4633 return;
4634 }
0fd73091 4635
97e9cfa0 4636 while (getline(&line, &len, f) != -1) {
0fd73091 4637 char *p, *p2;
b7930180 4638 size_t no_newline = 0;
0fd73091
CB
4639
4640 p = strchr(line, ':');
97e9cfa0
SH
4641 if (*line == '#')
4642 continue;
4643 if (!p)
4644 continue;
4645 *p = '\0';
4646 p++;
0fd73091 4647
97e9cfa0
SH
4648 if (strcmp(line, uname))
4649 continue;
0fd73091 4650
97e9cfa0
SH
4651 p2 = strchr(p, ':');
4652 if (!p2)
4653 continue;
4654 *p2 = '\0';
4655 p2++;
4656 if (!*p2)
4657 continue;
b7930180
CB
4658 no_newline = strcspn(p2, "\n");
4659 p2[no_newline] = '\0';
4660
b7b2fde4 4661 if (lxc_safe_uint(p, &gid) < 0)
0fd73091 4662 WARN("Could not parse GID");
b7b2fde4 4663 if (lxc_safe_uint(p2, &grange) < 0)
0fd73091 4664 WARN("Could not parse GID range");
97e9cfa0
SH
4665 }
4666 fclose(f);
4667
f10fad2f 4668 free(line);
97e9cfa0
SH
4669
4670 if (!urange || !grange) {
4671 ERROR("You do not have subuids or subgids allocated");
4672 ERROR("Unprivileged containers require subuids and subgids");
fbd4a4d1 4673 free(uname);
1e7cd2f7 4674 free(gname);
97e9cfa0
SH
4675 return;
4676 }
4677
4678 ERROR("You must either run as root, or define uid mappings");
4679 ERROR("To pass uid mappings to lxc-create, you could create");
4680 ERROR("~/.config/lxc/default.conf:");
4681 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
bdcbb6b3
CB
4682 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4683 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
97e9cfa0
SH
4684
4685 free(gname);
4686 free(uname);
4687}
aaf26830 4688
a7307747
SH
4689static void free_cgroup_settings(struct lxc_list *result)
4690{
4691 struct lxc_list *iterator, *next;
4692
0fd73091 4693 lxc_list_for_each_safe (iterator, result, next) {
a7307747
SH
4694 lxc_list_del(iterator);
4695 free(iterator);
4696 }
4697 free(result);
4698}
4699
0fd73091 4700/* Return the list of cgroup_settings sorted according to the following rules
aaf26830
KT
4701 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4702 */
0fd73091 4703struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings)
aaf26830
KT
4704{
4705 struct lxc_list *result;
aaf26830 4706 struct lxc_cgroup *cg = NULL;
0fd73091 4707 struct lxc_list *it = NULL, *item = NULL, *memsw_limit = NULL;
aaf26830
KT
4708
4709 result = malloc(sizeof(*result));
0fd73091 4710 if (!result)
fac7c663 4711 return NULL;
aaf26830
KT
4712 lxc_list_init(result);
4713
0fd73091
CB
4714 /* Iterate over the cgroup settings and copy them to the output list. */
4715 lxc_list_for_each (it, cgroup_settings) {
aaf26830 4716 item = malloc(sizeof(*item));
fac7c663 4717 if (!item) {
a7307747 4718 free_cgroup_settings(result);
fac7c663
KT
4719 return NULL;
4720 }
0fd73091 4721
aaf26830
KT
4722 item->elem = it->elem;
4723 cg = it->elem;
4724 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4725 /* Store the memsw_limit location */
4726 memsw_limit = item;
0fd73091
CB
4727 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 &&
4728 memsw_limit != NULL) {
4729 /* lxc.cgroup.memory.memsw.limit_in_bytes is found
4730 * before lxc.cgroup.memory.limit_in_bytes, swap these
4731 * two items */
aaf26830
KT
4732 item->elem = memsw_limit->elem;
4733 memsw_limit->elem = it->elem;
4734 }
4735 lxc_list_add_tail(result, item);
4736 }
4737
4738 return result;
a7307747 4739}