]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
lxccontainer: reword create_mount_target()
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8 25#include "config.h"
0d190408 26#include "confile.h"
d06245b8 27
9d257a2a 28#include <arpa/inet.h>
8f3e280e
CB
29#include <dirent.h>
30#include <errno.h>
31#include <fcntl.h>
32#include <grp.h>
33#include <inttypes.h>
34#include <libgen.h>
9d257a2a
CB
35#include <linux/loop.h>
36#include <net/if.h>
37#include <netinet/in.h>
8f3e280e
CB
38#include <pwd.h>
39#include <stdarg.h>
0ad19a3f 40#include <stdio.h>
0ad19a3f 41#include <stdlib.h>
0ad19a3f 42#include <string.h>
8f3e280e
CB
43#include <sys/mman.h>
44#include <sys/mount.h>
45#include <sys/param.h>
46#include <sys/prctl.h>
6a49f05e 47#include <sys/sendfile.h>
8f3e280e 48#include <sys/socket.h>
9d257a2a 49#include <sys/stat.h>
2d76d1d7 50#include <sys/syscall.h>
9d257a2a 51#include <sys/sysmacros.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
9d257a2a
CB
55#include <time.h>
56#include <unistd.h>
1d52bdf7 57
af6824fc 58#ifdef MAJOR_IN_MKDEV
9d257a2a 59#include <sys/mkdev.h>
af6824fc 60#endif
af6824fc 61
614305f3 62#ifdef HAVE_STATVFS
2938f7c8 63#include <sys/statvfs.h>
614305f3 64#endif
e827ff7e
SG
65
66#if HAVE_PTY_H
b0a33c1e 67#include <pty.h>
e827ff7e
SG
68#else
69#include <../include/openpty.h>
70#endif
0ad19a3f 71
9d257a2a
CB
72#if HAVE_LIBCAP
73#include <sys/capability.h>
74#endif
75
76#if HAVE_SYS_PERSONALITY_H
77#include <sys/personality.h>
78#endif
79
f1e05b90
DJ
80#ifndef HAVE_STRLCAT
81#include "include/strlcat.h"
82#endif
83
9d257a2a
CB
84#if IS_BIONIC
85#include <../include/lxcmntent.h>
86#else
87#include <mntent.h>
88#endif
89
90#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
91#include <../include/prlimit.h>
92#endif
93
e8bd4e43 94#include "af_unix.h"
9d257a2a 95#include "caps.h"
8f3e280e 96#include "cgroup.h"
1b09f2c0 97#include "conf.h"
1ed6ba91 98#include "confile_utils.h"
8f3e280e 99#include "error.h"
1b09f2c0 100#include "log.h"
0ed9b1bc 101#include "lsm/lsm.h"
025ed0f3 102#include "lxclock.h"
8f3e280e 103#include "lxcseccomp.h"
4355ab5f 104#include "namespace.h"
8f3e280e
CB
105#include "network.h"
106#include "parse.h"
732375f5 107#include "ringbuf.h"
794248d0 108#include "start.h"
28d832c4 109#include "storage.h"
28d832c4 110#include "storage/overlay.h"
0ed9b1bc 111#include "terminal.h"
8f3e280e 112#include "utils.h"
d0a36f2c 113
9d257a2a
CB
114#ifndef MS_PRIVATE
115#define MS_PRIVATE (1<<18)
edaf8b1b
SG
116#endif
117
9d257a2a
CB
118#ifndef MS_LAZYTIME
119#define MS_LAZYTIME (1<<25)
f48b5fd8
FF
120#endif
121
ac2cecc4 122lxc_log_define(conf, lxc);
e5bda9ee 123
0fd73091
CB
124/* The lxc_conf of the container currently being worked on in an API call.
125 * This is used in the error calls.
126 */
127#ifdef HAVE_TLS
128__thread struct lxc_conf *current_config;
129#else
130struct lxc_conf *current_config;
131#endif
132
2d76d1d7
SG
133/* Define pivot_root() if missing from the C library */
134#ifndef HAVE_PIVOT_ROOT
9d257a2a 135static int pivot_root(const char *new_root, const char *put_old)
2d76d1d7
SG
136{
137#ifdef __NR_pivot_root
8f3e280e 138 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 139#else
8f3e280e
CB
140 errno = ENOSYS;
141 return -1;
2d76d1d7
SG
142#endif
143}
144#else
9d257a2a 145extern int pivot_root(const char *new_root, const char *put_old);
8912711c
CB
146#endif
147
0fd73091
CB
148char *lxchook_names[NUM_LXC_HOOKS] = {
149 "pre-start",
150 "pre-mount",
151 "mount",
152 "autodev",
153 "start",
154 "stop",
155 "post-stop",
156 "clone",
157 "destroy",
158 "start-host"
159};
72d0e1cb 160
998ac676
RT
161struct mount_opt {
162 char *name;
163 int clear;
164 int flag;
165};
166
81810dd1
DL
167struct caps_opt {
168 char *name;
169 int value;
170};
171
c6d09e15
WB
172struct limit_opt {
173 char *name;
174 int value;
175};
176
998ac676 177static struct mount_opt mount_opt[] = {
470b359b
CB
178 { "async", 1, MS_SYNCHRONOUS },
179 { "atime", 1, MS_NOATIME },
180 { "bind", 0, MS_BIND },
88d413d5 181 { "defaults", 0, 0 },
88d413d5 182 { "dev", 1, MS_NODEV },
470b359b 183 { "diratime", 1, MS_NODIRATIME },
88d413d5 184 { "dirsync", 0, MS_DIRSYNC },
470b359b 185 { "exec", 1, MS_NOEXEC },
8912711c 186 { "lazytime", 0, MS_LAZYTIME },
88d413d5 187 { "mand", 0, MS_MANDLOCK },
88d413d5 188 { "noatime", 0, MS_NOATIME },
470b359b 189 { "nodev", 0, MS_NODEV },
88d413d5 190 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
191 { "noexec", 0, MS_NOEXEC },
192 { "nomand", 1, MS_MANDLOCK },
193 { "norelatime", 1, MS_RELATIME },
194 { "nostrictatime", 1, MS_STRICTATIME },
195 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
196 { "rbind", 0, MS_BIND|MS_REC },
197 { "relatime", 0, MS_RELATIME },
470b359b
CB
198 { "remount", 0, MS_REMOUNT },
199 { "ro", 0, MS_RDONLY },
200 { "rw", 1, MS_RDONLY },
88d413d5 201 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
202 { "suid", 1, MS_NOSUID },
203 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 204 { NULL, 0, 0 },
998ac676
RT
205};
206
d840039e 207static struct mount_opt propagation_opt[] = {
0fd73091
CB
208 { "private", 0, MS_PRIVATE },
209 { "shared", 0, MS_SHARED },
210 { "slave", 0, MS_SLAVE },
211 { "unbindable", 0, MS_UNBINDABLE },
212 { "rprivate", 0, MS_PRIVATE|MS_REC },
213 { "rshared", 0, MS_SHARED|MS_REC },
214 { "rslave", 0, MS_SLAVE|MS_REC },
215 { "runbindable", 0, MS_UNBINDABLE|MS_REC },
216 { NULL, 0, 0 },
d840039e
YT
217};
218
81810dd1 219static struct caps_opt caps_opt[] = {
8560cd36 220#if HAVE_LIBCAP
0fd73091
CB
221 { "chown", CAP_CHOWN },
222 { "dac_override", CAP_DAC_OVERRIDE },
223 { "dac_read_search", CAP_DAC_READ_SEARCH },
224 { "fowner", CAP_FOWNER },
225 { "fsetid", CAP_FSETID },
226 { "kill", CAP_KILL },
227 { "setgid", CAP_SETGID },
228 { "setuid", CAP_SETUID },
229 { "setpcap", CAP_SETPCAP },
230 { "linux_immutable", CAP_LINUX_IMMUTABLE },
231 { "net_bind_service", CAP_NET_BIND_SERVICE },
232 { "net_broadcast", CAP_NET_BROADCAST },
233 { "net_admin", CAP_NET_ADMIN },
234 { "net_raw", CAP_NET_RAW },
235 { "ipc_lock", CAP_IPC_LOCK },
236 { "ipc_owner", CAP_IPC_OWNER },
237 { "sys_module", CAP_SYS_MODULE },
238 { "sys_rawio", CAP_SYS_RAWIO },
239 { "sys_chroot", CAP_SYS_CHROOT },
240 { "sys_ptrace", CAP_SYS_PTRACE },
241 { "sys_pacct", CAP_SYS_PACCT },
242 { "sys_admin", CAP_SYS_ADMIN },
243 { "sys_boot", CAP_SYS_BOOT },
244 { "sys_nice", CAP_SYS_NICE },
245 { "sys_resource", CAP_SYS_RESOURCE },
246 { "sys_time", CAP_SYS_TIME },
247 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
248 { "mknod", CAP_MKNOD },
249 { "lease", CAP_LEASE },
57b837e2 250#ifdef CAP_AUDIT_READ
0fd73091 251 { "audit_read", CAP_AUDIT_READ },
57b837e2 252#endif
9527e566 253#ifdef CAP_AUDIT_WRITE
0fd73091 254 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
255#endif
256#ifdef CAP_AUDIT_CONTROL
0fd73091 257 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 258#endif
0fd73091
CB
259 { "setfcap", CAP_SETFCAP },
260 { "mac_override", CAP_MAC_OVERRIDE },
261 { "mac_admin", CAP_MAC_ADMIN },
5170c716 262#ifdef CAP_SYSLOG
0fd73091 263 { "syslog", CAP_SYSLOG },
5170c716
CS
264#endif
265#ifdef CAP_WAKE_ALARM
0fd73091 266 { "wake_alarm", CAP_WAKE_ALARM },
5170c716 267#endif
2b54359b 268#ifdef CAP_BLOCK_SUSPEND
0fd73091 269 { "block_suspend", CAP_BLOCK_SUSPEND },
2b54359b 270#endif
495d2046 271#endif
8560cd36 272};
81810dd1 273
c6d09e15
WB
274static struct limit_opt limit_opt[] = {
275#ifdef RLIMIT_AS
276 { "as", RLIMIT_AS },
277#endif
278#ifdef RLIMIT_CORE
279 { "core", RLIMIT_CORE },
280#endif
281#ifdef RLIMIT_CPU
282 { "cpu", RLIMIT_CPU },
283#endif
284#ifdef RLIMIT_DATA
285 { "data", RLIMIT_DATA },
286#endif
287#ifdef RLIMIT_FSIZE
288 { "fsize", RLIMIT_FSIZE },
289#endif
290#ifdef RLIMIT_LOCKS
291 { "locks", RLIMIT_LOCKS },
292#endif
293#ifdef RLIMIT_MEMLOCK
294 { "memlock", RLIMIT_MEMLOCK },
295#endif
296#ifdef RLIMIT_MSGQUEUE
297 { "msgqueue", RLIMIT_MSGQUEUE },
298#endif
299#ifdef RLIMIT_NICE
300 { "nice", RLIMIT_NICE },
301#endif
302#ifdef RLIMIT_NOFILE
303 { "nofile", RLIMIT_NOFILE },
304#endif
305#ifdef RLIMIT_NPROC
306 { "nproc", RLIMIT_NPROC },
307#endif
308#ifdef RLIMIT_RSS
309 { "rss", RLIMIT_RSS },
310#endif
311#ifdef RLIMIT_RTPRIO
312 { "rtprio", RLIMIT_RTPRIO },
313#endif
314#ifdef RLIMIT_RTTIME
315 { "rttime", RLIMIT_RTTIME },
316#endif
317#ifdef RLIMIT_SIGPENDING
318 { "sigpending", RLIMIT_SIGPENDING },
319#endif
320#ifdef RLIMIT_STACK
321 { "stack", RLIMIT_STACK },
322#endif
323};
324
91c3830e
SH
325static int run_buffer(char *buffer)
326{
8e7da691 327 int ret;
0fd73091
CB
328 char *output;
329 struct lxc_popen_FILE *f;
91c3830e 330
ebec9176 331 f = lxc_popen(buffer);
91c3830e 332 if (!f) {
3f60c2f7 333 SYSERROR("Failed to popen() %s", buffer);
91c3830e
SH
334 return -1;
335 }
336
337 output = malloc(LXC_LOG_BUFFER_SIZE);
338 if (!output) {
3f60c2f7 339 ERROR("Failed to allocate memory for %s", buffer);
ebec9176 340 lxc_pclose(f);
91c3830e
SH
341 return -1;
342 }
343
062b72c6 344 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
3f60c2f7 345 DEBUG("Script %s with output: %s", buffer, output);
91c3830e
SH
346
347 free(output);
348
ebec9176 349 ret = lxc_pclose(f);
8e7da691 350 if (ret == -1) {
3f60c2f7 351 SYSERROR("Script exited with error");
91c3830e 352 return -1;
8e7da691 353 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
3f60c2f7 354 ERROR("Script exited with status %d", WEXITSTATUS(ret));
8e7da691
DE
355 return -1;
356 } else if (WIFSIGNALED(ret)) {
3f60c2f7 357 ERROR("Script terminated by signal %d", WTERMSIG(ret));
8e7da691 358 return -1;
91c3830e
SH
359 }
360
361 return 0;
362}
363
14a7b0f9
CB
364int run_script_argv(const char *name, unsigned int hook_version,
365 const char *section, const char *script,
586b1ce7 366 const char *hookname, char **argv)
148e91f5 367{
3f60c2f7 368 int buf_pos, i, ret;
148e91f5 369 char *buffer;
6f8d00d2 370 int fret = -1;
d08e5708 371 size_t size = 0;
148e91f5 372
3f60c2f7
CB
373 if (hook_version == 0)
374 INFO("Executing script \"%s\" for container \"%s\", config "
375 "section \"%s\"", script, name, section);
376 else
377 INFO("Executing script \"%s\" for container \"%s\"", script, name);
148e91f5 378
586b1ce7
CB
379 for (i = 0; argv && argv[i]; i++)
380 size += strlen(argv[i]) + 1;
148e91f5 381
3f60c2f7 382 size += sizeof("exec");
148e91f5 383 size += strlen(script);
3f60c2f7
CB
384 size++;
385
148e91f5 386 if (size > INT_MAX)
3f60c2f7 387 return -EFBIG;
148e91f5 388
3f60c2f7 389 if (hook_version == 0) {
d08e5708
CB
390 size += strlen(hookname);
391 size++;
392
393 size += strlen(name);
394 size++;
395
396 size += strlen(section);
397 size++;
398
399 if (size > INT_MAX)
400 return -EFBIG;
327cce76 401 }
3f60c2f7 402
6f8d00d2
CB
403 buffer = malloc(size);
404 if (!buffer)
405 return -ENOMEM;
406
327cce76 407 if (hook_version == 0)
3f60c2f7 408 buf_pos = snprintf(buffer, size, "exec %s %s %s %s", script, name, section, hookname);
327cce76 409 else
3f60c2f7 410 buf_pos = snprintf(buffer, size, "exec %s", script);
327cce76
CB
411 if (buf_pos < 0 || (size_t)buf_pos >= size) {
412 ERROR("Failed to create command line for script \"%s\"", script);
6f8d00d2 413 goto on_error;
327cce76 414 }
3f60c2f7 415
327cce76 416 if (hook_version == 1) {
3f60c2f7
CB
417 ret = setenv("LXC_HOOK_TYPE", hookname, 1);
418 if (ret < 0) {
419 SYSERROR("Failed to set environment variable: "
420 "LXC_HOOK_TYPE=%s", hookname);
6f8d00d2 421 goto on_error;
3f60c2f7 422 }
90f20466 423 TRACE("Set environment variable: LXC_HOOK_TYPE=%s", hookname);
3f60c2f7
CB
424
425 ret = setenv("LXC_HOOK_SECTION", section, 1);
426 if (ret < 0) {
427 SYSERROR("Failed to set environment variable: "
428 "LXC_HOOK_SECTION=%s", section);
6f8d00d2 429 goto on_error;
3f60c2f7
CB
430 }
431 TRACE("Set environment variable: LXC_HOOK_SECTION=%s", section);
14a7b0f9
CB
432
433 if (strcmp(section, "net") == 0) {
434 char *parent;
435
586b1ce7 436 if (!argv || !argv[0])
6f8d00d2 437 goto on_error;
14a7b0f9 438
586b1ce7 439 ret = setenv("LXC_NET_TYPE", argv[0], 1);
14a7b0f9
CB
440 if (ret < 0) {
441 SYSERROR("Failed to set environment variable: "
586b1ce7 442 "LXC_NET_TYPE=%s", argv[0]);
6f8d00d2 443 goto on_error;
14a7b0f9 444 }
586b1ce7 445 TRACE("Set environment variable: LXC_NET_TYPE=%s", argv[0]);
14a7b0f9 446
586b1ce7 447 parent = argv[1] ? argv[1] : "";
14a7b0f9 448
a8144263 449 if (strcmp(argv[0], "macvlan") == 0) {
14a7b0f9
CB
450 ret = setenv("LXC_NET_PARENT", parent, 1);
451 if (ret < 0) {
452 SYSERROR("Failed to set environment "
453 "variable: LXC_NET_PARENT=%s", parent);
6f8d00d2 454 goto on_error;
14a7b0f9
CB
455 }
456 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 457 } else if (strcmp(argv[0], "phys") == 0) {
14a7b0f9
CB
458 ret = setenv("LXC_NET_PARENT", parent, 1);
459 if (ret < 0) {
460 SYSERROR("Failed to set environment "
461 "variable: LXC_NET_PARENT=%s", parent);
6f8d00d2 462 goto on_error;
14a7b0f9
CB
463 }
464 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
a8144263 465 } else if (strcmp(argv[0], "veth") == 0) {
586b1ce7 466 char *peer = argv[2] ? argv[2] : "";
14a7b0f9
CB
467
468 ret = setenv("LXC_NET_PEER", peer, 1);
469 if (ret < 0) {
470 SYSERROR("Failed to set environment "
471 "variable: LXC_NET_PEER=%s", peer);
6f8d00d2 472 goto on_error;
14a7b0f9
CB
473 }
474 TRACE("Set environment variable: LXC_NET_PEER=%s", peer);
475
476 ret = setenv("LXC_NET_PARENT", parent, 1);
477 if (ret < 0) {
478 SYSERROR("Failed to set environment "
479 "variable: LXC_NET_PARENT=%s", parent);
6f8d00d2 480 goto on_error;
14a7b0f9
CB
481 }
482 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
483 }
484 }
148e91f5
SH
485 }
486
586b1ce7 487 for (i = 0; argv && argv[i]; i++) {
3f60c2f7
CB
488 size_t len = size - buf_pos;
489
586b1ce7 490 ret = snprintf(buffer + buf_pos, len, " %s", argv[i]);
3f60c2f7
CB
491 if (ret < 0 || (size_t)ret >= len) {
492 ERROR("Failed to create command line for script \"%s\"", script);
6f8d00d2 493 goto on_error;
148e91f5 494 }
3f60c2f7 495 buf_pos += ret;
148e91f5
SH
496 }
497
6f8d00d2
CB
498 fret = run_buffer(buffer);
499
500on_error:
501 free(buffer);
502 return fret;
148e91f5
SH
503}
504
811ef482 505int run_script(const char *name, const char *section, const char *script, ...)
e3b4c4c4 506{
abbfd20b 507 int ret;
91c3830e 508 char *buffer, *p;
abbfd20b 509 va_list ap;
0fd73091 510 size_t size = 0;
751d9dcd 511
0fd73091 512 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
751d9dcd 513 script, name, section);
e3b4c4c4 514
abbfd20b
DL
515 va_start(ap, script);
516 while ((p = va_arg(ap, char *)))
95642a10 517 size += strlen(p) + 1;
abbfd20b
DL
518 va_end(ap);
519
6d1a5f93 520 size += strlen("exec");
abbfd20b
DL
521 size += strlen(script);
522 size += strlen(name);
523 size += strlen(section);
6d1a5f93 524 size += 4;
abbfd20b 525
95642a10
MS
526 if (size > INT_MAX)
527 return -1;
528
529 buffer = alloca(size);
6d1a5f93 530 ret = snprintf(buffer, size, "exec %s %s %s", script, name, section);
0fd73091 531 if (ret < 0 || ret >= size)
9ba8130c 532 return -1;
751d9dcd 533
abbfd20b 534 va_start(ap, script);
9ba8130c 535 while ((p = va_arg(ap, char *))) {
062b72c6 536 int len = size - ret;
9ba8130c
SH
537 int rc;
538 rc = snprintf(buffer + ret, len, " %s", p);
7b5a2435
DJ
539 if (rc < 0 || rc >= len) {
540 va_end(ap);
9ba8130c 541 return -1;
7b5a2435 542 }
9ba8130c
SH
543 ret += rc;
544 }
abbfd20b 545 va_end(ap);
751d9dcd 546
91c3830e 547 return run_buffer(buffer);
e3b4c4c4
ST
548}
549
0fd73091 550/* pin_rootfs
63fc76c3 551 * if rootfs is a directory, then open ${rootfs}/.lxc-keep for writing for
b7ed4bf0
CS
552 * the duration of the container run, to prevent the container from marking
553 * the underlying fs readonly on shutdown. unlink the file immediately so
63fc76c3
GJ
554 * no name pollution is happens.
555 * don't unlink on NFS to avoid random named stale handles.
0c547523
SH
556 * return -1 on error.
557 * return -2 if nothing needed to be pinned.
558 * return an open fd (>=0) if we pinned it.
559 */
560int pin_rootfs(const char *rootfs)
561{
0fd73091
CB
562 int fd, ret;
563 char absrootfs[MAXPATHLEN], absrootfspin[MAXPATHLEN];
0c547523 564 struct stat s;
63fc76c3 565 struct statfs sfs;
0c547523 566
e99ee0de 567 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 568 return -2;
e99ee0de 569
00ec333b 570 if (!realpath(rootfs, absrootfs))
9be53773 571 return -2;
0c547523 572
0fd73091
CB
573 ret = stat(absrootfs, &s);
574 if (ret < 0)
0c547523 575 return -1;
0c547523 576
72f919c4 577 if (!S_ISDIR(s.st_mode))
0c547523
SH
578 return -2;
579
63fc76c3 580 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/.lxc-keep", absrootfs);
00ec333b 581 if (ret >= MAXPATHLEN)
0c547523 582 return -1;
0c547523 583
0fd73091 584 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR | S_IRUSR);
b7ed4bf0
CS
585 if (fd < 0)
586 return fd;
0fd73091 587
205fc010
CB
588 ret = fstatfs (fd, &sfs);
589 if (ret < 0)
590 return fd;
63fc76c3
GJ
591
592 if (sfs.f_type == NFS_SUPER_MAGIC) {
205fc010 593 DEBUG("Rootfs on NFS, not unlinking pin file \"%s\"", absrootfspin);
63fc76c3
GJ
594 return fd;
595 }
596
b7ed4bf0 597 (void)unlink(absrootfspin);
0fd73091 598
0c547523
SH
599 return fd;
600}
601
0fd73091
CB
602/* If we are asking to remount something, make sure that any NOEXEC etc are
603 * honored.
e2a7e8dc 604 */
5ae72b98 605unsigned long add_required_remount_flags(const char *s, const char *d,
5285689c 606 unsigned long flags)
e2a7e8dc 607{
614305f3 608#ifdef HAVE_STATVFS
0fd73091 609 int ret;
e2a7e8dc
SH
610 struct statvfs sb;
611 unsigned long required_flags = 0;
612
e2a7e8dc
SH
613 if (!s)
614 s = d;
615
616 if (!s)
617 return flags;
0fd73091
CB
618
619 ret = statvfs(s, &sb);
620 if (ret < 0)
e2a7e8dc
SH
621 return flags;
622
69eadddb
CB
623 if (flags & MS_REMOUNT) {
624 if (sb.f_flag & MS_NOSUID)
625 required_flags |= MS_NOSUID;
626 if (sb.f_flag & MS_NODEV)
627 required_flags |= MS_NODEV;
628 if (sb.f_flag & MS_RDONLY)
629 required_flags |= MS_RDONLY;
630 if (sb.f_flag & MS_NOEXEC)
631 required_flags |= MS_NOEXEC;
632 }
633
634 if (sb.f_flag & MS_NOATIME)
635 required_flags |= MS_NOATIME;
636 if (sb.f_flag & MS_NODIRATIME)
637 required_flags |= MS_NODIRATIME;
638 if (sb.f_flag & MS_LAZYTIME)
639 required_flags |= MS_LAZYTIME;
640 if (sb.f_flag & MS_RELATIME)
641 required_flags |= MS_RELATIME;
642 if (sb.f_flag & MS_STRICTATIME)
643 required_flags |= MS_STRICTATIME;
e2a7e8dc
SH
644
645 return flags | required_flags;
614305f3
SH
646#else
647 return flags;
648#endif
e2a7e8dc
SH
649}
650
0d190408
LT
651static int add_shmount_to_list(struct lxc_conf *conf) {
652 char new_mount[MAXPATHLEN];
653 size_t len_mount;
654 /* Offset for the leading '/' since the path_cont
655 * is absolute inside the container */
656 int ret = -1, offset = 1;
657
658 /* +1 for the separating whitespace */
659 len_mount = strlen(conf->lxc_shmount.path_host) + 1
660 + strlen(conf->lxc_shmount.path_cont) - offset
661 + sizeof(" none bind,create=dir 0 0") - 1;
662
663 ret = snprintf(new_mount, len_mount + 1, "%s %s none bind,create=dir 0 0",
664 conf->lxc_shmount.path_host, conf->lxc_shmount.path_cont + offset);
665 if (ret < 0 || (size_t)ret >= len_mount + 1)
666 return -1;
667
668 ret = add_elem_to_mount_list(new_mount, conf);
669 if (ret < 0)
670 ERROR("Failed to add new mount \"%s\" to the config", new_mount);
671
672 return ret;
673}
674
4fb3cba5 675static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 676{
0fd73091 677 int i, r;
b06b8511
CS
678 static struct {
679 int match_mask;
680 int match_flag;
681 const char *source;
682 const char *destination;
683 const char *fstype;
684 unsigned long flags;
685 const char *options;
686 } default_mounts[] = {
0fd73091
CB
687 /* Read-only bind-mounting... In older kernels, doing that
688 * required to do one MS_BIND mount and then
689 * MS_REMOUNT|MS_RDONLY the same one. According to mount(2)
690 * manpage, MS_BIND honors MS_RDONLY from kernel 2.6.26
691 * onwards. However, this apparently does not work on kernel
692 * 3.8. Unfortunately, on that very same kernel, doing the same
693 * trick as above doesn't seem to work either, there one needs
694 * to ALSO specify MS_BIND for the remount, otherwise the
695 * entire fs is remounted read-only or the mount fails because
696 * it's busy... MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for
697 * kernels as low as 2.6.32...
368bbc02 698 */
0fd73091 699 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a 700 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
0fd73091
CB
701 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
702 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
703 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
704 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
705 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
706 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
707 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
708 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
709 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
710 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
711 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
712 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
713 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
714 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
715 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
716 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 717 };
368bbc02 718
b06b8511 719 for (i = 0; default_mounts[i].match_mask; i++) {
0fd73091
CB
720 int saved_errno;
721 unsigned long mflags;
722 char *destination = NULL;
723 char *source = NULL;
724 if ((flags & default_mounts[i].match_mask) != default_mounts[i].match_flag)
725 continue;
726
727 if (default_mounts[i].source) {
cc4fd506 728 /* will act like strdup if %r is not present */
0fd73091
CB
729 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
730 if (!source)
cc4fd506 731 return -1;
0fd73091 732 }
f24a52d5 733
0fd73091
CB
734 if (!default_mounts[i].destination) {
735 ERROR("BUG: auto mounts destination %d was NULL", i);
b06b8511 736 free(source);
0fd73091
CB
737 return -1;
738 }
739
740 /* will act like strdup if %r is not present */
741 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
742 if (!destination) {
743 saved_errno = errno;
744 free(source);
745 errno = saved_errno;
746 return -1;
747 }
748
749 mflags = add_required_remount_flags(source, destination,
750 default_mounts[i].flags);
751 r = safe_mount(source, destination, default_mounts[i].fstype,
752 mflags, default_mounts[i].options,
753 conf->rootfs.path ? conf->rootfs.mount : NULL);
754 saved_errno = errno;
755 if (r < 0 && errno == ENOENT) {
756 INFO("Mount source or target for \"%s\" on \"%s\" does "
757 "not exist. Skipping", source, destination);
758 r = 0;
759 } else if (r < 0) {
760 SYSERROR("Failed to mount \"%s\" on \"%s\" with flags %lu", source, destination, mflags);
761 }
762
763 free(source);
764 free(destination);
765 if (r < 0) {
766 errno = saved_errno;
767 return -1;
368bbc02 768 }
368bbc02
CS
769 }
770
b06b8511 771 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
772 int cg_flags;
773
3f69fb12 774 cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE);
0fd73091
CB
775 /* If the type of cgroup mount was not specified, it depends on
776 * the container's capabilities as to what makes sense: if we
777 * have CAP_SYS_ADMIN, the read-only part can be remounted
778 * read-write anyway, so we may as well default to read-write;
779 * then the admin will not be given a false sense of security.
780 * (And if they really want mixed r/o r/w, then they can
781 * explicitly specify :mixed.) OTOH, if the container lacks
782 * CAP_SYS_ADMIN, do only default to :mixed, because then the
783 * container can't remount it read-write.
784 */
0769b82a
CS
785 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
786 int has_sys_admin = 0;
b0ee5983
CB
787
788 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 789 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 790 else
0769b82a 791 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
792
793 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 794 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 795 else
0769b82a 796 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a 797 }
0fd73091 798
3f69fb12 799 if (flags & LXC_AUTO_CGROUP_FORCE)
0fd73091
CB
800 cg_flags |= LXC_AUTO_CGROUP_FORCE;
801
2202afc9
CB
802 if (!handler->cgroup_ops->mount(handler->cgroup_ops,
803 handler,
804 conf->rootfs.path ? conf->rootfs.mount : "",
805 cg_flags)) {
0fd73091 806 SYSERROR("Failed to mount \"/sys/fs/cgroup\"");
b06b8511 807 return -1;
368bbc02
CS
808 }
809 }
810
0d190408
LT
811 if (flags & LXC_AUTO_SHMOUNTS_MASK) {
812 int ret = add_shmount_to_list(conf);
813 if (ret < 0) {
814 ERROR("Failed to add shmount entry to container config");
815 return ret;
816 }
817 }
818
368bbc02 819 return 0;
368bbc02
CS
820}
821
4e5440c6 822static int setup_utsname(struct utsname *utsname)
0ad19a3f 823{
0fd73091
CB
824 int ret;
825
4e5440c6
DL
826 if (!utsname)
827 return 0;
0ad19a3f 828
0fd73091
CB
829 ret = sethostname(utsname->nodename, strlen(utsname->nodename));
830 if (ret < 0) {
831 SYSERROR("Failed to set the hostname to \"%s\"", utsname->nodename);
0ad19a3f 832 return -1;
833 }
834
0fd73091 835 INFO("Set hostname to \"%s\"", utsname->nodename);
cd54d859 836
0ad19a3f 837 return 0;
838}
839
69aa6655
DE
840struct dev_symlinks {
841 const char *oldpath;
842 const char *name;
843};
844
845static const struct dev_symlinks dev_symlinks[] = {
0fd73091
CB
846 { "/proc/self/fd", "fd" },
847 { "/proc/self/fd/0", "stdin" },
848 { "/proc/self/fd/1", "stdout" },
849 { "/proc/self/fd/2", "stderr" },
69aa6655
DE
850};
851
ed8704d0 852static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
69aa6655 853{
0fd73091 854 int i, ret;
69aa6655 855 char path[MAXPATHLEN];
09227be2 856 struct stat s;
69aa6655 857
69aa6655
DE
858 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
859 const struct dev_symlinks *d = &dev_symlinks[i];
0fd73091
CB
860
861 ret = snprintf(path, sizeof(path), "%s/dev/%s",
862 rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
863 if (ret < 0 || ret >= MAXPATHLEN)
864 return -1;
09227be2 865
0fd73091
CB
866 /* Stat the path first. If we don't get an error accept it as
867 * is and don't try to create it
09227be2 868 */
0fd73091
CB
869 ret = stat(path, &s);
870 if (ret == 0)
09227be2 871 continue;
09227be2 872
69aa6655
DE
873 ret = symlink(d->oldpath, path);
874 if (ret && errno != EEXIST) {
0fd73091
CB
875 if (errno == EROFS) {
876 WARN("Failed to create \"%s\". Read-only filesystem", path);
09227be2 877 } else {
0fd73091 878 SYSERROR("Failed to create \"%s\"", path);
09227be2
MW
879 return -1;
880 }
69aa6655
DE
881 }
882 }
0fd73091 883
69aa6655
DE
884 return 0;
885}
886
2187efd3 887/* Build a space-separate list of ptys to pass to systemd. */
885766f5 888static bool append_ttyname(char **pp, char *name)
b0a33c1e 889{
393903d1 890 char *p;
f1e05b90 891 size_t size;
393903d1
SH
892
893 if (!*pp) {
894 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
895 if (!*pp)
896 return false;
0fd73091 897
393903d1
SH
898 sprintf(*pp, "container_ttys=%s", name);
899 return true;
900 }
0fd73091 901
f1e05b90
DJ
902 size = strlen(*pp) + strlen(name) + 2;
903 p = realloc(*pp, size);
393903d1
SH
904 if (!p)
905 return false;
0fd73091 906
393903d1 907 *pp = p;
f1e05b90
DJ
908 (void)strlcat(p, " ", size);
909 (void)strlcat(p, name, size);
0fd73091 910
393903d1
SH
911 return true;
912}
913
2187efd3 914static int lxc_setup_ttys(struct lxc_conf *conf)
393903d1 915{
9e1045e3 916 int i, ret;
0e4be3cf 917 const struct lxc_tty_info *ttys = &conf->ttys;
885766f5 918 char *ttydir = ttys->dir;
7c6ef2a2 919 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 920
e8bd4e43 921 if (!conf->rootfs.path)
bc9bd0e3
DL
922 return 0;
923
885766f5 924 for (i = 0; i < ttys->max; i++) {
0e4be3cf 925 struct lxc_terminal_info *tty = &ttys->tty[i];
b0a33c1e 926
e8bd4e43 927 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
73363c61 928 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 929 return -1;
9e1045e3 930
7c6ef2a2
SH
931 if (ttydir) {
932 /* create dev/lxc/tty%d" */
9e1045e3
CB
933 ret = snprintf(lxcpath, sizeof(lxcpath),
934 "/dev/%s/tty%d", ttydir, i + 1);
73363c61 935 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 936 return -1;
9e1045e3 937
3b7e332f 938 ret = mknod(path, S_IFREG | 0000, 0);
9e1045e3 939 if (ret < 0 && errno != EEXIST) {
73363c61 940 SYSERROR("Failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
941 return -1;
942 }
9e1045e3 943
7c6ef2a2 944 ret = unlink(path);
9e1045e3 945 if (ret < 0 && errno != ENOENT) {
73363c61 946 SYSERROR("Failed to unlink \"%s\"", path);
7c6ef2a2
SH
947 return -1;
948 }
b0a33c1e 949
2520facd 950 ret = mount(tty->name, lxcpath, "none", MS_BIND, 0);
9e1045e3 951 if (ret < 0) {
73363c61 952 WARN("Failed to bind mount \"%s\" onto \"%s\"",
2520facd 953 tty->name, path);
7c6ef2a2
SH
954 continue;
955 }
0fd73091 956 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name,
9e1045e3 957 path);
13954cce 958
9e1045e3
CB
959 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
960 ttydir, i + 1);
73363c61 961 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 962 return -1;
9e1045e3 963
7c6ef2a2 964 ret = symlink(lxcpath, path);
9e1045e3 965 if (ret < 0) {
73363c61 966 SYSERROR("Failed to create symlink \"%s\" -> \"%s\"",
9e1045e3 967 path, lxcpath);
7c6ef2a2
SH
968 return -1;
969 }
970 } else {
9e1045e3
CB
971 /* If we populated /dev, then we need to create
972 * /dev/ttyN
973 */
d3ccc04e
CB
974 ret = mknod(path, S_IFREG | 0000, 0);
975 if (ret < 0) /* this isn't fatal, continue */
6d1400b5 976 SYSERROR("Failed to create \"%s\"", path);
9e1045e3 977
2520facd 978 ret = mount(tty->name, path, "none", MS_BIND, 0);
9e1045e3 979 if (ret < 0) {
2520facd 980 SYSERROR("Failed to mount '%s'->'%s'", tty->name, path);
7c6ef2a2
SH
981 continue;
982 }
9e1045e3 983
d3ccc04e 984 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, path);
393903d1 985 }
9e1045e3 986
885766f5 987 if (!append_ttyname(&conf->ttys.tty_names, tty->name)) {
393903d1
SH
988 ERROR("Error setting up container_ttys string");
989 return -1;
b0a33c1e 990 }
991 }
992
885766f5 993 INFO("Finished setting up %zu /dev/tty<N> device(s)", ttys->max);
b0a33c1e 994 return 0;
995}
996
663014ee 997int lxc_allocate_ttys(struct lxc_conf *conf)
2187efd3 998{
2187efd3 999 int i, ret;
0fd73091 1000 struct lxc_tty_info *ttys = &conf->ttys;
2187efd3
CB
1001
1002 /* no tty in the configuration */
885766f5 1003 if (ttys->max == 0)
2187efd3
CB
1004 return 0;
1005
885766f5 1006 ttys->tty = malloc(sizeof(*ttys->tty) * ttys->max);
0e4be3cf 1007 if (!ttys->tty)
2187efd3 1008 return -ENOMEM;
2187efd3 1009
885766f5 1010 for (i = 0; i < ttys->max; i++) {
0e4be3cf 1011 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 1012
386e6768
CB
1013 tty->master = -EBADF;
1014 tty->slave = -EBADF;
77a39805
CB
1015 ret = openpty(&tty->master, &tty->slave, NULL, NULL, NULL);
1016 if (ret < 0) {
0fd73091 1017 SYSERROR("Failed to create tty %d", i);
885766f5 1018 ttys->max = i;
0e4be3cf 1019 lxc_delete_tty(ttys);
2187efd3
CB
1020 return -ENOTTY;
1021 }
1022
77a39805
CB
1023 ret = ttyname_r(tty->slave, tty->name, sizeof(tty->name));
1024 if (ret < 0) {
1025 SYSERROR("Failed to retrieve name of tty %d slave", i);
1026 ttys->max = i;
1027 lxc_delete_tty(ttys);
1028 return -ENOTTY;
1029 }
1030
0fd73091 1031 DEBUG("Created tty \"%s\" with master fd %d and slave fd %d",
2520facd 1032 tty->name, tty->master, tty->slave);
2187efd3
CB
1033
1034 /* Prevent leaking the file descriptors to the container */
615f24ff 1035 ret = fd_cloexec(tty->master, true);
2187efd3 1036 if (ret < 0)
a24c5678 1037 SYSWARN("Failed to set FD_CLOEXEC flag on master fd %d of "
1038 "tty device \"%s\"", tty->master, tty->name);
2187efd3 1039
615f24ff 1040 ret = fd_cloexec(tty->slave, true);
2187efd3 1041 if (ret < 0)
a24c5678 1042 SYSWARN("Failed to set FD_CLOEXEC flag on slave fd %d of "
1043 "tty device \"%s\"", tty->slave, tty->name);
2187efd3 1044
2520facd 1045 tty->busy = 0;
2187efd3
CB
1046 }
1047
885766f5 1048 INFO("Finished creating %zu tty devices", ttys->max);
2187efd3
CB
1049 return 0;
1050}
1051
0e4be3cf 1052void lxc_delete_tty(struct lxc_tty_info *ttys)
2187efd3
CB
1053{
1054 int i;
1055
386e6768
CB
1056 if (!ttys->tty)
1057 return;
1058
885766f5 1059 for (i = 0; i < ttys->max; i++) {
0e4be3cf 1060 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 1061
386e6768
CB
1062 if (tty->master >= 0) {
1063 close(tty->master);
1064 tty->master = -EBADF;
1065 }
1066
1067 if (tty->slave >= 0) {
1068 close(tty->slave);
1069 tty->slave = -EBADF;
1070 }
2187efd3
CB
1071 }
1072
0e4be3cf
CB
1073 free(ttys->tty);
1074 ttys->tty = NULL;
2187efd3
CB
1075}
1076
1077static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
1078{
1079 int i;
0fd73091 1080 int ret = -1;
2187efd3 1081 struct lxc_conf *conf = handler->conf;
0e4be3cf 1082 struct lxc_tty_info *ttys = &conf->ttys;
2187efd3 1083 int sock = handler->data_sock[0];
2187efd3 1084
885766f5 1085 if (ttys->max == 0)
2187efd3
CB
1086 return 0;
1087
885766f5 1088 for (i = 0; i < ttys->max; i++) {
2187efd3 1089 int ttyfds[2];
0e4be3cf 1090 struct lxc_terminal_info *tty = &ttys->tty[i];
2187efd3 1091
2520facd
CB
1092 ttyfds[0] = tty->master;
1093 ttyfds[1] = tty->slave;
2187efd3
CB
1094
1095 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1096 if (ret < 0)
1097 break;
1098
0fd73091 1099 TRACE("Sent ty \"%s\" with master fd %d and slave fd %d to "
2520facd 1100 "parent", tty->name, tty->master, tty->slave);
2187efd3
CB
1101 }
1102
1103 if (ret < 0)
6d1400b5 1104 SYSERROR("Failed to send %zu ttys to parent", ttys->max);
2187efd3 1105 else
885766f5 1106 TRACE("Sent %zu ttys to parent", ttys->max);
2187efd3
CB
1107
1108 return ret;
1109}
1110
1111static int lxc_create_ttys(struct lxc_handler *handler)
1112{
1113 int ret = -1;
1114 struct lxc_conf *conf = handler->conf;
1115
663014ee 1116 ret = lxc_allocate_ttys(conf);
2187efd3
CB
1117 if (ret < 0) {
1118 ERROR("Failed to allocate ttys");
1119 goto on_error;
1120 }
1121
1122 ret = lxc_send_ttys_to_parent(handler);
1123 if (ret < 0) {
1124 ERROR("Failed to send ttys to parent");
1125 goto on_error;
1126 }
1127
1128 if (!conf->is_execute) {
1129 ret = lxc_setup_ttys(conf);
1130 if (ret < 0) {
1131 ERROR("Failed to setup ttys");
1132 goto on_error;
1133 }
1134 }
1135
885766f5
CB
1136 if (conf->ttys.tty_names) {
1137 ret = setenv("container_ttys", conf->ttys.tty_names, 1);
2187efd3 1138 if (ret < 0)
885766f5 1139 SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
2187efd3
CB
1140 }
1141
1142 ret = 0;
1143
1144on_error:
0e4be3cf 1145 lxc_delete_tty(&conf->ttys);
2187efd3
CB
1146
1147 return ret;
1148}
1149
59bb8698 1150static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 1151{
0fd73091
CB
1152 int ret;
1153 int newroot = -1, oldroot = -1;
bf601689 1154
2d489f9e
SH
1155 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1156 if (oldroot < 0) {
0fd73091 1157 SYSERROR("Failed to open old root directory");
9ba8130c
SH
1158 return -1;
1159 }
0fd73091 1160
2d489f9e
SH
1161 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1162 if (newroot < 0) {
0fd73091
CB
1163 SYSERROR("Failed to open new root directory");
1164 goto on_error;
c08556c6 1165 }
bf601689 1166
cc6f6dd7 1167 /* change into new root fs */
0fd73091
CB
1168 ret = fchdir(newroot);
1169 if (ret < 0) {
1170 SYSERROR("Failed to change to new rootfs \"%s\"", rootfs);
1171 goto on_error;
cc6f6dd7
DL
1172 }
1173
cc6f6dd7 1174 /* pivot_root into our new root fs */
0fd73091
CB
1175 ret = pivot_root(".", ".");
1176 if (ret < 0) {
1177 SYSERROR("Failed to pivot_root()");
1178 goto on_error;
bf601689 1179 }
cc6f6dd7 1180
e599717b 1181 /* At this point the old-root is mounted on top of our new-root. To
0fd73091
CB
1182 * unmounted it we must not be chdir'd into it, so escape back to
1183 * old-root.
2d489f9e 1184 */
0fd73091
CB
1185 ret = fchdir(oldroot);
1186 if (ret < 0) {
1187 SYSERROR("Failed to enter old root directory");
1188 goto on_error;
2d489f9e 1189 }
0fd73091 1190
e599717b
FW
1191 /* Make oldroot rslave to make sure our umounts don't propagate to the
1192 * host.
1193 */
1194 ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
1195 if (ret < 0) {
1196 SYSERROR("Failed to make oldroot rslave");
1197 goto on_error;
1198 }
1199
0fd73091
CB
1200 ret = umount2(".", MNT_DETACH);
1201 if (ret < 0) {
1202 SYSERROR("Failed to detach old root directory");
1203 goto on_error;
cc6f6dd7
DL
1204 }
1205
0fd73091
CB
1206 ret = fchdir(newroot);
1207 if (ret < 0) {
1208 SYSERROR("Failed to re-enter new root directory");
1209 goto on_error;
2d489f9e 1210 }
cc6f6dd7 1211
2d489f9e
SH
1212 close(oldroot);
1213 close(newroot);
bf601689 1214
0fd73091 1215 DEBUG("pivot_root(\"%s\") successful", rootfs);
bf601689 1216
bf601689 1217 return 0;
2d489f9e 1218
0fd73091 1219on_error:
2d489f9e
SH
1220 if (oldroot != -1)
1221 close(oldroot);
1222 if (newroot != -1)
1223 close(newroot);
0fd73091 1224
2d489f9e 1225 return -1;
bf601689
MH
1226}
1227
7133b912
CB
1228/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1229 * error, log it but don't fail yet.
91c3830e 1230 */
7133b912
CB
1231static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1232 const char *lxcpath)
91c3830e
SH
1233{
1234 int ret;
87da4ec3
SH
1235 size_t clen;
1236 char *path;
87e0e273 1237 mode_t cur_mask;
91c3830e 1238
7133b912 1239 INFO("Preparing \"/dev\"");
bc6928ff 1240
14221cbb 1241 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1242 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1243 path = alloca(clen);
bc6928ff 1244
ec50007f 1245 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
7133b912 1246 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1247 return -1;
bc6928ff 1248
87e0e273
CB
1249 cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1250 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1251 if (ret < 0 && errno != EEXIST) {
1252 SYSERROR("Failed to create \"/dev\" directory");
1253 ret = -errno;
1254 goto reset_umask;
bc6928ff 1255 }
87da4ec3 1256
1ec0e8e3 1257 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
7133b912
CB
1258 rootfs->path ? rootfs->mount : NULL);
1259 if (ret < 0) {
1260 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
87e0e273 1261 goto reset_umask;
91c3830e 1262 }
87e0e273 1263 TRACE("Mounted tmpfs on \"%s\"", path);
87da4ec3 1264
ec50007f 1265 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
87e0e273
CB
1266 if (ret < 0 || (size_t)ret >= clen) {
1267 ret = -1;
1268 goto reset_umask;
1269 }
87da4ec3 1270
7133b912 1271 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1272 * If not, then create it and exit if that fails...
1273 */
87e0e273
CB
1274 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1275 if (ret < 0 && errno != EEXIST) {
1276 SYSERROR("Failed to create directory \"%s\"", path);
1277 ret = -errno;
1278 goto reset_umask;
91c3830e
SH
1279 }
1280
87e0e273
CB
1281 ret = 0;
1282
1283reset_umask:
1284 (void)umask(cur_mask);
1285
7133b912 1286 INFO("Prepared \"/dev\"");
87e0e273 1287 return ret;
91c3830e
SH
1288}
1289
5e73416f 1290struct lxc_device_node {
74a3920a 1291 const char *name;
5e73416f
CB
1292 const mode_t mode;
1293 const int maj;
1294 const int min;
c6883f38
SH
1295};
1296
5e73416f 1297static const struct lxc_device_node lxc_devices[] = {
06749971 1298 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
5e73416f 1299 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
06749971
CB
1300 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1301 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
5e73416f
CB
1302 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1303 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
c6883f38
SH
1304};
1305
5067e4dd
CB
1306
1307enum {
1308 LXC_DEVNODE_BIND,
1309 LXC_DEVNODE_MKNOD,
1310 LXC_DEVNODE_PARTIAL,
1311 LXC_DEVNODE_OPEN,
1312};
1313
27245ff7 1314static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38 1315{
5e73416f 1316 int i, ret;
c6883f38 1317 char path[MAXPATHLEN];
3a32201c 1318 mode_t cmask;
5067e4dd 1319 int use_mknod = LXC_DEVNODE_MKNOD;
c6883f38 1320
3999be0a
CB
1321 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1322 rootfs->path ? rootfs->mount : "");
1323 if (ret < 0 || ret >= MAXPATHLEN)
c6883f38 1324 return -1;
91c3830e 1325
0bbf8572
CB
1326 /* ignore, just don't try to fill in */
1327 if (!dir_exists(path))
9cb4d183
SH
1328 return 0;
1329
3999be0a
CB
1330 INFO("Populating \"/dev\"");
1331
3a32201c 1332 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
5e73416f
CB
1333 for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
1334 char hostpath[MAXPATHLEN];
1335 const struct lxc_device_node *device = &lxc_devices[i];
0728ebf4 1336
3999be0a 1337 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
5e73416f 1338 rootfs->path ? rootfs->mount : "", device->name);
c6883f38
SH
1339 if (ret < 0 || ret >= MAXPATHLEN)
1340 return -1;
0bbf8572 1341
5067e4dd 1342 if (use_mknod >= LXC_DEVNODE_MKNOD) {
5e73416f
CB
1343 ret = mknod(path, device->mode, makedev(device->maj, device->min));
1344 if (ret == 0 || (ret < 0 && errno == EEXIST)) {
1345 DEBUG("Created device node \"%s\"", path);
5067e4dd
CB
1346 } else if (ret < 0) {
1347 if (errno != EPERM) {
1348 SYSERROR("Failed to create device node \"%s\"", path);
1349 return -1;
1350 }
0bbf8572 1351
5067e4dd 1352 use_mknod = LXC_DEVNODE_BIND;
9cb4d183 1353 }
3999be0a 1354
5067e4dd
CB
1355 /* Device nodes are fully useable. */
1356 if (use_mknod == LXC_DEVNODE_OPEN)
1357 continue;
1358
1359 if (use_mknod == LXC_DEVNODE_MKNOD) {
1360 /* See
1361 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
1362 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
1363 */
1364 ret = open(path, O_RDONLY | O_CLOEXEC);
1365 if (ret >= 0) {
1366 close(ret);
1367 /* Device nodes are fully useable. */
1368 use_mknod = LXC_DEVNODE_OPEN;
1369 continue;
1370 }
1371
1372 SYSTRACE("Failed to open \"%s\" device", path);
1373 /* Device nodes are only partially useable. */
1374 use_mknod = LXC_DEVNODE_PARTIAL;
1375 }
5e73416f
CB
1376 }
1377
5067e4dd
CB
1378 if (use_mknod != LXC_DEVNODE_PARTIAL) {
1379 /* If we are dealing with partially functional device
1380 * nodes the prio mknod() call will have created the
1381 * device node so we can use it as a bind-mount target.
1382 */
1383 ret = mknod(path, S_IFREG | 0000, 0);
1384 if (ret < 0 && errno != EEXIST) {
1385 SYSERROR("Failed to create file \"%s\"", path);
1386 return -1;
1387 }
5e73416f
CB
1388 }
1389
1390 /* Fallback to bind-mounting the device from the host. */
1391 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", device->name);
1392 if (ret < 0 || ret >= MAXPATHLEN)
1393 return -1;
1394
1395 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1396 rootfs->path ? rootfs->mount : NULL);
1397 if (ret < 0) {
1398 SYSERROR("Failed to bind mount host device node \"%s\" "
1399 "onto \"%s\"", hostpath, path);
1400 return -1;
c6883f38 1401 }
5e73416f
CB
1402 DEBUG("Bind mounted host device node \"%s\" onto \"%s\"",
1403 hostpath, path);
c6883f38 1404 }
5e73416f 1405 (void)umask(cmask);
c6883f38 1406
3999be0a 1407 INFO("Populated \"/dev\"");
c6883f38
SH
1408 return 0;
1409}
1410
9aa76a17 1411static int lxc_setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1412{
9aa76a17 1413 int ret;
10bc1861 1414 struct lxc_storage *bdev;
91c3e281 1415 const struct lxc_rootfs *rootfs;
cc28d0b0 1416
91c3e281 1417 rootfs = &conf->rootfs;
a0f379bf 1418 if (!rootfs->path) {
0fd73091
CB
1419 ret = mount("", "/", NULL, MS_SLAVE | MS_REC, 0);
1420 if (ret < 0) {
1421 SYSERROR("Failed to make / rslave");
a0f379bf
DW
1422 return -1;
1423 }
0fd73091 1424
c69bd12f 1425 return 0;
a0f379bf 1426 }
0ad19a3f 1427
0fd73091
CB
1428 ret = access(rootfs->mount, F_OK);
1429 if (ret != 0) {
1430 SYSERROR("Failed to access to \"%s\". Check it is present",
12297168 1431 rootfs->mount);
b1789442
DL
1432 return -1;
1433 }
1434
8a388ed4 1435 bdev = storage_init(conf);
9aa76a17 1436 if (!bdev) {
0fd73091 1437 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1438 rootfs->path, rootfs->mount,
1439 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1440 return -1;
9be53773 1441 }
9aa76a17
CB
1442
1443 ret = bdev->ops->mount(bdev);
10bc1861 1444 storage_put(bdev);
9aa76a17 1445 if (ret < 0) {
0fd73091 1446 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1447 rootfs->path, rootfs->mount,
1448 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1449 return -1;
1450 }
0ad19a3f 1451
0fd73091 1452 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\"",
91c3e281
CB
1453 rootfs->path, rootfs->mount,
1454 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1455
ac778708
DL
1456 return 0;
1457}
1458
91e93c71
AV
1459int prepare_ramfs_root(char *root)
1460{
0fd73091
CB
1461 int i, ret;
1462 char *p, *p2;
1463 char buf[LXC_LINELEN], nroot[PATH_MAX];
91e93c71 1464 FILE *f;
91e93c71 1465
0fd73091
CB
1466 if (!realpath(root, nroot))
1467 return -1;
91e93c71 1468
0fd73091
CB
1469 ret = chdir("/");
1470 if (ret < 0)
1471 return -1;
91e93c71 1472
0fd73091
CB
1473 /* We could use here MS_MOVE, but in userns this mount is locked and
1474 * can't be moved.
91e93c71 1475 */
0fd73091
CB
1476 ret = mount(root, "/", NULL, MS_REC | MS_BIND, NULL);
1477 if (ret < 0) {
1478 SYSERROR("Failed to move \"%s\" into \"/\"", root);
1479 return -1;
91e93c71
AV
1480 }
1481
0fd73091
CB
1482 ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
1483 if (ret < 0) {
1484 SYSERROR("Failed to make \"/\" rprivate");
1485 return -1;
91e93c71
AV
1486 }
1487
0fd73091
CB
1488 /* The following code cleans up inhereted mounts which are not required
1489 * for CT.
91e93c71
AV
1490 *
1491 * The mountinfo file shows not all mounts, if a few points have been
1492 * unmounted between read operations from the mountinfo. So we need to
1493 * read mountinfo a few times.
1494 *
1495 * This loop can be skipped if a container uses unserns, because all
1496 * inherited mounts are locked and we should live with all this trash.
1497 */
0fd73091 1498 for (;;) {
91e93c71
AV
1499 int progress = 0;
1500
1501 f = fopen("./proc/self/mountinfo", "r");
1502 if (!f) {
1503 SYSERROR("Unable to open /proc/self/mountinfo");
1504 return -1;
1505 }
0fd73091 1506
eab15c1e 1507 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1508 for (p = buf, i=0; p && i < 4; i++)
1509 p = strchr(p+1, ' ');
0fd73091 1510
91e93c71
AV
1511 if (!p)
1512 continue;
0fd73091 1513
91e93c71
AV
1514 p2 = strchr(p+1, ' ');
1515 if (!p2)
1516 continue;
1517
1518 *p2 = '\0';
1519 *p = '.';
1520
1521 if (strcmp(p + 1, "/") == 0)
1522 continue;
0fd73091 1523
91e93c71
AV
1524 if (strcmp(p + 1, "/proc") == 0)
1525 continue;
1526
0fd73091
CB
1527 ret = umount2(p, MNT_DETACH);
1528 if (ret == 0)
91e93c71
AV
1529 progress++;
1530 }
0fd73091 1531
91e93c71 1532 fclose(f);
0fd73091 1533
91e93c71
AV
1534 if (!progress)
1535 break;
1536 }
1537
0fd73091
CB
1538 /* This also can be skipped if a container uses unserns. */
1539 (void)umount2("./proc", MNT_DETACH);
91e93c71
AV
1540
1541 /* It is weird, but chdir("..") moves us in a new root */
0fd73091
CB
1542 ret = chdir("..");
1543 if (ret < 0) {
91e93c71
AV
1544 SYSERROR("Unable to change working directory");
1545 return -1;
1546 }
1547
0fd73091
CB
1548 ret = chroot(".");
1549 if (ret < 0) {
91e93c71
AV
1550 SYSERROR("Unable to chroot");
1551 return -1;
1552 }
1553
1554 return 0;
1555}
1556
74a3920a 1557static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1558{
0fd73091
CB
1559 int ret;
1560
39c7b795 1561 if (!rootfs->path) {
0fd73091 1562 DEBUG("Container does not have a rootfs");
ac778708 1563 return 0;
39c7b795 1564 }
ac778708 1565
91e93c71 1566 if (detect_ramfs_rootfs()) {
0fd73091
CB
1567 DEBUG("Detected that container is on ramfs");
1568
1569 ret = prepare_ramfs_root(rootfs->mount);
1570 if (ret < 0) {
1571 ERROR("Failed to prepare minimal ramfs root");
91e93c71 1572 return -1;
39c7b795
CB
1573 }
1574
0fd73091 1575 DEBUG("Prepared ramfs root for container");
39c7b795
CB
1576 return 0;
1577 }
1578
0fd73091
CB
1579 ret = setup_rootfs_pivot_root(rootfs->mount);
1580 if (ret < 0) {
1581 ERROR("Failed to pivot_root()");
25368b52 1582 return -1;
c69bd12f
DL
1583 }
1584
0fd73091 1585 DEBUG("Finished pivot_root()");
25368b52 1586 return 0;
0ad19a3f 1587}
1588
5173b710 1589static const struct id_map *find_mapped_nsid_entry(struct lxc_conf *conf, unsigned id,
f4900711
CB
1590 enum idtype idtype)
1591{
1592 struct lxc_list *it;
1593 struct id_map *map;
1594 struct id_map *retmap = NULL;
1595
dcf0ffdf
CB
1596 /* Shortcut for container's root mappings. */
1597 if (id == 0) {
1598 if (idtype == ID_TYPE_UID)
1599 return conf->root_nsuid_map;
1600
1601 if (idtype == ID_TYPE_GID)
1602 return conf->root_nsgid_map;
1603 }
1604
f4900711
CB
1605 lxc_list_for_each(it, &conf->id_map) {
1606 map = it->elem;
1607 if (map->idtype != idtype)
1608 continue;
1609
1610 if (id >= map->nsid && id < map->nsid + map->range) {
1611 retmap = map;
1612 break;
1613 }
1614 }
1615
1616 return retmap;
1617}
1618
1619static int lxc_setup_devpts(struct lxc_conf *conf)
3c26f34e 1620{
70761e5e 1621 int ret;
11293068 1622 const char *default_devpts_mntopts = "gid=5,newinstance,ptmxmode=0666,mode=0620";
9d28c4f9 1623 char devpts_mntopts[256];
77890c6d 1624
e528c735 1625 if (conf->pty_max <= 0) {
0fd73091 1626 DEBUG("No new devpts instance will be mounted since no pts "
70761e5e 1627 "devices are requested");
d852c78c 1628 return 0;
3c26f34e 1629 }
1630
e528c735
CB
1631 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu",
1632 default_devpts_mntopts, conf->pty_max);
9d28c4f9
CB
1633 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1634 return -1;
1635
77f94854
CB
1636 ret = umount2("/dev/pts", MNT_DETACH);
1637 if (ret < 0)
a24c5678 1638 SYSWARN("Failed to unmount old devpts instance");
77f94854 1639 else
0fd73091 1640 DEBUG("Unmounted old devpts instance");
7e40254a 1641
70761e5e
CB
1642 /* Create mountpoint for devpts instance. */
1643 ret = mkdir("/dev/pts", 0755);
1644 if (ret < 0 && errno != EEXIST) {
0fd73091 1645 SYSERROR("Failed to create \"/dev/pts\" directory");
3c26f34e 1646 return -1;
1647 }
1648
11293068 1649 /* mount new devpts instance */
f4900711 1650 ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, devpts_mntopts);
70761e5e 1651 if (ret < 0) {
11293068
CB
1652 /* try mounting without gid=5 */
1653 ret = mount("devpts", "/dev/pts", "devpts",
1654 MS_NOSUID | MS_NOEXEC, devpts_mntopts + sizeof("gid=5"));
1655 if (ret < 0) {
1656 SYSERROR("Failed to mount new devpts instance");
1657 return -1;
1658 }
70761e5e 1659 }
0fd73091 1660 DEBUG("Mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1661
d5cb35d6 1662 /* Remove any pre-existing /dev/ptmx file. */
b29e05d6
CB
1663 ret = remove("/dev/ptmx");
1664 if (ret < 0) {
1665 if (errno != ENOENT) {
0fd73091 1666 SYSERROR("Failed to remove existing \"/dev/ptmx\" file");
d5cb35d6 1667 return -1;
70761e5e 1668 }
b29e05d6 1669 } else {
0fd73091 1670 DEBUG("Removed existing \"/dev/ptmx\" file");
3c26f34e 1671 }
1672
d5cb35d6 1673 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
3b7e332f
CB
1674 ret = mknod("/dev/ptmx", S_IFREG | 0000, 0);
1675 if (ret < 0 && errno != EEXIST) {
0fd73091 1676 SYSERROR("Failed to create dummy \"/dev/ptmx\" file as bind mount target");
d5cb35d6
CB
1677 return -1;
1678 }
0fd73091 1679 DEBUG("Created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1680
d5cb35d6 1681 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1682 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6 1683 if (!ret) {
0fd73091 1684 DEBUG("Bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
d5cb35d6
CB
1685 return 0;
1686 } else {
1687 /* Fallthrough and try to create a symlink. */
0fd73091 1688 ERROR("Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
d5cb35d6
CB
1689 }
1690
1691 /* Remove the dummy /dev/ptmx file we created above. */
1692 ret = remove("/dev/ptmx");
70761e5e 1693 if (ret < 0) {
0fd73091 1694 SYSERROR("Failed to remove existing \"/dev/ptmx\"");
d5cb35d6
CB
1695 return -1;
1696 }
1697
1698 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1699 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1700 if (ret < 0) {
0fd73091 1701 SYSERROR("Failed to create symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
3c26f34e 1702 return -1;
1703 }
0fd73091 1704 DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
cd54d859 1705
3c26f34e 1706 return 0;
1707}
1708
cccc74b5
DL
1709static int setup_personality(int persona)
1710{
0fd73091
CB
1711 int ret;
1712
1713#if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1714 if (persona == -1)
1715 return 0;
1716
0fd73091
CB
1717 ret = personality(persona);
1718 if (ret < 0) {
1719 SYSERROR("Failed to set personality to \"0x%x\"", persona);
cccc74b5
DL
1720 return -1;
1721 }
1722
0fd73091
CB
1723 INFO("Set personality to \"0x%x\"", persona);
1724#endif
cccc74b5
DL
1725
1726 return 0;
1727}
1728
3d7d929a 1729static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
dcad02f8 1730 const struct lxc_terminal *console)
6e590161 1731{
882671aa 1732 int ret;
63376d7d 1733 char path[MAXPATHLEN];
86530b0a 1734 char *rootfs_path = rootfs->path ? rootfs->mount : "";
52e35957 1735
8b1b1210
CB
1736 if (console->path && !strcmp(console->path, "none"))
1737 return 0;
1738
86530b0a 1739 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
3d7d929a 1740 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1741 return -1;
52e35957 1742
8b1b1210
CB
1743 /* When we are asked to setup a console we remove any previous
1744 * /dev/console bind-mounts.
1745 */
a7ba3c7f
CB
1746 if (file_exists(path)) {
1747 ret = lxc_unstack_mountpoint(path, false);
1748 if (ret < 0) {
6d1400b5 1749 SYSERROR("Failed to unmount \"%s\"", path);
a7ba3c7f
CB
1750 return -ret;
1751 } else {
86530b0a 1752 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
a7ba3c7f 1753 }
8b1b1210
CB
1754 }
1755
1756 /* For unprivileged containers autodev or automounts will already have
1757 * taken care of creating /dev/console.
1758 */
882671aa 1759 ret = mknod(path, S_IFREG | 0000, 0);
3b7e332f
CB
1760 if (ret < 0 && errno != EEXIST) {
1761 SYSERROR("Failed to create console");
1762 return -errno;
52e35957
DL
1763 }
1764
882671aa 1765 ret = fchmod(console->slave, S_IXUSR | S_IXGRP | S_IXOTH);
86530b0a 1766 if (ret < 0) {
0fd73091
CB
1767 SYSERROR("Failed to set mode \"0%o\" to \"%s\"",
1768 S_IXUSR | S_IXGRP | S_IXOTH, console->name);
3d7d929a 1769 return -errno;
63376d7d 1770 }
13954cce 1771
86530b0a
L
1772 ret = safe_mount(console->name, path, "none", MS_BIND, 0, rootfs_path);
1773 if (ret < 0) {
0fd73091 1774 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, path);
6e590161 1775 return -1;
1776 }
1777
86530b0a 1778 DEBUG("Mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1779 return 0;
1780}
1781
3d7d929a 1782static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
dcad02f8 1783 const struct lxc_terminal *console,
3d7d929a 1784 char *ttydir)
7c6ef2a2 1785{
3b7e332f 1786 int ret;
3d7d929a 1787 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
86530b0a 1788 char *rootfs_path = rootfs->path ? rootfs->mount : "";
7c6ef2a2 1789
3dc035f1
L
1790 if (console->path && !strcmp(console->path, "none"))
1791 return 0;
1792
7c6ef2a2 1793 /* create rootfs/dev/<ttydir> directory */
86530b0a 1794 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs_path, ttydir);
3d7d929a 1795 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1796 return -1;
3d7d929a 1797
7c6ef2a2
SH
1798 ret = mkdir(path, 0755);
1799 if (ret && errno != EEXIST) {
0fd73091 1800 SYSERROR("Failed to create \"%s\"", path);
3d7d929a 1801 return -errno;
7c6ef2a2 1802 }
4742cd9a 1803 DEBUG("Created directory for console and tty devices at \"%s\"", path);
7c6ef2a2 1804
86530b0a 1805 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
3d7d929a
CB
1806 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1807 return -1;
1808
3b7e332f
CB
1809 ret = mknod(lxcpath, S_IFREG | 0000, 0);
1810 if (ret < 0 && errno != EEXIST) {
0fd73091 1811 SYSERROR("Failed to create \"%s\"", lxcpath);
3d7d929a 1812 return -errno;
7c6ef2a2 1813 }
7c6ef2a2 1814
86530b0a 1815 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
3dc035f1 1816 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1817 return -1;
2a12fefd 1818
3dc035f1 1819 if (file_exists(path)) {
a7ba3c7f 1820 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1821 if (ret < 0) {
6d1400b5 1822 SYSERROR("Failed to unmount \"%s\"", path);
a7ba3c7f
CB
1823 return -ret;
1824 } else {
86530b0a 1825 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
a7ba3c7f 1826 }
3dc035f1 1827 }
2a12fefd 1828
3b7e332f
CB
1829 ret = mknod(path, S_IFREG | 0000, 0);
1830 if (ret < 0 && errno != EEXIST) {
1831 SYSERROR("Failed to create console");
1832 return -errno;
7c6ef2a2
SH
1833 }
1834
3b7e332f 1835 ret = fchmod(console->slave, S_IXUSR | S_IXGRP | S_IXOTH);
86530b0a 1836 if (ret < 0) {
0fd73091
CB
1837 SYSERROR("Failed to set mode \"0%o\" to \"%s\"",
1838 S_IXUSR | S_IXGRP | S_IXOTH, console->name);
2a12fefd
CB
1839 return -errno;
1840 }
1841
3dc035f1 1842 /* bind mount console->name to '/dev/<ttydir>/console' */
86530b0a
L
1843 ret = safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs_path);
1844 if (ret < 0) {
0fd73091 1845 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1846 return -1;
1847 }
86530b0a 1848 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1
L
1849
1850 /* bind mount '/dev/<ttydir>/console' to '/dev/console' */
86530b0a
L
1851 ret = safe_mount(lxcpath, path, "none", MS_BIND, 0, rootfs_path);
1852 if (ret < 0) {
0fd73091 1853 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
3dc035f1
L
1854 return -1;
1855 }
86530b0a 1856 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
3dc035f1 1857
86530b0a 1858 DEBUG("Console has been setup under \"%s\" and mounted to \"%s\"", lxcpath, path);
6e590161 1859 return 0;
1860}
1861
3d7d929a 1862static int lxc_setup_console(const struct lxc_rootfs *rootfs,
dcad02f8 1863 const struct lxc_terminal *console, char *ttydir)
7c6ef2a2 1864{
3d7d929a 1865
7c6ef2a2 1866 if (!ttydir)
3d7d929a 1867 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1868
3d7d929a 1869 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1870}
1871
efed99a4 1872static void parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
998ac676
RT
1873{
1874 struct mount_opt *mo;
1875
1876 /* If opt is found in mount_opt, set or clear flags.
1877 * Otherwise append it to data. */
1878
1879 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
0fd73091 1880 if (strncmp(opt, mo->name, strlen(mo->name)) == 0) {
998ac676
RT
1881 if (mo->clear)
1882 *flags &= ~mo->flag;
1883 else
1884 *flags |= mo->flag;
1885 return;
1886 }
1887 }
1888
f1e05b90
DJ
1889 if (strlen(*data))
1890 (void)strlcat(*data, ",", size);
efed99a4 1891
f1e05b90 1892 (void)strlcat(*data, opt, size);
998ac676
RT
1893}
1894
0fd73091 1895int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata)
998ac676 1896{
0fd73091
CB
1897 char *data, *p, *s;
1898 char *saveptr = NULL;
efed99a4 1899 size_t size;
998ac676 1900
911324ef 1901 *mntdata = NULL;
91656ce5 1902 *mntflags = 0L;
911324ef
DL
1903
1904 if (!mntopts)
998ac676
RT
1905 return 0;
1906
911324ef 1907 s = strdup(mntopts);
0fd73091 1908 if (!s)
998ac676 1909 return -1;
998ac676 1910
efed99a4
DJ
1911 size = strlen(s) + 1;
1912 data = malloc(size);
998ac676 1913 if (!data) {
998ac676
RT
1914 free(s);
1915 return -1;
1916 }
1917 *data = 0;
1918
0fd73091 1919 for (; (p = strtok_r(s, ",", &saveptr)); s = NULL)
efed99a4 1920 parse_mntopt(p, mntflags, &data, size);
998ac676
RT
1921
1922 if (*data)
1923 *mntdata = data;
1924 else
1925 free(data);
1926 free(s);
1927
1928 return 0;
1929}
1930
d840039e
YT
1931static void parse_propagationopt(char *opt, unsigned long *flags)
1932{
1933 struct mount_opt *mo;
1934
1935 /* If opt is found in propagation_opt, set or clear flags. */
d840039e 1936 for (mo = &propagation_opt[0]; mo->name != NULL; mo++) {
0fd73091
CB
1937 if (strncmp(opt, mo->name, strlen(mo->name)) != 0)
1938 continue;
1939
1940 if (mo->clear)
1941 *flags &= ~mo->flag;
1942 else
1943 *flags |= mo->flag;
1944
1945 return;
d840039e
YT
1946 }
1947}
1948
1949static int parse_propagationopts(const char *mntopts, unsigned long *pflags)
1950{
0fd73091
CB
1951 char *p, *s;
1952 char *saveptr = NULL;
d840039e
YT
1953
1954 if (!mntopts)
1955 return 0;
1956
1957 s = strdup(mntopts);
1958 if (!s) {
1959 SYSERROR("Failed to allocate memory");
1960 return -ENOMEM;
1961 }
1962
0fd73091
CB
1963 *pflags = 0L;
1964 for (; (p = strtok_r(s, ",", &saveptr)); s = NULL)
d840039e 1965 parse_propagationopt(p, pflags);
d840039e 1966 free(s);
0fd73091 1967
d840039e
YT
1968 return 0;
1969}
1970
6fd5e769
SH
1971static void null_endofword(char *word)
1972{
1973 while (*word && *word != ' ' && *word != '\t')
1974 word++;
1975 *word = '\0';
1976}
1977
0fd73091 1978/* skip @nfields spaces in @src */
6fd5e769
SH
1979static char *get_field(char *src, int nfields)
1980{
6fd5e769 1981 int i;
0fd73091 1982 char *p = src;
6fd5e769
SH
1983
1984 for (i = 0; i < nfields; i++) {
1985 while (*p && *p != ' ' && *p != '\t')
1986 p++;
0fd73091 1987
6fd5e769
SH
1988 if (!*p)
1989 break;
0fd73091 1990
6fd5e769
SH
1991 p++;
1992 }
0fd73091 1993
6fd5e769
SH
1994 return p;
1995}
1996
911324ef
DL
1997static int mount_entry(const char *fsname, const char *target,
1998 const char *fstype, unsigned long mountflags,
d840039e
YT
1999 unsigned long pflags, const char *data, bool optional,
2000 bool dev, bool relative, const char *rootfs)
911324ef 2001{
0ac4b28a 2002 int ret;
181437fd
YT
2003 char srcbuf[MAXPATHLEN];
2004 const char *srcpath = fsname;
614305f3 2005#ifdef HAVE_STATVFS
2938f7c8 2006 struct statvfs sb;
614305f3 2007#endif
2938f7c8 2008
181437fd
YT
2009 if (relative) {
2010 ret = snprintf(srcbuf, MAXPATHLEN, "%s/%s", rootfs ? rootfs : "/", fsname ? fsname : "");
2011 if (ret < 0 || ret >= MAXPATHLEN) {
2012 ERROR("source path is too long");
2013 return -1;
2014 }
2015 srcpath = srcbuf;
2016 }
2017
2018 ret = safe_mount(srcpath, target, fstype, mountflags & ~MS_REMOUNT, data,
0ac4b28a
CB
2019 rootfs);
2020 if (ret < 0) {
1fc64d22 2021 if (optional) {
7874d81a 2022 SYSINFO("Failed to mount \"%s\" on \"%s\" (optional)",
2023 srcpath ? srcpath : "(null)", target);
1fc64d22
SG
2024 return 0;
2025 }
0ac4b28a 2026
0103eb53 2027 SYSERROR("Failed to mount \"%s\" on \"%s\"",
181437fd 2028 srcpath ? srcpath : "(null)", target);
0ac4b28a 2029 return -1;
911324ef
DL
2030 }
2031
2032 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
7c5b6e7c 2033 unsigned long rqd_flags = 0;
0ac4b28a
CB
2034
2035 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
181437fd 2036 "options", srcpath ? srcpath : "(none)", target ? target : "(none)");
0ac4b28a 2037
7c5b6e7c
AS
2038 if (mountflags & MS_RDONLY)
2039 rqd_flags |= MS_RDONLY;
614305f3 2040#ifdef HAVE_STATVFS
181437fd 2041 if (srcpath && statvfs(srcpath, &sb) == 0) {
7c5b6e7c 2042 unsigned long required_flags = rqd_flags;
0ac4b28a 2043
2938f7c8
SH
2044 if (sb.f_flag & MS_NOSUID)
2045 required_flags |= MS_NOSUID;
0ac4b28a 2046
ae7a770e 2047 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 2048 required_flags |= MS_NODEV;
0ac4b28a 2049
2938f7c8
SH
2050 if (sb.f_flag & MS_RDONLY)
2051 required_flags |= MS_RDONLY;
0ac4b28a 2052
2938f7c8
SH
2053 if (sb.f_flag & MS_NOEXEC)
2054 required_flags |= MS_NOEXEC;
0ac4b28a
CB
2055
2056 DEBUG("Flags for \"%s\" were %lu, required extra flags "
181437fd 2057 "are %lu", srcpath, sb.f_flag, required_flags);
0ac4b28a
CB
2058
2059 /* If this was a bind mount request, and required_flags
2938f7c8 2060 * does not have any flags which are not already in
0ac4b28a 2061 * mountflags, then skip the remount.
2938f7c8
SH
2062 */
2063 if (!(mountflags & MS_REMOUNT)) {
0ac4b28a
CB
2064 if (!(required_flags & ~mountflags) &&
2065 rqd_flags == 0) {
2066 DEBUG("Mountflags already were %lu, "
2067 "skipping remount", mountflags);
2938f7c8
SH
2068 goto skipremount;
2069 }
2070 }
0ac4b28a 2071
2938f7c8 2072 mountflags |= required_flags;
6fd5e769 2073 }
614305f3 2074#endif
911324ef 2075
181437fd 2076 ret = mount(srcpath, target, fstype, mountflags | MS_REMOUNT, data);
0ac4b28a 2077 if (ret < 0) {
1fc64d22 2078 if (optional) {
7874d81a 2079 SYSINFO("Failed to mount \"%s\" on \"%s\" (optional)",
2080 srcpath ? srcpath : "(null)", target);
1fc64d22
SG
2081 return 0;
2082 }
0ac4b28a 2083
0103eb53 2084 SYSERROR("Failed to mount \"%s\" on \"%s\"",
181437fd 2085 srcpath ? srcpath : "(null)", target);
0ac4b28a 2086 return -1;
911324ef
DL
2087 }
2088 }
2089
d840039e
YT
2090 if (pflags) {
2091 ret = mount(NULL, target, NULL, pflags, NULL);
2092 if (ret < 0) {
2093 if (optional) {
7874d81a 2094 SYSINFO("Failed to change mount propagation "
2095 "for \"%s\" (optional)", target);
d840039e
YT
2096 return 0;
2097 } else {
2098 SYSERROR("Failed to change mount propagation "
2099 "for \"%s\" (optional)", target);
2100 return -1;
2101 }
2102 }
2103 DEBUG("Changed mount propagation for \"%s\"", target);
2104 }
2105
2106
614305f3 2107#ifdef HAVE_STATVFS
6fd5e769 2108skipremount:
614305f3 2109#endif
0103eb53 2110 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"",
181437fd 2111 srcpath ? srcpath : "(null)", target, fstype);
911324ef
DL
2112
2113 return 0;
2114}
2115
c5e30de4 2116/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
2117static void cull_mntent_opt(struct mntent *mntent)
2118{
2119 int i;
0fd73091
CB
2120 char *list[] = {
2121 "create=dir",
2122 "create=file",
2123 "optional",
2124 "relative",
2125 NULL
2126 };
c5e30de4
CB
2127
2128 for (i = 0; list[i]; i++) {
2129 char *p, *p2;
2130
2131 p = strstr(mntent->mnt_opts, list[i]);
2132 if (!p)
4e4ca161 2133 continue;
c5e30de4 2134
4e4ca161
SH
2135 p2 = strchr(p, ',');
2136 if (!p2) {
2137 /* no more mntopts, so just chop it here */
2138 *p = '\0';
2139 continue;
2140 }
c5e30de4
CB
2141
2142 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
2143 }
2144}
2145
4d5b72a1 2146static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
2147 const char *path,
2148 const struct lxc_rootfs *rootfs,
0fd73091 2149 const char *lxc_name, const char *lxc_path)
0ad19a3f 2150{
3b7e332f 2151 int ret;
12e6ab5d 2152 char *p1, *p2;
911324ef 2153
12e6ab5d 2154 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
749f98d9 2155 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
12e6ab5d
CB
2156 if (ret < 0)
2157 return -1;
2158 }
6e46cc0d 2159
34cfffb3 2160 if (hasmntopt(mntent, "create=dir")) {
749f98d9
CB
2161 ret = mkdir_p(path, 0755);
2162 if (ret < 0 && errno != EEXIST) {
2163 SYSERROR("Failed to create directory \"%s\"", path);
2164 return -1;
34cfffb3
SG
2165 }
2166 }
2167
0fd73091
CB
2168 if (!hasmntopt(mntent, "create=file"))
2169 return 0;
749f98d9 2170
0fd73091
CB
2171 ret = access(path, F_OK);
2172 if (ret == 0)
2173 return 0;
749f98d9 2174
0fd73091
CB
2175 p1 = strdup(path);
2176 if (!p1)
2177 return -1;
749f98d9 2178
0fd73091 2179 p2 = dirname(p1);
749f98d9 2180
0fd73091
CB
2181 ret = mkdir_p(p2, 0755);
2182 free(p1);
2183 if (ret < 0 && errno != EEXIST) {
2184 SYSERROR("Failed to create directory \"%s\"", path);
2185 return -1;
34cfffb3 2186 }
749f98d9 2187
3b7e332f
CB
2188 ret = mknod(path, S_IFREG | 0000, 0);
2189 if (ret < 0 && errno != EEXIST)
2190 return -errno;
0fd73091 2191
749f98d9 2192 return 0;
4d5b72a1
NC
2193}
2194
ec50007f
CB
2195/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2196 * without a rootfs. */
db4aba38 2197static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
2198 const char *path,
2199 const struct lxc_rootfs *rootfs,
2200 const char *lxc_name,
2201 const char *lxc_path)
4d5b72a1 2202{
d8b712bc 2203 int ret;
949d0338 2204 unsigned long mntflags;
4d5b72a1 2205 char *mntdata;
181437fd 2206 bool dev, optional, relative;
949d0338 2207 unsigned long pflags = 0;
ec50007f 2208 char *rootfs_path = NULL;
d8b712bc
CB
2209
2210 optional = hasmntopt(mntent, "optional") != NULL;
2211 dev = hasmntopt(mntent, "dev") != NULL;
181437fd 2212 relative = hasmntopt(mntent, "relative") != NULL;
d8b712bc 2213
ec50007f
CB
2214 if (rootfs && rootfs->path)
2215 rootfs_path = rootfs->mount;
2216
d8b712bc
CB
2217 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2218 lxc_path);
2219 if (ret < 0) {
2220 if (optional)
2221 return 0;
608e3567 2222
d8b712bc
CB
2223 return -1;
2224 }
4e4ca161
SH
2225 cull_mntent_opt(mntent);
2226
d840039e
YT
2227 ret = parse_propagationopts(mntent->mnt_opts, &pflags);
2228 if (ret < 0)
2229 return -1;
2230
d8b712bc
CB
2231 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2232 if (ret < 0)
a17b1e65 2233 return -1;
a17b1e65 2234
6e46cc0d 2235 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
d840039e 2236 pflags, mntdata, optional, dev, relative, rootfs_path);
68c152ef 2237
911324ef 2238 free(mntdata);
911324ef
DL
2239 return ret;
2240}
2241
db4aba38
NC
2242static inline int mount_entry_on_systemfs(struct mntent *mntent)
2243{
1433c9f9 2244 int ret;
07667a6a 2245 char path[MAXPATHLEN];
1433c9f9
CB
2246
2247 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
2248 * absolute paths starting at / on the host.
2249 */
1433c9f9
CB
2250 if (mntent->mnt_dir[0] != '/')
2251 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2252 else
2253 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
07667a6a 2254 if (ret < 0 || ret >= sizeof(path))
1433c9f9 2255 return -1;
1433c9f9
CB
2256
2257 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
2258}
2259
4e4ca161 2260static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 2261 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
2262 const char *lxc_name,
2263 const char *lxc_path)
911324ef 2264{
bdd2b34c 2265 int offset;
013bd428 2266 char *aux;
67e571de 2267 const char *lxcpath;
bdd2b34c
CB
2268 char path[MAXPATHLEN];
2269 int ret = 0;
0ad19a3f 2270
593e8478 2271 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 2272 if (!lxcpath)
2a59a681 2273 return -1;
2a59a681 2274
bdd2b34c
CB
2275 /* If rootfs->path is a blockdev path, allow container fstab to use
2276 * <lxcpath>/<name>/rootfs" as the target prefix.
2277 */
2278 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
2279 if (ret < 0 || ret >= MAXPATHLEN)
80a881b2
SH
2280 goto skipvarlib;
2281
2282 aux = strstr(mntent->mnt_dir, path);
2283 if (aux) {
2284 offset = strlen(path);
2285 goto skipabs;
2286 }
2287
2288skipvarlib:
013bd428
DL
2289 aux = strstr(mntent->mnt_dir, rootfs->path);
2290 if (!aux) {
bdd2b34c 2291 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
db4aba38 2292 return ret;
013bd428 2293 }
80a881b2
SH
2294 offset = strlen(rootfs->path);
2295
2296skipabs:
bdd2b34c
CB
2297 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
2298 if (ret < 0 || ret >= MAXPATHLEN)
a17b1e65 2299 return -1;
a17b1e65 2300
0a2dddd4 2301 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 2302}
d330fe7b 2303
4e4ca161 2304static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
2305 const struct lxc_rootfs *rootfs,
2306 const char *lxc_name,
2307 const char *lxc_path)
911324ef 2308{
911324ef 2309 int ret;
0fd73091 2310 char path[MAXPATHLEN];
d330fe7b 2311
34cfffb3 2312 /* relative to root mount point */
6e46cc0d 2313 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
0fd73091 2314 if (ret < 0 || (size_t)ret >= sizeof(path))
9ba8130c 2315 return -1;
911324ef 2316
0a2dddd4 2317 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2318}
2319
06749971
CB
2320static int mount_file_entries(const struct lxc_conf *conf,
2321 const struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2322 const char *lxc_name, const char *lxc_path)
911324ef 2323{
aaf901be 2324 char buf[4096];
0fd73091 2325 struct mntent mntent;
911324ef 2326 int ret = -1;
e76b8764 2327
aaf901be 2328 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
1ae3c19f
CB
2329 if (!rootfs->path)
2330 ret = mount_entry_on_systemfs(&mntent);
2331 else if (mntent.mnt_dir[0] != '/')
2332 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2333 lxc_name, lxc_path);
2334 else
2335 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2336 lxc_name, lxc_path);
2337 if (ret < 0)
2338 return -1;
0ad19a3f 2339 }
2340 ret = 0;
cd54d859 2341
0fd73091 2342 INFO("Finished setting up mounts");
e7938e9e
MN
2343 return ret;
2344}
2345
06749971
CB
2346static int setup_mount(const struct lxc_conf *conf,
2347 const struct lxc_rootfs *rootfs, const char *fstab,
42dff448 2348 const char *lxc_name, const char *lxc_path)
e7938e9e 2349{
42dff448 2350 FILE *f;
e7938e9e
MN
2351 int ret;
2352
2353 if (!fstab)
2354 return 0;
2355
42dff448
CB
2356 f = setmntent(fstab, "r");
2357 if (!f) {
2358 SYSERROR("Failed to open \"%s\"", fstab);
e7938e9e
MN
2359 return -1;
2360 }
2361
06749971 2362 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
42dff448
CB
2363 if (ret < 0)
2364 ERROR("Failed to set up mount entries");
e7938e9e 2365
42dff448 2366 endmntent(f);
0ad19a3f 2367 return ret;
2368}
2369
5ef5c9a3 2370FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2371{
5ef5c9a3 2372 int ret;
e7938e9e 2373 char *mount_entry;
5ef5c9a3 2374 struct lxc_list *iterator;
5ef5c9a3
CB
2375 int fd = -1;
2376
0fd73091 2377 fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
5ef5c9a3 2378 if (fd < 0) {
a324e7eb
CB
2379 char template[] = P_tmpdir "/.lxc_mount_file_XXXXXX";
2380
5ef5c9a3
CB
2381 if (errno != ENOSYS)
2382 return NULL;
a324e7eb
CB
2383
2384 fd = lxc_make_tmpfile(template, true);
0fd73091
CB
2385 if (fd < 0) {
2386 SYSERROR("Could not create temporary mount file");
2387 return NULL;
2388 }
2389
6bd04140 2390 TRACE("Created temporary mount file");
5ef5c9a3 2391 }
0fd73091
CB
2392 if (fd < 0) {
2393 SYSERROR("Could not create temporary mount file");
9fc7f8c0 2394 return NULL;
e7938e9e
MN
2395 }
2396
0fd73091
CB
2397 lxc_list_for_each (iterator, mount) {
2398 size_t len;
2399
e7938e9e 2400 mount_entry = iterator->elem;
0fd73091 2401 len = strlen(mount_entry);
5ef5c9a3 2402
489f39be 2403 ret = lxc_write_nointr(fd, mount_entry, len);
0fd73091
CB
2404 if (ret != len)
2405 goto on_error;
2406
489f39be 2407 ret = lxc_write_nointr(fd, "\n", 1);
0fd73091
CB
2408 if (ret != 1)
2409 goto on_error;
e7938e9e
MN
2410 }
2411
0fd73091
CB
2412 ret = lseek(fd, 0, SEEK_SET);
2413 if (ret < 0)
2414 goto on_error;
2415
2416 return fdopen(fd, "r+");
2417
2418on_error:
2419 SYSERROR("Failed to write mount entry to temporary mount file");
2420 close(fd);
2421 return NULL;
9fc7f8c0
TA
2422}
2423
06749971
CB
2424static int setup_mount_entries(const struct lxc_conf *conf,
2425 const struct lxc_rootfs *rootfs,
5ef5c9a3
CB
2426 struct lxc_list *mount, const char *lxc_name,
2427 const char *lxc_path)
9fc7f8c0 2428{
9fc7f8c0 2429 int ret;
0fd73091 2430 FILE *f;
9fc7f8c0 2431
19b5d755
CB
2432 f = make_anonymous_mount_file(mount);
2433 if (!f)
9fc7f8c0 2434 return -1;
e7938e9e 2435
06749971 2436 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
19b5d755 2437 fclose(f);
0fd73091 2438
e7938e9e
MN
2439 return ret;
2440}
2441
bab88e68
CS
2442static int parse_cap(const char *cap)
2443{
84760c11 2444 size_t i;
2445 int capid = -1;
0fd73091
CB
2446 size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
2447 char *ptr = NULL;
bab88e68 2448
0fd73091 2449 if (strcmp(cap, "none") == 0)
7035407c
DE
2450 return -2;
2451
8560cd36 2452 for (i = 0; i < end; i++) {
bab88e68
CS
2453 if (strcmp(cap, caps_opt[i].name))
2454 continue;
2455
2456 capid = caps_opt[i].value;
2457 break;
2458 }
2459
2460 if (capid < 0) {
0fd73091
CB
2461 /* Try to see if it's numeric, so the user may specify
2462 * capabilities that the running kernel knows about but we
2463 * don't
2464 */
bab88e68
CS
2465 errno = 0;
2466 capid = strtol(cap, &ptr, 10);
2467 if (!ptr || *ptr != '\0' || errno != 0)
2468 /* not a valid number */
2469 capid = -1;
2470 else if (capid > lxc_caps_last_cap())
2471 /* we have a number but it's not a valid
2472 * capability */
2473 capid = -1;
2474 }
2475
2476 return capid;
2477}
2478
0769b82a
CS
2479int in_caplist(int cap, struct lxc_list *caps)
2480{
0769b82a 2481 int capid;
0fd73091 2482 struct lxc_list *iterator;
0769b82a 2483
0fd73091 2484 lxc_list_for_each (iterator, caps) {
0769b82a
CS
2485 capid = parse_cap(iterator->elem);
2486 if (capid == cap)
2487 return 1;
2488 }
2489
2490 return 0;
2491}
2492
81810dd1
DL
2493static int setup_caps(struct lxc_list *caps)
2494{
bab88e68 2495 int capid;
0fd73091
CB
2496 char *drop_entry;
2497 struct lxc_list *iterator;
81810dd1 2498
0fd73091
CB
2499 lxc_list_for_each (iterator, caps) {
2500 int ret;
81810dd1
DL
2501
2502 drop_entry = iterator->elem;
2503
bab88e68 2504 capid = parse_cap(drop_entry);
0fd73091 2505 if (capid < 0) {
1e11be34
DL
2506 ERROR("unknown capability %s", drop_entry);
2507 return -1;
81810dd1
DL
2508 }
2509
0fd73091
CB
2510 ret = prctl(PR_CAPBSET_DROP, capid, 0, 0, 0);
2511 if (ret < 0) {
2512 SYSERROR("Failed to remove %s capability", drop_entry);
3ec1648d
SH
2513 return -1;
2514 }
0fd73091 2515 DEBUG("Dropped %s (%d) capability", drop_entry, capid);
81810dd1
DL
2516 }
2517
0fd73091 2518 DEBUG("Capabilities have been setup");
1fb86a7c
SH
2519 return 0;
2520}
2521
2522static int dropcaps_except(struct lxc_list *caps)
2523{
0fd73091 2524 int i, capid, numcaps;
1fb86a7c 2525 char *keep_entry;
0fd73091 2526 struct lxc_list *iterator;
1fb86a7c 2527
0fd73091 2528 numcaps = lxc_caps_last_cap() + 1;
2caf9a97
SH
2529 if (numcaps <= 0 || numcaps > 200)
2530 return -1;
0fd73091 2531 TRACE("Found %d capabilities", numcaps);
2caf9a97 2532
1a0e70ac 2533 /* caplist[i] is 1 if we keep capability i */
1fb86a7c
SH
2534 int *caplist = alloca(numcaps * sizeof(int));
2535 memset(caplist, 0, numcaps * sizeof(int));
2536
0fd73091 2537 lxc_list_for_each (iterator, caps) {
1fb86a7c
SH
2538 keep_entry = iterator->elem;
2539
bab88e68 2540 capid = parse_cap(keep_entry);
7035407c
DE
2541 if (capid == -2)
2542 continue;
2543
0fd73091
CB
2544 if (capid < 0) {
2545 ERROR("Unknown capability %s", keep_entry);
1fb86a7c
SH
2546 return -1;
2547 }
2548
0fd73091 2549 DEBUG("Keep capability %s (%d)", keep_entry, capid);
1fb86a7c
SH
2550 caplist[capid] = 1;
2551 }
0fd73091
CB
2552
2553 for (i = 0; i < numcaps; i++) {
2554 int ret;
2555
1fb86a7c
SH
2556 if (caplist[i])
2557 continue;
0fd73091
CB
2558
2559 ret = prctl(PR_CAPBSET_DROP, i, 0, 0, 0);
2560 if (ret < 0) {
2561 SYSERROR("Failed to remove capability %d", i);
3ec1648d
SH
2562 return -1;
2563 }
1fb86a7c
SH
2564 }
2565
0fd73091 2566 DEBUG("Capabilities have been setup");
81810dd1
DL
2567 return 0;
2568}
2569
0fd73091
CB
2570static int parse_resource(const char *res)
2571{
2572 int ret;
c6d09e15
WB
2573 size_t i;
2574 int resid = -1;
2575
0fd73091 2576 for (i = 0; i < sizeof(limit_opt) / sizeof(limit_opt[0]); ++i)
c6d09e15
WB
2577 if (strcmp(res, limit_opt[i].name) == 0)
2578 return limit_opt[i].value;
c6d09e15 2579
0fd73091 2580 /* Try to see if it's numeric, so the user may specify
c6d09e15 2581 * resources that the running kernel knows about but
0fd73091
CB
2582 * we don't.
2583 */
2584 ret = lxc_safe_int(res, &resid);
2585 if (ret < 0)
2586 return -1;
2587
2588 return resid;
c6d09e15
WB
2589}
2590
0fd73091
CB
2591int setup_resource_limits(struct lxc_list *limits, pid_t pid)
2592{
2593 int resid;
c6d09e15
WB
2594 struct lxc_list *it;
2595 struct lxc_limit *lim;
c6d09e15 2596
0fd73091 2597 lxc_list_for_each (it, limits) {
c6d09e15
WB
2598 lim = it->elem;
2599
2600 resid = parse_resource(lim->resource);
2601 if (resid < 0) {
0fd73091 2602 ERROR("Unknown resource %s", lim->resource);
c6d09e15
WB
2603 return -1;
2604 }
2605
f48b5fd8 2606#if HAVE_PRLIMIT || HAVE_PRLIMIT64
c6d09e15 2607 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
6d1400b5 2608 SYSERROR("Failed to set limit %s", lim->resource);
c6d09e15
WB
2609 return -1;
2610 }
f48b5fd8
FF
2611#else
2612 ERROR("Cannot set limit %s as prlimit is missing", lim->resource);
2613 return -1;
2614#endif
c6d09e15 2615 }
0fd73091 2616
c6d09e15
WB
2617 return 0;
2618}
2619
7edd0540
L
2620int setup_sysctl_parameters(struct lxc_list *sysctls)
2621{
2622 struct lxc_list *it;
2623 struct lxc_sysctl *elem;
0fd73091 2624 int ret = 0;
7edd0540
L
2625 char *tmp = NULL;
2626 char filename[MAXPATHLEN] = {0};
7edd0540 2627
0fd73091 2628 lxc_list_for_each (it, sysctls) {
7edd0540
L
2629 elem = it->elem;
2630 tmp = lxc_string_replace(".", "/", elem->key);
2631 if (!tmp) {
2632 ERROR("Failed to replace key %s", elem->key);
2633 return -1;
2634 }
2635
2636 ret = snprintf(filename, sizeof(filename), "/proc/sys/%s", tmp);
2637 free(tmp);
2638 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2639 ERROR("Error setting up sysctl parameters path");
2640 return -1;
2641 }
2642
0fd73091 2643 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2644 strlen(elem->value), false, 0666);
7edd0540 2645 if (ret < 0) {
0fd73091
CB
2646 ERROR("Failed to setup sysctl parameters %s to %s",
2647 elem->key, elem->value);
7edd0540
L
2648 return -1;
2649 }
2650 }
0fd73091 2651
7edd0540
L
2652 return 0;
2653}
2654
61d7a733
YT
2655int setup_proc_filesystem(struct lxc_list *procs, pid_t pid)
2656{
2657 struct lxc_list *it;
2658 struct lxc_proc *elem;
0fd73091 2659 int ret = 0;
61d7a733
YT
2660 char *tmp = NULL;
2661 char filename[MAXPATHLEN] = {0};
61d7a733 2662
0fd73091 2663 lxc_list_for_each (it, procs) {
61d7a733
YT
2664 elem = it->elem;
2665 tmp = lxc_string_replace(".", "/", elem->filename);
2666 if (!tmp) {
2667 ERROR("Failed to replace key %s", elem->filename);
2668 return -1;
2669 }
2670
2671 ret = snprintf(filename, sizeof(filename), "/proc/%d/%s", pid, tmp);
2672 free(tmp);
2673 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2674 ERROR("Error setting up proc filesystem path");
2675 return -1;
2676 }
2677
0fd73091 2678 ret = lxc_write_to_file(filename, elem->value,
7cea5905 2679 strlen(elem->value), false, 0666);
61d7a733 2680 if (ret < 0) {
0fd73091
CB
2681 ERROR("Failed to setup proc filesystem %s to %s",
2682 elem->filename, elem->value);
61d7a733
YT
2683 return -1;
2684 }
2685 }
0fd73091 2686
61d7a733
YT
2687 return 0;
2688}
2689
ae9242c8
SH
2690static char *default_rootfs_mount = LXCROOTFSMOUNT;
2691
7b379ab3 2692struct lxc_conf *lxc_conf_init(void)
089cd8b8 2693{
26ddeedd 2694 int i;
0fd73091 2695 struct lxc_conf *new;
7b379ab3 2696
13277ec4 2697 new = malloc(sizeof(*new));
0fd73091 2698 if (!new)
7b379ab3 2699 return NULL;
7b379ab3
MN
2700 memset(new, 0, sizeof(*new));
2701
4b73005c 2702 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2703 new->personality = -1;
124fa0a8 2704 new->autodev = 1;
3a784510 2705 new->console.buffer_size = 0;
596a818d
DE
2706 new->console.log_path = NULL;
2707 new->console.log_fd = -1;
861813e5 2708 new->console.log_size = 0;
28a4b0e5 2709 new->console.path = NULL;
63376d7d 2710 new->console.peer = -1;
fb87aa6a
CB
2711 new->console.proxy.busy = -1;
2712 new->console.proxy.master = -1;
2713 new->console.proxy.slave = -1;
63376d7d
DL
2714 new->console.master = -1;
2715 new->console.slave = -1;
2716 new->console.name[0] = '\0';
732375f5 2717 memset(&new->console.ringbuf, 0, sizeof(struct lxc_ringbuf));
d2e30e99 2718 new->maincmd_fd = -1;
258f8051 2719 new->monitor_signal_pdeath = SIGKILL;
76a26f55 2720 new->nbd_idx = -1;
54c30e29 2721 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2722 if (!new->rootfs.mount) {
53f3f048
SH
2723 free(new);
2724 return NULL;
2725 }
858377e4 2726 new->logfd = -1;
7b379ab3 2727 lxc_list_init(&new->cgroup);
54860ed0 2728 lxc_list_init(&new->cgroup2);
7b379ab3
MN
2729 lxc_list_init(&new->network);
2730 lxc_list_init(&new->mount_list);
81810dd1 2731 lxc_list_init(&new->caps);
1fb86a7c 2732 lxc_list_init(&new->keepcaps);
f6d3e3e4 2733 lxc_list_init(&new->id_map);
46ad64ab
CB
2734 new->root_nsuid_map = NULL;
2735 new->root_nsgid_map = NULL;
f979ac15 2736 lxc_list_init(&new->includes);
4184c3e1 2737 lxc_list_init(&new->aliens);
7c661726 2738 lxc_list_init(&new->environment);
c6d09e15 2739 lxc_list_init(&new->limits);
7edd0540 2740 lxc_list_init(&new->sysctls);
61d7a733 2741 lxc_list_init(&new->procs);
44ae0fb6 2742 new->hooks_version = 0;
28d9e29e 2743 for (i = 0; i < NUM_LXC_HOOKS; i++)
26ddeedd 2744 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2745 lxc_list_init(&new->groups);
d39b10eb 2746 lxc_list_init(&new->state_clients);
fe4de9a6
DE
2747 new->lsm_aa_profile = NULL;
2748 new->lsm_se_context = NULL;
7a0bcca3 2749 new->tmp_umount_proc = false;
adf0ba1f
LT
2750 new->lxc_shmount.path_host = NULL;
2751 new->lxc_shmount.path_cont = NULL;
7b379ab3 2752
72bb04e4
PT
2753 /* if running in a new user namespace, init and COMMAND
2754 * default to running as UID/GID 0 when using lxc-execute */
2755 new->init_uid = 0;
2756 new->init_gid = 0;
43654d34 2757 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
b074bbf1 2758 memset(&new->ns_share, 0, sizeof(char *) * LXC_NS_MAX);
72bb04e4 2759
7b379ab3 2760 return new;
089cd8b8
DL
2761}
2762
344c9d81 2763int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
a19b974f 2764 size_t buf_size)
f6d3e3e4 2765{
29053180 2766 int fd, ret;
0fd73091 2767 char path[MAXPATHLEN];
f6d3e3e4 2768
a19b974f
CB
2769 if (geteuid() != 0 && idtype == ID_TYPE_GID) {
2770 size_t buflen;
2771
2772 ret = snprintf(path, MAXPATHLEN, "/proc/%d/setgroups", pid);
0fd73091 2773 if (ret < 0 || ret >= MAXPATHLEN)
a19b974f 2774 return -E2BIG;
a19b974f
CB
2775
2776 fd = open(path, O_WRONLY);
2777 if (fd < 0 && errno != ENOENT) {
2778 SYSERROR("Failed to open \"%s\"", path);
2779 return -1;
2780 }
2781
2388737b
CB
2782 if (fd >= 0) {
2783 buflen = sizeof("deny\n") - 1;
2784 errno = 0;
2785 ret = lxc_write_nointr(fd, "deny\n", buflen);
395b1a3e 2786 close(fd);
2388737b 2787 if (ret != buflen) {
0fd73091
CB
2788 SYSERROR("Failed to write \"deny\" to "
2789 "\"/proc/%d/setgroups\"", pid);
2388737b
CB
2790 return -1;
2791 }
395b1a3e 2792 TRACE("Wrote \"deny\" to \"/proc/%d/setgroups\"", pid);
a19b974f 2793 }
a19b974f
CB
2794 }
2795
29053180
CB
2796 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
2797 idtype == ID_TYPE_UID ? 'u' : 'g');
0fd73091 2798 if (ret < 0 || ret >= MAXPATHLEN)
f6d3e3e4 2799 return -E2BIG;
29053180
CB
2800
2801 fd = open(path, O_WRONLY);
2802 if (fd < 0) {
a19b974f 2803 SYSERROR("Failed to open \"%s\"", path);
29053180 2804 return -1;
f6d3e3e4 2805 }
29053180
CB
2806
2807 errno = 0;
2808 ret = lxc_write_nointr(fd, buf, buf_size);
395b1a3e 2809 close(fd);
29053180 2810 if (ret != buf_size) {
a19b974f 2811 SYSERROR("Failed to write %cid mapping to \"%s\"",
29053180 2812 idtype == ID_TYPE_UID ? 'u' : 'g', path);
29053180
CB
2813 return -1;
2814 }
29053180
CB
2815
2816 return 0;
f6d3e3e4
SH
2817}
2818
6e50e704
CB
2819/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2820 *
2821 * @return 1 if functional binary was found
2822 * @return 0 if binary exists but is lacking privilege
2823 * @return -ENOENT if binary does not exist
2824 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
6e50e704 2825 */
df6a2945
CB
2826static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2827{
2828 char *path;
2829 int ret;
2830 struct stat st;
2831 int fret = 0;
2832
6e50e704
CB
2833 if (cap != CAP_SETUID && cap != CAP_SETGID)
2834 return -EINVAL;
2835
df6a2945
CB
2836 path = on_path(binary, NULL);
2837 if (!path)
2838 return -ENOENT;
2839
2840 ret = stat(path, &st);
2841 if (ret < 0) {
2842 fret = -errno;
2843 goto cleanup;
2844 }
2845
2846 /* Check if the binary is setuid. */
2847 if (st.st_mode & S_ISUID) {
0fd73091 2848 DEBUG("The binary \"%s\" does have the setuid bit set", path);
df6a2945
CB
2849 fret = 1;
2850 goto cleanup;
2851 }
2852
0fd73091 2853#if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
2854 /* Check if it has the CAP_SETUID capability. */
2855 if ((cap & CAP_SETUID) &&
2856 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2857 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2858 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
0fd73091 2859 "and CAP_PERMITTED sets", path);
df6a2945
CB
2860 fret = 1;
2861 goto cleanup;
2862 }
2863
2864 /* Check if it has the CAP_SETGID capability. */
2865 if ((cap & CAP_SETGID) &&
2866 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2867 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2868 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
0fd73091 2869 "and CAP_PERMITTED sets", path);
df6a2945
CB
2870 fret = 1;
2871 goto cleanup;
2872 }
0fd73091 2873#else
69924fff
CB
2874 /* If we cannot check for file capabilities we need to give the benefit
2875 * of the doubt. Otherwise we might fail even though all the necessary
2876 * file capabilities are set.
2877 */
d6018f88 2878 DEBUG("Cannot check for file capabilites as full capability support is "
0fd73091 2879 "missing. Manual intervention needed");
d6018f88 2880 fret = 1;
0fd73091 2881#endif
df6a2945
CB
2882
2883cleanup:
2884 free(path);
2885 return fret;
2886}
2887
986ef930
CB
2888int lxc_map_ids_exec_wrapper(void *args)
2889{
2890 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2891 return -1;
2892}
2893
f6d3e3e4
SH
2894int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2895{
0fd73091 2896 int fill, left;
986ef930 2897 char u_or_g;
4bc3b759 2898 char *pos;
986ef930 2899 char cmd_output[MAXPATHLEN];
0fd73091
CB
2900 struct id_map *map;
2901 struct lxc_list *iterator;
2902 enum idtype type;
986ef930
CB
2903 /* strlen("new@idmap") = 9
2904 * +
2905 * strlen(" ") = 1
2906 * +
2907 * LXC_NUMSTRLEN64
2908 * +
2909 * strlen(" ") = 1
2910 *
2911 * We add some additional space to make sure that we really have
2912 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2913 */
0fd73091 2914 int ret = 0, gidmap = 0, uidmap = 0;
986ef930 2915 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
0fd73091 2916 bool had_entry = false, use_shadow = false;
c724025c
JC
2917 int hostuid, hostgid;
2918
2919 hostuid = geteuid();
2920 hostgid = getegid();
df6a2945
CB
2921
2922 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2923 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
2924 * will protected it by preventing another user from being handed the
2925 * range by shadow.
2926 */
df6a2945 2927 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
2928 if (uidmap == -ENOENT)
2929 WARN("newuidmap binary is missing");
2930 else if (!uidmap)
2931 WARN("newuidmap is lacking necessary privileges");
2932
df6a2945 2933 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
2934 if (gidmap == -ENOENT)
2935 WARN("newgidmap binary is missing");
2936 else if (!gidmap)
2937 WARN("newgidmap is lacking necessary privileges");
2938
df6a2945 2939 if (uidmap > 0 && gidmap > 0) {
0fd73091 2940 DEBUG("Functional newuidmap and newgidmap binary found");
4bc3b759 2941 use_shadow = true;
df6a2945 2942 } else {
99d43365
CB
2943 /* In case unprivileged users run application containers via
2944 * execute() or a start*() there are valid cases where they may
2945 * only want to map their own {g,u}id. Let's not block them from
2946 * doing so by requiring geteuid() == 0.
2947 */
2948 DEBUG("No newuidmap and newgidmap binary found. Trying to "
c724025c
JC
2949 "write directly with euid %d", hostuid);
2950 }
2951
2952 /* Check if we really need to use newuidmap and newgidmap.
2953 * If the user is only remapping his own {g,u}id, we don't need it.
2954 */
2955 if (use_shadow && lxc_list_len(idmap) == 2) {
2956 use_shadow = false;
2957 lxc_list_for_each(iterator, idmap) {
2958 map = iterator->elem;
2959 if (map->idtype == ID_TYPE_UID && map->range == 1 &&
2960 map->nsid == hostuid && map->hostid == hostuid)
2961 continue;
2962 if (map->idtype == ID_TYPE_GID && map->range == 1 &&
2963 map->nsid == hostgid && map->hostid == hostgid)
2964 continue;
2965 use_shadow = true;
2966 break;
2967 }
0e6e3a41 2968 }
251d0d2a 2969
986ef930
CB
2970 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2971 type++, u_or_g = 'g') {
2972 pos = mapbuf;
2973
0e6e3a41 2974 if (use_shadow)
986ef930 2975 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 2976
cf3ef16d 2977 lxc_list_for_each(iterator, idmap) {
251d0d2a 2978 map = iterator->elem;
cf3ef16d
SH
2979 if (map->idtype != type)
2980 continue;
2981
4bc3b759
CB
2982 had_entry = true;
2983
986ef930 2984 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 2985 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
2986 use_shadow ? " " : "", map->nsid,
2987 map->hostid, map->range,
0e6e3a41 2988 use_shadow ? "" : "\n");
a427e268
CB
2989 if (fill <= 0 || fill >= left) {
2990 /* The kernel only takes <= 4k for writes to
2991 * /proc/<pid>/{g,u}id_map
2992 */
2993 SYSERROR("Too many %cid mappings defined", u_or_g);
2994 return -1;
2995 }
4bc3b759 2996
cf3ef16d 2997 pos += fill;
251d0d2a 2998 }
cf3ef16d 2999 if (!had_entry)
4f7521b4 3000 continue;
cf3ef16d 3001
986ef930
CB
3002 /* Try to catch the ouput of new{g,u}idmap to make debugging
3003 * easier.
3004 */
3005 if (use_shadow) {
3006 ret = run_command(cmd_output, sizeof(cmd_output),
3007 lxc_map_ids_exec_wrapper,
3008 (void *)mapbuf);
3009 if (ret < 0) {
54fbbeb5
CB
3010 ERROR("new%cidmap failed to write mapping \"%s\": %s",
3011 u_or_g, cmd_output, mapbuf);
986ef930
CB
3012 return -1;
3013 }
54fbbeb5 3014 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
d1838f34 3015 } else {
986ef930 3016 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
54fbbeb5 3017 if (ret < 0) {
da0f9977 3018 ERROR("Failed to write mapping: %s", mapbuf);
986ef930 3019 return -1;
54fbbeb5
CB
3020 }
3021 TRACE("Wrote mapping \"%s\"", mapbuf);
d1838f34 3022 }
986ef930
CB
3023
3024 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 3025 }
251d0d2a 3026
986ef930 3027 return 0;
f6d3e3e4
SH
3028}
3029
0fd73091 3030/* Return the host uid/gid to which the container root is mapped in val.
0b3a6504 3031 * Return true if id was found, false otherwise.
cf3ef16d 3032 */
2a9a80cb 3033bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
4160c3a0 3034 unsigned long *val)
cf3ef16d 3035{
4160c3a0 3036 unsigned nsid;
0fd73091
CB
3037 struct id_map *map;
3038 struct lxc_list *it;
4160c3a0
CB
3039
3040 if (idtype == ID_TYPE_UID)
3041 nsid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
3042 else
3043 nsid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
cf3ef16d 3044
0fd73091 3045 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 3046 map = it->elem;
7b50c609 3047 if (map->idtype != idtype)
cf3ef16d 3048 continue;
4160c3a0 3049 if (map->nsid != nsid)
cf3ef16d 3050 continue;
2a9a80cb
SH
3051 *val = map->hostid;
3052 return true;
cf3ef16d 3053 }
4160c3a0 3054
2a9a80cb 3055 return false;
cf3ef16d
SH
3056}
3057
2133f58c 3058int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 3059{
cf3ef16d 3060 struct id_map *map;
0fd73091
CB
3061 struct lxc_list *it;
3062
3063 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 3064 map = it->elem;
2133f58c 3065 if (map->idtype != idtype)
cf3ef16d 3066 continue;
0fd73091 3067
cf3ef16d 3068 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 3069 return (id - map->hostid) + map->nsid;
cf3ef16d 3070 }
0fd73091 3071
57d116ab 3072 return -1;
cf3ef16d
SH
3073}
3074
339efad9 3075int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d 3076{
cf3ef16d 3077 struct id_map *map;
0fd73091 3078 struct lxc_list *it;
2133f58c 3079 unsigned int freeid = 0;
0fd73091 3080
cf3ef16d 3081again:
0fd73091 3082 lxc_list_for_each (it, &conf->id_map) {
cf3ef16d 3083 map = it->elem;
2133f58c 3084 if (map->idtype != idtype)
cf3ef16d 3085 continue;
0fd73091 3086
cf3ef16d
SH
3087 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3088 freeid = map->nsid + map->range;
3089 goto again;
3090 }
3091 }
0fd73091 3092
cf3ef16d
SH
3093 return freeid;
3094}
3095
f4f52cb5
CB
3096int chown_mapped_root_exec_wrapper(void *args)
3097{
3098 execvp("lxc-usernsexec", args);
3099 return -1;
3100}
3101
0fd73091 3102/* chown_mapped_root: for an unprivileged user with uid/gid X to
7b50c609
TS
3103 * chown a dir to subuid/subgid Y, he needs to run chown as root
3104 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3105 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3106 * root is privileged with respect to hostuid/hostgid X, allowing
3107 * him to do the chown.
f6d3e3e4 3108 */
41dc7155 3109int chown_mapped_root(const char *path, struct lxc_conf *conf)
f6d3e3e4 3110{
f4f52cb5 3111 uid_t rootuid, rootgid;
2a9a80cb 3112 unsigned long val;
f4f52cb5
CB
3113 int hostuid, hostgid, ret;
3114 struct stat sb;
3115 char map1[100], map2[100], map3[100], map4[100], map5[100];
3116 char ugid[100];
41dc7155 3117 const char *args1[] = {"lxc-usernsexec",
f4f52cb5
CB
3118 "-m", map1,
3119 "-m", map2,
3120 "-m", map3,
3121 "-m", map5,
3122 "--", "chown", ugid, path,
3123 NULL};
41dc7155 3124 const char *args2[] = {"lxc-usernsexec",
f4f52cb5
CB
3125 "-m", map1,
3126 "-m", map2,
3127 "-m", map3,
3128 "-m", map4,
3129 "-m", map5,
3130 "--", "chown", ugid, path,
3131 NULL};
3132 char cmd_output[MAXPATHLEN];
3133
3134 hostuid = geteuid();
3135 hostgid = getegid();
f6d3e3e4 3136
2a9a80cb 3137 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 3138 ERROR("No uid mapping for container root");
c4d10a05 3139 return -1;
f6d3e3e4 3140 }
f4f52cb5 3141 rootuid = (uid_t)val;
0fd73091 3142
7b50c609 3143 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 3144 ERROR("No gid mapping for container root");
7b50c609
TS
3145 return -1;
3146 }
f4f52cb5 3147 rootgid = (gid_t)val;
2a9a80cb 3148
f4f52cb5 3149 if (hostuid == 0) {
7b50c609 3150 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3151 ERROR("Error chowning %s", path);
3152 return -1;
3153 }
0fd73091 3154
c4d10a05
SH
3155 return 0;
3156 }
f3d7e4ca 3157
f4f52cb5 3158 if (rootuid == hostuid) {
1a0e70ac 3159 /* nothing to do */
b103ceac 3160 INFO("Container root is our uid; no need to chown");
f3d7e4ca
SH
3161 return 0;
3162 }
3163
bbdbf8f0 3164 /* save the current gid of "path" */
f4f52cb5
CB
3165 if (stat(path, &sb) < 0) {
3166 ERROR("Error stat %s", path);
f6d3e3e4
SH
3167 return -1;
3168 }
7b50c609 3169
bbdbf8f0
CB
3170 /* Update the path argument in case this was overlayfs. */
3171 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3172 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3173
f4f52cb5
CB
3174 /*
3175 * A file has to be group-owned by a gid mapped into the
3176 * container, or the container won't be privileged over it.
3177 */
3178 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3179 if (sb.st_uid == hostuid &&
3180 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3181 chown(path, -1, hostgid) < 0) {
3182 ERROR("Failed chgrping %s", path);
3183 return -1;
3184 }
f6d3e3e4 3185
1a0e70ac 3186 /* "u:0:rootuid:1" */
f4f52cb5
CB
3187 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3188 if (ret < 0 || ret >= 100) {
3189 ERROR("Error uid printing map string");
3190 return -1;
3191 }
7b50c609 3192
1a0e70ac 3193 /* "u:hostuid:hostuid:1" */
f4f52cb5
CB
3194 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3195 if (ret < 0 || ret >= 100) {
3196 ERROR("Error uid printing map string");
3197 return -1;
3198 }
c4d10a05 3199
1a0e70ac 3200 /* "g:0:rootgid:1" */
f4f52cb5
CB
3201 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3202 if (ret < 0 || ret >= 100) {
3203 ERROR("Error gid printing map string");
3204 return -1;
3205 }
98e5ba51 3206
1a0e70ac 3207 /* "g:pathgid:rootgid+pathgid:1" */
f4f52cb5
CB
3208 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3209 rootgid + (gid_t)sb.st_gid);
3210 if (ret < 0 || ret >= 100) {
3211 ERROR("Error gid printing map string");
3212 return -1;
3213 }
c4d10a05 3214
1a0e70ac 3215 /* "g:hostgid:hostgid:1" */
f4f52cb5
CB
3216 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3217 if (ret < 0 || ret >= 100) {
3218 ERROR("Error gid printing map string");
3219 return -1;
3220 }
7b50c609 3221
1a0e70ac 3222 /* "0:pathgid" (chown) */
f4f52cb5
CB
3223 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3224 if (ret < 0 || ret >= 100) {
3225 ERROR("Error owner printing format string for chown");
3226 return -1;
3227 }
7b50c609 3228
f4f52cb5
CB
3229 if (hostgid == sb.st_gid)
3230 ret = run_command(cmd_output, sizeof(cmd_output),
3231 chown_mapped_root_exec_wrapper,
3232 (void *)args1);
3233 else
3234 ret = run_command(cmd_output, sizeof(cmd_output),
3235 chown_mapped_root_exec_wrapper,
3236 (void *)args2);
3237 if (ret < 0)
3238 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 3239
f4f52cb5 3240 return ret;
f6d3e3e4
SH
3241}
3242
943144d9
CB
3243/* NOTE: Must not be called from inside the container namespace! */
3244int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3245{
3246 int mounted;
3247
943144d9 3248 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3249 if (mounted == -1) {
0fd73091 3250 SYSERROR("Failed to mount proc in the container");
01958b1f 3251 /* continue only if there is no rootfs */
943144d9 3252 if (conf->rootfs.path)
01958b1f 3253 return -1;
5112cd70 3254 } else if (mounted == 1) {
7a0bcca3 3255 conf->tmp_umount_proc = true;
5112cd70 3256 }
943144d9 3257
5112cd70
SH
3258 return 0;
3259}
3260
3261void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3262{
7a0bcca3 3263 if (!lxc_conf->tmp_umount_proc)
0fd73091
CB
3264 return;
3265
7a0bcca3
CB
3266 (void)umount2("/proc", MNT_DETACH);
3267 lxc_conf->tmp_umount_proc = false;
5112cd70
SH
3268}
3269
0fd73091 3270/* Walk /proc/mounts and change any shared entries to slave. */
6a0c909a 3271void remount_all_slave(void)
e995d7a2 3272{
6a49f05e
CB
3273 int memfd, mntinfo_fd, ret;
3274 ssize_t copied;
0fd73091 3275 FILE *f;
e995d7a2 3276 size_t len = 0;
0fd73091 3277 char *line = NULL;
e995d7a2 3278
6a49f05e 3279 mntinfo_fd = open("/proc/self/mountinfo", O_RDONLY | O_CLOEXEC);
fea3b91d
DJ
3280 if (mntinfo_fd < 0) {
3281 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
6a49f05e 3282 return;
fea3b91d 3283 }
6a49f05e
CB
3284
3285 memfd = memfd_create(".lxc_mountinfo", MFD_CLOEXEC);
3286 if (memfd < 0) {
3287 char template[] = P_tmpdir "/.lxc_mountinfo_XXXXXX";
3288
3289 if (errno != ENOSYS) {
fea3b91d 3290 SYSERROR("Failed to create temporary in-memory file");
6a49f05e 3291 close(mntinfo_fd);
6a49f05e
CB
3292 return;
3293 }
3294
3295 memfd = lxc_make_tmpfile(template, true);
fea3b91d
DJ
3296 if (memfd < 0) {
3297 close(mntinfo_fd);
3298 WARN("Failed to create temporary file");
3299 return;
3300 }
6a49f05e
CB
3301 }
3302
3303#define __LXC_SENDFILE_MAX 0x7ffff000 /* maximum number of bytes sendfile can handle */
3304again:
3305 copied = sendfile(memfd, mntinfo_fd, NULL, __LXC_SENDFILE_MAX);
3306 if (copied < 0) {
3307 if (errno == EINTR)
3308 goto again;
3309
fea3b91d 3310 SYSERROR("Failed to copy \"/proc/self/mountinfo\"");
6a49f05e
CB
3311 close(mntinfo_fd);
3312 close(memfd);
6a49f05e
CB
3313 return;
3314 }
3315 close(mntinfo_fd);
3316
3317 /* After a successful fdopen() memfd will be closed when calling
3318 * fclose(f). Calling close(memfd) afterwards is undefined.
3319 */
3320 ret = lseek(memfd, 0, SEEK_SET);
3321 if (ret < 0) {
fea3b91d 3322 SYSERROR("Failed to reset file descriptor offset");
6a49f05e 3323 close(memfd);
6a49f05e
CB
3324 return;
3325 }
3326
3327 f = fdopen(memfd, "r");
e995d7a2 3328 if (!f) {
fea3b91d
DJ
3329 SYSERROR("Failed to open copy of \"/proc/self/mountinfo\" to mark "
3330 "all shared. Continuing");
6a49f05e 3331 close(memfd);
e995d7a2
SH
3332 return;
3333 }
3334
3335 while (getline(&line, &len, f) != -1) {
0fd73091
CB
3336 int ret;
3337 char *opts, *target;
3338
e995d7a2
SH
3339 target = get_field(line, 4);
3340 if (!target)
3341 continue;
0fd73091 3342
e995d7a2
SH
3343 opts = get_field(target, 2);
3344 if (!opts)
3345 continue;
0fd73091 3346
e995d7a2
SH
3347 null_endofword(opts);
3348 if (!strstr(opts, "shared"))
3349 continue;
0fd73091 3350
e995d7a2 3351 null_endofword(target);
0fd73091
CB
3352 ret = mount(NULL, target, NULL, MS_SLAVE, NULL);
3353 if (ret < 0) {
3354 SYSERROR("Failed to make \"%s\" MS_SLAVE", target);
e995d7a2 3355 ERROR("Continuing...");
6a49f05e 3356 continue;
e995d7a2 3357 }
6a49f05e 3358 TRACE("Remounted \"%s\" as MS_SLAVE", target);
e995d7a2
SH
3359 }
3360 fclose(f);
f10fad2f 3361 free(line);
6a49f05e 3362 TRACE("Remounted all mount table entries as MS_SLAVE");
e995d7a2
SH
3363}
3364
794248d0 3365static int lxc_execute_bind_init(struct lxc_handler *handler)
2322903b
SH
3366{
3367 int ret;
794248d0
CB
3368 char *p;
3369 char path[PATH_MAX], destpath[PATH_MAX];
3370 struct lxc_conf *conf = handler->conf;
9d9c111c
SH
3371
3372 /* If init exists in the container, don't bind mount a static one */
3373 p = choose_init(conf->rootfs.mount);
3374 if (p) {
41089848
TA
3375 char *old = p;
3376
3377 p = strdup(old + strlen(conf->rootfs.mount));
3378 free(old);
3379 if (!p)
3380 return -ENOMEM;
3381
3382 INFO("Found existing init at \"%s\"", p);
3383 goto out;
9d9c111c 3384 }
2322903b
SH
3385
3386 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
0fd73091 3387 if (ret < 0 || ret >= PATH_MAX)
8353b4c9 3388 return -1;
2322903b
SH
3389
3390 if (!file_exists(path)) {
0fd73091 3391 ERROR("The file \"%s\" does not exist on host", path);
8353b4c9 3392 return -1;
2322903b
SH
3393 }
3394
794248d0 3395 ret = snprintf(destpath, PATH_MAX, "%s" P_tmpdir "%s", conf->rootfs.mount, "/.lxc-init");
0fd73091 3396 if (ret < 0 || ret >= PATH_MAX)
8353b4c9 3397 return -1;
2322903b
SH
3398
3399 if (!file_exists(destpath)) {
794248d0
CB
3400 ret = mknod(destpath, S_IFREG | 0000, 0);
3401 if (ret < 0 && errno != EEXIST) {
3402 SYSERROR("Failed to create dummy \"%s\" file as bind mount target", destpath);
8353b4c9 3403 return -1;
2322903b 3404 }
2322903b
SH
3405 }
3406
592fd47a 3407 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
8353b4c9 3408 if (ret < 0) {
0fd73091 3409 SYSERROR("Failed to bind mount lxc.init.static into container");
8353b4c9
CB
3410 return -1;
3411 }
3412
794248d0
CB
3413 p = strdup(destpath + strlen(conf->rootfs.mount));
3414 if (!p)
3415 return -ENOMEM;
794248d0 3416
8353b4c9 3417 INFO("Bind mounted lxc.init.static into container at \"%s\"", path);
41089848 3418out:
4b5b3a2a 3419 ((struct execute_args *)handler->data)->init_fd = -1;
41089848 3420 ((struct execute_args *)handler->data)->init_path = p;
8353b4c9 3421 return 0;
2322903b
SH
3422}
3423
0fd73091
CB
3424/* This does the work of remounting / if it is shared, calling the container
3425 * pre-mount hooks, and mounting the rootfs.
35120d9c
SH
3426 */
3427int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 3428{
0fd73091
CB
3429 int ret;
3430
35120d9c 3431 if (conf->rootfs_setup) {
35120d9c 3432 const char *path = conf->rootfs.mount;
0fd73091
CB
3433
3434 /* The rootfs was set up in another namespace. bind-mount it to
3435 * give us a mount in our own ns so we can pivot_root to it
3436 */
3437 ret = mount(path, path, "rootfs", MS_BIND, NULL);
3438 if (ret < 0) {
3439 ERROR("Failed to bind mount container / onto itself");
145832ba 3440 return -1;
35120d9c 3441 }
0fd73091
CB
3442
3443 TRACE("Bind mounted container / onto itself");
145832ba 3444 return 0;
35120d9c 3445 }
d4ef7c50 3446
e995d7a2
SH
3447 remount_all_slave();
3448
0fd73091
CB
3449 ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
3450 if (ret < 0) {
3451 ERROR("Failed to run pre-mount hooks");
35120d9c
SH
3452 return -1;
3453 }
3454
0fd73091
CB
3455 ret = lxc_setup_rootfs(conf);
3456 if (ret < 0) {
3457 ERROR("Failed to setup rootfs for");
35120d9c
SH
3458 return -1;
3459 }
3460
3461 conf->rootfs_setup = true;
3462 return 0;
3463}
3464
1c1c7051
SH
3465static bool verify_start_hooks(struct lxc_conf *conf)
3466{
1c1c7051 3467 char path[MAXPATHLEN];
0fd73091
CB
3468 struct lxc_list *it;
3469
3470 lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
1c1c7051 3471 int ret;
0fd73091
CB
3472 struct stat st;
3473 char *hookname = it->elem;
1c1c7051
SH
3474
3475 ret = snprintf(path, MAXPATHLEN, "%s%s",
0fd73091
CB
3476 conf->rootfs.path ? conf->rootfs.mount : "",
3477 hookname);
1c1c7051
SH
3478 if (ret < 0 || ret >= MAXPATHLEN)
3479 return false;
0fd73091 3480
1c1c7051 3481 ret = stat(path, &st);
0fd73091 3482 if (ret < 0) {
7b6753e7 3483 SYSERROR("Start hook %s not found in container",
0fd73091 3484 hookname);
1c1c7051
SH
3485 return false;
3486 }
0fd73091 3487
6a0c909a 3488 return true;
1c1c7051
SH
3489 }
3490
3491 return true;
3492}
3493
4b5b3a2a
TA
3494static bool execveat_supported(void)
3495{
3496#ifdef __NR_execveat
3497 /*
3498 * We use the syscall here, because it was introduced in kernel 3.19,
3499 * while glibc got support for using the syscall much later, in 2.27.
3500 * We don't want to use glibc because it falls back to /proc, and the
3501 * container may not have /proc mounted depending on its configuration.
3502 */
3503 syscall(__NR_execveat, -1, "", NULL, NULL, AT_EMPTY_PATH);
3504 if (errno == ENOSYS)
3505 return false;
3506
3507 return true;
3508#else
3509 return false;
3510#endif
3511}
3512
3b988b33 3513int lxc_setup(struct lxc_handler *handler)
35120d9c 3514{
2187efd3 3515 int ret;
0fd73091 3516 const char *lxcpath = handler->lxcpath, *name = handler->name;
35120d9c 3517 struct lxc_conf *lxc_conf = handler->conf;
35120d9c 3518
8353b4c9
CB
3519 ret = do_rootfs_setup(lxc_conf, name, lxcpath);
3520 if (ret < 0) {
3521 ERROR("Failed to setup rootfs");
35120d9c
SH
3522 return -1;
3523 }
3524
28d9e29e 3525 if (handler->nsfd[LXC_NS_UTS] == -1) {
8353b4c9
CB
3526 ret = setup_utsname(lxc_conf->utsname);
3527 if (ret < 0) {
0fd73091 3528 ERROR("Failed to setup the utsname %s", name);
6c544cb3
MM
3529 return -1;
3530 }
0ad19a3f 3531 }
3532
8353b4c9
CB
3533 ret = lxc_setup_network_in_child_namespaces(lxc_conf, &lxc_conf->network);
3534 if (ret < 0) {
3535 ERROR("Failed to setup network");
95b5ffaf 3536 return -1;
0ad19a3f 3537 }
3538
8353b4c9
CB
3539 ret = lxc_network_send_name_and_ifindex_to_parent(handler);
3540 if (ret < 0) {
3541 ERROR("Failed to send network device names and ifindices to parent");
790255cf
CB
3542 return -1;
3543 }
3544
bc6928ff 3545 if (lxc_conf->autodev > 0) {
8353b4c9
CB
3546 ret = mount_autodev(name, &lxc_conf->rootfs, lxcpath);
3547 if (ret < 0) {
3548 ERROR("Failed to mount \"/dev\"");
c6883f38
SH
3549 return -1;
3550 }
3551 }
3552
8353b4c9
CB
3553 /* Do automatic mounts (mainly /proc and /sys), but exclude those that
3554 * need to wait until other stuff has finished.
368bbc02 3555 */
8353b4c9
CB
3556 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler);
3557 if (ret < 0) {
3558 ERROR("Failed to setup first automatic mounts");
368bbc02
CS
3559 return -1;
3560 }
3561
8353b4c9
CB
3562 ret = setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath);
3563 if (ret < 0) {
3564 ERROR("Failed to setup mounts");
95b5ffaf 3565 return -1;
576f946d 3566 }
3567
7b6753e7 3568 /* Make sure any start hooks are in the container */
1c1c7051
SH
3569 if (!verify_start_hooks(lxc_conf))
3570 return -1;
3571
8353b4c9 3572 if (lxc_conf->is_execute) {
4b5b3a2a
TA
3573 if (execveat_supported()) {
3574 int fd;
3575 char path[PATH_MAX];
3576
3577 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3578 if (ret < 0 || ret >= PATH_MAX) {
3579 ERROR("Path to init.lxc.static too long");
3580 return -1;
3581 }
3582
3583 fd = open(path, O_PATH | O_CLOEXEC);
3584 if (fd < 0) {
3585 SYSERROR("Unable to open lxc.init.static");
3586 return -1;
3587 }
3588
3589 ((struct execute_args *)handler->data)->init_fd = fd;
3590 ((struct execute_args *)handler->data)->init_path = NULL;
3591 } else {
3592 ret = lxc_execute_bind_init(handler);
3593 if (ret < 0) {
3594 ERROR("Failed to bind-mount the lxc init system");
3595 return -1;
3596 }
8353b4c9
CB
3597 }
3598 }
2322903b 3599
8353b4c9
CB
3600 /* Now mount only cgroups, if wanted. Before, /sys could not have been
3601 * mounted. It is guaranteed to be mounted now either through
3602 * automatically or via fstab entries.
368bbc02 3603 */
8353b4c9
CB
3604 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler);
3605 if (ret < 0) {
3606 ERROR("Failed to setup remaining automatic mounts");
368bbc02
CS
3607 return -1;
3608 }
3609
8353b4c9 3610 ret = run_lxc_hooks(name, "mount", lxc_conf, NULL);
1a2cf89d 3611 if (ret < 0) {
8353b4c9 3612 ERROR("Failed to run mount hooks");
773fb9ca
SH
3613 return -1;
3614 }
3615
bc6928ff 3616 if (lxc_conf->autodev > 0) {
8353b4c9
CB
3617 ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
3618 if (ret < 0) {
3619 ERROR("Failed to run autodev hooks");
f7bee6c6
MW
3620 return -1;
3621 }
06749971 3622
8353b4c9
CB
3623 ret = lxc_fill_autodev(&lxc_conf->rootfs);
3624 if (ret < 0) {
3625 ERROR("Failed to populate \"/dev\"");
91c3830e
SH
3626 return -1;
3627 }
3628 }
368bbc02 3629
8353b4c9
CB
3630 if (!lxc_list_empty(&lxc_conf->mount_list)) {
3631 ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
3632 &lxc_conf->mount_list, name, lxcpath);
3633 if (ret < 0) {
3634 ERROR("Failed to setup mount entries");
3635 return -1;
3636 }
181437fd
YT
3637 }
3638
ed8704d0 3639 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
885766f5 3640 lxc_conf->ttys.dir);
ed8704d0
CB
3641 if (ret < 0) {
3642 ERROR("Failed to setup console");
95b5ffaf 3643 return -1;
6e590161 3644 }
3645
ed8704d0
CB
3646 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
3647 if (ret < 0) {
8353b4c9 3648 ERROR("Failed to setup \"/dev\" symlinks");
69aa6655
DE
3649 return -1;
3650 }
3651
8353b4c9
CB
3652 ret = lxc_create_tmp_proc_mount(lxc_conf);
3653 if (ret < 0) {
3654 ERROR("Failed to \"/proc\" LSMs");
e075f5d9 3655 return -1;
e075f5d9 3656 }
e075f5d9 3657
8353b4c9
CB
3658 ret = setup_pivot_root(&lxc_conf->rootfs);
3659 if (ret < 0) {
3660 ERROR("Failed to pivot root into rootfs");
95b5ffaf 3661 return -1;
ed502555 3662 }
3663
8353b4c9
CB
3664 ret = lxc_setup_devpts(lxc_conf);
3665 if (ret < 0) {
3666 ERROR("Failed to setup new devpts instance");
95b5ffaf 3667 return -1;
3c26f34e 3668 }
3669
2187efd3
CB
3670 ret = lxc_create_ttys(handler);
3671 if (ret < 0)
e8bd4e43 3672 return -1;
e8bd4e43 3673
8353b4c9
CB
3674 ret = setup_personality(lxc_conf->personality);
3675 if (ret < 0) {
3676 ERROR("Failed to set personality");
cccc74b5
DL
3677 return -1;
3678 }
3679
8353b4c9
CB
3680 /* Set sysctl value to a path under /proc/sys as determined from the
3681 * key. For e.g. net.ipv4.ip_forward translated to
3682 * /proc/sys/net/ipv4/ip_forward.
7edd0540
L
3683 */
3684 if (!lxc_list_empty(&lxc_conf->sysctls)) {
3685 ret = setup_sysctl_parameters(&lxc_conf->sysctls);
8353b4c9
CB
3686 if (ret < 0) {
3687 ERROR("Failed to setup sysctl parameters");
7edd0540 3688 return -1;
8353b4c9 3689 }
7edd0540
L
3690 }
3691
97a8f74f
SG
3692 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3693 if (!lxc_list_empty(&lxc_conf->caps)) {
8353b4c9
CB
3694 ERROR("Container requests lxc.cap.drop and "
3695 "lxc.cap.keep: either use lxc.cap.drop or "
3696 "lxc.cap.keep, not both");
f6d3e3e4
SH
3697 return -1;
3698 }
8353b4c9 3699
97a8f74f 3700 if (dropcaps_except(&lxc_conf->keepcaps)) {
8353b4c9 3701 ERROR("Failed to keep capabilities");
97a8f74f
SG
3702 return -1;
3703 }
3704 } else if (setup_caps(&lxc_conf->caps)) {
8353b4c9 3705 ERROR("Failed to drop capabilities");
97a8f74f 3706 return -1;
81810dd1
DL
3707 }
3708
8353b4c9 3709 NOTICE("The container \"%s\" is set up", name);
cd54d859 3710
0ad19a3f 3711 return 0;
3712}
26ddeedd 3713
3f60c2f7 3714int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
14a7b0f9 3715 char *argv[])
26ddeedd 3716{
26ddeedd 3717 struct lxc_list *it;
3f60c2f7 3718 int which = -1;
26ddeedd 3719
3f60c2f7 3720 if (strcmp(hookname, "pre-start") == 0)
26ddeedd 3721 which = LXCHOOK_PRESTART;
3f60c2f7 3722 else if (strcmp(hookname, "start-host") == 0)
08dd2805 3723 which = LXCHOOK_START_HOST;
3f60c2f7 3724 else if (strcmp(hookname, "pre-mount") == 0)
5ea6163a 3725 which = LXCHOOK_PREMOUNT;
3f60c2f7 3726 else if (strcmp(hookname, "mount") == 0)
26ddeedd 3727 which = LXCHOOK_MOUNT;
3f60c2f7 3728 else if (strcmp(hookname, "autodev") == 0)
f7bee6c6 3729 which = LXCHOOK_AUTODEV;
3f60c2f7 3730 else if (strcmp(hookname, "start") == 0)
26ddeedd 3731 which = LXCHOOK_START;
3f60c2f7 3732 else if (strcmp(hookname, "stop") == 0)
52492063 3733 which = LXCHOOK_STOP;
3f60c2f7 3734 else if (strcmp(hookname, "post-stop") == 0)
26ddeedd 3735 which = LXCHOOK_POSTSTOP;
3f60c2f7 3736 else if (strcmp(hookname, "clone") == 0)
148e91f5 3737 which = LXCHOOK_CLONE;
3f60c2f7 3738 else if (strcmp(hookname, "destroy") == 0)
37cf711b 3739 which = LXCHOOK_DESTROY;
26ddeedd
SH
3740 else
3741 return -1;
3f60c2f7 3742
0fd73091 3743 lxc_list_for_each (it, &conf->hooks[which]) {
26ddeedd 3744 int ret;
3f60c2f7
CB
3745 char *hook = it->elem;
3746
3747 ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
14a7b0f9 3748 hookname, argv);
3f60c2f7
CB
3749 if (ret < 0)
3750 return -1;
26ddeedd 3751 }
3f60c2f7 3752
26ddeedd
SH
3753 return 0;
3754}
72d0e1cb 3755
72d0e1cb
SG
3756int lxc_clear_config_caps(struct lxc_conf *c)
3757{
1a0e70ac 3758 struct lxc_list *it, *next;
72d0e1cb 3759
0fd73091 3760 lxc_list_for_each_safe (it, &c->caps, next) {
72d0e1cb
SG
3761 lxc_list_del(it);
3762 free(it->elem);
3763 free(it);
3764 }
0fd73091 3765
72d0e1cb
SG
3766 return 0;
3767}
3768
c7e345ae
CB
3769static int lxc_free_idmap(struct lxc_list *id_map)
3770{
27c27d73
SH
3771 struct lxc_list *it, *next;
3772
0fd73091 3773 lxc_list_for_each_safe (it, id_map, next) {
27c27d73
SH
3774 lxc_list_del(it);
3775 free(it->elem);
3776 free(it);
3777 }
c7e345ae 3778
27c27d73
SH
3779 return 0;
3780}
3781
4355ab5f
SH
3782int lxc_clear_idmaps(struct lxc_conf *c)
3783{
3784 return lxc_free_idmap(&c->id_map);
3785}
3786
1fb86a7c
SH
3787int lxc_clear_config_keepcaps(struct lxc_conf *c)
3788{
0fd73091 3789 struct lxc_list *it, *next;
1fb86a7c 3790
0fd73091 3791 lxc_list_for_each_safe (it, &c->keepcaps, next) {
1fb86a7c
SH
3792 lxc_list_del(it);
3793 free(it->elem);
3794 free(it);
3795 }
0fd73091 3796
1fb86a7c
SH
3797 return 0;
3798}
3799
54860ed0 3800int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
72d0e1cb 3801{
54860ed0 3802 char *global_token, *namespaced_token;
ab1a6cac 3803 size_t namespaced_token_len;
54860ed0 3804 struct lxc_list *it, *next, *list;
ab1a6cac 3805 const char *k = key;
54860ed0 3806 bool all = false;
72d0e1cb 3807
54860ed0
CB
3808 if (version == CGROUP2_SUPER_MAGIC) {
3809 global_token = "lxc.cgroup2";
3810 namespaced_token = "lxc.cgroup2.";
0fd73091 3811 namespaced_token_len = sizeof("lxc.cgroup2.") - 1;
54860ed0
CB
3812 list = &c->cgroup2;
3813 } else if (version == CGROUP_SUPER_MAGIC) {
3814 global_token = "lxc.cgroup";
3815 namespaced_token = "lxc.cgroup.";
0fd73091 3816 namespaced_token_len = sizeof("lxc.cgroup.") - 1;
54860ed0
CB
3817 list = &c->cgroup;
3818 } else {
ab1a6cac 3819 return -EINVAL;
54860ed0
CB
3820 }
3821
3822 if (strcmp(key, global_token) == 0)
72d0e1cb 3823 all = true;
54860ed0 3824 else if (strncmp(key, namespaced_token, sizeof(namespaced_token) - 1) == 0)
ab1a6cac 3825 k += namespaced_token_len;
a6390f01 3826 else
ab1a6cac 3827 return -EINVAL;
72d0e1cb 3828
0fd73091 3829 lxc_list_for_each_safe (it, list, next) {
72d0e1cb 3830 struct lxc_cgroup *cg = it->elem;
54860ed0 3831
72d0e1cb
SG
3832 if (!all && strcmp(cg->subsystem, k) != 0)
3833 continue;
54860ed0 3834
72d0e1cb
SG
3835 lxc_list_del(it);
3836 free(cg->subsystem);
3837 free(cg->value);
3838 free(cg);
3839 free(it);
3840 }
e409b214 3841
72d0e1cb
SG
3842 return 0;
3843}
3844
c6d09e15
WB
3845int lxc_clear_limits(struct lxc_conf *c, const char *key)
3846{
3847 struct lxc_list *it, *next;
c6d09e15 3848 const char *k = NULL;
0fd73091 3849 bool all = false;
c6d09e15 3850
b668653c 3851 if (strcmp(key, "lxc.limit") == 0 || strcmp(key, "lxc.prlimit") == 0)
c6d09e15 3852 all = true;
b668653c
CB
3853 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.") - 1) == 0)
3854 k = key + sizeof("lxc.limit.") - 1;
3855 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.") - 1) == 0)
3856 k = key + sizeof("lxc.prlimit.") - 1;
c6d09e15
WB
3857 else
3858 return -1;
3859
0fd73091 3860 lxc_list_for_each_safe (it, &c->limits, next) {
c6d09e15 3861 struct lxc_limit *lim = it->elem;
0fd73091 3862
c6d09e15
WB
3863 if (!all && strcmp(lim->resource, k) != 0)
3864 continue;
0fd73091 3865
c6d09e15
WB
3866 lxc_list_del(it);
3867 free(lim->resource);
3868 free(lim);
3869 free(it);
3870 }
b668653c 3871
c6d09e15
WB
3872 return 0;
3873}
3874
7edd0540
L
3875int lxc_clear_sysctls(struct lxc_conf *c, const char *key)
3876{
3877 struct lxc_list *it, *next;
7edd0540 3878 const char *k = NULL;
0fd73091 3879 bool all = false;
7edd0540
L
3880
3881 if (strcmp(key, "lxc.sysctl") == 0)
3882 all = true;
3883 else if (strncmp(key, "lxc.sysctl.", sizeof("lxc.sysctl.") - 1) == 0)
3884 k = key + sizeof("lxc.sysctl.") - 1;
3885 else
3886 return -1;
3887
0fd73091 3888 lxc_list_for_each_safe (it, &c->sysctls, next) {
7edd0540 3889 struct lxc_sysctl *elem = it->elem;
0fd73091 3890
7edd0540
L
3891 if (!all && strcmp(elem->key, k) != 0)
3892 continue;
0fd73091 3893
7edd0540
L
3894 lxc_list_del(it);
3895 free(elem->key);
3896 free(elem->value);
3897 free(elem);
3898 free(it);
3899 }
0fd73091 3900
7edd0540
L
3901 return 0;
3902}
3903
61d7a733
YT
3904int lxc_clear_procs(struct lxc_conf *c, const char *key)
3905{
0fd73091 3906 struct lxc_list *it, *next;
61d7a733 3907 const char *k = NULL;
0fd73091 3908 bool all = false;
61d7a733
YT
3909
3910 if (strcmp(key, "lxc.proc") == 0)
3911 all = true;
3912 else if (strncmp(key, "lxc.proc.", sizeof("lxc.proc.") - 1) == 0)
3913 k = key + sizeof("lxc.proc.") - 1;
3914 else
3915 return -1;
3916
0fd73091 3917 lxc_list_for_each_safe (it, &c->procs, next) {
61d7a733 3918 struct lxc_proc *proc = it->elem;
0fd73091 3919
61d7a733
YT
3920 if (!all && strcmp(proc->filename, k) != 0)
3921 continue;
0fd73091 3922
61d7a733
YT
3923 lxc_list_del(it);
3924 free(proc->filename);
3925 free(proc->value);
3926 free(proc);
3927 free(it);
3928 }
3929
3930 return 0;
3931}
3932
ee1e7aa0
SG
3933int lxc_clear_groups(struct lxc_conf *c)
3934{
0fd73091 3935 struct lxc_list *it, *next;
ee1e7aa0 3936
0fd73091 3937 lxc_list_for_each_safe (it, &c->groups, next) {
ee1e7aa0
SG
3938 lxc_list_del(it);
3939 free(it->elem);
3940 free(it);
3941 }
0fd73091 3942
ee1e7aa0
SG
3943 return 0;
3944}
3945
ab799c0b
SG
3946int lxc_clear_environment(struct lxc_conf *c)
3947{
0fd73091 3948 struct lxc_list *it, *next;
ab799c0b 3949
0fd73091 3950 lxc_list_for_each_safe (it, &c->environment, next) {
ab799c0b
SG
3951 lxc_list_del(it);
3952 free(it->elem);
3953 free(it);
3954 }
0fd73091 3955
ab799c0b
SG
3956 return 0;
3957}
3958
72d0e1cb
SG
3959int lxc_clear_mount_entries(struct lxc_conf *c)
3960{
0fd73091 3961 struct lxc_list *it, *next;
72d0e1cb 3962
0fd73091 3963 lxc_list_for_each_safe (it, &c->mount_list, next) {
72d0e1cb
SG
3964 lxc_list_del(it);
3965 free(it->elem);
3966 free(it);
3967 }
0fd73091 3968
72d0e1cb
SG
3969 return 0;
3970}
3971
b099e9e9
SH
3972int lxc_clear_automounts(struct lxc_conf *c)
3973{
3974 c->auto_mounts = 0;
3975 return 0;
3976}
3977
12a50cc6 3978int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 3979{
72d0e1cb 3980 int i;
0fd73091
CB
3981 struct lxc_list *it, *next;
3982 const char *k = NULL;
3983 bool all = false, done = false;
72d0e1cb 3984
17ed13a3
SH
3985 if (strcmp(key, "lxc.hook") == 0)
3986 all = true;
0fd73091
CB
3987 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.") - 1) == 0)
3988 k = key + sizeof("lxc.hook.") - 1;
a6390f01
WB
3989 else
3990 return -1;
17ed13a3 3991
0fd73091 3992 for (i = 0; i < NUM_LXC_HOOKS; i++) {
17ed13a3 3993 if (all || strcmp(k, lxchook_names[i]) == 0) {
0fd73091 3994 lxc_list_for_each_safe (it, &c->hooks[i], next) {
17ed13a3
SH
3995 lxc_list_del(it);
3996 free(it->elem);
3997 free(it);
3998 }
0fd73091 3999
17ed13a3 4000 done = true;
72d0e1cb
SG
4001 }
4002 }
17ed13a3
SH
4003
4004 if (!done) {
4005 ERROR("Invalid hook key: %s", key);
4006 return -1;
4007 }
0fd73091 4008
72d0e1cb
SG
4009 return 0;
4010}
8eb5694b 4011
4184c3e1
SH
4012static inline void lxc_clear_aliens(struct lxc_conf *conf)
4013{
0fd73091 4014 struct lxc_list *it, *next;
4184c3e1 4015
0fd73091 4016 lxc_list_for_each_safe (it, &conf->aliens, next) {
4184c3e1
SH
4017 lxc_list_del(it);
4018 free(it->elem);
4019 free(it);
4020 }
4021}
4022
c7b15d1e 4023void lxc_clear_includes(struct lxc_conf *conf)
f979ac15 4024{
0fd73091 4025 struct lxc_list *it, *next;
f979ac15 4026
0fd73091 4027 lxc_list_for_each_safe (it, &conf->includes, next) {
f979ac15
SH
4028 lxc_list_del(it);
4029 free(it->elem);
4030 free(it);
4031 }
4032}
4033
8eb5694b
SH
4034void lxc_conf_free(struct lxc_conf *conf)
4035{
4036 if (!conf)
4037 return;
0fd73091 4038
858377e4
SH
4039 if (current_config == conf)
4040 current_config = NULL;
aed105d5 4041 lxc_terminal_conf_free(&conf->console);
f10fad2f 4042 free(conf->rootfs.mount);
b3b8c97f 4043 free(conf->rootfs.bdev_type);
f10fad2f
ME
4044 free(conf->rootfs.options);
4045 free(conf->rootfs.path);
f10fad2f 4046 free(conf->logfile);
858377e4
SH
4047 if (conf->logfd != -1)
4048 close(conf->logfd);
f10fad2f 4049 free(conf->utsname);
885766f5
CB
4050 free(conf->ttys.dir);
4051 free(conf->ttys.tty_names);
f10fad2f
ME
4052 free(conf->fstab);
4053 free(conf->rcfile);
5cda27c1 4054 free(conf->execute_cmd);
f10fad2f 4055 free(conf->init_cmd);
3c491553 4056 free(conf->init_cwd);
6b0d5538 4057 free(conf->unexpanded_config);
76d0127f 4058 free(conf->syslog);
c302b476 4059 lxc_free_networks(&conf->network);
f10fad2f
ME
4060 free(conf->lsm_aa_profile);
4061 free(conf->lsm_se_context);
769872f9 4062 lxc_seccomp_free(conf);
8eb5694b 4063 lxc_clear_config_caps(conf);
1fb86a7c 4064 lxc_clear_config_keepcaps(conf);
54860ed0
CB
4065 lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
4066 lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
17ed13a3 4067 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 4068 lxc_clear_mount_entries(conf);
27c27d73 4069 lxc_clear_idmaps(conf);
ee1e7aa0 4070 lxc_clear_groups(conf);
f979ac15 4071 lxc_clear_includes(conf);
761d81ca 4072 lxc_clear_aliens(conf);
ab799c0b 4073 lxc_clear_environment(conf);
240d4b74 4074 lxc_clear_limits(conf, "lxc.prlimit");
7edd0540 4075 lxc_clear_sysctls(conf, "lxc.sysctl");
61d7a733 4076 lxc_clear_procs(conf, "lxc.proc");
43654d34
CB
4077 free(conf->cgroup_meta.dir);
4078 free(conf->cgroup_meta.controllers);
adf0ba1f
LT
4079 free(conf->lxc_shmount.path_host);
4080 free(conf->lxc_shmount.path_cont);
8eb5694b
SH
4081 free(conf);
4082}
4355ab5f
SH
4083
4084struct userns_fn_data {
4085 int (*fn)(void *);
c9b7c33e 4086 const char *fn_name;
4355ab5f
SH
4087 void *arg;
4088 int p[2];
4089};
4090
4091static int run_userns_fn(void *data)
4092{
4355ab5f 4093 char c;
0fd73091 4094 struct userns_fn_data *d = data;
4355ab5f 4095
f8aa4bf3 4096 /* Close write end of the pipe. */
4355ab5f 4097 close(d->p[1]);
f8aa4bf3
CB
4098
4099 /* Wait for parent to finish establishing a new mapping in the user
4100 * namespace we are executing in.
4101 */
489f39be 4102 if (lxc_read_nointr(d->p[0], &c, 1) != 1)
4355ab5f 4103 return -1;
f8aa4bf3
CB
4104
4105 /* Close read end of the pipe. */
4355ab5f 4106 close(d->p[0]);
f8aa4bf3 4107
c9b7c33e
CB
4108 if (d->fn_name)
4109 TRACE("calling function \"%s\"", d->fn_name);
0fd73091 4110
f8aa4bf3 4111 /* Call function to run. */
4355ab5f
SH
4112 return d->fn(d->arg);
4113}
4114
db7cfe23
CB
4115static struct id_map *mapped_nsid_add(struct lxc_conf *conf, unsigned id,
4116 enum idtype idtype)
4117{
5173b710
CB
4118 const struct id_map *map;
4119 struct id_map *retmap;
db7cfe23
CB
4120
4121 map = find_mapped_nsid_entry(conf, id, idtype);
4122 if (!map)
4123 return NULL;
4124
4125 retmap = malloc(sizeof(*retmap));
4126 if (!retmap)
4127 return NULL;
4128
4129 memcpy(retmap, map, sizeof(*retmap));
4130 return retmap;
4131}
4132
c4333195
CB
4133static struct id_map *find_mapped_hostid_entry(struct lxc_conf *conf,
4134 unsigned id, enum idtype idtype)
f8aa4bf3 4135{
f8aa4bf3 4136 struct id_map *map;
0fd73091 4137 struct lxc_list *it;
f8aa4bf3
CB
4138 struct id_map *retmap = NULL;
4139
0fd73091 4140 lxc_list_for_each (it, &conf->id_map) {
f8aa4bf3
CB
4141 map = it->elem;
4142 if (map->idtype != idtype)
4143 continue;
4144
4145 if (id >= map->hostid && id < map->hostid + map->range) {
4146 retmap = map;
4147 break;
4148 }
4149 }
4150
f8aa4bf3
CB
4151 return retmap;
4152}
4153
0fd73091 4154/* Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
f8aa4bf3 4155 * existing one or establish a new one.
4355ab5f 4156 */
0fd73091
CB
4157static struct id_map *mapped_hostid_add(struct lxc_conf *conf, uid_t id,
4158 enum idtype type)
4355ab5f 4159{
28a2d9e7 4160 int hostid_mapped;
c4333195
CB
4161 struct id_map *entry = NULL, *tmp = NULL;
4162
4163 entry = malloc(sizeof(*entry));
4164 if (!entry)
4165 return NULL;
f8aa4bf3 4166
28a2d9e7 4167 /* Reuse existing mapping. */
c4333195
CB
4168 tmp = find_mapped_hostid_entry(conf, id, type);
4169 if (tmp)
4170 return memcpy(entry, tmp, sizeof(*entry));
f8aa4bf3 4171
28a2d9e7
CB
4172 /* Find new mapping. */
4173 hostid_mapped = find_unmapped_nsid(conf, type);
4174 if (hostid_mapped < 0) {
c4333195
CB
4175 DEBUG("Failed to find free mapping for id %d", id);
4176 free(entry);
28a2d9e7 4177 return NULL;
f8aa4bf3 4178 }
f8aa4bf3 4179
28a2d9e7
CB
4180 entry->idtype = type;
4181 entry->nsid = hostid_mapped;
4182 entry->hostid = (unsigned long)id;
4183 entry->range = 1;
4355ab5f 4184
28a2d9e7 4185 return entry;
4355ab5f
SH
4186}
4187
dcf0ffdf 4188struct lxc_list *get_minimal_idmap(struct lxc_conf *conf)
4355ab5f 4189{
f8aa4bf3 4190 uid_t euid, egid;
4160c3a0
CB
4191 uid_t nsuid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
4192 gid_t nsgid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
f8aa4bf3 4193 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
4194 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4195 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 4196
db7cfe23 4197 /* Find container root mappings. */
4160c3a0 4198 container_root_uid = mapped_nsid_add(conf, nsuid, ID_TYPE_UID);
db7cfe23 4199 if (!container_root_uid) {
dcf0ffdf 4200 DEBUG("Failed to find mapping for namespace uid %d", 0);
db7cfe23 4201 goto on_error;
f8aa4bf3 4202 }
dcf0ffdf
CB
4203 euid = geteuid();
4204 if (euid >= container_root_uid->hostid &&
4205 euid < (container_root_uid->hostid + container_root_uid->range))
db7cfe23 4206 host_uid_map = container_root_uid;
f8aa4bf3 4207
4160c3a0 4208 container_root_gid = mapped_nsid_add(conf, nsgid, ID_TYPE_GID);
db7cfe23 4209 if (!container_root_gid) {
dcf0ffdf 4210 DEBUG("Failed to find mapping for namespace gid %d", 0);
f8aa4bf3
CB
4211 goto on_error;
4212 }
dcf0ffdf
CB
4213 egid = getegid();
4214 if (egid >= container_root_gid->hostid &&
4215 egid < (container_root_gid->hostid + container_root_gid->range))
db7cfe23 4216 host_gid_map = container_root_gid;
f8aa4bf3
CB
4217
4218 /* Check whether the {g,u}id of the user has a mapping. */
954b7d9b 4219 if (!host_uid_map)
c4333195 4220 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
28a2d9e7 4221 if (!host_uid_map) {
db7cfe23 4222 DEBUG("Failed to find mapping for uid %d", euid);
f8aa4bf3
CB
4223 goto on_error;
4224 }
4225
dcf0ffdf
CB
4226 if (!host_gid_map)
4227 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
28a2d9e7 4228 if (!host_gid_map) {
db7cfe23 4229 DEBUG("Failed to find mapping for gid %d", egid);
28a2d9e7
CB
4230 goto on_error;
4231 }
4232
4233 /* Allocate new {g,u}id map list. */
4234 idmap = malloc(sizeof(*idmap));
4235 if (!idmap)
4236 goto on_error;
4237 lxc_list_init(idmap);
4238
f8aa4bf3
CB
4239 /* Add container root to the map. */
4240 tmplist = malloc(sizeof(*tmplist));
4241 if (!tmplist)
4242 goto on_error;
4243 lxc_list_add_elem(tmplist, container_root_uid);
4244 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4245
1d90e064 4246 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
4247 /* idmap will now keep track of that memory. */
4248 container_root_uid = NULL;
4249
4250 /* Add container root to the map. */
4251 tmplist = malloc(sizeof(*tmplist));
4252 if (!tmplist)
4253 goto on_error;
4254 lxc_list_add_elem(tmplist, host_uid_map);
4255 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4256 }
1d90e064
CB
4257 /* idmap will now keep track of that memory. */
4258 container_root_uid = NULL;
4259 /* idmap will now keep track of that memory. */
4260 host_uid_map = NULL;
f8aa4bf3
CB
4261
4262 tmplist = malloc(sizeof(*tmplist));
4263 if (!tmplist)
4264 goto on_error;
4265 lxc_list_add_elem(tmplist, container_root_gid);
4266 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4267
1d90e064 4268 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
4269 /* idmap will now keep track of that memory. */
4270 container_root_gid = NULL;
4271
4272 tmplist = malloc(sizeof(*tmplist));
4273 if (!tmplist)
4274 goto on_error;
4275 lxc_list_add_elem(tmplist, host_gid_map);
4276 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4277 }
1d90e064
CB
4278 /* idmap will now keep track of that memory. */
4279 container_root_gid = NULL;
4280 /* idmap will now keep track of that memory. */
4281 host_gid_map = NULL;
f8aa4bf3 4282
dcf0ffdf
CB
4283 TRACE("Allocated minimal idmapping");
4284 return idmap;
4285
4286on_error:
4dc41f99 4287 if (idmap) {
dcf0ffdf 4288 lxc_free_idmap(idmap);
4dc41f99
SX
4289 free(idmap);
4290 }
dcf0ffdf
CB
4291 if (container_root_uid)
4292 free(container_root_uid);
4293 if (container_root_gid)
4294 free(container_root_gid);
4295 if (host_uid_map && (host_uid_map != container_root_uid))
4296 free(host_uid_map);
4297 if (host_gid_map && (host_gid_map != container_root_gid))
4298 free(host_gid_map);
4299
4300 return NULL;
4301}
4302
4303/* Run a function in a new user namespace.
4304 * The caller's euid/egid will be mapped if it is not already.
4305 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4306 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4307 * This means we require only to establish a mapping from:
4308 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4309 * - the container root -> some sub{g,u}id
4310 * The former we add, if the user did not specifiy a mapping. The latter we
4311 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4312 * there to start the container in the first place.
4313 */
4314int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4315 const char *fn_name)
4316{
4317 pid_t pid;
dcf0ffdf 4318 int p[2];
0fd73091 4319 struct userns_fn_data d;
dcf0ffdf 4320 struct lxc_list *idmap;
0fd73091
CB
4321 int ret = -1, status = -1;
4322 char c = '1';
dcf0ffdf 4323
2b2655a8
CB
4324 if (!conf)
4325 return -EINVAL;
4326
dcf0ffdf
CB
4327 idmap = get_minimal_idmap(conf);
4328 if (!idmap)
4329 return -1;
4330
4331 ret = pipe(p);
4332 if (ret < 0) {
4333 SYSERROR("Failed to create pipe");
4334 return -1;
4335 }
4336 d.fn = fn;
4337 d.fn_name = fn_name;
4338 d.arg = data;
4339 d.p[0] = p[0];
4340 d.p[1] = p[1];
4341
4342 /* Clone child in new user namespace. */
4343 pid = lxc_raw_clone_cb(run_userns_fn, &d, CLONE_NEWUSER);
4344 if (pid < 0) {
0fd73091 4345 ERROR("Failed to clone process in new user namespace");
dcf0ffdf
CB
4346 goto on_error;
4347 }
4348
4349 close(p[0]);
4350 p[0] = -1;
4351
4b73005c
CB
4352 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4353 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
dcf0ffdf 4354 struct id_map *map;
0fd73091 4355 struct lxc_list *it;
dcf0ffdf 4356
0fd73091 4357 lxc_list_for_each (it, idmap) {
f8aa4bf3 4358 map = it->elem;
dcf0ffdf 4359 TRACE("Establishing %cid mapping for \"%d\" in new "
f8aa4bf3 4360 "user namespace: nsuid %lu - hostid %lu - range "
0fd73091
CB
4361 "%lu",
4362 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4363 map->nsid, map->hostid, map->range);
f8aa4bf3 4364 }
4355ab5f
SH
4365 }
4366
f8aa4bf3 4367 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4368 ret = lxc_map_ids(idmap, pid);
f8aa4bf3 4369 if (ret < 0) {
0fd73091 4370 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
f8aa4bf3 4371 goto on_error;
4355ab5f
SH
4372 }
4373
f8aa4bf3 4374 /* Tell child to proceed. */
489f39be 4375 if (lxc_write_nointr(p[1], &c, 1) != 1) {
dcf0ffdf 4376 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
f8aa4bf3 4377 goto on_error;
4355ab5f
SH
4378 }
4379
686dd5d1 4380on_error:
4355ab5f
SH
4381 if (p[0] != -1)
4382 close(p[0]);
4383 close(p[1]);
f8aa4bf3 4384
ee1b16bc
TA
4385 /* Wait for child to finish. */
4386 if (pid > 0)
4387 status = wait_for_pid(pid);
4388
686dd5d1
CB
4389 if (status < 0)
4390 ret = -1;
4391
f8aa4bf3 4392 return ret;
4355ab5f 4393}
97e9cfa0 4394
415a8851
CB
4395int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
4396 const char *fn_name)
4397{
4398 pid_t pid;
4399 uid_t euid, egid;
415a8851
CB
4400 int p[2];
4401 struct id_map *map;
4402 struct lxc_list *cur;
0fd73091 4403 struct userns_fn_data d;
415a8851 4404 int ret = -1;
0fd73091 4405 char c = '1';
415a8851
CB
4406 struct lxc_list *idmap = NULL, *tmplist = NULL;
4407 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4408 *host_uid_map = NULL, *host_gid_map = NULL;
4409
2b2655a8
CB
4410 if (!conf)
4411 return -EINVAL;
4412
415a8851
CB
4413 ret = pipe(p);
4414 if (ret < 0) {
4415 SYSERROR("opening pipe");
4416 return -1;
4417 }
4418 d.fn = fn;
4419 d.fn_name = fn_name;
4420 d.arg = data;
4421 d.p[0] = p[0];
4422 d.p[1] = p[1];
4423
4424 /* Clone child in new user namespace. */
4425 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4426 if (pid < 0) {
0fd73091 4427 ERROR("Failed to clone process in new user namespace");
415a8851
CB
4428 goto on_error;
4429 }
4430
4431 close(p[0]);
4432 p[0] = -1;
4433
4434 euid = geteuid();
4435 egid = getegid();
4436
4437 /* Allocate new {g,u}id map list. */
4438 idmap = malloc(sizeof(*idmap));
4439 if (!idmap)
4440 goto on_error;
4441 lxc_list_init(idmap);
4442
4443 /* Find container root. */
0fd73091 4444 lxc_list_for_each (cur, &conf->id_map) {
415a8851
CB
4445 struct id_map *tmpmap;
4446
4447 tmplist = malloc(sizeof(*tmplist));
4448 if (!tmplist)
4449 goto on_error;
4450
4451 tmpmap = malloc(sizeof(*tmpmap));
4452 if (!tmpmap) {
4453 free(tmplist);
4454 goto on_error;
4455 }
4456
4457 memset(tmpmap, 0, sizeof(*tmpmap));
4458 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
4459 tmplist->elem = tmpmap;
4460
4461 lxc_list_add_tail(idmap, tmplist);
4462
4463 map = cur->elem;
4464
4465 if (map->idtype == ID_TYPE_UID)
4466 if (euid >= map->hostid && euid < map->hostid + map->range)
4467 host_uid_map = map;
4468
4469 if (map->idtype == ID_TYPE_GID)
4470 if (egid >= map->hostid && egid < map->hostid + map->range)
4471 host_gid_map = map;
4472
4473 if (map->nsid != 0)
4474 continue;
4475
4476 if (map->idtype == ID_TYPE_UID)
4477 if (container_root_uid == NULL)
4478 container_root_uid = map;
4479
4480 if (map->idtype == ID_TYPE_GID)
4481 if (container_root_gid == NULL)
4482 container_root_gid = map;
4483 }
4484
4485 if (!container_root_uid || !container_root_gid) {
4486 ERROR("No mapping for container root found");
4487 goto on_error;
4488 }
4489
4490 /* Check whether the {g,u}id of the user has a mapping. */
4491 if (!host_uid_map)
c4333195 4492 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
415a8851
CB
4493 else
4494 host_uid_map = container_root_uid;
4495
4496 if (!host_gid_map)
c4333195 4497 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
415a8851
CB
4498 else
4499 host_gid_map = container_root_gid;
4500
4501 if (!host_uid_map) {
4502 DEBUG("Failed to find mapping for uid %d", euid);
4503 goto on_error;
4504 }
4505
4506 if (!host_gid_map) {
4507 DEBUG("Failed to find mapping for gid %d", egid);
4508 goto on_error;
4509 }
4510
4511 if (host_uid_map && (host_uid_map != container_root_uid)) {
4512 /* Add container root to the map. */
4513 tmplist = malloc(sizeof(*tmplist));
4514 if (!tmplist)
4515 goto on_error;
4516 lxc_list_add_elem(tmplist, host_uid_map);
4517 lxc_list_add_tail(idmap, tmplist);
4518 }
4519 /* idmap will now keep track of that memory. */
4520 host_uid_map = NULL;
4521
4522 if (host_gid_map && (host_gid_map != container_root_gid)) {
4523 tmplist = malloc(sizeof(*tmplist));
4524 if (!tmplist)
4525 goto on_error;
4526 lxc_list_add_elem(tmplist, host_gid_map);
4527 lxc_list_add_tail(idmap, tmplist);
4528 }
4529 /* idmap will now keep track of that memory. */
4530 host_gid_map = NULL;
4531
4532 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4533 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
0fd73091 4534 lxc_list_for_each (cur, idmap) {
415a8851
CB
4535 map = cur->elem;
4536 TRACE("establishing %cid mapping for \"%d\" in new "
4537 "user namespace: nsuid %lu - hostid %lu - range "
4538 "%lu",
4539 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4540 map->nsid, map->hostid, map->range);
4541 }
4542 }
4543
4544 /* Set up {g,u}id mapping for user namespace of child process. */
4545 ret = lxc_map_ids(idmap, pid);
4546 if (ret < 0) {
0fd73091 4547 ERROR("error setting up {g,u}id mappings for child process \"%d\"", pid);
415a8851
CB
4548 goto on_error;
4549 }
4550
4551 /* Tell child to proceed. */
489f39be 4552 if (lxc_write_nointr(p[1], &c, 1) != 1) {
0fd73091 4553 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
415a8851
CB
4554 goto on_error;
4555 }
4556
686dd5d1 4557on_error:
ee1b16bc
TA
4558 if (p[0] != -1)
4559 close(p[0]);
4560 close(p[1]);
4561
415a8851 4562 /* Wait for child to finish. */
686dd5d1
CB
4563 if (pid > 0)
4564 ret = wait_for_pid(pid);
415a8851 4565
80758b4b 4566 if (idmap) {
415a8851 4567 lxc_free_idmap(idmap);
80758b4b
DJ
4568 free(idmap);
4569 }
4570
415a8851
CB
4571 if (host_uid_map && (host_uid_map != container_root_uid))
4572 free(host_uid_map);
4573 if (host_gid_map && (host_gid_map != container_root_gid))
4574 free(host_gid_map);
4575
415a8851
CB
4576 return ret;
4577}
4578
a96a8e8c 4579/* not thread-safe, do not use from api without first forking */
0fd73091 4580static char *getuname(void)
97e9cfa0 4581{
cb7aa5e8
DJ
4582 struct passwd pwent;
4583 struct passwd *pwentp = NULL;
4584 char *buf;
4585 char *username;
4586 size_t bufsize;
4587 int ret;
97e9cfa0 4588
cb7aa5e8
DJ
4589 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
4590 if (bufsize == -1)
4591 bufsize = 1024;
4592
4593 buf = malloc(bufsize);
4594 if (!buf)
97e9cfa0
SH
4595 return NULL;
4596
cb7aa5e8
DJ
4597 ret = getpwuid_r(geteuid(), &pwent, buf, bufsize, &pwentp);
4598 if (!pwentp) {
4599 if (ret == 0)
4600 WARN("Could not find matched password record.");
4601
4602 ERROR("Failed to get password record - %u", geteuid());
4603 free(buf);
4604 return NULL;
4605 }
4606
4607 username = strdup(pwent.pw_name);
4608 free(buf);
4609
4610 return username;
97e9cfa0
SH
4611}
4612
a96a8e8c 4613/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4614static char *getgname(void)
4615{
3de9fb4c
DJ
4616 struct group grent;
4617 struct group *grentp = NULL;
4618 char *buf;
4619 char *grname;
4620 size_t bufsize;
4621 int ret;
4622
4623 bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
4624 if (bufsize == -1)
4625 bufsize = 1024;
4626
4627 buf = malloc(bufsize);
4628 if (!buf)
4629 return NULL;
4630
4631 ret = getgrgid_r(getegid(), &grent, buf, bufsize, &grentp);
4632 if (!grentp) {
4633 if (ret == 0)
4634 WARN("Could not find matched group record");
97e9cfa0 4635
3de9fb4c
DJ
4636 ERROR("Failed to get group record - %u", getegid());
4637 free(buf);
97e9cfa0 4638 return NULL;
3de9fb4c
DJ
4639 }
4640
4641 grname = strdup(grent.gr_name);
4642 free(buf);
97e9cfa0 4643
3de9fb4c 4644 return grname;
97e9cfa0
SH
4645}
4646
a96a8e8c 4647/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4648void suggest_default_idmap(void)
4649{
0fd73091 4650 char *uname, *gname;
97e9cfa0
SH
4651 FILE *f;
4652 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
97e9cfa0 4653 size_t len = 0;
0fd73091 4654 char *line = NULL;
97e9cfa0 4655
0fd73091
CB
4656 uname = getuname();
4657 if (!uname)
97e9cfa0
SH
4658 return;
4659
0fd73091
CB
4660 gname = getgname();
4661 if (!gname) {
97e9cfa0
SH
4662 free(uname);
4663 return;
4664 }
4665
4666 f = fopen(subuidfile, "r");
4667 if (!f) {
4668 ERROR("Your system is not configured with subuids");
4669 free(gname);
4670 free(uname);
4671 return;
4672 }
0fd73091 4673
97e9cfa0 4674 while (getline(&line, &len, f) != -1) {
0fd73091 4675 char *p, *p2;
b7930180 4676 size_t no_newline = 0;
0fd73091
CB
4677
4678 p = strchr(line, ':');
97e9cfa0
SH
4679 if (*line == '#')
4680 continue;
4681 if (!p)
4682 continue;
4683 *p = '\0';
4684 p++;
0fd73091 4685
97e9cfa0
SH
4686 if (strcmp(line, uname))
4687 continue;
0fd73091 4688
97e9cfa0
SH
4689 p2 = strchr(p, ':');
4690 if (!p2)
4691 continue;
4692 *p2 = '\0';
4693 p2++;
4694 if (!*p2)
4695 continue;
b7930180
CB
4696 no_newline = strcspn(p2, "\n");
4697 p2[no_newline] = '\0';
4698
b7b2fde4 4699 if (lxc_safe_uint(p, &uid) < 0)
0fd73091 4700 WARN("Could not parse UID");
b7b2fde4 4701 if (lxc_safe_uint(p2, &urange) < 0)
0fd73091 4702 WARN("Could not parse UID range");
97e9cfa0
SH
4703 }
4704 fclose(f);
4705
6be7389a 4706 f = fopen(subgidfile, "r");
97e9cfa0
SH
4707 if (!f) {
4708 ERROR("Your system is not configured with subgids");
4709 free(gname);
4710 free(uname);
4711 return;
4712 }
0fd73091 4713
97e9cfa0 4714 while (getline(&line, &len, f) != -1) {
0fd73091 4715 char *p, *p2;
b7930180 4716 size_t no_newline = 0;
0fd73091
CB
4717
4718 p = strchr(line, ':');
97e9cfa0
SH
4719 if (*line == '#')
4720 continue;
4721 if (!p)
4722 continue;
4723 *p = '\0';
4724 p++;
0fd73091 4725
97e9cfa0
SH
4726 if (strcmp(line, uname))
4727 continue;
0fd73091 4728
97e9cfa0
SH
4729 p2 = strchr(p, ':');
4730 if (!p2)
4731 continue;
4732 *p2 = '\0';
4733 p2++;
4734 if (!*p2)
4735 continue;
b7930180
CB
4736 no_newline = strcspn(p2, "\n");
4737 p2[no_newline] = '\0';
4738
b7b2fde4 4739 if (lxc_safe_uint(p, &gid) < 0)
0fd73091 4740 WARN("Could not parse GID");
b7b2fde4 4741 if (lxc_safe_uint(p2, &grange) < 0)
0fd73091 4742 WARN("Could not parse GID range");
97e9cfa0
SH
4743 }
4744 fclose(f);
4745
f10fad2f 4746 free(line);
97e9cfa0
SH
4747
4748 if (!urange || !grange) {
4749 ERROR("You do not have subuids or subgids allocated");
4750 ERROR("Unprivileged containers require subuids and subgids");
fbd4a4d1 4751 free(uname);
1e7cd2f7 4752 free(gname);
97e9cfa0
SH
4753 return;
4754 }
4755
4756 ERROR("You must either run as root, or define uid mappings");
4757 ERROR("To pass uid mappings to lxc-create, you could create");
4758 ERROR("~/.config/lxc/default.conf:");
4759 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
bdcbb6b3
CB
4760 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4761 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
97e9cfa0
SH
4762
4763 free(gname);
4764 free(uname);
4765}
aaf26830 4766
a7307747
SH
4767static void free_cgroup_settings(struct lxc_list *result)
4768{
4769 struct lxc_list *iterator, *next;
4770
0fd73091 4771 lxc_list_for_each_safe (iterator, result, next) {
a7307747
SH
4772 lxc_list_del(iterator);
4773 free(iterator);
4774 }
4775 free(result);
4776}
4777
0fd73091 4778/* Return the list of cgroup_settings sorted according to the following rules
aaf26830
KT
4779 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4780 */
0fd73091 4781struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings)
aaf26830
KT
4782{
4783 struct lxc_list *result;
aaf26830 4784 struct lxc_cgroup *cg = NULL;
0fd73091 4785 struct lxc_list *it = NULL, *item = NULL, *memsw_limit = NULL;
aaf26830
KT
4786
4787 result = malloc(sizeof(*result));
0fd73091 4788 if (!result)
fac7c663 4789 return NULL;
aaf26830
KT
4790 lxc_list_init(result);
4791
0fd73091
CB
4792 /* Iterate over the cgroup settings and copy them to the output list. */
4793 lxc_list_for_each (it, cgroup_settings) {
aaf26830 4794 item = malloc(sizeof(*item));
fac7c663 4795 if (!item) {
a7307747 4796 free_cgroup_settings(result);
fac7c663
KT
4797 return NULL;
4798 }
0fd73091 4799
aaf26830
KT
4800 item->elem = it->elem;
4801 cg = it->elem;
4802 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4803 /* Store the memsw_limit location */
4804 memsw_limit = item;
0fd73091
CB
4805 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 &&
4806 memsw_limit != NULL) {
4807 /* lxc.cgroup.memory.memsw.limit_in_bytes is found
4808 * before lxc.cgroup.memory.limit_in_bytes, swap these
4809 * two items */
aaf26830
KT
4810 item->elem = memsw_limit->elem;
4811 memsw_limit->elem = it->elem;
4812 }
4813 lxc_list_add_tail(result, item);
4814 }
4815
4816 return result;
a7307747 4817}