]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
network: pass info in env if hook version is 1
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
8f3e280e
CB
27#include <dirent.h>
28#include <errno.h>
29#include <fcntl.h>
30#include <grp.h>
31#include <inttypes.h>
32#include <libgen.h>
33#include <pwd.h>
34#include <stdarg.h>
0ad19a3f 35#include <stdio.h>
0ad19a3f 36#include <stdlib.h>
0ad19a3f 37#include <string.h>
8f3e280e 38#include <time.h>
0ad19a3f 39#include <unistd.h>
8f3e280e
CB
40#include <arpa/inet.h>
41#include <linux/loop.h>
8f3e280e
CB
42#include <net/if.h>
43#include <netinet/in.h>
44#include <sys/mman.h>
45#include <sys/mount.h>
46#include <sys/param.h>
47#include <sys/prctl.h>
48#include <sys/stat.h>
49#include <sys/socket.h>
ce831b3b 50#include <sys/sysmacros.h>
2d76d1d7 51#include <sys/syscall.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
1d52bdf7 55
af6824fc
ST
56/* makedev() */
57#ifdef MAJOR_IN_MKDEV
58# include <sys/mkdev.h>
59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
e8bd4e43 71#include "af_unix.h"
8f3e280e
CB
72#include "caps.h" /* for lxc_caps_last_cap() */
73#include "cgroup.h"
1b09f2c0 74#include "conf.h"
1ed6ba91 75#include "confile_utils.h"
8f3e280e 76#include "error.h"
1b09f2c0 77#include "log.h"
025ed0f3 78#include "lxclock.h"
8f3e280e 79#include "lxcseccomp.h"
4355ab5f 80#include "namespace.h"
8f3e280e
CB
81#include "network.h"
82#include "parse.h"
732375f5 83#include "ringbuf.h"
28d832c4
CB
84#include "storage.h"
85#include "storage/aufs.h"
86#include "storage/overlay.h"
8f3e280e 87#include "utils.h"
fe4de9a6 88#include "lsm/lsm.h"
d0a36f2c 89
e37dda71 90#if HAVE_LIBCAP
495d2046
SG
91#include <sys/capability.h>
92#endif
93
6ff05e18
SG
94#if HAVE_SYS_PERSONALITY_H
95#include <sys/personality.h>
96#endif
97
edaf8b1b
SG
98#if IS_BIONIC
99#include <../include/lxcmntent.h>
100#else
101#include <mntent.h>
102#endif
103
f48b5fd8
FF
104#if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
105#include <../include/prlimit.h>
106#endif
107
36eb9bde 108lxc_log_define(lxc_conf, lxc);
e5bda9ee 109
2d76d1d7
SG
110/* Define pivot_root() if missing from the C library */
111#ifndef HAVE_PIVOT_ROOT
112static int pivot_root(const char * new_root, const char * put_old)
113{
114#ifdef __NR_pivot_root
8f3e280e 115 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 116#else
8f3e280e
CB
117 errno = ENOSYS;
118 return -1;
2d76d1d7
SG
119#endif
120}
121#else
122extern int pivot_root(const char * new_root, const char * put_old);
123#endif
124
ecec0126
SG
125#ifndef MS_PRIVATE
126#define MS_PRIVATE (1<<18)
127#endif
128
8912711c
CB
129#ifndef MS_LAZYTIME
130#define MS_LAZYTIME (1<<25)
131#endif
132
2b9ae35a
CB
133char *lxchook_names[NUM_LXC_HOOKS] = {"pre-start", "pre-mount", "mount",
134 "autodev", "start", "stop",
08dd2805
SH
135 "post-stop", "clone", "destroy",
136 "start-host"};
72d0e1cb 137
998ac676
RT
138struct mount_opt {
139 char *name;
140 int clear;
141 int flag;
142};
143
81810dd1
DL
144struct caps_opt {
145 char *name;
146 int value;
147};
148
c6d09e15
WB
149struct limit_opt {
150 char *name;
151 int value;
152};
153
858377e4
SH
154/*
155 * The lxc_conf of the container currently being worked on in an
156 * API call
157 * This is used in the error calls
158 */
159#ifdef HAVE_TLS
160__thread struct lxc_conf *current_config;
161#else
162struct lxc_conf *current_config;
163#endif
164
998ac676 165static struct mount_opt mount_opt[] = {
470b359b
CB
166 { "async", 1, MS_SYNCHRONOUS },
167 { "atime", 1, MS_NOATIME },
168 { "bind", 0, MS_BIND },
88d413d5 169 { "defaults", 0, 0 },
88d413d5 170 { "dev", 1, MS_NODEV },
470b359b 171 { "diratime", 1, MS_NODIRATIME },
88d413d5 172 { "dirsync", 0, MS_DIRSYNC },
470b359b 173 { "exec", 1, MS_NOEXEC },
8912711c 174 { "lazytime", 0, MS_LAZYTIME },
88d413d5 175 { "mand", 0, MS_MANDLOCK },
88d413d5 176 { "noatime", 0, MS_NOATIME },
470b359b 177 { "nodev", 0, MS_NODEV },
88d413d5 178 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
179 { "noexec", 0, MS_NOEXEC },
180 { "nomand", 1, MS_MANDLOCK },
181 { "norelatime", 1, MS_RELATIME },
182 { "nostrictatime", 1, MS_STRICTATIME },
183 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
184 { "rbind", 0, MS_BIND|MS_REC },
185 { "relatime", 0, MS_RELATIME },
470b359b
CB
186 { "remount", 0, MS_REMOUNT },
187 { "ro", 0, MS_RDONLY },
188 { "rw", 1, MS_RDONLY },
88d413d5 189 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
190 { "suid", 1, MS_NOSUID },
191 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 192 { NULL, 0, 0 },
998ac676
RT
193};
194
e37dda71 195#if HAVE_LIBCAP
81810dd1 196static struct caps_opt caps_opt[] = {
a6afdde9 197 { "chown", CAP_CHOWN },
1e11be34
DL
198 { "dac_override", CAP_DAC_OVERRIDE },
199 { "dac_read_search", CAP_DAC_READ_SEARCH },
200 { "fowner", CAP_FOWNER },
201 { "fsetid", CAP_FSETID },
81810dd1
DL
202 { "kill", CAP_KILL },
203 { "setgid", CAP_SETGID },
204 { "setuid", CAP_SETUID },
205 { "setpcap", CAP_SETPCAP },
206 { "linux_immutable", CAP_LINUX_IMMUTABLE },
207 { "net_bind_service", CAP_NET_BIND_SERVICE },
208 { "net_broadcast", CAP_NET_BROADCAST },
209 { "net_admin", CAP_NET_ADMIN },
210 { "net_raw", CAP_NET_RAW },
211 { "ipc_lock", CAP_IPC_LOCK },
212 { "ipc_owner", CAP_IPC_OWNER },
213 { "sys_module", CAP_SYS_MODULE },
214 { "sys_rawio", CAP_SYS_RAWIO },
215 { "sys_chroot", CAP_SYS_CHROOT },
216 { "sys_ptrace", CAP_SYS_PTRACE },
217 { "sys_pacct", CAP_SYS_PACCT },
218 { "sys_admin", CAP_SYS_ADMIN },
219 { "sys_boot", CAP_SYS_BOOT },
220 { "sys_nice", CAP_SYS_NICE },
221 { "sys_resource", CAP_SYS_RESOURCE },
222 { "sys_time", CAP_SYS_TIME },
223 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
224 { "mknod", CAP_MKNOD },
225 { "lease", CAP_LEASE },
57b837e2
CB
226#ifdef CAP_AUDIT_READ
227 { "audit_read", CAP_AUDIT_READ },
228#endif
9527e566 229#ifdef CAP_AUDIT_WRITE
81810dd1 230 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
231#endif
232#ifdef CAP_AUDIT_CONTROL
81810dd1 233 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 234#endif
81810dd1
DL
235 { "setfcap", CAP_SETFCAP },
236 { "mac_override", CAP_MAC_OVERRIDE },
237 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
238#ifdef CAP_SYSLOG
239 { "syslog", CAP_SYSLOG },
240#endif
241#ifdef CAP_WAKE_ALARM
242 { "wake_alarm", CAP_WAKE_ALARM },
243#endif
2b54359b
CB
244#ifdef CAP_BLOCK_SUSPEND
245 { "block_suspend", CAP_BLOCK_SUSPEND },
246#endif
81810dd1 247};
495d2046
SG
248#else
249static struct caps_opt caps_opt[] = {};
250#endif
81810dd1 251
c6d09e15
WB
252static struct limit_opt limit_opt[] = {
253#ifdef RLIMIT_AS
254 { "as", RLIMIT_AS },
255#endif
256#ifdef RLIMIT_CORE
257 { "core", RLIMIT_CORE },
258#endif
259#ifdef RLIMIT_CPU
260 { "cpu", RLIMIT_CPU },
261#endif
262#ifdef RLIMIT_DATA
263 { "data", RLIMIT_DATA },
264#endif
265#ifdef RLIMIT_FSIZE
266 { "fsize", RLIMIT_FSIZE },
267#endif
268#ifdef RLIMIT_LOCKS
269 { "locks", RLIMIT_LOCKS },
270#endif
271#ifdef RLIMIT_MEMLOCK
272 { "memlock", RLIMIT_MEMLOCK },
273#endif
274#ifdef RLIMIT_MSGQUEUE
275 { "msgqueue", RLIMIT_MSGQUEUE },
276#endif
277#ifdef RLIMIT_NICE
278 { "nice", RLIMIT_NICE },
279#endif
280#ifdef RLIMIT_NOFILE
281 { "nofile", RLIMIT_NOFILE },
282#endif
283#ifdef RLIMIT_NPROC
284 { "nproc", RLIMIT_NPROC },
285#endif
286#ifdef RLIMIT_RSS
287 { "rss", RLIMIT_RSS },
288#endif
289#ifdef RLIMIT_RTPRIO
290 { "rtprio", RLIMIT_RTPRIO },
291#endif
292#ifdef RLIMIT_RTTIME
293 { "rttime", RLIMIT_RTTIME },
294#endif
295#ifdef RLIMIT_SIGPENDING
296 { "sigpending", RLIMIT_SIGPENDING },
297#endif
298#ifdef RLIMIT_STACK
299 { "stack", RLIMIT_STACK },
300#endif
301};
302
91c3830e
SH
303static int run_buffer(char *buffer)
304{
ebec9176 305 struct lxc_popen_FILE *f;
91c3830e 306 char *output;
8e7da691 307 int ret;
91c3830e 308
ebec9176 309 f = lxc_popen(buffer);
91c3830e 310 if (!f) {
3f60c2f7 311 SYSERROR("Failed to popen() %s", buffer);
91c3830e
SH
312 return -1;
313 }
314
315 output = malloc(LXC_LOG_BUFFER_SIZE);
316 if (!output) {
3f60c2f7 317 ERROR("Failed to allocate memory for %s", buffer);
ebec9176 318 lxc_pclose(f);
91c3830e
SH
319 return -1;
320 }
321
062b72c6 322 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
3f60c2f7 323 DEBUG("Script %s with output: %s", buffer, output);
91c3830e
SH
324
325 free(output);
326
ebec9176 327 ret = lxc_pclose(f);
8e7da691 328 if (ret == -1) {
3f60c2f7 329 SYSERROR("Script exited with error");
91c3830e 330 return -1;
8e7da691 331 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
3f60c2f7 332 ERROR("Script exited with status %d", WEXITSTATUS(ret));
8e7da691
DE
333 return -1;
334 } else if (WIFSIGNALED(ret)) {
3f60c2f7 335 ERROR("Script terminated by signal %d", WTERMSIG(ret));
8e7da691 336 return -1;
91c3830e
SH
337 }
338
339 return 0;
340}
341
14a7b0f9
CB
342int run_script_argv(const char *name, unsigned int hook_version,
343 const char *section, const char *script,
344 const char *hookname, char **argsin)
148e91f5 345{
3f60c2f7 346 int buf_pos, i, ret;
148e91f5 347 char *buffer;
3f60c2f7 348 size_t size = 0, size_legacy_args = 0;
148e91f5 349
3f60c2f7
CB
350 if (hook_version == 0)
351 INFO("Executing script \"%s\" for container \"%s\", config "
352 "section \"%s\"", script, name, section);
353 else
354 INFO("Executing script \"%s\" for container \"%s\"", script, name);
148e91f5 355
062b72c6 356 for (i = 0; argsin && argsin[i]; i++)
148e91f5
SH
357 size += strlen(argsin[i]) + 1;
358
3f60c2f7 359 size += sizeof("exec");
148e91f5 360 size += strlen(script);
3f60c2f7
CB
361 size++;
362
363 size_legacy_args += strlen(section);
364 size_legacy_args++;
365
366 size_legacy_args += strlen(name);
367 size_legacy_args++;
368
369 size_legacy_args += strlen(hookname);
370 size_legacy_args++;
148e91f5
SH
371
372 if (size > INT_MAX)
3f60c2f7 373 return -EFBIG;
148e91f5
SH
374
375 buffer = alloca(size);
148e91f5 376
3f60c2f7
CB
377 if (hook_version == 0) {
378 size += size_legacy_args;
379
380 buf_pos = snprintf(buffer, size, "exec %s %s %s %s", script, name, section, hookname);
381 if (buf_pos < 0 || (size_t)buf_pos >= size) {
382 ERROR("Failed to create command line for script \"%s\"", script);
383 return -1;
384 }
385 } else {
386 buf_pos = snprintf(buffer, size, "exec %s", script);
387 if (buf_pos < 0 || (size_t)buf_pos >= size) {
388 ERROR("Failed to create command line for script \"%s\"", script);
389 return -1;
390 }
391
392 ret = setenv("LXC_HOOK_TYPE", hookname, 1);
393 if (ret < 0) {
394 SYSERROR("Failed to set environment variable: "
395 "LXC_HOOK_TYPE=%s", hookname);
396 return -1;
397 }
398 TRACE("Set environment variable: LXC_HOOK_TYPE=%s", section);
399
400 ret = setenv("LXC_HOOK_SECTION", section, 1);
401 if (ret < 0) {
402 SYSERROR("Failed to set environment variable: "
403 "LXC_HOOK_SECTION=%s", section);
404 return -1;
405 }
406 TRACE("Set environment variable: LXC_HOOK_SECTION=%s", section);
14a7b0f9
CB
407
408 if (strcmp(section, "net") == 0) {
409 char *parent;
410
411 if (!argsin[0])
412 return -EINVAL;
413
414 ret = setenv("LXC_NET_TYPE", argsin[0], 1);
415 if (ret < 0) {
416 SYSERROR("Failed to set environment variable: "
417 "LXC_NET_TYPE=%s", argsin[0]);
418 return -1;
419 }
420 TRACE("Set environment variable: LXC_NET_TYPE=%s", argsin[0]);
421
422 parent = argsin[1] ? argsin[1] : "";
423
424 if (strcmp(argsin[0], "macvlan")) {
425 ret = setenv("LXC_NET_PARENT", parent, 1);
426 if (ret < 0) {
427 SYSERROR("Failed to set environment "
428 "variable: LXC_NET_PARENT=%s", parent);
429 return -1;
430 }
431 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
432 } else if (strcmp(argsin[0], "phys")) {
433 ret = setenv("LXC_NET_PARENT", parent, 1);
434 if (ret < 0) {
435 SYSERROR("Failed to set environment "
436 "variable: LXC_NET_PARENT=%s", parent);
437 return -1;
438 }
439 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
440 } else if (strcmp(argsin[0], "veth")) {
441 char *peer = argsin[2] ? argsin[2] : "";
442
443 ret = setenv("LXC_NET_PEER", peer, 1);
444 if (ret < 0) {
445 SYSERROR("Failed to set environment "
446 "variable: LXC_NET_PEER=%s", peer);
447 return -1;
448 }
449 TRACE("Set environment variable: LXC_NET_PEER=%s", peer);
450
451 ret = setenv("LXC_NET_PARENT", parent, 1);
452 if (ret < 0) {
453 SYSERROR("Failed to set environment "
454 "variable: LXC_NET_PARENT=%s", parent);
455 return -1;
456 }
457 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
458 }
459 }
148e91f5
SH
460 }
461
062b72c6 462 for (i = 0; argsin && argsin[i]; i++) {
3f60c2f7
CB
463 size_t len = size - buf_pos;
464
465 ret = snprintf(buffer + buf_pos, len, " %s", argsin[i]);
466 if (ret < 0 || (size_t)ret >= len) {
467 ERROR("Failed to create command line for script \"%s\"", script);
148e91f5
SH
468 return -1;
469 }
3f60c2f7 470 buf_pos += ret;
148e91f5
SH
471 }
472
473 return run_buffer(buffer);
474}
475
811ef482 476int run_script(const char *name, const char *section, const char *script, ...)
e3b4c4c4 477{
abbfd20b 478 int ret;
91c3830e 479 char *buffer, *p;
abbfd20b
DL
480 size_t size = 0;
481 va_list ap;
751d9dcd 482
062b72c6 483 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
751d9dcd 484 script, name, section);
e3b4c4c4 485
abbfd20b
DL
486 va_start(ap, script);
487 while ((p = va_arg(ap, char *)))
95642a10 488 size += strlen(p) + 1;
abbfd20b
DL
489 va_end(ap);
490
6d1a5f93 491 size += strlen("exec");
abbfd20b
DL
492 size += strlen(script);
493 size += strlen(name);
494 size += strlen(section);
6d1a5f93 495 size += 4;
abbfd20b 496
95642a10
MS
497 if (size > INT_MAX)
498 return -1;
499
500 buffer = alloca(size);
abbfd20b 501 if (!buffer) {
062b72c6 502 ERROR("Failed to allocate memory.");
751d9dcd
DL
503 return -1;
504 }
505
6d1a5f93 506 ret = snprintf(buffer, size, "exec %s %s %s", script, name, section);
9ba8130c 507 if (ret < 0 || ret >= size) {
062b72c6 508 ERROR("Script name too long.");
9ba8130c
SH
509 return -1;
510 }
751d9dcd 511
abbfd20b 512 va_start(ap, script);
9ba8130c 513 while ((p = va_arg(ap, char *))) {
062b72c6 514 int len = size - ret;
9ba8130c
SH
515 int rc;
516 rc = snprintf(buffer + ret, len, " %s", p);
517 if (rc < 0 || rc >= len) {
062b72c6 518 ERROR("Script args too long.");
9ba8130c
SH
519 return -1;
520 }
521 ret += rc;
522 }
abbfd20b 523 va_end(ap);
751d9dcd 524
91c3830e 525 return run_buffer(buffer);
e3b4c4c4
ST
526}
527
0c547523
SH
528/*
529 * pin_rootfs
b7ed4bf0
CS
530 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
531 * the duration of the container run, to prevent the container from marking
532 * the underlying fs readonly on shutdown. unlink the file immediately so
533 * no name pollution is happens
0c547523
SH
534 * return -1 on error.
535 * return -2 if nothing needed to be pinned.
536 * return an open fd (>=0) if we pinned it.
537 */
538int pin_rootfs(const char *rootfs)
539{
540 char absrootfs[MAXPATHLEN];
541 char absrootfspin[MAXPATHLEN];
542 struct stat s;
543 int ret, fd;
544
e99ee0de 545 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 546 return -2;
e99ee0de 547
00ec333b 548 if (!realpath(rootfs, absrootfs))
9be53773 549 return -2;
0c547523 550
00ec333b 551 if (access(absrootfs, F_OK))
0c547523 552 return -1;
0c547523 553
00ec333b 554 if (stat(absrootfs, &s))
0c547523 555 return -1;
0c547523 556
72f919c4 557 if (!S_ISDIR(s.st_mode))
0c547523
SH
558 return -2;
559
b7ed4bf0 560 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 561 if (ret >= MAXPATHLEN)
0c547523 562 return -1;
0c547523
SH
563
564 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
565 if (fd < 0)
566 return fd;
567 (void)unlink(absrootfspin);
0c547523
SH
568 return fd;
569}
570
e2a7e8dc
SH
571/*
572 * If we are asking to remount something, make sure that any
573 * NOEXEC etc are honored.
574 */
5ae72b98 575unsigned long add_required_remount_flags(const char *s, const char *d,
e2a7e8dc
SH
576 unsigned long flags)
577{
614305f3 578#ifdef HAVE_STATVFS
e2a7e8dc
SH
579 struct statvfs sb;
580 unsigned long required_flags = 0;
581
582 if (!(flags & MS_REMOUNT))
583 return flags;
584
585 if (!s)
586 s = d;
587
588 if (!s)
589 return flags;
590 if (statvfs(s, &sb) < 0)
591 return flags;
592
593 if (sb.f_flag & MS_NOSUID)
594 required_flags |= MS_NOSUID;
595 if (sb.f_flag & MS_NODEV)
596 required_flags |= MS_NODEV;
597 if (sb.f_flag & MS_RDONLY)
598 required_flags |= MS_RDONLY;
599 if (sb.f_flag & MS_NOEXEC)
600 required_flags |= MS_NOEXEC;
601
602 return flags | required_flags;
614305f3
SH
603#else
604 return flags;
605#endif
e2a7e8dc
SH
606}
607
4fb3cba5 608static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 609{
368bbc02 610 int r;
80e80c40 611 int i;
b06b8511
CS
612 static struct {
613 int match_mask;
614 int match_flag;
615 const char *source;
616 const char *destination;
617 const char *fstype;
618 unsigned long flags;
619 const char *options;
620 } default_mounts[] = {
621 /* Read-only bind-mounting... In older kernels, doing that required
622 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
623 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
624 * kernel 2.6.26 onwards. However, this apparently does not work on
625 * kernel 3.8. Unfortunately, on that very same kernel, doing the
626 * same trick as above doesn't seem to work either, there one needs
627 * to ALSO specify MS_BIND for the remount, otherwise the entire
628 * fs is remounted read-only or the mount fails because it's busy...
629 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
630 * 2.6.32...
368bbc02 631 */
f24a52d5 632 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a
SH
633 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
634 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
f24a52d5
SG
635 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
636 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
592fd47a 637 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
f24a52d5
SG
638 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
639 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
640 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
641 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
642 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
643 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
644 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
645 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
646 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
647 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
648 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
649 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 650 };
368bbc02 651
b06b8511
CS
652 for (i = 0; default_mounts[i].match_mask; i++) {
653 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
654 char *source = NULL;
655 char *destination = NULL;
656 int saved_errno;
e2a7e8dc 657 unsigned long mflags;
b06b8511
CS
658
659 if (default_mounts[i].source) {
660 /* will act like strdup if %r is not present */
8ede5f4c 661 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
b06b8511
CS
662 if (!source) {
663 SYSERROR("memory allocation error");
664 return -1;
665 }
666 }
cc4fd506
SH
667 if (!default_mounts[i].destination) {
668 ERROR("BUG: auto mounts destination %d was NULL", i);
b2f44b4d 669 free(source);
cc4fd506
SH
670 return -1;
671 }
672 /* will act like strdup if %r is not present */
673 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
674 if (!destination) {
675 saved_errno = errno;
676 SYSERROR("memory allocation error");
677 free(source);
678 errno = saved_errno;
679 return -1;
b06b8511 680 }
e2a7e8dc
SH
681 mflags = add_required_remount_flags(source, destination,
682 default_mounts[i].flags);
592fd47a 683 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
b06b8511 684 saved_errno = errno;
b88ff9a0
SG
685 if (r < 0 && errno == ENOENT) {
686 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
687 r = 0;
688 }
689 else if (r < 0)
e2a7e8dc 690 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
f24a52d5 691
b06b8511
CS
692 free(source);
693 free(destination);
694 if (r < 0) {
b06b8511
CS
695 errno = saved_errno;
696 return -1;
697 }
368bbc02 698 }
368bbc02
CS
699 }
700
b06b8511 701 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
702 int cg_flags;
703
704 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
705 /* If the type of cgroup mount was not specified, it depends on the
706 * container's capabilities as to what makes sense: if we have
707 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
708 * anyway, so we may as well default to read-write; then the admin
709 * will not be given a false sense of security. (And if they really
710 * want mixed r/o r/w, then they can explicitly specify :mixed.)
711 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
712 * :mixed, because then the container can't remount it read-write. */
713 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
714 int has_sys_admin = 0;
b0ee5983
CB
715
716 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 717 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 718 else
0769b82a 719 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
720
721 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 722 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 723 else
0769b82a 724 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a
CS
725 }
726
8ede5f4c 727 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
368bbc02 728 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 729 return -1;
368bbc02
CS
730 }
731 }
732
368bbc02 733 return 0;
368bbc02
CS
734}
735
4e5440c6 736static int setup_utsname(struct utsname *utsname)
0ad19a3f 737{
4e5440c6
DL
738 if (!utsname)
739 return 0;
0ad19a3f 740
4e5440c6
DL
741 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
742 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 743 return -1;
744 }
745
4e5440c6 746 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 747
0ad19a3f 748 return 0;
749}
750
69aa6655
DE
751struct dev_symlinks {
752 const char *oldpath;
753 const char *name;
754};
755
756static const struct dev_symlinks dev_symlinks[] = {
757 {"/proc/self/fd", "fd"},
758 {"/proc/self/fd/0", "stdin"},
759 {"/proc/self/fd/1", "stdout"},
760 {"/proc/self/fd/2", "stderr"},
761};
762
ed8704d0 763static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
69aa6655
DE
764{
765 char path[MAXPATHLEN];
766 int ret,i;
09227be2 767 struct stat s;
69aa6655
DE
768
769
770 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
771 const struct dev_symlinks *d = &dev_symlinks[i];
ec50007f 772 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
773 if (ret < 0 || ret >= MAXPATHLEN)
774 return -1;
09227be2
MW
775
776 /*
777 * Stat the path first. If we don't get an error
778 * accept it as is and don't try to create it
779 */
780 if (!stat(path, &s)) {
781 continue;
782 }
783
69aa6655 784 ret = symlink(d->oldpath, path);
09227be2 785
69aa6655 786 if (ret && errno != EEXIST) {
09227be2
MW
787 if ( errno == EROFS ) {
788 WARN("Warning: Read Only file system while creating %s", path);
789 } else {
790 SYSERROR("Error creating %s", path);
791 return -1;
792 }
69aa6655
DE
793 }
794 }
795 return 0;
796}
797
2187efd3 798/* Build a space-separate list of ptys to pass to systemd. */
393903d1 799static bool append_ptyname(char **pp, char *name)
b0a33c1e 800{
393903d1
SH
801 char *p;
802
803 if (!*pp) {
804 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
805 if (!*pp)
806 return false;
807 sprintf(*pp, "container_ttys=%s", name);
808 return true;
809 }
810 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
811 if (!p)
812 return false;
813 *pp = p;
814 strcat(p, " ");
815 strcat(p, name);
816 return true;
817}
818
2187efd3 819static int lxc_setup_ttys(struct lxc_conf *conf)
393903d1 820{
9e1045e3 821 int i, ret;
393903d1
SH
822 const struct lxc_tty_info *tty_info = &conf->tty_info;
823 char *ttydir = conf->ttydir;
7c6ef2a2 824 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 825
e8bd4e43 826 if (!conf->rootfs.path)
bc9bd0e3
DL
827 return 0;
828
b0a33c1e 829 for (i = 0; i < tty_info->nbtty; i++) {
b0a33c1e 830 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
831
e8bd4e43 832 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
73363c61 833 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 834 return -1;
9e1045e3 835
7c6ef2a2
SH
836 if (ttydir) {
837 /* create dev/lxc/tty%d" */
9e1045e3
CB
838 ret = snprintf(lxcpath, sizeof(lxcpath),
839 "/dev/%s/tty%d", ttydir, i + 1);
73363c61 840 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 841 return -1;
9e1045e3 842
7c6ef2a2 843 ret = creat(lxcpath, 0660);
9e1045e3 844 if (ret < 0 && errno != EEXIST) {
73363c61 845 SYSERROR("Failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
846 return -1;
847 }
4d44e274
SH
848 if (ret >= 0)
849 close(ret);
9e1045e3 850
7c6ef2a2 851 ret = unlink(path);
9e1045e3 852 if (ret < 0 && errno != ENOENT) {
73363c61 853 SYSERROR("Failed to unlink \"%s\"", path);
7c6ef2a2
SH
854 return -1;
855 }
b0a33c1e 856
9e1045e3
CB
857 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
858 if (ret < 0) {
73363c61 859 WARN("Failed to bind mount \"%s\" onto \"%s\"",
7c6ef2a2
SH
860 pty_info->name, path);
861 continue;
862 }
9e1045e3
CB
863 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
864 path);
13954cce 865
9e1045e3
CB
866 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
867 ttydir, i + 1);
73363c61 868 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 869 return -1;
9e1045e3 870
7c6ef2a2 871 ret = symlink(lxcpath, path);
9e1045e3 872 if (ret < 0) {
73363c61 873 SYSERROR("Failed to create symlink \"%s\" -> \"%s\"",
9e1045e3 874 path, lxcpath);
7c6ef2a2
SH
875 return -1;
876 }
877 } else {
9e1045e3
CB
878 /* If we populated /dev, then we need to create
879 * /dev/ttyN
880 */
881 ret = access(path, F_OK);
882 if (ret < 0) {
c6883f38 883 ret = creat(path, 0660);
9e1045e3 884 if (ret < 0) {
73363c61 885 SYSERROR("Failed to create \"%s\"", path);
c6883f38 886 /* this isn't fatal, continue */
025ed0f3 887 } else {
c6883f38 888 close(ret);
025ed0f3 889 }
c6883f38 890 }
9e1045e3
CB
891
892 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
893 if (ret < 0) {
73363c61 894 SYSERROR("Failed to mount '%s'->'%s'", pty_info->name, path);
7c6ef2a2
SH
895 continue;
896 }
9e1045e3 897
73363c61 898 DEBUG("Bind mounted \"%s\" onto \"%s\"", pty_info->name,
9e1045e3 899 path);
393903d1 900 }
9e1045e3 901
e8bd4e43 902 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
393903d1
SH
903 ERROR("Error setting up container_ttys string");
904 return -1;
b0a33c1e 905 }
906 }
907
73363c61 908 INFO("Finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
b0a33c1e 909 return 0;
910}
911
2187efd3
CB
912int lxc_allocate_ttys(const char *name, struct lxc_conf *conf)
913{
914 struct lxc_tty_info *tty_info = &conf->tty_info;
915 int i, ret;
916
917 /* no tty in the configuration */
918 if (!conf->tty)
919 return 0;
920
921 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
922 if (!tty_info->pty_info) {
923 SYSERROR("failed to allocate struct *pty_info");
924 return -ENOMEM;
925 }
926
927 for (i = 0; i < conf->tty; i++) {
928 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
929
930 process_lock();
931 ret = openpty(&pty_info->master, &pty_info->slave,
932 pty_info->name, NULL, NULL);
933 process_unlock();
934 if (ret) {
935 SYSERROR("failed to create pty device number %d", i);
936 tty_info->nbtty = i;
937 lxc_delete_tty(tty_info);
938 return -ENOTTY;
939 }
940
941 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
942 pty_info->name, pty_info->master, pty_info->slave);
943
944 /* Prevent leaking the file descriptors to the container */
945 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
946 if (ret < 0)
947 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
948 "pty device \"%s\": %s",
949 pty_info->master, pty_info->name, strerror(errno));
950
951 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
952 if (ret < 0)
953 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
954 "pty device \"%s\": %s",
955 pty_info->slave, pty_info->name, strerror(errno));
956
957 pty_info->busy = 0;
958 }
959
960 tty_info->nbtty = conf->tty;
961
962 INFO("finished allocating %d pts devices", conf->tty);
963 return 0;
964}
965
966void lxc_delete_tty(struct lxc_tty_info *tty_info)
967{
968 int i;
969
970 for (i = 0; i < tty_info->nbtty; i++) {
971 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
972
973 close(pty_info->master);
974 close(pty_info->slave);
975 }
976
977 free(tty_info->pty_info);
978 tty_info->pty_info = NULL;
979 tty_info->nbtty = 0;
980}
981
982static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
983{
984 int i;
985 struct lxc_conf *conf = handler->conf;
986 struct lxc_tty_info *tty_info = &conf->tty_info;
987 int sock = handler->data_sock[0];
988 int ret = -1;
989
990 if (!conf->tty)
991 return 0;
992
993 for (i = 0; i < conf->tty; i++) {
994 int ttyfds[2];
995 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
996
997 ttyfds[0] = pty_info->master;
998 ttyfds[1] = pty_info->slave;
999
1000 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1001 if (ret < 0)
1002 break;
1003
1004 TRACE("Send pty \"%s\" with master fd %d and slave fd %d to "
1005 "parent", pty_info->name, pty_info->master, pty_info->slave);
1006 }
1007
1008 if (ret < 0)
1009 ERROR("Failed to send %d ttys to parent: %s", conf->tty,
1010 strerror(errno));
1011 else
1012 TRACE("Sent %d ttys to parent", conf->tty);
1013
1014 return ret;
1015}
1016
1017static int lxc_create_ttys(struct lxc_handler *handler)
1018{
1019 int ret = -1;
1020 struct lxc_conf *conf = handler->conf;
1021
1022 ret = lxc_allocate_ttys(handler->name, conf);
1023 if (ret < 0) {
1024 ERROR("Failed to allocate ttys");
1025 goto on_error;
1026 }
1027
1028 ret = lxc_send_ttys_to_parent(handler);
1029 if (ret < 0) {
1030 ERROR("Failed to send ttys to parent");
1031 goto on_error;
1032 }
1033
1034 if (!conf->is_execute) {
1035 ret = lxc_setup_ttys(conf);
1036 if (ret < 0) {
1037 ERROR("Failed to setup ttys");
1038 goto on_error;
1039 }
1040 }
1041
1042 if (conf->pty_names) {
1043 ret = setenv("container_ttys", conf->pty_names, 1);
1044 if (ret < 0)
1045 SYSERROR("Failed to set \"container_ttys=%s\"", conf->pty_names);
1046 }
1047
1048 ret = 0;
1049
1050on_error:
1051 lxc_delete_tty(&conf->tty_info);
1052
1053 return ret;
1054}
1055
59bb8698 1056static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 1057{
2d489f9e 1058 int oldroot = -1, newroot = -1;
bf601689 1059
2d489f9e
SH
1060 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1061 if (oldroot < 0) {
1062 SYSERROR("Error opening old-/ for fchdir");
9ba8130c
SH
1063 return -1;
1064 }
2d489f9e
SH
1065 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1066 if (newroot < 0) {
1067 SYSERROR("Error opening new-/ for fchdir");
1068 goto fail;
c08556c6 1069 }
bf601689 1070
cc6f6dd7 1071 /* change into new root fs */
2d489f9e 1072 if (fchdir(newroot)) {
cc6f6dd7 1073 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
2d489f9e 1074 goto fail;
cc6f6dd7
DL
1075 }
1076
cc6f6dd7 1077 /* pivot_root into our new root fs */
2d489f9e 1078 if (pivot_root(".", ".")) {
cc6f6dd7 1079 SYSERROR("pivot_root syscall failed");
2d489f9e 1080 goto fail;
bf601689 1081 }
cc6f6dd7 1082
2d489f9e
SH
1083 /*
1084 * at this point the old-root is mounted on top of our new-root
1085 * To unmounted it we must not be chdir'd into it, so escape back
1086 * to old-root
1087 */
1088 if (fchdir(oldroot) < 0) {
1089 SYSERROR("Error entering oldroot");
1090 goto fail;
1091 }
7981ea46 1092 if (umount2(".", MNT_DETACH) < 0) {
2d489f9e
SH
1093 SYSERROR("Error detaching old root");
1094 goto fail;
cc6f6dd7
DL
1095 }
1096
2d489f9e
SH
1097 if (fchdir(newroot) < 0) {
1098 SYSERROR("Error re-entering newroot");
1099 goto fail;
1100 }
cc6f6dd7 1101
2d489f9e
SH
1102 close(oldroot);
1103 close(newroot);
bf601689 1104
2d489f9e 1105 DEBUG("pivot_root syscall to '%s' successful", rootfs);
bf601689 1106
bf601689 1107 return 0;
2d489f9e
SH
1108
1109fail:
1110 if (oldroot != -1)
1111 close(oldroot);
1112 if (newroot != -1)
1113 close(newroot);
1114 return -1;
bf601689
MH
1115}
1116
7133b912
CB
1117/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1118 * error, log it but don't fail yet.
91c3830e 1119 */
7133b912
CB
1120static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1121 const char *lxcpath)
91c3830e
SH
1122{
1123 int ret;
87da4ec3
SH
1124 size_t clen;
1125 char *path;
91c3830e 1126
7133b912 1127 INFO("Preparing \"/dev\"");
bc6928ff 1128
14221cbb 1129 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1130 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1131 path = alloca(clen);
bc6928ff 1132
ec50007f 1133 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
7133b912 1134 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1135 return -1;
bc6928ff 1136
87da4ec3 1137 if (!dir_exists(path)) {
7133b912
CB
1138 WARN("\"/dev\" directory does not exist. Proceeding without "
1139 "autodev being set up");
87da4ec3 1140 return 0;
bc6928ff 1141 }
87da4ec3 1142
1ec0e8e3 1143 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
7133b912
CB
1144 rootfs->path ? rootfs->mount : NULL);
1145 if (ret < 0) {
1146 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1ec0e8e3 1147 return -1;
91c3830e 1148 }
7133b912 1149 INFO("Mounted tmpfs on \"%s\"", path);
87da4ec3 1150
ec50007f 1151 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
7133b912 1152 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1153 return -1;
87da4ec3 1154
7133b912 1155 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1156 * If not, then create it and exit if that fails...
1157 */
87da4ec3 1158 if (!dir_exists(path)) {
bc6928ff 1159 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
7133b912
CB
1160 if (ret < 0) {
1161 SYSERROR("Failed to create directory \"%s\"", path);
bc6928ff
MW
1162 return -1;
1163 }
91c3830e
SH
1164 }
1165
7133b912 1166 INFO("Prepared \"/dev\"");
91c3830e
SH
1167 return 0;
1168}
1169
c6883f38 1170struct lxc_devs {
74a3920a 1171 const char *name;
c6883f38
SH
1172 mode_t mode;
1173 int maj;
1174 int min;
1175};
1176
74a3920a 1177static const struct lxc_devs lxc_devs[] = {
06749971
CB
1178 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1179 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1180 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1181 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1182 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1183 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
c6883f38
SH
1184};
1185
27245ff7 1186static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38
SH
1187{
1188 int ret;
c6883f38
SH
1189 char path[MAXPATHLEN];
1190 int i;
3a32201c 1191 mode_t cmask;
c6883f38 1192
3999be0a
CB
1193 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1194 rootfs->path ? rootfs->mount : "");
1195 if (ret < 0 || ret >= MAXPATHLEN)
c6883f38 1196 return -1;
91c3830e 1197
0bbf8572
CB
1198 /* ignore, just don't try to fill in */
1199 if (!dir_exists(path))
9cb4d183
SH
1200 return 0;
1201
3999be0a
CB
1202 INFO("Populating \"/dev\"");
1203
3a32201c 1204 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1205 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1206 const struct lxc_devs *d = &lxc_devs[i];
0728ebf4 1207
3999be0a
CB
1208 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
1209 rootfs->path ? rootfs->mount : "", d->name);
c6883f38
SH
1210 if (ret < 0 || ret >= MAXPATHLEN)
1211 return -1;
0bbf8572 1212
c6883f38 1213 ret = mknod(path, d->mode, makedev(d->maj, d->min));
0bbf8572 1214 if (ret < 0) {
9cb4d183 1215 FILE *pathfile;
3999be0a 1216 char hostpath[MAXPATHLEN];
9cb4d183 1217
0bbf8572
CB
1218 if (errno == EEXIST) {
1219 DEBUG("\"%s\" device already existed", path);
1220 continue;
1221 }
1222
1223 /* Unprivileged containers cannot create devices, so
1224 * bind mount the device from the host.
1225 */
9cb4d183
SH
1226 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1227 if (ret < 0 || ret >= MAXPATHLEN)
1228 return -1;
3999be0a 1229
9cb4d183
SH
1230 pathfile = fopen(path, "wb");
1231 if (!pathfile) {
3999be0a 1232 SYSERROR("Failed to create file \"%s\"", path);
9cb4d183
SH
1233 return -1;
1234 }
1235 fclose(pathfile);
3999be0a
CB
1236
1237 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1238 rootfs->path ? rootfs->mount : NULL);
1239 if (ret < 0) {
1240 SYSERROR("Failed to bind mount \"%s\" from "
1241 "host into container",
1242 d->name);
9cb4d183
SH
1243 return -1;
1244 }
3999be0a
CB
1245 DEBUG("Bind mounted \"%s\" onto \"%s\"", hostpath,
1246 path);
0bbf8572 1247 } else {
3999be0a 1248 DEBUG("Created device node \"%s\"", path);
c6883f38
SH
1249 }
1250 }
3a32201c 1251 umask(cmask);
c6883f38 1252
3999be0a 1253 INFO("Populated \"/dev\"");
c6883f38
SH
1254 return 0;
1255}
1256
9aa76a17 1257static int lxc_setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1258{
9aa76a17 1259 int ret;
10bc1861 1260 struct lxc_storage *bdev;
91c3e281 1261 const struct lxc_rootfs *rootfs;
cc28d0b0 1262
91c3e281 1263 rootfs = &conf->rootfs;
a0f379bf 1264 if (!rootfs->path) {
91c3e281
CB
1265 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1266 SYSERROR("Failed to make / rslave.");
a0f379bf
DW
1267 return -1;
1268 }
c69bd12f 1269 return 0;
a0f379bf 1270 }
0ad19a3f 1271
12297168 1272 if (access(rootfs->mount, F_OK)) {
91c3e281 1273 SYSERROR("Failed to access to \"%s\". Check it is present.",
12297168 1274 rootfs->mount);
b1789442
DL
1275 return -1;
1276 }
1277
8a388ed4 1278 bdev = storage_init(conf);
9aa76a17
CB
1279 if (!bdev) {
1280 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
91c3e281
CB
1281 rootfs->path, rootfs->mount,
1282 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1283 return -1;
9be53773 1284 }
9aa76a17
CB
1285
1286 ret = bdev->ops->mount(bdev);
10bc1861 1287 storage_put(bdev);
9aa76a17 1288 if (ret < 0) {
91c3e281
CB
1289 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1290 rootfs->path, rootfs->mount,
1291 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1292 return -1;
1293 }
0ad19a3f 1294
91c3e281
CB
1295 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1296 rootfs->path, rootfs->mount,
1297 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1298
ac778708
DL
1299 return 0;
1300}
1301
91e93c71
AV
1302int prepare_ramfs_root(char *root)
1303{
eab15c1e 1304 char buf[LXC_LINELEN], *p;
91e93c71
AV
1305 char nroot[PATH_MAX];
1306 FILE *f;
1307 int i;
1308 char *p2;
1309
1310 if (realpath(root, nroot) == NULL)
39c7b795 1311 return -errno;
91e93c71
AV
1312
1313 if (chdir("/") == -1)
39c7b795 1314 return -errno;
91e93c71
AV
1315
1316 /*
1317 * We could use here MS_MOVE, but in userns this mount is
1318 * locked and can't be moved.
1319 */
39c7b795 1320 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
91e93c71 1321 SYSERROR("Failed to move %s into /", root);
39c7b795 1322 return -errno;
91e93c71
AV
1323 }
1324
39c7b795 1325 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
91e93c71 1326 SYSERROR("Failed to make . rprivate");
39c7b795 1327 return -errno;
91e93c71
AV
1328 }
1329
1330 /*
1331 * The following code cleans up inhereted mounts which are not
1332 * required for CT.
1333 *
1334 * The mountinfo file shows not all mounts, if a few points have been
1335 * unmounted between read operations from the mountinfo. So we need to
1336 * read mountinfo a few times.
1337 *
1338 * This loop can be skipped if a container uses unserns, because all
1339 * inherited mounts are locked and we should live with all this trash.
1340 */
1341 while (1) {
1342 int progress = 0;
1343
1344 f = fopen("./proc/self/mountinfo", "r");
1345 if (!f) {
1346 SYSERROR("Unable to open /proc/self/mountinfo");
1347 return -1;
1348 }
eab15c1e 1349 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1350 for (p = buf, i=0; p && i < 4; i++)
1351 p = strchr(p+1, ' ');
1352 if (!p)
1353 continue;
1354 p2 = strchr(p+1, ' ');
1355 if (!p2)
1356 continue;
1357
1358 *p2 = '\0';
1359 *p = '.';
1360
1361 if (strcmp(p + 1, "/") == 0)
1362 continue;
1363 if (strcmp(p + 1, "/proc") == 0)
1364 continue;
1365
1366 if (umount2(p, MNT_DETACH) == 0)
1367 progress++;
1368 }
1369 fclose(f);
1370 if (!progress)
1371 break;
1372 }
1373
8bea9fae
PR
1374 /* This also can be skipped if a container uses unserns */
1375 umount2("./proc", MNT_DETACH);
91e93c71
AV
1376
1377 /* It is weird, but chdir("..") moves us in a new root */
1378 if (chdir("..") == -1) {
1379 SYSERROR("Unable to change working directory");
1380 return -1;
1381 }
1382
1383 if (chroot(".") == -1) {
1384 SYSERROR("Unable to chroot");
1385 return -1;
1386 }
1387
1388 return 0;
1389}
1390
74a3920a 1391static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1392{
39c7b795
CB
1393 if (!rootfs->path) {
1394 DEBUG("container does not have a rootfs, so not doing pivot root");
ac778708 1395 return 0;
39c7b795 1396 }
ac778708 1397
91e93c71 1398 if (detect_ramfs_rootfs()) {
39c7b795
CB
1399 DEBUG("detected that container is on ramfs");
1400 if (prepare_ramfs_root(rootfs->mount)) {
1401 ERROR("failed to prepare minimal ramfs root");
91e93c71 1402 return -1;
39c7b795
CB
1403 }
1404
1405 DEBUG("prepared ramfs root for container");
1406 return 0;
1407 }
1408
1409 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1410 ERROR("failed to pivot root");
25368b52 1411 return -1;
c69bd12f
DL
1412 }
1413
39c7b795 1414 DEBUG("finished pivot root");
25368b52 1415 return 0;
0ad19a3f 1416}
1417
70761e5e 1418static int lxc_setup_devpts(int num_pts)
3c26f34e 1419{
70761e5e 1420 int ret;
9d28c4f9
CB
1421 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1422 char devpts_mntopts[256];
77890c6d 1423
70761e5e
CB
1424 if (!num_pts) {
1425 DEBUG("no new devpts instance will be mounted since no pts "
1426 "devices are requested");
d852c78c 1427 return 0;
3c26f34e 1428 }
1429
9d28c4f9
CB
1430 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1431 default_devpts_mntopts, num_pts);
1432 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1433 return -1;
1434
d5cb35d6 1435 /* Unmount old devpts instance. */
70761e5e
CB
1436 ret = access("/dev/pts/ptmx", F_OK);
1437 if (!ret) {
70761e5e
CB
1438 ret = umount("/dev/pts");
1439 if (ret < 0) {
1440 SYSERROR("failed to unmount old devpts instance");
1441 return -1;
7e40254a 1442 }
70761e5e 1443 DEBUG("unmounted old /dev/pts instance");
7e40254a
JTLB
1444 }
1445
70761e5e
CB
1446 /* Create mountpoint for devpts instance. */
1447 ret = mkdir("/dev/pts", 0755);
1448 if (ret < 0 && errno != EEXIST) {
1449 SYSERROR("failed to create the \"/dev/pts\" directory");
3c26f34e 1450 return -1;
1451 }
1452
70761e5e
CB
1453 /* Mount new devpts instance. */
1454 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1455 if (ret < 0) {
1456 SYSERROR("failed to mount new devpts instance");
1457 return -1;
1458 }
f4f52cb5 1459 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1460
d5cb35d6 1461 /* Remove any pre-existing /dev/ptmx file. */
70761e5e 1462 ret = access("/dev/ptmx", F_OK);
d5cb35d6
CB
1463 if (!ret) {
1464 ret = remove("/dev/ptmx");
1465 if (ret < 0) {
1466 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1467 return -1;
70761e5e 1468 }
d5cb35d6 1469 DEBUG("removed existing \"/dev/ptmx\"");
3c26f34e 1470 }
1471
d5cb35d6
CB
1472 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1473 ret = open("/dev/ptmx", O_CREAT, 0666);
1474 if (ret < 0) {
1475 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1476 return -1;
1477 }
e87bd19c 1478 close(ret);
d5cb35d6 1479 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1480
d5cb35d6 1481 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1482 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6
CB
1483 if (!ret) {
1484 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1485 return 0;
1486 } else {
1487 /* Fallthrough and try to create a symlink. */
1488 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1489 }
1490
1491 /* Remove the dummy /dev/ptmx file we created above. */
1492 ret = remove("/dev/ptmx");
70761e5e 1493 if (ret < 0) {
d5cb35d6
CB
1494 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1495 return -1;
1496 }
1497
1498 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1499 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1500 if (ret < 0) {
1501 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
3c26f34e 1502 return -1;
1503 }
d5cb35d6 1504 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
cd54d859 1505
3c26f34e 1506 return 0;
1507}
1508
cccc74b5
DL
1509static int setup_personality(int persona)
1510{
6ff05e18 1511 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1512 if (persona == -1)
1513 return 0;
1514
1515 if (personality(persona) < 0) {
1516 SYSERROR("failed to set personality to '0x%x'", persona);
1517 return -1;
1518 }
1519
1520 INFO("set personality to '0x%x'", persona);
6ff05e18 1521 #endif
cccc74b5
DL
1522
1523 return 0;
1524}
1525
3d7d929a
CB
1526static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1527 const struct lxc_console *console)
6e590161 1528{
63376d7d 1529 char path[MAXPATHLEN];
0728ebf4 1530 int ret, fd;
52e35957 1531
8b1b1210
CB
1532 if (console->path && !strcmp(console->path, "none"))
1533 return 0;
1534
7c6ef2a2 1535 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
3d7d929a 1536 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1537 return -1;
52e35957 1538
8b1b1210
CB
1539 /* When we are asked to setup a console we remove any previous
1540 * /dev/console bind-mounts.
1541 */
a7ba3c7f
CB
1542 if (file_exists(path)) {
1543 ret = lxc_unstack_mountpoint(path, false);
1544 if (ret < 0) {
8b1b1210 1545 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1546 return -ret;
1547 } else {
1548 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1549 }
953fe44f 1550
a7ba3c7f
CB
1551 ret = unlink(path);
1552 if (ret < 0) {
1553 SYSERROR("error unlinking %s", path);
8b1b1210
CB
1554 return -errno;
1555 }
8b1b1210
CB
1556 }
1557
1558 /* For unprivileged containers autodev or automounts will already have
1559 * taken care of creating /dev/console.
1560 */
0728ebf4
TA
1561 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1562 if (fd < 0) {
1563 if (errno != EEXIST) {
1564 SYSERROR("failed to create console");
3d7d929a 1565 return -errno;
0728ebf4
TA
1566 }
1567 } else {
1568 close(fd);
52e35957
DL
1569 }
1570
0728ebf4 1571 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
3d7d929a
CB
1572 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1573 return -errno;
63376d7d 1574 }
13954cce 1575
3d7d929a 1576 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
63376d7d 1577 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1578 return -1;
1579 }
1580
3d7d929a 1581 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1582 return 0;
1583}
1584
3d7d929a
CB
1585static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1586 const struct lxc_console *console,
1587 char *ttydir)
7c6ef2a2 1588{
7c6ef2a2 1589 int ret;
3d7d929a 1590 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
7c6ef2a2
SH
1591
1592 /* create rootfs/dev/<ttydir> directory */
3d7d929a
CB
1593 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1594 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1595 return -1;
3d7d929a 1596
7c6ef2a2
SH
1597 ret = mkdir(path, 0755);
1598 if (ret && errno != EEXIST) {
959aee9c 1599 SYSERROR("failed with errno %d to create %s", errno, path);
3d7d929a 1600 return -errno;
7c6ef2a2 1601 }
4742cd9a 1602 DEBUG("Created directory for console and tty devices at \"%s\"", path);
7c6ef2a2 1603
3d7d929a
CB
1604 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1605 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1606 return -1;
1607
7c6ef2a2 1608 ret = creat(lxcpath, 0660);
3d7d929a 1609 if (ret == -1 && errno != EEXIST) {
959aee9c 1610 SYSERROR("error %d creating %s", errno, lxcpath);
3d7d929a 1611 return -errno;
7c6ef2a2 1612 }
4d44e274
SH
1613 if (ret >= 0)
1614 close(ret);
7c6ef2a2 1615
2a12fefd
CB
1616 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1617 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 1618 return -1;
2a12fefd
CB
1619
1620 /* When we are asked to setup a console we remove any previous
1621 * /dev/console bind-mounts.
1622 */
1623 if (console->path && !strcmp(console->path, "none")) {
1624 struct stat st;
1625 ret = stat(path, &st);
1626 if (ret < 0) {
1627 if (errno == ENOENT)
1628 return 0;
1629 SYSERROR("failed stat() \"%s\"", path);
1630 return -errno;
1631 }
1632
1633 /* /dev/console must be character device with major number 5 and
1634 * minor number 1. If not, give benefit of the doubt and assume
1635 * the user has mounted something else right there on purpose.
1636 */
1637 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1638 return 0;
1639
1640 /* In case the user requested a bind-mount for /dev/console and
1641 * requests a ttydir we move the mount to the
a7ba3c7f
CB
1642 * /dev/<ttydir/console.
1643 * Note, we only move the uppermost mount and clear all other
1644 * mounts underneath for safety.
1645 * If it is a character device created via mknod() we simply
1646 * rename it.
2a12fefd
CB
1647 */
1648 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1649 if (ret < 0) {
1650 if (errno != EINVAL) {
1651 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1652 return -errno;
1653 }
1654 /* path was not a mountpoint */
1655 ret = rename(path, lxcpath);
1656 if (ret < 0) {
1657 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1658 return -errno;
1659 }
1660 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1661 } else {
1662 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1663 }
a7ba3c7f
CB
1664
1665 /* Clear all remaining bind-mounts. */
1666 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1667 if (ret < 0) {
a7ba3c7f
CB
1668 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1669 return -ret;
1670 } else {
1671 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1672 }
1673 } else {
1674 if (file_exists(path)) {
1675 ret = lxc_unstack_mountpoint(path, false);
1676 if (ret < 0) {
2a12fefd 1677 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1678 return -ret;
1679 } else {
1680 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
2a12fefd 1681 }
2a12fefd
CB
1682 }
1683
1684 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1685 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1686 return -1;
1687 }
1688 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1689 }
1690
2a12fefd 1691 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
9ba8130c 1692 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
3d7d929a 1693 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 1694 return -1;
3d7d929a 1695
2a12fefd
CB
1696 ret = unlink(path);
1697 if (ret && errno != ENOENT) {
1698 SYSERROR("error unlinking %s", path);
1699 return -errno;
1700 }
1701
7c6ef2a2 1702 ret = symlink(lxcpath, path);
3d7d929a
CB
1703 if (ret < 0) {
1704 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
7c6ef2a2
SH
1705 return -1;
1706 }
1707
3d7d929a 1708 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
6e590161 1709 return 0;
1710}
1711
3d7d929a
CB
1712static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1713 const struct lxc_console *console, char *ttydir)
7c6ef2a2 1714{
3d7d929a
CB
1715 /* We don't have a rootfs, /dev/console will be shared. */
1716 if (!rootfs->path) {
1717 DEBUG("/dev/console will be shared with the host");
7c6ef2a2 1718 return 0;
3d7d929a
CB
1719 }
1720
7c6ef2a2 1721 if (!ttydir)
3d7d929a 1722 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1723
3d7d929a 1724 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1725}
1726
998ac676
RT
1727static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1728{
1729 struct mount_opt *mo;
1730
1731 /* If opt is found in mount_opt, set or clear flags.
1732 * Otherwise append it to data. */
1733
1734 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1735 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1736 if (mo->clear)
1737 *flags &= ~mo->flag;
1738 else
1739 *flags |= mo->flag;
1740 return;
1741 }
1742 }
1743
1744 if (strlen(*data))
1745 strcat(*data, ",");
1746 strcat(*data, opt);
1747}
1748
a17b1e65 1749int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1750 char **mntdata)
1751{
1752 char *s, *data;
1753 char *p, *saveptr = NULL;
1754
911324ef 1755 *mntdata = NULL;
91656ce5 1756 *mntflags = 0L;
911324ef
DL
1757
1758 if (!mntopts)
998ac676
RT
1759 return 0;
1760
911324ef 1761 s = strdup(mntopts);
998ac676 1762 if (!s) {
36eb9bde 1763 SYSERROR("failed to allocate memory");
998ac676
RT
1764 return -1;
1765 }
1766
1767 data = malloc(strlen(s) + 1);
1768 if (!data) {
36eb9bde 1769 SYSERROR("failed to allocate memory");
998ac676
RT
1770 free(s);
1771 return -1;
1772 }
1773 *data = 0;
1774
1775 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1776 p = strtok_r(NULL, ",", &saveptr))
1777 parse_mntopt(p, mntflags, &data);
1778
1779 if (*data)
1780 *mntdata = data;
1781 else
1782 free(data);
1783 free(s);
1784
1785 return 0;
1786}
1787
6fd5e769
SH
1788static void null_endofword(char *word)
1789{
1790 while (*word && *word != ' ' && *word != '\t')
1791 word++;
1792 *word = '\0';
1793}
1794
1795/*
1796 * skip @nfields spaces in @src
1797 */
1798static char *get_field(char *src, int nfields)
1799{
1800 char *p = src;
1801 int i;
1802
1803 for (i = 0; i < nfields; i++) {
1804 while (*p && *p != ' ' && *p != '\t')
1805 p++;
1806 if (!*p)
1807 break;
1808 p++;
1809 }
1810 return p;
1811}
1812
911324ef
DL
1813static int mount_entry(const char *fsname, const char *target,
1814 const char *fstype, unsigned long mountflags,
0ac4b28a
CB
1815 const char *data, int optional, int dev,
1816 const char *rootfs)
911324ef 1817{
0ac4b28a 1818 int ret;
614305f3 1819#ifdef HAVE_STATVFS
2938f7c8 1820 struct statvfs sb;
614305f3 1821#endif
2938f7c8 1822
0ac4b28a
CB
1823 ret = safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data,
1824 rootfs);
1825 if (ret < 0) {
1fc64d22 1826 if (optional) {
0ac4b28a 1827 INFO("Failed to mount \"%s\" on \"%s\" (optional): %s",
0103eb53 1828 fsname ? fsname : "(null)", target, strerror(errno));
1fc64d22
SG
1829 return 0;
1830 }
0ac4b28a 1831
0103eb53
CB
1832 SYSERROR("Failed to mount \"%s\" on \"%s\"",
1833 fsname ? fsname : "(null)", target);
0ac4b28a 1834 return -1;
911324ef
DL
1835 }
1836
1837 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
7c5b6e7c 1838 unsigned long rqd_flags = 0;
0ac4b28a
CB
1839
1840 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
0103eb53 1841 "options", fsname ? fsname : "(none)", target ? target : "(none)");
0ac4b28a 1842
7c5b6e7c
AS
1843 if (mountflags & MS_RDONLY)
1844 rqd_flags |= MS_RDONLY;
614305f3 1845#ifdef HAVE_STATVFS
0103eb53 1846 if (fsname && statvfs(fsname, &sb) == 0) {
7c5b6e7c 1847 unsigned long required_flags = rqd_flags;
0ac4b28a 1848
2938f7c8
SH
1849 if (sb.f_flag & MS_NOSUID)
1850 required_flags |= MS_NOSUID;
0ac4b28a 1851
ae7a770e 1852 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 1853 required_flags |= MS_NODEV;
0ac4b28a 1854
2938f7c8
SH
1855 if (sb.f_flag & MS_RDONLY)
1856 required_flags |= MS_RDONLY;
0ac4b28a 1857
2938f7c8
SH
1858 if (sb.f_flag & MS_NOEXEC)
1859 required_flags |= MS_NOEXEC;
0ac4b28a
CB
1860
1861 DEBUG("Flags for \"%s\" were %lu, required extra flags "
1862 "are %lu", fsname, sb.f_flag, required_flags);
1863
1864 /* If this was a bind mount request, and required_flags
2938f7c8 1865 * does not have any flags which are not already in
0ac4b28a 1866 * mountflags, then skip the remount.
2938f7c8
SH
1867 */
1868 if (!(mountflags & MS_REMOUNT)) {
0ac4b28a
CB
1869 if (!(required_flags & ~mountflags) &&
1870 rqd_flags == 0) {
1871 DEBUG("Mountflags already were %lu, "
1872 "skipping remount", mountflags);
2938f7c8
SH
1873 goto skipremount;
1874 }
1875 }
0ac4b28a 1876
2938f7c8 1877 mountflags |= required_flags;
6fd5e769 1878 }
614305f3 1879#endif
911324ef 1880
0ac4b28a
CB
1881 ret = mount(fsname, target, fstype, mountflags | MS_REMOUNT, data);
1882 if (ret < 0) {
1fc64d22 1883 if (optional) {
0ac4b28a 1884 INFO("Failed to mount \"%s\" on \"%s\" "
0103eb53
CB
1885 "(optional): %s",
1886 fsname ? fsname : "(null)", target,
0ac4b28a 1887 strerror(errno));
1fc64d22
SG
1888 return 0;
1889 }
0ac4b28a 1890
0103eb53
CB
1891 SYSERROR("Failed to mount \"%s\" on \"%s\"",
1892 fsname ? fsname : "(null)", target);
0ac4b28a 1893 return -1;
911324ef
DL
1894 }
1895 }
1896
614305f3 1897#ifdef HAVE_STATVFS
6fd5e769 1898skipremount:
614305f3 1899#endif
0103eb53
CB
1900 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"",
1901 fsname ? fsname : "(null)", target, fstype);
911324ef
DL
1902
1903 return 0;
1904}
1905
c5e30de4 1906/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
1907static void cull_mntent_opt(struct mntent *mntent)
1908{
1909 int i;
c5e30de4
CB
1910 char *list[] = {"create=dir", "create=file", "optional", NULL};
1911
1912 for (i = 0; list[i]; i++) {
1913 char *p, *p2;
1914
1915 p = strstr(mntent->mnt_opts, list[i]);
1916 if (!p)
4e4ca161 1917 continue;
c5e30de4 1918
4e4ca161
SH
1919 p2 = strchr(p, ',');
1920 if (!p2) {
1921 /* no more mntopts, so just chop it here */
1922 *p = '\0';
1923 continue;
1924 }
c5e30de4
CB
1925
1926 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
1927 }
1928}
1929
4d5b72a1 1930static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
1931 const char *path,
1932 const struct lxc_rootfs *rootfs,
1933 const char *lxc_name,
1934 const char *lxc_path)
0ad19a3f 1935{
608e3567 1936 int ret = 0;
911324ef 1937
749f98d9
CB
1938 if (!strncmp(mntent->mnt_type, "overlay", 7))
1939 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
1940 else if (!strncmp(mntent->mnt_type, "aufs", 4))
1941 ret = aufs_mkdir(mntent, rootfs, lxc_name, lxc_path);
1942 if (ret < 0)
1943 return -1;
6e46cc0d 1944
34cfffb3 1945 if (hasmntopt(mntent, "create=dir")) {
749f98d9
CB
1946 ret = mkdir_p(path, 0755);
1947 if (ret < 0 && errno != EEXIST) {
1948 SYSERROR("Failed to create directory \"%s\"", path);
1949 return -1;
34cfffb3
SG
1950 }
1951 }
1952
4d5b72a1 1953 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
749f98d9
CB
1954 int fd;
1955 char *p1, *p2;
1956
1957 p1 = strdup(path);
1958 if (!p1)
1959 return -1;
1960
1961 p2 = dirname(p1);
1962
1963 ret = mkdir_p(p2, 0755);
1964 free(p1);
1965 if (ret < 0 && errno != EEXIST) {
1966 SYSERROR("Failed to create directory \"%s\"", path);
1967 return -1;
6e46cc0d 1968 }
749f98d9
CB
1969
1970 fd = open(path, O_CREAT, 0644);
1971 if (fd < 0)
1972 return -1;
1973 close(fd);
34cfffb3 1974 }
749f98d9
CB
1975
1976 return 0;
4d5b72a1
NC
1977}
1978
ec50007f
CB
1979/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1980 * without a rootfs. */
db4aba38 1981static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
1982 const char *path,
1983 const struct lxc_rootfs *rootfs,
1984 const char *lxc_name,
1985 const char *lxc_path)
4d5b72a1 1986{
d8b712bc 1987 int ret;
4d5b72a1
NC
1988 unsigned long mntflags;
1989 char *mntdata;
d8b712bc 1990 bool dev, optional;
ec50007f 1991 char *rootfs_path = NULL;
d8b712bc
CB
1992
1993 optional = hasmntopt(mntent, "optional") != NULL;
1994 dev = hasmntopt(mntent, "dev") != NULL;
1995
ec50007f
CB
1996 if (rootfs && rootfs->path)
1997 rootfs_path = rootfs->mount;
1998
d8b712bc
CB
1999 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2000 lxc_path);
2001 if (ret < 0) {
2002 if (optional)
2003 return 0;
608e3567 2004
d8b712bc
CB
2005 return -1;
2006 }
4e4ca161
SH
2007 cull_mntent_opt(mntent);
2008
d8b712bc
CB
2009 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2010 if (ret < 0)
a17b1e65 2011 return -1;
a17b1e65 2012
6e46cc0d 2013 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
ae7a770e 2014 mntdata, optional, dev, rootfs_path);
68c152ef 2015
911324ef 2016 free(mntdata);
911324ef
DL
2017 return ret;
2018}
2019
db4aba38
NC
2020static inline int mount_entry_on_systemfs(struct mntent *mntent)
2021{
1433c9f9 2022 int ret;
07667a6a 2023 char path[MAXPATHLEN];
1433c9f9
CB
2024
2025 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
2026 * absolute paths starting at / on the host.
2027 */
1433c9f9
CB
2028 if (mntent->mnt_dir[0] != '/')
2029 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2030 else
2031 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
07667a6a 2032 if (ret < 0 || ret >= sizeof(path))
1433c9f9 2033 return -1;
1433c9f9
CB
2034
2035 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
2036}
2037
4e4ca161 2038static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 2039 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
2040 const char *lxc_name,
2041 const char *lxc_path)
911324ef 2042{
bdd2b34c 2043 int offset;
013bd428 2044 char *aux;
67e571de 2045 const char *lxcpath;
bdd2b34c
CB
2046 char path[MAXPATHLEN];
2047 int ret = 0;
0ad19a3f 2048
593e8478 2049 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 2050 if (!lxcpath)
2a59a681 2051 return -1;
2a59a681 2052
bdd2b34c
CB
2053 /* If rootfs->path is a blockdev path, allow container fstab to use
2054 * <lxcpath>/<name>/rootfs" as the target prefix.
2055 */
2056 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
2057 if (ret < 0 || ret >= MAXPATHLEN)
80a881b2
SH
2058 goto skipvarlib;
2059
2060 aux = strstr(mntent->mnt_dir, path);
2061 if (aux) {
2062 offset = strlen(path);
2063 goto skipabs;
2064 }
2065
2066skipvarlib:
013bd428
DL
2067 aux = strstr(mntent->mnt_dir, rootfs->path);
2068 if (!aux) {
bdd2b34c 2069 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
db4aba38 2070 return ret;
013bd428 2071 }
80a881b2
SH
2072 offset = strlen(rootfs->path);
2073
2074skipabs:
bdd2b34c
CB
2075 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
2076 if (ret < 0 || ret >= MAXPATHLEN)
a17b1e65 2077 return -1;
a17b1e65 2078
0a2dddd4 2079 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 2080}
d330fe7b 2081
4e4ca161 2082static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
2083 const struct lxc_rootfs *rootfs,
2084 const char *lxc_name,
2085 const char *lxc_path)
911324ef
DL
2086{
2087 char path[MAXPATHLEN];
911324ef 2088 int ret;
d330fe7b 2089
34cfffb3 2090 /* relative to root mount point */
6e46cc0d 2091 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1433c9f9 2092 if (ret < 0 || ret >= sizeof(path)) {
9ba8130c
SH
2093 ERROR("path name too long");
2094 return -1;
2095 }
911324ef 2096
0a2dddd4 2097 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2098}
2099
06749971
CB
2100/* This logs a NOTICE() when a user specifies mounts that would conflict with
2101 * devices liblxc sets up automatically.
2102 */
2103static void log_notice_on_conflict(const struct lxc_conf *conf, const char *src,
2104 const char *dest)
2105{
2106 char *clean_mnt_fsname, *clean_mnt_dir, *tmp;
2107 bool needs_warning = false;
2108
2109 clean_mnt_fsname = lxc_deslashify(src);
2110 if (!clean_mnt_fsname)
2111 return;
2112
2113 clean_mnt_dir = lxc_deslashify(dest);
2114 if (!clean_mnt_dir) {
2115 free(clean_mnt_fsname);
2116 return;
2117 }
2118
2119 tmp = clean_mnt_dir;
2120 if (*tmp == '/')
2121 tmp++;
2122
2123 if (strncmp(src, "/dev", 4) || strncmp(tmp, "dev", 3)) {
2124 free(clean_mnt_dir);
2125 free(clean_mnt_fsname);
2126 return;
2127 }
2128
2129 if (!conf->autodev && !conf->pts && !conf->tty &&
2130 (!conf->console.path || !strcmp(conf->console.path, "none"))) {
2131 free(clean_mnt_dir);
2132 free(clean_mnt_fsname);
2133 return;
2134 }
2135
2136 if (!strcmp(tmp, "dev") && conf->autodev > 0)
2137 needs_warning = true;
2138 else if (!strcmp(tmp, "dev/pts") && (conf->autodev > 0 || conf->pts > 0))
2139 needs_warning = true;
2140 else if (!strcmp(tmp, "dev/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2141 needs_warning = true;
2142 else if (!strcmp(tmp, "dev/pts/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2143 needs_warning = true;
2144 else if (!strcmp(tmp, "dev/null") && conf->autodev > 0)
2145 needs_warning = true;
2146 else if (!strcmp(tmp, "dev/zero") && conf->autodev > 0)
2147 needs_warning = true;
2148 else if (!strcmp(tmp, "dev/full") && conf->autodev > 0)
2149 needs_warning = true;
2150 else if (!strcmp(tmp, "dev/urandom") && conf->autodev > 0)
2151 needs_warning = true;
2152 else if (!strcmp(tmp, "dev/random") && conf->autodev > 0)
2153 needs_warning = true;
2154 else if (!strcmp(tmp, "dev/tty") && conf->autodev > 0)
2155 needs_warning = true;
2156 else if (!strncmp(tmp, "dev/tty", 7) && (conf->autodev > 0 || conf->tty > 0))
2157 needs_warning = true;
2158
2159 if (needs_warning)
2160 NOTICE("Requesting to mount \"%s\" on \"%s\" while requesting "
2161 "automatic device setup under \"/dev\"",
2162 clean_mnt_fsname, clean_mnt_dir);
2163
2164 free(clean_mnt_dir);
2165 free(clean_mnt_fsname);
2166}
2167
2168static int mount_file_entries(const struct lxc_conf *conf,
2169 const struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2170 const char *lxc_name, const char *lxc_path)
911324ef 2171{
aaf901be
AM
2172 struct mntent mntent;
2173 char buf[4096];
911324ef 2174 int ret = -1;
e76b8764 2175
aaf901be 2176 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
06749971
CB
2177 log_notice_on_conflict(conf, mntent.mnt_fsname, mntent.mnt_dir);
2178
1ae3c19f
CB
2179 if (!rootfs->path)
2180 ret = mount_entry_on_systemfs(&mntent);
2181 else if (mntent.mnt_dir[0] != '/')
2182 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2183 lxc_name, lxc_path);
2184 else
2185 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2186 lxc_name, lxc_path);
2187 if (ret < 0)
2188 return -1;
0ad19a3f 2189 }
2190 ret = 0;
cd54d859 2191
1ae3c19f 2192 INFO("Set up mount entries");
e7938e9e
MN
2193 return ret;
2194}
2195
06749971
CB
2196static int setup_mount(const struct lxc_conf *conf,
2197 const struct lxc_rootfs *rootfs, const char *fstab,
42dff448 2198 const char *lxc_name, const char *lxc_path)
e7938e9e 2199{
42dff448 2200 FILE *f;
e7938e9e
MN
2201 int ret;
2202
2203 if (!fstab)
2204 return 0;
2205
42dff448
CB
2206 f = setmntent(fstab, "r");
2207 if (!f) {
2208 SYSERROR("Failed to open \"%s\"", fstab);
e7938e9e
MN
2209 return -1;
2210 }
2211
06749971 2212 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
42dff448
CB
2213 if (ret < 0)
2214 ERROR("Failed to set up mount entries");
e7938e9e 2215
42dff448 2216 endmntent(f);
0ad19a3f 2217 return ret;
2218}
2219
5ef5c9a3 2220FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2221{
5ef5c9a3 2222 int ret;
e7938e9e 2223 char *mount_entry;
5ef5c9a3 2224 struct lxc_list *iterator;
6bd04140 2225 FILE *f;
5ef5c9a3
CB
2226 int fd = -1;
2227
2228 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2229 if (fd < 0) {
2230 if (errno != ENOSYS)
2231 return NULL;
6bd04140
CB
2232 f = tmpfile();
2233 TRACE("Created temporary mount file");
5ef5c9a3 2234 } else {
6bd04140
CB
2235 f = fdopen(fd, "r+");
2236 TRACE("Created anonymous mount file");
5ef5c9a3 2237 }
e7938e9e 2238
6bd04140
CB
2239 if (!f) {
2240 SYSERROR("Could not create mount file");
5ef5c9a3
CB
2241 if (fd != -1)
2242 close(fd);
9fc7f8c0 2243 return NULL;
e7938e9e
MN
2244 }
2245
2246 lxc_list_for_each(iterator, mount) {
2247 mount_entry = iterator->elem;
6bd04140 2248 ret = fprintf(f, "%s\n", mount_entry);
5ef5c9a3 2249 if (ret < strlen(mount_entry))
6bd04140 2250 WARN("Could not write mount entry to mount file");
5ef5c9a3
CB
2251 }
2252
6bd04140
CB
2253 ret = fseek(f, 0, SEEK_SET);
2254 if (ret < 0) {
2255 SYSERROR("Failed to seek mount file");
2256 fclose(f);
5ef5c9a3 2257 return NULL;
e7938e9e
MN
2258 }
2259
6bd04140 2260 return f;
9fc7f8c0
TA
2261}
2262
06749971
CB
2263static int setup_mount_entries(const struct lxc_conf *conf,
2264 const struct lxc_rootfs *rootfs,
5ef5c9a3
CB
2265 struct lxc_list *mount, const char *lxc_name,
2266 const char *lxc_path)
9fc7f8c0 2267{
19b5d755 2268 FILE *f;
9fc7f8c0
TA
2269 int ret;
2270
19b5d755
CB
2271 f = make_anonymous_mount_file(mount);
2272 if (!f)
9fc7f8c0 2273 return -1;
e7938e9e 2274
06749971 2275 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
e7938e9e 2276
19b5d755 2277 fclose(f);
e7938e9e
MN
2278 return ret;
2279}
2280
bab88e68
CS
2281static int parse_cap(const char *cap)
2282{
2283 char *ptr = NULL;
84760c11 2284 size_t i;
2285 int capid = -1;
bab88e68 2286
7035407c
DE
2287 if (!strcmp(cap, "none"))
2288 return -2;
2289
bab88e68
CS
2290 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2291
2292 if (strcmp(cap, caps_opt[i].name))
2293 continue;
2294
2295 capid = caps_opt[i].value;
2296 break;
2297 }
2298
2299 if (capid < 0) {
2300 /* try to see if it's numeric, so the user may specify
2301 * capabilities that the running kernel knows about but
2302 * we don't */
2303 errno = 0;
2304 capid = strtol(cap, &ptr, 10);
2305 if (!ptr || *ptr != '\0' || errno != 0)
2306 /* not a valid number */
2307 capid = -1;
2308 else if (capid > lxc_caps_last_cap())
2309 /* we have a number but it's not a valid
2310 * capability */
2311 capid = -1;
2312 }
2313
2314 return capid;
2315}
2316
0769b82a
CS
2317int in_caplist(int cap, struct lxc_list *caps)
2318{
2319 struct lxc_list *iterator;
2320 int capid;
2321
2322 lxc_list_for_each(iterator, caps) {
2323 capid = parse_cap(iterator->elem);
2324 if (capid == cap)
2325 return 1;
2326 }
2327
2328 return 0;
2329}
2330
81810dd1
DL
2331static int setup_caps(struct lxc_list *caps)
2332{
2333 struct lxc_list *iterator;
2334 char *drop_entry;
bab88e68 2335 int capid;
81810dd1
DL
2336
2337 lxc_list_for_each(iterator, caps) {
2338
2339 drop_entry = iterator->elem;
2340
bab88e68 2341 capid = parse_cap(drop_entry);
d55bc1ad 2342
81810dd1 2343 if (capid < 0) {
1e11be34
DL
2344 ERROR("unknown capability %s", drop_entry);
2345 return -1;
81810dd1
DL
2346 }
2347
2348 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2349
2350 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2351 SYSERROR("failed to remove %s capability", drop_entry);
2352 return -1;
2353 }
81810dd1
DL
2354
2355 }
2356
1fb86a7c
SH
2357 DEBUG("capabilities have been setup");
2358
2359 return 0;
2360}
2361
2362static int dropcaps_except(struct lxc_list *caps)
2363{
2364 struct lxc_list *iterator;
2365 char *keep_entry;
1fb86a7c
SH
2366 int i, capid;
2367 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2368 INFO("found %d capabilities", numcaps);
1fb86a7c 2369
2caf9a97
SH
2370 if (numcaps <= 0 || numcaps > 200)
2371 return -1;
2372
1a0e70ac 2373 /* caplist[i] is 1 if we keep capability i */
1fb86a7c
SH
2374 int *caplist = alloca(numcaps * sizeof(int));
2375 memset(caplist, 0, numcaps * sizeof(int));
2376
2377 lxc_list_for_each(iterator, caps) {
2378
2379 keep_entry = iterator->elem;
2380
bab88e68 2381 capid = parse_cap(keep_entry);
1fb86a7c 2382
7035407c
DE
2383 if (capid == -2)
2384 continue;
2385
1fb86a7c
SH
2386 if (capid < 0) {
2387 ERROR("unknown capability %s", keep_entry);
2388 return -1;
2389 }
2390
8255688a 2391 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2392
2393 caplist[capid] = 1;
2394 }
2395 for (i=0; i<numcaps; i++) {
2396 if (caplist[i])
2397 continue;
2398 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2399 SYSERROR("failed to remove capability %d", i);
2400 return -1;
2401 }
1fb86a7c
SH
2402 }
2403
2404 DEBUG("capabilities have been setup");
81810dd1
DL
2405
2406 return 0;
2407}
2408
c6d09e15
WB
2409static int parse_resource(const char *res) {
2410 size_t i;
2411 int resid = -1;
2412
2413 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2414 if (strcmp(res, limit_opt[i].name) == 0)
2415 return limit_opt[i].value;
2416 }
2417
2418 /* try to see if it's numeric, so the user may specify
2419 * resources that the running kernel knows about but
2420 * we don't */
2421 if (lxc_safe_int(res, &resid) == 0)
2422 return resid;
2423 return -1;
2424}
2425
2426int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2427 struct lxc_list *it;
2428 struct lxc_limit *lim;
2429 int resid;
2430
2431 lxc_list_for_each(it, limits) {
2432 lim = it->elem;
2433
2434 resid = parse_resource(lim->resource);
2435 if (resid < 0) {
2436 ERROR("unknown resource %s", lim->resource);
2437 return -1;
2438 }
2439
f48b5fd8 2440#if HAVE_PRLIMIT || HAVE_PRLIMIT64
c6d09e15
WB
2441 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2442 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2443 return -1;
2444 }
f48b5fd8
FF
2445#else
2446 ERROR("Cannot set limit %s as prlimit is missing", lim->resource);
2447 return -1;
2448#endif
c6d09e15
WB
2449 }
2450 return 0;
2451}
2452
7edd0540
L
2453int setup_sysctl_parameters(struct lxc_list *sysctls)
2454{
2455 struct lxc_list *it;
2456 struct lxc_sysctl *elem;
2457 char *tmp = NULL;
2458 char filename[MAXPATHLEN] = {0};
2459 int ret = 0;
2460
2461 lxc_list_for_each(it, sysctls) {
2462 elem = it->elem;
2463 tmp = lxc_string_replace(".", "/", elem->key);
2464 if (!tmp) {
2465 ERROR("Failed to replace key %s", elem->key);
2466 return -1;
2467 }
2468
2469 ret = snprintf(filename, sizeof(filename), "/proc/sys/%s", tmp);
2470 free(tmp);
2471 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2472 ERROR("Error setting up sysctl parameters path");
2473 return -1;
2474 }
2475
2476 ret = lxc_write_to_file(filename, elem->value, strlen(elem->value), false);
2477 if (ret < 0) {
2478 ERROR("Failed to setup sysctl parameters %s to %s", elem->key, elem->value);
2479 return -1;
2480 }
2481 }
2482 return 0;
2483}
2484
61d7a733
YT
2485int setup_proc_filesystem(struct lxc_list *procs, pid_t pid)
2486{
2487 struct lxc_list *it;
2488 struct lxc_proc *elem;
2489 char *tmp = NULL;
2490 char filename[MAXPATHLEN] = {0};
2491 int ret = 0;
2492
2493 lxc_list_for_each(it, procs) {
2494 elem = it->elem;
2495 tmp = lxc_string_replace(".", "/", elem->filename);
2496 if (!tmp) {
2497 ERROR("Failed to replace key %s", elem->filename);
2498 return -1;
2499 }
2500
2501 ret = snprintf(filename, sizeof(filename), "/proc/%d/%s", pid, tmp);
2502 free(tmp);
2503 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2504 ERROR("Error setting up proc filesystem path");
2505 return -1;
2506 }
2507
2508 ret = lxc_write_to_file(filename, elem->value, strlen(elem->value), false);
2509 if (ret < 0) {
2510 ERROR("Failed to setup proc filesystem %s to %s", elem->filename, elem->value);
2511 return -1;
2512 }
2513 }
2514 return 0;
2515}
2516
ae9242c8
SH
2517static char *default_rootfs_mount = LXCROOTFSMOUNT;
2518
7b379ab3 2519struct lxc_conf *lxc_conf_init(void)
089cd8b8 2520{
7b379ab3 2521 struct lxc_conf *new;
26ddeedd 2522 int i;
7b379ab3 2523
13277ec4 2524 new = malloc(sizeof(*new));
7b379ab3 2525 if (!new) {
13277ec4 2526 ERROR("lxc_conf_init : %s", strerror(errno));
7b379ab3
MN
2527 return NULL;
2528 }
2529 memset(new, 0, sizeof(*new));
2530
4b73005c 2531 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2532 new->personality = -1;
124fa0a8 2533 new->autodev = 1;
3a784510
CB
2534 new->console.buffer_log_file = NULL;
2535 new->console.buffer_log_file_fd = -1;
2536 new->console.buffer_size = 0;
596a818d
DE
2537 new->console.log_path = NULL;
2538 new->console.log_fd = -1;
28a4b0e5 2539 new->console.path = NULL;
63376d7d 2540 new->console.peer = -1;
b5159817
DE
2541 new->console.peerpty.busy = -1;
2542 new->console.peerpty.master = -1;
2543 new->console.peerpty.slave = -1;
63376d7d
DL
2544 new->console.master = -1;
2545 new->console.slave = -1;
2546 new->console.name[0] = '\0';
732375f5 2547 memset(&new->console.ringbuf, 0, sizeof(struct lxc_ringbuf));
d2e30e99 2548 new->maincmd_fd = -1;
76a26f55 2549 new->nbd_idx = -1;
54c30e29 2550 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2551 if (!new->rootfs.mount) {
13277ec4 2552 ERROR("lxc_conf_init : %s", strerror(errno));
53f3f048
SH
2553 free(new);
2554 return NULL;
2555 }
858377e4 2556 new->logfd = -1;
7b379ab3
MN
2557 lxc_list_init(&new->cgroup);
2558 lxc_list_init(&new->network);
2559 lxc_list_init(&new->mount_list);
81810dd1 2560 lxc_list_init(&new->caps);
1fb86a7c 2561 lxc_list_init(&new->keepcaps);
f6d3e3e4 2562 lxc_list_init(&new->id_map);
f979ac15 2563 lxc_list_init(&new->includes);
4184c3e1 2564 lxc_list_init(&new->aliens);
7c661726 2565 lxc_list_init(&new->environment);
c6d09e15 2566 lxc_list_init(&new->limits);
7edd0540 2567 lxc_list_init(&new->sysctls);
61d7a733 2568 lxc_list_init(&new->procs);
44ae0fb6 2569 new->hooks_version = 0;
28d9e29e 2570 for (i = 0; i < NUM_LXC_HOOKS; i++)
26ddeedd 2571 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2572 lxc_list_init(&new->groups);
d39b10eb 2573 lxc_list_init(&new->state_clients);
fe4de9a6
DE
2574 new->lsm_aa_profile = NULL;
2575 new->lsm_se_context = NULL;
5112cd70 2576 new->tmp_umount_proc = 0;
7b379ab3 2577
72bb04e4
PT
2578 /* if running in a new user namespace, init and COMMAND
2579 * default to running as UID/GID 0 when using lxc-execute */
2580 new->init_uid = 0;
2581 new->init_gid = 0;
43654d34 2582 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
28d9e29e 2583 memset(&new->inherit_ns, 0, sizeof(char *) * LXC_NS_MAX);
72bb04e4 2584
7b379ab3 2585 return new;
089cd8b8
DL
2586}
2587
344c9d81 2588int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
251d0d2a 2589 size_t buf_size)
f6d3e3e4 2590{
29053180
CB
2591 char path[MAXPATHLEN];
2592 int fd, ret;
f6d3e3e4 2593
29053180
CB
2594 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
2595 idtype == ID_TYPE_UID ? 'u' : 'g');
2596 if (ret < 0 || ret >= MAXPATHLEN) {
2597 ERROR("failed to create path \"%s\"", path);
f6d3e3e4
SH
2598 return -E2BIG;
2599 }
29053180
CB
2600
2601 fd = open(path, O_WRONLY);
2602 if (fd < 0) {
2603 SYSERROR("failed to open \"%s\"", path);
2604 return -1;
f6d3e3e4 2605 }
29053180
CB
2606
2607 errno = 0;
2608 ret = lxc_write_nointr(fd, buf, buf_size);
2609 if (ret != buf_size) {
2610 SYSERROR("failed to write %cid mapping to \"%s\"",
2611 idtype == ID_TYPE_UID ? 'u' : 'g', path);
2612 close(fd);
2613 return -1;
2614 }
2615 close(fd);
2616
2617 return 0;
f6d3e3e4
SH
2618}
2619
6e50e704
CB
2620/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2621 *
2622 * @return 1 if functional binary was found
2623 * @return 0 if binary exists but is lacking privilege
2624 * @return -ENOENT if binary does not exist
2625 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
2626 *
2627 */
df6a2945
CB
2628static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2629{
2630 char *path;
2631 int ret;
2632 struct stat st;
2633 int fret = 0;
2634
6e50e704
CB
2635 if (cap != CAP_SETUID && cap != CAP_SETGID)
2636 return -EINVAL;
2637
df6a2945
CB
2638 path = on_path(binary, NULL);
2639 if (!path)
2640 return -ENOENT;
2641
2642 ret = stat(path, &st);
2643 if (ret < 0) {
2644 fret = -errno;
2645 goto cleanup;
2646 }
2647
2648 /* Check if the binary is setuid. */
2649 if (st.st_mode & S_ISUID) {
2650 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
2651 fret = 1;
2652 goto cleanup;
2653 }
2654
69924fff 2655 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
2656 /* Check if it has the CAP_SETUID capability. */
2657 if ((cap & CAP_SETUID) &&
2658 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2659 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2660 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
2661 "and CAP_PERMITTED sets.", path);
2662 fret = 1;
2663 goto cleanup;
2664 }
2665
2666 /* Check if it has the CAP_SETGID capability. */
2667 if ((cap & CAP_SETGID) &&
2668 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2669 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2670 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
2671 "and CAP_PERMITTED sets.", path);
2672 fret = 1;
2673 goto cleanup;
2674 }
d6018f88 2675 #else
69924fff
CB
2676 /* If we cannot check for file capabilities we need to give the benefit
2677 * of the doubt. Otherwise we might fail even though all the necessary
2678 * file capabilities are set.
2679 */
d6018f88
CB
2680 DEBUG("Cannot check for file capabilites as full capability support is "
2681 "missing. Manual intervention needed.");
2682 fret = 1;
df6a2945
CB
2683 #endif
2684
2685cleanup:
2686 free(path);
2687 return fret;
2688}
2689
986ef930
CB
2690int lxc_map_ids_exec_wrapper(void *args)
2691{
2692 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2693 return -1;
2694}
2695
f6d3e3e4
SH
2696int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2697{
f6d3e3e4 2698 struct id_map *map;
4bc3b759 2699 struct lxc_list *iterator;
251d0d2a 2700 enum idtype type;
986ef930 2701 char u_or_g;
4bc3b759 2702 char *pos;
99d43365 2703 int fill, left;
986ef930
CB
2704 char cmd_output[MAXPATHLEN];
2705 /* strlen("new@idmap") = 9
2706 * +
2707 * strlen(" ") = 1
2708 * +
2709 * LXC_NUMSTRLEN64
2710 * +
2711 * strlen(" ") = 1
2712 *
2713 * We add some additional space to make sure that we really have
2714 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2715 */
2716 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
2717 int ret = 0, uidmap = 0, gidmap = 0;
2718 bool use_shadow = false, had_entry = false;
df6a2945
CB
2719
2720 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2721 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
2722 * will protected it by preventing another user from being handed the
2723 * range by shadow.
2724 */
df6a2945 2725 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
2726 if (uidmap == -ENOENT)
2727 WARN("newuidmap binary is missing");
2728 else if (!uidmap)
2729 WARN("newuidmap is lacking necessary privileges");
2730
df6a2945 2731 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
2732 if (gidmap == -ENOENT)
2733 WARN("newgidmap binary is missing");
2734 else if (!gidmap)
2735 WARN("newgidmap is lacking necessary privileges");
2736
df6a2945
CB
2737 if (uidmap > 0 && gidmap > 0) {
2738 DEBUG("Functional newuidmap and newgidmap binary found.");
4bc3b759 2739 use_shadow = true;
df6a2945 2740 } else {
99d43365
CB
2741 /* In case unprivileged users run application containers via
2742 * execute() or a start*() there are valid cases where they may
2743 * only want to map their own {g,u}id. Let's not block them from
2744 * doing so by requiring geteuid() == 0.
2745 */
2746 DEBUG("No newuidmap and newgidmap binary found. Trying to "
2747 "write directly with euid %d.", geteuid());
0e6e3a41 2748 }
251d0d2a 2749
986ef930
CB
2750 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2751 type++, u_or_g = 'g') {
2752 pos = mapbuf;
2753
0e6e3a41 2754 if (use_shadow)
986ef930 2755 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 2756
cf3ef16d 2757 lxc_list_for_each(iterator, idmap) {
251d0d2a 2758 map = iterator->elem;
cf3ef16d
SH
2759 if (map->idtype != type)
2760 continue;
2761
4bc3b759
CB
2762 had_entry = true;
2763
986ef930 2764 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 2765 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
2766 use_shadow ? " " : "", map->nsid,
2767 map->hostid, map->range,
0e6e3a41 2768 use_shadow ? "" : "\n");
a427e268
CB
2769 if (fill <= 0 || fill >= left) {
2770 /* The kernel only takes <= 4k for writes to
2771 * /proc/<pid>/{g,u}id_map
2772 */
2773 SYSERROR("Too many %cid mappings defined", u_or_g);
2774 return -1;
2775 }
4bc3b759 2776
cf3ef16d 2777 pos += fill;
251d0d2a 2778 }
cf3ef16d 2779 if (!had_entry)
4f7521b4 2780 continue;
cf3ef16d 2781
986ef930
CB
2782 /* Try to catch the ouput of new{g,u}idmap to make debugging
2783 * easier.
2784 */
2785 if (use_shadow) {
2786 ret = run_command(cmd_output, sizeof(cmd_output),
2787 lxc_map_ids_exec_wrapper,
2788 (void *)mapbuf);
2789 if (ret < 0) {
54fbbeb5
CB
2790 ERROR("new%cidmap failed to write mapping \"%s\": %s",
2791 u_or_g, cmd_output, mapbuf);
986ef930
CB
2792 return -1;
2793 }
54fbbeb5 2794 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
d1838f34 2795 } else {
986ef930 2796 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
54fbbeb5 2797 if (ret < 0) {
da0f9977 2798 ERROR("Failed to write mapping: %s", mapbuf);
986ef930 2799 return -1;
54fbbeb5
CB
2800 }
2801 TRACE("Wrote mapping \"%s\"", mapbuf);
d1838f34 2802 }
986ef930
CB
2803
2804 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 2805 }
251d0d2a 2806
986ef930 2807 return 0;
f6d3e3e4
SH
2808}
2809
cf3ef16d 2810/*
7b50c609
TS
2811 * return the host uid/gid to which the container root is mapped in
2812 * *val.
0b3a6504 2813 * Return true if id was found, false otherwise.
cf3ef16d 2814 */
2a9a80cb 2815bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 2816 unsigned long *val)
cf3ef16d
SH
2817{
2818 struct lxc_list *it;
2819 struct id_map *map;
2820
2821 lxc_list_for_each(it, &conf->id_map) {
2822 map = it->elem;
7b50c609 2823 if (map->idtype != idtype)
cf3ef16d
SH
2824 continue;
2825 if (map->nsid != 0)
2826 continue;
2a9a80cb
SH
2827 *val = map->hostid;
2828 return true;
cf3ef16d 2829 }
2a9a80cb 2830 return false;
cf3ef16d
SH
2831}
2832
2133f58c 2833int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
2834{
2835 struct lxc_list *it;
2836 struct id_map *map;
2837 lxc_list_for_each(it, &conf->id_map) {
2838 map = it->elem;
2133f58c 2839 if (map->idtype != idtype)
cf3ef16d
SH
2840 continue;
2841 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 2842 return (id - map->hostid) + map->nsid;
cf3ef16d 2843 }
57d116ab 2844 return -1;
cf3ef16d
SH
2845}
2846
339efad9 2847int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
2848{
2849 struct lxc_list *it;
2850 struct id_map *map;
2133f58c 2851 unsigned int freeid = 0;
cf3ef16d
SH
2852again:
2853 lxc_list_for_each(it, &conf->id_map) {
2854 map = it->elem;
2133f58c 2855 if (map->idtype != idtype)
cf3ef16d
SH
2856 continue;
2857 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
2858 freeid = map->nsid + map->range;
2859 goto again;
2860 }
2861 }
2862 return freeid;
2863}
2864
f4f52cb5
CB
2865int chown_mapped_root_exec_wrapper(void *args)
2866{
2867 execvp("lxc-usernsexec", args);
2868 return -1;
2869}
2870
f6d3e3e4 2871/*
7b50c609
TS
2872 * chown_mapped_root: for an unprivileged user with uid/gid X to
2873 * chown a dir to subuid/subgid Y, he needs to run chown as root
2874 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
2875 * nsid Y is mapped to hostuid/hostgid X. That way, the container
2876 * root is privileged with respect to hostuid/hostgid X, allowing
2877 * him to do the chown.
f6d3e3e4 2878 */
41dc7155 2879int chown_mapped_root(const char *path, struct lxc_conf *conf)
f6d3e3e4 2880{
f4f52cb5 2881 uid_t rootuid, rootgid;
2a9a80cb 2882 unsigned long val;
f4f52cb5
CB
2883 int hostuid, hostgid, ret;
2884 struct stat sb;
2885 char map1[100], map2[100], map3[100], map4[100], map5[100];
2886 char ugid[100];
41dc7155 2887 const char *args1[] = {"lxc-usernsexec",
f4f52cb5
CB
2888 "-m", map1,
2889 "-m", map2,
2890 "-m", map3,
2891 "-m", map5,
2892 "--", "chown", ugid, path,
2893 NULL};
41dc7155 2894 const char *args2[] = {"lxc-usernsexec",
f4f52cb5
CB
2895 "-m", map1,
2896 "-m", map2,
2897 "-m", map3,
2898 "-m", map4,
2899 "-m", map5,
2900 "--", "chown", ugid, path,
2901 NULL};
2902 char cmd_output[MAXPATHLEN];
2903
2904 hostuid = geteuid();
2905 hostgid = getegid();
f6d3e3e4 2906
2a9a80cb 2907 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 2908 ERROR("No uid mapping for container root");
c4d10a05 2909 return -1;
f6d3e3e4 2910 }
f4f52cb5 2911 rootuid = (uid_t)val;
7b50c609 2912 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 2913 ERROR("No gid mapping for container root");
7b50c609
TS
2914 return -1;
2915 }
f4f52cb5 2916 rootgid = (gid_t)val;
2a9a80cb 2917
f4f52cb5 2918 if (hostuid == 0) {
7b50c609 2919 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
2920 ERROR("Error chowning %s", path);
2921 return -1;
2922 }
2923 return 0;
2924 }
f3d7e4ca 2925
f4f52cb5 2926 if (rootuid == hostuid) {
1a0e70ac 2927 /* nothing to do */
b103ceac 2928 INFO("Container root is our uid; no need to chown");
f3d7e4ca
SH
2929 return 0;
2930 }
2931
bbdbf8f0 2932 /* save the current gid of "path" */
f4f52cb5
CB
2933 if (stat(path, &sb) < 0) {
2934 ERROR("Error stat %s", path);
f6d3e3e4
SH
2935 return -1;
2936 }
7b50c609 2937
bbdbf8f0
CB
2938 /* Update the path argument in case this was overlayfs. */
2939 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
2940 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
2941
f4f52cb5
CB
2942 /*
2943 * A file has to be group-owned by a gid mapped into the
2944 * container, or the container won't be privileged over it.
2945 */
2946 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
2947 if (sb.st_uid == hostuid &&
2948 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
2949 chown(path, -1, hostgid) < 0) {
2950 ERROR("Failed chgrping %s", path);
2951 return -1;
2952 }
f6d3e3e4 2953
1a0e70ac 2954 /* "u:0:rootuid:1" */
f4f52cb5
CB
2955 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
2956 if (ret < 0 || ret >= 100) {
2957 ERROR("Error uid printing map string");
2958 return -1;
2959 }
7b50c609 2960
1a0e70ac 2961 /* "u:hostuid:hostuid:1" */
f4f52cb5
CB
2962 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
2963 if (ret < 0 || ret >= 100) {
2964 ERROR("Error uid printing map string");
2965 return -1;
2966 }
c4d10a05 2967
1a0e70ac 2968 /* "g:0:rootgid:1" */
f4f52cb5
CB
2969 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
2970 if (ret < 0 || ret >= 100) {
2971 ERROR("Error gid printing map string");
2972 return -1;
2973 }
98e5ba51 2974
1a0e70ac 2975 /* "g:pathgid:rootgid+pathgid:1" */
f4f52cb5
CB
2976 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
2977 rootgid + (gid_t)sb.st_gid);
2978 if (ret < 0 || ret >= 100) {
2979 ERROR("Error gid printing map string");
2980 return -1;
2981 }
c4d10a05 2982
1a0e70ac 2983 /* "g:hostgid:hostgid:1" */
f4f52cb5
CB
2984 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
2985 if (ret < 0 || ret >= 100) {
2986 ERROR("Error gid printing map string");
2987 return -1;
2988 }
7b50c609 2989
1a0e70ac 2990 /* "0:pathgid" (chown) */
f4f52cb5
CB
2991 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
2992 if (ret < 0 || ret >= 100) {
2993 ERROR("Error owner printing format string for chown");
2994 return -1;
2995 }
7b50c609 2996
f4f52cb5
CB
2997 if (hostgid == sb.st_gid)
2998 ret = run_command(cmd_output, sizeof(cmd_output),
2999 chown_mapped_root_exec_wrapper,
3000 (void *)args1);
3001 else
3002 ret = run_command(cmd_output, sizeof(cmd_output),
3003 chown_mapped_root_exec_wrapper,
3004 (void *)args2);
3005 if (ret < 0)
3006 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 3007
f4f52cb5 3008 return ret;
f6d3e3e4
SH
3009}
3010
54117de5 3011int lxc_ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 3012{
c4d10a05 3013 if (lxc_list_empty(&c->id_map))
f6d3e3e4 3014 return 0;
c4d10a05 3015
54117de5
CB
3016 if (!strcmp(c->console.name, ""))
3017 return 0;
3018
3019 if (chown_mapped_root(c->console.name, c) < 0) {
3020 ERROR("failed to chown console \"%s\"", c->console.name);
c4d10a05
SH
3021 return -1;
3022 }
3023
54117de5
CB
3024 TRACE("chowned console \"%s\"", c->console.name);
3025
f6d3e3e4
SH
3026 return 0;
3027}
3028
943144d9
CB
3029/* NOTE: Must not be called from inside the container namespace! */
3030int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3031{
3032 int mounted;
3033
943144d9 3034 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3035 if (mounted == -1) {
943144d9 3036 SYSERROR("failed to mount /proc in the container");
01958b1f 3037 /* continue only if there is no rootfs */
943144d9 3038 if (conf->rootfs.path)
01958b1f 3039 return -1;
5112cd70 3040 } else if (mounted == 1) {
943144d9 3041 conf->tmp_umount_proc = 1;
5112cd70 3042 }
943144d9 3043
5112cd70
SH
3044 return 0;
3045}
3046
3047void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3048{
3049 if (lxc_conf->tmp_umount_proc == 1) {
3050 umount("/proc");
3051 lxc_conf->tmp_umount_proc = 0;
3052 }
3053}
3054
6a0c909a 3055void remount_all_slave(void)
e995d7a2
SH
3056{
3057 /* walk /proc/mounts and change any shared entries to slave */
3058 FILE *f = fopen("/proc/self/mountinfo", "r");
3059 char *line = NULL;
3060 size_t len = 0;
3061
3062 if (!f) {
3063 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3064 ERROR("Continuing container startup...");
3065 return;
3066 }
3067
3068 while (getline(&line, &len, f) != -1) {
3069 char *target, *opts;
3070 target = get_field(line, 4);
3071 if (!target)
3072 continue;
3073 opts = get_field(target, 2);
3074 if (!opts)
3075 continue;
3076 null_endofword(opts);
3077 if (!strstr(opts, "shared"))
3078 continue;
3079 null_endofword(target);
3080 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3081 SYSERROR("Failed to make %s rslave", target);
3082 ERROR("Continuing...");
3083 }
3084 }
3085 fclose(f);
f10fad2f 3086 free(line);
e995d7a2
SH
3087}
3088
2322903b
SH
3089void lxc_execute_bind_init(struct lxc_conf *conf)
3090{
3091 int ret;
9d9c111c
SH
3092 char path[PATH_MAX], destpath[PATH_MAX], *p;
3093
3094 /* If init exists in the container, don't bind mount a static one */
3095 p = choose_init(conf->rootfs.mount);
3096 if (p) {
3097 free(p);
3098 return;
3099 }
2322903b
SH
3100
3101 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3102 if (ret < 0 || ret >= PATH_MAX) {
3103 WARN("Path name too long searching for lxc.init.static");
3104 return;
3105 }
3106
3107 if (!file_exists(path)) {
3108 INFO("%s does not exist on host", path);
3109 return;
3110 }
3111
3112 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3113 if (ret < 0 || ret >= PATH_MAX) {
3114 WARN("Path name too long for container's lxc.init.static");
3115 return;
3116 }
3117
3118 if (!file_exists(destpath)) {
3119 FILE * pathfile = fopen(destpath, "wb");
3120 if (!pathfile) {
3121 SYSERROR("Failed to create mount target '%s'", destpath);
3122 return;
3123 }
3124 fclose(pathfile);
3125 }
3126
592fd47a 3127 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
2322903b
SH
3128 if (ret < 0)
3129 SYSERROR("Failed to bind lxc.init.static into container");
3130 INFO("lxc.init.static bound into container at %s", path);
3131}
3132
35120d9c
SH
3133/*
3134 * This does the work of remounting / if it is shared, calling the
3135 * container pre-mount hooks, and mounting the rootfs.
3136 */
3137int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 3138{
35120d9c
SH
3139 if (conf->rootfs_setup) {
3140 /*
3141 * rootfs was set up in another namespace. bind-mount it
3142 * to give us a mount in our own ns so we can pivot_root to it
3143 */
3144 const char *path = conf->rootfs.mount;
3145 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
3146 ERROR("Failed to bind-mount container / onto itself");
145832ba 3147 return -1;
35120d9c 3148 }
145832ba 3149 return 0;
35120d9c 3150 }
d4ef7c50 3151
e995d7a2
SH
3152 remount_all_slave();
3153
14a7b0f9 3154 if (run_lxc_hooks(name, "pre-mount", conf, NULL)) {
35120d9c
SH
3155 ERROR("failed to run pre-mount hooks for container '%s'.", name);
3156 return -1;
3157 }
3158
9aa76a17 3159 if (lxc_setup_rootfs(conf)) {
35120d9c
SH
3160 ERROR("failed to setup rootfs for '%s'", name);
3161 return -1;
3162 }
3163
3164 conf->rootfs_setup = true;
3165 return 0;
3166}
3167
1c1c7051
SH
3168static bool verify_start_hooks(struct lxc_conf *conf)
3169{
3170 struct lxc_list *it;
3171 char path[MAXPATHLEN];
3172 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
3173 char *hookname = it->elem;
3174 struct stat st;
3175 int ret;
3176
3177 ret = snprintf(path, MAXPATHLEN, "%s%s",
7b6753e7 3178 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
1c1c7051
SH
3179 if (ret < 0 || ret >= MAXPATHLEN)
3180 return false;
3181 ret = stat(path, &st);
3182 if (ret) {
7b6753e7 3183 SYSERROR("Start hook %s not found in container",
1c1c7051
SH
3184 hookname);
3185 return false;
3186 }
6a0c909a 3187 return true;
1c1c7051
SH
3188 }
3189
3190 return true;
3191}
3192
3b988b33 3193int lxc_setup(struct lxc_handler *handler)
35120d9c 3194{
2187efd3 3195 int ret;
35120d9c
SH
3196 const char *name = handler->name;
3197 struct lxc_conf *lxc_conf = handler->conf;
3198 const char *lxcpath = handler->lxcpath;
35120d9c
SH
3199
3200 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
3201 ERROR("Error setting up rootfs mount after spawn");
3202 return -1;
3203 }
3204
28d9e29e 3205 if (handler->nsfd[LXC_NS_UTS] == -1) {
6c544cb3
MM
3206 if (setup_utsname(lxc_conf->utsname)) {
3207 ERROR("failed to setup the utsname for '%s'", name);
3208 return -1;
3209 }
0ad19a3f 3210 }
3211
811ef482 3212 if (lxc_setup_network_in_child_namespaces(lxc_conf, &lxc_conf->network)) {
36eb9bde 3213 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 3214 return -1;
0ad19a3f 3215 }
3216
790255cf
CB
3217 if (lxc_network_send_name_and_ifindex_to_parent(handler) < 0) {
3218 ERROR("Failed to network device names and ifindices to parent");
3219 return -1;
3220 }
3221
bc6928ff 3222 if (lxc_conf->autodev > 0) {
14221cbb 3223 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
91c3830e 3224 ERROR("failed to mount /dev in the container");
c6883f38
SH
3225 return -1;
3226 }
3227 }
3228
368bbc02
CS
3229 /* do automatic mounts (mainly /proc and /sys), but exclude
3230 * those that need to wait until other stuff has finished
3231 */
4fb3cba5 3232 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
3233 ERROR("failed to setup the automatic mounts for '%s'", name);
3234 return -1;
3235 }
3236
06749971 3237 if (setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
36eb9bde 3238 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 3239 return -1;
576f946d 3240 }
3241
06749971 3242 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(lxc_conf, &lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
e7938e9e
MN
3243 ERROR("failed to setup the mount entries for '%s'", name);
3244 return -1;
3245 }
3246
7b6753e7 3247 /* Make sure any start hooks are in the container */
1c1c7051
SH
3248 if (!verify_start_hooks(lxc_conf))
3249 return -1;
3250
2322903b
SH
3251 if (lxc_conf->is_execute)
3252 lxc_execute_bind_init(lxc_conf);
3253
368bbc02
CS
3254 /* now mount only cgroup, if wanted;
3255 * before, /sys could not have been mounted
3256 * (is either mounted automatically or via fstab entries)
3257 */
4fb3cba5 3258 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
3259 ERROR("failed to setup the automatic mounts for '%s'", name);
3260 return -1;
3261 }
3262
14a7b0f9 3263 if (run_lxc_hooks(name, "mount", lxc_conf, NULL)) {
773fb9ca
SH
3264 ERROR("failed to run mount hooks for container '%s'.", name);
3265 return -1;
3266 }
3267
bc6928ff 3268 if (lxc_conf->autodev > 0) {
14a7b0f9 3269 if (run_lxc_hooks(name, "autodev", lxc_conf, NULL)) {
f7bee6c6
MW
3270 ERROR("failed to run autodev hooks for container '%s'.", name);
3271 return -1;
3272 }
06749971 3273
27245ff7 3274 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
91c3830e
SH
3275 ERROR("failed to populate /dev in the container");
3276 return -1;
3277 }
3278 }
368bbc02 3279
ed8704d0
CB
3280 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
3281 lxc_conf->ttydir);
3282 if (ret < 0) {
3283 ERROR("Failed to setup console");
95b5ffaf 3284 return -1;
6e590161 3285 }
3286
ed8704d0
CB
3287 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
3288 if (ret < 0) {
3289 ERROR("Failed to setup /dev symlinks");
69aa6655
DE
3290 return -1;
3291 }
3292
5112cd70 3293 /* mount /proc if it's not already there */
943144d9 3294 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 3295 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 3296 return -1;
e075f5d9 3297 }
e075f5d9 3298
ac778708 3299 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 3300 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 3301 return -1;
ed502555 3302 }
3303
70761e5e 3304 if (lxc_setup_devpts(lxc_conf->pts)) {
36eb9bde 3305 ERROR("failed to setup the new pts instance");
95b5ffaf 3306 return -1;
3c26f34e 3307 }
3308
2187efd3
CB
3309 ret = lxc_create_ttys(handler);
3310 if (ret < 0)
e8bd4e43 3311 return -1;
e8bd4e43 3312
cccc74b5
DL
3313 if (setup_personality(lxc_conf->personality)) {
3314 ERROR("failed to setup personality");
3315 return -1;
3316 }
3317
7edd0540
L
3318 /* set sysctl value to a path under /proc/sys as determined from the key.
3319 * For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward.
3320 */
3321 if (!lxc_list_empty(&lxc_conf->sysctls)) {
3322 ret = setup_sysctl_parameters(&lxc_conf->sysctls);
3323 if (ret < 0)
3324 return -1;
3325 }
3326
97a8f74f
SG
3327 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3328 if (!lxc_list_empty(&lxc_conf->caps)) {
7389ca26 3329 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
f6d3e3e4
SH
3330 return -1;
3331 }
97a8f74f
SG
3332 if (dropcaps_except(&lxc_conf->keepcaps)) {
3333 ERROR("failed to keep requested caps");
3334 return -1;
3335 }
3336 } else if (setup_caps(&lxc_conf->caps)) {
3337 ERROR("failed to drop capabilities");
3338 return -1;
81810dd1
DL
3339 }
3340
f4152036 3341 NOTICE("Container \"%s\" is set up", name);
cd54d859 3342
0ad19a3f 3343 return 0;
3344}
26ddeedd 3345
3f60c2f7 3346int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
14a7b0f9 3347 char *argv[])
26ddeedd 3348{
26ddeedd 3349 struct lxc_list *it;
3f60c2f7 3350 int which = -1;
26ddeedd 3351
3f60c2f7 3352 if (strcmp(hookname, "pre-start") == 0)
26ddeedd 3353 which = LXCHOOK_PRESTART;
3f60c2f7 3354 else if (strcmp(hookname, "start-host") == 0)
08dd2805 3355 which = LXCHOOK_START_HOST;
3f60c2f7 3356 else if (strcmp(hookname, "pre-mount") == 0)
5ea6163a 3357 which = LXCHOOK_PREMOUNT;
3f60c2f7 3358 else if (strcmp(hookname, "mount") == 0)
26ddeedd 3359 which = LXCHOOK_MOUNT;
3f60c2f7 3360 else if (strcmp(hookname, "autodev") == 0)
f7bee6c6 3361 which = LXCHOOK_AUTODEV;
3f60c2f7 3362 else if (strcmp(hookname, "start") == 0)
26ddeedd 3363 which = LXCHOOK_START;
3f60c2f7 3364 else if (strcmp(hookname, "stop") == 0)
52492063 3365 which = LXCHOOK_STOP;
3f60c2f7 3366 else if (strcmp(hookname, "post-stop") == 0)
26ddeedd 3367 which = LXCHOOK_POSTSTOP;
3f60c2f7 3368 else if (strcmp(hookname, "clone") == 0)
148e91f5 3369 which = LXCHOOK_CLONE;
3f60c2f7 3370 else if (strcmp(hookname, "destroy") == 0)
37cf711b 3371 which = LXCHOOK_DESTROY;
26ddeedd
SH
3372 else
3373 return -1;
3f60c2f7 3374
26ddeedd
SH
3375 lxc_list_for_each(it, &conf->hooks[which]) {
3376 int ret;
3f60c2f7
CB
3377 char *hook = it->elem;
3378
3379 ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
14a7b0f9 3380 hookname, argv);
3f60c2f7
CB
3381 if (ret < 0)
3382 return -1;
26ddeedd 3383 }
3f60c2f7 3384
26ddeedd
SH
3385 return 0;
3386}
72d0e1cb 3387
72d0e1cb
SG
3388int lxc_clear_config_caps(struct lxc_conf *c)
3389{
1a0e70ac 3390 struct lxc_list *it, *next;
72d0e1cb 3391
9ebb03ad 3392 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
3393 lxc_list_del(it);
3394 free(it->elem);
3395 free(it);
3396 }
3397 return 0;
3398}
3399
74a3920a 3400static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
3401 struct lxc_list *it, *next;
3402
4355ab5f 3403 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
3404 lxc_list_del(it);
3405 free(it->elem);
3406 free(it);
3407 }
3408 return 0;
3409}
3410
4355ab5f
SH
3411int lxc_clear_idmaps(struct lxc_conf *c)
3412{
3413 return lxc_free_idmap(&c->id_map);
3414}
3415
1fb86a7c
SH
3416int lxc_clear_config_keepcaps(struct lxc_conf *c)
3417{
3418 struct lxc_list *it,*next;
3419
3420 lxc_list_for_each_safe(it, &c->keepcaps, next) {
3421 lxc_list_del(it);
3422 free(it->elem);
3423 free(it);
3424 }
3425 return 0;
3426}
3427
12a50cc6 3428int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 3429{
9ebb03ad 3430 struct lxc_list *it,*next;
72d0e1cb 3431 bool all = false;
a6390f01 3432 const char *k = NULL;
72d0e1cb
SG
3433
3434 if (strcmp(key, "lxc.cgroup") == 0)
3435 all = true;
e409b214
CB
3436 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.") - 1) == 0)
3437 k = key + sizeof("lxc.cgroup.") - 1;
a6390f01
WB
3438 else
3439 return -1;
72d0e1cb 3440
9ebb03ad 3441 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
3442 struct lxc_cgroup *cg = it->elem;
3443 if (!all && strcmp(cg->subsystem, k) != 0)
3444 continue;
3445 lxc_list_del(it);
3446 free(cg->subsystem);
3447 free(cg->value);
3448 free(cg);
3449 free(it);
3450 }
e409b214 3451
72d0e1cb
SG
3452 return 0;
3453}
3454
c6d09e15
WB
3455int lxc_clear_limits(struct lxc_conf *c, const char *key)
3456{
3457 struct lxc_list *it, *next;
3458 bool all = false;
3459 const char *k = NULL;
3460
b668653c 3461 if (strcmp(key, "lxc.limit") == 0 || strcmp(key, "lxc.prlimit") == 0)
c6d09e15 3462 all = true;
b668653c
CB
3463 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.") - 1) == 0)
3464 k = key + sizeof("lxc.limit.") - 1;
3465 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.") - 1) == 0)
3466 k = key + sizeof("lxc.prlimit.") - 1;
c6d09e15
WB
3467 else
3468 return -1;
3469
3470 lxc_list_for_each_safe(it, &c->limits, next) {
3471 struct lxc_limit *lim = it->elem;
3472 if (!all && strcmp(lim->resource, k) != 0)
3473 continue;
3474 lxc_list_del(it);
3475 free(lim->resource);
3476 free(lim);
3477 free(it);
3478 }
b668653c 3479
c6d09e15
WB
3480 return 0;
3481}
3482
7edd0540
L
3483int lxc_clear_sysctls(struct lxc_conf *c, const char *key)
3484{
3485 struct lxc_list *it, *next;
3486 bool all = false;
3487 const char *k = NULL;
3488
3489 if (strcmp(key, "lxc.sysctl") == 0)
3490 all = true;
3491 else if (strncmp(key, "lxc.sysctl.", sizeof("lxc.sysctl.") - 1) == 0)
3492 k = key + sizeof("lxc.sysctl.") - 1;
3493 else
3494 return -1;
3495
3496 lxc_list_for_each_safe(it, &c->sysctls, next) {
3497 struct lxc_sysctl *elem = it->elem;
3498 if (!all && strcmp(elem->key, k) != 0)
3499 continue;
3500 lxc_list_del(it);
3501 free(elem->key);
3502 free(elem->value);
3503 free(elem);
3504 free(it);
3505 }
3506 return 0;
3507}
3508
61d7a733
YT
3509int lxc_clear_procs(struct lxc_conf *c, const char *key)
3510{
3511 struct lxc_list *it,*next;
3512 bool all = false;
3513 const char *k = NULL;
3514
3515 if (strcmp(key, "lxc.proc") == 0)
3516 all = true;
3517 else if (strncmp(key, "lxc.proc.", sizeof("lxc.proc.") - 1) == 0)
3518 k = key + sizeof("lxc.proc.") - 1;
3519 else
3520 return -1;
3521
3522 lxc_list_for_each_safe(it, &c->procs, next) {
3523 struct lxc_proc *proc = it->elem;
3524 if (!all && strcmp(proc->filename, k) != 0)
3525 continue;
3526 lxc_list_del(it);
3527 free(proc->filename);
3528 free(proc->value);
3529 free(proc);
3530 free(it);
3531 }
3532
3533 return 0;
3534}
3535
ee1e7aa0
SG
3536int lxc_clear_groups(struct lxc_conf *c)
3537{
3538 struct lxc_list *it,*next;
3539
3540 lxc_list_for_each_safe(it, &c->groups, next) {
3541 lxc_list_del(it);
3542 free(it->elem);
3543 free(it);
3544 }
3545 return 0;
3546}
3547
ab799c0b
SG
3548int lxc_clear_environment(struct lxc_conf *c)
3549{
3550 struct lxc_list *it,*next;
3551
3552 lxc_list_for_each_safe(it, &c->environment, next) {
3553 lxc_list_del(it);
3554 free(it->elem);
3555 free(it);
3556 }
3557 return 0;
3558}
3559
72d0e1cb
SG
3560int lxc_clear_mount_entries(struct lxc_conf *c)
3561{
9ebb03ad 3562 struct lxc_list *it,*next;
72d0e1cb 3563
9ebb03ad 3564 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
3565 lxc_list_del(it);
3566 free(it->elem);
3567 free(it);
3568 }
3569 return 0;
3570}
3571
b099e9e9
SH
3572int lxc_clear_automounts(struct lxc_conf *c)
3573{
3574 c->auto_mounts = 0;
3575 return 0;
3576}
3577
12a50cc6 3578int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 3579{
9ebb03ad 3580 struct lxc_list *it,*next;
17ed13a3 3581 bool all = false, done = false;
a6390f01 3582 const char *k = NULL;
72d0e1cb
SG
3583 int i;
3584
17ed13a3
SH
3585 if (strcmp(key, "lxc.hook") == 0)
3586 all = true;
a6390f01
WB
3587 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
3588 k = key + sizeof("lxc.hook.")-1;
3589 else
3590 return -1;
17ed13a3 3591
72d0e1cb 3592 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 3593 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 3594 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
3595 lxc_list_del(it);
3596 free(it->elem);
3597 free(it);
3598 }
3599 done = true;
72d0e1cb
SG
3600 }
3601 }
17ed13a3
SH
3602
3603 if (!done) {
3604 ERROR("Invalid hook key: %s", key);
3605 return -1;
3606 }
72d0e1cb
SG
3607 return 0;
3608}
8eb5694b 3609
4184c3e1
SH
3610static inline void lxc_clear_aliens(struct lxc_conf *conf)
3611{
3612 struct lxc_list *it,*next;
3613
3614 lxc_list_for_each_safe(it, &conf->aliens, next) {
3615 lxc_list_del(it);
3616 free(it->elem);
3617 free(it);
3618 }
3619}
3620
c7b15d1e 3621void lxc_clear_includes(struct lxc_conf *conf)
f979ac15
SH
3622{
3623 struct lxc_list *it,*next;
3624
3625 lxc_list_for_each_safe(it, &conf->includes, next) {
3626 lxc_list_del(it);
3627 free(it->elem);
3628 free(it);
3629 }
3630}
3631
8eb5694b
SH
3632void lxc_conf_free(struct lxc_conf *conf)
3633{
3634 if (!conf)
3635 return;
858377e4
SH
3636 if (current_config == conf)
3637 current_config = NULL;
3a784510 3638 free(conf->console.buffer_log_file);
f10fad2f
ME
3639 free(conf->console.log_path);
3640 free(conf->console.path);
28f3b1cd 3641 if (conf->console.buffer_size > 0 && conf->console.ringbuf.addr)
732375f5 3642 lxc_ringbuf_release(&conf->console.ringbuf);
f10fad2f 3643 free(conf->rootfs.mount);
b3b8c97f 3644 free(conf->rootfs.bdev_type);
f10fad2f
ME
3645 free(conf->rootfs.options);
3646 free(conf->rootfs.path);
f10fad2f 3647 free(conf->logfile);
858377e4
SH
3648 if (conf->logfd != -1)
3649 close(conf->logfd);
f10fad2f
ME
3650 free(conf->utsname);
3651 free(conf->ttydir);
3652 free(conf->fstab);
3653 free(conf->rcfile);
5cda27c1 3654 free(conf->execute_cmd);
f10fad2f 3655 free(conf->init_cmd);
3c491553 3656 free(conf->init_cwd);
6b0d5538 3657 free(conf->unexpanded_config);
393903d1 3658 free(conf->pty_names);
76d0127f 3659 free(conf->syslog);
c302b476 3660 lxc_free_networks(&conf->network);
f10fad2f
ME
3661 free(conf->lsm_aa_profile);
3662 free(conf->lsm_se_context);
769872f9 3663 lxc_seccomp_free(conf);
8eb5694b 3664 lxc_clear_config_caps(conf);
1fb86a7c 3665 lxc_clear_config_keepcaps(conf);
8eb5694b 3666 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 3667 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 3668 lxc_clear_mount_entries(conf);
27c27d73 3669 lxc_clear_idmaps(conf);
ee1e7aa0 3670 lxc_clear_groups(conf);
f979ac15 3671 lxc_clear_includes(conf);
761d81ca 3672 lxc_clear_aliens(conf);
ab799c0b 3673 lxc_clear_environment(conf);
240d4b74 3674 lxc_clear_limits(conf, "lxc.prlimit");
7edd0540 3675 lxc_clear_sysctls(conf, "lxc.sysctl");
61d7a733 3676 lxc_clear_procs(conf, "lxc.proc");
43654d34
CB
3677 free(conf->cgroup_meta.dir);
3678 free(conf->cgroup_meta.controllers);
8eb5694b
SH
3679 free(conf);
3680}
4355ab5f
SH
3681
3682struct userns_fn_data {
3683 int (*fn)(void *);
c9b7c33e 3684 const char *fn_name;
4355ab5f
SH
3685 void *arg;
3686 int p[2];
3687};
3688
3689static int run_userns_fn(void *data)
3690{
3691 struct userns_fn_data *d = data;
3692 char c;
4355ab5f 3693
f8aa4bf3 3694 /* Close write end of the pipe. */
4355ab5f 3695 close(d->p[1]);
f8aa4bf3
CB
3696
3697 /* Wait for parent to finish establishing a new mapping in the user
3698 * namespace we are executing in.
3699 */
4355ab5f
SH
3700 if (read(d->p[0], &c, 1) != 1)
3701 return -1;
f8aa4bf3
CB
3702
3703 /* Close read end of the pipe. */
4355ab5f 3704 close(d->p[0]);
f8aa4bf3 3705
c9b7c33e
CB
3706 if (d->fn_name)
3707 TRACE("calling function \"%s\"", d->fn_name);
f8aa4bf3 3708 /* Call function to run. */
4355ab5f
SH
3709 return d->fn(d->arg);
3710}
3711
339efad9 3712static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
f8aa4bf3
CB
3713 enum idtype idtype)
3714{
3715 struct lxc_list *it;
3716 struct id_map *map;
3717 struct id_map *retmap = NULL;
3718
3719 lxc_list_for_each(it, &conf->id_map) {
3720 map = it->elem;
3721 if (map->idtype != idtype)
3722 continue;
3723
3724 if (id >= map->hostid && id < map->hostid + map->range) {
3725 retmap = map;
3726 break;
3727 }
3728 }
3729
3730 if (!retmap)
3731 return NULL;
3732
3733 retmap = malloc(sizeof(*retmap));
3734 if (!retmap)
3735 return NULL;
3736
3737 memcpy(retmap, map, sizeof(*retmap));
3738 return retmap;
3739}
3740
4355ab5f 3741/*
f8aa4bf3
CB
3742 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
3743 * existing one or establish a new one.
4355ab5f 3744 */
28a2d9e7 3745static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4355ab5f 3746{
28a2d9e7 3747 int hostid_mapped;
f8aa4bf3 3748 struct id_map *entry = NULL;
f8aa4bf3 3749
28a2d9e7
CB
3750 /* Reuse existing mapping. */
3751 entry = mapped_hostid_entry(conf, id, type);
3752 if (entry)
3753 return entry;
f8aa4bf3 3754
28a2d9e7
CB
3755 /* Find new mapping. */
3756 hostid_mapped = find_unmapped_nsid(conf, type);
3757 if (hostid_mapped < 0) {
3758 DEBUG("failed to find free mapping for id %d", id);
3759 return NULL;
f8aa4bf3 3760 }
f8aa4bf3 3761
28a2d9e7
CB
3762 entry = malloc(sizeof(*entry));
3763 if (!entry)
3764 return NULL;
4355ab5f 3765
28a2d9e7
CB
3766 entry->idtype = type;
3767 entry->nsid = hostid_mapped;
3768 entry->hostid = (unsigned long)id;
3769 entry->range = 1;
4355ab5f 3770
28a2d9e7 3771 return entry;
4355ab5f
SH
3772}
3773
f8aa4bf3
CB
3774/* Run a function in a new user namespace.
3775 * The caller's euid/egid will be mapped if it is not already.
3776 * Afaict, userns_exec_1() is only used to operate based on privileges for the
3777 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
3778 * This means we require only to establish a mapping from:
3779 * - the container root {g,u}id as seen from the host > user's host {g,u}id
3780 * - the container root -> some sub{g,u}id
3781 * The former we add, if the user did not specifiy a mapping. The latter we
3782 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
3783 * there to start the container in the first place.
4355ab5f 3784 */
c9b7c33e
CB
3785int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
3786 const char *fn_name)
4355ab5f 3787{
f8aa4bf3
CB
3788 pid_t pid;
3789 uid_t euid, egid;
4355ab5f 3790 struct userns_fn_data d;
4355ab5f 3791 int p[2];
f8aa4bf3
CB
3792 struct lxc_list *it;
3793 struct id_map *map;
3794 char c = '1';
686dd5d1 3795 int ret = -1, status = -1;
f8aa4bf3 3796 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
3797 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
3798 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 3799
4355ab5f 3800 ret = pipe(p);
4355ab5f
SH
3801 if (ret < 0) {
3802 SYSERROR("opening pipe");
3803 return -1;
3804 }
3805 d.fn = fn;
c9b7c33e 3806 d.fn_name = fn_name;
4355ab5f
SH
3807 d.arg = data;
3808 d.p[0] = p[0];
3809 d.p[1] = p[1];
f8aa4bf3
CB
3810
3811 /* Clone child in new user namespace. */
4355ab5f 3812 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
f8aa4bf3
CB
3813 if (pid < 0) {
3814 ERROR("failed to clone child process in new user namespace");
3815 goto on_error;
3816 }
3817
4355ab5f 3818 close(p[0]);
4355ab5f
SH
3819 p[0] = -1;
3820
954b7d9b
CB
3821 euid = geteuid();
3822 egid = getegid();
3823
f8aa4bf3
CB
3824 /* Find container root. */
3825 lxc_list_for_each(it, &conf->id_map) {
3826 map = it->elem;
3827
3828 if (map->nsid != 0)
3829 continue;
3830
3831 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
3832 container_root_uid = malloc(sizeof(*container_root_uid));
3833 if (!container_root_uid)
3834 goto on_error;
3835 container_root_uid->idtype = map->idtype;
3836 container_root_uid->hostid = map->hostid;
3837 container_root_uid->nsid = 0;
3838 container_root_uid->range = map->range;
954b7d9b
CB
3839
3840 /* Check if container root mapping contains a mapping
3841 * for user's uid.
3842 */
3843 if (euid >= map->hostid && euid < map->hostid + map->range)
3844 host_uid_map = container_root_uid;
f8aa4bf3
CB
3845 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
3846 container_root_gid = malloc(sizeof(*container_root_gid));
3847 if (!container_root_gid)
3848 goto on_error;
3849 container_root_gid->idtype = map->idtype;
3850 container_root_gid->hostid = map->hostid;
3851 container_root_gid->nsid = 0;
3852 container_root_gid->range = map->range;
954b7d9b
CB
3853
3854 /* Check if container root mapping contains a mapping
3855 * for user's gid.
3856 */
3857 if (egid >= map->hostid && egid < map->hostid + map->range)
3858 host_gid_map = container_root_gid;
f8aa4bf3
CB
3859 }
3860
3861 /* Found container root. */
3862 if (container_root_uid && container_root_gid)
3863 break;
3864 }
3865
3866 /* This is actually checked earlier but it can't hurt. */
3867 if (!container_root_uid || !container_root_gid) {
3868 ERROR("no mapping for container root found");
3869 goto on_error;
3870 }
3871
3872 /* Check whether the {g,u}id of the user has a mapping. */
954b7d9b 3873 if (!host_uid_map)
28a2d9e7
CB
3874 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
3875
954b7d9b 3876 if (!host_gid_map)
28a2d9e7
CB
3877 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
3878
3879 if (!host_uid_map) {
3880 DEBUG("failed to find mapping for uid %d", euid);
f8aa4bf3
CB
3881 goto on_error;
3882 }
3883
28a2d9e7
CB
3884 if (!host_gid_map) {
3885 DEBUG("failed to find mapping for gid %d", egid);
3886 goto on_error;
3887 }
3888
3889 /* Allocate new {g,u}id map list. */
3890 idmap = malloc(sizeof(*idmap));
3891 if (!idmap)
3892 goto on_error;
3893 lxc_list_init(idmap);
3894
f8aa4bf3
CB
3895 /* Add container root to the map. */
3896 tmplist = malloc(sizeof(*tmplist));
3897 if (!tmplist)
3898 goto on_error;
3899 lxc_list_add_elem(tmplist, container_root_uid);
3900 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3901
1d90e064 3902 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
3903 /* idmap will now keep track of that memory. */
3904 container_root_uid = NULL;
3905
3906 /* Add container root to the map. */
3907 tmplist = malloc(sizeof(*tmplist));
3908 if (!tmplist)
3909 goto on_error;
3910 lxc_list_add_elem(tmplist, host_uid_map);
3911 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3912 }
1d90e064
CB
3913 /* idmap will now keep track of that memory. */
3914 container_root_uid = NULL;
3915 /* idmap will now keep track of that memory. */
3916 host_uid_map = NULL;
f8aa4bf3
CB
3917
3918 tmplist = malloc(sizeof(*tmplist));
3919 if (!tmplist)
3920 goto on_error;
3921 lxc_list_add_elem(tmplist, container_root_gid);
3922 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3923
1d90e064 3924 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
3925 /* idmap will now keep track of that memory. */
3926 container_root_gid = NULL;
3927
3928 tmplist = malloc(sizeof(*tmplist));
3929 if (!tmplist)
3930 goto on_error;
3931 lxc_list_add_elem(tmplist, host_gid_map);
3932 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3933 }
1d90e064
CB
3934 /* idmap will now keep track of that memory. */
3935 container_root_gid = NULL;
3936 /* idmap will now keep track of that memory. */
3937 host_gid_map = NULL;
f8aa4bf3 3938
4b73005c
CB
3939 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
3940 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
f8aa4bf3
CB
3941 lxc_list_for_each(it, idmap) {
3942 map = it->elem;
3943 TRACE("establishing %cid mapping for \"%d\" in new "
3944 "user namespace: nsuid %lu - hostid %lu - range "
3945 "%lu",
3946 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
3947 map->nsid, map->hostid, map->range);
3948 }
4355ab5f
SH
3949 }
3950
f8aa4bf3 3951 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 3952 ret = lxc_map_ids(idmap, pid);
f8aa4bf3
CB
3953 if (ret < 0) {
3954 ERROR("error setting up {g,u}id mappings for child process "
415a8851 3955 "\"%d\"", pid);
f8aa4bf3 3956 goto on_error;
4355ab5f
SH
3957 }
3958
f8aa4bf3 3959 /* Tell child to proceed. */
4355ab5f 3960 if (write(p[1], &c, 1) != 1) {
f8aa4bf3
CB
3961 SYSERROR("failed telling child process \"%d\" to proceed", pid);
3962 goto on_error;
4355ab5f
SH
3963 }
3964
686dd5d1 3965on_error:
f8aa4bf3 3966 /* Wait for child to finish. */
686dd5d1
CB
3967 if (pid > 0)
3968 status = wait_for_pid(pid);
3139aead 3969
1d90e064
CB
3970 if (idmap)
3971 lxc_free_idmap(idmap);
3972 if (container_root_uid)
3973 free(container_root_uid);
3974 if (container_root_gid)
3975 free(container_root_gid);
3976 if (host_uid_map && (host_uid_map != container_root_uid))
3977 free(host_uid_map);
3978 if (host_gid_map && (host_gid_map != container_root_gid))
3979 free(host_gid_map);
3139aead 3980
4355ab5f
SH
3981 if (p[0] != -1)
3982 close(p[0]);
3983 close(p[1]);
f8aa4bf3 3984
686dd5d1
CB
3985 if (status < 0)
3986 ret = -1;
3987
f8aa4bf3 3988 return ret;
4355ab5f 3989}
97e9cfa0 3990
415a8851
CB
3991int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
3992 const char *fn_name)
3993{
3994 pid_t pid;
3995 uid_t euid, egid;
3996 struct userns_fn_data d;
3997 int p[2];
3998 struct id_map *map;
3999 struct lxc_list *cur;
4000 char c = '1';
4001 int ret = -1;
4002 struct lxc_list *idmap = NULL, *tmplist = NULL;
4003 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4004 *host_uid_map = NULL, *host_gid_map = NULL;
4005
4006 ret = pipe(p);
4007 if (ret < 0) {
4008 SYSERROR("opening pipe");
4009 return -1;
4010 }
4011 d.fn = fn;
4012 d.fn_name = fn_name;
4013 d.arg = data;
4014 d.p[0] = p[0];
4015 d.p[1] = p[1];
4016
4017 /* Clone child in new user namespace. */
4018 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4019 if (pid < 0) {
4020 ERROR("failed to clone child process in new user namespace");
4021 goto on_error;
4022 }
4023
4024 close(p[0]);
4025 p[0] = -1;
4026
4027 euid = geteuid();
4028 egid = getegid();
4029
4030 /* Allocate new {g,u}id map list. */
4031 idmap = malloc(sizeof(*idmap));
4032 if (!idmap)
4033 goto on_error;
4034 lxc_list_init(idmap);
4035
4036 /* Find container root. */
4037 lxc_list_for_each(cur, &conf->id_map) {
4038 struct id_map *tmpmap;
4039
4040 tmplist = malloc(sizeof(*tmplist));
4041 if (!tmplist)
4042 goto on_error;
4043
4044 tmpmap = malloc(sizeof(*tmpmap));
4045 if (!tmpmap) {
4046 free(tmplist);
4047 goto on_error;
4048 }
4049
4050 memset(tmpmap, 0, sizeof(*tmpmap));
4051 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
4052 tmplist->elem = tmpmap;
4053
4054 lxc_list_add_tail(idmap, tmplist);
4055
4056 map = cur->elem;
4057
4058 if (map->idtype == ID_TYPE_UID)
4059 if (euid >= map->hostid && euid < map->hostid + map->range)
4060 host_uid_map = map;
4061
4062 if (map->idtype == ID_TYPE_GID)
4063 if (egid >= map->hostid && egid < map->hostid + map->range)
4064 host_gid_map = map;
4065
4066 if (map->nsid != 0)
4067 continue;
4068
4069 if (map->idtype == ID_TYPE_UID)
4070 if (container_root_uid == NULL)
4071 container_root_uid = map;
4072
4073 if (map->idtype == ID_TYPE_GID)
4074 if (container_root_gid == NULL)
4075 container_root_gid = map;
4076 }
4077
4078 if (!container_root_uid || !container_root_gid) {
4079 ERROR("No mapping for container root found");
4080 goto on_error;
4081 }
4082
4083 /* Check whether the {g,u}id of the user has a mapping. */
4084 if (!host_uid_map)
4085 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4086 else
4087 host_uid_map = container_root_uid;
4088
4089 if (!host_gid_map)
4090 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4091 else
4092 host_gid_map = container_root_gid;
4093
4094 if (!host_uid_map) {
4095 DEBUG("Failed to find mapping for uid %d", euid);
4096 goto on_error;
4097 }
4098
4099 if (!host_gid_map) {
4100 DEBUG("Failed to find mapping for gid %d", egid);
4101 goto on_error;
4102 }
4103
4104 if (host_uid_map && (host_uid_map != container_root_uid)) {
4105 /* Add container root to the map. */
4106 tmplist = malloc(sizeof(*tmplist));
4107 if (!tmplist)
4108 goto on_error;
4109 lxc_list_add_elem(tmplist, host_uid_map);
4110 lxc_list_add_tail(idmap, tmplist);
4111 }
4112 /* idmap will now keep track of that memory. */
4113 host_uid_map = NULL;
4114
4115 if (host_gid_map && (host_gid_map != container_root_gid)) {
4116 tmplist = malloc(sizeof(*tmplist));
4117 if (!tmplist)
4118 goto on_error;
4119 lxc_list_add_elem(tmplist, host_gid_map);
4120 lxc_list_add_tail(idmap, tmplist);
4121 }
4122 /* idmap will now keep track of that memory. */
4123 host_gid_map = NULL;
4124
4125 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4126 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4127 lxc_list_for_each(cur, idmap) {
4128 map = cur->elem;
4129 TRACE("establishing %cid mapping for \"%d\" in new "
4130 "user namespace: nsuid %lu - hostid %lu - range "
4131 "%lu",
4132 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4133 map->nsid, map->hostid, map->range);
4134 }
4135 }
4136
4137 /* Set up {g,u}id mapping for user namespace of child process. */
4138 ret = lxc_map_ids(idmap, pid);
4139 if (ret < 0) {
4140 ERROR("error setting up {g,u}id mappings for child process "
4141 "\"%d\"", pid);
4142 goto on_error;
4143 }
4144
4145 /* Tell child to proceed. */
4146 if (write(p[1], &c, 1) != 1) {
4147 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4148 goto on_error;
4149 }
4150
686dd5d1 4151on_error:
415a8851 4152 /* Wait for child to finish. */
686dd5d1
CB
4153 if (pid > 0)
4154 ret = wait_for_pid(pid);
415a8851 4155
415a8851
CB
4156 if (idmap)
4157 lxc_free_idmap(idmap);
4158 if (host_uid_map && (host_uid_map != container_root_uid))
4159 free(host_uid_map);
4160 if (host_gid_map && (host_gid_map != container_root_gid))
4161 free(host_gid_map);
4162
4163 if (p[0] != -1)
4164 close(p[0]);
4165 close(p[1]);
4166
4167 return ret;
4168}
4169
a96a8e8c 4170/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4171static char* getuname(void)
4172{
a96a8e8c 4173 struct passwd *result;
97e9cfa0 4174
a96a8e8c
SH
4175 result = getpwuid(geteuid());
4176 if (!result)
97e9cfa0
SH
4177 return NULL;
4178
a96a8e8c 4179 return strdup(result->pw_name);
97e9cfa0
SH
4180}
4181
a96a8e8c 4182/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4183static char *getgname(void)
4184{
a96a8e8c 4185 struct group *result;
97e9cfa0 4186
a96a8e8c
SH
4187 result = getgrgid(getegid());
4188 if (!result)
97e9cfa0
SH
4189 return NULL;
4190
a96a8e8c 4191 return strdup(result->gr_name);
97e9cfa0
SH
4192}
4193
a96a8e8c 4194/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4195void suggest_default_idmap(void)
4196{
4197 FILE *f;
4198 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4199 char *line = NULL;
4200 char *uname, *gname;
4201 size_t len = 0;
4202
4203 if (!(uname = getuname()))
4204 return;
4205
4206 if (!(gname = getgname())) {
4207 free(uname);
4208 return;
4209 }
4210
4211 f = fopen(subuidfile, "r");
4212 if (!f) {
4213 ERROR("Your system is not configured with subuids");
4214 free(gname);
4215 free(uname);
4216 return;
4217 }
4218 while (getline(&line, &len, f) != -1) {
b7930180 4219 size_t no_newline = 0;
97e9cfa0
SH
4220 char *p = strchr(line, ':'), *p2;
4221 if (*line == '#')
4222 continue;
4223 if (!p)
4224 continue;
4225 *p = '\0';
4226 p++;
4227 if (strcmp(line, uname))
4228 continue;
4229 p2 = strchr(p, ':');
4230 if (!p2)
4231 continue;
4232 *p2 = '\0';
4233 p2++;
4234 if (!*p2)
4235 continue;
b7930180
CB
4236 no_newline = strcspn(p2, "\n");
4237 p2[no_newline] = '\0';
4238
b7b2fde4
CB
4239 if (lxc_safe_uint(p, &uid) < 0)
4240 WARN("Could not parse UID.");
4241 if (lxc_safe_uint(p2, &urange) < 0)
4242 WARN("Could not parse UID range.");
97e9cfa0
SH
4243 }
4244 fclose(f);
4245
6be7389a 4246 f = fopen(subgidfile, "r");
97e9cfa0
SH
4247 if (!f) {
4248 ERROR("Your system is not configured with subgids");
4249 free(gname);
4250 free(uname);
4251 return;
4252 }
4253 while (getline(&line, &len, f) != -1) {
b7930180 4254 size_t no_newline = 0;
97e9cfa0
SH
4255 char *p = strchr(line, ':'), *p2;
4256 if (*line == '#')
4257 continue;
4258 if (!p)
4259 continue;
4260 *p = '\0';
4261 p++;
4262 if (strcmp(line, uname))
4263 continue;
4264 p2 = strchr(p, ':');
4265 if (!p2)
4266 continue;
4267 *p2 = '\0';
4268 p2++;
4269 if (!*p2)
4270 continue;
b7930180
CB
4271 no_newline = strcspn(p2, "\n");
4272 p2[no_newline] = '\0';
4273
b7b2fde4
CB
4274 if (lxc_safe_uint(p, &gid) < 0)
4275 WARN("Could not parse GID.");
4276 if (lxc_safe_uint(p2, &grange) < 0)
4277 WARN("Could not parse GID range.");
97e9cfa0
SH
4278 }
4279 fclose(f);
4280
f10fad2f 4281 free(line);
97e9cfa0
SH
4282
4283 if (!urange || !grange) {
4284 ERROR("You do not have subuids or subgids allocated");
4285 ERROR("Unprivileged containers require subuids and subgids");
4286 return;
4287 }
4288
4289 ERROR("You must either run as root, or define uid mappings");
4290 ERROR("To pass uid mappings to lxc-create, you could create");
4291 ERROR("~/.config/lxc/default.conf:");
4292 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
bdcbb6b3
CB
4293 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4294 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
97e9cfa0
SH
4295
4296 free(gname);
4297 free(uname);
4298}
aaf26830 4299
a7307747
SH
4300static void free_cgroup_settings(struct lxc_list *result)
4301{
4302 struct lxc_list *iterator, *next;
4303
4304 lxc_list_for_each_safe(iterator, result, next) {
4305 lxc_list_del(iterator);
4306 free(iterator);
4307 }
4308 free(result);
4309}
4310
aaf26830
KT
4311/*
4312 * Return the list of cgroup_settings sorted according to the following rules
4313 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4314 */
4315struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
4316{
4317 struct lxc_list *result;
4318 struct lxc_list *memsw_limit = NULL;
4319 struct lxc_list *it = NULL;
4320 struct lxc_cgroup *cg = NULL;
4321 struct lxc_list *item = NULL;
4322
4323 result = malloc(sizeof(*result));
fac7c663
KT
4324 if (!result) {
4325 ERROR("failed to allocate memory to sort cgroup settings");
4326 return NULL;
4327 }
aaf26830
KT
4328 lxc_list_init(result);
4329
4330 /*Iterate over the cgroup settings and copy them to the output list*/
4331 lxc_list_for_each(it, cgroup_settings) {
4332 item = malloc(sizeof(*item));
fac7c663
KT
4333 if (!item) {
4334 ERROR("failed to allocate memory to sort cgroup settings");
a7307747 4335 free_cgroup_settings(result);
fac7c663
KT
4336 return NULL;
4337 }
aaf26830
KT
4338 item->elem = it->elem;
4339 cg = it->elem;
4340 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4341 /* Store the memsw_limit location */
4342 memsw_limit = item;
4343 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4d5b72a1 4344 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
aaf26830
KT
4345 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
4346 item->elem = memsw_limit->elem;
4347 memsw_limit->elem = it->elem;
4348 }
4349 lxc_list_add_tail(result, item);
4350 }
4351
4352 return result;
a7307747 4353}