]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
conf: fix tty creation
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
8f3e280e
CB
27#include <dirent.h>
28#include <errno.h>
29#include <fcntl.h>
30#include <grp.h>
31#include <inttypes.h>
32#include <libgen.h>
33#include <pwd.h>
34#include <stdarg.h>
0ad19a3f 35#include <stdio.h>
0ad19a3f 36#include <stdlib.h>
0ad19a3f 37#include <string.h>
8f3e280e 38#include <time.h>
0ad19a3f 39#include <unistd.h>
8f3e280e
CB
40#include <arpa/inet.h>
41#include <linux/loop.h>
8f3e280e
CB
42#include <net/if.h>
43#include <netinet/in.h>
44#include <sys/mman.h>
45#include <sys/mount.h>
46#include <sys/param.h>
47#include <sys/prctl.h>
48#include <sys/stat.h>
49#include <sys/socket.h>
ce831b3b 50#include <sys/sysmacros.h>
2d76d1d7 51#include <sys/syscall.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
1d52bdf7 55
af6824fc
ST
56/* makedev() */
57#ifdef MAJOR_IN_MKDEV
58# include <sys/mkdev.h>
59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
5ef5c9a3
CB
71#ifdef HAVE_LINUX_MEMFD_H
72#include <linux/memfd.h>
73#endif
74
e8bd4e43 75#include "af_unix.h"
8f3e280e
CB
76#include "caps.h" /* for lxc_caps_last_cap() */
77#include "cgroup.h"
1b09f2c0 78#include "conf.h"
1ed6ba91 79#include "confile_utils.h"
8f3e280e 80#include "error.h"
1b09f2c0 81#include "log.h"
025ed0f3 82#include "lxclock.h"
8f3e280e 83#include "lxcseccomp.h"
4355ab5f 84#include "namespace.h"
8f3e280e
CB
85#include "network.h"
86#include "parse.h"
28d832c4
CB
87#include "storage.h"
88#include "storage/aufs.h"
89#include "storage/overlay.h"
8f3e280e 90#include "utils.h"
fe4de9a6 91#include "lsm/lsm.h"
d0a36f2c 92
e37dda71 93#if HAVE_LIBCAP
495d2046
SG
94#include <sys/capability.h>
95#endif
96
6ff05e18
SG
97#if HAVE_SYS_PERSONALITY_H
98#include <sys/personality.h>
99#endif
100
edaf8b1b
SG
101#if IS_BIONIC
102#include <../include/lxcmntent.h>
a04f5407
CB
103#ifndef HAVE_PRLIMIT
104#include <../include/prlimit.h>
105#endif
edaf8b1b
SG
106#else
107#include <mntent.h>
108#endif
109
36eb9bde 110lxc_log_define(lxc_conf, lxc);
e5bda9ee 111
e37dda71 112#if HAVE_LIBCAP
b09094da
MN
113#ifndef CAP_SETFCAP
114#define CAP_SETFCAP 31
115#endif
116
117#ifndef CAP_MAC_OVERRIDE
118#define CAP_MAC_OVERRIDE 32
119#endif
120
121#ifndef CAP_MAC_ADMIN
122#define CAP_MAC_ADMIN 33
123#endif
495d2046 124#endif
b09094da
MN
125
126#ifndef PR_CAPBSET_DROP
127#define PR_CAPBSET_DROP 24
128#endif
129
9818cae4
SG
130#ifndef LO_FLAGS_AUTOCLEAR
131#define LO_FLAGS_AUTOCLEAR 4
132#endif
133
bc5b27d6
DK
134#ifndef CAP_SETUID
135#define CAP_SETUID 7
136#endif
137
138#ifndef CAP_SETGID
139#define CAP_SETGID 6
140#endif
141
0769b82a
CS
142/* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144#ifndef CAP_SYS_ADMIN
145#define CAP_SYS_ADMIN 21
146#endif
147
2d76d1d7
SG
148/* Define pivot_root() if missing from the C library */
149#ifndef HAVE_PIVOT_ROOT
150static int pivot_root(const char * new_root, const char * put_old)
151{
152#ifdef __NR_pivot_root
8f3e280e 153 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 154#else
8f3e280e
CB
155 errno = ENOSYS;
156 return -1;
2d76d1d7
SG
157#endif
158}
159#else
160extern int pivot_root(const char * new_root, const char * put_old);
161#endif
162
163/* Define sethostname() if missing from the C library */
164#ifndef HAVE_SETHOSTNAME
165static int sethostname(const char * name, size_t len)
166{
167#ifdef __NR_sethostname
8f3e280e 168 return syscall(__NR_sethostname, name, len);
2d76d1d7 169#else
8f3e280e
CB
170 errno = ENOSYS;
171 return -1;
2d76d1d7
SG
172#endif
173}
174#endif
175
ecec0126
SG
176#ifndef MS_PRIVATE
177#define MS_PRIVATE (1<<18)
178#endif
179
8912711c
CB
180#ifndef MS_LAZYTIME
181#define MS_LAZYTIME (1<<25)
182#endif
183
5ef5c9a3
CB
184/* memfd_create() */
185#ifndef MFD_CLOEXEC
186#define MFD_CLOEXEC 0x0001U
187#endif
188
189#ifndef MFD_ALLOW_SEALING
190#define MFD_ALLOW_SEALING 0x0002U
191#endif
192
193#ifndef HAVE_MEMFD_CREATE
194static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232}
233#else
234extern int memfd_create(const char *name, unsigned int flags);
235#endif
236
2b9ae35a
CB
237char *lxchook_names[NUM_LXC_HOOKS] = {"pre-start", "pre-mount", "mount",
238 "autodev", "start", "stop",
239 "post-stop", "clone", "destroy"};
72d0e1cb 240
998ac676
RT
241struct mount_opt {
242 char *name;
243 int clear;
244 int flag;
245};
246
81810dd1
DL
247struct caps_opt {
248 char *name;
249 int value;
250};
251
c6d09e15
WB
252struct limit_opt {
253 char *name;
254 int value;
255};
256
858377e4
SH
257/*
258 * The lxc_conf of the container currently being worked on in an
259 * API call
260 * This is used in the error calls
261 */
262#ifdef HAVE_TLS
263__thread struct lxc_conf *current_config;
264#else
265struct lxc_conf *current_config;
266#endif
267
0769b82a
CS
268/* Declare this here, since we don't want to reshuffle the whole file. */
269static int in_caplist(int cap, struct lxc_list *caps);
270
998ac676 271static struct mount_opt mount_opt[] = {
470b359b
CB
272 { "async", 1, MS_SYNCHRONOUS },
273 { "atime", 1, MS_NOATIME },
274 { "bind", 0, MS_BIND },
88d413d5 275 { "defaults", 0, 0 },
88d413d5 276 { "dev", 1, MS_NODEV },
470b359b 277 { "diratime", 1, MS_NODIRATIME },
88d413d5 278 { "dirsync", 0, MS_DIRSYNC },
470b359b 279 { "exec", 1, MS_NOEXEC },
8912711c 280 { "lazytime", 0, MS_LAZYTIME },
88d413d5 281 { "mand", 0, MS_MANDLOCK },
88d413d5 282 { "noatime", 0, MS_NOATIME },
470b359b 283 { "nodev", 0, MS_NODEV },
88d413d5 284 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
285 { "noexec", 0, MS_NOEXEC },
286 { "nomand", 1, MS_MANDLOCK },
287 { "norelatime", 1, MS_RELATIME },
288 { "nostrictatime", 1, MS_STRICTATIME },
289 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
290 { "rbind", 0, MS_BIND|MS_REC },
291 { "relatime", 0, MS_RELATIME },
470b359b
CB
292 { "remount", 0, MS_REMOUNT },
293 { "ro", 0, MS_RDONLY },
294 { "rw", 1, MS_RDONLY },
88d413d5 295 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
296 { "suid", 1, MS_NOSUID },
297 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 298 { NULL, 0, 0 },
998ac676
RT
299};
300
e37dda71 301#if HAVE_LIBCAP
81810dd1 302static struct caps_opt caps_opt[] = {
a6afdde9 303 { "chown", CAP_CHOWN },
1e11be34
DL
304 { "dac_override", CAP_DAC_OVERRIDE },
305 { "dac_read_search", CAP_DAC_READ_SEARCH },
306 { "fowner", CAP_FOWNER },
307 { "fsetid", CAP_FSETID },
81810dd1
DL
308 { "kill", CAP_KILL },
309 { "setgid", CAP_SETGID },
310 { "setuid", CAP_SETUID },
311 { "setpcap", CAP_SETPCAP },
312 { "linux_immutable", CAP_LINUX_IMMUTABLE },
313 { "net_bind_service", CAP_NET_BIND_SERVICE },
314 { "net_broadcast", CAP_NET_BROADCAST },
315 { "net_admin", CAP_NET_ADMIN },
316 { "net_raw", CAP_NET_RAW },
317 { "ipc_lock", CAP_IPC_LOCK },
318 { "ipc_owner", CAP_IPC_OWNER },
319 { "sys_module", CAP_SYS_MODULE },
320 { "sys_rawio", CAP_SYS_RAWIO },
321 { "sys_chroot", CAP_SYS_CHROOT },
322 { "sys_ptrace", CAP_SYS_PTRACE },
323 { "sys_pacct", CAP_SYS_PACCT },
324 { "sys_admin", CAP_SYS_ADMIN },
325 { "sys_boot", CAP_SYS_BOOT },
326 { "sys_nice", CAP_SYS_NICE },
327 { "sys_resource", CAP_SYS_RESOURCE },
328 { "sys_time", CAP_SYS_TIME },
329 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
330 { "mknod", CAP_MKNOD },
331 { "lease", CAP_LEASE },
57b837e2
CB
332#ifdef CAP_AUDIT_READ
333 { "audit_read", CAP_AUDIT_READ },
334#endif
9527e566 335#ifdef CAP_AUDIT_WRITE
81810dd1 336 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
337#endif
338#ifdef CAP_AUDIT_CONTROL
81810dd1 339 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 340#endif
81810dd1
DL
341 { "setfcap", CAP_SETFCAP },
342 { "mac_override", CAP_MAC_OVERRIDE },
343 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
344#ifdef CAP_SYSLOG
345 { "syslog", CAP_SYSLOG },
346#endif
347#ifdef CAP_WAKE_ALARM
348 { "wake_alarm", CAP_WAKE_ALARM },
349#endif
2b54359b
CB
350#ifdef CAP_BLOCK_SUSPEND
351 { "block_suspend", CAP_BLOCK_SUSPEND },
352#endif
81810dd1 353};
495d2046
SG
354#else
355static struct caps_opt caps_opt[] = {};
356#endif
81810dd1 357
c6d09e15
WB
358static struct limit_opt limit_opt[] = {
359#ifdef RLIMIT_AS
360 { "as", RLIMIT_AS },
361#endif
362#ifdef RLIMIT_CORE
363 { "core", RLIMIT_CORE },
364#endif
365#ifdef RLIMIT_CPU
366 { "cpu", RLIMIT_CPU },
367#endif
368#ifdef RLIMIT_DATA
369 { "data", RLIMIT_DATA },
370#endif
371#ifdef RLIMIT_FSIZE
372 { "fsize", RLIMIT_FSIZE },
373#endif
374#ifdef RLIMIT_LOCKS
375 { "locks", RLIMIT_LOCKS },
376#endif
377#ifdef RLIMIT_MEMLOCK
378 { "memlock", RLIMIT_MEMLOCK },
379#endif
380#ifdef RLIMIT_MSGQUEUE
381 { "msgqueue", RLIMIT_MSGQUEUE },
382#endif
383#ifdef RLIMIT_NICE
384 { "nice", RLIMIT_NICE },
385#endif
386#ifdef RLIMIT_NOFILE
387 { "nofile", RLIMIT_NOFILE },
388#endif
389#ifdef RLIMIT_NPROC
390 { "nproc", RLIMIT_NPROC },
391#endif
392#ifdef RLIMIT_RSS
393 { "rss", RLIMIT_RSS },
394#endif
395#ifdef RLIMIT_RTPRIO
396 { "rtprio", RLIMIT_RTPRIO },
397#endif
398#ifdef RLIMIT_RTTIME
399 { "rttime", RLIMIT_RTTIME },
400#endif
401#ifdef RLIMIT_SIGPENDING
402 { "sigpending", RLIMIT_SIGPENDING },
403#endif
404#ifdef RLIMIT_STACK
405 { "stack", RLIMIT_STACK },
406#endif
407};
408
91c3830e
SH
409static int run_buffer(char *buffer)
410{
ebec9176 411 struct lxc_popen_FILE *f;
91c3830e 412 char *output;
8e7da691 413 int ret;
91c3830e 414
ebec9176 415 f = lxc_popen(buffer);
91c3830e 416 if (!f) {
062b72c6 417 SYSERROR("Failed to popen() %s.", buffer);
91c3830e
SH
418 return -1;
419 }
420
421 output = malloc(LXC_LOG_BUFFER_SIZE);
422 if (!output) {
062b72c6 423 ERROR("Failed to allocate memory for %s.", buffer);
ebec9176 424 lxc_pclose(f);
91c3830e
SH
425 return -1;
426 }
427
062b72c6
CB
428 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
429 DEBUG("Script %s with output: %s.", buffer, output);
91c3830e
SH
430
431 free(output);
432
ebec9176 433 ret = lxc_pclose(f);
8e7da691 434 if (ret == -1) {
062b72c6 435 SYSERROR("Script exited with error.");
91c3830e 436 return -1;
8e7da691 437 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
062b72c6 438 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
8e7da691
DE
439 return -1;
440 } else if (WIFSIGNALED(ret)) {
062b72c6 441 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
8e7da691 442 return -1;
91c3830e
SH
443 }
444
445 return 0;
446}
447
148e91f5 448static int run_script_argv(const char *name, const char *section,
062b72c6
CB
449 const char *script, const char *hook,
450 const char *lxcpath, char **argsin)
148e91f5
SH
451{
452 int ret, i;
453 char *buffer;
454 size_t size = 0;
455
062b72c6 456 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
148e91f5
SH
457 script, name, section);
458
062b72c6 459 for (i = 0; argsin && argsin[i]; i++)
148e91f5
SH
460 size += strlen(argsin[i]) + 1;
461
462 size += strlen(hook) + 1;
463
464 size += strlen(script);
465 size += strlen(name);
466 size += strlen(section);
467 size += 3;
468
469 if (size > INT_MAX)
470 return -1;
471
472 buffer = alloca(size);
473 if (!buffer) {
062b72c6 474 ERROR("Failed to allocate memory.");
148e91f5
SH
475 return -1;
476 }
477
062b72c6
CB
478 ret =
479 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
480 if (ret < 0 || (size_t)ret >= size) {
481 ERROR("Script name too long.");
148e91f5
SH
482 return -1;
483 }
484
062b72c6
CB
485 for (i = 0; argsin && argsin[i]; i++) {
486 int len = size - ret;
148e91f5
SH
487 int rc;
488 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
489 if (rc < 0 || rc >= len) {
062b72c6 490 ERROR("Script args too long.");
148e91f5
SH
491 return -1;
492 }
493 ret += rc;
494 }
495
496 return run_buffer(buffer);
497}
498
811ef482 499int run_script(const char *name, const char *section, const char *script, ...)
e3b4c4c4 500{
abbfd20b 501 int ret;
91c3830e 502 char *buffer, *p;
abbfd20b
DL
503 size_t size = 0;
504 va_list ap;
751d9dcd 505
062b72c6 506 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
751d9dcd 507 script, name, section);
e3b4c4c4 508
abbfd20b
DL
509 va_start(ap, script);
510 while ((p = va_arg(ap, char *)))
95642a10 511 size += strlen(p) + 1;
abbfd20b
DL
512 va_end(ap);
513
514 size += strlen(script);
515 size += strlen(name);
516 size += strlen(section);
95642a10 517 size += 3;
abbfd20b 518
95642a10
MS
519 if (size > INT_MAX)
520 return -1;
521
522 buffer = alloca(size);
abbfd20b 523 if (!buffer) {
062b72c6 524 ERROR("Failed to allocate memory.");
751d9dcd
DL
525 return -1;
526 }
527
9ba8130c
SH
528 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
529 if (ret < 0 || ret >= size) {
062b72c6 530 ERROR("Script name too long.");
9ba8130c
SH
531 return -1;
532 }
751d9dcd 533
abbfd20b 534 va_start(ap, script);
9ba8130c 535 while ((p = va_arg(ap, char *))) {
062b72c6 536 int len = size - ret;
9ba8130c
SH
537 int rc;
538 rc = snprintf(buffer + ret, len, " %s", p);
539 if (rc < 0 || rc >= len) {
062b72c6 540 ERROR("Script args too long.");
9ba8130c
SH
541 return -1;
542 }
543 ret += rc;
544 }
abbfd20b 545 va_end(ap);
751d9dcd 546
91c3830e 547 return run_buffer(buffer);
e3b4c4c4
ST
548}
549
0c547523
SH
550/*
551 * pin_rootfs
b7ed4bf0
CS
552 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
553 * the duration of the container run, to prevent the container from marking
554 * the underlying fs readonly on shutdown. unlink the file immediately so
555 * no name pollution is happens
0c547523
SH
556 * return -1 on error.
557 * return -2 if nothing needed to be pinned.
558 * return an open fd (>=0) if we pinned it.
559 */
560int pin_rootfs(const char *rootfs)
561{
562 char absrootfs[MAXPATHLEN];
563 char absrootfspin[MAXPATHLEN];
564 struct stat s;
565 int ret, fd;
566
e99ee0de 567 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 568 return -2;
e99ee0de 569
00ec333b 570 if (!realpath(rootfs, absrootfs))
9be53773 571 return -2;
0c547523 572
00ec333b 573 if (access(absrootfs, F_OK))
0c547523 574 return -1;
0c547523 575
00ec333b 576 if (stat(absrootfs, &s))
0c547523 577 return -1;
0c547523 578
72f919c4 579 if (!S_ISDIR(s.st_mode))
0c547523
SH
580 return -2;
581
b7ed4bf0 582 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 583 if (ret >= MAXPATHLEN)
0c547523 584 return -1;
0c547523
SH
585
586 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
587 if (fd < 0)
588 return fd;
589 (void)unlink(absrootfspin);
0c547523
SH
590 return fd;
591}
592
e2a7e8dc
SH
593/*
594 * If we are asking to remount something, make sure that any
595 * NOEXEC etc are honored.
596 */
5ae72b98 597unsigned long add_required_remount_flags(const char *s, const char *d,
e2a7e8dc
SH
598 unsigned long flags)
599{
614305f3 600#ifdef HAVE_STATVFS
e2a7e8dc
SH
601 struct statvfs sb;
602 unsigned long required_flags = 0;
603
604 if (!(flags & MS_REMOUNT))
605 return flags;
606
607 if (!s)
608 s = d;
609
610 if (!s)
611 return flags;
612 if (statvfs(s, &sb) < 0)
613 return flags;
614
615 if (sb.f_flag & MS_NOSUID)
616 required_flags |= MS_NOSUID;
617 if (sb.f_flag & MS_NODEV)
618 required_flags |= MS_NODEV;
619 if (sb.f_flag & MS_RDONLY)
620 required_flags |= MS_RDONLY;
621 if (sb.f_flag & MS_NOEXEC)
622 required_flags |= MS_NOEXEC;
623
624 return flags | required_flags;
614305f3
SH
625#else
626 return flags;
627#endif
e2a7e8dc
SH
628}
629
4fb3cba5 630static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 631{
368bbc02 632 int r;
80e80c40 633 int i;
b06b8511
CS
634 static struct {
635 int match_mask;
636 int match_flag;
637 const char *source;
638 const char *destination;
639 const char *fstype;
640 unsigned long flags;
641 const char *options;
642 } default_mounts[] = {
643 /* Read-only bind-mounting... In older kernels, doing that required
644 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
645 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
646 * kernel 2.6.26 onwards. However, this apparently does not work on
647 * kernel 3.8. Unfortunately, on that very same kernel, doing the
648 * same trick as above doesn't seem to work either, there one needs
649 * to ALSO specify MS_BIND for the remount, otherwise the entire
650 * fs is remounted read-only or the mount fails because it's busy...
651 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
652 * 2.6.32...
368bbc02 653 */
f24a52d5 654 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a
SH
655 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
656 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
f24a52d5
SG
657 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
658 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
592fd47a 659 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
f24a52d5
SG
660 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
661 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
662 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
663 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
664 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
665 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
666 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
667 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
668 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
669 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
670 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
671 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 672 };
368bbc02 673
b06b8511
CS
674 for (i = 0; default_mounts[i].match_mask; i++) {
675 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
676 char *source = NULL;
677 char *destination = NULL;
678 int saved_errno;
e2a7e8dc 679 unsigned long mflags;
b06b8511
CS
680
681 if (default_mounts[i].source) {
682 /* will act like strdup if %r is not present */
8ede5f4c 683 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
b06b8511
CS
684 if (!source) {
685 SYSERROR("memory allocation error");
686 return -1;
687 }
688 }
cc4fd506
SH
689 if (!default_mounts[i].destination) {
690 ERROR("BUG: auto mounts destination %d was NULL", i);
b2f44b4d 691 free(source);
cc4fd506
SH
692 return -1;
693 }
694 /* will act like strdup if %r is not present */
695 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
696 if (!destination) {
697 saved_errno = errno;
698 SYSERROR("memory allocation error");
699 free(source);
700 errno = saved_errno;
701 return -1;
b06b8511 702 }
e2a7e8dc
SH
703 mflags = add_required_remount_flags(source, destination,
704 default_mounts[i].flags);
592fd47a 705 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
b06b8511 706 saved_errno = errno;
b88ff9a0
SG
707 if (r < 0 && errno == ENOENT) {
708 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
709 r = 0;
710 }
711 else if (r < 0)
e2a7e8dc 712 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
f24a52d5 713
b06b8511
CS
714 free(source);
715 free(destination);
716 if (r < 0) {
b06b8511
CS
717 errno = saved_errno;
718 return -1;
719 }
368bbc02 720 }
368bbc02
CS
721 }
722
b06b8511 723 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
724 int cg_flags;
725
726 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
727 /* If the type of cgroup mount was not specified, it depends on the
728 * container's capabilities as to what makes sense: if we have
729 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
730 * anyway, so we may as well default to read-write; then the admin
731 * will not be given a false sense of security. (And if they really
732 * want mixed r/o r/w, then they can explicitly specify :mixed.)
733 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
734 * :mixed, because then the container can't remount it read-write. */
735 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
736 int has_sys_admin = 0;
b0ee5983
CB
737
738 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 739 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 740 else
0769b82a 741 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
742
743 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 744 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 745 else
0769b82a 746 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a
CS
747 }
748
8ede5f4c 749 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
368bbc02 750 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 751 return -1;
368bbc02
CS
752 }
753 }
754
368bbc02 755 return 0;
368bbc02
CS
756}
757
4e5440c6 758static int setup_utsname(struct utsname *utsname)
0ad19a3f 759{
4e5440c6
DL
760 if (!utsname)
761 return 0;
0ad19a3f 762
4e5440c6
DL
763 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
764 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 765 return -1;
766 }
767
4e5440c6 768 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 769
0ad19a3f 770 return 0;
771}
772
69aa6655
DE
773struct dev_symlinks {
774 const char *oldpath;
775 const char *name;
776};
777
778static const struct dev_symlinks dev_symlinks[] = {
779 {"/proc/self/fd", "fd"},
780 {"/proc/self/fd/0", "stdin"},
781 {"/proc/self/fd/1", "stdout"},
782 {"/proc/self/fd/2", "stderr"},
783};
784
785static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
786{
787 char path[MAXPATHLEN];
788 int ret,i;
09227be2 789 struct stat s;
69aa6655
DE
790
791
792 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
793 const struct dev_symlinks *d = &dev_symlinks[i];
ec50007f 794 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
795 if (ret < 0 || ret >= MAXPATHLEN)
796 return -1;
09227be2
MW
797
798 /*
799 * Stat the path first. If we don't get an error
800 * accept it as is and don't try to create it
801 */
802 if (!stat(path, &s)) {
803 continue;
804 }
805
69aa6655 806 ret = symlink(d->oldpath, path);
09227be2 807
69aa6655 808 if (ret && errno != EEXIST) {
09227be2
MW
809 if ( errno == EROFS ) {
810 WARN("Warning: Read Only file system while creating %s", path);
811 } else {
812 SYSERROR("Error creating %s", path);
813 return -1;
814 }
69aa6655
DE
815 }
816 }
817 return 0;
818}
819
2187efd3 820/* Build a space-separate list of ptys to pass to systemd. */
393903d1 821static bool append_ptyname(char **pp, char *name)
b0a33c1e 822{
393903d1
SH
823 char *p;
824
825 if (!*pp) {
826 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
827 if (!*pp)
828 return false;
829 sprintf(*pp, "container_ttys=%s", name);
830 return true;
831 }
832 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
833 if (!p)
834 return false;
835 *pp = p;
836 strcat(p, " ");
837 strcat(p, name);
838 return true;
839}
840
2187efd3 841static int lxc_setup_ttys(struct lxc_conf *conf)
393903d1 842{
9e1045e3 843 int i, ret;
393903d1
SH
844 const struct lxc_tty_info *tty_info = &conf->tty_info;
845 char *ttydir = conf->ttydir;
7c6ef2a2 846 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 847
e8bd4e43 848 if (!conf->rootfs.path)
bc9bd0e3
DL
849 return 0;
850
b0a33c1e 851 for (i = 0; i < tty_info->nbtty; i++) {
b0a33c1e 852 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
853
e8bd4e43 854 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
73363c61 855 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 856 return -1;
9e1045e3 857
7c6ef2a2
SH
858 if (ttydir) {
859 /* create dev/lxc/tty%d" */
9e1045e3
CB
860 ret = snprintf(lxcpath, sizeof(lxcpath),
861 "/dev/%s/tty%d", ttydir, i + 1);
73363c61 862 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 863 return -1;
9e1045e3 864
7c6ef2a2 865 ret = creat(lxcpath, 0660);
9e1045e3 866 if (ret < 0 && errno != EEXIST) {
73363c61 867 SYSERROR("Failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
868 return -1;
869 }
4d44e274
SH
870 if (ret >= 0)
871 close(ret);
9e1045e3 872
7c6ef2a2 873 ret = unlink(path);
9e1045e3 874 if (ret < 0 && errno != ENOENT) {
73363c61 875 SYSERROR("Failed to unlink \"%s\"", path);
7c6ef2a2
SH
876 return -1;
877 }
b0a33c1e 878
9e1045e3
CB
879 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
880 if (ret < 0) {
73363c61 881 WARN("Failed to bind mount \"%s\" onto \"%s\"",
7c6ef2a2
SH
882 pty_info->name, path);
883 continue;
884 }
9e1045e3
CB
885 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
886 path);
13954cce 887
9e1045e3
CB
888 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
889 ttydir, i + 1);
73363c61 890 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 891 return -1;
9e1045e3 892
7c6ef2a2 893 ret = symlink(lxcpath, path);
9e1045e3 894 if (ret < 0) {
73363c61 895 SYSERROR("Failed to create symlink \"%s\" -> \"%s\"",
9e1045e3 896 path, lxcpath);
7c6ef2a2
SH
897 return -1;
898 }
899 } else {
9e1045e3
CB
900 /* If we populated /dev, then we need to create
901 * /dev/ttyN
902 */
903 ret = access(path, F_OK);
904 if (ret < 0) {
c6883f38 905 ret = creat(path, 0660);
9e1045e3 906 if (ret < 0) {
73363c61 907 SYSERROR("Failed to create \"%s\"", path);
c6883f38 908 /* this isn't fatal, continue */
025ed0f3 909 } else {
c6883f38 910 close(ret);
025ed0f3 911 }
c6883f38 912 }
9e1045e3
CB
913
914 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
915 if (ret < 0) {
73363c61 916 SYSERROR("Failed to mount '%s'->'%s'", pty_info->name, path);
7c6ef2a2
SH
917 continue;
918 }
9e1045e3 919
73363c61 920 DEBUG("Bind mounted \"%s\" onto \"%s\"", pty_info->name,
9e1045e3 921 path);
393903d1 922 }
9e1045e3 923
e8bd4e43 924 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
393903d1
SH
925 ERROR("Error setting up container_ttys string");
926 return -1;
b0a33c1e 927 }
928 }
929
73363c61 930 INFO("Finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
b0a33c1e 931 return 0;
932}
933
2187efd3
CB
934int lxc_allocate_ttys(const char *name, struct lxc_conf *conf)
935{
936 struct lxc_tty_info *tty_info = &conf->tty_info;
937 int i, ret;
938
939 /* no tty in the configuration */
940 if (!conf->tty)
941 return 0;
942
943 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
944 if (!tty_info->pty_info) {
945 SYSERROR("failed to allocate struct *pty_info");
946 return -ENOMEM;
947 }
948
949 for (i = 0; i < conf->tty; i++) {
950 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
951
952 process_lock();
953 ret = openpty(&pty_info->master, &pty_info->slave,
954 pty_info->name, NULL, NULL);
955 process_unlock();
956 if (ret) {
957 SYSERROR("failed to create pty device number %d", i);
958 tty_info->nbtty = i;
959 lxc_delete_tty(tty_info);
960 return -ENOTTY;
961 }
962
963 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
964 pty_info->name, pty_info->master, pty_info->slave);
965
966 /* Prevent leaking the file descriptors to the container */
967 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
968 if (ret < 0)
969 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
970 "pty device \"%s\": %s",
971 pty_info->master, pty_info->name, strerror(errno));
972
973 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
974 if (ret < 0)
975 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
976 "pty device \"%s\": %s",
977 pty_info->slave, pty_info->name, strerror(errno));
978
979 pty_info->busy = 0;
980 }
981
982 tty_info->nbtty = conf->tty;
983
984 INFO("finished allocating %d pts devices", conf->tty);
985 return 0;
986}
987
988void lxc_delete_tty(struct lxc_tty_info *tty_info)
989{
990 int i;
991
992 for (i = 0; i < tty_info->nbtty; i++) {
993 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
994
995 close(pty_info->master);
996 close(pty_info->slave);
997 }
998
999 free(tty_info->pty_info);
1000 tty_info->pty_info = NULL;
1001 tty_info->nbtty = 0;
1002}
1003
1004static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
1005{
1006 int i;
1007 struct lxc_conf *conf = handler->conf;
1008 struct lxc_tty_info *tty_info = &conf->tty_info;
1009 int sock = handler->data_sock[0];
1010 int ret = -1;
1011
1012 if (!conf->tty)
1013 return 0;
1014
1015 for (i = 0; i < conf->tty; i++) {
1016 int ttyfds[2];
1017 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
1018
1019 ttyfds[0] = pty_info->master;
1020 ttyfds[1] = pty_info->slave;
1021
1022 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1023 if (ret < 0)
1024 break;
1025
1026 TRACE("Send pty \"%s\" with master fd %d and slave fd %d to "
1027 "parent", pty_info->name, pty_info->master, pty_info->slave);
1028 }
1029
1030 if (ret < 0)
1031 ERROR("Failed to send %d ttys to parent: %s", conf->tty,
1032 strerror(errno));
1033 else
1034 TRACE("Sent %d ttys to parent", conf->tty);
1035
1036 return ret;
1037}
1038
1039static int lxc_create_ttys(struct lxc_handler *handler)
1040{
1041 int ret = -1;
1042 struct lxc_conf *conf = handler->conf;
1043
1044 ret = lxc_allocate_ttys(handler->name, conf);
1045 if (ret < 0) {
1046 ERROR("Failed to allocate ttys");
1047 goto on_error;
1048 }
1049
1050 ret = lxc_send_ttys_to_parent(handler);
1051 if (ret < 0) {
1052 ERROR("Failed to send ttys to parent");
1053 goto on_error;
1054 }
1055
1056 if (!conf->is_execute) {
1057 ret = lxc_setup_ttys(conf);
1058 if (ret < 0) {
1059 ERROR("Failed to setup ttys");
1060 goto on_error;
1061 }
1062 }
1063
1064 if (conf->pty_names) {
1065 ret = setenv("container_ttys", conf->pty_names, 1);
1066 if (ret < 0)
1067 SYSERROR("Failed to set \"container_ttys=%s\"", conf->pty_names);
1068 }
1069
1070 ret = 0;
1071
1072on_error:
1073 lxc_delete_tty(&conf->tty_info);
1074
1075 return ret;
1076}
1077
59bb8698 1078static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 1079{
2d489f9e 1080 int oldroot = -1, newroot = -1;
bf601689 1081
2d489f9e
SH
1082 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1083 if (oldroot < 0) {
1084 SYSERROR("Error opening old-/ for fchdir");
9ba8130c
SH
1085 return -1;
1086 }
2d489f9e
SH
1087 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1088 if (newroot < 0) {
1089 SYSERROR("Error opening new-/ for fchdir");
1090 goto fail;
c08556c6 1091 }
bf601689 1092
cc6f6dd7 1093 /* change into new root fs */
2d489f9e 1094 if (fchdir(newroot)) {
cc6f6dd7 1095 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
2d489f9e 1096 goto fail;
cc6f6dd7
DL
1097 }
1098
cc6f6dd7 1099 /* pivot_root into our new root fs */
2d489f9e 1100 if (pivot_root(".", ".")) {
cc6f6dd7 1101 SYSERROR("pivot_root syscall failed");
2d489f9e 1102 goto fail;
bf601689 1103 }
cc6f6dd7 1104
2d489f9e
SH
1105 /*
1106 * at this point the old-root is mounted on top of our new-root
1107 * To unmounted it we must not be chdir'd into it, so escape back
1108 * to old-root
1109 */
1110 if (fchdir(oldroot) < 0) {
1111 SYSERROR("Error entering oldroot");
1112 goto fail;
1113 }
7981ea46 1114 if (umount2(".", MNT_DETACH) < 0) {
2d489f9e
SH
1115 SYSERROR("Error detaching old root");
1116 goto fail;
cc6f6dd7
DL
1117 }
1118
2d489f9e
SH
1119 if (fchdir(newroot) < 0) {
1120 SYSERROR("Error re-entering newroot");
1121 goto fail;
1122 }
cc6f6dd7 1123
2d489f9e
SH
1124 close(oldroot);
1125 close(newroot);
bf601689 1126
2d489f9e 1127 DEBUG("pivot_root syscall to '%s' successful", rootfs);
bf601689 1128
bf601689 1129 return 0;
2d489f9e
SH
1130
1131fail:
1132 if (oldroot != -1)
1133 close(oldroot);
1134 if (newroot != -1)
1135 close(newroot);
1136 return -1;
bf601689
MH
1137}
1138
7133b912
CB
1139/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1140 * error, log it but don't fail yet.
91c3830e 1141 */
7133b912
CB
1142static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1143 const char *lxcpath)
91c3830e
SH
1144{
1145 int ret;
87da4ec3
SH
1146 size_t clen;
1147 char *path;
91c3830e 1148
7133b912 1149 INFO("Preparing \"/dev\"");
bc6928ff 1150
14221cbb 1151 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1152 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1153 path = alloca(clen);
bc6928ff 1154
ec50007f 1155 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
7133b912 1156 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1157 return -1;
bc6928ff 1158
87da4ec3 1159 if (!dir_exists(path)) {
7133b912
CB
1160 WARN("\"/dev\" directory does not exist. Proceeding without "
1161 "autodev being set up");
87da4ec3 1162 return 0;
bc6928ff 1163 }
87da4ec3 1164
1ec0e8e3 1165 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
7133b912
CB
1166 rootfs->path ? rootfs->mount : NULL);
1167 if (ret < 0) {
1168 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1ec0e8e3 1169 return -1;
91c3830e 1170 }
7133b912 1171 INFO("Mounted tmpfs on \"%s\"", path);
87da4ec3 1172
ec50007f 1173 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
7133b912 1174 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1175 return -1;
87da4ec3 1176
7133b912 1177 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1178 * If not, then create it and exit if that fails...
1179 */
87da4ec3 1180 if (!dir_exists(path)) {
bc6928ff 1181 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
7133b912
CB
1182 if (ret < 0) {
1183 SYSERROR("Failed to create directory \"%s\"", path);
bc6928ff
MW
1184 return -1;
1185 }
91c3830e
SH
1186 }
1187
7133b912 1188 INFO("Prepared \"/dev\"");
91c3830e
SH
1189 return 0;
1190}
1191
c6883f38 1192struct lxc_devs {
74a3920a 1193 const char *name;
c6883f38
SH
1194 mode_t mode;
1195 int maj;
1196 int min;
1197};
1198
74a3920a 1199static const struct lxc_devs lxc_devs[] = {
06749971
CB
1200 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1201 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1202 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1203 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1204 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1205 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
c6883f38
SH
1206};
1207
27245ff7 1208static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38
SH
1209{
1210 int ret;
c6883f38
SH
1211 char path[MAXPATHLEN];
1212 int i;
3a32201c 1213 mode_t cmask;
c6883f38 1214
3999be0a
CB
1215 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1216 rootfs->path ? rootfs->mount : "");
1217 if (ret < 0 || ret >= MAXPATHLEN)
c6883f38 1218 return -1;
91c3830e 1219
0bbf8572
CB
1220 /* ignore, just don't try to fill in */
1221 if (!dir_exists(path))
9cb4d183
SH
1222 return 0;
1223
3999be0a
CB
1224 INFO("Populating \"/dev\"");
1225
3a32201c 1226 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1227 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1228 const struct lxc_devs *d = &lxc_devs[i];
0728ebf4 1229
3999be0a
CB
1230 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
1231 rootfs->path ? rootfs->mount : "", d->name);
c6883f38
SH
1232 if (ret < 0 || ret >= MAXPATHLEN)
1233 return -1;
0bbf8572 1234
c6883f38 1235 ret = mknod(path, d->mode, makedev(d->maj, d->min));
0bbf8572 1236 if (ret < 0) {
9cb4d183 1237 FILE *pathfile;
3999be0a 1238 char hostpath[MAXPATHLEN];
9cb4d183 1239
0bbf8572
CB
1240 if (errno == EEXIST) {
1241 DEBUG("\"%s\" device already existed", path);
1242 continue;
1243 }
1244
1245 /* Unprivileged containers cannot create devices, so
1246 * bind mount the device from the host.
1247 */
9cb4d183
SH
1248 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1249 if (ret < 0 || ret >= MAXPATHLEN)
1250 return -1;
3999be0a 1251
9cb4d183
SH
1252 pathfile = fopen(path, "wb");
1253 if (!pathfile) {
3999be0a 1254 SYSERROR("Failed to create file \"%s\"", path);
9cb4d183
SH
1255 return -1;
1256 }
1257 fclose(pathfile);
3999be0a
CB
1258
1259 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1260 rootfs->path ? rootfs->mount : NULL);
1261 if (ret < 0) {
1262 SYSERROR("Failed to bind mount \"%s\" from "
1263 "host into container",
1264 d->name);
9cb4d183
SH
1265 return -1;
1266 }
3999be0a
CB
1267 DEBUG("Bind mounted \"%s\" onto \"%s\"", hostpath,
1268 path);
0bbf8572 1269 } else {
3999be0a 1270 DEBUG("Created device node \"%s\"", path);
c6883f38
SH
1271 }
1272 }
3a32201c 1273 umask(cmask);
c6883f38 1274
3999be0a 1275 INFO("Populated \"/dev\"");
c6883f38
SH
1276 return 0;
1277}
1278
9aa76a17 1279static int lxc_setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1280{
9aa76a17 1281 int ret;
10bc1861 1282 struct lxc_storage *bdev;
91c3e281 1283 const struct lxc_rootfs *rootfs;
cc28d0b0 1284
91c3e281 1285 rootfs = &conf->rootfs;
a0f379bf 1286 if (!rootfs->path) {
91c3e281
CB
1287 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1288 SYSERROR("Failed to make / rslave.");
a0f379bf
DW
1289 return -1;
1290 }
c69bd12f 1291 return 0;
a0f379bf 1292 }
0ad19a3f 1293
12297168 1294 if (access(rootfs->mount, F_OK)) {
91c3e281 1295 SYSERROR("Failed to access to \"%s\". Check it is present.",
12297168 1296 rootfs->mount);
b1789442
DL
1297 return -1;
1298 }
1299
10bc1861 1300 bdev = storage_init(conf, rootfs->path, rootfs->mount, rootfs->options);
9aa76a17
CB
1301 if (!bdev) {
1302 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
91c3e281
CB
1303 rootfs->path, rootfs->mount,
1304 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1305 return -1;
9be53773 1306 }
9aa76a17
CB
1307
1308 ret = bdev->ops->mount(bdev);
10bc1861 1309 storage_put(bdev);
9aa76a17 1310 if (ret < 0) {
91c3e281
CB
1311 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1312 rootfs->path, rootfs->mount,
1313 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1314 return -1;
1315 }
0ad19a3f 1316
91c3e281
CB
1317 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1318 rootfs->path, rootfs->mount,
1319 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1320
ac778708
DL
1321 return 0;
1322}
1323
91e93c71
AV
1324int prepare_ramfs_root(char *root)
1325{
eab15c1e 1326 char buf[LXC_LINELEN], *p;
91e93c71
AV
1327 char nroot[PATH_MAX];
1328 FILE *f;
1329 int i;
1330 char *p2;
1331
1332 if (realpath(root, nroot) == NULL)
39c7b795 1333 return -errno;
91e93c71
AV
1334
1335 if (chdir("/") == -1)
39c7b795 1336 return -errno;
91e93c71
AV
1337
1338 /*
1339 * We could use here MS_MOVE, but in userns this mount is
1340 * locked and can't be moved.
1341 */
39c7b795 1342 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
91e93c71 1343 SYSERROR("Failed to move %s into /", root);
39c7b795 1344 return -errno;
91e93c71
AV
1345 }
1346
39c7b795 1347 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
91e93c71 1348 SYSERROR("Failed to make . rprivate");
39c7b795 1349 return -errno;
91e93c71
AV
1350 }
1351
1352 /*
1353 * The following code cleans up inhereted mounts which are not
1354 * required for CT.
1355 *
1356 * The mountinfo file shows not all mounts, if a few points have been
1357 * unmounted between read operations from the mountinfo. So we need to
1358 * read mountinfo a few times.
1359 *
1360 * This loop can be skipped if a container uses unserns, because all
1361 * inherited mounts are locked and we should live with all this trash.
1362 */
1363 while (1) {
1364 int progress = 0;
1365
1366 f = fopen("./proc/self/mountinfo", "r");
1367 if (!f) {
1368 SYSERROR("Unable to open /proc/self/mountinfo");
1369 return -1;
1370 }
eab15c1e 1371 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1372 for (p = buf, i=0; p && i < 4; i++)
1373 p = strchr(p+1, ' ');
1374 if (!p)
1375 continue;
1376 p2 = strchr(p+1, ' ');
1377 if (!p2)
1378 continue;
1379
1380 *p2 = '\0';
1381 *p = '.';
1382
1383 if (strcmp(p + 1, "/") == 0)
1384 continue;
1385 if (strcmp(p + 1, "/proc") == 0)
1386 continue;
1387
1388 if (umount2(p, MNT_DETACH) == 0)
1389 progress++;
1390 }
1391 fclose(f);
1392 if (!progress)
1393 break;
1394 }
1395
8bea9fae
PR
1396 /* This also can be skipped if a container uses unserns */
1397 umount2("./proc", MNT_DETACH);
91e93c71
AV
1398
1399 /* It is weird, but chdir("..") moves us in a new root */
1400 if (chdir("..") == -1) {
1401 SYSERROR("Unable to change working directory");
1402 return -1;
1403 }
1404
1405 if (chroot(".") == -1) {
1406 SYSERROR("Unable to chroot");
1407 return -1;
1408 }
1409
1410 return 0;
1411}
1412
74a3920a 1413static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1414{
39c7b795
CB
1415 if (!rootfs->path) {
1416 DEBUG("container does not have a rootfs, so not doing pivot root");
ac778708 1417 return 0;
39c7b795 1418 }
ac778708 1419
91e93c71 1420 if (detect_ramfs_rootfs()) {
39c7b795
CB
1421 DEBUG("detected that container is on ramfs");
1422 if (prepare_ramfs_root(rootfs->mount)) {
1423 ERROR("failed to prepare minimal ramfs root");
91e93c71 1424 return -1;
39c7b795
CB
1425 }
1426
1427 DEBUG("prepared ramfs root for container");
1428 return 0;
1429 }
1430
1431 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1432 ERROR("failed to pivot root");
25368b52 1433 return -1;
c69bd12f
DL
1434 }
1435
39c7b795 1436 DEBUG("finished pivot root");
25368b52 1437 return 0;
0ad19a3f 1438}
1439
70761e5e 1440static int lxc_setup_devpts(int num_pts)
3c26f34e 1441{
70761e5e 1442 int ret;
9d28c4f9
CB
1443 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1444 char devpts_mntopts[256];
77890c6d 1445
70761e5e
CB
1446 if (!num_pts) {
1447 DEBUG("no new devpts instance will be mounted since no pts "
1448 "devices are requested");
d852c78c 1449 return 0;
3c26f34e 1450 }
1451
9d28c4f9
CB
1452 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1453 default_devpts_mntopts, num_pts);
1454 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1455 return -1;
1456
d5cb35d6 1457 /* Unmount old devpts instance. */
70761e5e
CB
1458 ret = access("/dev/pts/ptmx", F_OK);
1459 if (!ret) {
70761e5e
CB
1460 ret = umount("/dev/pts");
1461 if (ret < 0) {
1462 SYSERROR("failed to unmount old devpts instance");
1463 return -1;
7e40254a 1464 }
70761e5e 1465 DEBUG("unmounted old /dev/pts instance");
7e40254a
JTLB
1466 }
1467
70761e5e
CB
1468 /* Create mountpoint for devpts instance. */
1469 ret = mkdir("/dev/pts", 0755);
1470 if (ret < 0 && errno != EEXIST) {
1471 SYSERROR("failed to create the \"/dev/pts\" directory");
3c26f34e 1472 return -1;
1473 }
1474
70761e5e
CB
1475 /* Mount new devpts instance. */
1476 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1477 if (ret < 0) {
1478 SYSERROR("failed to mount new devpts instance");
1479 return -1;
1480 }
f4f52cb5 1481 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1482
d5cb35d6 1483 /* Remove any pre-existing /dev/ptmx file. */
70761e5e 1484 ret = access("/dev/ptmx", F_OK);
d5cb35d6
CB
1485 if (!ret) {
1486 ret = remove("/dev/ptmx");
1487 if (ret < 0) {
1488 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1489 return -1;
70761e5e 1490 }
d5cb35d6 1491 DEBUG("removed existing \"/dev/ptmx\"");
3c26f34e 1492 }
1493
d5cb35d6
CB
1494 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1495 ret = open("/dev/ptmx", O_CREAT, 0666);
1496 if (ret < 0) {
1497 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1498 return -1;
1499 }
e87bd19c 1500 close(ret);
d5cb35d6 1501 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1502
d5cb35d6 1503 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1504 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6
CB
1505 if (!ret) {
1506 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1507 return 0;
1508 } else {
1509 /* Fallthrough and try to create a symlink. */
1510 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1511 }
1512
1513 /* Remove the dummy /dev/ptmx file we created above. */
1514 ret = remove("/dev/ptmx");
70761e5e 1515 if (ret < 0) {
d5cb35d6
CB
1516 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1517 return -1;
1518 }
1519
1520 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1521 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1522 if (ret < 0) {
1523 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
3c26f34e 1524 return -1;
1525 }
d5cb35d6 1526 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
cd54d859 1527
3c26f34e 1528 return 0;
1529}
1530
cccc74b5
DL
1531static int setup_personality(int persona)
1532{
6ff05e18 1533 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1534 if (persona == -1)
1535 return 0;
1536
1537 if (personality(persona) < 0) {
1538 SYSERROR("failed to set personality to '0x%x'", persona);
1539 return -1;
1540 }
1541
1542 INFO("set personality to '0x%x'", persona);
6ff05e18 1543 #endif
cccc74b5
DL
1544
1545 return 0;
1546}
1547
3d7d929a
CB
1548static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1549 const struct lxc_console *console)
6e590161 1550{
63376d7d 1551 char path[MAXPATHLEN];
0728ebf4 1552 int ret, fd;
52e35957 1553
8b1b1210
CB
1554 if (console->path && !strcmp(console->path, "none"))
1555 return 0;
1556
7c6ef2a2 1557 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
3d7d929a 1558 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1559 return -1;
52e35957 1560
8b1b1210
CB
1561 /* When we are asked to setup a console we remove any previous
1562 * /dev/console bind-mounts.
1563 */
a7ba3c7f
CB
1564 if (file_exists(path)) {
1565 ret = lxc_unstack_mountpoint(path, false);
1566 if (ret < 0) {
8b1b1210 1567 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1568 return -ret;
1569 } else {
1570 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1571 }
953fe44f 1572
a7ba3c7f
CB
1573 ret = unlink(path);
1574 if (ret < 0) {
1575 SYSERROR("error unlinking %s", path);
8b1b1210
CB
1576 return -errno;
1577 }
8b1b1210
CB
1578 }
1579
1580 /* For unprivileged containers autodev or automounts will already have
1581 * taken care of creating /dev/console.
1582 */
0728ebf4
TA
1583 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1584 if (fd < 0) {
1585 if (errno != EEXIST) {
1586 SYSERROR("failed to create console");
3d7d929a 1587 return -errno;
0728ebf4
TA
1588 }
1589 } else {
1590 close(fd);
52e35957
DL
1591 }
1592
0728ebf4 1593 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
3d7d929a
CB
1594 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1595 return -errno;
63376d7d 1596 }
13954cce 1597
3d7d929a 1598 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
63376d7d 1599 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1600 return -1;
1601 }
1602
3d7d929a 1603 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1604 return 0;
1605}
1606
3d7d929a
CB
1607static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1608 const struct lxc_console *console,
1609 char *ttydir)
7c6ef2a2 1610{
7c6ef2a2 1611 int ret;
3d7d929a 1612 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
7c6ef2a2
SH
1613
1614 /* create rootfs/dev/<ttydir> directory */
3d7d929a
CB
1615 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1616 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1617 return -1;
3d7d929a 1618
7c6ef2a2
SH
1619 ret = mkdir(path, 0755);
1620 if (ret && errno != EEXIST) {
959aee9c 1621 SYSERROR("failed with errno %d to create %s", errno, path);
3d7d929a 1622 return -errno;
7c6ef2a2 1623 }
4742cd9a 1624 DEBUG("Created directory for console and tty devices at \"%s\"", path);
7c6ef2a2 1625
3d7d929a
CB
1626 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1627 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1628 return -1;
1629
7c6ef2a2 1630 ret = creat(lxcpath, 0660);
3d7d929a 1631 if (ret == -1 && errno != EEXIST) {
959aee9c 1632 SYSERROR("error %d creating %s", errno, lxcpath);
3d7d929a 1633 return -errno;
7c6ef2a2 1634 }
4d44e274
SH
1635 if (ret >= 0)
1636 close(ret);
7c6ef2a2 1637
2a12fefd
CB
1638 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1639 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 1640 return -1;
2a12fefd
CB
1641
1642 /* When we are asked to setup a console we remove any previous
1643 * /dev/console bind-mounts.
1644 */
1645 if (console->path && !strcmp(console->path, "none")) {
1646 struct stat st;
1647 ret = stat(path, &st);
1648 if (ret < 0) {
1649 if (errno == ENOENT)
1650 return 0;
1651 SYSERROR("failed stat() \"%s\"", path);
1652 return -errno;
1653 }
1654
1655 /* /dev/console must be character device with major number 5 and
1656 * minor number 1. If not, give benefit of the doubt and assume
1657 * the user has mounted something else right there on purpose.
1658 */
1659 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1660 return 0;
1661
1662 /* In case the user requested a bind-mount for /dev/console and
1663 * requests a ttydir we move the mount to the
a7ba3c7f
CB
1664 * /dev/<ttydir/console.
1665 * Note, we only move the uppermost mount and clear all other
1666 * mounts underneath for safety.
1667 * If it is a character device created via mknod() we simply
1668 * rename it.
2a12fefd
CB
1669 */
1670 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1671 if (ret < 0) {
1672 if (errno != EINVAL) {
1673 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1674 return -errno;
1675 }
1676 /* path was not a mountpoint */
1677 ret = rename(path, lxcpath);
1678 if (ret < 0) {
1679 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1680 return -errno;
1681 }
1682 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1683 } else {
1684 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1685 }
a7ba3c7f
CB
1686
1687 /* Clear all remaining bind-mounts. */
1688 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1689 if (ret < 0) {
a7ba3c7f
CB
1690 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1691 return -ret;
1692 } else {
1693 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1694 }
1695 } else {
1696 if (file_exists(path)) {
1697 ret = lxc_unstack_mountpoint(path, false);
1698 if (ret < 0) {
2a12fefd 1699 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1700 return -ret;
1701 } else {
1702 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
2a12fefd 1703 }
2a12fefd
CB
1704 }
1705
1706 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1707 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1708 return -1;
1709 }
1710 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1711 }
1712
2a12fefd 1713 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
9ba8130c 1714 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
3d7d929a 1715 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 1716 return -1;
3d7d929a 1717
2a12fefd
CB
1718 ret = unlink(path);
1719 if (ret && errno != ENOENT) {
1720 SYSERROR("error unlinking %s", path);
1721 return -errno;
1722 }
1723
7c6ef2a2 1724 ret = symlink(lxcpath, path);
3d7d929a
CB
1725 if (ret < 0) {
1726 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
7c6ef2a2
SH
1727 return -1;
1728 }
1729
3d7d929a 1730 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
6e590161 1731 return 0;
1732}
1733
3d7d929a
CB
1734static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1735 const struct lxc_console *console, char *ttydir)
7c6ef2a2 1736{
3d7d929a
CB
1737 /* We don't have a rootfs, /dev/console will be shared. */
1738 if (!rootfs->path) {
1739 DEBUG("/dev/console will be shared with the host");
7c6ef2a2 1740 return 0;
3d7d929a
CB
1741 }
1742
7c6ef2a2 1743 if (!ttydir)
3d7d929a 1744 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1745
3d7d929a 1746 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1747}
1748
998ac676
RT
1749static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1750{
1751 struct mount_opt *mo;
1752
1753 /* If opt is found in mount_opt, set or clear flags.
1754 * Otherwise append it to data. */
1755
1756 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1757 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1758 if (mo->clear)
1759 *flags &= ~mo->flag;
1760 else
1761 *flags |= mo->flag;
1762 return;
1763 }
1764 }
1765
1766 if (strlen(*data))
1767 strcat(*data, ",");
1768 strcat(*data, opt);
1769}
1770
a17b1e65 1771int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1772 char **mntdata)
1773{
1774 char *s, *data;
1775 char *p, *saveptr = NULL;
1776
911324ef 1777 *mntdata = NULL;
91656ce5 1778 *mntflags = 0L;
911324ef
DL
1779
1780 if (!mntopts)
998ac676
RT
1781 return 0;
1782
911324ef 1783 s = strdup(mntopts);
998ac676 1784 if (!s) {
36eb9bde 1785 SYSERROR("failed to allocate memory");
998ac676
RT
1786 return -1;
1787 }
1788
1789 data = malloc(strlen(s) + 1);
1790 if (!data) {
36eb9bde 1791 SYSERROR("failed to allocate memory");
998ac676
RT
1792 free(s);
1793 return -1;
1794 }
1795 *data = 0;
1796
1797 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1798 p = strtok_r(NULL, ",", &saveptr))
1799 parse_mntopt(p, mntflags, &data);
1800
1801 if (*data)
1802 *mntdata = data;
1803 else
1804 free(data);
1805 free(s);
1806
1807 return 0;
1808}
1809
6fd5e769
SH
1810static void null_endofword(char *word)
1811{
1812 while (*word && *word != ' ' && *word != '\t')
1813 word++;
1814 *word = '\0';
1815}
1816
1817/*
1818 * skip @nfields spaces in @src
1819 */
1820static char *get_field(char *src, int nfields)
1821{
1822 char *p = src;
1823 int i;
1824
1825 for (i = 0; i < nfields; i++) {
1826 while (*p && *p != ' ' && *p != '\t')
1827 p++;
1828 if (!*p)
1829 break;
1830 p++;
1831 }
1832 return p;
1833}
1834
911324ef
DL
1835static int mount_entry(const char *fsname, const char *target,
1836 const char *fstype, unsigned long mountflags,
0ac4b28a
CB
1837 const char *data, int optional, int dev,
1838 const char *rootfs)
911324ef 1839{
0ac4b28a 1840 int ret;
614305f3 1841#ifdef HAVE_STATVFS
2938f7c8 1842 struct statvfs sb;
614305f3 1843#endif
2938f7c8 1844
0ac4b28a
CB
1845 ret = safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data,
1846 rootfs);
1847 if (ret < 0) {
1fc64d22 1848 if (optional) {
0ac4b28a
CB
1849 INFO("Failed to mount \"%s\" on \"%s\" (optional): %s",
1850 fsname, target, strerror(errno));
1fc64d22
SG
1851 return 0;
1852 }
0ac4b28a
CB
1853
1854 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1855 return -1;
911324ef
DL
1856 }
1857
1858 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
7c5b6e7c 1859 unsigned long rqd_flags = 0;
0ac4b28a
CB
1860
1861 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
1862 "options",
1863 fsname ? fsname : "(none)", target ? target : "(none)");
1864
7c5b6e7c
AS
1865 if (mountflags & MS_RDONLY)
1866 rqd_flags |= MS_RDONLY;
614305f3 1867#ifdef HAVE_STATVFS
2938f7c8 1868 if (statvfs(fsname, &sb) == 0) {
7c5b6e7c 1869 unsigned long required_flags = rqd_flags;
0ac4b28a 1870
2938f7c8
SH
1871 if (sb.f_flag & MS_NOSUID)
1872 required_flags |= MS_NOSUID;
0ac4b28a 1873
ae7a770e 1874 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 1875 required_flags |= MS_NODEV;
0ac4b28a 1876
2938f7c8
SH
1877 if (sb.f_flag & MS_RDONLY)
1878 required_flags |= MS_RDONLY;
0ac4b28a 1879
2938f7c8
SH
1880 if (sb.f_flag & MS_NOEXEC)
1881 required_flags |= MS_NOEXEC;
0ac4b28a
CB
1882
1883 DEBUG("Flags for \"%s\" were %lu, required extra flags "
1884 "are %lu", fsname, sb.f_flag, required_flags);
1885
1886 /* If this was a bind mount request, and required_flags
2938f7c8 1887 * does not have any flags which are not already in
0ac4b28a 1888 * mountflags, then skip the remount.
2938f7c8
SH
1889 */
1890 if (!(mountflags & MS_REMOUNT)) {
0ac4b28a
CB
1891 if (!(required_flags & ~mountflags) &&
1892 rqd_flags == 0) {
1893 DEBUG("Mountflags already were %lu, "
1894 "skipping remount", mountflags);
2938f7c8
SH
1895 goto skipremount;
1896 }
1897 }
0ac4b28a 1898
2938f7c8 1899 mountflags |= required_flags;
6fd5e769 1900 }
614305f3 1901#endif
911324ef 1902
0ac4b28a
CB
1903 ret = mount(fsname, target, fstype, mountflags | MS_REMOUNT, data);
1904 if (ret < 0) {
1fc64d22 1905 if (optional) {
0ac4b28a
CB
1906 INFO("Failed to mount \"%s\" on \"%s\" "
1907 "(optional): %s", fsname, target,
1908 strerror(errno));
1fc64d22
SG
1909 return 0;
1910 }
0ac4b28a
CB
1911
1912 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1913 return -1;
911324ef
DL
1914 }
1915 }
1916
614305f3 1917#ifdef HAVE_STATVFS
6fd5e769 1918skipremount:
614305f3 1919#endif
0ac4b28a
CB
1920 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"", fsname,
1921 target, fstype);
911324ef
DL
1922
1923 return 0;
1924}
1925
c5e30de4 1926/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
1927static void cull_mntent_opt(struct mntent *mntent)
1928{
1929 int i;
c5e30de4
CB
1930 char *list[] = {"create=dir", "create=file", "optional", NULL};
1931
1932 for (i = 0; list[i]; i++) {
1933 char *p, *p2;
1934
1935 p = strstr(mntent->mnt_opts, list[i]);
1936 if (!p)
4e4ca161 1937 continue;
c5e30de4 1938
4e4ca161
SH
1939 p2 = strchr(p, ',');
1940 if (!p2) {
1941 /* no more mntopts, so just chop it here */
1942 *p = '\0';
1943 continue;
1944 }
c5e30de4
CB
1945
1946 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
1947 }
1948}
1949
4d5b72a1 1950static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
1951 const char *path,
1952 const struct lxc_rootfs *rootfs,
1953 const char *lxc_name,
1954 const char *lxc_path)
0ad19a3f 1955{
608e3567 1956 int ret = 0;
911324ef 1957
749f98d9
CB
1958 if (!strncmp(mntent->mnt_type, "overlay", 7))
1959 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
1960 else if (!strncmp(mntent->mnt_type, "aufs", 4))
1961 ret = aufs_mkdir(mntent, rootfs, lxc_name, lxc_path);
1962 if (ret < 0)
1963 return -1;
6e46cc0d 1964
34cfffb3 1965 if (hasmntopt(mntent, "create=dir")) {
749f98d9
CB
1966 ret = mkdir_p(path, 0755);
1967 if (ret < 0 && errno != EEXIST) {
1968 SYSERROR("Failed to create directory \"%s\"", path);
1969 return -1;
34cfffb3
SG
1970 }
1971 }
1972
4d5b72a1 1973 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
749f98d9
CB
1974 int fd;
1975 char *p1, *p2;
1976
1977 p1 = strdup(path);
1978 if (!p1)
1979 return -1;
1980
1981 p2 = dirname(p1);
1982
1983 ret = mkdir_p(p2, 0755);
1984 free(p1);
1985 if (ret < 0 && errno != EEXIST) {
1986 SYSERROR("Failed to create directory \"%s\"", path);
1987 return -1;
6e46cc0d 1988 }
749f98d9
CB
1989
1990 fd = open(path, O_CREAT, 0644);
1991 if (fd < 0)
1992 return -1;
1993 close(fd);
34cfffb3 1994 }
749f98d9
CB
1995
1996 return 0;
4d5b72a1
NC
1997}
1998
ec50007f
CB
1999/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2000 * without a rootfs. */
db4aba38 2001static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
2002 const char *path,
2003 const struct lxc_rootfs *rootfs,
2004 const char *lxc_name,
2005 const char *lxc_path)
4d5b72a1 2006{
d8b712bc 2007 int ret;
4d5b72a1
NC
2008 unsigned long mntflags;
2009 char *mntdata;
d8b712bc 2010 bool dev, optional;
ec50007f 2011 char *rootfs_path = NULL;
d8b712bc
CB
2012
2013 optional = hasmntopt(mntent, "optional") != NULL;
2014 dev = hasmntopt(mntent, "dev") != NULL;
2015
ec50007f
CB
2016 if (rootfs && rootfs->path)
2017 rootfs_path = rootfs->mount;
2018
d8b712bc
CB
2019 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2020 lxc_path);
2021 if (ret < 0) {
2022 if (optional)
2023 return 0;
608e3567 2024
d8b712bc
CB
2025 return -1;
2026 }
4e4ca161
SH
2027 cull_mntent_opt(mntent);
2028
d8b712bc
CB
2029 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2030 if (ret < 0)
a17b1e65 2031 return -1;
a17b1e65 2032
6e46cc0d 2033 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
ae7a770e 2034 mntdata, optional, dev, rootfs_path);
68c152ef 2035
911324ef 2036 free(mntdata);
911324ef
DL
2037 return ret;
2038}
2039
db4aba38
NC
2040static inline int mount_entry_on_systemfs(struct mntent *mntent)
2041{
1433c9f9 2042 int ret;
07667a6a 2043 char path[MAXPATHLEN];
1433c9f9
CB
2044
2045 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
2046 * absolute paths starting at / on the host.
2047 */
1433c9f9
CB
2048 if (mntent->mnt_dir[0] != '/')
2049 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2050 else
2051 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
07667a6a 2052 if (ret < 0 || ret >= sizeof(path))
1433c9f9 2053 return -1;
1433c9f9
CB
2054
2055 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
2056}
2057
4e4ca161 2058static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 2059 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
2060 const char *lxc_name,
2061 const char *lxc_path)
911324ef 2062{
bdd2b34c 2063 int offset;
013bd428 2064 char *aux;
67e571de 2065 const char *lxcpath;
bdd2b34c
CB
2066 char path[MAXPATHLEN];
2067 int ret = 0;
0ad19a3f 2068
593e8478 2069 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 2070 if (!lxcpath)
2a59a681 2071 return -1;
2a59a681 2072
bdd2b34c
CB
2073 /* If rootfs->path is a blockdev path, allow container fstab to use
2074 * <lxcpath>/<name>/rootfs" as the target prefix.
2075 */
2076 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
2077 if (ret < 0 || ret >= MAXPATHLEN)
80a881b2
SH
2078 goto skipvarlib;
2079
2080 aux = strstr(mntent->mnt_dir, path);
2081 if (aux) {
2082 offset = strlen(path);
2083 goto skipabs;
2084 }
2085
2086skipvarlib:
013bd428
DL
2087 aux = strstr(mntent->mnt_dir, rootfs->path);
2088 if (!aux) {
bdd2b34c 2089 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
db4aba38 2090 return ret;
013bd428 2091 }
80a881b2
SH
2092 offset = strlen(rootfs->path);
2093
2094skipabs:
bdd2b34c
CB
2095 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
2096 if (ret < 0 || ret >= MAXPATHLEN)
a17b1e65 2097 return -1;
a17b1e65 2098
0a2dddd4 2099 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 2100}
d330fe7b 2101
4e4ca161 2102static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
2103 const struct lxc_rootfs *rootfs,
2104 const char *lxc_name,
2105 const char *lxc_path)
911324ef
DL
2106{
2107 char path[MAXPATHLEN];
911324ef 2108 int ret;
d330fe7b 2109
34cfffb3 2110 /* relative to root mount point */
6e46cc0d 2111 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1433c9f9 2112 if (ret < 0 || ret >= sizeof(path)) {
9ba8130c
SH
2113 ERROR("path name too long");
2114 return -1;
2115 }
911324ef 2116
0a2dddd4 2117 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2118}
2119
06749971
CB
2120/* This logs a NOTICE() when a user specifies mounts that would conflict with
2121 * devices liblxc sets up automatically.
2122 */
2123static void log_notice_on_conflict(const struct lxc_conf *conf, const char *src,
2124 const char *dest)
2125{
2126 char *clean_mnt_fsname, *clean_mnt_dir, *tmp;
2127 bool needs_warning = false;
2128
2129 clean_mnt_fsname = lxc_deslashify(src);
2130 if (!clean_mnt_fsname)
2131 return;
2132
2133 clean_mnt_dir = lxc_deslashify(dest);
2134 if (!clean_mnt_dir) {
2135 free(clean_mnt_fsname);
2136 return;
2137 }
2138
2139 tmp = clean_mnt_dir;
2140 if (*tmp == '/')
2141 tmp++;
2142
2143 if (strncmp(src, "/dev", 4) || strncmp(tmp, "dev", 3)) {
2144 free(clean_mnt_dir);
2145 free(clean_mnt_fsname);
2146 return;
2147 }
2148
2149 if (!conf->autodev && !conf->pts && !conf->tty &&
2150 (!conf->console.path || !strcmp(conf->console.path, "none"))) {
2151 free(clean_mnt_dir);
2152 free(clean_mnt_fsname);
2153 return;
2154 }
2155
2156 if (!strcmp(tmp, "dev") && conf->autodev > 0)
2157 needs_warning = true;
2158 else if (!strcmp(tmp, "dev/pts") && (conf->autodev > 0 || conf->pts > 0))
2159 needs_warning = true;
2160 else if (!strcmp(tmp, "dev/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2161 needs_warning = true;
2162 else if (!strcmp(tmp, "dev/pts/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2163 needs_warning = true;
2164 else if (!strcmp(tmp, "dev/null") && conf->autodev > 0)
2165 needs_warning = true;
2166 else if (!strcmp(tmp, "dev/zero") && conf->autodev > 0)
2167 needs_warning = true;
2168 else if (!strcmp(tmp, "dev/full") && conf->autodev > 0)
2169 needs_warning = true;
2170 else if (!strcmp(tmp, "dev/urandom") && conf->autodev > 0)
2171 needs_warning = true;
2172 else if (!strcmp(tmp, "dev/random") && conf->autodev > 0)
2173 needs_warning = true;
2174 else if (!strcmp(tmp, "dev/tty") && conf->autodev > 0)
2175 needs_warning = true;
2176 else if (!strncmp(tmp, "dev/tty", 7) && (conf->autodev > 0 || conf->tty > 0))
2177 needs_warning = true;
2178
2179 if (needs_warning)
2180 NOTICE("Requesting to mount \"%s\" on \"%s\" while requesting "
2181 "automatic device setup under \"/dev\"",
2182 clean_mnt_fsname, clean_mnt_dir);
2183
2184 free(clean_mnt_dir);
2185 free(clean_mnt_fsname);
2186}
2187
2188static int mount_file_entries(const struct lxc_conf *conf,
2189 const struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2190 const char *lxc_name, const char *lxc_path)
911324ef 2191{
aaf901be
AM
2192 struct mntent mntent;
2193 char buf[4096];
911324ef 2194 int ret = -1;
e76b8764 2195
aaf901be 2196 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
06749971
CB
2197 log_notice_on_conflict(conf, mntent.mnt_fsname, mntent.mnt_dir);
2198
1ae3c19f
CB
2199 if (!rootfs->path)
2200 ret = mount_entry_on_systemfs(&mntent);
2201 else if (mntent.mnt_dir[0] != '/')
2202 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2203 lxc_name, lxc_path);
2204 else
2205 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2206 lxc_name, lxc_path);
2207 if (ret < 0)
2208 return -1;
0ad19a3f 2209 }
2210 ret = 0;
cd54d859 2211
1ae3c19f 2212 INFO("Set up mount entries");
e7938e9e
MN
2213 return ret;
2214}
2215
06749971
CB
2216static int setup_mount(const struct lxc_conf *conf,
2217 const struct lxc_rootfs *rootfs, const char *fstab,
42dff448 2218 const char *lxc_name, const char *lxc_path)
e7938e9e 2219{
42dff448 2220 FILE *f;
e7938e9e
MN
2221 int ret;
2222
2223 if (!fstab)
2224 return 0;
2225
42dff448
CB
2226 f = setmntent(fstab, "r");
2227 if (!f) {
2228 SYSERROR("Failed to open \"%s\"", fstab);
e7938e9e
MN
2229 return -1;
2230 }
2231
06749971 2232 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
42dff448
CB
2233 if (ret < 0)
2234 ERROR("Failed to set up mount entries");
e7938e9e 2235
42dff448 2236 endmntent(f);
0ad19a3f 2237 return ret;
2238}
2239
5ef5c9a3 2240FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2241{
5ef5c9a3 2242 int ret;
e7938e9e 2243 char *mount_entry;
5ef5c9a3 2244 struct lxc_list *iterator;
6bd04140 2245 FILE *f;
5ef5c9a3
CB
2246 int fd = -1;
2247
2248 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2249 if (fd < 0) {
2250 if (errno != ENOSYS)
2251 return NULL;
6bd04140
CB
2252 f = tmpfile();
2253 TRACE("Created temporary mount file");
5ef5c9a3 2254 } else {
6bd04140
CB
2255 f = fdopen(fd, "r+");
2256 TRACE("Created anonymous mount file");
5ef5c9a3 2257 }
e7938e9e 2258
6bd04140
CB
2259 if (!f) {
2260 SYSERROR("Could not create mount file");
5ef5c9a3
CB
2261 if (fd != -1)
2262 close(fd);
9fc7f8c0 2263 return NULL;
e7938e9e
MN
2264 }
2265
2266 lxc_list_for_each(iterator, mount) {
2267 mount_entry = iterator->elem;
6bd04140 2268 ret = fprintf(f, "%s\n", mount_entry);
5ef5c9a3 2269 if (ret < strlen(mount_entry))
6bd04140 2270 WARN("Could not write mount entry to mount file");
5ef5c9a3
CB
2271 }
2272
6bd04140
CB
2273 ret = fseek(f, 0, SEEK_SET);
2274 if (ret < 0) {
2275 SYSERROR("Failed to seek mount file");
2276 fclose(f);
5ef5c9a3 2277 return NULL;
e7938e9e
MN
2278 }
2279
6bd04140 2280 return f;
9fc7f8c0
TA
2281}
2282
06749971
CB
2283static int setup_mount_entries(const struct lxc_conf *conf,
2284 const struct lxc_rootfs *rootfs,
5ef5c9a3
CB
2285 struct lxc_list *mount, const char *lxc_name,
2286 const char *lxc_path)
9fc7f8c0 2287{
19b5d755 2288 FILE *f;
9fc7f8c0
TA
2289 int ret;
2290
19b5d755
CB
2291 f = make_anonymous_mount_file(mount);
2292 if (!f)
9fc7f8c0 2293 return -1;
e7938e9e 2294
06749971 2295 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
e7938e9e 2296
19b5d755 2297 fclose(f);
e7938e9e
MN
2298 return ret;
2299}
2300
bab88e68
CS
2301static int parse_cap(const char *cap)
2302{
2303 char *ptr = NULL;
84760c11 2304 size_t i;
2305 int capid = -1;
bab88e68 2306
7035407c
DE
2307 if (!strcmp(cap, "none"))
2308 return -2;
2309
bab88e68
CS
2310 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2311
2312 if (strcmp(cap, caps_opt[i].name))
2313 continue;
2314
2315 capid = caps_opt[i].value;
2316 break;
2317 }
2318
2319 if (capid < 0) {
2320 /* try to see if it's numeric, so the user may specify
2321 * capabilities that the running kernel knows about but
2322 * we don't */
2323 errno = 0;
2324 capid = strtol(cap, &ptr, 10);
2325 if (!ptr || *ptr != '\0' || errno != 0)
2326 /* not a valid number */
2327 capid = -1;
2328 else if (capid > lxc_caps_last_cap())
2329 /* we have a number but it's not a valid
2330 * capability */
2331 capid = -1;
2332 }
2333
2334 return capid;
2335}
2336
0769b82a
CS
2337int in_caplist(int cap, struct lxc_list *caps)
2338{
2339 struct lxc_list *iterator;
2340 int capid;
2341
2342 lxc_list_for_each(iterator, caps) {
2343 capid = parse_cap(iterator->elem);
2344 if (capid == cap)
2345 return 1;
2346 }
2347
2348 return 0;
2349}
2350
81810dd1
DL
2351static int setup_caps(struct lxc_list *caps)
2352{
2353 struct lxc_list *iterator;
2354 char *drop_entry;
bab88e68 2355 int capid;
81810dd1
DL
2356
2357 lxc_list_for_each(iterator, caps) {
2358
2359 drop_entry = iterator->elem;
2360
bab88e68 2361 capid = parse_cap(drop_entry);
d55bc1ad 2362
81810dd1 2363 if (capid < 0) {
1e11be34
DL
2364 ERROR("unknown capability %s", drop_entry);
2365 return -1;
81810dd1
DL
2366 }
2367
2368 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2369
2370 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2371 SYSERROR("failed to remove %s capability", drop_entry);
2372 return -1;
2373 }
81810dd1
DL
2374
2375 }
2376
1fb86a7c
SH
2377 DEBUG("capabilities have been setup");
2378
2379 return 0;
2380}
2381
2382static int dropcaps_except(struct lxc_list *caps)
2383{
2384 struct lxc_list *iterator;
2385 char *keep_entry;
1fb86a7c
SH
2386 int i, capid;
2387 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2388 INFO("found %d capabilities", numcaps);
1fb86a7c 2389
2caf9a97
SH
2390 if (numcaps <= 0 || numcaps > 200)
2391 return -1;
2392
1a0e70ac 2393 /* caplist[i] is 1 if we keep capability i */
1fb86a7c
SH
2394 int *caplist = alloca(numcaps * sizeof(int));
2395 memset(caplist, 0, numcaps * sizeof(int));
2396
2397 lxc_list_for_each(iterator, caps) {
2398
2399 keep_entry = iterator->elem;
2400
bab88e68 2401 capid = parse_cap(keep_entry);
1fb86a7c 2402
7035407c
DE
2403 if (capid == -2)
2404 continue;
2405
1fb86a7c
SH
2406 if (capid < 0) {
2407 ERROR("unknown capability %s", keep_entry);
2408 return -1;
2409 }
2410
8255688a 2411 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2412
2413 caplist[capid] = 1;
2414 }
2415 for (i=0; i<numcaps; i++) {
2416 if (caplist[i])
2417 continue;
2418 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2419 SYSERROR("failed to remove capability %d", i);
2420 return -1;
2421 }
1fb86a7c
SH
2422 }
2423
2424 DEBUG("capabilities have been setup");
81810dd1
DL
2425
2426 return 0;
2427}
2428
c6d09e15
WB
2429static int parse_resource(const char *res) {
2430 size_t i;
2431 int resid = -1;
2432
2433 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2434 if (strcmp(res, limit_opt[i].name) == 0)
2435 return limit_opt[i].value;
2436 }
2437
2438 /* try to see if it's numeric, so the user may specify
2439 * resources that the running kernel knows about but
2440 * we don't */
2441 if (lxc_safe_int(res, &resid) == 0)
2442 return resid;
2443 return -1;
2444}
2445
2446int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2447 struct lxc_list *it;
2448 struct lxc_limit *lim;
2449 int resid;
2450
2451 lxc_list_for_each(it, limits) {
2452 lim = it->elem;
2453
2454 resid = parse_resource(lim->resource);
2455 if (resid < 0) {
2456 ERROR("unknown resource %s", lim->resource);
2457 return -1;
2458 }
2459
2460 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2461 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2462 return -1;
2463 }
2464 }
2465 return 0;
2466}
2467
ae9242c8
SH
2468static char *default_rootfs_mount = LXCROOTFSMOUNT;
2469
7b379ab3 2470struct lxc_conf *lxc_conf_init(void)
089cd8b8 2471{
7b379ab3 2472 struct lxc_conf *new;
26ddeedd 2473 int i;
7b379ab3 2474
13277ec4 2475 new = malloc(sizeof(*new));
7b379ab3 2476 if (!new) {
13277ec4 2477 ERROR("lxc_conf_init : %s", strerror(errno));
7b379ab3
MN
2478 return NULL;
2479 }
2480 memset(new, 0, sizeof(*new));
2481
4b73005c 2482 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2483 new->personality = -1;
124fa0a8 2484 new->autodev = 1;
596a818d
DE
2485 new->console.log_path = NULL;
2486 new->console.log_fd = -1;
28a4b0e5 2487 new->console.path = NULL;
63376d7d 2488 new->console.peer = -1;
b5159817
DE
2489 new->console.peerpty.busy = -1;
2490 new->console.peerpty.master = -1;
2491 new->console.peerpty.slave = -1;
63376d7d
DL
2492 new->console.master = -1;
2493 new->console.slave = -1;
2494 new->console.name[0] = '\0';
d2e30e99 2495 new->maincmd_fd = -1;
76a26f55 2496 new->nbd_idx = -1;
54c30e29 2497 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2498 if (!new->rootfs.mount) {
13277ec4 2499 ERROR("lxc_conf_init : %s", strerror(errno));
53f3f048
SH
2500 free(new);
2501 return NULL;
2502 }
858377e4 2503 new->logfd = -1;
7b379ab3
MN
2504 lxc_list_init(&new->cgroup);
2505 lxc_list_init(&new->network);
2506 lxc_list_init(&new->mount_list);
81810dd1 2507 lxc_list_init(&new->caps);
1fb86a7c 2508 lxc_list_init(&new->keepcaps);
f6d3e3e4 2509 lxc_list_init(&new->id_map);
f979ac15 2510 lxc_list_init(&new->includes);
4184c3e1 2511 lxc_list_init(&new->aliens);
7c661726 2512 lxc_list_init(&new->environment);
c6d09e15 2513 lxc_list_init(&new->limits);
26ddeedd
SH
2514 for (i=0; i<NUM_LXC_HOOKS; i++)
2515 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2516 lxc_list_init(&new->groups);
fe4de9a6
DE
2517 new->lsm_aa_profile = NULL;
2518 new->lsm_se_context = NULL;
5112cd70 2519 new->tmp_umount_proc = 0;
7b379ab3 2520
9f30a190
MM
2521 for (i = 0; i < LXC_NS_MAX; i++)
2522 new->inherit_ns_fd[i] = -1;
2523
72bb04e4
PT
2524 /* if running in a new user namespace, init and COMMAND
2525 * default to running as UID/GID 0 when using lxc-execute */
2526 new->init_uid = 0;
2527 new->init_gid = 0;
43654d34 2528 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
72bb04e4 2529
7b379ab3 2530 return new;
089cd8b8
DL
2531}
2532
251d0d2a
DE
2533static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
2534 size_t buf_size)
f6d3e3e4 2535{
29053180
CB
2536 char path[MAXPATHLEN];
2537 int fd, ret;
f6d3e3e4 2538
29053180
CB
2539 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
2540 idtype == ID_TYPE_UID ? 'u' : 'g');
2541 if (ret < 0 || ret >= MAXPATHLEN) {
2542 ERROR("failed to create path \"%s\"", path);
f6d3e3e4
SH
2543 return -E2BIG;
2544 }
29053180
CB
2545
2546 fd = open(path, O_WRONLY);
2547 if (fd < 0) {
2548 SYSERROR("failed to open \"%s\"", path);
2549 return -1;
f6d3e3e4 2550 }
29053180
CB
2551
2552 errno = 0;
2553 ret = lxc_write_nointr(fd, buf, buf_size);
2554 if (ret != buf_size) {
2555 SYSERROR("failed to write %cid mapping to \"%s\"",
2556 idtype == ID_TYPE_UID ? 'u' : 'g', path);
2557 close(fd);
2558 return -1;
2559 }
2560 close(fd);
2561
2562 return 0;
f6d3e3e4
SH
2563}
2564
6e50e704
CB
2565/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2566 *
2567 * @return 1 if functional binary was found
2568 * @return 0 if binary exists but is lacking privilege
2569 * @return -ENOENT if binary does not exist
2570 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
2571 *
2572 */
df6a2945
CB
2573static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2574{
2575 char *path;
2576 int ret;
2577 struct stat st;
2578 int fret = 0;
2579
6e50e704
CB
2580 if (cap != CAP_SETUID && cap != CAP_SETGID)
2581 return -EINVAL;
2582
df6a2945
CB
2583 path = on_path(binary, NULL);
2584 if (!path)
2585 return -ENOENT;
2586
2587 ret = stat(path, &st);
2588 if (ret < 0) {
2589 fret = -errno;
2590 goto cleanup;
2591 }
2592
2593 /* Check if the binary is setuid. */
2594 if (st.st_mode & S_ISUID) {
2595 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
2596 fret = 1;
2597 goto cleanup;
2598 }
2599
69924fff 2600 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
2601 /* Check if it has the CAP_SETUID capability. */
2602 if ((cap & CAP_SETUID) &&
2603 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2604 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2605 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
2606 "and CAP_PERMITTED sets.", path);
2607 fret = 1;
2608 goto cleanup;
2609 }
2610
2611 /* Check if it has the CAP_SETGID capability. */
2612 if ((cap & CAP_SETGID) &&
2613 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2614 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2615 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
2616 "and CAP_PERMITTED sets.", path);
2617 fret = 1;
2618 goto cleanup;
2619 }
d6018f88 2620 #else
69924fff
CB
2621 /* If we cannot check for file capabilities we need to give the benefit
2622 * of the doubt. Otherwise we might fail even though all the necessary
2623 * file capabilities are set.
2624 */
d6018f88
CB
2625 DEBUG("Cannot check for file capabilites as full capability support is "
2626 "missing. Manual intervention needed.");
2627 fret = 1;
df6a2945
CB
2628 #endif
2629
2630cleanup:
2631 free(path);
2632 return fret;
2633}
2634
986ef930
CB
2635int lxc_map_ids_exec_wrapper(void *args)
2636{
2637 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2638 return -1;
2639}
2640
f6d3e3e4
SH
2641int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2642{
f6d3e3e4 2643 struct id_map *map;
4bc3b759 2644 struct lxc_list *iterator;
251d0d2a 2645 enum idtype type;
986ef930 2646 char u_or_g;
4bc3b759 2647 char *pos;
99d43365 2648 int fill, left;
986ef930
CB
2649 char cmd_output[MAXPATHLEN];
2650 /* strlen("new@idmap") = 9
2651 * +
2652 * strlen(" ") = 1
2653 * +
2654 * LXC_NUMSTRLEN64
2655 * +
2656 * strlen(" ") = 1
2657 *
2658 * We add some additional space to make sure that we really have
2659 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2660 */
2661 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
2662 int ret = 0, uidmap = 0, gidmap = 0;
2663 bool use_shadow = false, had_entry = false;
df6a2945
CB
2664
2665 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2666 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
2667 * will protected it by preventing another user from being handed the
2668 * range by shadow.
2669 */
df6a2945 2670 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
2671 if (uidmap == -ENOENT)
2672 WARN("newuidmap binary is missing");
2673 else if (!uidmap)
2674 WARN("newuidmap is lacking necessary privileges");
2675
df6a2945 2676 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
2677 if (gidmap == -ENOENT)
2678 WARN("newgidmap binary is missing");
2679 else if (!gidmap)
2680 WARN("newgidmap is lacking necessary privileges");
2681
df6a2945
CB
2682 if (uidmap > 0 && gidmap > 0) {
2683 DEBUG("Functional newuidmap and newgidmap binary found.");
4bc3b759 2684 use_shadow = true;
df6a2945 2685 } else {
99d43365
CB
2686 /* In case unprivileged users run application containers via
2687 * execute() or a start*() there are valid cases where they may
2688 * only want to map their own {g,u}id. Let's not block them from
2689 * doing so by requiring geteuid() == 0.
2690 */
2691 DEBUG("No newuidmap and newgidmap binary found. Trying to "
2692 "write directly with euid %d.", geteuid());
0e6e3a41 2693 }
251d0d2a 2694
986ef930
CB
2695 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2696 type++, u_or_g = 'g') {
2697 pos = mapbuf;
2698
0e6e3a41 2699 if (use_shadow)
986ef930 2700 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 2701
cf3ef16d 2702 lxc_list_for_each(iterator, idmap) {
4bc3b759
CB
2703 /* The kernel only takes <= 4k for writes to
2704 * /proc/<nr>/[ug]id_map
2705 */
251d0d2a 2706 map = iterator->elem;
cf3ef16d
SH
2707 if (map->idtype != type)
2708 continue;
2709
4bc3b759
CB
2710 had_entry = true;
2711
986ef930 2712 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 2713 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
2714 use_shadow ? " " : "", map->nsid,
2715 map->hostid, map->range,
0e6e3a41 2716 use_shadow ? "" : "\n");
cf3ef16d 2717 if (fill <= 0 || fill >= left)
4bc3b759
CB
2718 SYSERROR("Too many {g,u}id mappings defined.");
2719
cf3ef16d 2720 pos += fill;
251d0d2a 2721 }
cf3ef16d 2722 if (!had_entry)
4f7521b4 2723 continue;
cf3ef16d 2724
986ef930
CB
2725 /* Try to catch the ouput of new{g,u}idmap to make debugging
2726 * easier.
2727 */
2728 if (use_shadow) {
2729 ret = run_command(cmd_output, sizeof(cmd_output),
2730 lxc_map_ids_exec_wrapper,
2731 (void *)mapbuf);
2732 if (ret < 0) {
54fbbeb5
CB
2733 ERROR("new%cidmap failed to write mapping \"%s\": %s",
2734 u_or_g, cmd_output, mapbuf);
986ef930
CB
2735 return -1;
2736 }
54fbbeb5 2737 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
d1838f34 2738 } else {
986ef930 2739 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
54fbbeb5
CB
2740 if (ret < 0) {
2741 ERROR("Failed to write mapping \"%s\": %s",
2742 cmd_output, mapbuf);
986ef930 2743 return -1;
54fbbeb5
CB
2744 }
2745 TRACE("Wrote mapping \"%s\"", mapbuf);
d1838f34 2746 }
986ef930
CB
2747
2748 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 2749 }
251d0d2a 2750
986ef930 2751 return 0;
f6d3e3e4
SH
2752}
2753
cf3ef16d 2754/*
7b50c609
TS
2755 * return the host uid/gid to which the container root is mapped in
2756 * *val.
0b3a6504 2757 * Return true if id was found, false otherwise.
cf3ef16d 2758 */
2a9a80cb 2759bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 2760 unsigned long *val)
cf3ef16d
SH
2761{
2762 struct lxc_list *it;
2763 struct id_map *map;
2764
2765 lxc_list_for_each(it, &conf->id_map) {
2766 map = it->elem;
7b50c609 2767 if (map->idtype != idtype)
cf3ef16d
SH
2768 continue;
2769 if (map->nsid != 0)
2770 continue;
2a9a80cb
SH
2771 *val = map->hostid;
2772 return true;
cf3ef16d 2773 }
2a9a80cb 2774 return false;
cf3ef16d
SH
2775}
2776
2133f58c 2777int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
2778{
2779 struct lxc_list *it;
2780 struct id_map *map;
2781 lxc_list_for_each(it, &conf->id_map) {
2782 map = it->elem;
2133f58c 2783 if (map->idtype != idtype)
cf3ef16d
SH
2784 continue;
2785 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 2786 return (id - map->hostid) + map->nsid;
cf3ef16d 2787 }
57d116ab 2788 return -1;
cf3ef16d
SH
2789}
2790
339efad9 2791int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
2792{
2793 struct lxc_list *it;
2794 struct id_map *map;
2133f58c 2795 unsigned int freeid = 0;
cf3ef16d
SH
2796again:
2797 lxc_list_for_each(it, &conf->id_map) {
2798 map = it->elem;
2133f58c 2799 if (map->idtype != idtype)
cf3ef16d
SH
2800 continue;
2801 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
2802 freeid = map->nsid + map->range;
2803 goto again;
2804 }
2805 }
2806 return freeid;
2807}
2808
f4f52cb5
CB
2809int chown_mapped_root_exec_wrapper(void *args)
2810{
2811 execvp("lxc-usernsexec", args);
2812 return -1;
2813}
2814
f6d3e3e4 2815/*
7b50c609
TS
2816 * chown_mapped_root: for an unprivileged user with uid/gid X to
2817 * chown a dir to subuid/subgid Y, he needs to run chown as root
2818 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
2819 * nsid Y is mapped to hostuid/hostgid X. That way, the container
2820 * root is privileged with respect to hostuid/hostgid X, allowing
2821 * him to do the chown.
f6d3e3e4 2822 */
c4d10a05 2823int chown_mapped_root(char *path, struct lxc_conf *conf)
f6d3e3e4 2824{
f4f52cb5 2825 uid_t rootuid, rootgid;
2a9a80cb 2826 unsigned long val;
f4f52cb5
CB
2827 int hostuid, hostgid, ret;
2828 struct stat sb;
2829 char map1[100], map2[100], map3[100], map4[100], map5[100];
2830 char ugid[100];
2831 char *args1[] = {"lxc-usernsexec",
2832 "-m", map1,
2833 "-m", map2,
2834 "-m", map3,
2835 "-m", map5,
2836 "--", "chown", ugid, path,
2837 NULL};
2838 char *args2[] = {"lxc-usernsexec",
2839 "-m", map1,
2840 "-m", map2,
2841 "-m", map3,
2842 "-m", map4,
2843 "-m", map5,
2844 "--", "chown", ugid, path,
2845 NULL};
2846 char cmd_output[MAXPATHLEN];
2847
2848 hostuid = geteuid();
2849 hostgid = getegid();
f6d3e3e4 2850
2a9a80cb 2851 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 2852 ERROR("No uid mapping for container root");
c4d10a05 2853 return -1;
f6d3e3e4 2854 }
f4f52cb5 2855 rootuid = (uid_t)val;
7b50c609 2856 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 2857 ERROR("No gid mapping for container root");
7b50c609
TS
2858 return -1;
2859 }
f4f52cb5 2860 rootgid = (gid_t)val;
2a9a80cb 2861
f4f52cb5 2862 if (hostuid == 0) {
7b50c609 2863 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
2864 ERROR("Error chowning %s", path);
2865 return -1;
2866 }
2867 return 0;
2868 }
f3d7e4ca 2869
f4f52cb5 2870 if (rootuid == hostuid) {
1a0e70ac 2871 /* nothing to do */
b103ceac 2872 INFO("Container root is our uid; no need to chown");
f3d7e4ca
SH
2873 return 0;
2874 }
2875
bbdbf8f0 2876 /* save the current gid of "path" */
f4f52cb5
CB
2877 if (stat(path, &sb) < 0) {
2878 ERROR("Error stat %s", path);
f6d3e3e4
SH
2879 return -1;
2880 }
7b50c609 2881
bbdbf8f0
CB
2882 /* Update the path argument in case this was overlayfs. */
2883 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
2884 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
2885
f4f52cb5
CB
2886 /*
2887 * A file has to be group-owned by a gid mapped into the
2888 * container, or the container won't be privileged over it.
2889 */
2890 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
2891 if (sb.st_uid == hostuid &&
2892 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
2893 chown(path, -1, hostgid) < 0) {
2894 ERROR("Failed chgrping %s", path);
2895 return -1;
2896 }
f6d3e3e4 2897
1a0e70ac 2898 /* "u:0:rootuid:1" */
f4f52cb5
CB
2899 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
2900 if (ret < 0 || ret >= 100) {
2901 ERROR("Error uid printing map string");
2902 return -1;
2903 }
7b50c609 2904
1a0e70ac 2905 /* "u:hostuid:hostuid:1" */
f4f52cb5
CB
2906 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
2907 if (ret < 0 || ret >= 100) {
2908 ERROR("Error uid printing map string");
2909 return -1;
2910 }
c4d10a05 2911
1a0e70ac 2912 /* "g:0:rootgid:1" */
f4f52cb5
CB
2913 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
2914 if (ret < 0 || ret >= 100) {
2915 ERROR("Error gid printing map string");
2916 return -1;
2917 }
98e5ba51 2918
1a0e70ac 2919 /* "g:pathgid:rootgid+pathgid:1" */
f4f52cb5
CB
2920 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
2921 rootgid + (gid_t)sb.st_gid);
2922 if (ret < 0 || ret >= 100) {
2923 ERROR("Error gid printing map string");
2924 return -1;
2925 }
c4d10a05 2926
1a0e70ac 2927 /* "g:hostgid:hostgid:1" */
f4f52cb5
CB
2928 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
2929 if (ret < 0 || ret >= 100) {
2930 ERROR("Error gid printing map string");
2931 return -1;
2932 }
7b50c609 2933
1a0e70ac 2934 /* "0:pathgid" (chown) */
f4f52cb5
CB
2935 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
2936 if (ret < 0 || ret >= 100) {
2937 ERROR("Error owner printing format string for chown");
2938 return -1;
2939 }
7b50c609 2940
f4f52cb5
CB
2941 if (hostgid == sb.st_gid)
2942 ret = run_command(cmd_output, sizeof(cmd_output),
2943 chown_mapped_root_exec_wrapper,
2944 (void *)args1);
2945 else
2946 ret = run_command(cmd_output, sizeof(cmd_output),
2947 chown_mapped_root_exec_wrapper,
2948 (void *)args2);
2949 if (ret < 0)
2950 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 2951
f4f52cb5 2952 return ret;
f6d3e3e4
SH
2953}
2954
54117de5 2955int lxc_ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 2956{
c4d10a05 2957 if (lxc_list_empty(&c->id_map))
f6d3e3e4 2958 return 0;
c4d10a05 2959
54117de5
CB
2960 if (!strcmp(c->console.name, ""))
2961 return 0;
2962
2963 if (chown_mapped_root(c->console.name, c) < 0) {
2964 ERROR("failed to chown console \"%s\"", c->console.name);
c4d10a05
SH
2965 return -1;
2966 }
2967
54117de5
CB
2968 TRACE("chowned console \"%s\"", c->console.name);
2969
f6d3e3e4
SH
2970 return 0;
2971}
2972
943144d9
CB
2973/* NOTE: Must not be called from inside the container namespace! */
2974int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
2975{
2976 int mounted;
2977
943144d9 2978 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 2979 if (mounted == -1) {
943144d9 2980 SYSERROR("failed to mount /proc in the container");
01958b1f 2981 /* continue only if there is no rootfs */
943144d9 2982 if (conf->rootfs.path)
01958b1f 2983 return -1;
5112cd70 2984 } else if (mounted == 1) {
943144d9 2985 conf->tmp_umount_proc = 1;
5112cd70 2986 }
943144d9 2987
5112cd70
SH
2988 return 0;
2989}
2990
2991void tmp_proc_unmount(struct lxc_conf *lxc_conf)
2992{
2993 if (lxc_conf->tmp_umount_proc == 1) {
2994 umount("/proc");
2995 lxc_conf->tmp_umount_proc = 0;
2996 }
2997}
2998
6a0c909a 2999void remount_all_slave(void)
e995d7a2
SH
3000{
3001 /* walk /proc/mounts and change any shared entries to slave */
3002 FILE *f = fopen("/proc/self/mountinfo", "r");
3003 char *line = NULL;
3004 size_t len = 0;
3005
3006 if (!f) {
3007 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3008 ERROR("Continuing container startup...");
3009 return;
3010 }
3011
3012 while (getline(&line, &len, f) != -1) {
3013 char *target, *opts;
3014 target = get_field(line, 4);
3015 if (!target)
3016 continue;
3017 opts = get_field(target, 2);
3018 if (!opts)
3019 continue;
3020 null_endofword(opts);
3021 if (!strstr(opts, "shared"))
3022 continue;
3023 null_endofword(target);
3024 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3025 SYSERROR("Failed to make %s rslave", target);
3026 ERROR("Continuing...");
3027 }
3028 }
3029 fclose(f);
f10fad2f 3030 free(line);
e995d7a2
SH
3031}
3032
2322903b
SH
3033void lxc_execute_bind_init(struct lxc_conf *conf)
3034{
3035 int ret;
9d9c111c
SH
3036 char path[PATH_MAX], destpath[PATH_MAX], *p;
3037
3038 /* If init exists in the container, don't bind mount a static one */
3039 p = choose_init(conf->rootfs.mount);
3040 if (p) {
3041 free(p);
3042 return;
3043 }
2322903b
SH
3044
3045 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3046 if (ret < 0 || ret >= PATH_MAX) {
3047 WARN("Path name too long searching for lxc.init.static");
3048 return;
3049 }
3050
3051 if (!file_exists(path)) {
3052 INFO("%s does not exist on host", path);
3053 return;
3054 }
3055
3056 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3057 if (ret < 0 || ret >= PATH_MAX) {
3058 WARN("Path name too long for container's lxc.init.static");
3059 return;
3060 }
3061
3062 if (!file_exists(destpath)) {
3063 FILE * pathfile = fopen(destpath, "wb");
3064 if (!pathfile) {
3065 SYSERROR("Failed to create mount target '%s'", destpath);
3066 return;
3067 }
3068 fclose(pathfile);
3069 }
3070
592fd47a 3071 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
2322903b
SH
3072 if (ret < 0)
3073 SYSERROR("Failed to bind lxc.init.static into container");
3074 INFO("lxc.init.static bound into container at %s", path);
3075}
3076
35120d9c
SH
3077/*
3078 * This does the work of remounting / if it is shared, calling the
3079 * container pre-mount hooks, and mounting the rootfs.
3080 */
3081int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 3082{
35120d9c
SH
3083 if (conf->rootfs_setup) {
3084 /*
3085 * rootfs was set up in another namespace. bind-mount it
3086 * to give us a mount in our own ns so we can pivot_root to it
3087 */
3088 const char *path = conf->rootfs.mount;
3089 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
3090 ERROR("Failed to bind-mount container / onto itself");
145832ba 3091 return -1;
35120d9c 3092 }
145832ba 3093 return 0;
35120d9c 3094 }
d4ef7c50 3095
e995d7a2
SH
3096 remount_all_slave();
3097
35120d9c
SH
3098 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
3099 ERROR("failed to run pre-mount hooks for container '%s'.", name);
3100 return -1;
3101 }
3102
9aa76a17 3103 if (lxc_setup_rootfs(conf)) {
35120d9c
SH
3104 ERROR("failed to setup rootfs for '%s'", name);
3105 return -1;
3106 }
3107
3108 conf->rootfs_setup = true;
3109 return 0;
3110}
3111
1c1c7051
SH
3112static bool verify_start_hooks(struct lxc_conf *conf)
3113{
3114 struct lxc_list *it;
3115 char path[MAXPATHLEN];
3116 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
3117 char *hookname = it->elem;
3118 struct stat st;
3119 int ret;
3120
3121 ret = snprintf(path, MAXPATHLEN, "%s%s",
7b6753e7 3122 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
1c1c7051
SH
3123 if (ret < 0 || ret >= MAXPATHLEN)
3124 return false;
3125 ret = stat(path, &st);
3126 if (ret) {
7b6753e7 3127 SYSERROR("Start hook %s not found in container",
1c1c7051
SH
3128 hookname);
3129 return false;
3130 }
6a0c909a 3131 return true;
1c1c7051
SH
3132 }
3133
3134 return true;
3135}
3136
35120d9c
SH
3137int lxc_setup(struct lxc_handler *handler)
3138{
2187efd3 3139 int ret;
35120d9c
SH
3140 const char *name = handler->name;
3141 struct lxc_conf *lxc_conf = handler->conf;
3142 const char *lxcpath = handler->lxcpath;
35120d9c
SH
3143
3144 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
3145 ERROR("Error setting up rootfs mount after spawn");
3146 return -1;
3147 }
3148
6c544cb3
MM
3149 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
3150 if (setup_utsname(lxc_conf->utsname)) {
3151 ERROR("failed to setup the utsname for '%s'", name);
3152 return -1;
3153 }
0ad19a3f 3154 }
3155
811ef482 3156 if (lxc_setup_network_in_child_namespaces(lxc_conf, &lxc_conf->network)) {
36eb9bde 3157 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 3158 return -1;
0ad19a3f 3159 }
3160
790255cf
CB
3161 if (lxc_network_send_name_and_ifindex_to_parent(handler) < 0) {
3162 ERROR("Failed to network device names and ifindices to parent");
3163 return -1;
3164 }
3165
bc6928ff 3166 if (lxc_conf->autodev > 0) {
14221cbb 3167 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
91c3830e 3168 ERROR("failed to mount /dev in the container");
c6883f38
SH
3169 return -1;
3170 }
3171 }
3172
368bbc02
CS
3173 /* do automatic mounts (mainly /proc and /sys), but exclude
3174 * those that need to wait until other stuff has finished
3175 */
4fb3cba5 3176 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
3177 ERROR("failed to setup the automatic mounts for '%s'", name);
3178 return -1;
3179 }
3180
06749971 3181 if (setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
36eb9bde 3182 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 3183 return -1;
576f946d 3184 }
3185
06749971 3186 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(lxc_conf, &lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
e7938e9e
MN
3187 ERROR("failed to setup the mount entries for '%s'", name);
3188 return -1;
3189 }
3190
7b6753e7 3191 /* Make sure any start hooks are in the container */
1c1c7051
SH
3192 if (!verify_start_hooks(lxc_conf))
3193 return -1;
3194
2322903b
SH
3195 if (lxc_conf->is_execute)
3196 lxc_execute_bind_init(lxc_conf);
3197
368bbc02
CS
3198 /* now mount only cgroup, if wanted;
3199 * before, /sys could not have been mounted
3200 * (is either mounted automatically or via fstab entries)
3201 */
4fb3cba5 3202 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
3203 ERROR("failed to setup the automatic mounts for '%s'", name);
3204 return -1;
3205 }
3206
283678ed 3207 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
773fb9ca
SH
3208 ERROR("failed to run mount hooks for container '%s'.", name);
3209 return -1;
3210 }
3211
bc6928ff 3212 if (lxc_conf->autodev > 0) {
283678ed 3213 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
f7bee6c6
MW
3214 ERROR("failed to run autodev hooks for container '%s'.", name);
3215 return -1;
3216 }
06749971 3217
27245ff7 3218 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
91c3830e
SH
3219 ERROR("failed to populate /dev in the container");
3220 return -1;
3221 }
3222 }
368bbc02 3223
3d7d929a 3224 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
36eb9bde 3225 ERROR("failed to setup the console for '%s'", name);
95b5ffaf 3226 return -1;
6e590161 3227 }
3228
69aa6655
DE
3229 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
3230 ERROR("failed to setup /dev symlinks for '%s'", name);
3231 return -1;
3232 }
3233
5112cd70 3234 /* mount /proc if it's not already there */
943144d9 3235 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 3236 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 3237 return -1;
e075f5d9 3238 }
e075f5d9 3239
ac778708 3240 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 3241 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 3242 return -1;
ed502555 3243 }
3244
70761e5e 3245 if (lxc_setup_devpts(lxc_conf->pts)) {
36eb9bde 3246 ERROR("failed to setup the new pts instance");
95b5ffaf 3247 return -1;
3c26f34e 3248 }
3249
2187efd3
CB
3250 ret = lxc_create_ttys(handler);
3251 if (ret < 0)
e8bd4e43 3252 return -1;
e8bd4e43 3253
cccc74b5
DL
3254 if (setup_personality(lxc_conf->personality)) {
3255 ERROR("failed to setup personality");
3256 return -1;
3257 }
3258
97a8f74f
SG
3259 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3260 if (!lxc_list_empty(&lxc_conf->caps)) {
7389ca26 3261 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
f6d3e3e4
SH
3262 return -1;
3263 }
97a8f74f
SG
3264 if (dropcaps_except(&lxc_conf->keepcaps)) {
3265 ERROR("failed to keep requested caps");
3266 return -1;
3267 }
3268 } else if (setup_caps(&lxc_conf->caps)) {
3269 ERROR("failed to drop capabilities");
3270 return -1;
81810dd1
DL
3271 }
3272
f4152036 3273 NOTICE("Container \"%s\" is set up", name);
cd54d859 3274
0ad19a3f 3275 return 0;
3276}
26ddeedd 3277
283678ed
SH
3278int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
3279 const char *lxcpath, char *argv[])
26ddeedd
SH
3280{
3281 int which = -1;
3282 struct lxc_list *it;
3283
3284 if (strcmp(hook, "pre-start") == 0)
3285 which = LXCHOOK_PRESTART;
5ea6163a
SH
3286 else if (strcmp(hook, "pre-mount") == 0)
3287 which = LXCHOOK_PREMOUNT;
26ddeedd
SH
3288 else if (strcmp(hook, "mount") == 0)
3289 which = LXCHOOK_MOUNT;
f7bee6c6
MW
3290 else if (strcmp(hook, "autodev") == 0)
3291 which = LXCHOOK_AUTODEV;
26ddeedd
SH
3292 else if (strcmp(hook, "start") == 0)
3293 which = LXCHOOK_START;
52492063
WB
3294 else if (strcmp(hook, "stop") == 0)
3295 which = LXCHOOK_STOP;
26ddeedd
SH
3296 else if (strcmp(hook, "post-stop") == 0)
3297 which = LXCHOOK_POSTSTOP;
148e91f5
SH
3298 else if (strcmp(hook, "clone") == 0)
3299 which = LXCHOOK_CLONE;
37cf711b
SY
3300 else if (strcmp(hook, "destroy") == 0)
3301 which = LXCHOOK_DESTROY;
26ddeedd
SH
3302 else
3303 return -1;
3304 lxc_list_for_each(it, &conf->hooks[which]) {
3305 int ret;
3306 char *hookname = it->elem;
283678ed 3307 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
26ddeedd
SH
3308 if (ret)
3309 return ret;
3310 }
3311 return 0;
3312}
72d0e1cb 3313
72d0e1cb
SG
3314int lxc_clear_config_caps(struct lxc_conf *c)
3315{
1a0e70ac 3316 struct lxc_list *it, *next;
72d0e1cb 3317
9ebb03ad 3318 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
3319 lxc_list_del(it);
3320 free(it->elem);
3321 free(it);
3322 }
3323 return 0;
3324}
3325
74a3920a 3326static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
3327 struct lxc_list *it, *next;
3328
4355ab5f 3329 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
3330 lxc_list_del(it);
3331 free(it->elem);
3332 free(it);
3333 }
3334 return 0;
3335}
3336
4355ab5f
SH
3337int lxc_clear_idmaps(struct lxc_conf *c)
3338{
3339 return lxc_free_idmap(&c->id_map);
3340}
3341
1fb86a7c
SH
3342int lxc_clear_config_keepcaps(struct lxc_conf *c)
3343{
3344 struct lxc_list *it,*next;
3345
3346 lxc_list_for_each_safe(it, &c->keepcaps, next) {
3347 lxc_list_del(it);
3348 free(it->elem);
3349 free(it);
3350 }
3351 return 0;
3352}
3353
12a50cc6 3354int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 3355{
9ebb03ad 3356 struct lxc_list *it,*next;
72d0e1cb 3357 bool all = false;
a6390f01 3358 const char *k = NULL;
72d0e1cb
SG
3359
3360 if (strcmp(key, "lxc.cgroup") == 0)
3361 all = true;
a6390f01
WB
3362 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
3363 k = key + sizeof("lxc.cgroup.")-1;
3364 else
3365 return -1;
72d0e1cb 3366
9ebb03ad 3367 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
3368 struct lxc_cgroup *cg = it->elem;
3369 if (!all && strcmp(cg->subsystem, k) != 0)
3370 continue;
3371 lxc_list_del(it);
3372 free(cg->subsystem);
3373 free(cg->value);
3374 free(cg);
3375 free(it);
3376 }
3377 return 0;
3378}
3379
c6d09e15
WB
3380int lxc_clear_limits(struct lxc_conf *c, const char *key)
3381{
3382 struct lxc_list *it, *next;
3383 bool all = false;
3384 const char *k = NULL;
3385
240d4b74 3386 if (strcmp(key, "lxc.limit") == 0
3387 || strcmp(key, "lxc.prlimit"))
c6d09e15
WB
3388 all = true;
3389 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
3390 k = key + sizeof("lxc.limit.")-1;
240d4b74 3391 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
3392 k = key + sizeof("lxc.prlimit.")-1;
c6d09e15
WB
3393 else
3394 return -1;
3395
3396 lxc_list_for_each_safe(it, &c->limits, next) {
3397 struct lxc_limit *lim = it->elem;
3398 if (!all && strcmp(lim->resource, k) != 0)
3399 continue;
3400 lxc_list_del(it);
3401 free(lim->resource);
3402 free(lim);
3403 free(it);
3404 }
3405 return 0;
3406}
3407
ee1e7aa0
SG
3408int lxc_clear_groups(struct lxc_conf *c)
3409{
3410 struct lxc_list *it,*next;
3411
3412 lxc_list_for_each_safe(it, &c->groups, next) {
3413 lxc_list_del(it);
3414 free(it->elem);
3415 free(it);
3416 }
3417 return 0;
3418}
3419
ab799c0b
SG
3420int lxc_clear_environment(struct lxc_conf *c)
3421{
3422 struct lxc_list *it,*next;
3423
3424 lxc_list_for_each_safe(it, &c->environment, next) {
3425 lxc_list_del(it);
3426 free(it->elem);
3427 free(it);
3428 }
3429 return 0;
3430}
3431
72d0e1cb
SG
3432int lxc_clear_mount_entries(struct lxc_conf *c)
3433{
9ebb03ad 3434 struct lxc_list *it,*next;
72d0e1cb 3435
9ebb03ad 3436 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
3437 lxc_list_del(it);
3438 free(it->elem);
3439 free(it);
3440 }
3441 return 0;
3442}
3443
b099e9e9
SH
3444int lxc_clear_automounts(struct lxc_conf *c)
3445{
3446 c->auto_mounts = 0;
3447 return 0;
3448}
3449
12a50cc6 3450int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 3451{
9ebb03ad 3452 struct lxc_list *it,*next;
17ed13a3 3453 bool all = false, done = false;
a6390f01 3454 const char *k = NULL;
72d0e1cb
SG
3455 int i;
3456
17ed13a3
SH
3457 if (strcmp(key, "lxc.hook") == 0)
3458 all = true;
a6390f01
WB
3459 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
3460 k = key + sizeof("lxc.hook.")-1;
3461 else
3462 return -1;
17ed13a3 3463
72d0e1cb 3464 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 3465 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 3466 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
3467 lxc_list_del(it);
3468 free(it->elem);
3469 free(it);
3470 }
3471 done = true;
72d0e1cb
SG
3472 }
3473 }
17ed13a3
SH
3474
3475 if (!done) {
3476 ERROR("Invalid hook key: %s", key);
3477 return -1;
3478 }
72d0e1cb
SG
3479 return 0;
3480}
8eb5694b 3481
4184c3e1
SH
3482static inline void lxc_clear_aliens(struct lxc_conf *conf)
3483{
3484 struct lxc_list *it,*next;
3485
3486 lxc_list_for_each_safe(it, &conf->aliens, next) {
3487 lxc_list_del(it);
3488 free(it->elem);
3489 free(it);
3490 }
3491}
3492
c7b15d1e 3493void lxc_clear_includes(struct lxc_conf *conf)
f979ac15
SH
3494{
3495 struct lxc_list *it,*next;
3496
3497 lxc_list_for_each_safe(it, &conf->includes, next) {
3498 lxc_list_del(it);
3499 free(it->elem);
3500 free(it);
3501 }
3502}
3503
8eb5694b
SH
3504void lxc_conf_free(struct lxc_conf *conf)
3505{
3506 if (!conf)
3507 return;
858377e4
SH
3508 if (current_config == conf)
3509 current_config = NULL;
f10fad2f
ME
3510 free(conf->console.log_path);
3511 free(conf->console.path);
3512 free(conf->rootfs.mount);
b3b8c97f 3513 free(conf->rootfs.bdev_type);
f10fad2f
ME
3514 free(conf->rootfs.options);
3515 free(conf->rootfs.path);
f10fad2f 3516 free(conf->logfile);
858377e4
SH
3517 if (conf->logfd != -1)
3518 close(conf->logfd);
f10fad2f
ME
3519 free(conf->utsname);
3520 free(conf->ttydir);
3521 free(conf->fstab);
3522 free(conf->rcfile);
3523 free(conf->init_cmd);
6b0d5538 3524 free(conf->unexpanded_config);
393903d1 3525 free(conf->pty_names);
76d0127f 3526 free(conf->syslog);
c302b476 3527 lxc_free_networks(&conf->network);
f10fad2f
ME
3528 free(conf->lsm_aa_profile);
3529 free(conf->lsm_se_context);
769872f9 3530 lxc_seccomp_free(conf);
8eb5694b 3531 lxc_clear_config_caps(conf);
1fb86a7c 3532 lxc_clear_config_keepcaps(conf);
8eb5694b 3533 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 3534 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 3535 lxc_clear_mount_entries(conf);
27c27d73 3536 lxc_clear_idmaps(conf);
ee1e7aa0 3537 lxc_clear_groups(conf);
f979ac15 3538 lxc_clear_includes(conf);
761d81ca 3539 lxc_clear_aliens(conf);
ab799c0b 3540 lxc_clear_environment(conf);
240d4b74 3541 lxc_clear_limits(conf, "lxc.prlimit");
43654d34
CB
3542 free(conf->cgroup_meta.dir);
3543 free(conf->cgroup_meta.controllers);
8eb5694b
SH
3544 free(conf);
3545}
4355ab5f
SH
3546
3547struct userns_fn_data {
3548 int (*fn)(void *);
c9b7c33e 3549 const char *fn_name;
4355ab5f
SH
3550 void *arg;
3551 int p[2];
3552};
3553
3554static int run_userns_fn(void *data)
3555{
3556 struct userns_fn_data *d = data;
3557 char c;
4355ab5f 3558
f8aa4bf3 3559 /* Close write end of the pipe. */
4355ab5f 3560 close(d->p[1]);
f8aa4bf3
CB
3561
3562 /* Wait for parent to finish establishing a new mapping in the user
3563 * namespace we are executing in.
3564 */
4355ab5f
SH
3565 if (read(d->p[0], &c, 1) != 1)
3566 return -1;
f8aa4bf3
CB
3567
3568 /* Close read end of the pipe. */
4355ab5f 3569 close(d->p[0]);
f8aa4bf3 3570
c9b7c33e
CB
3571 if (d->fn_name)
3572 TRACE("calling function \"%s\"", d->fn_name);
f8aa4bf3 3573 /* Call function to run. */
4355ab5f
SH
3574 return d->fn(d->arg);
3575}
3576
339efad9 3577static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
f8aa4bf3
CB
3578 enum idtype idtype)
3579{
3580 struct lxc_list *it;
3581 struct id_map *map;
3582 struct id_map *retmap = NULL;
3583
3584 lxc_list_for_each(it, &conf->id_map) {
3585 map = it->elem;
3586 if (map->idtype != idtype)
3587 continue;
3588
3589 if (id >= map->hostid && id < map->hostid + map->range) {
3590 retmap = map;
3591 break;
3592 }
3593 }
3594
3595 if (!retmap)
3596 return NULL;
3597
3598 retmap = malloc(sizeof(*retmap));
3599 if (!retmap)
3600 return NULL;
3601
3602 memcpy(retmap, map, sizeof(*retmap));
3603 return retmap;
3604}
3605
4355ab5f 3606/*
f8aa4bf3
CB
3607 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
3608 * existing one or establish a new one.
4355ab5f 3609 */
28a2d9e7 3610static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4355ab5f 3611{
28a2d9e7 3612 int hostid_mapped;
f8aa4bf3 3613 struct id_map *entry = NULL;
f8aa4bf3 3614
28a2d9e7
CB
3615 /* Reuse existing mapping. */
3616 entry = mapped_hostid_entry(conf, id, type);
3617 if (entry)
3618 return entry;
f8aa4bf3 3619
28a2d9e7
CB
3620 /* Find new mapping. */
3621 hostid_mapped = find_unmapped_nsid(conf, type);
3622 if (hostid_mapped < 0) {
3623 DEBUG("failed to find free mapping for id %d", id);
3624 return NULL;
f8aa4bf3 3625 }
f8aa4bf3 3626
28a2d9e7
CB
3627 entry = malloc(sizeof(*entry));
3628 if (!entry)
3629 return NULL;
4355ab5f 3630
28a2d9e7
CB
3631 entry->idtype = type;
3632 entry->nsid = hostid_mapped;
3633 entry->hostid = (unsigned long)id;
3634 entry->range = 1;
4355ab5f 3635
28a2d9e7 3636 return entry;
4355ab5f
SH
3637}
3638
f8aa4bf3
CB
3639/* Run a function in a new user namespace.
3640 * The caller's euid/egid will be mapped if it is not already.
3641 * Afaict, userns_exec_1() is only used to operate based on privileges for the
3642 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
3643 * This means we require only to establish a mapping from:
3644 * - the container root {g,u}id as seen from the host > user's host {g,u}id
3645 * - the container root -> some sub{g,u}id
3646 * The former we add, if the user did not specifiy a mapping. The latter we
3647 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
3648 * there to start the container in the first place.
4355ab5f 3649 */
c9b7c33e
CB
3650int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
3651 const char *fn_name)
4355ab5f 3652{
f8aa4bf3
CB
3653 pid_t pid;
3654 uid_t euid, egid;
4355ab5f 3655 struct userns_fn_data d;
4355ab5f 3656 int p[2];
f8aa4bf3
CB
3657 struct lxc_list *it;
3658 struct id_map *map;
3659 char c = '1';
3660 int ret = -1;
3661 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
3662 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
3663 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 3664
4355ab5f 3665 ret = pipe(p);
4355ab5f
SH
3666 if (ret < 0) {
3667 SYSERROR("opening pipe");
3668 return -1;
3669 }
3670 d.fn = fn;
c9b7c33e 3671 d.fn_name = fn_name;
4355ab5f
SH
3672 d.arg = data;
3673 d.p[0] = p[0];
3674 d.p[1] = p[1];
f8aa4bf3
CB
3675
3676 /* Clone child in new user namespace. */
4355ab5f 3677 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
f8aa4bf3
CB
3678 if (pid < 0) {
3679 ERROR("failed to clone child process in new user namespace");
3680 goto on_error;
3681 }
3682
4355ab5f 3683 close(p[0]);
4355ab5f
SH
3684 p[0] = -1;
3685
f8aa4bf3
CB
3686 /* Find container root. */
3687 lxc_list_for_each(it, &conf->id_map) {
3688 map = it->elem;
3689
3690 if (map->nsid != 0)
3691 continue;
3692
3693 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
3694 container_root_uid = malloc(sizeof(*container_root_uid));
3695 if (!container_root_uid)
3696 goto on_error;
3697 container_root_uid->idtype = map->idtype;
3698 container_root_uid->hostid = map->hostid;
3699 container_root_uid->nsid = 0;
3700 container_root_uid->range = map->range;
3701 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
3702 container_root_gid = malloc(sizeof(*container_root_gid));
3703 if (!container_root_gid)
3704 goto on_error;
3705 container_root_gid->idtype = map->idtype;
3706 container_root_gid->hostid = map->hostid;
3707 container_root_gid->nsid = 0;
3708 container_root_gid->range = map->range;
3709 }
3710
3711 /* Found container root. */
3712 if (container_root_uid && container_root_gid)
3713 break;
3714 }
3715
3716 /* This is actually checked earlier but it can't hurt. */
3717 if (!container_root_uid || !container_root_gid) {
3718 ERROR("no mapping for container root found");
3719 goto on_error;
3720 }
3721
1d90e064
CB
3722 host_uid_map = container_root_uid;
3723 host_gid_map = container_root_gid;
3724
f8aa4bf3
CB
3725 /* Check whether the {g,u}id of the user has a mapping. */
3726 euid = geteuid();
3727 egid = getegid();
1d90e064 3728 if (euid != container_root_uid->hostid)
28a2d9e7
CB
3729 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
3730
1d90e064 3731 if (egid != container_root_gid->hostid)
28a2d9e7
CB
3732 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
3733
3734 if (!host_uid_map) {
3735 DEBUG("failed to find mapping for uid %d", euid);
f8aa4bf3
CB
3736 goto on_error;
3737 }
3738
28a2d9e7
CB
3739 if (!host_gid_map) {
3740 DEBUG("failed to find mapping for gid %d", egid);
3741 goto on_error;
3742 }
3743
3744 /* Allocate new {g,u}id map list. */
3745 idmap = malloc(sizeof(*idmap));
3746 if (!idmap)
3747 goto on_error;
3748 lxc_list_init(idmap);
3749
f8aa4bf3
CB
3750 /* Add container root to the map. */
3751 tmplist = malloc(sizeof(*tmplist));
3752 if (!tmplist)
3753 goto on_error;
3754 lxc_list_add_elem(tmplist, container_root_uid);
3755 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3756
1d90e064 3757 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
3758 /* idmap will now keep track of that memory. */
3759 container_root_uid = NULL;
3760
3761 /* Add container root to the map. */
3762 tmplist = malloc(sizeof(*tmplist));
3763 if (!tmplist)
3764 goto on_error;
3765 lxc_list_add_elem(tmplist, host_uid_map);
3766 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3767 }
1d90e064
CB
3768 /* idmap will now keep track of that memory. */
3769 container_root_uid = NULL;
3770 /* idmap will now keep track of that memory. */
3771 host_uid_map = NULL;
f8aa4bf3
CB
3772
3773 tmplist = malloc(sizeof(*tmplist));
3774 if (!tmplist)
3775 goto on_error;
3776 lxc_list_add_elem(tmplist, container_root_gid);
3777 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3778
1d90e064 3779 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
3780 /* idmap will now keep track of that memory. */
3781 container_root_gid = NULL;
3782
3783 tmplist = malloc(sizeof(*tmplist));
3784 if (!tmplist)
3785 goto on_error;
3786 lxc_list_add_elem(tmplist, host_gid_map);
3787 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3788 }
1d90e064
CB
3789 /* idmap will now keep track of that memory. */
3790 container_root_gid = NULL;
3791 /* idmap will now keep track of that memory. */
3792 host_gid_map = NULL;
f8aa4bf3 3793
4b73005c
CB
3794 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
3795 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
f8aa4bf3
CB
3796 lxc_list_for_each(it, idmap) {
3797 map = it->elem;
3798 TRACE("establishing %cid mapping for \"%d\" in new "
3799 "user namespace: nsuid %lu - hostid %lu - range "
3800 "%lu",
3801 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
3802 map->nsid, map->hostid, map->range);
3803 }
4355ab5f
SH
3804 }
3805
f8aa4bf3 3806 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 3807 ret = lxc_map_ids(idmap, pid);
f8aa4bf3
CB
3808 if (ret < 0) {
3809 ERROR("error setting up {g,u}id mappings for child process "
3810 "\"%d\"",
3811 pid);
3812 goto on_error;
4355ab5f
SH
3813 }
3814
f8aa4bf3 3815 /* Tell child to proceed. */
4355ab5f 3816 if (write(p[1], &c, 1) != 1) {
f8aa4bf3
CB
3817 SYSERROR("failed telling child process \"%d\" to proceed", pid);
3818 goto on_error;
4355ab5f
SH
3819 }
3820
f8aa4bf3 3821 /* Wait for child to finish. */
3139aead
SG
3822 ret = wait_for_pid(pid);
3823
f8aa4bf3 3824on_error:
1d90e064
CB
3825 if (idmap)
3826 lxc_free_idmap(idmap);
3827 if (container_root_uid)
3828 free(container_root_uid);
3829 if (container_root_gid)
3830 free(container_root_gid);
3831 if (host_uid_map && (host_uid_map != container_root_uid))
3832 free(host_uid_map);
3833 if (host_gid_map && (host_gid_map != container_root_gid))
3834 free(host_gid_map);
3139aead 3835
4355ab5f
SH
3836 if (p[0] != -1)
3837 close(p[0]);
3838 close(p[1]);
f8aa4bf3
CB
3839
3840 return ret;
4355ab5f 3841}
97e9cfa0 3842
a96a8e8c 3843/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
3844static char* getuname(void)
3845{
a96a8e8c 3846 struct passwd *result;
97e9cfa0 3847
a96a8e8c
SH
3848 result = getpwuid(geteuid());
3849 if (!result)
97e9cfa0
SH
3850 return NULL;
3851
a96a8e8c 3852 return strdup(result->pw_name);
97e9cfa0
SH
3853}
3854
a96a8e8c 3855/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
3856static char *getgname(void)
3857{
a96a8e8c 3858 struct group *result;
97e9cfa0 3859
a96a8e8c
SH
3860 result = getgrgid(getegid());
3861 if (!result)
97e9cfa0
SH
3862 return NULL;
3863
a96a8e8c 3864 return strdup(result->gr_name);
97e9cfa0
SH
3865}
3866
a96a8e8c 3867/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
3868void suggest_default_idmap(void)
3869{
3870 FILE *f;
3871 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
3872 char *line = NULL;
3873 char *uname, *gname;
3874 size_t len = 0;
3875
3876 if (!(uname = getuname()))
3877 return;
3878
3879 if (!(gname = getgname())) {
3880 free(uname);
3881 return;
3882 }
3883
3884 f = fopen(subuidfile, "r");
3885 if (!f) {
3886 ERROR("Your system is not configured with subuids");
3887 free(gname);
3888 free(uname);
3889 return;
3890 }
3891 while (getline(&line, &len, f) != -1) {
b7930180 3892 size_t no_newline = 0;
97e9cfa0
SH
3893 char *p = strchr(line, ':'), *p2;
3894 if (*line == '#')
3895 continue;
3896 if (!p)
3897 continue;
3898 *p = '\0';
3899 p++;
3900 if (strcmp(line, uname))
3901 continue;
3902 p2 = strchr(p, ':');
3903 if (!p2)
3904 continue;
3905 *p2 = '\0';
3906 p2++;
3907 if (!*p2)
3908 continue;
b7930180
CB
3909 no_newline = strcspn(p2, "\n");
3910 p2[no_newline] = '\0';
3911
b7b2fde4
CB
3912 if (lxc_safe_uint(p, &uid) < 0)
3913 WARN("Could not parse UID.");
3914 if (lxc_safe_uint(p2, &urange) < 0)
3915 WARN("Could not parse UID range.");
97e9cfa0
SH
3916 }
3917 fclose(f);
3918
6be7389a 3919 f = fopen(subgidfile, "r");
97e9cfa0
SH
3920 if (!f) {
3921 ERROR("Your system is not configured with subgids");
3922 free(gname);
3923 free(uname);
3924 return;
3925 }
3926 while (getline(&line, &len, f) != -1) {
b7930180 3927 size_t no_newline = 0;
97e9cfa0
SH
3928 char *p = strchr(line, ':'), *p2;
3929 if (*line == '#')
3930 continue;
3931 if (!p)
3932 continue;
3933 *p = '\0';
3934 p++;
3935 if (strcmp(line, uname))
3936 continue;
3937 p2 = strchr(p, ':');
3938 if (!p2)
3939 continue;
3940 *p2 = '\0';
3941 p2++;
3942 if (!*p2)
3943 continue;
b7930180
CB
3944 no_newline = strcspn(p2, "\n");
3945 p2[no_newline] = '\0';
3946
b7b2fde4
CB
3947 if (lxc_safe_uint(p, &gid) < 0)
3948 WARN("Could not parse GID.");
3949 if (lxc_safe_uint(p2, &grange) < 0)
3950 WARN("Could not parse GID range.");
97e9cfa0
SH
3951 }
3952 fclose(f);
3953
f10fad2f 3954 free(line);
97e9cfa0
SH
3955
3956 if (!urange || !grange) {
3957 ERROR("You do not have subuids or subgids allocated");
3958 ERROR("Unprivileged containers require subuids and subgids");
3959 return;
3960 }
3961
3962 ERROR("You must either run as root, or define uid mappings");
3963 ERROR("To pass uid mappings to lxc-create, you could create");
3964 ERROR("~/.config/lxc/default.conf:");
3965 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
3966 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
3967 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
3968
3969 free(gname);
3970 free(uname);
3971}
aaf26830 3972
a7307747
SH
3973static void free_cgroup_settings(struct lxc_list *result)
3974{
3975 struct lxc_list *iterator, *next;
3976
3977 lxc_list_for_each_safe(iterator, result, next) {
3978 lxc_list_del(iterator);
3979 free(iterator);
3980 }
3981 free(result);
3982}
3983
aaf26830
KT
3984/*
3985 * Return the list of cgroup_settings sorted according to the following rules
3986 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
3987 */
3988struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
3989{
3990 struct lxc_list *result;
3991 struct lxc_list *memsw_limit = NULL;
3992 struct lxc_list *it = NULL;
3993 struct lxc_cgroup *cg = NULL;
3994 struct lxc_list *item = NULL;
3995
3996 result = malloc(sizeof(*result));
fac7c663
KT
3997 if (!result) {
3998 ERROR("failed to allocate memory to sort cgroup settings");
3999 return NULL;
4000 }
aaf26830
KT
4001 lxc_list_init(result);
4002
4003 /*Iterate over the cgroup settings and copy them to the output list*/
4004 lxc_list_for_each(it, cgroup_settings) {
4005 item = malloc(sizeof(*item));
fac7c663
KT
4006 if (!item) {
4007 ERROR("failed to allocate memory to sort cgroup settings");
a7307747 4008 free_cgroup_settings(result);
fac7c663
KT
4009 return NULL;
4010 }
aaf26830
KT
4011 item->elem = it->elem;
4012 cg = it->elem;
4013 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4014 /* Store the memsw_limit location */
4015 memsw_limit = item;
4016 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4d5b72a1 4017 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
aaf26830
KT
4018 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
4019 item->elem = memsw_limit->elem;
4020 memsw_limit->elem = it->elem;
4021 }
4022 lxc_list_add_tail(result, item);
4023 }
4024
4025 return result;
a7307747 4026}