]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
conf: do not log uninitialized memory
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
8f3e280e
CB
27#include <dirent.h>
28#include <errno.h>
29#include <fcntl.h>
30#include <grp.h>
31#include <inttypes.h>
32#include <libgen.h>
33#include <pwd.h>
34#include <stdarg.h>
0ad19a3f 35#include <stdio.h>
0ad19a3f 36#include <stdlib.h>
0ad19a3f 37#include <string.h>
8f3e280e 38#include <time.h>
0ad19a3f 39#include <unistd.h>
8f3e280e
CB
40#include <arpa/inet.h>
41#include <linux/loop.h>
8f3e280e
CB
42#include <net/if.h>
43#include <netinet/in.h>
44#include <sys/mman.h>
45#include <sys/mount.h>
46#include <sys/param.h>
47#include <sys/prctl.h>
48#include <sys/stat.h>
49#include <sys/socket.h>
ce831b3b 50#include <sys/sysmacros.h>
2d76d1d7 51#include <sys/syscall.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
1d52bdf7 55
af6824fc
ST
56/* makedev() */
57#ifdef MAJOR_IN_MKDEV
58# include <sys/mkdev.h>
59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
5ef5c9a3
CB
71#ifdef HAVE_LINUX_MEMFD_H
72#include <linux/memfd.h>
73#endif
74
e8bd4e43 75#include "af_unix.h"
8f3e280e
CB
76#include "caps.h" /* for lxc_caps_last_cap() */
77#include "cgroup.h"
1b09f2c0 78#include "conf.h"
1ed6ba91 79#include "confile_utils.h"
8f3e280e 80#include "error.h"
1b09f2c0 81#include "log.h"
025ed0f3 82#include "lxclock.h"
8f3e280e 83#include "lxcseccomp.h"
4355ab5f 84#include "namespace.h"
8f3e280e
CB
85#include "network.h"
86#include "parse.h"
28d832c4
CB
87#include "storage.h"
88#include "storage/aufs.h"
89#include "storage/overlay.h"
8f3e280e 90#include "utils.h"
fe4de9a6 91#include "lsm/lsm.h"
d0a36f2c 92
e37dda71 93#if HAVE_LIBCAP
495d2046
SG
94#include <sys/capability.h>
95#endif
96
6ff05e18
SG
97#if HAVE_SYS_PERSONALITY_H
98#include <sys/personality.h>
99#endif
100
edaf8b1b
SG
101#if IS_BIONIC
102#include <../include/lxcmntent.h>
a04f5407
CB
103#ifndef HAVE_PRLIMIT
104#include <../include/prlimit.h>
105#endif
edaf8b1b
SG
106#else
107#include <mntent.h>
108#endif
109
36eb9bde 110lxc_log_define(lxc_conf, lxc);
e5bda9ee 111
e37dda71 112#if HAVE_LIBCAP
b09094da
MN
113#ifndef CAP_SETFCAP
114#define CAP_SETFCAP 31
115#endif
116
117#ifndef CAP_MAC_OVERRIDE
118#define CAP_MAC_OVERRIDE 32
119#endif
120
121#ifndef CAP_MAC_ADMIN
122#define CAP_MAC_ADMIN 33
123#endif
495d2046 124#endif
b09094da
MN
125
126#ifndef PR_CAPBSET_DROP
127#define PR_CAPBSET_DROP 24
128#endif
129
9818cae4
SG
130#ifndef LO_FLAGS_AUTOCLEAR
131#define LO_FLAGS_AUTOCLEAR 4
132#endif
133
bc5b27d6
DK
134#ifndef CAP_SETUID
135#define CAP_SETUID 7
136#endif
137
138#ifndef CAP_SETGID
139#define CAP_SETGID 6
140#endif
141
0769b82a
CS
142/* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144#ifndef CAP_SYS_ADMIN
145#define CAP_SYS_ADMIN 21
146#endif
147
2d76d1d7
SG
148/* Define pivot_root() if missing from the C library */
149#ifndef HAVE_PIVOT_ROOT
150static int pivot_root(const char * new_root, const char * put_old)
151{
152#ifdef __NR_pivot_root
8f3e280e 153 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 154#else
8f3e280e
CB
155 errno = ENOSYS;
156 return -1;
2d76d1d7
SG
157#endif
158}
159#else
160extern int pivot_root(const char * new_root, const char * put_old);
161#endif
162
163/* Define sethostname() if missing from the C library */
164#ifndef HAVE_SETHOSTNAME
165static int sethostname(const char * name, size_t len)
166{
167#ifdef __NR_sethostname
8f3e280e 168 return syscall(__NR_sethostname, name, len);
2d76d1d7 169#else
8f3e280e
CB
170 errno = ENOSYS;
171 return -1;
2d76d1d7
SG
172#endif
173}
174#endif
175
ecec0126
SG
176#ifndef MS_PRIVATE
177#define MS_PRIVATE (1<<18)
178#endif
179
8912711c
CB
180#ifndef MS_LAZYTIME
181#define MS_LAZYTIME (1<<25)
182#endif
183
5ef5c9a3
CB
184/* memfd_create() */
185#ifndef MFD_CLOEXEC
186#define MFD_CLOEXEC 0x0001U
187#endif
188
189#ifndef MFD_ALLOW_SEALING
190#define MFD_ALLOW_SEALING 0x0002U
191#endif
192
193#ifndef HAVE_MEMFD_CREATE
194static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232}
233#else
234extern int memfd_create(const char *name, unsigned int flags);
235#endif
236
2b9ae35a
CB
237char *lxchook_names[NUM_LXC_HOOKS] = {"pre-start", "pre-mount", "mount",
238 "autodev", "start", "stop",
239 "post-stop", "clone", "destroy"};
72d0e1cb 240
998ac676
RT
241struct mount_opt {
242 char *name;
243 int clear;
244 int flag;
245};
246
81810dd1
DL
247struct caps_opt {
248 char *name;
249 int value;
250};
251
c6d09e15
WB
252struct limit_opt {
253 char *name;
254 int value;
255};
256
858377e4
SH
257/*
258 * The lxc_conf of the container currently being worked on in an
259 * API call
260 * This is used in the error calls
261 */
262#ifdef HAVE_TLS
263__thread struct lxc_conf *current_config;
264#else
265struct lxc_conf *current_config;
266#endif
267
0769b82a
CS
268/* Declare this here, since we don't want to reshuffle the whole file. */
269static int in_caplist(int cap, struct lxc_list *caps);
270
998ac676 271static struct mount_opt mount_opt[] = {
470b359b
CB
272 { "async", 1, MS_SYNCHRONOUS },
273 { "atime", 1, MS_NOATIME },
274 { "bind", 0, MS_BIND },
88d413d5 275 { "defaults", 0, 0 },
88d413d5 276 { "dev", 1, MS_NODEV },
470b359b 277 { "diratime", 1, MS_NODIRATIME },
88d413d5 278 { "dirsync", 0, MS_DIRSYNC },
470b359b 279 { "exec", 1, MS_NOEXEC },
8912711c 280 { "lazytime", 0, MS_LAZYTIME },
88d413d5 281 { "mand", 0, MS_MANDLOCK },
88d413d5 282 { "noatime", 0, MS_NOATIME },
470b359b 283 { "nodev", 0, MS_NODEV },
88d413d5 284 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
285 { "noexec", 0, MS_NOEXEC },
286 { "nomand", 1, MS_MANDLOCK },
287 { "norelatime", 1, MS_RELATIME },
288 { "nostrictatime", 1, MS_STRICTATIME },
289 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
290 { "rbind", 0, MS_BIND|MS_REC },
291 { "relatime", 0, MS_RELATIME },
470b359b
CB
292 { "remount", 0, MS_REMOUNT },
293 { "ro", 0, MS_RDONLY },
294 { "rw", 1, MS_RDONLY },
88d413d5 295 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
296 { "suid", 1, MS_NOSUID },
297 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 298 { NULL, 0, 0 },
998ac676
RT
299};
300
e37dda71 301#if HAVE_LIBCAP
81810dd1 302static struct caps_opt caps_opt[] = {
a6afdde9 303 { "chown", CAP_CHOWN },
1e11be34
DL
304 { "dac_override", CAP_DAC_OVERRIDE },
305 { "dac_read_search", CAP_DAC_READ_SEARCH },
306 { "fowner", CAP_FOWNER },
307 { "fsetid", CAP_FSETID },
81810dd1
DL
308 { "kill", CAP_KILL },
309 { "setgid", CAP_SETGID },
310 { "setuid", CAP_SETUID },
311 { "setpcap", CAP_SETPCAP },
312 { "linux_immutable", CAP_LINUX_IMMUTABLE },
313 { "net_bind_service", CAP_NET_BIND_SERVICE },
314 { "net_broadcast", CAP_NET_BROADCAST },
315 { "net_admin", CAP_NET_ADMIN },
316 { "net_raw", CAP_NET_RAW },
317 { "ipc_lock", CAP_IPC_LOCK },
318 { "ipc_owner", CAP_IPC_OWNER },
319 { "sys_module", CAP_SYS_MODULE },
320 { "sys_rawio", CAP_SYS_RAWIO },
321 { "sys_chroot", CAP_SYS_CHROOT },
322 { "sys_ptrace", CAP_SYS_PTRACE },
323 { "sys_pacct", CAP_SYS_PACCT },
324 { "sys_admin", CAP_SYS_ADMIN },
325 { "sys_boot", CAP_SYS_BOOT },
326 { "sys_nice", CAP_SYS_NICE },
327 { "sys_resource", CAP_SYS_RESOURCE },
328 { "sys_time", CAP_SYS_TIME },
329 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
330 { "mknod", CAP_MKNOD },
331 { "lease", CAP_LEASE },
57b837e2
CB
332#ifdef CAP_AUDIT_READ
333 { "audit_read", CAP_AUDIT_READ },
334#endif
9527e566 335#ifdef CAP_AUDIT_WRITE
81810dd1 336 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
337#endif
338#ifdef CAP_AUDIT_CONTROL
81810dd1 339 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 340#endif
81810dd1
DL
341 { "setfcap", CAP_SETFCAP },
342 { "mac_override", CAP_MAC_OVERRIDE },
343 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
344#ifdef CAP_SYSLOG
345 { "syslog", CAP_SYSLOG },
346#endif
347#ifdef CAP_WAKE_ALARM
348 { "wake_alarm", CAP_WAKE_ALARM },
349#endif
2b54359b
CB
350#ifdef CAP_BLOCK_SUSPEND
351 { "block_suspend", CAP_BLOCK_SUSPEND },
352#endif
81810dd1 353};
495d2046
SG
354#else
355static struct caps_opt caps_opt[] = {};
356#endif
81810dd1 357
c6d09e15
WB
358static struct limit_opt limit_opt[] = {
359#ifdef RLIMIT_AS
360 { "as", RLIMIT_AS },
361#endif
362#ifdef RLIMIT_CORE
363 { "core", RLIMIT_CORE },
364#endif
365#ifdef RLIMIT_CPU
366 { "cpu", RLIMIT_CPU },
367#endif
368#ifdef RLIMIT_DATA
369 { "data", RLIMIT_DATA },
370#endif
371#ifdef RLIMIT_FSIZE
372 { "fsize", RLIMIT_FSIZE },
373#endif
374#ifdef RLIMIT_LOCKS
375 { "locks", RLIMIT_LOCKS },
376#endif
377#ifdef RLIMIT_MEMLOCK
378 { "memlock", RLIMIT_MEMLOCK },
379#endif
380#ifdef RLIMIT_MSGQUEUE
381 { "msgqueue", RLIMIT_MSGQUEUE },
382#endif
383#ifdef RLIMIT_NICE
384 { "nice", RLIMIT_NICE },
385#endif
386#ifdef RLIMIT_NOFILE
387 { "nofile", RLIMIT_NOFILE },
388#endif
389#ifdef RLIMIT_NPROC
390 { "nproc", RLIMIT_NPROC },
391#endif
392#ifdef RLIMIT_RSS
393 { "rss", RLIMIT_RSS },
394#endif
395#ifdef RLIMIT_RTPRIO
396 { "rtprio", RLIMIT_RTPRIO },
397#endif
398#ifdef RLIMIT_RTTIME
399 { "rttime", RLIMIT_RTTIME },
400#endif
401#ifdef RLIMIT_SIGPENDING
402 { "sigpending", RLIMIT_SIGPENDING },
403#endif
404#ifdef RLIMIT_STACK
405 { "stack", RLIMIT_STACK },
406#endif
407};
408
91c3830e
SH
409static int run_buffer(char *buffer)
410{
ebec9176 411 struct lxc_popen_FILE *f;
91c3830e 412 char *output;
8e7da691 413 int ret;
91c3830e 414
ebec9176 415 f = lxc_popen(buffer);
91c3830e 416 if (!f) {
062b72c6 417 SYSERROR("Failed to popen() %s.", buffer);
91c3830e
SH
418 return -1;
419 }
420
421 output = malloc(LXC_LOG_BUFFER_SIZE);
422 if (!output) {
062b72c6 423 ERROR("Failed to allocate memory for %s.", buffer);
ebec9176 424 lxc_pclose(f);
91c3830e
SH
425 return -1;
426 }
427
062b72c6
CB
428 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
429 DEBUG("Script %s with output: %s.", buffer, output);
91c3830e
SH
430
431 free(output);
432
ebec9176 433 ret = lxc_pclose(f);
8e7da691 434 if (ret == -1) {
062b72c6 435 SYSERROR("Script exited with error.");
91c3830e 436 return -1;
8e7da691 437 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
062b72c6 438 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
8e7da691
DE
439 return -1;
440 } else if (WIFSIGNALED(ret)) {
062b72c6 441 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
8e7da691 442 return -1;
91c3830e
SH
443 }
444
445 return 0;
446}
447
148e91f5 448static int run_script_argv(const char *name, const char *section,
062b72c6
CB
449 const char *script, const char *hook,
450 const char *lxcpath, char **argsin)
148e91f5
SH
451{
452 int ret, i;
453 char *buffer;
454 size_t size = 0;
455
062b72c6 456 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
148e91f5
SH
457 script, name, section);
458
062b72c6 459 for (i = 0; argsin && argsin[i]; i++)
148e91f5
SH
460 size += strlen(argsin[i]) + 1;
461
462 size += strlen(hook) + 1;
463
464 size += strlen(script);
465 size += strlen(name);
466 size += strlen(section);
467 size += 3;
468
469 if (size > INT_MAX)
470 return -1;
471
472 buffer = alloca(size);
473 if (!buffer) {
062b72c6 474 ERROR("Failed to allocate memory.");
148e91f5
SH
475 return -1;
476 }
477
062b72c6
CB
478 ret =
479 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
480 if (ret < 0 || (size_t)ret >= size) {
481 ERROR("Script name too long.");
148e91f5
SH
482 return -1;
483 }
484
062b72c6
CB
485 for (i = 0; argsin && argsin[i]; i++) {
486 int len = size - ret;
148e91f5
SH
487 int rc;
488 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
489 if (rc < 0 || rc >= len) {
062b72c6 490 ERROR("Script args too long.");
148e91f5
SH
491 return -1;
492 }
493 ret += rc;
494 }
495
496 return run_buffer(buffer);
497}
498
811ef482 499int run_script(const char *name, const char *section, const char *script, ...)
e3b4c4c4 500{
abbfd20b 501 int ret;
91c3830e 502 char *buffer, *p;
abbfd20b
DL
503 size_t size = 0;
504 va_list ap;
751d9dcd 505
062b72c6 506 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
751d9dcd 507 script, name, section);
e3b4c4c4 508
abbfd20b
DL
509 va_start(ap, script);
510 while ((p = va_arg(ap, char *)))
95642a10 511 size += strlen(p) + 1;
abbfd20b
DL
512 va_end(ap);
513
514 size += strlen(script);
515 size += strlen(name);
516 size += strlen(section);
95642a10 517 size += 3;
abbfd20b 518
95642a10
MS
519 if (size > INT_MAX)
520 return -1;
521
522 buffer = alloca(size);
abbfd20b 523 if (!buffer) {
062b72c6 524 ERROR("Failed to allocate memory.");
751d9dcd
DL
525 return -1;
526 }
527
9ba8130c
SH
528 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
529 if (ret < 0 || ret >= size) {
062b72c6 530 ERROR("Script name too long.");
9ba8130c
SH
531 return -1;
532 }
751d9dcd 533
abbfd20b 534 va_start(ap, script);
9ba8130c 535 while ((p = va_arg(ap, char *))) {
062b72c6 536 int len = size - ret;
9ba8130c
SH
537 int rc;
538 rc = snprintf(buffer + ret, len, " %s", p);
539 if (rc < 0 || rc >= len) {
062b72c6 540 ERROR("Script args too long.");
9ba8130c
SH
541 return -1;
542 }
543 ret += rc;
544 }
abbfd20b 545 va_end(ap);
751d9dcd 546
91c3830e 547 return run_buffer(buffer);
e3b4c4c4
ST
548}
549
0c547523
SH
550/*
551 * pin_rootfs
b7ed4bf0
CS
552 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
553 * the duration of the container run, to prevent the container from marking
554 * the underlying fs readonly on shutdown. unlink the file immediately so
555 * no name pollution is happens
0c547523
SH
556 * return -1 on error.
557 * return -2 if nothing needed to be pinned.
558 * return an open fd (>=0) if we pinned it.
559 */
560int pin_rootfs(const char *rootfs)
561{
562 char absrootfs[MAXPATHLEN];
563 char absrootfspin[MAXPATHLEN];
564 struct stat s;
565 int ret, fd;
566
e99ee0de 567 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 568 return -2;
e99ee0de 569
00ec333b 570 if (!realpath(rootfs, absrootfs))
9be53773 571 return -2;
0c547523 572
00ec333b 573 if (access(absrootfs, F_OK))
0c547523 574 return -1;
0c547523 575
00ec333b 576 if (stat(absrootfs, &s))
0c547523 577 return -1;
0c547523 578
72f919c4 579 if (!S_ISDIR(s.st_mode))
0c547523
SH
580 return -2;
581
b7ed4bf0 582 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 583 if (ret >= MAXPATHLEN)
0c547523 584 return -1;
0c547523
SH
585
586 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
587 if (fd < 0)
588 return fd;
589 (void)unlink(absrootfspin);
0c547523
SH
590 return fd;
591}
592
e2a7e8dc
SH
593/*
594 * If we are asking to remount something, make sure that any
595 * NOEXEC etc are honored.
596 */
5ae72b98 597unsigned long add_required_remount_flags(const char *s, const char *d,
e2a7e8dc
SH
598 unsigned long flags)
599{
614305f3 600#ifdef HAVE_STATVFS
e2a7e8dc
SH
601 struct statvfs sb;
602 unsigned long required_flags = 0;
603
604 if (!(flags & MS_REMOUNT))
605 return flags;
606
607 if (!s)
608 s = d;
609
610 if (!s)
611 return flags;
612 if (statvfs(s, &sb) < 0)
613 return flags;
614
615 if (sb.f_flag & MS_NOSUID)
616 required_flags |= MS_NOSUID;
617 if (sb.f_flag & MS_NODEV)
618 required_flags |= MS_NODEV;
619 if (sb.f_flag & MS_RDONLY)
620 required_flags |= MS_RDONLY;
621 if (sb.f_flag & MS_NOEXEC)
622 required_flags |= MS_NOEXEC;
623
624 return flags | required_flags;
614305f3
SH
625#else
626 return flags;
627#endif
e2a7e8dc
SH
628}
629
4fb3cba5 630static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 631{
368bbc02 632 int r;
80e80c40 633 int i;
b06b8511
CS
634 static struct {
635 int match_mask;
636 int match_flag;
637 const char *source;
638 const char *destination;
639 const char *fstype;
640 unsigned long flags;
641 const char *options;
642 } default_mounts[] = {
643 /* Read-only bind-mounting... In older kernels, doing that required
644 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
645 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
646 * kernel 2.6.26 onwards. However, this apparently does not work on
647 * kernel 3.8. Unfortunately, on that very same kernel, doing the
648 * same trick as above doesn't seem to work either, there one needs
649 * to ALSO specify MS_BIND for the remount, otherwise the entire
650 * fs is remounted read-only or the mount fails because it's busy...
651 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
652 * 2.6.32...
368bbc02 653 */
f24a52d5 654 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a
SH
655 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
656 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
f24a52d5
SG
657 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
658 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
592fd47a 659 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
f24a52d5
SG
660 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
661 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
662 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
663 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
664 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
665 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
666 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
667 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
668 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
669 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
670 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
671 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 672 };
368bbc02 673
b06b8511
CS
674 for (i = 0; default_mounts[i].match_mask; i++) {
675 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
676 char *source = NULL;
677 char *destination = NULL;
678 int saved_errno;
e2a7e8dc 679 unsigned long mflags;
b06b8511
CS
680
681 if (default_mounts[i].source) {
682 /* will act like strdup if %r is not present */
8ede5f4c 683 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
b06b8511
CS
684 if (!source) {
685 SYSERROR("memory allocation error");
686 return -1;
687 }
688 }
cc4fd506
SH
689 if (!default_mounts[i].destination) {
690 ERROR("BUG: auto mounts destination %d was NULL", i);
b2f44b4d 691 free(source);
cc4fd506
SH
692 return -1;
693 }
694 /* will act like strdup if %r is not present */
695 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
696 if (!destination) {
697 saved_errno = errno;
698 SYSERROR("memory allocation error");
699 free(source);
700 errno = saved_errno;
701 return -1;
b06b8511 702 }
e2a7e8dc
SH
703 mflags = add_required_remount_flags(source, destination,
704 default_mounts[i].flags);
592fd47a 705 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
b06b8511 706 saved_errno = errno;
b88ff9a0
SG
707 if (r < 0 && errno == ENOENT) {
708 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
709 r = 0;
710 }
711 else if (r < 0)
e2a7e8dc 712 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
f24a52d5 713
b06b8511
CS
714 free(source);
715 free(destination);
716 if (r < 0) {
b06b8511
CS
717 errno = saved_errno;
718 return -1;
719 }
368bbc02 720 }
368bbc02
CS
721 }
722
b06b8511 723 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
724 int cg_flags;
725
726 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
727 /* If the type of cgroup mount was not specified, it depends on the
728 * container's capabilities as to what makes sense: if we have
729 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
730 * anyway, so we may as well default to read-write; then the admin
731 * will not be given a false sense of security. (And if they really
732 * want mixed r/o r/w, then they can explicitly specify :mixed.)
733 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
734 * :mixed, because then the container can't remount it read-write. */
735 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
736 int has_sys_admin = 0;
b0ee5983
CB
737
738 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 739 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 740 else
0769b82a 741 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
742
743 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 744 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 745 else
0769b82a 746 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a
CS
747 }
748
8ede5f4c 749 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
368bbc02 750 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 751 return -1;
368bbc02
CS
752 }
753 }
754
368bbc02 755 return 0;
368bbc02
CS
756}
757
4e5440c6 758static int setup_utsname(struct utsname *utsname)
0ad19a3f 759{
4e5440c6
DL
760 if (!utsname)
761 return 0;
0ad19a3f 762
4e5440c6
DL
763 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
764 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 765 return -1;
766 }
767
4e5440c6 768 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 769
0ad19a3f 770 return 0;
771}
772
69aa6655
DE
773struct dev_symlinks {
774 const char *oldpath;
775 const char *name;
776};
777
778static const struct dev_symlinks dev_symlinks[] = {
779 {"/proc/self/fd", "fd"},
780 {"/proc/self/fd/0", "stdin"},
781 {"/proc/self/fd/1", "stdout"},
782 {"/proc/self/fd/2", "stderr"},
783};
784
785static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
786{
787 char path[MAXPATHLEN];
788 int ret,i;
09227be2 789 struct stat s;
69aa6655
DE
790
791
792 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
793 const struct dev_symlinks *d = &dev_symlinks[i];
ec50007f 794 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
795 if (ret < 0 || ret >= MAXPATHLEN)
796 return -1;
09227be2
MW
797
798 /*
799 * Stat the path first. If we don't get an error
800 * accept it as is and don't try to create it
801 */
802 if (!stat(path, &s)) {
803 continue;
804 }
805
69aa6655 806 ret = symlink(d->oldpath, path);
09227be2 807
69aa6655 808 if (ret && errno != EEXIST) {
09227be2
MW
809 if ( errno == EROFS ) {
810 WARN("Warning: Read Only file system while creating %s", path);
811 } else {
812 SYSERROR("Error creating %s", path);
813 return -1;
814 }
69aa6655
DE
815 }
816 }
817 return 0;
818}
819
2187efd3 820/* Build a space-separate list of ptys to pass to systemd. */
393903d1 821static bool append_ptyname(char **pp, char *name)
b0a33c1e 822{
393903d1
SH
823 char *p;
824
825 if (!*pp) {
826 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
827 if (!*pp)
828 return false;
829 sprintf(*pp, "container_ttys=%s", name);
830 return true;
831 }
832 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
833 if (!p)
834 return false;
835 *pp = p;
836 strcat(p, " ");
837 strcat(p, name);
838 return true;
839}
840
2187efd3 841static int lxc_setup_ttys(struct lxc_conf *conf)
393903d1 842{
9e1045e3 843 int i, ret;
393903d1
SH
844 const struct lxc_tty_info *tty_info = &conf->tty_info;
845 char *ttydir = conf->ttydir;
7c6ef2a2 846 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 847
e8bd4e43 848 if (!conf->rootfs.path)
bc9bd0e3
DL
849 return 0;
850
b0a33c1e 851 for (i = 0; i < tty_info->nbtty; i++) {
b0a33c1e 852 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
853
e8bd4e43 854 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
73363c61 855 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 856 return -1;
9e1045e3 857
7c6ef2a2
SH
858 if (ttydir) {
859 /* create dev/lxc/tty%d" */
9e1045e3
CB
860 ret = snprintf(lxcpath, sizeof(lxcpath),
861 "/dev/%s/tty%d", ttydir, i + 1);
73363c61 862 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 863 return -1;
9e1045e3 864
7c6ef2a2 865 ret = creat(lxcpath, 0660);
9e1045e3 866 if (ret < 0 && errno != EEXIST) {
73363c61 867 SYSERROR("Failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
868 return -1;
869 }
4d44e274
SH
870 if (ret >= 0)
871 close(ret);
9e1045e3 872
7c6ef2a2 873 ret = unlink(path);
9e1045e3 874 if (ret < 0 && errno != ENOENT) {
73363c61 875 SYSERROR("Failed to unlink \"%s\"", path);
7c6ef2a2
SH
876 return -1;
877 }
b0a33c1e 878
9e1045e3
CB
879 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
880 if (ret < 0) {
73363c61 881 WARN("Failed to bind mount \"%s\" onto \"%s\"",
7c6ef2a2
SH
882 pty_info->name, path);
883 continue;
884 }
9e1045e3
CB
885 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
886 path);
13954cce 887
9e1045e3
CB
888 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
889 ttydir, i + 1);
73363c61 890 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 891 return -1;
9e1045e3 892
7c6ef2a2 893 ret = symlink(lxcpath, path);
9e1045e3 894 if (ret < 0) {
73363c61 895 SYSERROR("Failed to create symlink \"%s\" -> \"%s\"",
9e1045e3 896 path, lxcpath);
7c6ef2a2
SH
897 return -1;
898 }
899 } else {
9e1045e3
CB
900 /* If we populated /dev, then we need to create
901 * /dev/ttyN
902 */
903 ret = access(path, F_OK);
904 if (ret < 0) {
c6883f38 905 ret = creat(path, 0660);
9e1045e3 906 if (ret < 0) {
73363c61 907 SYSERROR("Failed to create \"%s\"", path);
c6883f38 908 /* this isn't fatal, continue */
025ed0f3 909 } else {
c6883f38 910 close(ret);
025ed0f3 911 }
c6883f38 912 }
9e1045e3
CB
913
914 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
915 if (ret < 0) {
73363c61 916 SYSERROR("Failed to mount '%s'->'%s'", pty_info->name, path);
7c6ef2a2
SH
917 continue;
918 }
9e1045e3 919
73363c61 920 DEBUG("Bind mounted \"%s\" onto \"%s\"", pty_info->name,
9e1045e3 921 path);
393903d1 922 }
9e1045e3 923
e8bd4e43 924 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
393903d1
SH
925 ERROR("Error setting up container_ttys string");
926 return -1;
b0a33c1e 927 }
928 }
929
73363c61 930 INFO("Finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
b0a33c1e 931 return 0;
932}
933
2187efd3
CB
934int lxc_allocate_ttys(const char *name, struct lxc_conf *conf)
935{
936 struct lxc_tty_info *tty_info = &conf->tty_info;
937 int i, ret;
938
939 /* no tty in the configuration */
940 if (!conf->tty)
941 return 0;
942
943 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
944 if (!tty_info->pty_info) {
945 SYSERROR("failed to allocate struct *pty_info");
946 return -ENOMEM;
947 }
948
949 for (i = 0; i < conf->tty; i++) {
950 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
951
952 process_lock();
953 ret = openpty(&pty_info->master, &pty_info->slave,
954 pty_info->name, NULL, NULL);
955 process_unlock();
956 if (ret) {
957 SYSERROR("failed to create pty device number %d", i);
958 tty_info->nbtty = i;
959 lxc_delete_tty(tty_info);
960 return -ENOTTY;
961 }
962
963 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
964 pty_info->name, pty_info->master, pty_info->slave);
965
966 /* Prevent leaking the file descriptors to the container */
967 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
968 if (ret < 0)
969 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
970 "pty device \"%s\": %s",
971 pty_info->master, pty_info->name, strerror(errno));
972
973 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
974 if (ret < 0)
975 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
976 "pty device \"%s\": %s",
977 pty_info->slave, pty_info->name, strerror(errno));
978
979 pty_info->busy = 0;
980 }
981
982 tty_info->nbtty = conf->tty;
983
984 INFO("finished allocating %d pts devices", conf->tty);
985 return 0;
986}
987
988void lxc_delete_tty(struct lxc_tty_info *tty_info)
989{
990 int i;
991
992 for (i = 0; i < tty_info->nbtty; i++) {
993 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
994
995 close(pty_info->master);
996 close(pty_info->slave);
997 }
998
999 free(tty_info->pty_info);
1000 tty_info->pty_info = NULL;
1001 tty_info->nbtty = 0;
1002}
1003
1004static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
1005{
1006 int i;
1007 struct lxc_conf *conf = handler->conf;
1008 struct lxc_tty_info *tty_info = &conf->tty_info;
1009 int sock = handler->data_sock[0];
1010 int ret = -1;
1011
1012 if (!conf->tty)
1013 return 0;
1014
1015 for (i = 0; i < conf->tty; i++) {
1016 int ttyfds[2];
1017 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
1018
1019 ttyfds[0] = pty_info->master;
1020 ttyfds[1] = pty_info->slave;
1021
1022 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1023 if (ret < 0)
1024 break;
1025
1026 TRACE("Send pty \"%s\" with master fd %d and slave fd %d to "
1027 "parent", pty_info->name, pty_info->master, pty_info->slave);
1028 }
1029
1030 if (ret < 0)
1031 ERROR("Failed to send %d ttys to parent: %s", conf->tty,
1032 strerror(errno));
1033 else
1034 TRACE("Sent %d ttys to parent", conf->tty);
1035
1036 return ret;
1037}
1038
1039static int lxc_create_ttys(struct lxc_handler *handler)
1040{
1041 int ret = -1;
1042 struct lxc_conf *conf = handler->conf;
1043
1044 ret = lxc_allocate_ttys(handler->name, conf);
1045 if (ret < 0) {
1046 ERROR("Failed to allocate ttys");
1047 goto on_error;
1048 }
1049
1050 ret = lxc_send_ttys_to_parent(handler);
1051 if (ret < 0) {
1052 ERROR("Failed to send ttys to parent");
1053 goto on_error;
1054 }
1055
1056 if (!conf->is_execute) {
1057 ret = lxc_setup_ttys(conf);
1058 if (ret < 0) {
1059 ERROR("Failed to setup ttys");
1060 goto on_error;
1061 }
1062 }
1063
1064 if (conf->pty_names) {
1065 ret = setenv("container_ttys", conf->pty_names, 1);
1066 if (ret < 0)
1067 SYSERROR("Failed to set \"container_ttys=%s\"", conf->pty_names);
1068 }
1069
1070 ret = 0;
1071
1072on_error:
1073 lxc_delete_tty(&conf->tty_info);
1074
1075 return ret;
1076}
1077
59bb8698 1078static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 1079{
2d489f9e 1080 int oldroot = -1, newroot = -1;
bf601689 1081
2d489f9e
SH
1082 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1083 if (oldroot < 0) {
1084 SYSERROR("Error opening old-/ for fchdir");
9ba8130c
SH
1085 return -1;
1086 }
2d489f9e
SH
1087 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1088 if (newroot < 0) {
1089 SYSERROR("Error opening new-/ for fchdir");
1090 goto fail;
c08556c6 1091 }
bf601689 1092
cc6f6dd7 1093 /* change into new root fs */
2d489f9e 1094 if (fchdir(newroot)) {
cc6f6dd7 1095 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
2d489f9e 1096 goto fail;
cc6f6dd7
DL
1097 }
1098
cc6f6dd7 1099 /* pivot_root into our new root fs */
2d489f9e 1100 if (pivot_root(".", ".")) {
cc6f6dd7 1101 SYSERROR("pivot_root syscall failed");
2d489f9e 1102 goto fail;
bf601689 1103 }
cc6f6dd7 1104
2d489f9e
SH
1105 /*
1106 * at this point the old-root is mounted on top of our new-root
1107 * To unmounted it we must not be chdir'd into it, so escape back
1108 * to old-root
1109 */
1110 if (fchdir(oldroot) < 0) {
1111 SYSERROR("Error entering oldroot");
1112 goto fail;
1113 }
7981ea46 1114 if (umount2(".", MNT_DETACH) < 0) {
2d489f9e
SH
1115 SYSERROR("Error detaching old root");
1116 goto fail;
cc6f6dd7
DL
1117 }
1118
2d489f9e
SH
1119 if (fchdir(newroot) < 0) {
1120 SYSERROR("Error re-entering newroot");
1121 goto fail;
1122 }
cc6f6dd7 1123
2d489f9e
SH
1124 close(oldroot);
1125 close(newroot);
bf601689 1126
2d489f9e 1127 DEBUG("pivot_root syscall to '%s' successful", rootfs);
bf601689 1128
bf601689 1129 return 0;
2d489f9e
SH
1130
1131fail:
1132 if (oldroot != -1)
1133 close(oldroot);
1134 if (newroot != -1)
1135 close(newroot);
1136 return -1;
bf601689
MH
1137}
1138
7133b912
CB
1139/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1140 * error, log it but don't fail yet.
91c3830e 1141 */
7133b912
CB
1142static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1143 const char *lxcpath)
91c3830e
SH
1144{
1145 int ret;
87da4ec3
SH
1146 size_t clen;
1147 char *path;
91c3830e 1148
7133b912 1149 INFO("Preparing \"/dev\"");
bc6928ff 1150
14221cbb 1151 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1152 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1153 path = alloca(clen);
bc6928ff 1154
ec50007f 1155 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
7133b912 1156 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1157 return -1;
bc6928ff 1158
87da4ec3 1159 if (!dir_exists(path)) {
7133b912
CB
1160 WARN("\"/dev\" directory does not exist. Proceeding without "
1161 "autodev being set up");
87da4ec3 1162 return 0;
bc6928ff 1163 }
87da4ec3 1164
1ec0e8e3 1165 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
7133b912
CB
1166 rootfs->path ? rootfs->mount : NULL);
1167 if (ret < 0) {
1168 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1ec0e8e3 1169 return -1;
91c3830e 1170 }
7133b912 1171 INFO("Mounted tmpfs on \"%s\"", path);
87da4ec3 1172
ec50007f 1173 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
7133b912 1174 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1175 return -1;
87da4ec3 1176
7133b912 1177 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1178 * If not, then create it and exit if that fails...
1179 */
87da4ec3 1180 if (!dir_exists(path)) {
bc6928ff 1181 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
7133b912
CB
1182 if (ret < 0) {
1183 SYSERROR("Failed to create directory \"%s\"", path);
bc6928ff
MW
1184 return -1;
1185 }
91c3830e
SH
1186 }
1187
7133b912 1188 INFO("Prepared \"/dev\"");
91c3830e
SH
1189 return 0;
1190}
1191
c6883f38 1192struct lxc_devs {
74a3920a 1193 const char *name;
c6883f38
SH
1194 mode_t mode;
1195 int maj;
1196 int min;
1197};
1198
74a3920a 1199static const struct lxc_devs lxc_devs[] = {
06749971
CB
1200 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1201 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1202 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1203 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1204 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1205 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
c6883f38
SH
1206};
1207
27245ff7 1208static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38
SH
1209{
1210 int ret;
c6883f38
SH
1211 char path[MAXPATHLEN];
1212 int i;
3a32201c 1213 mode_t cmask;
c6883f38 1214
3999be0a
CB
1215 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1216 rootfs->path ? rootfs->mount : "");
1217 if (ret < 0 || ret >= MAXPATHLEN)
c6883f38 1218 return -1;
91c3830e 1219
0bbf8572
CB
1220 /* ignore, just don't try to fill in */
1221 if (!dir_exists(path))
9cb4d183
SH
1222 return 0;
1223
3999be0a
CB
1224 INFO("Populating \"/dev\"");
1225
3a32201c 1226 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1227 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1228 const struct lxc_devs *d = &lxc_devs[i];
0728ebf4 1229
3999be0a
CB
1230 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
1231 rootfs->path ? rootfs->mount : "", d->name);
c6883f38
SH
1232 if (ret < 0 || ret >= MAXPATHLEN)
1233 return -1;
0bbf8572 1234
c6883f38 1235 ret = mknod(path, d->mode, makedev(d->maj, d->min));
0bbf8572 1236 if (ret < 0) {
9cb4d183 1237 FILE *pathfile;
3999be0a 1238 char hostpath[MAXPATHLEN];
9cb4d183 1239
0bbf8572
CB
1240 if (errno == EEXIST) {
1241 DEBUG("\"%s\" device already existed", path);
1242 continue;
1243 }
1244
1245 /* Unprivileged containers cannot create devices, so
1246 * bind mount the device from the host.
1247 */
9cb4d183
SH
1248 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1249 if (ret < 0 || ret >= MAXPATHLEN)
1250 return -1;
3999be0a 1251
9cb4d183
SH
1252 pathfile = fopen(path, "wb");
1253 if (!pathfile) {
3999be0a 1254 SYSERROR("Failed to create file \"%s\"", path);
9cb4d183
SH
1255 return -1;
1256 }
1257 fclose(pathfile);
3999be0a
CB
1258
1259 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1260 rootfs->path ? rootfs->mount : NULL);
1261 if (ret < 0) {
1262 SYSERROR("Failed to bind mount \"%s\" from "
1263 "host into container",
1264 d->name);
9cb4d183
SH
1265 return -1;
1266 }
3999be0a
CB
1267 DEBUG("Bind mounted \"%s\" onto \"%s\"", hostpath,
1268 path);
0bbf8572 1269 } else {
3999be0a 1270 DEBUG("Created device node \"%s\"", path);
c6883f38
SH
1271 }
1272 }
3a32201c 1273 umask(cmask);
c6883f38 1274
3999be0a 1275 INFO("Populated \"/dev\"");
c6883f38
SH
1276 return 0;
1277}
1278
9aa76a17 1279static int lxc_setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1280{
9aa76a17 1281 int ret;
10bc1861 1282 struct lxc_storage *bdev;
91c3e281 1283 const struct lxc_rootfs *rootfs;
cc28d0b0 1284
91c3e281 1285 rootfs = &conf->rootfs;
a0f379bf 1286 if (!rootfs->path) {
91c3e281
CB
1287 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1288 SYSERROR("Failed to make / rslave.");
a0f379bf
DW
1289 return -1;
1290 }
c69bd12f 1291 return 0;
a0f379bf 1292 }
0ad19a3f 1293
12297168 1294 if (access(rootfs->mount, F_OK)) {
91c3e281 1295 SYSERROR("Failed to access to \"%s\". Check it is present.",
12297168 1296 rootfs->mount);
b1789442
DL
1297 return -1;
1298 }
1299
10bc1861 1300 bdev = storage_init(conf, rootfs->path, rootfs->mount, rootfs->options);
9aa76a17
CB
1301 if (!bdev) {
1302 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
91c3e281
CB
1303 rootfs->path, rootfs->mount,
1304 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1305 return -1;
9be53773 1306 }
9aa76a17
CB
1307
1308 ret = bdev->ops->mount(bdev);
10bc1861 1309 storage_put(bdev);
9aa76a17 1310 if (ret < 0) {
91c3e281
CB
1311 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1312 rootfs->path, rootfs->mount,
1313 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1314 return -1;
1315 }
0ad19a3f 1316
91c3e281
CB
1317 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1318 rootfs->path, rootfs->mount,
1319 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1320
ac778708
DL
1321 return 0;
1322}
1323
91e93c71
AV
1324int prepare_ramfs_root(char *root)
1325{
eab15c1e 1326 char buf[LXC_LINELEN], *p;
91e93c71
AV
1327 char nroot[PATH_MAX];
1328 FILE *f;
1329 int i;
1330 char *p2;
1331
1332 if (realpath(root, nroot) == NULL)
39c7b795 1333 return -errno;
91e93c71
AV
1334
1335 if (chdir("/") == -1)
39c7b795 1336 return -errno;
91e93c71
AV
1337
1338 /*
1339 * We could use here MS_MOVE, but in userns this mount is
1340 * locked and can't be moved.
1341 */
39c7b795 1342 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
91e93c71 1343 SYSERROR("Failed to move %s into /", root);
39c7b795 1344 return -errno;
91e93c71
AV
1345 }
1346
39c7b795 1347 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
91e93c71 1348 SYSERROR("Failed to make . rprivate");
39c7b795 1349 return -errno;
91e93c71
AV
1350 }
1351
1352 /*
1353 * The following code cleans up inhereted mounts which are not
1354 * required for CT.
1355 *
1356 * The mountinfo file shows not all mounts, if a few points have been
1357 * unmounted between read operations from the mountinfo. So we need to
1358 * read mountinfo a few times.
1359 *
1360 * This loop can be skipped if a container uses unserns, because all
1361 * inherited mounts are locked and we should live with all this trash.
1362 */
1363 while (1) {
1364 int progress = 0;
1365
1366 f = fopen("./proc/self/mountinfo", "r");
1367 if (!f) {
1368 SYSERROR("Unable to open /proc/self/mountinfo");
1369 return -1;
1370 }
eab15c1e 1371 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1372 for (p = buf, i=0; p && i < 4; i++)
1373 p = strchr(p+1, ' ');
1374 if (!p)
1375 continue;
1376 p2 = strchr(p+1, ' ');
1377 if (!p2)
1378 continue;
1379
1380 *p2 = '\0';
1381 *p = '.';
1382
1383 if (strcmp(p + 1, "/") == 0)
1384 continue;
1385 if (strcmp(p + 1, "/proc") == 0)
1386 continue;
1387
1388 if (umount2(p, MNT_DETACH) == 0)
1389 progress++;
1390 }
1391 fclose(f);
1392 if (!progress)
1393 break;
1394 }
1395
8bea9fae
PR
1396 /* This also can be skipped if a container uses unserns */
1397 umount2("./proc", MNT_DETACH);
91e93c71
AV
1398
1399 /* It is weird, but chdir("..") moves us in a new root */
1400 if (chdir("..") == -1) {
1401 SYSERROR("Unable to change working directory");
1402 return -1;
1403 }
1404
1405 if (chroot(".") == -1) {
1406 SYSERROR("Unable to chroot");
1407 return -1;
1408 }
1409
1410 return 0;
1411}
1412
74a3920a 1413static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1414{
39c7b795
CB
1415 if (!rootfs->path) {
1416 DEBUG("container does not have a rootfs, so not doing pivot root");
ac778708 1417 return 0;
39c7b795 1418 }
ac778708 1419
91e93c71 1420 if (detect_ramfs_rootfs()) {
39c7b795
CB
1421 DEBUG("detected that container is on ramfs");
1422 if (prepare_ramfs_root(rootfs->mount)) {
1423 ERROR("failed to prepare minimal ramfs root");
91e93c71 1424 return -1;
39c7b795
CB
1425 }
1426
1427 DEBUG("prepared ramfs root for container");
1428 return 0;
1429 }
1430
1431 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1432 ERROR("failed to pivot root");
25368b52 1433 return -1;
c69bd12f
DL
1434 }
1435
39c7b795 1436 DEBUG("finished pivot root");
25368b52 1437 return 0;
0ad19a3f 1438}
1439
70761e5e 1440static int lxc_setup_devpts(int num_pts)
3c26f34e 1441{
70761e5e 1442 int ret;
9d28c4f9
CB
1443 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1444 char devpts_mntopts[256];
77890c6d 1445
70761e5e
CB
1446 if (!num_pts) {
1447 DEBUG("no new devpts instance will be mounted since no pts "
1448 "devices are requested");
d852c78c 1449 return 0;
3c26f34e 1450 }
1451
9d28c4f9
CB
1452 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1453 default_devpts_mntopts, num_pts);
1454 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1455 return -1;
1456
d5cb35d6 1457 /* Unmount old devpts instance. */
70761e5e
CB
1458 ret = access("/dev/pts/ptmx", F_OK);
1459 if (!ret) {
70761e5e
CB
1460 ret = umount("/dev/pts");
1461 if (ret < 0) {
1462 SYSERROR("failed to unmount old devpts instance");
1463 return -1;
7e40254a 1464 }
70761e5e 1465 DEBUG("unmounted old /dev/pts instance");
7e40254a
JTLB
1466 }
1467
70761e5e
CB
1468 /* Create mountpoint for devpts instance. */
1469 ret = mkdir("/dev/pts", 0755);
1470 if (ret < 0 && errno != EEXIST) {
1471 SYSERROR("failed to create the \"/dev/pts\" directory");
3c26f34e 1472 return -1;
1473 }
1474
70761e5e
CB
1475 /* Mount new devpts instance. */
1476 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1477 if (ret < 0) {
1478 SYSERROR("failed to mount new devpts instance");
1479 return -1;
1480 }
f4f52cb5 1481 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1482
d5cb35d6 1483 /* Remove any pre-existing /dev/ptmx file. */
70761e5e 1484 ret = access("/dev/ptmx", F_OK);
d5cb35d6
CB
1485 if (!ret) {
1486 ret = remove("/dev/ptmx");
1487 if (ret < 0) {
1488 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1489 return -1;
70761e5e 1490 }
d5cb35d6 1491 DEBUG("removed existing \"/dev/ptmx\"");
3c26f34e 1492 }
1493
d5cb35d6
CB
1494 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1495 ret = open("/dev/ptmx", O_CREAT, 0666);
1496 if (ret < 0) {
1497 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1498 return -1;
1499 }
e87bd19c 1500 close(ret);
d5cb35d6 1501 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1502
d5cb35d6 1503 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1504 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6
CB
1505 if (!ret) {
1506 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1507 return 0;
1508 } else {
1509 /* Fallthrough and try to create a symlink. */
1510 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1511 }
1512
1513 /* Remove the dummy /dev/ptmx file we created above. */
1514 ret = remove("/dev/ptmx");
70761e5e 1515 if (ret < 0) {
d5cb35d6
CB
1516 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1517 return -1;
1518 }
1519
1520 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1521 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1522 if (ret < 0) {
1523 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
3c26f34e 1524 return -1;
1525 }
d5cb35d6 1526 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
cd54d859 1527
3c26f34e 1528 return 0;
1529}
1530
cccc74b5
DL
1531static int setup_personality(int persona)
1532{
6ff05e18 1533 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1534 if (persona == -1)
1535 return 0;
1536
1537 if (personality(persona) < 0) {
1538 SYSERROR("failed to set personality to '0x%x'", persona);
1539 return -1;
1540 }
1541
1542 INFO("set personality to '0x%x'", persona);
6ff05e18 1543 #endif
cccc74b5
DL
1544
1545 return 0;
1546}
1547
3d7d929a
CB
1548static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1549 const struct lxc_console *console)
6e590161 1550{
63376d7d 1551 char path[MAXPATHLEN];
0728ebf4 1552 int ret, fd;
52e35957 1553
8b1b1210
CB
1554 if (console->path && !strcmp(console->path, "none"))
1555 return 0;
1556
7c6ef2a2 1557 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
3d7d929a 1558 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1559 return -1;
52e35957 1560
8b1b1210
CB
1561 /* When we are asked to setup a console we remove any previous
1562 * /dev/console bind-mounts.
1563 */
a7ba3c7f
CB
1564 if (file_exists(path)) {
1565 ret = lxc_unstack_mountpoint(path, false);
1566 if (ret < 0) {
8b1b1210 1567 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1568 return -ret;
1569 } else {
1570 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1571 }
953fe44f 1572
a7ba3c7f
CB
1573 ret = unlink(path);
1574 if (ret < 0) {
1575 SYSERROR("error unlinking %s", path);
8b1b1210
CB
1576 return -errno;
1577 }
8b1b1210
CB
1578 }
1579
1580 /* For unprivileged containers autodev or automounts will already have
1581 * taken care of creating /dev/console.
1582 */
0728ebf4
TA
1583 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1584 if (fd < 0) {
1585 if (errno != EEXIST) {
1586 SYSERROR("failed to create console");
3d7d929a 1587 return -errno;
0728ebf4
TA
1588 }
1589 } else {
1590 close(fd);
52e35957
DL
1591 }
1592
0728ebf4 1593 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
3d7d929a
CB
1594 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1595 return -errno;
63376d7d 1596 }
13954cce 1597
3d7d929a 1598 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
63376d7d 1599 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1600 return -1;
1601 }
1602
3d7d929a 1603 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1604 return 0;
1605}
1606
3d7d929a
CB
1607static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1608 const struct lxc_console *console,
1609 char *ttydir)
7c6ef2a2 1610{
7c6ef2a2 1611 int ret;
3d7d929a 1612 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
7c6ef2a2
SH
1613
1614 /* create rootfs/dev/<ttydir> directory */
3d7d929a
CB
1615 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1616 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1617 return -1;
3d7d929a 1618
7c6ef2a2
SH
1619 ret = mkdir(path, 0755);
1620 if (ret && errno != EEXIST) {
959aee9c 1621 SYSERROR("failed with errno %d to create %s", errno, path);
3d7d929a 1622 return -errno;
7c6ef2a2 1623 }
4742cd9a 1624 DEBUG("Created directory for console and tty devices at \"%s\"", path);
7c6ef2a2 1625
3d7d929a
CB
1626 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1627 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1628 return -1;
1629
7c6ef2a2 1630 ret = creat(lxcpath, 0660);
3d7d929a 1631 if (ret == -1 && errno != EEXIST) {
959aee9c 1632 SYSERROR("error %d creating %s", errno, lxcpath);
3d7d929a 1633 return -errno;
7c6ef2a2 1634 }
4d44e274
SH
1635 if (ret >= 0)
1636 close(ret);
7c6ef2a2 1637
2a12fefd
CB
1638 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1639 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 1640 return -1;
2a12fefd
CB
1641
1642 /* When we are asked to setup a console we remove any previous
1643 * /dev/console bind-mounts.
1644 */
1645 if (console->path && !strcmp(console->path, "none")) {
1646 struct stat st;
1647 ret = stat(path, &st);
1648 if (ret < 0) {
1649 if (errno == ENOENT)
1650 return 0;
1651 SYSERROR("failed stat() \"%s\"", path);
1652 return -errno;
1653 }
1654
1655 /* /dev/console must be character device with major number 5 and
1656 * minor number 1. If not, give benefit of the doubt and assume
1657 * the user has mounted something else right there on purpose.
1658 */
1659 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1660 return 0;
1661
1662 /* In case the user requested a bind-mount for /dev/console and
1663 * requests a ttydir we move the mount to the
a7ba3c7f
CB
1664 * /dev/<ttydir/console.
1665 * Note, we only move the uppermost mount and clear all other
1666 * mounts underneath for safety.
1667 * If it is a character device created via mknod() we simply
1668 * rename it.
2a12fefd
CB
1669 */
1670 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1671 if (ret < 0) {
1672 if (errno != EINVAL) {
1673 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1674 return -errno;
1675 }
1676 /* path was not a mountpoint */
1677 ret = rename(path, lxcpath);
1678 if (ret < 0) {
1679 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1680 return -errno;
1681 }
1682 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1683 } else {
1684 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1685 }
a7ba3c7f
CB
1686
1687 /* Clear all remaining bind-mounts. */
1688 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1689 if (ret < 0) {
a7ba3c7f
CB
1690 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1691 return -ret;
1692 } else {
1693 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1694 }
1695 } else {
1696 if (file_exists(path)) {
1697 ret = lxc_unstack_mountpoint(path, false);
1698 if (ret < 0) {
2a12fefd 1699 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1700 return -ret;
1701 } else {
1702 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
2a12fefd 1703 }
2a12fefd
CB
1704 }
1705
1706 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1707 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1708 return -1;
1709 }
1710 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1711 }
1712
2a12fefd 1713 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
9ba8130c 1714 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
3d7d929a 1715 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 1716 return -1;
3d7d929a 1717
2a12fefd
CB
1718 ret = unlink(path);
1719 if (ret && errno != ENOENT) {
1720 SYSERROR("error unlinking %s", path);
1721 return -errno;
1722 }
1723
7c6ef2a2 1724 ret = symlink(lxcpath, path);
3d7d929a
CB
1725 if (ret < 0) {
1726 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
7c6ef2a2
SH
1727 return -1;
1728 }
1729
3d7d929a 1730 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
6e590161 1731 return 0;
1732}
1733
3d7d929a
CB
1734static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1735 const struct lxc_console *console, char *ttydir)
7c6ef2a2 1736{
3d7d929a
CB
1737 /* We don't have a rootfs, /dev/console will be shared. */
1738 if (!rootfs->path) {
1739 DEBUG("/dev/console will be shared with the host");
7c6ef2a2 1740 return 0;
3d7d929a
CB
1741 }
1742
7c6ef2a2 1743 if (!ttydir)
3d7d929a 1744 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1745
3d7d929a 1746 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1747}
1748
998ac676
RT
1749static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1750{
1751 struct mount_opt *mo;
1752
1753 /* If opt is found in mount_opt, set or clear flags.
1754 * Otherwise append it to data. */
1755
1756 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1757 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1758 if (mo->clear)
1759 *flags &= ~mo->flag;
1760 else
1761 *flags |= mo->flag;
1762 return;
1763 }
1764 }
1765
1766 if (strlen(*data))
1767 strcat(*data, ",");
1768 strcat(*data, opt);
1769}
1770
a17b1e65 1771int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1772 char **mntdata)
1773{
1774 char *s, *data;
1775 char *p, *saveptr = NULL;
1776
911324ef 1777 *mntdata = NULL;
91656ce5 1778 *mntflags = 0L;
911324ef
DL
1779
1780 if (!mntopts)
998ac676
RT
1781 return 0;
1782
911324ef 1783 s = strdup(mntopts);
998ac676 1784 if (!s) {
36eb9bde 1785 SYSERROR("failed to allocate memory");
998ac676
RT
1786 return -1;
1787 }
1788
1789 data = malloc(strlen(s) + 1);
1790 if (!data) {
36eb9bde 1791 SYSERROR("failed to allocate memory");
998ac676
RT
1792 free(s);
1793 return -1;
1794 }
1795 *data = 0;
1796
1797 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1798 p = strtok_r(NULL, ",", &saveptr))
1799 parse_mntopt(p, mntflags, &data);
1800
1801 if (*data)
1802 *mntdata = data;
1803 else
1804 free(data);
1805 free(s);
1806
1807 return 0;
1808}
1809
6fd5e769
SH
1810static void null_endofword(char *word)
1811{
1812 while (*word && *word != ' ' && *word != '\t')
1813 word++;
1814 *word = '\0';
1815}
1816
1817/*
1818 * skip @nfields spaces in @src
1819 */
1820static char *get_field(char *src, int nfields)
1821{
1822 char *p = src;
1823 int i;
1824
1825 for (i = 0; i < nfields; i++) {
1826 while (*p && *p != ' ' && *p != '\t')
1827 p++;
1828 if (!*p)
1829 break;
1830 p++;
1831 }
1832 return p;
1833}
1834
911324ef
DL
1835static int mount_entry(const char *fsname, const char *target,
1836 const char *fstype, unsigned long mountflags,
0ac4b28a
CB
1837 const char *data, int optional, int dev,
1838 const char *rootfs)
911324ef 1839{
0ac4b28a 1840 int ret;
614305f3 1841#ifdef HAVE_STATVFS
2938f7c8 1842 struct statvfs sb;
614305f3 1843#endif
2938f7c8 1844
0ac4b28a
CB
1845 ret = safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data,
1846 rootfs);
1847 if (ret < 0) {
1fc64d22 1848 if (optional) {
0ac4b28a
CB
1849 INFO("Failed to mount \"%s\" on \"%s\" (optional): %s",
1850 fsname, target, strerror(errno));
1fc64d22
SG
1851 return 0;
1852 }
0ac4b28a
CB
1853
1854 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1855 return -1;
911324ef
DL
1856 }
1857
1858 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
7c5b6e7c 1859 unsigned long rqd_flags = 0;
0ac4b28a
CB
1860
1861 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
1862 "options",
1863 fsname ? fsname : "(none)", target ? target : "(none)");
1864
7c5b6e7c
AS
1865 if (mountflags & MS_RDONLY)
1866 rqd_flags |= MS_RDONLY;
614305f3 1867#ifdef HAVE_STATVFS
2938f7c8 1868 if (statvfs(fsname, &sb) == 0) {
7c5b6e7c 1869 unsigned long required_flags = rqd_flags;
0ac4b28a 1870
2938f7c8
SH
1871 if (sb.f_flag & MS_NOSUID)
1872 required_flags |= MS_NOSUID;
0ac4b28a 1873
ae7a770e 1874 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 1875 required_flags |= MS_NODEV;
0ac4b28a 1876
2938f7c8
SH
1877 if (sb.f_flag & MS_RDONLY)
1878 required_flags |= MS_RDONLY;
0ac4b28a 1879
2938f7c8
SH
1880 if (sb.f_flag & MS_NOEXEC)
1881 required_flags |= MS_NOEXEC;
0ac4b28a
CB
1882
1883 DEBUG("Flags for \"%s\" were %lu, required extra flags "
1884 "are %lu", fsname, sb.f_flag, required_flags);
1885
1886 /* If this was a bind mount request, and required_flags
2938f7c8 1887 * does not have any flags which are not already in
0ac4b28a 1888 * mountflags, then skip the remount.
2938f7c8
SH
1889 */
1890 if (!(mountflags & MS_REMOUNT)) {
0ac4b28a
CB
1891 if (!(required_flags & ~mountflags) &&
1892 rqd_flags == 0) {
1893 DEBUG("Mountflags already were %lu, "
1894 "skipping remount", mountflags);
2938f7c8
SH
1895 goto skipremount;
1896 }
1897 }
0ac4b28a 1898
2938f7c8 1899 mountflags |= required_flags;
6fd5e769 1900 }
614305f3 1901#endif
911324ef 1902
0ac4b28a
CB
1903 ret = mount(fsname, target, fstype, mountflags | MS_REMOUNT, data);
1904 if (ret < 0) {
1fc64d22 1905 if (optional) {
0ac4b28a
CB
1906 INFO("Failed to mount \"%s\" on \"%s\" "
1907 "(optional): %s", fsname, target,
1908 strerror(errno));
1fc64d22
SG
1909 return 0;
1910 }
0ac4b28a
CB
1911
1912 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1913 return -1;
911324ef
DL
1914 }
1915 }
1916
614305f3 1917#ifdef HAVE_STATVFS
6fd5e769 1918skipremount:
614305f3 1919#endif
0ac4b28a
CB
1920 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"", fsname,
1921 target, fstype);
911324ef
DL
1922
1923 return 0;
1924}
1925
c5e30de4 1926/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
1927static void cull_mntent_opt(struct mntent *mntent)
1928{
1929 int i;
c5e30de4
CB
1930 char *list[] = {"create=dir", "create=file", "optional", NULL};
1931
1932 for (i = 0; list[i]; i++) {
1933 char *p, *p2;
1934
1935 p = strstr(mntent->mnt_opts, list[i]);
1936 if (!p)
4e4ca161 1937 continue;
c5e30de4 1938
4e4ca161
SH
1939 p2 = strchr(p, ',');
1940 if (!p2) {
1941 /* no more mntopts, so just chop it here */
1942 *p = '\0';
1943 continue;
1944 }
c5e30de4
CB
1945
1946 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
1947 }
1948}
1949
4d5b72a1 1950static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
1951 const char *path,
1952 const struct lxc_rootfs *rootfs,
1953 const char *lxc_name,
1954 const char *lxc_path)
0ad19a3f 1955{
608e3567 1956 int ret = 0;
911324ef 1957
749f98d9
CB
1958 if (!strncmp(mntent->mnt_type, "overlay", 7))
1959 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
1960 else if (!strncmp(mntent->mnt_type, "aufs", 4))
1961 ret = aufs_mkdir(mntent, rootfs, lxc_name, lxc_path);
1962 if (ret < 0)
1963 return -1;
6e46cc0d 1964
34cfffb3 1965 if (hasmntopt(mntent, "create=dir")) {
749f98d9
CB
1966 ret = mkdir_p(path, 0755);
1967 if (ret < 0 && errno != EEXIST) {
1968 SYSERROR("Failed to create directory \"%s\"", path);
1969 return -1;
34cfffb3
SG
1970 }
1971 }
1972
4d5b72a1 1973 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
749f98d9
CB
1974 int fd;
1975 char *p1, *p2;
1976
1977 p1 = strdup(path);
1978 if (!p1)
1979 return -1;
1980
1981 p2 = dirname(p1);
1982
1983 ret = mkdir_p(p2, 0755);
1984 free(p1);
1985 if (ret < 0 && errno != EEXIST) {
1986 SYSERROR("Failed to create directory \"%s\"", path);
1987 return -1;
6e46cc0d 1988 }
749f98d9
CB
1989
1990 fd = open(path, O_CREAT, 0644);
1991 if (fd < 0)
1992 return -1;
1993 close(fd);
34cfffb3 1994 }
749f98d9
CB
1995
1996 return 0;
4d5b72a1
NC
1997}
1998
ec50007f
CB
1999/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2000 * without a rootfs. */
db4aba38 2001static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
2002 const char *path,
2003 const struct lxc_rootfs *rootfs,
2004 const char *lxc_name,
2005 const char *lxc_path)
4d5b72a1 2006{
d8b712bc 2007 int ret;
4d5b72a1
NC
2008 unsigned long mntflags;
2009 char *mntdata;
d8b712bc 2010 bool dev, optional;
ec50007f 2011 char *rootfs_path = NULL;
d8b712bc
CB
2012
2013 optional = hasmntopt(mntent, "optional") != NULL;
2014 dev = hasmntopt(mntent, "dev") != NULL;
2015
ec50007f
CB
2016 if (rootfs && rootfs->path)
2017 rootfs_path = rootfs->mount;
2018
d8b712bc
CB
2019 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2020 lxc_path);
2021 if (ret < 0) {
2022 if (optional)
2023 return 0;
608e3567 2024
d8b712bc
CB
2025 return -1;
2026 }
4e4ca161
SH
2027 cull_mntent_opt(mntent);
2028
d8b712bc
CB
2029 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2030 if (ret < 0)
a17b1e65 2031 return -1;
a17b1e65 2032
6e46cc0d 2033 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
ae7a770e 2034 mntdata, optional, dev, rootfs_path);
68c152ef 2035
911324ef 2036 free(mntdata);
911324ef
DL
2037 return ret;
2038}
2039
db4aba38
NC
2040static inline int mount_entry_on_systemfs(struct mntent *mntent)
2041{
1433c9f9 2042 int ret;
07667a6a 2043 char path[MAXPATHLEN];
1433c9f9
CB
2044
2045 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
2046 * absolute paths starting at / on the host.
2047 */
1433c9f9
CB
2048 if (mntent->mnt_dir[0] != '/')
2049 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2050 else
2051 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
07667a6a 2052 if (ret < 0 || ret >= sizeof(path))
1433c9f9 2053 return -1;
1433c9f9
CB
2054
2055 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
2056}
2057
4e4ca161 2058static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 2059 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
2060 const char *lxc_name,
2061 const char *lxc_path)
911324ef 2062{
bdd2b34c 2063 int offset;
013bd428 2064 char *aux;
67e571de 2065 const char *lxcpath;
bdd2b34c
CB
2066 char path[MAXPATHLEN];
2067 int ret = 0;
0ad19a3f 2068
593e8478 2069 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 2070 if (!lxcpath)
2a59a681 2071 return -1;
2a59a681 2072
bdd2b34c
CB
2073 /* If rootfs->path is a blockdev path, allow container fstab to use
2074 * <lxcpath>/<name>/rootfs" as the target prefix.
2075 */
2076 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
2077 if (ret < 0 || ret >= MAXPATHLEN)
80a881b2
SH
2078 goto skipvarlib;
2079
2080 aux = strstr(mntent->mnt_dir, path);
2081 if (aux) {
2082 offset = strlen(path);
2083 goto skipabs;
2084 }
2085
2086skipvarlib:
013bd428
DL
2087 aux = strstr(mntent->mnt_dir, rootfs->path);
2088 if (!aux) {
bdd2b34c 2089 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
db4aba38 2090 return ret;
013bd428 2091 }
80a881b2
SH
2092 offset = strlen(rootfs->path);
2093
2094skipabs:
bdd2b34c
CB
2095 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
2096 if (ret < 0 || ret >= MAXPATHLEN)
a17b1e65 2097 return -1;
a17b1e65 2098
0a2dddd4 2099 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 2100}
d330fe7b 2101
4e4ca161 2102static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
2103 const struct lxc_rootfs *rootfs,
2104 const char *lxc_name,
2105 const char *lxc_path)
911324ef
DL
2106{
2107 char path[MAXPATHLEN];
911324ef 2108 int ret;
d330fe7b 2109
34cfffb3 2110 /* relative to root mount point */
6e46cc0d 2111 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1433c9f9 2112 if (ret < 0 || ret >= sizeof(path)) {
9ba8130c
SH
2113 ERROR("path name too long");
2114 return -1;
2115 }
911324ef 2116
0a2dddd4 2117 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2118}
2119
06749971
CB
2120/* This logs a NOTICE() when a user specifies mounts that would conflict with
2121 * devices liblxc sets up automatically.
2122 */
2123static void log_notice_on_conflict(const struct lxc_conf *conf, const char *src,
2124 const char *dest)
2125{
2126 char *clean_mnt_fsname, *clean_mnt_dir, *tmp;
2127 bool needs_warning = false;
2128
2129 clean_mnt_fsname = lxc_deslashify(src);
2130 if (!clean_mnt_fsname)
2131 return;
2132
2133 clean_mnt_dir = lxc_deslashify(dest);
2134 if (!clean_mnt_dir) {
2135 free(clean_mnt_fsname);
2136 return;
2137 }
2138
2139 tmp = clean_mnt_dir;
2140 if (*tmp == '/')
2141 tmp++;
2142
2143 if (strncmp(src, "/dev", 4) || strncmp(tmp, "dev", 3)) {
2144 free(clean_mnt_dir);
2145 free(clean_mnt_fsname);
2146 return;
2147 }
2148
2149 if (!conf->autodev && !conf->pts && !conf->tty &&
2150 (!conf->console.path || !strcmp(conf->console.path, "none"))) {
2151 free(clean_mnt_dir);
2152 free(clean_mnt_fsname);
2153 return;
2154 }
2155
2156 if (!strcmp(tmp, "dev") && conf->autodev > 0)
2157 needs_warning = true;
2158 else if (!strcmp(tmp, "dev/pts") && (conf->autodev > 0 || conf->pts > 0))
2159 needs_warning = true;
2160 else if (!strcmp(tmp, "dev/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2161 needs_warning = true;
2162 else if (!strcmp(tmp, "dev/pts/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2163 needs_warning = true;
2164 else if (!strcmp(tmp, "dev/null") && conf->autodev > 0)
2165 needs_warning = true;
2166 else if (!strcmp(tmp, "dev/zero") && conf->autodev > 0)
2167 needs_warning = true;
2168 else if (!strcmp(tmp, "dev/full") && conf->autodev > 0)
2169 needs_warning = true;
2170 else if (!strcmp(tmp, "dev/urandom") && conf->autodev > 0)
2171 needs_warning = true;
2172 else if (!strcmp(tmp, "dev/random") && conf->autodev > 0)
2173 needs_warning = true;
2174 else if (!strcmp(tmp, "dev/tty") && conf->autodev > 0)
2175 needs_warning = true;
2176 else if (!strncmp(tmp, "dev/tty", 7) && (conf->autodev > 0 || conf->tty > 0))
2177 needs_warning = true;
2178
2179 if (needs_warning)
2180 NOTICE("Requesting to mount \"%s\" on \"%s\" while requesting "
2181 "automatic device setup under \"/dev\"",
2182 clean_mnt_fsname, clean_mnt_dir);
2183
2184 free(clean_mnt_dir);
2185 free(clean_mnt_fsname);
2186}
2187
2188static int mount_file_entries(const struct lxc_conf *conf,
2189 const struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2190 const char *lxc_name, const char *lxc_path)
911324ef 2191{
aaf901be
AM
2192 struct mntent mntent;
2193 char buf[4096];
911324ef 2194 int ret = -1;
e76b8764 2195
aaf901be 2196 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
06749971
CB
2197 log_notice_on_conflict(conf, mntent.mnt_fsname, mntent.mnt_dir);
2198
1ae3c19f
CB
2199 if (!rootfs->path)
2200 ret = mount_entry_on_systemfs(&mntent);
2201 else if (mntent.mnt_dir[0] != '/')
2202 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2203 lxc_name, lxc_path);
2204 else
2205 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2206 lxc_name, lxc_path);
2207 if (ret < 0)
2208 return -1;
0ad19a3f 2209 }
2210 ret = 0;
cd54d859 2211
1ae3c19f 2212 INFO("Set up mount entries");
e7938e9e
MN
2213 return ret;
2214}
2215
06749971
CB
2216static int setup_mount(const struct lxc_conf *conf,
2217 const struct lxc_rootfs *rootfs, const char *fstab,
42dff448 2218 const char *lxc_name, const char *lxc_path)
e7938e9e 2219{
42dff448 2220 FILE *f;
e7938e9e
MN
2221 int ret;
2222
2223 if (!fstab)
2224 return 0;
2225
42dff448
CB
2226 f = setmntent(fstab, "r");
2227 if (!f) {
2228 SYSERROR("Failed to open \"%s\"", fstab);
e7938e9e
MN
2229 return -1;
2230 }
2231
06749971 2232 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
42dff448
CB
2233 if (ret < 0)
2234 ERROR("Failed to set up mount entries");
e7938e9e 2235
42dff448 2236 endmntent(f);
0ad19a3f 2237 return ret;
2238}
2239
5ef5c9a3 2240FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2241{
5ef5c9a3 2242 int ret;
e7938e9e 2243 char *mount_entry;
5ef5c9a3 2244 struct lxc_list *iterator;
6bd04140 2245 FILE *f;
5ef5c9a3
CB
2246 int fd = -1;
2247
2248 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2249 if (fd < 0) {
2250 if (errno != ENOSYS)
2251 return NULL;
6bd04140
CB
2252 f = tmpfile();
2253 TRACE("Created temporary mount file");
5ef5c9a3 2254 } else {
6bd04140
CB
2255 f = fdopen(fd, "r+");
2256 TRACE("Created anonymous mount file");
5ef5c9a3 2257 }
e7938e9e 2258
6bd04140
CB
2259 if (!f) {
2260 SYSERROR("Could not create mount file");
5ef5c9a3
CB
2261 if (fd != -1)
2262 close(fd);
9fc7f8c0 2263 return NULL;
e7938e9e
MN
2264 }
2265
2266 lxc_list_for_each(iterator, mount) {
2267 mount_entry = iterator->elem;
6bd04140 2268 ret = fprintf(f, "%s\n", mount_entry);
5ef5c9a3 2269 if (ret < strlen(mount_entry))
6bd04140 2270 WARN("Could not write mount entry to mount file");
5ef5c9a3
CB
2271 }
2272
6bd04140
CB
2273 ret = fseek(f, 0, SEEK_SET);
2274 if (ret < 0) {
2275 SYSERROR("Failed to seek mount file");
2276 fclose(f);
5ef5c9a3 2277 return NULL;
e7938e9e
MN
2278 }
2279
6bd04140 2280 return f;
9fc7f8c0
TA
2281}
2282
06749971
CB
2283static int setup_mount_entries(const struct lxc_conf *conf,
2284 const struct lxc_rootfs *rootfs,
5ef5c9a3
CB
2285 struct lxc_list *mount, const char *lxc_name,
2286 const char *lxc_path)
9fc7f8c0 2287{
19b5d755 2288 FILE *f;
9fc7f8c0
TA
2289 int ret;
2290
19b5d755
CB
2291 f = make_anonymous_mount_file(mount);
2292 if (!f)
9fc7f8c0 2293 return -1;
e7938e9e 2294
06749971 2295 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
e7938e9e 2296
19b5d755 2297 fclose(f);
e7938e9e
MN
2298 return ret;
2299}
2300
bab88e68
CS
2301static int parse_cap(const char *cap)
2302{
2303 char *ptr = NULL;
84760c11 2304 size_t i;
2305 int capid = -1;
bab88e68 2306
7035407c
DE
2307 if (!strcmp(cap, "none"))
2308 return -2;
2309
bab88e68
CS
2310 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2311
2312 if (strcmp(cap, caps_opt[i].name))
2313 continue;
2314
2315 capid = caps_opt[i].value;
2316 break;
2317 }
2318
2319 if (capid < 0) {
2320 /* try to see if it's numeric, so the user may specify
2321 * capabilities that the running kernel knows about but
2322 * we don't */
2323 errno = 0;
2324 capid = strtol(cap, &ptr, 10);
2325 if (!ptr || *ptr != '\0' || errno != 0)
2326 /* not a valid number */
2327 capid = -1;
2328 else if (capid > lxc_caps_last_cap())
2329 /* we have a number but it's not a valid
2330 * capability */
2331 capid = -1;
2332 }
2333
2334 return capid;
2335}
2336
0769b82a
CS
2337int in_caplist(int cap, struct lxc_list *caps)
2338{
2339 struct lxc_list *iterator;
2340 int capid;
2341
2342 lxc_list_for_each(iterator, caps) {
2343 capid = parse_cap(iterator->elem);
2344 if (capid == cap)
2345 return 1;
2346 }
2347
2348 return 0;
2349}
2350
81810dd1
DL
2351static int setup_caps(struct lxc_list *caps)
2352{
2353 struct lxc_list *iterator;
2354 char *drop_entry;
bab88e68 2355 int capid;
81810dd1
DL
2356
2357 lxc_list_for_each(iterator, caps) {
2358
2359 drop_entry = iterator->elem;
2360
bab88e68 2361 capid = parse_cap(drop_entry);
d55bc1ad 2362
81810dd1 2363 if (capid < 0) {
1e11be34
DL
2364 ERROR("unknown capability %s", drop_entry);
2365 return -1;
81810dd1
DL
2366 }
2367
2368 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2369
2370 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2371 SYSERROR("failed to remove %s capability", drop_entry);
2372 return -1;
2373 }
81810dd1
DL
2374
2375 }
2376
1fb86a7c
SH
2377 DEBUG("capabilities have been setup");
2378
2379 return 0;
2380}
2381
2382static int dropcaps_except(struct lxc_list *caps)
2383{
2384 struct lxc_list *iterator;
2385 char *keep_entry;
1fb86a7c
SH
2386 int i, capid;
2387 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2388 INFO("found %d capabilities", numcaps);
1fb86a7c 2389
2caf9a97
SH
2390 if (numcaps <= 0 || numcaps > 200)
2391 return -1;
2392
1a0e70ac 2393 /* caplist[i] is 1 if we keep capability i */
1fb86a7c
SH
2394 int *caplist = alloca(numcaps * sizeof(int));
2395 memset(caplist, 0, numcaps * sizeof(int));
2396
2397 lxc_list_for_each(iterator, caps) {
2398
2399 keep_entry = iterator->elem;
2400
bab88e68 2401 capid = parse_cap(keep_entry);
1fb86a7c 2402
7035407c
DE
2403 if (capid == -2)
2404 continue;
2405
1fb86a7c
SH
2406 if (capid < 0) {
2407 ERROR("unknown capability %s", keep_entry);
2408 return -1;
2409 }
2410
8255688a 2411 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2412
2413 caplist[capid] = 1;
2414 }
2415 for (i=0; i<numcaps; i++) {
2416 if (caplist[i])
2417 continue;
2418 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2419 SYSERROR("failed to remove capability %d", i);
2420 return -1;
2421 }
1fb86a7c
SH
2422 }
2423
2424 DEBUG("capabilities have been setup");
81810dd1
DL
2425
2426 return 0;
2427}
2428
c6d09e15
WB
2429static int parse_resource(const char *res) {
2430 size_t i;
2431 int resid = -1;
2432
2433 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2434 if (strcmp(res, limit_opt[i].name) == 0)
2435 return limit_opt[i].value;
2436 }
2437
2438 /* try to see if it's numeric, so the user may specify
2439 * resources that the running kernel knows about but
2440 * we don't */
2441 if (lxc_safe_int(res, &resid) == 0)
2442 return resid;
2443 return -1;
2444}
2445
2446int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2447 struct lxc_list *it;
2448 struct lxc_limit *lim;
2449 int resid;
2450
2451 lxc_list_for_each(it, limits) {
2452 lim = it->elem;
2453
2454 resid = parse_resource(lim->resource);
2455 if (resid < 0) {
2456 ERROR("unknown resource %s", lim->resource);
2457 return -1;
2458 }
2459
2460 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2461 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2462 return -1;
2463 }
2464 }
2465 return 0;
2466}
2467
ae9242c8
SH
2468static char *default_rootfs_mount = LXCROOTFSMOUNT;
2469
7b379ab3 2470struct lxc_conf *lxc_conf_init(void)
089cd8b8 2471{
7b379ab3 2472 struct lxc_conf *new;
26ddeedd 2473 int i;
7b379ab3 2474
13277ec4 2475 new = malloc(sizeof(*new));
7b379ab3 2476 if (!new) {
13277ec4 2477 ERROR("lxc_conf_init : %s", strerror(errno));
7b379ab3
MN
2478 return NULL;
2479 }
2480 memset(new, 0, sizeof(*new));
2481
4b73005c 2482 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2483 new->personality = -1;
124fa0a8 2484 new->autodev = 1;
596a818d
DE
2485 new->console.log_path = NULL;
2486 new->console.log_fd = -1;
28a4b0e5 2487 new->console.path = NULL;
63376d7d 2488 new->console.peer = -1;
b5159817
DE
2489 new->console.peerpty.busy = -1;
2490 new->console.peerpty.master = -1;
2491 new->console.peerpty.slave = -1;
63376d7d
DL
2492 new->console.master = -1;
2493 new->console.slave = -1;
2494 new->console.name[0] = '\0';
d2e30e99 2495 new->maincmd_fd = -1;
76a26f55 2496 new->nbd_idx = -1;
54c30e29 2497 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2498 if (!new->rootfs.mount) {
13277ec4 2499 ERROR("lxc_conf_init : %s", strerror(errno));
53f3f048
SH
2500 free(new);
2501 return NULL;
2502 }
858377e4 2503 new->logfd = -1;
7b379ab3
MN
2504 lxc_list_init(&new->cgroup);
2505 lxc_list_init(&new->network);
2506 lxc_list_init(&new->mount_list);
81810dd1 2507 lxc_list_init(&new->caps);
1fb86a7c 2508 lxc_list_init(&new->keepcaps);
f6d3e3e4 2509 lxc_list_init(&new->id_map);
f979ac15 2510 lxc_list_init(&new->includes);
4184c3e1 2511 lxc_list_init(&new->aliens);
7c661726 2512 lxc_list_init(&new->environment);
c6d09e15 2513 lxc_list_init(&new->limits);
26ddeedd
SH
2514 for (i=0; i<NUM_LXC_HOOKS; i++)
2515 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2516 lxc_list_init(&new->groups);
fe4de9a6
DE
2517 new->lsm_aa_profile = NULL;
2518 new->lsm_se_context = NULL;
5112cd70 2519 new->tmp_umount_proc = 0;
7b379ab3 2520
9f30a190
MM
2521 for (i = 0; i < LXC_NS_MAX; i++)
2522 new->inherit_ns_fd[i] = -1;
2523
72bb04e4
PT
2524 /* if running in a new user namespace, init and COMMAND
2525 * default to running as UID/GID 0 when using lxc-execute */
2526 new->init_uid = 0;
2527 new->init_gid = 0;
43654d34 2528 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
72bb04e4 2529
7b379ab3 2530 return new;
089cd8b8
DL
2531}
2532
251d0d2a
DE
2533static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
2534 size_t buf_size)
f6d3e3e4 2535{
29053180
CB
2536 char path[MAXPATHLEN];
2537 int fd, ret;
f6d3e3e4 2538
29053180
CB
2539 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
2540 idtype == ID_TYPE_UID ? 'u' : 'g');
2541 if (ret < 0 || ret >= MAXPATHLEN) {
2542 ERROR("failed to create path \"%s\"", path);
f6d3e3e4
SH
2543 return -E2BIG;
2544 }
29053180
CB
2545
2546 fd = open(path, O_WRONLY);
2547 if (fd < 0) {
2548 SYSERROR("failed to open \"%s\"", path);
2549 return -1;
f6d3e3e4 2550 }
29053180
CB
2551
2552 errno = 0;
2553 ret = lxc_write_nointr(fd, buf, buf_size);
2554 if (ret != buf_size) {
2555 SYSERROR("failed to write %cid mapping to \"%s\"",
2556 idtype == ID_TYPE_UID ? 'u' : 'g', path);
2557 close(fd);
2558 return -1;
2559 }
2560 close(fd);
2561
2562 return 0;
f6d3e3e4
SH
2563}
2564
6e50e704
CB
2565/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2566 *
2567 * @return 1 if functional binary was found
2568 * @return 0 if binary exists but is lacking privilege
2569 * @return -ENOENT if binary does not exist
2570 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
2571 *
2572 */
df6a2945
CB
2573static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2574{
2575 char *path;
2576 int ret;
2577 struct stat st;
2578 int fret = 0;
2579
6e50e704
CB
2580 if (cap != CAP_SETUID && cap != CAP_SETGID)
2581 return -EINVAL;
2582
df6a2945
CB
2583 path = on_path(binary, NULL);
2584 if (!path)
2585 return -ENOENT;
2586
2587 ret = stat(path, &st);
2588 if (ret < 0) {
2589 fret = -errno;
2590 goto cleanup;
2591 }
2592
2593 /* Check if the binary is setuid. */
2594 if (st.st_mode & S_ISUID) {
2595 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
2596 fret = 1;
2597 goto cleanup;
2598 }
2599
69924fff 2600 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
2601 /* Check if it has the CAP_SETUID capability. */
2602 if ((cap & CAP_SETUID) &&
2603 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2604 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2605 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
2606 "and CAP_PERMITTED sets.", path);
2607 fret = 1;
2608 goto cleanup;
2609 }
2610
2611 /* Check if it has the CAP_SETGID capability. */
2612 if ((cap & CAP_SETGID) &&
2613 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2614 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2615 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
2616 "and CAP_PERMITTED sets.", path);
2617 fret = 1;
2618 goto cleanup;
2619 }
d6018f88 2620 #else
69924fff
CB
2621 /* If we cannot check for file capabilities we need to give the benefit
2622 * of the doubt. Otherwise we might fail even though all the necessary
2623 * file capabilities are set.
2624 */
d6018f88
CB
2625 DEBUG("Cannot check for file capabilites as full capability support is "
2626 "missing. Manual intervention needed.");
2627 fret = 1;
df6a2945
CB
2628 #endif
2629
2630cleanup:
2631 free(path);
2632 return fret;
2633}
2634
986ef930
CB
2635int lxc_map_ids_exec_wrapper(void *args)
2636{
2637 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2638 return -1;
2639}
2640
f6d3e3e4
SH
2641int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2642{
f6d3e3e4 2643 struct id_map *map;
4bc3b759 2644 struct lxc_list *iterator;
251d0d2a 2645 enum idtype type;
986ef930 2646 char u_or_g;
4bc3b759 2647 char *pos;
99d43365 2648 int fill, left;
986ef930
CB
2649 char cmd_output[MAXPATHLEN];
2650 /* strlen("new@idmap") = 9
2651 * +
2652 * strlen(" ") = 1
2653 * +
2654 * LXC_NUMSTRLEN64
2655 * +
2656 * strlen(" ") = 1
2657 *
2658 * We add some additional space to make sure that we really have
2659 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2660 */
2661 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
2662 int ret = 0, uidmap = 0, gidmap = 0;
2663 bool use_shadow = false, had_entry = false;
df6a2945
CB
2664
2665 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2666 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
2667 * will protected it by preventing another user from being handed the
2668 * range by shadow.
2669 */
df6a2945 2670 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
2671 if (uidmap == -ENOENT)
2672 WARN("newuidmap binary is missing");
2673 else if (!uidmap)
2674 WARN("newuidmap is lacking necessary privileges");
2675
df6a2945 2676 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
2677 if (gidmap == -ENOENT)
2678 WARN("newgidmap binary is missing");
2679 else if (!gidmap)
2680 WARN("newgidmap is lacking necessary privileges");
2681
df6a2945
CB
2682 if (uidmap > 0 && gidmap > 0) {
2683 DEBUG("Functional newuidmap and newgidmap binary found.");
4bc3b759 2684 use_shadow = true;
df6a2945 2685 } else {
99d43365
CB
2686 /* In case unprivileged users run application containers via
2687 * execute() or a start*() there are valid cases where they may
2688 * only want to map their own {g,u}id. Let's not block them from
2689 * doing so by requiring geteuid() == 0.
2690 */
2691 DEBUG("No newuidmap and newgidmap binary found. Trying to "
2692 "write directly with euid %d.", geteuid());
0e6e3a41 2693 }
251d0d2a 2694
986ef930
CB
2695 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2696 type++, u_or_g = 'g') {
2697 pos = mapbuf;
2698
0e6e3a41 2699 if (use_shadow)
986ef930 2700 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 2701
cf3ef16d 2702 lxc_list_for_each(iterator, idmap) {
4bc3b759
CB
2703 /* The kernel only takes <= 4k for writes to
2704 * /proc/<nr>/[ug]id_map
2705 */
251d0d2a 2706 map = iterator->elem;
cf3ef16d
SH
2707 if (map->idtype != type)
2708 continue;
2709
4bc3b759
CB
2710 had_entry = true;
2711
986ef930 2712 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 2713 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
2714 use_shadow ? " " : "", map->nsid,
2715 map->hostid, map->range,
0e6e3a41 2716 use_shadow ? "" : "\n");
cf3ef16d 2717 if (fill <= 0 || fill >= left)
4bc3b759
CB
2718 SYSERROR("Too many {g,u}id mappings defined.");
2719
cf3ef16d 2720 pos += fill;
251d0d2a 2721 }
cf3ef16d 2722 if (!had_entry)
4f7521b4 2723 continue;
cf3ef16d 2724
986ef930
CB
2725 /* Try to catch the ouput of new{g,u}idmap to make debugging
2726 * easier.
2727 */
2728 if (use_shadow) {
2729 ret = run_command(cmd_output, sizeof(cmd_output),
2730 lxc_map_ids_exec_wrapper,
2731 (void *)mapbuf);
2732 if (ret < 0) {
54fbbeb5
CB
2733 ERROR("new%cidmap failed to write mapping \"%s\": %s",
2734 u_or_g, cmd_output, mapbuf);
986ef930
CB
2735 return -1;
2736 }
54fbbeb5 2737 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
d1838f34 2738 } else {
986ef930 2739 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
54fbbeb5 2740 if (ret < 0) {
da0f9977 2741 ERROR("Failed to write mapping: %s", mapbuf);
986ef930 2742 return -1;
54fbbeb5
CB
2743 }
2744 TRACE("Wrote mapping \"%s\"", mapbuf);
d1838f34 2745 }
986ef930
CB
2746
2747 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 2748 }
251d0d2a 2749
986ef930 2750 return 0;
f6d3e3e4
SH
2751}
2752
cf3ef16d 2753/*
7b50c609
TS
2754 * return the host uid/gid to which the container root is mapped in
2755 * *val.
0b3a6504 2756 * Return true if id was found, false otherwise.
cf3ef16d 2757 */
2a9a80cb 2758bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 2759 unsigned long *val)
cf3ef16d
SH
2760{
2761 struct lxc_list *it;
2762 struct id_map *map;
2763
2764 lxc_list_for_each(it, &conf->id_map) {
2765 map = it->elem;
7b50c609 2766 if (map->idtype != idtype)
cf3ef16d
SH
2767 continue;
2768 if (map->nsid != 0)
2769 continue;
2a9a80cb
SH
2770 *val = map->hostid;
2771 return true;
cf3ef16d 2772 }
2a9a80cb 2773 return false;
cf3ef16d
SH
2774}
2775
2133f58c 2776int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
2777{
2778 struct lxc_list *it;
2779 struct id_map *map;
2780 lxc_list_for_each(it, &conf->id_map) {
2781 map = it->elem;
2133f58c 2782 if (map->idtype != idtype)
cf3ef16d
SH
2783 continue;
2784 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 2785 return (id - map->hostid) + map->nsid;
cf3ef16d 2786 }
57d116ab 2787 return -1;
cf3ef16d
SH
2788}
2789
339efad9 2790int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
2791{
2792 struct lxc_list *it;
2793 struct id_map *map;
2133f58c 2794 unsigned int freeid = 0;
cf3ef16d
SH
2795again:
2796 lxc_list_for_each(it, &conf->id_map) {
2797 map = it->elem;
2133f58c 2798 if (map->idtype != idtype)
cf3ef16d
SH
2799 continue;
2800 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
2801 freeid = map->nsid + map->range;
2802 goto again;
2803 }
2804 }
2805 return freeid;
2806}
2807
f4f52cb5
CB
2808int chown_mapped_root_exec_wrapper(void *args)
2809{
2810 execvp("lxc-usernsexec", args);
2811 return -1;
2812}
2813
f6d3e3e4 2814/*
7b50c609
TS
2815 * chown_mapped_root: for an unprivileged user with uid/gid X to
2816 * chown a dir to subuid/subgid Y, he needs to run chown as root
2817 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
2818 * nsid Y is mapped to hostuid/hostgid X. That way, the container
2819 * root is privileged with respect to hostuid/hostgid X, allowing
2820 * him to do the chown.
f6d3e3e4 2821 */
c4d10a05 2822int chown_mapped_root(char *path, struct lxc_conf *conf)
f6d3e3e4 2823{
f4f52cb5 2824 uid_t rootuid, rootgid;
2a9a80cb 2825 unsigned long val;
f4f52cb5
CB
2826 int hostuid, hostgid, ret;
2827 struct stat sb;
2828 char map1[100], map2[100], map3[100], map4[100], map5[100];
2829 char ugid[100];
2830 char *args1[] = {"lxc-usernsexec",
2831 "-m", map1,
2832 "-m", map2,
2833 "-m", map3,
2834 "-m", map5,
2835 "--", "chown", ugid, path,
2836 NULL};
2837 char *args2[] = {"lxc-usernsexec",
2838 "-m", map1,
2839 "-m", map2,
2840 "-m", map3,
2841 "-m", map4,
2842 "-m", map5,
2843 "--", "chown", ugid, path,
2844 NULL};
2845 char cmd_output[MAXPATHLEN];
2846
2847 hostuid = geteuid();
2848 hostgid = getegid();
f6d3e3e4 2849
2a9a80cb 2850 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 2851 ERROR("No uid mapping for container root");
c4d10a05 2852 return -1;
f6d3e3e4 2853 }
f4f52cb5 2854 rootuid = (uid_t)val;
7b50c609 2855 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 2856 ERROR("No gid mapping for container root");
7b50c609
TS
2857 return -1;
2858 }
f4f52cb5 2859 rootgid = (gid_t)val;
2a9a80cb 2860
f4f52cb5 2861 if (hostuid == 0) {
7b50c609 2862 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
2863 ERROR("Error chowning %s", path);
2864 return -1;
2865 }
2866 return 0;
2867 }
f3d7e4ca 2868
f4f52cb5 2869 if (rootuid == hostuid) {
1a0e70ac 2870 /* nothing to do */
b103ceac 2871 INFO("Container root is our uid; no need to chown");
f3d7e4ca
SH
2872 return 0;
2873 }
2874
bbdbf8f0 2875 /* save the current gid of "path" */
f4f52cb5
CB
2876 if (stat(path, &sb) < 0) {
2877 ERROR("Error stat %s", path);
f6d3e3e4
SH
2878 return -1;
2879 }
7b50c609 2880
bbdbf8f0
CB
2881 /* Update the path argument in case this was overlayfs. */
2882 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
2883 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
2884
f4f52cb5
CB
2885 /*
2886 * A file has to be group-owned by a gid mapped into the
2887 * container, or the container won't be privileged over it.
2888 */
2889 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
2890 if (sb.st_uid == hostuid &&
2891 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
2892 chown(path, -1, hostgid) < 0) {
2893 ERROR("Failed chgrping %s", path);
2894 return -1;
2895 }
f6d3e3e4 2896
1a0e70ac 2897 /* "u:0:rootuid:1" */
f4f52cb5
CB
2898 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
2899 if (ret < 0 || ret >= 100) {
2900 ERROR("Error uid printing map string");
2901 return -1;
2902 }
7b50c609 2903
1a0e70ac 2904 /* "u:hostuid:hostuid:1" */
f4f52cb5
CB
2905 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
2906 if (ret < 0 || ret >= 100) {
2907 ERROR("Error uid printing map string");
2908 return -1;
2909 }
c4d10a05 2910
1a0e70ac 2911 /* "g:0:rootgid:1" */
f4f52cb5
CB
2912 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
2913 if (ret < 0 || ret >= 100) {
2914 ERROR("Error gid printing map string");
2915 return -1;
2916 }
98e5ba51 2917
1a0e70ac 2918 /* "g:pathgid:rootgid+pathgid:1" */
f4f52cb5
CB
2919 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
2920 rootgid + (gid_t)sb.st_gid);
2921 if (ret < 0 || ret >= 100) {
2922 ERROR("Error gid printing map string");
2923 return -1;
2924 }
c4d10a05 2925
1a0e70ac 2926 /* "g:hostgid:hostgid:1" */
f4f52cb5
CB
2927 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
2928 if (ret < 0 || ret >= 100) {
2929 ERROR("Error gid printing map string");
2930 return -1;
2931 }
7b50c609 2932
1a0e70ac 2933 /* "0:pathgid" (chown) */
f4f52cb5
CB
2934 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
2935 if (ret < 0 || ret >= 100) {
2936 ERROR("Error owner printing format string for chown");
2937 return -1;
2938 }
7b50c609 2939
f4f52cb5
CB
2940 if (hostgid == sb.st_gid)
2941 ret = run_command(cmd_output, sizeof(cmd_output),
2942 chown_mapped_root_exec_wrapper,
2943 (void *)args1);
2944 else
2945 ret = run_command(cmd_output, sizeof(cmd_output),
2946 chown_mapped_root_exec_wrapper,
2947 (void *)args2);
2948 if (ret < 0)
2949 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 2950
f4f52cb5 2951 return ret;
f6d3e3e4
SH
2952}
2953
54117de5 2954int lxc_ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 2955{
c4d10a05 2956 if (lxc_list_empty(&c->id_map))
f6d3e3e4 2957 return 0;
c4d10a05 2958
54117de5
CB
2959 if (!strcmp(c->console.name, ""))
2960 return 0;
2961
2962 if (chown_mapped_root(c->console.name, c) < 0) {
2963 ERROR("failed to chown console \"%s\"", c->console.name);
c4d10a05
SH
2964 return -1;
2965 }
2966
54117de5
CB
2967 TRACE("chowned console \"%s\"", c->console.name);
2968
f6d3e3e4
SH
2969 return 0;
2970}
2971
943144d9
CB
2972/* NOTE: Must not be called from inside the container namespace! */
2973int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
2974{
2975 int mounted;
2976
943144d9 2977 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 2978 if (mounted == -1) {
943144d9 2979 SYSERROR("failed to mount /proc in the container");
01958b1f 2980 /* continue only if there is no rootfs */
943144d9 2981 if (conf->rootfs.path)
01958b1f 2982 return -1;
5112cd70 2983 } else if (mounted == 1) {
943144d9 2984 conf->tmp_umount_proc = 1;
5112cd70 2985 }
943144d9 2986
5112cd70
SH
2987 return 0;
2988}
2989
2990void tmp_proc_unmount(struct lxc_conf *lxc_conf)
2991{
2992 if (lxc_conf->tmp_umount_proc == 1) {
2993 umount("/proc");
2994 lxc_conf->tmp_umount_proc = 0;
2995 }
2996}
2997
6a0c909a 2998void remount_all_slave(void)
e995d7a2
SH
2999{
3000 /* walk /proc/mounts and change any shared entries to slave */
3001 FILE *f = fopen("/proc/self/mountinfo", "r");
3002 char *line = NULL;
3003 size_t len = 0;
3004
3005 if (!f) {
3006 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3007 ERROR("Continuing container startup...");
3008 return;
3009 }
3010
3011 while (getline(&line, &len, f) != -1) {
3012 char *target, *opts;
3013 target = get_field(line, 4);
3014 if (!target)
3015 continue;
3016 opts = get_field(target, 2);
3017 if (!opts)
3018 continue;
3019 null_endofword(opts);
3020 if (!strstr(opts, "shared"))
3021 continue;
3022 null_endofword(target);
3023 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3024 SYSERROR("Failed to make %s rslave", target);
3025 ERROR("Continuing...");
3026 }
3027 }
3028 fclose(f);
f10fad2f 3029 free(line);
e995d7a2
SH
3030}
3031
2322903b
SH
3032void lxc_execute_bind_init(struct lxc_conf *conf)
3033{
3034 int ret;
9d9c111c
SH
3035 char path[PATH_MAX], destpath[PATH_MAX], *p;
3036
3037 /* If init exists in the container, don't bind mount a static one */
3038 p = choose_init(conf->rootfs.mount);
3039 if (p) {
3040 free(p);
3041 return;
3042 }
2322903b
SH
3043
3044 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3045 if (ret < 0 || ret >= PATH_MAX) {
3046 WARN("Path name too long searching for lxc.init.static");
3047 return;
3048 }
3049
3050 if (!file_exists(path)) {
3051 INFO("%s does not exist on host", path);
3052 return;
3053 }
3054
3055 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3056 if (ret < 0 || ret >= PATH_MAX) {
3057 WARN("Path name too long for container's lxc.init.static");
3058 return;
3059 }
3060
3061 if (!file_exists(destpath)) {
3062 FILE * pathfile = fopen(destpath, "wb");
3063 if (!pathfile) {
3064 SYSERROR("Failed to create mount target '%s'", destpath);
3065 return;
3066 }
3067 fclose(pathfile);
3068 }
3069
592fd47a 3070 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
2322903b
SH
3071 if (ret < 0)
3072 SYSERROR("Failed to bind lxc.init.static into container");
3073 INFO("lxc.init.static bound into container at %s", path);
3074}
3075
35120d9c
SH
3076/*
3077 * This does the work of remounting / if it is shared, calling the
3078 * container pre-mount hooks, and mounting the rootfs.
3079 */
3080int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 3081{
35120d9c
SH
3082 if (conf->rootfs_setup) {
3083 /*
3084 * rootfs was set up in another namespace. bind-mount it
3085 * to give us a mount in our own ns so we can pivot_root to it
3086 */
3087 const char *path = conf->rootfs.mount;
3088 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
3089 ERROR("Failed to bind-mount container / onto itself");
145832ba 3090 return -1;
35120d9c 3091 }
145832ba 3092 return 0;
35120d9c 3093 }
d4ef7c50 3094
e995d7a2
SH
3095 remount_all_slave();
3096
35120d9c
SH
3097 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
3098 ERROR("failed to run pre-mount hooks for container '%s'.", name);
3099 return -1;
3100 }
3101
9aa76a17 3102 if (lxc_setup_rootfs(conf)) {
35120d9c
SH
3103 ERROR("failed to setup rootfs for '%s'", name);
3104 return -1;
3105 }
3106
3107 conf->rootfs_setup = true;
3108 return 0;
3109}
3110
1c1c7051
SH
3111static bool verify_start_hooks(struct lxc_conf *conf)
3112{
3113 struct lxc_list *it;
3114 char path[MAXPATHLEN];
3115 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
3116 char *hookname = it->elem;
3117 struct stat st;
3118 int ret;
3119
3120 ret = snprintf(path, MAXPATHLEN, "%s%s",
7b6753e7 3121 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
1c1c7051
SH
3122 if (ret < 0 || ret >= MAXPATHLEN)
3123 return false;
3124 ret = stat(path, &st);
3125 if (ret) {
7b6753e7 3126 SYSERROR("Start hook %s not found in container",
1c1c7051
SH
3127 hookname);
3128 return false;
3129 }
6a0c909a 3130 return true;
1c1c7051
SH
3131 }
3132
3133 return true;
3134}
3135
35120d9c
SH
3136int lxc_setup(struct lxc_handler *handler)
3137{
2187efd3 3138 int ret;
35120d9c
SH
3139 const char *name = handler->name;
3140 struct lxc_conf *lxc_conf = handler->conf;
3141 const char *lxcpath = handler->lxcpath;
35120d9c
SH
3142
3143 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
3144 ERROR("Error setting up rootfs mount after spawn");
3145 return -1;
3146 }
3147
6c544cb3
MM
3148 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
3149 if (setup_utsname(lxc_conf->utsname)) {
3150 ERROR("failed to setup the utsname for '%s'", name);
3151 return -1;
3152 }
0ad19a3f 3153 }
3154
811ef482 3155 if (lxc_setup_network_in_child_namespaces(lxc_conf, &lxc_conf->network)) {
36eb9bde 3156 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 3157 return -1;
0ad19a3f 3158 }
3159
790255cf
CB
3160 if (lxc_network_send_name_and_ifindex_to_parent(handler) < 0) {
3161 ERROR("Failed to network device names and ifindices to parent");
3162 return -1;
3163 }
3164
bc6928ff 3165 if (lxc_conf->autodev > 0) {
14221cbb 3166 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
91c3830e 3167 ERROR("failed to mount /dev in the container");
c6883f38
SH
3168 return -1;
3169 }
3170 }
3171
368bbc02
CS
3172 /* do automatic mounts (mainly /proc and /sys), but exclude
3173 * those that need to wait until other stuff has finished
3174 */
4fb3cba5 3175 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
3176 ERROR("failed to setup the automatic mounts for '%s'", name);
3177 return -1;
3178 }
3179
06749971 3180 if (setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
36eb9bde 3181 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 3182 return -1;
576f946d 3183 }
3184
06749971 3185 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(lxc_conf, &lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
e7938e9e
MN
3186 ERROR("failed to setup the mount entries for '%s'", name);
3187 return -1;
3188 }
3189
7b6753e7 3190 /* Make sure any start hooks are in the container */
1c1c7051
SH
3191 if (!verify_start_hooks(lxc_conf))
3192 return -1;
3193
2322903b
SH
3194 if (lxc_conf->is_execute)
3195 lxc_execute_bind_init(lxc_conf);
3196
368bbc02
CS
3197 /* now mount only cgroup, if wanted;
3198 * before, /sys could not have been mounted
3199 * (is either mounted automatically or via fstab entries)
3200 */
4fb3cba5 3201 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
3202 ERROR("failed to setup the automatic mounts for '%s'", name);
3203 return -1;
3204 }
3205
283678ed 3206 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
773fb9ca
SH
3207 ERROR("failed to run mount hooks for container '%s'.", name);
3208 return -1;
3209 }
3210
bc6928ff 3211 if (lxc_conf->autodev > 0) {
283678ed 3212 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
f7bee6c6
MW
3213 ERROR("failed to run autodev hooks for container '%s'.", name);
3214 return -1;
3215 }
06749971 3216
27245ff7 3217 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
91c3830e
SH
3218 ERROR("failed to populate /dev in the container");
3219 return -1;
3220 }
3221 }
368bbc02 3222
3d7d929a 3223 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
36eb9bde 3224 ERROR("failed to setup the console for '%s'", name);
95b5ffaf 3225 return -1;
6e590161 3226 }
3227
69aa6655
DE
3228 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
3229 ERROR("failed to setup /dev symlinks for '%s'", name);
3230 return -1;
3231 }
3232
5112cd70 3233 /* mount /proc if it's not already there */
943144d9 3234 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 3235 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 3236 return -1;
e075f5d9 3237 }
e075f5d9 3238
ac778708 3239 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 3240 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 3241 return -1;
ed502555 3242 }
3243
70761e5e 3244 if (lxc_setup_devpts(lxc_conf->pts)) {
36eb9bde 3245 ERROR("failed to setup the new pts instance");
95b5ffaf 3246 return -1;
3c26f34e 3247 }
3248
2187efd3
CB
3249 ret = lxc_create_ttys(handler);
3250 if (ret < 0)
e8bd4e43 3251 return -1;
e8bd4e43 3252
cccc74b5
DL
3253 if (setup_personality(lxc_conf->personality)) {
3254 ERROR("failed to setup personality");
3255 return -1;
3256 }
3257
97a8f74f
SG
3258 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3259 if (!lxc_list_empty(&lxc_conf->caps)) {
7389ca26 3260 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
f6d3e3e4
SH
3261 return -1;
3262 }
97a8f74f
SG
3263 if (dropcaps_except(&lxc_conf->keepcaps)) {
3264 ERROR("failed to keep requested caps");
3265 return -1;
3266 }
3267 } else if (setup_caps(&lxc_conf->caps)) {
3268 ERROR("failed to drop capabilities");
3269 return -1;
81810dd1
DL
3270 }
3271
f4152036 3272 NOTICE("Container \"%s\" is set up", name);
cd54d859 3273
0ad19a3f 3274 return 0;
3275}
26ddeedd 3276
283678ed
SH
3277int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
3278 const char *lxcpath, char *argv[])
26ddeedd
SH
3279{
3280 int which = -1;
3281 struct lxc_list *it;
3282
3283 if (strcmp(hook, "pre-start") == 0)
3284 which = LXCHOOK_PRESTART;
5ea6163a
SH
3285 else if (strcmp(hook, "pre-mount") == 0)
3286 which = LXCHOOK_PREMOUNT;
26ddeedd
SH
3287 else if (strcmp(hook, "mount") == 0)
3288 which = LXCHOOK_MOUNT;
f7bee6c6
MW
3289 else if (strcmp(hook, "autodev") == 0)
3290 which = LXCHOOK_AUTODEV;
26ddeedd
SH
3291 else if (strcmp(hook, "start") == 0)
3292 which = LXCHOOK_START;
52492063
WB
3293 else if (strcmp(hook, "stop") == 0)
3294 which = LXCHOOK_STOP;
26ddeedd
SH
3295 else if (strcmp(hook, "post-stop") == 0)
3296 which = LXCHOOK_POSTSTOP;
148e91f5
SH
3297 else if (strcmp(hook, "clone") == 0)
3298 which = LXCHOOK_CLONE;
37cf711b
SY
3299 else if (strcmp(hook, "destroy") == 0)
3300 which = LXCHOOK_DESTROY;
26ddeedd
SH
3301 else
3302 return -1;
3303 lxc_list_for_each(it, &conf->hooks[which]) {
3304 int ret;
3305 char *hookname = it->elem;
283678ed 3306 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
26ddeedd
SH
3307 if (ret)
3308 return ret;
3309 }
3310 return 0;
3311}
72d0e1cb 3312
72d0e1cb
SG
3313int lxc_clear_config_caps(struct lxc_conf *c)
3314{
1a0e70ac 3315 struct lxc_list *it, *next;
72d0e1cb 3316
9ebb03ad 3317 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
3318 lxc_list_del(it);
3319 free(it->elem);
3320 free(it);
3321 }
3322 return 0;
3323}
3324
74a3920a 3325static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
3326 struct lxc_list *it, *next;
3327
4355ab5f 3328 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
3329 lxc_list_del(it);
3330 free(it->elem);
3331 free(it);
3332 }
3333 return 0;
3334}
3335
4355ab5f
SH
3336int lxc_clear_idmaps(struct lxc_conf *c)
3337{
3338 return lxc_free_idmap(&c->id_map);
3339}
3340
1fb86a7c
SH
3341int lxc_clear_config_keepcaps(struct lxc_conf *c)
3342{
3343 struct lxc_list *it,*next;
3344
3345 lxc_list_for_each_safe(it, &c->keepcaps, next) {
3346 lxc_list_del(it);
3347 free(it->elem);
3348 free(it);
3349 }
3350 return 0;
3351}
3352
12a50cc6 3353int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 3354{
9ebb03ad 3355 struct lxc_list *it,*next;
72d0e1cb 3356 bool all = false;
a6390f01 3357 const char *k = NULL;
72d0e1cb
SG
3358
3359 if (strcmp(key, "lxc.cgroup") == 0)
3360 all = true;
a6390f01
WB
3361 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
3362 k = key + sizeof("lxc.cgroup.")-1;
3363 else
3364 return -1;
72d0e1cb 3365
9ebb03ad 3366 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
3367 struct lxc_cgroup *cg = it->elem;
3368 if (!all && strcmp(cg->subsystem, k) != 0)
3369 continue;
3370 lxc_list_del(it);
3371 free(cg->subsystem);
3372 free(cg->value);
3373 free(cg);
3374 free(it);
3375 }
3376 return 0;
3377}
3378
c6d09e15
WB
3379int lxc_clear_limits(struct lxc_conf *c, const char *key)
3380{
3381 struct lxc_list *it, *next;
3382 bool all = false;
3383 const char *k = NULL;
3384
240d4b74 3385 if (strcmp(key, "lxc.limit") == 0
3386 || strcmp(key, "lxc.prlimit"))
c6d09e15
WB
3387 all = true;
3388 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
3389 k = key + sizeof("lxc.limit.")-1;
240d4b74 3390 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
3391 k = key + sizeof("lxc.prlimit.")-1;
c6d09e15
WB
3392 else
3393 return -1;
3394
3395 lxc_list_for_each_safe(it, &c->limits, next) {
3396 struct lxc_limit *lim = it->elem;
3397 if (!all && strcmp(lim->resource, k) != 0)
3398 continue;
3399 lxc_list_del(it);
3400 free(lim->resource);
3401 free(lim);
3402 free(it);
3403 }
3404 return 0;
3405}
3406
ee1e7aa0
SG
3407int lxc_clear_groups(struct lxc_conf *c)
3408{
3409 struct lxc_list *it,*next;
3410
3411 lxc_list_for_each_safe(it, &c->groups, next) {
3412 lxc_list_del(it);
3413 free(it->elem);
3414 free(it);
3415 }
3416 return 0;
3417}
3418
ab799c0b
SG
3419int lxc_clear_environment(struct lxc_conf *c)
3420{
3421 struct lxc_list *it,*next;
3422
3423 lxc_list_for_each_safe(it, &c->environment, next) {
3424 lxc_list_del(it);
3425 free(it->elem);
3426 free(it);
3427 }
3428 return 0;
3429}
3430
72d0e1cb
SG
3431int lxc_clear_mount_entries(struct lxc_conf *c)
3432{
9ebb03ad 3433 struct lxc_list *it,*next;
72d0e1cb 3434
9ebb03ad 3435 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
3436 lxc_list_del(it);
3437 free(it->elem);
3438 free(it);
3439 }
3440 return 0;
3441}
3442
b099e9e9
SH
3443int lxc_clear_automounts(struct lxc_conf *c)
3444{
3445 c->auto_mounts = 0;
3446 return 0;
3447}
3448
12a50cc6 3449int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 3450{
9ebb03ad 3451 struct lxc_list *it,*next;
17ed13a3 3452 bool all = false, done = false;
a6390f01 3453 const char *k = NULL;
72d0e1cb
SG
3454 int i;
3455
17ed13a3
SH
3456 if (strcmp(key, "lxc.hook") == 0)
3457 all = true;
a6390f01
WB
3458 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
3459 k = key + sizeof("lxc.hook.")-1;
3460 else
3461 return -1;
17ed13a3 3462
72d0e1cb 3463 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 3464 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 3465 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
3466 lxc_list_del(it);
3467 free(it->elem);
3468 free(it);
3469 }
3470 done = true;
72d0e1cb
SG
3471 }
3472 }
17ed13a3
SH
3473
3474 if (!done) {
3475 ERROR("Invalid hook key: %s", key);
3476 return -1;
3477 }
72d0e1cb
SG
3478 return 0;
3479}
8eb5694b 3480
4184c3e1
SH
3481static inline void lxc_clear_aliens(struct lxc_conf *conf)
3482{
3483 struct lxc_list *it,*next;
3484
3485 lxc_list_for_each_safe(it, &conf->aliens, next) {
3486 lxc_list_del(it);
3487 free(it->elem);
3488 free(it);
3489 }
3490}
3491
c7b15d1e 3492void lxc_clear_includes(struct lxc_conf *conf)
f979ac15
SH
3493{
3494 struct lxc_list *it,*next;
3495
3496 lxc_list_for_each_safe(it, &conf->includes, next) {
3497 lxc_list_del(it);
3498 free(it->elem);
3499 free(it);
3500 }
3501}
3502
8eb5694b
SH
3503void lxc_conf_free(struct lxc_conf *conf)
3504{
3505 if (!conf)
3506 return;
858377e4
SH
3507 if (current_config == conf)
3508 current_config = NULL;
f10fad2f
ME
3509 free(conf->console.log_path);
3510 free(conf->console.path);
3511 free(conf->rootfs.mount);
b3b8c97f 3512 free(conf->rootfs.bdev_type);
f10fad2f
ME
3513 free(conf->rootfs.options);
3514 free(conf->rootfs.path);
f10fad2f 3515 free(conf->logfile);
858377e4
SH
3516 if (conf->logfd != -1)
3517 close(conf->logfd);
f10fad2f
ME
3518 free(conf->utsname);
3519 free(conf->ttydir);
3520 free(conf->fstab);
3521 free(conf->rcfile);
3522 free(conf->init_cmd);
6b0d5538 3523 free(conf->unexpanded_config);
393903d1 3524 free(conf->pty_names);
76d0127f 3525 free(conf->syslog);
c302b476 3526 lxc_free_networks(&conf->network);
f10fad2f
ME
3527 free(conf->lsm_aa_profile);
3528 free(conf->lsm_se_context);
769872f9 3529 lxc_seccomp_free(conf);
8eb5694b 3530 lxc_clear_config_caps(conf);
1fb86a7c 3531 lxc_clear_config_keepcaps(conf);
8eb5694b 3532 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 3533 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 3534 lxc_clear_mount_entries(conf);
27c27d73 3535 lxc_clear_idmaps(conf);
ee1e7aa0 3536 lxc_clear_groups(conf);
f979ac15 3537 lxc_clear_includes(conf);
761d81ca 3538 lxc_clear_aliens(conf);
ab799c0b 3539 lxc_clear_environment(conf);
240d4b74 3540 lxc_clear_limits(conf, "lxc.prlimit");
43654d34
CB
3541 free(conf->cgroup_meta.dir);
3542 free(conf->cgroup_meta.controllers);
8eb5694b
SH
3543 free(conf);
3544}
4355ab5f
SH
3545
3546struct userns_fn_data {
3547 int (*fn)(void *);
c9b7c33e 3548 const char *fn_name;
4355ab5f
SH
3549 void *arg;
3550 int p[2];
3551};
3552
3553static int run_userns_fn(void *data)
3554{
3555 struct userns_fn_data *d = data;
3556 char c;
4355ab5f 3557
f8aa4bf3 3558 /* Close write end of the pipe. */
4355ab5f 3559 close(d->p[1]);
f8aa4bf3
CB
3560
3561 /* Wait for parent to finish establishing a new mapping in the user
3562 * namespace we are executing in.
3563 */
4355ab5f
SH
3564 if (read(d->p[0], &c, 1) != 1)
3565 return -1;
f8aa4bf3
CB
3566
3567 /* Close read end of the pipe. */
4355ab5f 3568 close(d->p[0]);
f8aa4bf3 3569
c9b7c33e
CB
3570 if (d->fn_name)
3571 TRACE("calling function \"%s\"", d->fn_name);
f8aa4bf3 3572 /* Call function to run. */
4355ab5f
SH
3573 return d->fn(d->arg);
3574}
3575
339efad9 3576static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
f8aa4bf3
CB
3577 enum idtype idtype)
3578{
3579 struct lxc_list *it;
3580 struct id_map *map;
3581 struct id_map *retmap = NULL;
3582
3583 lxc_list_for_each(it, &conf->id_map) {
3584 map = it->elem;
3585 if (map->idtype != idtype)
3586 continue;
3587
3588 if (id >= map->hostid && id < map->hostid + map->range) {
3589 retmap = map;
3590 break;
3591 }
3592 }
3593
3594 if (!retmap)
3595 return NULL;
3596
3597 retmap = malloc(sizeof(*retmap));
3598 if (!retmap)
3599 return NULL;
3600
3601 memcpy(retmap, map, sizeof(*retmap));
3602 return retmap;
3603}
3604
4355ab5f 3605/*
f8aa4bf3
CB
3606 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
3607 * existing one or establish a new one.
4355ab5f 3608 */
28a2d9e7 3609static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4355ab5f 3610{
28a2d9e7 3611 int hostid_mapped;
f8aa4bf3 3612 struct id_map *entry = NULL;
f8aa4bf3 3613
28a2d9e7
CB
3614 /* Reuse existing mapping. */
3615 entry = mapped_hostid_entry(conf, id, type);
3616 if (entry)
3617 return entry;
f8aa4bf3 3618
28a2d9e7
CB
3619 /* Find new mapping. */
3620 hostid_mapped = find_unmapped_nsid(conf, type);
3621 if (hostid_mapped < 0) {
3622 DEBUG("failed to find free mapping for id %d", id);
3623 return NULL;
f8aa4bf3 3624 }
f8aa4bf3 3625
28a2d9e7
CB
3626 entry = malloc(sizeof(*entry));
3627 if (!entry)
3628 return NULL;
4355ab5f 3629
28a2d9e7
CB
3630 entry->idtype = type;
3631 entry->nsid = hostid_mapped;
3632 entry->hostid = (unsigned long)id;
3633 entry->range = 1;
4355ab5f 3634
28a2d9e7 3635 return entry;
4355ab5f
SH
3636}
3637
f8aa4bf3
CB
3638/* Run a function in a new user namespace.
3639 * The caller's euid/egid will be mapped if it is not already.
3640 * Afaict, userns_exec_1() is only used to operate based on privileges for the
3641 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
3642 * This means we require only to establish a mapping from:
3643 * - the container root {g,u}id as seen from the host > user's host {g,u}id
3644 * - the container root -> some sub{g,u}id
3645 * The former we add, if the user did not specifiy a mapping. The latter we
3646 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
3647 * there to start the container in the first place.
4355ab5f 3648 */
c9b7c33e
CB
3649int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
3650 const char *fn_name)
4355ab5f 3651{
f8aa4bf3
CB
3652 pid_t pid;
3653 uid_t euid, egid;
4355ab5f 3654 struct userns_fn_data d;
4355ab5f 3655 int p[2];
f8aa4bf3
CB
3656 struct lxc_list *it;
3657 struct id_map *map;
3658 char c = '1';
3659 int ret = -1;
3660 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
3661 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
3662 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 3663
4355ab5f 3664 ret = pipe(p);
4355ab5f
SH
3665 if (ret < 0) {
3666 SYSERROR("opening pipe");
3667 return -1;
3668 }
3669 d.fn = fn;
c9b7c33e 3670 d.fn_name = fn_name;
4355ab5f
SH
3671 d.arg = data;
3672 d.p[0] = p[0];
3673 d.p[1] = p[1];
f8aa4bf3
CB
3674
3675 /* Clone child in new user namespace. */
4355ab5f 3676 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
f8aa4bf3
CB
3677 if (pid < 0) {
3678 ERROR("failed to clone child process in new user namespace");
3679 goto on_error;
3680 }
3681
4355ab5f 3682 close(p[0]);
4355ab5f
SH
3683 p[0] = -1;
3684
f8aa4bf3
CB
3685 /* Find container root. */
3686 lxc_list_for_each(it, &conf->id_map) {
3687 map = it->elem;
3688
3689 if (map->nsid != 0)
3690 continue;
3691
3692 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
3693 container_root_uid = malloc(sizeof(*container_root_uid));
3694 if (!container_root_uid)
3695 goto on_error;
3696 container_root_uid->idtype = map->idtype;
3697 container_root_uid->hostid = map->hostid;
3698 container_root_uid->nsid = 0;
3699 container_root_uid->range = map->range;
3700 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
3701 container_root_gid = malloc(sizeof(*container_root_gid));
3702 if (!container_root_gid)
3703 goto on_error;
3704 container_root_gid->idtype = map->idtype;
3705 container_root_gid->hostid = map->hostid;
3706 container_root_gid->nsid = 0;
3707 container_root_gid->range = map->range;
3708 }
3709
3710 /* Found container root. */
3711 if (container_root_uid && container_root_gid)
3712 break;
3713 }
3714
3715 /* This is actually checked earlier but it can't hurt. */
3716 if (!container_root_uid || !container_root_gid) {
3717 ERROR("no mapping for container root found");
3718 goto on_error;
3719 }
3720
1d90e064
CB
3721 host_uid_map = container_root_uid;
3722 host_gid_map = container_root_gid;
3723
f8aa4bf3
CB
3724 /* Check whether the {g,u}id of the user has a mapping. */
3725 euid = geteuid();
3726 egid = getegid();
1d90e064 3727 if (euid != container_root_uid->hostid)
28a2d9e7
CB
3728 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
3729
1d90e064 3730 if (egid != container_root_gid->hostid)
28a2d9e7
CB
3731 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
3732
3733 if (!host_uid_map) {
3734 DEBUG("failed to find mapping for uid %d", euid);
f8aa4bf3
CB
3735 goto on_error;
3736 }
3737
28a2d9e7
CB
3738 if (!host_gid_map) {
3739 DEBUG("failed to find mapping for gid %d", egid);
3740 goto on_error;
3741 }
3742
3743 /* Allocate new {g,u}id map list. */
3744 idmap = malloc(sizeof(*idmap));
3745 if (!idmap)
3746 goto on_error;
3747 lxc_list_init(idmap);
3748
f8aa4bf3
CB
3749 /* Add container root to the map. */
3750 tmplist = malloc(sizeof(*tmplist));
3751 if (!tmplist)
3752 goto on_error;
3753 lxc_list_add_elem(tmplist, container_root_uid);
3754 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3755
1d90e064 3756 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
3757 /* idmap will now keep track of that memory. */
3758 container_root_uid = NULL;
3759
3760 /* Add container root to the map. */
3761 tmplist = malloc(sizeof(*tmplist));
3762 if (!tmplist)
3763 goto on_error;
3764 lxc_list_add_elem(tmplist, host_uid_map);
3765 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3766 }
1d90e064
CB
3767 /* idmap will now keep track of that memory. */
3768 container_root_uid = NULL;
3769 /* idmap will now keep track of that memory. */
3770 host_uid_map = NULL;
f8aa4bf3
CB
3771
3772 tmplist = malloc(sizeof(*tmplist));
3773 if (!tmplist)
3774 goto on_error;
3775 lxc_list_add_elem(tmplist, container_root_gid);
3776 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3777
1d90e064 3778 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
3779 /* idmap will now keep track of that memory. */
3780 container_root_gid = NULL;
3781
3782 tmplist = malloc(sizeof(*tmplist));
3783 if (!tmplist)
3784 goto on_error;
3785 lxc_list_add_elem(tmplist, host_gid_map);
3786 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3787 }
1d90e064
CB
3788 /* idmap will now keep track of that memory. */
3789 container_root_gid = NULL;
3790 /* idmap will now keep track of that memory. */
3791 host_gid_map = NULL;
f8aa4bf3 3792
4b73005c
CB
3793 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
3794 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
f8aa4bf3
CB
3795 lxc_list_for_each(it, idmap) {
3796 map = it->elem;
3797 TRACE("establishing %cid mapping for \"%d\" in new "
3798 "user namespace: nsuid %lu - hostid %lu - range "
3799 "%lu",
3800 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
3801 map->nsid, map->hostid, map->range);
3802 }
4355ab5f
SH
3803 }
3804
f8aa4bf3 3805 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 3806 ret = lxc_map_ids(idmap, pid);
f8aa4bf3
CB
3807 if (ret < 0) {
3808 ERROR("error setting up {g,u}id mappings for child process "
3809 "\"%d\"",
3810 pid);
3811 goto on_error;
4355ab5f
SH
3812 }
3813
f8aa4bf3 3814 /* Tell child to proceed. */
4355ab5f 3815 if (write(p[1], &c, 1) != 1) {
f8aa4bf3
CB
3816 SYSERROR("failed telling child process \"%d\" to proceed", pid);
3817 goto on_error;
4355ab5f
SH
3818 }
3819
f8aa4bf3 3820 /* Wait for child to finish. */
3139aead
SG
3821 ret = wait_for_pid(pid);
3822
f8aa4bf3 3823on_error:
1d90e064
CB
3824 if (idmap)
3825 lxc_free_idmap(idmap);
3826 if (container_root_uid)
3827 free(container_root_uid);
3828 if (container_root_gid)
3829 free(container_root_gid);
3830 if (host_uid_map && (host_uid_map != container_root_uid))
3831 free(host_uid_map);
3832 if (host_gid_map && (host_gid_map != container_root_gid))
3833 free(host_gid_map);
3139aead 3834
4355ab5f
SH
3835 if (p[0] != -1)
3836 close(p[0]);
3837 close(p[1]);
f8aa4bf3
CB
3838
3839 return ret;
4355ab5f 3840}
97e9cfa0 3841
a96a8e8c 3842/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
3843static char* getuname(void)
3844{
a96a8e8c 3845 struct passwd *result;
97e9cfa0 3846
a96a8e8c
SH
3847 result = getpwuid(geteuid());
3848 if (!result)
97e9cfa0
SH
3849 return NULL;
3850
a96a8e8c 3851 return strdup(result->pw_name);
97e9cfa0
SH
3852}
3853
a96a8e8c 3854/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
3855static char *getgname(void)
3856{
a96a8e8c 3857 struct group *result;
97e9cfa0 3858
a96a8e8c
SH
3859 result = getgrgid(getegid());
3860 if (!result)
97e9cfa0
SH
3861 return NULL;
3862
a96a8e8c 3863 return strdup(result->gr_name);
97e9cfa0
SH
3864}
3865
a96a8e8c 3866/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
3867void suggest_default_idmap(void)
3868{
3869 FILE *f;
3870 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
3871 char *line = NULL;
3872 char *uname, *gname;
3873 size_t len = 0;
3874
3875 if (!(uname = getuname()))
3876 return;
3877
3878 if (!(gname = getgname())) {
3879 free(uname);
3880 return;
3881 }
3882
3883 f = fopen(subuidfile, "r");
3884 if (!f) {
3885 ERROR("Your system is not configured with subuids");
3886 free(gname);
3887 free(uname);
3888 return;
3889 }
3890 while (getline(&line, &len, f) != -1) {
b7930180 3891 size_t no_newline = 0;
97e9cfa0
SH
3892 char *p = strchr(line, ':'), *p2;
3893 if (*line == '#')
3894 continue;
3895 if (!p)
3896 continue;
3897 *p = '\0';
3898 p++;
3899 if (strcmp(line, uname))
3900 continue;
3901 p2 = strchr(p, ':');
3902 if (!p2)
3903 continue;
3904 *p2 = '\0';
3905 p2++;
3906 if (!*p2)
3907 continue;
b7930180
CB
3908 no_newline = strcspn(p2, "\n");
3909 p2[no_newline] = '\0';
3910
b7b2fde4
CB
3911 if (lxc_safe_uint(p, &uid) < 0)
3912 WARN("Could not parse UID.");
3913 if (lxc_safe_uint(p2, &urange) < 0)
3914 WARN("Could not parse UID range.");
97e9cfa0
SH
3915 }
3916 fclose(f);
3917
6be7389a 3918 f = fopen(subgidfile, "r");
97e9cfa0
SH
3919 if (!f) {
3920 ERROR("Your system is not configured with subgids");
3921 free(gname);
3922 free(uname);
3923 return;
3924 }
3925 while (getline(&line, &len, f) != -1) {
b7930180 3926 size_t no_newline = 0;
97e9cfa0
SH
3927 char *p = strchr(line, ':'), *p2;
3928 if (*line == '#')
3929 continue;
3930 if (!p)
3931 continue;
3932 *p = '\0';
3933 p++;
3934 if (strcmp(line, uname))
3935 continue;
3936 p2 = strchr(p, ':');
3937 if (!p2)
3938 continue;
3939 *p2 = '\0';
3940 p2++;
3941 if (!*p2)
3942 continue;
b7930180
CB
3943 no_newline = strcspn(p2, "\n");
3944 p2[no_newline] = '\0';
3945
b7b2fde4
CB
3946 if (lxc_safe_uint(p, &gid) < 0)
3947 WARN("Could not parse GID.");
3948 if (lxc_safe_uint(p2, &grange) < 0)
3949 WARN("Could not parse GID range.");
97e9cfa0
SH
3950 }
3951 fclose(f);
3952
f10fad2f 3953 free(line);
97e9cfa0
SH
3954
3955 if (!urange || !grange) {
3956 ERROR("You do not have subuids or subgids allocated");
3957 ERROR("Unprivileged containers require subuids and subgids");
3958 return;
3959 }
3960
3961 ERROR("You must either run as root, or define uid mappings");
3962 ERROR("To pass uid mappings to lxc-create, you could create");
3963 ERROR("~/.config/lxc/default.conf:");
3964 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
3965 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
3966 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
3967
3968 free(gname);
3969 free(uname);
3970}
aaf26830 3971
a7307747
SH
3972static void free_cgroup_settings(struct lxc_list *result)
3973{
3974 struct lxc_list *iterator, *next;
3975
3976 lxc_list_for_each_safe(iterator, result, next) {
3977 lxc_list_del(iterator);
3978 free(iterator);
3979 }
3980 free(result);
3981}
3982
aaf26830
KT
3983/*
3984 * Return the list of cgroup_settings sorted according to the following rules
3985 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
3986 */
3987struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
3988{
3989 struct lxc_list *result;
3990 struct lxc_list *memsw_limit = NULL;
3991 struct lxc_list *it = NULL;
3992 struct lxc_cgroup *cg = NULL;
3993 struct lxc_list *item = NULL;
3994
3995 result = malloc(sizeof(*result));
fac7c663
KT
3996 if (!result) {
3997 ERROR("failed to allocate memory to sort cgroup settings");
3998 return NULL;
3999 }
aaf26830
KT
4000 lxc_list_init(result);
4001
4002 /*Iterate over the cgroup settings and copy them to the output list*/
4003 lxc_list_for_each(it, cgroup_settings) {
4004 item = malloc(sizeof(*item));
fac7c663
KT
4005 if (!item) {
4006 ERROR("failed to allocate memory to sort cgroup settings");
a7307747 4007 free_cgroup_settings(result);
fac7c663
KT
4008 return NULL;
4009 }
aaf26830
KT
4010 item->elem = it->elem;
4011 cg = it->elem;
4012 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4013 /* Store the memsw_limit location */
4014 memsw_limit = item;
4015 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4d5b72a1 4016 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
aaf26830
KT
4017 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
4018 item->elem = memsw_limit->elem;
4019 memsw_limit->elem = it->elem;
4020 }
4021 lxc_list_add_tail(result, item);
4022 }
4023
4024 return result;
a7307747 4025}