]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
Merge pull request #1784 from brauner/2017-09-05/document_handler_fields
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
8f3e280e
CB
27#include <dirent.h>
28#include <errno.h>
29#include <fcntl.h>
30#include <grp.h>
31#include <inttypes.h>
32#include <libgen.h>
33#include <pwd.h>
34#include <stdarg.h>
0ad19a3f 35#include <stdio.h>
0ad19a3f 36#include <stdlib.h>
0ad19a3f 37#include <string.h>
8f3e280e 38#include <time.h>
0ad19a3f 39#include <unistd.h>
8f3e280e
CB
40#include <arpa/inet.h>
41#include <linux/loop.h>
8f3e280e
CB
42#include <net/if.h>
43#include <netinet/in.h>
44#include <sys/mman.h>
45#include <sys/mount.h>
46#include <sys/param.h>
47#include <sys/prctl.h>
48#include <sys/stat.h>
49#include <sys/socket.h>
ce831b3b 50#include <sys/sysmacros.h>
2d76d1d7 51#include <sys/syscall.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
1d52bdf7 55
af6824fc
ST
56/* makedev() */
57#ifdef MAJOR_IN_MKDEV
58# include <sys/mkdev.h>
59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
5ef5c9a3
CB
71#ifdef HAVE_LINUX_MEMFD_H
72#include <linux/memfd.h>
73#endif
74
e8bd4e43 75#include "af_unix.h"
8f3e280e
CB
76#include "caps.h" /* for lxc_caps_last_cap() */
77#include "cgroup.h"
1b09f2c0 78#include "conf.h"
1ed6ba91 79#include "confile_utils.h"
8f3e280e 80#include "error.h"
1b09f2c0 81#include "log.h"
025ed0f3 82#include "lxclock.h"
8f3e280e 83#include "lxcseccomp.h"
4355ab5f 84#include "namespace.h"
8f3e280e
CB
85#include "network.h"
86#include "parse.h"
28d832c4
CB
87#include "storage.h"
88#include "storage/aufs.h"
89#include "storage/overlay.h"
8f3e280e 90#include "utils.h"
fe4de9a6 91#include "lsm/lsm.h"
d0a36f2c 92
e37dda71 93#if HAVE_LIBCAP
495d2046
SG
94#include <sys/capability.h>
95#endif
96
6ff05e18
SG
97#if HAVE_SYS_PERSONALITY_H
98#include <sys/personality.h>
99#endif
100
edaf8b1b
SG
101#if IS_BIONIC
102#include <../include/lxcmntent.h>
a04f5407
CB
103#ifndef HAVE_PRLIMIT
104#include <../include/prlimit.h>
105#endif
edaf8b1b
SG
106#else
107#include <mntent.h>
108#endif
109
36eb9bde 110lxc_log_define(lxc_conf, lxc);
e5bda9ee 111
e37dda71 112#if HAVE_LIBCAP
b09094da
MN
113#ifndef CAP_SETFCAP
114#define CAP_SETFCAP 31
115#endif
116
117#ifndef CAP_MAC_OVERRIDE
118#define CAP_MAC_OVERRIDE 32
119#endif
120
121#ifndef CAP_MAC_ADMIN
122#define CAP_MAC_ADMIN 33
123#endif
495d2046 124#endif
b09094da
MN
125
126#ifndef PR_CAPBSET_DROP
127#define PR_CAPBSET_DROP 24
128#endif
129
9818cae4
SG
130#ifndef LO_FLAGS_AUTOCLEAR
131#define LO_FLAGS_AUTOCLEAR 4
132#endif
133
bc5b27d6
DK
134#ifndef CAP_SETUID
135#define CAP_SETUID 7
136#endif
137
138#ifndef CAP_SETGID
139#define CAP_SETGID 6
140#endif
141
0769b82a
CS
142/* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144#ifndef CAP_SYS_ADMIN
145#define CAP_SYS_ADMIN 21
146#endif
147
2d76d1d7
SG
148/* Define pivot_root() if missing from the C library */
149#ifndef HAVE_PIVOT_ROOT
150static int pivot_root(const char * new_root, const char * put_old)
151{
152#ifdef __NR_pivot_root
8f3e280e 153 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 154#else
8f3e280e
CB
155 errno = ENOSYS;
156 return -1;
2d76d1d7
SG
157#endif
158}
159#else
160extern int pivot_root(const char * new_root, const char * put_old);
161#endif
162
163/* Define sethostname() if missing from the C library */
164#ifndef HAVE_SETHOSTNAME
165static int sethostname(const char * name, size_t len)
166{
167#ifdef __NR_sethostname
8f3e280e 168 return syscall(__NR_sethostname, name, len);
2d76d1d7 169#else
8f3e280e
CB
170 errno = ENOSYS;
171 return -1;
2d76d1d7
SG
172#endif
173}
174#endif
175
ecec0126
SG
176#ifndef MS_PRIVATE
177#define MS_PRIVATE (1<<18)
178#endif
179
8912711c
CB
180#ifndef MS_LAZYTIME
181#define MS_LAZYTIME (1<<25)
182#endif
183
5ef5c9a3
CB
184/* memfd_create() */
185#ifndef MFD_CLOEXEC
186#define MFD_CLOEXEC 0x0001U
187#endif
188
189#ifndef MFD_ALLOW_SEALING
190#define MFD_ALLOW_SEALING 0x0002U
191#endif
192
193#ifndef HAVE_MEMFD_CREATE
194static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232}
233#else
234extern int memfd_create(const char *name, unsigned int flags);
235#endif
236
2b9ae35a
CB
237char *lxchook_names[NUM_LXC_HOOKS] = {"pre-start", "pre-mount", "mount",
238 "autodev", "start", "stop",
239 "post-stop", "clone", "destroy"};
72d0e1cb 240
998ac676
RT
241struct mount_opt {
242 char *name;
243 int clear;
244 int flag;
245};
246
81810dd1
DL
247struct caps_opt {
248 char *name;
249 int value;
250};
251
c6d09e15
WB
252struct limit_opt {
253 char *name;
254 int value;
255};
256
858377e4
SH
257/*
258 * The lxc_conf of the container currently being worked on in an
259 * API call
260 * This is used in the error calls
261 */
262#ifdef HAVE_TLS
263__thread struct lxc_conf *current_config;
264#else
265struct lxc_conf *current_config;
266#endif
267
0769b82a
CS
268/* Declare this here, since we don't want to reshuffle the whole file. */
269static int in_caplist(int cap, struct lxc_list *caps);
270
998ac676 271static struct mount_opt mount_opt[] = {
470b359b
CB
272 { "async", 1, MS_SYNCHRONOUS },
273 { "atime", 1, MS_NOATIME },
274 { "bind", 0, MS_BIND },
88d413d5 275 { "defaults", 0, 0 },
88d413d5 276 { "dev", 1, MS_NODEV },
470b359b 277 { "diratime", 1, MS_NODIRATIME },
88d413d5 278 { "dirsync", 0, MS_DIRSYNC },
470b359b 279 { "exec", 1, MS_NOEXEC },
8912711c 280 { "lazytime", 0, MS_LAZYTIME },
88d413d5 281 { "mand", 0, MS_MANDLOCK },
88d413d5 282 { "noatime", 0, MS_NOATIME },
470b359b 283 { "nodev", 0, MS_NODEV },
88d413d5 284 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
285 { "noexec", 0, MS_NOEXEC },
286 { "nomand", 1, MS_MANDLOCK },
287 { "norelatime", 1, MS_RELATIME },
288 { "nostrictatime", 1, MS_STRICTATIME },
289 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
290 { "rbind", 0, MS_BIND|MS_REC },
291 { "relatime", 0, MS_RELATIME },
470b359b
CB
292 { "remount", 0, MS_REMOUNT },
293 { "ro", 0, MS_RDONLY },
294 { "rw", 1, MS_RDONLY },
88d413d5 295 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
296 { "suid", 1, MS_NOSUID },
297 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 298 { NULL, 0, 0 },
998ac676
RT
299};
300
e37dda71 301#if HAVE_LIBCAP
81810dd1 302static struct caps_opt caps_opt[] = {
a6afdde9 303 { "chown", CAP_CHOWN },
1e11be34
DL
304 { "dac_override", CAP_DAC_OVERRIDE },
305 { "dac_read_search", CAP_DAC_READ_SEARCH },
306 { "fowner", CAP_FOWNER },
307 { "fsetid", CAP_FSETID },
81810dd1
DL
308 { "kill", CAP_KILL },
309 { "setgid", CAP_SETGID },
310 { "setuid", CAP_SETUID },
311 { "setpcap", CAP_SETPCAP },
312 { "linux_immutable", CAP_LINUX_IMMUTABLE },
313 { "net_bind_service", CAP_NET_BIND_SERVICE },
314 { "net_broadcast", CAP_NET_BROADCAST },
315 { "net_admin", CAP_NET_ADMIN },
316 { "net_raw", CAP_NET_RAW },
317 { "ipc_lock", CAP_IPC_LOCK },
318 { "ipc_owner", CAP_IPC_OWNER },
319 { "sys_module", CAP_SYS_MODULE },
320 { "sys_rawio", CAP_SYS_RAWIO },
321 { "sys_chroot", CAP_SYS_CHROOT },
322 { "sys_ptrace", CAP_SYS_PTRACE },
323 { "sys_pacct", CAP_SYS_PACCT },
324 { "sys_admin", CAP_SYS_ADMIN },
325 { "sys_boot", CAP_SYS_BOOT },
326 { "sys_nice", CAP_SYS_NICE },
327 { "sys_resource", CAP_SYS_RESOURCE },
328 { "sys_time", CAP_SYS_TIME },
329 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
330 { "mknod", CAP_MKNOD },
331 { "lease", CAP_LEASE },
57b837e2
CB
332#ifdef CAP_AUDIT_READ
333 { "audit_read", CAP_AUDIT_READ },
334#endif
9527e566 335#ifdef CAP_AUDIT_WRITE
81810dd1 336 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
337#endif
338#ifdef CAP_AUDIT_CONTROL
81810dd1 339 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 340#endif
81810dd1
DL
341 { "setfcap", CAP_SETFCAP },
342 { "mac_override", CAP_MAC_OVERRIDE },
343 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
344#ifdef CAP_SYSLOG
345 { "syslog", CAP_SYSLOG },
346#endif
347#ifdef CAP_WAKE_ALARM
348 { "wake_alarm", CAP_WAKE_ALARM },
349#endif
2b54359b
CB
350#ifdef CAP_BLOCK_SUSPEND
351 { "block_suspend", CAP_BLOCK_SUSPEND },
352#endif
81810dd1 353};
495d2046
SG
354#else
355static struct caps_opt caps_opt[] = {};
356#endif
81810dd1 357
c6d09e15
WB
358static struct limit_opt limit_opt[] = {
359#ifdef RLIMIT_AS
360 { "as", RLIMIT_AS },
361#endif
362#ifdef RLIMIT_CORE
363 { "core", RLIMIT_CORE },
364#endif
365#ifdef RLIMIT_CPU
366 { "cpu", RLIMIT_CPU },
367#endif
368#ifdef RLIMIT_DATA
369 { "data", RLIMIT_DATA },
370#endif
371#ifdef RLIMIT_FSIZE
372 { "fsize", RLIMIT_FSIZE },
373#endif
374#ifdef RLIMIT_LOCKS
375 { "locks", RLIMIT_LOCKS },
376#endif
377#ifdef RLIMIT_MEMLOCK
378 { "memlock", RLIMIT_MEMLOCK },
379#endif
380#ifdef RLIMIT_MSGQUEUE
381 { "msgqueue", RLIMIT_MSGQUEUE },
382#endif
383#ifdef RLIMIT_NICE
384 { "nice", RLIMIT_NICE },
385#endif
386#ifdef RLIMIT_NOFILE
387 { "nofile", RLIMIT_NOFILE },
388#endif
389#ifdef RLIMIT_NPROC
390 { "nproc", RLIMIT_NPROC },
391#endif
392#ifdef RLIMIT_RSS
393 { "rss", RLIMIT_RSS },
394#endif
395#ifdef RLIMIT_RTPRIO
396 { "rtprio", RLIMIT_RTPRIO },
397#endif
398#ifdef RLIMIT_RTTIME
399 { "rttime", RLIMIT_RTTIME },
400#endif
401#ifdef RLIMIT_SIGPENDING
402 { "sigpending", RLIMIT_SIGPENDING },
403#endif
404#ifdef RLIMIT_STACK
405 { "stack", RLIMIT_STACK },
406#endif
407};
408
91c3830e
SH
409static int run_buffer(char *buffer)
410{
ebec9176 411 struct lxc_popen_FILE *f;
91c3830e 412 char *output;
8e7da691 413 int ret;
91c3830e 414
ebec9176 415 f = lxc_popen(buffer);
91c3830e 416 if (!f) {
062b72c6 417 SYSERROR("Failed to popen() %s.", buffer);
91c3830e
SH
418 return -1;
419 }
420
421 output = malloc(LXC_LOG_BUFFER_SIZE);
422 if (!output) {
062b72c6 423 ERROR("Failed to allocate memory for %s.", buffer);
ebec9176 424 lxc_pclose(f);
91c3830e
SH
425 return -1;
426 }
427
062b72c6
CB
428 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
429 DEBUG("Script %s with output: %s.", buffer, output);
91c3830e
SH
430
431 free(output);
432
ebec9176 433 ret = lxc_pclose(f);
8e7da691 434 if (ret == -1) {
062b72c6 435 SYSERROR("Script exited with error.");
91c3830e 436 return -1;
8e7da691 437 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
062b72c6 438 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
8e7da691
DE
439 return -1;
440 } else if (WIFSIGNALED(ret)) {
062b72c6 441 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
8e7da691 442 return -1;
91c3830e
SH
443 }
444
445 return 0;
446}
447
148e91f5 448static int run_script_argv(const char *name, const char *section,
062b72c6
CB
449 const char *script, const char *hook,
450 const char *lxcpath, char **argsin)
148e91f5
SH
451{
452 int ret, i;
453 char *buffer;
454 size_t size = 0;
455
062b72c6 456 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
148e91f5
SH
457 script, name, section);
458
062b72c6 459 for (i = 0; argsin && argsin[i]; i++)
148e91f5
SH
460 size += strlen(argsin[i]) + 1;
461
462 size += strlen(hook) + 1;
463
464 size += strlen(script);
465 size += strlen(name);
466 size += strlen(section);
467 size += 3;
468
469 if (size > INT_MAX)
470 return -1;
471
472 buffer = alloca(size);
473 if (!buffer) {
062b72c6 474 ERROR("Failed to allocate memory.");
148e91f5
SH
475 return -1;
476 }
477
062b72c6
CB
478 ret =
479 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
480 if (ret < 0 || (size_t)ret >= size) {
481 ERROR("Script name too long.");
148e91f5
SH
482 return -1;
483 }
484
062b72c6
CB
485 for (i = 0; argsin && argsin[i]; i++) {
486 int len = size - ret;
148e91f5
SH
487 int rc;
488 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
489 if (rc < 0 || rc >= len) {
062b72c6 490 ERROR("Script args too long.");
148e91f5
SH
491 return -1;
492 }
493 ret += rc;
494 }
495
496 return run_buffer(buffer);
497}
498
811ef482 499int run_script(const char *name, const char *section, const char *script, ...)
e3b4c4c4 500{
abbfd20b 501 int ret;
91c3830e 502 char *buffer, *p;
abbfd20b
DL
503 size_t size = 0;
504 va_list ap;
751d9dcd 505
062b72c6 506 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
751d9dcd 507 script, name, section);
e3b4c4c4 508
abbfd20b
DL
509 va_start(ap, script);
510 while ((p = va_arg(ap, char *)))
95642a10 511 size += strlen(p) + 1;
abbfd20b
DL
512 va_end(ap);
513
514 size += strlen(script);
515 size += strlen(name);
516 size += strlen(section);
95642a10 517 size += 3;
abbfd20b 518
95642a10
MS
519 if (size > INT_MAX)
520 return -1;
521
522 buffer = alloca(size);
abbfd20b 523 if (!buffer) {
062b72c6 524 ERROR("Failed to allocate memory.");
751d9dcd
DL
525 return -1;
526 }
527
9ba8130c
SH
528 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
529 if (ret < 0 || ret >= size) {
062b72c6 530 ERROR("Script name too long.");
9ba8130c
SH
531 return -1;
532 }
751d9dcd 533
abbfd20b 534 va_start(ap, script);
9ba8130c 535 while ((p = va_arg(ap, char *))) {
062b72c6 536 int len = size - ret;
9ba8130c
SH
537 int rc;
538 rc = snprintf(buffer + ret, len, " %s", p);
539 if (rc < 0 || rc >= len) {
062b72c6 540 ERROR("Script args too long.");
9ba8130c
SH
541 return -1;
542 }
543 ret += rc;
544 }
abbfd20b 545 va_end(ap);
751d9dcd 546
91c3830e 547 return run_buffer(buffer);
e3b4c4c4
ST
548}
549
0c547523
SH
550/*
551 * pin_rootfs
b7ed4bf0
CS
552 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
553 * the duration of the container run, to prevent the container from marking
554 * the underlying fs readonly on shutdown. unlink the file immediately so
555 * no name pollution is happens
0c547523
SH
556 * return -1 on error.
557 * return -2 if nothing needed to be pinned.
558 * return an open fd (>=0) if we pinned it.
559 */
560int pin_rootfs(const char *rootfs)
561{
562 char absrootfs[MAXPATHLEN];
563 char absrootfspin[MAXPATHLEN];
564 struct stat s;
565 int ret, fd;
566
e99ee0de 567 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 568 return -2;
e99ee0de 569
00ec333b 570 if (!realpath(rootfs, absrootfs))
9be53773 571 return -2;
0c547523 572
00ec333b 573 if (access(absrootfs, F_OK))
0c547523 574 return -1;
0c547523 575
00ec333b 576 if (stat(absrootfs, &s))
0c547523 577 return -1;
0c547523 578
72f919c4 579 if (!S_ISDIR(s.st_mode))
0c547523
SH
580 return -2;
581
b7ed4bf0 582 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 583 if (ret >= MAXPATHLEN)
0c547523 584 return -1;
0c547523
SH
585
586 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
587 if (fd < 0)
588 return fd;
589 (void)unlink(absrootfspin);
0c547523
SH
590 return fd;
591}
592
e2a7e8dc
SH
593/*
594 * If we are asking to remount something, make sure that any
595 * NOEXEC etc are honored.
596 */
5ae72b98 597unsigned long add_required_remount_flags(const char *s, const char *d,
e2a7e8dc
SH
598 unsigned long flags)
599{
614305f3 600#ifdef HAVE_STATVFS
e2a7e8dc
SH
601 struct statvfs sb;
602 unsigned long required_flags = 0;
603
604 if (!(flags & MS_REMOUNT))
605 return flags;
606
607 if (!s)
608 s = d;
609
610 if (!s)
611 return flags;
612 if (statvfs(s, &sb) < 0)
613 return flags;
614
615 if (sb.f_flag & MS_NOSUID)
616 required_flags |= MS_NOSUID;
617 if (sb.f_flag & MS_NODEV)
618 required_flags |= MS_NODEV;
619 if (sb.f_flag & MS_RDONLY)
620 required_flags |= MS_RDONLY;
621 if (sb.f_flag & MS_NOEXEC)
622 required_flags |= MS_NOEXEC;
623
624 return flags | required_flags;
614305f3
SH
625#else
626 return flags;
627#endif
e2a7e8dc
SH
628}
629
4fb3cba5 630static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 631{
368bbc02 632 int r;
80e80c40 633 int i;
b06b8511
CS
634 static struct {
635 int match_mask;
636 int match_flag;
637 const char *source;
638 const char *destination;
639 const char *fstype;
640 unsigned long flags;
641 const char *options;
642 } default_mounts[] = {
643 /* Read-only bind-mounting... In older kernels, doing that required
644 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
645 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
646 * kernel 2.6.26 onwards. However, this apparently does not work on
647 * kernel 3.8. Unfortunately, on that very same kernel, doing the
648 * same trick as above doesn't seem to work either, there one needs
649 * to ALSO specify MS_BIND for the remount, otherwise the entire
650 * fs is remounted read-only or the mount fails because it's busy...
651 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
652 * 2.6.32...
368bbc02 653 */
f24a52d5 654 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a
SH
655 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
656 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
f24a52d5
SG
657 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
658 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
592fd47a 659 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
f24a52d5
SG
660 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
661 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
662 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
663 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
664 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
665 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
666 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
667 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
668 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
669 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
670 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
671 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 672 };
368bbc02 673
b06b8511
CS
674 for (i = 0; default_mounts[i].match_mask; i++) {
675 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
676 char *source = NULL;
677 char *destination = NULL;
678 int saved_errno;
e2a7e8dc 679 unsigned long mflags;
b06b8511
CS
680
681 if (default_mounts[i].source) {
682 /* will act like strdup if %r is not present */
8ede5f4c 683 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
b06b8511
CS
684 if (!source) {
685 SYSERROR("memory allocation error");
686 return -1;
687 }
688 }
cc4fd506
SH
689 if (!default_mounts[i].destination) {
690 ERROR("BUG: auto mounts destination %d was NULL", i);
b2f44b4d 691 free(source);
cc4fd506
SH
692 return -1;
693 }
694 /* will act like strdup if %r is not present */
695 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
696 if (!destination) {
697 saved_errno = errno;
698 SYSERROR("memory allocation error");
699 free(source);
700 errno = saved_errno;
701 return -1;
b06b8511 702 }
e2a7e8dc
SH
703 mflags = add_required_remount_flags(source, destination,
704 default_mounts[i].flags);
592fd47a 705 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
b06b8511 706 saved_errno = errno;
b88ff9a0
SG
707 if (r < 0 && errno == ENOENT) {
708 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
709 r = 0;
710 }
711 else if (r < 0)
e2a7e8dc 712 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
f24a52d5 713
b06b8511
CS
714 free(source);
715 free(destination);
716 if (r < 0) {
b06b8511
CS
717 errno = saved_errno;
718 return -1;
719 }
368bbc02 720 }
368bbc02
CS
721 }
722
b06b8511 723 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
724 int cg_flags;
725
726 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
727 /* If the type of cgroup mount was not specified, it depends on the
728 * container's capabilities as to what makes sense: if we have
729 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
730 * anyway, so we may as well default to read-write; then the admin
731 * will not be given a false sense of security. (And if they really
732 * want mixed r/o r/w, then they can explicitly specify :mixed.)
733 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
734 * :mixed, because then the container can't remount it read-write. */
735 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
736 int has_sys_admin = 0;
b0ee5983
CB
737
738 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 739 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 740 else
0769b82a 741 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
742
743 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 744 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 745 else
0769b82a 746 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a
CS
747 }
748
8ede5f4c 749 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
368bbc02 750 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 751 return -1;
368bbc02
CS
752 }
753 }
754
368bbc02 755 return 0;
368bbc02
CS
756}
757
4e5440c6 758static int setup_utsname(struct utsname *utsname)
0ad19a3f 759{
4e5440c6
DL
760 if (!utsname)
761 return 0;
0ad19a3f 762
4e5440c6
DL
763 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
764 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 765 return -1;
766 }
767
4e5440c6 768 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 769
0ad19a3f 770 return 0;
771}
772
69aa6655
DE
773struct dev_symlinks {
774 const char *oldpath;
775 const char *name;
776};
777
778static const struct dev_symlinks dev_symlinks[] = {
779 {"/proc/self/fd", "fd"},
780 {"/proc/self/fd/0", "stdin"},
781 {"/proc/self/fd/1", "stdout"},
782 {"/proc/self/fd/2", "stderr"},
783};
784
785static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
786{
787 char path[MAXPATHLEN];
788 int ret,i;
09227be2 789 struct stat s;
69aa6655
DE
790
791
792 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
793 const struct dev_symlinks *d = &dev_symlinks[i];
ec50007f 794 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
795 if (ret < 0 || ret >= MAXPATHLEN)
796 return -1;
09227be2
MW
797
798 /*
799 * Stat the path first. If we don't get an error
800 * accept it as is and don't try to create it
801 */
802 if (!stat(path, &s)) {
803 continue;
804 }
805
69aa6655 806 ret = symlink(d->oldpath, path);
09227be2 807
69aa6655 808 if (ret && errno != EEXIST) {
09227be2
MW
809 if ( errno == EROFS ) {
810 WARN("Warning: Read Only file system while creating %s", path);
811 } else {
812 SYSERROR("Error creating %s", path);
813 return -1;
814 }
69aa6655
DE
815 }
816 }
817 return 0;
818}
819
393903d1
SH
820/*
821 * Build a space-separate list of ptys to pass to systemd.
822 */
823static bool append_ptyname(char **pp, char *name)
b0a33c1e 824{
393903d1
SH
825 char *p;
826
827 if (!*pp) {
828 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
829 if (!*pp)
830 return false;
831 sprintf(*pp, "container_ttys=%s", name);
832 return true;
833 }
834 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
835 if (!p)
836 return false;
837 *pp = p;
838 strcat(p, " ");
839 strcat(p, name);
840 return true;
841}
842
9e1045e3 843static int lxc_setup_tty(struct lxc_conf *conf)
393903d1 844{
9e1045e3 845 int i, ret;
393903d1
SH
846 const struct lxc_tty_info *tty_info = &conf->tty_info;
847 char *ttydir = conf->ttydir;
7c6ef2a2 848 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 849
e8bd4e43 850 if (!conf->rootfs.path)
bc9bd0e3
DL
851 return 0;
852
b0a33c1e 853 for (i = 0; i < tty_info->nbtty; i++) {
b0a33c1e 854 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
855
e8bd4e43 856 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
9e1045e3 857 if (ret < 0 || (size_t)ret >= sizeof(path)) {
7c6ef2a2
SH
858 ERROR("pathname too long for ttys");
859 return -1;
860 }
9e1045e3 861
7c6ef2a2
SH
862 if (ttydir) {
863 /* create dev/lxc/tty%d" */
9e1045e3
CB
864 ret = snprintf(lxcpath, sizeof(lxcpath),
865 "/dev/%s/tty%d", ttydir, i + 1);
866 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
7c6ef2a2
SH
867 ERROR("pathname too long for ttys");
868 return -1;
869 }
9e1045e3 870
7c6ef2a2 871 ret = creat(lxcpath, 0660);
9e1045e3
CB
872 if (ret < 0 && errno != EEXIST) {
873 SYSERROR("failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
874 return -1;
875 }
4d44e274
SH
876 if (ret >= 0)
877 close(ret);
9e1045e3 878
7c6ef2a2 879 ret = unlink(path);
9e1045e3
CB
880 if (ret < 0 && errno != ENOENT) {
881 SYSERROR("failed to unlink \"%s\"", path);
7c6ef2a2
SH
882 return -1;
883 }
b0a33c1e 884
9e1045e3
CB
885 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
886 if (ret < 0) {
887 WARN("failed to bind mount \"%s\" onto \"%s\"",
7c6ef2a2
SH
888 pty_info->name, path);
889 continue;
890 }
9e1045e3
CB
891 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
892 path);
13954cce 893
9e1045e3
CB
894 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
895 ttydir, i + 1);
896 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
9ba8130c
SH
897 ERROR("tty pathname too long");
898 return -1;
899 }
9e1045e3 900
7c6ef2a2 901 ret = symlink(lxcpath, path);
9e1045e3
CB
902 if (ret < 0) {
903 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
904 path, lxcpath);
7c6ef2a2
SH
905 return -1;
906 }
907 } else {
9e1045e3
CB
908 /* If we populated /dev, then we need to create
909 * /dev/ttyN
910 */
911 ret = access(path, F_OK);
912 if (ret < 0) {
c6883f38 913 ret = creat(path, 0660);
9e1045e3
CB
914 if (ret < 0) {
915 SYSERROR("failed to create \"%s\"", path);
c6883f38 916 /* this isn't fatal, continue */
025ed0f3 917 } else {
c6883f38 918 close(ret);
025ed0f3 919 }
c6883f38 920 }
9e1045e3
CB
921
922 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
923 if (ret < 0) {
e8bd4e43 924 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
7c6ef2a2
SH
925 continue;
926 }
9e1045e3
CB
927
928 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
929 path);
393903d1 930 }
9e1045e3 931
e8bd4e43 932 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
393903d1
SH
933 ERROR("Error setting up container_ttys string");
934 return -1;
b0a33c1e 935 }
936 }
937
9e1045e3 938 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
b0a33c1e 939 return 0;
940}
941
59bb8698 942static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 943{
2d489f9e 944 int oldroot = -1, newroot = -1;
bf601689 945
2d489f9e
SH
946 oldroot = open("/", O_DIRECTORY | O_RDONLY);
947 if (oldroot < 0) {
948 SYSERROR("Error opening old-/ for fchdir");
9ba8130c
SH
949 return -1;
950 }
2d489f9e
SH
951 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
952 if (newroot < 0) {
953 SYSERROR("Error opening new-/ for fchdir");
954 goto fail;
c08556c6 955 }
bf601689 956
cc6f6dd7 957 /* change into new root fs */
2d489f9e 958 if (fchdir(newroot)) {
cc6f6dd7 959 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
2d489f9e 960 goto fail;
cc6f6dd7
DL
961 }
962
cc6f6dd7 963 /* pivot_root into our new root fs */
2d489f9e 964 if (pivot_root(".", ".")) {
cc6f6dd7 965 SYSERROR("pivot_root syscall failed");
2d489f9e 966 goto fail;
bf601689 967 }
cc6f6dd7 968
2d489f9e
SH
969 /*
970 * at this point the old-root is mounted on top of our new-root
971 * To unmounted it we must not be chdir'd into it, so escape back
972 * to old-root
973 */
974 if (fchdir(oldroot) < 0) {
975 SYSERROR("Error entering oldroot");
976 goto fail;
977 }
7981ea46 978 if (umount2(".", MNT_DETACH) < 0) {
2d489f9e
SH
979 SYSERROR("Error detaching old root");
980 goto fail;
cc6f6dd7
DL
981 }
982
2d489f9e
SH
983 if (fchdir(newroot) < 0) {
984 SYSERROR("Error re-entering newroot");
985 goto fail;
986 }
cc6f6dd7 987
2d489f9e
SH
988 close(oldroot);
989 close(newroot);
bf601689 990
2d489f9e 991 DEBUG("pivot_root syscall to '%s' successful", rootfs);
bf601689 992
bf601689 993 return 0;
2d489f9e
SH
994
995fail:
996 if (oldroot != -1)
997 close(oldroot);
998 if (newroot != -1)
999 close(newroot);
1000 return -1;
bf601689
MH
1001}
1002
7133b912
CB
1003/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1004 * error, log it but don't fail yet.
91c3830e 1005 */
7133b912
CB
1006static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1007 const char *lxcpath)
91c3830e
SH
1008{
1009 int ret;
87da4ec3
SH
1010 size_t clen;
1011 char *path;
91c3830e 1012
7133b912 1013 INFO("Preparing \"/dev\"");
bc6928ff 1014
14221cbb 1015 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1016 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1017 path = alloca(clen);
bc6928ff 1018
ec50007f 1019 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
7133b912 1020 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1021 return -1;
bc6928ff 1022
87da4ec3 1023 if (!dir_exists(path)) {
7133b912
CB
1024 WARN("\"/dev\" directory does not exist. Proceeding without "
1025 "autodev being set up");
87da4ec3 1026 return 0;
bc6928ff 1027 }
87da4ec3 1028
1ec0e8e3 1029 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
7133b912
CB
1030 rootfs->path ? rootfs->mount : NULL);
1031 if (ret < 0) {
1032 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1ec0e8e3 1033 return -1;
91c3830e 1034 }
7133b912 1035 INFO("Mounted tmpfs on \"%s\"", path);
87da4ec3 1036
ec50007f 1037 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
7133b912 1038 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1039 return -1;
87da4ec3 1040
7133b912 1041 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1042 * If not, then create it and exit if that fails...
1043 */
87da4ec3 1044 if (!dir_exists(path)) {
bc6928ff 1045 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
7133b912
CB
1046 if (ret < 0) {
1047 SYSERROR("Failed to create directory \"%s\"", path);
bc6928ff
MW
1048 return -1;
1049 }
91c3830e
SH
1050 }
1051
7133b912 1052 INFO("Prepared \"/dev\"");
91c3830e
SH
1053 return 0;
1054}
1055
c6883f38 1056struct lxc_devs {
74a3920a 1057 const char *name;
c6883f38
SH
1058 mode_t mode;
1059 int maj;
1060 int min;
1061};
1062
74a3920a 1063static const struct lxc_devs lxc_devs[] = {
06749971
CB
1064 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1065 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1066 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1067 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1068 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1069 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
c6883f38
SH
1070};
1071
27245ff7 1072static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38
SH
1073{
1074 int ret;
c6883f38
SH
1075 char path[MAXPATHLEN];
1076 int i;
3a32201c 1077 mode_t cmask;
c6883f38 1078
3999be0a
CB
1079 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1080 rootfs->path ? rootfs->mount : "");
1081 if (ret < 0 || ret >= MAXPATHLEN)
c6883f38 1082 return -1;
91c3830e 1083
0bbf8572
CB
1084 /* ignore, just don't try to fill in */
1085 if (!dir_exists(path))
9cb4d183
SH
1086 return 0;
1087
3999be0a
CB
1088 INFO("Populating \"/dev\"");
1089
3a32201c 1090 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1091 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1092 const struct lxc_devs *d = &lxc_devs[i];
0728ebf4 1093
3999be0a
CB
1094 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
1095 rootfs->path ? rootfs->mount : "", d->name);
c6883f38
SH
1096 if (ret < 0 || ret >= MAXPATHLEN)
1097 return -1;
0bbf8572 1098
c6883f38 1099 ret = mknod(path, d->mode, makedev(d->maj, d->min));
0bbf8572 1100 if (ret < 0) {
9cb4d183 1101 FILE *pathfile;
3999be0a 1102 char hostpath[MAXPATHLEN];
9cb4d183 1103
0bbf8572
CB
1104 if (errno == EEXIST) {
1105 DEBUG("\"%s\" device already existed", path);
1106 continue;
1107 }
1108
1109 /* Unprivileged containers cannot create devices, so
1110 * bind mount the device from the host.
1111 */
9cb4d183
SH
1112 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1113 if (ret < 0 || ret >= MAXPATHLEN)
1114 return -1;
3999be0a 1115
9cb4d183
SH
1116 pathfile = fopen(path, "wb");
1117 if (!pathfile) {
3999be0a 1118 SYSERROR("Failed to create file \"%s\"", path);
9cb4d183
SH
1119 return -1;
1120 }
1121 fclose(pathfile);
3999be0a
CB
1122
1123 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1124 rootfs->path ? rootfs->mount : NULL);
1125 if (ret < 0) {
1126 SYSERROR("Failed to bind mount \"%s\" from "
1127 "host into container",
1128 d->name);
9cb4d183
SH
1129 return -1;
1130 }
3999be0a
CB
1131 DEBUG("Bind mounted \"%s\" onto \"%s\"", hostpath,
1132 path);
0bbf8572 1133 } else {
3999be0a 1134 DEBUG("Created device node \"%s\"", path);
c6883f38
SH
1135 }
1136 }
3a32201c 1137 umask(cmask);
c6883f38 1138
3999be0a 1139 INFO("Populated \"/dev\"");
c6883f38
SH
1140 return 0;
1141}
1142
9aa76a17 1143static int lxc_setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1144{
9aa76a17 1145 int ret;
10bc1861 1146 struct lxc_storage *bdev;
91c3e281 1147 const struct lxc_rootfs *rootfs;
cc28d0b0 1148
91c3e281 1149 rootfs = &conf->rootfs;
a0f379bf 1150 if (!rootfs->path) {
91c3e281
CB
1151 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1152 SYSERROR("Failed to make / rslave.");
a0f379bf
DW
1153 return -1;
1154 }
c69bd12f 1155 return 0;
a0f379bf 1156 }
0ad19a3f 1157
12297168 1158 if (access(rootfs->mount, F_OK)) {
91c3e281 1159 SYSERROR("Failed to access to \"%s\". Check it is present.",
12297168 1160 rootfs->mount);
b1789442
DL
1161 return -1;
1162 }
1163
10bc1861 1164 bdev = storage_init(conf, rootfs->path, rootfs->mount, rootfs->options);
9aa76a17
CB
1165 if (!bdev) {
1166 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
91c3e281
CB
1167 rootfs->path, rootfs->mount,
1168 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1169 return -1;
9be53773 1170 }
9aa76a17
CB
1171
1172 ret = bdev->ops->mount(bdev);
10bc1861 1173 storage_put(bdev);
9aa76a17 1174 if (ret < 0) {
91c3e281
CB
1175 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1176 rootfs->path, rootfs->mount,
1177 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1178 return -1;
1179 }
0ad19a3f 1180
91c3e281
CB
1181 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1182 rootfs->path, rootfs->mount,
1183 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1184
ac778708
DL
1185 return 0;
1186}
1187
91e93c71
AV
1188int prepare_ramfs_root(char *root)
1189{
eab15c1e 1190 char buf[LXC_LINELEN], *p;
91e93c71
AV
1191 char nroot[PATH_MAX];
1192 FILE *f;
1193 int i;
1194 char *p2;
1195
1196 if (realpath(root, nroot) == NULL)
39c7b795 1197 return -errno;
91e93c71
AV
1198
1199 if (chdir("/") == -1)
39c7b795 1200 return -errno;
91e93c71
AV
1201
1202 /*
1203 * We could use here MS_MOVE, but in userns this mount is
1204 * locked and can't be moved.
1205 */
39c7b795 1206 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
91e93c71 1207 SYSERROR("Failed to move %s into /", root);
39c7b795 1208 return -errno;
91e93c71
AV
1209 }
1210
39c7b795 1211 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
91e93c71 1212 SYSERROR("Failed to make . rprivate");
39c7b795 1213 return -errno;
91e93c71
AV
1214 }
1215
1216 /*
1217 * The following code cleans up inhereted mounts which are not
1218 * required for CT.
1219 *
1220 * The mountinfo file shows not all mounts, if a few points have been
1221 * unmounted between read operations from the mountinfo. So we need to
1222 * read mountinfo a few times.
1223 *
1224 * This loop can be skipped if a container uses unserns, because all
1225 * inherited mounts are locked and we should live with all this trash.
1226 */
1227 while (1) {
1228 int progress = 0;
1229
1230 f = fopen("./proc/self/mountinfo", "r");
1231 if (!f) {
1232 SYSERROR("Unable to open /proc/self/mountinfo");
1233 return -1;
1234 }
eab15c1e 1235 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1236 for (p = buf, i=0; p && i < 4; i++)
1237 p = strchr(p+1, ' ');
1238 if (!p)
1239 continue;
1240 p2 = strchr(p+1, ' ');
1241 if (!p2)
1242 continue;
1243
1244 *p2 = '\0';
1245 *p = '.';
1246
1247 if (strcmp(p + 1, "/") == 0)
1248 continue;
1249 if (strcmp(p + 1, "/proc") == 0)
1250 continue;
1251
1252 if (umount2(p, MNT_DETACH) == 0)
1253 progress++;
1254 }
1255 fclose(f);
1256 if (!progress)
1257 break;
1258 }
1259
8bea9fae
PR
1260 /* This also can be skipped if a container uses unserns */
1261 umount2("./proc", MNT_DETACH);
91e93c71
AV
1262
1263 /* It is weird, but chdir("..") moves us in a new root */
1264 if (chdir("..") == -1) {
1265 SYSERROR("Unable to change working directory");
1266 return -1;
1267 }
1268
1269 if (chroot(".") == -1) {
1270 SYSERROR("Unable to chroot");
1271 return -1;
1272 }
1273
1274 return 0;
1275}
1276
74a3920a 1277static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1278{
39c7b795
CB
1279 if (!rootfs->path) {
1280 DEBUG("container does not have a rootfs, so not doing pivot root");
ac778708 1281 return 0;
39c7b795 1282 }
ac778708 1283
91e93c71 1284 if (detect_ramfs_rootfs()) {
39c7b795
CB
1285 DEBUG("detected that container is on ramfs");
1286 if (prepare_ramfs_root(rootfs->mount)) {
1287 ERROR("failed to prepare minimal ramfs root");
91e93c71 1288 return -1;
39c7b795
CB
1289 }
1290
1291 DEBUG("prepared ramfs root for container");
1292 return 0;
1293 }
1294
1295 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1296 ERROR("failed to pivot root");
25368b52 1297 return -1;
c69bd12f
DL
1298 }
1299
39c7b795 1300 DEBUG("finished pivot root");
25368b52 1301 return 0;
0ad19a3f 1302}
1303
70761e5e 1304static int lxc_setup_devpts(int num_pts)
3c26f34e 1305{
70761e5e 1306 int ret;
9d28c4f9
CB
1307 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1308 char devpts_mntopts[256];
77890c6d 1309
70761e5e
CB
1310 if (!num_pts) {
1311 DEBUG("no new devpts instance will be mounted since no pts "
1312 "devices are requested");
d852c78c 1313 return 0;
3c26f34e 1314 }
1315
9d28c4f9
CB
1316 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1317 default_devpts_mntopts, num_pts);
1318 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1319 return -1;
1320
d5cb35d6 1321 /* Unmount old devpts instance. */
70761e5e
CB
1322 ret = access("/dev/pts/ptmx", F_OK);
1323 if (!ret) {
70761e5e
CB
1324 ret = umount("/dev/pts");
1325 if (ret < 0) {
1326 SYSERROR("failed to unmount old devpts instance");
1327 return -1;
7e40254a 1328 }
70761e5e 1329 DEBUG("unmounted old /dev/pts instance");
7e40254a
JTLB
1330 }
1331
70761e5e
CB
1332 /* Create mountpoint for devpts instance. */
1333 ret = mkdir("/dev/pts", 0755);
1334 if (ret < 0 && errno != EEXIST) {
1335 SYSERROR("failed to create the \"/dev/pts\" directory");
3c26f34e 1336 return -1;
1337 }
1338
70761e5e
CB
1339 /* Mount new devpts instance. */
1340 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1341 if (ret < 0) {
1342 SYSERROR("failed to mount new devpts instance");
1343 return -1;
1344 }
f4f52cb5 1345 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1346
d5cb35d6 1347 /* Remove any pre-existing /dev/ptmx file. */
70761e5e 1348 ret = access("/dev/ptmx", F_OK);
d5cb35d6
CB
1349 if (!ret) {
1350 ret = remove("/dev/ptmx");
1351 if (ret < 0) {
1352 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1353 return -1;
70761e5e 1354 }
d5cb35d6 1355 DEBUG("removed existing \"/dev/ptmx\"");
3c26f34e 1356 }
1357
d5cb35d6
CB
1358 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1359 ret = open("/dev/ptmx", O_CREAT, 0666);
1360 if (ret < 0) {
1361 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1362 return -1;
1363 }
e87bd19c 1364 close(ret);
d5cb35d6 1365 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1366
d5cb35d6 1367 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1368 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6
CB
1369 if (!ret) {
1370 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1371 return 0;
1372 } else {
1373 /* Fallthrough and try to create a symlink. */
1374 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1375 }
1376
1377 /* Remove the dummy /dev/ptmx file we created above. */
1378 ret = remove("/dev/ptmx");
70761e5e 1379 if (ret < 0) {
d5cb35d6
CB
1380 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1381 return -1;
1382 }
1383
1384 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1385 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1386 if (ret < 0) {
1387 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
3c26f34e 1388 return -1;
1389 }
d5cb35d6 1390 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
cd54d859 1391
3c26f34e 1392 return 0;
1393}
1394
cccc74b5
DL
1395static int setup_personality(int persona)
1396{
6ff05e18 1397 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1398 if (persona == -1)
1399 return 0;
1400
1401 if (personality(persona) < 0) {
1402 SYSERROR("failed to set personality to '0x%x'", persona);
1403 return -1;
1404 }
1405
1406 INFO("set personality to '0x%x'", persona);
6ff05e18 1407 #endif
cccc74b5
DL
1408
1409 return 0;
1410}
1411
3d7d929a
CB
1412static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1413 const struct lxc_console *console)
6e590161 1414{
63376d7d 1415 char path[MAXPATHLEN];
0728ebf4 1416 int ret, fd;
52e35957 1417
8b1b1210
CB
1418 if (console->path && !strcmp(console->path, "none"))
1419 return 0;
1420
7c6ef2a2 1421 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
3d7d929a 1422 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1423 return -1;
52e35957 1424
8b1b1210
CB
1425 /* When we are asked to setup a console we remove any previous
1426 * /dev/console bind-mounts.
1427 */
a7ba3c7f
CB
1428 if (file_exists(path)) {
1429 ret = lxc_unstack_mountpoint(path, false);
1430 if (ret < 0) {
8b1b1210 1431 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1432 return -ret;
1433 } else {
1434 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1435 }
953fe44f 1436
a7ba3c7f
CB
1437 ret = unlink(path);
1438 if (ret < 0) {
1439 SYSERROR("error unlinking %s", path);
8b1b1210
CB
1440 return -errno;
1441 }
8b1b1210
CB
1442 }
1443
1444 /* For unprivileged containers autodev or automounts will already have
1445 * taken care of creating /dev/console.
1446 */
0728ebf4
TA
1447 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1448 if (fd < 0) {
1449 if (errno != EEXIST) {
1450 SYSERROR("failed to create console");
3d7d929a 1451 return -errno;
0728ebf4
TA
1452 }
1453 } else {
1454 close(fd);
52e35957
DL
1455 }
1456
0728ebf4 1457 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
3d7d929a
CB
1458 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1459 return -errno;
63376d7d 1460 }
13954cce 1461
3d7d929a 1462 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
63376d7d 1463 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1464 return -1;
1465 }
1466
3d7d929a 1467 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1468 return 0;
1469}
1470
3d7d929a
CB
1471static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1472 const struct lxc_console *console,
1473 char *ttydir)
7c6ef2a2 1474{
7c6ef2a2 1475 int ret;
3d7d929a 1476 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
7c6ef2a2
SH
1477
1478 /* create rootfs/dev/<ttydir> directory */
3d7d929a
CB
1479 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1480 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1481 return -1;
3d7d929a 1482
7c6ef2a2
SH
1483 ret = mkdir(path, 0755);
1484 if (ret && errno != EEXIST) {
959aee9c 1485 SYSERROR("failed with errno %d to create %s", errno, path);
3d7d929a 1486 return -errno;
7c6ef2a2 1487 }
4742cd9a 1488 DEBUG("Created directory for console and tty devices at \"%s\"", path);
7c6ef2a2 1489
3d7d929a
CB
1490 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1491 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1492 return -1;
1493
7c6ef2a2 1494 ret = creat(lxcpath, 0660);
3d7d929a 1495 if (ret == -1 && errno != EEXIST) {
959aee9c 1496 SYSERROR("error %d creating %s", errno, lxcpath);
3d7d929a 1497 return -errno;
7c6ef2a2 1498 }
4d44e274
SH
1499 if (ret >= 0)
1500 close(ret);
7c6ef2a2 1501
2a12fefd
CB
1502 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1503 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 1504 return -1;
2a12fefd
CB
1505
1506 /* When we are asked to setup a console we remove any previous
1507 * /dev/console bind-mounts.
1508 */
1509 if (console->path && !strcmp(console->path, "none")) {
1510 struct stat st;
1511 ret = stat(path, &st);
1512 if (ret < 0) {
1513 if (errno == ENOENT)
1514 return 0;
1515 SYSERROR("failed stat() \"%s\"", path);
1516 return -errno;
1517 }
1518
1519 /* /dev/console must be character device with major number 5 and
1520 * minor number 1. If not, give benefit of the doubt and assume
1521 * the user has mounted something else right there on purpose.
1522 */
1523 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1524 return 0;
1525
1526 /* In case the user requested a bind-mount for /dev/console and
1527 * requests a ttydir we move the mount to the
a7ba3c7f
CB
1528 * /dev/<ttydir/console.
1529 * Note, we only move the uppermost mount and clear all other
1530 * mounts underneath for safety.
1531 * If it is a character device created via mknod() we simply
1532 * rename it.
2a12fefd
CB
1533 */
1534 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1535 if (ret < 0) {
1536 if (errno != EINVAL) {
1537 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1538 return -errno;
1539 }
1540 /* path was not a mountpoint */
1541 ret = rename(path, lxcpath);
1542 if (ret < 0) {
1543 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1544 return -errno;
1545 }
1546 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1547 } else {
1548 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1549 }
a7ba3c7f
CB
1550
1551 /* Clear all remaining bind-mounts. */
1552 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1553 if (ret < 0) {
a7ba3c7f
CB
1554 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1555 return -ret;
1556 } else {
1557 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1558 }
1559 } else {
1560 if (file_exists(path)) {
1561 ret = lxc_unstack_mountpoint(path, false);
1562 if (ret < 0) {
2a12fefd 1563 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1564 return -ret;
1565 } else {
1566 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
2a12fefd 1567 }
2a12fefd
CB
1568 }
1569
1570 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1571 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1572 return -1;
1573 }
1574 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1575 }
1576
2a12fefd 1577 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
9ba8130c 1578 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
3d7d929a 1579 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 1580 return -1;
3d7d929a 1581
2a12fefd
CB
1582 ret = unlink(path);
1583 if (ret && errno != ENOENT) {
1584 SYSERROR("error unlinking %s", path);
1585 return -errno;
1586 }
1587
7c6ef2a2 1588 ret = symlink(lxcpath, path);
3d7d929a
CB
1589 if (ret < 0) {
1590 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
7c6ef2a2
SH
1591 return -1;
1592 }
1593
3d7d929a 1594 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
6e590161 1595 return 0;
1596}
1597
3d7d929a
CB
1598static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1599 const struct lxc_console *console, char *ttydir)
7c6ef2a2 1600{
3d7d929a
CB
1601 /* We don't have a rootfs, /dev/console will be shared. */
1602 if (!rootfs->path) {
1603 DEBUG("/dev/console will be shared with the host");
7c6ef2a2 1604 return 0;
3d7d929a
CB
1605 }
1606
7c6ef2a2 1607 if (!ttydir)
3d7d929a 1608 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1609
3d7d929a 1610 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1611}
1612
998ac676
RT
1613static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1614{
1615 struct mount_opt *mo;
1616
1617 /* If opt is found in mount_opt, set or clear flags.
1618 * Otherwise append it to data. */
1619
1620 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1621 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1622 if (mo->clear)
1623 *flags &= ~mo->flag;
1624 else
1625 *flags |= mo->flag;
1626 return;
1627 }
1628 }
1629
1630 if (strlen(*data))
1631 strcat(*data, ",");
1632 strcat(*data, opt);
1633}
1634
a17b1e65 1635int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1636 char **mntdata)
1637{
1638 char *s, *data;
1639 char *p, *saveptr = NULL;
1640
911324ef 1641 *mntdata = NULL;
91656ce5 1642 *mntflags = 0L;
911324ef
DL
1643
1644 if (!mntopts)
998ac676
RT
1645 return 0;
1646
911324ef 1647 s = strdup(mntopts);
998ac676 1648 if (!s) {
36eb9bde 1649 SYSERROR("failed to allocate memory");
998ac676
RT
1650 return -1;
1651 }
1652
1653 data = malloc(strlen(s) + 1);
1654 if (!data) {
36eb9bde 1655 SYSERROR("failed to allocate memory");
998ac676
RT
1656 free(s);
1657 return -1;
1658 }
1659 *data = 0;
1660
1661 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1662 p = strtok_r(NULL, ",", &saveptr))
1663 parse_mntopt(p, mntflags, &data);
1664
1665 if (*data)
1666 *mntdata = data;
1667 else
1668 free(data);
1669 free(s);
1670
1671 return 0;
1672}
1673
6fd5e769
SH
1674static void null_endofword(char *word)
1675{
1676 while (*word && *word != ' ' && *word != '\t')
1677 word++;
1678 *word = '\0';
1679}
1680
1681/*
1682 * skip @nfields spaces in @src
1683 */
1684static char *get_field(char *src, int nfields)
1685{
1686 char *p = src;
1687 int i;
1688
1689 for (i = 0; i < nfields; i++) {
1690 while (*p && *p != ' ' && *p != '\t')
1691 p++;
1692 if (!*p)
1693 break;
1694 p++;
1695 }
1696 return p;
1697}
1698
911324ef
DL
1699static int mount_entry(const char *fsname, const char *target,
1700 const char *fstype, unsigned long mountflags,
0ac4b28a
CB
1701 const char *data, int optional, int dev,
1702 const char *rootfs)
911324ef 1703{
0ac4b28a 1704 int ret;
614305f3 1705#ifdef HAVE_STATVFS
2938f7c8 1706 struct statvfs sb;
614305f3 1707#endif
2938f7c8 1708
0ac4b28a
CB
1709 ret = safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data,
1710 rootfs);
1711 if (ret < 0) {
1fc64d22 1712 if (optional) {
0ac4b28a
CB
1713 INFO("Failed to mount \"%s\" on \"%s\" (optional): %s",
1714 fsname, target, strerror(errno));
1fc64d22
SG
1715 return 0;
1716 }
0ac4b28a
CB
1717
1718 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1719 return -1;
911324ef
DL
1720 }
1721
1722 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
7c5b6e7c 1723 unsigned long rqd_flags = 0;
0ac4b28a
CB
1724
1725 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
1726 "options",
1727 fsname ? fsname : "(none)", target ? target : "(none)");
1728
7c5b6e7c
AS
1729 if (mountflags & MS_RDONLY)
1730 rqd_flags |= MS_RDONLY;
614305f3 1731#ifdef HAVE_STATVFS
2938f7c8 1732 if (statvfs(fsname, &sb) == 0) {
7c5b6e7c 1733 unsigned long required_flags = rqd_flags;
0ac4b28a 1734
2938f7c8
SH
1735 if (sb.f_flag & MS_NOSUID)
1736 required_flags |= MS_NOSUID;
0ac4b28a 1737
ae7a770e 1738 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 1739 required_flags |= MS_NODEV;
0ac4b28a 1740
2938f7c8
SH
1741 if (sb.f_flag & MS_RDONLY)
1742 required_flags |= MS_RDONLY;
0ac4b28a 1743
2938f7c8
SH
1744 if (sb.f_flag & MS_NOEXEC)
1745 required_flags |= MS_NOEXEC;
0ac4b28a
CB
1746
1747 DEBUG("Flags for \"%s\" were %lu, required extra flags "
1748 "are %lu", fsname, sb.f_flag, required_flags);
1749
1750 /* If this was a bind mount request, and required_flags
2938f7c8 1751 * does not have any flags which are not already in
0ac4b28a 1752 * mountflags, then skip the remount.
2938f7c8
SH
1753 */
1754 if (!(mountflags & MS_REMOUNT)) {
0ac4b28a
CB
1755 if (!(required_flags & ~mountflags) &&
1756 rqd_flags == 0) {
1757 DEBUG("Mountflags already were %lu, "
1758 "skipping remount", mountflags);
2938f7c8
SH
1759 goto skipremount;
1760 }
1761 }
0ac4b28a 1762
2938f7c8 1763 mountflags |= required_flags;
6fd5e769 1764 }
614305f3 1765#endif
911324ef 1766
0ac4b28a
CB
1767 ret = mount(fsname, target, fstype, mountflags | MS_REMOUNT, data);
1768 if (ret < 0) {
1fc64d22 1769 if (optional) {
0ac4b28a
CB
1770 INFO("Failed to mount \"%s\" on \"%s\" "
1771 "(optional): %s", fsname, target,
1772 strerror(errno));
1fc64d22
SG
1773 return 0;
1774 }
0ac4b28a
CB
1775
1776 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1777 return -1;
911324ef
DL
1778 }
1779 }
1780
614305f3 1781#ifdef HAVE_STATVFS
6fd5e769 1782skipremount:
614305f3 1783#endif
0ac4b28a
CB
1784 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"", fsname,
1785 target, fstype);
911324ef
DL
1786
1787 return 0;
1788}
1789
c5e30de4 1790/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
1791static void cull_mntent_opt(struct mntent *mntent)
1792{
1793 int i;
c5e30de4
CB
1794 char *list[] = {"create=dir", "create=file", "optional", NULL};
1795
1796 for (i = 0; list[i]; i++) {
1797 char *p, *p2;
1798
1799 p = strstr(mntent->mnt_opts, list[i]);
1800 if (!p)
4e4ca161 1801 continue;
c5e30de4 1802
4e4ca161
SH
1803 p2 = strchr(p, ',');
1804 if (!p2) {
1805 /* no more mntopts, so just chop it here */
1806 *p = '\0';
1807 continue;
1808 }
c5e30de4
CB
1809
1810 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
1811 }
1812}
1813
4d5b72a1 1814static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
1815 const char *path,
1816 const struct lxc_rootfs *rootfs,
1817 const char *lxc_name,
1818 const char *lxc_path)
0ad19a3f 1819{
608e3567 1820 int ret = 0;
911324ef 1821
749f98d9
CB
1822 if (!strncmp(mntent->mnt_type, "overlay", 7))
1823 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
1824 else if (!strncmp(mntent->mnt_type, "aufs", 4))
1825 ret = aufs_mkdir(mntent, rootfs, lxc_name, lxc_path);
1826 if (ret < 0)
1827 return -1;
6e46cc0d 1828
34cfffb3 1829 if (hasmntopt(mntent, "create=dir")) {
749f98d9
CB
1830 ret = mkdir_p(path, 0755);
1831 if (ret < 0 && errno != EEXIST) {
1832 SYSERROR("Failed to create directory \"%s\"", path);
1833 return -1;
34cfffb3
SG
1834 }
1835 }
1836
4d5b72a1 1837 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
749f98d9
CB
1838 int fd;
1839 char *p1, *p2;
1840
1841 p1 = strdup(path);
1842 if (!p1)
1843 return -1;
1844
1845 p2 = dirname(p1);
1846
1847 ret = mkdir_p(p2, 0755);
1848 free(p1);
1849 if (ret < 0 && errno != EEXIST) {
1850 SYSERROR("Failed to create directory \"%s\"", path);
1851 return -1;
6e46cc0d 1852 }
749f98d9
CB
1853
1854 fd = open(path, O_CREAT, 0644);
1855 if (fd < 0)
1856 return -1;
1857 close(fd);
34cfffb3 1858 }
749f98d9
CB
1859
1860 return 0;
4d5b72a1
NC
1861}
1862
ec50007f
CB
1863/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1864 * without a rootfs. */
db4aba38 1865static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
1866 const char *path,
1867 const struct lxc_rootfs *rootfs,
1868 const char *lxc_name,
1869 const char *lxc_path)
4d5b72a1 1870{
d8b712bc 1871 int ret;
4d5b72a1
NC
1872 unsigned long mntflags;
1873 char *mntdata;
d8b712bc 1874 bool dev, optional;
ec50007f 1875 char *rootfs_path = NULL;
d8b712bc
CB
1876
1877 optional = hasmntopt(mntent, "optional") != NULL;
1878 dev = hasmntopt(mntent, "dev") != NULL;
1879
ec50007f
CB
1880 if (rootfs && rootfs->path)
1881 rootfs_path = rootfs->mount;
1882
d8b712bc
CB
1883 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
1884 lxc_path);
1885 if (ret < 0) {
1886 if (optional)
1887 return 0;
608e3567 1888
d8b712bc
CB
1889 return -1;
1890 }
4e4ca161
SH
1891 cull_mntent_opt(mntent);
1892
d8b712bc
CB
1893 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
1894 if (ret < 0)
a17b1e65 1895 return -1;
a17b1e65 1896
6e46cc0d 1897 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
ae7a770e 1898 mntdata, optional, dev, rootfs_path);
68c152ef 1899
911324ef 1900 free(mntdata);
911324ef
DL
1901 return ret;
1902}
1903
db4aba38
NC
1904static inline int mount_entry_on_systemfs(struct mntent *mntent)
1905{
1433c9f9 1906 int ret;
07667a6a 1907 char path[MAXPATHLEN];
1433c9f9
CB
1908
1909 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
1910 * absolute paths starting at / on the host.
1911 */
1433c9f9
CB
1912 if (mntent->mnt_dir[0] != '/')
1913 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1914 else
1915 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
07667a6a 1916 if (ret < 0 || ret >= sizeof(path))
1433c9f9 1917 return -1;
1433c9f9
CB
1918
1919 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
1920}
1921
4e4ca161 1922static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 1923 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
1924 const char *lxc_name,
1925 const char *lxc_path)
911324ef 1926{
bdd2b34c 1927 int offset;
013bd428 1928 char *aux;
67e571de 1929 const char *lxcpath;
bdd2b34c
CB
1930 char path[MAXPATHLEN];
1931 int ret = 0;
0ad19a3f 1932
593e8478 1933 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 1934 if (!lxcpath)
2a59a681 1935 return -1;
2a59a681 1936
bdd2b34c
CB
1937 /* If rootfs->path is a blockdev path, allow container fstab to use
1938 * <lxcpath>/<name>/rootfs" as the target prefix.
1939 */
1940 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
1941 if (ret < 0 || ret >= MAXPATHLEN)
80a881b2
SH
1942 goto skipvarlib;
1943
1944 aux = strstr(mntent->mnt_dir, path);
1945 if (aux) {
1946 offset = strlen(path);
1947 goto skipabs;
1948 }
1949
1950skipvarlib:
013bd428
DL
1951 aux = strstr(mntent->mnt_dir, rootfs->path);
1952 if (!aux) {
bdd2b34c 1953 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
db4aba38 1954 return ret;
013bd428 1955 }
80a881b2
SH
1956 offset = strlen(rootfs->path);
1957
1958skipabs:
bdd2b34c
CB
1959 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
1960 if (ret < 0 || ret >= MAXPATHLEN)
a17b1e65 1961 return -1;
a17b1e65 1962
0a2dddd4 1963 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 1964}
d330fe7b 1965
4e4ca161 1966static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
1967 const struct lxc_rootfs *rootfs,
1968 const char *lxc_name,
1969 const char *lxc_path)
911324ef
DL
1970{
1971 char path[MAXPATHLEN];
911324ef 1972 int ret;
d330fe7b 1973
34cfffb3 1974 /* relative to root mount point */
6e46cc0d 1975 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1433c9f9 1976 if (ret < 0 || ret >= sizeof(path)) {
9ba8130c
SH
1977 ERROR("path name too long");
1978 return -1;
1979 }
911324ef 1980
0a2dddd4 1981 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
1982}
1983
06749971
CB
1984/* This logs a NOTICE() when a user specifies mounts that would conflict with
1985 * devices liblxc sets up automatically.
1986 */
1987static void log_notice_on_conflict(const struct lxc_conf *conf, const char *src,
1988 const char *dest)
1989{
1990 char *clean_mnt_fsname, *clean_mnt_dir, *tmp;
1991 bool needs_warning = false;
1992
1993 clean_mnt_fsname = lxc_deslashify(src);
1994 if (!clean_mnt_fsname)
1995 return;
1996
1997 clean_mnt_dir = lxc_deslashify(dest);
1998 if (!clean_mnt_dir) {
1999 free(clean_mnt_fsname);
2000 return;
2001 }
2002
2003 tmp = clean_mnt_dir;
2004 if (*tmp == '/')
2005 tmp++;
2006
2007 if (strncmp(src, "/dev", 4) || strncmp(tmp, "dev", 3)) {
2008 free(clean_mnt_dir);
2009 free(clean_mnt_fsname);
2010 return;
2011 }
2012
2013 if (!conf->autodev && !conf->pts && !conf->tty &&
2014 (!conf->console.path || !strcmp(conf->console.path, "none"))) {
2015 free(clean_mnt_dir);
2016 free(clean_mnt_fsname);
2017 return;
2018 }
2019
2020 if (!strcmp(tmp, "dev") && conf->autodev > 0)
2021 needs_warning = true;
2022 else if (!strcmp(tmp, "dev/pts") && (conf->autodev > 0 || conf->pts > 0))
2023 needs_warning = true;
2024 else if (!strcmp(tmp, "dev/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2025 needs_warning = true;
2026 else if (!strcmp(tmp, "dev/pts/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2027 needs_warning = true;
2028 else if (!strcmp(tmp, "dev/null") && conf->autodev > 0)
2029 needs_warning = true;
2030 else if (!strcmp(tmp, "dev/zero") && conf->autodev > 0)
2031 needs_warning = true;
2032 else if (!strcmp(tmp, "dev/full") && conf->autodev > 0)
2033 needs_warning = true;
2034 else if (!strcmp(tmp, "dev/urandom") && conf->autodev > 0)
2035 needs_warning = true;
2036 else if (!strcmp(tmp, "dev/random") && conf->autodev > 0)
2037 needs_warning = true;
2038 else if (!strcmp(tmp, "dev/tty") && conf->autodev > 0)
2039 needs_warning = true;
2040 else if (!strncmp(tmp, "dev/tty", 7) && (conf->autodev > 0 || conf->tty > 0))
2041 needs_warning = true;
2042
2043 if (needs_warning)
2044 NOTICE("Requesting to mount \"%s\" on \"%s\" while requesting "
2045 "automatic device setup under \"/dev\"",
2046 clean_mnt_fsname, clean_mnt_dir);
2047
2048 free(clean_mnt_dir);
2049 free(clean_mnt_fsname);
2050}
2051
2052static int mount_file_entries(const struct lxc_conf *conf,
2053 const struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2054 const char *lxc_name, const char *lxc_path)
911324ef 2055{
aaf901be
AM
2056 struct mntent mntent;
2057 char buf[4096];
911324ef 2058 int ret = -1;
e76b8764 2059
aaf901be 2060 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
06749971
CB
2061 log_notice_on_conflict(conf, mntent.mnt_fsname, mntent.mnt_dir);
2062
1ae3c19f
CB
2063 if (!rootfs->path)
2064 ret = mount_entry_on_systemfs(&mntent);
2065 else if (mntent.mnt_dir[0] != '/')
2066 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2067 lxc_name, lxc_path);
2068 else
2069 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2070 lxc_name, lxc_path);
2071 if (ret < 0)
2072 return -1;
0ad19a3f 2073 }
2074 ret = 0;
cd54d859 2075
1ae3c19f 2076 INFO("Set up mount entries");
e7938e9e
MN
2077 return ret;
2078}
2079
06749971
CB
2080static int setup_mount(const struct lxc_conf *conf,
2081 const struct lxc_rootfs *rootfs, const char *fstab,
42dff448 2082 const char *lxc_name, const char *lxc_path)
e7938e9e 2083{
42dff448 2084 FILE *f;
e7938e9e
MN
2085 int ret;
2086
2087 if (!fstab)
2088 return 0;
2089
42dff448
CB
2090 f = setmntent(fstab, "r");
2091 if (!f) {
2092 SYSERROR("Failed to open \"%s\"", fstab);
e7938e9e
MN
2093 return -1;
2094 }
2095
06749971 2096 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
42dff448
CB
2097 if (ret < 0)
2098 ERROR("Failed to set up mount entries");
e7938e9e 2099
42dff448 2100 endmntent(f);
0ad19a3f 2101 return ret;
2102}
2103
5ef5c9a3 2104FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2105{
5ef5c9a3 2106 int ret;
e7938e9e 2107 char *mount_entry;
5ef5c9a3 2108 struct lxc_list *iterator;
6bd04140 2109 FILE *f;
5ef5c9a3
CB
2110 int fd = -1;
2111
2112 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2113 if (fd < 0) {
2114 if (errno != ENOSYS)
2115 return NULL;
6bd04140
CB
2116 f = tmpfile();
2117 TRACE("Created temporary mount file");
5ef5c9a3 2118 } else {
6bd04140
CB
2119 f = fdopen(fd, "r+");
2120 TRACE("Created anonymous mount file");
5ef5c9a3 2121 }
e7938e9e 2122
6bd04140
CB
2123 if (!f) {
2124 SYSERROR("Could not create mount file");
5ef5c9a3
CB
2125 if (fd != -1)
2126 close(fd);
9fc7f8c0 2127 return NULL;
e7938e9e
MN
2128 }
2129
2130 lxc_list_for_each(iterator, mount) {
2131 mount_entry = iterator->elem;
6bd04140 2132 ret = fprintf(f, "%s\n", mount_entry);
5ef5c9a3 2133 if (ret < strlen(mount_entry))
6bd04140 2134 WARN("Could not write mount entry to mount file");
5ef5c9a3
CB
2135 }
2136
6bd04140
CB
2137 ret = fseek(f, 0, SEEK_SET);
2138 if (ret < 0) {
2139 SYSERROR("Failed to seek mount file");
2140 fclose(f);
5ef5c9a3 2141 return NULL;
e7938e9e
MN
2142 }
2143
6bd04140 2144 return f;
9fc7f8c0
TA
2145}
2146
06749971
CB
2147static int setup_mount_entries(const struct lxc_conf *conf,
2148 const struct lxc_rootfs *rootfs,
5ef5c9a3
CB
2149 struct lxc_list *mount, const char *lxc_name,
2150 const char *lxc_path)
9fc7f8c0 2151{
19b5d755 2152 FILE *f;
9fc7f8c0
TA
2153 int ret;
2154
19b5d755
CB
2155 f = make_anonymous_mount_file(mount);
2156 if (!f)
9fc7f8c0 2157 return -1;
e7938e9e 2158
06749971 2159 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
e7938e9e 2160
19b5d755 2161 fclose(f);
e7938e9e
MN
2162 return ret;
2163}
2164
bab88e68
CS
2165static int parse_cap(const char *cap)
2166{
2167 char *ptr = NULL;
84760c11 2168 size_t i;
2169 int capid = -1;
bab88e68 2170
7035407c
DE
2171 if (!strcmp(cap, "none"))
2172 return -2;
2173
bab88e68
CS
2174 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2175
2176 if (strcmp(cap, caps_opt[i].name))
2177 continue;
2178
2179 capid = caps_opt[i].value;
2180 break;
2181 }
2182
2183 if (capid < 0) {
2184 /* try to see if it's numeric, so the user may specify
2185 * capabilities that the running kernel knows about but
2186 * we don't */
2187 errno = 0;
2188 capid = strtol(cap, &ptr, 10);
2189 if (!ptr || *ptr != '\0' || errno != 0)
2190 /* not a valid number */
2191 capid = -1;
2192 else if (capid > lxc_caps_last_cap())
2193 /* we have a number but it's not a valid
2194 * capability */
2195 capid = -1;
2196 }
2197
2198 return capid;
2199}
2200
0769b82a
CS
2201int in_caplist(int cap, struct lxc_list *caps)
2202{
2203 struct lxc_list *iterator;
2204 int capid;
2205
2206 lxc_list_for_each(iterator, caps) {
2207 capid = parse_cap(iterator->elem);
2208 if (capid == cap)
2209 return 1;
2210 }
2211
2212 return 0;
2213}
2214
81810dd1
DL
2215static int setup_caps(struct lxc_list *caps)
2216{
2217 struct lxc_list *iterator;
2218 char *drop_entry;
bab88e68 2219 int capid;
81810dd1
DL
2220
2221 lxc_list_for_each(iterator, caps) {
2222
2223 drop_entry = iterator->elem;
2224
bab88e68 2225 capid = parse_cap(drop_entry);
d55bc1ad 2226
81810dd1 2227 if (capid < 0) {
1e11be34
DL
2228 ERROR("unknown capability %s", drop_entry);
2229 return -1;
81810dd1
DL
2230 }
2231
2232 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2233
2234 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2235 SYSERROR("failed to remove %s capability", drop_entry);
2236 return -1;
2237 }
81810dd1
DL
2238
2239 }
2240
1fb86a7c
SH
2241 DEBUG("capabilities have been setup");
2242
2243 return 0;
2244}
2245
2246static int dropcaps_except(struct lxc_list *caps)
2247{
2248 struct lxc_list *iterator;
2249 char *keep_entry;
1fb86a7c
SH
2250 int i, capid;
2251 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2252 INFO("found %d capabilities", numcaps);
1fb86a7c 2253
2caf9a97
SH
2254 if (numcaps <= 0 || numcaps > 200)
2255 return -1;
2256
1a0e70ac 2257 /* caplist[i] is 1 if we keep capability i */
1fb86a7c
SH
2258 int *caplist = alloca(numcaps * sizeof(int));
2259 memset(caplist, 0, numcaps * sizeof(int));
2260
2261 lxc_list_for_each(iterator, caps) {
2262
2263 keep_entry = iterator->elem;
2264
bab88e68 2265 capid = parse_cap(keep_entry);
1fb86a7c 2266
7035407c
DE
2267 if (capid == -2)
2268 continue;
2269
1fb86a7c
SH
2270 if (capid < 0) {
2271 ERROR("unknown capability %s", keep_entry);
2272 return -1;
2273 }
2274
8255688a 2275 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2276
2277 caplist[capid] = 1;
2278 }
2279 for (i=0; i<numcaps; i++) {
2280 if (caplist[i])
2281 continue;
2282 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2283 SYSERROR("failed to remove capability %d", i);
2284 return -1;
2285 }
1fb86a7c
SH
2286 }
2287
2288 DEBUG("capabilities have been setup");
81810dd1
DL
2289
2290 return 0;
2291}
2292
c6d09e15
WB
2293static int parse_resource(const char *res) {
2294 size_t i;
2295 int resid = -1;
2296
2297 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2298 if (strcmp(res, limit_opt[i].name) == 0)
2299 return limit_opt[i].value;
2300 }
2301
2302 /* try to see if it's numeric, so the user may specify
2303 * resources that the running kernel knows about but
2304 * we don't */
2305 if (lxc_safe_int(res, &resid) == 0)
2306 return resid;
2307 return -1;
2308}
2309
2310int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2311 struct lxc_list *it;
2312 struct lxc_limit *lim;
2313 int resid;
2314
2315 lxc_list_for_each(it, limits) {
2316 lim = it->elem;
2317
2318 resid = parse_resource(lim->resource);
2319 if (resid < 0) {
2320 ERROR("unknown resource %s", lim->resource);
2321 return -1;
2322 }
2323
2324 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2325 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2326 return -1;
2327 }
2328 }
2329 return 0;
2330}
2331
ae9242c8
SH
2332static char *default_rootfs_mount = LXCROOTFSMOUNT;
2333
7b379ab3 2334struct lxc_conf *lxc_conf_init(void)
089cd8b8 2335{
7b379ab3 2336 struct lxc_conf *new;
26ddeedd 2337 int i;
7b379ab3 2338
13277ec4 2339 new = malloc(sizeof(*new));
7b379ab3 2340 if (!new) {
13277ec4 2341 ERROR("lxc_conf_init : %s", strerror(errno));
7b379ab3
MN
2342 return NULL;
2343 }
2344 memset(new, 0, sizeof(*new));
2345
4b73005c 2346 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2347 new->personality = -1;
124fa0a8 2348 new->autodev = 1;
596a818d
DE
2349 new->console.log_path = NULL;
2350 new->console.log_fd = -1;
28a4b0e5 2351 new->console.path = NULL;
63376d7d 2352 new->console.peer = -1;
b5159817
DE
2353 new->console.peerpty.busy = -1;
2354 new->console.peerpty.master = -1;
2355 new->console.peerpty.slave = -1;
63376d7d
DL
2356 new->console.master = -1;
2357 new->console.slave = -1;
2358 new->console.name[0] = '\0';
d2e30e99 2359 new->maincmd_fd = -1;
76a26f55 2360 new->nbd_idx = -1;
54c30e29 2361 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2362 if (!new->rootfs.mount) {
13277ec4 2363 ERROR("lxc_conf_init : %s", strerror(errno));
53f3f048
SH
2364 free(new);
2365 return NULL;
2366 }
858377e4 2367 new->logfd = -1;
7b379ab3
MN
2368 lxc_list_init(&new->cgroup);
2369 lxc_list_init(&new->network);
2370 lxc_list_init(&new->mount_list);
81810dd1 2371 lxc_list_init(&new->caps);
1fb86a7c 2372 lxc_list_init(&new->keepcaps);
f6d3e3e4 2373 lxc_list_init(&new->id_map);
f979ac15 2374 lxc_list_init(&new->includes);
4184c3e1 2375 lxc_list_init(&new->aliens);
7c661726 2376 lxc_list_init(&new->environment);
c6d09e15 2377 lxc_list_init(&new->limits);
26ddeedd
SH
2378 for (i=0; i<NUM_LXC_HOOKS; i++)
2379 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2380 lxc_list_init(&new->groups);
fe4de9a6
DE
2381 new->lsm_aa_profile = NULL;
2382 new->lsm_se_context = NULL;
5112cd70 2383 new->tmp_umount_proc = 0;
7b379ab3 2384
9f30a190
MM
2385 for (i = 0; i < LXC_NS_MAX; i++)
2386 new->inherit_ns_fd[i] = -1;
2387
72bb04e4
PT
2388 /* if running in a new user namespace, init and COMMAND
2389 * default to running as UID/GID 0 when using lxc-execute */
2390 new->init_uid = 0;
2391 new->init_gid = 0;
43654d34 2392 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
72bb04e4 2393
7b379ab3 2394 return new;
089cd8b8
DL
2395}
2396
251d0d2a
DE
2397static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
2398 size_t buf_size)
f6d3e3e4 2399{
29053180
CB
2400 char path[MAXPATHLEN];
2401 int fd, ret;
f6d3e3e4 2402
29053180
CB
2403 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
2404 idtype == ID_TYPE_UID ? 'u' : 'g');
2405 if (ret < 0 || ret >= MAXPATHLEN) {
2406 ERROR("failed to create path \"%s\"", path);
f6d3e3e4
SH
2407 return -E2BIG;
2408 }
29053180
CB
2409
2410 fd = open(path, O_WRONLY);
2411 if (fd < 0) {
2412 SYSERROR("failed to open \"%s\"", path);
2413 return -1;
f6d3e3e4 2414 }
29053180
CB
2415
2416 errno = 0;
2417 ret = lxc_write_nointr(fd, buf, buf_size);
2418 if (ret != buf_size) {
2419 SYSERROR("failed to write %cid mapping to \"%s\"",
2420 idtype == ID_TYPE_UID ? 'u' : 'g', path);
2421 close(fd);
2422 return -1;
2423 }
2424 close(fd);
2425
2426 return 0;
f6d3e3e4
SH
2427}
2428
6e50e704
CB
2429/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2430 *
2431 * @return 1 if functional binary was found
2432 * @return 0 if binary exists but is lacking privilege
2433 * @return -ENOENT if binary does not exist
2434 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
2435 *
2436 */
df6a2945
CB
2437static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2438{
2439 char *path;
2440 int ret;
2441 struct stat st;
2442 int fret = 0;
2443
6e50e704
CB
2444 if (cap != CAP_SETUID && cap != CAP_SETGID)
2445 return -EINVAL;
2446
df6a2945
CB
2447 path = on_path(binary, NULL);
2448 if (!path)
2449 return -ENOENT;
2450
2451 ret = stat(path, &st);
2452 if (ret < 0) {
2453 fret = -errno;
2454 goto cleanup;
2455 }
2456
2457 /* Check if the binary is setuid. */
2458 if (st.st_mode & S_ISUID) {
2459 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
2460 fret = 1;
2461 goto cleanup;
2462 }
2463
69924fff 2464 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
2465 /* Check if it has the CAP_SETUID capability. */
2466 if ((cap & CAP_SETUID) &&
2467 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2468 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2469 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
2470 "and CAP_PERMITTED sets.", path);
2471 fret = 1;
2472 goto cleanup;
2473 }
2474
2475 /* Check if it has the CAP_SETGID capability. */
2476 if ((cap & CAP_SETGID) &&
2477 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2478 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2479 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
2480 "and CAP_PERMITTED sets.", path);
2481 fret = 1;
2482 goto cleanup;
2483 }
d6018f88 2484 #else
69924fff
CB
2485 /* If we cannot check for file capabilities we need to give the benefit
2486 * of the doubt. Otherwise we might fail even though all the necessary
2487 * file capabilities are set.
2488 */
d6018f88
CB
2489 DEBUG("Cannot check for file capabilites as full capability support is "
2490 "missing. Manual intervention needed.");
2491 fret = 1;
df6a2945
CB
2492 #endif
2493
2494cleanup:
2495 free(path);
2496 return fret;
2497}
2498
986ef930
CB
2499int lxc_map_ids_exec_wrapper(void *args)
2500{
2501 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2502 return -1;
2503}
2504
f6d3e3e4
SH
2505int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2506{
f6d3e3e4 2507 struct id_map *map;
4bc3b759 2508 struct lxc_list *iterator;
251d0d2a 2509 enum idtype type;
986ef930 2510 char u_or_g;
4bc3b759 2511 char *pos;
99d43365 2512 int fill, left;
986ef930
CB
2513 char cmd_output[MAXPATHLEN];
2514 /* strlen("new@idmap") = 9
2515 * +
2516 * strlen(" ") = 1
2517 * +
2518 * LXC_NUMSTRLEN64
2519 * +
2520 * strlen(" ") = 1
2521 *
2522 * We add some additional space to make sure that we really have
2523 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2524 */
2525 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
2526 int ret = 0, uidmap = 0, gidmap = 0;
2527 bool use_shadow = false, had_entry = false;
df6a2945
CB
2528
2529 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2530 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
2531 * will protected it by preventing another user from being handed the
2532 * range by shadow.
2533 */
df6a2945 2534 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
2535 if (uidmap == -ENOENT)
2536 WARN("newuidmap binary is missing");
2537 else if (!uidmap)
2538 WARN("newuidmap is lacking necessary privileges");
2539
df6a2945 2540 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
2541 if (gidmap == -ENOENT)
2542 WARN("newgidmap binary is missing");
2543 else if (!gidmap)
2544 WARN("newgidmap is lacking necessary privileges");
2545
df6a2945
CB
2546 if (uidmap > 0 && gidmap > 0) {
2547 DEBUG("Functional newuidmap and newgidmap binary found.");
4bc3b759 2548 use_shadow = true;
df6a2945 2549 } else {
99d43365
CB
2550 /* In case unprivileged users run application containers via
2551 * execute() or a start*() there are valid cases where they may
2552 * only want to map their own {g,u}id. Let's not block them from
2553 * doing so by requiring geteuid() == 0.
2554 */
2555 DEBUG("No newuidmap and newgidmap binary found. Trying to "
2556 "write directly with euid %d.", geteuid());
0e6e3a41 2557 }
251d0d2a 2558
986ef930
CB
2559 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2560 type++, u_or_g = 'g') {
2561 pos = mapbuf;
2562
0e6e3a41 2563 if (use_shadow)
986ef930 2564 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 2565
cf3ef16d 2566 lxc_list_for_each(iterator, idmap) {
4bc3b759
CB
2567 /* The kernel only takes <= 4k for writes to
2568 * /proc/<nr>/[ug]id_map
2569 */
251d0d2a 2570 map = iterator->elem;
cf3ef16d
SH
2571 if (map->idtype != type)
2572 continue;
2573
4bc3b759
CB
2574 had_entry = true;
2575
986ef930 2576 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 2577 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
2578 use_shadow ? " " : "", map->nsid,
2579 map->hostid, map->range,
0e6e3a41 2580 use_shadow ? "" : "\n");
cf3ef16d 2581 if (fill <= 0 || fill >= left)
4bc3b759
CB
2582 SYSERROR("Too many {g,u}id mappings defined.");
2583
cf3ef16d 2584 pos += fill;
251d0d2a 2585 }
cf3ef16d 2586 if (!had_entry)
4f7521b4 2587 continue;
cf3ef16d 2588
986ef930
CB
2589 /* Try to catch the ouput of new{g,u}idmap to make debugging
2590 * easier.
2591 */
2592 if (use_shadow) {
2593 ret = run_command(cmd_output, sizeof(cmd_output),
2594 lxc_map_ids_exec_wrapper,
2595 (void *)mapbuf);
2596 if (ret < 0) {
2597 ERROR("new%cidmap failed to write mapping: %s",
2598 u_or_g, cmd_output);
2599 return -1;
2600 }
d1838f34 2601 } else {
986ef930
CB
2602 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
2603 if (ret < 0)
2604 return -1;
d1838f34 2605 }
986ef930
CB
2606
2607 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 2608 }
251d0d2a 2609
986ef930 2610 return 0;
f6d3e3e4
SH
2611}
2612
cf3ef16d 2613/*
7b50c609
TS
2614 * return the host uid/gid to which the container root is mapped in
2615 * *val.
0b3a6504 2616 * Return true if id was found, false otherwise.
cf3ef16d 2617 */
2a9a80cb 2618bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 2619 unsigned long *val)
cf3ef16d
SH
2620{
2621 struct lxc_list *it;
2622 struct id_map *map;
2623
2624 lxc_list_for_each(it, &conf->id_map) {
2625 map = it->elem;
7b50c609 2626 if (map->idtype != idtype)
cf3ef16d
SH
2627 continue;
2628 if (map->nsid != 0)
2629 continue;
2a9a80cb
SH
2630 *val = map->hostid;
2631 return true;
cf3ef16d 2632 }
2a9a80cb 2633 return false;
cf3ef16d
SH
2634}
2635
2133f58c 2636int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
2637{
2638 struct lxc_list *it;
2639 struct id_map *map;
2640 lxc_list_for_each(it, &conf->id_map) {
2641 map = it->elem;
2133f58c 2642 if (map->idtype != idtype)
cf3ef16d
SH
2643 continue;
2644 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 2645 return (id - map->hostid) + map->nsid;
cf3ef16d 2646 }
57d116ab 2647 return -1;
cf3ef16d
SH
2648}
2649
339efad9 2650int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
2651{
2652 struct lxc_list *it;
2653 struct id_map *map;
2133f58c 2654 unsigned int freeid = 0;
cf3ef16d
SH
2655again:
2656 lxc_list_for_each(it, &conf->id_map) {
2657 map = it->elem;
2133f58c 2658 if (map->idtype != idtype)
cf3ef16d
SH
2659 continue;
2660 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
2661 freeid = map->nsid + map->range;
2662 goto again;
2663 }
2664 }
2665 return freeid;
2666}
2667
5e4a62bf 2668int lxc_create_tty(const char *name, struct lxc_conf *conf)
b0a33c1e 2669{
5e4a62bf 2670 struct lxc_tty_info *tty_info = &conf->tty_info;
025ed0f3 2671 int i, ret;
b0a33c1e 2672
5e4a62bf
DL
2673 /* no tty in the configuration */
2674 if (!conf->tty)
b0a33c1e 2675 return 0;
2676
9e1045e3 2677 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
b0a33c1e 2678 if (!tty_info->pty_info) {
9e1045e3
CB
2679 SYSERROR("failed to allocate struct *pty_info");
2680 return -ENOMEM;
b0a33c1e 2681 }
2682
985d15b1 2683 for (i = 0; i < conf->tty; i++) {
b0a33c1e 2684 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
2685
025ed0f3
SH
2686 process_lock();
2687 ret = openpty(&pty_info->master, &pty_info->slave,
9e1045e3 2688 pty_info->name, NULL, NULL);
025ed0f3
SH
2689 process_unlock();
2690 if (ret) {
9e1045e3 2691 SYSERROR("failed to create pty device number %d", i);
985d15b1
MT
2692 tty_info->nbtty = i;
2693 lxc_delete_tty(tty_info);
9e1045e3 2694 return -ENOTTY;
b0a33c1e 2695 }
2696
9e1045e3 2697 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
5332bb84
DL
2698 pty_info->name, pty_info->master, pty_info->slave);
2699
3ec1648d 2700 /* Prevent leaking the file descriptors to the container */
9e1045e3
CB
2701 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
2702 if (ret < 0)
2703 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
2704 "pty device \"%s\": %s",
2705 pty_info->master, pty_info->name, strerror(errno));
2706
2707 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
2708 if (ret < 0)
2709 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
2710 "pty device \"%s\": %s",
2711 pty_info->slave, pty_info->name, strerror(errno));
b035ad62 2712
b0a33c1e 2713 pty_info->busy = 0;
2714 }
2715
985d15b1 2716 tty_info->nbtty = conf->tty;
1ac470c0 2717
9e1045e3 2718 INFO("finished allocating %d pts devices", conf->tty);
985d15b1 2719 return 0;
b0a33c1e 2720}
2721
2722void lxc_delete_tty(struct lxc_tty_info *tty_info)
2723{
2724 int i;
2725
2726 for (i = 0; i < tty_info->nbtty; i++) {
2727 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
2728
2729 close(pty_info->master);
2730 close(pty_info->slave);
2731 }
2732
2733 free(tty_info->pty_info);
e00c0242 2734 tty_info->pty_info = NULL;
b0a33c1e 2735 tty_info->nbtty = 0;
2736}
2737
f4f52cb5
CB
2738
2739int chown_mapped_root_exec_wrapper(void *args)
2740{
2741 execvp("lxc-usernsexec", args);
2742 return -1;
2743}
2744
f6d3e3e4 2745/*
7b50c609
TS
2746 * chown_mapped_root: for an unprivileged user with uid/gid X to
2747 * chown a dir to subuid/subgid Y, he needs to run chown as root
2748 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
2749 * nsid Y is mapped to hostuid/hostgid X. That way, the container
2750 * root is privileged with respect to hostuid/hostgid X, allowing
2751 * him to do the chown.
f6d3e3e4 2752 */
c4d10a05 2753int chown_mapped_root(char *path, struct lxc_conf *conf)
f6d3e3e4 2754{
f4f52cb5 2755 uid_t rootuid, rootgid;
2a9a80cb 2756 unsigned long val;
f4f52cb5
CB
2757 int hostuid, hostgid, ret;
2758 struct stat sb;
2759 char map1[100], map2[100], map3[100], map4[100], map5[100];
2760 char ugid[100];
2761 char *args1[] = {"lxc-usernsexec",
2762 "-m", map1,
2763 "-m", map2,
2764 "-m", map3,
2765 "-m", map5,
2766 "--", "chown", ugid, path,
2767 NULL};
2768 char *args2[] = {"lxc-usernsexec",
2769 "-m", map1,
2770 "-m", map2,
2771 "-m", map3,
2772 "-m", map4,
2773 "-m", map5,
2774 "--", "chown", ugid, path,
2775 NULL};
2776 char cmd_output[MAXPATHLEN];
2777
2778 hostuid = geteuid();
2779 hostgid = getegid();
f6d3e3e4 2780
2a9a80cb 2781 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 2782 ERROR("No uid mapping for container root");
c4d10a05 2783 return -1;
f6d3e3e4 2784 }
f4f52cb5 2785 rootuid = (uid_t)val;
7b50c609 2786 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 2787 ERROR("No gid mapping for container root");
7b50c609
TS
2788 return -1;
2789 }
f4f52cb5 2790 rootgid = (gid_t)val;
2a9a80cb 2791
f4f52cb5 2792 if (hostuid == 0) {
7b50c609 2793 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
2794 ERROR("Error chowning %s", path);
2795 return -1;
2796 }
2797 return 0;
2798 }
f3d7e4ca 2799
f4f52cb5 2800 if (rootuid == hostuid) {
1a0e70ac 2801 /* nothing to do */
b103ceac 2802 INFO("Container root is our uid; no need to chown");
f3d7e4ca
SH
2803 return 0;
2804 }
2805
bbdbf8f0 2806 /* save the current gid of "path" */
f4f52cb5
CB
2807 if (stat(path, &sb) < 0) {
2808 ERROR("Error stat %s", path);
f6d3e3e4
SH
2809 return -1;
2810 }
7b50c609 2811
bbdbf8f0
CB
2812 /* Update the path argument in case this was overlayfs. */
2813 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
2814 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
2815
f4f52cb5
CB
2816 /*
2817 * A file has to be group-owned by a gid mapped into the
2818 * container, or the container won't be privileged over it.
2819 */
2820 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
2821 if (sb.st_uid == hostuid &&
2822 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
2823 chown(path, -1, hostgid) < 0) {
2824 ERROR("Failed chgrping %s", path);
2825 return -1;
2826 }
f6d3e3e4 2827
1a0e70ac 2828 /* "u:0:rootuid:1" */
f4f52cb5
CB
2829 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
2830 if (ret < 0 || ret >= 100) {
2831 ERROR("Error uid printing map string");
2832 return -1;
2833 }
7b50c609 2834
1a0e70ac 2835 /* "u:hostuid:hostuid:1" */
f4f52cb5
CB
2836 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
2837 if (ret < 0 || ret >= 100) {
2838 ERROR("Error uid printing map string");
2839 return -1;
2840 }
c4d10a05 2841
1a0e70ac 2842 /* "g:0:rootgid:1" */
f4f52cb5
CB
2843 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
2844 if (ret < 0 || ret >= 100) {
2845 ERROR("Error gid printing map string");
2846 return -1;
2847 }
98e5ba51 2848
1a0e70ac 2849 /* "g:pathgid:rootgid+pathgid:1" */
f4f52cb5
CB
2850 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
2851 rootgid + (gid_t)sb.st_gid);
2852 if (ret < 0 || ret >= 100) {
2853 ERROR("Error gid printing map string");
2854 return -1;
2855 }
c4d10a05 2856
1a0e70ac 2857 /* "g:hostgid:hostgid:1" */
f4f52cb5
CB
2858 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
2859 if (ret < 0 || ret >= 100) {
2860 ERROR("Error gid printing map string");
2861 return -1;
2862 }
7b50c609 2863
1a0e70ac 2864 /* "0:pathgid" (chown) */
f4f52cb5
CB
2865 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
2866 if (ret < 0 || ret >= 100) {
2867 ERROR("Error owner printing format string for chown");
2868 return -1;
2869 }
7b50c609 2870
f4f52cb5
CB
2871 if (hostgid == sb.st_gid)
2872 ret = run_command(cmd_output, sizeof(cmd_output),
2873 chown_mapped_root_exec_wrapper,
2874 (void *)args1);
2875 else
2876 ret = run_command(cmd_output, sizeof(cmd_output),
2877 chown_mapped_root_exec_wrapper,
2878 (void *)args2);
2879 if (ret < 0)
2880 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 2881
f4f52cb5 2882 return ret;
f6d3e3e4
SH
2883}
2884
54117de5 2885int lxc_ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 2886{
c4d10a05 2887 if (lxc_list_empty(&c->id_map))
f6d3e3e4 2888 return 0;
c4d10a05 2889
54117de5
CB
2890 if (!strcmp(c->console.name, ""))
2891 return 0;
2892
2893 if (chown_mapped_root(c->console.name, c) < 0) {
2894 ERROR("failed to chown console \"%s\"", c->console.name);
c4d10a05
SH
2895 return -1;
2896 }
2897
54117de5
CB
2898 TRACE("chowned console \"%s\"", c->console.name);
2899
f6d3e3e4
SH
2900 return 0;
2901}
2902
943144d9
CB
2903/* NOTE: Must not be called from inside the container namespace! */
2904int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
2905{
2906 int mounted;
2907
943144d9 2908 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 2909 if (mounted == -1) {
943144d9 2910 SYSERROR("failed to mount /proc in the container");
01958b1f 2911 /* continue only if there is no rootfs */
943144d9 2912 if (conf->rootfs.path)
01958b1f 2913 return -1;
5112cd70 2914 } else if (mounted == 1) {
943144d9 2915 conf->tmp_umount_proc = 1;
5112cd70 2916 }
943144d9 2917
5112cd70
SH
2918 return 0;
2919}
2920
2921void tmp_proc_unmount(struct lxc_conf *lxc_conf)
2922{
2923 if (lxc_conf->tmp_umount_proc == 1) {
2924 umount("/proc");
2925 lxc_conf->tmp_umount_proc = 0;
2926 }
2927}
2928
6a0c909a 2929void remount_all_slave(void)
e995d7a2
SH
2930{
2931 /* walk /proc/mounts and change any shared entries to slave */
2932 FILE *f = fopen("/proc/self/mountinfo", "r");
2933 char *line = NULL;
2934 size_t len = 0;
2935
2936 if (!f) {
2937 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
2938 ERROR("Continuing container startup...");
2939 return;
2940 }
2941
2942 while (getline(&line, &len, f) != -1) {
2943 char *target, *opts;
2944 target = get_field(line, 4);
2945 if (!target)
2946 continue;
2947 opts = get_field(target, 2);
2948 if (!opts)
2949 continue;
2950 null_endofword(opts);
2951 if (!strstr(opts, "shared"))
2952 continue;
2953 null_endofword(target);
2954 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
2955 SYSERROR("Failed to make %s rslave", target);
2956 ERROR("Continuing...");
2957 }
2958 }
2959 fclose(f);
f10fad2f 2960 free(line);
e995d7a2
SH
2961}
2962
2322903b
SH
2963void lxc_execute_bind_init(struct lxc_conf *conf)
2964{
2965 int ret;
9d9c111c
SH
2966 char path[PATH_MAX], destpath[PATH_MAX], *p;
2967
2968 /* If init exists in the container, don't bind mount a static one */
2969 p = choose_init(conf->rootfs.mount);
2970 if (p) {
2971 free(p);
2972 return;
2973 }
2322903b
SH
2974
2975 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
2976 if (ret < 0 || ret >= PATH_MAX) {
2977 WARN("Path name too long searching for lxc.init.static");
2978 return;
2979 }
2980
2981 if (!file_exists(path)) {
2982 INFO("%s does not exist on host", path);
2983 return;
2984 }
2985
2986 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
2987 if (ret < 0 || ret >= PATH_MAX) {
2988 WARN("Path name too long for container's lxc.init.static");
2989 return;
2990 }
2991
2992 if (!file_exists(destpath)) {
2993 FILE * pathfile = fopen(destpath, "wb");
2994 if (!pathfile) {
2995 SYSERROR("Failed to create mount target '%s'", destpath);
2996 return;
2997 }
2998 fclose(pathfile);
2999 }
3000
592fd47a 3001 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
2322903b
SH
3002 if (ret < 0)
3003 SYSERROR("Failed to bind lxc.init.static into container");
3004 INFO("lxc.init.static bound into container at %s", path);
3005}
3006
35120d9c
SH
3007/*
3008 * This does the work of remounting / if it is shared, calling the
3009 * container pre-mount hooks, and mounting the rootfs.
3010 */
3011int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 3012{
35120d9c
SH
3013 if (conf->rootfs_setup) {
3014 /*
3015 * rootfs was set up in another namespace. bind-mount it
3016 * to give us a mount in our own ns so we can pivot_root to it
3017 */
3018 const char *path = conf->rootfs.mount;
3019 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
3020 ERROR("Failed to bind-mount container / onto itself");
145832ba 3021 return -1;
35120d9c 3022 }
145832ba 3023 return 0;
35120d9c 3024 }
d4ef7c50 3025
e995d7a2
SH
3026 remount_all_slave();
3027
35120d9c
SH
3028 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
3029 ERROR("failed to run pre-mount hooks for container '%s'.", name);
3030 return -1;
3031 }
3032
9aa76a17 3033 if (lxc_setup_rootfs(conf)) {
35120d9c
SH
3034 ERROR("failed to setup rootfs for '%s'", name);
3035 return -1;
3036 }
3037
3038 conf->rootfs_setup = true;
3039 return 0;
3040}
3041
1c1c7051
SH
3042static bool verify_start_hooks(struct lxc_conf *conf)
3043{
3044 struct lxc_list *it;
3045 char path[MAXPATHLEN];
3046 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
3047 char *hookname = it->elem;
3048 struct stat st;
3049 int ret;
3050
3051 ret = snprintf(path, MAXPATHLEN, "%s%s",
7b6753e7 3052 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
1c1c7051
SH
3053 if (ret < 0 || ret >= MAXPATHLEN)
3054 return false;
3055 ret = stat(path, &st);
3056 if (ret) {
7b6753e7 3057 SYSERROR("Start hook %s not found in container",
1c1c7051
SH
3058 hookname);
3059 return false;
3060 }
6a0c909a 3061 return true;
1c1c7051
SH
3062 }
3063
3064 return true;
3065}
3066
ae467c54 3067static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
e8bd4e43 3068{
ae467c54 3069 int i;
e8bd4e43 3070 struct lxc_conf *conf = handler->conf;
672c1e58 3071 struct lxc_tty_info *tty_info = &conf->tty_info;
c6012571 3072 int sock = handler->data_sock[0];
ae467c54 3073 int ret = -1;
e8bd4e43 3074
1f9bbd23
CB
3075 if (!conf->tty)
3076 return 0;
3077
672c1e58
CB
3078 for (i = 0; i < conf->tty; i++) {
3079 int ttyfds[2];
3080 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
ae467c54 3081
672c1e58
CB
3082 ttyfds[0] = pty_info->master;
3083 ttyfds[1] = pty_info->slave;
3084
3085 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
3086 if (ret < 0)
3087 break;
3088
3089 TRACE("Send pty \"%s\" with master fd %d and slave fd %d to "
3090 "parent", pty_info->name, pty_info->master, pty_info->slave);
e8bd4e43
SH
3091 }
3092
ae467c54 3093 if (ret < 0)
672c1e58 3094 ERROR("Failed to send %d ttys to parent: %s", conf->tty,
ae467c54
CB
3095 strerror(errno));
3096 else
672c1e58 3097 TRACE("Sent %d ttys to parent", conf->tty);
ae467c54 3098
672c1e58 3099 lxc_delete_tty(tty_info);
ae467c54
CB
3100
3101 return ret;
e8bd4e43
SH
3102}
3103
35120d9c
SH
3104int lxc_setup(struct lxc_handler *handler)
3105{
3106 const char *name = handler->name;
3107 struct lxc_conf *lxc_conf = handler->conf;
3108 const char *lxcpath = handler->lxcpath;
35120d9c
SH
3109
3110 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
3111 ERROR("Error setting up rootfs mount after spawn");
3112 return -1;
3113 }
3114
6c544cb3
MM
3115 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
3116 if (setup_utsname(lxc_conf->utsname)) {
3117 ERROR("failed to setup the utsname for '%s'", name);
3118 return -1;
3119 }
0ad19a3f 3120 }
3121
811ef482 3122 if (lxc_setup_network_in_child_namespaces(lxc_conf, &lxc_conf->network)) {
36eb9bde 3123 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 3124 return -1;
0ad19a3f 3125 }
3126
790255cf
CB
3127 if (lxc_network_send_name_and_ifindex_to_parent(handler) < 0) {
3128 ERROR("Failed to network device names and ifindices to parent");
3129 return -1;
3130 }
3131
bc6928ff 3132 if (lxc_conf->autodev > 0) {
14221cbb 3133 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
91c3830e 3134 ERROR("failed to mount /dev in the container");
c6883f38
SH
3135 return -1;
3136 }
3137 }
3138
368bbc02
CS
3139 /* do automatic mounts (mainly /proc and /sys), but exclude
3140 * those that need to wait until other stuff has finished
3141 */
4fb3cba5 3142 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
3143 ERROR("failed to setup the automatic mounts for '%s'", name);
3144 return -1;
3145 }
3146
06749971 3147 if (setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
36eb9bde 3148 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 3149 return -1;
576f946d 3150 }
3151
06749971 3152 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(lxc_conf, &lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
e7938e9e
MN
3153 ERROR("failed to setup the mount entries for '%s'", name);
3154 return -1;
3155 }
3156
7b6753e7 3157 /* Make sure any start hooks are in the container */
1c1c7051
SH
3158 if (!verify_start_hooks(lxc_conf))
3159 return -1;
3160
2322903b
SH
3161 if (lxc_conf->is_execute)
3162 lxc_execute_bind_init(lxc_conf);
3163
368bbc02
CS
3164 /* now mount only cgroup, if wanted;
3165 * before, /sys could not have been mounted
3166 * (is either mounted automatically or via fstab entries)
3167 */
4fb3cba5 3168 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
3169 ERROR("failed to setup the automatic mounts for '%s'", name);
3170 return -1;
3171 }
3172
283678ed 3173 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
773fb9ca
SH
3174 ERROR("failed to run mount hooks for container '%s'.", name);
3175 return -1;
3176 }
3177
bc6928ff 3178 if (lxc_conf->autodev > 0) {
283678ed 3179 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
f7bee6c6
MW
3180 ERROR("failed to run autodev hooks for container '%s'.", name);
3181 return -1;
3182 }
06749971 3183
27245ff7 3184 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
91c3830e
SH
3185 ERROR("failed to populate /dev in the container");
3186 return -1;
3187 }
3188 }
368bbc02 3189
3d7d929a 3190 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
36eb9bde 3191 ERROR("failed to setup the console for '%s'", name);
95b5ffaf 3192 return -1;
6e590161 3193 }
3194
69aa6655
DE
3195 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
3196 ERROR("failed to setup /dev symlinks for '%s'", name);
3197 return -1;
3198 }
3199
5112cd70 3200 /* mount /proc if it's not already there */
943144d9 3201 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 3202 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 3203 return -1;
e075f5d9 3204 }
e075f5d9 3205
ac778708 3206 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 3207 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 3208 return -1;
ed502555 3209 }
3210
70761e5e 3211 if (lxc_setup_devpts(lxc_conf->pts)) {
36eb9bde 3212 ERROR("failed to setup the new pts instance");
95b5ffaf 3213 return -1;
3c26f34e 3214 }
3215
e8bd4e43
SH
3216 if (lxc_create_tty(name, lxc_conf)) {
3217 ERROR("failed to create the ttys");
3218 return -1;
3219 }
3220
ae467c54 3221 if (lxc_send_ttys_to_parent(handler) < 0) {
e8bd4e43
SH
3222 ERROR("failure sending console info to parent");
3223 return -1;
3224 }
3225
9e1045e3 3226 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
e8bd4e43
SH
3227 ERROR("failed to setup the ttys for '%s'", name);
3228 return -1;
3229 }
3230
3231 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
3232 SYSERROR("failed to set environment variable for container ptys");
3233
3234
cccc74b5
DL
3235 if (setup_personality(lxc_conf->personality)) {
3236 ERROR("failed to setup personality");
3237 return -1;
3238 }
3239
97a8f74f
SG
3240 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3241 if (!lxc_list_empty(&lxc_conf->caps)) {
7389ca26 3242 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
f6d3e3e4
SH
3243 return -1;
3244 }
97a8f74f
SG
3245 if (dropcaps_except(&lxc_conf->keepcaps)) {
3246 ERROR("failed to keep requested caps");
3247 return -1;
3248 }
3249 } else if (setup_caps(&lxc_conf->caps)) {
3250 ERROR("failed to drop capabilities");
3251 return -1;
81810dd1
DL
3252 }
3253
f4152036 3254 NOTICE("Container \"%s\" is set up", name);
cd54d859 3255
0ad19a3f 3256 return 0;
3257}
26ddeedd 3258
283678ed
SH
3259int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
3260 const char *lxcpath, char *argv[])
26ddeedd
SH
3261{
3262 int which = -1;
3263 struct lxc_list *it;
3264
3265 if (strcmp(hook, "pre-start") == 0)
3266 which = LXCHOOK_PRESTART;
5ea6163a
SH
3267 else if (strcmp(hook, "pre-mount") == 0)
3268 which = LXCHOOK_PREMOUNT;
26ddeedd
SH
3269 else if (strcmp(hook, "mount") == 0)
3270 which = LXCHOOK_MOUNT;
f7bee6c6
MW
3271 else if (strcmp(hook, "autodev") == 0)
3272 which = LXCHOOK_AUTODEV;
26ddeedd
SH
3273 else if (strcmp(hook, "start") == 0)
3274 which = LXCHOOK_START;
52492063
WB
3275 else if (strcmp(hook, "stop") == 0)
3276 which = LXCHOOK_STOP;
26ddeedd
SH
3277 else if (strcmp(hook, "post-stop") == 0)
3278 which = LXCHOOK_POSTSTOP;
148e91f5
SH
3279 else if (strcmp(hook, "clone") == 0)
3280 which = LXCHOOK_CLONE;
37cf711b
SY
3281 else if (strcmp(hook, "destroy") == 0)
3282 which = LXCHOOK_DESTROY;
26ddeedd
SH
3283 else
3284 return -1;
3285 lxc_list_for_each(it, &conf->hooks[which]) {
3286 int ret;
3287 char *hookname = it->elem;
283678ed 3288 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
26ddeedd
SH
3289 if (ret)
3290 return ret;
3291 }
3292 return 0;
3293}
72d0e1cb 3294
72d0e1cb
SG
3295int lxc_clear_config_caps(struct lxc_conf *c)
3296{
1a0e70ac 3297 struct lxc_list *it, *next;
72d0e1cb 3298
9ebb03ad 3299 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
3300 lxc_list_del(it);
3301 free(it->elem);
3302 free(it);
3303 }
3304 return 0;
3305}
3306
74a3920a 3307static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
3308 struct lxc_list *it, *next;
3309
4355ab5f 3310 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
3311 lxc_list_del(it);
3312 free(it->elem);
3313 free(it);
3314 }
3315 return 0;
3316}
3317
4355ab5f
SH
3318int lxc_clear_idmaps(struct lxc_conf *c)
3319{
3320 return lxc_free_idmap(&c->id_map);
3321}
3322
1fb86a7c
SH
3323int lxc_clear_config_keepcaps(struct lxc_conf *c)
3324{
3325 struct lxc_list *it,*next;
3326
3327 lxc_list_for_each_safe(it, &c->keepcaps, next) {
3328 lxc_list_del(it);
3329 free(it->elem);
3330 free(it);
3331 }
3332 return 0;
3333}
3334
12a50cc6 3335int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 3336{
9ebb03ad 3337 struct lxc_list *it,*next;
72d0e1cb 3338 bool all = false;
a6390f01 3339 const char *k = NULL;
72d0e1cb
SG
3340
3341 if (strcmp(key, "lxc.cgroup") == 0)
3342 all = true;
a6390f01
WB
3343 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
3344 k = key + sizeof("lxc.cgroup.")-1;
3345 else
3346 return -1;
72d0e1cb 3347
9ebb03ad 3348 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
3349 struct lxc_cgroup *cg = it->elem;
3350 if (!all && strcmp(cg->subsystem, k) != 0)
3351 continue;
3352 lxc_list_del(it);
3353 free(cg->subsystem);
3354 free(cg->value);
3355 free(cg);
3356 free(it);
3357 }
3358 return 0;
3359}
3360
c6d09e15
WB
3361int lxc_clear_limits(struct lxc_conf *c, const char *key)
3362{
3363 struct lxc_list *it, *next;
3364 bool all = false;
3365 const char *k = NULL;
3366
240d4b74 3367 if (strcmp(key, "lxc.limit") == 0
3368 || strcmp(key, "lxc.prlimit"))
c6d09e15
WB
3369 all = true;
3370 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
3371 k = key + sizeof("lxc.limit.")-1;
240d4b74 3372 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
3373 k = key + sizeof("lxc.prlimit.")-1;
c6d09e15
WB
3374 else
3375 return -1;
3376
3377 lxc_list_for_each_safe(it, &c->limits, next) {
3378 struct lxc_limit *lim = it->elem;
3379 if (!all && strcmp(lim->resource, k) != 0)
3380 continue;
3381 lxc_list_del(it);
3382 free(lim->resource);
3383 free(lim);
3384 free(it);
3385 }
3386 return 0;
3387}
3388
ee1e7aa0
SG
3389int lxc_clear_groups(struct lxc_conf *c)
3390{
3391 struct lxc_list *it,*next;
3392
3393 lxc_list_for_each_safe(it, &c->groups, next) {
3394 lxc_list_del(it);
3395 free(it->elem);
3396 free(it);
3397 }
3398 return 0;
3399}
3400
ab799c0b
SG
3401int lxc_clear_environment(struct lxc_conf *c)
3402{
3403 struct lxc_list *it,*next;
3404
3405 lxc_list_for_each_safe(it, &c->environment, next) {
3406 lxc_list_del(it);
3407 free(it->elem);
3408 free(it);
3409 }
3410 return 0;
3411}
3412
72d0e1cb
SG
3413int lxc_clear_mount_entries(struct lxc_conf *c)
3414{
9ebb03ad 3415 struct lxc_list *it,*next;
72d0e1cb 3416
9ebb03ad 3417 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
3418 lxc_list_del(it);
3419 free(it->elem);
3420 free(it);
3421 }
3422 return 0;
3423}
3424
b099e9e9
SH
3425int lxc_clear_automounts(struct lxc_conf *c)
3426{
3427 c->auto_mounts = 0;
3428 return 0;
3429}
3430
12a50cc6 3431int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 3432{
9ebb03ad 3433 struct lxc_list *it,*next;
17ed13a3 3434 bool all = false, done = false;
a6390f01 3435 const char *k = NULL;
72d0e1cb
SG
3436 int i;
3437
17ed13a3
SH
3438 if (strcmp(key, "lxc.hook") == 0)
3439 all = true;
a6390f01
WB
3440 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
3441 k = key + sizeof("lxc.hook.")-1;
3442 else
3443 return -1;
17ed13a3 3444
72d0e1cb 3445 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 3446 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 3447 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
3448 lxc_list_del(it);
3449 free(it->elem);
3450 free(it);
3451 }
3452 done = true;
72d0e1cb
SG
3453 }
3454 }
17ed13a3
SH
3455
3456 if (!done) {
3457 ERROR("Invalid hook key: %s", key);
3458 return -1;
3459 }
72d0e1cb
SG
3460 return 0;
3461}
8eb5694b 3462
4184c3e1
SH
3463static inline void lxc_clear_aliens(struct lxc_conf *conf)
3464{
3465 struct lxc_list *it,*next;
3466
3467 lxc_list_for_each_safe(it, &conf->aliens, next) {
3468 lxc_list_del(it);
3469 free(it->elem);
3470 free(it);
3471 }
3472}
3473
c7b15d1e 3474void lxc_clear_includes(struct lxc_conf *conf)
f979ac15
SH
3475{
3476 struct lxc_list *it,*next;
3477
3478 lxc_list_for_each_safe(it, &conf->includes, next) {
3479 lxc_list_del(it);
3480 free(it->elem);
3481 free(it);
3482 }
3483}
3484
8eb5694b
SH
3485void lxc_conf_free(struct lxc_conf *conf)
3486{
3487 if (!conf)
3488 return;
858377e4
SH
3489 if (current_config == conf)
3490 current_config = NULL;
f10fad2f
ME
3491 free(conf->console.log_path);
3492 free(conf->console.path);
3493 free(conf->rootfs.mount);
b3b8c97f 3494 free(conf->rootfs.bdev_type);
f10fad2f
ME
3495 free(conf->rootfs.options);
3496 free(conf->rootfs.path);
f10fad2f 3497 free(conf->logfile);
858377e4
SH
3498 if (conf->logfd != -1)
3499 close(conf->logfd);
f10fad2f
ME
3500 free(conf->utsname);
3501 free(conf->ttydir);
3502 free(conf->fstab);
3503 free(conf->rcfile);
3504 free(conf->init_cmd);
6b0d5538 3505 free(conf->unexpanded_config);
393903d1 3506 free(conf->pty_names);
76d0127f 3507 free(conf->syslog);
c302b476 3508 lxc_free_networks(&conf->network);
f10fad2f
ME
3509 free(conf->lsm_aa_profile);
3510 free(conf->lsm_se_context);
769872f9 3511 lxc_seccomp_free(conf);
8eb5694b 3512 lxc_clear_config_caps(conf);
1fb86a7c 3513 lxc_clear_config_keepcaps(conf);
8eb5694b 3514 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 3515 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 3516 lxc_clear_mount_entries(conf);
27c27d73 3517 lxc_clear_idmaps(conf);
ee1e7aa0 3518 lxc_clear_groups(conf);
f979ac15 3519 lxc_clear_includes(conf);
761d81ca 3520 lxc_clear_aliens(conf);
ab799c0b 3521 lxc_clear_environment(conf);
240d4b74 3522 lxc_clear_limits(conf, "lxc.prlimit");
43654d34
CB
3523 free(conf->cgroup_meta.dir);
3524 free(conf->cgroup_meta.controllers);
8eb5694b
SH
3525 free(conf);
3526}
4355ab5f
SH
3527
3528struct userns_fn_data {
3529 int (*fn)(void *);
c9b7c33e 3530 const char *fn_name;
4355ab5f
SH
3531 void *arg;
3532 int p[2];
3533};
3534
3535static int run_userns_fn(void *data)
3536{
3537 struct userns_fn_data *d = data;
3538 char c;
4355ab5f 3539
f8aa4bf3 3540 /* Close write end of the pipe. */
4355ab5f 3541 close(d->p[1]);
f8aa4bf3
CB
3542
3543 /* Wait for parent to finish establishing a new mapping in the user
3544 * namespace we are executing in.
3545 */
4355ab5f
SH
3546 if (read(d->p[0], &c, 1) != 1)
3547 return -1;
f8aa4bf3
CB
3548
3549 /* Close read end of the pipe. */
4355ab5f 3550 close(d->p[0]);
f8aa4bf3 3551
c9b7c33e
CB
3552 if (d->fn_name)
3553 TRACE("calling function \"%s\"", d->fn_name);
f8aa4bf3 3554 /* Call function to run. */
4355ab5f
SH
3555 return d->fn(d->arg);
3556}
3557
339efad9 3558static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
f8aa4bf3
CB
3559 enum idtype idtype)
3560{
3561 struct lxc_list *it;
3562 struct id_map *map;
3563 struct id_map *retmap = NULL;
3564
3565 lxc_list_for_each(it, &conf->id_map) {
3566 map = it->elem;
3567 if (map->idtype != idtype)
3568 continue;
3569
3570 if (id >= map->hostid && id < map->hostid + map->range) {
3571 retmap = map;
3572 break;
3573 }
3574 }
3575
3576 if (!retmap)
3577 return NULL;
3578
3579 retmap = malloc(sizeof(*retmap));
3580 if (!retmap)
3581 return NULL;
3582
3583 memcpy(retmap, map, sizeof(*retmap));
3584 return retmap;
3585}
3586
4355ab5f 3587/*
f8aa4bf3
CB
3588 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
3589 * existing one or establish a new one.
4355ab5f 3590 */
28a2d9e7 3591static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4355ab5f 3592{
28a2d9e7 3593 int hostid_mapped;
f8aa4bf3 3594 struct id_map *entry = NULL;
f8aa4bf3 3595
28a2d9e7
CB
3596 /* Reuse existing mapping. */
3597 entry = mapped_hostid_entry(conf, id, type);
3598 if (entry)
3599 return entry;
f8aa4bf3 3600
28a2d9e7
CB
3601 /* Find new mapping. */
3602 hostid_mapped = find_unmapped_nsid(conf, type);
3603 if (hostid_mapped < 0) {
3604 DEBUG("failed to find free mapping for id %d", id);
3605 return NULL;
f8aa4bf3 3606 }
f8aa4bf3 3607
28a2d9e7
CB
3608 entry = malloc(sizeof(*entry));
3609 if (!entry)
3610 return NULL;
4355ab5f 3611
28a2d9e7
CB
3612 entry->idtype = type;
3613 entry->nsid = hostid_mapped;
3614 entry->hostid = (unsigned long)id;
3615 entry->range = 1;
4355ab5f 3616
28a2d9e7 3617 return entry;
4355ab5f
SH
3618}
3619
f8aa4bf3
CB
3620/* Run a function in a new user namespace.
3621 * The caller's euid/egid will be mapped if it is not already.
3622 * Afaict, userns_exec_1() is only used to operate based on privileges for the
3623 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
3624 * This means we require only to establish a mapping from:
3625 * - the container root {g,u}id as seen from the host > user's host {g,u}id
3626 * - the container root -> some sub{g,u}id
3627 * The former we add, if the user did not specifiy a mapping. The latter we
3628 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
3629 * there to start the container in the first place.
4355ab5f 3630 */
c9b7c33e
CB
3631int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
3632 const char *fn_name)
4355ab5f 3633{
f8aa4bf3
CB
3634 pid_t pid;
3635 uid_t euid, egid;
4355ab5f 3636 struct userns_fn_data d;
4355ab5f 3637 int p[2];
f8aa4bf3
CB
3638 struct lxc_list *it;
3639 struct id_map *map;
3640 char c = '1';
3641 int ret = -1;
3642 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
3643 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
3644 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 3645
4355ab5f 3646 ret = pipe(p);
4355ab5f
SH
3647 if (ret < 0) {
3648 SYSERROR("opening pipe");
3649 return -1;
3650 }
3651 d.fn = fn;
c9b7c33e 3652 d.fn_name = fn_name;
4355ab5f
SH
3653 d.arg = data;
3654 d.p[0] = p[0];
3655 d.p[1] = p[1];
f8aa4bf3
CB
3656
3657 /* Clone child in new user namespace. */
4355ab5f 3658 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
f8aa4bf3
CB
3659 if (pid < 0) {
3660 ERROR("failed to clone child process in new user namespace");
3661 goto on_error;
3662 }
3663
4355ab5f 3664 close(p[0]);
4355ab5f
SH
3665 p[0] = -1;
3666
f8aa4bf3
CB
3667 /* Find container root. */
3668 lxc_list_for_each(it, &conf->id_map) {
3669 map = it->elem;
3670
3671 if (map->nsid != 0)
3672 continue;
3673
3674 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
3675 container_root_uid = malloc(sizeof(*container_root_uid));
3676 if (!container_root_uid)
3677 goto on_error;
3678 container_root_uid->idtype = map->idtype;
3679 container_root_uid->hostid = map->hostid;
3680 container_root_uid->nsid = 0;
3681 container_root_uid->range = map->range;
3682 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
3683 container_root_gid = malloc(sizeof(*container_root_gid));
3684 if (!container_root_gid)
3685 goto on_error;
3686 container_root_gid->idtype = map->idtype;
3687 container_root_gid->hostid = map->hostid;
3688 container_root_gid->nsid = 0;
3689 container_root_gid->range = map->range;
3690 }
3691
3692 /* Found container root. */
3693 if (container_root_uid && container_root_gid)
3694 break;
3695 }
3696
3697 /* This is actually checked earlier but it can't hurt. */
3698 if (!container_root_uid || !container_root_gid) {
3699 ERROR("no mapping for container root found");
3700 goto on_error;
3701 }
3702
1d90e064
CB
3703 host_uid_map = container_root_uid;
3704 host_gid_map = container_root_gid;
3705
f8aa4bf3
CB
3706 /* Check whether the {g,u}id of the user has a mapping. */
3707 euid = geteuid();
3708 egid = getegid();
1d90e064 3709 if (euid != container_root_uid->hostid)
28a2d9e7
CB
3710 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
3711
1d90e064 3712 if (egid != container_root_gid->hostid)
28a2d9e7
CB
3713 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
3714
3715 if (!host_uid_map) {
3716 DEBUG("failed to find mapping for uid %d", euid);
f8aa4bf3
CB
3717 goto on_error;
3718 }
3719
28a2d9e7
CB
3720 if (!host_gid_map) {
3721 DEBUG("failed to find mapping for gid %d", egid);
3722 goto on_error;
3723 }
3724
3725 /* Allocate new {g,u}id map list. */
3726 idmap = malloc(sizeof(*idmap));
3727 if (!idmap)
3728 goto on_error;
3729 lxc_list_init(idmap);
3730
f8aa4bf3
CB
3731 /* Add container root to the map. */
3732 tmplist = malloc(sizeof(*tmplist));
3733 if (!tmplist)
3734 goto on_error;
3735 lxc_list_add_elem(tmplist, container_root_uid);
3736 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3737
1d90e064 3738 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
3739 /* idmap will now keep track of that memory. */
3740 container_root_uid = NULL;
3741
3742 /* Add container root to the map. */
3743 tmplist = malloc(sizeof(*tmplist));
3744 if (!tmplist)
3745 goto on_error;
3746 lxc_list_add_elem(tmplist, host_uid_map);
3747 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3748 }
1d90e064
CB
3749 /* idmap will now keep track of that memory. */
3750 container_root_uid = NULL;
3751 /* idmap will now keep track of that memory. */
3752 host_uid_map = NULL;
f8aa4bf3
CB
3753
3754 tmplist = malloc(sizeof(*tmplist));
3755 if (!tmplist)
3756 goto on_error;
3757 lxc_list_add_elem(tmplist, container_root_gid);
3758 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3759
1d90e064 3760 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
3761 /* idmap will now keep track of that memory. */
3762 container_root_gid = NULL;
3763
3764 tmplist = malloc(sizeof(*tmplist));
3765 if (!tmplist)
3766 goto on_error;
3767 lxc_list_add_elem(tmplist, host_gid_map);
3768 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 3769 }
1d90e064
CB
3770 /* idmap will now keep track of that memory. */
3771 container_root_gid = NULL;
3772 /* idmap will now keep track of that memory. */
3773 host_gid_map = NULL;
f8aa4bf3 3774
4b73005c
CB
3775 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
3776 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
f8aa4bf3
CB
3777 lxc_list_for_each(it, idmap) {
3778 map = it->elem;
3779 TRACE("establishing %cid mapping for \"%d\" in new "
3780 "user namespace: nsuid %lu - hostid %lu - range "
3781 "%lu",
3782 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
3783 map->nsid, map->hostid, map->range);
3784 }
4355ab5f
SH
3785 }
3786
f8aa4bf3 3787 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 3788 ret = lxc_map_ids(idmap, pid);
f8aa4bf3
CB
3789 if (ret < 0) {
3790 ERROR("error setting up {g,u}id mappings for child process "
3791 "\"%d\"",
3792 pid);
3793 goto on_error;
4355ab5f
SH
3794 }
3795
f8aa4bf3 3796 /* Tell child to proceed. */
4355ab5f 3797 if (write(p[1], &c, 1) != 1) {
f8aa4bf3
CB
3798 SYSERROR("failed telling child process \"%d\" to proceed", pid);
3799 goto on_error;
4355ab5f
SH
3800 }
3801
f8aa4bf3 3802 /* Wait for child to finish. */
3139aead
SG
3803 ret = wait_for_pid(pid);
3804
f8aa4bf3 3805on_error:
1d90e064
CB
3806 if (idmap)
3807 lxc_free_idmap(idmap);
3808 if (container_root_uid)
3809 free(container_root_uid);
3810 if (container_root_gid)
3811 free(container_root_gid);
3812 if (host_uid_map && (host_uid_map != container_root_uid))
3813 free(host_uid_map);
3814 if (host_gid_map && (host_gid_map != container_root_gid))
3815 free(host_gid_map);
3139aead 3816
4355ab5f
SH
3817 if (p[0] != -1)
3818 close(p[0]);
3819 close(p[1]);
f8aa4bf3
CB
3820
3821 return ret;
4355ab5f 3822}
97e9cfa0 3823
a96a8e8c 3824/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
3825static char* getuname(void)
3826{
a96a8e8c 3827 struct passwd *result;
97e9cfa0 3828
a96a8e8c
SH
3829 result = getpwuid(geteuid());
3830 if (!result)
97e9cfa0
SH
3831 return NULL;
3832
a96a8e8c 3833 return strdup(result->pw_name);
97e9cfa0
SH
3834}
3835
a96a8e8c 3836/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
3837static char *getgname(void)
3838{
a96a8e8c 3839 struct group *result;
97e9cfa0 3840
a96a8e8c
SH
3841 result = getgrgid(getegid());
3842 if (!result)
97e9cfa0
SH
3843 return NULL;
3844
a96a8e8c 3845 return strdup(result->gr_name);
97e9cfa0
SH
3846}
3847
a96a8e8c 3848/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
3849void suggest_default_idmap(void)
3850{
3851 FILE *f;
3852 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
3853 char *line = NULL;
3854 char *uname, *gname;
3855 size_t len = 0;
3856
3857 if (!(uname = getuname()))
3858 return;
3859
3860 if (!(gname = getgname())) {
3861 free(uname);
3862 return;
3863 }
3864
3865 f = fopen(subuidfile, "r");
3866 if (!f) {
3867 ERROR("Your system is not configured with subuids");
3868 free(gname);
3869 free(uname);
3870 return;
3871 }
3872 while (getline(&line, &len, f) != -1) {
b7930180 3873 size_t no_newline = 0;
97e9cfa0
SH
3874 char *p = strchr(line, ':'), *p2;
3875 if (*line == '#')
3876 continue;
3877 if (!p)
3878 continue;
3879 *p = '\0';
3880 p++;
3881 if (strcmp(line, uname))
3882 continue;
3883 p2 = strchr(p, ':');
3884 if (!p2)
3885 continue;
3886 *p2 = '\0';
3887 p2++;
3888 if (!*p2)
3889 continue;
b7930180
CB
3890 no_newline = strcspn(p2, "\n");
3891 p2[no_newline] = '\0';
3892
b7b2fde4
CB
3893 if (lxc_safe_uint(p, &uid) < 0)
3894 WARN("Could not parse UID.");
3895 if (lxc_safe_uint(p2, &urange) < 0)
3896 WARN("Could not parse UID range.");
97e9cfa0
SH
3897 }
3898 fclose(f);
3899
6be7389a 3900 f = fopen(subgidfile, "r");
97e9cfa0
SH
3901 if (!f) {
3902 ERROR("Your system is not configured with subgids");
3903 free(gname);
3904 free(uname);
3905 return;
3906 }
3907 while (getline(&line, &len, f) != -1) {
b7930180 3908 size_t no_newline = 0;
97e9cfa0
SH
3909 char *p = strchr(line, ':'), *p2;
3910 if (*line == '#')
3911 continue;
3912 if (!p)
3913 continue;
3914 *p = '\0';
3915 p++;
3916 if (strcmp(line, uname))
3917 continue;
3918 p2 = strchr(p, ':');
3919 if (!p2)
3920 continue;
3921 *p2 = '\0';
3922 p2++;
3923 if (!*p2)
3924 continue;
b7930180
CB
3925 no_newline = strcspn(p2, "\n");
3926 p2[no_newline] = '\0';
3927
b7b2fde4
CB
3928 if (lxc_safe_uint(p, &gid) < 0)
3929 WARN("Could not parse GID.");
3930 if (lxc_safe_uint(p2, &grange) < 0)
3931 WARN("Could not parse GID range.");
97e9cfa0
SH
3932 }
3933 fclose(f);
3934
f10fad2f 3935 free(line);
97e9cfa0
SH
3936
3937 if (!urange || !grange) {
3938 ERROR("You do not have subuids or subgids allocated");
3939 ERROR("Unprivileged containers require subuids and subgids");
3940 return;
3941 }
3942
3943 ERROR("You must either run as root, or define uid mappings");
3944 ERROR("To pass uid mappings to lxc-create, you could create");
3945 ERROR("~/.config/lxc/default.conf:");
3946 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
3947 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
3948 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
3949
3950 free(gname);
3951 free(uname);
3952}
aaf26830 3953
a7307747
SH
3954static void free_cgroup_settings(struct lxc_list *result)
3955{
3956 struct lxc_list *iterator, *next;
3957
3958 lxc_list_for_each_safe(iterator, result, next) {
3959 lxc_list_del(iterator);
3960 free(iterator);
3961 }
3962 free(result);
3963}
3964
aaf26830
KT
3965/*
3966 * Return the list of cgroup_settings sorted according to the following rules
3967 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
3968 */
3969struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
3970{
3971 struct lxc_list *result;
3972 struct lxc_list *memsw_limit = NULL;
3973 struct lxc_list *it = NULL;
3974 struct lxc_cgroup *cg = NULL;
3975 struct lxc_list *item = NULL;
3976
3977 result = malloc(sizeof(*result));
fac7c663
KT
3978 if (!result) {
3979 ERROR("failed to allocate memory to sort cgroup settings");
3980 return NULL;
3981 }
aaf26830
KT
3982 lxc_list_init(result);
3983
3984 /*Iterate over the cgroup settings and copy them to the output list*/
3985 lxc_list_for_each(it, cgroup_settings) {
3986 item = malloc(sizeof(*item));
fac7c663
KT
3987 if (!item) {
3988 ERROR("failed to allocate memory to sort cgroup settings");
a7307747 3989 free_cgroup_settings(result);
fac7c663
KT
3990 return NULL;
3991 }
aaf26830
KT
3992 item->elem = it->elem;
3993 cg = it->elem;
3994 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
3995 /* Store the memsw_limit location */
3996 memsw_limit = item;
3997 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4d5b72a1 3998 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
aaf26830
KT
3999 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
4000 item->elem = memsw_limit->elem;
4001 memsw_limit->elem = it->elem;
4002 }
4003 lxc_list_add_tail(result, item);
4004 }
4005
4006 return result;
a7307747 4007}