]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
tests: add unit tests for idmap parser
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
8f3e280e
CB
27#include <dirent.h>
28#include <errno.h>
29#include <fcntl.h>
30#include <grp.h>
31#include <inttypes.h>
32#include <libgen.h>
33#include <pwd.h>
34#include <stdarg.h>
0ad19a3f 35#include <stdio.h>
0ad19a3f 36#include <stdlib.h>
0ad19a3f 37#include <string.h>
8f3e280e 38#include <time.h>
0ad19a3f 39#include <unistd.h>
8f3e280e
CB
40#include <arpa/inet.h>
41#include <linux/loop.h>
8f3e280e
CB
42#include <net/if.h>
43#include <netinet/in.h>
44#include <sys/mman.h>
45#include <sys/mount.h>
46#include <sys/param.h>
47#include <sys/prctl.h>
48#include <sys/stat.h>
49#include <sys/socket.h>
ce831b3b 50#include <sys/sysmacros.h>
2d76d1d7 51#include <sys/syscall.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
1d52bdf7 55
af6824fc
ST
56/* makedev() */
57#ifdef MAJOR_IN_MKDEV
58# include <sys/mkdev.h>
59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
5ef5c9a3
CB
71#ifdef HAVE_LINUX_MEMFD_H
72#include <linux/memfd.h>
73#endif
74
e8bd4e43 75#include "af_unix.h"
8f3e280e
CB
76#include "bdev.h"
77#include "caps.h" /* for lxc_caps_last_cap() */
78#include "cgroup.h"
1b09f2c0 79#include "conf.h"
8f3e280e 80#include "error.h"
1b09f2c0 81#include "log.h"
d8e48992 82#include "lxcaufs.h"
025ed0f3 83#include "lxclock.h"
8f3e280e
CB
84#include "lxcoverlay.h"
85#include "lxcseccomp.h"
4355ab5f 86#include "namespace.h"
8f3e280e
CB
87#include "network.h"
88#include "parse.h"
89#include "utils.h"
fe4de9a6 90#include "lsm/lsm.h"
d0a36f2c 91
e37dda71 92#if HAVE_LIBCAP
495d2046
SG
93#include <sys/capability.h>
94#endif
95
6ff05e18
SG
96#if HAVE_SYS_PERSONALITY_H
97#include <sys/personality.h>
98#endif
99
edaf8b1b
SG
100#if IS_BIONIC
101#include <../include/lxcmntent.h>
a04f5407
CB
102#ifndef HAVE_PRLIMIT
103#include <../include/prlimit.h>
104#endif
edaf8b1b
SG
105#else
106#include <mntent.h>
107#endif
108
36eb9bde 109lxc_log_define(lxc_conf, lxc);
e5bda9ee 110
e37dda71 111#if HAVE_LIBCAP
b09094da
MN
112#ifndef CAP_SETFCAP
113#define CAP_SETFCAP 31
114#endif
115
116#ifndef CAP_MAC_OVERRIDE
117#define CAP_MAC_OVERRIDE 32
118#endif
119
120#ifndef CAP_MAC_ADMIN
121#define CAP_MAC_ADMIN 33
122#endif
495d2046 123#endif
b09094da
MN
124
125#ifndef PR_CAPBSET_DROP
126#define PR_CAPBSET_DROP 24
127#endif
128
9818cae4
SG
129#ifndef LO_FLAGS_AUTOCLEAR
130#define LO_FLAGS_AUTOCLEAR 4
131#endif
132
bc5b27d6
DK
133#ifndef CAP_SETUID
134#define CAP_SETUID 7
135#endif
136
137#ifndef CAP_SETGID
138#define CAP_SETGID 6
139#endif
140
0769b82a
CS
141/* needed for cgroup automount checks, regardless of whether we
142 * have included linux/capability.h or not */
143#ifndef CAP_SYS_ADMIN
144#define CAP_SYS_ADMIN 21
145#endif
146
2d76d1d7
SG
147/* Define pivot_root() if missing from the C library */
148#ifndef HAVE_PIVOT_ROOT
149static int pivot_root(const char * new_root, const char * put_old)
150{
151#ifdef __NR_pivot_root
8f3e280e 152 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 153#else
8f3e280e
CB
154 errno = ENOSYS;
155 return -1;
2d76d1d7
SG
156#endif
157}
158#else
159extern int pivot_root(const char * new_root, const char * put_old);
160#endif
161
162/* Define sethostname() if missing from the C library */
163#ifndef HAVE_SETHOSTNAME
164static int sethostname(const char * name, size_t len)
165{
166#ifdef __NR_sethostname
8f3e280e 167 return syscall(__NR_sethostname, name, len);
2d76d1d7 168#else
8f3e280e
CB
169 errno = ENOSYS;
170 return -1;
2d76d1d7
SG
171#endif
172}
173#endif
174
72f919c4
SG
175/* Define __S_ISTYPE if missing from the C library */
176#ifndef __S_ISTYPE
177#define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
178#endif
179
ecec0126
SG
180#ifndef MS_PRIVATE
181#define MS_PRIVATE (1<<18)
182#endif
183
8912711c
CB
184#ifndef MS_LAZYTIME
185#define MS_LAZYTIME (1<<25)
186#endif
187
5ef5c9a3
CB
188/* memfd_create() */
189#ifndef MFD_CLOEXEC
190#define MFD_CLOEXEC 0x0001U
191#endif
192
193#ifndef MFD_ALLOW_SEALING
194#define MFD_ALLOW_SEALING 0x0002U
195#endif
196
197#ifndef HAVE_MEMFD_CREATE
198static int memfd_create(const char *name, unsigned int flags) {
199 #ifndef __NR_memfd_create
200 #if defined __i386__
201 #define __NR_memfd_create 356
202 #elif defined __x86_64__
203 #define __NR_memfd_create 319
204 #elif defined __arm__
205 #define __NR_memfd_create 385
206 #elif defined __aarch64__
207 #define __NR_memfd_create 279
208 #elif defined __s390__
209 #define __NR_memfd_create 350
210 #elif defined __powerpc__
211 #define __NR_memfd_create 360
212 #elif defined __sparc__
213 #define __NR_memfd_create 348
214 #elif defined __blackfin__
215 #define __NR_memfd_create 390
216 #elif defined __ia64__
217 #define __NR_memfd_create 1340
218 #elif defined _MIPS_SIM
219 #if _MIPS_SIM == _MIPS_SIM_ABI32
220 #define __NR_memfd_create 4354
221 #endif
222 #if _MIPS_SIM == _MIPS_SIM_NABI32
223 #define __NR_memfd_create 6318
224 #endif
225 #if _MIPS_SIM == _MIPS_SIM_ABI64
226 #define __NR_memfd_create 5314
227 #endif
228 #endif
229 #endif
230 #ifdef __NR_memfd_create
231 return syscall(__NR_memfd_create, name, flags);
232 #else
233 errno = ENOSYS;
234 return -1;
235 #endif
236}
237#else
238extern int memfd_create(const char *name, unsigned int flags);
239#endif
240
72d0e1cb 241char *lxchook_names[NUM_LXC_HOOKS] = {
52492063 242 "pre-start", "pre-mount", "mount", "autodev", "start", "stop", "post-stop", "clone", "destroy" };
72d0e1cb 243
a589434e 244typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
0ad19a3f 245
998ac676
RT
246struct mount_opt {
247 char *name;
248 int clear;
249 int flag;
250};
251
81810dd1
DL
252struct caps_opt {
253 char *name;
254 int value;
255};
256
c6d09e15
WB
257struct limit_opt {
258 char *name;
259 int value;
260};
261
858377e4
SH
262/*
263 * The lxc_conf of the container currently being worked on in an
264 * API call
265 * This is used in the error calls
266 */
267#ifdef HAVE_TLS
268__thread struct lxc_conf *current_config;
269#else
270struct lxc_conf *current_config;
271#endif
272
0769b82a
CS
273/* Declare this here, since we don't want to reshuffle the whole file. */
274static int in_caplist(int cap, struct lxc_list *caps);
275
a589434e
JN
276static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
277static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
278static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
279static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
280static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
281static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
282
283static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
284 [LXC_NET_VETH] = instantiate_veth,
285 [LXC_NET_MACVLAN] = instantiate_macvlan,
286 [LXC_NET_VLAN] = instantiate_vlan,
287 [LXC_NET_PHYS] = instantiate_phys,
288 [LXC_NET_EMPTY] = instantiate_empty,
289 [LXC_NET_NONE] = instantiate_none,
0ad19a3f 290};
291
74a2b586
JK
292static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
293static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
294static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
295static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
296static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
26b797f3 297static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
74a2b586 298
a589434e 299static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
74a2b586
JK
300 [LXC_NET_VETH] = shutdown_veth,
301 [LXC_NET_MACVLAN] = shutdown_macvlan,
302 [LXC_NET_VLAN] = shutdown_vlan,
303 [LXC_NET_PHYS] = shutdown_phys,
304 [LXC_NET_EMPTY] = shutdown_empty,
26b797f3 305 [LXC_NET_NONE] = shutdown_none,
74a2b586
JK
306};
307
998ac676 308static struct mount_opt mount_opt[] = {
470b359b
CB
309 { "async", 1, MS_SYNCHRONOUS },
310 { "atime", 1, MS_NOATIME },
311 { "bind", 0, MS_BIND },
88d413d5 312 { "defaults", 0, 0 },
88d413d5 313 { "dev", 1, MS_NODEV },
470b359b 314 { "diratime", 1, MS_NODIRATIME },
88d413d5 315 { "dirsync", 0, MS_DIRSYNC },
470b359b 316 { "exec", 1, MS_NOEXEC },
8912711c 317 { "lazytime", 0, MS_LAZYTIME },
88d413d5 318 { "mand", 0, MS_MANDLOCK },
88d413d5 319 { "noatime", 0, MS_NOATIME },
470b359b 320 { "nodev", 0, MS_NODEV },
88d413d5 321 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
322 { "noexec", 0, MS_NOEXEC },
323 { "nomand", 1, MS_MANDLOCK },
324 { "norelatime", 1, MS_RELATIME },
325 { "nostrictatime", 1, MS_STRICTATIME },
326 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
327 { "rbind", 0, MS_BIND|MS_REC },
328 { "relatime", 0, MS_RELATIME },
470b359b
CB
329 { "remount", 0, MS_REMOUNT },
330 { "ro", 0, MS_RDONLY },
331 { "rw", 1, MS_RDONLY },
88d413d5 332 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
333 { "suid", 1, MS_NOSUID },
334 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 335 { NULL, 0, 0 },
998ac676
RT
336};
337
e37dda71 338#if HAVE_LIBCAP
81810dd1 339static struct caps_opt caps_opt[] = {
a6afdde9 340 { "chown", CAP_CHOWN },
1e11be34
DL
341 { "dac_override", CAP_DAC_OVERRIDE },
342 { "dac_read_search", CAP_DAC_READ_SEARCH },
343 { "fowner", CAP_FOWNER },
344 { "fsetid", CAP_FSETID },
81810dd1
DL
345 { "kill", CAP_KILL },
346 { "setgid", CAP_SETGID },
347 { "setuid", CAP_SETUID },
348 { "setpcap", CAP_SETPCAP },
349 { "linux_immutable", CAP_LINUX_IMMUTABLE },
350 { "net_bind_service", CAP_NET_BIND_SERVICE },
351 { "net_broadcast", CAP_NET_BROADCAST },
352 { "net_admin", CAP_NET_ADMIN },
353 { "net_raw", CAP_NET_RAW },
354 { "ipc_lock", CAP_IPC_LOCK },
355 { "ipc_owner", CAP_IPC_OWNER },
356 { "sys_module", CAP_SYS_MODULE },
357 { "sys_rawio", CAP_SYS_RAWIO },
358 { "sys_chroot", CAP_SYS_CHROOT },
359 { "sys_ptrace", CAP_SYS_PTRACE },
360 { "sys_pacct", CAP_SYS_PACCT },
361 { "sys_admin", CAP_SYS_ADMIN },
362 { "sys_boot", CAP_SYS_BOOT },
363 { "sys_nice", CAP_SYS_NICE },
364 { "sys_resource", CAP_SYS_RESOURCE },
365 { "sys_time", CAP_SYS_TIME },
366 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
367 { "mknod", CAP_MKNOD },
368 { "lease", CAP_LEASE },
57b837e2
CB
369#ifdef CAP_AUDIT_READ
370 { "audit_read", CAP_AUDIT_READ },
371#endif
9527e566 372#ifdef CAP_AUDIT_WRITE
81810dd1 373 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
374#endif
375#ifdef CAP_AUDIT_CONTROL
81810dd1 376 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 377#endif
81810dd1
DL
378 { "setfcap", CAP_SETFCAP },
379 { "mac_override", CAP_MAC_OVERRIDE },
380 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
381#ifdef CAP_SYSLOG
382 { "syslog", CAP_SYSLOG },
383#endif
384#ifdef CAP_WAKE_ALARM
385 { "wake_alarm", CAP_WAKE_ALARM },
386#endif
2b54359b
CB
387#ifdef CAP_BLOCK_SUSPEND
388 { "block_suspend", CAP_BLOCK_SUSPEND },
389#endif
81810dd1 390};
495d2046
SG
391#else
392static struct caps_opt caps_opt[] = {};
393#endif
81810dd1 394
c6d09e15
WB
395static struct limit_opt limit_opt[] = {
396#ifdef RLIMIT_AS
397 { "as", RLIMIT_AS },
398#endif
399#ifdef RLIMIT_CORE
400 { "core", RLIMIT_CORE },
401#endif
402#ifdef RLIMIT_CPU
403 { "cpu", RLIMIT_CPU },
404#endif
405#ifdef RLIMIT_DATA
406 { "data", RLIMIT_DATA },
407#endif
408#ifdef RLIMIT_FSIZE
409 { "fsize", RLIMIT_FSIZE },
410#endif
411#ifdef RLIMIT_LOCKS
412 { "locks", RLIMIT_LOCKS },
413#endif
414#ifdef RLIMIT_MEMLOCK
415 { "memlock", RLIMIT_MEMLOCK },
416#endif
417#ifdef RLIMIT_MSGQUEUE
418 { "msgqueue", RLIMIT_MSGQUEUE },
419#endif
420#ifdef RLIMIT_NICE
421 { "nice", RLIMIT_NICE },
422#endif
423#ifdef RLIMIT_NOFILE
424 { "nofile", RLIMIT_NOFILE },
425#endif
426#ifdef RLIMIT_NPROC
427 { "nproc", RLIMIT_NPROC },
428#endif
429#ifdef RLIMIT_RSS
430 { "rss", RLIMIT_RSS },
431#endif
432#ifdef RLIMIT_RTPRIO
433 { "rtprio", RLIMIT_RTPRIO },
434#endif
435#ifdef RLIMIT_RTTIME
436 { "rttime", RLIMIT_RTTIME },
437#endif
438#ifdef RLIMIT_SIGPENDING
439 { "sigpending", RLIMIT_SIGPENDING },
440#endif
441#ifdef RLIMIT_STACK
442 { "stack", RLIMIT_STACK },
443#endif
444};
445
91c3830e
SH
446static int run_buffer(char *buffer)
447{
ebec9176 448 struct lxc_popen_FILE *f;
91c3830e 449 char *output;
8e7da691 450 int ret;
91c3830e 451
ebec9176 452 f = lxc_popen(buffer);
91c3830e 453 if (!f) {
062b72c6 454 SYSERROR("Failed to popen() %s.", buffer);
91c3830e
SH
455 return -1;
456 }
457
458 output = malloc(LXC_LOG_BUFFER_SIZE);
459 if (!output) {
062b72c6 460 ERROR("Failed to allocate memory for %s.", buffer);
ebec9176 461 lxc_pclose(f);
91c3830e
SH
462 return -1;
463 }
464
062b72c6
CB
465 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
466 DEBUG("Script %s with output: %s.", buffer, output);
91c3830e
SH
467
468 free(output);
469
ebec9176 470 ret = lxc_pclose(f);
8e7da691 471 if (ret == -1) {
062b72c6 472 SYSERROR("Script exited with error.");
91c3830e 473 return -1;
8e7da691 474 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
062b72c6 475 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
8e7da691
DE
476 return -1;
477 } else if (WIFSIGNALED(ret)) {
062b72c6 478 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
8e7da691 479 return -1;
91c3830e
SH
480 }
481
482 return 0;
483}
484
148e91f5 485static int run_script_argv(const char *name, const char *section,
062b72c6
CB
486 const char *script, const char *hook,
487 const char *lxcpath, char **argsin)
148e91f5
SH
488{
489 int ret, i;
490 char *buffer;
491 size_t size = 0;
492
062b72c6 493 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
148e91f5
SH
494 script, name, section);
495
062b72c6 496 for (i = 0; argsin && argsin[i]; i++)
148e91f5
SH
497 size += strlen(argsin[i]) + 1;
498
499 size += strlen(hook) + 1;
500
501 size += strlen(script);
502 size += strlen(name);
503 size += strlen(section);
504 size += 3;
505
506 if (size > INT_MAX)
507 return -1;
508
509 buffer = alloca(size);
510 if (!buffer) {
062b72c6 511 ERROR("Failed to allocate memory.");
148e91f5
SH
512 return -1;
513 }
514
062b72c6
CB
515 ret =
516 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
517 if (ret < 0 || (size_t)ret >= size) {
518 ERROR("Script name too long.");
148e91f5
SH
519 return -1;
520 }
521
062b72c6
CB
522 for (i = 0; argsin && argsin[i]; i++) {
523 int len = size - ret;
148e91f5
SH
524 int rc;
525 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
526 if (rc < 0 || rc >= len) {
062b72c6 527 ERROR("Script args too long.");
148e91f5
SH
528 return -1;
529 }
530 ret += rc;
531 }
532
533 return run_buffer(buffer);
534}
535
062b72c6
CB
536static int run_script(const char *name, const char *section, const char *script,
537 ...)
e3b4c4c4 538{
abbfd20b 539 int ret;
91c3830e 540 char *buffer, *p;
abbfd20b
DL
541 size_t size = 0;
542 va_list ap;
751d9dcd 543
062b72c6 544 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
751d9dcd 545 script, name, section);
e3b4c4c4 546
abbfd20b
DL
547 va_start(ap, script);
548 while ((p = va_arg(ap, char *)))
95642a10 549 size += strlen(p) + 1;
abbfd20b
DL
550 va_end(ap);
551
552 size += strlen(script);
553 size += strlen(name);
554 size += strlen(section);
95642a10 555 size += 3;
abbfd20b 556
95642a10
MS
557 if (size > INT_MAX)
558 return -1;
559
560 buffer = alloca(size);
abbfd20b 561 if (!buffer) {
062b72c6 562 ERROR("Failed to allocate memory.");
751d9dcd
DL
563 return -1;
564 }
565
9ba8130c
SH
566 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
567 if (ret < 0 || ret >= size) {
062b72c6 568 ERROR("Script name too long.");
9ba8130c
SH
569 return -1;
570 }
751d9dcd 571
abbfd20b 572 va_start(ap, script);
9ba8130c 573 while ((p = va_arg(ap, char *))) {
062b72c6 574 int len = size - ret;
9ba8130c
SH
575 int rc;
576 rc = snprintf(buffer + ret, len, " %s", p);
577 if (rc < 0 || rc >= len) {
062b72c6 578 ERROR("Script args too long.");
9ba8130c
SH
579 return -1;
580 }
581 ret += rc;
582 }
abbfd20b 583 va_end(ap);
751d9dcd 584
91c3830e 585 return run_buffer(buffer);
e3b4c4c4
ST
586}
587
a17b1e65 588static int mount_rootfs_dir(const char *rootfs, const char *target,
d435aae1 589 const char *options)
a6afdde9 590{
a17b1e65
SG
591 unsigned long mntflags;
592 char *mntdata;
593 int ret;
594
595 if (parse_mntopts(options, &mntflags, &mntdata) < 0) {
596 free(mntdata);
597 return -1;
598 }
599
600 ret = mount(rootfs, target, "none", MS_BIND | MS_REC | mntflags, mntdata);
601 free(mntdata);
602
603 return ret;
a6afdde9
DL
604}
605
c6868a1f 606static int lxc_mount_rootfs_file(const char *rootfs, const char *target,
d435aae1 607 const char *options)
78ae2fcc 608{
c6868a1f 609 int ret, loopfd;
a6afdde9 610 char path[MAXPATHLEN];
78ae2fcc 611
c6868a1f
CB
612 loopfd = lxc_prepare_loop_dev(rootfs, path, LO_FLAGS_AUTOCLEAR);
613 if (loopfd < 0)
78ae2fcc 614 return -1;
c6868a1f 615 DEBUG("prepared loop device \"%s\"", path);
a6afdde9 616
c6868a1f
CB
617 ret = mount_unknown_fs(path, target, options);
618 close(loopfd);
a6afdde9 619
c6868a1f 620 DEBUG("mounted rootfs \"%s\" on loop device \"%s\" via loop device \"%s\"", rootfs, target, path);
a6afdde9
DL
621
622 return ret;
78ae2fcc 623}
624
a17b1e65
SG
625static int mount_rootfs_block(const char *rootfs, const char *target,
626 const char *options)
a6afdde9 627{
a17b1e65 628 return mount_unknown_fs(rootfs, target, options);
a6afdde9
DL
629}
630
0c547523
SH
631/*
632 * pin_rootfs
b7ed4bf0
CS
633 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
634 * the duration of the container run, to prevent the container from marking
635 * the underlying fs readonly on shutdown. unlink the file immediately so
636 * no name pollution is happens
0c547523
SH
637 * return -1 on error.
638 * return -2 if nothing needed to be pinned.
639 * return an open fd (>=0) if we pinned it.
640 */
641int pin_rootfs(const char *rootfs)
642{
643 char absrootfs[MAXPATHLEN];
644 char absrootfspin[MAXPATHLEN];
645 struct stat s;
646 int ret, fd;
647
e99ee0de 648 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 649 return -2;
e99ee0de 650
00ec333b 651 if (!realpath(rootfs, absrootfs))
9be53773 652 return -2;
0c547523 653
00ec333b 654 if (access(absrootfs, F_OK))
0c547523 655 return -1;
0c547523 656
00ec333b 657 if (stat(absrootfs, &s))
0c547523 658 return -1;
0c547523 659
72f919c4 660 if (!S_ISDIR(s.st_mode))
0c547523
SH
661 return -2;
662
b7ed4bf0 663 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 664 if (ret >= MAXPATHLEN)
0c547523 665 return -1;
0c547523
SH
666
667 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
668 if (fd < 0)
669 return fd;
670 (void)unlink(absrootfspin);
0c547523
SH
671 return fd;
672}
673
e2a7e8dc
SH
674/*
675 * If we are asking to remount something, make sure that any
676 * NOEXEC etc are honored.
677 */
678static unsigned long add_required_remount_flags(const char *s, const char *d,
679 unsigned long flags)
680{
614305f3 681#ifdef HAVE_STATVFS
e2a7e8dc
SH
682 struct statvfs sb;
683 unsigned long required_flags = 0;
684
685 if (!(flags & MS_REMOUNT))
686 return flags;
687
688 if (!s)
689 s = d;
690
691 if (!s)
692 return flags;
693 if (statvfs(s, &sb) < 0)
694 return flags;
695
696 if (sb.f_flag & MS_NOSUID)
697 required_flags |= MS_NOSUID;
698 if (sb.f_flag & MS_NODEV)
699 required_flags |= MS_NODEV;
700 if (sb.f_flag & MS_RDONLY)
701 required_flags |= MS_RDONLY;
702 if (sb.f_flag & MS_NOEXEC)
703 required_flags |= MS_NOEXEC;
704
705 return flags | required_flags;
614305f3
SH
706#else
707 return flags;
708#endif
e2a7e8dc
SH
709}
710
4fb3cba5 711static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 712{
368bbc02 713 int r;
80e80c40 714 int i;
b06b8511
CS
715 static struct {
716 int match_mask;
717 int match_flag;
718 const char *source;
719 const char *destination;
720 const char *fstype;
721 unsigned long flags;
722 const char *options;
723 } default_mounts[] = {
724 /* Read-only bind-mounting... In older kernels, doing that required
725 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
726 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
727 * kernel 2.6.26 onwards. However, this apparently does not work on
728 * kernel 3.8. Unfortunately, on that very same kernel, doing the
729 * same trick as above doesn't seem to work either, there one needs
730 * to ALSO specify MS_BIND for the remount, otherwise the entire
731 * fs is remounted read-only or the mount fails because it's busy...
732 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
733 * 2.6.32...
368bbc02 734 */
f24a52d5 735 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a
SH
736 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
737 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
f24a52d5
SG
738 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
739 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
592fd47a 740 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
f24a52d5
SG
741 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
742 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
743 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
744 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
745 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
746 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
747 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
748 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
749 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
750 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
751 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
752 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 753 };
368bbc02 754
b06b8511
CS
755 for (i = 0; default_mounts[i].match_mask; i++) {
756 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
757 char *source = NULL;
758 char *destination = NULL;
759 int saved_errno;
e2a7e8dc 760 unsigned long mflags;
b06b8511
CS
761
762 if (default_mounts[i].source) {
763 /* will act like strdup if %r is not present */
8ede5f4c 764 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
b06b8511
CS
765 if (!source) {
766 SYSERROR("memory allocation error");
767 return -1;
768 }
769 }
cc4fd506
SH
770 if (!default_mounts[i].destination) {
771 ERROR("BUG: auto mounts destination %d was NULL", i);
b2f44b4d 772 free(source);
cc4fd506
SH
773 return -1;
774 }
775 /* will act like strdup if %r is not present */
776 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
777 if (!destination) {
778 saved_errno = errno;
779 SYSERROR("memory allocation error");
780 free(source);
781 errno = saved_errno;
782 return -1;
b06b8511 783 }
e2a7e8dc
SH
784 mflags = add_required_remount_flags(source, destination,
785 default_mounts[i].flags);
592fd47a 786 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
b06b8511 787 saved_errno = errno;
b88ff9a0
SG
788 if (r < 0 && errno == ENOENT) {
789 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
790 r = 0;
791 }
792 else if (r < 0)
e2a7e8dc 793 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
f24a52d5 794
b06b8511
CS
795 free(source);
796 free(destination);
797 if (r < 0) {
b06b8511
CS
798 errno = saved_errno;
799 return -1;
800 }
368bbc02 801 }
368bbc02
CS
802 }
803
b06b8511 804 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
805 int cg_flags;
806
807 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
808 /* If the type of cgroup mount was not specified, it depends on the
809 * container's capabilities as to what makes sense: if we have
810 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
811 * anyway, so we may as well default to read-write; then the admin
812 * will not be given a false sense of security. (And if they really
813 * want mixed r/o r/w, then they can explicitly specify :mixed.)
814 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
815 * :mixed, because then the container can't remount it read-write. */
816 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
817 int has_sys_admin = 0;
818 if (!lxc_list_empty(&conf->keepcaps)) {
819 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
820 } else {
821 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
822 }
823 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC) {
824 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
825 } else {
826 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
827 }
828 }
829
8ede5f4c 830 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
368bbc02 831 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 832 return -1;
368bbc02
CS
833 }
834 }
835
368bbc02 836 return 0;
368bbc02
CS
837}
838
a17b1e65 839static int mount_rootfs(const char *rootfs, const char *target, const char *options)
0ad19a3f 840{
b09ef133 841 char absrootfs[MAXPATHLEN];
78ae2fcc 842 struct stat s;
a6afdde9 843 int i;
78ae2fcc 844
a17b1e65 845 typedef int (*rootfs_cb)(const char *, const char *, const char *);
78ae2fcc 846
847 struct rootfs_type {
848 int type;
849 rootfs_cb cb;
850 } rtfs_type[] = {
2656d231
DL
851 { S_IFDIR, mount_rootfs_dir },
852 { S_IFBLK, mount_rootfs_block },
c6868a1f 853 { S_IFREG, lxc_mount_rootfs_file },
78ae2fcc 854 };
0ad19a3f 855
4c8ab83b 856 if (!realpath(rootfs, absrootfs)) {
91c3e281 857 SYSERROR("Failed to get real path for \"%s\".", rootfs);
4c8ab83b 858 return -1;
859 }
b09ef133 860
b09ef133 861 if (access(absrootfs, F_OK)) {
d26582c1 862 SYSERROR("The rootfs \"%s\" is not accessible.", absrootfs);
b09ef133 863 return -1;
864 }
865
78ae2fcc 866 if (stat(absrootfs, &s)) {
91c3e281 867 SYSERROR("Failed to stat the rootfs \"%s\".", absrootfs);
9b0f0477 868 return -1;
869 }
870
78ae2fcc 871 for (i = 0; i < sizeof(rtfs_type)/sizeof(rtfs_type[0]); i++) {
78ae2fcc 872 if (!__S_ISTYPE(s.st_mode, rtfs_type[i].type))
873 continue;
9b0f0477 874
a17b1e65 875 return rtfs_type[i].cb(absrootfs, target, options);
78ae2fcc 876 }
9b0f0477 877
91c3e281 878 ERROR("Unsupported rootfs type for rootfs \"%s\".", absrootfs);
78ae2fcc 879 return -1;
0ad19a3f 880}
881
4e5440c6 882static int setup_utsname(struct utsname *utsname)
0ad19a3f 883{
4e5440c6
DL
884 if (!utsname)
885 return 0;
0ad19a3f 886
4e5440c6
DL
887 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
888 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 889 return -1;
890 }
891
4e5440c6 892 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 893
0ad19a3f 894 return 0;
895}
896
69aa6655
DE
897struct dev_symlinks {
898 const char *oldpath;
899 const char *name;
900};
901
902static const struct dev_symlinks dev_symlinks[] = {
903 {"/proc/self/fd", "fd"},
904 {"/proc/self/fd/0", "stdin"},
905 {"/proc/self/fd/1", "stdout"},
906 {"/proc/self/fd/2", "stderr"},
907};
908
909static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
910{
911 char path[MAXPATHLEN];
912 int ret,i;
09227be2 913 struct stat s;
69aa6655
DE
914
915
916 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
917 const struct dev_symlinks *d = &dev_symlinks[i];
ec50007f 918 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
919 if (ret < 0 || ret >= MAXPATHLEN)
920 return -1;
09227be2
MW
921
922 /*
923 * Stat the path first. If we don't get an error
924 * accept it as is and don't try to create it
925 */
926 if (!stat(path, &s)) {
927 continue;
928 }
929
69aa6655 930 ret = symlink(d->oldpath, path);
09227be2 931
69aa6655 932 if (ret && errno != EEXIST) {
09227be2
MW
933 if ( errno == EROFS ) {
934 WARN("Warning: Read Only file system while creating %s", path);
935 } else {
936 SYSERROR("Error creating %s", path);
937 return -1;
938 }
69aa6655
DE
939 }
940 }
941 return 0;
942}
943
393903d1
SH
944/*
945 * Build a space-separate list of ptys to pass to systemd.
946 */
947static bool append_ptyname(char **pp, char *name)
b0a33c1e 948{
393903d1
SH
949 char *p;
950
951 if (!*pp) {
952 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
953 if (!*pp)
954 return false;
955 sprintf(*pp, "container_ttys=%s", name);
956 return true;
957 }
958 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
959 if (!p)
960 return false;
961 *pp = p;
962 strcat(p, " ");
963 strcat(p, name);
964 return true;
965}
966
9e1045e3 967static int lxc_setup_tty(struct lxc_conf *conf)
393903d1 968{
9e1045e3 969 int i, ret;
393903d1
SH
970 const struct lxc_tty_info *tty_info = &conf->tty_info;
971 char *ttydir = conf->ttydir;
7c6ef2a2 972 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 973
e8bd4e43 974 if (!conf->rootfs.path)
bc9bd0e3
DL
975 return 0;
976
b0a33c1e 977 for (i = 0; i < tty_info->nbtty; i++) {
b0a33c1e 978 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
979
e8bd4e43 980 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
9e1045e3 981 if (ret < 0 || (size_t)ret >= sizeof(path)) {
7c6ef2a2
SH
982 ERROR("pathname too long for ttys");
983 return -1;
984 }
9e1045e3 985
7c6ef2a2
SH
986 if (ttydir) {
987 /* create dev/lxc/tty%d" */
9e1045e3
CB
988 ret = snprintf(lxcpath, sizeof(lxcpath),
989 "/dev/%s/tty%d", ttydir, i + 1);
990 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
7c6ef2a2
SH
991 ERROR("pathname too long for ttys");
992 return -1;
993 }
9e1045e3 994
7c6ef2a2 995 ret = creat(lxcpath, 0660);
9e1045e3
CB
996 if (ret < 0 && errno != EEXIST) {
997 SYSERROR("failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
998 return -1;
999 }
4d44e274
SH
1000 if (ret >= 0)
1001 close(ret);
9e1045e3 1002
7c6ef2a2 1003 ret = unlink(path);
9e1045e3
CB
1004 if (ret < 0 && errno != ENOENT) {
1005 SYSERROR("failed to unlink \"%s\"", path);
7c6ef2a2
SH
1006 return -1;
1007 }
b0a33c1e 1008
9e1045e3
CB
1009 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
1010 if (ret < 0) {
1011 WARN("failed to bind mount \"%s\" onto \"%s\"",
7c6ef2a2
SH
1012 pty_info->name, path);
1013 continue;
1014 }
9e1045e3
CB
1015 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
1016 path);
13954cce 1017
9e1045e3
CB
1018 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
1019 ttydir, i + 1);
1020 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
9ba8130c
SH
1021 ERROR("tty pathname too long");
1022 return -1;
1023 }
9e1045e3 1024
7c6ef2a2 1025 ret = symlink(lxcpath, path);
9e1045e3
CB
1026 if (ret < 0) {
1027 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
1028 path, lxcpath);
7c6ef2a2
SH
1029 return -1;
1030 }
1031 } else {
9e1045e3
CB
1032 /* If we populated /dev, then we need to create
1033 * /dev/ttyN
1034 */
1035 ret = access(path, F_OK);
1036 if (ret < 0) {
c6883f38 1037 ret = creat(path, 0660);
9e1045e3
CB
1038 if (ret < 0) {
1039 SYSERROR("failed to create \"%s\"", path);
c6883f38 1040 /* this isn't fatal, continue */
025ed0f3 1041 } else {
c6883f38 1042 close(ret);
025ed0f3 1043 }
c6883f38 1044 }
9e1045e3
CB
1045
1046 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
1047 if (ret < 0) {
e8bd4e43 1048 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
7c6ef2a2
SH
1049 continue;
1050 }
9e1045e3
CB
1051
1052 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
1053 path);
393903d1 1054 }
9e1045e3 1055
e8bd4e43 1056 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
393903d1
SH
1057 ERROR("Error setting up container_ttys string");
1058 return -1;
b0a33c1e 1059 }
1060 }
1061
9e1045e3 1062 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
b0a33c1e 1063 return 0;
1064}
1065
59bb8698 1066static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 1067{
2d489f9e 1068 int oldroot = -1, newroot = -1;
bf601689 1069
2d489f9e
SH
1070 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1071 if (oldroot < 0) {
1072 SYSERROR("Error opening old-/ for fchdir");
9ba8130c
SH
1073 return -1;
1074 }
2d489f9e
SH
1075 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1076 if (newroot < 0) {
1077 SYSERROR("Error opening new-/ for fchdir");
1078 goto fail;
c08556c6 1079 }
bf601689 1080
cc6f6dd7 1081 /* change into new root fs */
2d489f9e 1082 if (fchdir(newroot)) {
cc6f6dd7 1083 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
2d489f9e 1084 goto fail;
cc6f6dd7
DL
1085 }
1086
cc6f6dd7 1087 /* pivot_root into our new root fs */
2d489f9e 1088 if (pivot_root(".", ".")) {
cc6f6dd7 1089 SYSERROR("pivot_root syscall failed");
2d489f9e 1090 goto fail;
bf601689 1091 }
cc6f6dd7 1092
2d489f9e
SH
1093 /*
1094 * at this point the old-root is mounted on top of our new-root
1095 * To unmounted it we must not be chdir'd into it, so escape back
1096 * to old-root
1097 */
1098 if (fchdir(oldroot) < 0) {
1099 SYSERROR("Error entering oldroot");
1100 goto fail;
1101 }
7981ea46 1102 if (umount2(".", MNT_DETACH) < 0) {
2d489f9e
SH
1103 SYSERROR("Error detaching old root");
1104 goto fail;
cc6f6dd7
DL
1105 }
1106
2d489f9e
SH
1107 if (fchdir(newroot) < 0) {
1108 SYSERROR("Error re-entering newroot");
1109 goto fail;
1110 }
cc6f6dd7 1111
2d489f9e
SH
1112 close(oldroot);
1113 close(newroot);
bf601689 1114
2d489f9e 1115 DEBUG("pivot_root syscall to '%s' successful", rootfs);
bf601689 1116
bf601689 1117 return 0;
2d489f9e
SH
1118
1119fail:
1120 if (oldroot != -1)
1121 close(oldroot);
1122 if (newroot != -1)
1123 close(newroot);
1124 return -1;
bf601689
MH
1125}
1126
bc6928ff 1127/*
87da4ec3
SH
1128 * Just create a path for /dev under $lxcpath/$name and in rootfs
1129 * If we hit an error, log it but don't fail yet.
91c3830e 1130 */
14221cbb 1131static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, const char *lxcpath)
91c3830e
SH
1132{
1133 int ret;
87da4ec3
SH
1134 size_t clen;
1135 char *path;
91c3830e 1136
14221cbb 1137 INFO("Mounting container /dev");
bc6928ff 1138
14221cbb 1139 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1140 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1141 path = alloca(clen);
bc6928ff 1142
ec50007f 1143 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
87da4ec3 1144 if (ret < 0 || ret >= clen)
91c3830e 1145 return -1;
bc6928ff 1146
87da4ec3 1147 if (!dir_exists(path)) {
14221cbb 1148 WARN("No /dev in container.");
87da4ec3
SH
1149 WARN("Proceeding without autodev setup");
1150 return 0;
bc6928ff 1151 }
87da4ec3 1152
1ec0e8e3 1153 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
ec50007f 1154 rootfs->path ? rootfs->mount : NULL);
1ec0e8e3 1155 if (ret != 0) {
87da4ec3 1156 SYSERROR("Failed mounting tmpfs onto %s\n", path);
1ec0e8e3 1157 return -1;
91c3830e 1158 }
87da4ec3
SH
1159
1160 INFO("Mounted tmpfs onto %s", path);
1161
ec50007f 1162 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
87da4ec3 1163 if (ret < 0 || ret >= clen)
91c3830e 1164 return -1;
87da4ec3 1165
bc6928ff
MW
1166 /*
1167 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1168 * If not, then create it and exit if that fails...
1169 */
87da4ec3 1170 if (!dir_exists(path)) {
bc6928ff
MW
1171 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1172 if (ret) {
1173 SYSERROR("Failed to create /dev/pts in container");
1174 return -1;
1175 }
91c3830e
SH
1176 }
1177
14221cbb 1178 INFO("Mounted container /dev");
91c3830e
SH
1179 return 0;
1180}
1181
c6883f38 1182struct lxc_devs {
74a3920a 1183 const char *name;
c6883f38
SH
1184 mode_t mode;
1185 int maj;
1186 int min;
1187};
1188
74a3920a 1189static const struct lxc_devs lxc_devs[] = {
c6883f38
SH
1190 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1191 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1192 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1193 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1194 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1195 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
c6883f38
SH
1196};
1197
27245ff7 1198static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38
SH
1199{
1200 int ret;
c6883f38
SH
1201 char path[MAXPATHLEN];
1202 int i;
3a32201c 1203 mode_t cmask;
c6883f38 1204
ec50007f 1205 ret = snprintf(path, MAXPATHLEN, "%s/dev", rootfs->path ? rootfs->mount : "");
91c3830e
SH
1206 if (ret < 0 || ret >= MAXPATHLEN) {
1207 ERROR("Error calculating container /dev location");
c6883f38 1208 return -1;
f7bee6c6 1209 }
91c3830e 1210
0bbf8572
CB
1211 /* ignore, just don't try to fill in */
1212 if (!dir_exists(path))
9cb4d183
SH
1213 return 0;
1214
0bbf8572 1215 INFO("populating container /dev");
3a32201c 1216 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1217 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1218 const struct lxc_devs *d = &lxc_devs[i];
0728ebf4 1219
ec50007f 1220 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
c6883f38
SH
1221 if (ret < 0 || ret >= MAXPATHLEN)
1222 return -1;
0bbf8572 1223
c6883f38 1224 ret = mknod(path, d->mode, makedev(d->maj, d->min));
0bbf8572 1225 if (ret < 0) {
9cb4d183
SH
1226 char hostpath[MAXPATHLEN];
1227 FILE *pathfile;
1228
0bbf8572
CB
1229 if (errno == EEXIST) {
1230 DEBUG("\"%s\" device already existed", path);
1231 continue;
1232 }
1233
1234 /* Unprivileged containers cannot create devices, so
1235 * bind mount the device from the host.
1236 */
9cb4d183
SH
1237 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1238 if (ret < 0 || ret >= MAXPATHLEN)
1239 return -1;
1240 pathfile = fopen(path, "wb");
1241 if (!pathfile) {
1242 SYSERROR("Failed to create device mount target '%s'", path);
1243 return -1;
1244 }
1245 fclose(pathfile);
0bbf8572
CB
1246 if (safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL) != 0) {
1247 SYSERROR("Failed bind mounting device %s from host into container", d->name);
9cb4d183
SH
1248 return -1;
1249 }
0bbf8572
CB
1250 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath, path);
1251 } else {
1252 DEBUG("created device node \"%s\"", path);
c6883f38
SH
1253 }
1254 }
3a32201c 1255 umask(cmask);
c6883f38 1256
0bbf8572 1257 INFO("populated container /dev");
c6883f38
SH
1258 return 0;
1259}
1260
cc28d0b0 1261static int setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1262{
91c3e281
CB
1263 struct bdev *bdev;
1264 const struct lxc_rootfs *rootfs;
cc28d0b0 1265
91c3e281 1266 rootfs = &conf->rootfs;
a0f379bf 1267 if (!rootfs->path) {
91c3e281
CB
1268 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1269 SYSERROR("Failed to make / rslave.");
a0f379bf
DW
1270 return -1;
1271 }
c69bd12f 1272 return 0;
a0f379bf 1273 }
0ad19a3f 1274
12297168 1275 if (access(rootfs->mount, F_OK)) {
91c3e281 1276 SYSERROR("Failed to access to \"%s\". Check it is present.",
12297168 1277 rootfs->mount);
b1789442
DL
1278 return -1;
1279 }
1280
91c3e281
CB
1281 /* First try mounting rootfs using a bdev. */
1282 bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1283 if (bdev && !bdev->ops->mount(bdev)) {
59d66af2 1284 bdev_put(bdev);
91c3e281
CB
1285 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1286 rootfs->path, rootfs->mount,
1287 rootfs->options ? rootfs->options : "(null)");
9be53773
SH
1288 return 0;
1289 }
59d66af2
SH
1290 if (bdev)
1291 bdev_put(bdev);
a17b1e65 1292 if (mount_rootfs(rootfs->path, rootfs->mount, rootfs->options)) {
91c3e281
CB
1293 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1294 rootfs->path, rootfs->mount,
1295 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1296 return -1;
1297 }
0ad19a3f 1298
91c3e281
CB
1299 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1300 rootfs->path, rootfs->mount,
1301 rootfs->options ? rootfs->options : "(null)");
ac778708
DL
1302 return 0;
1303}
1304
91e93c71
AV
1305int prepare_ramfs_root(char *root)
1306{
eab15c1e 1307 char buf[LXC_LINELEN], *p;
91e93c71
AV
1308 char nroot[PATH_MAX];
1309 FILE *f;
1310 int i;
1311 char *p2;
1312
1313 if (realpath(root, nroot) == NULL)
39c7b795 1314 return -errno;
91e93c71
AV
1315
1316 if (chdir("/") == -1)
39c7b795 1317 return -errno;
91e93c71
AV
1318
1319 /*
1320 * We could use here MS_MOVE, but in userns this mount is
1321 * locked and can't be moved.
1322 */
39c7b795 1323 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
91e93c71 1324 SYSERROR("Failed to move %s into /", root);
39c7b795 1325 return -errno;
91e93c71
AV
1326 }
1327
39c7b795 1328 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
91e93c71 1329 SYSERROR("Failed to make . rprivate");
39c7b795 1330 return -errno;
91e93c71
AV
1331 }
1332
1333 /*
1334 * The following code cleans up inhereted mounts which are not
1335 * required for CT.
1336 *
1337 * The mountinfo file shows not all mounts, if a few points have been
1338 * unmounted between read operations from the mountinfo. So we need to
1339 * read mountinfo a few times.
1340 *
1341 * This loop can be skipped if a container uses unserns, because all
1342 * inherited mounts are locked and we should live with all this trash.
1343 */
1344 while (1) {
1345 int progress = 0;
1346
1347 f = fopen("./proc/self/mountinfo", "r");
1348 if (!f) {
1349 SYSERROR("Unable to open /proc/self/mountinfo");
1350 return -1;
1351 }
eab15c1e 1352 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1353 for (p = buf, i=0; p && i < 4; i++)
1354 p = strchr(p+1, ' ');
1355 if (!p)
1356 continue;
1357 p2 = strchr(p+1, ' ');
1358 if (!p2)
1359 continue;
1360
1361 *p2 = '\0';
1362 *p = '.';
1363
1364 if (strcmp(p + 1, "/") == 0)
1365 continue;
1366 if (strcmp(p + 1, "/proc") == 0)
1367 continue;
1368
1369 if (umount2(p, MNT_DETACH) == 0)
1370 progress++;
1371 }
1372 fclose(f);
1373 if (!progress)
1374 break;
1375 }
1376
8bea9fae
PR
1377 /* This also can be skipped if a container uses unserns */
1378 umount2("./proc", MNT_DETACH);
91e93c71
AV
1379
1380 /* It is weird, but chdir("..") moves us in a new root */
1381 if (chdir("..") == -1) {
1382 SYSERROR("Unable to change working directory");
1383 return -1;
1384 }
1385
1386 if (chroot(".") == -1) {
1387 SYSERROR("Unable to chroot");
1388 return -1;
1389 }
1390
1391 return 0;
1392}
1393
74a3920a 1394static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1395{
39c7b795
CB
1396 if (!rootfs->path) {
1397 DEBUG("container does not have a rootfs, so not doing pivot root");
ac778708 1398 return 0;
39c7b795 1399 }
ac778708 1400
91e93c71 1401 if (detect_ramfs_rootfs()) {
39c7b795
CB
1402 DEBUG("detected that container is on ramfs");
1403 if (prepare_ramfs_root(rootfs->mount)) {
1404 ERROR("failed to prepare minimal ramfs root");
91e93c71 1405 return -1;
39c7b795
CB
1406 }
1407
1408 DEBUG("prepared ramfs root for container");
1409 return 0;
1410 }
1411
1412 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1413 ERROR("failed to pivot root");
25368b52 1414 return -1;
c69bd12f
DL
1415 }
1416
39c7b795 1417 DEBUG("finished pivot root");
25368b52 1418 return 0;
0ad19a3f 1419}
1420
70761e5e 1421static int lxc_setup_devpts(int num_pts)
3c26f34e 1422{
70761e5e 1423 int ret;
d5cb35d6 1424 const char *devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
77890c6d 1425
70761e5e
CB
1426 if (!num_pts) {
1427 DEBUG("no new devpts instance will be mounted since no pts "
1428 "devices are requested");
d852c78c 1429 return 0;
3c26f34e 1430 }
1431
d5cb35d6 1432 /* Unmount old devpts instance. */
70761e5e
CB
1433 ret = access("/dev/pts/ptmx", F_OK);
1434 if (!ret) {
70761e5e
CB
1435 ret = umount("/dev/pts");
1436 if (ret < 0) {
1437 SYSERROR("failed to unmount old devpts instance");
1438 return -1;
7e40254a 1439 }
70761e5e 1440 DEBUG("unmounted old /dev/pts instance");
7e40254a
JTLB
1441 }
1442
70761e5e
CB
1443 /* Create mountpoint for devpts instance. */
1444 ret = mkdir("/dev/pts", 0755);
1445 if (ret < 0 && errno != EEXIST) {
1446 SYSERROR("failed to create the \"/dev/pts\" directory");
3c26f34e 1447 return -1;
1448 }
1449
70761e5e
CB
1450 /* Mount new devpts instance. */
1451 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1452 if (ret < 0) {
1453 SYSERROR("failed to mount new devpts instance");
1454 return -1;
1455 }
f4f52cb5 1456 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1457
d5cb35d6 1458 /* Remove any pre-existing /dev/ptmx file. */
70761e5e 1459 ret = access("/dev/ptmx", F_OK);
d5cb35d6
CB
1460 if (!ret) {
1461 ret = remove("/dev/ptmx");
1462 if (ret < 0) {
1463 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1464 return -1;
70761e5e 1465 }
d5cb35d6 1466 DEBUG("removed existing \"/dev/ptmx\"");
3c26f34e 1467 }
1468
d5cb35d6
CB
1469 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1470 ret = open("/dev/ptmx", O_CREAT, 0666);
1471 if (ret < 0) {
1472 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1473 return -1;
1474 }
e87bd19c 1475 close(ret);
d5cb35d6 1476 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1477
d5cb35d6 1478 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1479 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6
CB
1480 if (!ret) {
1481 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1482 return 0;
1483 } else {
1484 /* Fallthrough and try to create a symlink. */
1485 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1486 }
1487
1488 /* Remove the dummy /dev/ptmx file we created above. */
1489 ret = remove("/dev/ptmx");
70761e5e 1490 if (ret < 0) {
d5cb35d6
CB
1491 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1492 return -1;
1493 }
1494
1495 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1496 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1497 if (ret < 0) {
1498 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
3c26f34e 1499 return -1;
1500 }
d5cb35d6 1501 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
cd54d859 1502
3c26f34e 1503 return 0;
1504}
1505
cccc74b5
DL
1506static int setup_personality(int persona)
1507{
6ff05e18 1508 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1509 if (persona == -1)
1510 return 0;
1511
1512 if (personality(persona) < 0) {
1513 SYSERROR("failed to set personality to '0x%x'", persona);
1514 return -1;
1515 }
1516
1517 INFO("set personality to '0x%x'", persona);
6ff05e18 1518 #endif
cccc74b5
DL
1519
1520 return 0;
1521}
1522
3d7d929a
CB
1523static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1524 const struct lxc_console *console)
6e590161 1525{
63376d7d 1526 char path[MAXPATHLEN];
0728ebf4 1527 int ret, fd;
52e35957 1528
8b1b1210
CB
1529 if (console->path && !strcmp(console->path, "none"))
1530 return 0;
1531
7c6ef2a2 1532 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
3d7d929a 1533 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1534 return -1;
52e35957 1535
8b1b1210
CB
1536 /* When we are asked to setup a console we remove any previous
1537 * /dev/console bind-mounts.
1538 */
a7ba3c7f
CB
1539 if (file_exists(path)) {
1540 ret = lxc_unstack_mountpoint(path, false);
1541 if (ret < 0) {
8b1b1210 1542 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1543 return -ret;
1544 } else {
1545 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1546 }
1547 ret = unlink(path);
1548 if (ret < 0) {
1549 SYSERROR("error unlinking %s", path);
8b1b1210
CB
1550 return -errno;
1551 }
8b1b1210
CB
1552 }
1553
1554 /* For unprivileged containers autodev or automounts will already have
1555 * taken care of creating /dev/console.
1556 */
0728ebf4
TA
1557 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1558 if (fd < 0) {
1559 if (errno != EEXIST) {
1560 SYSERROR("failed to create console");
3d7d929a 1561 return -errno;
0728ebf4
TA
1562 }
1563 } else {
1564 close(fd);
52e35957
DL
1565 }
1566
0728ebf4 1567 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
3d7d929a
CB
1568 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1569 return -errno;
63376d7d 1570 }
13954cce 1571
3d7d929a 1572 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
63376d7d 1573 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1574 return -1;
1575 }
1576
3d7d929a 1577 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1578 return 0;
1579}
1580
3d7d929a
CB
1581static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1582 const struct lxc_console *console,
1583 char *ttydir)
7c6ef2a2 1584{
7c6ef2a2 1585 int ret;
3d7d929a 1586 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
7c6ef2a2
SH
1587
1588 /* create rootfs/dev/<ttydir> directory */
3d7d929a
CB
1589 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1590 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1591 return -1;
3d7d929a 1592
7c6ef2a2
SH
1593 ret = mkdir(path, 0755);
1594 if (ret && errno != EEXIST) {
959aee9c 1595 SYSERROR("failed with errno %d to create %s", errno, path);
3d7d929a 1596 return -errno;
7c6ef2a2 1597 }
3d7d929a 1598 DEBUG("created directory for console and tty devices at \%s\"", path);
7c6ef2a2 1599
3d7d929a
CB
1600 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1601 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1602 return -1;
1603
7c6ef2a2 1604 ret = creat(lxcpath, 0660);
3d7d929a 1605 if (ret == -1 && errno != EEXIST) {
959aee9c 1606 SYSERROR("error %d creating %s", errno, lxcpath);
3d7d929a 1607 return -errno;
7c6ef2a2 1608 }
4d44e274
SH
1609 if (ret >= 0)
1610 close(ret);
7c6ef2a2 1611
2a12fefd
CB
1612 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1613 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 1614 return -1;
2a12fefd
CB
1615
1616 /* When we are asked to setup a console we remove any previous
1617 * /dev/console bind-mounts.
1618 */
1619 if (console->path && !strcmp(console->path, "none")) {
1620 struct stat st;
1621 ret = stat(path, &st);
1622 if (ret < 0) {
1623 if (errno == ENOENT)
1624 return 0;
1625 SYSERROR("failed stat() \"%s\"", path);
1626 return -errno;
1627 }
1628
1629 /* /dev/console must be character device with major number 5 and
1630 * minor number 1. If not, give benefit of the doubt and assume
1631 * the user has mounted something else right there on purpose.
1632 */
1633 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1634 return 0;
1635
1636 /* In case the user requested a bind-mount for /dev/console and
1637 * requests a ttydir we move the mount to the
a7ba3c7f
CB
1638 * /dev/<ttydir/console.
1639 * Note, we only move the uppermost mount and clear all other
1640 * mounts underneath for safety.
1641 * If it is a character device created via mknod() we simply
1642 * rename it.
2a12fefd
CB
1643 */
1644 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1645 if (ret < 0) {
1646 if (errno != EINVAL) {
1647 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1648 return -errno;
1649 }
1650 /* path was not a mountpoint */
1651 ret = rename(path, lxcpath);
1652 if (ret < 0) {
1653 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1654 return -errno;
1655 }
1656 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1657 } else {
1658 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1659 }
a7ba3c7f
CB
1660
1661 /* Clear all remaining bind-mounts. */
1662 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1663 if (ret < 0) {
a7ba3c7f
CB
1664 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1665 return -ret;
1666 } else {
1667 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1668 }
1669 } else {
1670 if (file_exists(path)) {
1671 ret = lxc_unstack_mountpoint(path, false);
1672 if (ret < 0) {
2a12fefd 1673 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1674 return -ret;
1675 } else {
1676 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
2a12fefd 1677 }
2a12fefd
CB
1678 }
1679
1680 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1681 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1682 return -1;
1683 }
1684 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1685 }
1686
2a12fefd 1687 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
9ba8130c 1688 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
3d7d929a 1689 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 1690 return -1;
3d7d929a 1691
2a12fefd
CB
1692 ret = unlink(path);
1693 if (ret && errno != ENOENT) {
1694 SYSERROR("error unlinking %s", path);
1695 return -errno;
1696 }
1697
7c6ef2a2 1698 ret = symlink(lxcpath, path);
3d7d929a
CB
1699 if (ret < 0) {
1700 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
7c6ef2a2
SH
1701 return -1;
1702 }
1703
3d7d929a 1704 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
6e590161 1705 return 0;
1706}
1707
3d7d929a
CB
1708static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1709 const struct lxc_console *console, char *ttydir)
7c6ef2a2 1710{
3d7d929a
CB
1711 /* We don't have a rootfs, /dev/console will be shared. */
1712 if (!rootfs->path) {
1713 DEBUG("/dev/console will be shared with the host");
7c6ef2a2 1714 return 0;
3d7d929a
CB
1715 }
1716
7c6ef2a2 1717 if (!ttydir)
3d7d929a 1718 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1719
3d7d929a 1720 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1721}
1722
1bd051a6
SH
1723static int setup_kmsg(const struct lxc_rootfs *rootfs,
1724 const struct lxc_console *console)
1725{
1726 char kpath[MAXPATHLEN];
1727 int ret;
1728
222fea5a
DE
1729 if (!rootfs->path)
1730 return 0;
1bd051a6
SH
1731 ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
1732 if (ret < 0 || ret >= sizeof(kpath))
1733 return -1;
1734
1735 ret = unlink(kpath);
1736 if (ret && errno != ENOENT) {
959aee9c 1737 SYSERROR("error unlinking %s", kpath);
1bd051a6
SH
1738 return -1;
1739 }
1740
1741 ret = symlink("console", kpath);
1742 if (ret) {
1743 SYSERROR("failed to create symlink for kmsg");
1744 return -1;
1745 }
1746
1747 return 0;
1748}
1749
998ac676
RT
1750static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1751{
1752 struct mount_opt *mo;
1753
1754 /* If opt is found in mount_opt, set or clear flags.
1755 * Otherwise append it to data. */
1756
1757 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1758 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1759 if (mo->clear)
1760 *flags &= ~mo->flag;
1761 else
1762 *flags |= mo->flag;
1763 return;
1764 }
1765 }
1766
1767 if (strlen(*data))
1768 strcat(*data, ",");
1769 strcat(*data, opt);
1770}
1771
a17b1e65 1772int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1773 char **mntdata)
1774{
1775 char *s, *data;
1776 char *p, *saveptr = NULL;
1777
911324ef 1778 *mntdata = NULL;
91656ce5 1779 *mntflags = 0L;
911324ef
DL
1780
1781 if (!mntopts)
998ac676
RT
1782 return 0;
1783
911324ef 1784 s = strdup(mntopts);
998ac676 1785 if (!s) {
36eb9bde 1786 SYSERROR("failed to allocate memory");
998ac676
RT
1787 return -1;
1788 }
1789
1790 data = malloc(strlen(s) + 1);
1791 if (!data) {
36eb9bde 1792 SYSERROR("failed to allocate memory");
998ac676
RT
1793 free(s);
1794 return -1;
1795 }
1796 *data = 0;
1797
1798 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1799 p = strtok_r(NULL, ",", &saveptr))
1800 parse_mntopt(p, mntflags, &data);
1801
1802 if (*data)
1803 *mntdata = data;
1804 else
1805 free(data);
1806 free(s);
1807
1808 return 0;
1809}
1810
6fd5e769
SH
1811static void null_endofword(char *word)
1812{
1813 while (*word && *word != ' ' && *word != '\t')
1814 word++;
1815 *word = '\0';
1816}
1817
1818/*
1819 * skip @nfields spaces in @src
1820 */
1821static char *get_field(char *src, int nfields)
1822{
1823 char *p = src;
1824 int i;
1825
1826 for (i = 0; i < nfields; i++) {
1827 while (*p && *p != ' ' && *p != '\t')
1828 p++;
1829 if (!*p)
1830 break;
1831 p++;
1832 }
1833 return p;
1834}
1835
911324ef
DL
1836static int mount_entry(const char *fsname, const char *target,
1837 const char *fstype, unsigned long mountflags,
ae7a770e 1838 const char *data, int optional, int dev, const char *rootfs)
911324ef 1839{
614305f3 1840#ifdef HAVE_STATVFS
2938f7c8 1841 struct statvfs sb;
614305f3 1842#endif
2938f7c8 1843
592fd47a 1844 if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) {
1fc64d22
SG
1845 if (optional) {
1846 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
1847 target, strerror(errno));
1848 return 0;
1849 }
1850 else {
1851 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1852 return -1;
1853 }
911324ef
DL
1854 }
1855
1856 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
2938f7c8
SH
1857 DEBUG("remounting %s on %s to respect bind or remount options",
1858 fsname ? fsname : "(none)", target ? target : "(none)");
7c5b6e7c
AS
1859 unsigned long rqd_flags = 0;
1860 if (mountflags & MS_RDONLY)
1861 rqd_flags |= MS_RDONLY;
614305f3 1862#ifdef HAVE_STATVFS
2938f7c8 1863 if (statvfs(fsname, &sb) == 0) {
7c5b6e7c 1864 unsigned long required_flags = rqd_flags;
2938f7c8
SH
1865 if (sb.f_flag & MS_NOSUID)
1866 required_flags |= MS_NOSUID;
ae7a770e 1867 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8
SH
1868 required_flags |= MS_NODEV;
1869 if (sb.f_flag & MS_RDONLY)
1870 required_flags |= MS_RDONLY;
1871 if (sb.f_flag & MS_NOEXEC)
1872 required_flags |= MS_NOEXEC;
1873 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname, sb.f_flag, required_flags);
1874 /*
1875 * If this was a bind mount request, and required_flags
1876 * does not have any flags which are not already in
1877 * mountflags, then skip the remount
1878 */
1879 if (!(mountflags & MS_REMOUNT)) {
7c5b6e7c 1880 if (!(required_flags & ~mountflags) && rqd_flags == 0) {
2938f7c8
SH
1881 DEBUG("mountflags already was %lu, skipping remount",
1882 mountflags);
1883 goto skipremount;
1884 }
1885 }
1886 mountflags |= required_flags;
6fd5e769 1887 }
614305f3 1888#endif
911324ef
DL
1889
1890 if (mount(fsname, target, fstype,
592fd47a 1891 mountflags | MS_REMOUNT, data) < 0) {
1fc64d22
SG
1892 if (optional) {
1893 INFO("failed to mount '%s' on '%s' (optional): %s",
1894 fsname, target, strerror(errno));
1895 return 0;
1896 }
1897 else {
1898 SYSERROR("failed to mount '%s' on '%s'",
1899 fsname, target);
1900 return -1;
1901 }
911324ef
DL
1902 }
1903 }
1904
614305f3 1905#ifdef HAVE_STATVFS
6fd5e769 1906skipremount:
614305f3 1907#endif
911324ef
DL
1908 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1909
1910 return 0;
1911}
1912
4e4ca161
SH
1913/*
1914 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1915 */
1916static void cull_mntent_opt(struct mntent *mntent)
1917{
1918 int i;
1919 char *p, *p2;
1920 char *list[] = {"create=dir",
1921 "create=file",
1922 "optional",
1923 NULL };
1924
1925 for (i=0; list[i]; i++) {
1926 if (!(p = strstr(mntent->mnt_opts, list[i])))
1927 continue;
1928 p2 = strchr(p, ',');
1929 if (!p2) {
1930 /* no more mntopts, so just chop it here */
1931 *p = '\0';
1932 continue;
1933 }
1934 memmove(p, p2+1, strlen(p2+1)+1);
1935 }
1936}
1937
4d5b72a1 1938static int mount_entry_create_dir_file(const struct mntent *mntent,
0a2dddd4
CB
1939 const char* path, const struct lxc_rootfs *rootfs,
1940 const char *lxc_name, const char *lxc_path)
0ad19a3f 1941{
4d5b72a1 1942 char *pathdirname = NULL;
608e3567 1943 int ret = 0;
34cfffb3 1944 FILE *pathfile = NULL;
911324ef 1945
6e46cc0d 1946 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
5c484f79 1947 if (ovl_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
6e46cc0d
CB
1948 return -1;
1949 } else if (strncmp(mntent->mnt_type, "aufs", 4) == 0) {
1d52bdf7 1950 if (aufs_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
6e46cc0d
CB
1951 return -1;
1952 }
1953
34cfffb3 1954 if (hasmntopt(mntent, "create=dir")) {
4d5b72a1
NC
1955 if (mkdir_p(path, 0755) < 0) {
1956 WARN("Failed to create mount target '%s'", path);
34cfffb3
SG
1957 ret = -1;
1958 }
1959 }
1960
4d5b72a1
NC
1961 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1962 pathdirname = strdup(path);
34cfffb3 1963 pathdirname = dirname(pathdirname);
119126b6
SG
1964 if (mkdir_p(pathdirname, 0755) < 0) {
1965 WARN("Failed to create target directory");
1966 }
4d5b72a1 1967 pathfile = fopen(path, "wb");
34cfffb3 1968 if (!pathfile) {
4d5b72a1 1969 WARN("Failed to create mount target '%s'", path);
34cfffb3 1970 ret = -1;
6e46cc0d 1971 } else {
34cfffb3 1972 fclose(pathfile);
6e46cc0d 1973 }
34cfffb3 1974 }
4d5b72a1
NC
1975 free(pathdirname);
1976 return ret;
1977}
1978
ec50007f
CB
1979/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1980 * without a rootfs. */
db4aba38 1981static inline int mount_entry_on_generic(struct mntent *mntent,
0a2dddd4
CB
1982 const char* path, const struct lxc_rootfs *rootfs,
1983 const char *lxc_name, const char *lxc_path)
4d5b72a1
NC
1984{
1985 unsigned long mntflags;
1986 char *mntdata;
1987 int ret;
1988 bool optional = hasmntopt(mntent, "optional") != NULL;
ae7a770e 1989 bool dev = hasmntopt(mntent, "dev") != NULL;
4d5b72a1 1990
ec50007f
CB
1991 char *rootfs_path = NULL;
1992 if (rootfs && rootfs->path)
1993 rootfs_path = rootfs->mount;
1994
0a2dddd4 1995 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name, lxc_path);
34cfffb3 1996
608e3567
SH
1997 if (ret < 0)
1998 return optional ? 0 : -1;
1999
4e4ca161
SH
2000 cull_mntent_opt(mntent);
2001
a17b1e65
SG
2002 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
2003 free(mntdata);
2004 return -1;
2005 }
2006
6e46cc0d 2007 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
ae7a770e 2008 mntdata, optional, dev, rootfs_path);
68c152ef 2009
911324ef 2010 free(mntdata);
911324ef
DL
2011 return ret;
2012}
2013
db4aba38
NC
2014static inline int mount_entry_on_systemfs(struct mntent *mntent)
2015{
1433c9f9
CB
2016 char path[MAXPATHLEN];
2017 int ret;
2018
2019 /* For containers created without a rootfs all mounts are treated as
2020 * absolute paths starting at / on the host. */
2021 if (mntent->mnt_dir[0] != '/')
2022 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2023 else
2024 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
2025
2026 if (ret < 0 || ret >= sizeof(path)) {
2027 ERROR("path name too long");
2028 return -1;
2029 }
2030
2031 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
2032}
2033
4e4ca161 2034static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 2035 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
2036 const char *lxc_name,
2037 const char *lxc_path)
911324ef 2038{
013bd428 2039 char *aux;
59760f5d 2040 char path[MAXPATHLEN];
80a881b2 2041 int r, ret = 0, offset;
67e571de 2042 const char *lxcpath;
0ad19a3f 2043
593e8478 2044 lxcpath = lxc_global_config_value("lxc.lxcpath");
2a59a681
SH
2045 if (!lxcpath) {
2046 ERROR("Out of memory");
2047 return -1;
2048 }
2049
80a881b2 2050 /* if rootfs->path is a blockdev path, allow container fstab to
2a59a681
SH
2051 * use $lxcpath/CN/rootfs as the target prefix */
2052 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
80a881b2
SH
2053 if (r < 0 || r >= MAXPATHLEN)
2054 goto skipvarlib;
2055
2056 aux = strstr(mntent->mnt_dir, path);
2057 if (aux) {
2058 offset = strlen(path);
2059 goto skipabs;
2060 }
2061
2062skipvarlib:
013bd428
DL
2063 aux = strstr(mntent->mnt_dir, rootfs->path);
2064 if (!aux) {
2065 WARN("ignoring mount point '%s'", mntent->mnt_dir);
db4aba38 2066 return ret;
013bd428 2067 }
80a881b2
SH
2068 offset = strlen(rootfs->path);
2069
2070skipabs:
013bd428 2071
9ba8130c 2072 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
80a881b2
SH
2073 aux + offset);
2074 if (r < 0 || r >= MAXPATHLEN) {
2075 WARN("pathnme too long for '%s'", mntent->mnt_dir);
a17b1e65
SG
2076 return -1;
2077 }
2078
0a2dddd4 2079 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 2080}
d330fe7b 2081
4e4ca161 2082static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
2083 const struct lxc_rootfs *rootfs,
2084 const char *lxc_name,
2085 const char *lxc_path)
911324ef
DL
2086{
2087 char path[MAXPATHLEN];
911324ef 2088 int ret;
d330fe7b 2089
34cfffb3 2090 /* relative to root mount point */
6e46cc0d 2091 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1433c9f9 2092 if (ret < 0 || ret >= sizeof(path)) {
9ba8130c
SH
2093 ERROR("path name too long");
2094 return -1;
2095 }
911324ef 2096
0a2dddd4 2097 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2098}
2099
80a881b2 2100static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
0a2dddd4 2101 const char *lxc_name, const char *lxc_path)
911324ef 2102{
aaf901be
AM
2103 struct mntent mntent;
2104 char buf[4096];
911324ef 2105 int ret = -1;
e76b8764 2106
aaf901be 2107 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
e76b8764 2108
911324ef 2109 if (!rootfs->path) {
aaf901be 2110 if (mount_entry_on_systemfs(&mntent))
e76b8764 2111 goto out;
911324ef 2112 continue;
e76b8764
CDC
2113 }
2114
911324ef 2115 /* We have a separate root, mounts are relative to it */
aaf901be 2116 if (mntent.mnt_dir[0] != '/') {
0a2dddd4 2117 if (mount_entry_on_relative_rootfs(&mntent, rootfs, lxc_name, lxc_path))
911324ef
DL
2118 goto out;
2119 continue;
2120 }
cd54d859 2121
0a2dddd4 2122 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name, lxc_path))
911324ef 2123 goto out;
0ad19a3f 2124 }
cd54d859 2125
0ad19a3f 2126 ret = 0;
cd54d859
DL
2127
2128 INFO("mount points have been setup");
0ad19a3f 2129out:
e7938e9e
MN
2130 return ret;
2131}
2132
80a881b2 2133static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
0a2dddd4 2134 const char *lxc_name, const char *lxc_path)
e7938e9e
MN
2135{
2136 FILE *file;
2137 int ret;
2138
2139 if (!fstab)
2140 return 0;
2141
2142 file = setmntent(fstab, "r");
2143 if (!file) {
2144 SYSERROR("failed to use '%s'", fstab);
2145 return -1;
2146 }
2147
0a2dddd4 2148 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
e7938e9e 2149
0ad19a3f 2150 endmntent(file);
2151 return ret;
2152}
2153
5ef5c9a3 2154FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2155{
5ef5c9a3 2156 int ret;
e7938e9e 2157 char *mount_entry;
5ef5c9a3
CB
2158 struct lxc_list *iterator;
2159 FILE *file;
2160 int fd = -1;
2161
2162 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2163 if (fd < 0) {
2164 if (errno != ENOSYS)
2165 return NULL;
2166 file = tmpfile();
2167 } else {
2168 file = fdopen(fd, "r+");
2169 }
e7938e9e 2170
e7938e9e 2171 if (!file) {
fad6ef95 2172 int saved_errno = errno;
5ef5c9a3
CB
2173 if (fd != -1)
2174 close(fd);
fad6ef95 2175 ERROR("Could not create mount entry file: %s.", strerror(saved_errno));
9fc7f8c0 2176 return NULL;
e7938e9e
MN
2177 }
2178
2179 lxc_list_for_each(iterator, mount) {
2180 mount_entry = iterator->elem;
5ef5c9a3
CB
2181 ret = fprintf(file, "%s\n", mount_entry);
2182 if (ret < strlen(mount_entry))
2183 WARN("Could not write mount entry to anonymous mount file.");
2184 }
2185
2186 if (fseek(file, 0, SEEK_SET) < 0) {
2187 fclose(file);
2188 return NULL;
e7938e9e
MN
2189 }
2190
9fc7f8c0
TA
2191 return file;
2192}
2193
5ef5c9a3
CB
2194static int setup_mount_entries(const struct lxc_rootfs *rootfs,
2195 struct lxc_list *mount, const char *lxc_name,
2196 const char *lxc_path)
9fc7f8c0
TA
2197{
2198 FILE *file;
2199 int ret;
2200
5ef5c9a3 2201 file = make_anonymous_mount_file(mount);
9fc7f8c0
TA
2202 if (!file)
2203 return -1;
e7938e9e 2204
0a2dddd4 2205 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
e7938e9e
MN
2206
2207 fclose(file);
2208 return ret;
2209}
2210
bab88e68
CS
2211static int parse_cap(const char *cap)
2212{
2213 char *ptr = NULL;
84760c11 2214 size_t i;
2215 int capid = -1;
bab88e68 2216
7035407c
DE
2217 if (!strcmp(cap, "none"))
2218 return -2;
2219
bab88e68
CS
2220 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2221
2222 if (strcmp(cap, caps_opt[i].name))
2223 continue;
2224
2225 capid = caps_opt[i].value;
2226 break;
2227 }
2228
2229 if (capid < 0) {
2230 /* try to see if it's numeric, so the user may specify
2231 * capabilities that the running kernel knows about but
2232 * we don't */
2233 errno = 0;
2234 capid = strtol(cap, &ptr, 10);
2235 if (!ptr || *ptr != '\0' || errno != 0)
2236 /* not a valid number */
2237 capid = -1;
2238 else if (capid > lxc_caps_last_cap())
2239 /* we have a number but it's not a valid
2240 * capability */
2241 capid = -1;
2242 }
2243
2244 return capid;
2245}
2246
0769b82a
CS
2247int in_caplist(int cap, struct lxc_list *caps)
2248{
2249 struct lxc_list *iterator;
2250 int capid;
2251
2252 lxc_list_for_each(iterator, caps) {
2253 capid = parse_cap(iterator->elem);
2254 if (capid == cap)
2255 return 1;
2256 }
2257
2258 return 0;
2259}
2260
81810dd1
DL
2261static int setup_caps(struct lxc_list *caps)
2262{
2263 struct lxc_list *iterator;
2264 char *drop_entry;
bab88e68 2265 int capid;
81810dd1
DL
2266
2267 lxc_list_for_each(iterator, caps) {
2268
2269 drop_entry = iterator->elem;
2270
bab88e68 2271 capid = parse_cap(drop_entry);
d55bc1ad 2272
81810dd1 2273 if (capid < 0) {
1e11be34
DL
2274 ERROR("unknown capability %s", drop_entry);
2275 return -1;
81810dd1
DL
2276 }
2277
2278 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2279
2280 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2281 SYSERROR("failed to remove %s capability", drop_entry);
2282 return -1;
2283 }
81810dd1
DL
2284
2285 }
2286
1fb86a7c
SH
2287 DEBUG("capabilities have been setup");
2288
2289 return 0;
2290}
2291
2292static int dropcaps_except(struct lxc_list *caps)
2293{
2294 struct lxc_list *iterator;
2295 char *keep_entry;
1fb86a7c
SH
2296 int i, capid;
2297 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2298 INFO("found %d capabilities", numcaps);
1fb86a7c 2299
2caf9a97
SH
2300 if (numcaps <= 0 || numcaps > 200)
2301 return -1;
2302
1fb86a7c
SH
2303 // caplist[i] is 1 if we keep capability i
2304 int *caplist = alloca(numcaps * sizeof(int));
2305 memset(caplist, 0, numcaps * sizeof(int));
2306
2307 lxc_list_for_each(iterator, caps) {
2308
2309 keep_entry = iterator->elem;
2310
bab88e68 2311 capid = parse_cap(keep_entry);
1fb86a7c 2312
7035407c
DE
2313 if (capid == -2)
2314 continue;
2315
1fb86a7c
SH
2316 if (capid < 0) {
2317 ERROR("unknown capability %s", keep_entry);
2318 return -1;
2319 }
2320
8255688a 2321 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2322
2323 caplist[capid] = 1;
2324 }
2325 for (i=0; i<numcaps; i++) {
2326 if (caplist[i])
2327 continue;
2328 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2329 SYSERROR("failed to remove capability %d", i);
2330 return -1;
2331 }
1fb86a7c
SH
2332 }
2333
2334 DEBUG("capabilities have been setup");
81810dd1
DL
2335
2336 return 0;
2337}
2338
0ad19a3f 2339static int setup_hw_addr(char *hwaddr, const char *ifname)
2340{
2341 struct sockaddr sockaddr;
2342 struct ifreq ifr;
fad6ef95 2343 int ret, fd, saved_errno;
0ad19a3f 2344
3cfc0f3a
MN
2345 ret = lxc_convert_mac(hwaddr, &sockaddr);
2346 if (ret) {
2347 ERROR("mac address '%s' conversion failed : %s",
2348 hwaddr, strerror(-ret));
0ad19a3f 2349 return -1;
2350 }
2351
2352 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
5da6aa8c 2353 ifr.ifr_name[IFNAMSIZ-1] = '\0';
0ad19a3f 2354 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2355
2356 fd = socket(AF_INET, SOCK_DGRAM, 0);
2357 if (fd < 0) {
3ab87b66 2358 ERROR("socket failure : %s", strerror(errno));
0ad19a3f 2359 return -1;
2360 }
2361
2362 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
fad6ef95 2363 saved_errno = errno;
0ad19a3f 2364 close(fd);
2365 if (ret)
fad6ef95 2366 ERROR("ioctl failure : %s", strerror(saved_errno));
0ad19a3f 2367
5da6aa8c 2368 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
cd54d859 2369
0ad19a3f 2370 return ret;
2371}
2372
82d5ae15 2373static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2374{
82d5ae15
DL
2375 struct lxc_list *iterator;
2376 struct lxc_inetdev *inetdev;
3cfc0f3a 2377 int err;
0ad19a3f 2378
82d5ae15
DL
2379 lxc_list_for_each(iterator, ip) {
2380
2381 inetdev = iterator->elem;
2382
0093bb8c
DL
2383 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2384 &inetdev->bcast, inetdev->prefix);
3cfc0f3a
MN
2385 if (err) {
2386 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2387 ifindex, strerror(-err));
82d5ae15
DL
2388 return -1;
2389 }
2390 }
2391
2392 return 0;
0ad19a3f 2393}
2394
82d5ae15 2395static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2396{
82d5ae15 2397 struct lxc_list *iterator;
7fa9074f 2398 struct lxc_inet6dev *inet6dev;
3cfc0f3a 2399 int err;
0ad19a3f 2400
82d5ae15
DL
2401 lxc_list_for_each(iterator, ip) {
2402
2403 inet6dev = iterator->elem;
2404
b3df193c 2405 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
0093bb8c
DL
2406 &inet6dev->mcast, &inet6dev->acast,
2407 inet6dev->prefix);
3cfc0f3a
MN
2408 if (err) {
2409 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2410 ifindex, strerror(-err));
82d5ae15 2411 return -1;
3cfc0f3a 2412 }
82d5ae15
DL
2413 }
2414
2415 return 0;
0ad19a3f 2416}
2417
82d5ae15 2418static int setup_netdev(struct lxc_netdev *netdev)
0ad19a3f 2419{
0ad19a3f 2420 char ifname[IFNAMSIZ];
0ad19a3f 2421 char *current_ifname = ifname;
3cfc0f3a 2422 int err;
0ad19a3f 2423
82d5ae15
DL
2424 /* empty network namespace */
2425 if (!netdev->ifindex) {
b0efbac4 2426 if (netdev->flags & IFF_UP) {
d472214b 2427 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2428 if (err) {
2429 ERROR("failed to set the loopback up : %s",
2430 strerror(-err));
82d5ae15
DL
2431 return -1;
2432 }
82d5ae15 2433 }
40790553
SH
2434 if (netdev->type != LXC_NET_VETH)
2435 return 0;
2436 netdev->ifindex = if_nametoindex(netdev->name);
0ad19a3f 2437 }
13954cce 2438
b466dc33 2439 /* get the new ifindex in case of physical netdev */
40790553 2440 if (netdev->type == LXC_NET_PHYS) {
b466dc33
BP
2441 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2442 ERROR("failed to get ifindex for %s",
2443 netdev->link);
2444 return -1;
2445 }
40790553 2446 }
b466dc33 2447
82d5ae15
DL
2448 /* retrieve the name of the interface */
2449 if (!if_indextoname(netdev->ifindex, current_ifname)) {
36eb9bde 2450 ERROR("no interface corresponding to index '%d'",
82d5ae15 2451 netdev->ifindex);
0ad19a3f 2452 return -1;
2453 }
13954cce 2454
018ef520 2455 /* default: let the system to choose one interface name */
9d083402 2456 if (!netdev->name)
fb6d9b2f
DL
2457 netdev->name = netdev->type == LXC_NET_PHYS ?
2458 netdev->link : "eth%d";
018ef520 2459
82d5ae15 2460 /* rename the interface name */
40790553
SH
2461 if (strcmp(ifname, netdev->name) != 0) {
2462 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2463 if (err) {
2464 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2465 strerror(-err));
2466 return -1;
2467 }
018ef520
DL
2468 }
2469
2470 /* Re-read the name of the interface because its name has changed
2471 * and would be automatically allocated by the system
2472 */
82d5ae15 2473 if (!if_indextoname(netdev->ifindex, current_ifname)) {
018ef520 2474 ERROR("no interface corresponding to index '%d'",
82d5ae15 2475 netdev->ifindex);
018ef520 2476 return -1;
0ad19a3f 2477 }
2478
82d5ae15
DL
2479 /* set a mac address */
2480 if (netdev->hwaddr) {
2481 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
36eb9bde 2482 ERROR("failed to setup hw address for '%s'",
82d5ae15 2483 current_ifname);
0ad19a3f 2484 return -1;
2485 }
2486 }
2487
82d5ae15
DL
2488 /* setup ipv4 addresses on the interface */
2489 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
36eb9bde 2490 ERROR("failed to setup ip addresses for '%s'",
0ad19a3f 2491 ifname);
2492 return -1;
2493 }
2494
82d5ae15
DL
2495 /* setup ipv6 addresses on the interface */
2496 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
36eb9bde 2497 ERROR("failed to setup ipv6 addresses for '%s'",
0ad19a3f 2498 ifname);
2499 return -1;
2500 }
2501
82d5ae15 2502 /* set the network device up */
b0efbac4 2503 if (netdev->flags & IFF_UP) {
3cfc0f3a
MN
2504 int err;
2505
d472214b 2506 err = lxc_netdev_up(current_ifname);
3cfc0f3a
MN
2507 if (err) {
2508 ERROR("failed to set '%s' up : %s", current_ifname,
2509 strerror(-err));
0ad19a3f 2510 return -1;
2511 }
2512
2513 /* the network is up, make the loopback up too */
d472214b 2514 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2515 if (err) {
2516 ERROR("failed to set the loopback up : %s",
2517 strerror(-err));
0ad19a3f 2518 return -1;
2519 }
2520 }
2521
f8fee0e2
MK
2522 /* We can only set up the default routes after bringing
2523 * up the interface, sine bringing up the interface adds
2524 * the link-local routes and we can't add a default
2525 * route if the gateway is not reachable. */
2526
2527 /* setup ipv4 gateway on the interface */
2528 if (netdev->ipv4_gateway) {
2529 if (!(netdev->flags & IFF_UP)) {
2530 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2531 return -1;
2532 }
2533
2534 if (lxc_list_empty(&netdev->ipv4)) {
2535 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2536 return -1;
2537 }
2538
2539 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2540 if (err) {
fc739df5
SG
2541 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2542 if (err) {
2543 ERROR("failed to add ipv4 dest for '%s': %s",
2544 ifname, strerror(-err));
2545 }
2546
2547 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2548 if (err) {
2549 ERROR("failed to setup ipv4 gateway for '%s': %s",
2550 ifname, strerror(-err));
2551 if (netdev->ipv4_gateway_auto) {
2552 char buf[INET_ADDRSTRLEN];
2553 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2554 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2555 }
2556 return -1;
19a26f82 2557 }
f8fee0e2
MK
2558 }
2559 }
2560
2561 /* setup ipv6 gateway on the interface */
2562 if (netdev->ipv6_gateway) {
2563 if (!(netdev->flags & IFF_UP)) {
2564 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2565 return -1;
2566 }
2567
2568 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2569 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2570 return -1;
2571 }
2572
2573 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2574 if (err) {
fc739df5
SG
2575 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2576 if (err) {
2577 ERROR("failed to add ipv6 dest for '%s': %s",
f8fee0e2 2578 ifname, strerror(-err));
19a26f82 2579 }
fc739df5
SG
2580
2581 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2582 if (err) {
2583 ERROR("failed to setup ipv6 gateway for '%s': %s",
2584 ifname, strerror(-err));
2585 if (netdev->ipv6_gateway_auto) {
2586 char buf[INET6_ADDRSTRLEN];
2587 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2588 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2589 }
2590 return -1;
2591 }
f8fee0e2
MK
2592 }
2593 }
2594
cd54d859
DL
2595 DEBUG("'%s' has been setup", current_ifname);
2596
0ad19a3f 2597 return 0;
2598}
2599
5f4535a3 2600static int setup_network(struct lxc_list *network)
0ad19a3f 2601{
82d5ae15 2602 struct lxc_list *iterator;
82d5ae15 2603 struct lxc_netdev *netdev;
0ad19a3f 2604
5f4535a3 2605 lxc_list_for_each(iterator, network) {
cd54d859 2606
5f4535a3 2607 netdev = iterator->elem;
82d5ae15
DL
2608
2609 if (setup_netdev(netdev)) {
2610 ERROR("failed to setup netdev");
2611 return -1;
2612 }
2613 }
cd54d859 2614
5f4535a3
DL
2615 if (!lxc_list_empty(network))
2616 INFO("network has been setup");
cd54d859
DL
2617
2618 return 0;
0ad19a3f 2619}
2620
c6d09e15
WB
2621static int parse_resource(const char *res) {
2622 size_t i;
2623 int resid = -1;
2624
2625 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2626 if (strcmp(res, limit_opt[i].name) == 0)
2627 return limit_opt[i].value;
2628 }
2629
2630 /* try to see if it's numeric, so the user may specify
2631 * resources that the running kernel knows about but
2632 * we don't */
2633 if (lxc_safe_int(res, &resid) == 0)
2634 return resid;
2635 return -1;
2636}
2637
2638int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2639 struct lxc_list *it;
2640 struct lxc_limit *lim;
2641 int resid;
2642
2643 lxc_list_for_each(it, limits) {
2644 lim = it->elem;
2645
2646 resid = parse_resource(lim->resource);
2647 if (resid < 0) {
2648 ERROR("unknown resource %s", lim->resource);
2649 return -1;
2650 }
2651
2652 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2653 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2654 return -1;
2655 }
2656 }
2657 return 0;
2658}
2659
2af6bd1b 2660/* try to move physical nics to the init netns */
5610055a 2661void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2af6bd1b 2662{
64d2fcb5 2663 int i, oldfd;
4ec31c52 2664 char ifname[IFNAMSIZ];
2af6bd1b 2665
5610055a 2666 if (netnsfd < 0 || conf->num_savednics == 0)
2af6bd1b
SH
2667 return;
2668
64d2fcb5 2669 INFO("Running to reset %d nic names.", conf->num_savednics);
5610055a 2670
64d2fcb5
CB
2671 oldfd = lxc_preserve_ns(getpid(), "net");
2672 if (oldfd < 0) {
2673 SYSERROR("Failed to open monitor netns fd.");
2af6bd1b
SH
2674 return;
2675 }
64d2fcb5 2676
2af6bd1b
SH
2677 if (setns(netnsfd, 0) != 0) {
2678 SYSERROR("Failed to enter container netns to reset nics");
2679 close(oldfd);
2680 return;
2681 }
2682 for (i=0; i<conf->num_savednics; i++) {
2683 struct saved_nic *s = &conf->saved_nics[i];
f2e206ff 2684 /* retrieve the name of the interface */
2685 if (!if_indextoname(s->ifindex, ifname)) {
2686 WARN("no interface corresponding to index '%d'", s->ifindex);
2687 continue;
2688 }
5610055a 2689 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
f2e206ff 2690 WARN("Error moving nic name:%s back to host netns", ifname);
5610055a 2691 free(s->orig_name);
2af6bd1b 2692 }
5610055a
WB
2693 conf->num_savednics = 0;
2694
2af6bd1b
SH
2695 if (setns(oldfd, 0) != 0)
2696 SYSERROR("Failed to re-enter monitor's netns");
2697 close(oldfd);
2698}
2699
ae9242c8
SH
2700static char *default_rootfs_mount = LXCROOTFSMOUNT;
2701
7b379ab3 2702struct lxc_conf *lxc_conf_init(void)
089cd8b8 2703{
7b379ab3 2704 struct lxc_conf *new;
26ddeedd 2705 int i;
7b379ab3
MN
2706
2707 new = malloc(sizeof(*new));
2708 if (!new) {
2709 ERROR("lxc_conf_init : %m");
2710 return NULL;
2711 }
2712 memset(new, 0, sizeof(*new));
2713
b40a606e 2714 new->loglevel = LXC_LOG_PRIORITY_NOTSET;
cccc74b5 2715 new->personality = -1;
124fa0a8 2716 new->autodev = 1;
596a818d
DE
2717 new->console.log_path = NULL;
2718 new->console.log_fd = -1;
28a4b0e5 2719 new->console.path = NULL;
63376d7d 2720 new->console.peer = -1;
b5159817
DE
2721 new->console.peerpty.busy = -1;
2722 new->console.peerpty.master = -1;
2723 new->console.peerpty.slave = -1;
63376d7d
DL
2724 new->console.master = -1;
2725 new->console.slave = -1;
2726 new->console.name[0] = '\0';
d2e30e99 2727 new->maincmd_fd = -1;
76a26f55 2728 new->nbd_idx = -1;
54c30e29 2729 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048
SH
2730 if (!new->rootfs.mount) {
2731 ERROR("lxc_conf_init : %m");
2732 free(new);
2733 return NULL;
2734 }
d89de239 2735 new->kmsg = 0;
858377e4 2736 new->logfd = -1;
7b379ab3
MN
2737 lxc_list_init(&new->cgroup);
2738 lxc_list_init(&new->network);
2739 lxc_list_init(&new->mount_list);
81810dd1 2740 lxc_list_init(&new->caps);
1fb86a7c 2741 lxc_list_init(&new->keepcaps);
f6d3e3e4 2742 lxc_list_init(&new->id_map);
f979ac15 2743 lxc_list_init(&new->includes);
4184c3e1 2744 lxc_list_init(&new->aliens);
7c661726 2745 lxc_list_init(&new->environment);
c6d09e15 2746 lxc_list_init(&new->limits);
26ddeedd
SH
2747 for (i=0; i<NUM_LXC_HOOKS; i++)
2748 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2749 lxc_list_init(&new->groups);
fe4de9a6
DE
2750 new->lsm_aa_profile = NULL;
2751 new->lsm_se_context = NULL;
5112cd70 2752 new->tmp_umount_proc = 0;
7b379ab3 2753
9f30a190
MM
2754 for (i = 0; i < LXC_NS_MAX; i++)
2755 new->inherit_ns_fd[i] = -1;
2756
72bb04e4
PT
2757 /* if running in a new user namespace, init and COMMAND
2758 * default to running as UID/GID 0 when using lxc-execute */
2759 new->init_uid = 0;
2760 new->init_gid = 0;
2761
7b379ab3 2762 return new;
089cd8b8
DL
2763}
2764
a589434e 2765static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2766{
8634bc19 2767 char veth1buf[IFNAMSIZ], *veth1;
0e391e57 2768 char veth2buf[IFNAMSIZ], *veth2;
b7b2fde4
CB
2769 int bridge_index, err;
2770 unsigned int mtu = 0;
13954cce 2771
8bee8851 2772 if (netdev->priv.veth_attr.pair) {
e892973e 2773 veth1 = netdev->priv.veth_attr.pair;
8bee8851
WB
2774 if (handler->conf->reboot)
2775 lxc_netdev_delete_by_name(veth1);
2776 } else {
9ba8130c
SH
2777 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2778 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2779 ERROR("veth1 name too long");
2780 return -1;
2781 }
a0265685 2782 veth1 = lxc_mkifname(veth1buf);
ad40563e
ÇO
2783 if (!veth1) {
2784 ERROR("failed to allocate a temporary name");
2785 return -1;
2786 }
74a2b586
JK
2787 /* store away for deconf */
2788 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
8634bc19 2789 }
82d5ae15 2790
0e391e57 2791 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
a0265685 2792 veth2 = lxc_mkifname(veth2buf);
ad40563e 2793 if (!veth2) {
82d5ae15 2794 ERROR("failed to allocate a temporary name");
ad40563e 2795 goto out_delete;
0ad19a3f 2796 }
2797
3cfc0f3a
MN
2798 err = lxc_veth_create(veth1, veth2);
2799 if (err) {
2e2d6a7b 2800 ERROR("failed to create veth pair (%s and %s): %s", veth1, veth2,
3cfc0f3a 2801 strerror(-err));
ad40563e 2802 goto out_delete;
0ad19a3f 2803 }
13954cce 2804
49684c0b
CS
2805 /* changing the high byte of the mac address to 0xfe, the bridge interface
2806 * will always keep the host's mac address and not take the mac address
2807 * of a container */
2808 err = setup_private_host_hw_addr(veth1);
2809 if (err) {
2e2d6a7b 2810 ERROR("failed to change mac address of host interface '%s': %s",
49684c0b
CS
2811 veth1, strerror(-err));
2812 goto out_delete;
2813 }
2814
af651aa9
SN
2815 netdev->ifindex = if_nametoindex(veth2);
2816 if (!netdev->ifindex) {
2817 ERROR("failed to retrieve the index for %s", veth2);
2818 goto out_delete;
2819 }
2820
82d5ae15 2821 if (netdev->mtu) {
b7b2fde4
CB
2822 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
2823 WARN("Failed to parse mtu from.");
2824 else
2825 INFO("Retrieved mtu %d", mtu);
e54864d3 2826 } else if (netdev->link) {
e9280f65 2827 bridge_index = if_nametoindex(netdev->link);
729e8bf6
CB
2828 if (bridge_index) {
2829 mtu = netdev_get_mtu(bridge_index);
2830 INFO("Retrieved mtu %d from %s", mtu, netdev->link);
2831 } else {
2832 mtu = netdev_get_mtu(netdev->ifindex);
2833 INFO("Retrieved mtu %d from %s", mtu, veth2);
2834 }
e54864d3
NC
2835 }
2836
2837 if (mtu) {
2838 err = lxc_netdev_set_mtu(veth1, mtu);
3cfc0f3a 2839 if (!err)
e54864d3 2840 err = lxc_netdev_set_mtu(veth2, mtu);
3cfc0f3a 2841 if (err) {
e54864d3
NC
2842 ERROR("failed to set mtu '%i' for veth pair (%s and %s): %s",
2843 mtu, veth1, veth2, strerror(-err));
eb14c10a 2844 goto out_delete;
75d09f83
DL
2845 }
2846 }
2847
3cfc0f3a 2848 if (netdev->link) {
c43cbc04 2849 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
3cfc0f3a 2850 if (err) {
2e2d6a7b 2851 ERROR("failed to attach '%s' to the bridge '%s': %s",
3cfc0f3a
MN
2852 veth1, netdev->link, strerror(-err));
2853 goto out_delete;
2854 }
738d0deb 2855 INFO("Attached '%s': to the bridge '%s': ", veth1, netdev->link);
eb14c10a
DL
2856 }
2857
d472214b 2858 err = lxc_netdev_up(veth1);
6e35af2e
DL
2859 if (err) {
2860 ERROR("failed to set %s up : %s", veth1, strerror(-err));
2861 goto out_delete;
0ad19a3f 2862 }
2863
e3b4c4c4 2864 if (netdev->upscript) {
751d9dcd
DL
2865 err = run_script(handler->name, "net", netdev->upscript, "up",
2866 "veth", veth1, (char*) NULL);
2867 if (err)
e3b4c4c4 2868 goto out_delete;
e3b4c4c4
ST
2869 }
2870
a589434e 2871 DEBUG("instantiated veth '%s/%s', index is '%d'",
82d5ae15
DL
2872 veth1, veth2, netdev->ifindex);
2873
6ab9ab6d 2874 return 0;
eb14c10a
DL
2875
2876out_delete:
b316d209
CB
2877 if (netdev->ifindex != 0)
2878 lxc_netdev_delete_by_name(veth1);
f10fad2f 2879 if (!netdev->priv.veth_attr.pair)
ad40563e 2880 free(veth1);
f10fad2f 2881 free(veth2);
6ab9ab6d 2882 return -1;
13954cce 2883}
d957ae2d 2884
74a2b586
JK
2885static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2886{
2887 char *veth1;
2888 int err;
2889
2890 if (netdev->priv.veth_attr.pair)
2891 veth1 = netdev->priv.veth_attr.pair;
2892 else
2893 veth1 = netdev->priv.veth_attr.veth1;
2894
2895 if (netdev->downscript) {
2896 err = run_script(handler->name, "net", netdev->downscript,
2897 "down", "veth", veth1, (char*) NULL);
2898 if (err)
2899 return -1;
2900 }
2901 return 0;
2902}
2903
a589434e 2904static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2905{
0e391e57 2906 char peerbuf[IFNAMSIZ], *peer;
3cfc0f3a 2907 int err;
d957ae2d
MT
2908
2909 if (!netdev->link) {
2910 ERROR("no link specified for macvlan netdev");
2911 return -1;
2912 }
13954cce 2913
9ba8130c
SH
2914 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2915 if (err >= sizeof(peerbuf))
2916 return -1;
82d5ae15 2917
a0265685 2918 peer = lxc_mkifname(peerbuf);
ad40563e 2919 if (!peer) {
82d5ae15
DL
2920 ERROR("failed to make a temporary name");
2921 return -1;
0ad19a3f 2922 }
2923
3cfc0f3a
MN
2924 err = lxc_macvlan_create(netdev->link, peer,
2925 netdev->priv.macvlan_attr.mode);
2926 if (err) {
2927 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2928 peer, netdev->link, strerror(-err));
ad40563e 2929 goto out;
0ad19a3f 2930 }
2931
82d5ae15
DL
2932 netdev->ifindex = if_nametoindex(peer);
2933 if (!netdev->ifindex) {
36eb9bde 2934 ERROR("failed to retrieve the index for %s", peer);
ad40563e 2935 goto out;
22ebac19 2936 }
2937
e3b4c4c4 2938 if (netdev->upscript) {
751d9dcd
DL
2939 err = run_script(handler->name, "net", netdev->upscript, "up",
2940 "macvlan", netdev->link, (char*) NULL);
2941 if (err)
ad40563e 2942 goto out;
e3b4c4c4
ST
2943 }
2944
a589434e 2945 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
e892973e 2946 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
0ad19a3f 2947
d957ae2d 2948 return 0;
ad40563e
ÇO
2949out:
2950 lxc_netdev_delete_by_name(peer);
2951 free(peer);
2952 return -1;
0ad19a3f 2953}
2954
74a2b586
JK
2955static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2956{
2957 int err;
2958
2959 if (netdev->downscript) {
2960 err = run_script(handler->name, "net", netdev->downscript,
2961 "down", "macvlan", netdev->link,
2962 (char*) NULL);
2963 if (err)
2964 return -1;
2965 }
2966 return 0;
2967}
2968
a589434e
JN
2969/* XXX: merge with instantiate_macvlan */
2970static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
26c39028
JHS
2971{
2972 char peer[IFNAMSIZ];
3cfc0f3a 2973 int err;
82f58d03 2974 static uint16_t vlan_cntr = 0;
b7b2fde4 2975 unsigned int mtu = 0;
26c39028
JHS
2976
2977 if (!netdev->link) {
2978 ERROR("no link specified for vlan netdev");
2979 return -1;
2980 }
2981
82f58d03 2982 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
9ba8130c
SH
2983 if (err >= sizeof(peer)) {
2984 ERROR("peer name too long");
2985 return -1;
2986 }
26c39028 2987
3cfc0f3a
MN
2988 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2989 if (err) {
2990 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2991 peer, netdev->link, strerror(-err));
26c39028
JHS
2992 return -1;
2993 }
2994
2995 netdev->ifindex = if_nametoindex(peer);
2996 if (!netdev->ifindex) {
2997 ERROR("failed to retrieve the ifindex for %s", peer);
b84f58b9 2998 lxc_netdev_delete_by_name(peer);
26c39028
JHS
2999 return -1;
3000 }
3001
a589434e 3002 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
e892973e 3003 netdev->ifindex);
b4fb7de1 3004 if (netdev->mtu) {
b7b2fde4
CB
3005 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
3006 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
3007 netdev->ifindex, netdev->name);
3008 return -1;
3009 }
3010 err = lxc_netdev_set_mtu(peer, mtu);
b4fb7de1
VL
3011 if (err) {
3012 ERROR("failed to set mtu '%s' for %s : %s",
3013 netdev->mtu, peer, strerror(-err));
3014 lxc_netdev_delete_by_name(peer);
3015 return -1;
3016 }
3017 }
e892973e 3018
26c39028
JHS
3019 return 0;
3020}
3021
74a2b586
JK
3022static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
3023{
3024 return 0;
3025}
3026
a589434e 3027static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 3028{
6168e99f
DL
3029 if (!netdev->link) {
3030 ERROR("no link specified for the physical interface");
3031 return -1;
3032 }
3033
9d083402 3034 netdev->ifindex = if_nametoindex(netdev->link);
82d5ae15 3035 if (!netdev->ifindex) {
9d083402 3036 ERROR("failed to retrieve the index for %s", netdev->link);
0ad19a3f 3037 return -1;
3038 }
3039
e3b4c4c4
ST
3040 if (netdev->upscript) {
3041 int err;
751d9dcd
DL
3042 err = run_script(handler->name, "net", netdev->upscript,
3043 "up", "phys", netdev->link, (char*) NULL);
3044 if (err)
e3b4c4c4 3045 return -1;
e3b4c4c4
ST
3046 }
3047
82d5ae15 3048 return 0;
0ad19a3f 3049}
3050
74a2b586
JK
3051static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
3052{
3053 int err;
3054
3055 if (netdev->downscript) {
3056 err = run_script(handler->name, "net", netdev->downscript,
3057 "down", "phys", netdev->link, (char*) NULL);
3058 if (err)
3059 return -1;
3060 }
3061 return 0;
3062}
3063
a589434e 3064static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
26b797f3
SH
3065{
3066 netdev->ifindex = 0;
3067 return 0;
3068}
3069
a589434e 3070static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 3071{
82d5ae15 3072 netdev->ifindex = 0;
e3b4c4c4
ST
3073 if (netdev->upscript) {
3074 int err;
751d9dcd
DL
3075 err = run_script(handler->name, "net", netdev->upscript,
3076 "up", "empty", (char*) NULL);
3077 if (err)
e3b4c4c4 3078 return -1;
e3b4c4c4 3079 }
82d5ae15 3080 return 0;
0ad19a3f 3081}
3082
74a2b586
JK
3083static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3084{
3085 int err;
3086
3087 if (netdev->downscript) {
3088 err = run_script(handler->name, "net", netdev->downscript,
3089 "down", "empty", (char*) NULL);
3090 if (err)
3091 return -1;
3092 }
3093 return 0;
3094}
3095
26b797f3
SH
3096static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3097{
3098 return 0;
3099}
3100
3101int lxc_requests_empty_network(struct lxc_handler *handler)
3102{
3103 struct lxc_list *network = &handler->conf->network;
3104 struct lxc_list *iterator;
3105 struct lxc_netdev *netdev;
3106 bool found_none = false, found_nic = false;
3107
3108 if (lxc_list_empty(network))
3109 return 0;
3110
3111 lxc_list_for_each(iterator, network) {
3112
3113 netdev = iterator->elem;
3114
3115 if (netdev->type == LXC_NET_NONE)
3116 found_none = true;
3117 else
3118 found_nic = true;
3119 }
3120 if (found_none && !found_nic)
3121 return 1;
3122 return 0;
3123}
3124
e3b4c4c4 3125int lxc_create_network(struct lxc_handler *handler)
0ad19a3f 3126{
e3b4c4c4 3127 struct lxc_list *network = &handler->conf->network;
82d5ae15 3128 struct lxc_list *iterator;
82d5ae15 3129 struct lxc_netdev *netdev;
cbef6c52
SH
3130 int am_root = (getuid() == 0);
3131
3132 if (!am_root)
3133 return 0;
0ad19a3f 3134
5f4535a3 3135 lxc_list_for_each(iterator, network) {
0ad19a3f 3136
5f4535a3 3137 netdev = iterator->elem;
13954cce 3138
24654103 3139 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
82d5ae15 3140 ERROR("invalid network configuration type '%d'",
5f4535a3 3141 netdev->type);
82d5ae15
DL
3142 return -1;
3143 }
0ad19a3f 3144
e3b4c4c4 3145 if (netdev_conf[netdev->type](handler, netdev)) {
82d5ae15
DL
3146 ERROR("failed to create netdev");
3147 return -1;
3148 }
e3b4c4c4 3149
0ad19a3f 3150 }
3151
3152 return 0;
3153}
3154
358daf49 3155bool lxc_delete_network(struct lxc_handler *handler)
7fef7a06 3156{
e97946ae 3157 int ret;
74a2b586 3158 struct lxc_list *network = &handler->conf->network;
7fef7a06
DL
3159 struct lxc_list *iterator;
3160 struct lxc_netdev *netdev;
358daf49 3161 bool deleted_all = true;
7fef7a06
DL
3162
3163 lxc_list_for_each(iterator, network) {
3164 netdev = iterator->elem;
d472214b 3165
74a2b586 3166 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
d8f8e352 3167 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
358daf49
CB
3168 WARN("Failed to rename interface with index %d "
3169 "to its initial name \"%s\".",
3170 netdev->ifindex, netdev->link);
d472214b 3171 continue;
d8f8e352 3172 }
d472214b 3173
74a2b586 3174 if (netdev_deconf[netdev->type](handler, netdev)) {
e97946ae 3175 WARN("Failed to destroy netdev");
74a2b586
JK
3176 }
3177
d8f8e352
DL
3178 /* Recent kernel remove the virtual interfaces when the network
3179 * namespace is destroyed but in case we did not moved the
3180 * interface to the network namespace, we have to destroy it
3181 */
e97946ae
CB
3182 if (netdev->ifindex != 0) {
3183 ret = lxc_netdev_delete_by_index(netdev->ifindex);
358daf49
CB
3184 if (-ret == ENODEV) {
3185 INFO("Interface \"%s\" with index %d already "
3186 "deleted or existing in different network "
3187 "namespace.",
3188 netdev->name ? netdev->name : "(null)",
3189 netdev->ifindex);
3190 } else if (ret < 0) {
3191 deleted_all = false;
3192 WARN("Failed to remove interface \"%s\" with "
3193 "index %d: %s.",
3194 netdev->name ? netdev->name : "(null)",
3195 netdev->ifindex, strerror(-ret));
3196 } else {
3197 INFO("Removed interface \"%s\" with index %d.",
3198 netdev->name ? netdev->name : "(null)",
3199 netdev->ifindex);
3200 }
e97946ae
CB
3201 }
3202
3203 /* Explicitly delete host veth device to prevent lingering
3204 * devices. We had issues in LXD around this.
3205 */
b316d209 3206 if (netdev->ifindex != 0 && netdev->type == LXC_NET_VETH && !am_unpriv()) {
358daf49
CB
3207 char *hostveth;
3208 if (netdev->priv.veth_attr.pair) {
e97946ae 3209 hostveth = netdev->priv.veth_attr.pair;
358daf49
CB
3210 ret = lxc_netdev_delete_by_name(hostveth);
3211 if (ret < 0) {
3212 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3213 } else {
3214 INFO("Removed interface \"%s\" from host.", hostveth);
358daf49
CB
3215 }
3216 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
e97946ae 3217 hostveth = netdev->priv.veth_attr.veth1;
e97946ae 3218 ret = lxc_netdev_delete_by_name(hostveth);
358daf49
CB
3219 if (ret < 0) {
3220 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3221 } else {
3222 INFO("Removed interface \"%s\" from host.", hostveth);
3223 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3224 }
e97946ae
CB
3225 }
3226 }
7fef7a06 3227 }
358daf49
CB
3228
3229 return deleted_all;
7fef7a06
DL
3230}
3231
45e854dc
SG
3232#define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3233
fe1f672f 3234/* lxc-user-nic returns "interface_name:interface_name\n" */
eab15c1e 3235#define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
c43cbc04
SH
3236static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3237 struct lxc_netdev *netdev, pid_t pid)
cbef6c52
SH
3238{
3239 pid_t child;
a7242d9a
ÇO
3240 int bytes, pipefd[2];
3241 char *token, *saveptr = NULL;
fe1f672f 3242 char buffer[MAX_BUFFER_SIZE];
091045f8 3243 char netdev_link[IFNAMSIZ + 1];
cbef6c52
SH
3244
3245 if (netdev->type != LXC_NET_VETH) {
3246 ERROR("nic type %d not support for unprivileged use",
091045f8 3247 netdev->type);
cbef6c52
SH
3248 return -1;
3249 }
3250
091045f8 3251 if (pipe(pipefd) < 0) {
a7242d9a
ÇO
3252 SYSERROR("pipe failed");
3253 return -1;
3254 }
3255
091045f8
CB
3256 child = fork();
3257 if (child < 0) {
cbef6c52 3258 SYSERROR("fork");
a7242d9a
ÇO
3259 close(pipefd[0]);
3260 close(pipefd[1]);
3261 return -1;
3262 }
3263
3264 if (child == 0) { // child
091045f8
CB
3265 /* Call lxc-user-nic pid type bridge. */
3266 int ret;
3267 char pidstr[LXC_NUMSTRLEN64];
3268
3269 close(pipefd[0]); /* Close the read-end of the pipe. */
3270
3271 /* Redirect stdout to write-end of the pipe. */
3272 ret = dup2(pipefd[1], STDOUT_FILENO);
3273 close(pipefd[1]); /* Close the write-end of the pipe. */
3274 if (ret < 0) {
3275 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3276 exit(EXIT_FAILURE);
3277 }
a7242d9a 3278
091045f8 3279 if (netdev->link)
cff7b5eb 3280 strncpy(netdev_link, netdev->link, IFNAMSIZ);
091045f8 3281 else
cff7b5eb 3282 strncpy(netdev_link, "none", IFNAMSIZ);
091045f8
CB
3283
3284 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3285 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3286 exit(EXIT_FAILURE);
3287 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3288
3289 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3290 lxcname, pidstr, netdev_link, netdev->name);
c43cbc04 3291 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
091045f8
CB
3292 pidstr, "veth", netdev_link, netdev->name, NULL);
3293
3294 SYSERROR("Failed to exec lxc-user-nic.");
3295 exit(EXIT_FAILURE);
a7242d9a
ÇO
3296 }
3297
3298 /* close the write-end of the pipe */
3299 close(pipefd[1]);
3300
fe1f672f 3301 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
091045f8
CB
3302 if (bytes < 0)
3303 SYSERROR("Failed to read from pipe file descriptor.");
a7242d9a
ÇO
3304 buffer[bytes - 1] = '\0';
3305
3306 if (wait_for_pid(child) != 0) {
3307 close(pipefd[0]);
cbef6c52
SH
3308 return -1;
3309 }
3310
a7242d9a
ÇO
3311 /* close the read-end of the pipe */
3312 close(pipefd[0]);
cbef6c52 3313
a7242d9a
ÇO
3314 /* fill netdev->name field */
3315 token = strtok_r(buffer, ":", &saveptr);
3316 if (!token)
3317 return -1;
091045f8
CB
3318
3319 netdev->name = malloc(IFNAMSIZ + 1);
658979c5 3320 if (!netdev->name) {
091045f8 3321 SYSERROR("Failed to allocate memory.");
658979c5
SH
3322 return -1;
3323 }
091045f8 3324 memset(netdev->name, 0, IFNAMSIZ + 1);
658979c5 3325 strncpy(netdev->name, token, IFNAMSIZ);
a7242d9a
ÇO
3326
3327 /* fill netdev->veth_attr.pair field */
3328 token = strtok_r(NULL, ":", &saveptr);
3329 if (!token)
3330 return -1;
091045f8 3331
a7242d9a 3332 netdev->priv.veth_attr.pair = strdup(token);
658979c5 3333 if (!netdev->priv.veth_attr.pair) {
091045f8 3334 ERROR("Failed to allocate memory.");
658979c5
SH
3335 return -1;
3336 }
45e854dc 3337
a7242d9a 3338 return 0;
cbef6c52
SH
3339}
3340
c43cbc04
SH
3341int lxc_assign_network(const char *lxcpath, char *lxcname,
3342 struct lxc_list *network, pid_t pid)
0ad19a3f 3343{
82d5ae15 3344 struct lxc_list *iterator;
82d5ae15 3345 struct lxc_netdev *netdev;
f2e206ff 3346 char ifname[IFNAMSIZ];
cbef6c52 3347 int am_root = (getuid() == 0);
3cfc0f3a 3348 int err;
0ad19a3f 3349
5f4535a3 3350 lxc_list_for_each(iterator, network) {
82d5ae15 3351
5f4535a3 3352 netdev = iterator->elem;
82d5ae15 3353
fbb16259 3354 if (netdev->type == LXC_NET_VETH && !am_root) {
72ccbbe1
SC
3355 if (netdev->mtu)
3356 INFO("mtu ignored due to insufficient privilege");
c43cbc04 3357 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
cbef6c52 3358 return -1;
658979c5
SH
3359 // lxc-user-nic has moved the nic to the new ns.
3360 // unpriv_assign_nic() fills in netdev->name.
3361 // netdev->ifindex will be filed in at setup_netdev.
cbef6c52
SH
3362 continue;
3363 }
236087a6 3364
fbb16259
SH
3365 /* empty network namespace, nothing to move */
3366 if (!netdev->ifindex)
3367 continue;
3368
f2e206ff 3369 /* retrieve the name of the interface */
3370 if (!if_indextoname(netdev->ifindex, ifname)) {
3371 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3372 return -1;
3373 }
3374
3375 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3cfc0f3a
MN
3376 if (err) {
3377 ERROR("failed to move '%s' to the container : %s",
3378 netdev->link, strerror(-err));
82d5ae15
DL
3379 return -1;
3380 }
3381
198cbbaa 3382 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
0ad19a3f 3383 }
3384
3385 return 0;
3386}
3387
251d0d2a
DE
3388static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3389 size_t buf_size)
f6d3e3e4 3390{
29053180
CB
3391 char path[MAXPATHLEN];
3392 int fd, ret;
f6d3e3e4 3393
29053180
CB
3394 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3395 idtype == ID_TYPE_UID ? 'u' : 'g');
3396 if (ret < 0 || ret >= MAXPATHLEN) {
3397 ERROR("failed to create path \"%s\"", path);
f6d3e3e4
SH
3398 return -E2BIG;
3399 }
29053180
CB
3400
3401 fd = open(path, O_WRONLY);
3402 if (fd < 0) {
3403 SYSERROR("failed to open \"%s\"", path);
3404 return -1;
f6d3e3e4 3405 }
29053180
CB
3406
3407 errno = 0;
3408 ret = lxc_write_nointr(fd, buf, buf_size);
3409 if (ret != buf_size) {
3410 SYSERROR("failed to write %cid mapping to \"%s\"",
3411 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3412 close(fd);
3413 return -1;
3414 }
3415 close(fd);
3416
3417 return 0;
f6d3e3e4
SH
3418}
3419
df6a2945
CB
3420/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both. */
3421static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3422{
3423 char *path;
3424 int ret;
3425 struct stat st;
3426 int fret = 0;
3427
3428 path = on_path(binary, NULL);
3429 if (!path)
3430 return -ENOENT;
3431
3432 ret = stat(path, &st);
3433 if (ret < 0) {
3434 fret = -errno;
3435 goto cleanup;
3436 }
3437
3438 /* Check if the binary is setuid. */
3439 if (st.st_mode & S_ISUID) {
3440 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3441 fret = 1;
3442 goto cleanup;
3443 }
3444
69924fff 3445 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
3446 /* Check if it has the CAP_SETUID capability. */
3447 if ((cap & CAP_SETUID) &&
3448 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3449 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3450 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3451 "and CAP_PERMITTED sets.", path);
3452 fret = 1;
3453 goto cleanup;
3454 }
3455
3456 /* Check if it has the CAP_SETGID capability. */
3457 if ((cap & CAP_SETGID) &&
3458 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3459 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3460 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3461 "and CAP_PERMITTED sets.", path);
3462 fret = 1;
3463 goto cleanup;
3464 }
d6018f88 3465 #else
69924fff
CB
3466 /* If we cannot check for file capabilities we need to give the benefit
3467 * of the doubt. Otherwise we might fail even though all the necessary
3468 * file capabilities are set.
3469 */
d6018f88
CB
3470 DEBUG("Cannot check for file capabilites as full capability support is "
3471 "missing. Manual intervention needed.");
3472 fret = 1;
df6a2945
CB
3473 #endif
3474
3475cleanup:
3476 free(path);
3477 return fret;
3478}
3479
986ef930
CB
3480int lxc_map_ids_exec_wrapper(void *args)
3481{
3482 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3483 return -1;
3484}
3485
f6d3e3e4
SH
3486int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3487{
f6d3e3e4 3488 struct id_map *map;
4bc3b759 3489 struct lxc_list *iterator;
251d0d2a 3490 enum idtype type;
986ef930 3491 char u_or_g;
4bc3b759 3492 char *pos;
99d43365 3493 int fill, left;
986ef930
CB
3494 char cmd_output[MAXPATHLEN];
3495 /* strlen("new@idmap") = 9
3496 * +
3497 * strlen(" ") = 1
3498 * +
3499 * LXC_NUMSTRLEN64
3500 * +
3501 * strlen(" ") = 1
3502 *
3503 * We add some additional space to make sure that we really have
3504 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3505 */
3506 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3507 int ret = 0, uidmap = 0, gidmap = 0;
3508 bool use_shadow = false, had_entry = false;
df6a2945
CB
3509
3510 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3511 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
3512 * will protected it by preventing another user from being handed the
3513 * range by shadow.
3514 */
df6a2945
CB
3515 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
3516 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
3517 if (uidmap > 0 && gidmap > 0) {
3518 DEBUG("Functional newuidmap and newgidmap binary found.");
4bc3b759 3519 use_shadow = true;
df6a2945 3520 } else {
99d43365
CB
3521 /* In case unprivileged users run application containers via
3522 * execute() or a start*() there are valid cases where they may
3523 * only want to map their own {g,u}id. Let's not block them from
3524 * doing so by requiring geteuid() == 0.
3525 */
3526 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3527 "write directly with euid %d.", geteuid());
0e6e3a41 3528 }
251d0d2a 3529
986ef930
CB
3530 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3531 type++, u_or_g = 'g') {
3532 pos = mapbuf;
3533
0e6e3a41 3534 if (use_shadow)
986ef930 3535 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 3536
cf3ef16d 3537 lxc_list_for_each(iterator, idmap) {
4bc3b759
CB
3538 /* The kernel only takes <= 4k for writes to
3539 * /proc/<nr>/[ug]id_map
3540 */
251d0d2a 3541 map = iterator->elem;
cf3ef16d
SH
3542 if (map->idtype != type)
3543 continue;
3544
4bc3b759
CB
3545 had_entry = true;
3546
986ef930 3547 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 3548 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
3549 use_shadow ? " " : "", map->nsid,
3550 map->hostid, map->range,
0e6e3a41 3551 use_shadow ? "" : "\n");
cf3ef16d 3552 if (fill <= 0 || fill >= left)
4bc3b759
CB
3553 SYSERROR("Too many {g,u}id mappings defined.");
3554
cf3ef16d 3555 pos += fill;
251d0d2a 3556 }
cf3ef16d 3557 if (!had_entry)
4f7521b4 3558 continue;
cf3ef16d 3559
986ef930
CB
3560 /* Try to catch the ouput of new{g,u}idmap to make debugging
3561 * easier.
3562 */
3563 if (use_shadow) {
3564 ret = run_command(cmd_output, sizeof(cmd_output),
3565 lxc_map_ids_exec_wrapper,
3566 (void *)mapbuf);
3567 if (ret < 0) {
3568 ERROR("new%cidmap failed to write mapping: %s",
3569 u_or_g, cmd_output);
3570 return -1;
3571 }
d1838f34 3572 } else {
986ef930
CB
3573 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3574 if (ret < 0)
3575 return -1;
d1838f34 3576 }
986ef930
CB
3577
3578 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 3579 }
251d0d2a 3580
986ef930 3581 return 0;
f6d3e3e4
SH
3582}
3583
cf3ef16d 3584/*
7b50c609
TS
3585 * return the host uid/gid to which the container root is mapped in
3586 * *val.
0b3a6504 3587 * Return true if id was found, false otherwise.
cf3ef16d 3588 */
2a9a80cb 3589bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 3590 unsigned long *val)
cf3ef16d
SH
3591{
3592 struct lxc_list *it;
3593 struct id_map *map;
3594
3595 lxc_list_for_each(it, &conf->id_map) {
3596 map = it->elem;
7b50c609 3597 if (map->idtype != idtype)
cf3ef16d
SH
3598 continue;
3599 if (map->nsid != 0)
3600 continue;
2a9a80cb
SH
3601 *val = map->hostid;
3602 return true;
cf3ef16d 3603 }
2a9a80cb 3604 return false;
cf3ef16d
SH
3605}
3606
2133f58c 3607int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3608{
3609 struct lxc_list *it;
3610 struct id_map *map;
3611 lxc_list_for_each(it, &conf->id_map) {
3612 map = it->elem;
2133f58c 3613 if (map->idtype != idtype)
cf3ef16d
SH
3614 continue;
3615 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 3616 return (id - map->hostid) + map->nsid;
cf3ef16d 3617 }
57d116ab 3618 return -1;
cf3ef16d
SH
3619}
3620
2133f58c 3621int find_unmapped_nsuid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3622{
3623 struct lxc_list *it;
3624 struct id_map *map;
2133f58c 3625 unsigned int freeid = 0;
cf3ef16d
SH
3626again:
3627 lxc_list_for_each(it, &conf->id_map) {
3628 map = it->elem;
2133f58c 3629 if (map->idtype != idtype)
cf3ef16d
SH
3630 continue;
3631 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3632 freeid = map->nsid + map->range;
3633 goto again;
3634 }
3635 }
3636 return freeid;
3637}
3638
19a26f82
MK
3639int lxc_find_gateway_addresses(struct lxc_handler *handler)
3640{
3641 struct lxc_list *network = &handler->conf->network;
3642 struct lxc_list *iterator;
3643 struct lxc_netdev *netdev;
3644 int link_index;
3645
3646 lxc_list_for_each(iterator, network) {
3647 netdev = iterator->elem;
3648
3649 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3650 continue;
3651
3652 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3653 ERROR("gateway = auto only supported for "
3654 "veth and macvlan");
3655 return -1;
3656 }
3657
3658 if (!netdev->link) {
3659 ERROR("gateway = auto needs a link interface");
3660 return -1;
3661 }
3662
3663 link_index = if_nametoindex(netdev->link);
3664 if (!link_index)
3665 return -EINVAL;
3666
3667 if (netdev->ipv4_gateway_auto) {
3668 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3669 ERROR("failed to automatically find ipv4 gateway "
3670 "address from link interface '%s'", netdev->link);
3671 return -1;
3672 }
3673 }
3674
3675 if (netdev->ipv6_gateway_auto) {
3676 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3677 ERROR("failed to automatically find ipv6 gateway "
3678 "address from link interface '%s'", netdev->link);
3679 return -1;
3680 }
3681 }
3682 }
3683
3684 return 0;
3685}
3686
5e4a62bf 3687int lxc_create_tty(const char *name, struct lxc_conf *conf)
b0a33c1e 3688{
5e4a62bf 3689 struct lxc_tty_info *tty_info = &conf->tty_info;
025ed0f3 3690 int i, ret;
b0a33c1e 3691
5e4a62bf
DL
3692 /* no tty in the configuration */
3693 if (!conf->tty)
b0a33c1e 3694 return 0;
3695
9e1045e3 3696 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
b0a33c1e 3697 if (!tty_info->pty_info) {
9e1045e3
CB
3698 SYSERROR("failed to allocate struct *pty_info");
3699 return -ENOMEM;
b0a33c1e 3700 }
3701
985d15b1 3702 for (i = 0; i < conf->tty; i++) {
b0a33c1e 3703 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3704
025ed0f3
SH
3705 process_lock();
3706 ret = openpty(&pty_info->master, &pty_info->slave,
9e1045e3 3707 pty_info->name, NULL, NULL);
025ed0f3
SH
3708 process_unlock();
3709 if (ret) {
9e1045e3 3710 SYSERROR("failed to create pty device number %d", i);
985d15b1
MT
3711 tty_info->nbtty = i;
3712 lxc_delete_tty(tty_info);
9e1045e3 3713 return -ENOTTY;
b0a33c1e 3714 }
3715
9e1045e3 3716 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
5332bb84
DL
3717 pty_info->name, pty_info->master, pty_info->slave);
3718
3ec1648d 3719 /* Prevent leaking the file descriptors to the container */
9e1045e3
CB
3720 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3721 if (ret < 0)
3722 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3723 "pty device \"%s\": %s",
3724 pty_info->master, pty_info->name, strerror(errno));
3725
3726 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3727 if (ret < 0)
3728 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3729 "pty device \"%s\": %s",
3730 pty_info->slave, pty_info->name, strerror(errno));
b035ad62 3731
b0a33c1e 3732 pty_info->busy = 0;
3733 }
3734
985d15b1 3735 tty_info->nbtty = conf->tty;
1ac470c0 3736
9e1045e3 3737 INFO("finished allocating %d pts devices", conf->tty);
985d15b1 3738 return 0;
b0a33c1e 3739}
3740
3741void lxc_delete_tty(struct lxc_tty_info *tty_info)
3742{
3743 int i;
3744
3745 for (i = 0; i < tty_info->nbtty; i++) {
3746 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3747
3748 close(pty_info->master);
3749 close(pty_info->slave);
3750 }
3751
3752 free(tty_info->pty_info);
e00c0242 3753 tty_info->pty_info = NULL;
b0a33c1e 3754 tty_info->nbtty = 0;
3755}
3756
f4f52cb5
CB
3757
3758int chown_mapped_root_exec_wrapper(void *args)
3759{
3760 execvp("lxc-usernsexec", args);
3761 return -1;
3762}
3763
f6d3e3e4 3764/*
7b50c609
TS
3765 * chown_mapped_root: for an unprivileged user with uid/gid X to
3766 * chown a dir to subuid/subgid Y, he needs to run chown as root
3767 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3768 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3769 * root is privileged with respect to hostuid/hostgid X, allowing
3770 * him to do the chown.
f6d3e3e4 3771 */
c4d10a05 3772int chown_mapped_root(char *path, struct lxc_conf *conf)
f6d3e3e4 3773{
f4f52cb5 3774 uid_t rootuid, rootgid;
2a9a80cb 3775 unsigned long val;
a7ef8753 3776 char *chownpath = path;
f4f52cb5
CB
3777 int hostuid, hostgid, ret;
3778 struct stat sb;
3779 char map1[100], map2[100], map3[100], map4[100], map5[100];
3780 char ugid[100];
3781 char *args1[] = {"lxc-usernsexec",
3782 "-m", map1,
3783 "-m", map2,
3784 "-m", map3,
3785 "-m", map5,
3786 "--", "chown", ugid, path,
3787 NULL};
3788 char *args2[] = {"lxc-usernsexec",
3789 "-m", map1,
3790 "-m", map2,
3791 "-m", map3,
3792 "-m", map4,
3793 "-m", map5,
3794 "--", "chown", ugid, path,
3795 NULL};
3796 char cmd_output[MAXPATHLEN];
3797
3798 hostuid = geteuid();
3799 hostgid = getegid();
f6d3e3e4 3800
2a9a80cb 3801 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 3802 ERROR("No uid mapping for container root");
c4d10a05 3803 return -1;
f6d3e3e4 3804 }
f4f52cb5 3805 rootuid = (uid_t)val;
7b50c609 3806 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 3807 ERROR("No gid mapping for container root");
7b50c609
TS
3808 return -1;
3809 }
f4f52cb5 3810 rootgid = (gid_t)val;
2a9a80cb 3811
a7ef8753 3812 /*
f4f52cb5 3813 * In case of overlay, we want only the writeable layer to be chowned
a7ef8753 3814 */
1f92162d 3815 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
a7ef8753
SH
3816 chownpath = strchr(path, ':');
3817 if (!chownpath) {
3818 ERROR("Bad overlay path: %s", path);
3819 return -1;
3820 }
f4f52cb5 3821 chownpath = strchr(chownpath + 1, ':');
a7ef8753
SH
3822 if (!chownpath) {
3823 ERROR("Bad overlay path: %s", path);
3824 return -1;
3825 }
3826 chownpath++;
3827 }
3828 path = chownpath;
f4f52cb5 3829 if (hostuid == 0) {
7b50c609 3830 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3831 ERROR("Error chowning %s", path);
3832 return -1;
3833 }
3834 return 0;
3835 }
f3d7e4ca 3836
f4f52cb5 3837 if (rootuid == hostuid) {
f3d7e4ca
SH
3838 // nothing to do
3839 INFO("%s: container root is our uid; no need to chown" ,__func__);
3840 return 0;
3841 }
3842
f4f52cb5
CB
3843 // save the current gid of "path"
3844 if (stat(path, &sb) < 0) {
3845 ERROR("Error stat %s", path);
f6d3e3e4
SH
3846 return -1;
3847 }
7b50c609 3848
f4f52cb5
CB
3849 /*
3850 * A file has to be group-owned by a gid mapped into the
3851 * container, or the container won't be privileged over it.
3852 */
3853 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3854 if (sb.st_uid == hostuid &&
3855 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3856 chown(path, -1, hostgid) < 0) {
3857 ERROR("Failed chgrping %s", path);
3858 return -1;
3859 }
f6d3e3e4 3860
f4f52cb5
CB
3861 // "u:0:rootuid:1"
3862 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3863 if (ret < 0 || ret >= 100) {
3864 ERROR("Error uid printing map string");
3865 return -1;
3866 }
7b50c609 3867
f4f52cb5
CB
3868 // "u:hostuid:hostuid:1"
3869 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3870 if (ret < 0 || ret >= 100) {
3871 ERROR("Error uid printing map string");
3872 return -1;
3873 }
c4d10a05 3874
f4f52cb5
CB
3875 // "g:0:rootgid:1"
3876 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3877 if (ret < 0 || ret >= 100) {
3878 ERROR("Error gid printing map string");
3879 return -1;
3880 }
98e5ba51 3881
f4f52cb5
CB
3882 // "g:pathgid:rootgid+pathgid:1"
3883 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3884 rootgid + (gid_t)sb.st_gid);
3885 if (ret < 0 || ret >= 100) {
3886 ERROR("Error gid printing map string");
3887 return -1;
3888 }
c4d10a05 3889
f4f52cb5
CB
3890 // "g:hostgid:hostgid:1"
3891 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3892 if (ret < 0 || ret >= 100) {
3893 ERROR("Error gid printing map string");
3894 return -1;
3895 }
7b50c609 3896
f4f52cb5
CB
3897 // "0:pathgid" (chown)
3898 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3899 if (ret < 0 || ret >= 100) {
3900 ERROR("Error owner printing format string for chown");
3901 return -1;
3902 }
7b50c609 3903
f4f52cb5
CB
3904 if (hostgid == sb.st_gid)
3905 ret = run_command(cmd_output, sizeof(cmd_output),
3906 chown_mapped_root_exec_wrapper,
3907 (void *)args1);
3908 else
3909 ret = run_command(cmd_output, sizeof(cmd_output),
3910 chown_mapped_root_exec_wrapper,
3911 (void *)args2);
3912 if (ret < 0)
3913 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 3914
f4f52cb5 3915 return ret;
f6d3e3e4
SH
3916}
3917
c4d10a05 3918int ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 3919{
c4d10a05 3920 if (lxc_list_empty(&c->id_map))
f6d3e3e4 3921 return 0;
c4d10a05 3922
29b10e4f 3923 if (strcmp(c->console.name, "") !=0 && chown_mapped_root(c->console.name, c) < 0) {
c4d10a05
SH
3924 ERROR("Failed to chown %s", c->console.name);
3925 return -1;
3926 }
3927
f6d3e3e4
SH
3928 return 0;
3929}
3930
943144d9
CB
3931/* NOTE: Must not be called from inside the container namespace! */
3932int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3933{
3934 int mounted;
3935
943144d9 3936 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3937 if (mounted == -1) {
943144d9 3938 SYSERROR("failed to mount /proc in the container");
01958b1f 3939 /* continue only if there is no rootfs */
943144d9 3940 if (conf->rootfs.path)
01958b1f 3941 return -1;
5112cd70 3942 } else if (mounted == 1) {
943144d9 3943 conf->tmp_umount_proc = 1;
5112cd70 3944 }
943144d9 3945
5112cd70
SH
3946 return 0;
3947}
3948
3949void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3950{
3951 if (lxc_conf->tmp_umount_proc == 1) {
3952 umount("/proc");
3953 lxc_conf->tmp_umount_proc = 0;
3954 }
3955}
3956
6a0c909a 3957void remount_all_slave(void)
e995d7a2
SH
3958{
3959 /* walk /proc/mounts and change any shared entries to slave */
3960 FILE *f = fopen("/proc/self/mountinfo", "r");
3961 char *line = NULL;
3962 size_t len = 0;
3963
3964 if (!f) {
3965 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3966 ERROR("Continuing container startup...");
3967 return;
3968 }
3969
3970 while (getline(&line, &len, f) != -1) {
3971 char *target, *opts;
3972 target = get_field(line, 4);
3973 if (!target)
3974 continue;
3975 opts = get_field(target, 2);
3976 if (!opts)
3977 continue;
3978 null_endofword(opts);
3979 if (!strstr(opts, "shared"))
3980 continue;
3981 null_endofword(target);
3982 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3983 SYSERROR("Failed to make %s rslave", target);
3984 ERROR("Continuing...");
3985 }
3986 }
3987 fclose(f);
f10fad2f 3988 free(line);
e995d7a2
SH
3989}
3990
2322903b
SH
3991void lxc_execute_bind_init(struct lxc_conf *conf)
3992{
3993 int ret;
9d9c111c
SH
3994 char path[PATH_MAX], destpath[PATH_MAX], *p;
3995
3996 /* If init exists in the container, don't bind mount a static one */
3997 p = choose_init(conf->rootfs.mount);
3998 if (p) {
3999 free(p);
4000 return;
4001 }
2322903b
SH
4002
4003 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
4004 if (ret < 0 || ret >= PATH_MAX) {
4005 WARN("Path name too long searching for lxc.init.static");
4006 return;
4007 }
4008
4009 if (!file_exists(path)) {
4010 INFO("%s does not exist on host", path);
4011 return;
4012 }
4013
4014 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
4015 if (ret < 0 || ret >= PATH_MAX) {
4016 WARN("Path name too long for container's lxc.init.static");
4017 return;
4018 }
4019
4020 if (!file_exists(destpath)) {
4021 FILE * pathfile = fopen(destpath, "wb");
4022 if (!pathfile) {
4023 SYSERROR("Failed to create mount target '%s'", destpath);
4024 return;
4025 }
4026 fclose(pathfile);
4027 }
4028
592fd47a 4029 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
2322903b
SH
4030 if (ret < 0)
4031 SYSERROR("Failed to bind lxc.init.static into container");
4032 INFO("lxc.init.static bound into container at %s", path);
4033}
4034
35120d9c
SH
4035/*
4036 * This does the work of remounting / if it is shared, calling the
4037 * container pre-mount hooks, and mounting the rootfs.
4038 */
4039int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 4040{
35120d9c
SH
4041 if (conf->rootfs_setup) {
4042 /*
4043 * rootfs was set up in another namespace. bind-mount it
4044 * to give us a mount in our own ns so we can pivot_root to it
4045 */
4046 const char *path = conf->rootfs.mount;
4047 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4048 ERROR("Failed to bind-mount container / onto itself");
145832ba 4049 return -1;
35120d9c 4050 }
145832ba 4051 return 0;
35120d9c 4052 }
d4ef7c50 4053
e995d7a2
SH
4054 remount_all_slave();
4055
35120d9c
SH
4056 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4057 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4058 return -1;
4059 }
4060
4061 if (setup_rootfs(conf)) {
4062 ERROR("failed to setup rootfs for '%s'", name);
4063 return -1;
4064 }
4065
4066 conf->rootfs_setup = true;
4067 return 0;
4068}
4069
1c1c7051
SH
4070static bool verify_start_hooks(struct lxc_conf *conf)
4071{
4072 struct lxc_list *it;
4073 char path[MAXPATHLEN];
4074 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4075 char *hookname = it->elem;
4076 struct stat st;
4077 int ret;
4078
4079 ret = snprintf(path, MAXPATHLEN, "%s%s",
7b6753e7 4080 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
1c1c7051
SH
4081 if (ret < 0 || ret >= MAXPATHLEN)
4082 return false;
4083 ret = stat(path, &st);
4084 if (ret) {
7b6753e7 4085 SYSERROR("Start hook %s not found in container",
1c1c7051
SH
4086 hookname);
4087 return false;
4088 }
6a0c909a 4089 return true;
1c1c7051
SH
4090 }
4091
4092 return true;
4093}
4094
e8bd4e43
SH
4095static int send_fd(int sock, int fd)
4096{
4097 int ret = lxc_abstract_unix_send_fd(sock, fd, NULL, 0);
4098
4099
4100 if (ret < 0) {
4101 SYSERROR("Error sending tty fd to parent");
4102 return -1;
4103 }
4104
4105 return 0;
4106}
4107
4108static int send_ttys_to_parent(struct lxc_handler *handler)
4109{
4110 struct lxc_conf *conf = handler->conf;
4111 const struct lxc_tty_info *tty_info = &conf->tty_info;
4112 int i;
4113 int sock = handler->ttysock[0];
4114
4115 for (i = 0; i < tty_info->nbtty; i++) {
4116 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
4117 if (send_fd(sock, pty_info->slave) < 0)
4118 goto bad;
4119 close(pty_info->slave);
4120 pty_info->slave = -1;
4121 if (send_fd(sock, pty_info->master) < 0)
4122 goto bad;
4123 close(pty_info->master);
4124 pty_info->master = -1;
4125 }
4126
4127 close(handler->ttysock[0]);
4128 close(handler->ttysock[1]);
4129
4130 return 0;
4131
4132bad:
4133 ERROR("Error writing tty fd to parent");
4134 return -1;
4135}
4136
35120d9c
SH
4137int lxc_setup(struct lxc_handler *handler)
4138{
4139 const char *name = handler->name;
4140 struct lxc_conf *lxc_conf = handler->conf;
4141 const char *lxcpath = handler->lxcpath;
35120d9c
SH
4142
4143 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4144 ERROR("Error setting up rootfs mount after spawn");
4145 return -1;
4146 }
4147
6c544cb3
MM
4148 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4149 if (setup_utsname(lxc_conf->utsname)) {
4150 ERROR("failed to setup the utsname for '%s'", name);
4151 return -1;
4152 }
0ad19a3f 4153 }
4154
5f4535a3 4155 if (setup_network(&lxc_conf->network)) {
36eb9bde 4156 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 4157 return -1;
0ad19a3f 4158 }
4159
bc6928ff 4160 if (lxc_conf->autodev > 0) {
14221cbb 4161 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
91c3830e 4162 ERROR("failed to mount /dev in the container");
c6883f38
SH
4163 return -1;
4164 }
4165 }
4166
368bbc02
CS
4167 /* do automatic mounts (mainly /proc and /sys), but exclude
4168 * those that need to wait until other stuff has finished
4169 */
4fb3cba5 4170 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4171 ERROR("failed to setup the automatic mounts for '%s'", name);
4172 return -1;
4173 }
4174
0a2dddd4 4175 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
36eb9bde 4176 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 4177 return -1;
576f946d 4178 }
4179
0a2dddd4 4180 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
e7938e9e
MN
4181 ERROR("failed to setup the mount entries for '%s'", name);
4182 return -1;
4183 }
4184
7b6753e7 4185 /* Make sure any start hooks are in the container */
1c1c7051
SH
4186 if (!verify_start_hooks(lxc_conf))
4187 return -1;
4188
2322903b
SH
4189 if (lxc_conf->is_execute)
4190 lxc_execute_bind_init(lxc_conf);
4191
368bbc02
CS
4192 /* now mount only cgroup, if wanted;
4193 * before, /sys could not have been mounted
4194 * (is either mounted automatically or via fstab entries)
4195 */
4fb3cba5 4196 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4197 ERROR("failed to setup the automatic mounts for '%s'", name);
4198 return -1;
4199 }
4200
283678ed 4201 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
773fb9ca
SH
4202 ERROR("failed to run mount hooks for container '%s'.", name);
4203 return -1;
4204 }
4205
bc6928ff 4206 if (lxc_conf->autodev > 0) {
283678ed 4207 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
f7bee6c6
MW
4208 ERROR("failed to run autodev hooks for container '%s'.", name);
4209 return -1;
4210 }
27245ff7 4211 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
91c3830e
SH
4212 ERROR("failed to populate /dev in the container");
4213 return -1;
4214 }
4215 }
368bbc02 4216
3d7d929a 4217 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
36eb9bde 4218 ERROR("failed to setup the console for '%s'", name);
95b5ffaf 4219 return -1;
6e590161 4220 }
4221
7e0e1d94
AV
4222 if (lxc_conf->kmsg) {
4223 if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail
4224 ERROR("failed to setup kmsg for '%s'", name);
4225 }
1bd051a6 4226
69aa6655
DE
4227 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4228 ERROR("failed to setup /dev symlinks for '%s'", name);
4229 return -1;
4230 }
4231
5112cd70 4232 /* mount /proc if it's not already there */
943144d9 4233 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 4234 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 4235 return -1;
e075f5d9 4236 }
e075f5d9 4237
ac778708 4238 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 4239 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 4240 return -1;
ed502555 4241 }
4242
70761e5e 4243 if (lxc_setup_devpts(lxc_conf->pts)) {
36eb9bde 4244 ERROR("failed to setup the new pts instance");
95b5ffaf 4245 return -1;
3c26f34e 4246 }
4247
e8bd4e43
SH
4248 if (lxc_create_tty(name, lxc_conf)) {
4249 ERROR("failed to create the ttys");
4250 return -1;
4251 }
4252
4253 if (send_ttys_to_parent(handler) < 0) {
4254 ERROR("failure sending console info to parent");
4255 return -1;
4256 }
4257
9e1045e3 4258 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
e8bd4e43
SH
4259 ERROR("failed to setup the ttys for '%s'", name);
4260 return -1;
4261 }
4262
4263 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4264 SYSERROR("failed to set environment variable for container ptys");
4265
4266
cccc74b5
DL
4267 if (setup_personality(lxc_conf->personality)) {
4268 ERROR("failed to setup personality");
4269 return -1;
4270 }
4271
97a8f74f
SG
4272 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4273 if (!lxc_list_empty(&lxc_conf->caps)) {
7389ca26 4274 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
f6d3e3e4
SH
4275 return -1;
4276 }
97a8f74f
SG
4277 if (dropcaps_except(&lxc_conf->keepcaps)) {
4278 ERROR("failed to keep requested caps");
4279 return -1;
4280 }
4281 } else if (setup_caps(&lxc_conf->caps)) {
4282 ERROR("failed to drop capabilities");
4283 return -1;
81810dd1
DL
4284 }
4285
cd54d859
DL
4286 NOTICE("'%s' is setup.", name);
4287
0ad19a3f 4288 return 0;
4289}
26ddeedd 4290
283678ed
SH
4291int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4292 const char *lxcpath, char *argv[])
26ddeedd
SH
4293{
4294 int which = -1;
4295 struct lxc_list *it;
4296
4297 if (strcmp(hook, "pre-start") == 0)
4298 which = LXCHOOK_PRESTART;
5ea6163a
SH
4299 else if (strcmp(hook, "pre-mount") == 0)
4300 which = LXCHOOK_PREMOUNT;
26ddeedd
SH
4301 else if (strcmp(hook, "mount") == 0)
4302 which = LXCHOOK_MOUNT;
f7bee6c6
MW
4303 else if (strcmp(hook, "autodev") == 0)
4304 which = LXCHOOK_AUTODEV;
26ddeedd
SH
4305 else if (strcmp(hook, "start") == 0)
4306 which = LXCHOOK_START;
52492063
WB
4307 else if (strcmp(hook, "stop") == 0)
4308 which = LXCHOOK_STOP;
26ddeedd
SH
4309 else if (strcmp(hook, "post-stop") == 0)
4310 which = LXCHOOK_POSTSTOP;
148e91f5
SH
4311 else if (strcmp(hook, "clone") == 0)
4312 which = LXCHOOK_CLONE;
37cf711b
SY
4313 else if (strcmp(hook, "destroy") == 0)
4314 which = LXCHOOK_DESTROY;
26ddeedd
SH
4315 else
4316 return -1;
4317 lxc_list_for_each(it, &conf->hooks[which]) {
4318 int ret;
4319 char *hookname = it->elem;
283678ed 4320 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
26ddeedd
SH
4321 if (ret)
4322 return ret;
4323 }
4324 return 0;
4325}
72d0e1cb 4326
427b3a21 4327static void lxc_remove_nic(struct lxc_list *it)
72d0e1cb
SG
4328{
4329 struct lxc_netdev *netdev = it->elem;
9ebb03ad 4330 struct lxc_list *it2,*next;
72d0e1cb
SG
4331
4332 lxc_list_del(it);
4333
f10fad2f
ME
4334 free(netdev->link);
4335 free(netdev->name);
4336 if (netdev->type == LXC_NET_VETH)
c9bb9a85 4337 free(netdev->priv.veth_attr.pair);
f10fad2f
ME
4338 free(netdev->upscript);
4339 free(netdev->hwaddr);
4340 free(netdev->mtu);
4341 free(netdev->ipv4_gateway);
4342 free(netdev->ipv6_gateway);
9ebb03ad 4343 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
72d0e1cb
SG
4344 lxc_list_del(it2);
4345 free(it2->elem);
4346 free(it2);
4347 }
9ebb03ad 4348 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
72d0e1cb
SG
4349 lxc_list_del(it2);
4350 free(it2->elem);
4351 free(it2);
4352 }
d95db067 4353 free(netdev);
72d0e1cb
SG
4354 free(it);
4355}
4356
4357/* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
12a50cc6 4358int lxc_clear_nic(struct lxc_conf *c, const char *key)
72d0e1cb
SG
4359{
4360 char *p1;
4361 int ret, idx, i;
4362 struct lxc_list *it;
4363 struct lxc_netdev *netdev;
4364
46cd2845 4365 p1 = strchr(key, '.');
72d0e1cb
SG
4366 if (!p1 || *(p1+1) == '\0')
4367 p1 = NULL;
4368
4369 ret = sscanf(key, "%d", &idx);
4370 if (ret != 1) return -1;
4371 if (idx < 0)
4372 return -1;
4373
4374 i = 0;
4375 lxc_list_for_each(it, &c->network) {
4376 if (i == idx)
4377 break;
4378 i++;
4379 }
4380 if (i < idx) // we don't have that many nics defined
4381 return -1;
4382
4383 if (!it || !it->elem)
4384 return -1;
4385
4386 netdev = it->elem;
4387
4388 if (!p1) {
4389 lxc_remove_nic(it);
52d21d40 4390 } else if (strcmp(p1, ".ipv4") == 0) {
9ebb03ad
DE
4391 struct lxc_list *it2,*next;
4392 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
72d0e1cb
SG
4393 lxc_list_del(it2);
4394 free(it2->elem);
4395 free(it2);
4396 }
52d21d40 4397 } else if (strcmp(p1, ".ipv6") == 0) {
9ebb03ad
DE
4398 struct lxc_list *it2,*next;
4399 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
72d0e1cb
SG
4400 lxc_list_del(it2);
4401 free(it2->elem);
4402 free(it2);
4403 }
72d0e1cb
SG
4404 }
4405 else return -1;
4406
4407 return 0;
4408}
4409
4410int lxc_clear_config_network(struct lxc_conf *c)
4411{
9ebb03ad
DE
4412 struct lxc_list *it,*next;
4413 lxc_list_for_each_safe(it, &c->network, next) {
72d0e1cb
SG
4414 lxc_remove_nic(it);
4415 }
4416 return 0;
4417}
4418
4419int lxc_clear_config_caps(struct lxc_conf *c)
4420{
9ebb03ad 4421 struct lxc_list *it,*next;
72d0e1cb 4422
9ebb03ad 4423 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
4424 lxc_list_del(it);
4425 free(it->elem);
4426 free(it);
4427 }
4428 return 0;
4429}
4430
74a3920a 4431static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
4432 struct lxc_list *it, *next;
4433
4355ab5f 4434 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
4435 lxc_list_del(it);
4436 free(it->elem);
4437 free(it);
4438 }
4439 return 0;
4440}
4441
4355ab5f
SH
4442int lxc_clear_idmaps(struct lxc_conf *c)
4443{
4444 return lxc_free_idmap(&c->id_map);
4445}
4446
1fb86a7c
SH
4447int lxc_clear_config_keepcaps(struct lxc_conf *c)
4448{
4449 struct lxc_list *it,*next;
4450
4451 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4452 lxc_list_del(it);
4453 free(it->elem);
4454 free(it);
4455 }
4456 return 0;
4457}
4458
12a50cc6 4459int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 4460{
9ebb03ad 4461 struct lxc_list *it,*next;
72d0e1cb 4462 bool all = false;
a6390f01 4463 const char *k = NULL;
72d0e1cb
SG
4464
4465 if (strcmp(key, "lxc.cgroup") == 0)
4466 all = true;
a6390f01
WB
4467 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4468 k = key + sizeof("lxc.cgroup.")-1;
4469 else
4470 return -1;
72d0e1cb 4471
9ebb03ad 4472 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
4473 struct lxc_cgroup *cg = it->elem;
4474 if (!all && strcmp(cg->subsystem, k) != 0)
4475 continue;
4476 lxc_list_del(it);
4477 free(cg->subsystem);
4478 free(cg->value);
4479 free(cg);
4480 free(it);
4481 }
4482 return 0;
4483}
4484
c6d09e15
WB
4485int lxc_clear_limits(struct lxc_conf *c, const char *key)
4486{
4487 struct lxc_list *it, *next;
4488 bool all = false;
4489 const char *k = NULL;
4490
4491 if (strcmp(key, "lxc.limit") == 0)
4492 all = true;
4493 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4494 k = key + sizeof("lxc.limit.")-1;
4495 else
4496 return -1;
4497
4498 lxc_list_for_each_safe(it, &c->limits, next) {
4499 struct lxc_limit *lim = it->elem;
4500 if (!all && strcmp(lim->resource, k) != 0)
4501 continue;
4502 lxc_list_del(it);
4503 free(lim->resource);
4504 free(lim);
4505 free(it);
4506 }
4507 return 0;
4508}
4509
ee1e7aa0
SG
4510int lxc_clear_groups(struct lxc_conf *c)
4511{
4512 struct lxc_list *it,*next;
4513
4514 lxc_list_for_each_safe(it, &c->groups, next) {
4515 lxc_list_del(it);
4516 free(it->elem);
4517 free(it);
4518 }
4519 return 0;
4520}
4521
ab799c0b
SG
4522int lxc_clear_environment(struct lxc_conf *c)
4523{
4524 struct lxc_list *it,*next;
4525
4526 lxc_list_for_each_safe(it, &c->environment, next) {
4527 lxc_list_del(it);
4528 free(it->elem);
4529 free(it);
4530 }
4531 return 0;
4532}
4533
4534
72d0e1cb
SG
4535int lxc_clear_mount_entries(struct lxc_conf *c)
4536{
9ebb03ad 4537 struct lxc_list *it,*next;
72d0e1cb 4538
9ebb03ad 4539 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
4540 lxc_list_del(it);
4541 free(it->elem);
4542 free(it);
4543 }
4544 return 0;
4545}
4546
b099e9e9
SH
4547int lxc_clear_automounts(struct lxc_conf *c)
4548{
4549 c->auto_mounts = 0;
4550 return 0;
4551}
4552
12a50cc6 4553int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 4554{
9ebb03ad 4555 struct lxc_list *it,*next;
17ed13a3 4556 bool all = false, done = false;
a6390f01 4557 const char *k = NULL;
72d0e1cb
SG
4558 int i;
4559
17ed13a3
SH
4560 if (strcmp(key, "lxc.hook") == 0)
4561 all = true;
a6390f01
WB
4562 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4563 k = key + sizeof("lxc.hook.")-1;
4564 else
4565 return -1;
17ed13a3 4566
72d0e1cb 4567 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 4568 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 4569 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
4570 lxc_list_del(it);
4571 free(it->elem);
4572 free(it);
4573 }
4574 done = true;
72d0e1cb
SG
4575 }
4576 }
17ed13a3
SH
4577
4578 if (!done) {
4579 ERROR("Invalid hook key: %s", key);
4580 return -1;
4581 }
72d0e1cb
SG
4582 return 0;
4583}
8eb5694b 4584
74a3920a 4585static void lxc_clear_saved_nics(struct lxc_conf *conf)
7b35f3d6
SH
4586{
4587 int i;
4588
0cf45501 4589 if (!conf->saved_nics)
7b35f3d6
SH
4590 return;
4591 for (i=0; i < conf->num_savednics; i++)
4592 free(conf->saved_nics[i].orig_name);
7b35f3d6
SH
4593 free(conf->saved_nics);
4594}
4595
4184c3e1
SH
4596static inline void lxc_clear_aliens(struct lxc_conf *conf)
4597{
4598 struct lxc_list *it,*next;
4599
4600 lxc_list_for_each_safe(it, &conf->aliens, next) {
4601 lxc_list_del(it);
4602 free(it->elem);
4603 free(it);
4604 }
4605}
4606
c7b15d1e 4607void lxc_clear_includes(struct lxc_conf *conf)
f979ac15
SH
4608{
4609 struct lxc_list *it,*next;
4610
4611 lxc_list_for_each_safe(it, &conf->includes, next) {
4612 lxc_list_del(it);
4613 free(it->elem);
4614 free(it);
4615 }
4616}
4617
8eb5694b
SH
4618void lxc_conf_free(struct lxc_conf *conf)
4619{
4620 if (!conf)
4621 return;
858377e4
SH
4622 if (current_config == conf)
4623 current_config = NULL;
f10fad2f
ME
4624 free(conf->console.log_path);
4625 free(conf->console.path);
4626 free(conf->rootfs.mount);
b3b8c97f 4627 free(conf->rootfs.bdev_type);
f10fad2f
ME
4628 free(conf->rootfs.options);
4629 free(conf->rootfs.path);
f10fad2f 4630 free(conf->logfile);
858377e4
SH
4631 if (conf->logfd != -1)
4632 close(conf->logfd);
f10fad2f
ME
4633 free(conf->utsname);
4634 free(conf->ttydir);
4635 free(conf->fstab);
4636 free(conf->rcfile);
4637 free(conf->init_cmd);
6b0d5538 4638 free(conf->unexpanded_config);
393903d1 4639 free(conf->pty_names);
76d0127f 4640 free(conf->syslog);
8eb5694b 4641 lxc_clear_config_network(conf);
f10fad2f
ME
4642 free(conf->lsm_aa_profile);
4643 free(conf->lsm_se_context);
769872f9 4644 lxc_seccomp_free(conf);
8eb5694b 4645 lxc_clear_config_caps(conf);
1fb86a7c 4646 lxc_clear_config_keepcaps(conf);
8eb5694b 4647 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 4648 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 4649 lxc_clear_mount_entries(conf);
7b35f3d6 4650 lxc_clear_saved_nics(conf);
27c27d73 4651 lxc_clear_idmaps(conf);
ee1e7aa0 4652 lxc_clear_groups(conf);
f979ac15 4653 lxc_clear_includes(conf);
761d81ca 4654 lxc_clear_aliens(conf);
ab799c0b 4655 lxc_clear_environment(conf);
c6d09e15 4656 lxc_clear_limits(conf, "lxc.limit");
8eb5694b
SH
4657 free(conf);
4658}
4355ab5f
SH
4659
4660struct userns_fn_data {
4661 int (*fn)(void *);
4662 void *arg;
4663 int p[2];
4664};
4665
4666static int run_userns_fn(void *data)
4667{
4668 struct userns_fn_data *d = data;
4669 char c;
4355ab5f 4670
f8aa4bf3 4671 /* Close write end of the pipe. */
4355ab5f 4672 close(d->p[1]);
f8aa4bf3
CB
4673
4674 /* Wait for parent to finish establishing a new mapping in the user
4675 * namespace we are executing in.
4676 */
4355ab5f
SH
4677 if (read(d->p[0], &c, 1) != 1)
4678 return -1;
f8aa4bf3
CB
4679
4680 /* Close read end of the pipe. */
4355ab5f 4681 close(d->p[0]);
f8aa4bf3
CB
4682
4683 /* Call function to run. */
4355ab5f
SH
4684 return d->fn(d->arg);
4685}
4686
f8aa4bf3
CB
4687static struct id_map *mapped_hostid_entry(unsigned id, struct lxc_conf *conf,
4688 enum idtype idtype)
4689{
4690 struct lxc_list *it;
4691 struct id_map *map;
4692 struct id_map *retmap = NULL;
4693
4694 lxc_list_for_each(it, &conf->id_map) {
4695 map = it->elem;
4696 if (map->idtype != idtype)
4697 continue;
4698
4699 if (id >= map->hostid && id < map->hostid + map->range) {
4700 retmap = map;
4701 break;
4702 }
4703 }
4704
4705 if (!retmap)
4706 return NULL;
4707
4708 retmap = malloc(sizeof(*retmap));
4709 if (!retmap)
4710 return NULL;
4711
4712 memcpy(retmap, map, sizeof(*retmap));
4713 return retmap;
4714}
4715
4355ab5f 4716/*
f8aa4bf3
CB
4717 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4718 * existing one or establish a new one.
4355ab5f 4719 */
f8aa4bf3
CB
4720static struct lxc_list *idmap_add_id(struct lxc_conf *conf, uid_t uid,
4721 gid_t gid)
4355ab5f 4722{
f8aa4bf3
CB
4723 int hostuid_mapped, hostgid_mapped;
4724 struct id_map *hostuid_idmap, *hostgid_idmap;
4725 struct id_map *entry = NULL;
4726 struct lxc_list *new = NULL;
4727 struct lxc_list *tmp = NULL;
4728
4729 hostuid_idmap = mapped_hostid_entry(uid, conf, ID_TYPE_UID);
4730 hostgid_idmap = mapped_hostid_entry(gid, conf, ID_TYPE_GID);
4355ab5f 4731
f8aa4bf3 4732 /* Allocate new {g,u}id map list. */
3ec1648d 4733 new = malloc(sizeof(*new));
f8aa4bf3
CB
4734 if (!new)
4735 goto on_error;
3ec1648d
SH
4736 lxc_list_init(new);
4737
f8aa4bf3
CB
4738 tmp = malloc(sizeof(*tmp));
4739 if (!tmp)
4740 goto on_error;
4741 entry = hostuid_idmap;
4742 if (!hostuid_idmap) {
8b227008
TS
4743 hostuid_mapped = find_unmapped_nsuid(conf, ID_TYPE_UID);
4744 if (hostuid_mapped < 0)
f8aa4bf3
CB
4745 goto on_error;
4746
4355ab5f 4747 entry = malloc(sizeof(*entry));
f8aa4bf3
CB
4748 if (!entry)
4749 goto on_error;
4750
3ec1648d 4751 tmp->elem = entry;
4355ab5f 4752 entry->idtype = ID_TYPE_UID;
8b227008 4753 entry->nsid = hostuid_mapped;
f8aa4bf3 4754 entry->hostid = (unsigned long)uid;
8b227008 4755 entry->range = 1;
f8aa4bf3
CB
4756 DEBUG("adding uid mapping: nsid %lu hostid %lu range %lu",
4757 entry->nsid, entry->hostid, entry->range);
4758 }
4759 lxc_list_add_tail(new, tmp);
4760 entry = NULL;
4761 tmp = NULL;
4762
4763 tmp = malloc(sizeof(*tmp));
4764 if (!tmp)
4765 goto on_error;
4766 entry = hostgid_idmap;
4767 if (!hostgid_idmap) {
8b227008
TS
4768 hostgid_mapped = find_unmapped_nsuid(conf, ID_TYPE_GID);
4769 if (hostgid_mapped < 0)
f8aa4bf3
CB
4770 goto on_error;
4771
8b227008 4772 entry = malloc(sizeof(*entry));
f8aa4bf3
CB
4773 if (!entry)
4774 goto on_error;
4775
8b227008
TS
4776 tmp->elem = entry;
4777 entry->idtype = ID_TYPE_GID;
4778 entry->nsid = hostgid_mapped;
f8aa4bf3 4779 entry->hostid = (unsigned long)gid;
4355ab5f 4780 entry->range = 1;
f8aa4bf3
CB
4781 DEBUG("adding gid mapping: nsid %lu hostid %lu range %lu",
4782 entry->nsid, entry->hostid, entry->range);
4355ab5f 4783 }
f8aa4bf3 4784 lxc_list_add_tail(new, tmp);
4355ab5f
SH
4785
4786 return new;
4787
f8aa4bf3
CB
4788on_error:
4789 ERROR("failed to allocate memory for new id map");
908fde6a
SH
4790 if (new)
4791 lxc_free_idmap(new);
c30ac545 4792 free(new);
f8aa4bf3
CB
4793 free(tmp);
4794 if (entry)
4795 free(entry);
4355ab5f
SH
4796 return NULL;
4797}
4798
f8aa4bf3
CB
4799/* Run a function in a new user namespace.
4800 * The caller's euid/egid will be mapped if it is not already.
4801 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4802 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4803 * This means we require only to establish a mapping from:
4804 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4805 * - the container root -> some sub{g,u}id
4806 * The former we add, if the user did not specifiy a mapping. The latter we
4807 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4808 * there to start the container in the first place.
4355ab5f
SH
4809 */
4810int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data)
4811{
f8aa4bf3
CB
4812 pid_t pid;
4813 uid_t euid, egid;
4355ab5f 4814 struct userns_fn_data d;
4355ab5f 4815 int p[2];
f8aa4bf3
CB
4816 struct lxc_list *it;
4817 struct id_map *map;
4818 char c = '1';
4819 int ret = -1;
4820 struct lxc_list *idmap = NULL, *tmplist = NULL;
4821 struct id_map *container_root_uid = NULL, *container_root_gid = NULL;
4355ab5f 4822
4355ab5f 4823 ret = pipe(p);
4355ab5f
SH
4824 if (ret < 0) {
4825 SYSERROR("opening pipe");
4826 return -1;
4827 }
4828 d.fn = fn;
4829 d.arg = data;
4830 d.p[0] = p[0];
4831 d.p[1] = p[1];
f8aa4bf3
CB
4832
4833 /* Clone child in new user namespace. */
4355ab5f 4834 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
f8aa4bf3
CB
4835 if (pid < 0) {
4836 ERROR("failed to clone child process in new user namespace");
4837 goto on_error;
4838 }
4839
4355ab5f 4840 close(p[0]);
4355ab5f
SH
4841 p[0] = -1;
4842
f8aa4bf3
CB
4843 /* Find container root. */
4844 lxc_list_for_each(it, &conf->id_map) {
4845 map = it->elem;
4846
4847 if (map->nsid != 0)
4848 continue;
4849
4850 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4851 container_root_uid = malloc(sizeof(*container_root_uid));
4852 if (!container_root_uid)
4853 goto on_error;
4854 container_root_uid->idtype = map->idtype;
4855 container_root_uid->hostid = map->hostid;
4856 container_root_uid->nsid = 0;
4857 container_root_uid->range = map->range;
4858 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4859 container_root_gid = malloc(sizeof(*container_root_gid));
4860 if (!container_root_gid)
4861 goto on_error;
4862 container_root_gid->idtype = map->idtype;
4863 container_root_gid->hostid = map->hostid;
4864 container_root_gid->nsid = 0;
4865 container_root_gid->range = map->range;
4866 }
4867
4868 /* Found container root. */
4869 if (container_root_uid && container_root_gid)
4870 break;
4871 }
4872
4873 /* This is actually checked earlier but it can't hurt. */
4874 if (!container_root_uid || !container_root_gid) {
4875 ERROR("no mapping for container root found");
4876 goto on_error;
4877 }
4878
4879 /* Check whether the {g,u}id of the user has a mapping. */
4880 euid = geteuid();
4881 egid = getegid();
4882 idmap = idmap_add_id(conf, euid, egid);
4883 if (!idmap) {
4884 ERROR("failed to prepare id mapping for uid %d and gid %d",
4885 euid, egid);
4886 goto on_error;
4887 }
4888
4889 /* Add container root to the map. */
4890 tmplist = malloc(sizeof(*tmplist));
4891 if (!tmplist)
4892 goto on_error;
4893 lxc_list_add_elem(tmplist, container_root_uid);
4894 lxc_list_add_tail(idmap, tmplist);
4895 /* idmap will now keep track of that memory. */
4896 container_root_uid = NULL;
4897
4898 tmplist = malloc(sizeof(*tmplist));
4899 if (!tmplist)
4900 goto on_error;
4901 lxc_list_add_elem(tmplist, container_root_gid);
4902 lxc_list_add_tail(idmap, tmplist);
4903 /* idmap will now keep track of that memory. */
4904 container_root_gid = NULL;
4905
77803ee7
CB
4906 if (lxc_log_get_level() == LXC_LOG_PRIORITY_TRACE ||
4907 conf->loglevel == LXC_LOG_PRIORITY_TRACE) {
f8aa4bf3
CB
4908 lxc_list_for_each(it, idmap) {
4909 map = it->elem;
4910 TRACE("establishing %cid mapping for \"%d\" in new "
4911 "user namespace: nsuid %lu - hostid %lu - range "
4912 "%lu",
4913 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4914 map->nsid, map->hostid, map->range);
4915 }
4355ab5f
SH
4916 }
4917
f8aa4bf3 4918 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4919 ret = lxc_map_ids(idmap, pid);
f8aa4bf3
CB
4920 if (ret < 0) {
4921 ERROR("error setting up {g,u}id mappings for child process "
4922 "\"%d\"",
4923 pid);
4924 goto on_error;
4355ab5f
SH
4925 }
4926
f8aa4bf3 4927 /* Tell child to proceed. */
4355ab5f 4928 if (write(p[1], &c, 1) != 1) {
f8aa4bf3
CB
4929 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4930 goto on_error;
4355ab5f
SH
4931 }
4932
f8aa4bf3 4933 /* Wait for child to finish. */
3139aead
SG
4934 ret = wait_for_pid(pid);
4935
f8aa4bf3
CB
4936on_error:
4937 lxc_free_idmap(idmap);
4938 free(container_root_uid);
4939 free(container_root_gid);
3139aead 4940
4355ab5f
SH
4941 if (p[0] != -1)
4942 close(p[0]);
4943 close(p[1]);
f8aa4bf3
CB
4944
4945 return ret;
4355ab5f 4946}
97e9cfa0 4947
a96a8e8c 4948/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4949static char* getuname(void)
4950{
a96a8e8c 4951 struct passwd *result;
97e9cfa0 4952
a96a8e8c
SH
4953 result = getpwuid(geteuid());
4954 if (!result)
97e9cfa0
SH
4955 return NULL;
4956
a96a8e8c 4957 return strdup(result->pw_name);
97e9cfa0
SH
4958}
4959
a96a8e8c 4960/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4961static char *getgname(void)
4962{
a96a8e8c 4963 struct group *result;
97e9cfa0 4964
a96a8e8c
SH
4965 result = getgrgid(getegid());
4966 if (!result)
97e9cfa0
SH
4967 return NULL;
4968
a96a8e8c 4969 return strdup(result->gr_name);
97e9cfa0
SH
4970}
4971
a96a8e8c 4972/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4973void suggest_default_idmap(void)
4974{
4975 FILE *f;
4976 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4977 char *line = NULL;
4978 char *uname, *gname;
4979 size_t len = 0;
4980
4981 if (!(uname = getuname()))
4982 return;
4983
4984 if (!(gname = getgname())) {
4985 free(uname);
4986 return;
4987 }
4988
4989 f = fopen(subuidfile, "r");
4990 if (!f) {
4991 ERROR("Your system is not configured with subuids");
4992 free(gname);
4993 free(uname);
4994 return;
4995 }
4996 while (getline(&line, &len, f) != -1) {
b7930180 4997 size_t no_newline = 0;
97e9cfa0
SH
4998 char *p = strchr(line, ':'), *p2;
4999 if (*line == '#')
5000 continue;
5001 if (!p)
5002 continue;
5003 *p = '\0';
5004 p++;
5005 if (strcmp(line, uname))
5006 continue;
5007 p2 = strchr(p, ':');
5008 if (!p2)
5009 continue;
5010 *p2 = '\0';
5011 p2++;
5012 if (!*p2)
5013 continue;
b7930180
CB
5014 no_newline = strcspn(p2, "\n");
5015 p2[no_newline] = '\0';
5016
b7b2fde4
CB
5017 if (lxc_safe_uint(p, &uid) < 0)
5018 WARN("Could not parse UID.");
5019 if (lxc_safe_uint(p2, &urange) < 0)
5020 WARN("Could not parse UID range.");
97e9cfa0
SH
5021 }
5022 fclose(f);
5023
6be7389a 5024 f = fopen(subgidfile, "r");
97e9cfa0
SH
5025 if (!f) {
5026 ERROR("Your system is not configured with subgids");
5027 free(gname);
5028 free(uname);
5029 return;
5030 }
5031 while (getline(&line, &len, f) != -1) {
b7930180 5032 size_t no_newline = 0;
97e9cfa0
SH
5033 char *p = strchr(line, ':'), *p2;
5034 if (*line == '#')
5035 continue;
5036 if (!p)
5037 continue;
5038 *p = '\0';
5039 p++;
5040 if (strcmp(line, uname))
5041 continue;
5042 p2 = strchr(p, ':');
5043 if (!p2)
5044 continue;
5045 *p2 = '\0';
5046 p2++;
5047 if (!*p2)
5048 continue;
b7930180
CB
5049 no_newline = strcspn(p2, "\n");
5050 p2[no_newline] = '\0';
5051
b7b2fde4
CB
5052 if (lxc_safe_uint(p, &gid) < 0)
5053 WARN("Could not parse GID.");
5054 if (lxc_safe_uint(p2, &grange) < 0)
5055 WARN("Could not parse GID range.");
97e9cfa0
SH
5056 }
5057 fclose(f);
5058
f10fad2f 5059 free(line);
97e9cfa0
SH
5060
5061 if (!urange || !grange) {
5062 ERROR("You do not have subuids or subgids allocated");
5063 ERROR("Unprivileged containers require subuids and subgids");
5064 return;
5065 }
5066
5067 ERROR("You must either run as root, or define uid mappings");
5068 ERROR("To pass uid mappings to lxc-create, you could create");
5069 ERROR("~/.config/lxc/default.conf:");
5070 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
5071 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
5072 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
5073
5074 free(gname);
5075 free(uname);
5076}
aaf26830 5077
a7307747
SH
5078static void free_cgroup_settings(struct lxc_list *result)
5079{
5080 struct lxc_list *iterator, *next;
5081
5082 lxc_list_for_each_safe(iterator, result, next) {
5083 lxc_list_del(iterator);
5084 free(iterator);
5085 }
5086 free(result);
5087}
5088
aaf26830
KT
5089/*
5090 * Return the list of cgroup_settings sorted according to the following rules
5091 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
5092 */
5093struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
5094{
5095 struct lxc_list *result;
5096 struct lxc_list *memsw_limit = NULL;
5097 struct lxc_list *it = NULL;
5098 struct lxc_cgroup *cg = NULL;
5099 struct lxc_list *item = NULL;
5100
5101 result = malloc(sizeof(*result));
fac7c663
KT
5102 if (!result) {
5103 ERROR("failed to allocate memory to sort cgroup settings");
5104 return NULL;
5105 }
aaf26830
KT
5106 lxc_list_init(result);
5107
5108 /*Iterate over the cgroup settings and copy them to the output list*/
5109 lxc_list_for_each(it, cgroup_settings) {
5110 item = malloc(sizeof(*item));
fac7c663
KT
5111 if (!item) {
5112 ERROR("failed to allocate memory to sort cgroup settings");
a7307747 5113 free_cgroup_settings(result);
fac7c663
KT
5114 return NULL;
5115 }
aaf26830
KT
5116 item->elem = it->elem;
5117 cg = it->elem;
5118 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
5119 /* Store the memsw_limit location */
5120 memsw_limit = item;
5121 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4d5b72a1 5122 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
aaf26830
KT
5123 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5124 item->elem = memsw_limit->elem;
5125 memsw_limit->elem = it->elem;
5126 }
5127 lxc_list_add_tail(result, item);
5128 }
5129
5130 return result;
a7307747 5131}