]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
bdev: record output from mkfs.*
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
8f3e280e
CB
27#include <dirent.h>
28#include <errno.h>
29#include <fcntl.h>
30#include <grp.h>
31#include <inttypes.h>
32#include <libgen.h>
33#include <pwd.h>
34#include <stdarg.h>
0ad19a3f 35#include <stdio.h>
0ad19a3f 36#include <stdlib.h>
0ad19a3f 37#include <string.h>
8f3e280e 38#include <time.h>
0ad19a3f 39#include <unistd.h>
8f3e280e
CB
40#include <arpa/inet.h>
41#include <linux/loop.h>
8f3e280e
CB
42#include <net/if.h>
43#include <netinet/in.h>
44#include <sys/mman.h>
45#include <sys/mount.h>
46#include <sys/param.h>
47#include <sys/prctl.h>
48#include <sys/stat.h>
49#include <sys/socket.h>
ce831b3b 50#include <sys/sysmacros.h>
2d76d1d7 51#include <sys/syscall.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
1d52bdf7 55
af6824fc
ST
56/* makedev() */
57#ifdef MAJOR_IN_MKDEV
58# include <sys/mkdev.h>
59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
5ef5c9a3
CB
71#ifdef HAVE_LINUX_MEMFD_H
72#include <linux/memfd.h>
73#endif
74
e8bd4e43 75#include "af_unix.h"
8f3e280e
CB
76#include "bdev.h"
77#include "caps.h" /* for lxc_caps_last_cap() */
78#include "cgroup.h"
1b09f2c0 79#include "conf.h"
8f3e280e 80#include "error.h"
1b09f2c0 81#include "log.h"
d8e48992 82#include "lxcaufs.h"
025ed0f3 83#include "lxclock.h"
8f3e280e
CB
84#include "lxcoverlay.h"
85#include "lxcseccomp.h"
4355ab5f 86#include "namespace.h"
8f3e280e
CB
87#include "network.h"
88#include "parse.h"
89#include "utils.h"
fe4de9a6 90#include "lsm/lsm.h"
d0a36f2c 91
e37dda71 92#if HAVE_LIBCAP
495d2046
SG
93#include <sys/capability.h>
94#endif
95
6ff05e18
SG
96#if HAVE_SYS_PERSONALITY_H
97#include <sys/personality.h>
98#endif
99
edaf8b1b
SG
100#if IS_BIONIC
101#include <../include/lxcmntent.h>
a04f5407
CB
102#ifndef HAVE_PRLIMIT
103#include <../include/prlimit.h>
104#endif
edaf8b1b
SG
105#else
106#include <mntent.h>
107#endif
108
36eb9bde 109lxc_log_define(lxc_conf, lxc);
e5bda9ee 110
e37dda71 111#if HAVE_LIBCAP
b09094da
MN
112#ifndef CAP_SETFCAP
113#define CAP_SETFCAP 31
114#endif
115
116#ifndef CAP_MAC_OVERRIDE
117#define CAP_MAC_OVERRIDE 32
118#endif
119
120#ifndef CAP_MAC_ADMIN
121#define CAP_MAC_ADMIN 33
122#endif
495d2046 123#endif
b09094da
MN
124
125#ifndef PR_CAPBSET_DROP
126#define PR_CAPBSET_DROP 24
127#endif
128
9818cae4
SG
129#ifndef LO_FLAGS_AUTOCLEAR
130#define LO_FLAGS_AUTOCLEAR 4
131#endif
132
bc5b27d6
DK
133#ifndef CAP_SETUID
134#define CAP_SETUID 7
135#endif
136
137#ifndef CAP_SETGID
138#define CAP_SETGID 6
139#endif
140
0769b82a
CS
141/* needed for cgroup automount checks, regardless of whether we
142 * have included linux/capability.h or not */
143#ifndef CAP_SYS_ADMIN
144#define CAP_SYS_ADMIN 21
145#endif
146
2d76d1d7
SG
147/* Define pivot_root() if missing from the C library */
148#ifndef HAVE_PIVOT_ROOT
149static int pivot_root(const char * new_root, const char * put_old)
150{
151#ifdef __NR_pivot_root
8f3e280e 152 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 153#else
8f3e280e
CB
154 errno = ENOSYS;
155 return -1;
2d76d1d7
SG
156#endif
157}
158#else
159extern int pivot_root(const char * new_root, const char * put_old);
160#endif
161
162/* Define sethostname() if missing from the C library */
163#ifndef HAVE_SETHOSTNAME
164static int sethostname(const char * name, size_t len)
165{
166#ifdef __NR_sethostname
8f3e280e 167 return syscall(__NR_sethostname, name, len);
2d76d1d7 168#else
8f3e280e
CB
169 errno = ENOSYS;
170 return -1;
2d76d1d7
SG
171#endif
172}
173#endif
174
72f919c4
SG
175/* Define __S_ISTYPE if missing from the C library */
176#ifndef __S_ISTYPE
177#define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
178#endif
179
ecec0126
SG
180#ifndef MS_PRIVATE
181#define MS_PRIVATE (1<<18)
182#endif
183
8912711c
CB
184#ifndef MS_LAZYTIME
185#define MS_LAZYTIME (1<<25)
186#endif
187
5ef5c9a3
CB
188/* memfd_create() */
189#ifndef MFD_CLOEXEC
190#define MFD_CLOEXEC 0x0001U
191#endif
192
193#ifndef MFD_ALLOW_SEALING
194#define MFD_ALLOW_SEALING 0x0002U
195#endif
196
197#ifndef HAVE_MEMFD_CREATE
198static int memfd_create(const char *name, unsigned int flags) {
199 #ifndef __NR_memfd_create
200 #if defined __i386__
201 #define __NR_memfd_create 356
202 #elif defined __x86_64__
203 #define __NR_memfd_create 319
204 #elif defined __arm__
205 #define __NR_memfd_create 385
206 #elif defined __aarch64__
207 #define __NR_memfd_create 279
208 #elif defined __s390__
209 #define __NR_memfd_create 350
210 #elif defined __powerpc__
211 #define __NR_memfd_create 360
212 #elif defined __sparc__
213 #define __NR_memfd_create 348
214 #elif defined __blackfin__
215 #define __NR_memfd_create 390
216 #elif defined __ia64__
217 #define __NR_memfd_create 1340
218 #elif defined _MIPS_SIM
219 #if _MIPS_SIM == _MIPS_SIM_ABI32
220 #define __NR_memfd_create 4354
221 #endif
222 #if _MIPS_SIM == _MIPS_SIM_NABI32
223 #define __NR_memfd_create 6318
224 #endif
225 #if _MIPS_SIM == _MIPS_SIM_ABI64
226 #define __NR_memfd_create 5314
227 #endif
228 #endif
229 #endif
230 #ifdef __NR_memfd_create
231 return syscall(__NR_memfd_create, name, flags);
232 #else
233 errno = ENOSYS;
234 return -1;
235 #endif
236}
237#else
238extern int memfd_create(const char *name, unsigned int flags);
239#endif
240
72d0e1cb 241char *lxchook_names[NUM_LXC_HOOKS] = {
52492063 242 "pre-start", "pre-mount", "mount", "autodev", "start", "stop", "post-stop", "clone", "destroy" };
72d0e1cb 243
a589434e 244typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
0ad19a3f 245
998ac676
RT
246struct mount_opt {
247 char *name;
248 int clear;
249 int flag;
250};
251
81810dd1
DL
252struct caps_opt {
253 char *name;
254 int value;
255};
256
c6d09e15
WB
257struct limit_opt {
258 char *name;
259 int value;
260};
261
858377e4
SH
262/*
263 * The lxc_conf of the container currently being worked on in an
264 * API call
265 * This is used in the error calls
266 */
267#ifdef HAVE_TLS
268__thread struct lxc_conf *current_config;
269#else
270struct lxc_conf *current_config;
271#endif
272
0769b82a
CS
273/* Declare this here, since we don't want to reshuffle the whole file. */
274static int in_caplist(int cap, struct lxc_list *caps);
275
a589434e
JN
276static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
277static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
278static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
279static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
280static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
281static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
282
283static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
284 [LXC_NET_VETH] = instantiate_veth,
285 [LXC_NET_MACVLAN] = instantiate_macvlan,
286 [LXC_NET_VLAN] = instantiate_vlan,
287 [LXC_NET_PHYS] = instantiate_phys,
288 [LXC_NET_EMPTY] = instantiate_empty,
289 [LXC_NET_NONE] = instantiate_none,
0ad19a3f 290};
291
74a2b586
JK
292static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
293static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
294static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
295static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
296static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
26b797f3 297static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
74a2b586 298
a589434e 299static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
74a2b586
JK
300 [LXC_NET_VETH] = shutdown_veth,
301 [LXC_NET_MACVLAN] = shutdown_macvlan,
302 [LXC_NET_VLAN] = shutdown_vlan,
303 [LXC_NET_PHYS] = shutdown_phys,
304 [LXC_NET_EMPTY] = shutdown_empty,
26b797f3 305 [LXC_NET_NONE] = shutdown_none,
74a2b586
JK
306};
307
998ac676 308static struct mount_opt mount_opt[] = {
470b359b
CB
309 { "async", 1, MS_SYNCHRONOUS },
310 { "atime", 1, MS_NOATIME },
311 { "bind", 0, MS_BIND },
88d413d5 312 { "defaults", 0, 0 },
88d413d5 313 { "dev", 1, MS_NODEV },
470b359b 314 { "diratime", 1, MS_NODIRATIME },
88d413d5 315 { "dirsync", 0, MS_DIRSYNC },
470b359b 316 { "exec", 1, MS_NOEXEC },
8912711c 317 { "lazytime", 0, MS_LAZYTIME },
88d413d5 318 { "mand", 0, MS_MANDLOCK },
88d413d5 319 { "noatime", 0, MS_NOATIME },
470b359b 320 { "nodev", 0, MS_NODEV },
88d413d5 321 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
322 { "noexec", 0, MS_NOEXEC },
323 { "nomand", 1, MS_MANDLOCK },
324 { "norelatime", 1, MS_RELATIME },
325 { "nostrictatime", 1, MS_STRICTATIME },
326 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
327 { "rbind", 0, MS_BIND|MS_REC },
328 { "relatime", 0, MS_RELATIME },
470b359b
CB
329 { "remount", 0, MS_REMOUNT },
330 { "ro", 0, MS_RDONLY },
331 { "rw", 1, MS_RDONLY },
88d413d5 332 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
333 { "suid", 1, MS_NOSUID },
334 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 335 { NULL, 0, 0 },
998ac676
RT
336};
337
e37dda71 338#if HAVE_LIBCAP
81810dd1 339static struct caps_opt caps_opt[] = {
a6afdde9 340 { "chown", CAP_CHOWN },
1e11be34
DL
341 { "dac_override", CAP_DAC_OVERRIDE },
342 { "dac_read_search", CAP_DAC_READ_SEARCH },
343 { "fowner", CAP_FOWNER },
344 { "fsetid", CAP_FSETID },
81810dd1
DL
345 { "kill", CAP_KILL },
346 { "setgid", CAP_SETGID },
347 { "setuid", CAP_SETUID },
348 { "setpcap", CAP_SETPCAP },
349 { "linux_immutable", CAP_LINUX_IMMUTABLE },
350 { "net_bind_service", CAP_NET_BIND_SERVICE },
351 { "net_broadcast", CAP_NET_BROADCAST },
352 { "net_admin", CAP_NET_ADMIN },
353 { "net_raw", CAP_NET_RAW },
354 { "ipc_lock", CAP_IPC_LOCK },
355 { "ipc_owner", CAP_IPC_OWNER },
356 { "sys_module", CAP_SYS_MODULE },
357 { "sys_rawio", CAP_SYS_RAWIO },
358 { "sys_chroot", CAP_SYS_CHROOT },
359 { "sys_ptrace", CAP_SYS_PTRACE },
360 { "sys_pacct", CAP_SYS_PACCT },
361 { "sys_admin", CAP_SYS_ADMIN },
362 { "sys_boot", CAP_SYS_BOOT },
363 { "sys_nice", CAP_SYS_NICE },
364 { "sys_resource", CAP_SYS_RESOURCE },
365 { "sys_time", CAP_SYS_TIME },
366 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
367 { "mknod", CAP_MKNOD },
368 { "lease", CAP_LEASE },
57b837e2
CB
369#ifdef CAP_AUDIT_READ
370 { "audit_read", CAP_AUDIT_READ },
371#endif
9527e566 372#ifdef CAP_AUDIT_WRITE
81810dd1 373 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
374#endif
375#ifdef CAP_AUDIT_CONTROL
81810dd1 376 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 377#endif
81810dd1
DL
378 { "setfcap", CAP_SETFCAP },
379 { "mac_override", CAP_MAC_OVERRIDE },
380 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
381#ifdef CAP_SYSLOG
382 { "syslog", CAP_SYSLOG },
383#endif
384#ifdef CAP_WAKE_ALARM
385 { "wake_alarm", CAP_WAKE_ALARM },
386#endif
2b54359b
CB
387#ifdef CAP_BLOCK_SUSPEND
388 { "block_suspend", CAP_BLOCK_SUSPEND },
389#endif
81810dd1 390};
495d2046
SG
391#else
392static struct caps_opt caps_opt[] = {};
393#endif
81810dd1 394
c6d09e15
WB
395static struct limit_opt limit_opt[] = {
396#ifdef RLIMIT_AS
397 { "as", RLIMIT_AS },
398#endif
399#ifdef RLIMIT_CORE
400 { "core", RLIMIT_CORE },
401#endif
402#ifdef RLIMIT_CPU
403 { "cpu", RLIMIT_CPU },
404#endif
405#ifdef RLIMIT_DATA
406 { "data", RLIMIT_DATA },
407#endif
408#ifdef RLIMIT_FSIZE
409 { "fsize", RLIMIT_FSIZE },
410#endif
411#ifdef RLIMIT_LOCKS
412 { "locks", RLIMIT_LOCKS },
413#endif
414#ifdef RLIMIT_MEMLOCK
415 { "memlock", RLIMIT_MEMLOCK },
416#endif
417#ifdef RLIMIT_MSGQUEUE
418 { "msgqueue", RLIMIT_MSGQUEUE },
419#endif
420#ifdef RLIMIT_NICE
421 { "nice", RLIMIT_NICE },
422#endif
423#ifdef RLIMIT_NOFILE
424 { "nofile", RLIMIT_NOFILE },
425#endif
426#ifdef RLIMIT_NPROC
427 { "nproc", RLIMIT_NPROC },
428#endif
429#ifdef RLIMIT_RSS
430 { "rss", RLIMIT_RSS },
431#endif
432#ifdef RLIMIT_RTPRIO
433 { "rtprio", RLIMIT_RTPRIO },
434#endif
435#ifdef RLIMIT_RTTIME
436 { "rttime", RLIMIT_RTTIME },
437#endif
438#ifdef RLIMIT_SIGPENDING
439 { "sigpending", RLIMIT_SIGPENDING },
440#endif
441#ifdef RLIMIT_STACK
442 { "stack", RLIMIT_STACK },
443#endif
444};
445
91c3830e
SH
446static int run_buffer(char *buffer)
447{
ebec9176 448 struct lxc_popen_FILE *f;
91c3830e 449 char *output;
8e7da691 450 int ret;
91c3830e 451
ebec9176 452 f = lxc_popen(buffer);
91c3830e 453 if (!f) {
062b72c6 454 SYSERROR("Failed to popen() %s.", buffer);
91c3830e
SH
455 return -1;
456 }
457
458 output = malloc(LXC_LOG_BUFFER_SIZE);
459 if (!output) {
062b72c6 460 ERROR("Failed to allocate memory for %s.", buffer);
ebec9176 461 lxc_pclose(f);
91c3830e
SH
462 return -1;
463 }
464
062b72c6
CB
465 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
466 DEBUG("Script %s with output: %s.", buffer, output);
91c3830e
SH
467
468 free(output);
469
ebec9176 470 ret = lxc_pclose(f);
8e7da691 471 if (ret == -1) {
062b72c6 472 SYSERROR("Script exited with error.");
91c3830e 473 return -1;
8e7da691 474 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
062b72c6 475 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
8e7da691
DE
476 return -1;
477 } else if (WIFSIGNALED(ret)) {
062b72c6 478 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
8e7da691 479 return -1;
91c3830e
SH
480 }
481
482 return 0;
483}
484
148e91f5 485static int run_script_argv(const char *name, const char *section,
062b72c6
CB
486 const char *script, const char *hook,
487 const char *lxcpath, char **argsin)
148e91f5
SH
488{
489 int ret, i;
490 char *buffer;
491 size_t size = 0;
492
062b72c6 493 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
148e91f5
SH
494 script, name, section);
495
062b72c6 496 for (i = 0; argsin && argsin[i]; i++)
148e91f5
SH
497 size += strlen(argsin[i]) + 1;
498
499 size += strlen(hook) + 1;
500
501 size += strlen(script);
502 size += strlen(name);
503 size += strlen(section);
504 size += 3;
505
506 if (size > INT_MAX)
507 return -1;
508
509 buffer = alloca(size);
510 if (!buffer) {
062b72c6 511 ERROR("Failed to allocate memory.");
148e91f5
SH
512 return -1;
513 }
514
062b72c6
CB
515 ret =
516 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
517 if (ret < 0 || (size_t)ret >= size) {
518 ERROR("Script name too long.");
148e91f5
SH
519 return -1;
520 }
521
062b72c6
CB
522 for (i = 0; argsin && argsin[i]; i++) {
523 int len = size - ret;
148e91f5
SH
524 int rc;
525 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
526 if (rc < 0 || rc >= len) {
062b72c6 527 ERROR("Script args too long.");
148e91f5
SH
528 return -1;
529 }
530 ret += rc;
531 }
532
533 return run_buffer(buffer);
534}
535
062b72c6
CB
536static int run_script(const char *name, const char *section, const char *script,
537 ...)
e3b4c4c4 538{
abbfd20b 539 int ret;
91c3830e 540 char *buffer, *p;
abbfd20b
DL
541 size_t size = 0;
542 va_list ap;
751d9dcd 543
062b72c6 544 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
751d9dcd 545 script, name, section);
e3b4c4c4 546
abbfd20b
DL
547 va_start(ap, script);
548 while ((p = va_arg(ap, char *)))
95642a10 549 size += strlen(p) + 1;
abbfd20b
DL
550 va_end(ap);
551
552 size += strlen(script);
553 size += strlen(name);
554 size += strlen(section);
95642a10 555 size += 3;
abbfd20b 556
95642a10
MS
557 if (size > INT_MAX)
558 return -1;
559
560 buffer = alloca(size);
abbfd20b 561 if (!buffer) {
062b72c6 562 ERROR("Failed to allocate memory.");
751d9dcd
DL
563 return -1;
564 }
565
9ba8130c
SH
566 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
567 if (ret < 0 || ret >= size) {
062b72c6 568 ERROR("Script name too long.");
9ba8130c
SH
569 return -1;
570 }
751d9dcd 571
abbfd20b 572 va_start(ap, script);
9ba8130c 573 while ((p = va_arg(ap, char *))) {
062b72c6 574 int len = size - ret;
9ba8130c
SH
575 int rc;
576 rc = snprintf(buffer + ret, len, " %s", p);
577 if (rc < 0 || rc >= len) {
062b72c6 578 ERROR("Script args too long.");
9ba8130c
SH
579 return -1;
580 }
581 ret += rc;
582 }
abbfd20b 583 va_end(ap);
751d9dcd 584
91c3830e 585 return run_buffer(buffer);
e3b4c4c4
ST
586}
587
0c547523
SH
588/*
589 * pin_rootfs
b7ed4bf0
CS
590 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
591 * the duration of the container run, to prevent the container from marking
592 * the underlying fs readonly on shutdown. unlink the file immediately so
593 * no name pollution is happens
0c547523
SH
594 * return -1 on error.
595 * return -2 if nothing needed to be pinned.
596 * return an open fd (>=0) if we pinned it.
597 */
598int pin_rootfs(const char *rootfs)
599{
600 char absrootfs[MAXPATHLEN];
601 char absrootfspin[MAXPATHLEN];
602 struct stat s;
603 int ret, fd;
604
e99ee0de 605 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 606 return -2;
e99ee0de 607
00ec333b 608 if (!realpath(rootfs, absrootfs))
9be53773 609 return -2;
0c547523 610
00ec333b 611 if (access(absrootfs, F_OK))
0c547523 612 return -1;
0c547523 613
00ec333b 614 if (stat(absrootfs, &s))
0c547523 615 return -1;
0c547523 616
72f919c4 617 if (!S_ISDIR(s.st_mode))
0c547523
SH
618 return -2;
619
b7ed4bf0 620 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 621 if (ret >= MAXPATHLEN)
0c547523 622 return -1;
0c547523
SH
623
624 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
625 if (fd < 0)
626 return fd;
627 (void)unlink(absrootfspin);
0c547523
SH
628 return fd;
629}
630
e2a7e8dc
SH
631/*
632 * If we are asking to remount something, make sure that any
633 * NOEXEC etc are honored.
634 */
635static unsigned long add_required_remount_flags(const char *s, const char *d,
636 unsigned long flags)
637{
614305f3 638#ifdef HAVE_STATVFS
e2a7e8dc
SH
639 struct statvfs sb;
640 unsigned long required_flags = 0;
641
642 if (!(flags & MS_REMOUNT))
643 return flags;
644
645 if (!s)
646 s = d;
647
648 if (!s)
649 return flags;
650 if (statvfs(s, &sb) < 0)
651 return flags;
652
653 if (sb.f_flag & MS_NOSUID)
654 required_flags |= MS_NOSUID;
655 if (sb.f_flag & MS_NODEV)
656 required_flags |= MS_NODEV;
657 if (sb.f_flag & MS_RDONLY)
658 required_flags |= MS_RDONLY;
659 if (sb.f_flag & MS_NOEXEC)
660 required_flags |= MS_NOEXEC;
661
662 return flags | required_flags;
614305f3
SH
663#else
664 return flags;
665#endif
e2a7e8dc
SH
666}
667
4fb3cba5 668static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 669{
368bbc02 670 int r;
80e80c40 671 int i;
b06b8511
CS
672 static struct {
673 int match_mask;
674 int match_flag;
675 const char *source;
676 const char *destination;
677 const char *fstype;
678 unsigned long flags;
679 const char *options;
680 } default_mounts[] = {
681 /* Read-only bind-mounting... In older kernels, doing that required
682 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
683 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
684 * kernel 2.6.26 onwards. However, this apparently does not work on
685 * kernel 3.8. Unfortunately, on that very same kernel, doing the
686 * same trick as above doesn't seem to work either, there one needs
687 * to ALSO specify MS_BIND for the remount, otherwise the entire
688 * fs is remounted read-only or the mount fails because it's busy...
689 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
690 * 2.6.32...
368bbc02 691 */
f24a52d5 692 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a
SH
693 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
694 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
f24a52d5
SG
695 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
696 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
592fd47a 697 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
f24a52d5
SG
698 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
699 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
700 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
701 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
702 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
703 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
704 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
705 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
706 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
707 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
708 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
709 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 710 };
368bbc02 711
b06b8511
CS
712 for (i = 0; default_mounts[i].match_mask; i++) {
713 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
714 char *source = NULL;
715 char *destination = NULL;
716 int saved_errno;
e2a7e8dc 717 unsigned long mflags;
b06b8511
CS
718
719 if (default_mounts[i].source) {
720 /* will act like strdup if %r is not present */
8ede5f4c 721 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
b06b8511
CS
722 if (!source) {
723 SYSERROR("memory allocation error");
724 return -1;
725 }
726 }
cc4fd506
SH
727 if (!default_mounts[i].destination) {
728 ERROR("BUG: auto mounts destination %d was NULL", i);
b2f44b4d 729 free(source);
cc4fd506
SH
730 return -1;
731 }
732 /* will act like strdup if %r is not present */
733 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
734 if (!destination) {
735 saved_errno = errno;
736 SYSERROR("memory allocation error");
737 free(source);
738 errno = saved_errno;
739 return -1;
b06b8511 740 }
e2a7e8dc
SH
741 mflags = add_required_remount_flags(source, destination,
742 default_mounts[i].flags);
592fd47a 743 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
b06b8511 744 saved_errno = errno;
b88ff9a0
SG
745 if (r < 0 && errno == ENOENT) {
746 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
747 r = 0;
748 }
749 else if (r < 0)
e2a7e8dc 750 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
f24a52d5 751
b06b8511
CS
752 free(source);
753 free(destination);
754 if (r < 0) {
b06b8511
CS
755 errno = saved_errno;
756 return -1;
757 }
368bbc02 758 }
368bbc02
CS
759 }
760
b06b8511 761 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
762 int cg_flags;
763
764 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
765 /* If the type of cgroup mount was not specified, it depends on the
766 * container's capabilities as to what makes sense: if we have
767 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
768 * anyway, so we may as well default to read-write; then the admin
769 * will not be given a false sense of security. (And if they really
770 * want mixed r/o r/w, then they can explicitly specify :mixed.)
771 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
772 * :mixed, because then the container can't remount it read-write. */
773 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
774 int has_sys_admin = 0;
b0ee5983
CB
775
776 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 777 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 778 else
0769b82a 779 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
780
781 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 782 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 783 else
0769b82a 784 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a
CS
785 }
786
8ede5f4c 787 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
368bbc02 788 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 789 return -1;
368bbc02
CS
790 }
791 }
792
368bbc02 793 return 0;
368bbc02
CS
794}
795
4e5440c6 796static int setup_utsname(struct utsname *utsname)
0ad19a3f 797{
4e5440c6
DL
798 if (!utsname)
799 return 0;
0ad19a3f 800
4e5440c6
DL
801 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
802 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 803 return -1;
804 }
805
4e5440c6 806 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 807
0ad19a3f 808 return 0;
809}
810
69aa6655
DE
811struct dev_symlinks {
812 const char *oldpath;
813 const char *name;
814};
815
816static const struct dev_symlinks dev_symlinks[] = {
817 {"/proc/self/fd", "fd"},
818 {"/proc/self/fd/0", "stdin"},
819 {"/proc/self/fd/1", "stdout"},
820 {"/proc/self/fd/2", "stderr"},
821};
822
823static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
824{
825 char path[MAXPATHLEN];
826 int ret,i;
09227be2 827 struct stat s;
69aa6655
DE
828
829
830 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
831 const struct dev_symlinks *d = &dev_symlinks[i];
ec50007f 832 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
833 if (ret < 0 || ret >= MAXPATHLEN)
834 return -1;
09227be2
MW
835
836 /*
837 * Stat the path first. If we don't get an error
838 * accept it as is and don't try to create it
839 */
840 if (!stat(path, &s)) {
841 continue;
842 }
843
69aa6655 844 ret = symlink(d->oldpath, path);
09227be2 845
69aa6655 846 if (ret && errno != EEXIST) {
09227be2
MW
847 if ( errno == EROFS ) {
848 WARN("Warning: Read Only file system while creating %s", path);
849 } else {
850 SYSERROR("Error creating %s", path);
851 return -1;
852 }
69aa6655
DE
853 }
854 }
855 return 0;
856}
857
393903d1
SH
858/*
859 * Build a space-separate list of ptys to pass to systemd.
860 */
861static bool append_ptyname(char **pp, char *name)
b0a33c1e 862{
393903d1
SH
863 char *p;
864
865 if (!*pp) {
866 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
867 if (!*pp)
868 return false;
869 sprintf(*pp, "container_ttys=%s", name);
870 return true;
871 }
872 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
873 if (!p)
874 return false;
875 *pp = p;
876 strcat(p, " ");
877 strcat(p, name);
878 return true;
879}
880
9e1045e3 881static int lxc_setup_tty(struct lxc_conf *conf)
393903d1 882{
9e1045e3 883 int i, ret;
393903d1
SH
884 const struct lxc_tty_info *tty_info = &conf->tty_info;
885 char *ttydir = conf->ttydir;
7c6ef2a2 886 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 887
e8bd4e43 888 if (!conf->rootfs.path)
bc9bd0e3
DL
889 return 0;
890
b0a33c1e 891 for (i = 0; i < tty_info->nbtty; i++) {
b0a33c1e 892 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
893
e8bd4e43 894 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
9e1045e3 895 if (ret < 0 || (size_t)ret >= sizeof(path)) {
7c6ef2a2
SH
896 ERROR("pathname too long for ttys");
897 return -1;
898 }
9e1045e3 899
7c6ef2a2
SH
900 if (ttydir) {
901 /* create dev/lxc/tty%d" */
9e1045e3
CB
902 ret = snprintf(lxcpath, sizeof(lxcpath),
903 "/dev/%s/tty%d", ttydir, i + 1);
904 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
7c6ef2a2
SH
905 ERROR("pathname too long for ttys");
906 return -1;
907 }
9e1045e3 908
7c6ef2a2 909 ret = creat(lxcpath, 0660);
9e1045e3
CB
910 if (ret < 0 && errno != EEXIST) {
911 SYSERROR("failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
912 return -1;
913 }
4d44e274
SH
914 if (ret >= 0)
915 close(ret);
9e1045e3 916
7c6ef2a2 917 ret = unlink(path);
9e1045e3
CB
918 if (ret < 0 && errno != ENOENT) {
919 SYSERROR("failed to unlink \"%s\"", path);
7c6ef2a2
SH
920 return -1;
921 }
b0a33c1e 922
9e1045e3
CB
923 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
924 if (ret < 0) {
925 WARN("failed to bind mount \"%s\" onto \"%s\"",
7c6ef2a2
SH
926 pty_info->name, path);
927 continue;
928 }
9e1045e3
CB
929 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
930 path);
13954cce 931
9e1045e3
CB
932 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
933 ttydir, i + 1);
934 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
9ba8130c
SH
935 ERROR("tty pathname too long");
936 return -1;
937 }
9e1045e3 938
7c6ef2a2 939 ret = symlink(lxcpath, path);
9e1045e3
CB
940 if (ret < 0) {
941 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
942 path, lxcpath);
7c6ef2a2
SH
943 return -1;
944 }
945 } else {
9e1045e3
CB
946 /* If we populated /dev, then we need to create
947 * /dev/ttyN
948 */
949 ret = access(path, F_OK);
950 if (ret < 0) {
c6883f38 951 ret = creat(path, 0660);
9e1045e3
CB
952 if (ret < 0) {
953 SYSERROR("failed to create \"%s\"", path);
c6883f38 954 /* this isn't fatal, continue */
025ed0f3 955 } else {
c6883f38 956 close(ret);
025ed0f3 957 }
c6883f38 958 }
9e1045e3
CB
959
960 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
961 if (ret < 0) {
e8bd4e43 962 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
7c6ef2a2
SH
963 continue;
964 }
9e1045e3
CB
965
966 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
967 path);
393903d1 968 }
9e1045e3 969
e8bd4e43 970 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
393903d1
SH
971 ERROR("Error setting up container_ttys string");
972 return -1;
b0a33c1e 973 }
974 }
975
9e1045e3 976 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
b0a33c1e 977 return 0;
978}
979
59bb8698 980static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 981{
2d489f9e 982 int oldroot = -1, newroot = -1;
bf601689 983
2d489f9e
SH
984 oldroot = open("/", O_DIRECTORY | O_RDONLY);
985 if (oldroot < 0) {
986 SYSERROR("Error opening old-/ for fchdir");
9ba8130c
SH
987 return -1;
988 }
2d489f9e
SH
989 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
990 if (newroot < 0) {
991 SYSERROR("Error opening new-/ for fchdir");
992 goto fail;
c08556c6 993 }
bf601689 994
cc6f6dd7 995 /* change into new root fs */
2d489f9e 996 if (fchdir(newroot)) {
cc6f6dd7 997 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
2d489f9e 998 goto fail;
cc6f6dd7
DL
999 }
1000
cc6f6dd7 1001 /* pivot_root into our new root fs */
2d489f9e 1002 if (pivot_root(".", ".")) {
cc6f6dd7 1003 SYSERROR("pivot_root syscall failed");
2d489f9e 1004 goto fail;
bf601689 1005 }
cc6f6dd7 1006
2d489f9e
SH
1007 /*
1008 * at this point the old-root is mounted on top of our new-root
1009 * To unmounted it we must not be chdir'd into it, so escape back
1010 * to old-root
1011 */
1012 if (fchdir(oldroot) < 0) {
1013 SYSERROR("Error entering oldroot");
1014 goto fail;
1015 }
7981ea46 1016 if (umount2(".", MNT_DETACH) < 0) {
2d489f9e
SH
1017 SYSERROR("Error detaching old root");
1018 goto fail;
cc6f6dd7
DL
1019 }
1020
2d489f9e
SH
1021 if (fchdir(newroot) < 0) {
1022 SYSERROR("Error re-entering newroot");
1023 goto fail;
1024 }
cc6f6dd7 1025
2d489f9e
SH
1026 close(oldroot);
1027 close(newroot);
bf601689 1028
2d489f9e 1029 DEBUG("pivot_root syscall to '%s' successful", rootfs);
bf601689 1030
bf601689 1031 return 0;
2d489f9e
SH
1032
1033fail:
1034 if (oldroot != -1)
1035 close(oldroot);
1036 if (newroot != -1)
1037 close(newroot);
1038 return -1;
bf601689
MH
1039}
1040
bc6928ff 1041/*
87da4ec3
SH
1042 * Just create a path for /dev under $lxcpath/$name and in rootfs
1043 * If we hit an error, log it but don't fail yet.
91c3830e 1044 */
14221cbb 1045static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, const char *lxcpath)
91c3830e
SH
1046{
1047 int ret;
87da4ec3
SH
1048 size_t clen;
1049 char *path;
91c3830e 1050
14221cbb 1051 INFO("Mounting container /dev");
bc6928ff 1052
14221cbb 1053 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1054 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1055 path = alloca(clen);
bc6928ff 1056
ec50007f 1057 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
87da4ec3 1058 if (ret < 0 || ret >= clen)
91c3830e 1059 return -1;
bc6928ff 1060
87da4ec3 1061 if (!dir_exists(path)) {
14221cbb 1062 WARN("No /dev in container.");
87da4ec3
SH
1063 WARN("Proceeding without autodev setup");
1064 return 0;
bc6928ff 1065 }
87da4ec3 1066
1ec0e8e3 1067 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
ec50007f 1068 rootfs->path ? rootfs->mount : NULL);
1ec0e8e3 1069 if (ret != 0) {
87da4ec3 1070 SYSERROR("Failed mounting tmpfs onto %s\n", path);
1ec0e8e3 1071 return -1;
91c3830e 1072 }
87da4ec3
SH
1073
1074 INFO("Mounted tmpfs onto %s", path);
1075
ec50007f 1076 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
87da4ec3 1077 if (ret < 0 || ret >= clen)
91c3830e 1078 return -1;
87da4ec3 1079
bc6928ff
MW
1080 /*
1081 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1082 * If not, then create it and exit if that fails...
1083 */
87da4ec3 1084 if (!dir_exists(path)) {
bc6928ff
MW
1085 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1086 if (ret) {
1087 SYSERROR("Failed to create /dev/pts in container");
1088 return -1;
1089 }
91c3830e
SH
1090 }
1091
14221cbb 1092 INFO("Mounted container /dev");
91c3830e
SH
1093 return 0;
1094}
1095
c6883f38 1096struct lxc_devs {
74a3920a 1097 const char *name;
c6883f38
SH
1098 mode_t mode;
1099 int maj;
1100 int min;
1101};
1102
74a3920a 1103static const struct lxc_devs lxc_devs[] = {
c6883f38
SH
1104 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1105 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1106 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1107 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1108 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1109 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
c6883f38
SH
1110};
1111
27245ff7 1112static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38
SH
1113{
1114 int ret;
c6883f38
SH
1115 char path[MAXPATHLEN];
1116 int i;
3a32201c 1117 mode_t cmask;
c6883f38 1118
ec50007f 1119 ret = snprintf(path, MAXPATHLEN, "%s/dev", rootfs->path ? rootfs->mount : "");
91c3830e
SH
1120 if (ret < 0 || ret >= MAXPATHLEN) {
1121 ERROR("Error calculating container /dev location");
c6883f38 1122 return -1;
f7bee6c6 1123 }
91c3830e 1124
0bbf8572
CB
1125 /* ignore, just don't try to fill in */
1126 if (!dir_exists(path))
9cb4d183
SH
1127 return 0;
1128
0bbf8572 1129 INFO("populating container /dev");
3a32201c 1130 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1131 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1132 const struct lxc_devs *d = &lxc_devs[i];
0728ebf4 1133
ec50007f 1134 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
c6883f38
SH
1135 if (ret < 0 || ret >= MAXPATHLEN)
1136 return -1;
0bbf8572 1137
c6883f38 1138 ret = mknod(path, d->mode, makedev(d->maj, d->min));
0bbf8572 1139 if (ret < 0) {
9cb4d183
SH
1140 char hostpath[MAXPATHLEN];
1141 FILE *pathfile;
1142
0bbf8572
CB
1143 if (errno == EEXIST) {
1144 DEBUG("\"%s\" device already existed", path);
1145 continue;
1146 }
1147
1148 /* Unprivileged containers cannot create devices, so
1149 * bind mount the device from the host.
1150 */
9cb4d183
SH
1151 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1152 if (ret < 0 || ret >= MAXPATHLEN)
1153 return -1;
1154 pathfile = fopen(path, "wb");
1155 if (!pathfile) {
1156 SYSERROR("Failed to create device mount target '%s'", path);
1157 return -1;
1158 }
1159 fclose(pathfile);
0bbf8572
CB
1160 if (safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL) != 0) {
1161 SYSERROR("Failed bind mounting device %s from host into container", d->name);
9cb4d183
SH
1162 return -1;
1163 }
0bbf8572
CB
1164 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath, path);
1165 } else {
1166 DEBUG("created device node \"%s\"", path);
c6883f38
SH
1167 }
1168 }
3a32201c 1169 umask(cmask);
c6883f38 1170
0bbf8572 1171 INFO("populated container /dev");
c6883f38
SH
1172 return 0;
1173}
1174
9aa76a17 1175static int lxc_setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1176{
9aa76a17 1177 int ret;
91c3e281
CB
1178 struct bdev *bdev;
1179 const struct lxc_rootfs *rootfs;
cc28d0b0 1180
91c3e281 1181 rootfs = &conf->rootfs;
a0f379bf 1182 if (!rootfs->path) {
91c3e281
CB
1183 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1184 SYSERROR("Failed to make / rslave.");
a0f379bf
DW
1185 return -1;
1186 }
c69bd12f 1187 return 0;
a0f379bf 1188 }
0ad19a3f 1189
12297168 1190 if (access(rootfs->mount, F_OK)) {
91c3e281 1191 SYSERROR("Failed to access to \"%s\". Check it is present.",
12297168 1192 rootfs->mount);
b1789442
DL
1193 return -1;
1194 }
1195
91c3e281 1196 bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
9aa76a17
CB
1197 if (!bdev) {
1198 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
91c3e281
CB
1199 rootfs->path, rootfs->mount,
1200 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1201 return -1;
9be53773 1202 }
9aa76a17
CB
1203
1204 ret = bdev->ops->mount(bdev);
1205 bdev_put(bdev);
1206 if (ret < 0) {
91c3e281
CB
1207 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1208 rootfs->path, rootfs->mount,
1209 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1210 return -1;
1211 }
0ad19a3f 1212
91c3e281
CB
1213 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1214 rootfs->path, rootfs->mount,
1215 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1216
ac778708
DL
1217 return 0;
1218}
1219
91e93c71
AV
1220int prepare_ramfs_root(char *root)
1221{
eab15c1e 1222 char buf[LXC_LINELEN], *p;
91e93c71
AV
1223 char nroot[PATH_MAX];
1224 FILE *f;
1225 int i;
1226 char *p2;
1227
1228 if (realpath(root, nroot) == NULL)
39c7b795 1229 return -errno;
91e93c71
AV
1230
1231 if (chdir("/") == -1)
39c7b795 1232 return -errno;
91e93c71
AV
1233
1234 /*
1235 * We could use here MS_MOVE, but in userns this mount is
1236 * locked and can't be moved.
1237 */
39c7b795 1238 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
91e93c71 1239 SYSERROR("Failed to move %s into /", root);
39c7b795 1240 return -errno;
91e93c71
AV
1241 }
1242
39c7b795 1243 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
91e93c71 1244 SYSERROR("Failed to make . rprivate");
39c7b795 1245 return -errno;
91e93c71
AV
1246 }
1247
1248 /*
1249 * The following code cleans up inhereted mounts which are not
1250 * required for CT.
1251 *
1252 * The mountinfo file shows not all mounts, if a few points have been
1253 * unmounted between read operations from the mountinfo. So we need to
1254 * read mountinfo a few times.
1255 *
1256 * This loop can be skipped if a container uses unserns, because all
1257 * inherited mounts are locked and we should live with all this trash.
1258 */
1259 while (1) {
1260 int progress = 0;
1261
1262 f = fopen("./proc/self/mountinfo", "r");
1263 if (!f) {
1264 SYSERROR("Unable to open /proc/self/mountinfo");
1265 return -1;
1266 }
eab15c1e 1267 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1268 for (p = buf, i=0; p && i < 4; i++)
1269 p = strchr(p+1, ' ');
1270 if (!p)
1271 continue;
1272 p2 = strchr(p+1, ' ');
1273 if (!p2)
1274 continue;
1275
1276 *p2 = '\0';
1277 *p = '.';
1278
1279 if (strcmp(p + 1, "/") == 0)
1280 continue;
1281 if (strcmp(p + 1, "/proc") == 0)
1282 continue;
1283
1284 if (umount2(p, MNT_DETACH) == 0)
1285 progress++;
1286 }
1287 fclose(f);
1288 if (!progress)
1289 break;
1290 }
1291
8bea9fae
PR
1292 /* This also can be skipped if a container uses unserns */
1293 umount2("./proc", MNT_DETACH);
91e93c71
AV
1294
1295 /* It is weird, but chdir("..") moves us in a new root */
1296 if (chdir("..") == -1) {
1297 SYSERROR("Unable to change working directory");
1298 return -1;
1299 }
1300
1301 if (chroot(".") == -1) {
1302 SYSERROR("Unable to chroot");
1303 return -1;
1304 }
1305
1306 return 0;
1307}
1308
74a3920a 1309static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1310{
39c7b795
CB
1311 if (!rootfs->path) {
1312 DEBUG("container does not have a rootfs, so not doing pivot root");
ac778708 1313 return 0;
39c7b795 1314 }
ac778708 1315
91e93c71 1316 if (detect_ramfs_rootfs()) {
39c7b795
CB
1317 DEBUG("detected that container is on ramfs");
1318 if (prepare_ramfs_root(rootfs->mount)) {
1319 ERROR("failed to prepare minimal ramfs root");
91e93c71 1320 return -1;
39c7b795
CB
1321 }
1322
1323 DEBUG("prepared ramfs root for container");
1324 return 0;
1325 }
1326
1327 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1328 ERROR("failed to pivot root");
25368b52 1329 return -1;
c69bd12f
DL
1330 }
1331
39c7b795 1332 DEBUG("finished pivot root");
25368b52 1333 return 0;
0ad19a3f 1334}
1335
70761e5e 1336static int lxc_setup_devpts(int num_pts)
3c26f34e 1337{
70761e5e 1338 int ret;
d5cb35d6 1339 const char *devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
77890c6d 1340
70761e5e
CB
1341 if (!num_pts) {
1342 DEBUG("no new devpts instance will be mounted since no pts "
1343 "devices are requested");
d852c78c 1344 return 0;
3c26f34e 1345 }
1346
d5cb35d6 1347 /* Unmount old devpts instance. */
70761e5e
CB
1348 ret = access("/dev/pts/ptmx", F_OK);
1349 if (!ret) {
70761e5e
CB
1350 ret = umount("/dev/pts");
1351 if (ret < 0) {
1352 SYSERROR("failed to unmount old devpts instance");
1353 return -1;
7e40254a 1354 }
70761e5e 1355 DEBUG("unmounted old /dev/pts instance");
7e40254a
JTLB
1356 }
1357
70761e5e
CB
1358 /* Create mountpoint for devpts instance. */
1359 ret = mkdir("/dev/pts", 0755);
1360 if (ret < 0 && errno != EEXIST) {
1361 SYSERROR("failed to create the \"/dev/pts\" directory");
3c26f34e 1362 return -1;
1363 }
1364
70761e5e
CB
1365 /* Mount new devpts instance. */
1366 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1367 if (ret < 0) {
1368 SYSERROR("failed to mount new devpts instance");
1369 return -1;
1370 }
f4f52cb5 1371 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1372
d5cb35d6 1373 /* Remove any pre-existing /dev/ptmx file. */
70761e5e 1374 ret = access("/dev/ptmx", F_OK);
d5cb35d6
CB
1375 if (!ret) {
1376 ret = remove("/dev/ptmx");
1377 if (ret < 0) {
1378 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1379 return -1;
70761e5e 1380 }
d5cb35d6 1381 DEBUG("removed existing \"/dev/ptmx\"");
3c26f34e 1382 }
1383
d5cb35d6
CB
1384 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1385 ret = open("/dev/ptmx", O_CREAT, 0666);
1386 if (ret < 0) {
1387 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1388 return -1;
1389 }
e87bd19c 1390 close(ret);
d5cb35d6 1391 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1392
d5cb35d6 1393 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1394 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6
CB
1395 if (!ret) {
1396 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1397 return 0;
1398 } else {
1399 /* Fallthrough and try to create a symlink. */
1400 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1401 }
1402
1403 /* Remove the dummy /dev/ptmx file we created above. */
1404 ret = remove("/dev/ptmx");
70761e5e 1405 if (ret < 0) {
d5cb35d6
CB
1406 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1407 return -1;
1408 }
1409
1410 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1411 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1412 if (ret < 0) {
1413 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
3c26f34e 1414 return -1;
1415 }
d5cb35d6 1416 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
cd54d859 1417
3c26f34e 1418 return 0;
1419}
1420
cccc74b5
DL
1421static int setup_personality(int persona)
1422{
6ff05e18 1423 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1424 if (persona == -1)
1425 return 0;
1426
1427 if (personality(persona) < 0) {
1428 SYSERROR("failed to set personality to '0x%x'", persona);
1429 return -1;
1430 }
1431
1432 INFO("set personality to '0x%x'", persona);
6ff05e18 1433 #endif
cccc74b5
DL
1434
1435 return 0;
1436}
1437
3d7d929a
CB
1438static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1439 const struct lxc_console *console)
6e590161 1440{
63376d7d 1441 char path[MAXPATHLEN];
0728ebf4 1442 int ret, fd;
52e35957 1443
8b1b1210
CB
1444 if (console->path && !strcmp(console->path, "none"))
1445 return 0;
1446
7c6ef2a2 1447 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
3d7d929a 1448 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1449 return -1;
52e35957 1450
8b1b1210
CB
1451 /* When we are asked to setup a console we remove any previous
1452 * /dev/console bind-mounts.
1453 */
a7ba3c7f
CB
1454 if (file_exists(path)) {
1455 ret = lxc_unstack_mountpoint(path, false);
1456 if (ret < 0) {
8b1b1210 1457 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1458 return -ret;
1459 } else {
1460 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1461 }
1462 ret = unlink(path);
1463 if (ret < 0) {
1464 SYSERROR("error unlinking %s", path);
8b1b1210
CB
1465 return -errno;
1466 }
8b1b1210
CB
1467 }
1468
1469 /* For unprivileged containers autodev or automounts will already have
1470 * taken care of creating /dev/console.
1471 */
0728ebf4
TA
1472 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1473 if (fd < 0) {
1474 if (errno != EEXIST) {
1475 SYSERROR("failed to create console");
3d7d929a 1476 return -errno;
0728ebf4
TA
1477 }
1478 } else {
1479 close(fd);
52e35957
DL
1480 }
1481
0728ebf4 1482 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
3d7d929a
CB
1483 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1484 return -errno;
63376d7d 1485 }
13954cce 1486
3d7d929a 1487 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
63376d7d 1488 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1489 return -1;
1490 }
1491
3d7d929a 1492 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1493 return 0;
1494}
1495
3d7d929a
CB
1496static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1497 const struct lxc_console *console,
1498 char *ttydir)
7c6ef2a2 1499{
7c6ef2a2 1500 int ret;
3d7d929a 1501 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
7c6ef2a2
SH
1502
1503 /* create rootfs/dev/<ttydir> directory */
3d7d929a
CB
1504 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1505 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1506 return -1;
3d7d929a 1507
7c6ef2a2
SH
1508 ret = mkdir(path, 0755);
1509 if (ret && errno != EEXIST) {
959aee9c 1510 SYSERROR("failed with errno %d to create %s", errno, path);
3d7d929a 1511 return -errno;
7c6ef2a2 1512 }
3d7d929a 1513 DEBUG("created directory for console and tty devices at \%s\"", path);
7c6ef2a2 1514
3d7d929a
CB
1515 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1516 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1517 return -1;
1518
7c6ef2a2 1519 ret = creat(lxcpath, 0660);
3d7d929a 1520 if (ret == -1 && errno != EEXIST) {
959aee9c 1521 SYSERROR("error %d creating %s", errno, lxcpath);
3d7d929a 1522 return -errno;
7c6ef2a2 1523 }
4d44e274
SH
1524 if (ret >= 0)
1525 close(ret);
7c6ef2a2 1526
2a12fefd
CB
1527 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1528 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 1529 return -1;
2a12fefd
CB
1530
1531 /* When we are asked to setup a console we remove any previous
1532 * /dev/console bind-mounts.
1533 */
1534 if (console->path && !strcmp(console->path, "none")) {
1535 struct stat st;
1536 ret = stat(path, &st);
1537 if (ret < 0) {
1538 if (errno == ENOENT)
1539 return 0;
1540 SYSERROR("failed stat() \"%s\"", path);
1541 return -errno;
1542 }
1543
1544 /* /dev/console must be character device with major number 5 and
1545 * minor number 1. If not, give benefit of the doubt and assume
1546 * the user has mounted something else right there on purpose.
1547 */
1548 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1549 return 0;
1550
1551 /* In case the user requested a bind-mount for /dev/console and
1552 * requests a ttydir we move the mount to the
a7ba3c7f
CB
1553 * /dev/<ttydir/console.
1554 * Note, we only move the uppermost mount and clear all other
1555 * mounts underneath for safety.
1556 * If it is a character device created via mknod() we simply
1557 * rename it.
2a12fefd
CB
1558 */
1559 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1560 if (ret < 0) {
1561 if (errno != EINVAL) {
1562 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1563 return -errno;
1564 }
1565 /* path was not a mountpoint */
1566 ret = rename(path, lxcpath);
1567 if (ret < 0) {
1568 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1569 return -errno;
1570 }
1571 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1572 } else {
1573 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1574 }
a7ba3c7f
CB
1575
1576 /* Clear all remaining bind-mounts. */
1577 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1578 if (ret < 0) {
a7ba3c7f
CB
1579 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1580 return -ret;
1581 } else {
1582 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1583 }
1584 } else {
1585 if (file_exists(path)) {
1586 ret = lxc_unstack_mountpoint(path, false);
1587 if (ret < 0) {
2a12fefd 1588 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1589 return -ret;
1590 } else {
1591 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
2a12fefd 1592 }
2a12fefd
CB
1593 }
1594
1595 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1596 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1597 return -1;
1598 }
1599 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1600 }
1601
2a12fefd 1602 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
9ba8130c 1603 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
3d7d929a 1604 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 1605 return -1;
3d7d929a 1606
2a12fefd
CB
1607 ret = unlink(path);
1608 if (ret && errno != ENOENT) {
1609 SYSERROR("error unlinking %s", path);
1610 return -errno;
1611 }
1612
7c6ef2a2 1613 ret = symlink(lxcpath, path);
3d7d929a
CB
1614 if (ret < 0) {
1615 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
7c6ef2a2
SH
1616 return -1;
1617 }
1618
3d7d929a 1619 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
6e590161 1620 return 0;
1621}
1622
3d7d929a
CB
1623static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1624 const struct lxc_console *console, char *ttydir)
7c6ef2a2 1625{
3d7d929a
CB
1626 /* We don't have a rootfs, /dev/console will be shared. */
1627 if (!rootfs->path) {
1628 DEBUG("/dev/console will be shared with the host");
7c6ef2a2 1629 return 0;
3d7d929a
CB
1630 }
1631
7c6ef2a2 1632 if (!ttydir)
3d7d929a 1633 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1634
3d7d929a 1635 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1636}
1637
1bd051a6
SH
1638static int setup_kmsg(const struct lxc_rootfs *rootfs,
1639 const struct lxc_console *console)
1640{
1641 char kpath[MAXPATHLEN];
1642 int ret;
1643
222fea5a
DE
1644 if (!rootfs->path)
1645 return 0;
1bd051a6
SH
1646 ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
1647 if (ret < 0 || ret >= sizeof(kpath))
1648 return -1;
1649
1650 ret = unlink(kpath);
1651 if (ret && errno != ENOENT) {
959aee9c 1652 SYSERROR("error unlinking %s", kpath);
1bd051a6
SH
1653 return -1;
1654 }
1655
1656 ret = symlink("console", kpath);
1657 if (ret) {
1658 SYSERROR("failed to create symlink for kmsg");
1659 return -1;
1660 }
1661
1662 return 0;
1663}
1664
998ac676
RT
1665static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1666{
1667 struct mount_opt *mo;
1668
1669 /* If opt is found in mount_opt, set or clear flags.
1670 * Otherwise append it to data. */
1671
1672 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1673 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1674 if (mo->clear)
1675 *flags &= ~mo->flag;
1676 else
1677 *flags |= mo->flag;
1678 return;
1679 }
1680 }
1681
1682 if (strlen(*data))
1683 strcat(*data, ",");
1684 strcat(*data, opt);
1685}
1686
a17b1e65 1687int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1688 char **mntdata)
1689{
1690 char *s, *data;
1691 char *p, *saveptr = NULL;
1692
911324ef 1693 *mntdata = NULL;
91656ce5 1694 *mntflags = 0L;
911324ef
DL
1695
1696 if (!mntopts)
998ac676
RT
1697 return 0;
1698
911324ef 1699 s = strdup(mntopts);
998ac676 1700 if (!s) {
36eb9bde 1701 SYSERROR("failed to allocate memory");
998ac676
RT
1702 return -1;
1703 }
1704
1705 data = malloc(strlen(s) + 1);
1706 if (!data) {
36eb9bde 1707 SYSERROR("failed to allocate memory");
998ac676
RT
1708 free(s);
1709 return -1;
1710 }
1711 *data = 0;
1712
1713 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1714 p = strtok_r(NULL, ",", &saveptr))
1715 parse_mntopt(p, mntflags, &data);
1716
1717 if (*data)
1718 *mntdata = data;
1719 else
1720 free(data);
1721 free(s);
1722
1723 return 0;
1724}
1725
6fd5e769
SH
1726static void null_endofword(char *word)
1727{
1728 while (*word && *word != ' ' && *word != '\t')
1729 word++;
1730 *word = '\0';
1731}
1732
1733/*
1734 * skip @nfields spaces in @src
1735 */
1736static char *get_field(char *src, int nfields)
1737{
1738 char *p = src;
1739 int i;
1740
1741 for (i = 0; i < nfields; i++) {
1742 while (*p && *p != ' ' && *p != '\t')
1743 p++;
1744 if (!*p)
1745 break;
1746 p++;
1747 }
1748 return p;
1749}
1750
911324ef
DL
1751static int mount_entry(const char *fsname, const char *target,
1752 const char *fstype, unsigned long mountflags,
ae7a770e 1753 const char *data, int optional, int dev, const char *rootfs)
911324ef 1754{
614305f3 1755#ifdef HAVE_STATVFS
2938f7c8 1756 struct statvfs sb;
614305f3 1757#endif
2938f7c8 1758
592fd47a 1759 if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) {
1fc64d22
SG
1760 if (optional) {
1761 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
1762 target, strerror(errno));
1763 return 0;
1764 }
1765 else {
1766 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1767 return -1;
1768 }
911324ef
DL
1769 }
1770
1771 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
2938f7c8
SH
1772 DEBUG("remounting %s on %s to respect bind or remount options",
1773 fsname ? fsname : "(none)", target ? target : "(none)");
7c5b6e7c
AS
1774 unsigned long rqd_flags = 0;
1775 if (mountflags & MS_RDONLY)
1776 rqd_flags |= MS_RDONLY;
614305f3 1777#ifdef HAVE_STATVFS
2938f7c8 1778 if (statvfs(fsname, &sb) == 0) {
7c5b6e7c 1779 unsigned long required_flags = rqd_flags;
2938f7c8
SH
1780 if (sb.f_flag & MS_NOSUID)
1781 required_flags |= MS_NOSUID;
ae7a770e 1782 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8
SH
1783 required_flags |= MS_NODEV;
1784 if (sb.f_flag & MS_RDONLY)
1785 required_flags |= MS_RDONLY;
1786 if (sb.f_flag & MS_NOEXEC)
1787 required_flags |= MS_NOEXEC;
1788 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname, sb.f_flag, required_flags);
1789 /*
1790 * If this was a bind mount request, and required_flags
1791 * does not have any flags which are not already in
1792 * mountflags, then skip the remount
1793 */
1794 if (!(mountflags & MS_REMOUNT)) {
7c5b6e7c 1795 if (!(required_flags & ~mountflags) && rqd_flags == 0) {
2938f7c8
SH
1796 DEBUG("mountflags already was %lu, skipping remount",
1797 mountflags);
1798 goto skipremount;
1799 }
1800 }
1801 mountflags |= required_flags;
6fd5e769 1802 }
614305f3 1803#endif
911324ef
DL
1804
1805 if (mount(fsname, target, fstype,
592fd47a 1806 mountflags | MS_REMOUNT, data) < 0) {
1fc64d22
SG
1807 if (optional) {
1808 INFO("failed to mount '%s' on '%s' (optional): %s",
1809 fsname, target, strerror(errno));
1810 return 0;
1811 }
1812 else {
1813 SYSERROR("failed to mount '%s' on '%s'",
1814 fsname, target);
1815 return -1;
1816 }
911324ef
DL
1817 }
1818 }
1819
614305f3 1820#ifdef HAVE_STATVFS
6fd5e769 1821skipremount:
614305f3 1822#endif
911324ef
DL
1823 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1824
1825 return 0;
1826}
1827
4e4ca161
SH
1828/*
1829 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1830 */
1831static void cull_mntent_opt(struct mntent *mntent)
1832{
1833 int i;
1834 char *p, *p2;
1835 char *list[] = {"create=dir",
1836 "create=file",
1837 "optional",
1838 NULL };
1839
1840 for (i=0; list[i]; i++) {
1841 if (!(p = strstr(mntent->mnt_opts, list[i])))
1842 continue;
1843 p2 = strchr(p, ',');
1844 if (!p2) {
1845 /* no more mntopts, so just chop it here */
1846 *p = '\0';
1847 continue;
1848 }
1849 memmove(p, p2+1, strlen(p2+1)+1);
1850 }
1851}
1852
4d5b72a1 1853static int mount_entry_create_dir_file(const struct mntent *mntent,
0a2dddd4
CB
1854 const char* path, const struct lxc_rootfs *rootfs,
1855 const char *lxc_name, const char *lxc_path)
0ad19a3f 1856{
4d5b72a1 1857 char *pathdirname = NULL;
608e3567 1858 int ret = 0;
34cfffb3 1859 FILE *pathfile = NULL;
911324ef 1860
6e46cc0d 1861 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
5c484f79 1862 if (ovl_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
6e46cc0d
CB
1863 return -1;
1864 } else if (strncmp(mntent->mnt_type, "aufs", 4) == 0) {
1d52bdf7 1865 if (aufs_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
6e46cc0d
CB
1866 return -1;
1867 }
1868
34cfffb3 1869 if (hasmntopt(mntent, "create=dir")) {
4d5b72a1
NC
1870 if (mkdir_p(path, 0755) < 0) {
1871 WARN("Failed to create mount target '%s'", path);
34cfffb3
SG
1872 ret = -1;
1873 }
1874 }
1875
4d5b72a1
NC
1876 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1877 pathdirname = strdup(path);
34cfffb3 1878 pathdirname = dirname(pathdirname);
119126b6
SG
1879 if (mkdir_p(pathdirname, 0755) < 0) {
1880 WARN("Failed to create target directory");
1881 }
4d5b72a1 1882 pathfile = fopen(path, "wb");
34cfffb3 1883 if (!pathfile) {
4d5b72a1 1884 WARN("Failed to create mount target '%s'", path);
34cfffb3 1885 ret = -1;
6e46cc0d 1886 } else {
34cfffb3 1887 fclose(pathfile);
6e46cc0d 1888 }
34cfffb3 1889 }
4d5b72a1
NC
1890 free(pathdirname);
1891 return ret;
1892}
1893
ec50007f
CB
1894/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1895 * without a rootfs. */
db4aba38 1896static inline int mount_entry_on_generic(struct mntent *mntent,
0a2dddd4
CB
1897 const char* path, const struct lxc_rootfs *rootfs,
1898 const char *lxc_name, const char *lxc_path)
4d5b72a1
NC
1899{
1900 unsigned long mntflags;
1901 char *mntdata;
1902 int ret;
1903 bool optional = hasmntopt(mntent, "optional") != NULL;
ae7a770e 1904 bool dev = hasmntopt(mntent, "dev") != NULL;
4d5b72a1 1905
ec50007f
CB
1906 char *rootfs_path = NULL;
1907 if (rootfs && rootfs->path)
1908 rootfs_path = rootfs->mount;
1909
0a2dddd4 1910 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name, lxc_path);
34cfffb3 1911
608e3567
SH
1912 if (ret < 0)
1913 return optional ? 0 : -1;
1914
4e4ca161
SH
1915 cull_mntent_opt(mntent);
1916
a17b1e65
SG
1917 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
1918 free(mntdata);
1919 return -1;
1920 }
1921
6e46cc0d 1922 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
ae7a770e 1923 mntdata, optional, dev, rootfs_path);
68c152ef 1924
911324ef 1925 free(mntdata);
911324ef
DL
1926 return ret;
1927}
1928
db4aba38
NC
1929static inline int mount_entry_on_systemfs(struct mntent *mntent)
1930{
1433c9f9
CB
1931 char path[MAXPATHLEN];
1932 int ret;
1933
1934 /* For containers created without a rootfs all mounts are treated as
1935 * absolute paths starting at / on the host. */
1936 if (mntent->mnt_dir[0] != '/')
1937 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1938 else
1939 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
1940
1941 if (ret < 0 || ret >= sizeof(path)) {
1942 ERROR("path name too long");
1943 return -1;
1944 }
1945
1946 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
1947}
1948
4e4ca161 1949static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 1950 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
1951 const char *lxc_name,
1952 const char *lxc_path)
911324ef 1953{
013bd428 1954 char *aux;
59760f5d 1955 char path[MAXPATHLEN];
80a881b2 1956 int r, ret = 0, offset;
67e571de 1957 const char *lxcpath;
0ad19a3f 1958
593e8478 1959 lxcpath = lxc_global_config_value("lxc.lxcpath");
2a59a681
SH
1960 if (!lxcpath) {
1961 ERROR("Out of memory");
1962 return -1;
1963 }
1964
80a881b2 1965 /* if rootfs->path is a blockdev path, allow container fstab to
2a59a681
SH
1966 * use $lxcpath/CN/rootfs as the target prefix */
1967 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
80a881b2
SH
1968 if (r < 0 || r >= MAXPATHLEN)
1969 goto skipvarlib;
1970
1971 aux = strstr(mntent->mnt_dir, path);
1972 if (aux) {
1973 offset = strlen(path);
1974 goto skipabs;
1975 }
1976
1977skipvarlib:
013bd428
DL
1978 aux = strstr(mntent->mnt_dir, rootfs->path);
1979 if (!aux) {
1980 WARN("ignoring mount point '%s'", mntent->mnt_dir);
db4aba38 1981 return ret;
013bd428 1982 }
80a881b2
SH
1983 offset = strlen(rootfs->path);
1984
1985skipabs:
013bd428 1986
9ba8130c 1987 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
80a881b2
SH
1988 aux + offset);
1989 if (r < 0 || r >= MAXPATHLEN) {
1990 WARN("pathnme too long for '%s'", mntent->mnt_dir);
a17b1e65
SG
1991 return -1;
1992 }
1993
0a2dddd4 1994 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 1995}
d330fe7b 1996
4e4ca161 1997static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
1998 const struct lxc_rootfs *rootfs,
1999 const char *lxc_name,
2000 const char *lxc_path)
911324ef
DL
2001{
2002 char path[MAXPATHLEN];
911324ef 2003 int ret;
d330fe7b 2004
34cfffb3 2005 /* relative to root mount point */
6e46cc0d 2006 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1433c9f9 2007 if (ret < 0 || ret >= sizeof(path)) {
9ba8130c
SH
2008 ERROR("path name too long");
2009 return -1;
2010 }
911324ef 2011
0a2dddd4 2012 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2013}
2014
80a881b2 2015static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
0a2dddd4 2016 const char *lxc_name, const char *lxc_path)
911324ef 2017{
aaf901be
AM
2018 struct mntent mntent;
2019 char buf[4096];
911324ef 2020 int ret = -1;
e76b8764 2021
aaf901be 2022 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
e76b8764 2023
911324ef 2024 if (!rootfs->path) {
aaf901be 2025 if (mount_entry_on_systemfs(&mntent))
e76b8764 2026 goto out;
911324ef 2027 continue;
e76b8764
CDC
2028 }
2029
911324ef 2030 /* We have a separate root, mounts are relative to it */
aaf901be 2031 if (mntent.mnt_dir[0] != '/') {
0a2dddd4 2032 if (mount_entry_on_relative_rootfs(&mntent, rootfs, lxc_name, lxc_path))
911324ef
DL
2033 goto out;
2034 continue;
2035 }
cd54d859 2036
0a2dddd4 2037 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name, lxc_path))
911324ef 2038 goto out;
0ad19a3f 2039 }
cd54d859 2040
0ad19a3f 2041 ret = 0;
cd54d859
DL
2042
2043 INFO("mount points have been setup");
0ad19a3f 2044out:
e7938e9e
MN
2045 return ret;
2046}
2047
80a881b2 2048static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
0a2dddd4 2049 const char *lxc_name, const char *lxc_path)
e7938e9e
MN
2050{
2051 FILE *file;
2052 int ret;
2053
2054 if (!fstab)
2055 return 0;
2056
2057 file = setmntent(fstab, "r");
2058 if (!file) {
2059 SYSERROR("failed to use '%s'", fstab);
2060 return -1;
2061 }
2062
0a2dddd4 2063 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
e7938e9e 2064
0ad19a3f 2065 endmntent(file);
2066 return ret;
2067}
2068
5ef5c9a3 2069FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2070{
5ef5c9a3 2071 int ret;
e7938e9e 2072 char *mount_entry;
5ef5c9a3
CB
2073 struct lxc_list *iterator;
2074 FILE *file;
2075 int fd = -1;
2076
2077 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2078 if (fd < 0) {
2079 if (errno != ENOSYS)
2080 return NULL;
2081 file = tmpfile();
2082 } else {
2083 file = fdopen(fd, "r+");
2084 }
e7938e9e 2085
e7938e9e 2086 if (!file) {
fad6ef95 2087 int saved_errno = errno;
5ef5c9a3
CB
2088 if (fd != -1)
2089 close(fd);
fad6ef95 2090 ERROR("Could not create mount entry file: %s.", strerror(saved_errno));
9fc7f8c0 2091 return NULL;
e7938e9e
MN
2092 }
2093
2094 lxc_list_for_each(iterator, mount) {
2095 mount_entry = iterator->elem;
5ef5c9a3
CB
2096 ret = fprintf(file, "%s\n", mount_entry);
2097 if (ret < strlen(mount_entry))
2098 WARN("Could not write mount entry to anonymous mount file.");
2099 }
2100
2101 if (fseek(file, 0, SEEK_SET) < 0) {
2102 fclose(file);
2103 return NULL;
e7938e9e
MN
2104 }
2105
9fc7f8c0
TA
2106 return file;
2107}
2108
5ef5c9a3
CB
2109static int setup_mount_entries(const struct lxc_rootfs *rootfs,
2110 struct lxc_list *mount, const char *lxc_name,
2111 const char *lxc_path)
9fc7f8c0
TA
2112{
2113 FILE *file;
2114 int ret;
2115
5ef5c9a3 2116 file = make_anonymous_mount_file(mount);
9fc7f8c0
TA
2117 if (!file)
2118 return -1;
e7938e9e 2119
0a2dddd4 2120 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
e7938e9e
MN
2121
2122 fclose(file);
2123 return ret;
2124}
2125
bab88e68
CS
2126static int parse_cap(const char *cap)
2127{
2128 char *ptr = NULL;
84760c11 2129 size_t i;
2130 int capid = -1;
bab88e68 2131
7035407c
DE
2132 if (!strcmp(cap, "none"))
2133 return -2;
2134
bab88e68
CS
2135 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2136
2137 if (strcmp(cap, caps_opt[i].name))
2138 continue;
2139
2140 capid = caps_opt[i].value;
2141 break;
2142 }
2143
2144 if (capid < 0) {
2145 /* try to see if it's numeric, so the user may specify
2146 * capabilities that the running kernel knows about but
2147 * we don't */
2148 errno = 0;
2149 capid = strtol(cap, &ptr, 10);
2150 if (!ptr || *ptr != '\0' || errno != 0)
2151 /* not a valid number */
2152 capid = -1;
2153 else if (capid > lxc_caps_last_cap())
2154 /* we have a number but it's not a valid
2155 * capability */
2156 capid = -1;
2157 }
2158
2159 return capid;
2160}
2161
0769b82a
CS
2162int in_caplist(int cap, struct lxc_list *caps)
2163{
2164 struct lxc_list *iterator;
2165 int capid;
2166
2167 lxc_list_for_each(iterator, caps) {
2168 capid = parse_cap(iterator->elem);
2169 if (capid == cap)
2170 return 1;
2171 }
2172
2173 return 0;
2174}
2175
81810dd1
DL
2176static int setup_caps(struct lxc_list *caps)
2177{
2178 struct lxc_list *iterator;
2179 char *drop_entry;
bab88e68 2180 int capid;
81810dd1
DL
2181
2182 lxc_list_for_each(iterator, caps) {
2183
2184 drop_entry = iterator->elem;
2185
bab88e68 2186 capid = parse_cap(drop_entry);
d55bc1ad 2187
81810dd1 2188 if (capid < 0) {
1e11be34
DL
2189 ERROR("unknown capability %s", drop_entry);
2190 return -1;
81810dd1
DL
2191 }
2192
2193 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2194
2195 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2196 SYSERROR("failed to remove %s capability", drop_entry);
2197 return -1;
2198 }
81810dd1
DL
2199
2200 }
2201
1fb86a7c
SH
2202 DEBUG("capabilities have been setup");
2203
2204 return 0;
2205}
2206
2207static int dropcaps_except(struct lxc_list *caps)
2208{
2209 struct lxc_list *iterator;
2210 char *keep_entry;
1fb86a7c
SH
2211 int i, capid;
2212 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2213 INFO("found %d capabilities", numcaps);
1fb86a7c 2214
2caf9a97
SH
2215 if (numcaps <= 0 || numcaps > 200)
2216 return -1;
2217
1fb86a7c
SH
2218 // caplist[i] is 1 if we keep capability i
2219 int *caplist = alloca(numcaps * sizeof(int));
2220 memset(caplist, 0, numcaps * sizeof(int));
2221
2222 lxc_list_for_each(iterator, caps) {
2223
2224 keep_entry = iterator->elem;
2225
bab88e68 2226 capid = parse_cap(keep_entry);
1fb86a7c 2227
7035407c
DE
2228 if (capid == -2)
2229 continue;
2230
1fb86a7c
SH
2231 if (capid < 0) {
2232 ERROR("unknown capability %s", keep_entry);
2233 return -1;
2234 }
2235
8255688a 2236 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2237
2238 caplist[capid] = 1;
2239 }
2240 for (i=0; i<numcaps; i++) {
2241 if (caplist[i])
2242 continue;
2243 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2244 SYSERROR("failed to remove capability %d", i);
2245 return -1;
2246 }
1fb86a7c
SH
2247 }
2248
2249 DEBUG("capabilities have been setup");
81810dd1
DL
2250
2251 return 0;
2252}
2253
0ad19a3f 2254static int setup_hw_addr(char *hwaddr, const char *ifname)
2255{
2256 struct sockaddr sockaddr;
2257 struct ifreq ifr;
fad6ef95 2258 int ret, fd, saved_errno;
0ad19a3f 2259
3cfc0f3a
MN
2260 ret = lxc_convert_mac(hwaddr, &sockaddr);
2261 if (ret) {
2262 ERROR("mac address '%s' conversion failed : %s",
2263 hwaddr, strerror(-ret));
0ad19a3f 2264 return -1;
2265 }
2266
2267 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
5da6aa8c 2268 ifr.ifr_name[IFNAMSIZ-1] = '\0';
0ad19a3f 2269 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2270
2271 fd = socket(AF_INET, SOCK_DGRAM, 0);
2272 if (fd < 0) {
3ab87b66 2273 ERROR("socket failure : %s", strerror(errno));
0ad19a3f 2274 return -1;
2275 }
2276
2277 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
fad6ef95 2278 saved_errno = errno;
0ad19a3f 2279 close(fd);
2280 if (ret)
fad6ef95 2281 ERROR("ioctl failure : %s", strerror(saved_errno));
0ad19a3f 2282
5da6aa8c 2283 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
cd54d859 2284
0ad19a3f 2285 return ret;
2286}
2287
82d5ae15 2288static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2289{
82d5ae15
DL
2290 struct lxc_list *iterator;
2291 struct lxc_inetdev *inetdev;
3cfc0f3a 2292 int err;
0ad19a3f 2293
82d5ae15
DL
2294 lxc_list_for_each(iterator, ip) {
2295
2296 inetdev = iterator->elem;
2297
0093bb8c
DL
2298 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2299 &inetdev->bcast, inetdev->prefix);
3cfc0f3a
MN
2300 if (err) {
2301 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2302 ifindex, strerror(-err));
82d5ae15
DL
2303 return -1;
2304 }
2305 }
2306
2307 return 0;
0ad19a3f 2308}
2309
82d5ae15 2310static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2311{
82d5ae15 2312 struct lxc_list *iterator;
7fa9074f 2313 struct lxc_inet6dev *inet6dev;
3cfc0f3a 2314 int err;
0ad19a3f 2315
82d5ae15
DL
2316 lxc_list_for_each(iterator, ip) {
2317
2318 inet6dev = iterator->elem;
2319
b3df193c 2320 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
0093bb8c
DL
2321 &inet6dev->mcast, &inet6dev->acast,
2322 inet6dev->prefix);
3cfc0f3a
MN
2323 if (err) {
2324 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2325 ifindex, strerror(-err));
82d5ae15 2326 return -1;
3cfc0f3a 2327 }
82d5ae15
DL
2328 }
2329
2330 return 0;
0ad19a3f 2331}
2332
82d5ae15 2333static int setup_netdev(struct lxc_netdev *netdev)
0ad19a3f 2334{
0ad19a3f 2335 char ifname[IFNAMSIZ];
0ad19a3f 2336 char *current_ifname = ifname;
3cfc0f3a 2337 int err;
0ad19a3f 2338
82d5ae15
DL
2339 /* empty network namespace */
2340 if (!netdev->ifindex) {
b0efbac4 2341 if (netdev->flags & IFF_UP) {
d472214b 2342 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2343 if (err) {
2344 ERROR("failed to set the loopback up : %s",
2345 strerror(-err));
82d5ae15
DL
2346 return -1;
2347 }
82d5ae15 2348 }
40790553
SH
2349 if (netdev->type != LXC_NET_VETH)
2350 return 0;
2351 netdev->ifindex = if_nametoindex(netdev->name);
0ad19a3f 2352 }
13954cce 2353
b466dc33 2354 /* get the new ifindex in case of physical netdev */
40790553 2355 if (netdev->type == LXC_NET_PHYS) {
b466dc33
BP
2356 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2357 ERROR("failed to get ifindex for %s",
2358 netdev->link);
2359 return -1;
2360 }
40790553 2361 }
b466dc33 2362
82d5ae15
DL
2363 /* retrieve the name of the interface */
2364 if (!if_indextoname(netdev->ifindex, current_ifname)) {
36eb9bde 2365 ERROR("no interface corresponding to index '%d'",
82d5ae15 2366 netdev->ifindex);
0ad19a3f 2367 return -1;
2368 }
13954cce 2369
018ef520 2370 /* default: let the system to choose one interface name */
9d083402 2371 if (!netdev->name)
fb6d9b2f
DL
2372 netdev->name = netdev->type == LXC_NET_PHYS ?
2373 netdev->link : "eth%d";
018ef520 2374
82d5ae15 2375 /* rename the interface name */
40790553
SH
2376 if (strcmp(ifname, netdev->name) != 0) {
2377 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2378 if (err) {
2379 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2380 strerror(-err));
2381 return -1;
2382 }
018ef520
DL
2383 }
2384
2385 /* Re-read the name of the interface because its name has changed
2386 * and would be automatically allocated by the system
2387 */
82d5ae15 2388 if (!if_indextoname(netdev->ifindex, current_ifname)) {
018ef520 2389 ERROR("no interface corresponding to index '%d'",
82d5ae15 2390 netdev->ifindex);
018ef520 2391 return -1;
0ad19a3f 2392 }
2393
82d5ae15
DL
2394 /* set a mac address */
2395 if (netdev->hwaddr) {
2396 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
36eb9bde 2397 ERROR("failed to setup hw address for '%s'",
82d5ae15 2398 current_ifname);
0ad19a3f 2399 return -1;
2400 }
2401 }
2402
82d5ae15
DL
2403 /* setup ipv4 addresses on the interface */
2404 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
36eb9bde 2405 ERROR("failed to setup ip addresses for '%s'",
0ad19a3f 2406 ifname);
2407 return -1;
2408 }
2409
82d5ae15
DL
2410 /* setup ipv6 addresses on the interface */
2411 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
36eb9bde 2412 ERROR("failed to setup ipv6 addresses for '%s'",
0ad19a3f 2413 ifname);
2414 return -1;
2415 }
2416
82d5ae15 2417 /* set the network device up */
b0efbac4 2418 if (netdev->flags & IFF_UP) {
3cfc0f3a
MN
2419 int err;
2420
d472214b 2421 err = lxc_netdev_up(current_ifname);
3cfc0f3a
MN
2422 if (err) {
2423 ERROR("failed to set '%s' up : %s", current_ifname,
2424 strerror(-err));
0ad19a3f 2425 return -1;
2426 }
2427
2428 /* the network is up, make the loopback up too */
d472214b 2429 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2430 if (err) {
2431 ERROR("failed to set the loopback up : %s",
2432 strerror(-err));
0ad19a3f 2433 return -1;
2434 }
2435 }
2436
f8fee0e2
MK
2437 /* We can only set up the default routes after bringing
2438 * up the interface, sine bringing up the interface adds
2439 * the link-local routes and we can't add a default
2440 * route if the gateway is not reachable. */
2441
2442 /* setup ipv4 gateway on the interface */
2443 if (netdev->ipv4_gateway) {
2444 if (!(netdev->flags & IFF_UP)) {
2445 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2446 return -1;
2447 }
2448
2449 if (lxc_list_empty(&netdev->ipv4)) {
2450 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2451 return -1;
2452 }
2453
2454 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2455 if (err) {
fc739df5
SG
2456 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2457 if (err) {
2458 ERROR("failed to add ipv4 dest for '%s': %s",
2459 ifname, strerror(-err));
2460 }
2461
2462 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2463 if (err) {
2464 ERROR("failed to setup ipv4 gateway for '%s': %s",
2465 ifname, strerror(-err));
2466 if (netdev->ipv4_gateway_auto) {
2467 char buf[INET_ADDRSTRLEN];
2468 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2469 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2470 }
2471 return -1;
19a26f82 2472 }
f8fee0e2
MK
2473 }
2474 }
2475
2476 /* setup ipv6 gateway on the interface */
2477 if (netdev->ipv6_gateway) {
2478 if (!(netdev->flags & IFF_UP)) {
2479 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2480 return -1;
2481 }
2482
2483 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2484 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2485 return -1;
2486 }
2487
2488 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2489 if (err) {
fc739df5
SG
2490 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2491 if (err) {
2492 ERROR("failed to add ipv6 dest for '%s': %s",
f8fee0e2 2493 ifname, strerror(-err));
19a26f82 2494 }
fc739df5
SG
2495
2496 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2497 if (err) {
2498 ERROR("failed to setup ipv6 gateway for '%s': %s",
2499 ifname, strerror(-err));
2500 if (netdev->ipv6_gateway_auto) {
2501 char buf[INET6_ADDRSTRLEN];
2502 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2503 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2504 }
2505 return -1;
2506 }
f8fee0e2
MK
2507 }
2508 }
2509
cd54d859
DL
2510 DEBUG("'%s' has been setup", current_ifname);
2511
0ad19a3f 2512 return 0;
2513}
2514
5f4535a3 2515static int setup_network(struct lxc_list *network)
0ad19a3f 2516{
82d5ae15 2517 struct lxc_list *iterator;
82d5ae15 2518 struct lxc_netdev *netdev;
0ad19a3f 2519
5f4535a3 2520 lxc_list_for_each(iterator, network) {
cd54d859 2521
5f4535a3 2522 netdev = iterator->elem;
82d5ae15
DL
2523
2524 if (setup_netdev(netdev)) {
2525 ERROR("failed to setup netdev");
2526 return -1;
2527 }
2528 }
cd54d859 2529
5f4535a3
DL
2530 if (!lxc_list_empty(network))
2531 INFO("network has been setup");
cd54d859
DL
2532
2533 return 0;
0ad19a3f 2534}
2535
c6d09e15
WB
2536static int parse_resource(const char *res) {
2537 size_t i;
2538 int resid = -1;
2539
2540 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2541 if (strcmp(res, limit_opt[i].name) == 0)
2542 return limit_opt[i].value;
2543 }
2544
2545 /* try to see if it's numeric, so the user may specify
2546 * resources that the running kernel knows about but
2547 * we don't */
2548 if (lxc_safe_int(res, &resid) == 0)
2549 return resid;
2550 return -1;
2551}
2552
2553int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2554 struct lxc_list *it;
2555 struct lxc_limit *lim;
2556 int resid;
2557
2558 lxc_list_for_each(it, limits) {
2559 lim = it->elem;
2560
2561 resid = parse_resource(lim->resource);
2562 if (resid < 0) {
2563 ERROR("unknown resource %s", lim->resource);
2564 return -1;
2565 }
2566
2567 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2568 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2569 return -1;
2570 }
2571 }
2572 return 0;
2573}
2574
2af6bd1b 2575/* try to move physical nics to the init netns */
5610055a 2576void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2af6bd1b 2577{
64d2fcb5 2578 int i, oldfd;
4ec31c52 2579 char ifname[IFNAMSIZ];
2af6bd1b 2580
5610055a 2581 if (netnsfd < 0 || conf->num_savednics == 0)
2af6bd1b
SH
2582 return;
2583
64d2fcb5 2584 INFO("Running to reset %d nic names.", conf->num_savednics);
5610055a 2585
64d2fcb5
CB
2586 oldfd = lxc_preserve_ns(getpid(), "net");
2587 if (oldfd < 0) {
2588 SYSERROR("Failed to open monitor netns fd.");
2af6bd1b
SH
2589 return;
2590 }
64d2fcb5 2591
2af6bd1b
SH
2592 if (setns(netnsfd, 0) != 0) {
2593 SYSERROR("Failed to enter container netns to reset nics");
2594 close(oldfd);
2595 return;
2596 }
2597 for (i=0; i<conf->num_savednics; i++) {
2598 struct saved_nic *s = &conf->saved_nics[i];
f2e206ff 2599 /* retrieve the name of the interface */
2600 if (!if_indextoname(s->ifindex, ifname)) {
2601 WARN("no interface corresponding to index '%d'", s->ifindex);
2602 continue;
2603 }
5610055a 2604 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
f2e206ff 2605 WARN("Error moving nic name:%s back to host netns", ifname);
5610055a 2606 free(s->orig_name);
2af6bd1b 2607 }
5610055a
WB
2608 conf->num_savednics = 0;
2609
2af6bd1b
SH
2610 if (setns(oldfd, 0) != 0)
2611 SYSERROR("Failed to re-enter monitor's netns");
2612 close(oldfd);
2613}
2614
ae9242c8
SH
2615static char *default_rootfs_mount = LXCROOTFSMOUNT;
2616
7b379ab3 2617struct lxc_conf *lxc_conf_init(void)
089cd8b8 2618{
7b379ab3 2619 struct lxc_conf *new;
26ddeedd 2620 int i;
7b379ab3
MN
2621
2622 new = malloc(sizeof(*new));
2623 if (!new) {
2624 ERROR("lxc_conf_init : %m");
2625 return NULL;
2626 }
2627 memset(new, 0, sizeof(*new));
2628
b40a606e 2629 new->loglevel = LXC_LOG_PRIORITY_NOTSET;
cccc74b5 2630 new->personality = -1;
124fa0a8 2631 new->autodev = 1;
596a818d
DE
2632 new->console.log_path = NULL;
2633 new->console.log_fd = -1;
28a4b0e5 2634 new->console.path = NULL;
63376d7d 2635 new->console.peer = -1;
b5159817
DE
2636 new->console.peerpty.busy = -1;
2637 new->console.peerpty.master = -1;
2638 new->console.peerpty.slave = -1;
63376d7d
DL
2639 new->console.master = -1;
2640 new->console.slave = -1;
2641 new->console.name[0] = '\0';
d2e30e99 2642 new->maincmd_fd = -1;
76a26f55 2643 new->nbd_idx = -1;
54c30e29 2644 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048
SH
2645 if (!new->rootfs.mount) {
2646 ERROR("lxc_conf_init : %m");
2647 free(new);
2648 return NULL;
2649 }
d89de239 2650 new->kmsg = 0;
858377e4 2651 new->logfd = -1;
7b379ab3
MN
2652 lxc_list_init(&new->cgroup);
2653 lxc_list_init(&new->network);
2654 lxc_list_init(&new->mount_list);
81810dd1 2655 lxc_list_init(&new->caps);
1fb86a7c 2656 lxc_list_init(&new->keepcaps);
f6d3e3e4 2657 lxc_list_init(&new->id_map);
f979ac15 2658 lxc_list_init(&new->includes);
4184c3e1 2659 lxc_list_init(&new->aliens);
7c661726 2660 lxc_list_init(&new->environment);
c6d09e15 2661 lxc_list_init(&new->limits);
26ddeedd
SH
2662 for (i=0; i<NUM_LXC_HOOKS; i++)
2663 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2664 lxc_list_init(&new->groups);
fe4de9a6
DE
2665 new->lsm_aa_profile = NULL;
2666 new->lsm_se_context = NULL;
5112cd70 2667 new->tmp_umount_proc = 0;
7b379ab3 2668
9f30a190
MM
2669 for (i = 0; i < LXC_NS_MAX; i++)
2670 new->inherit_ns_fd[i] = -1;
2671
72bb04e4
PT
2672 /* if running in a new user namespace, init and COMMAND
2673 * default to running as UID/GID 0 when using lxc-execute */
2674 new->init_uid = 0;
2675 new->init_gid = 0;
2676
7b379ab3 2677 return new;
089cd8b8
DL
2678}
2679
a589434e 2680static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2681{
b0ee5983
CB
2682 char *veth1, *veth2;
2683 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
b7b2fde4
CB
2684 int bridge_index, err;
2685 unsigned int mtu = 0;
13954cce 2686
8bee8851 2687 if (netdev->priv.veth_attr.pair) {
e892973e 2688 veth1 = netdev->priv.veth_attr.pair;
8bee8851
WB
2689 if (handler->conf->reboot)
2690 lxc_netdev_delete_by_name(veth1);
2691 } else {
9ba8130c
SH
2692 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2693 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2694 ERROR("veth1 name too long");
2695 return -1;
2696 }
a0265685 2697 veth1 = lxc_mkifname(veth1buf);
ad40563e
ÇO
2698 if (!veth1) {
2699 ERROR("failed to allocate a temporary name");
2700 return -1;
2701 }
74a2b586
JK
2702 /* store away for deconf */
2703 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
8634bc19 2704 }
82d5ae15 2705
0e391e57 2706 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
a0265685 2707 veth2 = lxc_mkifname(veth2buf);
ad40563e 2708 if (!veth2) {
82d5ae15 2709 ERROR("failed to allocate a temporary name");
ad40563e 2710 goto out_delete;
0ad19a3f 2711 }
2712
3cfc0f3a
MN
2713 err = lxc_veth_create(veth1, veth2);
2714 if (err) {
b0ee5983
CB
2715 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2716 veth2, strerror(-err));
ad40563e 2717 goto out_delete;
0ad19a3f 2718 }
13954cce 2719
49684c0b
CS
2720 /* changing the high byte of the mac address to 0xfe, the bridge interface
2721 * will always keep the host's mac address and not take the mac address
2722 * of a container */
2723 err = setup_private_host_hw_addr(veth1);
2724 if (err) {
b0ee5983
CB
2725 ERROR("failed to change mac address of host interface \"%s\": %s",
2726 veth1, strerror(-err));
49684c0b
CS
2727 goto out_delete;
2728 }
2729
af651aa9
SN
2730 netdev->ifindex = if_nametoindex(veth2);
2731 if (!netdev->ifindex) {
b0ee5983 2732 ERROR("failed to retrieve the index for \"%s\"", veth2);
af651aa9
SN
2733 goto out_delete;
2734 }
2735
82d5ae15 2736 if (netdev->mtu) {
b7b2fde4 2737 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
b0ee5983 2738 WARN("failed to parse mtu from");
b7b2fde4 2739 else
b0ee5983 2740 INFO("retrieved mtu %d", mtu);
e54864d3 2741 } else if (netdev->link) {
e9280f65 2742 bridge_index = if_nametoindex(netdev->link);
729e8bf6
CB
2743 if (bridge_index) {
2744 mtu = netdev_get_mtu(bridge_index);
b0ee5983 2745 INFO("retrieved mtu %d from %s", mtu, netdev->link);
729e8bf6
CB
2746 } else {
2747 mtu = netdev_get_mtu(netdev->ifindex);
b0ee5983 2748 INFO("retrieved mtu %d from %s", mtu, veth2);
729e8bf6 2749 }
e54864d3
NC
2750 }
2751
2752 if (mtu) {
2753 err = lxc_netdev_set_mtu(veth1, mtu);
3cfc0f3a 2754 if (!err)
e54864d3 2755 err = lxc_netdev_set_mtu(veth2, mtu);
3cfc0f3a 2756 if (err) {
b0ee5983
CB
2757 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2758 "and \"%s\": %s",
e54864d3 2759 mtu, veth1, veth2, strerror(-err));
eb14c10a 2760 goto out_delete;
75d09f83
DL
2761 }
2762 }
2763
3cfc0f3a 2764 if (netdev->link) {
c43cbc04 2765 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
3cfc0f3a 2766 if (err) {
b0ee5983
CB
2767 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2768 veth1, netdev->link, strerror(-err));
3cfc0f3a
MN
2769 goto out_delete;
2770 }
b0ee5983 2771 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
eb14c10a
DL
2772 }
2773
d472214b 2774 err = lxc_netdev_up(veth1);
6e35af2e 2775 if (err) {
b0ee5983 2776 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
6e35af2e 2777 goto out_delete;
0ad19a3f 2778 }
2779
e3b4c4c4 2780 if (netdev->upscript) {
751d9dcd
DL
2781 err = run_script(handler->name, "net", netdev->upscript, "up",
2782 "veth", veth1, (char*) NULL);
2783 if (err)
e3b4c4c4 2784 goto out_delete;
e3b4c4c4
ST
2785 }
2786
b0ee5983
CB
2787 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2788 netdev->ifindex);
82d5ae15 2789
6ab9ab6d 2790 return 0;
eb14c10a
DL
2791
2792out_delete:
b316d209
CB
2793 if (netdev->ifindex != 0)
2794 lxc_netdev_delete_by_name(veth1);
f10fad2f 2795 if (!netdev->priv.veth_attr.pair)
ad40563e 2796 free(veth1);
f10fad2f 2797 free(veth2);
6ab9ab6d 2798 return -1;
13954cce 2799}
d957ae2d 2800
74a2b586
JK
2801static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2802{
2803 char *veth1;
2804 int err;
2805
2806 if (netdev->priv.veth_attr.pair)
2807 veth1 = netdev->priv.veth_attr.pair;
2808 else
2809 veth1 = netdev->priv.veth_attr.veth1;
2810
2811 if (netdev->downscript) {
2812 err = run_script(handler->name, "net", netdev->downscript,
2813 "down", "veth", veth1, (char*) NULL);
2814 if (err)
2815 return -1;
2816 }
2817 return 0;
2818}
2819
a589434e 2820static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2821{
0e391e57 2822 char peerbuf[IFNAMSIZ], *peer;
3cfc0f3a 2823 int err;
d957ae2d
MT
2824
2825 if (!netdev->link) {
2826 ERROR("no link specified for macvlan netdev");
2827 return -1;
2828 }
13954cce 2829
9ba8130c
SH
2830 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2831 if (err >= sizeof(peerbuf))
2832 return -1;
82d5ae15 2833
a0265685 2834 peer = lxc_mkifname(peerbuf);
ad40563e 2835 if (!peer) {
82d5ae15
DL
2836 ERROR("failed to make a temporary name");
2837 return -1;
0ad19a3f 2838 }
2839
3cfc0f3a
MN
2840 err = lxc_macvlan_create(netdev->link, peer,
2841 netdev->priv.macvlan_attr.mode);
2842 if (err) {
2843 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2844 peer, netdev->link, strerror(-err));
ad40563e 2845 goto out;
0ad19a3f 2846 }
2847
82d5ae15
DL
2848 netdev->ifindex = if_nametoindex(peer);
2849 if (!netdev->ifindex) {
36eb9bde 2850 ERROR("failed to retrieve the index for %s", peer);
ad40563e 2851 goto out;
22ebac19 2852 }
2853
e3b4c4c4 2854 if (netdev->upscript) {
751d9dcd
DL
2855 err = run_script(handler->name, "net", netdev->upscript, "up",
2856 "macvlan", netdev->link, (char*) NULL);
2857 if (err)
ad40563e 2858 goto out;
e3b4c4c4
ST
2859 }
2860
a589434e 2861 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
e892973e 2862 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
0ad19a3f 2863
d957ae2d 2864 return 0;
ad40563e
ÇO
2865out:
2866 lxc_netdev_delete_by_name(peer);
2867 free(peer);
2868 return -1;
0ad19a3f 2869}
2870
74a2b586
JK
2871static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2872{
2873 int err;
2874
2875 if (netdev->downscript) {
2876 err = run_script(handler->name, "net", netdev->downscript,
2877 "down", "macvlan", netdev->link,
2878 (char*) NULL);
2879 if (err)
2880 return -1;
2881 }
2882 return 0;
2883}
2884
a589434e
JN
2885/* XXX: merge with instantiate_macvlan */
2886static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
26c39028
JHS
2887{
2888 char peer[IFNAMSIZ];
3cfc0f3a 2889 int err;
82f58d03 2890 static uint16_t vlan_cntr = 0;
b7b2fde4 2891 unsigned int mtu = 0;
26c39028
JHS
2892
2893 if (!netdev->link) {
2894 ERROR("no link specified for vlan netdev");
2895 return -1;
2896 }
2897
82f58d03 2898 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
9ba8130c
SH
2899 if (err >= sizeof(peer)) {
2900 ERROR("peer name too long");
2901 return -1;
2902 }
26c39028 2903
3cfc0f3a
MN
2904 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2905 if (err) {
2906 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2907 peer, netdev->link, strerror(-err));
26c39028
JHS
2908 return -1;
2909 }
2910
2911 netdev->ifindex = if_nametoindex(peer);
2912 if (!netdev->ifindex) {
2913 ERROR("failed to retrieve the ifindex for %s", peer);
b84f58b9 2914 lxc_netdev_delete_by_name(peer);
26c39028
JHS
2915 return -1;
2916 }
2917
a589434e 2918 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
e892973e 2919 netdev->ifindex);
b4fb7de1 2920 if (netdev->mtu) {
b7b2fde4
CB
2921 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
2922 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
2923 netdev->ifindex, netdev->name);
2924 return -1;
2925 }
2926 err = lxc_netdev_set_mtu(peer, mtu);
b4fb7de1
VL
2927 if (err) {
2928 ERROR("failed to set mtu '%s' for %s : %s",
2929 netdev->mtu, peer, strerror(-err));
2930 lxc_netdev_delete_by_name(peer);
2931 return -1;
2932 }
2933 }
e892973e 2934
26c39028
JHS
2935 return 0;
2936}
2937
74a2b586
JK
2938static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2939{
2940 return 0;
2941}
2942
a589434e 2943static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2944{
6168e99f
DL
2945 if (!netdev->link) {
2946 ERROR("no link specified for the physical interface");
2947 return -1;
2948 }
2949
9d083402 2950 netdev->ifindex = if_nametoindex(netdev->link);
82d5ae15 2951 if (!netdev->ifindex) {
9d083402 2952 ERROR("failed to retrieve the index for %s", netdev->link);
0ad19a3f 2953 return -1;
2954 }
2955
e3b4c4c4
ST
2956 if (netdev->upscript) {
2957 int err;
751d9dcd
DL
2958 err = run_script(handler->name, "net", netdev->upscript,
2959 "up", "phys", netdev->link, (char*) NULL);
2960 if (err)
e3b4c4c4 2961 return -1;
e3b4c4c4
ST
2962 }
2963
82d5ae15 2964 return 0;
0ad19a3f 2965}
2966
74a2b586
JK
2967static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2968{
2969 int err;
2970
2971 if (netdev->downscript) {
2972 err = run_script(handler->name, "net", netdev->downscript,
2973 "down", "phys", netdev->link, (char*) NULL);
2974 if (err)
2975 return -1;
2976 }
2977 return 0;
2978}
2979
a589434e 2980static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
26b797f3
SH
2981{
2982 netdev->ifindex = 0;
2983 return 0;
2984}
2985
a589434e 2986static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2987{
82d5ae15 2988 netdev->ifindex = 0;
e3b4c4c4
ST
2989 if (netdev->upscript) {
2990 int err;
751d9dcd
DL
2991 err = run_script(handler->name, "net", netdev->upscript,
2992 "up", "empty", (char*) NULL);
2993 if (err)
e3b4c4c4 2994 return -1;
e3b4c4c4 2995 }
82d5ae15 2996 return 0;
0ad19a3f 2997}
2998
74a2b586
JK
2999static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3000{
3001 int err;
3002
3003 if (netdev->downscript) {
3004 err = run_script(handler->name, "net", netdev->downscript,
3005 "down", "empty", (char*) NULL);
3006 if (err)
3007 return -1;
3008 }
3009 return 0;
3010}
3011
26b797f3
SH
3012static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3013{
3014 return 0;
3015}
3016
3017int lxc_requests_empty_network(struct lxc_handler *handler)
3018{
3019 struct lxc_list *network = &handler->conf->network;
3020 struct lxc_list *iterator;
3021 struct lxc_netdev *netdev;
3022 bool found_none = false, found_nic = false;
3023
3024 if (lxc_list_empty(network))
3025 return 0;
3026
3027 lxc_list_for_each(iterator, network) {
3028
3029 netdev = iterator->elem;
3030
3031 if (netdev->type == LXC_NET_NONE)
3032 found_none = true;
3033 else
3034 found_nic = true;
3035 }
3036 if (found_none && !found_nic)
3037 return 1;
3038 return 0;
3039}
3040
e3b4c4c4 3041int lxc_create_network(struct lxc_handler *handler)
0ad19a3f 3042{
e3b4c4c4 3043 struct lxc_list *network = &handler->conf->network;
82d5ae15 3044 struct lxc_list *iterator;
82d5ae15 3045 struct lxc_netdev *netdev;
cbef6c52
SH
3046 int am_root = (getuid() == 0);
3047
3048 if (!am_root)
3049 return 0;
0ad19a3f 3050
5f4535a3 3051 lxc_list_for_each(iterator, network) {
0ad19a3f 3052
5f4535a3 3053 netdev = iterator->elem;
13954cce 3054
24654103 3055 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
82d5ae15 3056 ERROR("invalid network configuration type '%d'",
5f4535a3 3057 netdev->type);
82d5ae15
DL
3058 return -1;
3059 }
0ad19a3f 3060
e3b4c4c4 3061 if (netdev_conf[netdev->type](handler, netdev)) {
82d5ae15
DL
3062 ERROR("failed to create netdev");
3063 return -1;
3064 }
e3b4c4c4 3065
0ad19a3f 3066 }
3067
3068 return 0;
3069}
3070
358daf49 3071bool lxc_delete_network(struct lxc_handler *handler)
7fef7a06 3072{
e97946ae 3073 int ret;
74a2b586 3074 struct lxc_list *network = &handler->conf->network;
7fef7a06
DL
3075 struct lxc_list *iterator;
3076 struct lxc_netdev *netdev;
358daf49 3077 bool deleted_all = true;
7fef7a06
DL
3078
3079 lxc_list_for_each(iterator, network) {
3080 netdev = iterator->elem;
d472214b 3081
74a2b586 3082 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
d8f8e352 3083 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
358daf49
CB
3084 WARN("Failed to rename interface with index %d "
3085 "to its initial name \"%s\".",
3086 netdev->ifindex, netdev->link);
d472214b 3087 continue;
d8f8e352 3088 }
d472214b 3089
74a2b586 3090 if (netdev_deconf[netdev->type](handler, netdev)) {
e97946ae 3091 WARN("Failed to destroy netdev");
74a2b586
JK
3092 }
3093
d8f8e352
DL
3094 /* Recent kernel remove the virtual interfaces when the network
3095 * namespace is destroyed but in case we did not moved the
3096 * interface to the network namespace, we have to destroy it
3097 */
e97946ae
CB
3098 if (netdev->ifindex != 0) {
3099 ret = lxc_netdev_delete_by_index(netdev->ifindex);
358daf49
CB
3100 if (-ret == ENODEV) {
3101 INFO("Interface \"%s\" with index %d already "
3102 "deleted or existing in different network "
3103 "namespace.",
3104 netdev->name ? netdev->name : "(null)",
3105 netdev->ifindex);
3106 } else if (ret < 0) {
3107 deleted_all = false;
3108 WARN("Failed to remove interface \"%s\" with "
3109 "index %d: %s.",
3110 netdev->name ? netdev->name : "(null)",
3111 netdev->ifindex, strerror(-ret));
3112 } else {
3113 INFO("Removed interface \"%s\" with index %d.",
3114 netdev->name ? netdev->name : "(null)",
3115 netdev->ifindex);
3116 }
e97946ae
CB
3117 }
3118
3119 /* Explicitly delete host veth device to prevent lingering
3120 * devices. We had issues in LXD around this.
3121 */
b316d209 3122 if (netdev->ifindex != 0 && netdev->type == LXC_NET_VETH && !am_unpriv()) {
358daf49
CB
3123 char *hostveth;
3124 if (netdev->priv.veth_attr.pair) {
e97946ae 3125 hostveth = netdev->priv.veth_attr.pair;
358daf49
CB
3126 ret = lxc_netdev_delete_by_name(hostveth);
3127 if (ret < 0) {
3128 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3129 } else {
3130 INFO("Removed interface \"%s\" from host.", hostveth);
358daf49
CB
3131 }
3132 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
e97946ae 3133 hostveth = netdev->priv.veth_attr.veth1;
e97946ae 3134 ret = lxc_netdev_delete_by_name(hostveth);
358daf49
CB
3135 if (ret < 0) {
3136 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3137 } else {
3138 INFO("Removed interface \"%s\" from host.", hostveth);
3139 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3140 }
e97946ae
CB
3141 }
3142 }
7fef7a06 3143 }
358daf49
CB
3144
3145 return deleted_all;
7fef7a06
DL
3146}
3147
45e854dc
SG
3148#define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3149
fe1f672f 3150/* lxc-user-nic returns "interface_name:interface_name\n" */
eab15c1e 3151#define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
c43cbc04
SH
3152static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3153 struct lxc_netdev *netdev, pid_t pid)
cbef6c52
SH
3154{
3155 pid_t child;
a7242d9a
ÇO
3156 int bytes, pipefd[2];
3157 char *token, *saveptr = NULL;
fe1f672f 3158 char buffer[MAX_BUFFER_SIZE];
091045f8 3159 char netdev_link[IFNAMSIZ + 1];
cbef6c52
SH
3160
3161 if (netdev->type != LXC_NET_VETH) {
3162 ERROR("nic type %d not support for unprivileged use",
091045f8 3163 netdev->type);
cbef6c52
SH
3164 return -1;
3165 }
3166
091045f8 3167 if (pipe(pipefd) < 0) {
a7242d9a
ÇO
3168 SYSERROR("pipe failed");
3169 return -1;
3170 }
3171
091045f8
CB
3172 child = fork();
3173 if (child < 0) {
cbef6c52 3174 SYSERROR("fork");
a7242d9a
ÇO
3175 close(pipefd[0]);
3176 close(pipefd[1]);
3177 return -1;
3178 }
3179
3180 if (child == 0) { // child
091045f8
CB
3181 /* Call lxc-user-nic pid type bridge. */
3182 int ret;
3183 char pidstr[LXC_NUMSTRLEN64];
3184
3185 close(pipefd[0]); /* Close the read-end of the pipe. */
3186
3187 /* Redirect stdout to write-end of the pipe. */
3188 ret = dup2(pipefd[1], STDOUT_FILENO);
3189 close(pipefd[1]); /* Close the write-end of the pipe. */
3190 if (ret < 0) {
3191 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3192 exit(EXIT_FAILURE);
3193 }
a7242d9a 3194
091045f8 3195 if (netdev->link)
cff7b5eb 3196 strncpy(netdev_link, netdev->link, IFNAMSIZ);
091045f8 3197 else
cff7b5eb 3198 strncpy(netdev_link, "none", IFNAMSIZ);
091045f8
CB
3199
3200 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3201 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3202 exit(EXIT_FAILURE);
3203 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3204
3205 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3206 lxcname, pidstr, netdev_link, netdev->name);
c43cbc04 3207 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
091045f8
CB
3208 pidstr, "veth", netdev_link, netdev->name, NULL);
3209
3210 SYSERROR("Failed to exec lxc-user-nic.");
3211 exit(EXIT_FAILURE);
a7242d9a
ÇO
3212 }
3213
3214 /* close the write-end of the pipe */
3215 close(pipefd[1]);
3216
fe1f672f 3217 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
091045f8
CB
3218 if (bytes < 0)
3219 SYSERROR("Failed to read from pipe file descriptor.");
a7242d9a
ÇO
3220 buffer[bytes - 1] = '\0';
3221
3222 if (wait_for_pid(child) != 0) {
3223 close(pipefd[0]);
cbef6c52
SH
3224 return -1;
3225 }
3226
a7242d9a
ÇO
3227 /* close the read-end of the pipe */
3228 close(pipefd[0]);
cbef6c52 3229
a7242d9a
ÇO
3230 /* fill netdev->name field */
3231 token = strtok_r(buffer, ":", &saveptr);
3232 if (!token)
3233 return -1;
091045f8
CB
3234
3235 netdev->name = malloc(IFNAMSIZ + 1);
658979c5 3236 if (!netdev->name) {
091045f8 3237 SYSERROR("Failed to allocate memory.");
658979c5
SH
3238 return -1;
3239 }
091045f8 3240 memset(netdev->name, 0, IFNAMSIZ + 1);
658979c5 3241 strncpy(netdev->name, token, IFNAMSIZ);
a7242d9a
ÇO
3242
3243 /* fill netdev->veth_attr.pair field */
3244 token = strtok_r(NULL, ":", &saveptr);
3245 if (!token)
3246 return -1;
091045f8 3247
a7242d9a 3248 netdev->priv.veth_attr.pair = strdup(token);
658979c5 3249 if (!netdev->priv.veth_attr.pair) {
091045f8 3250 ERROR("Failed to allocate memory.");
658979c5
SH
3251 return -1;
3252 }
45e854dc 3253
a7242d9a 3254 return 0;
cbef6c52
SH
3255}
3256
c43cbc04
SH
3257int lxc_assign_network(const char *lxcpath, char *lxcname,
3258 struct lxc_list *network, pid_t pid)
0ad19a3f 3259{
82d5ae15 3260 struct lxc_list *iterator;
82d5ae15 3261 struct lxc_netdev *netdev;
f2e206ff 3262 char ifname[IFNAMSIZ];
cbef6c52 3263 int am_root = (getuid() == 0);
3cfc0f3a 3264 int err;
0ad19a3f 3265
5f4535a3 3266 lxc_list_for_each(iterator, network) {
82d5ae15 3267
5f4535a3 3268 netdev = iterator->elem;
82d5ae15 3269
fbb16259 3270 if (netdev->type == LXC_NET_VETH && !am_root) {
72ccbbe1
SC
3271 if (netdev->mtu)
3272 INFO("mtu ignored due to insufficient privilege");
c43cbc04 3273 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
cbef6c52 3274 return -1;
658979c5
SH
3275 // lxc-user-nic has moved the nic to the new ns.
3276 // unpriv_assign_nic() fills in netdev->name.
3277 // netdev->ifindex will be filed in at setup_netdev.
cbef6c52
SH
3278 continue;
3279 }
236087a6 3280
fbb16259
SH
3281 /* empty network namespace, nothing to move */
3282 if (!netdev->ifindex)
3283 continue;
3284
f2e206ff 3285 /* retrieve the name of the interface */
3286 if (!if_indextoname(netdev->ifindex, ifname)) {
3287 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3288 return -1;
3289 }
3290
3291 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3cfc0f3a
MN
3292 if (err) {
3293 ERROR("failed to move '%s' to the container : %s",
3294 netdev->link, strerror(-err));
82d5ae15
DL
3295 return -1;
3296 }
3297
198cbbaa 3298 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
0ad19a3f 3299 }
3300
3301 return 0;
3302}
3303
251d0d2a
DE
3304static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3305 size_t buf_size)
f6d3e3e4 3306{
29053180
CB
3307 char path[MAXPATHLEN];
3308 int fd, ret;
f6d3e3e4 3309
29053180
CB
3310 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3311 idtype == ID_TYPE_UID ? 'u' : 'g');
3312 if (ret < 0 || ret >= MAXPATHLEN) {
3313 ERROR("failed to create path \"%s\"", path);
f6d3e3e4
SH
3314 return -E2BIG;
3315 }
29053180
CB
3316
3317 fd = open(path, O_WRONLY);
3318 if (fd < 0) {
3319 SYSERROR("failed to open \"%s\"", path);
3320 return -1;
f6d3e3e4 3321 }
29053180
CB
3322
3323 errno = 0;
3324 ret = lxc_write_nointr(fd, buf, buf_size);
3325 if (ret != buf_size) {
3326 SYSERROR("failed to write %cid mapping to \"%s\"",
3327 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3328 close(fd);
3329 return -1;
3330 }
3331 close(fd);
3332
3333 return 0;
f6d3e3e4
SH
3334}
3335
df6a2945
CB
3336/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both. */
3337static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3338{
3339 char *path;
3340 int ret;
3341 struct stat st;
3342 int fret = 0;
3343
3344 path = on_path(binary, NULL);
3345 if (!path)
3346 return -ENOENT;
3347
3348 ret = stat(path, &st);
3349 if (ret < 0) {
3350 fret = -errno;
3351 goto cleanup;
3352 }
3353
3354 /* Check if the binary is setuid. */
3355 if (st.st_mode & S_ISUID) {
3356 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3357 fret = 1;
3358 goto cleanup;
3359 }
3360
69924fff 3361 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
3362 /* Check if it has the CAP_SETUID capability. */
3363 if ((cap & CAP_SETUID) &&
3364 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3365 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3366 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3367 "and CAP_PERMITTED sets.", path);
3368 fret = 1;
3369 goto cleanup;
3370 }
3371
3372 /* Check if it has the CAP_SETGID capability. */
3373 if ((cap & CAP_SETGID) &&
3374 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3375 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3376 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3377 "and CAP_PERMITTED sets.", path);
3378 fret = 1;
3379 goto cleanup;
3380 }
d6018f88 3381 #else
69924fff
CB
3382 /* If we cannot check for file capabilities we need to give the benefit
3383 * of the doubt. Otherwise we might fail even though all the necessary
3384 * file capabilities are set.
3385 */
d6018f88
CB
3386 DEBUG("Cannot check for file capabilites as full capability support is "
3387 "missing. Manual intervention needed.");
3388 fret = 1;
df6a2945
CB
3389 #endif
3390
3391cleanup:
3392 free(path);
3393 return fret;
3394}
3395
986ef930
CB
3396int lxc_map_ids_exec_wrapper(void *args)
3397{
3398 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3399 return -1;
3400}
3401
f6d3e3e4
SH
3402int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3403{
f6d3e3e4 3404 struct id_map *map;
4bc3b759 3405 struct lxc_list *iterator;
251d0d2a 3406 enum idtype type;
986ef930 3407 char u_or_g;
4bc3b759 3408 char *pos;
99d43365 3409 int fill, left;
986ef930
CB
3410 char cmd_output[MAXPATHLEN];
3411 /* strlen("new@idmap") = 9
3412 * +
3413 * strlen(" ") = 1
3414 * +
3415 * LXC_NUMSTRLEN64
3416 * +
3417 * strlen(" ") = 1
3418 *
3419 * We add some additional space to make sure that we really have
3420 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3421 */
3422 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3423 int ret = 0, uidmap = 0, gidmap = 0;
3424 bool use_shadow = false, had_entry = false;
df6a2945
CB
3425
3426 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3427 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
3428 * will protected it by preventing another user from being handed the
3429 * range by shadow.
3430 */
df6a2945
CB
3431 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
3432 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
3433 if (uidmap > 0 && gidmap > 0) {
3434 DEBUG("Functional newuidmap and newgidmap binary found.");
4bc3b759 3435 use_shadow = true;
df6a2945 3436 } else {
99d43365
CB
3437 /* In case unprivileged users run application containers via
3438 * execute() or a start*() there are valid cases where they may
3439 * only want to map their own {g,u}id. Let's not block them from
3440 * doing so by requiring geteuid() == 0.
3441 */
3442 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3443 "write directly with euid %d.", geteuid());
0e6e3a41 3444 }
251d0d2a 3445
986ef930
CB
3446 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3447 type++, u_or_g = 'g') {
3448 pos = mapbuf;
3449
0e6e3a41 3450 if (use_shadow)
986ef930 3451 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 3452
cf3ef16d 3453 lxc_list_for_each(iterator, idmap) {
4bc3b759
CB
3454 /* The kernel only takes <= 4k for writes to
3455 * /proc/<nr>/[ug]id_map
3456 */
251d0d2a 3457 map = iterator->elem;
cf3ef16d
SH
3458 if (map->idtype != type)
3459 continue;
3460
4bc3b759
CB
3461 had_entry = true;
3462
986ef930 3463 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 3464 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
3465 use_shadow ? " " : "", map->nsid,
3466 map->hostid, map->range,
0e6e3a41 3467 use_shadow ? "" : "\n");
cf3ef16d 3468 if (fill <= 0 || fill >= left)
4bc3b759
CB
3469 SYSERROR("Too many {g,u}id mappings defined.");
3470
cf3ef16d 3471 pos += fill;
251d0d2a 3472 }
cf3ef16d 3473 if (!had_entry)
4f7521b4 3474 continue;
cf3ef16d 3475
986ef930
CB
3476 /* Try to catch the ouput of new{g,u}idmap to make debugging
3477 * easier.
3478 */
3479 if (use_shadow) {
3480 ret = run_command(cmd_output, sizeof(cmd_output),
3481 lxc_map_ids_exec_wrapper,
3482 (void *)mapbuf);
3483 if (ret < 0) {
3484 ERROR("new%cidmap failed to write mapping: %s",
3485 u_or_g, cmd_output);
3486 return -1;
3487 }
d1838f34 3488 } else {
986ef930
CB
3489 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3490 if (ret < 0)
3491 return -1;
d1838f34 3492 }
986ef930
CB
3493
3494 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 3495 }
251d0d2a 3496
986ef930 3497 return 0;
f6d3e3e4
SH
3498}
3499
cf3ef16d 3500/*
7b50c609
TS
3501 * return the host uid/gid to which the container root is mapped in
3502 * *val.
0b3a6504 3503 * Return true if id was found, false otherwise.
cf3ef16d 3504 */
2a9a80cb 3505bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 3506 unsigned long *val)
cf3ef16d
SH
3507{
3508 struct lxc_list *it;
3509 struct id_map *map;
3510
3511 lxc_list_for_each(it, &conf->id_map) {
3512 map = it->elem;
7b50c609 3513 if (map->idtype != idtype)
cf3ef16d
SH
3514 continue;
3515 if (map->nsid != 0)
3516 continue;
2a9a80cb
SH
3517 *val = map->hostid;
3518 return true;
cf3ef16d 3519 }
2a9a80cb 3520 return false;
cf3ef16d
SH
3521}
3522
2133f58c 3523int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3524{
3525 struct lxc_list *it;
3526 struct id_map *map;
3527 lxc_list_for_each(it, &conf->id_map) {
3528 map = it->elem;
2133f58c 3529 if (map->idtype != idtype)
cf3ef16d
SH
3530 continue;
3531 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 3532 return (id - map->hostid) + map->nsid;
cf3ef16d 3533 }
57d116ab 3534 return -1;
cf3ef16d
SH
3535}
3536
339efad9 3537int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3538{
3539 struct lxc_list *it;
3540 struct id_map *map;
2133f58c 3541 unsigned int freeid = 0;
cf3ef16d
SH
3542again:
3543 lxc_list_for_each(it, &conf->id_map) {
3544 map = it->elem;
2133f58c 3545 if (map->idtype != idtype)
cf3ef16d
SH
3546 continue;
3547 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3548 freeid = map->nsid + map->range;
3549 goto again;
3550 }
3551 }
3552 return freeid;
3553}
3554
19a26f82
MK
3555int lxc_find_gateway_addresses(struct lxc_handler *handler)
3556{
3557 struct lxc_list *network = &handler->conf->network;
3558 struct lxc_list *iterator;
3559 struct lxc_netdev *netdev;
3560 int link_index;
3561
3562 lxc_list_for_each(iterator, network) {
3563 netdev = iterator->elem;
3564
3565 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3566 continue;
3567
3568 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3569 ERROR("gateway = auto only supported for "
3570 "veth and macvlan");
3571 return -1;
3572 }
3573
3574 if (!netdev->link) {
3575 ERROR("gateway = auto needs a link interface");
3576 return -1;
3577 }
3578
3579 link_index = if_nametoindex(netdev->link);
3580 if (!link_index)
3581 return -EINVAL;
3582
3583 if (netdev->ipv4_gateway_auto) {
3584 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3585 ERROR("failed to automatically find ipv4 gateway "
3586 "address from link interface '%s'", netdev->link);
3587 return -1;
3588 }
3589 }
3590
3591 if (netdev->ipv6_gateway_auto) {
3592 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3593 ERROR("failed to automatically find ipv6 gateway "
3594 "address from link interface '%s'", netdev->link);
3595 return -1;
3596 }
3597 }
3598 }
3599
3600 return 0;
3601}
3602
5e4a62bf 3603int lxc_create_tty(const char *name, struct lxc_conf *conf)
b0a33c1e 3604{
5e4a62bf 3605 struct lxc_tty_info *tty_info = &conf->tty_info;
025ed0f3 3606 int i, ret;
b0a33c1e 3607
5e4a62bf
DL
3608 /* no tty in the configuration */
3609 if (!conf->tty)
b0a33c1e 3610 return 0;
3611
9e1045e3 3612 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
b0a33c1e 3613 if (!tty_info->pty_info) {
9e1045e3
CB
3614 SYSERROR("failed to allocate struct *pty_info");
3615 return -ENOMEM;
b0a33c1e 3616 }
3617
985d15b1 3618 for (i = 0; i < conf->tty; i++) {
b0a33c1e 3619 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3620
025ed0f3
SH
3621 process_lock();
3622 ret = openpty(&pty_info->master, &pty_info->slave,
9e1045e3 3623 pty_info->name, NULL, NULL);
025ed0f3
SH
3624 process_unlock();
3625 if (ret) {
9e1045e3 3626 SYSERROR("failed to create pty device number %d", i);
985d15b1
MT
3627 tty_info->nbtty = i;
3628 lxc_delete_tty(tty_info);
9e1045e3 3629 return -ENOTTY;
b0a33c1e 3630 }
3631
9e1045e3 3632 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
5332bb84
DL
3633 pty_info->name, pty_info->master, pty_info->slave);
3634
3ec1648d 3635 /* Prevent leaking the file descriptors to the container */
9e1045e3
CB
3636 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3637 if (ret < 0)
3638 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3639 "pty device \"%s\": %s",
3640 pty_info->master, pty_info->name, strerror(errno));
3641
3642 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3643 if (ret < 0)
3644 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3645 "pty device \"%s\": %s",
3646 pty_info->slave, pty_info->name, strerror(errno));
b035ad62 3647
b0a33c1e 3648 pty_info->busy = 0;
3649 }
3650
985d15b1 3651 tty_info->nbtty = conf->tty;
1ac470c0 3652
9e1045e3 3653 INFO("finished allocating %d pts devices", conf->tty);
985d15b1 3654 return 0;
b0a33c1e 3655}
3656
3657void lxc_delete_tty(struct lxc_tty_info *tty_info)
3658{
3659 int i;
3660
3661 for (i = 0; i < tty_info->nbtty; i++) {
3662 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3663
3664 close(pty_info->master);
3665 close(pty_info->slave);
3666 }
3667
3668 free(tty_info->pty_info);
e00c0242 3669 tty_info->pty_info = NULL;
b0a33c1e 3670 tty_info->nbtty = 0;
3671}
3672
f4f52cb5
CB
3673
3674int chown_mapped_root_exec_wrapper(void *args)
3675{
3676 execvp("lxc-usernsexec", args);
3677 return -1;
3678}
3679
f6d3e3e4 3680/*
7b50c609
TS
3681 * chown_mapped_root: for an unprivileged user with uid/gid X to
3682 * chown a dir to subuid/subgid Y, he needs to run chown as root
3683 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3684 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3685 * root is privileged with respect to hostuid/hostgid X, allowing
3686 * him to do the chown.
f6d3e3e4 3687 */
c4d10a05 3688int chown_mapped_root(char *path, struct lxc_conf *conf)
f6d3e3e4 3689{
f4f52cb5 3690 uid_t rootuid, rootgid;
2a9a80cb 3691 unsigned long val;
a7ef8753 3692 char *chownpath = path;
f4f52cb5
CB
3693 int hostuid, hostgid, ret;
3694 struct stat sb;
3695 char map1[100], map2[100], map3[100], map4[100], map5[100];
3696 char ugid[100];
3697 char *args1[] = {"lxc-usernsexec",
3698 "-m", map1,
3699 "-m", map2,
3700 "-m", map3,
3701 "-m", map5,
3702 "--", "chown", ugid, path,
3703 NULL};
3704 char *args2[] = {"lxc-usernsexec",
3705 "-m", map1,
3706 "-m", map2,
3707 "-m", map3,
3708 "-m", map4,
3709 "-m", map5,
3710 "--", "chown", ugid, path,
3711 NULL};
3712 char cmd_output[MAXPATHLEN];
3713
3714 hostuid = geteuid();
3715 hostgid = getegid();
f6d3e3e4 3716
2a9a80cb 3717 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 3718 ERROR("No uid mapping for container root");
c4d10a05 3719 return -1;
f6d3e3e4 3720 }
f4f52cb5 3721 rootuid = (uid_t)val;
7b50c609 3722 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 3723 ERROR("No gid mapping for container root");
7b50c609
TS
3724 return -1;
3725 }
f4f52cb5 3726 rootgid = (gid_t)val;
2a9a80cb 3727
a7ef8753 3728 /*
f4f52cb5 3729 * In case of overlay, we want only the writeable layer to be chowned
a7ef8753 3730 */
1f92162d 3731 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
a7ef8753
SH
3732 chownpath = strchr(path, ':');
3733 if (!chownpath) {
3734 ERROR("Bad overlay path: %s", path);
3735 return -1;
3736 }
f4f52cb5 3737 chownpath = strchr(chownpath + 1, ':');
a7ef8753
SH
3738 if (!chownpath) {
3739 ERROR("Bad overlay path: %s", path);
3740 return -1;
3741 }
3742 chownpath++;
3743 }
3744 path = chownpath;
f4f52cb5 3745 if (hostuid == 0) {
7b50c609 3746 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3747 ERROR("Error chowning %s", path);
3748 return -1;
3749 }
3750 return 0;
3751 }
f3d7e4ca 3752
f4f52cb5 3753 if (rootuid == hostuid) {
f3d7e4ca
SH
3754 // nothing to do
3755 INFO("%s: container root is our uid; no need to chown" ,__func__);
3756 return 0;
3757 }
3758
f4f52cb5
CB
3759 // save the current gid of "path"
3760 if (stat(path, &sb) < 0) {
3761 ERROR("Error stat %s", path);
f6d3e3e4
SH
3762 return -1;
3763 }
7b50c609 3764
f4f52cb5
CB
3765 /*
3766 * A file has to be group-owned by a gid mapped into the
3767 * container, or the container won't be privileged over it.
3768 */
3769 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3770 if (sb.st_uid == hostuid &&
3771 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3772 chown(path, -1, hostgid) < 0) {
3773 ERROR("Failed chgrping %s", path);
3774 return -1;
3775 }
f6d3e3e4 3776
f4f52cb5
CB
3777 // "u:0:rootuid:1"
3778 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3779 if (ret < 0 || ret >= 100) {
3780 ERROR("Error uid printing map string");
3781 return -1;
3782 }
7b50c609 3783
f4f52cb5
CB
3784 // "u:hostuid:hostuid:1"
3785 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3786 if (ret < 0 || ret >= 100) {
3787 ERROR("Error uid printing map string");
3788 return -1;
3789 }
c4d10a05 3790
f4f52cb5
CB
3791 // "g:0:rootgid:1"
3792 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3793 if (ret < 0 || ret >= 100) {
3794 ERROR("Error gid printing map string");
3795 return -1;
3796 }
98e5ba51 3797
f4f52cb5
CB
3798 // "g:pathgid:rootgid+pathgid:1"
3799 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3800 rootgid + (gid_t)sb.st_gid);
3801 if (ret < 0 || ret >= 100) {
3802 ERROR("Error gid printing map string");
3803 return -1;
3804 }
c4d10a05 3805
f4f52cb5
CB
3806 // "g:hostgid:hostgid:1"
3807 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3808 if (ret < 0 || ret >= 100) {
3809 ERROR("Error gid printing map string");
3810 return -1;
3811 }
7b50c609 3812
f4f52cb5
CB
3813 // "0:pathgid" (chown)
3814 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3815 if (ret < 0 || ret >= 100) {
3816 ERROR("Error owner printing format string for chown");
3817 return -1;
3818 }
7b50c609 3819
f4f52cb5
CB
3820 if (hostgid == sb.st_gid)
3821 ret = run_command(cmd_output, sizeof(cmd_output),
3822 chown_mapped_root_exec_wrapper,
3823 (void *)args1);
3824 else
3825 ret = run_command(cmd_output, sizeof(cmd_output),
3826 chown_mapped_root_exec_wrapper,
3827 (void *)args2);
3828 if (ret < 0)
3829 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 3830
f4f52cb5 3831 return ret;
f6d3e3e4
SH
3832}
3833
c4d10a05 3834int ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 3835{
c4d10a05 3836 if (lxc_list_empty(&c->id_map))
f6d3e3e4 3837 return 0;
c4d10a05 3838
29b10e4f 3839 if (strcmp(c->console.name, "") !=0 && chown_mapped_root(c->console.name, c) < 0) {
c4d10a05
SH
3840 ERROR("Failed to chown %s", c->console.name);
3841 return -1;
3842 }
3843
f6d3e3e4
SH
3844 return 0;
3845}
3846
943144d9
CB
3847/* NOTE: Must not be called from inside the container namespace! */
3848int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3849{
3850 int mounted;
3851
943144d9 3852 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3853 if (mounted == -1) {
943144d9 3854 SYSERROR("failed to mount /proc in the container");
01958b1f 3855 /* continue only if there is no rootfs */
943144d9 3856 if (conf->rootfs.path)
01958b1f 3857 return -1;
5112cd70 3858 } else if (mounted == 1) {
943144d9 3859 conf->tmp_umount_proc = 1;
5112cd70 3860 }
943144d9 3861
5112cd70
SH
3862 return 0;
3863}
3864
3865void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3866{
3867 if (lxc_conf->tmp_umount_proc == 1) {
3868 umount("/proc");
3869 lxc_conf->tmp_umount_proc = 0;
3870 }
3871}
3872
6a0c909a 3873void remount_all_slave(void)
e995d7a2
SH
3874{
3875 /* walk /proc/mounts and change any shared entries to slave */
3876 FILE *f = fopen("/proc/self/mountinfo", "r");
3877 char *line = NULL;
3878 size_t len = 0;
3879
3880 if (!f) {
3881 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3882 ERROR("Continuing container startup...");
3883 return;
3884 }
3885
3886 while (getline(&line, &len, f) != -1) {
3887 char *target, *opts;
3888 target = get_field(line, 4);
3889 if (!target)
3890 continue;
3891 opts = get_field(target, 2);
3892 if (!opts)
3893 continue;
3894 null_endofword(opts);
3895 if (!strstr(opts, "shared"))
3896 continue;
3897 null_endofword(target);
3898 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3899 SYSERROR("Failed to make %s rslave", target);
3900 ERROR("Continuing...");
3901 }
3902 }
3903 fclose(f);
f10fad2f 3904 free(line);
e995d7a2
SH
3905}
3906
2322903b
SH
3907void lxc_execute_bind_init(struct lxc_conf *conf)
3908{
3909 int ret;
9d9c111c
SH
3910 char path[PATH_MAX], destpath[PATH_MAX], *p;
3911
3912 /* If init exists in the container, don't bind mount a static one */
3913 p = choose_init(conf->rootfs.mount);
3914 if (p) {
3915 free(p);
3916 return;
3917 }
2322903b
SH
3918
3919 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3920 if (ret < 0 || ret >= PATH_MAX) {
3921 WARN("Path name too long searching for lxc.init.static");
3922 return;
3923 }
3924
3925 if (!file_exists(path)) {
3926 INFO("%s does not exist on host", path);
3927 return;
3928 }
3929
3930 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3931 if (ret < 0 || ret >= PATH_MAX) {
3932 WARN("Path name too long for container's lxc.init.static");
3933 return;
3934 }
3935
3936 if (!file_exists(destpath)) {
3937 FILE * pathfile = fopen(destpath, "wb");
3938 if (!pathfile) {
3939 SYSERROR("Failed to create mount target '%s'", destpath);
3940 return;
3941 }
3942 fclose(pathfile);
3943 }
3944
592fd47a 3945 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
2322903b
SH
3946 if (ret < 0)
3947 SYSERROR("Failed to bind lxc.init.static into container");
3948 INFO("lxc.init.static bound into container at %s", path);
3949}
3950
35120d9c
SH
3951/*
3952 * This does the work of remounting / if it is shared, calling the
3953 * container pre-mount hooks, and mounting the rootfs.
3954 */
3955int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 3956{
35120d9c
SH
3957 if (conf->rootfs_setup) {
3958 /*
3959 * rootfs was set up in another namespace. bind-mount it
3960 * to give us a mount in our own ns so we can pivot_root to it
3961 */
3962 const char *path = conf->rootfs.mount;
3963 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
3964 ERROR("Failed to bind-mount container / onto itself");
145832ba 3965 return -1;
35120d9c 3966 }
145832ba 3967 return 0;
35120d9c 3968 }
d4ef7c50 3969
e995d7a2
SH
3970 remount_all_slave();
3971
35120d9c
SH
3972 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
3973 ERROR("failed to run pre-mount hooks for container '%s'.", name);
3974 return -1;
3975 }
3976
9aa76a17 3977 if (lxc_setup_rootfs(conf)) {
35120d9c
SH
3978 ERROR("failed to setup rootfs for '%s'", name);
3979 return -1;
3980 }
3981
3982 conf->rootfs_setup = true;
3983 return 0;
3984}
3985
1c1c7051
SH
3986static bool verify_start_hooks(struct lxc_conf *conf)
3987{
3988 struct lxc_list *it;
3989 char path[MAXPATHLEN];
3990 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
3991 char *hookname = it->elem;
3992 struct stat st;
3993 int ret;
3994
3995 ret = snprintf(path, MAXPATHLEN, "%s%s",
7b6753e7 3996 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
1c1c7051
SH
3997 if (ret < 0 || ret >= MAXPATHLEN)
3998 return false;
3999 ret = stat(path, &st);
4000 if (ret) {
7b6753e7 4001 SYSERROR("Start hook %s not found in container",
1c1c7051
SH
4002 hookname);
4003 return false;
4004 }
6a0c909a 4005 return true;
1c1c7051
SH
4006 }
4007
4008 return true;
4009}
4010
ae467c54 4011static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
e8bd4e43 4012{
ae467c54
CB
4013 int i;
4014 int *ttyfds;
4015 struct lxc_pty_info *pty_info;
e8bd4e43
SH
4016 struct lxc_conf *conf = handler->conf;
4017 const struct lxc_tty_info *tty_info = &conf->tty_info;
e8bd4e43 4018 int sock = handler->ttysock[0];
ae467c54
CB
4019 int ret = -1;
4020 size_t num_ttyfds = (2 * conf->tty);
e8bd4e43 4021
ae467c54
CB
4022 ttyfds = malloc(num_ttyfds * sizeof(int));
4023 if (!ttyfds)
4024 return -1;
4025
4026 for (i = 0; i < num_ttyfds; i++) {
4027 pty_info = &tty_info->pty_info[i / 2];
4028 ttyfds[i++] = pty_info->slave;
4029 ttyfds[i] = pty_info->master;
4030 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
f07fa8df
CB
4031 "parent",
4032 pty_info->name, pty_info->master, pty_info->slave);
e8bd4e43
SH
4033 }
4034
ae467c54
CB
4035 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
4036 if (ret < 0)
4037 ERROR("failed to send %d ttys to parent: %s", conf->tty,
4038 strerror(errno));
4039 else
4040 TRACE("sent %d ttys to parent", conf->tty);
4041
e8bd4e43
SH
4042 close(handler->ttysock[0]);
4043 close(handler->ttysock[1]);
4044
ae467c54
CB
4045 for (i = 0; i < num_ttyfds; i++)
4046 close(ttyfds[i]);
e8bd4e43 4047
ae467c54
CB
4048 free(ttyfds);
4049
4050 return ret;
e8bd4e43
SH
4051}
4052
35120d9c
SH
4053int lxc_setup(struct lxc_handler *handler)
4054{
4055 const char *name = handler->name;
4056 struct lxc_conf *lxc_conf = handler->conf;
4057 const char *lxcpath = handler->lxcpath;
35120d9c
SH
4058
4059 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4060 ERROR("Error setting up rootfs mount after spawn");
4061 return -1;
4062 }
4063
6c544cb3
MM
4064 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4065 if (setup_utsname(lxc_conf->utsname)) {
4066 ERROR("failed to setup the utsname for '%s'", name);
4067 return -1;
4068 }
0ad19a3f 4069 }
4070
5f4535a3 4071 if (setup_network(&lxc_conf->network)) {
36eb9bde 4072 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 4073 return -1;
0ad19a3f 4074 }
4075
bc6928ff 4076 if (lxc_conf->autodev > 0) {
14221cbb 4077 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
91c3830e 4078 ERROR("failed to mount /dev in the container");
c6883f38
SH
4079 return -1;
4080 }
4081 }
4082
368bbc02
CS
4083 /* do automatic mounts (mainly /proc and /sys), but exclude
4084 * those that need to wait until other stuff has finished
4085 */
4fb3cba5 4086 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4087 ERROR("failed to setup the automatic mounts for '%s'", name);
4088 return -1;
4089 }
4090
0a2dddd4 4091 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
36eb9bde 4092 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 4093 return -1;
576f946d 4094 }
4095
0a2dddd4 4096 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
e7938e9e
MN
4097 ERROR("failed to setup the mount entries for '%s'", name);
4098 return -1;
4099 }
4100
7b6753e7 4101 /* Make sure any start hooks are in the container */
1c1c7051
SH
4102 if (!verify_start_hooks(lxc_conf))
4103 return -1;
4104
2322903b
SH
4105 if (lxc_conf->is_execute)
4106 lxc_execute_bind_init(lxc_conf);
4107
368bbc02
CS
4108 /* now mount only cgroup, if wanted;
4109 * before, /sys could not have been mounted
4110 * (is either mounted automatically or via fstab entries)
4111 */
4fb3cba5 4112 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4113 ERROR("failed to setup the automatic mounts for '%s'", name);
4114 return -1;
4115 }
4116
283678ed 4117 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
773fb9ca
SH
4118 ERROR("failed to run mount hooks for container '%s'.", name);
4119 return -1;
4120 }
4121
bc6928ff 4122 if (lxc_conf->autodev > 0) {
283678ed 4123 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
f7bee6c6
MW
4124 ERROR("failed to run autodev hooks for container '%s'.", name);
4125 return -1;
4126 }
27245ff7 4127 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
91c3830e
SH
4128 ERROR("failed to populate /dev in the container");
4129 return -1;
4130 }
4131 }
368bbc02 4132
3d7d929a 4133 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
36eb9bde 4134 ERROR("failed to setup the console for '%s'", name);
95b5ffaf 4135 return -1;
6e590161 4136 }
4137
7e0e1d94
AV
4138 if (lxc_conf->kmsg) {
4139 if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail
4140 ERROR("failed to setup kmsg for '%s'", name);
4141 }
1bd051a6 4142
69aa6655
DE
4143 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4144 ERROR("failed to setup /dev symlinks for '%s'", name);
4145 return -1;
4146 }
4147
5112cd70 4148 /* mount /proc if it's not already there */
943144d9 4149 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 4150 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 4151 return -1;
e075f5d9 4152 }
e075f5d9 4153
ac778708 4154 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 4155 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 4156 return -1;
ed502555 4157 }
4158
70761e5e 4159 if (lxc_setup_devpts(lxc_conf->pts)) {
36eb9bde 4160 ERROR("failed to setup the new pts instance");
95b5ffaf 4161 return -1;
3c26f34e 4162 }
4163
e8bd4e43
SH
4164 if (lxc_create_tty(name, lxc_conf)) {
4165 ERROR("failed to create the ttys");
4166 return -1;
4167 }
4168
ae467c54 4169 if (lxc_send_ttys_to_parent(handler) < 0) {
e8bd4e43
SH
4170 ERROR("failure sending console info to parent");
4171 return -1;
4172 }
4173
9e1045e3 4174 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
e8bd4e43
SH
4175 ERROR("failed to setup the ttys for '%s'", name);
4176 return -1;
4177 }
4178
4179 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4180 SYSERROR("failed to set environment variable for container ptys");
4181
4182
cccc74b5
DL
4183 if (setup_personality(lxc_conf->personality)) {
4184 ERROR("failed to setup personality");
4185 return -1;
4186 }
4187
97a8f74f
SG
4188 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4189 if (!lxc_list_empty(&lxc_conf->caps)) {
7389ca26 4190 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
f6d3e3e4
SH
4191 return -1;
4192 }
97a8f74f
SG
4193 if (dropcaps_except(&lxc_conf->keepcaps)) {
4194 ERROR("failed to keep requested caps");
4195 return -1;
4196 }
4197 } else if (setup_caps(&lxc_conf->caps)) {
4198 ERROR("failed to drop capabilities");
4199 return -1;
81810dd1
DL
4200 }
4201
cd54d859
DL
4202 NOTICE("'%s' is setup.", name);
4203
0ad19a3f 4204 return 0;
4205}
26ddeedd 4206
283678ed
SH
4207int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4208 const char *lxcpath, char *argv[])
26ddeedd
SH
4209{
4210 int which = -1;
4211 struct lxc_list *it;
4212
4213 if (strcmp(hook, "pre-start") == 0)
4214 which = LXCHOOK_PRESTART;
5ea6163a
SH
4215 else if (strcmp(hook, "pre-mount") == 0)
4216 which = LXCHOOK_PREMOUNT;
26ddeedd
SH
4217 else if (strcmp(hook, "mount") == 0)
4218 which = LXCHOOK_MOUNT;
f7bee6c6
MW
4219 else if (strcmp(hook, "autodev") == 0)
4220 which = LXCHOOK_AUTODEV;
26ddeedd
SH
4221 else if (strcmp(hook, "start") == 0)
4222 which = LXCHOOK_START;
52492063
WB
4223 else if (strcmp(hook, "stop") == 0)
4224 which = LXCHOOK_STOP;
26ddeedd
SH
4225 else if (strcmp(hook, "post-stop") == 0)
4226 which = LXCHOOK_POSTSTOP;
148e91f5
SH
4227 else if (strcmp(hook, "clone") == 0)
4228 which = LXCHOOK_CLONE;
37cf711b
SY
4229 else if (strcmp(hook, "destroy") == 0)
4230 which = LXCHOOK_DESTROY;
26ddeedd
SH
4231 else
4232 return -1;
4233 lxc_list_for_each(it, &conf->hooks[which]) {
4234 int ret;
4235 char *hookname = it->elem;
283678ed 4236 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
26ddeedd
SH
4237 if (ret)
4238 return ret;
4239 }
4240 return 0;
4241}
72d0e1cb 4242
427b3a21 4243static void lxc_remove_nic(struct lxc_list *it)
72d0e1cb
SG
4244{
4245 struct lxc_netdev *netdev = it->elem;
9ebb03ad 4246 struct lxc_list *it2,*next;
72d0e1cb
SG
4247
4248 lxc_list_del(it);
4249
f10fad2f
ME
4250 free(netdev->link);
4251 free(netdev->name);
4252 if (netdev->type == LXC_NET_VETH)
c9bb9a85 4253 free(netdev->priv.veth_attr.pair);
f10fad2f
ME
4254 free(netdev->upscript);
4255 free(netdev->hwaddr);
4256 free(netdev->mtu);
4257 free(netdev->ipv4_gateway);
4258 free(netdev->ipv6_gateway);
9ebb03ad 4259 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
72d0e1cb
SG
4260 lxc_list_del(it2);
4261 free(it2->elem);
4262 free(it2);
4263 }
9ebb03ad 4264 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
72d0e1cb
SG
4265 lxc_list_del(it2);
4266 free(it2->elem);
4267 free(it2);
4268 }
d95db067 4269 free(netdev);
72d0e1cb
SG
4270 free(it);
4271}
4272
4273/* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
12a50cc6 4274int lxc_clear_nic(struct lxc_conf *c, const char *key)
72d0e1cb
SG
4275{
4276 char *p1;
4277 int ret, idx, i;
4278 struct lxc_list *it;
4279 struct lxc_netdev *netdev;
4280
46cd2845 4281 p1 = strchr(key, '.');
72d0e1cb
SG
4282 if (!p1 || *(p1+1) == '\0')
4283 p1 = NULL;
4284
4285 ret = sscanf(key, "%d", &idx);
4286 if (ret != 1) return -1;
4287 if (idx < 0)
4288 return -1;
4289
4290 i = 0;
4291 lxc_list_for_each(it, &c->network) {
4292 if (i == idx)
4293 break;
4294 i++;
4295 }
4296 if (i < idx) // we don't have that many nics defined
4297 return -1;
4298
4299 if (!it || !it->elem)
4300 return -1;
4301
4302 netdev = it->elem;
4303
4304 if (!p1) {
4305 lxc_remove_nic(it);
52d21d40 4306 } else if (strcmp(p1, ".ipv4") == 0) {
9ebb03ad
DE
4307 struct lxc_list *it2,*next;
4308 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
72d0e1cb
SG
4309 lxc_list_del(it2);
4310 free(it2->elem);
4311 free(it2);
4312 }
52d21d40 4313 } else if (strcmp(p1, ".ipv6") == 0) {
9ebb03ad
DE
4314 struct lxc_list *it2,*next;
4315 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
72d0e1cb
SG
4316 lxc_list_del(it2);
4317 free(it2->elem);
4318 free(it2);
4319 }
72d0e1cb
SG
4320 }
4321 else return -1;
4322
4323 return 0;
4324}
4325
4326int lxc_clear_config_network(struct lxc_conf *c)
4327{
9ebb03ad
DE
4328 struct lxc_list *it,*next;
4329 lxc_list_for_each_safe(it, &c->network, next) {
72d0e1cb
SG
4330 lxc_remove_nic(it);
4331 }
4332 return 0;
4333}
4334
4335int lxc_clear_config_caps(struct lxc_conf *c)
4336{
9ebb03ad 4337 struct lxc_list *it,*next;
72d0e1cb 4338
9ebb03ad 4339 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
4340 lxc_list_del(it);
4341 free(it->elem);
4342 free(it);
4343 }
4344 return 0;
4345}
4346
74a3920a 4347static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
4348 struct lxc_list *it, *next;
4349
4355ab5f 4350 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
4351 lxc_list_del(it);
4352 free(it->elem);
4353 free(it);
4354 }
4355 return 0;
4356}
4357
4355ab5f
SH
4358int lxc_clear_idmaps(struct lxc_conf *c)
4359{
4360 return lxc_free_idmap(&c->id_map);
4361}
4362
1fb86a7c
SH
4363int lxc_clear_config_keepcaps(struct lxc_conf *c)
4364{
4365 struct lxc_list *it,*next;
4366
4367 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4368 lxc_list_del(it);
4369 free(it->elem);
4370 free(it);
4371 }
4372 return 0;
4373}
4374
12a50cc6 4375int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 4376{
9ebb03ad 4377 struct lxc_list *it,*next;
72d0e1cb 4378 bool all = false;
a6390f01 4379 const char *k = NULL;
72d0e1cb
SG
4380
4381 if (strcmp(key, "lxc.cgroup") == 0)
4382 all = true;
a6390f01
WB
4383 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4384 k = key + sizeof("lxc.cgroup.")-1;
4385 else
4386 return -1;
72d0e1cb 4387
9ebb03ad 4388 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
4389 struct lxc_cgroup *cg = it->elem;
4390 if (!all && strcmp(cg->subsystem, k) != 0)
4391 continue;
4392 lxc_list_del(it);
4393 free(cg->subsystem);
4394 free(cg->value);
4395 free(cg);
4396 free(it);
4397 }
4398 return 0;
4399}
4400
c6d09e15
WB
4401int lxc_clear_limits(struct lxc_conf *c, const char *key)
4402{
4403 struct lxc_list *it, *next;
4404 bool all = false;
4405 const char *k = NULL;
4406
4407 if (strcmp(key, "lxc.limit") == 0)
4408 all = true;
4409 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4410 k = key + sizeof("lxc.limit.")-1;
4411 else
4412 return -1;
4413
4414 lxc_list_for_each_safe(it, &c->limits, next) {
4415 struct lxc_limit *lim = it->elem;
4416 if (!all && strcmp(lim->resource, k) != 0)
4417 continue;
4418 lxc_list_del(it);
4419 free(lim->resource);
4420 free(lim);
4421 free(it);
4422 }
4423 return 0;
4424}
4425
ee1e7aa0
SG
4426int lxc_clear_groups(struct lxc_conf *c)
4427{
4428 struct lxc_list *it,*next;
4429
4430 lxc_list_for_each_safe(it, &c->groups, next) {
4431 lxc_list_del(it);
4432 free(it->elem);
4433 free(it);
4434 }
4435 return 0;
4436}
4437
ab799c0b
SG
4438int lxc_clear_environment(struct lxc_conf *c)
4439{
4440 struct lxc_list *it,*next;
4441
4442 lxc_list_for_each_safe(it, &c->environment, next) {
4443 lxc_list_del(it);
4444 free(it->elem);
4445 free(it);
4446 }
4447 return 0;
4448}
4449
4450
72d0e1cb
SG
4451int lxc_clear_mount_entries(struct lxc_conf *c)
4452{
9ebb03ad 4453 struct lxc_list *it,*next;
72d0e1cb 4454
9ebb03ad 4455 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
4456 lxc_list_del(it);
4457 free(it->elem);
4458 free(it);
4459 }
4460 return 0;
4461}
4462
b099e9e9
SH
4463int lxc_clear_automounts(struct lxc_conf *c)
4464{
4465 c->auto_mounts = 0;
4466 return 0;
4467}
4468
12a50cc6 4469int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 4470{
9ebb03ad 4471 struct lxc_list *it,*next;
17ed13a3 4472 bool all = false, done = false;
a6390f01 4473 const char *k = NULL;
72d0e1cb
SG
4474 int i;
4475
17ed13a3
SH
4476 if (strcmp(key, "lxc.hook") == 0)
4477 all = true;
a6390f01
WB
4478 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4479 k = key + sizeof("lxc.hook.")-1;
4480 else
4481 return -1;
17ed13a3 4482
72d0e1cb 4483 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 4484 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 4485 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
4486 lxc_list_del(it);
4487 free(it->elem);
4488 free(it);
4489 }
4490 done = true;
72d0e1cb
SG
4491 }
4492 }
17ed13a3
SH
4493
4494 if (!done) {
4495 ERROR("Invalid hook key: %s", key);
4496 return -1;
4497 }
72d0e1cb
SG
4498 return 0;
4499}
8eb5694b 4500
74a3920a 4501static void lxc_clear_saved_nics(struct lxc_conf *conf)
7b35f3d6
SH
4502{
4503 int i;
4504
0cf45501 4505 if (!conf->saved_nics)
7b35f3d6
SH
4506 return;
4507 for (i=0; i < conf->num_savednics; i++)
4508 free(conf->saved_nics[i].orig_name);
7b35f3d6
SH
4509 free(conf->saved_nics);
4510}
4511
4184c3e1
SH
4512static inline void lxc_clear_aliens(struct lxc_conf *conf)
4513{
4514 struct lxc_list *it,*next;
4515
4516 lxc_list_for_each_safe(it, &conf->aliens, next) {
4517 lxc_list_del(it);
4518 free(it->elem);
4519 free(it);
4520 }
4521}
4522
c7b15d1e 4523void lxc_clear_includes(struct lxc_conf *conf)
f979ac15
SH
4524{
4525 struct lxc_list *it,*next;
4526
4527 lxc_list_for_each_safe(it, &conf->includes, next) {
4528 lxc_list_del(it);
4529 free(it->elem);
4530 free(it);
4531 }
4532}
4533
8eb5694b
SH
4534void lxc_conf_free(struct lxc_conf *conf)
4535{
4536 if (!conf)
4537 return;
858377e4
SH
4538 if (current_config == conf)
4539 current_config = NULL;
f10fad2f
ME
4540 free(conf->console.log_path);
4541 free(conf->console.path);
4542 free(conf->rootfs.mount);
b3b8c97f 4543 free(conf->rootfs.bdev_type);
f10fad2f
ME
4544 free(conf->rootfs.options);
4545 free(conf->rootfs.path);
f10fad2f 4546 free(conf->logfile);
858377e4
SH
4547 if (conf->logfd != -1)
4548 close(conf->logfd);
f10fad2f
ME
4549 free(conf->utsname);
4550 free(conf->ttydir);
4551 free(conf->fstab);
4552 free(conf->rcfile);
4553 free(conf->init_cmd);
6b0d5538 4554 free(conf->unexpanded_config);
393903d1 4555 free(conf->pty_names);
76d0127f 4556 free(conf->syslog);
8eb5694b 4557 lxc_clear_config_network(conf);
f10fad2f
ME
4558 free(conf->lsm_aa_profile);
4559 free(conf->lsm_se_context);
769872f9 4560 lxc_seccomp_free(conf);
8eb5694b 4561 lxc_clear_config_caps(conf);
1fb86a7c 4562 lxc_clear_config_keepcaps(conf);
8eb5694b 4563 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 4564 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 4565 lxc_clear_mount_entries(conf);
7b35f3d6 4566 lxc_clear_saved_nics(conf);
27c27d73 4567 lxc_clear_idmaps(conf);
ee1e7aa0 4568 lxc_clear_groups(conf);
f979ac15 4569 lxc_clear_includes(conf);
761d81ca 4570 lxc_clear_aliens(conf);
ab799c0b 4571 lxc_clear_environment(conf);
c6d09e15 4572 lxc_clear_limits(conf, "lxc.limit");
8eb5694b
SH
4573 free(conf);
4574}
4355ab5f
SH
4575
4576struct userns_fn_data {
4577 int (*fn)(void *);
c9b7c33e 4578 const char *fn_name;
4355ab5f
SH
4579 void *arg;
4580 int p[2];
4581};
4582
4583static int run_userns_fn(void *data)
4584{
4585 struct userns_fn_data *d = data;
4586 char c;
4355ab5f 4587
f8aa4bf3 4588 /* Close write end of the pipe. */
4355ab5f 4589 close(d->p[1]);
f8aa4bf3
CB
4590
4591 /* Wait for parent to finish establishing a new mapping in the user
4592 * namespace we are executing in.
4593 */
4355ab5f
SH
4594 if (read(d->p[0], &c, 1) != 1)
4595 return -1;
f8aa4bf3
CB
4596
4597 /* Close read end of the pipe. */
4355ab5f 4598 close(d->p[0]);
f8aa4bf3 4599
c9b7c33e
CB
4600 if (d->fn_name)
4601 TRACE("calling function \"%s\"", d->fn_name);
f8aa4bf3 4602 /* Call function to run. */
4355ab5f
SH
4603 return d->fn(d->arg);
4604}
4605
339efad9 4606static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
f8aa4bf3
CB
4607 enum idtype idtype)
4608{
4609 struct lxc_list *it;
4610 struct id_map *map;
4611 struct id_map *retmap = NULL;
4612
4613 lxc_list_for_each(it, &conf->id_map) {
4614 map = it->elem;
4615 if (map->idtype != idtype)
4616 continue;
4617
4618 if (id >= map->hostid && id < map->hostid + map->range) {
4619 retmap = map;
4620 break;
4621 }
4622 }
4623
4624 if (!retmap)
4625 return NULL;
4626
4627 retmap = malloc(sizeof(*retmap));
4628 if (!retmap)
4629 return NULL;
4630
4631 memcpy(retmap, map, sizeof(*retmap));
4632 return retmap;
4633}
4634
4355ab5f 4635/*
f8aa4bf3
CB
4636 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4637 * existing one or establish a new one.
4355ab5f 4638 */
28a2d9e7 4639static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4355ab5f 4640{
28a2d9e7 4641 int hostid_mapped;
f8aa4bf3 4642 struct id_map *entry = NULL;
f8aa4bf3 4643
28a2d9e7
CB
4644 /* Reuse existing mapping. */
4645 entry = mapped_hostid_entry(conf, id, type);
4646 if (entry)
4647 return entry;
f8aa4bf3 4648
28a2d9e7
CB
4649 /* Find new mapping. */
4650 hostid_mapped = find_unmapped_nsid(conf, type);
4651 if (hostid_mapped < 0) {
4652 DEBUG("failed to find free mapping for id %d", id);
4653 return NULL;
f8aa4bf3 4654 }
f8aa4bf3 4655
28a2d9e7
CB
4656 entry = malloc(sizeof(*entry));
4657 if (!entry)
4658 return NULL;
4355ab5f 4659
28a2d9e7
CB
4660 entry->idtype = type;
4661 entry->nsid = hostid_mapped;
4662 entry->hostid = (unsigned long)id;
4663 entry->range = 1;
4355ab5f 4664
28a2d9e7 4665 return entry;
4355ab5f
SH
4666}
4667
f8aa4bf3
CB
4668/* Run a function in a new user namespace.
4669 * The caller's euid/egid will be mapped if it is not already.
4670 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4671 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4672 * This means we require only to establish a mapping from:
4673 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4674 * - the container root -> some sub{g,u}id
4675 * The former we add, if the user did not specifiy a mapping. The latter we
4676 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4677 * there to start the container in the first place.
4355ab5f 4678 */
c9b7c33e
CB
4679int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4680 const char *fn_name)
4355ab5f 4681{
f8aa4bf3
CB
4682 pid_t pid;
4683 uid_t euid, egid;
4355ab5f 4684 struct userns_fn_data d;
4355ab5f 4685 int p[2];
f8aa4bf3
CB
4686 struct lxc_list *it;
4687 struct id_map *map;
4688 char c = '1';
4689 int ret = -1;
4690 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
4691 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4692 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 4693
4355ab5f 4694 ret = pipe(p);
4355ab5f
SH
4695 if (ret < 0) {
4696 SYSERROR("opening pipe");
4697 return -1;
4698 }
4699 d.fn = fn;
c9b7c33e 4700 d.fn_name = fn_name;
4355ab5f
SH
4701 d.arg = data;
4702 d.p[0] = p[0];
4703 d.p[1] = p[1];
f8aa4bf3
CB
4704
4705 /* Clone child in new user namespace. */
4355ab5f 4706 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
f8aa4bf3
CB
4707 if (pid < 0) {
4708 ERROR("failed to clone child process in new user namespace");
4709 goto on_error;
4710 }
4711
4355ab5f 4712 close(p[0]);
4355ab5f
SH
4713 p[0] = -1;
4714
f8aa4bf3
CB
4715 /* Find container root. */
4716 lxc_list_for_each(it, &conf->id_map) {
4717 map = it->elem;
4718
4719 if (map->nsid != 0)
4720 continue;
4721
4722 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4723 container_root_uid = malloc(sizeof(*container_root_uid));
4724 if (!container_root_uid)
4725 goto on_error;
4726 container_root_uid->idtype = map->idtype;
4727 container_root_uid->hostid = map->hostid;
4728 container_root_uid->nsid = 0;
4729 container_root_uid->range = map->range;
4730 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4731 container_root_gid = malloc(sizeof(*container_root_gid));
4732 if (!container_root_gid)
4733 goto on_error;
4734 container_root_gid->idtype = map->idtype;
4735 container_root_gid->hostid = map->hostid;
4736 container_root_gid->nsid = 0;
4737 container_root_gid->range = map->range;
4738 }
4739
4740 /* Found container root. */
4741 if (container_root_uid && container_root_gid)
4742 break;
4743 }
4744
4745 /* This is actually checked earlier but it can't hurt. */
4746 if (!container_root_uid || !container_root_gid) {
4747 ERROR("no mapping for container root found");
4748 goto on_error;
4749 }
4750
1d90e064
CB
4751 host_uid_map = container_root_uid;
4752 host_gid_map = container_root_gid;
4753
f8aa4bf3
CB
4754 /* Check whether the {g,u}id of the user has a mapping. */
4755 euid = geteuid();
4756 egid = getegid();
1d90e064 4757 if (euid != container_root_uid->hostid)
28a2d9e7
CB
4758 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4759
1d90e064 4760 if (egid != container_root_gid->hostid)
28a2d9e7
CB
4761 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4762
4763 if (!host_uid_map) {
4764 DEBUG("failed to find mapping for uid %d", euid);
f8aa4bf3
CB
4765 goto on_error;
4766 }
4767
28a2d9e7
CB
4768 if (!host_gid_map) {
4769 DEBUG("failed to find mapping for gid %d", egid);
4770 goto on_error;
4771 }
4772
4773 /* Allocate new {g,u}id map list. */
4774 idmap = malloc(sizeof(*idmap));
4775 if (!idmap)
4776 goto on_error;
4777 lxc_list_init(idmap);
4778
f8aa4bf3
CB
4779 /* Add container root to the map. */
4780 tmplist = malloc(sizeof(*tmplist));
4781 if (!tmplist)
4782 goto on_error;
4783 lxc_list_add_elem(tmplist, container_root_uid);
4784 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4785
1d90e064 4786 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
4787 /* idmap will now keep track of that memory. */
4788 container_root_uid = NULL;
4789
4790 /* Add container root to the map. */
4791 tmplist = malloc(sizeof(*tmplist));
4792 if (!tmplist)
4793 goto on_error;
4794 lxc_list_add_elem(tmplist, host_uid_map);
4795 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4796 }
1d90e064
CB
4797 /* idmap will now keep track of that memory. */
4798 container_root_uid = NULL;
4799 /* idmap will now keep track of that memory. */
4800 host_uid_map = NULL;
f8aa4bf3
CB
4801
4802 tmplist = malloc(sizeof(*tmplist));
4803 if (!tmplist)
4804 goto on_error;
4805 lxc_list_add_elem(tmplist, container_root_gid);
4806 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4807
1d90e064 4808 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
4809 /* idmap will now keep track of that memory. */
4810 container_root_gid = NULL;
4811
4812 tmplist = malloc(sizeof(*tmplist));
4813 if (!tmplist)
4814 goto on_error;
4815 lxc_list_add_elem(tmplist, host_gid_map);
4816 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4817 }
1d90e064
CB
4818 /* idmap will now keep track of that memory. */
4819 container_root_gid = NULL;
4820 /* idmap will now keep track of that memory. */
4821 host_gid_map = NULL;
f8aa4bf3 4822
77803ee7
CB
4823 if (lxc_log_get_level() == LXC_LOG_PRIORITY_TRACE ||
4824 conf->loglevel == LXC_LOG_PRIORITY_TRACE) {
f8aa4bf3
CB
4825 lxc_list_for_each(it, idmap) {
4826 map = it->elem;
4827 TRACE("establishing %cid mapping for \"%d\" in new "
4828 "user namespace: nsuid %lu - hostid %lu - range "
4829 "%lu",
4830 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4831 map->nsid, map->hostid, map->range);
4832 }
4355ab5f
SH
4833 }
4834
f8aa4bf3 4835 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4836 ret = lxc_map_ids(idmap, pid);
f8aa4bf3
CB
4837 if (ret < 0) {
4838 ERROR("error setting up {g,u}id mappings for child process "
4839 "\"%d\"",
4840 pid);
4841 goto on_error;
4355ab5f
SH
4842 }
4843
f8aa4bf3 4844 /* Tell child to proceed. */
4355ab5f 4845 if (write(p[1], &c, 1) != 1) {
f8aa4bf3
CB
4846 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4847 goto on_error;
4355ab5f
SH
4848 }
4849
f8aa4bf3 4850 /* Wait for child to finish. */
3139aead
SG
4851 ret = wait_for_pid(pid);
4852
f8aa4bf3 4853on_error:
1d90e064
CB
4854 if (idmap)
4855 lxc_free_idmap(idmap);
4856 if (container_root_uid)
4857 free(container_root_uid);
4858 if (container_root_gid)
4859 free(container_root_gid);
4860 if (host_uid_map && (host_uid_map != container_root_uid))
4861 free(host_uid_map);
4862 if (host_gid_map && (host_gid_map != container_root_gid))
4863 free(host_gid_map);
3139aead 4864
4355ab5f
SH
4865 if (p[0] != -1)
4866 close(p[0]);
4867 close(p[1]);
f8aa4bf3
CB
4868
4869 return ret;
4355ab5f 4870}
97e9cfa0 4871
a96a8e8c 4872/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4873static char* getuname(void)
4874{
a96a8e8c 4875 struct passwd *result;
97e9cfa0 4876
a96a8e8c
SH
4877 result = getpwuid(geteuid());
4878 if (!result)
97e9cfa0
SH
4879 return NULL;
4880
a96a8e8c 4881 return strdup(result->pw_name);
97e9cfa0
SH
4882}
4883
a96a8e8c 4884/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4885static char *getgname(void)
4886{
a96a8e8c 4887 struct group *result;
97e9cfa0 4888
a96a8e8c
SH
4889 result = getgrgid(getegid());
4890 if (!result)
97e9cfa0
SH
4891 return NULL;
4892
a96a8e8c 4893 return strdup(result->gr_name);
97e9cfa0
SH
4894}
4895
a96a8e8c 4896/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4897void suggest_default_idmap(void)
4898{
4899 FILE *f;
4900 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4901 char *line = NULL;
4902 char *uname, *gname;
4903 size_t len = 0;
4904
4905 if (!(uname = getuname()))
4906 return;
4907
4908 if (!(gname = getgname())) {
4909 free(uname);
4910 return;
4911 }
4912
4913 f = fopen(subuidfile, "r");
4914 if (!f) {
4915 ERROR("Your system is not configured with subuids");
4916 free(gname);
4917 free(uname);
4918 return;
4919 }
4920 while (getline(&line, &len, f) != -1) {
b7930180 4921 size_t no_newline = 0;
97e9cfa0
SH
4922 char *p = strchr(line, ':'), *p2;
4923 if (*line == '#')
4924 continue;
4925 if (!p)
4926 continue;
4927 *p = '\0';
4928 p++;
4929 if (strcmp(line, uname))
4930 continue;
4931 p2 = strchr(p, ':');
4932 if (!p2)
4933 continue;
4934 *p2 = '\0';
4935 p2++;
4936 if (!*p2)
4937 continue;
b7930180
CB
4938 no_newline = strcspn(p2, "\n");
4939 p2[no_newline] = '\0';
4940
b7b2fde4
CB
4941 if (lxc_safe_uint(p, &uid) < 0)
4942 WARN("Could not parse UID.");
4943 if (lxc_safe_uint(p2, &urange) < 0)
4944 WARN("Could not parse UID range.");
97e9cfa0
SH
4945 }
4946 fclose(f);
4947
6be7389a 4948 f = fopen(subgidfile, "r");
97e9cfa0
SH
4949 if (!f) {
4950 ERROR("Your system is not configured with subgids");
4951 free(gname);
4952 free(uname);
4953 return;
4954 }
4955 while (getline(&line, &len, f) != -1) {
b7930180 4956 size_t no_newline = 0;
97e9cfa0
SH
4957 char *p = strchr(line, ':'), *p2;
4958 if (*line == '#')
4959 continue;
4960 if (!p)
4961 continue;
4962 *p = '\0';
4963 p++;
4964 if (strcmp(line, uname))
4965 continue;
4966 p2 = strchr(p, ':');
4967 if (!p2)
4968 continue;
4969 *p2 = '\0';
4970 p2++;
4971 if (!*p2)
4972 continue;
b7930180
CB
4973 no_newline = strcspn(p2, "\n");
4974 p2[no_newline] = '\0';
4975
b7b2fde4
CB
4976 if (lxc_safe_uint(p, &gid) < 0)
4977 WARN("Could not parse GID.");
4978 if (lxc_safe_uint(p2, &grange) < 0)
4979 WARN("Could not parse GID range.");
97e9cfa0
SH
4980 }
4981 fclose(f);
4982
f10fad2f 4983 free(line);
97e9cfa0
SH
4984
4985 if (!urange || !grange) {
4986 ERROR("You do not have subuids or subgids allocated");
4987 ERROR("Unprivileged containers require subuids and subgids");
4988 return;
4989 }
4990
4991 ERROR("You must either run as root, or define uid mappings");
4992 ERROR("To pass uid mappings to lxc-create, you could create");
4993 ERROR("~/.config/lxc/default.conf:");
4994 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4995 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
4996 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
4997
4998 free(gname);
4999 free(uname);
5000}
aaf26830 5001
a7307747
SH
5002static void free_cgroup_settings(struct lxc_list *result)
5003{
5004 struct lxc_list *iterator, *next;
5005
5006 lxc_list_for_each_safe(iterator, result, next) {
5007 lxc_list_del(iterator);
5008 free(iterator);
5009 }
5010 free(result);
5011}
5012
aaf26830
KT
5013/*
5014 * Return the list of cgroup_settings sorted according to the following rules
5015 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
5016 */
5017struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
5018{
5019 struct lxc_list *result;
5020 struct lxc_list *memsw_limit = NULL;
5021 struct lxc_list *it = NULL;
5022 struct lxc_cgroup *cg = NULL;
5023 struct lxc_list *item = NULL;
5024
5025 result = malloc(sizeof(*result));
fac7c663
KT
5026 if (!result) {
5027 ERROR("failed to allocate memory to sort cgroup settings");
5028 return NULL;
5029 }
aaf26830
KT
5030 lxc_list_init(result);
5031
5032 /*Iterate over the cgroup settings and copy them to the output list*/
5033 lxc_list_for_each(it, cgroup_settings) {
5034 item = malloc(sizeof(*item));
fac7c663
KT
5035 if (!item) {
5036 ERROR("failed to allocate memory to sort cgroup settings");
a7307747 5037 free_cgroup_settings(result);
fac7c663
KT
5038 return NULL;
5039 }
aaf26830
KT
5040 item->elem = it->elem;
5041 cg = it->elem;
5042 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
5043 /* Store the memsw_limit location */
5044 memsw_limit = item;
5045 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4d5b72a1 5046 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
aaf26830
KT
5047 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5048 item->elem = memsw_limit->elem;
5049 memsw_limit->elem = it->elem;
5050 }
5051 lxc_list_add_tail(result, item);
5052 }
5053
5054 return result;
a7307747 5055}