]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
utils: switch to has_fs_type()
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
8f3e280e
CB
27#include <dirent.h>
28#include <errno.h>
29#include <fcntl.h>
30#include <grp.h>
31#include <inttypes.h>
32#include <libgen.h>
33#include <pwd.h>
34#include <stdarg.h>
0ad19a3f 35#include <stdio.h>
0ad19a3f 36#include <stdlib.h>
0ad19a3f 37#include <string.h>
8f3e280e 38#include <time.h>
0ad19a3f 39#include <unistd.h>
8f3e280e
CB
40#include <arpa/inet.h>
41#include <linux/loop.h>
8f3e280e
CB
42#include <net/if.h>
43#include <netinet/in.h>
44#include <sys/mman.h>
45#include <sys/mount.h>
46#include <sys/param.h>
47#include <sys/prctl.h>
48#include <sys/stat.h>
49#include <sys/socket.h>
ce831b3b 50#include <sys/sysmacros.h>
2d76d1d7 51#include <sys/syscall.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
1d52bdf7 55
af6824fc
ST
56/* makedev() */
57#ifdef MAJOR_IN_MKDEV
58# include <sys/mkdev.h>
59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
5ef5c9a3
CB
71#ifdef HAVE_LINUX_MEMFD_H
72#include <linux/memfd.h>
73#endif
74
e8bd4e43 75#include "af_unix.h"
8f3e280e
CB
76#include "bdev.h"
77#include "caps.h" /* for lxc_caps_last_cap() */
78#include "cgroup.h"
1b09f2c0 79#include "conf.h"
1ed6ba91 80#include "confile_utils.h"
8f3e280e 81#include "error.h"
1b09f2c0 82#include "log.h"
d8e48992 83#include "lxcaufs.h"
025ed0f3 84#include "lxclock.h"
8f3e280e
CB
85#include "lxcoverlay.h"
86#include "lxcseccomp.h"
4355ab5f 87#include "namespace.h"
8f3e280e
CB
88#include "network.h"
89#include "parse.h"
90#include "utils.h"
fe4de9a6 91#include "lsm/lsm.h"
d0a36f2c 92
e37dda71 93#if HAVE_LIBCAP
495d2046
SG
94#include <sys/capability.h>
95#endif
96
6ff05e18
SG
97#if HAVE_SYS_PERSONALITY_H
98#include <sys/personality.h>
99#endif
100
edaf8b1b
SG
101#if IS_BIONIC
102#include <../include/lxcmntent.h>
a04f5407
CB
103#ifndef HAVE_PRLIMIT
104#include <../include/prlimit.h>
105#endif
edaf8b1b
SG
106#else
107#include <mntent.h>
108#endif
109
36eb9bde 110lxc_log_define(lxc_conf, lxc);
e5bda9ee 111
e37dda71 112#if HAVE_LIBCAP
b09094da
MN
113#ifndef CAP_SETFCAP
114#define CAP_SETFCAP 31
115#endif
116
117#ifndef CAP_MAC_OVERRIDE
118#define CAP_MAC_OVERRIDE 32
119#endif
120
121#ifndef CAP_MAC_ADMIN
122#define CAP_MAC_ADMIN 33
123#endif
495d2046 124#endif
b09094da
MN
125
126#ifndef PR_CAPBSET_DROP
127#define PR_CAPBSET_DROP 24
128#endif
129
9818cae4
SG
130#ifndef LO_FLAGS_AUTOCLEAR
131#define LO_FLAGS_AUTOCLEAR 4
132#endif
133
bc5b27d6
DK
134#ifndef CAP_SETUID
135#define CAP_SETUID 7
136#endif
137
138#ifndef CAP_SETGID
139#define CAP_SETGID 6
140#endif
141
0769b82a
CS
142/* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144#ifndef CAP_SYS_ADMIN
145#define CAP_SYS_ADMIN 21
146#endif
147
2d76d1d7
SG
148/* Define pivot_root() if missing from the C library */
149#ifndef HAVE_PIVOT_ROOT
150static int pivot_root(const char * new_root, const char * put_old)
151{
152#ifdef __NR_pivot_root
8f3e280e 153 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 154#else
8f3e280e
CB
155 errno = ENOSYS;
156 return -1;
2d76d1d7
SG
157#endif
158}
159#else
160extern int pivot_root(const char * new_root, const char * put_old);
161#endif
162
163/* Define sethostname() if missing from the C library */
164#ifndef HAVE_SETHOSTNAME
165static int sethostname(const char * name, size_t len)
166{
167#ifdef __NR_sethostname
8f3e280e 168 return syscall(__NR_sethostname, name, len);
2d76d1d7 169#else
8f3e280e
CB
170 errno = ENOSYS;
171 return -1;
2d76d1d7
SG
172#endif
173}
174#endif
175
ecec0126
SG
176#ifndef MS_PRIVATE
177#define MS_PRIVATE (1<<18)
178#endif
179
8912711c
CB
180#ifndef MS_LAZYTIME
181#define MS_LAZYTIME (1<<25)
182#endif
183
5ef5c9a3
CB
184/* memfd_create() */
185#ifndef MFD_CLOEXEC
186#define MFD_CLOEXEC 0x0001U
187#endif
188
189#ifndef MFD_ALLOW_SEALING
190#define MFD_ALLOW_SEALING 0x0002U
191#endif
192
193#ifndef HAVE_MEMFD_CREATE
194static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232}
233#else
234extern int memfd_create(const char *name, unsigned int flags);
235#endif
236
2b9ae35a
CB
237char *lxchook_names[NUM_LXC_HOOKS] = {"pre-start", "pre-mount", "mount",
238 "autodev", "start", "stop",
239 "post-stop", "clone", "destroy"};
72d0e1cb 240
a589434e 241typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
0ad19a3f 242
998ac676
RT
243struct mount_opt {
244 char *name;
245 int clear;
246 int flag;
247};
248
81810dd1
DL
249struct caps_opt {
250 char *name;
251 int value;
252};
253
c6d09e15
WB
254struct limit_opt {
255 char *name;
256 int value;
257};
258
858377e4
SH
259/*
260 * The lxc_conf of the container currently being worked on in an
261 * API call
262 * This is used in the error calls
263 */
264#ifdef HAVE_TLS
265__thread struct lxc_conf *current_config;
266#else
267struct lxc_conf *current_config;
268#endif
269
0769b82a
CS
270/* Declare this here, since we don't want to reshuffle the whole file. */
271static int in_caplist(int cap, struct lxc_list *caps);
272
a589434e
JN
273static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
274static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
275static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
276static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
277static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
278static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
279
280static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
281 [LXC_NET_VETH] = instantiate_veth,
282 [LXC_NET_MACVLAN] = instantiate_macvlan,
283 [LXC_NET_VLAN] = instantiate_vlan,
284 [LXC_NET_PHYS] = instantiate_phys,
285 [LXC_NET_EMPTY] = instantiate_empty,
286 [LXC_NET_NONE] = instantiate_none,
0ad19a3f 287};
288
74a2b586
JK
289static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
290static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
291static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
292static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
293static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
26b797f3 294static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
74a2b586 295
a589434e 296static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
74a2b586
JK
297 [LXC_NET_VETH] = shutdown_veth,
298 [LXC_NET_MACVLAN] = shutdown_macvlan,
299 [LXC_NET_VLAN] = shutdown_vlan,
300 [LXC_NET_PHYS] = shutdown_phys,
301 [LXC_NET_EMPTY] = shutdown_empty,
26b797f3 302 [LXC_NET_NONE] = shutdown_none,
74a2b586
JK
303};
304
998ac676 305static struct mount_opt mount_opt[] = {
470b359b
CB
306 { "async", 1, MS_SYNCHRONOUS },
307 { "atime", 1, MS_NOATIME },
308 { "bind", 0, MS_BIND },
88d413d5 309 { "defaults", 0, 0 },
88d413d5 310 { "dev", 1, MS_NODEV },
470b359b 311 { "diratime", 1, MS_NODIRATIME },
88d413d5 312 { "dirsync", 0, MS_DIRSYNC },
470b359b 313 { "exec", 1, MS_NOEXEC },
8912711c 314 { "lazytime", 0, MS_LAZYTIME },
88d413d5 315 { "mand", 0, MS_MANDLOCK },
88d413d5 316 { "noatime", 0, MS_NOATIME },
470b359b 317 { "nodev", 0, MS_NODEV },
88d413d5 318 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
319 { "noexec", 0, MS_NOEXEC },
320 { "nomand", 1, MS_MANDLOCK },
321 { "norelatime", 1, MS_RELATIME },
322 { "nostrictatime", 1, MS_STRICTATIME },
323 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
324 { "rbind", 0, MS_BIND|MS_REC },
325 { "relatime", 0, MS_RELATIME },
470b359b
CB
326 { "remount", 0, MS_REMOUNT },
327 { "ro", 0, MS_RDONLY },
328 { "rw", 1, MS_RDONLY },
88d413d5 329 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
330 { "suid", 1, MS_NOSUID },
331 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 332 { NULL, 0, 0 },
998ac676
RT
333};
334
e37dda71 335#if HAVE_LIBCAP
81810dd1 336static struct caps_opt caps_opt[] = {
a6afdde9 337 { "chown", CAP_CHOWN },
1e11be34
DL
338 { "dac_override", CAP_DAC_OVERRIDE },
339 { "dac_read_search", CAP_DAC_READ_SEARCH },
340 { "fowner", CAP_FOWNER },
341 { "fsetid", CAP_FSETID },
81810dd1
DL
342 { "kill", CAP_KILL },
343 { "setgid", CAP_SETGID },
344 { "setuid", CAP_SETUID },
345 { "setpcap", CAP_SETPCAP },
346 { "linux_immutable", CAP_LINUX_IMMUTABLE },
347 { "net_bind_service", CAP_NET_BIND_SERVICE },
348 { "net_broadcast", CAP_NET_BROADCAST },
349 { "net_admin", CAP_NET_ADMIN },
350 { "net_raw", CAP_NET_RAW },
351 { "ipc_lock", CAP_IPC_LOCK },
352 { "ipc_owner", CAP_IPC_OWNER },
353 { "sys_module", CAP_SYS_MODULE },
354 { "sys_rawio", CAP_SYS_RAWIO },
355 { "sys_chroot", CAP_SYS_CHROOT },
356 { "sys_ptrace", CAP_SYS_PTRACE },
357 { "sys_pacct", CAP_SYS_PACCT },
358 { "sys_admin", CAP_SYS_ADMIN },
359 { "sys_boot", CAP_SYS_BOOT },
360 { "sys_nice", CAP_SYS_NICE },
361 { "sys_resource", CAP_SYS_RESOURCE },
362 { "sys_time", CAP_SYS_TIME },
363 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
364 { "mknod", CAP_MKNOD },
365 { "lease", CAP_LEASE },
57b837e2
CB
366#ifdef CAP_AUDIT_READ
367 { "audit_read", CAP_AUDIT_READ },
368#endif
9527e566 369#ifdef CAP_AUDIT_WRITE
81810dd1 370 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
371#endif
372#ifdef CAP_AUDIT_CONTROL
81810dd1 373 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 374#endif
81810dd1
DL
375 { "setfcap", CAP_SETFCAP },
376 { "mac_override", CAP_MAC_OVERRIDE },
377 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
378#ifdef CAP_SYSLOG
379 { "syslog", CAP_SYSLOG },
380#endif
381#ifdef CAP_WAKE_ALARM
382 { "wake_alarm", CAP_WAKE_ALARM },
383#endif
2b54359b
CB
384#ifdef CAP_BLOCK_SUSPEND
385 { "block_suspend", CAP_BLOCK_SUSPEND },
386#endif
81810dd1 387};
495d2046
SG
388#else
389static struct caps_opt caps_opt[] = {};
390#endif
81810dd1 391
c6d09e15
WB
392static struct limit_opt limit_opt[] = {
393#ifdef RLIMIT_AS
394 { "as", RLIMIT_AS },
395#endif
396#ifdef RLIMIT_CORE
397 { "core", RLIMIT_CORE },
398#endif
399#ifdef RLIMIT_CPU
400 { "cpu", RLIMIT_CPU },
401#endif
402#ifdef RLIMIT_DATA
403 { "data", RLIMIT_DATA },
404#endif
405#ifdef RLIMIT_FSIZE
406 { "fsize", RLIMIT_FSIZE },
407#endif
408#ifdef RLIMIT_LOCKS
409 { "locks", RLIMIT_LOCKS },
410#endif
411#ifdef RLIMIT_MEMLOCK
412 { "memlock", RLIMIT_MEMLOCK },
413#endif
414#ifdef RLIMIT_MSGQUEUE
415 { "msgqueue", RLIMIT_MSGQUEUE },
416#endif
417#ifdef RLIMIT_NICE
418 { "nice", RLIMIT_NICE },
419#endif
420#ifdef RLIMIT_NOFILE
421 { "nofile", RLIMIT_NOFILE },
422#endif
423#ifdef RLIMIT_NPROC
424 { "nproc", RLIMIT_NPROC },
425#endif
426#ifdef RLIMIT_RSS
427 { "rss", RLIMIT_RSS },
428#endif
429#ifdef RLIMIT_RTPRIO
430 { "rtprio", RLIMIT_RTPRIO },
431#endif
432#ifdef RLIMIT_RTTIME
433 { "rttime", RLIMIT_RTTIME },
434#endif
435#ifdef RLIMIT_SIGPENDING
436 { "sigpending", RLIMIT_SIGPENDING },
437#endif
438#ifdef RLIMIT_STACK
439 { "stack", RLIMIT_STACK },
440#endif
441};
442
91c3830e
SH
443static int run_buffer(char *buffer)
444{
ebec9176 445 struct lxc_popen_FILE *f;
91c3830e 446 char *output;
8e7da691 447 int ret;
91c3830e 448
ebec9176 449 f = lxc_popen(buffer);
91c3830e 450 if (!f) {
062b72c6 451 SYSERROR("Failed to popen() %s.", buffer);
91c3830e
SH
452 return -1;
453 }
454
455 output = malloc(LXC_LOG_BUFFER_SIZE);
456 if (!output) {
062b72c6 457 ERROR("Failed to allocate memory for %s.", buffer);
ebec9176 458 lxc_pclose(f);
91c3830e
SH
459 return -1;
460 }
461
062b72c6
CB
462 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
463 DEBUG("Script %s with output: %s.", buffer, output);
91c3830e
SH
464
465 free(output);
466
ebec9176 467 ret = lxc_pclose(f);
8e7da691 468 if (ret == -1) {
062b72c6 469 SYSERROR("Script exited with error.");
91c3830e 470 return -1;
8e7da691 471 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
062b72c6 472 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
8e7da691
DE
473 return -1;
474 } else if (WIFSIGNALED(ret)) {
062b72c6 475 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
8e7da691 476 return -1;
91c3830e
SH
477 }
478
479 return 0;
480}
481
148e91f5 482static int run_script_argv(const char *name, const char *section,
062b72c6
CB
483 const char *script, const char *hook,
484 const char *lxcpath, char **argsin)
148e91f5
SH
485{
486 int ret, i;
487 char *buffer;
488 size_t size = 0;
489
062b72c6 490 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
148e91f5
SH
491 script, name, section);
492
062b72c6 493 for (i = 0; argsin && argsin[i]; i++)
148e91f5
SH
494 size += strlen(argsin[i]) + 1;
495
496 size += strlen(hook) + 1;
497
498 size += strlen(script);
499 size += strlen(name);
500 size += strlen(section);
501 size += 3;
502
503 if (size > INT_MAX)
504 return -1;
505
506 buffer = alloca(size);
507 if (!buffer) {
062b72c6 508 ERROR("Failed to allocate memory.");
148e91f5
SH
509 return -1;
510 }
511
062b72c6
CB
512 ret =
513 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
514 if (ret < 0 || (size_t)ret >= size) {
515 ERROR("Script name too long.");
148e91f5
SH
516 return -1;
517 }
518
062b72c6
CB
519 for (i = 0; argsin && argsin[i]; i++) {
520 int len = size - ret;
148e91f5
SH
521 int rc;
522 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
523 if (rc < 0 || rc >= len) {
062b72c6 524 ERROR("Script args too long.");
148e91f5
SH
525 return -1;
526 }
527 ret += rc;
528 }
529
530 return run_buffer(buffer);
531}
532
062b72c6
CB
533static int run_script(const char *name, const char *section, const char *script,
534 ...)
e3b4c4c4 535{
abbfd20b 536 int ret;
91c3830e 537 char *buffer, *p;
abbfd20b
DL
538 size_t size = 0;
539 va_list ap;
751d9dcd 540
062b72c6 541 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
751d9dcd 542 script, name, section);
e3b4c4c4 543
abbfd20b
DL
544 va_start(ap, script);
545 while ((p = va_arg(ap, char *)))
95642a10 546 size += strlen(p) + 1;
abbfd20b
DL
547 va_end(ap);
548
549 size += strlen(script);
550 size += strlen(name);
551 size += strlen(section);
95642a10 552 size += 3;
abbfd20b 553
95642a10
MS
554 if (size > INT_MAX)
555 return -1;
556
557 buffer = alloca(size);
abbfd20b 558 if (!buffer) {
062b72c6 559 ERROR("Failed to allocate memory.");
751d9dcd
DL
560 return -1;
561 }
562
9ba8130c
SH
563 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
564 if (ret < 0 || ret >= size) {
062b72c6 565 ERROR("Script name too long.");
9ba8130c
SH
566 return -1;
567 }
751d9dcd 568
abbfd20b 569 va_start(ap, script);
9ba8130c 570 while ((p = va_arg(ap, char *))) {
062b72c6 571 int len = size - ret;
9ba8130c
SH
572 int rc;
573 rc = snprintf(buffer + ret, len, " %s", p);
574 if (rc < 0 || rc >= len) {
062b72c6 575 ERROR("Script args too long.");
9ba8130c
SH
576 return -1;
577 }
578 ret += rc;
579 }
abbfd20b 580 va_end(ap);
751d9dcd 581
91c3830e 582 return run_buffer(buffer);
e3b4c4c4
ST
583}
584
0c547523
SH
585/*
586 * pin_rootfs
b7ed4bf0
CS
587 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
588 * the duration of the container run, to prevent the container from marking
589 * the underlying fs readonly on shutdown. unlink the file immediately so
590 * no name pollution is happens
0c547523
SH
591 * return -1 on error.
592 * return -2 if nothing needed to be pinned.
593 * return an open fd (>=0) if we pinned it.
594 */
595int pin_rootfs(const char *rootfs)
596{
597 char absrootfs[MAXPATHLEN];
598 char absrootfspin[MAXPATHLEN];
599 struct stat s;
600 int ret, fd;
601
e99ee0de 602 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 603 return -2;
e99ee0de 604
00ec333b 605 if (!realpath(rootfs, absrootfs))
9be53773 606 return -2;
0c547523 607
00ec333b 608 if (access(absrootfs, F_OK))
0c547523 609 return -1;
0c547523 610
00ec333b 611 if (stat(absrootfs, &s))
0c547523 612 return -1;
0c547523 613
72f919c4 614 if (!S_ISDIR(s.st_mode))
0c547523
SH
615 return -2;
616
b7ed4bf0 617 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 618 if (ret >= MAXPATHLEN)
0c547523 619 return -1;
0c547523
SH
620
621 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
622 if (fd < 0)
623 return fd;
624 (void)unlink(absrootfspin);
0c547523
SH
625 return fd;
626}
627
e2a7e8dc
SH
628/*
629 * If we are asking to remount something, make sure that any
630 * NOEXEC etc are honored.
631 */
5ae72b98 632unsigned long add_required_remount_flags(const char *s, const char *d,
e2a7e8dc
SH
633 unsigned long flags)
634{
614305f3 635#ifdef HAVE_STATVFS
e2a7e8dc
SH
636 struct statvfs sb;
637 unsigned long required_flags = 0;
638
639 if (!(flags & MS_REMOUNT))
640 return flags;
641
642 if (!s)
643 s = d;
644
645 if (!s)
646 return flags;
647 if (statvfs(s, &sb) < 0)
648 return flags;
649
650 if (sb.f_flag & MS_NOSUID)
651 required_flags |= MS_NOSUID;
652 if (sb.f_flag & MS_NODEV)
653 required_flags |= MS_NODEV;
654 if (sb.f_flag & MS_RDONLY)
655 required_flags |= MS_RDONLY;
656 if (sb.f_flag & MS_NOEXEC)
657 required_flags |= MS_NOEXEC;
658
659 return flags | required_flags;
614305f3
SH
660#else
661 return flags;
662#endif
e2a7e8dc
SH
663}
664
4fb3cba5 665static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 666{
368bbc02 667 int r;
80e80c40 668 int i;
b06b8511
CS
669 static struct {
670 int match_mask;
671 int match_flag;
672 const char *source;
673 const char *destination;
674 const char *fstype;
675 unsigned long flags;
676 const char *options;
677 } default_mounts[] = {
678 /* Read-only bind-mounting... In older kernels, doing that required
679 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
680 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
681 * kernel 2.6.26 onwards. However, this apparently does not work on
682 * kernel 3.8. Unfortunately, on that very same kernel, doing the
683 * same trick as above doesn't seem to work either, there one needs
684 * to ALSO specify MS_BIND for the remount, otherwise the entire
685 * fs is remounted read-only or the mount fails because it's busy...
686 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
687 * 2.6.32...
368bbc02 688 */
f24a52d5 689 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a
SH
690 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
691 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
f24a52d5
SG
692 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
693 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
592fd47a 694 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
f24a52d5
SG
695 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
696 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
697 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
698 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
699 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
700 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
701 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
702 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
703 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
704 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
705 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
706 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 707 };
368bbc02 708
b06b8511
CS
709 for (i = 0; default_mounts[i].match_mask; i++) {
710 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
711 char *source = NULL;
712 char *destination = NULL;
713 int saved_errno;
e2a7e8dc 714 unsigned long mflags;
b06b8511
CS
715
716 if (default_mounts[i].source) {
717 /* will act like strdup if %r is not present */
8ede5f4c 718 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
b06b8511
CS
719 if (!source) {
720 SYSERROR("memory allocation error");
721 return -1;
722 }
723 }
cc4fd506
SH
724 if (!default_mounts[i].destination) {
725 ERROR("BUG: auto mounts destination %d was NULL", i);
b2f44b4d 726 free(source);
cc4fd506
SH
727 return -1;
728 }
729 /* will act like strdup if %r is not present */
730 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
731 if (!destination) {
732 saved_errno = errno;
733 SYSERROR("memory allocation error");
734 free(source);
735 errno = saved_errno;
736 return -1;
b06b8511 737 }
e2a7e8dc
SH
738 mflags = add_required_remount_flags(source, destination,
739 default_mounts[i].flags);
592fd47a 740 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
b06b8511 741 saved_errno = errno;
b88ff9a0
SG
742 if (r < 0 && errno == ENOENT) {
743 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
744 r = 0;
745 }
746 else if (r < 0)
e2a7e8dc 747 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
f24a52d5 748
b06b8511
CS
749 free(source);
750 free(destination);
751 if (r < 0) {
b06b8511
CS
752 errno = saved_errno;
753 return -1;
754 }
368bbc02 755 }
368bbc02
CS
756 }
757
b06b8511 758 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
759 int cg_flags;
760
761 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
762 /* If the type of cgroup mount was not specified, it depends on the
763 * container's capabilities as to what makes sense: if we have
764 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
765 * anyway, so we may as well default to read-write; then the admin
766 * will not be given a false sense of security. (And if they really
767 * want mixed r/o r/w, then they can explicitly specify :mixed.)
768 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
769 * :mixed, because then the container can't remount it read-write. */
770 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
771 int has_sys_admin = 0;
b0ee5983
CB
772
773 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 774 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 775 else
0769b82a 776 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
777
778 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 779 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 780 else
0769b82a 781 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a
CS
782 }
783
8ede5f4c 784 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
368bbc02 785 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 786 return -1;
368bbc02
CS
787 }
788 }
789
368bbc02 790 return 0;
368bbc02
CS
791}
792
4e5440c6 793static int setup_utsname(struct utsname *utsname)
0ad19a3f 794{
4e5440c6
DL
795 if (!utsname)
796 return 0;
0ad19a3f 797
4e5440c6
DL
798 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
799 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 800 return -1;
801 }
802
4e5440c6 803 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 804
0ad19a3f 805 return 0;
806}
807
69aa6655
DE
808struct dev_symlinks {
809 const char *oldpath;
810 const char *name;
811};
812
813static const struct dev_symlinks dev_symlinks[] = {
814 {"/proc/self/fd", "fd"},
815 {"/proc/self/fd/0", "stdin"},
816 {"/proc/self/fd/1", "stdout"},
817 {"/proc/self/fd/2", "stderr"},
818};
819
820static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
821{
822 char path[MAXPATHLEN];
823 int ret,i;
09227be2 824 struct stat s;
69aa6655
DE
825
826
827 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
828 const struct dev_symlinks *d = &dev_symlinks[i];
ec50007f 829 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
830 if (ret < 0 || ret >= MAXPATHLEN)
831 return -1;
09227be2
MW
832
833 /*
834 * Stat the path first. If we don't get an error
835 * accept it as is and don't try to create it
836 */
837 if (!stat(path, &s)) {
838 continue;
839 }
840
69aa6655 841 ret = symlink(d->oldpath, path);
09227be2 842
69aa6655 843 if (ret && errno != EEXIST) {
09227be2
MW
844 if ( errno == EROFS ) {
845 WARN("Warning: Read Only file system while creating %s", path);
846 } else {
847 SYSERROR("Error creating %s", path);
848 return -1;
849 }
69aa6655
DE
850 }
851 }
852 return 0;
853}
854
393903d1
SH
855/*
856 * Build a space-separate list of ptys to pass to systemd.
857 */
858static bool append_ptyname(char **pp, char *name)
b0a33c1e 859{
393903d1
SH
860 char *p;
861
862 if (!*pp) {
863 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
864 if (!*pp)
865 return false;
866 sprintf(*pp, "container_ttys=%s", name);
867 return true;
868 }
869 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
870 if (!p)
871 return false;
872 *pp = p;
873 strcat(p, " ");
874 strcat(p, name);
875 return true;
876}
877
9e1045e3 878static int lxc_setup_tty(struct lxc_conf *conf)
393903d1 879{
9e1045e3 880 int i, ret;
393903d1
SH
881 const struct lxc_tty_info *tty_info = &conf->tty_info;
882 char *ttydir = conf->ttydir;
7c6ef2a2 883 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 884
e8bd4e43 885 if (!conf->rootfs.path)
bc9bd0e3
DL
886 return 0;
887
b0a33c1e 888 for (i = 0; i < tty_info->nbtty; i++) {
b0a33c1e 889 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
890
e8bd4e43 891 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
9e1045e3 892 if (ret < 0 || (size_t)ret >= sizeof(path)) {
7c6ef2a2
SH
893 ERROR("pathname too long for ttys");
894 return -1;
895 }
9e1045e3 896
7c6ef2a2
SH
897 if (ttydir) {
898 /* create dev/lxc/tty%d" */
9e1045e3
CB
899 ret = snprintf(lxcpath, sizeof(lxcpath),
900 "/dev/%s/tty%d", ttydir, i + 1);
901 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
7c6ef2a2
SH
902 ERROR("pathname too long for ttys");
903 return -1;
904 }
9e1045e3 905
7c6ef2a2 906 ret = creat(lxcpath, 0660);
9e1045e3
CB
907 if (ret < 0 && errno != EEXIST) {
908 SYSERROR("failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
909 return -1;
910 }
4d44e274
SH
911 if (ret >= 0)
912 close(ret);
9e1045e3 913
7c6ef2a2 914 ret = unlink(path);
9e1045e3
CB
915 if (ret < 0 && errno != ENOENT) {
916 SYSERROR("failed to unlink \"%s\"", path);
7c6ef2a2
SH
917 return -1;
918 }
b0a33c1e 919
9e1045e3
CB
920 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
921 if (ret < 0) {
922 WARN("failed to bind mount \"%s\" onto \"%s\"",
7c6ef2a2
SH
923 pty_info->name, path);
924 continue;
925 }
9e1045e3
CB
926 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
927 path);
13954cce 928
9e1045e3
CB
929 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
930 ttydir, i + 1);
931 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
9ba8130c
SH
932 ERROR("tty pathname too long");
933 return -1;
934 }
9e1045e3 935
7c6ef2a2 936 ret = symlink(lxcpath, path);
9e1045e3
CB
937 if (ret < 0) {
938 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
939 path, lxcpath);
7c6ef2a2
SH
940 return -1;
941 }
942 } else {
9e1045e3
CB
943 /* If we populated /dev, then we need to create
944 * /dev/ttyN
945 */
946 ret = access(path, F_OK);
947 if (ret < 0) {
c6883f38 948 ret = creat(path, 0660);
9e1045e3
CB
949 if (ret < 0) {
950 SYSERROR("failed to create \"%s\"", path);
c6883f38 951 /* this isn't fatal, continue */
025ed0f3 952 } else {
c6883f38 953 close(ret);
025ed0f3 954 }
c6883f38 955 }
9e1045e3
CB
956
957 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
958 if (ret < 0) {
e8bd4e43 959 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
7c6ef2a2
SH
960 continue;
961 }
9e1045e3
CB
962
963 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
964 path);
393903d1 965 }
9e1045e3 966
e8bd4e43 967 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
393903d1
SH
968 ERROR("Error setting up container_ttys string");
969 return -1;
b0a33c1e 970 }
971 }
972
9e1045e3 973 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
b0a33c1e 974 return 0;
975}
976
59bb8698 977static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 978{
2d489f9e 979 int oldroot = -1, newroot = -1;
bf601689 980
2d489f9e
SH
981 oldroot = open("/", O_DIRECTORY | O_RDONLY);
982 if (oldroot < 0) {
983 SYSERROR("Error opening old-/ for fchdir");
9ba8130c
SH
984 return -1;
985 }
2d489f9e
SH
986 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
987 if (newroot < 0) {
988 SYSERROR("Error opening new-/ for fchdir");
989 goto fail;
c08556c6 990 }
bf601689 991
cc6f6dd7 992 /* change into new root fs */
2d489f9e 993 if (fchdir(newroot)) {
cc6f6dd7 994 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
2d489f9e 995 goto fail;
cc6f6dd7
DL
996 }
997
cc6f6dd7 998 /* pivot_root into our new root fs */
2d489f9e 999 if (pivot_root(".", ".")) {
cc6f6dd7 1000 SYSERROR("pivot_root syscall failed");
2d489f9e 1001 goto fail;
bf601689 1002 }
cc6f6dd7 1003
2d489f9e
SH
1004 /*
1005 * at this point the old-root is mounted on top of our new-root
1006 * To unmounted it we must not be chdir'd into it, so escape back
1007 * to old-root
1008 */
1009 if (fchdir(oldroot) < 0) {
1010 SYSERROR("Error entering oldroot");
1011 goto fail;
1012 }
7981ea46 1013 if (umount2(".", MNT_DETACH) < 0) {
2d489f9e
SH
1014 SYSERROR("Error detaching old root");
1015 goto fail;
cc6f6dd7
DL
1016 }
1017
2d489f9e
SH
1018 if (fchdir(newroot) < 0) {
1019 SYSERROR("Error re-entering newroot");
1020 goto fail;
1021 }
cc6f6dd7 1022
2d489f9e
SH
1023 close(oldroot);
1024 close(newroot);
bf601689 1025
2d489f9e 1026 DEBUG("pivot_root syscall to '%s' successful", rootfs);
bf601689 1027
bf601689 1028 return 0;
2d489f9e
SH
1029
1030fail:
1031 if (oldroot != -1)
1032 close(oldroot);
1033 if (newroot != -1)
1034 close(newroot);
1035 return -1;
bf601689
MH
1036}
1037
7133b912
CB
1038/* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1039 * error, log it but don't fail yet.
91c3830e 1040 */
7133b912
CB
1041static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1042 const char *lxcpath)
91c3830e
SH
1043{
1044 int ret;
87da4ec3
SH
1045 size_t clen;
1046 char *path;
91c3830e 1047
7133b912 1048 INFO("Preparing \"/dev\"");
bc6928ff 1049
14221cbb 1050 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1051 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1052 path = alloca(clen);
bc6928ff 1053
ec50007f 1054 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
7133b912 1055 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1056 return -1;
bc6928ff 1057
87da4ec3 1058 if (!dir_exists(path)) {
7133b912
CB
1059 WARN("\"/dev\" directory does not exist. Proceeding without "
1060 "autodev being set up");
87da4ec3 1061 return 0;
bc6928ff 1062 }
87da4ec3 1063
1ec0e8e3 1064 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
7133b912
CB
1065 rootfs->path ? rootfs->mount : NULL);
1066 if (ret < 0) {
1067 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1ec0e8e3 1068 return -1;
91c3830e 1069 }
7133b912 1070 INFO("Mounted tmpfs on \"%s\"", path);
87da4ec3 1071
ec50007f 1072 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
7133b912 1073 if (ret < 0 || (size_t)ret >= clen)
91c3830e 1074 return -1;
87da4ec3 1075
7133b912 1076 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
bc6928ff
MW
1077 * If not, then create it and exit if that fails...
1078 */
87da4ec3 1079 if (!dir_exists(path)) {
bc6928ff 1080 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
7133b912
CB
1081 if (ret < 0) {
1082 SYSERROR("Failed to create directory \"%s\"", path);
bc6928ff
MW
1083 return -1;
1084 }
91c3830e
SH
1085 }
1086
7133b912 1087 INFO("Prepared \"/dev\"");
91c3830e
SH
1088 return 0;
1089}
1090
c6883f38 1091struct lxc_devs {
74a3920a 1092 const char *name;
c6883f38
SH
1093 mode_t mode;
1094 int maj;
1095 int min;
1096};
1097
74a3920a 1098static const struct lxc_devs lxc_devs[] = {
c6883f38
SH
1099 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1100 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1101 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1102 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1103 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1104 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
c6883f38
SH
1105};
1106
27245ff7 1107static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38
SH
1108{
1109 int ret;
c6883f38
SH
1110 char path[MAXPATHLEN];
1111 int i;
3a32201c 1112 mode_t cmask;
c6883f38 1113
ec50007f 1114 ret = snprintf(path, MAXPATHLEN, "%s/dev", rootfs->path ? rootfs->mount : "");
91c3830e
SH
1115 if (ret < 0 || ret >= MAXPATHLEN) {
1116 ERROR("Error calculating container /dev location");
c6883f38 1117 return -1;
f7bee6c6 1118 }
91c3830e 1119
0bbf8572
CB
1120 /* ignore, just don't try to fill in */
1121 if (!dir_exists(path))
9cb4d183
SH
1122 return 0;
1123
0bbf8572 1124 INFO("populating container /dev");
3a32201c 1125 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1126 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1127 const struct lxc_devs *d = &lxc_devs[i];
0728ebf4 1128
ec50007f 1129 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
c6883f38
SH
1130 if (ret < 0 || ret >= MAXPATHLEN)
1131 return -1;
0bbf8572 1132
c6883f38 1133 ret = mknod(path, d->mode, makedev(d->maj, d->min));
0bbf8572 1134 if (ret < 0) {
9cb4d183
SH
1135 char hostpath[MAXPATHLEN];
1136 FILE *pathfile;
1137
0bbf8572
CB
1138 if (errno == EEXIST) {
1139 DEBUG("\"%s\" device already existed", path);
1140 continue;
1141 }
1142
1143 /* Unprivileged containers cannot create devices, so
1144 * bind mount the device from the host.
1145 */
9cb4d183
SH
1146 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1147 if (ret < 0 || ret >= MAXPATHLEN)
1148 return -1;
1149 pathfile = fopen(path, "wb");
1150 if (!pathfile) {
1151 SYSERROR("Failed to create device mount target '%s'", path);
1152 return -1;
1153 }
1154 fclose(pathfile);
0bbf8572
CB
1155 if (safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL) != 0) {
1156 SYSERROR("Failed bind mounting device %s from host into container", d->name);
9cb4d183
SH
1157 return -1;
1158 }
0bbf8572
CB
1159 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath, path);
1160 } else {
1161 DEBUG("created device node \"%s\"", path);
c6883f38
SH
1162 }
1163 }
3a32201c 1164 umask(cmask);
c6883f38 1165
0bbf8572 1166 INFO("populated container /dev");
c6883f38
SH
1167 return 0;
1168}
1169
9aa76a17 1170static int lxc_setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1171{
9aa76a17 1172 int ret;
91c3e281
CB
1173 struct bdev *bdev;
1174 const struct lxc_rootfs *rootfs;
cc28d0b0 1175
91c3e281 1176 rootfs = &conf->rootfs;
a0f379bf 1177 if (!rootfs->path) {
91c3e281
CB
1178 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1179 SYSERROR("Failed to make / rslave.");
a0f379bf
DW
1180 return -1;
1181 }
c69bd12f 1182 return 0;
a0f379bf 1183 }
0ad19a3f 1184
12297168 1185 if (access(rootfs->mount, F_OK)) {
91c3e281 1186 SYSERROR("Failed to access to \"%s\". Check it is present.",
12297168 1187 rootfs->mount);
b1789442
DL
1188 return -1;
1189 }
1190
91c3e281 1191 bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
9aa76a17
CB
1192 if (!bdev) {
1193 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
91c3e281
CB
1194 rootfs->path, rootfs->mount,
1195 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1196 return -1;
9be53773 1197 }
9aa76a17
CB
1198
1199 ret = bdev->ops->mount(bdev);
1200 bdev_put(bdev);
1201 if (ret < 0) {
91c3e281
CB
1202 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1203 rootfs->path, rootfs->mount,
1204 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1205 return -1;
1206 }
0ad19a3f 1207
91c3e281
CB
1208 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1209 rootfs->path, rootfs->mount,
1210 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1211
ac778708
DL
1212 return 0;
1213}
1214
91e93c71
AV
1215int prepare_ramfs_root(char *root)
1216{
eab15c1e 1217 char buf[LXC_LINELEN], *p;
91e93c71
AV
1218 char nroot[PATH_MAX];
1219 FILE *f;
1220 int i;
1221 char *p2;
1222
1223 if (realpath(root, nroot) == NULL)
39c7b795 1224 return -errno;
91e93c71
AV
1225
1226 if (chdir("/") == -1)
39c7b795 1227 return -errno;
91e93c71
AV
1228
1229 /*
1230 * We could use here MS_MOVE, but in userns this mount is
1231 * locked and can't be moved.
1232 */
39c7b795 1233 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
91e93c71 1234 SYSERROR("Failed to move %s into /", root);
39c7b795 1235 return -errno;
91e93c71
AV
1236 }
1237
39c7b795 1238 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
91e93c71 1239 SYSERROR("Failed to make . rprivate");
39c7b795 1240 return -errno;
91e93c71
AV
1241 }
1242
1243 /*
1244 * The following code cleans up inhereted mounts which are not
1245 * required for CT.
1246 *
1247 * The mountinfo file shows not all mounts, if a few points have been
1248 * unmounted between read operations from the mountinfo. So we need to
1249 * read mountinfo a few times.
1250 *
1251 * This loop can be skipped if a container uses unserns, because all
1252 * inherited mounts are locked and we should live with all this trash.
1253 */
1254 while (1) {
1255 int progress = 0;
1256
1257 f = fopen("./proc/self/mountinfo", "r");
1258 if (!f) {
1259 SYSERROR("Unable to open /proc/self/mountinfo");
1260 return -1;
1261 }
eab15c1e 1262 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1263 for (p = buf, i=0; p && i < 4; i++)
1264 p = strchr(p+1, ' ');
1265 if (!p)
1266 continue;
1267 p2 = strchr(p+1, ' ');
1268 if (!p2)
1269 continue;
1270
1271 *p2 = '\0';
1272 *p = '.';
1273
1274 if (strcmp(p + 1, "/") == 0)
1275 continue;
1276 if (strcmp(p + 1, "/proc") == 0)
1277 continue;
1278
1279 if (umount2(p, MNT_DETACH) == 0)
1280 progress++;
1281 }
1282 fclose(f);
1283 if (!progress)
1284 break;
1285 }
1286
8bea9fae
PR
1287 /* This also can be skipped if a container uses unserns */
1288 umount2("./proc", MNT_DETACH);
91e93c71
AV
1289
1290 /* It is weird, but chdir("..") moves us in a new root */
1291 if (chdir("..") == -1) {
1292 SYSERROR("Unable to change working directory");
1293 return -1;
1294 }
1295
1296 if (chroot(".") == -1) {
1297 SYSERROR("Unable to chroot");
1298 return -1;
1299 }
1300
1301 return 0;
1302}
1303
74a3920a 1304static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1305{
39c7b795
CB
1306 if (!rootfs->path) {
1307 DEBUG("container does not have a rootfs, so not doing pivot root");
ac778708 1308 return 0;
39c7b795 1309 }
ac778708 1310
91e93c71 1311 if (detect_ramfs_rootfs()) {
39c7b795
CB
1312 DEBUG("detected that container is on ramfs");
1313 if (prepare_ramfs_root(rootfs->mount)) {
1314 ERROR("failed to prepare minimal ramfs root");
91e93c71 1315 return -1;
39c7b795
CB
1316 }
1317
1318 DEBUG("prepared ramfs root for container");
1319 return 0;
1320 }
1321
1322 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1323 ERROR("failed to pivot root");
25368b52 1324 return -1;
c69bd12f
DL
1325 }
1326
39c7b795 1327 DEBUG("finished pivot root");
25368b52 1328 return 0;
0ad19a3f 1329}
1330
70761e5e 1331static int lxc_setup_devpts(int num_pts)
3c26f34e 1332{
70761e5e 1333 int ret;
9d28c4f9
CB
1334 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1335 char devpts_mntopts[256];
77890c6d 1336
70761e5e
CB
1337 if (!num_pts) {
1338 DEBUG("no new devpts instance will be mounted since no pts "
1339 "devices are requested");
d852c78c 1340 return 0;
3c26f34e 1341 }
1342
9d28c4f9
CB
1343 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1344 default_devpts_mntopts, num_pts);
1345 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1346 return -1;
1347
d5cb35d6 1348 /* Unmount old devpts instance. */
70761e5e
CB
1349 ret = access("/dev/pts/ptmx", F_OK);
1350 if (!ret) {
70761e5e
CB
1351 ret = umount("/dev/pts");
1352 if (ret < 0) {
1353 SYSERROR("failed to unmount old devpts instance");
1354 return -1;
7e40254a 1355 }
70761e5e 1356 DEBUG("unmounted old /dev/pts instance");
7e40254a
JTLB
1357 }
1358
70761e5e
CB
1359 /* Create mountpoint for devpts instance. */
1360 ret = mkdir("/dev/pts", 0755);
1361 if (ret < 0 && errno != EEXIST) {
1362 SYSERROR("failed to create the \"/dev/pts\" directory");
3c26f34e 1363 return -1;
1364 }
1365
70761e5e
CB
1366 /* Mount new devpts instance. */
1367 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1368 if (ret < 0) {
1369 SYSERROR("failed to mount new devpts instance");
1370 return -1;
1371 }
f4f52cb5 1372 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1373
d5cb35d6 1374 /* Remove any pre-existing /dev/ptmx file. */
70761e5e 1375 ret = access("/dev/ptmx", F_OK);
d5cb35d6
CB
1376 if (!ret) {
1377 ret = remove("/dev/ptmx");
1378 if (ret < 0) {
1379 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1380 return -1;
70761e5e 1381 }
d5cb35d6 1382 DEBUG("removed existing \"/dev/ptmx\"");
3c26f34e 1383 }
1384
d5cb35d6
CB
1385 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1386 ret = open("/dev/ptmx", O_CREAT, 0666);
1387 if (ret < 0) {
1388 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1389 return -1;
1390 }
e87bd19c 1391 close(ret);
d5cb35d6 1392 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1393
d5cb35d6 1394 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1395 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6
CB
1396 if (!ret) {
1397 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1398 return 0;
1399 } else {
1400 /* Fallthrough and try to create a symlink. */
1401 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1402 }
1403
1404 /* Remove the dummy /dev/ptmx file we created above. */
1405 ret = remove("/dev/ptmx");
70761e5e 1406 if (ret < 0) {
d5cb35d6
CB
1407 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1408 return -1;
1409 }
1410
1411 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1412 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1413 if (ret < 0) {
1414 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
3c26f34e 1415 return -1;
1416 }
d5cb35d6 1417 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
cd54d859 1418
3c26f34e 1419 return 0;
1420}
1421
cccc74b5
DL
1422static int setup_personality(int persona)
1423{
6ff05e18 1424 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1425 if (persona == -1)
1426 return 0;
1427
1428 if (personality(persona) < 0) {
1429 SYSERROR("failed to set personality to '0x%x'", persona);
1430 return -1;
1431 }
1432
1433 INFO("set personality to '0x%x'", persona);
6ff05e18 1434 #endif
cccc74b5
DL
1435
1436 return 0;
1437}
1438
3d7d929a
CB
1439static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1440 const struct lxc_console *console)
6e590161 1441{
63376d7d 1442 char path[MAXPATHLEN];
0728ebf4 1443 int ret, fd;
52e35957 1444
8b1b1210
CB
1445 if (console->path && !strcmp(console->path, "none"))
1446 return 0;
1447
7c6ef2a2 1448 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
3d7d929a 1449 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1450 return -1;
52e35957 1451
8b1b1210
CB
1452 /* When we are asked to setup a console we remove any previous
1453 * /dev/console bind-mounts.
1454 */
a7ba3c7f
CB
1455 if (file_exists(path)) {
1456 ret = lxc_unstack_mountpoint(path, false);
1457 if (ret < 0) {
8b1b1210 1458 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1459 return -ret;
1460 } else {
1461 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1462 }
953fe44f 1463
a7ba3c7f
CB
1464 ret = unlink(path);
1465 if (ret < 0) {
1466 SYSERROR("error unlinking %s", path);
8b1b1210
CB
1467 return -errno;
1468 }
8b1b1210
CB
1469 }
1470
1471 /* For unprivileged containers autodev or automounts will already have
1472 * taken care of creating /dev/console.
1473 */
0728ebf4
TA
1474 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1475 if (fd < 0) {
1476 if (errno != EEXIST) {
1477 SYSERROR("failed to create console");
3d7d929a 1478 return -errno;
0728ebf4
TA
1479 }
1480 } else {
1481 close(fd);
52e35957
DL
1482 }
1483
0728ebf4 1484 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
3d7d929a
CB
1485 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1486 return -errno;
63376d7d 1487 }
13954cce 1488
3d7d929a 1489 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
63376d7d 1490 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1491 return -1;
1492 }
1493
3d7d929a 1494 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1495 return 0;
1496}
1497
3d7d929a
CB
1498static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1499 const struct lxc_console *console,
1500 char *ttydir)
7c6ef2a2 1501{
7c6ef2a2 1502 int ret;
3d7d929a 1503 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
7c6ef2a2
SH
1504
1505 /* create rootfs/dev/<ttydir> directory */
3d7d929a
CB
1506 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1507 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1508 return -1;
3d7d929a 1509
7c6ef2a2
SH
1510 ret = mkdir(path, 0755);
1511 if (ret && errno != EEXIST) {
959aee9c 1512 SYSERROR("failed with errno %d to create %s", errno, path);
3d7d929a 1513 return -errno;
7c6ef2a2 1514 }
3d7d929a 1515 DEBUG("created directory for console and tty devices at \%s\"", path);
7c6ef2a2 1516
3d7d929a
CB
1517 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1518 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1519 return -1;
1520
7c6ef2a2 1521 ret = creat(lxcpath, 0660);
3d7d929a 1522 if (ret == -1 && errno != EEXIST) {
959aee9c 1523 SYSERROR("error %d creating %s", errno, lxcpath);
3d7d929a 1524 return -errno;
7c6ef2a2 1525 }
4d44e274
SH
1526 if (ret >= 0)
1527 close(ret);
7c6ef2a2 1528
2a12fefd
CB
1529 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1530 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 1531 return -1;
2a12fefd
CB
1532
1533 /* When we are asked to setup a console we remove any previous
1534 * /dev/console bind-mounts.
1535 */
1536 if (console->path && !strcmp(console->path, "none")) {
1537 struct stat st;
1538 ret = stat(path, &st);
1539 if (ret < 0) {
1540 if (errno == ENOENT)
1541 return 0;
1542 SYSERROR("failed stat() \"%s\"", path);
1543 return -errno;
1544 }
1545
1546 /* /dev/console must be character device with major number 5 and
1547 * minor number 1. If not, give benefit of the doubt and assume
1548 * the user has mounted something else right there on purpose.
1549 */
1550 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1551 return 0;
1552
1553 /* In case the user requested a bind-mount for /dev/console and
1554 * requests a ttydir we move the mount to the
a7ba3c7f
CB
1555 * /dev/<ttydir/console.
1556 * Note, we only move the uppermost mount and clear all other
1557 * mounts underneath for safety.
1558 * If it is a character device created via mknod() we simply
1559 * rename it.
2a12fefd
CB
1560 */
1561 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1562 if (ret < 0) {
1563 if (errno != EINVAL) {
1564 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1565 return -errno;
1566 }
1567 /* path was not a mountpoint */
1568 ret = rename(path, lxcpath);
1569 if (ret < 0) {
1570 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1571 return -errno;
1572 }
1573 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1574 } else {
1575 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1576 }
a7ba3c7f
CB
1577
1578 /* Clear all remaining bind-mounts. */
1579 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1580 if (ret < 0) {
a7ba3c7f
CB
1581 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1582 return -ret;
1583 } else {
1584 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1585 }
1586 } else {
1587 if (file_exists(path)) {
1588 ret = lxc_unstack_mountpoint(path, false);
1589 if (ret < 0) {
2a12fefd 1590 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1591 return -ret;
1592 } else {
1593 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
2a12fefd 1594 }
2a12fefd
CB
1595 }
1596
1597 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1598 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1599 return -1;
1600 }
1601 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1602 }
1603
2a12fefd 1604 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
9ba8130c 1605 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
3d7d929a 1606 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 1607 return -1;
3d7d929a 1608
2a12fefd
CB
1609 ret = unlink(path);
1610 if (ret && errno != ENOENT) {
1611 SYSERROR("error unlinking %s", path);
1612 return -errno;
1613 }
1614
7c6ef2a2 1615 ret = symlink(lxcpath, path);
3d7d929a
CB
1616 if (ret < 0) {
1617 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
7c6ef2a2
SH
1618 return -1;
1619 }
1620
3d7d929a 1621 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
6e590161 1622 return 0;
1623}
1624
3d7d929a
CB
1625static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1626 const struct lxc_console *console, char *ttydir)
7c6ef2a2 1627{
3d7d929a
CB
1628 /* We don't have a rootfs, /dev/console will be shared. */
1629 if (!rootfs->path) {
1630 DEBUG("/dev/console will be shared with the host");
7c6ef2a2 1631 return 0;
3d7d929a
CB
1632 }
1633
7c6ef2a2 1634 if (!ttydir)
3d7d929a 1635 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1636
3d7d929a 1637 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1638}
1639
998ac676
RT
1640static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1641{
1642 struct mount_opt *mo;
1643
1644 /* If opt is found in mount_opt, set or clear flags.
1645 * Otherwise append it to data. */
1646
1647 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1648 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1649 if (mo->clear)
1650 *flags &= ~mo->flag;
1651 else
1652 *flags |= mo->flag;
1653 return;
1654 }
1655 }
1656
1657 if (strlen(*data))
1658 strcat(*data, ",");
1659 strcat(*data, opt);
1660}
1661
a17b1e65 1662int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1663 char **mntdata)
1664{
1665 char *s, *data;
1666 char *p, *saveptr = NULL;
1667
911324ef 1668 *mntdata = NULL;
91656ce5 1669 *mntflags = 0L;
911324ef
DL
1670
1671 if (!mntopts)
998ac676
RT
1672 return 0;
1673
911324ef 1674 s = strdup(mntopts);
998ac676 1675 if (!s) {
36eb9bde 1676 SYSERROR("failed to allocate memory");
998ac676
RT
1677 return -1;
1678 }
1679
1680 data = malloc(strlen(s) + 1);
1681 if (!data) {
36eb9bde 1682 SYSERROR("failed to allocate memory");
998ac676
RT
1683 free(s);
1684 return -1;
1685 }
1686 *data = 0;
1687
1688 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1689 p = strtok_r(NULL, ",", &saveptr))
1690 parse_mntopt(p, mntflags, &data);
1691
1692 if (*data)
1693 *mntdata = data;
1694 else
1695 free(data);
1696 free(s);
1697
1698 return 0;
1699}
1700
6fd5e769
SH
1701static void null_endofword(char *word)
1702{
1703 while (*word && *word != ' ' && *word != '\t')
1704 word++;
1705 *word = '\0';
1706}
1707
1708/*
1709 * skip @nfields spaces in @src
1710 */
1711static char *get_field(char *src, int nfields)
1712{
1713 char *p = src;
1714 int i;
1715
1716 for (i = 0; i < nfields; i++) {
1717 while (*p && *p != ' ' && *p != '\t')
1718 p++;
1719 if (!*p)
1720 break;
1721 p++;
1722 }
1723 return p;
1724}
1725
911324ef
DL
1726static int mount_entry(const char *fsname, const char *target,
1727 const char *fstype, unsigned long mountflags,
0ac4b28a
CB
1728 const char *data, int optional, int dev,
1729 const char *rootfs)
911324ef 1730{
0ac4b28a 1731 int ret;
614305f3 1732#ifdef HAVE_STATVFS
2938f7c8 1733 struct statvfs sb;
614305f3 1734#endif
2938f7c8 1735
0ac4b28a
CB
1736 ret = safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data,
1737 rootfs);
1738 if (ret < 0) {
1fc64d22 1739 if (optional) {
0ac4b28a
CB
1740 INFO("Failed to mount \"%s\" on \"%s\" (optional): %s",
1741 fsname, target, strerror(errno));
1fc64d22
SG
1742 return 0;
1743 }
0ac4b28a
CB
1744
1745 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1746 return -1;
911324ef
DL
1747 }
1748
1749 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
7c5b6e7c 1750 unsigned long rqd_flags = 0;
0ac4b28a
CB
1751
1752 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
1753 "options",
1754 fsname ? fsname : "(none)", target ? target : "(none)");
1755
7c5b6e7c
AS
1756 if (mountflags & MS_RDONLY)
1757 rqd_flags |= MS_RDONLY;
614305f3 1758#ifdef HAVE_STATVFS
2938f7c8 1759 if (statvfs(fsname, &sb) == 0) {
7c5b6e7c 1760 unsigned long required_flags = rqd_flags;
0ac4b28a 1761
2938f7c8
SH
1762 if (sb.f_flag & MS_NOSUID)
1763 required_flags |= MS_NOSUID;
0ac4b28a 1764
ae7a770e 1765 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8 1766 required_flags |= MS_NODEV;
0ac4b28a 1767
2938f7c8
SH
1768 if (sb.f_flag & MS_RDONLY)
1769 required_flags |= MS_RDONLY;
0ac4b28a 1770
2938f7c8
SH
1771 if (sb.f_flag & MS_NOEXEC)
1772 required_flags |= MS_NOEXEC;
0ac4b28a
CB
1773
1774 DEBUG("Flags for \"%s\" were %lu, required extra flags "
1775 "are %lu", fsname, sb.f_flag, required_flags);
1776
1777 /* If this was a bind mount request, and required_flags
2938f7c8 1778 * does not have any flags which are not already in
0ac4b28a 1779 * mountflags, then skip the remount.
2938f7c8
SH
1780 */
1781 if (!(mountflags & MS_REMOUNT)) {
0ac4b28a
CB
1782 if (!(required_flags & ~mountflags) &&
1783 rqd_flags == 0) {
1784 DEBUG("Mountflags already were %lu, "
1785 "skipping remount", mountflags);
2938f7c8
SH
1786 goto skipremount;
1787 }
1788 }
0ac4b28a 1789
2938f7c8 1790 mountflags |= required_flags;
6fd5e769 1791 }
614305f3 1792#endif
911324ef 1793
0ac4b28a
CB
1794 ret = mount(fsname, target, fstype, mountflags | MS_REMOUNT, data);
1795 if (ret < 0) {
1fc64d22 1796 if (optional) {
0ac4b28a
CB
1797 INFO("Failed to mount \"%s\" on \"%s\" "
1798 "(optional): %s", fsname, target,
1799 strerror(errno));
1fc64d22
SG
1800 return 0;
1801 }
0ac4b28a
CB
1802
1803 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1804 return -1;
911324ef
DL
1805 }
1806 }
1807
614305f3 1808#ifdef HAVE_STATVFS
6fd5e769 1809skipremount:
614305f3 1810#endif
0ac4b28a
CB
1811 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"", fsname,
1812 target, fstype);
911324ef
DL
1813
1814 return 0;
1815}
1816
c5e30de4 1817/* Remove "optional", "create=dir", and "create=file" from mntopt */
4e4ca161
SH
1818static void cull_mntent_opt(struct mntent *mntent)
1819{
1820 int i;
c5e30de4
CB
1821 char *list[] = {"create=dir", "create=file", "optional", NULL};
1822
1823 for (i = 0; list[i]; i++) {
1824 char *p, *p2;
1825
1826 p = strstr(mntent->mnt_opts, list[i]);
1827 if (!p)
4e4ca161 1828 continue;
c5e30de4 1829
4e4ca161
SH
1830 p2 = strchr(p, ',');
1831 if (!p2) {
1832 /* no more mntopts, so just chop it here */
1833 *p = '\0';
1834 continue;
1835 }
c5e30de4
CB
1836
1837 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
4e4ca161
SH
1838 }
1839}
1840
4d5b72a1 1841static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
1842 const char *path,
1843 const struct lxc_rootfs *rootfs,
1844 const char *lxc_name,
1845 const char *lxc_path)
0ad19a3f 1846{
608e3567 1847 int ret = 0;
911324ef 1848
749f98d9
CB
1849 if (!strncmp(mntent->mnt_type, "overlay", 7))
1850 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
1851 else if (!strncmp(mntent->mnt_type, "aufs", 4))
1852 ret = aufs_mkdir(mntent, rootfs, lxc_name, lxc_path);
1853 if (ret < 0)
1854 return -1;
6e46cc0d 1855
34cfffb3 1856 if (hasmntopt(mntent, "create=dir")) {
749f98d9
CB
1857 ret = mkdir_p(path, 0755);
1858 if (ret < 0 && errno != EEXIST) {
1859 SYSERROR("Failed to create directory \"%s\"", path);
1860 return -1;
34cfffb3
SG
1861 }
1862 }
1863
4d5b72a1 1864 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
749f98d9
CB
1865 int fd;
1866 char *p1, *p2;
1867
1868 p1 = strdup(path);
1869 if (!p1)
1870 return -1;
1871
1872 p2 = dirname(p1);
1873
1874 ret = mkdir_p(p2, 0755);
1875 free(p1);
1876 if (ret < 0 && errno != EEXIST) {
1877 SYSERROR("Failed to create directory \"%s\"", path);
1878 return -1;
6e46cc0d 1879 }
749f98d9
CB
1880
1881 fd = open(path, O_CREAT, 0644);
1882 if (fd < 0)
1883 return -1;
1884 close(fd);
34cfffb3 1885 }
749f98d9
CB
1886
1887 return 0;
4d5b72a1
NC
1888}
1889
ec50007f
CB
1890/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1891 * without a rootfs. */
db4aba38 1892static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
1893 const char *path,
1894 const struct lxc_rootfs *rootfs,
1895 const char *lxc_name,
1896 const char *lxc_path)
4d5b72a1 1897{
d8b712bc 1898 int ret;
4d5b72a1
NC
1899 unsigned long mntflags;
1900 char *mntdata;
d8b712bc 1901 bool dev, optional;
ec50007f 1902 char *rootfs_path = NULL;
d8b712bc
CB
1903
1904 optional = hasmntopt(mntent, "optional") != NULL;
1905 dev = hasmntopt(mntent, "dev") != NULL;
1906
ec50007f
CB
1907 if (rootfs && rootfs->path)
1908 rootfs_path = rootfs->mount;
1909
d8b712bc
CB
1910 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
1911 lxc_path);
1912 if (ret < 0) {
1913 if (optional)
1914 return 0;
608e3567 1915
d8b712bc
CB
1916 return -1;
1917 }
4e4ca161
SH
1918 cull_mntent_opt(mntent);
1919
d8b712bc
CB
1920 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
1921 if (ret < 0)
a17b1e65 1922 return -1;
a17b1e65 1923
6e46cc0d 1924 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
ae7a770e 1925 mntdata, optional, dev, rootfs_path);
68c152ef 1926
911324ef 1927 free(mntdata);
911324ef
DL
1928 return ret;
1929}
1930
db4aba38
NC
1931static inline int mount_entry_on_systemfs(struct mntent *mntent)
1932{
1433c9f9 1933 int ret;
07667a6a 1934 char path[MAXPATHLEN];
1433c9f9
CB
1935
1936 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
1937 * absolute paths starting at / on the host.
1938 */
1433c9f9
CB
1939 if (mntent->mnt_dir[0] != '/')
1940 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1941 else
1942 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
07667a6a 1943 if (ret < 0 || ret >= sizeof(path))
1433c9f9 1944 return -1;
1433c9f9
CB
1945
1946 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
1947}
1948
4e4ca161 1949static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 1950 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
1951 const char *lxc_name,
1952 const char *lxc_path)
911324ef 1953{
bdd2b34c 1954 int offset;
013bd428 1955 char *aux;
67e571de 1956 const char *lxcpath;
bdd2b34c
CB
1957 char path[MAXPATHLEN];
1958 int ret = 0;
0ad19a3f 1959
593e8478 1960 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 1961 if (!lxcpath)
2a59a681 1962 return -1;
2a59a681 1963
bdd2b34c
CB
1964 /* If rootfs->path is a blockdev path, allow container fstab to use
1965 * <lxcpath>/<name>/rootfs" as the target prefix.
1966 */
1967 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
1968 if (ret < 0 || ret >= MAXPATHLEN)
80a881b2
SH
1969 goto skipvarlib;
1970
1971 aux = strstr(mntent->mnt_dir, path);
1972 if (aux) {
1973 offset = strlen(path);
1974 goto skipabs;
1975 }
1976
1977skipvarlib:
013bd428
DL
1978 aux = strstr(mntent->mnt_dir, rootfs->path);
1979 if (!aux) {
bdd2b34c 1980 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
db4aba38 1981 return ret;
013bd428 1982 }
80a881b2
SH
1983 offset = strlen(rootfs->path);
1984
1985skipabs:
bdd2b34c
CB
1986 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
1987 if (ret < 0 || ret >= MAXPATHLEN)
a17b1e65 1988 return -1;
a17b1e65 1989
0a2dddd4 1990 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 1991}
d330fe7b 1992
4e4ca161 1993static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
1994 const struct lxc_rootfs *rootfs,
1995 const char *lxc_name,
1996 const char *lxc_path)
911324ef
DL
1997{
1998 char path[MAXPATHLEN];
911324ef 1999 int ret;
d330fe7b 2000
34cfffb3 2001 /* relative to root mount point */
6e46cc0d 2002 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1433c9f9 2003 if (ret < 0 || ret >= sizeof(path)) {
9ba8130c
SH
2004 ERROR("path name too long");
2005 return -1;
2006 }
911324ef 2007
0a2dddd4 2008 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2009}
2010
80a881b2 2011static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2012 const char *lxc_name, const char *lxc_path)
911324ef 2013{
aaf901be
AM
2014 struct mntent mntent;
2015 char buf[4096];
911324ef 2016 int ret = -1;
e76b8764 2017
aaf901be 2018 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
1ae3c19f
CB
2019 if (!rootfs->path)
2020 ret = mount_entry_on_systemfs(&mntent);
2021 else if (mntent.mnt_dir[0] != '/')
2022 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2023 lxc_name, lxc_path);
2024 else
2025 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2026 lxc_name, lxc_path);
2027 if (ret < 0)
2028 return -1;
0ad19a3f 2029 }
2030 ret = 0;
cd54d859 2031
1ae3c19f 2032 INFO("Set up mount entries");
e7938e9e
MN
2033 return ret;
2034}
2035
80a881b2 2036static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
42dff448 2037 const char *lxc_name, const char *lxc_path)
e7938e9e 2038{
42dff448 2039 FILE *f;
e7938e9e
MN
2040 int ret;
2041
2042 if (!fstab)
2043 return 0;
2044
42dff448
CB
2045 f = setmntent(fstab, "r");
2046 if (!f) {
2047 SYSERROR("Failed to open \"%s\"", fstab);
e7938e9e
MN
2048 return -1;
2049 }
2050
42dff448
CB
2051 ret = mount_file_entries(rootfs, f, lxc_name, lxc_path);
2052 if (ret < 0)
2053 ERROR("Failed to set up mount entries");
e7938e9e 2054
42dff448 2055 endmntent(f);
0ad19a3f 2056 return ret;
2057}
2058
5ef5c9a3 2059FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2060{
5ef5c9a3 2061 int ret;
e7938e9e 2062 char *mount_entry;
5ef5c9a3 2063 struct lxc_list *iterator;
6bd04140 2064 FILE *f;
5ef5c9a3
CB
2065 int fd = -1;
2066
2067 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2068 if (fd < 0) {
2069 if (errno != ENOSYS)
2070 return NULL;
6bd04140
CB
2071 f = tmpfile();
2072 TRACE("Created temporary mount file");
5ef5c9a3 2073 } else {
6bd04140
CB
2074 f = fdopen(fd, "r+");
2075 TRACE("Created anonymous mount file");
5ef5c9a3 2076 }
e7938e9e 2077
6bd04140
CB
2078 if (!f) {
2079 SYSERROR("Could not create mount file");
5ef5c9a3
CB
2080 if (fd != -1)
2081 close(fd);
9fc7f8c0 2082 return NULL;
e7938e9e
MN
2083 }
2084
2085 lxc_list_for_each(iterator, mount) {
2086 mount_entry = iterator->elem;
6bd04140 2087 ret = fprintf(f, "%s\n", mount_entry);
5ef5c9a3 2088 if (ret < strlen(mount_entry))
6bd04140 2089 WARN("Could not write mount entry to mount file");
5ef5c9a3
CB
2090 }
2091
6bd04140
CB
2092 ret = fseek(f, 0, SEEK_SET);
2093 if (ret < 0) {
2094 SYSERROR("Failed to seek mount file");
2095 fclose(f);
5ef5c9a3 2096 return NULL;
e7938e9e
MN
2097 }
2098
6bd04140 2099 return f;
9fc7f8c0
TA
2100}
2101
5ef5c9a3
CB
2102static int setup_mount_entries(const struct lxc_rootfs *rootfs,
2103 struct lxc_list *mount, const char *lxc_name,
2104 const char *lxc_path)
9fc7f8c0 2105{
19b5d755 2106 FILE *f;
9fc7f8c0
TA
2107 int ret;
2108
19b5d755
CB
2109 f = make_anonymous_mount_file(mount);
2110 if (!f)
9fc7f8c0 2111 return -1;
e7938e9e 2112
19b5d755 2113 ret = mount_file_entries(rootfs, f, lxc_name, lxc_path);
e7938e9e 2114
19b5d755 2115 fclose(f);
e7938e9e
MN
2116 return ret;
2117}
2118
bab88e68
CS
2119static int parse_cap(const char *cap)
2120{
2121 char *ptr = NULL;
84760c11 2122 size_t i;
2123 int capid = -1;
bab88e68 2124
7035407c
DE
2125 if (!strcmp(cap, "none"))
2126 return -2;
2127
bab88e68
CS
2128 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2129
2130 if (strcmp(cap, caps_opt[i].name))
2131 continue;
2132
2133 capid = caps_opt[i].value;
2134 break;
2135 }
2136
2137 if (capid < 0) {
2138 /* try to see if it's numeric, so the user may specify
2139 * capabilities that the running kernel knows about but
2140 * we don't */
2141 errno = 0;
2142 capid = strtol(cap, &ptr, 10);
2143 if (!ptr || *ptr != '\0' || errno != 0)
2144 /* not a valid number */
2145 capid = -1;
2146 else if (capid > lxc_caps_last_cap())
2147 /* we have a number but it's not a valid
2148 * capability */
2149 capid = -1;
2150 }
2151
2152 return capid;
2153}
2154
0769b82a
CS
2155int in_caplist(int cap, struct lxc_list *caps)
2156{
2157 struct lxc_list *iterator;
2158 int capid;
2159
2160 lxc_list_for_each(iterator, caps) {
2161 capid = parse_cap(iterator->elem);
2162 if (capid == cap)
2163 return 1;
2164 }
2165
2166 return 0;
2167}
2168
81810dd1
DL
2169static int setup_caps(struct lxc_list *caps)
2170{
2171 struct lxc_list *iterator;
2172 char *drop_entry;
bab88e68 2173 int capid;
81810dd1
DL
2174
2175 lxc_list_for_each(iterator, caps) {
2176
2177 drop_entry = iterator->elem;
2178
bab88e68 2179 capid = parse_cap(drop_entry);
d55bc1ad 2180
81810dd1 2181 if (capid < 0) {
1e11be34
DL
2182 ERROR("unknown capability %s", drop_entry);
2183 return -1;
81810dd1
DL
2184 }
2185
2186 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2187
2188 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2189 SYSERROR("failed to remove %s capability", drop_entry);
2190 return -1;
2191 }
81810dd1
DL
2192
2193 }
2194
1fb86a7c
SH
2195 DEBUG("capabilities have been setup");
2196
2197 return 0;
2198}
2199
2200static int dropcaps_except(struct lxc_list *caps)
2201{
2202 struct lxc_list *iterator;
2203 char *keep_entry;
1fb86a7c
SH
2204 int i, capid;
2205 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2206 INFO("found %d capabilities", numcaps);
1fb86a7c 2207
2caf9a97
SH
2208 if (numcaps <= 0 || numcaps > 200)
2209 return -1;
2210
1fb86a7c
SH
2211 // caplist[i] is 1 if we keep capability i
2212 int *caplist = alloca(numcaps * sizeof(int));
2213 memset(caplist, 0, numcaps * sizeof(int));
2214
2215 lxc_list_for_each(iterator, caps) {
2216
2217 keep_entry = iterator->elem;
2218
bab88e68 2219 capid = parse_cap(keep_entry);
1fb86a7c 2220
7035407c
DE
2221 if (capid == -2)
2222 continue;
2223
1fb86a7c
SH
2224 if (capid < 0) {
2225 ERROR("unknown capability %s", keep_entry);
2226 return -1;
2227 }
2228
8255688a 2229 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2230
2231 caplist[capid] = 1;
2232 }
2233 for (i=0; i<numcaps; i++) {
2234 if (caplist[i])
2235 continue;
2236 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2237 SYSERROR("failed to remove capability %d", i);
2238 return -1;
2239 }
1fb86a7c
SH
2240 }
2241
2242 DEBUG("capabilities have been setup");
81810dd1
DL
2243
2244 return 0;
2245}
2246
0ad19a3f 2247static int setup_hw_addr(char *hwaddr, const char *ifname)
2248{
2249 struct sockaddr sockaddr;
2250 struct ifreq ifr;
fad6ef95 2251 int ret, fd, saved_errno;
0ad19a3f 2252
3cfc0f3a
MN
2253 ret = lxc_convert_mac(hwaddr, &sockaddr);
2254 if (ret) {
2255 ERROR("mac address '%s' conversion failed : %s",
2256 hwaddr, strerror(-ret));
0ad19a3f 2257 return -1;
2258 }
2259
2260 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
5da6aa8c 2261 ifr.ifr_name[IFNAMSIZ-1] = '\0';
0ad19a3f 2262 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2263
2264 fd = socket(AF_INET, SOCK_DGRAM, 0);
2265 if (fd < 0) {
3ab87b66 2266 ERROR("socket failure : %s", strerror(errno));
0ad19a3f 2267 return -1;
2268 }
2269
2270 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
fad6ef95 2271 saved_errno = errno;
0ad19a3f 2272 close(fd);
2273 if (ret)
fad6ef95 2274 ERROR("ioctl failure : %s", strerror(saved_errno));
0ad19a3f 2275
5da6aa8c 2276 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
cd54d859 2277
0ad19a3f 2278 return ret;
2279}
2280
82d5ae15 2281static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2282{
82d5ae15
DL
2283 struct lxc_list *iterator;
2284 struct lxc_inetdev *inetdev;
3cfc0f3a 2285 int err;
0ad19a3f 2286
82d5ae15
DL
2287 lxc_list_for_each(iterator, ip) {
2288
2289 inetdev = iterator->elem;
2290
0093bb8c
DL
2291 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2292 &inetdev->bcast, inetdev->prefix);
3cfc0f3a
MN
2293 if (err) {
2294 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2295 ifindex, strerror(-err));
82d5ae15
DL
2296 return -1;
2297 }
2298 }
2299
2300 return 0;
0ad19a3f 2301}
2302
82d5ae15 2303static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2304{
82d5ae15 2305 struct lxc_list *iterator;
7fa9074f 2306 struct lxc_inet6dev *inet6dev;
3cfc0f3a 2307 int err;
0ad19a3f 2308
82d5ae15
DL
2309 lxc_list_for_each(iterator, ip) {
2310
2311 inet6dev = iterator->elem;
2312
b3df193c 2313 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
0093bb8c
DL
2314 &inet6dev->mcast, &inet6dev->acast,
2315 inet6dev->prefix);
3cfc0f3a
MN
2316 if (err) {
2317 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2318 ifindex, strerror(-err));
82d5ae15 2319 return -1;
3cfc0f3a 2320 }
82d5ae15
DL
2321 }
2322
2323 return 0;
0ad19a3f 2324}
2325
e337179a 2326static int lxc_setup_netdev_in_child_namespaces(struct lxc_netdev *netdev)
0ad19a3f 2327{
0ad19a3f 2328 char ifname[IFNAMSIZ];
3cfc0f3a 2329 int err;
d1826cf1
CB
2330 const char *net_type_name;
2331 char *current_ifname = ifname;
0ad19a3f 2332
82d5ae15
DL
2333 /* empty network namespace */
2334 if (!netdev->ifindex) {
b0efbac4 2335 if (netdev->flags & IFF_UP) {
d472214b 2336 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2337 if (err) {
2338 ERROR("failed to set the loopback up : %s",
2339 strerror(-err));
82d5ae15
DL
2340 return -1;
2341 }
82d5ae15 2342 }
d1826cf1
CB
2343
2344 if (netdev->type == LXC_NET_EMPTY)
2345 return 0;
2346
2347 if (netdev->type == LXC_NET_NONE)
40790553 2348 return 0;
d1826cf1
CB
2349
2350 if (netdev->type != LXC_NET_VETH) {
2351 net_type_name = lxc_net_type_to_str(netdev->type);
2352 ERROR("%s networks are not supported for containers "
2353 "not setup up by privileged users",
2354 net_type_name);
2355 return -1;
2356 }
2357
40790553 2358 netdev->ifindex = if_nametoindex(netdev->name);
0ad19a3f 2359 }
13954cce 2360
b466dc33 2361 /* get the new ifindex in case of physical netdev */
40790553 2362 if (netdev->type == LXC_NET_PHYS) {
b466dc33
BP
2363 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2364 ERROR("failed to get ifindex for %s",
2365 netdev->link);
2366 return -1;
2367 }
40790553 2368 }
b466dc33 2369
82d5ae15
DL
2370 /* retrieve the name of the interface */
2371 if (!if_indextoname(netdev->ifindex, current_ifname)) {
36eb9bde 2372 ERROR("no interface corresponding to index '%d'",
82d5ae15 2373 netdev->ifindex);
0ad19a3f 2374 return -1;
2375 }
13954cce 2376
018ef520 2377 /* default: let the system to choose one interface name */
9d083402 2378 if (!netdev->name)
fb6d9b2f
DL
2379 netdev->name = netdev->type == LXC_NET_PHYS ?
2380 netdev->link : "eth%d";
018ef520 2381
82d5ae15 2382 /* rename the interface name */
40790553
SH
2383 if (strcmp(ifname, netdev->name) != 0) {
2384 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2385 if (err) {
2386 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2387 strerror(-err));
2388 return -1;
2389 }
018ef520
DL
2390 }
2391
2392 /* Re-read the name of the interface because its name has changed
2393 * and would be automatically allocated by the system
2394 */
82d5ae15 2395 if (!if_indextoname(netdev->ifindex, current_ifname)) {
018ef520 2396 ERROR("no interface corresponding to index '%d'",
82d5ae15 2397 netdev->ifindex);
018ef520 2398 return -1;
0ad19a3f 2399 }
2400
82d5ae15
DL
2401 /* set a mac address */
2402 if (netdev->hwaddr) {
2403 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
36eb9bde 2404 ERROR("failed to setup hw address for '%s'",
82d5ae15 2405 current_ifname);
0ad19a3f 2406 return -1;
2407 }
2408 }
2409
82d5ae15
DL
2410 /* setup ipv4 addresses on the interface */
2411 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
36eb9bde 2412 ERROR("failed to setup ip addresses for '%s'",
0ad19a3f 2413 ifname);
2414 return -1;
2415 }
2416
82d5ae15
DL
2417 /* setup ipv6 addresses on the interface */
2418 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
36eb9bde 2419 ERROR("failed to setup ipv6 addresses for '%s'",
0ad19a3f 2420 ifname);
2421 return -1;
2422 }
2423
82d5ae15 2424 /* set the network device up */
b0efbac4 2425 if (netdev->flags & IFF_UP) {
3cfc0f3a
MN
2426 int err;
2427
d472214b 2428 err = lxc_netdev_up(current_ifname);
3cfc0f3a
MN
2429 if (err) {
2430 ERROR("failed to set '%s' up : %s", current_ifname,
2431 strerror(-err));
0ad19a3f 2432 return -1;
2433 }
2434
2435 /* the network is up, make the loopback up too */
d472214b 2436 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2437 if (err) {
2438 ERROR("failed to set the loopback up : %s",
2439 strerror(-err));
0ad19a3f 2440 return -1;
2441 }
2442 }
2443
f8fee0e2
MK
2444 /* We can only set up the default routes after bringing
2445 * up the interface, sine bringing up the interface adds
2446 * the link-local routes and we can't add a default
2447 * route if the gateway is not reachable. */
2448
2449 /* setup ipv4 gateway on the interface */
2450 if (netdev->ipv4_gateway) {
2451 if (!(netdev->flags & IFF_UP)) {
2452 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2453 return -1;
2454 }
2455
2456 if (lxc_list_empty(&netdev->ipv4)) {
2457 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2458 return -1;
2459 }
2460
2461 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2462 if (err) {
fc739df5
SG
2463 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2464 if (err) {
2465 ERROR("failed to add ipv4 dest for '%s': %s",
2466 ifname, strerror(-err));
2467 }
2468
2469 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2470 if (err) {
2471 ERROR("failed to setup ipv4 gateway for '%s': %s",
2472 ifname, strerror(-err));
2473 if (netdev->ipv4_gateway_auto) {
2474 char buf[INET_ADDRSTRLEN];
2475 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2476 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2477 }
2478 return -1;
19a26f82 2479 }
f8fee0e2
MK
2480 }
2481 }
2482
2483 /* setup ipv6 gateway on the interface */
2484 if (netdev->ipv6_gateway) {
2485 if (!(netdev->flags & IFF_UP)) {
2486 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2487 return -1;
2488 }
2489
2490 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2491 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2492 return -1;
2493 }
2494
2495 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2496 if (err) {
fc739df5
SG
2497 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2498 if (err) {
2499 ERROR("failed to add ipv6 dest for '%s': %s",
f8fee0e2 2500 ifname, strerror(-err));
19a26f82 2501 }
fc739df5
SG
2502
2503 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2504 if (err) {
2505 ERROR("failed to setup ipv6 gateway for '%s': %s",
2506 ifname, strerror(-err));
2507 if (netdev->ipv6_gateway_auto) {
2508 char buf[INET6_ADDRSTRLEN];
2509 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2510 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2511 }
2512 return -1;
2513 }
f8fee0e2
MK
2514 }
2515 }
2516
cd54d859
DL
2517 DEBUG("'%s' has been setup", current_ifname);
2518
0ad19a3f 2519 return 0;
2520}
2521
e337179a
CB
2522static int lxc_setup_networks_in_child_namespaces(const struct lxc_conf *conf,
2523 struct lxc_list *network)
0ad19a3f 2524{
82d5ae15 2525 struct lxc_list *iterator;
82d5ae15 2526 struct lxc_netdev *netdev;
0ad19a3f 2527
c302b476
CB
2528 lxc_log_configured_netdevs(conf);
2529
5f4535a3 2530 lxc_list_for_each(iterator, network) {
5f4535a3 2531 netdev = iterator->elem;
82d5ae15 2532
f9373e40
CB
2533 /* REMOVE in LXC 3.0 */
2534 if (netdev->idx < 0) {
2535 ERROR("WARNING: using \"lxc.network.*\" keys to define "
2536 "networks is DEPRECATED, please switch to using "
2537 "\"lxc.net.[i].* keys\"");
2538 }
2539
e337179a 2540 if (lxc_setup_netdev_in_child_namespaces(netdev)) {
82d5ae15
DL
2541 ERROR("failed to setup netdev");
2542 return -1;
2543 }
2544 }
cd54d859 2545
5f4535a3
DL
2546 if (!lxc_list_empty(network))
2547 INFO("network has been setup");
cd54d859
DL
2548
2549 return 0;
0ad19a3f 2550}
2551
c6d09e15
WB
2552static int parse_resource(const char *res) {
2553 size_t i;
2554 int resid = -1;
2555
2556 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2557 if (strcmp(res, limit_opt[i].name) == 0)
2558 return limit_opt[i].value;
2559 }
2560
2561 /* try to see if it's numeric, so the user may specify
2562 * resources that the running kernel knows about but
2563 * we don't */
2564 if (lxc_safe_int(res, &resid) == 0)
2565 return resid;
2566 return -1;
2567}
2568
2569int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2570 struct lxc_list *it;
2571 struct lxc_limit *lim;
2572 int resid;
2573
2574 lxc_list_for_each(it, limits) {
2575 lim = it->elem;
2576
2577 resid = parse_resource(lim->resource);
2578 if (resid < 0) {
2579 ERROR("unknown resource %s", lim->resource);
2580 return -1;
2581 }
2582
2583 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2584 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2585 return -1;
2586 }
2587 }
2588 return 0;
2589}
2590
2af6bd1b 2591/* try to move physical nics to the init netns */
5610055a 2592void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2af6bd1b 2593{
64d2fcb5 2594 int i, oldfd;
4ec31c52 2595 char ifname[IFNAMSIZ];
2af6bd1b 2596
5610055a 2597 if (netnsfd < 0 || conf->num_savednics == 0)
2af6bd1b
SH
2598 return;
2599
64d2fcb5 2600 INFO("Running to reset %d nic names.", conf->num_savednics);
5610055a 2601
64d2fcb5
CB
2602 oldfd = lxc_preserve_ns(getpid(), "net");
2603 if (oldfd < 0) {
2604 SYSERROR("Failed to open monitor netns fd.");
2af6bd1b
SH
2605 return;
2606 }
64d2fcb5 2607
2af6bd1b
SH
2608 if (setns(netnsfd, 0) != 0) {
2609 SYSERROR("Failed to enter container netns to reset nics");
2610 close(oldfd);
2611 return;
2612 }
2613 for (i=0; i<conf->num_savednics; i++) {
2614 struct saved_nic *s = &conf->saved_nics[i];
f2e206ff 2615 /* retrieve the name of the interface */
2616 if (!if_indextoname(s->ifindex, ifname)) {
2617 WARN("no interface corresponding to index '%d'", s->ifindex);
2618 continue;
2619 }
5610055a 2620 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
f2e206ff 2621 WARN("Error moving nic name:%s back to host netns", ifname);
5610055a 2622 free(s->orig_name);
2af6bd1b 2623 }
5610055a
WB
2624 conf->num_savednics = 0;
2625
2af6bd1b
SH
2626 if (setns(oldfd, 0) != 0)
2627 SYSERROR("Failed to re-enter monitor's netns");
2628 close(oldfd);
2629}
2630
ae9242c8
SH
2631static char *default_rootfs_mount = LXCROOTFSMOUNT;
2632
7b379ab3 2633struct lxc_conf *lxc_conf_init(void)
089cd8b8 2634{
7b379ab3 2635 struct lxc_conf *new;
26ddeedd 2636 int i;
7b379ab3 2637
13277ec4 2638 new = malloc(sizeof(*new));
7b379ab3 2639 if (!new) {
13277ec4 2640 ERROR("lxc_conf_init : %s", strerror(errno));
7b379ab3
MN
2641 return NULL;
2642 }
2643 memset(new, 0, sizeof(*new));
2644
4b73005c 2645 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2646 new->personality = -1;
124fa0a8 2647 new->autodev = 1;
596a818d
DE
2648 new->console.log_path = NULL;
2649 new->console.log_fd = -1;
28a4b0e5 2650 new->console.path = NULL;
63376d7d 2651 new->console.peer = -1;
b5159817
DE
2652 new->console.peerpty.busy = -1;
2653 new->console.peerpty.master = -1;
2654 new->console.peerpty.slave = -1;
63376d7d
DL
2655 new->console.master = -1;
2656 new->console.slave = -1;
2657 new->console.name[0] = '\0';
d2e30e99 2658 new->maincmd_fd = -1;
76a26f55 2659 new->nbd_idx = -1;
54c30e29 2660 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2661 if (!new->rootfs.mount) {
13277ec4 2662 ERROR("lxc_conf_init : %s", strerror(errno));
53f3f048
SH
2663 free(new);
2664 return NULL;
2665 }
858377e4 2666 new->logfd = -1;
7b379ab3
MN
2667 lxc_list_init(&new->cgroup);
2668 lxc_list_init(&new->network);
2669 lxc_list_init(&new->mount_list);
81810dd1 2670 lxc_list_init(&new->caps);
1fb86a7c 2671 lxc_list_init(&new->keepcaps);
f6d3e3e4 2672 lxc_list_init(&new->id_map);
f979ac15 2673 lxc_list_init(&new->includes);
4184c3e1 2674 lxc_list_init(&new->aliens);
7c661726 2675 lxc_list_init(&new->environment);
c6d09e15 2676 lxc_list_init(&new->limits);
26ddeedd
SH
2677 for (i=0; i<NUM_LXC_HOOKS; i++)
2678 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2679 lxc_list_init(&new->groups);
fe4de9a6
DE
2680 new->lsm_aa_profile = NULL;
2681 new->lsm_se_context = NULL;
5112cd70 2682 new->tmp_umount_proc = 0;
7b379ab3 2683
9f30a190
MM
2684 for (i = 0; i < LXC_NS_MAX; i++)
2685 new->inherit_ns_fd[i] = -1;
2686
72bb04e4
PT
2687 /* if running in a new user namespace, init and COMMAND
2688 * default to running as UID/GID 0 when using lxc-execute */
2689 new->init_uid = 0;
2690 new->init_gid = 0;
2691
7b379ab3 2692 return new;
089cd8b8
DL
2693}
2694
a589434e 2695static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2696{
b0ee5983
CB
2697 char *veth1, *veth2;
2698 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
b7b2fde4
CB
2699 int bridge_index, err;
2700 unsigned int mtu = 0;
13954cce 2701
8bee8851 2702 if (netdev->priv.veth_attr.pair) {
e892973e 2703 veth1 = netdev->priv.veth_attr.pair;
8bee8851
WB
2704 if (handler->conf->reboot)
2705 lxc_netdev_delete_by_name(veth1);
2706 } else {
9ba8130c
SH
2707 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2708 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2709 ERROR("veth1 name too long");
2710 return -1;
2711 }
a0265685 2712 veth1 = lxc_mkifname(veth1buf);
ad40563e
ÇO
2713 if (!veth1) {
2714 ERROR("failed to allocate a temporary name");
2715 return -1;
2716 }
74a2b586
JK
2717 /* store away for deconf */
2718 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
8634bc19 2719 }
82d5ae15 2720
0e391e57 2721 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
a0265685 2722 veth2 = lxc_mkifname(veth2buf);
ad40563e 2723 if (!veth2) {
82d5ae15 2724 ERROR("failed to allocate a temporary name");
ad40563e 2725 goto out_delete;
0ad19a3f 2726 }
2727
3cfc0f3a
MN
2728 err = lxc_veth_create(veth1, veth2);
2729 if (err) {
b0ee5983
CB
2730 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2731 veth2, strerror(-err));
ad40563e 2732 goto out_delete;
0ad19a3f 2733 }
13954cce 2734
49684c0b
CS
2735 /* changing the high byte of the mac address to 0xfe, the bridge interface
2736 * will always keep the host's mac address and not take the mac address
2737 * of a container */
2738 err = setup_private_host_hw_addr(veth1);
2739 if (err) {
b0ee5983
CB
2740 ERROR("failed to change mac address of host interface \"%s\": %s",
2741 veth1, strerror(-err));
49684c0b
CS
2742 goto out_delete;
2743 }
2744
af651aa9
SN
2745 netdev->ifindex = if_nametoindex(veth2);
2746 if (!netdev->ifindex) {
b0ee5983 2747 ERROR("failed to retrieve the index for \"%s\"", veth2);
af651aa9
SN
2748 goto out_delete;
2749 }
2750
82d5ae15 2751 if (netdev->mtu) {
b7b2fde4 2752 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
b0ee5983 2753 WARN("failed to parse mtu from");
b7b2fde4 2754 else
b0ee5983 2755 INFO("retrieved mtu %d", mtu);
e54864d3 2756 } else if (netdev->link) {
e9280f65 2757 bridge_index = if_nametoindex(netdev->link);
729e8bf6
CB
2758 if (bridge_index) {
2759 mtu = netdev_get_mtu(bridge_index);
b0ee5983 2760 INFO("retrieved mtu %d from %s", mtu, netdev->link);
729e8bf6
CB
2761 } else {
2762 mtu = netdev_get_mtu(netdev->ifindex);
b0ee5983 2763 INFO("retrieved mtu %d from %s", mtu, veth2);
729e8bf6 2764 }
e54864d3
NC
2765 }
2766
2767 if (mtu) {
2768 err = lxc_netdev_set_mtu(veth1, mtu);
3cfc0f3a 2769 if (!err)
e54864d3 2770 err = lxc_netdev_set_mtu(veth2, mtu);
3cfc0f3a 2771 if (err) {
b0ee5983
CB
2772 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2773 "and \"%s\": %s",
e54864d3 2774 mtu, veth1, veth2, strerror(-err));
eb14c10a 2775 goto out_delete;
75d09f83
DL
2776 }
2777 }
2778
3cfc0f3a 2779 if (netdev->link) {
c43cbc04 2780 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
3cfc0f3a 2781 if (err) {
b0ee5983
CB
2782 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2783 veth1, netdev->link, strerror(-err));
3cfc0f3a
MN
2784 goto out_delete;
2785 }
b0ee5983 2786 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
eb14c10a
DL
2787 }
2788
d472214b 2789 err = lxc_netdev_up(veth1);
6e35af2e 2790 if (err) {
b0ee5983 2791 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
6e35af2e 2792 goto out_delete;
0ad19a3f 2793 }
2794
e3b4c4c4 2795 if (netdev->upscript) {
751d9dcd
DL
2796 err = run_script(handler->name, "net", netdev->upscript, "up",
2797 "veth", veth1, (char*) NULL);
2798 if (err)
e3b4c4c4 2799 goto out_delete;
e3b4c4c4
ST
2800 }
2801
b0ee5983
CB
2802 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2803 netdev->ifindex);
82d5ae15 2804
6ab9ab6d 2805 return 0;
eb14c10a
DL
2806
2807out_delete:
b316d209
CB
2808 if (netdev->ifindex != 0)
2809 lxc_netdev_delete_by_name(veth1);
f10fad2f 2810 if (!netdev->priv.veth_attr.pair)
ad40563e 2811 free(veth1);
f10fad2f 2812 free(veth2);
6ab9ab6d 2813 return -1;
13954cce 2814}
d957ae2d 2815
74a2b586
JK
2816static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2817{
2818 char *veth1;
2819 int err;
2820
2821 if (netdev->priv.veth_attr.pair)
2822 veth1 = netdev->priv.veth_attr.pair;
2823 else
2824 veth1 = netdev->priv.veth_attr.veth1;
2825
2826 if (netdev->downscript) {
2827 err = run_script(handler->name, "net", netdev->downscript,
2828 "down", "veth", veth1, (char*) NULL);
2829 if (err)
2830 return -1;
2831 }
2832 return 0;
2833}
2834
a589434e 2835static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2836{
0e391e57 2837 char peerbuf[IFNAMSIZ], *peer;
3cfc0f3a 2838 int err;
d957ae2d
MT
2839
2840 if (!netdev->link) {
2841 ERROR("no link specified for macvlan netdev");
2842 return -1;
2843 }
13954cce 2844
9ba8130c
SH
2845 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2846 if (err >= sizeof(peerbuf))
2847 return -1;
82d5ae15 2848
a0265685 2849 peer = lxc_mkifname(peerbuf);
ad40563e 2850 if (!peer) {
82d5ae15
DL
2851 ERROR("failed to make a temporary name");
2852 return -1;
0ad19a3f 2853 }
2854
3cfc0f3a
MN
2855 err = lxc_macvlan_create(netdev->link, peer,
2856 netdev->priv.macvlan_attr.mode);
2857 if (err) {
2858 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2859 peer, netdev->link, strerror(-err));
ad40563e 2860 goto out;
0ad19a3f 2861 }
2862
82d5ae15
DL
2863 netdev->ifindex = if_nametoindex(peer);
2864 if (!netdev->ifindex) {
36eb9bde 2865 ERROR("failed to retrieve the index for %s", peer);
ad40563e 2866 goto out;
22ebac19 2867 }
2868
e3b4c4c4 2869 if (netdev->upscript) {
751d9dcd
DL
2870 err = run_script(handler->name, "net", netdev->upscript, "up",
2871 "macvlan", netdev->link, (char*) NULL);
2872 if (err)
ad40563e 2873 goto out;
e3b4c4c4
ST
2874 }
2875
a589434e 2876 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
e892973e 2877 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
0ad19a3f 2878
d957ae2d 2879 return 0;
ad40563e
ÇO
2880out:
2881 lxc_netdev_delete_by_name(peer);
2882 free(peer);
2883 return -1;
0ad19a3f 2884}
2885
74a2b586
JK
2886static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2887{
2888 int err;
2889
2890 if (netdev->downscript) {
2891 err = run_script(handler->name, "net", netdev->downscript,
2892 "down", "macvlan", netdev->link,
2893 (char*) NULL);
2894 if (err)
2895 return -1;
2896 }
2897 return 0;
2898}
2899
a589434e
JN
2900/* XXX: merge with instantiate_macvlan */
2901static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
26c39028
JHS
2902{
2903 char peer[IFNAMSIZ];
3cfc0f3a 2904 int err;
82f58d03 2905 static uint16_t vlan_cntr = 0;
b7b2fde4 2906 unsigned int mtu = 0;
26c39028
JHS
2907
2908 if (!netdev->link) {
2909 ERROR("no link specified for vlan netdev");
2910 return -1;
2911 }
2912
82f58d03 2913 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
9ba8130c
SH
2914 if (err >= sizeof(peer)) {
2915 ERROR("peer name too long");
2916 return -1;
2917 }
26c39028 2918
3cfc0f3a
MN
2919 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2920 if (err) {
2921 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2922 peer, netdev->link, strerror(-err));
26c39028
JHS
2923 return -1;
2924 }
2925
2926 netdev->ifindex = if_nametoindex(peer);
2927 if (!netdev->ifindex) {
2928 ERROR("failed to retrieve the ifindex for %s", peer);
b84f58b9 2929 lxc_netdev_delete_by_name(peer);
26c39028
JHS
2930 return -1;
2931 }
2932
a589434e 2933 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
e892973e 2934 netdev->ifindex);
b4fb7de1 2935 if (netdev->mtu) {
b7b2fde4
CB
2936 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
2937 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
2938 netdev->ifindex, netdev->name);
2939 return -1;
2940 }
2941 err = lxc_netdev_set_mtu(peer, mtu);
b4fb7de1
VL
2942 if (err) {
2943 ERROR("failed to set mtu '%s' for %s : %s",
2944 netdev->mtu, peer, strerror(-err));
2945 lxc_netdev_delete_by_name(peer);
2946 return -1;
2947 }
2948 }
e892973e 2949
26c39028
JHS
2950 return 0;
2951}
2952
74a2b586
JK
2953static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2954{
2955 return 0;
2956}
2957
a589434e 2958static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2959{
6168e99f
DL
2960 if (!netdev->link) {
2961 ERROR("no link specified for the physical interface");
2962 return -1;
2963 }
2964
9d083402 2965 netdev->ifindex = if_nametoindex(netdev->link);
82d5ae15 2966 if (!netdev->ifindex) {
9d083402 2967 ERROR("failed to retrieve the index for %s", netdev->link);
0ad19a3f 2968 return -1;
2969 }
2970
e3b4c4c4
ST
2971 if (netdev->upscript) {
2972 int err;
751d9dcd
DL
2973 err = run_script(handler->name, "net", netdev->upscript,
2974 "up", "phys", netdev->link, (char*) NULL);
2975 if (err)
e3b4c4c4 2976 return -1;
e3b4c4c4
ST
2977 }
2978
82d5ae15 2979 return 0;
0ad19a3f 2980}
2981
74a2b586
JK
2982static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2983{
2984 int err;
2985
2986 if (netdev->downscript) {
2987 err = run_script(handler->name, "net", netdev->downscript,
2988 "down", "phys", netdev->link, (char*) NULL);
2989 if (err)
2990 return -1;
2991 }
2992 return 0;
2993}
2994
a589434e 2995static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
26b797f3
SH
2996{
2997 netdev->ifindex = 0;
2998 return 0;
2999}
3000
a589434e 3001static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 3002{
82d5ae15 3003 netdev->ifindex = 0;
e3b4c4c4
ST
3004 if (netdev->upscript) {
3005 int err;
751d9dcd
DL
3006 err = run_script(handler->name, "net", netdev->upscript,
3007 "up", "empty", (char*) NULL);
3008 if (err)
e3b4c4c4 3009 return -1;
e3b4c4c4 3010 }
82d5ae15 3011 return 0;
0ad19a3f 3012}
3013
74a2b586
JK
3014static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3015{
3016 int err;
3017
3018 if (netdev->downscript) {
3019 err = run_script(handler->name, "net", netdev->downscript,
3020 "down", "empty", (char*) NULL);
3021 if (err)
3022 return -1;
3023 }
3024 return 0;
3025}
3026
26b797f3
SH
3027static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3028{
3029 return 0;
3030}
3031
3032int lxc_requests_empty_network(struct lxc_handler *handler)
3033{
3034 struct lxc_list *network = &handler->conf->network;
3035 struct lxc_list *iterator;
3036 struct lxc_netdev *netdev;
3037 bool found_none = false, found_nic = false;
3038
3039 if (lxc_list_empty(network))
3040 return 0;
3041
3042 lxc_list_for_each(iterator, network) {
3043
3044 netdev = iterator->elem;
3045
3046 if (netdev->type == LXC_NET_NONE)
3047 found_none = true;
3048 else
3049 found_nic = true;
3050 }
3051 if (found_none && !found_nic)
3052 return 1;
3053 return 0;
3054}
3055
e337179a 3056int lxc_setup_networks_in_parent_namespaces(struct lxc_handler *handler)
0ad19a3f 3057{
e337179a 3058 bool am_root;
82d5ae15 3059 struct lxc_netdev *netdev;
e337179a
CB
3060 struct lxc_list *iterator;
3061 struct lxc_list *network = &handler->conf->network;
cbef6c52 3062
e337179a
CB
3063 /* We need to be root. */
3064 am_root = (getuid() == 0);
cbef6c52
SH
3065 if (!am_root)
3066 return 0;
0ad19a3f 3067
5f4535a3 3068 lxc_list_for_each(iterator, network) {
5f4535a3 3069 netdev = iterator->elem;
13954cce 3070
e337179a
CB
3071 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
3072 ERROR("invalid network configuration type '%d'",
3073 netdev->type);
56637458
CB
3074 return -1;
3075 }
3076
e337179a
CB
3077 if (netdev->type != LXC_NET_MACVLAN &&
3078 netdev->priv.macvlan_attr.mode) {
3079 ERROR("Invalid macvlan.mode for a non-macvlan netdev");
56637458
CB
3080 return -1;
3081 }
3082
e337179a
CB
3083 if (netdev->type != LXC_NET_VETH &&
3084 netdev->priv.veth_attr.pair) {
3085 ERROR("Invalid veth pair for a non-veth netdev");
56637458
CB
3086 return -1;
3087 }
3088
e337179a
CB
3089 if (netdev->type != LXC_NET_VLAN &&
3090 netdev->priv.vlan_attr.vid > 0) {
3091 ERROR("Invalid vlan.id for a non-macvlan netdev");
82d5ae15
DL
3092 return -1;
3093 }
0ad19a3f 3094
e3b4c4c4 3095 if (netdev_conf[netdev->type](handler, netdev)) {
82d5ae15
DL
3096 ERROR("failed to create netdev");
3097 return -1;
3098 }
e3b4c4c4 3099
0ad19a3f 3100 }
3101
3102 return 0;
3103}
3104
358daf49 3105bool lxc_delete_network(struct lxc_handler *handler)
7fef7a06 3106{
e97946ae 3107 int ret;
74a2b586 3108 struct lxc_list *network = &handler->conf->network;
7fef7a06
DL
3109 struct lxc_list *iterator;
3110 struct lxc_netdev *netdev;
358daf49 3111 bool deleted_all = true;
7fef7a06
DL
3112
3113 lxc_list_for_each(iterator, network) {
3114 netdev = iterator->elem;
d472214b 3115
74a2b586 3116 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
d8f8e352 3117 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
358daf49
CB
3118 WARN("Failed to rename interface with index %d "
3119 "to its initial name \"%s\".",
3120 netdev->ifindex, netdev->link);
d472214b 3121 continue;
d8f8e352 3122 }
d472214b 3123
74a2b586 3124 if (netdev_deconf[netdev->type](handler, netdev)) {
e97946ae 3125 WARN("Failed to destroy netdev");
74a2b586
JK
3126 }
3127
d8f8e352
DL
3128 /* Recent kernel remove the virtual interfaces when the network
3129 * namespace is destroyed but in case we did not moved the
3130 * interface to the network namespace, we have to destroy it
3131 */
e97946ae
CB
3132 if (netdev->ifindex != 0) {
3133 ret = lxc_netdev_delete_by_index(netdev->ifindex);
358daf49
CB
3134 if (-ret == ENODEV) {
3135 INFO("Interface \"%s\" with index %d already "
3136 "deleted or existing in different network "
3137 "namespace.",
3138 netdev->name ? netdev->name : "(null)",
3139 netdev->ifindex);
3140 } else if (ret < 0) {
3141 deleted_all = false;
3142 WARN("Failed to remove interface \"%s\" with "
3143 "index %d: %s.",
3144 netdev->name ? netdev->name : "(null)",
3145 netdev->ifindex, strerror(-ret));
3146 } else {
3147 INFO("Removed interface \"%s\" with index %d.",
3148 netdev->name ? netdev->name : "(null)",
3149 netdev->ifindex);
3150 }
e97946ae
CB
3151 }
3152
3153 /* Explicitly delete host veth device to prevent lingering
3154 * devices. We had issues in LXD around this.
3155 */
b316d209 3156 if (netdev->ifindex != 0 && netdev->type == LXC_NET_VETH && !am_unpriv()) {
358daf49
CB
3157 char *hostveth;
3158 if (netdev->priv.veth_attr.pair) {
e97946ae 3159 hostveth = netdev->priv.veth_attr.pair;
358daf49
CB
3160 ret = lxc_netdev_delete_by_name(hostveth);
3161 if (ret < 0) {
3162 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3163 } else {
3164 INFO("Removed interface \"%s\" from host.", hostveth);
358daf49
CB
3165 }
3166 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
e97946ae 3167 hostveth = netdev->priv.veth_attr.veth1;
e97946ae 3168 ret = lxc_netdev_delete_by_name(hostveth);
358daf49
CB
3169 if (ret < 0) {
3170 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3171 } else {
3172 INFO("Removed interface \"%s\" from host.", hostveth);
3173 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3174 }
e97946ae
CB
3175 }
3176 }
7fef7a06 3177 }
358daf49
CB
3178
3179 return deleted_all;
7fef7a06
DL
3180}
3181
45e854dc
SG
3182#define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3183
fe1f672f 3184/* lxc-user-nic returns "interface_name:interface_name\n" */
eab15c1e 3185#define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
c43cbc04
SH
3186static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3187 struct lxc_netdev *netdev, pid_t pid)
cbef6c52
SH
3188{
3189 pid_t child;
a7242d9a
ÇO
3190 int bytes, pipefd[2];
3191 char *token, *saveptr = NULL;
fe1f672f 3192 char buffer[MAX_BUFFER_SIZE];
091045f8 3193 char netdev_link[IFNAMSIZ + 1];
cbef6c52
SH
3194
3195 if (netdev->type != LXC_NET_VETH) {
3196 ERROR("nic type %d not support for unprivileged use",
091045f8 3197 netdev->type);
cbef6c52
SH
3198 return -1;
3199 }
3200
091045f8 3201 if (pipe(pipefd) < 0) {
a7242d9a
ÇO
3202 SYSERROR("pipe failed");
3203 return -1;
3204 }
3205
091045f8
CB
3206 child = fork();
3207 if (child < 0) {
cbef6c52 3208 SYSERROR("fork");
a7242d9a
ÇO
3209 close(pipefd[0]);
3210 close(pipefd[1]);
3211 return -1;
3212 }
3213
3214 if (child == 0) { // child
091045f8
CB
3215 /* Call lxc-user-nic pid type bridge. */
3216 int ret;
3217 char pidstr[LXC_NUMSTRLEN64];
3218
3219 close(pipefd[0]); /* Close the read-end of the pipe. */
3220
3221 /* Redirect stdout to write-end of the pipe. */
3222 ret = dup2(pipefd[1], STDOUT_FILENO);
3223 close(pipefd[1]); /* Close the write-end of the pipe. */
3224 if (ret < 0) {
3225 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3226 exit(EXIT_FAILURE);
3227 }
a7242d9a 3228
091045f8 3229 if (netdev->link)
cff7b5eb 3230 strncpy(netdev_link, netdev->link, IFNAMSIZ);
091045f8 3231 else
cff7b5eb 3232 strncpy(netdev_link, "none", IFNAMSIZ);
091045f8
CB
3233
3234 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3235 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3236 exit(EXIT_FAILURE);
3237 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3238
3239 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3240 lxcname, pidstr, netdev_link, netdev->name);
c43cbc04 3241 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
091045f8
CB
3242 pidstr, "veth", netdev_link, netdev->name, NULL);
3243
3244 SYSERROR("Failed to exec lxc-user-nic.");
3245 exit(EXIT_FAILURE);
a7242d9a
ÇO
3246 }
3247
3248 /* close the write-end of the pipe */
3249 close(pipefd[1]);
3250
fe1f672f 3251 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
091045f8
CB
3252 if (bytes < 0)
3253 SYSERROR("Failed to read from pipe file descriptor.");
a7242d9a
ÇO
3254 buffer[bytes - 1] = '\0';
3255
3256 if (wait_for_pid(child) != 0) {
3257 close(pipefd[0]);
cbef6c52
SH
3258 return -1;
3259 }
3260
a7242d9a
ÇO
3261 /* close the read-end of the pipe */
3262 close(pipefd[0]);
cbef6c52 3263
a7242d9a
ÇO
3264 /* fill netdev->name field */
3265 token = strtok_r(buffer, ":", &saveptr);
3266 if (!token)
3267 return -1;
091045f8
CB
3268
3269 netdev->name = malloc(IFNAMSIZ + 1);
658979c5 3270 if (!netdev->name) {
091045f8 3271 SYSERROR("Failed to allocate memory.");
658979c5
SH
3272 return -1;
3273 }
091045f8 3274 memset(netdev->name, 0, IFNAMSIZ + 1);
658979c5 3275 strncpy(netdev->name, token, IFNAMSIZ);
a7242d9a
ÇO
3276
3277 /* fill netdev->veth_attr.pair field */
3278 token = strtok_r(NULL, ":", &saveptr);
3279 if (!token)
3280 return -1;
091045f8 3281
a7242d9a 3282 netdev->priv.veth_attr.pair = strdup(token);
658979c5 3283 if (!netdev->priv.veth_attr.pair) {
091045f8 3284 ERROR("Failed to allocate memory.");
658979c5
SH
3285 return -1;
3286 }
45e854dc 3287
a7242d9a 3288 return 0;
cbef6c52
SH
3289}
3290
c43cbc04
SH
3291int lxc_assign_network(const char *lxcpath, char *lxcname,
3292 struct lxc_list *network, pid_t pid)
0ad19a3f 3293{
82d5ae15 3294 struct lxc_list *iterator;
82d5ae15 3295 struct lxc_netdev *netdev;
f2e206ff 3296 char ifname[IFNAMSIZ];
cbef6c52 3297 int am_root = (getuid() == 0);
3cfc0f3a 3298 int err;
0ad19a3f 3299
5f4535a3 3300 lxc_list_for_each(iterator, network) {
82d5ae15 3301
5f4535a3 3302 netdev = iterator->elem;
82d5ae15 3303
fbb16259 3304 if (netdev->type == LXC_NET_VETH && !am_root) {
72ccbbe1
SC
3305 if (netdev->mtu)
3306 INFO("mtu ignored due to insufficient privilege");
c43cbc04 3307 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
cbef6c52 3308 return -1;
e337179a
CB
3309 /* lxc-user-nic has moved the nic to the new ns.
3310 * unpriv_assign_nic() fills in netdev->name.
3311 * netdev->ifindex will be filed in at
3312 * lxc_setup_netdev_in_child_namespaces.
3313 */
cbef6c52
SH
3314 continue;
3315 }
236087a6 3316
fbb16259
SH
3317 /* empty network namespace, nothing to move */
3318 if (!netdev->ifindex)
3319 continue;
3320
f2e206ff 3321 /* retrieve the name of the interface */
3322 if (!if_indextoname(netdev->ifindex, ifname)) {
3323 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3324 return -1;
3325 }
3326
3327 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3cfc0f3a
MN
3328 if (err) {
3329 ERROR("failed to move '%s' to the container : %s",
3330 netdev->link, strerror(-err));
82d5ae15
DL
3331 return -1;
3332 }
3333
198cbbaa 3334 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
0ad19a3f 3335 }
3336
3337 return 0;
3338}
3339
251d0d2a
DE
3340static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3341 size_t buf_size)
f6d3e3e4 3342{
29053180
CB
3343 char path[MAXPATHLEN];
3344 int fd, ret;
f6d3e3e4 3345
29053180
CB
3346 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3347 idtype == ID_TYPE_UID ? 'u' : 'g');
3348 if (ret < 0 || ret >= MAXPATHLEN) {
3349 ERROR("failed to create path \"%s\"", path);
f6d3e3e4
SH
3350 return -E2BIG;
3351 }
29053180
CB
3352
3353 fd = open(path, O_WRONLY);
3354 if (fd < 0) {
3355 SYSERROR("failed to open \"%s\"", path);
3356 return -1;
f6d3e3e4 3357 }
29053180
CB
3358
3359 errno = 0;
3360 ret = lxc_write_nointr(fd, buf, buf_size);
3361 if (ret != buf_size) {
3362 SYSERROR("failed to write %cid mapping to \"%s\"",
3363 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3364 close(fd);
3365 return -1;
3366 }
3367 close(fd);
3368
3369 return 0;
f6d3e3e4
SH
3370}
3371
6e50e704
CB
3372/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
3373 *
3374 * @return 1 if functional binary was found
3375 * @return 0 if binary exists but is lacking privilege
3376 * @return -ENOENT if binary does not exist
3377 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
3378 *
3379 */
df6a2945
CB
3380static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3381{
3382 char *path;
3383 int ret;
3384 struct stat st;
3385 int fret = 0;
3386
6e50e704
CB
3387 if (cap != CAP_SETUID && cap != CAP_SETGID)
3388 return -EINVAL;
3389
df6a2945
CB
3390 path = on_path(binary, NULL);
3391 if (!path)
3392 return -ENOENT;
3393
3394 ret = stat(path, &st);
3395 if (ret < 0) {
3396 fret = -errno;
3397 goto cleanup;
3398 }
3399
3400 /* Check if the binary is setuid. */
3401 if (st.st_mode & S_ISUID) {
3402 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3403 fret = 1;
3404 goto cleanup;
3405 }
3406
69924fff 3407 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
3408 /* Check if it has the CAP_SETUID capability. */
3409 if ((cap & CAP_SETUID) &&
3410 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3411 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3412 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3413 "and CAP_PERMITTED sets.", path);
3414 fret = 1;
3415 goto cleanup;
3416 }
3417
3418 /* Check if it has the CAP_SETGID capability. */
3419 if ((cap & CAP_SETGID) &&
3420 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3421 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3422 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3423 "and CAP_PERMITTED sets.", path);
3424 fret = 1;
3425 goto cleanup;
3426 }
d6018f88 3427 #else
69924fff
CB
3428 /* If we cannot check for file capabilities we need to give the benefit
3429 * of the doubt. Otherwise we might fail even though all the necessary
3430 * file capabilities are set.
3431 */
d6018f88
CB
3432 DEBUG("Cannot check for file capabilites as full capability support is "
3433 "missing. Manual intervention needed.");
3434 fret = 1;
df6a2945
CB
3435 #endif
3436
3437cleanup:
3438 free(path);
3439 return fret;
3440}
3441
986ef930
CB
3442int lxc_map_ids_exec_wrapper(void *args)
3443{
3444 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3445 return -1;
3446}
3447
f6d3e3e4
SH
3448int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3449{
f6d3e3e4 3450 struct id_map *map;
4bc3b759 3451 struct lxc_list *iterator;
251d0d2a 3452 enum idtype type;
986ef930 3453 char u_or_g;
4bc3b759 3454 char *pos;
99d43365 3455 int fill, left;
986ef930
CB
3456 char cmd_output[MAXPATHLEN];
3457 /* strlen("new@idmap") = 9
3458 * +
3459 * strlen(" ") = 1
3460 * +
3461 * LXC_NUMSTRLEN64
3462 * +
3463 * strlen(" ") = 1
3464 *
3465 * We add some additional space to make sure that we really have
3466 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3467 */
3468 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3469 int ret = 0, uidmap = 0, gidmap = 0;
3470 bool use_shadow = false, had_entry = false;
df6a2945
CB
3471
3472 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3473 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
3474 * will protected it by preventing another user from being handed the
3475 * range by shadow.
3476 */
df6a2945 3477 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
3478 if (uidmap == -ENOENT)
3479 WARN("newuidmap binary is missing");
3480 else if (!uidmap)
3481 WARN("newuidmap is lacking necessary privileges");
3482
df6a2945 3483 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
3484 if (gidmap == -ENOENT)
3485 WARN("newgidmap binary is missing");
3486 else if (!gidmap)
3487 WARN("newgidmap is lacking necessary privileges");
3488
df6a2945
CB
3489 if (uidmap > 0 && gidmap > 0) {
3490 DEBUG("Functional newuidmap and newgidmap binary found.");
4bc3b759 3491 use_shadow = true;
df6a2945 3492 } else {
99d43365
CB
3493 /* In case unprivileged users run application containers via
3494 * execute() or a start*() there are valid cases where they may
3495 * only want to map their own {g,u}id. Let's not block them from
3496 * doing so by requiring geteuid() == 0.
3497 */
3498 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3499 "write directly with euid %d.", geteuid());
0e6e3a41 3500 }
251d0d2a 3501
986ef930
CB
3502 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3503 type++, u_or_g = 'g') {
3504 pos = mapbuf;
3505
0e6e3a41 3506 if (use_shadow)
986ef930 3507 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 3508
cf3ef16d 3509 lxc_list_for_each(iterator, idmap) {
4bc3b759
CB
3510 /* The kernel only takes <= 4k for writes to
3511 * /proc/<nr>/[ug]id_map
3512 */
251d0d2a 3513 map = iterator->elem;
cf3ef16d
SH
3514 if (map->idtype != type)
3515 continue;
3516
4bc3b759
CB
3517 had_entry = true;
3518
986ef930 3519 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 3520 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
3521 use_shadow ? " " : "", map->nsid,
3522 map->hostid, map->range,
0e6e3a41 3523 use_shadow ? "" : "\n");
cf3ef16d 3524 if (fill <= 0 || fill >= left)
4bc3b759
CB
3525 SYSERROR("Too many {g,u}id mappings defined.");
3526
cf3ef16d 3527 pos += fill;
251d0d2a 3528 }
cf3ef16d 3529 if (!had_entry)
4f7521b4 3530 continue;
cf3ef16d 3531
986ef930
CB
3532 /* Try to catch the ouput of new{g,u}idmap to make debugging
3533 * easier.
3534 */
3535 if (use_shadow) {
3536 ret = run_command(cmd_output, sizeof(cmd_output),
3537 lxc_map_ids_exec_wrapper,
3538 (void *)mapbuf);
3539 if (ret < 0) {
3540 ERROR("new%cidmap failed to write mapping: %s",
3541 u_or_g, cmd_output);
3542 return -1;
3543 }
d1838f34 3544 } else {
986ef930
CB
3545 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3546 if (ret < 0)
3547 return -1;
d1838f34 3548 }
986ef930
CB
3549
3550 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 3551 }
251d0d2a 3552
986ef930 3553 return 0;
f6d3e3e4
SH
3554}
3555
cf3ef16d 3556/*
7b50c609
TS
3557 * return the host uid/gid to which the container root is mapped in
3558 * *val.
0b3a6504 3559 * Return true if id was found, false otherwise.
cf3ef16d 3560 */
2a9a80cb 3561bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 3562 unsigned long *val)
cf3ef16d
SH
3563{
3564 struct lxc_list *it;
3565 struct id_map *map;
3566
3567 lxc_list_for_each(it, &conf->id_map) {
3568 map = it->elem;
7b50c609 3569 if (map->idtype != idtype)
cf3ef16d
SH
3570 continue;
3571 if (map->nsid != 0)
3572 continue;
2a9a80cb
SH
3573 *val = map->hostid;
3574 return true;
cf3ef16d 3575 }
2a9a80cb 3576 return false;
cf3ef16d
SH
3577}
3578
2133f58c 3579int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3580{
3581 struct lxc_list *it;
3582 struct id_map *map;
3583 lxc_list_for_each(it, &conf->id_map) {
3584 map = it->elem;
2133f58c 3585 if (map->idtype != idtype)
cf3ef16d
SH
3586 continue;
3587 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 3588 return (id - map->hostid) + map->nsid;
cf3ef16d 3589 }
57d116ab 3590 return -1;
cf3ef16d
SH
3591}
3592
339efad9 3593int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3594{
3595 struct lxc_list *it;
3596 struct id_map *map;
2133f58c 3597 unsigned int freeid = 0;
cf3ef16d
SH
3598again:
3599 lxc_list_for_each(it, &conf->id_map) {
3600 map = it->elem;
2133f58c 3601 if (map->idtype != idtype)
cf3ef16d
SH
3602 continue;
3603 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3604 freeid = map->nsid + map->range;
3605 goto again;
3606 }
3607 }
3608 return freeid;
3609}
3610
19a26f82
MK
3611int lxc_find_gateway_addresses(struct lxc_handler *handler)
3612{
3613 struct lxc_list *network = &handler->conf->network;
3614 struct lxc_list *iterator;
3615 struct lxc_netdev *netdev;
3616 int link_index;
3617
3618 lxc_list_for_each(iterator, network) {
3619 netdev = iterator->elem;
3620
3621 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3622 continue;
3623
3624 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3625 ERROR("gateway = auto only supported for "
3626 "veth and macvlan");
3627 return -1;
3628 }
3629
3630 if (!netdev->link) {
3631 ERROR("gateway = auto needs a link interface");
3632 return -1;
3633 }
3634
3635 link_index = if_nametoindex(netdev->link);
3636 if (!link_index)
3637 return -EINVAL;
3638
3639 if (netdev->ipv4_gateway_auto) {
3640 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3641 ERROR("failed to automatically find ipv4 gateway "
3642 "address from link interface '%s'", netdev->link);
3643 return -1;
3644 }
3645 }
3646
3647 if (netdev->ipv6_gateway_auto) {
3648 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3649 ERROR("failed to automatically find ipv6 gateway "
3650 "address from link interface '%s'", netdev->link);
3651 return -1;
3652 }
3653 }
3654 }
3655
3656 return 0;
3657}
3658
5e4a62bf 3659int lxc_create_tty(const char *name, struct lxc_conf *conf)
b0a33c1e 3660{
5e4a62bf 3661 struct lxc_tty_info *tty_info = &conf->tty_info;
025ed0f3 3662 int i, ret;
b0a33c1e 3663
5e4a62bf
DL
3664 /* no tty in the configuration */
3665 if (!conf->tty)
b0a33c1e 3666 return 0;
3667
9e1045e3 3668 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
b0a33c1e 3669 if (!tty_info->pty_info) {
9e1045e3
CB
3670 SYSERROR("failed to allocate struct *pty_info");
3671 return -ENOMEM;
b0a33c1e 3672 }
3673
985d15b1 3674 for (i = 0; i < conf->tty; i++) {
b0a33c1e 3675 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3676
025ed0f3
SH
3677 process_lock();
3678 ret = openpty(&pty_info->master, &pty_info->slave,
9e1045e3 3679 pty_info->name, NULL, NULL);
025ed0f3
SH
3680 process_unlock();
3681 if (ret) {
9e1045e3 3682 SYSERROR("failed to create pty device number %d", i);
985d15b1
MT
3683 tty_info->nbtty = i;
3684 lxc_delete_tty(tty_info);
9e1045e3 3685 return -ENOTTY;
b0a33c1e 3686 }
3687
9e1045e3 3688 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
5332bb84
DL
3689 pty_info->name, pty_info->master, pty_info->slave);
3690
3ec1648d 3691 /* Prevent leaking the file descriptors to the container */
9e1045e3
CB
3692 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3693 if (ret < 0)
3694 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3695 "pty device \"%s\": %s",
3696 pty_info->master, pty_info->name, strerror(errno));
3697
3698 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3699 if (ret < 0)
3700 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3701 "pty device \"%s\": %s",
3702 pty_info->slave, pty_info->name, strerror(errno));
b035ad62 3703
b0a33c1e 3704 pty_info->busy = 0;
3705 }
3706
985d15b1 3707 tty_info->nbtty = conf->tty;
1ac470c0 3708
9e1045e3 3709 INFO("finished allocating %d pts devices", conf->tty);
985d15b1 3710 return 0;
b0a33c1e 3711}
3712
3713void lxc_delete_tty(struct lxc_tty_info *tty_info)
3714{
3715 int i;
3716
3717 for (i = 0; i < tty_info->nbtty; i++) {
3718 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3719
3720 close(pty_info->master);
3721 close(pty_info->slave);
3722 }
3723
3724 free(tty_info->pty_info);
e00c0242 3725 tty_info->pty_info = NULL;
b0a33c1e 3726 tty_info->nbtty = 0;
3727}
3728
f4f52cb5
CB
3729
3730int chown_mapped_root_exec_wrapper(void *args)
3731{
3732 execvp("lxc-usernsexec", args);
3733 return -1;
3734}
3735
f6d3e3e4 3736/*
7b50c609
TS
3737 * chown_mapped_root: for an unprivileged user with uid/gid X to
3738 * chown a dir to subuid/subgid Y, he needs to run chown as root
3739 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3740 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3741 * root is privileged with respect to hostuid/hostgid X, allowing
3742 * him to do the chown.
f6d3e3e4 3743 */
c4d10a05 3744int chown_mapped_root(char *path, struct lxc_conf *conf)
f6d3e3e4 3745{
f4f52cb5 3746 uid_t rootuid, rootgid;
2a9a80cb 3747 unsigned long val;
a7ef8753 3748 char *chownpath = path;
f4f52cb5
CB
3749 int hostuid, hostgid, ret;
3750 struct stat sb;
3751 char map1[100], map2[100], map3[100], map4[100], map5[100];
3752 char ugid[100];
3753 char *args1[] = {"lxc-usernsexec",
3754 "-m", map1,
3755 "-m", map2,
3756 "-m", map3,
3757 "-m", map5,
3758 "--", "chown", ugid, path,
3759 NULL};
3760 char *args2[] = {"lxc-usernsexec",
3761 "-m", map1,
3762 "-m", map2,
3763 "-m", map3,
3764 "-m", map4,
3765 "-m", map5,
3766 "--", "chown", ugid, path,
3767 NULL};
3768 char cmd_output[MAXPATHLEN];
3769
3770 hostuid = geteuid();
3771 hostgid = getegid();
f6d3e3e4 3772
2a9a80cb 3773 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 3774 ERROR("No uid mapping for container root");
c4d10a05 3775 return -1;
f6d3e3e4 3776 }
f4f52cb5 3777 rootuid = (uid_t)val;
7b50c609 3778 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 3779 ERROR("No gid mapping for container root");
7b50c609
TS
3780 return -1;
3781 }
f4f52cb5 3782 rootgid = (gid_t)val;
2a9a80cb 3783
a7ef8753 3784 /*
f4f52cb5 3785 * In case of overlay, we want only the writeable layer to be chowned
a7ef8753 3786 */
1f92162d 3787 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
a7ef8753
SH
3788 chownpath = strchr(path, ':');
3789 if (!chownpath) {
3790 ERROR("Bad overlay path: %s", path);
3791 return -1;
3792 }
f4f52cb5 3793 chownpath = strchr(chownpath + 1, ':');
a7ef8753
SH
3794 if (!chownpath) {
3795 ERROR("Bad overlay path: %s", path);
3796 return -1;
3797 }
3798 chownpath++;
3799 }
3800 path = chownpath;
f4f52cb5 3801 if (hostuid == 0) {
7b50c609 3802 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3803 ERROR("Error chowning %s", path);
3804 return -1;
3805 }
3806 return 0;
3807 }
f3d7e4ca 3808
f4f52cb5 3809 if (rootuid == hostuid) {
f3d7e4ca 3810 // nothing to do
b103ceac 3811 INFO("Container root is our uid; no need to chown");
f3d7e4ca
SH
3812 return 0;
3813 }
3814
bbdbf8f0 3815 /* save the current gid of "path" */
f4f52cb5
CB
3816 if (stat(path, &sb) < 0) {
3817 ERROR("Error stat %s", path);
f6d3e3e4
SH
3818 return -1;
3819 }
7b50c609 3820
bbdbf8f0
CB
3821 /* Update the path argument in case this was overlayfs. */
3822 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3823 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3824
f4f52cb5
CB
3825 /*
3826 * A file has to be group-owned by a gid mapped into the
3827 * container, or the container won't be privileged over it.
3828 */
3829 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3830 if (sb.st_uid == hostuid &&
3831 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3832 chown(path, -1, hostgid) < 0) {
3833 ERROR("Failed chgrping %s", path);
3834 return -1;
3835 }
f6d3e3e4 3836
f4f52cb5
CB
3837 // "u:0:rootuid:1"
3838 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3839 if (ret < 0 || ret >= 100) {
3840 ERROR("Error uid printing map string");
3841 return -1;
3842 }
7b50c609 3843
f4f52cb5
CB
3844 // "u:hostuid:hostuid:1"
3845 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3846 if (ret < 0 || ret >= 100) {
3847 ERROR("Error uid printing map string");
3848 return -1;
3849 }
c4d10a05 3850
f4f52cb5
CB
3851 // "g:0:rootgid:1"
3852 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3853 if (ret < 0 || ret >= 100) {
3854 ERROR("Error gid printing map string");
3855 return -1;
3856 }
98e5ba51 3857
f4f52cb5
CB
3858 // "g:pathgid:rootgid+pathgid:1"
3859 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3860 rootgid + (gid_t)sb.st_gid);
3861 if (ret < 0 || ret >= 100) {
3862 ERROR("Error gid printing map string");
3863 return -1;
3864 }
c4d10a05 3865
f4f52cb5
CB
3866 // "g:hostgid:hostgid:1"
3867 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3868 if (ret < 0 || ret >= 100) {
3869 ERROR("Error gid printing map string");
3870 return -1;
3871 }
7b50c609 3872
f4f52cb5
CB
3873 // "0:pathgid" (chown)
3874 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3875 if (ret < 0 || ret >= 100) {
3876 ERROR("Error owner printing format string for chown");
3877 return -1;
3878 }
7b50c609 3879
f4f52cb5
CB
3880 if (hostgid == sb.st_gid)
3881 ret = run_command(cmd_output, sizeof(cmd_output),
3882 chown_mapped_root_exec_wrapper,
3883 (void *)args1);
3884 else
3885 ret = run_command(cmd_output, sizeof(cmd_output),
3886 chown_mapped_root_exec_wrapper,
3887 (void *)args2);
3888 if (ret < 0)
3889 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 3890
f4f52cb5 3891 return ret;
f6d3e3e4
SH
3892}
3893
54117de5 3894int lxc_ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 3895{
c4d10a05 3896 if (lxc_list_empty(&c->id_map))
f6d3e3e4 3897 return 0;
c4d10a05 3898
54117de5
CB
3899 if (!strcmp(c->console.name, ""))
3900 return 0;
3901
3902 if (chown_mapped_root(c->console.name, c) < 0) {
3903 ERROR("failed to chown console \"%s\"", c->console.name);
c4d10a05
SH
3904 return -1;
3905 }
3906
54117de5
CB
3907 TRACE("chowned console \"%s\"", c->console.name);
3908
f6d3e3e4
SH
3909 return 0;
3910}
3911
943144d9
CB
3912/* NOTE: Must not be called from inside the container namespace! */
3913int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3914{
3915 int mounted;
3916
943144d9 3917 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3918 if (mounted == -1) {
943144d9 3919 SYSERROR("failed to mount /proc in the container");
01958b1f 3920 /* continue only if there is no rootfs */
943144d9 3921 if (conf->rootfs.path)
01958b1f 3922 return -1;
5112cd70 3923 } else if (mounted == 1) {
943144d9 3924 conf->tmp_umount_proc = 1;
5112cd70 3925 }
943144d9 3926
5112cd70
SH
3927 return 0;
3928}
3929
3930void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3931{
3932 if (lxc_conf->tmp_umount_proc == 1) {
3933 umount("/proc");
3934 lxc_conf->tmp_umount_proc = 0;
3935 }
3936}
3937
6a0c909a 3938void remount_all_slave(void)
e995d7a2
SH
3939{
3940 /* walk /proc/mounts and change any shared entries to slave */
3941 FILE *f = fopen("/proc/self/mountinfo", "r");
3942 char *line = NULL;
3943 size_t len = 0;
3944
3945 if (!f) {
3946 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3947 ERROR("Continuing container startup...");
3948 return;
3949 }
3950
3951 while (getline(&line, &len, f) != -1) {
3952 char *target, *opts;
3953 target = get_field(line, 4);
3954 if (!target)
3955 continue;
3956 opts = get_field(target, 2);
3957 if (!opts)
3958 continue;
3959 null_endofword(opts);
3960 if (!strstr(opts, "shared"))
3961 continue;
3962 null_endofword(target);
3963 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3964 SYSERROR("Failed to make %s rslave", target);
3965 ERROR("Continuing...");
3966 }
3967 }
3968 fclose(f);
f10fad2f 3969 free(line);
e995d7a2
SH
3970}
3971
2322903b
SH
3972void lxc_execute_bind_init(struct lxc_conf *conf)
3973{
3974 int ret;
9d9c111c
SH
3975 char path[PATH_MAX], destpath[PATH_MAX], *p;
3976
3977 /* If init exists in the container, don't bind mount a static one */
3978 p = choose_init(conf->rootfs.mount);
3979 if (p) {
3980 free(p);
3981 return;
3982 }
2322903b
SH
3983
3984 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3985 if (ret < 0 || ret >= PATH_MAX) {
3986 WARN("Path name too long searching for lxc.init.static");
3987 return;
3988 }
3989
3990 if (!file_exists(path)) {
3991 INFO("%s does not exist on host", path);
3992 return;
3993 }
3994
3995 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3996 if (ret < 0 || ret >= PATH_MAX) {
3997 WARN("Path name too long for container's lxc.init.static");
3998 return;
3999 }
4000
4001 if (!file_exists(destpath)) {
4002 FILE * pathfile = fopen(destpath, "wb");
4003 if (!pathfile) {
4004 SYSERROR("Failed to create mount target '%s'", destpath);
4005 return;
4006 }
4007 fclose(pathfile);
4008 }
4009
592fd47a 4010 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
2322903b
SH
4011 if (ret < 0)
4012 SYSERROR("Failed to bind lxc.init.static into container");
4013 INFO("lxc.init.static bound into container at %s", path);
4014}
4015
35120d9c
SH
4016/*
4017 * This does the work of remounting / if it is shared, calling the
4018 * container pre-mount hooks, and mounting the rootfs.
4019 */
4020int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 4021{
35120d9c
SH
4022 if (conf->rootfs_setup) {
4023 /*
4024 * rootfs was set up in another namespace. bind-mount it
4025 * to give us a mount in our own ns so we can pivot_root to it
4026 */
4027 const char *path = conf->rootfs.mount;
4028 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4029 ERROR("Failed to bind-mount container / onto itself");
145832ba 4030 return -1;
35120d9c 4031 }
145832ba 4032 return 0;
35120d9c 4033 }
d4ef7c50 4034
e995d7a2
SH
4035 remount_all_slave();
4036
35120d9c
SH
4037 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4038 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4039 return -1;
4040 }
4041
9aa76a17 4042 if (lxc_setup_rootfs(conf)) {
35120d9c
SH
4043 ERROR("failed to setup rootfs for '%s'", name);
4044 return -1;
4045 }
4046
4047 conf->rootfs_setup = true;
4048 return 0;
4049}
4050
1c1c7051
SH
4051static bool verify_start_hooks(struct lxc_conf *conf)
4052{
4053 struct lxc_list *it;
4054 char path[MAXPATHLEN];
4055 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4056 char *hookname = it->elem;
4057 struct stat st;
4058 int ret;
4059
4060 ret = snprintf(path, MAXPATHLEN, "%s%s",
7b6753e7 4061 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
1c1c7051
SH
4062 if (ret < 0 || ret >= MAXPATHLEN)
4063 return false;
4064 ret = stat(path, &st);
4065 if (ret) {
7b6753e7 4066 SYSERROR("Start hook %s not found in container",
1c1c7051
SH
4067 hookname);
4068 return false;
4069 }
6a0c909a 4070 return true;
1c1c7051
SH
4071 }
4072
4073 return true;
4074}
4075
ae467c54 4076static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
e8bd4e43 4077{
ae467c54
CB
4078 int i;
4079 int *ttyfds;
4080 struct lxc_pty_info *pty_info;
e8bd4e43
SH
4081 struct lxc_conf *conf = handler->conf;
4082 const struct lxc_tty_info *tty_info = &conf->tty_info;
e8bd4e43 4083 int sock = handler->ttysock[0];
ae467c54
CB
4084 int ret = -1;
4085 size_t num_ttyfds = (2 * conf->tty);
e8bd4e43 4086
ae467c54
CB
4087 ttyfds = malloc(num_ttyfds * sizeof(int));
4088 if (!ttyfds)
4089 return -1;
4090
4091 for (i = 0; i < num_ttyfds; i++) {
4092 pty_info = &tty_info->pty_info[i / 2];
4093 ttyfds[i++] = pty_info->slave;
4094 ttyfds[i] = pty_info->master;
4095 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
f07fa8df
CB
4096 "parent",
4097 pty_info->name, pty_info->master, pty_info->slave);
e8bd4e43
SH
4098 }
4099
ae467c54
CB
4100 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
4101 if (ret < 0)
4102 ERROR("failed to send %d ttys to parent: %s", conf->tty,
4103 strerror(errno));
4104 else
4105 TRACE("sent %d ttys to parent", conf->tty);
4106
e8bd4e43
SH
4107 close(handler->ttysock[0]);
4108 close(handler->ttysock[1]);
4109
ae467c54
CB
4110 for (i = 0; i < num_ttyfds; i++)
4111 close(ttyfds[i]);
e8bd4e43 4112
ae467c54
CB
4113 free(ttyfds);
4114
4115 return ret;
e8bd4e43
SH
4116}
4117
35120d9c
SH
4118int lxc_setup(struct lxc_handler *handler)
4119{
4120 const char *name = handler->name;
4121 struct lxc_conf *lxc_conf = handler->conf;
4122 const char *lxcpath = handler->lxcpath;
35120d9c
SH
4123
4124 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4125 ERROR("Error setting up rootfs mount after spawn");
4126 return -1;
4127 }
4128
6c544cb3
MM
4129 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4130 if (setup_utsname(lxc_conf->utsname)) {
4131 ERROR("failed to setup the utsname for '%s'", name);
4132 return -1;
4133 }
0ad19a3f 4134 }
4135
e337179a
CB
4136 if (lxc_setup_networks_in_child_namespaces(lxc_conf,
4137 &lxc_conf->network)) {
36eb9bde 4138 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 4139 return -1;
0ad19a3f 4140 }
4141
bc6928ff 4142 if (lxc_conf->autodev > 0) {
14221cbb 4143 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
91c3830e 4144 ERROR("failed to mount /dev in the container");
c6883f38
SH
4145 return -1;
4146 }
4147 }
4148
368bbc02
CS
4149 /* do automatic mounts (mainly /proc and /sys), but exclude
4150 * those that need to wait until other stuff has finished
4151 */
4fb3cba5 4152 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4153 ERROR("failed to setup the automatic mounts for '%s'", name);
4154 return -1;
4155 }
4156
0a2dddd4 4157 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
36eb9bde 4158 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 4159 return -1;
576f946d 4160 }
4161
0a2dddd4 4162 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
e7938e9e
MN
4163 ERROR("failed to setup the mount entries for '%s'", name);
4164 return -1;
4165 }
4166
7b6753e7 4167 /* Make sure any start hooks are in the container */
1c1c7051
SH
4168 if (!verify_start_hooks(lxc_conf))
4169 return -1;
4170
2322903b
SH
4171 if (lxc_conf->is_execute)
4172 lxc_execute_bind_init(lxc_conf);
4173
368bbc02
CS
4174 /* now mount only cgroup, if wanted;
4175 * before, /sys could not have been mounted
4176 * (is either mounted automatically or via fstab entries)
4177 */
4fb3cba5 4178 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4179 ERROR("failed to setup the automatic mounts for '%s'", name);
4180 return -1;
4181 }
4182
283678ed 4183 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
773fb9ca
SH
4184 ERROR("failed to run mount hooks for container '%s'.", name);
4185 return -1;
4186 }
4187
bc6928ff 4188 if (lxc_conf->autodev > 0) {
283678ed 4189 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
f7bee6c6
MW
4190 ERROR("failed to run autodev hooks for container '%s'.", name);
4191 return -1;
4192 }
27245ff7 4193 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
91c3830e
SH
4194 ERROR("failed to populate /dev in the container");
4195 return -1;
4196 }
4197 }
368bbc02 4198
3d7d929a 4199 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
36eb9bde 4200 ERROR("failed to setup the console for '%s'", name);
95b5ffaf 4201 return -1;
6e590161 4202 }
4203
69aa6655
DE
4204 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4205 ERROR("failed to setup /dev symlinks for '%s'", name);
4206 return -1;
4207 }
4208
5112cd70 4209 /* mount /proc if it's not already there */
943144d9 4210 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 4211 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 4212 return -1;
e075f5d9 4213 }
e075f5d9 4214
ac778708 4215 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 4216 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 4217 return -1;
ed502555 4218 }
4219
70761e5e 4220 if (lxc_setup_devpts(lxc_conf->pts)) {
36eb9bde 4221 ERROR("failed to setup the new pts instance");
95b5ffaf 4222 return -1;
3c26f34e 4223 }
4224
e8bd4e43
SH
4225 if (lxc_create_tty(name, lxc_conf)) {
4226 ERROR("failed to create the ttys");
4227 return -1;
4228 }
4229
ae467c54 4230 if (lxc_send_ttys_to_parent(handler) < 0) {
e8bd4e43
SH
4231 ERROR("failure sending console info to parent");
4232 return -1;
4233 }
4234
9e1045e3 4235 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
e8bd4e43
SH
4236 ERROR("failed to setup the ttys for '%s'", name);
4237 return -1;
4238 }
4239
4240 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4241 SYSERROR("failed to set environment variable for container ptys");
4242
4243
cccc74b5
DL
4244 if (setup_personality(lxc_conf->personality)) {
4245 ERROR("failed to setup personality");
4246 return -1;
4247 }
4248
97a8f74f
SG
4249 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4250 if (!lxc_list_empty(&lxc_conf->caps)) {
7389ca26 4251 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
f6d3e3e4
SH
4252 return -1;
4253 }
97a8f74f
SG
4254 if (dropcaps_except(&lxc_conf->keepcaps)) {
4255 ERROR("failed to keep requested caps");
4256 return -1;
4257 }
4258 } else if (setup_caps(&lxc_conf->caps)) {
4259 ERROR("failed to drop capabilities");
4260 return -1;
81810dd1
DL
4261 }
4262
f4152036 4263 NOTICE("Container \"%s\" is set up", name);
cd54d859 4264
0ad19a3f 4265 return 0;
4266}
26ddeedd 4267
283678ed
SH
4268int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4269 const char *lxcpath, char *argv[])
26ddeedd
SH
4270{
4271 int which = -1;
4272 struct lxc_list *it;
4273
4274 if (strcmp(hook, "pre-start") == 0)
4275 which = LXCHOOK_PRESTART;
5ea6163a
SH
4276 else if (strcmp(hook, "pre-mount") == 0)
4277 which = LXCHOOK_PREMOUNT;
26ddeedd
SH
4278 else if (strcmp(hook, "mount") == 0)
4279 which = LXCHOOK_MOUNT;
f7bee6c6
MW
4280 else if (strcmp(hook, "autodev") == 0)
4281 which = LXCHOOK_AUTODEV;
26ddeedd
SH
4282 else if (strcmp(hook, "start") == 0)
4283 which = LXCHOOK_START;
52492063
WB
4284 else if (strcmp(hook, "stop") == 0)
4285 which = LXCHOOK_STOP;
26ddeedd
SH
4286 else if (strcmp(hook, "post-stop") == 0)
4287 which = LXCHOOK_POSTSTOP;
148e91f5
SH
4288 else if (strcmp(hook, "clone") == 0)
4289 which = LXCHOOK_CLONE;
37cf711b
SY
4290 else if (strcmp(hook, "destroy") == 0)
4291 which = LXCHOOK_DESTROY;
26ddeedd
SH
4292 else
4293 return -1;
4294 lxc_list_for_each(it, &conf->hooks[which]) {
4295 int ret;
4296 char *hookname = it->elem;
283678ed 4297 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
26ddeedd
SH
4298 if (ret)
4299 return ret;
4300 }
4301 return 0;
4302}
72d0e1cb 4303
72d0e1cb
SG
4304int lxc_clear_config_caps(struct lxc_conf *c)
4305{
9ebb03ad 4306 struct lxc_list *it,*next;
72d0e1cb 4307
9ebb03ad 4308 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
4309 lxc_list_del(it);
4310 free(it->elem);
4311 free(it);
4312 }
4313 return 0;
4314}
4315
74a3920a 4316static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
4317 struct lxc_list *it, *next;
4318
4355ab5f 4319 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
4320 lxc_list_del(it);
4321 free(it->elem);
4322 free(it);
4323 }
4324 return 0;
4325}
4326
4355ab5f
SH
4327int lxc_clear_idmaps(struct lxc_conf *c)
4328{
4329 return lxc_free_idmap(&c->id_map);
4330}
4331
1fb86a7c
SH
4332int lxc_clear_config_keepcaps(struct lxc_conf *c)
4333{
4334 struct lxc_list *it,*next;
4335
4336 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4337 lxc_list_del(it);
4338 free(it->elem);
4339 free(it);
4340 }
4341 return 0;
4342}
4343
12a50cc6 4344int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 4345{
9ebb03ad 4346 struct lxc_list *it,*next;
72d0e1cb 4347 bool all = false;
a6390f01 4348 const char *k = NULL;
72d0e1cb
SG
4349
4350 if (strcmp(key, "lxc.cgroup") == 0)
4351 all = true;
a6390f01
WB
4352 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4353 k = key + sizeof("lxc.cgroup.")-1;
4354 else
4355 return -1;
72d0e1cb 4356
9ebb03ad 4357 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
4358 struct lxc_cgroup *cg = it->elem;
4359 if (!all && strcmp(cg->subsystem, k) != 0)
4360 continue;
4361 lxc_list_del(it);
4362 free(cg->subsystem);
4363 free(cg->value);
4364 free(cg);
4365 free(it);
4366 }
4367 return 0;
4368}
4369
c6d09e15
WB
4370int lxc_clear_limits(struct lxc_conf *c, const char *key)
4371{
4372 struct lxc_list *it, *next;
4373 bool all = false;
4374 const char *k = NULL;
4375
240d4b74 4376 if (strcmp(key, "lxc.limit") == 0
4377 || strcmp(key, "lxc.prlimit"))
c6d09e15
WB
4378 all = true;
4379 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4380 k = key + sizeof("lxc.limit.")-1;
240d4b74 4381 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
4382 k = key + sizeof("lxc.prlimit.")-1;
c6d09e15
WB
4383 else
4384 return -1;
4385
4386 lxc_list_for_each_safe(it, &c->limits, next) {
4387 struct lxc_limit *lim = it->elem;
4388 if (!all && strcmp(lim->resource, k) != 0)
4389 continue;
4390 lxc_list_del(it);
4391 free(lim->resource);
4392 free(lim);
4393 free(it);
4394 }
4395 return 0;
4396}
4397
ee1e7aa0
SG
4398int lxc_clear_groups(struct lxc_conf *c)
4399{
4400 struct lxc_list *it,*next;
4401
4402 lxc_list_for_each_safe(it, &c->groups, next) {
4403 lxc_list_del(it);
4404 free(it->elem);
4405 free(it);
4406 }
4407 return 0;
4408}
4409
ab799c0b
SG
4410int lxc_clear_environment(struct lxc_conf *c)
4411{
4412 struct lxc_list *it,*next;
4413
4414 lxc_list_for_each_safe(it, &c->environment, next) {
4415 lxc_list_del(it);
4416 free(it->elem);
4417 free(it);
4418 }
4419 return 0;
4420}
4421
4422
72d0e1cb
SG
4423int lxc_clear_mount_entries(struct lxc_conf *c)
4424{
9ebb03ad 4425 struct lxc_list *it,*next;
72d0e1cb 4426
9ebb03ad 4427 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
4428 lxc_list_del(it);
4429 free(it->elem);
4430 free(it);
4431 }
4432 return 0;
4433}
4434
b099e9e9
SH
4435int lxc_clear_automounts(struct lxc_conf *c)
4436{
4437 c->auto_mounts = 0;
4438 return 0;
4439}
4440
12a50cc6 4441int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 4442{
9ebb03ad 4443 struct lxc_list *it,*next;
17ed13a3 4444 bool all = false, done = false;
a6390f01 4445 const char *k = NULL;
72d0e1cb
SG
4446 int i;
4447
17ed13a3
SH
4448 if (strcmp(key, "lxc.hook") == 0)
4449 all = true;
a6390f01
WB
4450 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4451 k = key + sizeof("lxc.hook.")-1;
4452 else
4453 return -1;
17ed13a3 4454
72d0e1cb 4455 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 4456 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 4457 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
4458 lxc_list_del(it);
4459 free(it->elem);
4460 free(it);
4461 }
4462 done = true;
72d0e1cb
SG
4463 }
4464 }
17ed13a3
SH
4465
4466 if (!done) {
4467 ERROR("Invalid hook key: %s", key);
4468 return -1;
4469 }
72d0e1cb
SG
4470 return 0;
4471}
8eb5694b 4472
74a3920a 4473static void lxc_clear_saved_nics(struct lxc_conf *conf)
7b35f3d6
SH
4474{
4475 int i;
4476
0cf45501 4477 if (!conf->saved_nics)
7b35f3d6
SH
4478 return;
4479 for (i=0; i < conf->num_savednics; i++)
4480 free(conf->saved_nics[i].orig_name);
7b35f3d6
SH
4481 free(conf->saved_nics);
4482}
4483
4184c3e1
SH
4484static inline void lxc_clear_aliens(struct lxc_conf *conf)
4485{
4486 struct lxc_list *it,*next;
4487
4488 lxc_list_for_each_safe(it, &conf->aliens, next) {
4489 lxc_list_del(it);
4490 free(it->elem);
4491 free(it);
4492 }
4493}
4494
c7b15d1e 4495void lxc_clear_includes(struct lxc_conf *conf)
f979ac15
SH
4496{
4497 struct lxc_list *it,*next;
4498
4499 lxc_list_for_each_safe(it, &conf->includes, next) {
4500 lxc_list_del(it);
4501 free(it->elem);
4502 free(it);
4503 }
4504}
4505
8eb5694b
SH
4506void lxc_conf_free(struct lxc_conf *conf)
4507{
4508 if (!conf)
4509 return;
858377e4
SH
4510 if (current_config == conf)
4511 current_config = NULL;
f10fad2f
ME
4512 free(conf->console.log_path);
4513 free(conf->console.path);
4514 free(conf->rootfs.mount);
b3b8c97f 4515 free(conf->rootfs.bdev_type);
f10fad2f
ME
4516 free(conf->rootfs.options);
4517 free(conf->rootfs.path);
f10fad2f 4518 free(conf->logfile);
858377e4
SH
4519 if (conf->logfd != -1)
4520 close(conf->logfd);
f10fad2f
ME
4521 free(conf->utsname);
4522 free(conf->ttydir);
4523 free(conf->fstab);
4524 free(conf->rcfile);
4525 free(conf->init_cmd);
6b0d5538 4526 free(conf->unexpanded_config);
393903d1 4527 free(conf->pty_names);
76d0127f 4528 free(conf->syslog);
c302b476 4529 lxc_free_networks(&conf->network);
f10fad2f
ME
4530 free(conf->lsm_aa_profile);
4531 free(conf->lsm_se_context);
769872f9 4532 lxc_seccomp_free(conf);
8eb5694b 4533 lxc_clear_config_caps(conf);
1fb86a7c 4534 lxc_clear_config_keepcaps(conf);
8eb5694b 4535 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 4536 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 4537 lxc_clear_mount_entries(conf);
7b35f3d6 4538 lxc_clear_saved_nics(conf);
27c27d73 4539 lxc_clear_idmaps(conf);
ee1e7aa0 4540 lxc_clear_groups(conf);
f979ac15 4541 lxc_clear_includes(conf);
761d81ca 4542 lxc_clear_aliens(conf);
ab799c0b 4543 lxc_clear_environment(conf);
240d4b74 4544 lxc_clear_limits(conf, "lxc.prlimit");
8eb5694b
SH
4545 free(conf);
4546}
4355ab5f
SH
4547
4548struct userns_fn_data {
4549 int (*fn)(void *);
c9b7c33e 4550 const char *fn_name;
4355ab5f
SH
4551 void *arg;
4552 int p[2];
4553};
4554
4555static int run_userns_fn(void *data)
4556{
4557 struct userns_fn_data *d = data;
4558 char c;
4355ab5f 4559
f8aa4bf3 4560 /* Close write end of the pipe. */
4355ab5f 4561 close(d->p[1]);
f8aa4bf3
CB
4562
4563 /* Wait for parent to finish establishing a new mapping in the user
4564 * namespace we are executing in.
4565 */
4355ab5f
SH
4566 if (read(d->p[0], &c, 1) != 1)
4567 return -1;
f8aa4bf3
CB
4568
4569 /* Close read end of the pipe. */
4355ab5f 4570 close(d->p[0]);
f8aa4bf3 4571
c9b7c33e
CB
4572 if (d->fn_name)
4573 TRACE("calling function \"%s\"", d->fn_name);
f8aa4bf3 4574 /* Call function to run. */
4355ab5f
SH
4575 return d->fn(d->arg);
4576}
4577
339efad9 4578static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
f8aa4bf3
CB
4579 enum idtype idtype)
4580{
4581 struct lxc_list *it;
4582 struct id_map *map;
4583 struct id_map *retmap = NULL;
4584
4585 lxc_list_for_each(it, &conf->id_map) {
4586 map = it->elem;
4587 if (map->idtype != idtype)
4588 continue;
4589
4590 if (id >= map->hostid && id < map->hostid + map->range) {
4591 retmap = map;
4592 break;
4593 }
4594 }
4595
4596 if (!retmap)
4597 return NULL;
4598
4599 retmap = malloc(sizeof(*retmap));
4600 if (!retmap)
4601 return NULL;
4602
4603 memcpy(retmap, map, sizeof(*retmap));
4604 return retmap;
4605}
4606
4355ab5f 4607/*
f8aa4bf3
CB
4608 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4609 * existing one or establish a new one.
4355ab5f 4610 */
28a2d9e7 4611static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4355ab5f 4612{
28a2d9e7 4613 int hostid_mapped;
f8aa4bf3 4614 struct id_map *entry = NULL;
f8aa4bf3 4615
28a2d9e7
CB
4616 /* Reuse existing mapping. */
4617 entry = mapped_hostid_entry(conf, id, type);
4618 if (entry)
4619 return entry;
f8aa4bf3 4620
28a2d9e7
CB
4621 /* Find new mapping. */
4622 hostid_mapped = find_unmapped_nsid(conf, type);
4623 if (hostid_mapped < 0) {
4624 DEBUG("failed to find free mapping for id %d", id);
4625 return NULL;
f8aa4bf3 4626 }
f8aa4bf3 4627
28a2d9e7
CB
4628 entry = malloc(sizeof(*entry));
4629 if (!entry)
4630 return NULL;
4355ab5f 4631
28a2d9e7
CB
4632 entry->idtype = type;
4633 entry->nsid = hostid_mapped;
4634 entry->hostid = (unsigned long)id;
4635 entry->range = 1;
4355ab5f 4636
28a2d9e7 4637 return entry;
4355ab5f
SH
4638}
4639
f8aa4bf3
CB
4640/* Run a function in a new user namespace.
4641 * The caller's euid/egid will be mapped if it is not already.
4642 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4643 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4644 * This means we require only to establish a mapping from:
4645 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4646 * - the container root -> some sub{g,u}id
4647 * The former we add, if the user did not specifiy a mapping. The latter we
4648 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4649 * there to start the container in the first place.
4355ab5f 4650 */
c9b7c33e
CB
4651int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4652 const char *fn_name)
4355ab5f 4653{
f8aa4bf3
CB
4654 pid_t pid;
4655 uid_t euid, egid;
4355ab5f 4656 struct userns_fn_data d;
4355ab5f 4657 int p[2];
f8aa4bf3
CB
4658 struct lxc_list *it;
4659 struct id_map *map;
4660 char c = '1';
4661 int ret = -1;
4662 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
4663 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4664 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 4665
4355ab5f 4666 ret = pipe(p);
4355ab5f
SH
4667 if (ret < 0) {
4668 SYSERROR("opening pipe");
4669 return -1;
4670 }
4671 d.fn = fn;
c9b7c33e 4672 d.fn_name = fn_name;
4355ab5f
SH
4673 d.arg = data;
4674 d.p[0] = p[0];
4675 d.p[1] = p[1];
f8aa4bf3
CB
4676
4677 /* Clone child in new user namespace. */
4355ab5f 4678 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
f8aa4bf3
CB
4679 if (pid < 0) {
4680 ERROR("failed to clone child process in new user namespace");
4681 goto on_error;
4682 }
4683
4355ab5f 4684 close(p[0]);
4355ab5f
SH
4685 p[0] = -1;
4686
f8aa4bf3
CB
4687 /* Find container root. */
4688 lxc_list_for_each(it, &conf->id_map) {
4689 map = it->elem;
4690
4691 if (map->nsid != 0)
4692 continue;
4693
4694 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4695 container_root_uid = malloc(sizeof(*container_root_uid));
4696 if (!container_root_uid)
4697 goto on_error;
4698 container_root_uid->idtype = map->idtype;
4699 container_root_uid->hostid = map->hostid;
4700 container_root_uid->nsid = 0;
4701 container_root_uid->range = map->range;
4702 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4703 container_root_gid = malloc(sizeof(*container_root_gid));
4704 if (!container_root_gid)
4705 goto on_error;
4706 container_root_gid->idtype = map->idtype;
4707 container_root_gid->hostid = map->hostid;
4708 container_root_gid->nsid = 0;
4709 container_root_gid->range = map->range;
4710 }
4711
4712 /* Found container root. */
4713 if (container_root_uid && container_root_gid)
4714 break;
4715 }
4716
4717 /* This is actually checked earlier but it can't hurt. */
4718 if (!container_root_uid || !container_root_gid) {
4719 ERROR("no mapping for container root found");
4720 goto on_error;
4721 }
4722
1d90e064
CB
4723 host_uid_map = container_root_uid;
4724 host_gid_map = container_root_gid;
4725
f8aa4bf3
CB
4726 /* Check whether the {g,u}id of the user has a mapping. */
4727 euid = geteuid();
4728 egid = getegid();
1d90e064 4729 if (euid != container_root_uid->hostid)
28a2d9e7
CB
4730 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4731
1d90e064 4732 if (egid != container_root_gid->hostid)
28a2d9e7
CB
4733 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4734
4735 if (!host_uid_map) {
4736 DEBUG("failed to find mapping for uid %d", euid);
f8aa4bf3
CB
4737 goto on_error;
4738 }
4739
28a2d9e7
CB
4740 if (!host_gid_map) {
4741 DEBUG("failed to find mapping for gid %d", egid);
4742 goto on_error;
4743 }
4744
4745 /* Allocate new {g,u}id map list. */
4746 idmap = malloc(sizeof(*idmap));
4747 if (!idmap)
4748 goto on_error;
4749 lxc_list_init(idmap);
4750
f8aa4bf3
CB
4751 /* Add container root to the map. */
4752 tmplist = malloc(sizeof(*tmplist));
4753 if (!tmplist)
4754 goto on_error;
4755 lxc_list_add_elem(tmplist, container_root_uid);
4756 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4757
1d90e064 4758 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
4759 /* idmap will now keep track of that memory. */
4760 container_root_uid = NULL;
4761
4762 /* Add container root to the map. */
4763 tmplist = malloc(sizeof(*tmplist));
4764 if (!tmplist)
4765 goto on_error;
4766 lxc_list_add_elem(tmplist, host_uid_map);
4767 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4768 }
1d90e064
CB
4769 /* idmap will now keep track of that memory. */
4770 container_root_uid = NULL;
4771 /* idmap will now keep track of that memory. */
4772 host_uid_map = NULL;
f8aa4bf3
CB
4773
4774 tmplist = malloc(sizeof(*tmplist));
4775 if (!tmplist)
4776 goto on_error;
4777 lxc_list_add_elem(tmplist, container_root_gid);
4778 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4779
1d90e064 4780 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
4781 /* idmap will now keep track of that memory. */
4782 container_root_gid = NULL;
4783
4784 tmplist = malloc(sizeof(*tmplist));
4785 if (!tmplist)
4786 goto on_error;
4787 lxc_list_add_elem(tmplist, host_gid_map);
4788 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4789 }
1d90e064
CB
4790 /* idmap will now keep track of that memory. */
4791 container_root_gid = NULL;
4792 /* idmap will now keep track of that memory. */
4793 host_gid_map = NULL;
f8aa4bf3 4794
4b73005c
CB
4795 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4796 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
f8aa4bf3
CB
4797 lxc_list_for_each(it, idmap) {
4798 map = it->elem;
4799 TRACE("establishing %cid mapping for \"%d\" in new "
4800 "user namespace: nsuid %lu - hostid %lu - range "
4801 "%lu",
4802 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4803 map->nsid, map->hostid, map->range);
4804 }
4355ab5f
SH
4805 }
4806
f8aa4bf3 4807 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4808 ret = lxc_map_ids(idmap, pid);
f8aa4bf3
CB
4809 if (ret < 0) {
4810 ERROR("error setting up {g,u}id mappings for child process "
4811 "\"%d\"",
4812 pid);
4813 goto on_error;
4355ab5f
SH
4814 }
4815
f8aa4bf3 4816 /* Tell child to proceed. */
4355ab5f 4817 if (write(p[1], &c, 1) != 1) {
f8aa4bf3
CB
4818 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4819 goto on_error;
4355ab5f
SH
4820 }
4821
f8aa4bf3 4822 /* Wait for child to finish. */
3139aead
SG
4823 ret = wait_for_pid(pid);
4824
f8aa4bf3 4825on_error:
1d90e064
CB
4826 if (idmap)
4827 lxc_free_idmap(idmap);
4828 if (container_root_uid)
4829 free(container_root_uid);
4830 if (container_root_gid)
4831 free(container_root_gid);
4832 if (host_uid_map && (host_uid_map != container_root_uid))
4833 free(host_uid_map);
4834 if (host_gid_map && (host_gid_map != container_root_gid))
4835 free(host_gid_map);
3139aead 4836
4355ab5f
SH
4837 if (p[0] != -1)
4838 close(p[0]);
4839 close(p[1]);
f8aa4bf3
CB
4840
4841 return ret;
4355ab5f 4842}
97e9cfa0 4843
a96a8e8c 4844/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4845static char* getuname(void)
4846{
a96a8e8c 4847 struct passwd *result;
97e9cfa0 4848
a96a8e8c
SH
4849 result = getpwuid(geteuid());
4850 if (!result)
97e9cfa0
SH
4851 return NULL;
4852
a96a8e8c 4853 return strdup(result->pw_name);
97e9cfa0
SH
4854}
4855
a96a8e8c 4856/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4857static char *getgname(void)
4858{
a96a8e8c 4859 struct group *result;
97e9cfa0 4860
a96a8e8c
SH
4861 result = getgrgid(getegid());
4862 if (!result)
97e9cfa0
SH
4863 return NULL;
4864
a96a8e8c 4865 return strdup(result->gr_name);
97e9cfa0
SH
4866}
4867
a96a8e8c 4868/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4869void suggest_default_idmap(void)
4870{
4871 FILE *f;
4872 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4873 char *line = NULL;
4874 char *uname, *gname;
4875 size_t len = 0;
4876
4877 if (!(uname = getuname()))
4878 return;
4879
4880 if (!(gname = getgname())) {
4881 free(uname);
4882 return;
4883 }
4884
4885 f = fopen(subuidfile, "r");
4886 if (!f) {
4887 ERROR("Your system is not configured with subuids");
4888 free(gname);
4889 free(uname);
4890 return;
4891 }
4892 while (getline(&line, &len, f) != -1) {
b7930180 4893 size_t no_newline = 0;
97e9cfa0
SH
4894 char *p = strchr(line, ':'), *p2;
4895 if (*line == '#')
4896 continue;
4897 if (!p)
4898 continue;
4899 *p = '\0';
4900 p++;
4901 if (strcmp(line, uname))
4902 continue;
4903 p2 = strchr(p, ':');
4904 if (!p2)
4905 continue;
4906 *p2 = '\0';
4907 p2++;
4908 if (!*p2)
4909 continue;
b7930180
CB
4910 no_newline = strcspn(p2, "\n");
4911 p2[no_newline] = '\0';
4912
b7b2fde4
CB
4913 if (lxc_safe_uint(p, &uid) < 0)
4914 WARN("Could not parse UID.");
4915 if (lxc_safe_uint(p2, &urange) < 0)
4916 WARN("Could not parse UID range.");
97e9cfa0
SH
4917 }
4918 fclose(f);
4919
6be7389a 4920 f = fopen(subgidfile, "r");
97e9cfa0
SH
4921 if (!f) {
4922 ERROR("Your system is not configured with subgids");
4923 free(gname);
4924 free(uname);
4925 return;
4926 }
4927 while (getline(&line, &len, f) != -1) {
b7930180 4928 size_t no_newline = 0;
97e9cfa0
SH
4929 char *p = strchr(line, ':'), *p2;
4930 if (*line == '#')
4931 continue;
4932 if (!p)
4933 continue;
4934 *p = '\0';
4935 p++;
4936 if (strcmp(line, uname))
4937 continue;
4938 p2 = strchr(p, ':');
4939 if (!p2)
4940 continue;
4941 *p2 = '\0';
4942 p2++;
4943 if (!*p2)
4944 continue;
b7930180
CB
4945 no_newline = strcspn(p2, "\n");
4946 p2[no_newline] = '\0';
4947
b7b2fde4
CB
4948 if (lxc_safe_uint(p, &gid) < 0)
4949 WARN("Could not parse GID.");
4950 if (lxc_safe_uint(p2, &grange) < 0)
4951 WARN("Could not parse GID range.");
97e9cfa0
SH
4952 }
4953 fclose(f);
4954
f10fad2f 4955 free(line);
97e9cfa0
SH
4956
4957 if (!urange || !grange) {
4958 ERROR("You do not have subuids or subgids allocated");
4959 ERROR("Unprivileged containers require subuids and subgids");
4960 return;
4961 }
4962
4963 ERROR("You must either run as root, or define uid mappings");
4964 ERROR("To pass uid mappings to lxc-create, you could create");
4965 ERROR("~/.config/lxc/default.conf:");
4966 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4967 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
4968 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
4969
4970 free(gname);
4971 free(uname);
4972}
aaf26830 4973
a7307747
SH
4974static void free_cgroup_settings(struct lxc_list *result)
4975{
4976 struct lxc_list *iterator, *next;
4977
4978 lxc_list_for_each_safe(iterator, result, next) {
4979 lxc_list_del(iterator);
4980 free(iterator);
4981 }
4982 free(result);
4983}
4984
aaf26830
KT
4985/*
4986 * Return the list of cgroup_settings sorted according to the following rules
4987 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4988 */
4989struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
4990{
4991 struct lxc_list *result;
4992 struct lxc_list *memsw_limit = NULL;
4993 struct lxc_list *it = NULL;
4994 struct lxc_cgroup *cg = NULL;
4995 struct lxc_list *item = NULL;
4996
4997 result = malloc(sizeof(*result));
fac7c663
KT
4998 if (!result) {
4999 ERROR("failed to allocate memory to sort cgroup settings");
5000 return NULL;
5001 }
aaf26830
KT
5002 lxc_list_init(result);
5003
5004 /*Iterate over the cgroup settings and copy them to the output list*/
5005 lxc_list_for_each(it, cgroup_settings) {
5006 item = malloc(sizeof(*item));
fac7c663
KT
5007 if (!item) {
5008 ERROR("failed to allocate memory to sort cgroup settings");
a7307747 5009 free_cgroup_settings(result);
fac7c663
KT
5010 return NULL;
5011 }
aaf26830
KT
5012 item->elem = it->elem;
5013 cg = it->elem;
5014 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
5015 /* Store the memsw_limit location */
5016 memsw_limit = item;
5017 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4d5b72a1 5018 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
aaf26830
KT
5019 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5020 item->elem = memsw_limit->elem;
5021 memsw_limit->elem = it->elem;
5022 }
5023 lxc_list_add_tail(result, item);
5024 }
5025
5026 return result;
a7307747 5027}