]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
af_unix: abstract lxc_abstract_unix_{send,recv}_fd
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
8f3e280e
CB
27#include <dirent.h>
28#include <errno.h>
29#include <fcntl.h>
30#include <grp.h>
31#include <inttypes.h>
32#include <libgen.h>
33#include <pwd.h>
34#include <stdarg.h>
0ad19a3f 35#include <stdio.h>
0ad19a3f 36#include <stdlib.h>
0ad19a3f 37#include <string.h>
8f3e280e 38#include <time.h>
0ad19a3f 39#include <unistd.h>
8f3e280e
CB
40#include <arpa/inet.h>
41#include <linux/loop.h>
8f3e280e
CB
42#include <net/if.h>
43#include <netinet/in.h>
44#include <sys/mman.h>
45#include <sys/mount.h>
46#include <sys/param.h>
47#include <sys/prctl.h>
48#include <sys/stat.h>
49#include <sys/socket.h>
ce831b3b 50#include <sys/sysmacros.h>
2d76d1d7 51#include <sys/syscall.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
1d52bdf7 55
af6824fc
ST
56/* makedev() */
57#ifdef MAJOR_IN_MKDEV
58# include <sys/mkdev.h>
59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
5ef5c9a3
CB
71#ifdef HAVE_LINUX_MEMFD_H
72#include <linux/memfd.h>
73#endif
74
e8bd4e43 75#include "af_unix.h"
8f3e280e
CB
76#include "bdev.h"
77#include "caps.h" /* for lxc_caps_last_cap() */
78#include "cgroup.h"
1b09f2c0 79#include "conf.h"
8f3e280e 80#include "error.h"
1b09f2c0 81#include "log.h"
d8e48992 82#include "lxcaufs.h"
025ed0f3 83#include "lxclock.h"
8f3e280e
CB
84#include "lxcoverlay.h"
85#include "lxcseccomp.h"
4355ab5f 86#include "namespace.h"
8f3e280e
CB
87#include "network.h"
88#include "parse.h"
89#include "utils.h"
fe4de9a6 90#include "lsm/lsm.h"
d0a36f2c 91
e37dda71 92#if HAVE_LIBCAP
495d2046
SG
93#include <sys/capability.h>
94#endif
95
6ff05e18
SG
96#if HAVE_SYS_PERSONALITY_H
97#include <sys/personality.h>
98#endif
99
edaf8b1b
SG
100#if IS_BIONIC
101#include <../include/lxcmntent.h>
a04f5407
CB
102#ifndef HAVE_PRLIMIT
103#include <../include/prlimit.h>
104#endif
edaf8b1b
SG
105#else
106#include <mntent.h>
107#endif
108
36eb9bde 109lxc_log_define(lxc_conf, lxc);
e5bda9ee 110
e37dda71 111#if HAVE_LIBCAP
b09094da
MN
112#ifndef CAP_SETFCAP
113#define CAP_SETFCAP 31
114#endif
115
116#ifndef CAP_MAC_OVERRIDE
117#define CAP_MAC_OVERRIDE 32
118#endif
119
120#ifndef CAP_MAC_ADMIN
121#define CAP_MAC_ADMIN 33
122#endif
495d2046 123#endif
b09094da
MN
124
125#ifndef PR_CAPBSET_DROP
126#define PR_CAPBSET_DROP 24
127#endif
128
9818cae4
SG
129#ifndef LO_FLAGS_AUTOCLEAR
130#define LO_FLAGS_AUTOCLEAR 4
131#endif
132
bc5b27d6
DK
133#ifndef CAP_SETUID
134#define CAP_SETUID 7
135#endif
136
137#ifndef CAP_SETGID
138#define CAP_SETGID 6
139#endif
140
0769b82a
CS
141/* needed for cgroup automount checks, regardless of whether we
142 * have included linux/capability.h or not */
143#ifndef CAP_SYS_ADMIN
144#define CAP_SYS_ADMIN 21
145#endif
146
2d76d1d7
SG
147/* Define pivot_root() if missing from the C library */
148#ifndef HAVE_PIVOT_ROOT
149static int pivot_root(const char * new_root, const char * put_old)
150{
151#ifdef __NR_pivot_root
8f3e280e 152 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 153#else
8f3e280e
CB
154 errno = ENOSYS;
155 return -1;
2d76d1d7
SG
156#endif
157}
158#else
159extern int pivot_root(const char * new_root, const char * put_old);
160#endif
161
162/* Define sethostname() if missing from the C library */
163#ifndef HAVE_SETHOSTNAME
164static int sethostname(const char * name, size_t len)
165{
166#ifdef __NR_sethostname
8f3e280e 167 return syscall(__NR_sethostname, name, len);
2d76d1d7 168#else
8f3e280e
CB
169 errno = ENOSYS;
170 return -1;
2d76d1d7
SG
171#endif
172}
173#endif
174
72f919c4
SG
175/* Define __S_ISTYPE if missing from the C library */
176#ifndef __S_ISTYPE
177#define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
178#endif
179
ecec0126
SG
180#ifndef MS_PRIVATE
181#define MS_PRIVATE (1<<18)
182#endif
183
8912711c
CB
184#ifndef MS_LAZYTIME
185#define MS_LAZYTIME (1<<25)
186#endif
187
5ef5c9a3
CB
188/* memfd_create() */
189#ifndef MFD_CLOEXEC
190#define MFD_CLOEXEC 0x0001U
191#endif
192
193#ifndef MFD_ALLOW_SEALING
194#define MFD_ALLOW_SEALING 0x0002U
195#endif
196
197#ifndef HAVE_MEMFD_CREATE
198static int memfd_create(const char *name, unsigned int flags) {
199 #ifndef __NR_memfd_create
200 #if defined __i386__
201 #define __NR_memfd_create 356
202 #elif defined __x86_64__
203 #define __NR_memfd_create 319
204 #elif defined __arm__
205 #define __NR_memfd_create 385
206 #elif defined __aarch64__
207 #define __NR_memfd_create 279
208 #elif defined __s390__
209 #define __NR_memfd_create 350
210 #elif defined __powerpc__
211 #define __NR_memfd_create 360
212 #elif defined __sparc__
213 #define __NR_memfd_create 348
214 #elif defined __blackfin__
215 #define __NR_memfd_create 390
216 #elif defined __ia64__
217 #define __NR_memfd_create 1340
218 #elif defined _MIPS_SIM
219 #if _MIPS_SIM == _MIPS_SIM_ABI32
220 #define __NR_memfd_create 4354
221 #endif
222 #if _MIPS_SIM == _MIPS_SIM_NABI32
223 #define __NR_memfd_create 6318
224 #endif
225 #if _MIPS_SIM == _MIPS_SIM_ABI64
226 #define __NR_memfd_create 5314
227 #endif
228 #endif
229 #endif
230 #ifdef __NR_memfd_create
231 return syscall(__NR_memfd_create, name, flags);
232 #else
233 errno = ENOSYS;
234 return -1;
235 #endif
236}
237#else
238extern int memfd_create(const char *name, unsigned int flags);
239#endif
240
72d0e1cb 241char *lxchook_names[NUM_LXC_HOOKS] = {
52492063 242 "pre-start", "pre-mount", "mount", "autodev", "start", "stop", "post-stop", "clone", "destroy" };
72d0e1cb 243
a589434e 244typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
0ad19a3f 245
998ac676
RT
246struct mount_opt {
247 char *name;
248 int clear;
249 int flag;
250};
251
81810dd1
DL
252struct caps_opt {
253 char *name;
254 int value;
255};
256
c6d09e15
WB
257struct limit_opt {
258 char *name;
259 int value;
260};
261
858377e4
SH
262/*
263 * The lxc_conf of the container currently being worked on in an
264 * API call
265 * This is used in the error calls
266 */
267#ifdef HAVE_TLS
268__thread struct lxc_conf *current_config;
269#else
270struct lxc_conf *current_config;
271#endif
272
0769b82a
CS
273/* Declare this here, since we don't want to reshuffle the whole file. */
274static int in_caplist(int cap, struct lxc_list *caps);
275
a589434e
JN
276static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
277static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
278static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
279static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
280static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
281static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
282
283static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
284 [LXC_NET_VETH] = instantiate_veth,
285 [LXC_NET_MACVLAN] = instantiate_macvlan,
286 [LXC_NET_VLAN] = instantiate_vlan,
287 [LXC_NET_PHYS] = instantiate_phys,
288 [LXC_NET_EMPTY] = instantiate_empty,
289 [LXC_NET_NONE] = instantiate_none,
0ad19a3f 290};
291
74a2b586
JK
292static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
293static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
294static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
295static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
296static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
26b797f3 297static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
74a2b586 298
a589434e 299static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
74a2b586
JK
300 [LXC_NET_VETH] = shutdown_veth,
301 [LXC_NET_MACVLAN] = shutdown_macvlan,
302 [LXC_NET_VLAN] = shutdown_vlan,
303 [LXC_NET_PHYS] = shutdown_phys,
304 [LXC_NET_EMPTY] = shutdown_empty,
26b797f3 305 [LXC_NET_NONE] = shutdown_none,
74a2b586
JK
306};
307
998ac676 308static struct mount_opt mount_opt[] = {
470b359b
CB
309 { "async", 1, MS_SYNCHRONOUS },
310 { "atime", 1, MS_NOATIME },
311 { "bind", 0, MS_BIND },
88d413d5 312 { "defaults", 0, 0 },
88d413d5 313 { "dev", 1, MS_NODEV },
470b359b 314 { "diratime", 1, MS_NODIRATIME },
88d413d5 315 { "dirsync", 0, MS_DIRSYNC },
470b359b 316 { "exec", 1, MS_NOEXEC },
8912711c 317 { "lazytime", 0, MS_LAZYTIME },
88d413d5 318 { "mand", 0, MS_MANDLOCK },
88d413d5 319 { "noatime", 0, MS_NOATIME },
470b359b 320 { "nodev", 0, MS_NODEV },
88d413d5 321 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
322 { "noexec", 0, MS_NOEXEC },
323 { "nomand", 1, MS_MANDLOCK },
324 { "norelatime", 1, MS_RELATIME },
325 { "nostrictatime", 1, MS_STRICTATIME },
326 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
327 { "rbind", 0, MS_BIND|MS_REC },
328 { "relatime", 0, MS_RELATIME },
470b359b
CB
329 { "remount", 0, MS_REMOUNT },
330 { "ro", 0, MS_RDONLY },
331 { "rw", 1, MS_RDONLY },
88d413d5 332 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
333 { "suid", 1, MS_NOSUID },
334 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 335 { NULL, 0, 0 },
998ac676
RT
336};
337
e37dda71 338#if HAVE_LIBCAP
81810dd1 339static struct caps_opt caps_opt[] = {
a6afdde9 340 { "chown", CAP_CHOWN },
1e11be34
DL
341 { "dac_override", CAP_DAC_OVERRIDE },
342 { "dac_read_search", CAP_DAC_READ_SEARCH },
343 { "fowner", CAP_FOWNER },
344 { "fsetid", CAP_FSETID },
81810dd1
DL
345 { "kill", CAP_KILL },
346 { "setgid", CAP_SETGID },
347 { "setuid", CAP_SETUID },
348 { "setpcap", CAP_SETPCAP },
349 { "linux_immutable", CAP_LINUX_IMMUTABLE },
350 { "net_bind_service", CAP_NET_BIND_SERVICE },
351 { "net_broadcast", CAP_NET_BROADCAST },
352 { "net_admin", CAP_NET_ADMIN },
353 { "net_raw", CAP_NET_RAW },
354 { "ipc_lock", CAP_IPC_LOCK },
355 { "ipc_owner", CAP_IPC_OWNER },
356 { "sys_module", CAP_SYS_MODULE },
357 { "sys_rawio", CAP_SYS_RAWIO },
358 { "sys_chroot", CAP_SYS_CHROOT },
359 { "sys_ptrace", CAP_SYS_PTRACE },
360 { "sys_pacct", CAP_SYS_PACCT },
361 { "sys_admin", CAP_SYS_ADMIN },
362 { "sys_boot", CAP_SYS_BOOT },
363 { "sys_nice", CAP_SYS_NICE },
364 { "sys_resource", CAP_SYS_RESOURCE },
365 { "sys_time", CAP_SYS_TIME },
366 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
367 { "mknod", CAP_MKNOD },
368 { "lease", CAP_LEASE },
57b837e2
CB
369#ifdef CAP_AUDIT_READ
370 { "audit_read", CAP_AUDIT_READ },
371#endif
9527e566 372#ifdef CAP_AUDIT_WRITE
81810dd1 373 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
374#endif
375#ifdef CAP_AUDIT_CONTROL
81810dd1 376 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 377#endif
81810dd1
DL
378 { "setfcap", CAP_SETFCAP },
379 { "mac_override", CAP_MAC_OVERRIDE },
380 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
381#ifdef CAP_SYSLOG
382 { "syslog", CAP_SYSLOG },
383#endif
384#ifdef CAP_WAKE_ALARM
385 { "wake_alarm", CAP_WAKE_ALARM },
386#endif
2b54359b
CB
387#ifdef CAP_BLOCK_SUSPEND
388 { "block_suspend", CAP_BLOCK_SUSPEND },
389#endif
81810dd1 390};
495d2046
SG
391#else
392static struct caps_opt caps_opt[] = {};
393#endif
81810dd1 394
c6d09e15
WB
395static struct limit_opt limit_opt[] = {
396#ifdef RLIMIT_AS
397 { "as", RLIMIT_AS },
398#endif
399#ifdef RLIMIT_CORE
400 { "core", RLIMIT_CORE },
401#endif
402#ifdef RLIMIT_CPU
403 { "cpu", RLIMIT_CPU },
404#endif
405#ifdef RLIMIT_DATA
406 { "data", RLIMIT_DATA },
407#endif
408#ifdef RLIMIT_FSIZE
409 { "fsize", RLIMIT_FSIZE },
410#endif
411#ifdef RLIMIT_LOCKS
412 { "locks", RLIMIT_LOCKS },
413#endif
414#ifdef RLIMIT_MEMLOCK
415 { "memlock", RLIMIT_MEMLOCK },
416#endif
417#ifdef RLIMIT_MSGQUEUE
418 { "msgqueue", RLIMIT_MSGQUEUE },
419#endif
420#ifdef RLIMIT_NICE
421 { "nice", RLIMIT_NICE },
422#endif
423#ifdef RLIMIT_NOFILE
424 { "nofile", RLIMIT_NOFILE },
425#endif
426#ifdef RLIMIT_NPROC
427 { "nproc", RLIMIT_NPROC },
428#endif
429#ifdef RLIMIT_RSS
430 { "rss", RLIMIT_RSS },
431#endif
432#ifdef RLIMIT_RTPRIO
433 { "rtprio", RLIMIT_RTPRIO },
434#endif
435#ifdef RLIMIT_RTTIME
436 { "rttime", RLIMIT_RTTIME },
437#endif
438#ifdef RLIMIT_SIGPENDING
439 { "sigpending", RLIMIT_SIGPENDING },
440#endif
441#ifdef RLIMIT_STACK
442 { "stack", RLIMIT_STACK },
443#endif
444};
445
91c3830e
SH
446static int run_buffer(char *buffer)
447{
ebec9176 448 struct lxc_popen_FILE *f;
91c3830e 449 char *output;
8e7da691 450 int ret;
91c3830e 451
ebec9176 452 f = lxc_popen(buffer);
91c3830e 453 if (!f) {
062b72c6 454 SYSERROR("Failed to popen() %s.", buffer);
91c3830e
SH
455 return -1;
456 }
457
458 output = malloc(LXC_LOG_BUFFER_SIZE);
459 if (!output) {
062b72c6 460 ERROR("Failed to allocate memory for %s.", buffer);
ebec9176 461 lxc_pclose(f);
91c3830e
SH
462 return -1;
463 }
464
062b72c6
CB
465 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
466 DEBUG("Script %s with output: %s.", buffer, output);
91c3830e
SH
467
468 free(output);
469
ebec9176 470 ret = lxc_pclose(f);
8e7da691 471 if (ret == -1) {
062b72c6 472 SYSERROR("Script exited with error.");
91c3830e 473 return -1;
8e7da691 474 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
062b72c6 475 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
8e7da691
DE
476 return -1;
477 } else if (WIFSIGNALED(ret)) {
062b72c6 478 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
8e7da691 479 return -1;
91c3830e
SH
480 }
481
482 return 0;
483}
484
148e91f5 485static int run_script_argv(const char *name, const char *section,
062b72c6
CB
486 const char *script, const char *hook,
487 const char *lxcpath, char **argsin)
148e91f5
SH
488{
489 int ret, i;
490 char *buffer;
491 size_t size = 0;
492
062b72c6 493 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
148e91f5
SH
494 script, name, section);
495
062b72c6 496 for (i = 0; argsin && argsin[i]; i++)
148e91f5
SH
497 size += strlen(argsin[i]) + 1;
498
499 size += strlen(hook) + 1;
500
501 size += strlen(script);
502 size += strlen(name);
503 size += strlen(section);
504 size += 3;
505
506 if (size > INT_MAX)
507 return -1;
508
509 buffer = alloca(size);
510 if (!buffer) {
062b72c6 511 ERROR("Failed to allocate memory.");
148e91f5
SH
512 return -1;
513 }
514
062b72c6
CB
515 ret =
516 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
517 if (ret < 0 || (size_t)ret >= size) {
518 ERROR("Script name too long.");
148e91f5
SH
519 return -1;
520 }
521
062b72c6
CB
522 for (i = 0; argsin && argsin[i]; i++) {
523 int len = size - ret;
148e91f5
SH
524 int rc;
525 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
526 if (rc < 0 || rc >= len) {
062b72c6 527 ERROR("Script args too long.");
148e91f5
SH
528 return -1;
529 }
530 ret += rc;
531 }
532
533 return run_buffer(buffer);
534}
535
062b72c6
CB
536static int run_script(const char *name, const char *section, const char *script,
537 ...)
e3b4c4c4 538{
abbfd20b 539 int ret;
91c3830e 540 char *buffer, *p;
abbfd20b
DL
541 size_t size = 0;
542 va_list ap;
751d9dcd 543
062b72c6 544 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
751d9dcd 545 script, name, section);
e3b4c4c4 546
abbfd20b
DL
547 va_start(ap, script);
548 while ((p = va_arg(ap, char *)))
95642a10 549 size += strlen(p) + 1;
abbfd20b
DL
550 va_end(ap);
551
552 size += strlen(script);
553 size += strlen(name);
554 size += strlen(section);
95642a10 555 size += 3;
abbfd20b 556
95642a10
MS
557 if (size > INT_MAX)
558 return -1;
559
560 buffer = alloca(size);
abbfd20b 561 if (!buffer) {
062b72c6 562 ERROR("Failed to allocate memory.");
751d9dcd
DL
563 return -1;
564 }
565
9ba8130c
SH
566 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
567 if (ret < 0 || ret >= size) {
062b72c6 568 ERROR("Script name too long.");
9ba8130c
SH
569 return -1;
570 }
751d9dcd 571
abbfd20b 572 va_start(ap, script);
9ba8130c 573 while ((p = va_arg(ap, char *))) {
062b72c6 574 int len = size - ret;
9ba8130c
SH
575 int rc;
576 rc = snprintf(buffer + ret, len, " %s", p);
577 if (rc < 0 || rc >= len) {
062b72c6 578 ERROR("Script args too long.");
9ba8130c
SH
579 return -1;
580 }
581 ret += rc;
582 }
abbfd20b 583 va_end(ap);
751d9dcd 584
91c3830e 585 return run_buffer(buffer);
e3b4c4c4
ST
586}
587
a17b1e65 588static int mount_rootfs_dir(const char *rootfs, const char *target,
d435aae1 589 const char *options)
a6afdde9 590{
a17b1e65
SG
591 unsigned long mntflags;
592 char *mntdata;
593 int ret;
594
595 if (parse_mntopts(options, &mntflags, &mntdata) < 0) {
596 free(mntdata);
597 return -1;
598 }
599
600 ret = mount(rootfs, target, "none", MS_BIND | MS_REC | mntflags, mntdata);
601 free(mntdata);
602
603 return ret;
a6afdde9
DL
604}
605
c6868a1f 606static int lxc_mount_rootfs_file(const char *rootfs, const char *target,
d435aae1 607 const char *options)
78ae2fcc 608{
c6868a1f 609 int ret, loopfd;
a6afdde9 610 char path[MAXPATHLEN];
78ae2fcc 611
c6868a1f
CB
612 loopfd = lxc_prepare_loop_dev(rootfs, path, LO_FLAGS_AUTOCLEAR);
613 if (loopfd < 0)
78ae2fcc 614 return -1;
c6868a1f 615 DEBUG("prepared loop device \"%s\"", path);
a6afdde9 616
c6868a1f
CB
617 ret = mount_unknown_fs(path, target, options);
618 close(loopfd);
a6afdde9 619
c6868a1f 620 DEBUG("mounted rootfs \"%s\" on loop device \"%s\" via loop device \"%s\"", rootfs, target, path);
a6afdde9
DL
621
622 return ret;
78ae2fcc 623}
624
a17b1e65
SG
625static int mount_rootfs_block(const char *rootfs, const char *target,
626 const char *options)
a6afdde9 627{
a17b1e65 628 return mount_unknown_fs(rootfs, target, options);
a6afdde9
DL
629}
630
0c547523
SH
631/*
632 * pin_rootfs
b7ed4bf0
CS
633 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
634 * the duration of the container run, to prevent the container from marking
635 * the underlying fs readonly on shutdown. unlink the file immediately so
636 * no name pollution is happens
0c547523
SH
637 * return -1 on error.
638 * return -2 if nothing needed to be pinned.
639 * return an open fd (>=0) if we pinned it.
640 */
641int pin_rootfs(const char *rootfs)
642{
643 char absrootfs[MAXPATHLEN];
644 char absrootfspin[MAXPATHLEN];
645 struct stat s;
646 int ret, fd;
647
e99ee0de 648 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 649 return -2;
e99ee0de 650
00ec333b 651 if (!realpath(rootfs, absrootfs))
9be53773 652 return -2;
0c547523 653
00ec333b 654 if (access(absrootfs, F_OK))
0c547523 655 return -1;
0c547523 656
00ec333b 657 if (stat(absrootfs, &s))
0c547523 658 return -1;
0c547523 659
72f919c4 660 if (!S_ISDIR(s.st_mode))
0c547523
SH
661 return -2;
662
b7ed4bf0 663 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 664 if (ret >= MAXPATHLEN)
0c547523 665 return -1;
0c547523
SH
666
667 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
668 if (fd < 0)
669 return fd;
670 (void)unlink(absrootfspin);
0c547523
SH
671 return fd;
672}
673
e2a7e8dc
SH
674/*
675 * If we are asking to remount something, make sure that any
676 * NOEXEC etc are honored.
677 */
678static unsigned long add_required_remount_flags(const char *s, const char *d,
679 unsigned long flags)
680{
614305f3 681#ifdef HAVE_STATVFS
e2a7e8dc
SH
682 struct statvfs sb;
683 unsigned long required_flags = 0;
684
685 if (!(flags & MS_REMOUNT))
686 return flags;
687
688 if (!s)
689 s = d;
690
691 if (!s)
692 return flags;
693 if (statvfs(s, &sb) < 0)
694 return flags;
695
696 if (sb.f_flag & MS_NOSUID)
697 required_flags |= MS_NOSUID;
698 if (sb.f_flag & MS_NODEV)
699 required_flags |= MS_NODEV;
700 if (sb.f_flag & MS_RDONLY)
701 required_flags |= MS_RDONLY;
702 if (sb.f_flag & MS_NOEXEC)
703 required_flags |= MS_NOEXEC;
704
705 return flags | required_flags;
614305f3
SH
706#else
707 return flags;
708#endif
e2a7e8dc
SH
709}
710
4fb3cba5 711static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 712{
368bbc02 713 int r;
80e80c40 714 int i;
b06b8511
CS
715 static struct {
716 int match_mask;
717 int match_flag;
718 const char *source;
719 const char *destination;
720 const char *fstype;
721 unsigned long flags;
722 const char *options;
723 } default_mounts[] = {
724 /* Read-only bind-mounting... In older kernels, doing that required
725 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
726 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
727 * kernel 2.6.26 onwards. However, this apparently does not work on
728 * kernel 3.8. Unfortunately, on that very same kernel, doing the
729 * same trick as above doesn't seem to work either, there one needs
730 * to ALSO specify MS_BIND for the remount, otherwise the entire
731 * fs is remounted read-only or the mount fails because it's busy...
732 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
733 * 2.6.32...
368bbc02 734 */
f24a52d5 735 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a
SH
736 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
737 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
f24a52d5
SG
738 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
739 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
592fd47a 740 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
f24a52d5
SG
741 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
742 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
743 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
744 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
745 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
746 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
747 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
748 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
749 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
750 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
751 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
752 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 753 };
368bbc02 754
b06b8511
CS
755 for (i = 0; default_mounts[i].match_mask; i++) {
756 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
757 char *source = NULL;
758 char *destination = NULL;
759 int saved_errno;
e2a7e8dc 760 unsigned long mflags;
b06b8511
CS
761
762 if (default_mounts[i].source) {
763 /* will act like strdup if %r is not present */
8ede5f4c 764 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
b06b8511
CS
765 if (!source) {
766 SYSERROR("memory allocation error");
767 return -1;
768 }
769 }
cc4fd506
SH
770 if (!default_mounts[i].destination) {
771 ERROR("BUG: auto mounts destination %d was NULL", i);
b2f44b4d 772 free(source);
cc4fd506
SH
773 return -1;
774 }
775 /* will act like strdup if %r is not present */
776 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
777 if (!destination) {
778 saved_errno = errno;
779 SYSERROR("memory allocation error");
780 free(source);
781 errno = saved_errno;
782 return -1;
b06b8511 783 }
e2a7e8dc
SH
784 mflags = add_required_remount_flags(source, destination,
785 default_mounts[i].flags);
592fd47a 786 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
b06b8511 787 saved_errno = errno;
b88ff9a0
SG
788 if (r < 0 && errno == ENOENT) {
789 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
790 r = 0;
791 }
792 else if (r < 0)
e2a7e8dc 793 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
f24a52d5 794
b06b8511
CS
795 free(source);
796 free(destination);
797 if (r < 0) {
b06b8511
CS
798 errno = saved_errno;
799 return -1;
800 }
368bbc02 801 }
368bbc02
CS
802 }
803
b06b8511 804 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
805 int cg_flags;
806
807 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
808 /* If the type of cgroup mount was not specified, it depends on the
809 * container's capabilities as to what makes sense: if we have
810 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
811 * anyway, so we may as well default to read-write; then the admin
812 * will not be given a false sense of security. (And if they really
813 * want mixed r/o r/w, then they can explicitly specify :mixed.)
814 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
815 * :mixed, because then the container can't remount it read-write. */
816 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
817 int has_sys_admin = 0;
b0ee5983
CB
818
819 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 820 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 821 else
0769b82a 822 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
823
824 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 825 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 826 else
0769b82a 827 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a
CS
828 }
829
8ede5f4c 830 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
368bbc02 831 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 832 return -1;
368bbc02
CS
833 }
834 }
835
368bbc02 836 return 0;
368bbc02
CS
837}
838
a17b1e65 839static int mount_rootfs(const char *rootfs, const char *target, const char *options)
0ad19a3f 840{
b09ef133 841 char absrootfs[MAXPATHLEN];
78ae2fcc 842 struct stat s;
a6afdde9 843 int i;
78ae2fcc 844
a17b1e65 845 typedef int (*rootfs_cb)(const char *, const char *, const char *);
78ae2fcc 846
847 struct rootfs_type {
848 int type;
849 rootfs_cb cb;
850 } rtfs_type[] = {
2656d231
DL
851 { S_IFDIR, mount_rootfs_dir },
852 { S_IFBLK, mount_rootfs_block },
c6868a1f 853 { S_IFREG, lxc_mount_rootfs_file },
78ae2fcc 854 };
0ad19a3f 855
4c8ab83b 856 if (!realpath(rootfs, absrootfs)) {
91c3e281 857 SYSERROR("Failed to get real path for \"%s\".", rootfs);
4c8ab83b 858 return -1;
859 }
b09ef133 860
b09ef133 861 if (access(absrootfs, F_OK)) {
d26582c1 862 SYSERROR("The rootfs \"%s\" is not accessible.", absrootfs);
b09ef133 863 return -1;
864 }
865
78ae2fcc 866 if (stat(absrootfs, &s)) {
91c3e281 867 SYSERROR("Failed to stat the rootfs \"%s\".", absrootfs);
9b0f0477 868 return -1;
869 }
870
78ae2fcc 871 for (i = 0; i < sizeof(rtfs_type)/sizeof(rtfs_type[0]); i++) {
78ae2fcc 872 if (!__S_ISTYPE(s.st_mode, rtfs_type[i].type))
873 continue;
9b0f0477 874
a17b1e65 875 return rtfs_type[i].cb(absrootfs, target, options);
78ae2fcc 876 }
9b0f0477 877
91c3e281 878 ERROR("Unsupported rootfs type for rootfs \"%s\".", absrootfs);
78ae2fcc 879 return -1;
0ad19a3f 880}
881
4e5440c6 882static int setup_utsname(struct utsname *utsname)
0ad19a3f 883{
4e5440c6
DL
884 if (!utsname)
885 return 0;
0ad19a3f 886
4e5440c6
DL
887 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
888 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 889 return -1;
890 }
891
4e5440c6 892 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 893
0ad19a3f 894 return 0;
895}
896
69aa6655
DE
897struct dev_symlinks {
898 const char *oldpath;
899 const char *name;
900};
901
902static const struct dev_symlinks dev_symlinks[] = {
903 {"/proc/self/fd", "fd"},
904 {"/proc/self/fd/0", "stdin"},
905 {"/proc/self/fd/1", "stdout"},
906 {"/proc/self/fd/2", "stderr"},
907};
908
909static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
910{
911 char path[MAXPATHLEN];
912 int ret,i;
09227be2 913 struct stat s;
69aa6655
DE
914
915
916 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
917 const struct dev_symlinks *d = &dev_symlinks[i];
ec50007f 918 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
919 if (ret < 0 || ret >= MAXPATHLEN)
920 return -1;
09227be2
MW
921
922 /*
923 * Stat the path first. If we don't get an error
924 * accept it as is and don't try to create it
925 */
926 if (!stat(path, &s)) {
927 continue;
928 }
929
69aa6655 930 ret = symlink(d->oldpath, path);
09227be2 931
69aa6655 932 if (ret && errno != EEXIST) {
09227be2
MW
933 if ( errno == EROFS ) {
934 WARN("Warning: Read Only file system while creating %s", path);
935 } else {
936 SYSERROR("Error creating %s", path);
937 return -1;
938 }
69aa6655
DE
939 }
940 }
941 return 0;
942}
943
393903d1
SH
944/*
945 * Build a space-separate list of ptys to pass to systemd.
946 */
947static bool append_ptyname(char **pp, char *name)
b0a33c1e 948{
393903d1
SH
949 char *p;
950
951 if (!*pp) {
952 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
953 if (!*pp)
954 return false;
955 sprintf(*pp, "container_ttys=%s", name);
956 return true;
957 }
958 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
959 if (!p)
960 return false;
961 *pp = p;
962 strcat(p, " ");
963 strcat(p, name);
964 return true;
965}
966
9e1045e3 967static int lxc_setup_tty(struct lxc_conf *conf)
393903d1 968{
9e1045e3 969 int i, ret;
393903d1
SH
970 const struct lxc_tty_info *tty_info = &conf->tty_info;
971 char *ttydir = conf->ttydir;
7c6ef2a2 972 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 973
e8bd4e43 974 if (!conf->rootfs.path)
bc9bd0e3
DL
975 return 0;
976
b0a33c1e 977 for (i = 0; i < tty_info->nbtty; i++) {
b0a33c1e 978 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
979
e8bd4e43 980 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
9e1045e3 981 if (ret < 0 || (size_t)ret >= sizeof(path)) {
7c6ef2a2
SH
982 ERROR("pathname too long for ttys");
983 return -1;
984 }
9e1045e3 985
7c6ef2a2
SH
986 if (ttydir) {
987 /* create dev/lxc/tty%d" */
9e1045e3
CB
988 ret = snprintf(lxcpath, sizeof(lxcpath),
989 "/dev/%s/tty%d", ttydir, i + 1);
990 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
7c6ef2a2
SH
991 ERROR("pathname too long for ttys");
992 return -1;
993 }
9e1045e3 994
7c6ef2a2 995 ret = creat(lxcpath, 0660);
9e1045e3
CB
996 if (ret < 0 && errno != EEXIST) {
997 SYSERROR("failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
998 return -1;
999 }
4d44e274
SH
1000 if (ret >= 0)
1001 close(ret);
9e1045e3 1002
7c6ef2a2 1003 ret = unlink(path);
9e1045e3
CB
1004 if (ret < 0 && errno != ENOENT) {
1005 SYSERROR("failed to unlink \"%s\"", path);
7c6ef2a2
SH
1006 return -1;
1007 }
b0a33c1e 1008
9e1045e3
CB
1009 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
1010 if (ret < 0) {
1011 WARN("failed to bind mount \"%s\" onto \"%s\"",
7c6ef2a2
SH
1012 pty_info->name, path);
1013 continue;
1014 }
9e1045e3
CB
1015 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
1016 path);
13954cce 1017
9e1045e3
CB
1018 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
1019 ttydir, i + 1);
1020 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
9ba8130c
SH
1021 ERROR("tty pathname too long");
1022 return -1;
1023 }
9e1045e3 1024
7c6ef2a2 1025 ret = symlink(lxcpath, path);
9e1045e3
CB
1026 if (ret < 0) {
1027 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
1028 path, lxcpath);
7c6ef2a2
SH
1029 return -1;
1030 }
1031 } else {
9e1045e3
CB
1032 /* If we populated /dev, then we need to create
1033 * /dev/ttyN
1034 */
1035 ret = access(path, F_OK);
1036 if (ret < 0) {
c6883f38 1037 ret = creat(path, 0660);
9e1045e3
CB
1038 if (ret < 0) {
1039 SYSERROR("failed to create \"%s\"", path);
c6883f38 1040 /* this isn't fatal, continue */
025ed0f3 1041 } else {
c6883f38 1042 close(ret);
025ed0f3 1043 }
c6883f38 1044 }
9e1045e3
CB
1045
1046 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
1047 if (ret < 0) {
e8bd4e43 1048 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
7c6ef2a2
SH
1049 continue;
1050 }
9e1045e3
CB
1051
1052 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
1053 path);
393903d1 1054 }
9e1045e3 1055
e8bd4e43 1056 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
393903d1
SH
1057 ERROR("Error setting up container_ttys string");
1058 return -1;
b0a33c1e 1059 }
1060 }
1061
9e1045e3 1062 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
b0a33c1e 1063 return 0;
1064}
1065
59bb8698 1066static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 1067{
2d489f9e 1068 int oldroot = -1, newroot = -1;
bf601689 1069
2d489f9e
SH
1070 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1071 if (oldroot < 0) {
1072 SYSERROR("Error opening old-/ for fchdir");
9ba8130c
SH
1073 return -1;
1074 }
2d489f9e
SH
1075 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1076 if (newroot < 0) {
1077 SYSERROR("Error opening new-/ for fchdir");
1078 goto fail;
c08556c6 1079 }
bf601689 1080
cc6f6dd7 1081 /* change into new root fs */
2d489f9e 1082 if (fchdir(newroot)) {
cc6f6dd7 1083 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
2d489f9e 1084 goto fail;
cc6f6dd7
DL
1085 }
1086
cc6f6dd7 1087 /* pivot_root into our new root fs */
2d489f9e 1088 if (pivot_root(".", ".")) {
cc6f6dd7 1089 SYSERROR("pivot_root syscall failed");
2d489f9e 1090 goto fail;
bf601689 1091 }
cc6f6dd7 1092
2d489f9e
SH
1093 /*
1094 * at this point the old-root is mounted on top of our new-root
1095 * To unmounted it we must not be chdir'd into it, so escape back
1096 * to old-root
1097 */
1098 if (fchdir(oldroot) < 0) {
1099 SYSERROR("Error entering oldroot");
1100 goto fail;
1101 }
7981ea46 1102 if (umount2(".", MNT_DETACH) < 0) {
2d489f9e
SH
1103 SYSERROR("Error detaching old root");
1104 goto fail;
cc6f6dd7
DL
1105 }
1106
2d489f9e
SH
1107 if (fchdir(newroot) < 0) {
1108 SYSERROR("Error re-entering newroot");
1109 goto fail;
1110 }
cc6f6dd7 1111
2d489f9e
SH
1112 close(oldroot);
1113 close(newroot);
bf601689 1114
2d489f9e 1115 DEBUG("pivot_root syscall to '%s' successful", rootfs);
bf601689 1116
bf601689 1117 return 0;
2d489f9e
SH
1118
1119fail:
1120 if (oldroot != -1)
1121 close(oldroot);
1122 if (newroot != -1)
1123 close(newroot);
1124 return -1;
bf601689
MH
1125}
1126
bc6928ff 1127/*
87da4ec3
SH
1128 * Just create a path for /dev under $lxcpath/$name and in rootfs
1129 * If we hit an error, log it but don't fail yet.
91c3830e 1130 */
14221cbb 1131static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, const char *lxcpath)
91c3830e
SH
1132{
1133 int ret;
87da4ec3
SH
1134 size_t clen;
1135 char *path;
91c3830e 1136
14221cbb 1137 INFO("Mounting container /dev");
bc6928ff 1138
14221cbb 1139 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1140 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1141 path = alloca(clen);
bc6928ff 1142
ec50007f 1143 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
87da4ec3 1144 if (ret < 0 || ret >= clen)
91c3830e 1145 return -1;
bc6928ff 1146
87da4ec3 1147 if (!dir_exists(path)) {
14221cbb 1148 WARN("No /dev in container.");
87da4ec3
SH
1149 WARN("Proceeding without autodev setup");
1150 return 0;
bc6928ff 1151 }
87da4ec3 1152
1ec0e8e3 1153 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
ec50007f 1154 rootfs->path ? rootfs->mount : NULL);
1ec0e8e3 1155 if (ret != 0) {
87da4ec3 1156 SYSERROR("Failed mounting tmpfs onto %s\n", path);
1ec0e8e3 1157 return -1;
91c3830e 1158 }
87da4ec3
SH
1159
1160 INFO("Mounted tmpfs onto %s", path);
1161
ec50007f 1162 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
87da4ec3 1163 if (ret < 0 || ret >= clen)
91c3830e 1164 return -1;
87da4ec3 1165
bc6928ff
MW
1166 /*
1167 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1168 * If not, then create it and exit if that fails...
1169 */
87da4ec3 1170 if (!dir_exists(path)) {
bc6928ff
MW
1171 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1172 if (ret) {
1173 SYSERROR("Failed to create /dev/pts in container");
1174 return -1;
1175 }
91c3830e
SH
1176 }
1177
14221cbb 1178 INFO("Mounted container /dev");
91c3830e
SH
1179 return 0;
1180}
1181
c6883f38 1182struct lxc_devs {
74a3920a 1183 const char *name;
c6883f38
SH
1184 mode_t mode;
1185 int maj;
1186 int min;
1187};
1188
74a3920a 1189static const struct lxc_devs lxc_devs[] = {
c6883f38
SH
1190 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1191 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1192 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1193 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1194 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1195 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
c6883f38
SH
1196};
1197
27245ff7 1198static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38
SH
1199{
1200 int ret;
c6883f38
SH
1201 char path[MAXPATHLEN];
1202 int i;
3a32201c 1203 mode_t cmask;
c6883f38 1204
ec50007f 1205 ret = snprintf(path, MAXPATHLEN, "%s/dev", rootfs->path ? rootfs->mount : "");
91c3830e
SH
1206 if (ret < 0 || ret >= MAXPATHLEN) {
1207 ERROR("Error calculating container /dev location");
c6883f38 1208 return -1;
f7bee6c6 1209 }
91c3830e 1210
0bbf8572
CB
1211 /* ignore, just don't try to fill in */
1212 if (!dir_exists(path))
9cb4d183
SH
1213 return 0;
1214
0bbf8572 1215 INFO("populating container /dev");
3a32201c 1216 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1217 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1218 const struct lxc_devs *d = &lxc_devs[i];
0728ebf4 1219
ec50007f 1220 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
c6883f38
SH
1221 if (ret < 0 || ret >= MAXPATHLEN)
1222 return -1;
0bbf8572 1223
c6883f38 1224 ret = mknod(path, d->mode, makedev(d->maj, d->min));
0bbf8572 1225 if (ret < 0) {
9cb4d183
SH
1226 char hostpath[MAXPATHLEN];
1227 FILE *pathfile;
1228
0bbf8572
CB
1229 if (errno == EEXIST) {
1230 DEBUG("\"%s\" device already existed", path);
1231 continue;
1232 }
1233
1234 /* Unprivileged containers cannot create devices, so
1235 * bind mount the device from the host.
1236 */
9cb4d183
SH
1237 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1238 if (ret < 0 || ret >= MAXPATHLEN)
1239 return -1;
1240 pathfile = fopen(path, "wb");
1241 if (!pathfile) {
1242 SYSERROR("Failed to create device mount target '%s'", path);
1243 return -1;
1244 }
1245 fclose(pathfile);
0bbf8572
CB
1246 if (safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL) != 0) {
1247 SYSERROR("Failed bind mounting device %s from host into container", d->name);
9cb4d183
SH
1248 return -1;
1249 }
0bbf8572
CB
1250 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath, path);
1251 } else {
1252 DEBUG("created device node \"%s\"", path);
c6883f38
SH
1253 }
1254 }
3a32201c 1255 umask(cmask);
c6883f38 1256
0bbf8572 1257 INFO("populated container /dev");
c6883f38
SH
1258 return 0;
1259}
1260
cc28d0b0 1261static int setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1262{
91c3e281
CB
1263 struct bdev *bdev;
1264 const struct lxc_rootfs *rootfs;
cc28d0b0 1265
91c3e281 1266 rootfs = &conf->rootfs;
a0f379bf 1267 if (!rootfs->path) {
91c3e281
CB
1268 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1269 SYSERROR("Failed to make / rslave.");
a0f379bf
DW
1270 return -1;
1271 }
c69bd12f 1272 return 0;
a0f379bf 1273 }
0ad19a3f 1274
12297168 1275 if (access(rootfs->mount, F_OK)) {
91c3e281 1276 SYSERROR("Failed to access to \"%s\". Check it is present.",
12297168 1277 rootfs->mount);
b1789442
DL
1278 return -1;
1279 }
1280
91c3e281
CB
1281 /* First try mounting rootfs using a bdev. */
1282 bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1283 if (bdev && !bdev->ops->mount(bdev)) {
59d66af2 1284 bdev_put(bdev);
91c3e281
CB
1285 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1286 rootfs->path, rootfs->mount,
1287 rootfs->options ? rootfs->options : "(null)");
9be53773
SH
1288 return 0;
1289 }
59d66af2
SH
1290 if (bdev)
1291 bdev_put(bdev);
a17b1e65 1292 if (mount_rootfs(rootfs->path, rootfs->mount, rootfs->options)) {
91c3e281
CB
1293 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1294 rootfs->path, rootfs->mount,
1295 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1296 return -1;
1297 }
0ad19a3f 1298
91c3e281
CB
1299 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1300 rootfs->path, rootfs->mount,
1301 rootfs->options ? rootfs->options : "(null)");
ac778708
DL
1302 return 0;
1303}
1304
91e93c71
AV
1305int prepare_ramfs_root(char *root)
1306{
eab15c1e 1307 char buf[LXC_LINELEN], *p;
91e93c71
AV
1308 char nroot[PATH_MAX];
1309 FILE *f;
1310 int i;
1311 char *p2;
1312
1313 if (realpath(root, nroot) == NULL)
39c7b795 1314 return -errno;
91e93c71
AV
1315
1316 if (chdir("/") == -1)
39c7b795 1317 return -errno;
91e93c71
AV
1318
1319 /*
1320 * We could use here MS_MOVE, but in userns this mount is
1321 * locked and can't be moved.
1322 */
39c7b795 1323 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
91e93c71 1324 SYSERROR("Failed to move %s into /", root);
39c7b795 1325 return -errno;
91e93c71
AV
1326 }
1327
39c7b795 1328 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
91e93c71 1329 SYSERROR("Failed to make . rprivate");
39c7b795 1330 return -errno;
91e93c71
AV
1331 }
1332
1333 /*
1334 * The following code cleans up inhereted mounts which are not
1335 * required for CT.
1336 *
1337 * The mountinfo file shows not all mounts, if a few points have been
1338 * unmounted between read operations from the mountinfo. So we need to
1339 * read mountinfo a few times.
1340 *
1341 * This loop can be skipped if a container uses unserns, because all
1342 * inherited mounts are locked and we should live with all this trash.
1343 */
1344 while (1) {
1345 int progress = 0;
1346
1347 f = fopen("./proc/self/mountinfo", "r");
1348 if (!f) {
1349 SYSERROR("Unable to open /proc/self/mountinfo");
1350 return -1;
1351 }
eab15c1e 1352 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1353 for (p = buf, i=0; p && i < 4; i++)
1354 p = strchr(p+1, ' ');
1355 if (!p)
1356 continue;
1357 p2 = strchr(p+1, ' ');
1358 if (!p2)
1359 continue;
1360
1361 *p2 = '\0';
1362 *p = '.';
1363
1364 if (strcmp(p + 1, "/") == 0)
1365 continue;
1366 if (strcmp(p + 1, "/proc") == 0)
1367 continue;
1368
1369 if (umount2(p, MNT_DETACH) == 0)
1370 progress++;
1371 }
1372 fclose(f);
1373 if (!progress)
1374 break;
1375 }
1376
8bea9fae
PR
1377 /* This also can be skipped if a container uses unserns */
1378 umount2("./proc", MNT_DETACH);
91e93c71
AV
1379
1380 /* It is weird, but chdir("..") moves us in a new root */
1381 if (chdir("..") == -1) {
1382 SYSERROR("Unable to change working directory");
1383 return -1;
1384 }
1385
1386 if (chroot(".") == -1) {
1387 SYSERROR("Unable to chroot");
1388 return -1;
1389 }
1390
1391 return 0;
1392}
1393
74a3920a 1394static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1395{
39c7b795
CB
1396 if (!rootfs->path) {
1397 DEBUG("container does not have a rootfs, so not doing pivot root");
ac778708 1398 return 0;
39c7b795 1399 }
ac778708 1400
91e93c71 1401 if (detect_ramfs_rootfs()) {
39c7b795
CB
1402 DEBUG("detected that container is on ramfs");
1403 if (prepare_ramfs_root(rootfs->mount)) {
1404 ERROR("failed to prepare minimal ramfs root");
91e93c71 1405 return -1;
39c7b795
CB
1406 }
1407
1408 DEBUG("prepared ramfs root for container");
1409 return 0;
1410 }
1411
1412 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1413 ERROR("failed to pivot root");
25368b52 1414 return -1;
c69bd12f
DL
1415 }
1416
39c7b795 1417 DEBUG("finished pivot root");
25368b52 1418 return 0;
0ad19a3f 1419}
1420
70761e5e 1421static int lxc_setup_devpts(int num_pts)
3c26f34e 1422{
70761e5e 1423 int ret;
d5cb35d6 1424 const char *devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
77890c6d 1425
70761e5e
CB
1426 if (!num_pts) {
1427 DEBUG("no new devpts instance will be mounted since no pts "
1428 "devices are requested");
d852c78c 1429 return 0;
3c26f34e 1430 }
1431
d5cb35d6 1432 /* Unmount old devpts instance. */
70761e5e
CB
1433 ret = access("/dev/pts/ptmx", F_OK);
1434 if (!ret) {
70761e5e
CB
1435 ret = umount("/dev/pts");
1436 if (ret < 0) {
1437 SYSERROR("failed to unmount old devpts instance");
1438 return -1;
7e40254a 1439 }
70761e5e 1440 DEBUG("unmounted old /dev/pts instance");
7e40254a
JTLB
1441 }
1442
70761e5e
CB
1443 /* Create mountpoint for devpts instance. */
1444 ret = mkdir("/dev/pts", 0755);
1445 if (ret < 0 && errno != EEXIST) {
1446 SYSERROR("failed to create the \"/dev/pts\" directory");
3c26f34e 1447 return -1;
1448 }
1449
70761e5e
CB
1450 /* Mount new devpts instance. */
1451 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1452 if (ret < 0) {
1453 SYSERROR("failed to mount new devpts instance");
1454 return -1;
1455 }
f4f52cb5 1456 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1457
d5cb35d6 1458 /* Remove any pre-existing /dev/ptmx file. */
70761e5e 1459 ret = access("/dev/ptmx", F_OK);
d5cb35d6
CB
1460 if (!ret) {
1461 ret = remove("/dev/ptmx");
1462 if (ret < 0) {
1463 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1464 return -1;
70761e5e 1465 }
d5cb35d6 1466 DEBUG("removed existing \"/dev/ptmx\"");
3c26f34e 1467 }
1468
d5cb35d6
CB
1469 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1470 ret = open("/dev/ptmx", O_CREAT, 0666);
1471 if (ret < 0) {
1472 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1473 return -1;
1474 }
e87bd19c 1475 close(ret);
d5cb35d6 1476 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1477
d5cb35d6 1478 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1479 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6
CB
1480 if (!ret) {
1481 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1482 return 0;
1483 } else {
1484 /* Fallthrough and try to create a symlink. */
1485 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1486 }
1487
1488 /* Remove the dummy /dev/ptmx file we created above. */
1489 ret = remove("/dev/ptmx");
70761e5e 1490 if (ret < 0) {
d5cb35d6
CB
1491 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1492 return -1;
1493 }
1494
1495 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1496 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1497 if (ret < 0) {
1498 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
3c26f34e 1499 return -1;
1500 }
d5cb35d6 1501 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
cd54d859 1502
3c26f34e 1503 return 0;
1504}
1505
cccc74b5
DL
1506static int setup_personality(int persona)
1507{
6ff05e18 1508 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1509 if (persona == -1)
1510 return 0;
1511
1512 if (personality(persona) < 0) {
1513 SYSERROR("failed to set personality to '0x%x'", persona);
1514 return -1;
1515 }
1516
1517 INFO("set personality to '0x%x'", persona);
6ff05e18 1518 #endif
cccc74b5
DL
1519
1520 return 0;
1521}
1522
3d7d929a
CB
1523static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1524 const struct lxc_console *console)
6e590161 1525{
63376d7d 1526 char path[MAXPATHLEN];
0728ebf4 1527 int ret, fd;
52e35957 1528
8b1b1210
CB
1529 if (console->path && !strcmp(console->path, "none"))
1530 return 0;
1531
7c6ef2a2 1532 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
3d7d929a 1533 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1534 return -1;
52e35957 1535
8b1b1210
CB
1536 /* When we are asked to setup a console we remove any previous
1537 * /dev/console bind-mounts.
1538 */
a7ba3c7f
CB
1539 if (file_exists(path)) {
1540 ret = lxc_unstack_mountpoint(path, false);
1541 if (ret < 0) {
8b1b1210 1542 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1543 return -ret;
1544 } else {
1545 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1546 }
1547 ret = unlink(path);
1548 if (ret < 0) {
1549 SYSERROR("error unlinking %s", path);
8b1b1210
CB
1550 return -errno;
1551 }
8b1b1210
CB
1552 }
1553
1554 /* For unprivileged containers autodev or automounts will already have
1555 * taken care of creating /dev/console.
1556 */
0728ebf4
TA
1557 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1558 if (fd < 0) {
1559 if (errno != EEXIST) {
1560 SYSERROR("failed to create console");
3d7d929a 1561 return -errno;
0728ebf4
TA
1562 }
1563 } else {
1564 close(fd);
52e35957
DL
1565 }
1566
0728ebf4 1567 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
3d7d929a
CB
1568 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1569 return -errno;
63376d7d 1570 }
13954cce 1571
3d7d929a 1572 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
63376d7d 1573 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1574 return -1;
1575 }
1576
3d7d929a 1577 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1578 return 0;
1579}
1580
3d7d929a
CB
1581static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1582 const struct lxc_console *console,
1583 char *ttydir)
7c6ef2a2 1584{
7c6ef2a2 1585 int ret;
3d7d929a 1586 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
7c6ef2a2
SH
1587
1588 /* create rootfs/dev/<ttydir> directory */
3d7d929a
CB
1589 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1590 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1591 return -1;
3d7d929a 1592
7c6ef2a2
SH
1593 ret = mkdir(path, 0755);
1594 if (ret && errno != EEXIST) {
959aee9c 1595 SYSERROR("failed with errno %d to create %s", errno, path);
3d7d929a 1596 return -errno;
7c6ef2a2 1597 }
3d7d929a 1598 DEBUG("created directory for console and tty devices at \%s\"", path);
7c6ef2a2 1599
3d7d929a
CB
1600 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1601 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1602 return -1;
1603
7c6ef2a2 1604 ret = creat(lxcpath, 0660);
3d7d929a 1605 if (ret == -1 && errno != EEXIST) {
959aee9c 1606 SYSERROR("error %d creating %s", errno, lxcpath);
3d7d929a 1607 return -errno;
7c6ef2a2 1608 }
4d44e274
SH
1609 if (ret >= 0)
1610 close(ret);
7c6ef2a2 1611
2a12fefd
CB
1612 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1613 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 1614 return -1;
2a12fefd
CB
1615
1616 /* When we are asked to setup a console we remove any previous
1617 * /dev/console bind-mounts.
1618 */
1619 if (console->path && !strcmp(console->path, "none")) {
1620 struct stat st;
1621 ret = stat(path, &st);
1622 if (ret < 0) {
1623 if (errno == ENOENT)
1624 return 0;
1625 SYSERROR("failed stat() \"%s\"", path);
1626 return -errno;
1627 }
1628
1629 /* /dev/console must be character device with major number 5 and
1630 * minor number 1. If not, give benefit of the doubt and assume
1631 * the user has mounted something else right there on purpose.
1632 */
1633 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1634 return 0;
1635
1636 /* In case the user requested a bind-mount for /dev/console and
1637 * requests a ttydir we move the mount to the
a7ba3c7f
CB
1638 * /dev/<ttydir/console.
1639 * Note, we only move the uppermost mount and clear all other
1640 * mounts underneath for safety.
1641 * If it is a character device created via mknod() we simply
1642 * rename it.
2a12fefd
CB
1643 */
1644 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1645 if (ret < 0) {
1646 if (errno != EINVAL) {
1647 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1648 return -errno;
1649 }
1650 /* path was not a mountpoint */
1651 ret = rename(path, lxcpath);
1652 if (ret < 0) {
1653 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1654 return -errno;
1655 }
1656 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1657 } else {
1658 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1659 }
a7ba3c7f
CB
1660
1661 /* Clear all remaining bind-mounts. */
1662 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1663 if (ret < 0) {
a7ba3c7f
CB
1664 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1665 return -ret;
1666 } else {
1667 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1668 }
1669 } else {
1670 if (file_exists(path)) {
1671 ret = lxc_unstack_mountpoint(path, false);
1672 if (ret < 0) {
2a12fefd 1673 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1674 return -ret;
1675 } else {
1676 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
2a12fefd 1677 }
2a12fefd
CB
1678 }
1679
1680 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1681 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1682 return -1;
1683 }
1684 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1685 }
1686
2a12fefd 1687 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
9ba8130c 1688 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
3d7d929a 1689 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 1690 return -1;
3d7d929a 1691
2a12fefd
CB
1692 ret = unlink(path);
1693 if (ret && errno != ENOENT) {
1694 SYSERROR("error unlinking %s", path);
1695 return -errno;
1696 }
1697
7c6ef2a2 1698 ret = symlink(lxcpath, path);
3d7d929a
CB
1699 if (ret < 0) {
1700 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
7c6ef2a2
SH
1701 return -1;
1702 }
1703
3d7d929a 1704 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
6e590161 1705 return 0;
1706}
1707
3d7d929a
CB
1708static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1709 const struct lxc_console *console, char *ttydir)
7c6ef2a2 1710{
3d7d929a
CB
1711 /* We don't have a rootfs, /dev/console will be shared. */
1712 if (!rootfs->path) {
1713 DEBUG("/dev/console will be shared with the host");
7c6ef2a2 1714 return 0;
3d7d929a
CB
1715 }
1716
7c6ef2a2 1717 if (!ttydir)
3d7d929a 1718 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1719
3d7d929a 1720 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1721}
1722
1bd051a6
SH
1723static int setup_kmsg(const struct lxc_rootfs *rootfs,
1724 const struct lxc_console *console)
1725{
1726 char kpath[MAXPATHLEN];
1727 int ret;
1728
222fea5a
DE
1729 if (!rootfs->path)
1730 return 0;
1bd051a6
SH
1731 ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
1732 if (ret < 0 || ret >= sizeof(kpath))
1733 return -1;
1734
1735 ret = unlink(kpath);
1736 if (ret && errno != ENOENT) {
959aee9c 1737 SYSERROR("error unlinking %s", kpath);
1bd051a6
SH
1738 return -1;
1739 }
1740
1741 ret = symlink("console", kpath);
1742 if (ret) {
1743 SYSERROR("failed to create symlink for kmsg");
1744 return -1;
1745 }
1746
1747 return 0;
1748}
1749
998ac676
RT
1750static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1751{
1752 struct mount_opt *mo;
1753
1754 /* If opt is found in mount_opt, set or clear flags.
1755 * Otherwise append it to data. */
1756
1757 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1758 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1759 if (mo->clear)
1760 *flags &= ~mo->flag;
1761 else
1762 *flags |= mo->flag;
1763 return;
1764 }
1765 }
1766
1767 if (strlen(*data))
1768 strcat(*data, ",");
1769 strcat(*data, opt);
1770}
1771
a17b1e65 1772int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1773 char **mntdata)
1774{
1775 char *s, *data;
1776 char *p, *saveptr = NULL;
1777
911324ef 1778 *mntdata = NULL;
91656ce5 1779 *mntflags = 0L;
911324ef
DL
1780
1781 if (!mntopts)
998ac676
RT
1782 return 0;
1783
911324ef 1784 s = strdup(mntopts);
998ac676 1785 if (!s) {
36eb9bde 1786 SYSERROR("failed to allocate memory");
998ac676
RT
1787 return -1;
1788 }
1789
1790 data = malloc(strlen(s) + 1);
1791 if (!data) {
36eb9bde 1792 SYSERROR("failed to allocate memory");
998ac676
RT
1793 free(s);
1794 return -1;
1795 }
1796 *data = 0;
1797
1798 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1799 p = strtok_r(NULL, ",", &saveptr))
1800 parse_mntopt(p, mntflags, &data);
1801
1802 if (*data)
1803 *mntdata = data;
1804 else
1805 free(data);
1806 free(s);
1807
1808 return 0;
1809}
1810
6fd5e769
SH
1811static void null_endofword(char *word)
1812{
1813 while (*word && *word != ' ' && *word != '\t')
1814 word++;
1815 *word = '\0';
1816}
1817
1818/*
1819 * skip @nfields spaces in @src
1820 */
1821static char *get_field(char *src, int nfields)
1822{
1823 char *p = src;
1824 int i;
1825
1826 for (i = 0; i < nfields; i++) {
1827 while (*p && *p != ' ' && *p != '\t')
1828 p++;
1829 if (!*p)
1830 break;
1831 p++;
1832 }
1833 return p;
1834}
1835
911324ef
DL
1836static int mount_entry(const char *fsname, const char *target,
1837 const char *fstype, unsigned long mountflags,
ae7a770e 1838 const char *data, int optional, int dev, const char *rootfs)
911324ef 1839{
614305f3 1840#ifdef HAVE_STATVFS
2938f7c8 1841 struct statvfs sb;
614305f3 1842#endif
2938f7c8 1843
592fd47a 1844 if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) {
1fc64d22
SG
1845 if (optional) {
1846 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
1847 target, strerror(errno));
1848 return 0;
1849 }
1850 else {
1851 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1852 return -1;
1853 }
911324ef
DL
1854 }
1855
1856 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
2938f7c8
SH
1857 DEBUG("remounting %s on %s to respect bind or remount options",
1858 fsname ? fsname : "(none)", target ? target : "(none)");
7c5b6e7c
AS
1859 unsigned long rqd_flags = 0;
1860 if (mountflags & MS_RDONLY)
1861 rqd_flags |= MS_RDONLY;
614305f3 1862#ifdef HAVE_STATVFS
2938f7c8 1863 if (statvfs(fsname, &sb) == 0) {
7c5b6e7c 1864 unsigned long required_flags = rqd_flags;
2938f7c8
SH
1865 if (sb.f_flag & MS_NOSUID)
1866 required_flags |= MS_NOSUID;
ae7a770e 1867 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8
SH
1868 required_flags |= MS_NODEV;
1869 if (sb.f_flag & MS_RDONLY)
1870 required_flags |= MS_RDONLY;
1871 if (sb.f_flag & MS_NOEXEC)
1872 required_flags |= MS_NOEXEC;
1873 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname, sb.f_flag, required_flags);
1874 /*
1875 * If this was a bind mount request, and required_flags
1876 * does not have any flags which are not already in
1877 * mountflags, then skip the remount
1878 */
1879 if (!(mountflags & MS_REMOUNT)) {
7c5b6e7c 1880 if (!(required_flags & ~mountflags) && rqd_flags == 0) {
2938f7c8
SH
1881 DEBUG("mountflags already was %lu, skipping remount",
1882 mountflags);
1883 goto skipremount;
1884 }
1885 }
1886 mountflags |= required_flags;
6fd5e769 1887 }
614305f3 1888#endif
911324ef
DL
1889
1890 if (mount(fsname, target, fstype,
592fd47a 1891 mountflags | MS_REMOUNT, data) < 0) {
1fc64d22
SG
1892 if (optional) {
1893 INFO("failed to mount '%s' on '%s' (optional): %s",
1894 fsname, target, strerror(errno));
1895 return 0;
1896 }
1897 else {
1898 SYSERROR("failed to mount '%s' on '%s'",
1899 fsname, target);
1900 return -1;
1901 }
911324ef
DL
1902 }
1903 }
1904
614305f3 1905#ifdef HAVE_STATVFS
6fd5e769 1906skipremount:
614305f3 1907#endif
911324ef
DL
1908 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1909
1910 return 0;
1911}
1912
4e4ca161
SH
1913/*
1914 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1915 */
1916static void cull_mntent_opt(struct mntent *mntent)
1917{
1918 int i;
1919 char *p, *p2;
1920 char *list[] = {"create=dir",
1921 "create=file",
1922 "optional",
1923 NULL };
1924
1925 for (i=0; list[i]; i++) {
1926 if (!(p = strstr(mntent->mnt_opts, list[i])))
1927 continue;
1928 p2 = strchr(p, ',');
1929 if (!p2) {
1930 /* no more mntopts, so just chop it here */
1931 *p = '\0';
1932 continue;
1933 }
1934 memmove(p, p2+1, strlen(p2+1)+1);
1935 }
1936}
1937
4d5b72a1 1938static int mount_entry_create_dir_file(const struct mntent *mntent,
0a2dddd4
CB
1939 const char* path, const struct lxc_rootfs *rootfs,
1940 const char *lxc_name, const char *lxc_path)
0ad19a3f 1941{
4d5b72a1 1942 char *pathdirname = NULL;
608e3567 1943 int ret = 0;
34cfffb3 1944 FILE *pathfile = NULL;
911324ef 1945
6e46cc0d 1946 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
5c484f79 1947 if (ovl_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
6e46cc0d
CB
1948 return -1;
1949 } else if (strncmp(mntent->mnt_type, "aufs", 4) == 0) {
1d52bdf7 1950 if (aufs_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
6e46cc0d
CB
1951 return -1;
1952 }
1953
34cfffb3 1954 if (hasmntopt(mntent, "create=dir")) {
4d5b72a1
NC
1955 if (mkdir_p(path, 0755) < 0) {
1956 WARN("Failed to create mount target '%s'", path);
34cfffb3
SG
1957 ret = -1;
1958 }
1959 }
1960
4d5b72a1
NC
1961 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1962 pathdirname = strdup(path);
34cfffb3 1963 pathdirname = dirname(pathdirname);
119126b6
SG
1964 if (mkdir_p(pathdirname, 0755) < 0) {
1965 WARN("Failed to create target directory");
1966 }
4d5b72a1 1967 pathfile = fopen(path, "wb");
34cfffb3 1968 if (!pathfile) {
4d5b72a1 1969 WARN("Failed to create mount target '%s'", path);
34cfffb3 1970 ret = -1;
6e46cc0d 1971 } else {
34cfffb3 1972 fclose(pathfile);
6e46cc0d 1973 }
34cfffb3 1974 }
4d5b72a1
NC
1975 free(pathdirname);
1976 return ret;
1977}
1978
ec50007f
CB
1979/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1980 * without a rootfs. */
db4aba38 1981static inline int mount_entry_on_generic(struct mntent *mntent,
0a2dddd4
CB
1982 const char* path, const struct lxc_rootfs *rootfs,
1983 const char *lxc_name, const char *lxc_path)
4d5b72a1
NC
1984{
1985 unsigned long mntflags;
1986 char *mntdata;
1987 int ret;
1988 bool optional = hasmntopt(mntent, "optional") != NULL;
ae7a770e 1989 bool dev = hasmntopt(mntent, "dev") != NULL;
4d5b72a1 1990
ec50007f
CB
1991 char *rootfs_path = NULL;
1992 if (rootfs && rootfs->path)
1993 rootfs_path = rootfs->mount;
1994
0a2dddd4 1995 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name, lxc_path);
34cfffb3 1996
608e3567
SH
1997 if (ret < 0)
1998 return optional ? 0 : -1;
1999
4e4ca161
SH
2000 cull_mntent_opt(mntent);
2001
a17b1e65
SG
2002 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
2003 free(mntdata);
2004 return -1;
2005 }
2006
6e46cc0d 2007 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
ae7a770e 2008 mntdata, optional, dev, rootfs_path);
68c152ef 2009
911324ef 2010 free(mntdata);
911324ef
DL
2011 return ret;
2012}
2013
db4aba38
NC
2014static inline int mount_entry_on_systemfs(struct mntent *mntent)
2015{
1433c9f9
CB
2016 char path[MAXPATHLEN];
2017 int ret;
2018
2019 /* For containers created without a rootfs all mounts are treated as
2020 * absolute paths starting at / on the host. */
2021 if (mntent->mnt_dir[0] != '/')
2022 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2023 else
2024 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
2025
2026 if (ret < 0 || ret >= sizeof(path)) {
2027 ERROR("path name too long");
2028 return -1;
2029 }
2030
2031 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
2032}
2033
4e4ca161 2034static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 2035 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
2036 const char *lxc_name,
2037 const char *lxc_path)
911324ef 2038{
013bd428 2039 char *aux;
59760f5d 2040 char path[MAXPATHLEN];
80a881b2 2041 int r, ret = 0, offset;
67e571de 2042 const char *lxcpath;
0ad19a3f 2043
593e8478 2044 lxcpath = lxc_global_config_value("lxc.lxcpath");
2a59a681
SH
2045 if (!lxcpath) {
2046 ERROR("Out of memory");
2047 return -1;
2048 }
2049
80a881b2 2050 /* if rootfs->path is a blockdev path, allow container fstab to
2a59a681
SH
2051 * use $lxcpath/CN/rootfs as the target prefix */
2052 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
80a881b2
SH
2053 if (r < 0 || r >= MAXPATHLEN)
2054 goto skipvarlib;
2055
2056 aux = strstr(mntent->mnt_dir, path);
2057 if (aux) {
2058 offset = strlen(path);
2059 goto skipabs;
2060 }
2061
2062skipvarlib:
013bd428
DL
2063 aux = strstr(mntent->mnt_dir, rootfs->path);
2064 if (!aux) {
2065 WARN("ignoring mount point '%s'", mntent->mnt_dir);
db4aba38 2066 return ret;
013bd428 2067 }
80a881b2
SH
2068 offset = strlen(rootfs->path);
2069
2070skipabs:
013bd428 2071
9ba8130c 2072 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
80a881b2
SH
2073 aux + offset);
2074 if (r < 0 || r >= MAXPATHLEN) {
2075 WARN("pathnme too long for '%s'", mntent->mnt_dir);
a17b1e65
SG
2076 return -1;
2077 }
2078
0a2dddd4 2079 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 2080}
d330fe7b 2081
4e4ca161 2082static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
2083 const struct lxc_rootfs *rootfs,
2084 const char *lxc_name,
2085 const char *lxc_path)
911324ef
DL
2086{
2087 char path[MAXPATHLEN];
911324ef 2088 int ret;
d330fe7b 2089
34cfffb3 2090 /* relative to root mount point */
6e46cc0d 2091 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1433c9f9 2092 if (ret < 0 || ret >= sizeof(path)) {
9ba8130c
SH
2093 ERROR("path name too long");
2094 return -1;
2095 }
911324ef 2096
0a2dddd4 2097 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2098}
2099
80a881b2 2100static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
0a2dddd4 2101 const char *lxc_name, const char *lxc_path)
911324ef 2102{
aaf901be
AM
2103 struct mntent mntent;
2104 char buf[4096];
911324ef 2105 int ret = -1;
e76b8764 2106
aaf901be 2107 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
e76b8764 2108
911324ef 2109 if (!rootfs->path) {
aaf901be 2110 if (mount_entry_on_systemfs(&mntent))
e76b8764 2111 goto out;
911324ef 2112 continue;
e76b8764
CDC
2113 }
2114
911324ef 2115 /* We have a separate root, mounts are relative to it */
aaf901be 2116 if (mntent.mnt_dir[0] != '/') {
0a2dddd4 2117 if (mount_entry_on_relative_rootfs(&mntent, rootfs, lxc_name, lxc_path))
911324ef
DL
2118 goto out;
2119 continue;
2120 }
cd54d859 2121
0a2dddd4 2122 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name, lxc_path))
911324ef 2123 goto out;
0ad19a3f 2124 }
cd54d859 2125
0ad19a3f 2126 ret = 0;
cd54d859
DL
2127
2128 INFO("mount points have been setup");
0ad19a3f 2129out:
e7938e9e
MN
2130 return ret;
2131}
2132
80a881b2 2133static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
0a2dddd4 2134 const char *lxc_name, const char *lxc_path)
e7938e9e
MN
2135{
2136 FILE *file;
2137 int ret;
2138
2139 if (!fstab)
2140 return 0;
2141
2142 file = setmntent(fstab, "r");
2143 if (!file) {
2144 SYSERROR("failed to use '%s'", fstab);
2145 return -1;
2146 }
2147
0a2dddd4 2148 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
e7938e9e 2149
0ad19a3f 2150 endmntent(file);
2151 return ret;
2152}
2153
5ef5c9a3 2154FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2155{
5ef5c9a3 2156 int ret;
e7938e9e 2157 char *mount_entry;
5ef5c9a3
CB
2158 struct lxc_list *iterator;
2159 FILE *file;
2160 int fd = -1;
2161
2162 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2163 if (fd < 0) {
2164 if (errno != ENOSYS)
2165 return NULL;
2166 file = tmpfile();
2167 } else {
2168 file = fdopen(fd, "r+");
2169 }
e7938e9e 2170
e7938e9e 2171 if (!file) {
fad6ef95 2172 int saved_errno = errno;
5ef5c9a3
CB
2173 if (fd != -1)
2174 close(fd);
fad6ef95 2175 ERROR("Could not create mount entry file: %s.", strerror(saved_errno));
9fc7f8c0 2176 return NULL;
e7938e9e
MN
2177 }
2178
2179 lxc_list_for_each(iterator, mount) {
2180 mount_entry = iterator->elem;
5ef5c9a3
CB
2181 ret = fprintf(file, "%s\n", mount_entry);
2182 if (ret < strlen(mount_entry))
2183 WARN("Could not write mount entry to anonymous mount file.");
2184 }
2185
2186 if (fseek(file, 0, SEEK_SET) < 0) {
2187 fclose(file);
2188 return NULL;
e7938e9e
MN
2189 }
2190
9fc7f8c0
TA
2191 return file;
2192}
2193
5ef5c9a3
CB
2194static int setup_mount_entries(const struct lxc_rootfs *rootfs,
2195 struct lxc_list *mount, const char *lxc_name,
2196 const char *lxc_path)
9fc7f8c0
TA
2197{
2198 FILE *file;
2199 int ret;
2200
5ef5c9a3 2201 file = make_anonymous_mount_file(mount);
9fc7f8c0
TA
2202 if (!file)
2203 return -1;
e7938e9e 2204
0a2dddd4 2205 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
e7938e9e
MN
2206
2207 fclose(file);
2208 return ret;
2209}
2210
bab88e68
CS
2211static int parse_cap(const char *cap)
2212{
2213 char *ptr = NULL;
84760c11 2214 size_t i;
2215 int capid = -1;
bab88e68 2216
7035407c
DE
2217 if (!strcmp(cap, "none"))
2218 return -2;
2219
bab88e68
CS
2220 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2221
2222 if (strcmp(cap, caps_opt[i].name))
2223 continue;
2224
2225 capid = caps_opt[i].value;
2226 break;
2227 }
2228
2229 if (capid < 0) {
2230 /* try to see if it's numeric, so the user may specify
2231 * capabilities that the running kernel knows about but
2232 * we don't */
2233 errno = 0;
2234 capid = strtol(cap, &ptr, 10);
2235 if (!ptr || *ptr != '\0' || errno != 0)
2236 /* not a valid number */
2237 capid = -1;
2238 else if (capid > lxc_caps_last_cap())
2239 /* we have a number but it's not a valid
2240 * capability */
2241 capid = -1;
2242 }
2243
2244 return capid;
2245}
2246
0769b82a
CS
2247int in_caplist(int cap, struct lxc_list *caps)
2248{
2249 struct lxc_list *iterator;
2250 int capid;
2251
2252 lxc_list_for_each(iterator, caps) {
2253 capid = parse_cap(iterator->elem);
2254 if (capid == cap)
2255 return 1;
2256 }
2257
2258 return 0;
2259}
2260
81810dd1
DL
2261static int setup_caps(struct lxc_list *caps)
2262{
2263 struct lxc_list *iterator;
2264 char *drop_entry;
bab88e68 2265 int capid;
81810dd1
DL
2266
2267 lxc_list_for_each(iterator, caps) {
2268
2269 drop_entry = iterator->elem;
2270
bab88e68 2271 capid = parse_cap(drop_entry);
d55bc1ad 2272
81810dd1 2273 if (capid < 0) {
1e11be34
DL
2274 ERROR("unknown capability %s", drop_entry);
2275 return -1;
81810dd1
DL
2276 }
2277
2278 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2279
2280 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2281 SYSERROR("failed to remove %s capability", drop_entry);
2282 return -1;
2283 }
81810dd1
DL
2284
2285 }
2286
1fb86a7c
SH
2287 DEBUG("capabilities have been setup");
2288
2289 return 0;
2290}
2291
2292static int dropcaps_except(struct lxc_list *caps)
2293{
2294 struct lxc_list *iterator;
2295 char *keep_entry;
1fb86a7c
SH
2296 int i, capid;
2297 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2298 INFO("found %d capabilities", numcaps);
1fb86a7c 2299
2caf9a97
SH
2300 if (numcaps <= 0 || numcaps > 200)
2301 return -1;
2302
1fb86a7c
SH
2303 // caplist[i] is 1 if we keep capability i
2304 int *caplist = alloca(numcaps * sizeof(int));
2305 memset(caplist, 0, numcaps * sizeof(int));
2306
2307 lxc_list_for_each(iterator, caps) {
2308
2309 keep_entry = iterator->elem;
2310
bab88e68 2311 capid = parse_cap(keep_entry);
1fb86a7c 2312
7035407c
DE
2313 if (capid == -2)
2314 continue;
2315
1fb86a7c
SH
2316 if (capid < 0) {
2317 ERROR("unknown capability %s", keep_entry);
2318 return -1;
2319 }
2320
8255688a 2321 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2322
2323 caplist[capid] = 1;
2324 }
2325 for (i=0; i<numcaps; i++) {
2326 if (caplist[i])
2327 continue;
2328 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2329 SYSERROR("failed to remove capability %d", i);
2330 return -1;
2331 }
1fb86a7c
SH
2332 }
2333
2334 DEBUG("capabilities have been setup");
81810dd1
DL
2335
2336 return 0;
2337}
2338
0ad19a3f 2339static int setup_hw_addr(char *hwaddr, const char *ifname)
2340{
2341 struct sockaddr sockaddr;
2342 struct ifreq ifr;
fad6ef95 2343 int ret, fd, saved_errno;
0ad19a3f 2344
3cfc0f3a
MN
2345 ret = lxc_convert_mac(hwaddr, &sockaddr);
2346 if (ret) {
2347 ERROR("mac address '%s' conversion failed : %s",
2348 hwaddr, strerror(-ret));
0ad19a3f 2349 return -1;
2350 }
2351
2352 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
5da6aa8c 2353 ifr.ifr_name[IFNAMSIZ-1] = '\0';
0ad19a3f 2354 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2355
2356 fd = socket(AF_INET, SOCK_DGRAM, 0);
2357 if (fd < 0) {
3ab87b66 2358 ERROR("socket failure : %s", strerror(errno));
0ad19a3f 2359 return -1;
2360 }
2361
2362 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
fad6ef95 2363 saved_errno = errno;
0ad19a3f 2364 close(fd);
2365 if (ret)
fad6ef95 2366 ERROR("ioctl failure : %s", strerror(saved_errno));
0ad19a3f 2367
5da6aa8c 2368 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
cd54d859 2369
0ad19a3f 2370 return ret;
2371}
2372
82d5ae15 2373static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2374{
82d5ae15
DL
2375 struct lxc_list *iterator;
2376 struct lxc_inetdev *inetdev;
3cfc0f3a 2377 int err;
0ad19a3f 2378
82d5ae15
DL
2379 lxc_list_for_each(iterator, ip) {
2380
2381 inetdev = iterator->elem;
2382
0093bb8c
DL
2383 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2384 &inetdev->bcast, inetdev->prefix);
3cfc0f3a
MN
2385 if (err) {
2386 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2387 ifindex, strerror(-err));
82d5ae15
DL
2388 return -1;
2389 }
2390 }
2391
2392 return 0;
0ad19a3f 2393}
2394
82d5ae15 2395static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2396{
82d5ae15 2397 struct lxc_list *iterator;
7fa9074f 2398 struct lxc_inet6dev *inet6dev;
3cfc0f3a 2399 int err;
0ad19a3f 2400
82d5ae15
DL
2401 lxc_list_for_each(iterator, ip) {
2402
2403 inet6dev = iterator->elem;
2404
b3df193c 2405 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
0093bb8c
DL
2406 &inet6dev->mcast, &inet6dev->acast,
2407 inet6dev->prefix);
3cfc0f3a
MN
2408 if (err) {
2409 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2410 ifindex, strerror(-err));
82d5ae15 2411 return -1;
3cfc0f3a 2412 }
82d5ae15
DL
2413 }
2414
2415 return 0;
0ad19a3f 2416}
2417
82d5ae15 2418static int setup_netdev(struct lxc_netdev *netdev)
0ad19a3f 2419{
0ad19a3f 2420 char ifname[IFNAMSIZ];
0ad19a3f 2421 char *current_ifname = ifname;
3cfc0f3a 2422 int err;
0ad19a3f 2423
82d5ae15
DL
2424 /* empty network namespace */
2425 if (!netdev->ifindex) {
b0efbac4 2426 if (netdev->flags & IFF_UP) {
d472214b 2427 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2428 if (err) {
2429 ERROR("failed to set the loopback up : %s",
2430 strerror(-err));
82d5ae15
DL
2431 return -1;
2432 }
82d5ae15 2433 }
40790553
SH
2434 if (netdev->type != LXC_NET_VETH)
2435 return 0;
2436 netdev->ifindex = if_nametoindex(netdev->name);
0ad19a3f 2437 }
13954cce 2438
b466dc33 2439 /* get the new ifindex in case of physical netdev */
40790553 2440 if (netdev->type == LXC_NET_PHYS) {
b466dc33
BP
2441 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2442 ERROR("failed to get ifindex for %s",
2443 netdev->link);
2444 return -1;
2445 }
40790553 2446 }
b466dc33 2447
82d5ae15
DL
2448 /* retrieve the name of the interface */
2449 if (!if_indextoname(netdev->ifindex, current_ifname)) {
36eb9bde 2450 ERROR("no interface corresponding to index '%d'",
82d5ae15 2451 netdev->ifindex);
0ad19a3f 2452 return -1;
2453 }
13954cce 2454
018ef520 2455 /* default: let the system to choose one interface name */
9d083402 2456 if (!netdev->name)
fb6d9b2f
DL
2457 netdev->name = netdev->type == LXC_NET_PHYS ?
2458 netdev->link : "eth%d";
018ef520 2459
82d5ae15 2460 /* rename the interface name */
40790553
SH
2461 if (strcmp(ifname, netdev->name) != 0) {
2462 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2463 if (err) {
2464 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2465 strerror(-err));
2466 return -1;
2467 }
018ef520
DL
2468 }
2469
2470 /* Re-read the name of the interface because its name has changed
2471 * and would be automatically allocated by the system
2472 */
82d5ae15 2473 if (!if_indextoname(netdev->ifindex, current_ifname)) {
018ef520 2474 ERROR("no interface corresponding to index '%d'",
82d5ae15 2475 netdev->ifindex);
018ef520 2476 return -1;
0ad19a3f 2477 }
2478
82d5ae15
DL
2479 /* set a mac address */
2480 if (netdev->hwaddr) {
2481 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
36eb9bde 2482 ERROR("failed to setup hw address for '%s'",
82d5ae15 2483 current_ifname);
0ad19a3f 2484 return -1;
2485 }
2486 }
2487
82d5ae15
DL
2488 /* setup ipv4 addresses on the interface */
2489 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
36eb9bde 2490 ERROR("failed to setup ip addresses for '%s'",
0ad19a3f 2491 ifname);
2492 return -1;
2493 }
2494
82d5ae15
DL
2495 /* setup ipv6 addresses on the interface */
2496 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
36eb9bde 2497 ERROR("failed to setup ipv6 addresses for '%s'",
0ad19a3f 2498 ifname);
2499 return -1;
2500 }
2501
82d5ae15 2502 /* set the network device up */
b0efbac4 2503 if (netdev->flags & IFF_UP) {
3cfc0f3a
MN
2504 int err;
2505
d472214b 2506 err = lxc_netdev_up(current_ifname);
3cfc0f3a
MN
2507 if (err) {
2508 ERROR("failed to set '%s' up : %s", current_ifname,
2509 strerror(-err));
0ad19a3f 2510 return -1;
2511 }
2512
2513 /* the network is up, make the loopback up too */
d472214b 2514 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2515 if (err) {
2516 ERROR("failed to set the loopback up : %s",
2517 strerror(-err));
0ad19a3f 2518 return -1;
2519 }
2520 }
2521
f8fee0e2
MK
2522 /* We can only set up the default routes after bringing
2523 * up the interface, sine bringing up the interface adds
2524 * the link-local routes and we can't add a default
2525 * route if the gateway is not reachable. */
2526
2527 /* setup ipv4 gateway on the interface */
2528 if (netdev->ipv4_gateway) {
2529 if (!(netdev->flags & IFF_UP)) {
2530 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2531 return -1;
2532 }
2533
2534 if (lxc_list_empty(&netdev->ipv4)) {
2535 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2536 return -1;
2537 }
2538
2539 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2540 if (err) {
fc739df5
SG
2541 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2542 if (err) {
2543 ERROR("failed to add ipv4 dest for '%s': %s",
2544 ifname, strerror(-err));
2545 }
2546
2547 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2548 if (err) {
2549 ERROR("failed to setup ipv4 gateway for '%s': %s",
2550 ifname, strerror(-err));
2551 if (netdev->ipv4_gateway_auto) {
2552 char buf[INET_ADDRSTRLEN];
2553 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2554 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2555 }
2556 return -1;
19a26f82 2557 }
f8fee0e2
MK
2558 }
2559 }
2560
2561 /* setup ipv6 gateway on the interface */
2562 if (netdev->ipv6_gateway) {
2563 if (!(netdev->flags & IFF_UP)) {
2564 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2565 return -1;
2566 }
2567
2568 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2569 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2570 return -1;
2571 }
2572
2573 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2574 if (err) {
fc739df5
SG
2575 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2576 if (err) {
2577 ERROR("failed to add ipv6 dest for '%s': %s",
f8fee0e2 2578 ifname, strerror(-err));
19a26f82 2579 }
fc739df5
SG
2580
2581 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2582 if (err) {
2583 ERROR("failed to setup ipv6 gateway for '%s': %s",
2584 ifname, strerror(-err));
2585 if (netdev->ipv6_gateway_auto) {
2586 char buf[INET6_ADDRSTRLEN];
2587 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2588 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2589 }
2590 return -1;
2591 }
f8fee0e2
MK
2592 }
2593 }
2594
cd54d859
DL
2595 DEBUG("'%s' has been setup", current_ifname);
2596
0ad19a3f 2597 return 0;
2598}
2599
5f4535a3 2600static int setup_network(struct lxc_list *network)
0ad19a3f 2601{
82d5ae15 2602 struct lxc_list *iterator;
82d5ae15 2603 struct lxc_netdev *netdev;
0ad19a3f 2604
5f4535a3 2605 lxc_list_for_each(iterator, network) {
cd54d859 2606
5f4535a3 2607 netdev = iterator->elem;
82d5ae15
DL
2608
2609 if (setup_netdev(netdev)) {
2610 ERROR("failed to setup netdev");
2611 return -1;
2612 }
2613 }
cd54d859 2614
5f4535a3
DL
2615 if (!lxc_list_empty(network))
2616 INFO("network has been setup");
cd54d859
DL
2617
2618 return 0;
0ad19a3f 2619}
2620
c6d09e15
WB
2621static int parse_resource(const char *res) {
2622 size_t i;
2623 int resid = -1;
2624
2625 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2626 if (strcmp(res, limit_opt[i].name) == 0)
2627 return limit_opt[i].value;
2628 }
2629
2630 /* try to see if it's numeric, so the user may specify
2631 * resources that the running kernel knows about but
2632 * we don't */
2633 if (lxc_safe_int(res, &resid) == 0)
2634 return resid;
2635 return -1;
2636}
2637
2638int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2639 struct lxc_list *it;
2640 struct lxc_limit *lim;
2641 int resid;
2642
2643 lxc_list_for_each(it, limits) {
2644 lim = it->elem;
2645
2646 resid = parse_resource(lim->resource);
2647 if (resid < 0) {
2648 ERROR("unknown resource %s", lim->resource);
2649 return -1;
2650 }
2651
2652 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2653 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2654 return -1;
2655 }
2656 }
2657 return 0;
2658}
2659
2af6bd1b 2660/* try to move physical nics to the init netns */
5610055a 2661void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2af6bd1b 2662{
64d2fcb5 2663 int i, oldfd;
4ec31c52 2664 char ifname[IFNAMSIZ];
2af6bd1b 2665
5610055a 2666 if (netnsfd < 0 || conf->num_savednics == 0)
2af6bd1b
SH
2667 return;
2668
64d2fcb5 2669 INFO("Running to reset %d nic names.", conf->num_savednics);
5610055a 2670
64d2fcb5
CB
2671 oldfd = lxc_preserve_ns(getpid(), "net");
2672 if (oldfd < 0) {
2673 SYSERROR("Failed to open monitor netns fd.");
2af6bd1b
SH
2674 return;
2675 }
64d2fcb5 2676
2af6bd1b
SH
2677 if (setns(netnsfd, 0) != 0) {
2678 SYSERROR("Failed to enter container netns to reset nics");
2679 close(oldfd);
2680 return;
2681 }
2682 for (i=0; i<conf->num_savednics; i++) {
2683 struct saved_nic *s = &conf->saved_nics[i];
f2e206ff 2684 /* retrieve the name of the interface */
2685 if (!if_indextoname(s->ifindex, ifname)) {
2686 WARN("no interface corresponding to index '%d'", s->ifindex);
2687 continue;
2688 }
5610055a 2689 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
f2e206ff 2690 WARN("Error moving nic name:%s back to host netns", ifname);
5610055a 2691 free(s->orig_name);
2af6bd1b 2692 }
5610055a
WB
2693 conf->num_savednics = 0;
2694
2af6bd1b
SH
2695 if (setns(oldfd, 0) != 0)
2696 SYSERROR("Failed to re-enter monitor's netns");
2697 close(oldfd);
2698}
2699
ae9242c8
SH
2700static char *default_rootfs_mount = LXCROOTFSMOUNT;
2701
7b379ab3 2702struct lxc_conf *lxc_conf_init(void)
089cd8b8 2703{
7b379ab3 2704 struct lxc_conf *new;
26ddeedd 2705 int i;
7b379ab3
MN
2706
2707 new = malloc(sizeof(*new));
2708 if (!new) {
2709 ERROR("lxc_conf_init : %m");
2710 return NULL;
2711 }
2712 memset(new, 0, sizeof(*new));
2713
b40a606e 2714 new->loglevel = LXC_LOG_PRIORITY_NOTSET;
cccc74b5 2715 new->personality = -1;
124fa0a8 2716 new->autodev = 1;
596a818d
DE
2717 new->console.log_path = NULL;
2718 new->console.log_fd = -1;
28a4b0e5 2719 new->console.path = NULL;
63376d7d 2720 new->console.peer = -1;
b5159817
DE
2721 new->console.peerpty.busy = -1;
2722 new->console.peerpty.master = -1;
2723 new->console.peerpty.slave = -1;
63376d7d
DL
2724 new->console.master = -1;
2725 new->console.slave = -1;
2726 new->console.name[0] = '\0';
d2e30e99 2727 new->maincmd_fd = -1;
76a26f55 2728 new->nbd_idx = -1;
54c30e29 2729 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048
SH
2730 if (!new->rootfs.mount) {
2731 ERROR("lxc_conf_init : %m");
2732 free(new);
2733 return NULL;
2734 }
d89de239 2735 new->kmsg = 0;
858377e4 2736 new->logfd = -1;
7b379ab3
MN
2737 lxc_list_init(&new->cgroup);
2738 lxc_list_init(&new->network);
2739 lxc_list_init(&new->mount_list);
81810dd1 2740 lxc_list_init(&new->caps);
1fb86a7c 2741 lxc_list_init(&new->keepcaps);
f6d3e3e4 2742 lxc_list_init(&new->id_map);
f979ac15 2743 lxc_list_init(&new->includes);
4184c3e1 2744 lxc_list_init(&new->aliens);
7c661726 2745 lxc_list_init(&new->environment);
c6d09e15 2746 lxc_list_init(&new->limits);
26ddeedd
SH
2747 for (i=0; i<NUM_LXC_HOOKS; i++)
2748 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2749 lxc_list_init(&new->groups);
fe4de9a6
DE
2750 new->lsm_aa_profile = NULL;
2751 new->lsm_se_context = NULL;
5112cd70 2752 new->tmp_umount_proc = 0;
7b379ab3 2753
9f30a190
MM
2754 for (i = 0; i < LXC_NS_MAX; i++)
2755 new->inherit_ns_fd[i] = -1;
2756
72bb04e4
PT
2757 /* if running in a new user namespace, init and COMMAND
2758 * default to running as UID/GID 0 when using lxc-execute */
2759 new->init_uid = 0;
2760 new->init_gid = 0;
2761
7b379ab3 2762 return new;
089cd8b8
DL
2763}
2764
a589434e 2765static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2766{
b0ee5983
CB
2767 char *veth1, *veth2;
2768 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
b7b2fde4
CB
2769 int bridge_index, err;
2770 unsigned int mtu = 0;
13954cce 2771
8bee8851 2772 if (netdev->priv.veth_attr.pair) {
e892973e 2773 veth1 = netdev->priv.veth_attr.pair;
8bee8851
WB
2774 if (handler->conf->reboot)
2775 lxc_netdev_delete_by_name(veth1);
2776 } else {
9ba8130c
SH
2777 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2778 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2779 ERROR("veth1 name too long");
2780 return -1;
2781 }
a0265685 2782 veth1 = lxc_mkifname(veth1buf);
ad40563e
ÇO
2783 if (!veth1) {
2784 ERROR("failed to allocate a temporary name");
2785 return -1;
2786 }
74a2b586
JK
2787 /* store away for deconf */
2788 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
8634bc19 2789 }
82d5ae15 2790
0e391e57 2791 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
a0265685 2792 veth2 = lxc_mkifname(veth2buf);
ad40563e 2793 if (!veth2) {
82d5ae15 2794 ERROR("failed to allocate a temporary name");
ad40563e 2795 goto out_delete;
0ad19a3f 2796 }
2797
3cfc0f3a
MN
2798 err = lxc_veth_create(veth1, veth2);
2799 if (err) {
b0ee5983
CB
2800 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2801 veth2, strerror(-err));
ad40563e 2802 goto out_delete;
0ad19a3f 2803 }
13954cce 2804
49684c0b
CS
2805 /* changing the high byte of the mac address to 0xfe, the bridge interface
2806 * will always keep the host's mac address and not take the mac address
2807 * of a container */
2808 err = setup_private_host_hw_addr(veth1);
2809 if (err) {
b0ee5983
CB
2810 ERROR("failed to change mac address of host interface \"%s\": %s",
2811 veth1, strerror(-err));
49684c0b
CS
2812 goto out_delete;
2813 }
2814
af651aa9
SN
2815 netdev->ifindex = if_nametoindex(veth2);
2816 if (!netdev->ifindex) {
b0ee5983 2817 ERROR("failed to retrieve the index for \"%s\"", veth2);
af651aa9
SN
2818 goto out_delete;
2819 }
2820
82d5ae15 2821 if (netdev->mtu) {
b7b2fde4 2822 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
b0ee5983 2823 WARN("failed to parse mtu from");
b7b2fde4 2824 else
b0ee5983 2825 INFO("retrieved mtu %d", mtu);
e54864d3 2826 } else if (netdev->link) {
e9280f65 2827 bridge_index = if_nametoindex(netdev->link);
729e8bf6
CB
2828 if (bridge_index) {
2829 mtu = netdev_get_mtu(bridge_index);
b0ee5983 2830 INFO("retrieved mtu %d from %s", mtu, netdev->link);
729e8bf6
CB
2831 } else {
2832 mtu = netdev_get_mtu(netdev->ifindex);
b0ee5983 2833 INFO("retrieved mtu %d from %s", mtu, veth2);
729e8bf6 2834 }
e54864d3
NC
2835 }
2836
2837 if (mtu) {
2838 err = lxc_netdev_set_mtu(veth1, mtu);
3cfc0f3a 2839 if (!err)
e54864d3 2840 err = lxc_netdev_set_mtu(veth2, mtu);
3cfc0f3a 2841 if (err) {
b0ee5983
CB
2842 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2843 "and \"%s\": %s",
e54864d3 2844 mtu, veth1, veth2, strerror(-err));
eb14c10a 2845 goto out_delete;
75d09f83
DL
2846 }
2847 }
2848
3cfc0f3a 2849 if (netdev->link) {
c43cbc04 2850 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
3cfc0f3a 2851 if (err) {
b0ee5983
CB
2852 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2853 veth1, netdev->link, strerror(-err));
3cfc0f3a
MN
2854 goto out_delete;
2855 }
b0ee5983 2856 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
eb14c10a
DL
2857 }
2858
d472214b 2859 err = lxc_netdev_up(veth1);
6e35af2e 2860 if (err) {
b0ee5983 2861 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
6e35af2e 2862 goto out_delete;
0ad19a3f 2863 }
2864
e3b4c4c4 2865 if (netdev->upscript) {
751d9dcd
DL
2866 err = run_script(handler->name, "net", netdev->upscript, "up",
2867 "veth", veth1, (char*) NULL);
2868 if (err)
e3b4c4c4 2869 goto out_delete;
e3b4c4c4
ST
2870 }
2871
b0ee5983
CB
2872 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2873 netdev->ifindex);
82d5ae15 2874
6ab9ab6d 2875 return 0;
eb14c10a
DL
2876
2877out_delete:
b316d209
CB
2878 if (netdev->ifindex != 0)
2879 lxc_netdev_delete_by_name(veth1);
f10fad2f 2880 if (!netdev->priv.veth_attr.pair)
ad40563e 2881 free(veth1);
f10fad2f 2882 free(veth2);
6ab9ab6d 2883 return -1;
13954cce 2884}
d957ae2d 2885
74a2b586
JK
2886static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2887{
2888 char *veth1;
2889 int err;
2890
2891 if (netdev->priv.veth_attr.pair)
2892 veth1 = netdev->priv.veth_attr.pair;
2893 else
2894 veth1 = netdev->priv.veth_attr.veth1;
2895
2896 if (netdev->downscript) {
2897 err = run_script(handler->name, "net", netdev->downscript,
2898 "down", "veth", veth1, (char*) NULL);
2899 if (err)
2900 return -1;
2901 }
2902 return 0;
2903}
2904
a589434e 2905static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2906{
0e391e57 2907 char peerbuf[IFNAMSIZ], *peer;
3cfc0f3a 2908 int err;
d957ae2d
MT
2909
2910 if (!netdev->link) {
2911 ERROR("no link specified for macvlan netdev");
2912 return -1;
2913 }
13954cce 2914
9ba8130c
SH
2915 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2916 if (err >= sizeof(peerbuf))
2917 return -1;
82d5ae15 2918
a0265685 2919 peer = lxc_mkifname(peerbuf);
ad40563e 2920 if (!peer) {
82d5ae15
DL
2921 ERROR("failed to make a temporary name");
2922 return -1;
0ad19a3f 2923 }
2924
3cfc0f3a
MN
2925 err = lxc_macvlan_create(netdev->link, peer,
2926 netdev->priv.macvlan_attr.mode);
2927 if (err) {
2928 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2929 peer, netdev->link, strerror(-err));
ad40563e 2930 goto out;
0ad19a3f 2931 }
2932
82d5ae15
DL
2933 netdev->ifindex = if_nametoindex(peer);
2934 if (!netdev->ifindex) {
36eb9bde 2935 ERROR("failed to retrieve the index for %s", peer);
ad40563e 2936 goto out;
22ebac19 2937 }
2938
e3b4c4c4 2939 if (netdev->upscript) {
751d9dcd
DL
2940 err = run_script(handler->name, "net", netdev->upscript, "up",
2941 "macvlan", netdev->link, (char*) NULL);
2942 if (err)
ad40563e 2943 goto out;
e3b4c4c4
ST
2944 }
2945
a589434e 2946 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
e892973e 2947 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
0ad19a3f 2948
d957ae2d 2949 return 0;
ad40563e
ÇO
2950out:
2951 lxc_netdev_delete_by_name(peer);
2952 free(peer);
2953 return -1;
0ad19a3f 2954}
2955
74a2b586
JK
2956static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2957{
2958 int err;
2959
2960 if (netdev->downscript) {
2961 err = run_script(handler->name, "net", netdev->downscript,
2962 "down", "macvlan", netdev->link,
2963 (char*) NULL);
2964 if (err)
2965 return -1;
2966 }
2967 return 0;
2968}
2969
a589434e
JN
2970/* XXX: merge with instantiate_macvlan */
2971static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
26c39028
JHS
2972{
2973 char peer[IFNAMSIZ];
3cfc0f3a 2974 int err;
82f58d03 2975 static uint16_t vlan_cntr = 0;
b7b2fde4 2976 unsigned int mtu = 0;
26c39028
JHS
2977
2978 if (!netdev->link) {
2979 ERROR("no link specified for vlan netdev");
2980 return -1;
2981 }
2982
82f58d03 2983 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
9ba8130c
SH
2984 if (err >= sizeof(peer)) {
2985 ERROR("peer name too long");
2986 return -1;
2987 }
26c39028 2988
3cfc0f3a
MN
2989 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2990 if (err) {
2991 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2992 peer, netdev->link, strerror(-err));
26c39028
JHS
2993 return -1;
2994 }
2995
2996 netdev->ifindex = if_nametoindex(peer);
2997 if (!netdev->ifindex) {
2998 ERROR("failed to retrieve the ifindex for %s", peer);
b84f58b9 2999 lxc_netdev_delete_by_name(peer);
26c39028
JHS
3000 return -1;
3001 }
3002
a589434e 3003 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
e892973e 3004 netdev->ifindex);
b4fb7de1 3005 if (netdev->mtu) {
b7b2fde4
CB
3006 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
3007 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
3008 netdev->ifindex, netdev->name);
3009 return -1;
3010 }
3011 err = lxc_netdev_set_mtu(peer, mtu);
b4fb7de1
VL
3012 if (err) {
3013 ERROR("failed to set mtu '%s' for %s : %s",
3014 netdev->mtu, peer, strerror(-err));
3015 lxc_netdev_delete_by_name(peer);
3016 return -1;
3017 }
3018 }
e892973e 3019
26c39028
JHS
3020 return 0;
3021}
3022
74a2b586
JK
3023static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
3024{
3025 return 0;
3026}
3027
a589434e 3028static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 3029{
6168e99f
DL
3030 if (!netdev->link) {
3031 ERROR("no link specified for the physical interface");
3032 return -1;
3033 }
3034
9d083402 3035 netdev->ifindex = if_nametoindex(netdev->link);
82d5ae15 3036 if (!netdev->ifindex) {
9d083402 3037 ERROR("failed to retrieve the index for %s", netdev->link);
0ad19a3f 3038 return -1;
3039 }
3040
e3b4c4c4
ST
3041 if (netdev->upscript) {
3042 int err;
751d9dcd
DL
3043 err = run_script(handler->name, "net", netdev->upscript,
3044 "up", "phys", netdev->link, (char*) NULL);
3045 if (err)
e3b4c4c4 3046 return -1;
e3b4c4c4
ST
3047 }
3048
82d5ae15 3049 return 0;
0ad19a3f 3050}
3051
74a2b586
JK
3052static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
3053{
3054 int err;
3055
3056 if (netdev->downscript) {
3057 err = run_script(handler->name, "net", netdev->downscript,
3058 "down", "phys", netdev->link, (char*) NULL);
3059 if (err)
3060 return -1;
3061 }
3062 return 0;
3063}
3064
a589434e 3065static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
26b797f3
SH
3066{
3067 netdev->ifindex = 0;
3068 return 0;
3069}
3070
a589434e 3071static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 3072{
82d5ae15 3073 netdev->ifindex = 0;
e3b4c4c4
ST
3074 if (netdev->upscript) {
3075 int err;
751d9dcd
DL
3076 err = run_script(handler->name, "net", netdev->upscript,
3077 "up", "empty", (char*) NULL);
3078 if (err)
e3b4c4c4 3079 return -1;
e3b4c4c4 3080 }
82d5ae15 3081 return 0;
0ad19a3f 3082}
3083
74a2b586
JK
3084static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3085{
3086 int err;
3087
3088 if (netdev->downscript) {
3089 err = run_script(handler->name, "net", netdev->downscript,
3090 "down", "empty", (char*) NULL);
3091 if (err)
3092 return -1;
3093 }
3094 return 0;
3095}
3096
26b797f3
SH
3097static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3098{
3099 return 0;
3100}
3101
3102int lxc_requests_empty_network(struct lxc_handler *handler)
3103{
3104 struct lxc_list *network = &handler->conf->network;
3105 struct lxc_list *iterator;
3106 struct lxc_netdev *netdev;
3107 bool found_none = false, found_nic = false;
3108
3109 if (lxc_list_empty(network))
3110 return 0;
3111
3112 lxc_list_for_each(iterator, network) {
3113
3114 netdev = iterator->elem;
3115
3116 if (netdev->type == LXC_NET_NONE)
3117 found_none = true;
3118 else
3119 found_nic = true;
3120 }
3121 if (found_none && !found_nic)
3122 return 1;
3123 return 0;
3124}
3125
e3b4c4c4 3126int lxc_create_network(struct lxc_handler *handler)
0ad19a3f 3127{
e3b4c4c4 3128 struct lxc_list *network = &handler->conf->network;
82d5ae15 3129 struct lxc_list *iterator;
82d5ae15 3130 struct lxc_netdev *netdev;
cbef6c52
SH
3131 int am_root = (getuid() == 0);
3132
3133 if (!am_root)
3134 return 0;
0ad19a3f 3135
5f4535a3 3136 lxc_list_for_each(iterator, network) {
0ad19a3f 3137
5f4535a3 3138 netdev = iterator->elem;
13954cce 3139
24654103 3140 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
82d5ae15 3141 ERROR("invalid network configuration type '%d'",
5f4535a3 3142 netdev->type);
82d5ae15
DL
3143 return -1;
3144 }
0ad19a3f 3145
e3b4c4c4 3146 if (netdev_conf[netdev->type](handler, netdev)) {
82d5ae15
DL
3147 ERROR("failed to create netdev");
3148 return -1;
3149 }
e3b4c4c4 3150
0ad19a3f 3151 }
3152
3153 return 0;
3154}
3155
358daf49 3156bool lxc_delete_network(struct lxc_handler *handler)
7fef7a06 3157{
e97946ae 3158 int ret;
74a2b586 3159 struct lxc_list *network = &handler->conf->network;
7fef7a06
DL
3160 struct lxc_list *iterator;
3161 struct lxc_netdev *netdev;
358daf49 3162 bool deleted_all = true;
7fef7a06
DL
3163
3164 lxc_list_for_each(iterator, network) {
3165 netdev = iterator->elem;
d472214b 3166
74a2b586 3167 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
d8f8e352 3168 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
358daf49
CB
3169 WARN("Failed to rename interface with index %d "
3170 "to its initial name \"%s\".",
3171 netdev->ifindex, netdev->link);
d472214b 3172 continue;
d8f8e352 3173 }
d472214b 3174
74a2b586 3175 if (netdev_deconf[netdev->type](handler, netdev)) {
e97946ae 3176 WARN("Failed to destroy netdev");
74a2b586
JK
3177 }
3178
d8f8e352
DL
3179 /* Recent kernel remove the virtual interfaces when the network
3180 * namespace is destroyed but in case we did not moved the
3181 * interface to the network namespace, we have to destroy it
3182 */
e97946ae
CB
3183 if (netdev->ifindex != 0) {
3184 ret = lxc_netdev_delete_by_index(netdev->ifindex);
358daf49
CB
3185 if (-ret == ENODEV) {
3186 INFO("Interface \"%s\" with index %d already "
3187 "deleted or existing in different network "
3188 "namespace.",
3189 netdev->name ? netdev->name : "(null)",
3190 netdev->ifindex);
3191 } else if (ret < 0) {
3192 deleted_all = false;
3193 WARN("Failed to remove interface \"%s\" with "
3194 "index %d: %s.",
3195 netdev->name ? netdev->name : "(null)",
3196 netdev->ifindex, strerror(-ret));
3197 } else {
3198 INFO("Removed interface \"%s\" with index %d.",
3199 netdev->name ? netdev->name : "(null)",
3200 netdev->ifindex);
3201 }
e97946ae
CB
3202 }
3203
3204 /* Explicitly delete host veth device to prevent lingering
3205 * devices. We had issues in LXD around this.
3206 */
b316d209 3207 if (netdev->ifindex != 0 && netdev->type == LXC_NET_VETH && !am_unpriv()) {
358daf49
CB
3208 char *hostveth;
3209 if (netdev->priv.veth_attr.pair) {
e97946ae 3210 hostveth = netdev->priv.veth_attr.pair;
358daf49
CB
3211 ret = lxc_netdev_delete_by_name(hostveth);
3212 if (ret < 0) {
3213 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3214 } else {
3215 INFO("Removed interface \"%s\" from host.", hostveth);
358daf49
CB
3216 }
3217 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
e97946ae 3218 hostveth = netdev->priv.veth_attr.veth1;
e97946ae 3219 ret = lxc_netdev_delete_by_name(hostveth);
358daf49
CB
3220 if (ret < 0) {
3221 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3222 } else {
3223 INFO("Removed interface \"%s\" from host.", hostveth);
3224 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3225 }
e97946ae
CB
3226 }
3227 }
7fef7a06 3228 }
358daf49
CB
3229
3230 return deleted_all;
7fef7a06
DL
3231}
3232
45e854dc
SG
3233#define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3234
fe1f672f 3235/* lxc-user-nic returns "interface_name:interface_name\n" */
eab15c1e 3236#define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
c43cbc04
SH
3237static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3238 struct lxc_netdev *netdev, pid_t pid)
cbef6c52
SH
3239{
3240 pid_t child;
a7242d9a
ÇO
3241 int bytes, pipefd[2];
3242 char *token, *saveptr = NULL;
fe1f672f 3243 char buffer[MAX_BUFFER_SIZE];
091045f8 3244 char netdev_link[IFNAMSIZ + 1];
cbef6c52
SH
3245
3246 if (netdev->type != LXC_NET_VETH) {
3247 ERROR("nic type %d not support for unprivileged use",
091045f8 3248 netdev->type);
cbef6c52
SH
3249 return -1;
3250 }
3251
091045f8 3252 if (pipe(pipefd) < 0) {
a7242d9a
ÇO
3253 SYSERROR("pipe failed");
3254 return -1;
3255 }
3256
091045f8
CB
3257 child = fork();
3258 if (child < 0) {
cbef6c52 3259 SYSERROR("fork");
a7242d9a
ÇO
3260 close(pipefd[0]);
3261 close(pipefd[1]);
3262 return -1;
3263 }
3264
3265 if (child == 0) { // child
091045f8
CB
3266 /* Call lxc-user-nic pid type bridge. */
3267 int ret;
3268 char pidstr[LXC_NUMSTRLEN64];
3269
3270 close(pipefd[0]); /* Close the read-end of the pipe. */
3271
3272 /* Redirect stdout to write-end of the pipe. */
3273 ret = dup2(pipefd[1], STDOUT_FILENO);
3274 close(pipefd[1]); /* Close the write-end of the pipe. */
3275 if (ret < 0) {
3276 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3277 exit(EXIT_FAILURE);
3278 }
a7242d9a 3279
091045f8 3280 if (netdev->link)
cff7b5eb 3281 strncpy(netdev_link, netdev->link, IFNAMSIZ);
091045f8 3282 else
cff7b5eb 3283 strncpy(netdev_link, "none", IFNAMSIZ);
091045f8
CB
3284
3285 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3286 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3287 exit(EXIT_FAILURE);
3288 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3289
3290 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3291 lxcname, pidstr, netdev_link, netdev->name);
c43cbc04 3292 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
091045f8
CB
3293 pidstr, "veth", netdev_link, netdev->name, NULL);
3294
3295 SYSERROR("Failed to exec lxc-user-nic.");
3296 exit(EXIT_FAILURE);
a7242d9a
ÇO
3297 }
3298
3299 /* close the write-end of the pipe */
3300 close(pipefd[1]);
3301
fe1f672f 3302 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
091045f8
CB
3303 if (bytes < 0)
3304 SYSERROR("Failed to read from pipe file descriptor.");
a7242d9a
ÇO
3305 buffer[bytes - 1] = '\0';
3306
3307 if (wait_for_pid(child) != 0) {
3308 close(pipefd[0]);
cbef6c52
SH
3309 return -1;
3310 }
3311
a7242d9a
ÇO
3312 /* close the read-end of the pipe */
3313 close(pipefd[0]);
cbef6c52 3314
a7242d9a
ÇO
3315 /* fill netdev->name field */
3316 token = strtok_r(buffer, ":", &saveptr);
3317 if (!token)
3318 return -1;
091045f8
CB
3319
3320 netdev->name = malloc(IFNAMSIZ + 1);
658979c5 3321 if (!netdev->name) {
091045f8 3322 SYSERROR("Failed to allocate memory.");
658979c5
SH
3323 return -1;
3324 }
091045f8 3325 memset(netdev->name, 0, IFNAMSIZ + 1);
658979c5 3326 strncpy(netdev->name, token, IFNAMSIZ);
a7242d9a
ÇO
3327
3328 /* fill netdev->veth_attr.pair field */
3329 token = strtok_r(NULL, ":", &saveptr);
3330 if (!token)
3331 return -1;
091045f8 3332
a7242d9a 3333 netdev->priv.veth_attr.pair = strdup(token);
658979c5 3334 if (!netdev->priv.veth_attr.pair) {
091045f8 3335 ERROR("Failed to allocate memory.");
658979c5
SH
3336 return -1;
3337 }
45e854dc 3338
a7242d9a 3339 return 0;
cbef6c52
SH
3340}
3341
c43cbc04
SH
3342int lxc_assign_network(const char *lxcpath, char *lxcname,
3343 struct lxc_list *network, pid_t pid)
0ad19a3f 3344{
82d5ae15 3345 struct lxc_list *iterator;
82d5ae15 3346 struct lxc_netdev *netdev;
f2e206ff 3347 char ifname[IFNAMSIZ];
cbef6c52 3348 int am_root = (getuid() == 0);
3cfc0f3a 3349 int err;
0ad19a3f 3350
5f4535a3 3351 lxc_list_for_each(iterator, network) {
82d5ae15 3352
5f4535a3 3353 netdev = iterator->elem;
82d5ae15 3354
fbb16259 3355 if (netdev->type == LXC_NET_VETH && !am_root) {
72ccbbe1
SC
3356 if (netdev->mtu)
3357 INFO("mtu ignored due to insufficient privilege");
c43cbc04 3358 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
cbef6c52 3359 return -1;
658979c5
SH
3360 // lxc-user-nic has moved the nic to the new ns.
3361 // unpriv_assign_nic() fills in netdev->name.
3362 // netdev->ifindex will be filed in at setup_netdev.
cbef6c52
SH
3363 continue;
3364 }
236087a6 3365
fbb16259
SH
3366 /* empty network namespace, nothing to move */
3367 if (!netdev->ifindex)
3368 continue;
3369
f2e206ff 3370 /* retrieve the name of the interface */
3371 if (!if_indextoname(netdev->ifindex, ifname)) {
3372 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3373 return -1;
3374 }
3375
3376 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3cfc0f3a
MN
3377 if (err) {
3378 ERROR("failed to move '%s' to the container : %s",
3379 netdev->link, strerror(-err));
82d5ae15
DL
3380 return -1;
3381 }
3382
198cbbaa 3383 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
0ad19a3f 3384 }
3385
3386 return 0;
3387}
3388
251d0d2a
DE
3389static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3390 size_t buf_size)
f6d3e3e4 3391{
29053180
CB
3392 char path[MAXPATHLEN];
3393 int fd, ret;
f6d3e3e4 3394
29053180
CB
3395 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3396 idtype == ID_TYPE_UID ? 'u' : 'g');
3397 if (ret < 0 || ret >= MAXPATHLEN) {
3398 ERROR("failed to create path \"%s\"", path);
f6d3e3e4
SH
3399 return -E2BIG;
3400 }
29053180
CB
3401
3402 fd = open(path, O_WRONLY);
3403 if (fd < 0) {
3404 SYSERROR("failed to open \"%s\"", path);
3405 return -1;
f6d3e3e4 3406 }
29053180
CB
3407
3408 errno = 0;
3409 ret = lxc_write_nointr(fd, buf, buf_size);
3410 if (ret != buf_size) {
3411 SYSERROR("failed to write %cid mapping to \"%s\"",
3412 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3413 close(fd);
3414 return -1;
3415 }
3416 close(fd);
3417
3418 return 0;
f6d3e3e4
SH
3419}
3420
df6a2945
CB
3421/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both. */
3422static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3423{
3424 char *path;
3425 int ret;
3426 struct stat st;
3427 int fret = 0;
3428
3429 path = on_path(binary, NULL);
3430 if (!path)
3431 return -ENOENT;
3432
3433 ret = stat(path, &st);
3434 if (ret < 0) {
3435 fret = -errno;
3436 goto cleanup;
3437 }
3438
3439 /* Check if the binary is setuid. */
3440 if (st.st_mode & S_ISUID) {
3441 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3442 fret = 1;
3443 goto cleanup;
3444 }
3445
69924fff 3446 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
3447 /* Check if it has the CAP_SETUID capability. */
3448 if ((cap & CAP_SETUID) &&
3449 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3450 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3451 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3452 "and CAP_PERMITTED sets.", path);
3453 fret = 1;
3454 goto cleanup;
3455 }
3456
3457 /* Check if it has the CAP_SETGID capability. */
3458 if ((cap & CAP_SETGID) &&
3459 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3460 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3461 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3462 "and CAP_PERMITTED sets.", path);
3463 fret = 1;
3464 goto cleanup;
3465 }
d6018f88 3466 #else
69924fff
CB
3467 /* If we cannot check for file capabilities we need to give the benefit
3468 * of the doubt. Otherwise we might fail even though all the necessary
3469 * file capabilities are set.
3470 */
d6018f88
CB
3471 DEBUG("Cannot check for file capabilites as full capability support is "
3472 "missing. Manual intervention needed.");
3473 fret = 1;
df6a2945
CB
3474 #endif
3475
3476cleanup:
3477 free(path);
3478 return fret;
3479}
3480
986ef930
CB
3481int lxc_map_ids_exec_wrapper(void *args)
3482{
3483 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3484 return -1;
3485}
3486
f6d3e3e4
SH
3487int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3488{
f6d3e3e4 3489 struct id_map *map;
4bc3b759 3490 struct lxc_list *iterator;
251d0d2a 3491 enum idtype type;
986ef930 3492 char u_or_g;
4bc3b759 3493 char *pos;
99d43365 3494 int fill, left;
986ef930
CB
3495 char cmd_output[MAXPATHLEN];
3496 /* strlen("new@idmap") = 9
3497 * +
3498 * strlen(" ") = 1
3499 * +
3500 * LXC_NUMSTRLEN64
3501 * +
3502 * strlen(" ") = 1
3503 *
3504 * We add some additional space to make sure that we really have
3505 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3506 */
3507 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3508 int ret = 0, uidmap = 0, gidmap = 0;
3509 bool use_shadow = false, had_entry = false;
df6a2945
CB
3510
3511 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3512 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
3513 * will protected it by preventing another user from being handed the
3514 * range by shadow.
3515 */
df6a2945
CB
3516 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
3517 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
3518 if (uidmap > 0 && gidmap > 0) {
3519 DEBUG("Functional newuidmap and newgidmap binary found.");
4bc3b759 3520 use_shadow = true;
df6a2945 3521 } else {
99d43365
CB
3522 /* In case unprivileged users run application containers via
3523 * execute() or a start*() there are valid cases where they may
3524 * only want to map their own {g,u}id. Let's not block them from
3525 * doing so by requiring geteuid() == 0.
3526 */
3527 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3528 "write directly with euid %d.", geteuid());
0e6e3a41 3529 }
251d0d2a 3530
986ef930
CB
3531 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3532 type++, u_or_g = 'g') {
3533 pos = mapbuf;
3534
0e6e3a41 3535 if (use_shadow)
986ef930 3536 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 3537
cf3ef16d 3538 lxc_list_for_each(iterator, idmap) {
4bc3b759
CB
3539 /* The kernel only takes <= 4k for writes to
3540 * /proc/<nr>/[ug]id_map
3541 */
251d0d2a 3542 map = iterator->elem;
cf3ef16d
SH
3543 if (map->idtype != type)
3544 continue;
3545
4bc3b759
CB
3546 had_entry = true;
3547
986ef930 3548 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 3549 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
3550 use_shadow ? " " : "", map->nsid,
3551 map->hostid, map->range,
0e6e3a41 3552 use_shadow ? "" : "\n");
cf3ef16d 3553 if (fill <= 0 || fill >= left)
4bc3b759
CB
3554 SYSERROR("Too many {g,u}id mappings defined.");
3555
cf3ef16d 3556 pos += fill;
251d0d2a 3557 }
cf3ef16d 3558 if (!had_entry)
4f7521b4 3559 continue;
cf3ef16d 3560
986ef930
CB
3561 /* Try to catch the ouput of new{g,u}idmap to make debugging
3562 * easier.
3563 */
3564 if (use_shadow) {
3565 ret = run_command(cmd_output, sizeof(cmd_output),
3566 lxc_map_ids_exec_wrapper,
3567 (void *)mapbuf);
3568 if (ret < 0) {
3569 ERROR("new%cidmap failed to write mapping: %s",
3570 u_or_g, cmd_output);
3571 return -1;
3572 }
d1838f34 3573 } else {
986ef930
CB
3574 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3575 if (ret < 0)
3576 return -1;
d1838f34 3577 }
986ef930
CB
3578
3579 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 3580 }
251d0d2a 3581
986ef930 3582 return 0;
f6d3e3e4
SH
3583}
3584
cf3ef16d 3585/*
7b50c609
TS
3586 * return the host uid/gid to which the container root is mapped in
3587 * *val.
0b3a6504 3588 * Return true if id was found, false otherwise.
cf3ef16d 3589 */
2a9a80cb 3590bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 3591 unsigned long *val)
cf3ef16d
SH
3592{
3593 struct lxc_list *it;
3594 struct id_map *map;
3595
3596 lxc_list_for_each(it, &conf->id_map) {
3597 map = it->elem;
7b50c609 3598 if (map->idtype != idtype)
cf3ef16d
SH
3599 continue;
3600 if (map->nsid != 0)
3601 continue;
2a9a80cb
SH
3602 *val = map->hostid;
3603 return true;
cf3ef16d 3604 }
2a9a80cb 3605 return false;
cf3ef16d
SH
3606}
3607
2133f58c 3608int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3609{
3610 struct lxc_list *it;
3611 struct id_map *map;
3612 lxc_list_for_each(it, &conf->id_map) {
3613 map = it->elem;
2133f58c 3614 if (map->idtype != idtype)
cf3ef16d
SH
3615 continue;
3616 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 3617 return (id - map->hostid) + map->nsid;
cf3ef16d 3618 }
57d116ab 3619 return -1;
cf3ef16d
SH
3620}
3621
339efad9 3622int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3623{
3624 struct lxc_list *it;
3625 struct id_map *map;
2133f58c 3626 unsigned int freeid = 0;
cf3ef16d
SH
3627again:
3628 lxc_list_for_each(it, &conf->id_map) {
3629 map = it->elem;
2133f58c 3630 if (map->idtype != idtype)
cf3ef16d
SH
3631 continue;
3632 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3633 freeid = map->nsid + map->range;
3634 goto again;
3635 }
3636 }
3637 return freeid;
3638}
3639
19a26f82
MK
3640int lxc_find_gateway_addresses(struct lxc_handler *handler)
3641{
3642 struct lxc_list *network = &handler->conf->network;
3643 struct lxc_list *iterator;
3644 struct lxc_netdev *netdev;
3645 int link_index;
3646
3647 lxc_list_for_each(iterator, network) {
3648 netdev = iterator->elem;
3649
3650 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3651 continue;
3652
3653 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3654 ERROR("gateway = auto only supported for "
3655 "veth and macvlan");
3656 return -1;
3657 }
3658
3659 if (!netdev->link) {
3660 ERROR("gateway = auto needs a link interface");
3661 return -1;
3662 }
3663
3664 link_index = if_nametoindex(netdev->link);
3665 if (!link_index)
3666 return -EINVAL;
3667
3668 if (netdev->ipv4_gateway_auto) {
3669 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3670 ERROR("failed to automatically find ipv4 gateway "
3671 "address from link interface '%s'", netdev->link);
3672 return -1;
3673 }
3674 }
3675
3676 if (netdev->ipv6_gateway_auto) {
3677 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3678 ERROR("failed to automatically find ipv6 gateway "
3679 "address from link interface '%s'", netdev->link);
3680 return -1;
3681 }
3682 }
3683 }
3684
3685 return 0;
3686}
3687
5e4a62bf 3688int lxc_create_tty(const char *name, struct lxc_conf *conf)
b0a33c1e 3689{
5e4a62bf 3690 struct lxc_tty_info *tty_info = &conf->tty_info;
025ed0f3 3691 int i, ret;
b0a33c1e 3692
5e4a62bf
DL
3693 /* no tty in the configuration */
3694 if (!conf->tty)
b0a33c1e 3695 return 0;
3696
9e1045e3 3697 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
b0a33c1e 3698 if (!tty_info->pty_info) {
9e1045e3
CB
3699 SYSERROR("failed to allocate struct *pty_info");
3700 return -ENOMEM;
b0a33c1e 3701 }
3702
985d15b1 3703 for (i = 0; i < conf->tty; i++) {
b0a33c1e 3704 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3705
025ed0f3
SH
3706 process_lock();
3707 ret = openpty(&pty_info->master, &pty_info->slave,
9e1045e3 3708 pty_info->name, NULL, NULL);
025ed0f3
SH
3709 process_unlock();
3710 if (ret) {
9e1045e3 3711 SYSERROR("failed to create pty device number %d", i);
985d15b1
MT
3712 tty_info->nbtty = i;
3713 lxc_delete_tty(tty_info);
9e1045e3 3714 return -ENOTTY;
b0a33c1e 3715 }
3716
9e1045e3 3717 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
5332bb84
DL
3718 pty_info->name, pty_info->master, pty_info->slave);
3719
3ec1648d 3720 /* Prevent leaking the file descriptors to the container */
9e1045e3
CB
3721 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3722 if (ret < 0)
3723 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3724 "pty device \"%s\": %s",
3725 pty_info->master, pty_info->name, strerror(errno));
3726
3727 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3728 if (ret < 0)
3729 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3730 "pty device \"%s\": %s",
3731 pty_info->slave, pty_info->name, strerror(errno));
b035ad62 3732
b0a33c1e 3733 pty_info->busy = 0;
3734 }
3735
985d15b1 3736 tty_info->nbtty = conf->tty;
1ac470c0 3737
9e1045e3 3738 INFO("finished allocating %d pts devices", conf->tty);
985d15b1 3739 return 0;
b0a33c1e 3740}
3741
3742void lxc_delete_tty(struct lxc_tty_info *tty_info)
3743{
3744 int i;
3745
3746 for (i = 0; i < tty_info->nbtty; i++) {
3747 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3748
3749 close(pty_info->master);
3750 close(pty_info->slave);
3751 }
3752
3753 free(tty_info->pty_info);
e00c0242 3754 tty_info->pty_info = NULL;
b0a33c1e 3755 tty_info->nbtty = 0;
3756}
3757
f4f52cb5
CB
3758
3759int chown_mapped_root_exec_wrapper(void *args)
3760{
3761 execvp("lxc-usernsexec", args);
3762 return -1;
3763}
3764
f6d3e3e4 3765/*
7b50c609
TS
3766 * chown_mapped_root: for an unprivileged user with uid/gid X to
3767 * chown a dir to subuid/subgid Y, he needs to run chown as root
3768 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3769 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3770 * root is privileged with respect to hostuid/hostgid X, allowing
3771 * him to do the chown.
f6d3e3e4 3772 */
c4d10a05 3773int chown_mapped_root(char *path, struct lxc_conf *conf)
f6d3e3e4 3774{
f4f52cb5 3775 uid_t rootuid, rootgid;
2a9a80cb 3776 unsigned long val;
a7ef8753 3777 char *chownpath = path;
f4f52cb5
CB
3778 int hostuid, hostgid, ret;
3779 struct stat sb;
3780 char map1[100], map2[100], map3[100], map4[100], map5[100];
3781 char ugid[100];
3782 char *args1[] = {"lxc-usernsexec",
3783 "-m", map1,
3784 "-m", map2,
3785 "-m", map3,
3786 "-m", map5,
3787 "--", "chown", ugid, path,
3788 NULL};
3789 char *args2[] = {"lxc-usernsexec",
3790 "-m", map1,
3791 "-m", map2,
3792 "-m", map3,
3793 "-m", map4,
3794 "-m", map5,
3795 "--", "chown", ugid, path,
3796 NULL};
3797 char cmd_output[MAXPATHLEN];
3798
3799 hostuid = geteuid();
3800 hostgid = getegid();
f6d3e3e4 3801
2a9a80cb 3802 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 3803 ERROR("No uid mapping for container root");
c4d10a05 3804 return -1;
f6d3e3e4 3805 }
f4f52cb5 3806 rootuid = (uid_t)val;
7b50c609 3807 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 3808 ERROR("No gid mapping for container root");
7b50c609
TS
3809 return -1;
3810 }
f4f52cb5 3811 rootgid = (gid_t)val;
2a9a80cb 3812
a7ef8753 3813 /*
f4f52cb5 3814 * In case of overlay, we want only the writeable layer to be chowned
a7ef8753 3815 */
1f92162d 3816 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
a7ef8753
SH
3817 chownpath = strchr(path, ':');
3818 if (!chownpath) {
3819 ERROR("Bad overlay path: %s", path);
3820 return -1;
3821 }
f4f52cb5 3822 chownpath = strchr(chownpath + 1, ':');
a7ef8753
SH
3823 if (!chownpath) {
3824 ERROR("Bad overlay path: %s", path);
3825 return -1;
3826 }
3827 chownpath++;
3828 }
3829 path = chownpath;
f4f52cb5 3830 if (hostuid == 0) {
7b50c609 3831 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3832 ERROR("Error chowning %s", path);
3833 return -1;
3834 }
3835 return 0;
3836 }
f3d7e4ca 3837
f4f52cb5 3838 if (rootuid == hostuid) {
f3d7e4ca
SH
3839 // nothing to do
3840 INFO("%s: container root is our uid; no need to chown" ,__func__);
3841 return 0;
3842 }
3843
f4f52cb5
CB
3844 // save the current gid of "path"
3845 if (stat(path, &sb) < 0) {
3846 ERROR("Error stat %s", path);
f6d3e3e4
SH
3847 return -1;
3848 }
7b50c609 3849
f4f52cb5
CB
3850 /*
3851 * A file has to be group-owned by a gid mapped into the
3852 * container, or the container won't be privileged over it.
3853 */
3854 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3855 if (sb.st_uid == hostuid &&
3856 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3857 chown(path, -1, hostgid) < 0) {
3858 ERROR("Failed chgrping %s", path);
3859 return -1;
3860 }
f6d3e3e4 3861
f4f52cb5
CB
3862 // "u:0:rootuid:1"
3863 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3864 if (ret < 0 || ret >= 100) {
3865 ERROR("Error uid printing map string");
3866 return -1;
3867 }
7b50c609 3868
f4f52cb5
CB
3869 // "u:hostuid:hostuid:1"
3870 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3871 if (ret < 0 || ret >= 100) {
3872 ERROR("Error uid printing map string");
3873 return -1;
3874 }
c4d10a05 3875
f4f52cb5
CB
3876 // "g:0:rootgid:1"
3877 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3878 if (ret < 0 || ret >= 100) {
3879 ERROR("Error gid printing map string");
3880 return -1;
3881 }
98e5ba51 3882
f4f52cb5
CB
3883 // "g:pathgid:rootgid+pathgid:1"
3884 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3885 rootgid + (gid_t)sb.st_gid);
3886 if (ret < 0 || ret >= 100) {
3887 ERROR("Error gid printing map string");
3888 return -1;
3889 }
c4d10a05 3890
f4f52cb5
CB
3891 // "g:hostgid:hostgid:1"
3892 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3893 if (ret < 0 || ret >= 100) {
3894 ERROR("Error gid printing map string");
3895 return -1;
3896 }
7b50c609 3897
f4f52cb5
CB
3898 // "0:pathgid" (chown)
3899 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3900 if (ret < 0 || ret >= 100) {
3901 ERROR("Error owner printing format string for chown");
3902 return -1;
3903 }
7b50c609 3904
f4f52cb5
CB
3905 if (hostgid == sb.st_gid)
3906 ret = run_command(cmd_output, sizeof(cmd_output),
3907 chown_mapped_root_exec_wrapper,
3908 (void *)args1);
3909 else
3910 ret = run_command(cmd_output, sizeof(cmd_output),
3911 chown_mapped_root_exec_wrapper,
3912 (void *)args2);
3913 if (ret < 0)
3914 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 3915
f4f52cb5 3916 return ret;
f6d3e3e4
SH
3917}
3918
c4d10a05 3919int ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 3920{
c4d10a05 3921 if (lxc_list_empty(&c->id_map))
f6d3e3e4 3922 return 0;
c4d10a05 3923
29b10e4f 3924 if (strcmp(c->console.name, "") !=0 && chown_mapped_root(c->console.name, c) < 0) {
c4d10a05
SH
3925 ERROR("Failed to chown %s", c->console.name);
3926 return -1;
3927 }
3928
f6d3e3e4
SH
3929 return 0;
3930}
3931
943144d9
CB
3932/* NOTE: Must not be called from inside the container namespace! */
3933int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3934{
3935 int mounted;
3936
943144d9 3937 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3938 if (mounted == -1) {
943144d9 3939 SYSERROR("failed to mount /proc in the container");
01958b1f 3940 /* continue only if there is no rootfs */
943144d9 3941 if (conf->rootfs.path)
01958b1f 3942 return -1;
5112cd70 3943 } else if (mounted == 1) {
943144d9 3944 conf->tmp_umount_proc = 1;
5112cd70 3945 }
943144d9 3946
5112cd70
SH
3947 return 0;
3948}
3949
3950void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3951{
3952 if (lxc_conf->tmp_umount_proc == 1) {
3953 umount("/proc");
3954 lxc_conf->tmp_umount_proc = 0;
3955 }
3956}
3957
6a0c909a 3958void remount_all_slave(void)
e995d7a2
SH
3959{
3960 /* walk /proc/mounts and change any shared entries to slave */
3961 FILE *f = fopen("/proc/self/mountinfo", "r");
3962 char *line = NULL;
3963 size_t len = 0;
3964
3965 if (!f) {
3966 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3967 ERROR("Continuing container startup...");
3968 return;
3969 }
3970
3971 while (getline(&line, &len, f) != -1) {
3972 char *target, *opts;
3973 target = get_field(line, 4);
3974 if (!target)
3975 continue;
3976 opts = get_field(target, 2);
3977 if (!opts)
3978 continue;
3979 null_endofword(opts);
3980 if (!strstr(opts, "shared"))
3981 continue;
3982 null_endofword(target);
3983 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3984 SYSERROR("Failed to make %s rslave", target);
3985 ERROR("Continuing...");
3986 }
3987 }
3988 fclose(f);
f10fad2f 3989 free(line);
e995d7a2
SH
3990}
3991
2322903b
SH
3992void lxc_execute_bind_init(struct lxc_conf *conf)
3993{
3994 int ret;
9d9c111c
SH
3995 char path[PATH_MAX], destpath[PATH_MAX], *p;
3996
3997 /* If init exists in the container, don't bind mount a static one */
3998 p = choose_init(conf->rootfs.mount);
3999 if (p) {
4000 free(p);
4001 return;
4002 }
2322903b
SH
4003
4004 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
4005 if (ret < 0 || ret >= PATH_MAX) {
4006 WARN("Path name too long searching for lxc.init.static");
4007 return;
4008 }
4009
4010 if (!file_exists(path)) {
4011 INFO("%s does not exist on host", path);
4012 return;
4013 }
4014
4015 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
4016 if (ret < 0 || ret >= PATH_MAX) {
4017 WARN("Path name too long for container's lxc.init.static");
4018 return;
4019 }
4020
4021 if (!file_exists(destpath)) {
4022 FILE * pathfile = fopen(destpath, "wb");
4023 if (!pathfile) {
4024 SYSERROR("Failed to create mount target '%s'", destpath);
4025 return;
4026 }
4027 fclose(pathfile);
4028 }
4029
592fd47a 4030 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
2322903b
SH
4031 if (ret < 0)
4032 SYSERROR("Failed to bind lxc.init.static into container");
4033 INFO("lxc.init.static bound into container at %s", path);
4034}
4035
35120d9c
SH
4036/*
4037 * This does the work of remounting / if it is shared, calling the
4038 * container pre-mount hooks, and mounting the rootfs.
4039 */
4040int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 4041{
35120d9c
SH
4042 if (conf->rootfs_setup) {
4043 /*
4044 * rootfs was set up in another namespace. bind-mount it
4045 * to give us a mount in our own ns so we can pivot_root to it
4046 */
4047 const char *path = conf->rootfs.mount;
4048 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4049 ERROR("Failed to bind-mount container / onto itself");
145832ba 4050 return -1;
35120d9c 4051 }
145832ba 4052 return 0;
35120d9c 4053 }
d4ef7c50 4054
e995d7a2
SH
4055 remount_all_slave();
4056
35120d9c
SH
4057 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4058 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4059 return -1;
4060 }
4061
4062 if (setup_rootfs(conf)) {
4063 ERROR("failed to setup rootfs for '%s'", name);
4064 return -1;
4065 }
4066
4067 conf->rootfs_setup = true;
4068 return 0;
4069}
4070
1c1c7051
SH
4071static bool verify_start_hooks(struct lxc_conf *conf)
4072{
4073 struct lxc_list *it;
4074 char path[MAXPATHLEN];
4075 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4076 char *hookname = it->elem;
4077 struct stat st;
4078 int ret;
4079
4080 ret = snprintf(path, MAXPATHLEN, "%s%s",
7b6753e7 4081 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
1c1c7051
SH
4082 if (ret < 0 || ret >= MAXPATHLEN)
4083 return false;
4084 ret = stat(path, &st);
4085 if (ret) {
7b6753e7 4086 SYSERROR("Start hook %s not found in container",
1c1c7051
SH
4087 hookname);
4088 return false;
4089 }
6a0c909a 4090 return true;
1c1c7051
SH
4091 }
4092
4093 return true;
4094}
4095
ae467c54 4096static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
e8bd4e43 4097{
ae467c54
CB
4098 int i;
4099 int *ttyfds;
4100 struct lxc_pty_info *pty_info;
e8bd4e43
SH
4101 struct lxc_conf *conf = handler->conf;
4102 const struct lxc_tty_info *tty_info = &conf->tty_info;
e8bd4e43 4103 int sock = handler->ttysock[0];
ae467c54
CB
4104 int ret = -1;
4105 size_t num_ttyfds = (2 * conf->tty);
e8bd4e43 4106
ae467c54
CB
4107 ttyfds = malloc(num_ttyfds * sizeof(int));
4108 if (!ttyfds)
4109 return -1;
4110
4111 for (i = 0; i < num_ttyfds; i++) {
4112 pty_info = &tty_info->pty_info[i / 2];
4113 ttyfds[i++] = pty_info->slave;
4114 ttyfds[i] = pty_info->master;
4115 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
f07fa8df
CB
4116 "parent",
4117 pty_info->name, pty_info->master, pty_info->slave);
e8bd4e43
SH
4118 }
4119
ae467c54
CB
4120 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
4121 if (ret < 0)
4122 ERROR("failed to send %d ttys to parent: %s", conf->tty,
4123 strerror(errno));
4124 else
4125 TRACE("sent %d ttys to parent", conf->tty);
4126
e8bd4e43
SH
4127 close(handler->ttysock[0]);
4128 close(handler->ttysock[1]);
4129
ae467c54
CB
4130 for (i = 0; i < num_ttyfds; i++)
4131 close(ttyfds[i]);
e8bd4e43 4132
ae467c54
CB
4133 free(ttyfds);
4134
4135 return ret;
e8bd4e43
SH
4136}
4137
35120d9c
SH
4138int lxc_setup(struct lxc_handler *handler)
4139{
4140 const char *name = handler->name;
4141 struct lxc_conf *lxc_conf = handler->conf;
4142 const char *lxcpath = handler->lxcpath;
35120d9c
SH
4143
4144 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4145 ERROR("Error setting up rootfs mount after spawn");
4146 return -1;
4147 }
4148
6c544cb3
MM
4149 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4150 if (setup_utsname(lxc_conf->utsname)) {
4151 ERROR("failed to setup the utsname for '%s'", name);
4152 return -1;
4153 }
0ad19a3f 4154 }
4155
5f4535a3 4156 if (setup_network(&lxc_conf->network)) {
36eb9bde 4157 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 4158 return -1;
0ad19a3f 4159 }
4160
bc6928ff 4161 if (lxc_conf->autodev > 0) {
14221cbb 4162 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
91c3830e 4163 ERROR("failed to mount /dev in the container");
c6883f38
SH
4164 return -1;
4165 }
4166 }
4167
368bbc02
CS
4168 /* do automatic mounts (mainly /proc and /sys), but exclude
4169 * those that need to wait until other stuff has finished
4170 */
4fb3cba5 4171 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4172 ERROR("failed to setup the automatic mounts for '%s'", name);
4173 return -1;
4174 }
4175
0a2dddd4 4176 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
36eb9bde 4177 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 4178 return -1;
576f946d 4179 }
4180
0a2dddd4 4181 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
e7938e9e
MN
4182 ERROR("failed to setup the mount entries for '%s'", name);
4183 return -1;
4184 }
4185
7b6753e7 4186 /* Make sure any start hooks are in the container */
1c1c7051
SH
4187 if (!verify_start_hooks(lxc_conf))
4188 return -1;
4189
2322903b
SH
4190 if (lxc_conf->is_execute)
4191 lxc_execute_bind_init(lxc_conf);
4192
368bbc02
CS
4193 /* now mount only cgroup, if wanted;
4194 * before, /sys could not have been mounted
4195 * (is either mounted automatically or via fstab entries)
4196 */
4fb3cba5 4197 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4198 ERROR("failed to setup the automatic mounts for '%s'", name);
4199 return -1;
4200 }
4201
283678ed 4202 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
773fb9ca
SH
4203 ERROR("failed to run mount hooks for container '%s'.", name);
4204 return -1;
4205 }
4206
bc6928ff 4207 if (lxc_conf->autodev > 0) {
283678ed 4208 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
f7bee6c6
MW
4209 ERROR("failed to run autodev hooks for container '%s'.", name);
4210 return -1;
4211 }
27245ff7 4212 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
91c3830e
SH
4213 ERROR("failed to populate /dev in the container");
4214 return -1;
4215 }
4216 }
368bbc02 4217
3d7d929a 4218 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
36eb9bde 4219 ERROR("failed to setup the console for '%s'", name);
95b5ffaf 4220 return -1;
6e590161 4221 }
4222
7e0e1d94
AV
4223 if (lxc_conf->kmsg) {
4224 if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail
4225 ERROR("failed to setup kmsg for '%s'", name);
4226 }
1bd051a6 4227
69aa6655
DE
4228 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4229 ERROR("failed to setup /dev symlinks for '%s'", name);
4230 return -1;
4231 }
4232
5112cd70 4233 /* mount /proc if it's not already there */
943144d9 4234 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 4235 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 4236 return -1;
e075f5d9 4237 }
e075f5d9 4238
ac778708 4239 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 4240 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 4241 return -1;
ed502555 4242 }
4243
70761e5e 4244 if (lxc_setup_devpts(lxc_conf->pts)) {
36eb9bde 4245 ERROR("failed to setup the new pts instance");
95b5ffaf 4246 return -1;
3c26f34e 4247 }
4248
e8bd4e43
SH
4249 if (lxc_create_tty(name, lxc_conf)) {
4250 ERROR("failed to create the ttys");
4251 return -1;
4252 }
4253
ae467c54 4254 if (lxc_send_ttys_to_parent(handler) < 0) {
e8bd4e43
SH
4255 ERROR("failure sending console info to parent");
4256 return -1;
4257 }
4258
9e1045e3 4259 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
e8bd4e43
SH
4260 ERROR("failed to setup the ttys for '%s'", name);
4261 return -1;
4262 }
4263
4264 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4265 SYSERROR("failed to set environment variable for container ptys");
4266
4267
cccc74b5
DL
4268 if (setup_personality(lxc_conf->personality)) {
4269 ERROR("failed to setup personality");
4270 return -1;
4271 }
4272
97a8f74f
SG
4273 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4274 if (!lxc_list_empty(&lxc_conf->caps)) {
7389ca26 4275 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
f6d3e3e4
SH
4276 return -1;
4277 }
97a8f74f
SG
4278 if (dropcaps_except(&lxc_conf->keepcaps)) {
4279 ERROR("failed to keep requested caps");
4280 return -1;
4281 }
4282 } else if (setup_caps(&lxc_conf->caps)) {
4283 ERROR("failed to drop capabilities");
4284 return -1;
81810dd1
DL
4285 }
4286
cd54d859
DL
4287 NOTICE("'%s' is setup.", name);
4288
0ad19a3f 4289 return 0;
4290}
26ddeedd 4291
283678ed
SH
4292int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4293 const char *lxcpath, char *argv[])
26ddeedd
SH
4294{
4295 int which = -1;
4296 struct lxc_list *it;
4297
4298 if (strcmp(hook, "pre-start") == 0)
4299 which = LXCHOOK_PRESTART;
5ea6163a
SH
4300 else if (strcmp(hook, "pre-mount") == 0)
4301 which = LXCHOOK_PREMOUNT;
26ddeedd
SH
4302 else if (strcmp(hook, "mount") == 0)
4303 which = LXCHOOK_MOUNT;
f7bee6c6
MW
4304 else if (strcmp(hook, "autodev") == 0)
4305 which = LXCHOOK_AUTODEV;
26ddeedd
SH
4306 else if (strcmp(hook, "start") == 0)
4307 which = LXCHOOK_START;
52492063
WB
4308 else if (strcmp(hook, "stop") == 0)
4309 which = LXCHOOK_STOP;
26ddeedd
SH
4310 else if (strcmp(hook, "post-stop") == 0)
4311 which = LXCHOOK_POSTSTOP;
148e91f5
SH
4312 else if (strcmp(hook, "clone") == 0)
4313 which = LXCHOOK_CLONE;
37cf711b
SY
4314 else if (strcmp(hook, "destroy") == 0)
4315 which = LXCHOOK_DESTROY;
26ddeedd
SH
4316 else
4317 return -1;
4318 lxc_list_for_each(it, &conf->hooks[which]) {
4319 int ret;
4320 char *hookname = it->elem;
283678ed 4321 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
26ddeedd
SH
4322 if (ret)
4323 return ret;
4324 }
4325 return 0;
4326}
72d0e1cb 4327
427b3a21 4328static void lxc_remove_nic(struct lxc_list *it)
72d0e1cb
SG
4329{
4330 struct lxc_netdev *netdev = it->elem;
9ebb03ad 4331 struct lxc_list *it2,*next;
72d0e1cb
SG
4332
4333 lxc_list_del(it);
4334
f10fad2f
ME
4335 free(netdev->link);
4336 free(netdev->name);
4337 if (netdev->type == LXC_NET_VETH)
c9bb9a85 4338 free(netdev->priv.veth_attr.pair);
f10fad2f
ME
4339 free(netdev->upscript);
4340 free(netdev->hwaddr);
4341 free(netdev->mtu);
4342 free(netdev->ipv4_gateway);
4343 free(netdev->ipv6_gateway);
9ebb03ad 4344 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
72d0e1cb
SG
4345 lxc_list_del(it2);
4346 free(it2->elem);
4347 free(it2);
4348 }
9ebb03ad 4349 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
72d0e1cb
SG
4350 lxc_list_del(it2);
4351 free(it2->elem);
4352 free(it2);
4353 }
d95db067 4354 free(netdev);
72d0e1cb
SG
4355 free(it);
4356}
4357
4358/* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
12a50cc6 4359int lxc_clear_nic(struct lxc_conf *c, const char *key)
72d0e1cb
SG
4360{
4361 char *p1;
4362 int ret, idx, i;
4363 struct lxc_list *it;
4364 struct lxc_netdev *netdev;
4365
46cd2845 4366 p1 = strchr(key, '.');
72d0e1cb
SG
4367 if (!p1 || *(p1+1) == '\0')
4368 p1 = NULL;
4369
4370 ret = sscanf(key, "%d", &idx);
4371 if (ret != 1) return -1;
4372 if (idx < 0)
4373 return -1;
4374
4375 i = 0;
4376 lxc_list_for_each(it, &c->network) {
4377 if (i == idx)
4378 break;
4379 i++;
4380 }
4381 if (i < idx) // we don't have that many nics defined
4382 return -1;
4383
4384 if (!it || !it->elem)
4385 return -1;
4386
4387 netdev = it->elem;
4388
4389 if (!p1) {
4390 lxc_remove_nic(it);
52d21d40 4391 } else if (strcmp(p1, ".ipv4") == 0) {
9ebb03ad
DE
4392 struct lxc_list *it2,*next;
4393 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
72d0e1cb
SG
4394 lxc_list_del(it2);
4395 free(it2->elem);
4396 free(it2);
4397 }
52d21d40 4398 } else if (strcmp(p1, ".ipv6") == 0) {
9ebb03ad
DE
4399 struct lxc_list *it2,*next;
4400 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
72d0e1cb
SG
4401 lxc_list_del(it2);
4402 free(it2->elem);
4403 free(it2);
4404 }
72d0e1cb
SG
4405 }
4406 else return -1;
4407
4408 return 0;
4409}
4410
4411int lxc_clear_config_network(struct lxc_conf *c)
4412{
9ebb03ad
DE
4413 struct lxc_list *it,*next;
4414 lxc_list_for_each_safe(it, &c->network, next) {
72d0e1cb
SG
4415 lxc_remove_nic(it);
4416 }
4417 return 0;
4418}
4419
4420int lxc_clear_config_caps(struct lxc_conf *c)
4421{
9ebb03ad 4422 struct lxc_list *it,*next;
72d0e1cb 4423
9ebb03ad 4424 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
4425 lxc_list_del(it);
4426 free(it->elem);
4427 free(it);
4428 }
4429 return 0;
4430}
4431
74a3920a 4432static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
4433 struct lxc_list *it, *next;
4434
4355ab5f 4435 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
4436 lxc_list_del(it);
4437 free(it->elem);
4438 free(it);
4439 }
4440 return 0;
4441}
4442
4355ab5f
SH
4443int lxc_clear_idmaps(struct lxc_conf *c)
4444{
4445 return lxc_free_idmap(&c->id_map);
4446}
4447
1fb86a7c
SH
4448int lxc_clear_config_keepcaps(struct lxc_conf *c)
4449{
4450 struct lxc_list *it,*next;
4451
4452 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4453 lxc_list_del(it);
4454 free(it->elem);
4455 free(it);
4456 }
4457 return 0;
4458}
4459
12a50cc6 4460int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 4461{
9ebb03ad 4462 struct lxc_list *it,*next;
72d0e1cb 4463 bool all = false;
a6390f01 4464 const char *k = NULL;
72d0e1cb
SG
4465
4466 if (strcmp(key, "lxc.cgroup") == 0)
4467 all = true;
a6390f01
WB
4468 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4469 k = key + sizeof("lxc.cgroup.")-1;
4470 else
4471 return -1;
72d0e1cb 4472
9ebb03ad 4473 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
4474 struct lxc_cgroup *cg = it->elem;
4475 if (!all && strcmp(cg->subsystem, k) != 0)
4476 continue;
4477 lxc_list_del(it);
4478 free(cg->subsystem);
4479 free(cg->value);
4480 free(cg);
4481 free(it);
4482 }
4483 return 0;
4484}
4485
c6d09e15
WB
4486int lxc_clear_limits(struct lxc_conf *c, const char *key)
4487{
4488 struct lxc_list *it, *next;
4489 bool all = false;
4490 const char *k = NULL;
4491
4492 if (strcmp(key, "lxc.limit") == 0)
4493 all = true;
4494 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4495 k = key + sizeof("lxc.limit.")-1;
4496 else
4497 return -1;
4498
4499 lxc_list_for_each_safe(it, &c->limits, next) {
4500 struct lxc_limit *lim = it->elem;
4501 if (!all && strcmp(lim->resource, k) != 0)
4502 continue;
4503 lxc_list_del(it);
4504 free(lim->resource);
4505 free(lim);
4506 free(it);
4507 }
4508 return 0;
4509}
4510
ee1e7aa0
SG
4511int lxc_clear_groups(struct lxc_conf *c)
4512{
4513 struct lxc_list *it,*next;
4514
4515 lxc_list_for_each_safe(it, &c->groups, next) {
4516 lxc_list_del(it);
4517 free(it->elem);
4518 free(it);
4519 }
4520 return 0;
4521}
4522
ab799c0b
SG
4523int lxc_clear_environment(struct lxc_conf *c)
4524{
4525 struct lxc_list *it,*next;
4526
4527 lxc_list_for_each_safe(it, &c->environment, next) {
4528 lxc_list_del(it);
4529 free(it->elem);
4530 free(it);
4531 }
4532 return 0;
4533}
4534
4535
72d0e1cb
SG
4536int lxc_clear_mount_entries(struct lxc_conf *c)
4537{
9ebb03ad 4538 struct lxc_list *it,*next;
72d0e1cb 4539
9ebb03ad 4540 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
4541 lxc_list_del(it);
4542 free(it->elem);
4543 free(it);
4544 }
4545 return 0;
4546}
4547
b099e9e9
SH
4548int lxc_clear_automounts(struct lxc_conf *c)
4549{
4550 c->auto_mounts = 0;
4551 return 0;
4552}
4553
12a50cc6 4554int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 4555{
9ebb03ad 4556 struct lxc_list *it,*next;
17ed13a3 4557 bool all = false, done = false;
a6390f01 4558 const char *k = NULL;
72d0e1cb
SG
4559 int i;
4560
17ed13a3
SH
4561 if (strcmp(key, "lxc.hook") == 0)
4562 all = true;
a6390f01
WB
4563 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4564 k = key + sizeof("lxc.hook.")-1;
4565 else
4566 return -1;
17ed13a3 4567
72d0e1cb 4568 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 4569 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 4570 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
4571 lxc_list_del(it);
4572 free(it->elem);
4573 free(it);
4574 }
4575 done = true;
72d0e1cb
SG
4576 }
4577 }
17ed13a3
SH
4578
4579 if (!done) {
4580 ERROR("Invalid hook key: %s", key);
4581 return -1;
4582 }
72d0e1cb
SG
4583 return 0;
4584}
8eb5694b 4585
74a3920a 4586static void lxc_clear_saved_nics(struct lxc_conf *conf)
7b35f3d6
SH
4587{
4588 int i;
4589
0cf45501 4590 if (!conf->saved_nics)
7b35f3d6
SH
4591 return;
4592 for (i=0; i < conf->num_savednics; i++)
4593 free(conf->saved_nics[i].orig_name);
7b35f3d6
SH
4594 free(conf->saved_nics);
4595}
4596
4184c3e1
SH
4597static inline void lxc_clear_aliens(struct lxc_conf *conf)
4598{
4599 struct lxc_list *it,*next;
4600
4601 lxc_list_for_each_safe(it, &conf->aliens, next) {
4602 lxc_list_del(it);
4603 free(it->elem);
4604 free(it);
4605 }
4606}
4607
c7b15d1e 4608void lxc_clear_includes(struct lxc_conf *conf)
f979ac15
SH
4609{
4610 struct lxc_list *it,*next;
4611
4612 lxc_list_for_each_safe(it, &conf->includes, next) {
4613 lxc_list_del(it);
4614 free(it->elem);
4615 free(it);
4616 }
4617}
4618
8eb5694b
SH
4619void lxc_conf_free(struct lxc_conf *conf)
4620{
4621 if (!conf)
4622 return;
858377e4
SH
4623 if (current_config == conf)
4624 current_config = NULL;
f10fad2f
ME
4625 free(conf->console.log_path);
4626 free(conf->console.path);
4627 free(conf->rootfs.mount);
b3b8c97f 4628 free(conf->rootfs.bdev_type);
f10fad2f
ME
4629 free(conf->rootfs.options);
4630 free(conf->rootfs.path);
f10fad2f 4631 free(conf->logfile);
858377e4
SH
4632 if (conf->logfd != -1)
4633 close(conf->logfd);
f10fad2f
ME
4634 free(conf->utsname);
4635 free(conf->ttydir);
4636 free(conf->fstab);
4637 free(conf->rcfile);
4638 free(conf->init_cmd);
6b0d5538 4639 free(conf->unexpanded_config);
393903d1 4640 free(conf->pty_names);
76d0127f 4641 free(conf->syslog);
8eb5694b 4642 lxc_clear_config_network(conf);
f10fad2f
ME
4643 free(conf->lsm_aa_profile);
4644 free(conf->lsm_se_context);
769872f9 4645 lxc_seccomp_free(conf);
8eb5694b 4646 lxc_clear_config_caps(conf);
1fb86a7c 4647 lxc_clear_config_keepcaps(conf);
8eb5694b 4648 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 4649 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 4650 lxc_clear_mount_entries(conf);
7b35f3d6 4651 lxc_clear_saved_nics(conf);
27c27d73 4652 lxc_clear_idmaps(conf);
ee1e7aa0 4653 lxc_clear_groups(conf);
f979ac15 4654 lxc_clear_includes(conf);
761d81ca 4655 lxc_clear_aliens(conf);
ab799c0b 4656 lxc_clear_environment(conf);
c6d09e15 4657 lxc_clear_limits(conf, "lxc.limit");
8eb5694b
SH
4658 free(conf);
4659}
4355ab5f
SH
4660
4661struct userns_fn_data {
4662 int (*fn)(void *);
c9b7c33e 4663 const char *fn_name;
4355ab5f
SH
4664 void *arg;
4665 int p[2];
4666};
4667
4668static int run_userns_fn(void *data)
4669{
4670 struct userns_fn_data *d = data;
4671 char c;
4355ab5f 4672
f8aa4bf3 4673 /* Close write end of the pipe. */
4355ab5f 4674 close(d->p[1]);
f8aa4bf3
CB
4675
4676 /* Wait for parent to finish establishing a new mapping in the user
4677 * namespace we are executing in.
4678 */
4355ab5f
SH
4679 if (read(d->p[0], &c, 1) != 1)
4680 return -1;
f8aa4bf3
CB
4681
4682 /* Close read end of the pipe. */
4355ab5f 4683 close(d->p[0]);
f8aa4bf3 4684
c9b7c33e
CB
4685 if (d->fn_name)
4686 TRACE("calling function \"%s\"", d->fn_name);
f8aa4bf3 4687 /* Call function to run. */
4355ab5f
SH
4688 return d->fn(d->arg);
4689}
4690
339efad9 4691static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
f8aa4bf3
CB
4692 enum idtype idtype)
4693{
4694 struct lxc_list *it;
4695 struct id_map *map;
4696 struct id_map *retmap = NULL;
4697
4698 lxc_list_for_each(it, &conf->id_map) {
4699 map = it->elem;
4700 if (map->idtype != idtype)
4701 continue;
4702
4703 if (id >= map->hostid && id < map->hostid + map->range) {
4704 retmap = map;
4705 break;
4706 }
4707 }
4708
4709 if (!retmap)
4710 return NULL;
4711
4712 retmap = malloc(sizeof(*retmap));
4713 if (!retmap)
4714 return NULL;
4715
4716 memcpy(retmap, map, sizeof(*retmap));
4717 return retmap;
4718}
4719
4355ab5f 4720/*
f8aa4bf3
CB
4721 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4722 * existing one or establish a new one.
4355ab5f 4723 */
28a2d9e7 4724static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4355ab5f 4725{
28a2d9e7 4726 int hostid_mapped;
f8aa4bf3 4727 struct id_map *entry = NULL;
f8aa4bf3 4728
28a2d9e7
CB
4729 /* Reuse existing mapping. */
4730 entry = mapped_hostid_entry(conf, id, type);
4731 if (entry)
4732 return entry;
f8aa4bf3 4733
28a2d9e7
CB
4734 /* Find new mapping. */
4735 hostid_mapped = find_unmapped_nsid(conf, type);
4736 if (hostid_mapped < 0) {
4737 DEBUG("failed to find free mapping for id %d", id);
4738 return NULL;
f8aa4bf3 4739 }
f8aa4bf3 4740
28a2d9e7
CB
4741 entry = malloc(sizeof(*entry));
4742 if (!entry)
4743 return NULL;
4355ab5f 4744
28a2d9e7
CB
4745 entry->idtype = type;
4746 entry->nsid = hostid_mapped;
4747 entry->hostid = (unsigned long)id;
4748 entry->range = 1;
4355ab5f 4749
28a2d9e7 4750 return entry;
4355ab5f
SH
4751}
4752
f8aa4bf3
CB
4753/* Run a function in a new user namespace.
4754 * The caller's euid/egid will be mapped if it is not already.
4755 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4756 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4757 * This means we require only to establish a mapping from:
4758 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4759 * - the container root -> some sub{g,u}id
4760 * The former we add, if the user did not specifiy a mapping. The latter we
4761 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4762 * there to start the container in the first place.
4355ab5f 4763 */
c9b7c33e
CB
4764int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4765 const char *fn_name)
4355ab5f 4766{
f8aa4bf3
CB
4767 pid_t pid;
4768 uid_t euid, egid;
4355ab5f 4769 struct userns_fn_data d;
4355ab5f 4770 int p[2];
f8aa4bf3
CB
4771 struct lxc_list *it;
4772 struct id_map *map;
4773 char c = '1';
4774 int ret = -1;
4775 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
4776 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4777 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 4778
4355ab5f 4779 ret = pipe(p);
4355ab5f
SH
4780 if (ret < 0) {
4781 SYSERROR("opening pipe");
4782 return -1;
4783 }
4784 d.fn = fn;
c9b7c33e 4785 d.fn_name = fn_name;
4355ab5f
SH
4786 d.arg = data;
4787 d.p[0] = p[0];
4788 d.p[1] = p[1];
f8aa4bf3
CB
4789
4790 /* Clone child in new user namespace. */
4355ab5f 4791 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
f8aa4bf3
CB
4792 if (pid < 0) {
4793 ERROR("failed to clone child process in new user namespace");
4794 goto on_error;
4795 }
4796
4355ab5f 4797 close(p[0]);
4355ab5f
SH
4798 p[0] = -1;
4799
f8aa4bf3
CB
4800 /* Find container root. */
4801 lxc_list_for_each(it, &conf->id_map) {
4802 map = it->elem;
4803
4804 if (map->nsid != 0)
4805 continue;
4806
4807 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4808 container_root_uid = malloc(sizeof(*container_root_uid));
4809 if (!container_root_uid)
4810 goto on_error;
4811 container_root_uid->idtype = map->idtype;
4812 container_root_uid->hostid = map->hostid;
4813 container_root_uid->nsid = 0;
4814 container_root_uid->range = map->range;
4815 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4816 container_root_gid = malloc(sizeof(*container_root_gid));
4817 if (!container_root_gid)
4818 goto on_error;
4819 container_root_gid->idtype = map->idtype;
4820 container_root_gid->hostid = map->hostid;
4821 container_root_gid->nsid = 0;
4822 container_root_gid->range = map->range;
4823 }
4824
4825 /* Found container root. */
4826 if (container_root_uid && container_root_gid)
4827 break;
4828 }
4829
4830 /* This is actually checked earlier but it can't hurt. */
4831 if (!container_root_uid || !container_root_gid) {
4832 ERROR("no mapping for container root found");
4833 goto on_error;
4834 }
4835
1d90e064
CB
4836 host_uid_map = container_root_uid;
4837 host_gid_map = container_root_gid;
4838
f8aa4bf3
CB
4839 /* Check whether the {g,u}id of the user has a mapping. */
4840 euid = geteuid();
4841 egid = getegid();
1d90e064 4842 if (euid != container_root_uid->hostid)
28a2d9e7
CB
4843 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4844
1d90e064 4845 if (egid != container_root_gid->hostid)
28a2d9e7
CB
4846 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4847
4848 if (!host_uid_map) {
4849 DEBUG("failed to find mapping for uid %d", euid);
f8aa4bf3
CB
4850 goto on_error;
4851 }
4852
28a2d9e7
CB
4853 if (!host_gid_map) {
4854 DEBUG("failed to find mapping for gid %d", egid);
4855 goto on_error;
4856 }
4857
4858 /* Allocate new {g,u}id map list. */
4859 idmap = malloc(sizeof(*idmap));
4860 if (!idmap)
4861 goto on_error;
4862 lxc_list_init(idmap);
4863
f8aa4bf3
CB
4864 /* Add container root to the map. */
4865 tmplist = malloc(sizeof(*tmplist));
4866 if (!tmplist)
4867 goto on_error;
4868 lxc_list_add_elem(tmplist, container_root_uid);
4869 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4870
1d90e064 4871 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
4872 /* idmap will now keep track of that memory. */
4873 container_root_uid = NULL;
4874
4875 /* Add container root to the map. */
4876 tmplist = malloc(sizeof(*tmplist));
4877 if (!tmplist)
4878 goto on_error;
4879 lxc_list_add_elem(tmplist, host_uid_map);
4880 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4881 }
1d90e064
CB
4882 /* idmap will now keep track of that memory. */
4883 container_root_uid = NULL;
4884 /* idmap will now keep track of that memory. */
4885 host_uid_map = NULL;
f8aa4bf3
CB
4886
4887 tmplist = malloc(sizeof(*tmplist));
4888 if (!tmplist)
4889 goto on_error;
4890 lxc_list_add_elem(tmplist, container_root_gid);
4891 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4892
1d90e064 4893 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
4894 /* idmap will now keep track of that memory. */
4895 container_root_gid = NULL;
4896
4897 tmplist = malloc(sizeof(*tmplist));
4898 if (!tmplist)
4899 goto on_error;
4900 lxc_list_add_elem(tmplist, host_gid_map);
4901 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4902 }
1d90e064
CB
4903 /* idmap will now keep track of that memory. */
4904 container_root_gid = NULL;
4905 /* idmap will now keep track of that memory. */
4906 host_gid_map = NULL;
f8aa4bf3 4907
77803ee7
CB
4908 if (lxc_log_get_level() == LXC_LOG_PRIORITY_TRACE ||
4909 conf->loglevel == LXC_LOG_PRIORITY_TRACE) {
f8aa4bf3
CB
4910 lxc_list_for_each(it, idmap) {
4911 map = it->elem;
4912 TRACE("establishing %cid mapping for \"%d\" in new "
4913 "user namespace: nsuid %lu - hostid %lu - range "
4914 "%lu",
4915 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4916 map->nsid, map->hostid, map->range);
4917 }
4355ab5f
SH
4918 }
4919
f8aa4bf3 4920 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4921 ret = lxc_map_ids(idmap, pid);
f8aa4bf3
CB
4922 if (ret < 0) {
4923 ERROR("error setting up {g,u}id mappings for child process "
4924 "\"%d\"",
4925 pid);
4926 goto on_error;
4355ab5f
SH
4927 }
4928
f8aa4bf3 4929 /* Tell child to proceed. */
4355ab5f 4930 if (write(p[1], &c, 1) != 1) {
f8aa4bf3
CB
4931 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4932 goto on_error;
4355ab5f
SH
4933 }
4934
f8aa4bf3 4935 /* Wait for child to finish. */
3139aead
SG
4936 ret = wait_for_pid(pid);
4937
f8aa4bf3 4938on_error:
1d90e064
CB
4939 if (idmap)
4940 lxc_free_idmap(idmap);
4941 if (container_root_uid)
4942 free(container_root_uid);
4943 if (container_root_gid)
4944 free(container_root_gid);
4945 if (host_uid_map && (host_uid_map != container_root_uid))
4946 free(host_uid_map);
4947 if (host_gid_map && (host_gid_map != container_root_gid))
4948 free(host_gid_map);
3139aead 4949
4355ab5f
SH
4950 if (p[0] != -1)
4951 close(p[0]);
4952 close(p[1]);
f8aa4bf3
CB
4953
4954 return ret;
4355ab5f 4955}
97e9cfa0 4956
a96a8e8c 4957/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4958static char* getuname(void)
4959{
a96a8e8c 4960 struct passwd *result;
97e9cfa0 4961
a96a8e8c
SH
4962 result = getpwuid(geteuid());
4963 if (!result)
97e9cfa0
SH
4964 return NULL;
4965
a96a8e8c 4966 return strdup(result->pw_name);
97e9cfa0
SH
4967}
4968
a96a8e8c 4969/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4970static char *getgname(void)
4971{
a96a8e8c 4972 struct group *result;
97e9cfa0 4973
a96a8e8c
SH
4974 result = getgrgid(getegid());
4975 if (!result)
97e9cfa0
SH
4976 return NULL;
4977
a96a8e8c 4978 return strdup(result->gr_name);
97e9cfa0
SH
4979}
4980
a96a8e8c 4981/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4982void suggest_default_idmap(void)
4983{
4984 FILE *f;
4985 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4986 char *line = NULL;
4987 char *uname, *gname;
4988 size_t len = 0;
4989
4990 if (!(uname = getuname()))
4991 return;
4992
4993 if (!(gname = getgname())) {
4994 free(uname);
4995 return;
4996 }
4997
4998 f = fopen(subuidfile, "r");
4999 if (!f) {
5000 ERROR("Your system is not configured with subuids");
5001 free(gname);
5002 free(uname);
5003 return;
5004 }
5005 while (getline(&line, &len, f) != -1) {
b7930180 5006 size_t no_newline = 0;
97e9cfa0
SH
5007 char *p = strchr(line, ':'), *p2;
5008 if (*line == '#')
5009 continue;
5010 if (!p)
5011 continue;
5012 *p = '\0';
5013 p++;
5014 if (strcmp(line, uname))
5015 continue;
5016 p2 = strchr(p, ':');
5017 if (!p2)
5018 continue;
5019 *p2 = '\0';
5020 p2++;
5021 if (!*p2)
5022 continue;
b7930180
CB
5023 no_newline = strcspn(p2, "\n");
5024 p2[no_newline] = '\0';
5025
b7b2fde4
CB
5026 if (lxc_safe_uint(p, &uid) < 0)
5027 WARN("Could not parse UID.");
5028 if (lxc_safe_uint(p2, &urange) < 0)
5029 WARN("Could not parse UID range.");
97e9cfa0
SH
5030 }
5031 fclose(f);
5032
6be7389a 5033 f = fopen(subgidfile, "r");
97e9cfa0
SH
5034 if (!f) {
5035 ERROR("Your system is not configured with subgids");
5036 free(gname);
5037 free(uname);
5038 return;
5039 }
5040 while (getline(&line, &len, f) != -1) {
b7930180 5041 size_t no_newline = 0;
97e9cfa0
SH
5042 char *p = strchr(line, ':'), *p2;
5043 if (*line == '#')
5044 continue;
5045 if (!p)
5046 continue;
5047 *p = '\0';
5048 p++;
5049 if (strcmp(line, uname))
5050 continue;
5051 p2 = strchr(p, ':');
5052 if (!p2)
5053 continue;
5054 *p2 = '\0';
5055 p2++;
5056 if (!*p2)
5057 continue;
b7930180
CB
5058 no_newline = strcspn(p2, "\n");
5059 p2[no_newline] = '\0';
5060
b7b2fde4
CB
5061 if (lxc_safe_uint(p, &gid) < 0)
5062 WARN("Could not parse GID.");
5063 if (lxc_safe_uint(p2, &grange) < 0)
5064 WARN("Could not parse GID range.");
97e9cfa0
SH
5065 }
5066 fclose(f);
5067
f10fad2f 5068 free(line);
97e9cfa0
SH
5069
5070 if (!urange || !grange) {
5071 ERROR("You do not have subuids or subgids allocated");
5072 ERROR("Unprivileged containers require subuids and subgids");
5073 return;
5074 }
5075
5076 ERROR("You must either run as root, or define uid mappings");
5077 ERROR("To pass uid mappings to lxc-create, you could create");
5078 ERROR("~/.config/lxc/default.conf:");
5079 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
5080 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
5081 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
5082
5083 free(gname);
5084 free(uname);
5085}
aaf26830 5086
a7307747
SH
5087static void free_cgroup_settings(struct lxc_list *result)
5088{
5089 struct lxc_list *iterator, *next;
5090
5091 lxc_list_for_each_safe(iterator, result, next) {
5092 lxc_list_del(iterator);
5093 free(iterator);
5094 }
5095 free(result);
5096}
5097
aaf26830
KT
5098/*
5099 * Return the list of cgroup_settings sorted according to the following rules
5100 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
5101 */
5102struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
5103{
5104 struct lxc_list *result;
5105 struct lxc_list *memsw_limit = NULL;
5106 struct lxc_list *it = NULL;
5107 struct lxc_cgroup *cg = NULL;
5108 struct lxc_list *item = NULL;
5109
5110 result = malloc(sizeof(*result));
fac7c663
KT
5111 if (!result) {
5112 ERROR("failed to allocate memory to sort cgroup settings");
5113 return NULL;
5114 }
aaf26830
KT
5115 lxc_list_init(result);
5116
5117 /*Iterate over the cgroup settings and copy them to the output list*/
5118 lxc_list_for_each(it, cgroup_settings) {
5119 item = malloc(sizeof(*item));
fac7c663
KT
5120 if (!item) {
5121 ERROR("failed to allocate memory to sort cgroup settings");
a7307747 5122 free_cgroup_settings(result);
fac7c663
KT
5123 return NULL;
5124 }
aaf26830
KT
5125 item->elem = it->elem;
5126 cg = it->elem;
5127 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
5128 /* Store the memsw_limit location */
5129 memsw_limit = item;
5130 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4d5b72a1 5131 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
aaf26830
KT
5132 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5133 item->elem = memsw_limit->elem;
5134 memsw_limit->elem = it->elem;
5135 }
5136 lxc_list_add_tail(result, item);
5137 }
5138
5139 return result;
a7307747 5140}