]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
Merge pull request #1575 from brauner/2017-05-18/fix_tmp_mount_proc
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
8f3e280e
CB
27#include <dirent.h>
28#include <errno.h>
29#include <fcntl.h>
30#include <grp.h>
31#include <inttypes.h>
32#include <libgen.h>
33#include <pwd.h>
34#include <stdarg.h>
0ad19a3f 35#include <stdio.h>
0ad19a3f 36#include <stdlib.h>
0ad19a3f 37#include <string.h>
8f3e280e 38#include <time.h>
0ad19a3f 39#include <unistd.h>
8f3e280e
CB
40#include <arpa/inet.h>
41#include <linux/loop.h>
8f3e280e
CB
42#include <net/if.h>
43#include <netinet/in.h>
44#include <sys/mman.h>
45#include <sys/mount.h>
46#include <sys/param.h>
47#include <sys/prctl.h>
48#include <sys/stat.h>
49#include <sys/socket.h>
ce831b3b 50#include <sys/sysmacros.h>
2d76d1d7 51#include <sys/syscall.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
1d52bdf7 55
af6824fc
ST
56/* makedev() */
57#ifdef MAJOR_IN_MKDEV
58# include <sys/mkdev.h>
59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
5ef5c9a3
CB
71#ifdef HAVE_LINUX_MEMFD_H
72#include <linux/memfd.h>
73#endif
74
e8bd4e43 75#include "af_unix.h"
8f3e280e
CB
76#include "bdev.h"
77#include "caps.h" /* for lxc_caps_last_cap() */
78#include "cgroup.h"
1b09f2c0 79#include "conf.h"
8f3e280e 80#include "error.h"
1b09f2c0 81#include "log.h"
d8e48992 82#include "lxcaufs.h"
025ed0f3 83#include "lxclock.h"
8f3e280e
CB
84#include "lxcoverlay.h"
85#include "lxcseccomp.h"
4355ab5f 86#include "namespace.h"
8f3e280e
CB
87#include "network.h"
88#include "parse.h"
89#include "utils.h"
fe4de9a6 90#include "lsm/lsm.h"
d0a36f2c 91
e37dda71 92#if HAVE_LIBCAP
495d2046
SG
93#include <sys/capability.h>
94#endif
95
6ff05e18
SG
96#if HAVE_SYS_PERSONALITY_H
97#include <sys/personality.h>
98#endif
99
edaf8b1b
SG
100#if IS_BIONIC
101#include <../include/lxcmntent.h>
a04f5407
CB
102#ifndef HAVE_PRLIMIT
103#include <../include/prlimit.h>
104#endif
edaf8b1b
SG
105#else
106#include <mntent.h>
107#endif
108
36eb9bde 109lxc_log_define(lxc_conf, lxc);
e5bda9ee 110
e37dda71 111#if HAVE_LIBCAP
b09094da
MN
112#ifndef CAP_SETFCAP
113#define CAP_SETFCAP 31
114#endif
115
116#ifndef CAP_MAC_OVERRIDE
117#define CAP_MAC_OVERRIDE 32
118#endif
119
120#ifndef CAP_MAC_ADMIN
121#define CAP_MAC_ADMIN 33
122#endif
495d2046 123#endif
b09094da
MN
124
125#ifndef PR_CAPBSET_DROP
126#define PR_CAPBSET_DROP 24
127#endif
128
9818cae4
SG
129#ifndef LO_FLAGS_AUTOCLEAR
130#define LO_FLAGS_AUTOCLEAR 4
131#endif
132
bc5b27d6
DK
133#ifndef CAP_SETUID
134#define CAP_SETUID 7
135#endif
136
137#ifndef CAP_SETGID
138#define CAP_SETGID 6
139#endif
140
0769b82a
CS
141/* needed for cgroup automount checks, regardless of whether we
142 * have included linux/capability.h or not */
143#ifndef CAP_SYS_ADMIN
144#define CAP_SYS_ADMIN 21
145#endif
146
2d76d1d7
SG
147/* Define pivot_root() if missing from the C library */
148#ifndef HAVE_PIVOT_ROOT
149static int pivot_root(const char * new_root, const char * put_old)
150{
151#ifdef __NR_pivot_root
8f3e280e 152 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 153#else
8f3e280e
CB
154 errno = ENOSYS;
155 return -1;
2d76d1d7
SG
156#endif
157}
158#else
159extern int pivot_root(const char * new_root, const char * put_old);
160#endif
161
162/* Define sethostname() if missing from the C library */
163#ifndef HAVE_SETHOSTNAME
164static int sethostname(const char * name, size_t len)
165{
166#ifdef __NR_sethostname
8f3e280e 167 return syscall(__NR_sethostname, name, len);
2d76d1d7 168#else
8f3e280e
CB
169 errno = ENOSYS;
170 return -1;
2d76d1d7
SG
171#endif
172}
173#endif
174
72f919c4
SG
175/* Define __S_ISTYPE if missing from the C library */
176#ifndef __S_ISTYPE
177#define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
178#endif
179
ecec0126
SG
180#ifndef MS_PRIVATE
181#define MS_PRIVATE (1<<18)
182#endif
183
8912711c
CB
184#ifndef MS_LAZYTIME
185#define MS_LAZYTIME (1<<25)
186#endif
187
5ef5c9a3
CB
188/* memfd_create() */
189#ifndef MFD_CLOEXEC
190#define MFD_CLOEXEC 0x0001U
191#endif
192
193#ifndef MFD_ALLOW_SEALING
194#define MFD_ALLOW_SEALING 0x0002U
195#endif
196
197#ifndef HAVE_MEMFD_CREATE
198static int memfd_create(const char *name, unsigned int flags) {
199 #ifndef __NR_memfd_create
200 #if defined __i386__
201 #define __NR_memfd_create 356
202 #elif defined __x86_64__
203 #define __NR_memfd_create 319
204 #elif defined __arm__
205 #define __NR_memfd_create 385
206 #elif defined __aarch64__
207 #define __NR_memfd_create 279
208 #elif defined __s390__
209 #define __NR_memfd_create 350
210 #elif defined __powerpc__
211 #define __NR_memfd_create 360
212 #elif defined __sparc__
213 #define __NR_memfd_create 348
214 #elif defined __blackfin__
215 #define __NR_memfd_create 390
216 #elif defined __ia64__
217 #define __NR_memfd_create 1340
218 #elif defined _MIPS_SIM
219 #if _MIPS_SIM == _MIPS_SIM_ABI32
220 #define __NR_memfd_create 4354
221 #endif
222 #if _MIPS_SIM == _MIPS_SIM_NABI32
223 #define __NR_memfd_create 6318
224 #endif
225 #if _MIPS_SIM == _MIPS_SIM_ABI64
226 #define __NR_memfd_create 5314
227 #endif
228 #endif
229 #endif
230 #ifdef __NR_memfd_create
231 return syscall(__NR_memfd_create, name, flags);
232 #else
233 errno = ENOSYS;
234 return -1;
235 #endif
236}
237#else
238extern int memfd_create(const char *name, unsigned int flags);
239#endif
240
72d0e1cb 241char *lxchook_names[NUM_LXC_HOOKS] = {
52492063 242 "pre-start", "pre-mount", "mount", "autodev", "start", "stop", "post-stop", "clone", "destroy" };
72d0e1cb 243
a589434e 244typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
0ad19a3f 245
998ac676
RT
246struct mount_opt {
247 char *name;
248 int clear;
249 int flag;
250};
251
81810dd1
DL
252struct caps_opt {
253 char *name;
254 int value;
255};
256
c6d09e15
WB
257struct limit_opt {
258 char *name;
259 int value;
260};
261
858377e4
SH
262/*
263 * The lxc_conf of the container currently being worked on in an
264 * API call
265 * This is used in the error calls
266 */
267#ifdef HAVE_TLS
268__thread struct lxc_conf *current_config;
269#else
270struct lxc_conf *current_config;
271#endif
272
0769b82a
CS
273/* Declare this here, since we don't want to reshuffle the whole file. */
274static int in_caplist(int cap, struct lxc_list *caps);
275
a589434e
JN
276static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
277static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
278static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
279static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
280static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
281static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
282
283static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
284 [LXC_NET_VETH] = instantiate_veth,
285 [LXC_NET_MACVLAN] = instantiate_macvlan,
286 [LXC_NET_VLAN] = instantiate_vlan,
287 [LXC_NET_PHYS] = instantiate_phys,
288 [LXC_NET_EMPTY] = instantiate_empty,
289 [LXC_NET_NONE] = instantiate_none,
0ad19a3f 290};
291
74a2b586
JK
292static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
293static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
294static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
295static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
296static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
26b797f3 297static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
74a2b586 298
a589434e 299static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
74a2b586
JK
300 [LXC_NET_VETH] = shutdown_veth,
301 [LXC_NET_MACVLAN] = shutdown_macvlan,
302 [LXC_NET_VLAN] = shutdown_vlan,
303 [LXC_NET_PHYS] = shutdown_phys,
304 [LXC_NET_EMPTY] = shutdown_empty,
26b797f3 305 [LXC_NET_NONE] = shutdown_none,
74a2b586
JK
306};
307
998ac676 308static struct mount_opt mount_opt[] = {
470b359b
CB
309 { "async", 1, MS_SYNCHRONOUS },
310 { "atime", 1, MS_NOATIME },
311 { "bind", 0, MS_BIND },
88d413d5 312 { "defaults", 0, 0 },
88d413d5 313 { "dev", 1, MS_NODEV },
470b359b 314 { "diratime", 1, MS_NODIRATIME },
88d413d5 315 { "dirsync", 0, MS_DIRSYNC },
470b359b 316 { "exec", 1, MS_NOEXEC },
8912711c 317 { "lazytime", 0, MS_LAZYTIME },
88d413d5 318 { "mand", 0, MS_MANDLOCK },
88d413d5 319 { "noatime", 0, MS_NOATIME },
470b359b 320 { "nodev", 0, MS_NODEV },
88d413d5 321 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
322 { "noexec", 0, MS_NOEXEC },
323 { "nomand", 1, MS_MANDLOCK },
324 { "norelatime", 1, MS_RELATIME },
325 { "nostrictatime", 1, MS_STRICTATIME },
326 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
327 { "rbind", 0, MS_BIND|MS_REC },
328 { "relatime", 0, MS_RELATIME },
470b359b
CB
329 { "remount", 0, MS_REMOUNT },
330 { "ro", 0, MS_RDONLY },
331 { "rw", 1, MS_RDONLY },
88d413d5 332 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
333 { "suid", 1, MS_NOSUID },
334 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 335 { NULL, 0, 0 },
998ac676
RT
336};
337
e37dda71 338#if HAVE_LIBCAP
81810dd1 339static struct caps_opt caps_opt[] = {
a6afdde9 340 { "chown", CAP_CHOWN },
1e11be34
DL
341 { "dac_override", CAP_DAC_OVERRIDE },
342 { "dac_read_search", CAP_DAC_READ_SEARCH },
343 { "fowner", CAP_FOWNER },
344 { "fsetid", CAP_FSETID },
81810dd1
DL
345 { "kill", CAP_KILL },
346 { "setgid", CAP_SETGID },
347 { "setuid", CAP_SETUID },
348 { "setpcap", CAP_SETPCAP },
349 { "linux_immutable", CAP_LINUX_IMMUTABLE },
350 { "net_bind_service", CAP_NET_BIND_SERVICE },
351 { "net_broadcast", CAP_NET_BROADCAST },
352 { "net_admin", CAP_NET_ADMIN },
353 { "net_raw", CAP_NET_RAW },
354 { "ipc_lock", CAP_IPC_LOCK },
355 { "ipc_owner", CAP_IPC_OWNER },
356 { "sys_module", CAP_SYS_MODULE },
357 { "sys_rawio", CAP_SYS_RAWIO },
358 { "sys_chroot", CAP_SYS_CHROOT },
359 { "sys_ptrace", CAP_SYS_PTRACE },
360 { "sys_pacct", CAP_SYS_PACCT },
361 { "sys_admin", CAP_SYS_ADMIN },
362 { "sys_boot", CAP_SYS_BOOT },
363 { "sys_nice", CAP_SYS_NICE },
364 { "sys_resource", CAP_SYS_RESOURCE },
365 { "sys_time", CAP_SYS_TIME },
366 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
367 { "mknod", CAP_MKNOD },
368 { "lease", CAP_LEASE },
57b837e2
CB
369#ifdef CAP_AUDIT_READ
370 { "audit_read", CAP_AUDIT_READ },
371#endif
9527e566 372#ifdef CAP_AUDIT_WRITE
81810dd1 373 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
374#endif
375#ifdef CAP_AUDIT_CONTROL
81810dd1 376 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 377#endif
81810dd1
DL
378 { "setfcap", CAP_SETFCAP },
379 { "mac_override", CAP_MAC_OVERRIDE },
380 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
381#ifdef CAP_SYSLOG
382 { "syslog", CAP_SYSLOG },
383#endif
384#ifdef CAP_WAKE_ALARM
385 { "wake_alarm", CAP_WAKE_ALARM },
386#endif
2b54359b
CB
387#ifdef CAP_BLOCK_SUSPEND
388 { "block_suspend", CAP_BLOCK_SUSPEND },
389#endif
81810dd1 390};
495d2046
SG
391#else
392static struct caps_opt caps_opt[] = {};
393#endif
81810dd1 394
c6d09e15
WB
395static struct limit_opt limit_opt[] = {
396#ifdef RLIMIT_AS
397 { "as", RLIMIT_AS },
398#endif
399#ifdef RLIMIT_CORE
400 { "core", RLIMIT_CORE },
401#endif
402#ifdef RLIMIT_CPU
403 { "cpu", RLIMIT_CPU },
404#endif
405#ifdef RLIMIT_DATA
406 { "data", RLIMIT_DATA },
407#endif
408#ifdef RLIMIT_FSIZE
409 { "fsize", RLIMIT_FSIZE },
410#endif
411#ifdef RLIMIT_LOCKS
412 { "locks", RLIMIT_LOCKS },
413#endif
414#ifdef RLIMIT_MEMLOCK
415 { "memlock", RLIMIT_MEMLOCK },
416#endif
417#ifdef RLIMIT_MSGQUEUE
418 { "msgqueue", RLIMIT_MSGQUEUE },
419#endif
420#ifdef RLIMIT_NICE
421 { "nice", RLIMIT_NICE },
422#endif
423#ifdef RLIMIT_NOFILE
424 { "nofile", RLIMIT_NOFILE },
425#endif
426#ifdef RLIMIT_NPROC
427 { "nproc", RLIMIT_NPROC },
428#endif
429#ifdef RLIMIT_RSS
430 { "rss", RLIMIT_RSS },
431#endif
432#ifdef RLIMIT_RTPRIO
433 { "rtprio", RLIMIT_RTPRIO },
434#endif
435#ifdef RLIMIT_RTTIME
436 { "rttime", RLIMIT_RTTIME },
437#endif
438#ifdef RLIMIT_SIGPENDING
439 { "sigpending", RLIMIT_SIGPENDING },
440#endif
441#ifdef RLIMIT_STACK
442 { "stack", RLIMIT_STACK },
443#endif
444};
445
91c3830e
SH
446static int run_buffer(char *buffer)
447{
ebec9176 448 struct lxc_popen_FILE *f;
91c3830e 449 char *output;
8e7da691 450 int ret;
91c3830e 451
ebec9176 452 f = lxc_popen(buffer);
91c3830e 453 if (!f) {
062b72c6 454 SYSERROR("Failed to popen() %s.", buffer);
91c3830e
SH
455 return -1;
456 }
457
458 output = malloc(LXC_LOG_BUFFER_SIZE);
459 if (!output) {
062b72c6 460 ERROR("Failed to allocate memory for %s.", buffer);
ebec9176 461 lxc_pclose(f);
91c3830e
SH
462 return -1;
463 }
464
062b72c6
CB
465 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
466 DEBUG("Script %s with output: %s.", buffer, output);
91c3830e
SH
467
468 free(output);
469
ebec9176 470 ret = lxc_pclose(f);
8e7da691 471 if (ret == -1) {
062b72c6 472 SYSERROR("Script exited with error.");
91c3830e 473 return -1;
8e7da691 474 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
062b72c6 475 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
8e7da691
DE
476 return -1;
477 } else if (WIFSIGNALED(ret)) {
062b72c6 478 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
8e7da691 479 return -1;
91c3830e
SH
480 }
481
482 return 0;
483}
484
148e91f5 485static int run_script_argv(const char *name, const char *section,
062b72c6
CB
486 const char *script, const char *hook,
487 const char *lxcpath, char **argsin)
148e91f5
SH
488{
489 int ret, i;
490 char *buffer;
491 size_t size = 0;
492
062b72c6 493 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
148e91f5
SH
494 script, name, section);
495
062b72c6 496 for (i = 0; argsin && argsin[i]; i++)
148e91f5
SH
497 size += strlen(argsin[i]) + 1;
498
499 size += strlen(hook) + 1;
500
501 size += strlen(script);
502 size += strlen(name);
503 size += strlen(section);
504 size += 3;
505
506 if (size > INT_MAX)
507 return -1;
508
509 buffer = alloca(size);
510 if (!buffer) {
062b72c6 511 ERROR("Failed to allocate memory.");
148e91f5
SH
512 return -1;
513 }
514
062b72c6
CB
515 ret =
516 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
517 if (ret < 0 || (size_t)ret >= size) {
518 ERROR("Script name too long.");
148e91f5
SH
519 return -1;
520 }
521
062b72c6
CB
522 for (i = 0; argsin && argsin[i]; i++) {
523 int len = size - ret;
148e91f5
SH
524 int rc;
525 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
526 if (rc < 0 || rc >= len) {
062b72c6 527 ERROR("Script args too long.");
148e91f5
SH
528 return -1;
529 }
530 ret += rc;
531 }
532
533 return run_buffer(buffer);
534}
535
062b72c6
CB
536static int run_script(const char *name, const char *section, const char *script,
537 ...)
e3b4c4c4 538{
abbfd20b 539 int ret;
91c3830e 540 char *buffer, *p;
abbfd20b
DL
541 size_t size = 0;
542 va_list ap;
751d9dcd 543
062b72c6 544 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
751d9dcd 545 script, name, section);
e3b4c4c4 546
abbfd20b
DL
547 va_start(ap, script);
548 while ((p = va_arg(ap, char *)))
95642a10 549 size += strlen(p) + 1;
abbfd20b
DL
550 va_end(ap);
551
552 size += strlen(script);
553 size += strlen(name);
554 size += strlen(section);
95642a10 555 size += 3;
abbfd20b 556
95642a10
MS
557 if (size > INT_MAX)
558 return -1;
559
560 buffer = alloca(size);
abbfd20b 561 if (!buffer) {
062b72c6 562 ERROR("Failed to allocate memory.");
751d9dcd
DL
563 return -1;
564 }
565
9ba8130c
SH
566 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
567 if (ret < 0 || ret >= size) {
062b72c6 568 ERROR("Script name too long.");
9ba8130c
SH
569 return -1;
570 }
751d9dcd 571
abbfd20b 572 va_start(ap, script);
9ba8130c 573 while ((p = va_arg(ap, char *))) {
062b72c6 574 int len = size - ret;
9ba8130c
SH
575 int rc;
576 rc = snprintf(buffer + ret, len, " %s", p);
577 if (rc < 0 || rc >= len) {
062b72c6 578 ERROR("Script args too long.");
9ba8130c
SH
579 return -1;
580 }
581 ret += rc;
582 }
abbfd20b 583 va_end(ap);
751d9dcd 584
91c3830e 585 return run_buffer(buffer);
e3b4c4c4
ST
586}
587
a17b1e65 588static int mount_rootfs_dir(const char *rootfs, const char *target,
d435aae1 589 const char *options)
a6afdde9 590{
a17b1e65
SG
591 unsigned long mntflags;
592 char *mntdata;
593 int ret;
594
595 if (parse_mntopts(options, &mntflags, &mntdata) < 0) {
596 free(mntdata);
597 return -1;
598 }
599
600 ret = mount(rootfs, target, "none", MS_BIND | MS_REC | mntflags, mntdata);
601 free(mntdata);
602
603 return ret;
a6afdde9
DL
604}
605
c6868a1f 606static int lxc_mount_rootfs_file(const char *rootfs, const char *target,
d435aae1 607 const char *options)
78ae2fcc 608{
c6868a1f 609 int ret, loopfd;
a6afdde9 610 char path[MAXPATHLEN];
78ae2fcc 611
c6868a1f
CB
612 loopfd = lxc_prepare_loop_dev(rootfs, path, LO_FLAGS_AUTOCLEAR);
613 if (loopfd < 0)
78ae2fcc 614 return -1;
c6868a1f 615 DEBUG("prepared loop device \"%s\"", path);
a6afdde9 616
c6868a1f
CB
617 ret = mount_unknown_fs(path, target, options);
618 close(loopfd);
a6afdde9 619
c6868a1f 620 DEBUG("mounted rootfs \"%s\" on loop device \"%s\" via loop device \"%s\"", rootfs, target, path);
a6afdde9
DL
621
622 return ret;
78ae2fcc 623}
624
a17b1e65
SG
625static int mount_rootfs_block(const char *rootfs, const char *target,
626 const char *options)
a6afdde9 627{
a17b1e65 628 return mount_unknown_fs(rootfs, target, options);
a6afdde9
DL
629}
630
0c547523
SH
631/*
632 * pin_rootfs
b7ed4bf0
CS
633 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
634 * the duration of the container run, to prevent the container from marking
635 * the underlying fs readonly on shutdown. unlink the file immediately so
636 * no name pollution is happens
0c547523
SH
637 * return -1 on error.
638 * return -2 if nothing needed to be pinned.
639 * return an open fd (>=0) if we pinned it.
640 */
641int pin_rootfs(const char *rootfs)
642{
643 char absrootfs[MAXPATHLEN];
644 char absrootfspin[MAXPATHLEN];
645 struct stat s;
646 int ret, fd;
647
e99ee0de 648 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 649 return -2;
e99ee0de 650
00ec333b 651 if (!realpath(rootfs, absrootfs))
9be53773 652 return -2;
0c547523 653
00ec333b 654 if (access(absrootfs, F_OK))
0c547523 655 return -1;
0c547523 656
00ec333b 657 if (stat(absrootfs, &s))
0c547523 658 return -1;
0c547523 659
72f919c4 660 if (!S_ISDIR(s.st_mode))
0c547523
SH
661 return -2;
662
b7ed4bf0 663 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 664 if (ret >= MAXPATHLEN)
0c547523 665 return -1;
0c547523
SH
666
667 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
668 if (fd < 0)
669 return fd;
670 (void)unlink(absrootfspin);
0c547523
SH
671 return fd;
672}
673
e2a7e8dc
SH
674/*
675 * If we are asking to remount something, make sure that any
676 * NOEXEC etc are honored.
677 */
678static unsigned long add_required_remount_flags(const char *s, const char *d,
679 unsigned long flags)
680{
614305f3 681#ifdef HAVE_STATVFS
e2a7e8dc
SH
682 struct statvfs sb;
683 unsigned long required_flags = 0;
684
685 if (!(flags & MS_REMOUNT))
686 return flags;
687
688 if (!s)
689 s = d;
690
691 if (!s)
692 return flags;
693 if (statvfs(s, &sb) < 0)
694 return flags;
695
696 if (sb.f_flag & MS_NOSUID)
697 required_flags |= MS_NOSUID;
698 if (sb.f_flag & MS_NODEV)
699 required_flags |= MS_NODEV;
700 if (sb.f_flag & MS_RDONLY)
701 required_flags |= MS_RDONLY;
702 if (sb.f_flag & MS_NOEXEC)
703 required_flags |= MS_NOEXEC;
704
705 return flags | required_flags;
614305f3
SH
706#else
707 return flags;
708#endif
e2a7e8dc
SH
709}
710
4fb3cba5 711static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 712{
368bbc02 713 int r;
80e80c40 714 int i;
b06b8511
CS
715 static struct {
716 int match_mask;
717 int match_flag;
718 const char *source;
719 const char *destination;
720 const char *fstype;
721 unsigned long flags;
722 const char *options;
723 } default_mounts[] = {
724 /* Read-only bind-mounting... In older kernels, doing that required
725 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
726 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
727 * kernel 2.6.26 onwards. However, this apparently does not work on
728 * kernel 3.8. Unfortunately, on that very same kernel, doing the
729 * same trick as above doesn't seem to work either, there one needs
730 * to ALSO specify MS_BIND for the remount, otherwise the entire
731 * fs is remounted read-only or the mount fails because it's busy...
732 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
733 * 2.6.32...
368bbc02 734 */
f24a52d5 735 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a
SH
736 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
737 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
f24a52d5
SG
738 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
739 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
592fd47a 740 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
f24a52d5
SG
741 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
742 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
743 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
744 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
745 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
746 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
747 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
748 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
749 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
750 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
751 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
752 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 753 };
368bbc02 754
b06b8511
CS
755 for (i = 0; default_mounts[i].match_mask; i++) {
756 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
757 char *source = NULL;
758 char *destination = NULL;
759 int saved_errno;
e2a7e8dc 760 unsigned long mflags;
b06b8511
CS
761
762 if (default_mounts[i].source) {
763 /* will act like strdup if %r is not present */
8ede5f4c 764 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
b06b8511
CS
765 if (!source) {
766 SYSERROR("memory allocation error");
767 return -1;
768 }
769 }
cc4fd506
SH
770 if (!default_mounts[i].destination) {
771 ERROR("BUG: auto mounts destination %d was NULL", i);
b2f44b4d 772 free(source);
cc4fd506
SH
773 return -1;
774 }
775 /* will act like strdup if %r is not present */
776 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
777 if (!destination) {
778 saved_errno = errno;
779 SYSERROR("memory allocation error");
780 free(source);
781 errno = saved_errno;
782 return -1;
b06b8511 783 }
e2a7e8dc
SH
784 mflags = add_required_remount_flags(source, destination,
785 default_mounts[i].flags);
592fd47a 786 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
b06b8511 787 saved_errno = errno;
b88ff9a0
SG
788 if (r < 0 && errno == ENOENT) {
789 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
790 r = 0;
791 }
792 else if (r < 0)
e2a7e8dc 793 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
f24a52d5 794
b06b8511
CS
795 free(source);
796 free(destination);
797 if (r < 0) {
b06b8511
CS
798 errno = saved_errno;
799 return -1;
800 }
368bbc02 801 }
368bbc02
CS
802 }
803
b06b8511 804 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
805 int cg_flags;
806
807 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
808 /* If the type of cgroup mount was not specified, it depends on the
809 * container's capabilities as to what makes sense: if we have
810 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
811 * anyway, so we may as well default to read-write; then the admin
812 * will not be given a false sense of security. (And if they really
813 * want mixed r/o r/w, then they can explicitly specify :mixed.)
814 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
815 * :mixed, because then the container can't remount it read-write. */
816 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
817 int has_sys_admin = 0;
818 if (!lxc_list_empty(&conf->keepcaps)) {
819 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
820 } else {
821 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
822 }
823 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC) {
824 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
825 } else {
826 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
827 }
828 }
829
8ede5f4c 830 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
368bbc02 831 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 832 return -1;
368bbc02
CS
833 }
834 }
835
368bbc02 836 return 0;
368bbc02
CS
837}
838
a17b1e65 839static int mount_rootfs(const char *rootfs, const char *target, const char *options)
0ad19a3f 840{
b09ef133 841 char absrootfs[MAXPATHLEN];
78ae2fcc 842 struct stat s;
a6afdde9 843 int i;
78ae2fcc 844
a17b1e65 845 typedef int (*rootfs_cb)(const char *, const char *, const char *);
78ae2fcc 846
847 struct rootfs_type {
848 int type;
849 rootfs_cb cb;
850 } rtfs_type[] = {
2656d231
DL
851 { S_IFDIR, mount_rootfs_dir },
852 { S_IFBLK, mount_rootfs_block },
c6868a1f 853 { S_IFREG, lxc_mount_rootfs_file },
78ae2fcc 854 };
0ad19a3f 855
4c8ab83b 856 if (!realpath(rootfs, absrootfs)) {
91c3e281 857 SYSERROR("Failed to get real path for \"%s\".", rootfs);
4c8ab83b 858 return -1;
859 }
b09ef133 860
b09ef133 861 if (access(absrootfs, F_OK)) {
d26582c1 862 SYSERROR("The rootfs \"%s\" is not accessible.", absrootfs);
b09ef133 863 return -1;
864 }
865
78ae2fcc 866 if (stat(absrootfs, &s)) {
91c3e281 867 SYSERROR("Failed to stat the rootfs \"%s\".", absrootfs);
9b0f0477 868 return -1;
869 }
870
78ae2fcc 871 for (i = 0; i < sizeof(rtfs_type)/sizeof(rtfs_type[0]); i++) {
78ae2fcc 872 if (!__S_ISTYPE(s.st_mode, rtfs_type[i].type))
873 continue;
9b0f0477 874
a17b1e65 875 return rtfs_type[i].cb(absrootfs, target, options);
78ae2fcc 876 }
9b0f0477 877
91c3e281 878 ERROR("Unsupported rootfs type for rootfs \"%s\".", absrootfs);
78ae2fcc 879 return -1;
0ad19a3f 880}
881
4e5440c6 882static int setup_utsname(struct utsname *utsname)
0ad19a3f 883{
4e5440c6
DL
884 if (!utsname)
885 return 0;
0ad19a3f 886
4e5440c6
DL
887 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
888 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 889 return -1;
890 }
891
4e5440c6 892 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 893
0ad19a3f 894 return 0;
895}
896
69aa6655
DE
897struct dev_symlinks {
898 const char *oldpath;
899 const char *name;
900};
901
902static const struct dev_symlinks dev_symlinks[] = {
903 {"/proc/self/fd", "fd"},
904 {"/proc/self/fd/0", "stdin"},
905 {"/proc/self/fd/1", "stdout"},
906 {"/proc/self/fd/2", "stderr"},
907};
908
909static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
910{
911 char path[MAXPATHLEN];
912 int ret,i;
09227be2 913 struct stat s;
69aa6655
DE
914
915
916 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
917 const struct dev_symlinks *d = &dev_symlinks[i];
ec50007f 918 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
919 if (ret < 0 || ret >= MAXPATHLEN)
920 return -1;
09227be2
MW
921
922 /*
923 * Stat the path first. If we don't get an error
924 * accept it as is and don't try to create it
925 */
926 if (!stat(path, &s)) {
927 continue;
928 }
929
69aa6655 930 ret = symlink(d->oldpath, path);
09227be2 931
69aa6655 932 if (ret && errno != EEXIST) {
09227be2
MW
933 if ( errno == EROFS ) {
934 WARN("Warning: Read Only file system while creating %s", path);
935 } else {
936 SYSERROR("Error creating %s", path);
937 return -1;
938 }
69aa6655
DE
939 }
940 }
941 return 0;
942}
943
393903d1
SH
944/*
945 * Build a space-separate list of ptys to pass to systemd.
946 */
947static bool append_ptyname(char **pp, char *name)
b0a33c1e 948{
393903d1
SH
949 char *p;
950
951 if (!*pp) {
952 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
953 if (!*pp)
954 return false;
955 sprintf(*pp, "container_ttys=%s", name);
956 return true;
957 }
958 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
959 if (!p)
960 return false;
961 *pp = p;
962 strcat(p, " ");
963 strcat(p, name);
964 return true;
965}
966
967static int setup_tty(struct lxc_conf *conf)
968{
393903d1
SH
969 const struct lxc_tty_info *tty_info = &conf->tty_info;
970 char *ttydir = conf->ttydir;
7c6ef2a2
SH
971 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
972 int i, ret;
b0a33c1e 973
e8bd4e43 974 if (!conf->rootfs.path)
bc9bd0e3
DL
975 return 0;
976
b0a33c1e 977 for (i = 0; i < tty_info->nbtty; i++) {
978
979 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
980
e8bd4e43 981 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
7c6ef2a2
SH
982 if (ret >= sizeof(path)) {
983 ERROR("pathname too long for ttys");
984 return -1;
985 }
986 if (ttydir) {
987 /* create dev/lxc/tty%d" */
e8bd4e43 988 ret = snprintf(lxcpath, sizeof(lxcpath), "/dev/%s/tty%d", ttydir, i + 1);
7c6ef2a2
SH
989 if (ret >= sizeof(lxcpath)) {
990 ERROR("pathname too long for ttys");
991 return -1;
992 }
993 ret = creat(lxcpath, 0660);
994 if (ret==-1 && errno != EEXIST) {
959aee9c 995 SYSERROR("error creating %s", lxcpath);
7c6ef2a2
SH
996 return -1;
997 }
4d44e274
SH
998 if (ret >= 0)
999 close(ret);
7c6ef2a2
SH
1000 ret = unlink(path);
1001 if (ret && errno != ENOENT) {
959aee9c 1002 SYSERROR("error unlinking %s", path);
7c6ef2a2
SH
1003 return -1;
1004 }
b0a33c1e 1005
7c6ef2a2
SH
1006 if (mount(pty_info->name, lxcpath, "none", MS_BIND, 0)) {
1007 WARN("failed to mount '%s'->'%s'",
1008 pty_info->name, path);
1009 continue;
1010 }
13954cce 1011
9ba8130c
SH
1012 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d", ttydir, i+1);
1013 if (ret >= sizeof(lxcpath)) {
1014 ERROR("tty pathname too long");
1015 return -1;
1016 }
7c6ef2a2
SH
1017 ret = symlink(lxcpath, path);
1018 if (ret) {
959aee9c 1019 SYSERROR("failed to create symlink for tty %d", i+1);
7c6ef2a2
SH
1020 return -1;
1021 }
1022 } else {
c6883f38
SH
1023 /* If we populated /dev, then we need to create /dev/ttyN */
1024 if (access(path, F_OK)) {
1025 ret = creat(path, 0660);
1026 if (ret==-1) {
959aee9c 1027 SYSERROR("error creating %s", path);
c6883f38 1028 /* this isn't fatal, continue */
025ed0f3 1029 } else {
c6883f38 1030 close(ret);
025ed0f3 1031 }
c6883f38 1032 }
7c6ef2a2 1033 if (mount(pty_info->name, path, "none", MS_BIND, 0)) {
e8bd4e43 1034 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
7c6ef2a2
SH
1035 continue;
1036 }
393903d1 1037 }
e8bd4e43 1038 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
393903d1
SH
1039 ERROR("Error setting up container_ttys string");
1040 return -1;
b0a33c1e 1041 }
1042 }
1043
cd54d859
DL
1044 INFO("%d tty(s) has been setup", tty_info->nbtty);
1045
b0a33c1e 1046 return 0;
1047}
1048
bf601689 1049
59bb8698 1050static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 1051{
2d489f9e 1052 int oldroot = -1, newroot = -1;
bf601689 1053
2d489f9e
SH
1054 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1055 if (oldroot < 0) {
1056 SYSERROR("Error opening old-/ for fchdir");
9ba8130c
SH
1057 return -1;
1058 }
2d489f9e
SH
1059 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1060 if (newroot < 0) {
1061 SYSERROR("Error opening new-/ for fchdir");
1062 goto fail;
c08556c6 1063 }
bf601689 1064
cc6f6dd7 1065 /* change into new root fs */
2d489f9e 1066 if (fchdir(newroot)) {
cc6f6dd7 1067 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
2d489f9e 1068 goto fail;
cc6f6dd7
DL
1069 }
1070
cc6f6dd7 1071 /* pivot_root into our new root fs */
2d489f9e 1072 if (pivot_root(".", ".")) {
cc6f6dd7 1073 SYSERROR("pivot_root syscall failed");
2d489f9e 1074 goto fail;
bf601689 1075 }
cc6f6dd7 1076
2d489f9e
SH
1077 /*
1078 * at this point the old-root is mounted on top of our new-root
1079 * To unmounted it we must not be chdir'd into it, so escape back
1080 * to old-root
1081 */
1082 if (fchdir(oldroot) < 0) {
1083 SYSERROR("Error entering oldroot");
1084 goto fail;
1085 }
7981ea46 1086 if (umount2(".", MNT_DETACH) < 0) {
2d489f9e
SH
1087 SYSERROR("Error detaching old root");
1088 goto fail;
cc6f6dd7
DL
1089 }
1090
2d489f9e
SH
1091 if (fchdir(newroot) < 0) {
1092 SYSERROR("Error re-entering newroot");
1093 goto fail;
1094 }
cc6f6dd7 1095
2d489f9e
SH
1096 close(oldroot);
1097 close(newroot);
bf601689 1098
2d489f9e 1099 DEBUG("pivot_root syscall to '%s' successful", rootfs);
bf601689 1100
bf601689 1101 return 0;
2d489f9e
SH
1102
1103fail:
1104 if (oldroot != -1)
1105 close(oldroot);
1106 if (newroot != -1)
1107 close(newroot);
1108 return -1;
bf601689
MH
1109}
1110
bc6928ff 1111/*
87da4ec3
SH
1112 * Just create a path for /dev under $lxcpath/$name and in rootfs
1113 * If we hit an error, log it but don't fail yet.
91c3830e 1114 */
14221cbb 1115static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, const char *lxcpath)
91c3830e
SH
1116{
1117 int ret;
87da4ec3
SH
1118 size_t clen;
1119 char *path;
91c3830e 1120
14221cbb 1121 INFO("Mounting container /dev");
bc6928ff 1122
14221cbb 1123 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1124 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1125 path = alloca(clen);
bc6928ff 1126
ec50007f 1127 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
87da4ec3 1128 if (ret < 0 || ret >= clen)
91c3830e 1129 return -1;
bc6928ff 1130
87da4ec3 1131 if (!dir_exists(path)) {
14221cbb 1132 WARN("No /dev in container.");
87da4ec3
SH
1133 WARN("Proceeding without autodev setup");
1134 return 0;
bc6928ff 1135 }
87da4ec3 1136
1ec0e8e3 1137 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
ec50007f 1138 rootfs->path ? rootfs->mount : NULL);
1ec0e8e3 1139 if (ret != 0) {
87da4ec3 1140 SYSERROR("Failed mounting tmpfs onto %s\n", path);
1ec0e8e3 1141 return -1;
91c3830e 1142 }
87da4ec3
SH
1143
1144 INFO("Mounted tmpfs onto %s", path);
1145
ec50007f 1146 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
87da4ec3 1147 if (ret < 0 || ret >= clen)
91c3830e 1148 return -1;
87da4ec3 1149
bc6928ff
MW
1150 /*
1151 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1152 * If not, then create it and exit if that fails...
1153 */
87da4ec3 1154 if (!dir_exists(path)) {
bc6928ff
MW
1155 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1156 if (ret) {
1157 SYSERROR("Failed to create /dev/pts in container");
1158 return -1;
1159 }
91c3830e
SH
1160 }
1161
14221cbb 1162 INFO("Mounted container /dev");
91c3830e
SH
1163 return 0;
1164}
1165
c6883f38 1166struct lxc_devs {
74a3920a 1167 const char *name;
c6883f38
SH
1168 mode_t mode;
1169 int maj;
1170 int min;
1171};
1172
74a3920a 1173static const struct lxc_devs lxc_devs[] = {
c6883f38
SH
1174 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1175 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1176 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1177 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1178 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1179 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
c6883f38
SH
1180};
1181
27245ff7 1182static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38
SH
1183{
1184 int ret;
c6883f38
SH
1185 char path[MAXPATHLEN];
1186 int i;
3a32201c 1187 mode_t cmask;
c6883f38 1188
ec50007f 1189 ret = snprintf(path, MAXPATHLEN, "%s/dev", rootfs->path ? rootfs->mount : "");
91c3830e
SH
1190 if (ret < 0 || ret >= MAXPATHLEN) {
1191 ERROR("Error calculating container /dev location");
c6883f38 1192 return -1;
f7bee6c6 1193 }
91c3830e 1194
0bbf8572
CB
1195 /* ignore, just don't try to fill in */
1196 if (!dir_exists(path))
9cb4d183
SH
1197 return 0;
1198
0bbf8572 1199 INFO("populating container /dev");
3a32201c 1200 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1201 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1202 const struct lxc_devs *d = &lxc_devs[i];
0728ebf4 1203
ec50007f 1204 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
c6883f38
SH
1205 if (ret < 0 || ret >= MAXPATHLEN)
1206 return -1;
0bbf8572 1207
c6883f38 1208 ret = mknod(path, d->mode, makedev(d->maj, d->min));
0bbf8572 1209 if (ret < 0) {
9cb4d183
SH
1210 char hostpath[MAXPATHLEN];
1211 FILE *pathfile;
1212
0bbf8572
CB
1213 if (errno == EEXIST) {
1214 DEBUG("\"%s\" device already existed", path);
1215 continue;
1216 }
1217
1218 /* Unprivileged containers cannot create devices, so
1219 * bind mount the device from the host.
1220 */
9cb4d183
SH
1221 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1222 if (ret < 0 || ret >= MAXPATHLEN)
1223 return -1;
1224 pathfile = fopen(path, "wb");
1225 if (!pathfile) {
1226 SYSERROR("Failed to create device mount target '%s'", path);
1227 return -1;
1228 }
1229 fclose(pathfile);
0bbf8572
CB
1230 if (safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL) != 0) {
1231 SYSERROR("Failed bind mounting device %s from host into container", d->name);
9cb4d183
SH
1232 return -1;
1233 }
0bbf8572
CB
1234 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath, path);
1235 } else {
1236 DEBUG("created device node \"%s\"", path);
c6883f38
SH
1237 }
1238 }
3a32201c 1239 umask(cmask);
c6883f38 1240
0bbf8572 1241 INFO("populated container /dev");
c6883f38
SH
1242 return 0;
1243}
1244
cc28d0b0 1245static int setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1246{
91c3e281
CB
1247 struct bdev *bdev;
1248 const struct lxc_rootfs *rootfs;
cc28d0b0 1249
91c3e281 1250 rootfs = &conf->rootfs;
a0f379bf 1251 if (!rootfs->path) {
91c3e281
CB
1252 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1253 SYSERROR("Failed to make / rslave.");
a0f379bf
DW
1254 return -1;
1255 }
c69bd12f 1256 return 0;
a0f379bf 1257 }
0ad19a3f 1258
12297168 1259 if (access(rootfs->mount, F_OK)) {
91c3e281 1260 SYSERROR("Failed to access to \"%s\". Check it is present.",
12297168 1261 rootfs->mount);
b1789442
DL
1262 return -1;
1263 }
1264
91c3e281
CB
1265 /* First try mounting rootfs using a bdev. */
1266 bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1267 if (bdev && !bdev->ops->mount(bdev)) {
59d66af2 1268 bdev_put(bdev);
91c3e281
CB
1269 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1270 rootfs->path, rootfs->mount,
1271 rootfs->options ? rootfs->options : "(null)");
9be53773
SH
1272 return 0;
1273 }
59d66af2
SH
1274 if (bdev)
1275 bdev_put(bdev);
a17b1e65 1276 if (mount_rootfs(rootfs->path, rootfs->mount, rootfs->options)) {
91c3e281
CB
1277 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1278 rootfs->path, rootfs->mount,
1279 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1280 return -1;
1281 }
0ad19a3f 1282
91c3e281
CB
1283 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1284 rootfs->path, rootfs->mount,
1285 rootfs->options ? rootfs->options : "(null)");
ac778708
DL
1286 return 0;
1287}
1288
91e93c71
AV
1289int prepare_ramfs_root(char *root)
1290{
eab15c1e 1291 char buf[LXC_LINELEN], *p;
91e93c71
AV
1292 char nroot[PATH_MAX];
1293 FILE *f;
1294 int i;
1295 char *p2;
1296
1297 if (realpath(root, nroot) == NULL)
39c7b795 1298 return -errno;
91e93c71
AV
1299
1300 if (chdir("/") == -1)
39c7b795 1301 return -errno;
91e93c71
AV
1302
1303 /*
1304 * We could use here MS_MOVE, but in userns this mount is
1305 * locked and can't be moved.
1306 */
39c7b795 1307 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
91e93c71 1308 SYSERROR("Failed to move %s into /", root);
39c7b795 1309 return -errno;
91e93c71
AV
1310 }
1311
39c7b795 1312 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
91e93c71 1313 SYSERROR("Failed to make . rprivate");
39c7b795 1314 return -errno;
91e93c71
AV
1315 }
1316
1317 /*
1318 * The following code cleans up inhereted mounts which are not
1319 * required for CT.
1320 *
1321 * The mountinfo file shows not all mounts, if a few points have been
1322 * unmounted between read operations from the mountinfo. So we need to
1323 * read mountinfo a few times.
1324 *
1325 * This loop can be skipped if a container uses unserns, because all
1326 * inherited mounts are locked and we should live with all this trash.
1327 */
1328 while (1) {
1329 int progress = 0;
1330
1331 f = fopen("./proc/self/mountinfo", "r");
1332 if (!f) {
1333 SYSERROR("Unable to open /proc/self/mountinfo");
1334 return -1;
1335 }
eab15c1e 1336 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1337 for (p = buf, i=0; p && i < 4; i++)
1338 p = strchr(p+1, ' ');
1339 if (!p)
1340 continue;
1341 p2 = strchr(p+1, ' ');
1342 if (!p2)
1343 continue;
1344
1345 *p2 = '\0';
1346 *p = '.';
1347
1348 if (strcmp(p + 1, "/") == 0)
1349 continue;
1350 if (strcmp(p + 1, "/proc") == 0)
1351 continue;
1352
1353 if (umount2(p, MNT_DETACH) == 0)
1354 progress++;
1355 }
1356 fclose(f);
1357 if (!progress)
1358 break;
1359 }
1360
8bea9fae
PR
1361 /* This also can be skipped if a container uses unserns */
1362 umount2("./proc", MNT_DETACH);
91e93c71
AV
1363
1364 /* It is weird, but chdir("..") moves us in a new root */
1365 if (chdir("..") == -1) {
1366 SYSERROR("Unable to change working directory");
1367 return -1;
1368 }
1369
1370 if (chroot(".") == -1) {
1371 SYSERROR("Unable to chroot");
1372 return -1;
1373 }
1374
1375 return 0;
1376}
1377
74a3920a 1378static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1379{
39c7b795
CB
1380 if (!rootfs->path) {
1381 DEBUG("container does not have a rootfs, so not doing pivot root");
ac778708 1382 return 0;
39c7b795 1383 }
ac778708 1384
91e93c71 1385 if (detect_ramfs_rootfs()) {
39c7b795
CB
1386 DEBUG("detected that container is on ramfs");
1387 if (prepare_ramfs_root(rootfs->mount)) {
1388 ERROR("failed to prepare minimal ramfs root");
91e93c71 1389 return -1;
39c7b795
CB
1390 }
1391
1392 DEBUG("prepared ramfs root for container");
1393 return 0;
1394 }
1395
1396 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1397 ERROR("failed to pivot root");
25368b52 1398 return -1;
c69bd12f
DL
1399 }
1400
39c7b795 1401 DEBUG("finished pivot root");
25368b52 1402 return 0;
0ad19a3f 1403}
1404
70761e5e 1405static int lxc_setup_devpts(int num_pts)
3c26f34e 1406{
70761e5e 1407 int ret;
d5cb35d6 1408 const char *devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
77890c6d 1409
70761e5e
CB
1410 if (!num_pts) {
1411 DEBUG("no new devpts instance will be mounted since no pts "
1412 "devices are requested");
d852c78c 1413 return 0;
3c26f34e 1414 }
1415
d5cb35d6 1416 /* Unmount old devpts instance. */
70761e5e
CB
1417 ret = access("/dev/pts/ptmx", F_OK);
1418 if (!ret) {
70761e5e
CB
1419 ret = umount("/dev/pts");
1420 if (ret < 0) {
1421 SYSERROR("failed to unmount old devpts instance");
1422 return -1;
7e40254a 1423 }
70761e5e 1424 DEBUG("unmounted old /dev/pts instance");
7e40254a
JTLB
1425 }
1426
70761e5e
CB
1427 /* Create mountpoint for devpts instance. */
1428 ret = mkdir("/dev/pts", 0755);
1429 if (ret < 0 && errno != EEXIST) {
1430 SYSERROR("failed to create the \"/dev/pts\" directory");
3c26f34e 1431 return -1;
1432 }
1433
70761e5e
CB
1434 /* Mount new devpts instance. */
1435 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1436 if (ret < 0) {
1437 SYSERROR("failed to mount new devpts instance");
1438 return -1;
1439 }
1440
d5cb35d6 1441 /* Remove any pre-existing /dev/ptmx file. */
70761e5e 1442 ret = access("/dev/ptmx", F_OK);
d5cb35d6
CB
1443 if (!ret) {
1444 ret = remove("/dev/ptmx");
1445 if (ret < 0) {
1446 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1447 return -1;
70761e5e 1448 }
d5cb35d6 1449 DEBUG("removed existing \"/dev/ptmx\"");
3c26f34e 1450 }
1451
d5cb35d6
CB
1452 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1453 ret = open("/dev/ptmx", O_CREAT, 0666);
1454 if (ret < 0) {
1455 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1456 return -1;
1457 }
e87bd19c 1458 close(ret);
d5cb35d6 1459 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1460
d5cb35d6 1461 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1462 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6
CB
1463 if (!ret) {
1464 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1465 return 0;
1466 } else {
1467 /* Fallthrough and try to create a symlink. */
1468 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1469 }
1470
1471 /* Remove the dummy /dev/ptmx file we created above. */
1472 ret = remove("/dev/ptmx");
70761e5e 1473 if (ret < 0) {
d5cb35d6
CB
1474 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1475 return -1;
1476 }
1477
1478 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1479 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1480 if (ret < 0) {
1481 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
3c26f34e 1482 return -1;
1483 }
d5cb35d6 1484 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
cd54d859 1485
3c26f34e 1486 return 0;
1487}
1488
cccc74b5
DL
1489static int setup_personality(int persona)
1490{
6ff05e18 1491 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1492 if (persona == -1)
1493 return 0;
1494
1495 if (personality(persona) < 0) {
1496 SYSERROR("failed to set personality to '0x%x'", persona);
1497 return -1;
1498 }
1499
1500 INFO("set personality to '0x%x'", persona);
6ff05e18 1501 #endif
cccc74b5
DL
1502
1503 return 0;
1504}
1505
3d7d929a
CB
1506static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1507 const struct lxc_console *console)
6e590161 1508{
63376d7d 1509 char path[MAXPATHLEN];
0728ebf4 1510 int ret, fd;
52e35957 1511
8b1b1210
CB
1512 if (console->path && !strcmp(console->path, "none"))
1513 return 0;
1514
7c6ef2a2 1515 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
3d7d929a 1516 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1517 return -1;
52e35957 1518
8b1b1210
CB
1519 /* When we are asked to setup a console we remove any previous
1520 * /dev/console bind-mounts.
1521 */
a7ba3c7f
CB
1522 if (file_exists(path)) {
1523 ret = lxc_unstack_mountpoint(path, false);
1524 if (ret < 0) {
8b1b1210 1525 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1526 return -ret;
1527 } else {
1528 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1529 }
1530 ret = unlink(path);
1531 if (ret < 0) {
1532 SYSERROR("error unlinking %s", path);
8b1b1210
CB
1533 return -errno;
1534 }
8b1b1210
CB
1535 }
1536
1537 /* For unprivileged containers autodev or automounts will already have
1538 * taken care of creating /dev/console.
1539 */
0728ebf4
TA
1540 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1541 if (fd < 0) {
1542 if (errno != EEXIST) {
1543 SYSERROR("failed to create console");
3d7d929a 1544 return -errno;
0728ebf4
TA
1545 }
1546 } else {
1547 close(fd);
52e35957
DL
1548 }
1549
0728ebf4 1550 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
3d7d929a
CB
1551 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1552 return -errno;
63376d7d 1553 }
13954cce 1554
3d7d929a 1555 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
63376d7d 1556 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1557 return -1;
1558 }
1559
3d7d929a 1560 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1561 return 0;
1562}
1563
3d7d929a
CB
1564static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1565 const struct lxc_console *console,
1566 char *ttydir)
7c6ef2a2 1567{
7c6ef2a2 1568 int ret;
3d7d929a 1569 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
7c6ef2a2
SH
1570
1571 /* create rootfs/dev/<ttydir> directory */
3d7d929a
CB
1572 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1573 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1574 return -1;
3d7d929a 1575
7c6ef2a2
SH
1576 ret = mkdir(path, 0755);
1577 if (ret && errno != EEXIST) {
959aee9c 1578 SYSERROR("failed with errno %d to create %s", errno, path);
3d7d929a 1579 return -errno;
7c6ef2a2 1580 }
3d7d929a 1581 DEBUG("created directory for console and tty devices at \%s\"", path);
7c6ef2a2 1582
3d7d929a
CB
1583 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1584 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1585 return -1;
1586
7c6ef2a2 1587 ret = creat(lxcpath, 0660);
3d7d929a 1588 if (ret == -1 && errno != EEXIST) {
959aee9c 1589 SYSERROR("error %d creating %s", errno, lxcpath);
3d7d929a 1590 return -errno;
7c6ef2a2 1591 }
4d44e274
SH
1592 if (ret >= 0)
1593 close(ret);
7c6ef2a2 1594
2a12fefd
CB
1595 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1596 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 1597 return -1;
2a12fefd
CB
1598
1599 /* When we are asked to setup a console we remove any previous
1600 * /dev/console bind-mounts.
1601 */
1602 if (console->path && !strcmp(console->path, "none")) {
1603 struct stat st;
1604 ret = stat(path, &st);
1605 if (ret < 0) {
1606 if (errno == ENOENT)
1607 return 0;
1608 SYSERROR("failed stat() \"%s\"", path);
1609 return -errno;
1610 }
1611
1612 /* /dev/console must be character device with major number 5 and
1613 * minor number 1. If not, give benefit of the doubt and assume
1614 * the user has mounted something else right there on purpose.
1615 */
1616 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1617 return 0;
1618
1619 /* In case the user requested a bind-mount for /dev/console and
1620 * requests a ttydir we move the mount to the
a7ba3c7f
CB
1621 * /dev/<ttydir/console.
1622 * Note, we only move the uppermost mount and clear all other
1623 * mounts underneath for safety.
1624 * If it is a character device created via mknod() we simply
1625 * rename it.
2a12fefd
CB
1626 */
1627 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1628 if (ret < 0) {
1629 if (errno != EINVAL) {
1630 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1631 return -errno;
1632 }
1633 /* path was not a mountpoint */
1634 ret = rename(path, lxcpath);
1635 if (ret < 0) {
1636 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1637 return -errno;
1638 }
1639 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1640 } else {
1641 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1642 }
a7ba3c7f
CB
1643
1644 /* Clear all remaining bind-mounts. */
1645 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1646 if (ret < 0) {
a7ba3c7f
CB
1647 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1648 return -ret;
1649 } else {
1650 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1651 }
1652 } else {
1653 if (file_exists(path)) {
1654 ret = lxc_unstack_mountpoint(path, false);
1655 if (ret < 0) {
2a12fefd 1656 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1657 return -ret;
1658 } else {
1659 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
2a12fefd 1660 }
2a12fefd
CB
1661 }
1662
1663 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1664 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1665 return -1;
1666 }
1667 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1668 }
1669
2a12fefd 1670 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
9ba8130c 1671 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
3d7d929a 1672 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 1673 return -1;
3d7d929a 1674
2a12fefd
CB
1675 ret = unlink(path);
1676 if (ret && errno != ENOENT) {
1677 SYSERROR("error unlinking %s", path);
1678 return -errno;
1679 }
1680
7c6ef2a2 1681 ret = symlink(lxcpath, path);
3d7d929a
CB
1682 if (ret < 0) {
1683 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
7c6ef2a2
SH
1684 return -1;
1685 }
1686
3d7d929a 1687 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
6e590161 1688 return 0;
1689}
1690
3d7d929a
CB
1691static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1692 const struct lxc_console *console, char *ttydir)
7c6ef2a2 1693{
3d7d929a
CB
1694 /* We don't have a rootfs, /dev/console will be shared. */
1695 if (!rootfs->path) {
1696 DEBUG("/dev/console will be shared with the host");
7c6ef2a2 1697 return 0;
3d7d929a
CB
1698 }
1699
7c6ef2a2 1700 if (!ttydir)
3d7d929a 1701 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1702
3d7d929a 1703 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1704}
1705
1bd051a6
SH
1706static int setup_kmsg(const struct lxc_rootfs *rootfs,
1707 const struct lxc_console *console)
1708{
1709 char kpath[MAXPATHLEN];
1710 int ret;
1711
222fea5a
DE
1712 if (!rootfs->path)
1713 return 0;
1bd051a6
SH
1714 ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
1715 if (ret < 0 || ret >= sizeof(kpath))
1716 return -1;
1717
1718 ret = unlink(kpath);
1719 if (ret && errno != ENOENT) {
959aee9c 1720 SYSERROR("error unlinking %s", kpath);
1bd051a6
SH
1721 return -1;
1722 }
1723
1724 ret = symlink("console", kpath);
1725 if (ret) {
1726 SYSERROR("failed to create symlink for kmsg");
1727 return -1;
1728 }
1729
1730 return 0;
1731}
1732
998ac676
RT
1733static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1734{
1735 struct mount_opt *mo;
1736
1737 /* If opt is found in mount_opt, set or clear flags.
1738 * Otherwise append it to data. */
1739
1740 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1741 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1742 if (mo->clear)
1743 *flags &= ~mo->flag;
1744 else
1745 *flags |= mo->flag;
1746 return;
1747 }
1748 }
1749
1750 if (strlen(*data))
1751 strcat(*data, ",");
1752 strcat(*data, opt);
1753}
1754
a17b1e65 1755int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1756 char **mntdata)
1757{
1758 char *s, *data;
1759 char *p, *saveptr = NULL;
1760
911324ef 1761 *mntdata = NULL;
91656ce5 1762 *mntflags = 0L;
911324ef
DL
1763
1764 if (!mntopts)
998ac676
RT
1765 return 0;
1766
911324ef 1767 s = strdup(mntopts);
998ac676 1768 if (!s) {
36eb9bde 1769 SYSERROR("failed to allocate memory");
998ac676
RT
1770 return -1;
1771 }
1772
1773 data = malloc(strlen(s) + 1);
1774 if (!data) {
36eb9bde 1775 SYSERROR("failed to allocate memory");
998ac676
RT
1776 free(s);
1777 return -1;
1778 }
1779 *data = 0;
1780
1781 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1782 p = strtok_r(NULL, ",", &saveptr))
1783 parse_mntopt(p, mntflags, &data);
1784
1785 if (*data)
1786 *mntdata = data;
1787 else
1788 free(data);
1789 free(s);
1790
1791 return 0;
1792}
1793
6fd5e769
SH
1794static void null_endofword(char *word)
1795{
1796 while (*word && *word != ' ' && *word != '\t')
1797 word++;
1798 *word = '\0';
1799}
1800
1801/*
1802 * skip @nfields spaces in @src
1803 */
1804static char *get_field(char *src, int nfields)
1805{
1806 char *p = src;
1807 int i;
1808
1809 for (i = 0; i < nfields; i++) {
1810 while (*p && *p != ' ' && *p != '\t')
1811 p++;
1812 if (!*p)
1813 break;
1814 p++;
1815 }
1816 return p;
1817}
1818
911324ef
DL
1819static int mount_entry(const char *fsname, const char *target,
1820 const char *fstype, unsigned long mountflags,
ae7a770e 1821 const char *data, int optional, int dev, const char *rootfs)
911324ef 1822{
614305f3 1823#ifdef HAVE_STATVFS
2938f7c8 1824 struct statvfs sb;
614305f3 1825#endif
2938f7c8 1826
592fd47a 1827 if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) {
1fc64d22
SG
1828 if (optional) {
1829 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
1830 target, strerror(errno));
1831 return 0;
1832 }
1833 else {
1834 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1835 return -1;
1836 }
911324ef
DL
1837 }
1838
1839 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
2938f7c8
SH
1840 DEBUG("remounting %s on %s to respect bind or remount options",
1841 fsname ? fsname : "(none)", target ? target : "(none)");
7c5b6e7c
AS
1842 unsigned long rqd_flags = 0;
1843 if (mountflags & MS_RDONLY)
1844 rqd_flags |= MS_RDONLY;
614305f3 1845#ifdef HAVE_STATVFS
2938f7c8 1846 if (statvfs(fsname, &sb) == 0) {
7c5b6e7c 1847 unsigned long required_flags = rqd_flags;
2938f7c8
SH
1848 if (sb.f_flag & MS_NOSUID)
1849 required_flags |= MS_NOSUID;
ae7a770e 1850 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8
SH
1851 required_flags |= MS_NODEV;
1852 if (sb.f_flag & MS_RDONLY)
1853 required_flags |= MS_RDONLY;
1854 if (sb.f_flag & MS_NOEXEC)
1855 required_flags |= MS_NOEXEC;
1856 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname, sb.f_flag, required_flags);
1857 /*
1858 * If this was a bind mount request, and required_flags
1859 * does not have any flags which are not already in
1860 * mountflags, then skip the remount
1861 */
1862 if (!(mountflags & MS_REMOUNT)) {
7c5b6e7c 1863 if (!(required_flags & ~mountflags) && rqd_flags == 0) {
2938f7c8
SH
1864 DEBUG("mountflags already was %lu, skipping remount",
1865 mountflags);
1866 goto skipremount;
1867 }
1868 }
1869 mountflags |= required_flags;
6fd5e769 1870 }
614305f3 1871#endif
911324ef
DL
1872
1873 if (mount(fsname, target, fstype,
592fd47a 1874 mountflags | MS_REMOUNT, data) < 0) {
1fc64d22
SG
1875 if (optional) {
1876 INFO("failed to mount '%s' on '%s' (optional): %s",
1877 fsname, target, strerror(errno));
1878 return 0;
1879 }
1880 else {
1881 SYSERROR("failed to mount '%s' on '%s'",
1882 fsname, target);
1883 return -1;
1884 }
911324ef
DL
1885 }
1886 }
1887
614305f3 1888#ifdef HAVE_STATVFS
6fd5e769 1889skipremount:
614305f3 1890#endif
911324ef
DL
1891 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1892
1893 return 0;
1894}
1895
4e4ca161
SH
1896/*
1897 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1898 */
1899static void cull_mntent_opt(struct mntent *mntent)
1900{
1901 int i;
1902 char *p, *p2;
1903 char *list[] = {"create=dir",
1904 "create=file",
1905 "optional",
1906 NULL };
1907
1908 for (i=0; list[i]; i++) {
1909 if (!(p = strstr(mntent->mnt_opts, list[i])))
1910 continue;
1911 p2 = strchr(p, ',');
1912 if (!p2) {
1913 /* no more mntopts, so just chop it here */
1914 *p = '\0';
1915 continue;
1916 }
1917 memmove(p, p2+1, strlen(p2+1)+1);
1918 }
1919}
1920
4d5b72a1 1921static int mount_entry_create_dir_file(const struct mntent *mntent,
0a2dddd4
CB
1922 const char* path, const struct lxc_rootfs *rootfs,
1923 const char *lxc_name, const char *lxc_path)
0ad19a3f 1924{
4d5b72a1 1925 char *pathdirname = NULL;
608e3567 1926 int ret = 0;
34cfffb3 1927 FILE *pathfile = NULL;
911324ef 1928
6e46cc0d 1929 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
5c484f79 1930 if (ovl_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
6e46cc0d
CB
1931 return -1;
1932 } else if (strncmp(mntent->mnt_type, "aufs", 4) == 0) {
1d52bdf7 1933 if (aufs_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
6e46cc0d
CB
1934 return -1;
1935 }
1936
34cfffb3 1937 if (hasmntopt(mntent, "create=dir")) {
4d5b72a1
NC
1938 if (mkdir_p(path, 0755) < 0) {
1939 WARN("Failed to create mount target '%s'", path);
34cfffb3
SG
1940 ret = -1;
1941 }
1942 }
1943
4d5b72a1
NC
1944 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1945 pathdirname = strdup(path);
34cfffb3 1946 pathdirname = dirname(pathdirname);
119126b6
SG
1947 if (mkdir_p(pathdirname, 0755) < 0) {
1948 WARN("Failed to create target directory");
1949 }
4d5b72a1 1950 pathfile = fopen(path, "wb");
34cfffb3 1951 if (!pathfile) {
4d5b72a1 1952 WARN("Failed to create mount target '%s'", path);
34cfffb3 1953 ret = -1;
6e46cc0d 1954 } else {
34cfffb3 1955 fclose(pathfile);
6e46cc0d 1956 }
34cfffb3 1957 }
4d5b72a1
NC
1958 free(pathdirname);
1959 return ret;
1960}
1961
ec50007f
CB
1962/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1963 * without a rootfs. */
db4aba38 1964static inline int mount_entry_on_generic(struct mntent *mntent,
0a2dddd4
CB
1965 const char* path, const struct lxc_rootfs *rootfs,
1966 const char *lxc_name, const char *lxc_path)
4d5b72a1
NC
1967{
1968 unsigned long mntflags;
1969 char *mntdata;
1970 int ret;
1971 bool optional = hasmntopt(mntent, "optional") != NULL;
ae7a770e 1972 bool dev = hasmntopt(mntent, "dev") != NULL;
4d5b72a1 1973
ec50007f
CB
1974 char *rootfs_path = NULL;
1975 if (rootfs && rootfs->path)
1976 rootfs_path = rootfs->mount;
1977
0a2dddd4 1978 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name, lxc_path);
34cfffb3 1979
608e3567
SH
1980 if (ret < 0)
1981 return optional ? 0 : -1;
1982
4e4ca161
SH
1983 cull_mntent_opt(mntent);
1984
a17b1e65
SG
1985 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
1986 free(mntdata);
1987 return -1;
1988 }
1989
6e46cc0d 1990 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
ae7a770e 1991 mntdata, optional, dev, rootfs_path);
68c152ef 1992
911324ef 1993 free(mntdata);
911324ef
DL
1994 return ret;
1995}
1996
db4aba38
NC
1997static inline int mount_entry_on_systemfs(struct mntent *mntent)
1998{
1433c9f9
CB
1999 char path[MAXPATHLEN];
2000 int ret;
2001
2002 /* For containers created without a rootfs all mounts are treated as
2003 * absolute paths starting at / on the host. */
2004 if (mntent->mnt_dir[0] != '/')
2005 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2006 else
2007 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
2008
2009 if (ret < 0 || ret >= sizeof(path)) {
2010 ERROR("path name too long");
2011 return -1;
2012 }
2013
2014 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
2015}
2016
4e4ca161 2017static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 2018 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
2019 const char *lxc_name,
2020 const char *lxc_path)
911324ef 2021{
013bd428 2022 char *aux;
59760f5d 2023 char path[MAXPATHLEN];
80a881b2 2024 int r, ret = 0, offset;
67e571de 2025 const char *lxcpath;
0ad19a3f 2026
593e8478 2027 lxcpath = lxc_global_config_value("lxc.lxcpath");
2a59a681
SH
2028 if (!lxcpath) {
2029 ERROR("Out of memory");
2030 return -1;
2031 }
2032
80a881b2 2033 /* if rootfs->path is a blockdev path, allow container fstab to
2a59a681
SH
2034 * use $lxcpath/CN/rootfs as the target prefix */
2035 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
80a881b2
SH
2036 if (r < 0 || r >= MAXPATHLEN)
2037 goto skipvarlib;
2038
2039 aux = strstr(mntent->mnt_dir, path);
2040 if (aux) {
2041 offset = strlen(path);
2042 goto skipabs;
2043 }
2044
2045skipvarlib:
013bd428
DL
2046 aux = strstr(mntent->mnt_dir, rootfs->path);
2047 if (!aux) {
2048 WARN("ignoring mount point '%s'", mntent->mnt_dir);
db4aba38 2049 return ret;
013bd428 2050 }
80a881b2
SH
2051 offset = strlen(rootfs->path);
2052
2053skipabs:
013bd428 2054
9ba8130c 2055 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
80a881b2
SH
2056 aux + offset);
2057 if (r < 0 || r >= MAXPATHLEN) {
2058 WARN("pathnme too long for '%s'", mntent->mnt_dir);
a17b1e65
SG
2059 return -1;
2060 }
2061
0a2dddd4 2062 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 2063}
d330fe7b 2064
4e4ca161 2065static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
2066 const struct lxc_rootfs *rootfs,
2067 const char *lxc_name,
2068 const char *lxc_path)
911324ef
DL
2069{
2070 char path[MAXPATHLEN];
911324ef 2071 int ret;
d330fe7b 2072
34cfffb3 2073 /* relative to root mount point */
6e46cc0d 2074 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1433c9f9 2075 if (ret < 0 || ret >= sizeof(path)) {
9ba8130c
SH
2076 ERROR("path name too long");
2077 return -1;
2078 }
911324ef 2079
0a2dddd4 2080 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2081}
2082
80a881b2 2083static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
0a2dddd4 2084 const char *lxc_name, const char *lxc_path)
911324ef 2085{
aaf901be
AM
2086 struct mntent mntent;
2087 char buf[4096];
911324ef 2088 int ret = -1;
e76b8764 2089
aaf901be 2090 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
e76b8764 2091
911324ef 2092 if (!rootfs->path) {
aaf901be 2093 if (mount_entry_on_systemfs(&mntent))
e76b8764 2094 goto out;
911324ef 2095 continue;
e76b8764
CDC
2096 }
2097
911324ef 2098 /* We have a separate root, mounts are relative to it */
aaf901be 2099 if (mntent.mnt_dir[0] != '/') {
0a2dddd4 2100 if (mount_entry_on_relative_rootfs(&mntent, rootfs, lxc_name, lxc_path))
911324ef
DL
2101 goto out;
2102 continue;
2103 }
cd54d859 2104
0a2dddd4 2105 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name, lxc_path))
911324ef 2106 goto out;
0ad19a3f 2107 }
cd54d859 2108
0ad19a3f 2109 ret = 0;
cd54d859
DL
2110
2111 INFO("mount points have been setup");
0ad19a3f 2112out:
e7938e9e
MN
2113 return ret;
2114}
2115
80a881b2 2116static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
0a2dddd4 2117 const char *lxc_name, const char *lxc_path)
e7938e9e
MN
2118{
2119 FILE *file;
2120 int ret;
2121
2122 if (!fstab)
2123 return 0;
2124
2125 file = setmntent(fstab, "r");
2126 if (!file) {
2127 SYSERROR("failed to use '%s'", fstab);
2128 return -1;
2129 }
2130
0a2dddd4 2131 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
e7938e9e 2132
0ad19a3f 2133 endmntent(file);
2134 return ret;
2135}
2136
5ef5c9a3 2137FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2138{
5ef5c9a3 2139 int ret;
e7938e9e 2140 char *mount_entry;
5ef5c9a3
CB
2141 struct lxc_list *iterator;
2142 FILE *file;
2143 int fd = -1;
2144
2145 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2146 if (fd < 0) {
2147 if (errno != ENOSYS)
2148 return NULL;
2149 file = tmpfile();
2150 } else {
2151 file = fdopen(fd, "r+");
2152 }
e7938e9e 2153
e7938e9e 2154 if (!file) {
fad6ef95 2155 int saved_errno = errno;
5ef5c9a3
CB
2156 if (fd != -1)
2157 close(fd);
fad6ef95 2158 ERROR("Could not create mount entry file: %s.", strerror(saved_errno));
9fc7f8c0 2159 return NULL;
e7938e9e
MN
2160 }
2161
2162 lxc_list_for_each(iterator, mount) {
2163 mount_entry = iterator->elem;
5ef5c9a3
CB
2164 ret = fprintf(file, "%s\n", mount_entry);
2165 if (ret < strlen(mount_entry))
2166 WARN("Could not write mount entry to anonymous mount file.");
2167 }
2168
2169 if (fseek(file, 0, SEEK_SET) < 0) {
2170 fclose(file);
2171 return NULL;
e7938e9e
MN
2172 }
2173
9fc7f8c0
TA
2174 return file;
2175}
2176
5ef5c9a3
CB
2177static int setup_mount_entries(const struct lxc_rootfs *rootfs,
2178 struct lxc_list *mount, const char *lxc_name,
2179 const char *lxc_path)
9fc7f8c0
TA
2180{
2181 FILE *file;
2182 int ret;
2183
5ef5c9a3 2184 file = make_anonymous_mount_file(mount);
9fc7f8c0
TA
2185 if (!file)
2186 return -1;
e7938e9e 2187
0a2dddd4 2188 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
e7938e9e
MN
2189
2190 fclose(file);
2191 return ret;
2192}
2193
bab88e68
CS
2194static int parse_cap(const char *cap)
2195{
2196 char *ptr = NULL;
84760c11 2197 size_t i;
2198 int capid = -1;
bab88e68 2199
7035407c
DE
2200 if (!strcmp(cap, "none"))
2201 return -2;
2202
bab88e68
CS
2203 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2204
2205 if (strcmp(cap, caps_opt[i].name))
2206 continue;
2207
2208 capid = caps_opt[i].value;
2209 break;
2210 }
2211
2212 if (capid < 0) {
2213 /* try to see if it's numeric, so the user may specify
2214 * capabilities that the running kernel knows about but
2215 * we don't */
2216 errno = 0;
2217 capid = strtol(cap, &ptr, 10);
2218 if (!ptr || *ptr != '\0' || errno != 0)
2219 /* not a valid number */
2220 capid = -1;
2221 else if (capid > lxc_caps_last_cap())
2222 /* we have a number but it's not a valid
2223 * capability */
2224 capid = -1;
2225 }
2226
2227 return capid;
2228}
2229
0769b82a
CS
2230int in_caplist(int cap, struct lxc_list *caps)
2231{
2232 struct lxc_list *iterator;
2233 int capid;
2234
2235 lxc_list_for_each(iterator, caps) {
2236 capid = parse_cap(iterator->elem);
2237 if (capid == cap)
2238 return 1;
2239 }
2240
2241 return 0;
2242}
2243
81810dd1
DL
2244static int setup_caps(struct lxc_list *caps)
2245{
2246 struct lxc_list *iterator;
2247 char *drop_entry;
bab88e68 2248 int capid;
81810dd1
DL
2249
2250 lxc_list_for_each(iterator, caps) {
2251
2252 drop_entry = iterator->elem;
2253
bab88e68 2254 capid = parse_cap(drop_entry);
d55bc1ad 2255
81810dd1 2256 if (capid < 0) {
1e11be34
DL
2257 ERROR("unknown capability %s", drop_entry);
2258 return -1;
81810dd1
DL
2259 }
2260
2261 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2262
2263 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2264 SYSERROR("failed to remove %s capability", drop_entry);
2265 return -1;
2266 }
81810dd1
DL
2267
2268 }
2269
1fb86a7c
SH
2270 DEBUG("capabilities have been setup");
2271
2272 return 0;
2273}
2274
2275static int dropcaps_except(struct lxc_list *caps)
2276{
2277 struct lxc_list *iterator;
2278 char *keep_entry;
1fb86a7c
SH
2279 int i, capid;
2280 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2281 INFO("found %d capabilities", numcaps);
1fb86a7c 2282
2caf9a97
SH
2283 if (numcaps <= 0 || numcaps > 200)
2284 return -1;
2285
1fb86a7c
SH
2286 // caplist[i] is 1 if we keep capability i
2287 int *caplist = alloca(numcaps * sizeof(int));
2288 memset(caplist, 0, numcaps * sizeof(int));
2289
2290 lxc_list_for_each(iterator, caps) {
2291
2292 keep_entry = iterator->elem;
2293
bab88e68 2294 capid = parse_cap(keep_entry);
1fb86a7c 2295
7035407c
DE
2296 if (capid == -2)
2297 continue;
2298
1fb86a7c
SH
2299 if (capid < 0) {
2300 ERROR("unknown capability %s", keep_entry);
2301 return -1;
2302 }
2303
8255688a 2304 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2305
2306 caplist[capid] = 1;
2307 }
2308 for (i=0; i<numcaps; i++) {
2309 if (caplist[i])
2310 continue;
2311 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2312 SYSERROR("failed to remove capability %d", i);
2313 return -1;
2314 }
1fb86a7c
SH
2315 }
2316
2317 DEBUG("capabilities have been setup");
81810dd1
DL
2318
2319 return 0;
2320}
2321
0ad19a3f 2322static int setup_hw_addr(char *hwaddr, const char *ifname)
2323{
2324 struct sockaddr sockaddr;
2325 struct ifreq ifr;
fad6ef95 2326 int ret, fd, saved_errno;
0ad19a3f 2327
3cfc0f3a
MN
2328 ret = lxc_convert_mac(hwaddr, &sockaddr);
2329 if (ret) {
2330 ERROR("mac address '%s' conversion failed : %s",
2331 hwaddr, strerror(-ret));
0ad19a3f 2332 return -1;
2333 }
2334
2335 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
5da6aa8c 2336 ifr.ifr_name[IFNAMSIZ-1] = '\0';
0ad19a3f 2337 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2338
2339 fd = socket(AF_INET, SOCK_DGRAM, 0);
2340 if (fd < 0) {
3ab87b66 2341 ERROR("socket failure : %s", strerror(errno));
0ad19a3f 2342 return -1;
2343 }
2344
2345 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
fad6ef95 2346 saved_errno = errno;
0ad19a3f 2347 close(fd);
2348 if (ret)
fad6ef95 2349 ERROR("ioctl failure : %s", strerror(saved_errno));
0ad19a3f 2350
5da6aa8c 2351 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
cd54d859 2352
0ad19a3f 2353 return ret;
2354}
2355
82d5ae15 2356static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2357{
82d5ae15
DL
2358 struct lxc_list *iterator;
2359 struct lxc_inetdev *inetdev;
3cfc0f3a 2360 int err;
0ad19a3f 2361
82d5ae15
DL
2362 lxc_list_for_each(iterator, ip) {
2363
2364 inetdev = iterator->elem;
2365
0093bb8c
DL
2366 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2367 &inetdev->bcast, inetdev->prefix);
3cfc0f3a
MN
2368 if (err) {
2369 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2370 ifindex, strerror(-err));
82d5ae15
DL
2371 return -1;
2372 }
2373 }
2374
2375 return 0;
0ad19a3f 2376}
2377
82d5ae15 2378static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2379{
82d5ae15 2380 struct lxc_list *iterator;
7fa9074f 2381 struct lxc_inet6dev *inet6dev;
3cfc0f3a 2382 int err;
0ad19a3f 2383
82d5ae15
DL
2384 lxc_list_for_each(iterator, ip) {
2385
2386 inet6dev = iterator->elem;
2387
b3df193c 2388 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
0093bb8c
DL
2389 &inet6dev->mcast, &inet6dev->acast,
2390 inet6dev->prefix);
3cfc0f3a
MN
2391 if (err) {
2392 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2393 ifindex, strerror(-err));
82d5ae15 2394 return -1;
3cfc0f3a 2395 }
82d5ae15
DL
2396 }
2397
2398 return 0;
0ad19a3f 2399}
2400
82d5ae15 2401static int setup_netdev(struct lxc_netdev *netdev)
0ad19a3f 2402{
0ad19a3f 2403 char ifname[IFNAMSIZ];
0ad19a3f 2404 char *current_ifname = ifname;
3cfc0f3a 2405 int err;
0ad19a3f 2406
82d5ae15
DL
2407 /* empty network namespace */
2408 if (!netdev->ifindex) {
b0efbac4 2409 if (netdev->flags & IFF_UP) {
d472214b 2410 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2411 if (err) {
2412 ERROR("failed to set the loopback up : %s",
2413 strerror(-err));
82d5ae15
DL
2414 return -1;
2415 }
82d5ae15 2416 }
40790553
SH
2417 if (netdev->type != LXC_NET_VETH)
2418 return 0;
2419 netdev->ifindex = if_nametoindex(netdev->name);
0ad19a3f 2420 }
13954cce 2421
b466dc33 2422 /* get the new ifindex in case of physical netdev */
40790553 2423 if (netdev->type == LXC_NET_PHYS) {
b466dc33
BP
2424 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2425 ERROR("failed to get ifindex for %s",
2426 netdev->link);
2427 return -1;
2428 }
40790553 2429 }
b466dc33 2430
82d5ae15
DL
2431 /* retrieve the name of the interface */
2432 if (!if_indextoname(netdev->ifindex, current_ifname)) {
36eb9bde 2433 ERROR("no interface corresponding to index '%d'",
82d5ae15 2434 netdev->ifindex);
0ad19a3f 2435 return -1;
2436 }
13954cce 2437
018ef520 2438 /* default: let the system to choose one interface name */
9d083402 2439 if (!netdev->name)
fb6d9b2f
DL
2440 netdev->name = netdev->type == LXC_NET_PHYS ?
2441 netdev->link : "eth%d";
018ef520 2442
82d5ae15 2443 /* rename the interface name */
40790553
SH
2444 if (strcmp(ifname, netdev->name) != 0) {
2445 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2446 if (err) {
2447 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2448 strerror(-err));
2449 return -1;
2450 }
018ef520
DL
2451 }
2452
2453 /* Re-read the name of the interface because its name has changed
2454 * and would be automatically allocated by the system
2455 */
82d5ae15 2456 if (!if_indextoname(netdev->ifindex, current_ifname)) {
018ef520 2457 ERROR("no interface corresponding to index '%d'",
82d5ae15 2458 netdev->ifindex);
018ef520 2459 return -1;
0ad19a3f 2460 }
2461
82d5ae15
DL
2462 /* set a mac address */
2463 if (netdev->hwaddr) {
2464 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
36eb9bde 2465 ERROR("failed to setup hw address for '%s'",
82d5ae15 2466 current_ifname);
0ad19a3f 2467 return -1;
2468 }
2469 }
2470
82d5ae15
DL
2471 /* setup ipv4 addresses on the interface */
2472 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
36eb9bde 2473 ERROR("failed to setup ip addresses for '%s'",
0ad19a3f 2474 ifname);
2475 return -1;
2476 }
2477
82d5ae15
DL
2478 /* setup ipv6 addresses on the interface */
2479 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
36eb9bde 2480 ERROR("failed to setup ipv6 addresses for '%s'",
0ad19a3f 2481 ifname);
2482 return -1;
2483 }
2484
82d5ae15 2485 /* set the network device up */
b0efbac4 2486 if (netdev->flags & IFF_UP) {
3cfc0f3a
MN
2487 int err;
2488
d472214b 2489 err = lxc_netdev_up(current_ifname);
3cfc0f3a
MN
2490 if (err) {
2491 ERROR("failed to set '%s' up : %s", current_ifname,
2492 strerror(-err));
0ad19a3f 2493 return -1;
2494 }
2495
2496 /* the network is up, make the loopback up too */
d472214b 2497 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2498 if (err) {
2499 ERROR("failed to set the loopback up : %s",
2500 strerror(-err));
0ad19a3f 2501 return -1;
2502 }
2503 }
2504
f8fee0e2
MK
2505 /* We can only set up the default routes after bringing
2506 * up the interface, sine bringing up the interface adds
2507 * the link-local routes and we can't add a default
2508 * route if the gateway is not reachable. */
2509
2510 /* setup ipv4 gateway on the interface */
2511 if (netdev->ipv4_gateway) {
2512 if (!(netdev->flags & IFF_UP)) {
2513 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2514 return -1;
2515 }
2516
2517 if (lxc_list_empty(&netdev->ipv4)) {
2518 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2519 return -1;
2520 }
2521
2522 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2523 if (err) {
fc739df5
SG
2524 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2525 if (err) {
2526 ERROR("failed to add ipv4 dest for '%s': %s",
2527 ifname, strerror(-err));
2528 }
2529
2530 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2531 if (err) {
2532 ERROR("failed to setup ipv4 gateway for '%s': %s",
2533 ifname, strerror(-err));
2534 if (netdev->ipv4_gateway_auto) {
2535 char buf[INET_ADDRSTRLEN];
2536 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2537 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2538 }
2539 return -1;
19a26f82 2540 }
f8fee0e2
MK
2541 }
2542 }
2543
2544 /* setup ipv6 gateway on the interface */
2545 if (netdev->ipv6_gateway) {
2546 if (!(netdev->flags & IFF_UP)) {
2547 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2548 return -1;
2549 }
2550
2551 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2552 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2553 return -1;
2554 }
2555
2556 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2557 if (err) {
fc739df5
SG
2558 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2559 if (err) {
2560 ERROR("failed to add ipv6 dest for '%s': %s",
f8fee0e2 2561 ifname, strerror(-err));
19a26f82 2562 }
fc739df5
SG
2563
2564 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2565 if (err) {
2566 ERROR("failed to setup ipv6 gateway for '%s': %s",
2567 ifname, strerror(-err));
2568 if (netdev->ipv6_gateway_auto) {
2569 char buf[INET6_ADDRSTRLEN];
2570 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2571 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2572 }
2573 return -1;
2574 }
f8fee0e2
MK
2575 }
2576 }
2577
cd54d859
DL
2578 DEBUG("'%s' has been setup", current_ifname);
2579
0ad19a3f 2580 return 0;
2581}
2582
5f4535a3 2583static int setup_network(struct lxc_list *network)
0ad19a3f 2584{
82d5ae15 2585 struct lxc_list *iterator;
82d5ae15 2586 struct lxc_netdev *netdev;
0ad19a3f 2587
5f4535a3 2588 lxc_list_for_each(iterator, network) {
cd54d859 2589
5f4535a3 2590 netdev = iterator->elem;
82d5ae15
DL
2591
2592 if (setup_netdev(netdev)) {
2593 ERROR("failed to setup netdev");
2594 return -1;
2595 }
2596 }
cd54d859 2597
5f4535a3
DL
2598 if (!lxc_list_empty(network))
2599 INFO("network has been setup");
cd54d859
DL
2600
2601 return 0;
0ad19a3f 2602}
2603
c6d09e15
WB
2604static int parse_resource(const char *res) {
2605 size_t i;
2606 int resid = -1;
2607
2608 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2609 if (strcmp(res, limit_opt[i].name) == 0)
2610 return limit_opt[i].value;
2611 }
2612
2613 /* try to see if it's numeric, so the user may specify
2614 * resources that the running kernel knows about but
2615 * we don't */
2616 if (lxc_safe_int(res, &resid) == 0)
2617 return resid;
2618 return -1;
2619}
2620
2621int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2622 struct lxc_list *it;
2623 struct lxc_limit *lim;
2624 int resid;
2625
2626 lxc_list_for_each(it, limits) {
2627 lim = it->elem;
2628
2629 resid = parse_resource(lim->resource);
2630 if (resid < 0) {
2631 ERROR("unknown resource %s", lim->resource);
2632 return -1;
2633 }
2634
2635 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2636 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2637 return -1;
2638 }
2639 }
2640 return 0;
2641}
2642
2af6bd1b 2643/* try to move physical nics to the init netns */
5610055a 2644void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2af6bd1b 2645{
64d2fcb5 2646 int i, oldfd;
4ec31c52 2647 char ifname[IFNAMSIZ];
2af6bd1b 2648
5610055a 2649 if (netnsfd < 0 || conf->num_savednics == 0)
2af6bd1b
SH
2650 return;
2651
64d2fcb5 2652 INFO("Running to reset %d nic names.", conf->num_savednics);
5610055a 2653
64d2fcb5
CB
2654 oldfd = lxc_preserve_ns(getpid(), "net");
2655 if (oldfd < 0) {
2656 SYSERROR("Failed to open monitor netns fd.");
2af6bd1b
SH
2657 return;
2658 }
64d2fcb5 2659
2af6bd1b
SH
2660 if (setns(netnsfd, 0) != 0) {
2661 SYSERROR("Failed to enter container netns to reset nics");
2662 close(oldfd);
2663 return;
2664 }
2665 for (i=0; i<conf->num_savednics; i++) {
2666 struct saved_nic *s = &conf->saved_nics[i];
f2e206ff 2667 /* retrieve the name of the interface */
2668 if (!if_indextoname(s->ifindex, ifname)) {
2669 WARN("no interface corresponding to index '%d'", s->ifindex);
2670 continue;
2671 }
5610055a 2672 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
f2e206ff 2673 WARN("Error moving nic name:%s back to host netns", ifname);
5610055a 2674 free(s->orig_name);
2af6bd1b 2675 }
5610055a
WB
2676 conf->num_savednics = 0;
2677
2af6bd1b
SH
2678 if (setns(oldfd, 0) != 0)
2679 SYSERROR("Failed to re-enter monitor's netns");
2680 close(oldfd);
2681}
2682
ae9242c8
SH
2683static char *default_rootfs_mount = LXCROOTFSMOUNT;
2684
7b379ab3 2685struct lxc_conf *lxc_conf_init(void)
089cd8b8 2686{
7b379ab3 2687 struct lxc_conf *new;
26ddeedd 2688 int i;
7b379ab3
MN
2689
2690 new = malloc(sizeof(*new));
2691 if (!new) {
2692 ERROR("lxc_conf_init : %m");
2693 return NULL;
2694 }
2695 memset(new, 0, sizeof(*new));
2696
b40a606e 2697 new->loglevel = LXC_LOG_PRIORITY_NOTSET;
cccc74b5 2698 new->personality = -1;
124fa0a8 2699 new->autodev = 1;
596a818d
DE
2700 new->console.log_path = NULL;
2701 new->console.log_fd = -1;
28a4b0e5 2702 new->console.path = NULL;
63376d7d 2703 new->console.peer = -1;
b5159817
DE
2704 new->console.peerpty.busy = -1;
2705 new->console.peerpty.master = -1;
2706 new->console.peerpty.slave = -1;
63376d7d
DL
2707 new->console.master = -1;
2708 new->console.slave = -1;
2709 new->console.name[0] = '\0';
d2e30e99 2710 new->maincmd_fd = -1;
76a26f55 2711 new->nbd_idx = -1;
54c30e29 2712 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048
SH
2713 if (!new->rootfs.mount) {
2714 ERROR("lxc_conf_init : %m");
2715 free(new);
2716 return NULL;
2717 }
d89de239 2718 new->kmsg = 0;
858377e4 2719 new->logfd = -1;
7b379ab3
MN
2720 lxc_list_init(&new->cgroup);
2721 lxc_list_init(&new->network);
2722 lxc_list_init(&new->mount_list);
81810dd1 2723 lxc_list_init(&new->caps);
1fb86a7c 2724 lxc_list_init(&new->keepcaps);
f6d3e3e4 2725 lxc_list_init(&new->id_map);
f979ac15 2726 lxc_list_init(&new->includes);
4184c3e1 2727 lxc_list_init(&new->aliens);
7c661726 2728 lxc_list_init(&new->environment);
c6d09e15 2729 lxc_list_init(&new->limits);
26ddeedd
SH
2730 for (i=0; i<NUM_LXC_HOOKS; i++)
2731 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2732 lxc_list_init(&new->groups);
fe4de9a6
DE
2733 new->lsm_aa_profile = NULL;
2734 new->lsm_se_context = NULL;
5112cd70 2735 new->tmp_umount_proc = 0;
7b379ab3 2736
9f30a190
MM
2737 for (i = 0; i < LXC_NS_MAX; i++)
2738 new->inherit_ns_fd[i] = -1;
2739
72bb04e4
PT
2740 /* if running in a new user namespace, init and COMMAND
2741 * default to running as UID/GID 0 when using lxc-execute */
2742 new->init_uid = 0;
2743 new->init_gid = 0;
2744
7b379ab3 2745 return new;
089cd8b8
DL
2746}
2747
a589434e 2748static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2749{
8634bc19 2750 char veth1buf[IFNAMSIZ], *veth1;
0e391e57 2751 char veth2buf[IFNAMSIZ], *veth2;
b7b2fde4
CB
2752 int bridge_index, err;
2753 unsigned int mtu = 0;
13954cce 2754
8bee8851 2755 if (netdev->priv.veth_attr.pair) {
e892973e 2756 veth1 = netdev->priv.veth_attr.pair;
8bee8851
WB
2757 if (handler->conf->reboot)
2758 lxc_netdev_delete_by_name(veth1);
2759 } else {
9ba8130c
SH
2760 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2761 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2762 ERROR("veth1 name too long");
2763 return -1;
2764 }
a0265685 2765 veth1 = lxc_mkifname(veth1buf);
ad40563e
ÇO
2766 if (!veth1) {
2767 ERROR("failed to allocate a temporary name");
2768 return -1;
2769 }
74a2b586
JK
2770 /* store away for deconf */
2771 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
8634bc19 2772 }
82d5ae15 2773
0e391e57 2774 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
a0265685 2775 veth2 = lxc_mkifname(veth2buf);
ad40563e 2776 if (!veth2) {
82d5ae15 2777 ERROR("failed to allocate a temporary name");
ad40563e 2778 goto out_delete;
0ad19a3f 2779 }
2780
3cfc0f3a
MN
2781 err = lxc_veth_create(veth1, veth2);
2782 if (err) {
2e2d6a7b 2783 ERROR("failed to create veth pair (%s and %s): %s", veth1, veth2,
3cfc0f3a 2784 strerror(-err));
ad40563e 2785 goto out_delete;
0ad19a3f 2786 }
13954cce 2787
49684c0b
CS
2788 /* changing the high byte of the mac address to 0xfe, the bridge interface
2789 * will always keep the host's mac address and not take the mac address
2790 * of a container */
2791 err = setup_private_host_hw_addr(veth1);
2792 if (err) {
2e2d6a7b 2793 ERROR("failed to change mac address of host interface '%s': %s",
49684c0b
CS
2794 veth1, strerror(-err));
2795 goto out_delete;
2796 }
2797
af651aa9
SN
2798 netdev->ifindex = if_nametoindex(veth2);
2799 if (!netdev->ifindex) {
2800 ERROR("failed to retrieve the index for %s", veth2);
2801 goto out_delete;
2802 }
2803
82d5ae15 2804 if (netdev->mtu) {
b7b2fde4
CB
2805 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
2806 WARN("Failed to parse mtu from.");
2807 else
2808 INFO("Retrieved mtu %d", mtu);
e54864d3 2809 } else if (netdev->link) {
e9280f65 2810 bridge_index = if_nametoindex(netdev->link);
729e8bf6
CB
2811 if (bridge_index) {
2812 mtu = netdev_get_mtu(bridge_index);
2813 INFO("Retrieved mtu %d from %s", mtu, netdev->link);
2814 } else {
2815 mtu = netdev_get_mtu(netdev->ifindex);
2816 INFO("Retrieved mtu %d from %s", mtu, veth2);
2817 }
e54864d3
NC
2818 }
2819
2820 if (mtu) {
2821 err = lxc_netdev_set_mtu(veth1, mtu);
3cfc0f3a 2822 if (!err)
e54864d3 2823 err = lxc_netdev_set_mtu(veth2, mtu);
3cfc0f3a 2824 if (err) {
e54864d3
NC
2825 ERROR("failed to set mtu '%i' for veth pair (%s and %s): %s",
2826 mtu, veth1, veth2, strerror(-err));
eb14c10a 2827 goto out_delete;
75d09f83
DL
2828 }
2829 }
2830
3cfc0f3a 2831 if (netdev->link) {
c43cbc04 2832 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
3cfc0f3a 2833 if (err) {
2e2d6a7b 2834 ERROR("failed to attach '%s' to the bridge '%s': %s",
3cfc0f3a
MN
2835 veth1, netdev->link, strerror(-err));
2836 goto out_delete;
2837 }
738d0deb 2838 INFO("Attached '%s': to the bridge '%s': ", veth1, netdev->link);
eb14c10a
DL
2839 }
2840
d472214b 2841 err = lxc_netdev_up(veth1);
6e35af2e
DL
2842 if (err) {
2843 ERROR("failed to set %s up : %s", veth1, strerror(-err));
2844 goto out_delete;
0ad19a3f 2845 }
2846
e3b4c4c4 2847 if (netdev->upscript) {
751d9dcd
DL
2848 err = run_script(handler->name, "net", netdev->upscript, "up",
2849 "veth", veth1, (char*) NULL);
2850 if (err)
e3b4c4c4 2851 goto out_delete;
e3b4c4c4
ST
2852 }
2853
a589434e 2854 DEBUG("instantiated veth '%s/%s', index is '%d'",
82d5ae15
DL
2855 veth1, veth2, netdev->ifindex);
2856
6ab9ab6d 2857 return 0;
eb14c10a
DL
2858
2859out_delete:
b84f58b9 2860 lxc_netdev_delete_by_name(veth1);
f10fad2f 2861 if (!netdev->priv.veth_attr.pair)
ad40563e 2862 free(veth1);
f10fad2f 2863 free(veth2);
6ab9ab6d 2864 return -1;
13954cce 2865}
d957ae2d 2866
74a2b586
JK
2867static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2868{
2869 char *veth1;
2870 int err;
2871
2872 if (netdev->priv.veth_attr.pair)
2873 veth1 = netdev->priv.veth_attr.pair;
2874 else
2875 veth1 = netdev->priv.veth_attr.veth1;
2876
2877 if (netdev->downscript) {
2878 err = run_script(handler->name, "net", netdev->downscript,
2879 "down", "veth", veth1, (char*) NULL);
2880 if (err)
2881 return -1;
2882 }
2883 return 0;
2884}
2885
a589434e 2886static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2887{
0e391e57 2888 char peerbuf[IFNAMSIZ], *peer;
3cfc0f3a 2889 int err;
d957ae2d
MT
2890
2891 if (!netdev->link) {
2892 ERROR("no link specified for macvlan netdev");
2893 return -1;
2894 }
13954cce 2895
9ba8130c
SH
2896 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2897 if (err >= sizeof(peerbuf))
2898 return -1;
82d5ae15 2899
a0265685 2900 peer = lxc_mkifname(peerbuf);
ad40563e 2901 if (!peer) {
82d5ae15
DL
2902 ERROR("failed to make a temporary name");
2903 return -1;
0ad19a3f 2904 }
2905
3cfc0f3a
MN
2906 err = lxc_macvlan_create(netdev->link, peer,
2907 netdev->priv.macvlan_attr.mode);
2908 if (err) {
2909 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2910 peer, netdev->link, strerror(-err));
ad40563e 2911 goto out;
0ad19a3f 2912 }
2913
82d5ae15
DL
2914 netdev->ifindex = if_nametoindex(peer);
2915 if (!netdev->ifindex) {
36eb9bde 2916 ERROR("failed to retrieve the index for %s", peer);
ad40563e 2917 goto out;
22ebac19 2918 }
2919
e3b4c4c4 2920 if (netdev->upscript) {
751d9dcd
DL
2921 err = run_script(handler->name, "net", netdev->upscript, "up",
2922 "macvlan", netdev->link, (char*) NULL);
2923 if (err)
ad40563e 2924 goto out;
e3b4c4c4
ST
2925 }
2926
a589434e 2927 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
e892973e 2928 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
0ad19a3f 2929
d957ae2d 2930 return 0;
ad40563e
ÇO
2931out:
2932 lxc_netdev_delete_by_name(peer);
2933 free(peer);
2934 return -1;
0ad19a3f 2935}
2936
74a2b586
JK
2937static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2938{
2939 int err;
2940
2941 if (netdev->downscript) {
2942 err = run_script(handler->name, "net", netdev->downscript,
2943 "down", "macvlan", netdev->link,
2944 (char*) NULL);
2945 if (err)
2946 return -1;
2947 }
2948 return 0;
2949}
2950
a589434e
JN
2951/* XXX: merge with instantiate_macvlan */
2952static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
26c39028
JHS
2953{
2954 char peer[IFNAMSIZ];
3cfc0f3a 2955 int err;
82f58d03 2956 static uint16_t vlan_cntr = 0;
b7b2fde4 2957 unsigned int mtu = 0;
26c39028
JHS
2958
2959 if (!netdev->link) {
2960 ERROR("no link specified for vlan netdev");
2961 return -1;
2962 }
2963
82f58d03 2964 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
9ba8130c
SH
2965 if (err >= sizeof(peer)) {
2966 ERROR("peer name too long");
2967 return -1;
2968 }
26c39028 2969
3cfc0f3a
MN
2970 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2971 if (err) {
2972 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2973 peer, netdev->link, strerror(-err));
26c39028
JHS
2974 return -1;
2975 }
2976
2977 netdev->ifindex = if_nametoindex(peer);
2978 if (!netdev->ifindex) {
2979 ERROR("failed to retrieve the ifindex for %s", peer);
b84f58b9 2980 lxc_netdev_delete_by_name(peer);
26c39028
JHS
2981 return -1;
2982 }
2983
a589434e 2984 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
e892973e 2985 netdev->ifindex);
b4fb7de1 2986 if (netdev->mtu) {
b7b2fde4
CB
2987 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
2988 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
2989 netdev->ifindex, netdev->name);
2990 return -1;
2991 }
2992 err = lxc_netdev_set_mtu(peer, mtu);
b4fb7de1
VL
2993 if (err) {
2994 ERROR("failed to set mtu '%s' for %s : %s",
2995 netdev->mtu, peer, strerror(-err));
2996 lxc_netdev_delete_by_name(peer);
2997 return -1;
2998 }
2999 }
e892973e 3000
26c39028
JHS
3001 return 0;
3002}
3003
74a2b586
JK
3004static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
3005{
3006 return 0;
3007}
3008
a589434e 3009static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 3010{
6168e99f
DL
3011 if (!netdev->link) {
3012 ERROR("no link specified for the physical interface");
3013 return -1;
3014 }
3015
9d083402 3016 netdev->ifindex = if_nametoindex(netdev->link);
82d5ae15 3017 if (!netdev->ifindex) {
9d083402 3018 ERROR("failed to retrieve the index for %s", netdev->link);
0ad19a3f 3019 return -1;
3020 }
3021
e3b4c4c4
ST
3022 if (netdev->upscript) {
3023 int err;
751d9dcd
DL
3024 err = run_script(handler->name, "net", netdev->upscript,
3025 "up", "phys", netdev->link, (char*) NULL);
3026 if (err)
e3b4c4c4 3027 return -1;
e3b4c4c4
ST
3028 }
3029
82d5ae15 3030 return 0;
0ad19a3f 3031}
3032
74a2b586
JK
3033static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
3034{
3035 int err;
3036
3037 if (netdev->downscript) {
3038 err = run_script(handler->name, "net", netdev->downscript,
3039 "down", "phys", netdev->link, (char*) NULL);
3040 if (err)
3041 return -1;
3042 }
3043 return 0;
3044}
3045
a589434e 3046static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
26b797f3
SH
3047{
3048 netdev->ifindex = 0;
3049 return 0;
3050}
3051
a589434e 3052static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 3053{
82d5ae15 3054 netdev->ifindex = 0;
e3b4c4c4
ST
3055 if (netdev->upscript) {
3056 int err;
751d9dcd
DL
3057 err = run_script(handler->name, "net", netdev->upscript,
3058 "up", "empty", (char*) NULL);
3059 if (err)
e3b4c4c4 3060 return -1;
e3b4c4c4 3061 }
82d5ae15 3062 return 0;
0ad19a3f 3063}
3064
74a2b586
JK
3065static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3066{
3067 int err;
3068
3069 if (netdev->downscript) {
3070 err = run_script(handler->name, "net", netdev->downscript,
3071 "down", "empty", (char*) NULL);
3072 if (err)
3073 return -1;
3074 }
3075 return 0;
3076}
3077
26b797f3
SH
3078static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3079{
3080 return 0;
3081}
3082
3083int lxc_requests_empty_network(struct lxc_handler *handler)
3084{
3085 struct lxc_list *network = &handler->conf->network;
3086 struct lxc_list *iterator;
3087 struct lxc_netdev *netdev;
3088 bool found_none = false, found_nic = false;
3089
3090 if (lxc_list_empty(network))
3091 return 0;
3092
3093 lxc_list_for_each(iterator, network) {
3094
3095 netdev = iterator->elem;
3096
3097 if (netdev->type == LXC_NET_NONE)
3098 found_none = true;
3099 else
3100 found_nic = true;
3101 }
3102 if (found_none && !found_nic)
3103 return 1;
3104 return 0;
3105}
3106
e3b4c4c4 3107int lxc_create_network(struct lxc_handler *handler)
0ad19a3f 3108{
e3b4c4c4 3109 struct lxc_list *network = &handler->conf->network;
82d5ae15 3110 struct lxc_list *iterator;
82d5ae15 3111 struct lxc_netdev *netdev;
cbef6c52
SH
3112 int am_root = (getuid() == 0);
3113
3114 if (!am_root)
3115 return 0;
0ad19a3f 3116
5f4535a3 3117 lxc_list_for_each(iterator, network) {
0ad19a3f 3118
5f4535a3 3119 netdev = iterator->elem;
13954cce 3120
24654103 3121 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
82d5ae15 3122 ERROR("invalid network configuration type '%d'",
5f4535a3 3123 netdev->type);
82d5ae15
DL
3124 return -1;
3125 }
0ad19a3f 3126
e3b4c4c4 3127 if (netdev_conf[netdev->type](handler, netdev)) {
82d5ae15
DL
3128 ERROR("failed to create netdev");
3129 return -1;
3130 }
e3b4c4c4 3131
0ad19a3f 3132 }
3133
3134 return 0;
3135}
3136
358daf49 3137bool lxc_delete_network(struct lxc_handler *handler)
7fef7a06 3138{
e97946ae 3139 int ret;
74a2b586 3140 struct lxc_list *network = &handler->conf->network;
7fef7a06
DL
3141 struct lxc_list *iterator;
3142 struct lxc_netdev *netdev;
358daf49 3143 bool deleted_all = true;
7fef7a06
DL
3144
3145 lxc_list_for_each(iterator, network) {
3146 netdev = iterator->elem;
d472214b 3147
74a2b586 3148 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
d8f8e352 3149 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
358daf49
CB
3150 WARN("Failed to rename interface with index %d "
3151 "to its initial name \"%s\".",
3152 netdev->ifindex, netdev->link);
d472214b 3153 continue;
d8f8e352 3154 }
d472214b 3155
74a2b586 3156 if (netdev_deconf[netdev->type](handler, netdev)) {
e97946ae 3157 WARN("Failed to destroy netdev");
74a2b586
JK
3158 }
3159
d8f8e352
DL
3160 /* Recent kernel remove the virtual interfaces when the network
3161 * namespace is destroyed but in case we did not moved the
3162 * interface to the network namespace, we have to destroy it
3163 */
e97946ae
CB
3164 if (netdev->ifindex != 0) {
3165 ret = lxc_netdev_delete_by_index(netdev->ifindex);
358daf49
CB
3166 if (-ret == ENODEV) {
3167 INFO("Interface \"%s\" with index %d already "
3168 "deleted or existing in different network "
3169 "namespace.",
3170 netdev->name ? netdev->name : "(null)",
3171 netdev->ifindex);
3172 } else if (ret < 0) {
3173 deleted_all = false;
3174 WARN("Failed to remove interface \"%s\" with "
3175 "index %d: %s.",
3176 netdev->name ? netdev->name : "(null)",
3177 netdev->ifindex, strerror(-ret));
3178 } else {
3179 INFO("Removed interface \"%s\" with index %d.",
3180 netdev->name ? netdev->name : "(null)",
3181 netdev->ifindex);
3182 }
e97946ae
CB
3183 }
3184
3185 /* Explicitly delete host veth device to prevent lingering
3186 * devices. We had issues in LXD around this.
3187 */
9aaaad30 3188 if (netdev->type == LXC_NET_VETH && !am_unpriv()) {
358daf49
CB
3189 char *hostveth;
3190 if (netdev->priv.veth_attr.pair) {
e97946ae 3191 hostveth = netdev->priv.veth_attr.pair;
358daf49
CB
3192 ret = lxc_netdev_delete_by_name(hostveth);
3193 if (ret < 0) {
3194 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3195 } else {
3196 INFO("Removed interface \"%s\" from host.", hostveth);
358daf49
CB
3197 }
3198 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
e97946ae 3199 hostveth = netdev->priv.veth_attr.veth1;
e97946ae 3200 ret = lxc_netdev_delete_by_name(hostveth);
358daf49
CB
3201 if (ret < 0) {
3202 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3203 } else {
3204 INFO("Removed interface \"%s\" from host.", hostveth);
3205 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3206 }
e97946ae
CB
3207 }
3208 }
7fef7a06 3209 }
358daf49
CB
3210
3211 return deleted_all;
7fef7a06
DL
3212}
3213
45e854dc
SG
3214#define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3215
fe1f672f 3216/* lxc-user-nic returns "interface_name:interface_name\n" */
eab15c1e 3217#define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
c43cbc04
SH
3218static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3219 struct lxc_netdev *netdev, pid_t pid)
cbef6c52
SH
3220{
3221 pid_t child;
a7242d9a
ÇO
3222 int bytes, pipefd[2];
3223 char *token, *saveptr = NULL;
fe1f672f 3224 char buffer[MAX_BUFFER_SIZE];
091045f8 3225 char netdev_link[IFNAMSIZ + 1];
cbef6c52
SH
3226
3227 if (netdev->type != LXC_NET_VETH) {
3228 ERROR("nic type %d not support for unprivileged use",
091045f8 3229 netdev->type);
cbef6c52
SH
3230 return -1;
3231 }
3232
091045f8 3233 if (pipe(pipefd) < 0) {
a7242d9a
ÇO
3234 SYSERROR("pipe failed");
3235 return -1;
3236 }
3237
091045f8
CB
3238 child = fork();
3239 if (child < 0) {
cbef6c52 3240 SYSERROR("fork");
a7242d9a
ÇO
3241 close(pipefd[0]);
3242 close(pipefd[1]);
3243 return -1;
3244 }
3245
3246 if (child == 0) { // child
091045f8
CB
3247 /* Call lxc-user-nic pid type bridge. */
3248 int ret;
3249 char pidstr[LXC_NUMSTRLEN64];
3250
3251 close(pipefd[0]); /* Close the read-end of the pipe. */
3252
3253 /* Redirect stdout to write-end of the pipe. */
3254 ret = dup2(pipefd[1], STDOUT_FILENO);
3255 close(pipefd[1]); /* Close the write-end of the pipe. */
3256 if (ret < 0) {
3257 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3258 exit(EXIT_FAILURE);
3259 }
a7242d9a 3260
091045f8 3261 if (netdev->link)
cff7b5eb 3262 strncpy(netdev_link, netdev->link, IFNAMSIZ);
091045f8 3263 else
cff7b5eb 3264 strncpy(netdev_link, "none", IFNAMSIZ);
091045f8
CB
3265
3266 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3267 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3268 exit(EXIT_FAILURE);
3269 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3270
3271 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3272 lxcname, pidstr, netdev_link, netdev->name);
c43cbc04 3273 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
091045f8
CB
3274 pidstr, "veth", netdev_link, netdev->name, NULL);
3275
3276 SYSERROR("Failed to exec lxc-user-nic.");
3277 exit(EXIT_FAILURE);
a7242d9a
ÇO
3278 }
3279
3280 /* close the write-end of the pipe */
3281 close(pipefd[1]);
3282
fe1f672f 3283 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
091045f8
CB
3284 if (bytes < 0)
3285 SYSERROR("Failed to read from pipe file descriptor.");
a7242d9a
ÇO
3286 buffer[bytes - 1] = '\0';
3287
3288 if (wait_for_pid(child) != 0) {
3289 close(pipefd[0]);
cbef6c52
SH
3290 return -1;
3291 }
3292
a7242d9a
ÇO
3293 /* close the read-end of the pipe */
3294 close(pipefd[0]);
cbef6c52 3295
a7242d9a
ÇO
3296 /* fill netdev->name field */
3297 token = strtok_r(buffer, ":", &saveptr);
3298 if (!token)
3299 return -1;
091045f8
CB
3300
3301 netdev->name = malloc(IFNAMSIZ + 1);
658979c5 3302 if (!netdev->name) {
091045f8 3303 SYSERROR("Failed to allocate memory.");
658979c5
SH
3304 return -1;
3305 }
091045f8 3306 memset(netdev->name, 0, IFNAMSIZ + 1);
658979c5 3307 strncpy(netdev->name, token, IFNAMSIZ);
a7242d9a
ÇO
3308
3309 /* fill netdev->veth_attr.pair field */
3310 token = strtok_r(NULL, ":", &saveptr);
3311 if (!token)
3312 return -1;
091045f8 3313
a7242d9a 3314 netdev->priv.veth_attr.pair = strdup(token);
658979c5 3315 if (!netdev->priv.veth_attr.pair) {
091045f8 3316 ERROR("Failed to allocate memory.");
658979c5
SH
3317 return -1;
3318 }
45e854dc 3319
a7242d9a 3320 return 0;
cbef6c52
SH
3321}
3322
c43cbc04
SH
3323int lxc_assign_network(const char *lxcpath, char *lxcname,
3324 struct lxc_list *network, pid_t pid)
0ad19a3f 3325{
82d5ae15 3326 struct lxc_list *iterator;
82d5ae15 3327 struct lxc_netdev *netdev;
f2e206ff 3328 char ifname[IFNAMSIZ];
cbef6c52 3329 int am_root = (getuid() == 0);
3cfc0f3a 3330 int err;
0ad19a3f 3331
5f4535a3 3332 lxc_list_for_each(iterator, network) {
82d5ae15 3333
5f4535a3 3334 netdev = iterator->elem;
82d5ae15 3335
fbb16259 3336 if (netdev->type == LXC_NET_VETH && !am_root) {
c43cbc04 3337 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
cbef6c52 3338 return -1;
658979c5
SH
3339 // lxc-user-nic has moved the nic to the new ns.
3340 // unpriv_assign_nic() fills in netdev->name.
3341 // netdev->ifindex will be filed in at setup_netdev.
cbef6c52
SH
3342 continue;
3343 }
236087a6 3344
fbb16259
SH
3345 /* empty network namespace, nothing to move */
3346 if (!netdev->ifindex)
3347 continue;
3348
f2e206ff 3349 /* retrieve the name of the interface */
3350 if (!if_indextoname(netdev->ifindex, ifname)) {
3351 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3352 return -1;
3353 }
3354
3355 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3cfc0f3a
MN
3356 if (err) {
3357 ERROR("failed to move '%s' to the container : %s",
3358 netdev->link, strerror(-err));
82d5ae15
DL
3359 return -1;
3360 }
3361
198cbbaa 3362 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
0ad19a3f 3363 }
3364
3365 return 0;
3366}
3367
251d0d2a
DE
3368static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3369 size_t buf_size)
f6d3e3e4
SH
3370{
3371 char path[PATH_MAX];
e4ccd113 3372 int ret, closeret;
f6d3e3e4
SH
3373 FILE *f;
3374
3375 ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid, idtype == ID_TYPE_UID ? 'u' : 'g');
3376 if (ret < 0 || ret >= PATH_MAX) {
03fadd16 3377 fprintf(stderr, "%s: path name too long\n", __func__);
f6d3e3e4
SH
3378 return -E2BIG;
3379 }
3380 f = fopen(path, "w");
3381 if (!f) {
3382 perror("open");
3383 return -EINVAL;
3384 }
251d0d2a 3385 ret = fwrite(buf, buf_size, 1, f);
f6d3e3e4 3386 if (ret < 0)
e4ccd113
SH
3387 SYSERROR("writing id mapping");
3388 closeret = fclose(f);
3389 if (closeret)
3390 SYSERROR("writing id mapping");
3391 return ret < 0 ? ret : closeret;
f6d3e3e4
SH
3392}
3393
df6a2945
CB
3394/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both. */
3395static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3396{
3397 char *path;
3398 int ret;
3399 struct stat st;
3400 int fret = 0;
3401
3402 path = on_path(binary, NULL);
3403 if (!path)
3404 return -ENOENT;
3405
3406 ret = stat(path, &st);
3407 if (ret < 0) {
3408 fret = -errno;
3409 goto cleanup;
3410 }
3411
3412 /* Check if the binary is setuid. */
3413 if (st.st_mode & S_ISUID) {
3414 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3415 fret = 1;
3416 goto cleanup;
3417 }
3418
69924fff 3419 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
3420 /* Check if it has the CAP_SETUID capability. */
3421 if ((cap & CAP_SETUID) &&
3422 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3423 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3424 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3425 "and CAP_PERMITTED sets.", path);
3426 fret = 1;
3427 goto cleanup;
3428 }
3429
3430 /* Check if it has the CAP_SETGID capability. */
3431 if ((cap & CAP_SETGID) &&
3432 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3433 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3434 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3435 "and CAP_PERMITTED sets.", path);
3436 fret = 1;
3437 goto cleanup;
3438 }
d6018f88 3439 #else
69924fff
CB
3440 /* If we cannot check for file capabilities we need to give the benefit
3441 * of the doubt. Otherwise we might fail even though all the necessary
3442 * file capabilities are set.
3443 */
d6018f88
CB
3444 DEBUG("Cannot check for file capabilites as full capability support is "
3445 "missing. Manual intervention needed.");
3446 fret = 1;
df6a2945
CB
3447 #endif
3448
3449cleanup:
3450 free(path);
3451 return fret;
3452}
3453
f6d3e3e4
SH
3454int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3455{
f6d3e3e4 3456 struct id_map *map;
4bc3b759 3457 struct lxc_list *iterator;
251d0d2a 3458 enum idtype type;
4bc3b759 3459 char *pos;
df6a2945
CB
3460 int euid;
3461 int ret = 0, use_shadow = 0;
3462 int uidmap = 0, gidmap = 0;
3463 char *buf = NULL;
8afb3e61 3464
df6a2945
CB
3465 euid = geteuid();
3466
3467 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3468 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
3469 * will protected it by preventing another user from being handed the
3470 * range by shadow.
3471 */
df6a2945
CB
3472 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
3473 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
3474 if (uidmap > 0 && gidmap > 0) {
3475 DEBUG("Functional newuidmap and newgidmap binary found.");
4bc3b759 3476 use_shadow = true;
df6a2945
CB
3477 } else if (uidmap == -ENOENT && gidmap == -ENOENT && !euid) {
3478 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3479 "write directly with euid 0.");
3480 use_shadow = false;
3481 } else {
3482 DEBUG("Either one or both of the newuidmap and newgidmap "
3483 "binaries do not exist or are missing necessary "
3484 "privilege.");
0e6e3a41
SG
3485 return -1;
3486 }
251d0d2a 3487
4bc3b759 3488 for (type = ID_TYPE_UID; type <= ID_TYPE_GID; type++) {
4f7521b4 3489 int left, fill;
4bc3b759 3490 bool had_entry = false;
cf3ef16d 3491 if (!buf) {
4bc3b759 3492 buf = pos = malloc(LXC_IDMAPLEN);
4f7521b4
SH
3493 if (!buf)
3494 return -ENOMEM;
cf3ef16d
SH
3495 }
3496 pos = buf;
0e6e3a41 3497 if (use_shadow)
4bc3b759 3498 pos += sprintf(buf, "new%cidmap %d", type == ID_TYPE_UID ? 'u' : 'g', pid);
4f7521b4 3499
cf3ef16d 3500 lxc_list_for_each(iterator, idmap) {
4bc3b759
CB
3501 /* The kernel only takes <= 4k for writes to
3502 * /proc/<nr>/[ug]id_map
3503 */
251d0d2a 3504 map = iterator->elem;
cf3ef16d
SH
3505 if (map->idtype != type)
3506 continue;
3507
4bc3b759
CB
3508 had_entry = true;
3509
3510 left = LXC_IDMAPLEN - (pos - buf);
d1838f34 3511 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
3512 use_shadow ? " " : "", map->nsid,
3513 map->hostid, map->range,
0e6e3a41 3514 use_shadow ? "" : "\n");
cf3ef16d 3515 if (fill <= 0 || fill >= left)
4bc3b759
CB
3516 SYSERROR("Too many {g,u}id mappings defined.");
3517
cf3ef16d 3518 pos += fill;
251d0d2a 3519 }
cf3ef16d 3520 if (!had_entry)
4f7521b4 3521 continue;
cf3ef16d 3522
0e6e3a41 3523 if (!use_shadow) {
4bc3b759 3524 ret = write_id_mapping(type, pid, buf, pos - buf);
d1838f34 3525 } else {
4bc3b759 3526 left = LXC_IDMAPLEN - (pos - buf);
d1838f34
MS
3527 fill = snprintf(pos, left, "\n");
3528 if (fill <= 0 || fill >= left)
4bc3b759 3529 SYSERROR("Too many {g,u}id mappings defined.");
d1838f34 3530 pos += fill;
cf3ef16d 3531 ret = system(buf);
d1838f34 3532 }
f6d3e3e4
SH
3533 if (ret)
3534 break;
3535 }
251d0d2a 3536
f10fad2f 3537 free(buf);
f6d3e3e4
SH
3538 return ret;
3539}
3540
cf3ef16d 3541/*
7b50c609
TS
3542 * return the host uid/gid to which the container root is mapped in
3543 * *val.
0b3a6504 3544 * Return true if id was found, false otherwise.
cf3ef16d 3545 */
2a9a80cb 3546bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 3547 unsigned long *val)
cf3ef16d
SH
3548{
3549 struct lxc_list *it;
3550 struct id_map *map;
3551
3552 lxc_list_for_each(it, &conf->id_map) {
3553 map = it->elem;
7b50c609 3554 if (map->idtype != idtype)
cf3ef16d
SH
3555 continue;
3556 if (map->nsid != 0)
3557 continue;
2a9a80cb
SH
3558 *val = map->hostid;
3559 return true;
cf3ef16d 3560 }
2a9a80cb 3561 return false;
cf3ef16d
SH
3562}
3563
2133f58c 3564int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3565{
3566 struct lxc_list *it;
3567 struct id_map *map;
3568 lxc_list_for_each(it, &conf->id_map) {
3569 map = it->elem;
2133f58c 3570 if (map->idtype != idtype)
cf3ef16d
SH
3571 continue;
3572 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 3573 return (id - map->hostid) + map->nsid;
cf3ef16d 3574 }
57d116ab 3575 return -1;
cf3ef16d
SH
3576}
3577
2133f58c 3578int find_unmapped_nsuid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3579{
3580 struct lxc_list *it;
3581 struct id_map *map;
2133f58c 3582 unsigned int freeid = 0;
cf3ef16d
SH
3583again:
3584 lxc_list_for_each(it, &conf->id_map) {
3585 map = it->elem;
2133f58c 3586 if (map->idtype != idtype)
cf3ef16d
SH
3587 continue;
3588 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3589 freeid = map->nsid + map->range;
3590 goto again;
3591 }
3592 }
3593 return freeid;
3594}
3595
19a26f82
MK
3596int lxc_find_gateway_addresses(struct lxc_handler *handler)
3597{
3598 struct lxc_list *network = &handler->conf->network;
3599 struct lxc_list *iterator;
3600 struct lxc_netdev *netdev;
3601 int link_index;
3602
3603 lxc_list_for_each(iterator, network) {
3604 netdev = iterator->elem;
3605
3606 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3607 continue;
3608
3609 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3610 ERROR("gateway = auto only supported for "
3611 "veth and macvlan");
3612 return -1;
3613 }
3614
3615 if (!netdev->link) {
3616 ERROR("gateway = auto needs a link interface");
3617 return -1;
3618 }
3619
3620 link_index = if_nametoindex(netdev->link);
3621 if (!link_index)
3622 return -EINVAL;
3623
3624 if (netdev->ipv4_gateway_auto) {
3625 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3626 ERROR("failed to automatically find ipv4 gateway "
3627 "address from link interface '%s'", netdev->link);
3628 return -1;
3629 }
3630 }
3631
3632 if (netdev->ipv6_gateway_auto) {
3633 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3634 ERROR("failed to automatically find ipv6 gateway "
3635 "address from link interface '%s'", netdev->link);
3636 return -1;
3637 }
3638 }
3639 }
3640
3641 return 0;
3642}
3643
5e4a62bf 3644int lxc_create_tty(const char *name, struct lxc_conf *conf)
b0a33c1e 3645{
5e4a62bf 3646 struct lxc_tty_info *tty_info = &conf->tty_info;
025ed0f3 3647 int i, ret;
b0a33c1e 3648
5e4a62bf
DL
3649 /* no tty in the configuration */
3650 if (!conf->tty)
b0a33c1e 3651 return 0;
3652
13954cce 3653 tty_info->pty_info =
e4e7d59d 3654 malloc(sizeof(*tty_info->pty_info)*conf->tty);
b0a33c1e 3655 if (!tty_info->pty_info) {
36eb9bde 3656 SYSERROR("failed to allocate pty_info");
985d15b1 3657 return -1;
b0a33c1e 3658 }
3659
985d15b1 3660 for (i = 0; i < conf->tty; i++) {
13954cce 3661
b0a33c1e 3662 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3663
025ed0f3
SH
3664 process_lock();
3665 ret = openpty(&pty_info->master, &pty_info->slave,
3666 pty_info->name, NULL, NULL);
3667 process_unlock();
3668 if (ret) {
36eb9bde 3669 SYSERROR("failed to create pty #%d", i);
985d15b1
MT
3670 tty_info->nbtty = i;
3671 lxc_delete_tty(tty_info);
3672 return -1;
b0a33c1e 3673 }
3674
5332bb84
DL
3675 DEBUG("allocated pty '%s' (%d/%d)",
3676 pty_info->name, pty_info->master, pty_info->slave);
3677
3ec1648d 3678 /* Prevent leaking the file descriptors to the container */
b035ad62
MS
3679 fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3680 fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3681
b0a33c1e 3682 pty_info->busy = 0;
3683 }
3684
985d15b1 3685 tty_info->nbtty = conf->tty;
1ac470c0
DL
3686
3687 INFO("tty's configured");
3688
985d15b1 3689 return 0;
b0a33c1e 3690}
3691
3692void lxc_delete_tty(struct lxc_tty_info *tty_info)
3693{
3694 int i;
3695
3696 for (i = 0; i < tty_info->nbtty; i++) {
3697 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3698
3699 close(pty_info->master);
3700 close(pty_info->slave);
3701 }
3702
3703 free(tty_info->pty_info);
e00c0242 3704 tty_info->pty_info = NULL;
b0a33c1e 3705 tty_info->nbtty = 0;
3706}
3707
f6d3e3e4 3708/*
7b50c609
TS
3709 * chown_mapped_root: for an unprivileged user with uid/gid X to
3710 * chown a dir to subuid/subgid Y, he needs to run chown as root
3711 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3712 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3713 * root is privileged with respect to hostuid/hostgid X, allowing
3714 * him to do the chown.
f6d3e3e4 3715 */
c4d10a05 3716int chown_mapped_root(char *path, struct lxc_conf *conf)
f6d3e3e4 3717{
7b50c609
TS
3718 uid_t rootuid;
3719 gid_t rootgid;
c4d10a05 3720 pid_t pid;
2a9a80cb 3721 unsigned long val;
a7ef8753 3722 char *chownpath = path;
f6d3e3e4 3723
2a9a80cb 3724 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
c4d10a05
SH
3725 ERROR("No mapping for container root");
3726 return -1;
f6d3e3e4 3727 }
7b50c609
TS
3728 rootuid = (uid_t) val;
3729 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
3730 ERROR("No mapping for container root");
3731 return -1;
3732 }
3733 rootgid = (gid_t) val;
2a9a80cb 3734
a7ef8753
SH
3735 /*
3736 * In case of overlay, we want only the writeable layer
3737 * to be chowned
3738 */
1f92162d 3739 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
a7ef8753
SH
3740 chownpath = strchr(path, ':');
3741 if (!chownpath) {
3742 ERROR("Bad overlay path: %s", path);
3743 return -1;
3744 }
3745 chownpath = strchr(chownpath+1, ':');
3746 if (!chownpath) {
3747 ERROR("Bad overlay path: %s", path);
3748 return -1;
3749 }
3750 chownpath++;
3751 }
3752 path = chownpath;
c4d10a05 3753 if (geteuid() == 0) {
7b50c609 3754 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3755 ERROR("Error chowning %s", path);
3756 return -1;
3757 }
3758 return 0;
3759 }
f3d7e4ca 3760
7b50c609 3761 if (rootuid == geteuid()) {
f3d7e4ca
SH
3762 // nothing to do
3763 INFO("%s: container root is our uid; no need to chown" ,__func__);
3764 return 0;
3765 }
3766
c4d10a05
SH
3767 pid = fork();
3768 if (pid < 0) {
3769 SYSERROR("Failed forking");
f6d3e3e4
SH
3770 return -1;
3771 }
c4d10a05 3772 if (!pid) {
7b50c609
TS
3773 int hostuid = geteuid(), hostgid = getegid(), ret;
3774 struct stat sb;
3775 char map1[100], map2[100], map3[100], map4[100], map5[100];
3776 char ugid[100];
3777 char *args1[] = { "lxc-usernsexec", "-m", map1, "-m", map2,
3778 "-m", map3, "-m", map5,
3779 "--", "chown", ugid, path, NULL };
3780 char *args2[] = { "lxc-usernsexec", "-m", map1, "-m", map2,
3781 "-m", map3, "-m", map4, "-m", map5,
3782 "--", "chown", ugid, path, NULL };
3783
3784 // save the current gid of "path"
3785 if (stat(path, &sb) < 0) {
3786 ERROR("Error stat %s", path);
3787 return -1;
3788 }
f6d3e3e4 3789
9a7c2aba
SH
3790 /*
3791 * A file has to be group-owned by a gid mapped into the
3792 * container, or the container won't be privileged over it.
3793 */
3794 if (sb.st_uid == geteuid() &&
3795 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3796 chown(path, -1, hostgid) < 0) {
3797 ERROR("Failed chgrping %s", path);
7b50c609
TS
3798 return -1;
3799 }
3800
3801 // "u:0:rootuid:1"
3802 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
c4d10a05
SH
3803 if (ret < 0 || ret >= 100) {
3804 ERROR("Error uid printing map string");
f6d3e3e4
SH
3805 return -1;
3806 }
c4d10a05 3807
98e5ba51
SH
3808 // "u:hostuid:hostuid:1"
3809 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3810 if (ret < 0 || ret >= 100) {
3811 ERROR("Error uid printing map string");
3812 return -1;
3813 }
3814
7b50c609
TS
3815 // "g:0:rootgid:1"
3816 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
c4d10a05 3817 if (ret < 0 || ret >= 100) {
7b50c609 3818 ERROR("Error gid printing map string");
c4d10a05
SH
3819 return -1;
3820 }
3821
7b50c609 3822 // "g:pathgid:rootgid+pathgid:1"
b4c1e35d
SG
3823 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3824 rootgid + (gid_t)sb.st_gid);
7b50c609
TS
3825 if (ret < 0 || ret >= 100) {
3826 ERROR("Error gid printing map string");
3827 return -1;
3828 }
3829
3830 // "g:hostgid:hostgid:1"
3831 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3832 if (ret < 0 || ret >= 100) {
3833 ERROR("Error gid printing map string");
3834 return -1;
3835 }
3836
3837 // "0:pathgid" (chown)
b4c1e35d 3838 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
7b50c609
TS
3839 if (ret < 0 || ret >= 100) {
3840 ERROR("Error owner printing format string for chown");
3841 return -1;
3842 }
3843
3844 if (hostgid == sb.st_gid)
3845 ret = execvp("lxc-usernsexec", args1);
3846 else
3847 ret = execvp("lxc-usernsexec", args2);
c4d10a05
SH
3848 SYSERROR("Failed executing usernsexec");
3849 exit(1);
f6d3e3e4 3850 }
c4d10a05 3851 return wait_for_pid(pid);
f6d3e3e4
SH
3852}
3853
c4d10a05 3854int ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 3855{
c4d10a05 3856 if (lxc_list_empty(&c->id_map))
f6d3e3e4 3857 return 0;
c4d10a05 3858
29b10e4f 3859 if (strcmp(c->console.name, "") !=0 && chown_mapped_root(c->console.name, c) < 0) {
c4d10a05
SH
3860 ERROR("Failed to chown %s", c->console.name);
3861 return -1;
3862 }
3863
f6d3e3e4
SH
3864 return 0;
3865}
3866
943144d9
CB
3867/* NOTE: Must not be called from inside the container namespace! */
3868int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3869{
3870 int mounted;
3871
943144d9 3872 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3873 if (mounted == -1) {
943144d9 3874 SYSERROR("failed to mount /proc in the container");
01958b1f 3875 /* continue only if there is no rootfs */
943144d9 3876 if (conf->rootfs.path)
01958b1f 3877 return -1;
5112cd70 3878 } else if (mounted == 1) {
943144d9 3879 conf->tmp_umount_proc = 1;
5112cd70 3880 }
943144d9 3881
5112cd70
SH
3882 return 0;
3883}
3884
3885void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3886{
3887 if (lxc_conf->tmp_umount_proc == 1) {
3888 umount("/proc");
3889 lxc_conf->tmp_umount_proc = 0;
3890 }
3891}
3892
6a0c909a 3893void remount_all_slave(void)
e995d7a2
SH
3894{
3895 /* walk /proc/mounts and change any shared entries to slave */
3896 FILE *f = fopen("/proc/self/mountinfo", "r");
3897 char *line = NULL;
3898 size_t len = 0;
3899
3900 if (!f) {
3901 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3902 ERROR("Continuing container startup...");
3903 return;
3904 }
3905
3906 while (getline(&line, &len, f) != -1) {
3907 char *target, *opts;
3908 target = get_field(line, 4);
3909 if (!target)
3910 continue;
3911 opts = get_field(target, 2);
3912 if (!opts)
3913 continue;
3914 null_endofword(opts);
3915 if (!strstr(opts, "shared"))
3916 continue;
3917 null_endofword(target);
3918 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3919 SYSERROR("Failed to make %s rslave", target);
3920 ERROR("Continuing...");
3921 }
3922 }
3923 fclose(f);
f10fad2f 3924 free(line);
e995d7a2
SH
3925}
3926
2322903b
SH
3927void lxc_execute_bind_init(struct lxc_conf *conf)
3928{
3929 int ret;
9d9c111c
SH
3930 char path[PATH_MAX], destpath[PATH_MAX], *p;
3931
3932 /* If init exists in the container, don't bind mount a static one */
3933 p = choose_init(conf->rootfs.mount);
3934 if (p) {
3935 free(p);
3936 return;
3937 }
2322903b
SH
3938
3939 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3940 if (ret < 0 || ret >= PATH_MAX) {
3941 WARN("Path name too long searching for lxc.init.static");
3942 return;
3943 }
3944
3945 if (!file_exists(path)) {
3946 INFO("%s does not exist on host", path);
3947 return;
3948 }
3949
3950 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3951 if (ret < 0 || ret >= PATH_MAX) {
3952 WARN("Path name too long for container's lxc.init.static");
3953 return;
3954 }
3955
3956 if (!file_exists(destpath)) {
3957 FILE * pathfile = fopen(destpath, "wb");
3958 if (!pathfile) {
3959 SYSERROR("Failed to create mount target '%s'", destpath);
3960 return;
3961 }
3962 fclose(pathfile);
3963 }
3964
592fd47a 3965 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
2322903b
SH
3966 if (ret < 0)
3967 SYSERROR("Failed to bind lxc.init.static into container");
3968 INFO("lxc.init.static bound into container at %s", path);
3969}
3970
35120d9c
SH
3971/*
3972 * This does the work of remounting / if it is shared, calling the
3973 * container pre-mount hooks, and mounting the rootfs.
3974 */
3975int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 3976{
35120d9c
SH
3977 if (conf->rootfs_setup) {
3978 /*
3979 * rootfs was set up in another namespace. bind-mount it
3980 * to give us a mount in our own ns so we can pivot_root to it
3981 */
3982 const char *path = conf->rootfs.mount;
3983 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
3984 ERROR("Failed to bind-mount container / onto itself");
145832ba 3985 return -1;
35120d9c 3986 }
145832ba 3987 return 0;
35120d9c 3988 }
d4ef7c50 3989
e995d7a2
SH
3990 remount_all_slave();
3991
35120d9c
SH
3992 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
3993 ERROR("failed to run pre-mount hooks for container '%s'.", name);
3994 return -1;
3995 }
3996
3997 if (setup_rootfs(conf)) {
3998 ERROR("failed to setup rootfs for '%s'", name);
3999 return -1;
4000 }
4001
4002 conf->rootfs_setup = true;
4003 return 0;
4004}
4005
1c1c7051
SH
4006static bool verify_start_hooks(struct lxc_conf *conf)
4007{
4008 struct lxc_list *it;
4009 char path[MAXPATHLEN];
4010 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4011 char *hookname = it->elem;
4012 struct stat st;
4013 int ret;
4014
4015 ret = snprintf(path, MAXPATHLEN, "%s%s",
7b6753e7 4016 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
1c1c7051
SH
4017 if (ret < 0 || ret >= MAXPATHLEN)
4018 return false;
4019 ret = stat(path, &st);
4020 if (ret) {
7b6753e7 4021 SYSERROR("Start hook %s not found in container",
1c1c7051
SH
4022 hookname);
4023 return false;
4024 }
6a0c909a 4025 return true;
1c1c7051
SH
4026 }
4027
4028 return true;
4029}
4030
e8bd4e43
SH
4031static int send_fd(int sock, int fd)
4032{
4033 int ret = lxc_abstract_unix_send_fd(sock, fd, NULL, 0);
4034
4035
4036 if (ret < 0) {
4037 SYSERROR("Error sending tty fd to parent");
4038 return -1;
4039 }
4040
4041 return 0;
4042}
4043
4044static int send_ttys_to_parent(struct lxc_handler *handler)
4045{
4046 struct lxc_conf *conf = handler->conf;
4047 const struct lxc_tty_info *tty_info = &conf->tty_info;
4048 int i;
4049 int sock = handler->ttysock[0];
4050
4051 for (i = 0; i < tty_info->nbtty; i++) {
4052 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
4053 if (send_fd(sock, pty_info->slave) < 0)
4054 goto bad;
4055 close(pty_info->slave);
4056 pty_info->slave = -1;
4057 if (send_fd(sock, pty_info->master) < 0)
4058 goto bad;
4059 close(pty_info->master);
4060 pty_info->master = -1;
4061 }
4062
4063 close(handler->ttysock[0]);
4064 close(handler->ttysock[1]);
4065
4066 return 0;
4067
4068bad:
4069 ERROR("Error writing tty fd to parent");
4070 return -1;
4071}
4072
35120d9c
SH
4073int lxc_setup(struct lxc_handler *handler)
4074{
4075 const char *name = handler->name;
4076 struct lxc_conf *lxc_conf = handler->conf;
4077 const char *lxcpath = handler->lxcpath;
35120d9c
SH
4078
4079 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4080 ERROR("Error setting up rootfs mount after spawn");
4081 return -1;
4082 }
4083
6c544cb3
MM
4084 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4085 if (setup_utsname(lxc_conf->utsname)) {
4086 ERROR("failed to setup the utsname for '%s'", name);
4087 return -1;
4088 }
0ad19a3f 4089 }
4090
5f4535a3 4091 if (setup_network(&lxc_conf->network)) {
36eb9bde 4092 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 4093 return -1;
0ad19a3f 4094 }
4095
bc6928ff 4096 if (lxc_conf->autodev > 0) {
14221cbb 4097 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
91c3830e 4098 ERROR("failed to mount /dev in the container");
c6883f38
SH
4099 return -1;
4100 }
4101 }
4102
368bbc02
CS
4103 /* do automatic mounts (mainly /proc and /sys), but exclude
4104 * those that need to wait until other stuff has finished
4105 */
4fb3cba5 4106 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4107 ERROR("failed to setup the automatic mounts for '%s'", name);
4108 return -1;
4109 }
4110
0a2dddd4 4111 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
36eb9bde 4112 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 4113 return -1;
576f946d 4114 }
4115
0a2dddd4 4116 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
e7938e9e
MN
4117 ERROR("failed to setup the mount entries for '%s'", name);
4118 return -1;
4119 }
4120
7b6753e7 4121 /* Make sure any start hooks are in the container */
1c1c7051
SH
4122 if (!verify_start_hooks(lxc_conf))
4123 return -1;
4124
2322903b
SH
4125 if (lxc_conf->is_execute)
4126 lxc_execute_bind_init(lxc_conf);
4127
368bbc02
CS
4128 /* now mount only cgroup, if wanted;
4129 * before, /sys could not have been mounted
4130 * (is either mounted automatically or via fstab entries)
4131 */
4fb3cba5 4132 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4133 ERROR("failed to setup the automatic mounts for '%s'", name);
4134 return -1;
4135 }
4136
283678ed 4137 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
773fb9ca
SH
4138 ERROR("failed to run mount hooks for container '%s'.", name);
4139 return -1;
4140 }
4141
bc6928ff 4142 if (lxc_conf->autodev > 0) {
283678ed 4143 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
f7bee6c6
MW
4144 ERROR("failed to run autodev hooks for container '%s'.", name);
4145 return -1;
4146 }
27245ff7 4147 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
91c3830e
SH
4148 ERROR("failed to populate /dev in the container");
4149 return -1;
4150 }
4151 }
368bbc02 4152
3d7d929a 4153 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
36eb9bde 4154 ERROR("failed to setup the console for '%s'", name);
95b5ffaf 4155 return -1;
6e590161 4156 }
4157
7e0e1d94
AV
4158 if (lxc_conf->kmsg) {
4159 if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail
4160 ERROR("failed to setup kmsg for '%s'", name);
4161 }
1bd051a6 4162
69aa6655
DE
4163 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4164 ERROR("failed to setup /dev symlinks for '%s'", name);
4165 return -1;
4166 }
4167
5112cd70 4168 /* mount /proc if it's not already there */
943144d9 4169 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 4170 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 4171 return -1;
e075f5d9 4172 }
e075f5d9 4173
ac778708 4174 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 4175 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 4176 return -1;
ed502555 4177 }
4178
70761e5e 4179 if (lxc_setup_devpts(lxc_conf->pts)) {
36eb9bde 4180 ERROR("failed to setup the new pts instance");
95b5ffaf 4181 return -1;
3c26f34e 4182 }
4183
e8bd4e43
SH
4184 if (lxc_create_tty(name, lxc_conf)) {
4185 ERROR("failed to create the ttys");
4186 return -1;
4187 }
4188
4189 if (send_ttys_to_parent(handler) < 0) {
4190 ERROR("failure sending console info to parent");
4191 return -1;
4192 }
4193
4194
4195 if (!lxc_conf->is_execute && setup_tty(lxc_conf)) {
4196 ERROR("failed to setup the ttys for '%s'", name);
4197 return -1;
4198 }
4199
4200 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4201 SYSERROR("failed to set environment variable for container ptys");
4202
4203
cccc74b5
DL
4204 if (setup_personality(lxc_conf->personality)) {
4205 ERROR("failed to setup personality");
4206 return -1;
4207 }
4208
97a8f74f
SG
4209 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4210 if (!lxc_list_empty(&lxc_conf->caps)) {
7389ca26 4211 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
f6d3e3e4
SH
4212 return -1;
4213 }
97a8f74f
SG
4214 if (dropcaps_except(&lxc_conf->keepcaps)) {
4215 ERROR("failed to keep requested caps");
4216 return -1;
4217 }
4218 } else if (setup_caps(&lxc_conf->caps)) {
4219 ERROR("failed to drop capabilities");
4220 return -1;
81810dd1
DL
4221 }
4222
cd54d859
DL
4223 NOTICE("'%s' is setup.", name);
4224
0ad19a3f 4225 return 0;
4226}
26ddeedd 4227
283678ed
SH
4228int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4229 const char *lxcpath, char *argv[])
26ddeedd
SH
4230{
4231 int which = -1;
4232 struct lxc_list *it;
4233
4234 if (strcmp(hook, "pre-start") == 0)
4235 which = LXCHOOK_PRESTART;
5ea6163a
SH
4236 else if (strcmp(hook, "pre-mount") == 0)
4237 which = LXCHOOK_PREMOUNT;
26ddeedd
SH
4238 else if (strcmp(hook, "mount") == 0)
4239 which = LXCHOOK_MOUNT;
f7bee6c6
MW
4240 else if (strcmp(hook, "autodev") == 0)
4241 which = LXCHOOK_AUTODEV;
26ddeedd
SH
4242 else if (strcmp(hook, "start") == 0)
4243 which = LXCHOOK_START;
52492063
WB
4244 else if (strcmp(hook, "stop") == 0)
4245 which = LXCHOOK_STOP;
26ddeedd
SH
4246 else if (strcmp(hook, "post-stop") == 0)
4247 which = LXCHOOK_POSTSTOP;
148e91f5
SH
4248 else if (strcmp(hook, "clone") == 0)
4249 which = LXCHOOK_CLONE;
37cf711b
SY
4250 else if (strcmp(hook, "destroy") == 0)
4251 which = LXCHOOK_DESTROY;
26ddeedd
SH
4252 else
4253 return -1;
4254 lxc_list_for_each(it, &conf->hooks[which]) {
4255 int ret;
4256 char *hookname = it->elem;
283678ed 4257 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
26ddeedd
SH
4258 if (ret)
4259 return ret;
4260 }
4261 return 0;
4262}
72d0e1cb 4263
427b3a21 4264static void lxc_remove_nic(struct lxc_list *it)
72d0e1cb
SG
4265{
4266 struct lxc_netdev *netdev = it->elem;
9ebb03ad 4267 struct lxc_list *it2,*next;
72d0e1cb
SG
4268
4269 lxc_list_del(it);
4270
f10fad2f
ME
4271 free(netdev->link);
4272 free(netdev->name);
4273 if (netdev->type == LXC_NET_VETH)
c9bb9a85 4274 free(netdev->priv.veth_attr.pair);
f10fad2f
ME
4275 free(netdev->upscript);
4276 free(netdev->hwaddr);
4277 free(netdev->mtu);
4278 free(netdev->ipv4_gateway);
4279 free(netdev->ipv6_gateway);
9ebb03ad 4280 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
72d0e1cb
SG
4281 lxc_list_del(it2);
4282 free(it2->elem);
4283 free(it2);
4284 }
9ebb03ad 4285 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
72d0e1cb
SG
4286 lxc_list_del(it2);
4287 free(it2->elem);
4288 free(it2);
4289 }
d95db067 4290 free(netdev);
72d0e1cb
SG
4291 free(it);
4292}
4293
4294/* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
12a50cc6 4295int lxc_clear_nic(struct lxc_conf *c, const char *key)
72d0e1cb
SG
4296{
4297 char *p1;
4298 int ret, idx, i;
4299 struct lxc_list *it;
4300 struct lxc_netdev *netdev;
4301
46cd2845 4302 p1 = strchr(key, '.');
72d0e1cb
SG
4303 if (!p1 || *(p1+1) == '\0')
4304 p1 = NULL;
4305
4306 ret = sscanf(key, "%d", &idx);
4307 if (ret != 1) return -1;
4308 if (idx < 0)
4309 return -1;
4310
4311 i = 0;
4312 lxc_list_for_each(it, &c->network) {
4313 if (i == idx)
4314 break;
4315 i++;
4316 }
4317 if (i < idx) // we don't have that many nics defined
4318 return -1;
4319
4320 if (!it || !it->elem)
4321 return -1;
4322
4323 netdev = it->elem;
4324
4325 if (!p1) {
4326 lxc_remove_nic(it);
52d21d40 4327 } else if (strcmp(p1, ".ipv4") == 0) {
9ebb03ad
DE
4328 struct lxc_list *it2,*next;
4329 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
72d0e1cb
SG
4330 lxc_list_del(it2);
4331 free(it2->elem);
4332 free(it2);
4333 }
52d21d40 4334 } else if (strcmp(p1, ".ipv6") == 0) {
9ebb03ad
DE
4335 struct lxc_list *it2,*next;
4336 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
72d0e1cb
SG
4337 lxc_list_del(it2);
4338 free(it2->elem);
4339 free(it2);
4340 }
72d0e1cb
SG
4341 }
4342 else return -1;
4343
4344 return 0;
4345}
4346
4347int lxc_clear_config_network(struct lxc_conf *c)
4348{
9ebb03ad
DE
4349 struct lxc_list *it,*next;
4350 lxc_list_for_each_safe(it, &c->network, next) {
72d0e1cb
SG
4351 lxc_remove_nic(it);
4352 }
4353 return 0;
4354}
4355
4356int lxc_clear_config_caps(struct lxc_conf *c)
4357{
9ebb03ad 4358 struct lxc_list *it,*next;
72d0e1cb 4359
9ebb03ad 4360 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
4361 lxc_list_del(it);
4362 free(it->elem);
4363 free(it);
4364 }
4365 return 0;
4366}
4367
74a3920a 4368static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
4369 struct lxc_list *it, *next;
4370
4355ab5f 4371 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
4372 lxc_list_del(it);
4373 free(it->elem);
4374 free(it);
4375 }
4376 return 0;
4377}
4378
4355ab5f
SH
4379int lxc_clear_idmaps(struct lxc_conf *c)
4380{
4381 return lxc_free_idmap(&c->id_map);
4382}
4383
1fb86a7c
SH
4384int lxc_clear_config_keepcaps(struct lxc_conf *c)
4385{
4386 struct lxc_list *it,*next;
4387
4388 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4389 lxc_list_del(it);
4390 free(it->elem);
4391 free(it);
4392 }
4393 return 0;
4394}
4395
12a50cc6 4396int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 4397{
9ebb03ad 4398 struct lxc_list *it,*next;
72d0e1cb 4399 bool all = false;
a6390f01 4400 const char *k = NULL;
72d0e1cb
SG
4401
4402 if (strcmp(key, "lxc.cgroup") == 0)
4403 all = true;
a6390f01
WB
4404 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4405 k = key + sizeof("lxc.cgroup.")-1;
4406 else
4407 return -1;
72d0e1cb 4408
9ebb03ad 4409 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
4410 struct lxc_cgroup *cg = it->elem;
4411 if (!all && strcmp(cg->subsystem, k) != 0)
4412 continue;
4413 lxc_list_del(it);
4414 free(cg->subsystem);
4415 free(cg->value);
4416 free(cg);
4417 free(it);
4418 }
4419 return 0;
4420}
4421
c6d09e15
WB
4422int lxc_clear_limits(struct lxc_conf *c, const char *key)
4423{
4424 struct lxc_list *it, *next;
4425 bool all = false;
4426 const char *k = NULL;
4427
4428 if (strcmp(key, "lxc.limit") == 0)
4429 all = true;
4430 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4431 k = key + sizeof("lxc.limit.")-1;
4432 else
4433 return -1;
4434
4435 lxc_list_for_each_safe(it, &c->limits, next) {
4436 struct lxc_limit *lim = it->elem;
4437 if (!all && strcmp(lim->resource, k) != 0)
4438 continue;
4439 lxc_list_del(it);
4440 free(lim->resource);
4441 free(lim);
4442 free(it);
4443 }
4444 return 0;
4445}
4446
ee1e7aa0
SG
4447int lxc_clear_groups(struct lxc_conf *c)
4448{
4449 struct lxc_list *it,*next;
4450
4451 lxc_list_for_each_safe(it, &c->groups, next) {
4452 lxc_list_del(it);
4453 free(it->elem);
4454 free(it);
4455 }
4456 return 0;
4457}
4458
ab799c0b
SG
4459int lxc_clear_environment(struct lxc_conf *c)
4460{
4461 struct lxc_list *it,*next;
4462
4463 lxc_list_for_each_safe(it, &c->environment, next) {
4464 lxc_list_del(it);
4465 free(it->elem);
4466 free(it);
4467 }
4468 return 0;
4469}
4470
4471
72d0e1cb
SG
4472int lxc_clear_mount_entries(struct lxc_conf *c)
4473{
9ebb03ad 4474 struct lxc_list *it,*next;
72d0e1cb 4475
9ebb03ad 4476 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
4477 lxc_list_del(it);
4478 free(it->elem);
4479 free(it);
4480 }
4481 return 0;
4482}
4483
b099e9e9
SH
4484int lxc_clear_automounts(struct lxc_conf *c)
4485{
4486 c->auto_mounts = 0;
4487 return 0;
4488}
4489
12a50cc6 4490int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 4491{
9ebb03ad 4492 struct lxc_list *it,*next;
17ed13a3 4493 bool all = false, done = false;
a6390f01 4494 const char *k = NULL;
72d0e1cb
SG
4495 int i;
4496
17ed13a3
SH
4497 if (strcmp(key, "lxc.hook") == 0)
4498 all = true;
a6390f01
WB
4499 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4500 k = key + sizeof("lxc.hook.")-1;
4501 else
4502 return -1;
17ed13a3 4503
72d0e1cb 4504 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 4505 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 4506 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
4507 lxc_list_del(it);
4508 free(it->elem);
4509 free(it);
4510 }
4511 done = true;
72d0e1cb
SG
4512 }
4513 }
17ed13a3
SH
4514
4515 if (!done) {
4516 ERROR("Invalid hook key: %s", key);
4517 return -1;
4518 }
72d0e1cb
SG
4519 return 0;
4520}
8eb5694b 4521
74a3920a 4522static void lxc_clear_saved_nics(struct lxc_conf *conf)
7b35f3d6
SH
4523{
4524 int i;
4525
0cf45501 4526 if (!conf->saved_nics)
7b35f3d6
SH
4527 return;
4528 for (i=0; i < conf->num_savednics; i++)
4529 free(conf->saved_nics[i].orig_name);
7b35f3d6
SH
4530 free(conf->saved_nics);
4531}
4532
4184c3e1
SH
4533static inline void lxc_clear_aliens(struct lxc_conf *conf)
4534{
4535 struct lxc_list *it,*next;
4536
4537 lxc_list_for_each_safe(it, &conf->aliens, next) {
4538 lxc_list_del(it);
4539 free(it->elem);
4540 free(it);
4541 }
4542}
4543
f979ac15
SH
4544static inline void lxc_clear_includes(struct lxc_conf *conf)
4545{
4546 struct lxc_list *it,*next;
4547
4548 lxc_list_for_each_safe(it, &conf->includes, next) {
4549 lxc_list_del(it);
4550 free(it->elem);
4551 free(it);
4552 }
4553}
4554
8eb5694b
SH
4555void lxc_conf_free(struct lxc_conf *conf)
4556{
4557 if (!conf)
4558 return;
858377e4
SH
4559 if (current_config == conf)
4560 current_config = NULL;
f10fad2f
ME
4561 free(conf->console.log_path);
4562 free(conf->console.path);
4563 free(conf->rootfs.mount);
b3b8c97f 4564 free(conf->rootfs.bdev_type);
f10fad2f
ME
4565 free(conf->rootfs.options);
4566 free(conf->rootfs.path);
f10fad2f 4567 free(conf->logfile);
858377e4
SH
4568 if (conf->logfd != -1)
4569 close(conf->logfd);
f10fad2f
ME
4570 free(conf->utsname);
4571 free(conf->ttydir);
4572 free(conf->fstab);
4573 free(conf->rcfile);
4574 free(conf->init_cmd);
6b0d5538 4575 free(conf->unexpanded_config);
393903d1 4576 free(conf->pty_names);
76d0127f 4577 free(conf->syslog);
8eb5694b 4578 lxc_clear_config_network(conf);
f10fad2f
ME
4579 free(conf->lsm_aa_profile);
4580 free(conf->lsm_se_context);
769872f9 4581 lxc_seccomp_free(conf);
8eb5694b 4582 lxc_clear_config_caps(conf);
1fb86a7c 4583 lxc_clear_config_keepcaps(conf);
8eb5694b 4584 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 4585 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 4586 lxc_clear_mount_entries(conf);
7b35f3d6 4587 lxc_clear_saved_nics(conf);
27c27d73 4588 lxc_clear_idmaps(conf);
ee1e7aa0 4589 lxc_clear_groups(conf);
f979ac15 4590 lxc_clear_includes(conf);
761d81ca 4591 lxc_clear_aliens(conf);
ab799c0b 4592 lxc_clear_environment(conf);
c6d09e15 4593 lxc_clear_limits(conf, "lxc.limit");
8eb5694b
SH
4594 free(conf);
4595}
4355ab5f
SH
4596
4597struct userns_fn_data {
4598 int (*fn)(void *);
4599 void *arg;
4600 int p[2];
4601};
4602
4603static int run_userns_fn(void *data)
4604{
4605 struct userns_fn_data *d = data;
4606 char c;
4607 // we're not sharing with the parent any more, if it was a thread
4608
4609 close(d->p[1]);
4610 if (read(d->p[0], &c, 1) != 1)
4611 return -1;
4612 close(d->p[0]);
4613 return d->fn(d->arg);
4614}
4615
4616/*
8b227008
TS
4617 * Add ID_TYPE_UID/ID_TYPE_GID entries to an existing lxc_conf,
4618 * if they are not already there.
4355ab5f 4619 */
8b227008
TS
4620static struct lxc_list *idmap_add_id(struct lxc_conf *conf,
4621 uid_t uid, gid_t gid)
4355ab5f 4622{
8b227008
TS
4623 int hostuid_mapped = mapped_hostid(uid, conf, ID_TYPE_UID);
4624 int hostgid_mapped = mapped_hostid(gid, conf, ID_TYPE_GID);
4355ab5f
SH
4625 struct lxc_list *new = NULL, *tmp, *it, *next;
4626 struct id_map *entry;
4627
3ec1648d
SH
4628 new = malloc(sizeof(*new));
4629 if (!new) {
4630 ERROR("Out of memory building id map");
4631 return NULL;
4632 }
4633 lxc_list_init(new);
4634
8b227008
TS
4635 if (hostuid_mapped < 0) {
4636 hostuid_mapped = find_unmapped_nsuid(conf, ID_TYPE_UID);
4637 if (hostuid_mapped < 0)
3ec1648d
SH
4638 goto err;
4639 tmp = malloc(sizeof(*tmp));
4640 if (!tmp)
4641 goto err;
4355ab5f
SH
4642 entry = malloc(sizeof(*entry));
4643 if (!entry) {
3ec1648d
SH
4644 free(tmp);
4645 goto err;
4355ab5f 4646 }
3ec1648d 4647 tmp->elem = entry;
4355ab5f 4648 entry->idtype = ID_TYPE_UID;
8b227008
TS
4649 entry->nsid = hostuid_mapped;
4650 entry->hostid = (unsigned long) uid;
4651 entry->range = 1;
4652 lxc_list_add_tail(new, tmp);
4653 }
4654 if (hostgid_mapped < 0) {
4655 hostgid_mapped = find_unmapped_nsuid(conf, ID_TYPE_GID);
4656 if (hostgid_mapped < 0)
4657 goto err;
4658 tmp = malloc(sizeof(*tmp));
4659 if (!tmp)
4660 goto err;
4661 entry = malloc(sizeof(*entry));
4662 if (!entry) {
4663 free(tmp);
4664 goto err;
4665 }
4666 tmp->elem = entry;
4667 entry->idtype = ID_TYPE_GID;
4668 entry->nsid = hostgid_mapped;
4669 entry->hostid = (unsigned long) gid;
4355ab5f 4670 entry->range = 1;
3ec1648d 4671 lxc_list_add_tail(new, tmp);
4355ab5f
SH
4672 }
4673 lxc_list_for_each_safe(it, &conf->id_map, next) {
4674 tmp = malloc(sizeof(*tmp));
4675 if (!tmp)
4676 goto err;
4677 entry = malloc(sizeof(*entry));
4678 if (!entry) {
4679 free(tmp);
4680 goto err;
4681 }
4682 memset(entry, 0, sizeof(*entry));
4683 memcpy(entry, it->elem, sizeof(*entry));
4684 tmp->elem = entry;
3ec1648d 4685 lxc_list_add_tail(new, tmp);
4355ab5f
SH
4686 }
4687
4688 return new;
4689
4690err:
8b227008 4691 ERROR("Out of memory building a new uid/gid map");
908fde6a
SH
4692 if (new)
4693 lxc_free_idmap(new);
c30ac545 4694 free(new);
4355ab5f
SH
4695 return NULL;
4696}
4697
4698/*
4699 * Run a function in a new user namespace.
8b227008 4700 * The caller's euid/egid will be mapped in if it is not already.
4355ab5f
SH
4701 */
4702int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data)
4703{
4704 int ret, pid;
4705 struct userns_fn_data d;
4706 char c = '1';
4707 int p[2];
4708 struct lxc_list *idmap;
4709
4355ab5f 4710 ret = pipe(p);
4355ab5f
SH
4711 if (ret < 0) {
4712 SYSERROR("opening pipe");
4713 return -1;
4714 }
4715 d.fn = fn;
4716 d.arg = data;
4717 d.p[0] = p[0];
4718 d.p[1] = p[1];
4719 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4720 if (pid < 0)
4721 goto err;
4355ab5f 4722 close(p[0]);
4355ab5f
SH
4723 p[0] = -1;
4724
8b227008
TS
4725 if ((idmap = idmap_add_id(conf, geteuid(), getegid())) == NULL) {
4726 ERROR("Error adding self to container uid/gid map");
4355ab5f
SH
4727 goto err;
4728 }
4729
4730 ret = lxc_map_ids(idmap, pid);
4731 lxc_free_idmap(idmap);
88dd66fc 4732 free(idmap);
565e571c 4733 if (ret) {
4355ab5f
SH
4734 ERROR("Error setting up child mappings");
4735 goto err;
4736 }
4737
4738 // kick the child
4739 if (write(p[1], &c, 1) != 1) {
4740 SYSERROR("writing to pipe to child");
4741 goto err;
4742 }
4743
3139aead
SG
4744 ret = wait_for_pid(pid);
4745
4746 close(p[1]);
4747 return ret;
4748
4355ab5f 4749err:
4355ab5f
SH
4750 if (p[0] != -1)
4751 close(p[0]);
4752 close(p[1]);
4355ab5f
SH
4753 return -1;
4754}
97e9cfa0 4755
a96a8e8c 4756/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4757static char* getuname(void)
4758{
a96a8e8c 4759 struct passwd *result;
97e9cfa0 4760
a96a8e8c
SH
4761 result = getpwuid(geteuid());
4762 if (!result)
97e9cfa0
SH
4763 return NULL;
4764
a96a8e8c 4765 return strdup(result->pw_name);
97e9cfa0
SH
4766}
4767
a96a8e8c 4768/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4769static char *getgname(void)
4770{
a96a8e8c 4771 struct group *result;
97e9cfa0 4772
a96a8e8c
SH
4773 result = getgrgid(getegid());
4774 if (!result)
97e9cfa0
SH
4775 return NULL;
4776
a96a8e8c 4777 return strdup(result->gr_name);
97e9cfa0
SH
4778}
4779
a96a8e8c 4780/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4781void suggest_default_idmap(void)
4782{
4783 FILE *f;
4784 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4785 char *line = NULL;
4786 char *uname, *gname;
4787 size_t len = 0;
4788
4789 if (!(uname = getuname()))
4790 return;
4791
4792 if (!(gname = getgname())) {
4793 free(uname);
4794 return;
4795 }
4796
4797 f = fopen(subuidfile, "r");
4798 if (!f) {
4799 ERROR("Your system is not configured with subuids");
4800 free(gname);
4801 free(uname);
4802 return;
4803 }
4804 while (getline(&line, &len, f) != -1) {
b7930180 4805 size_t no_newline = 0;
97e9cfa0
SH
4806 char *p = strchr(line, ':'), *p2;
4807 if (*line == '#')
4808 continue;
4809 if (!p)
4810 continue;
4811 *p = '\0';
4812 p++;
4813 if (strcmp(line, uname))
4814 continue;
4815 p2 = strchr(p, ':');
4816 if (!p2)
4817 continue;
4818 *p2 = '\0';
4819 p2++;
4820 if (!*p2)
4821 continue;
b7930180
CB
4822 no_newline = strcspn(p2, "\n");
4823 p2[no_newline] = '\0';
4824
b7b2fde4
CB
4825 if (lxc_safe_uint(p, &uid) < 0)
4826 WARN("Could not parse UID.");
4827 if (lxc_safe_uint(p2, &urange) < 0)
4828 WARN("Could not parse UID range.");
97e9cfa0
SH
4829 }
4830 fclose(f);
4831
6be7389a 4832 f = fopen(subgidfile, "r");
97e9cfa0
SH
4833 if (!f) {
4834 ERROR("Your system is not configured with subgids");
4835 free(gname);
4836 free(uname);
4837 return;
4838 }
4839 while (getline(&line, &len, f) != -1) {
b7930180 4840 size_t no_newline = 0;
97e9cfa0
SH
4841 char *p = strchr(line, ':'), *p2;
4842 if (*line == '#')
4843 continue;
4844 if (!p)
4845 continue;
4846 *p = '\0';
4847 p++;
4848 if (strcmp(line, uname))
4849 continue;
4850 p2 = strchr(p, ':');
4851 if (!p2)
4852 continue;
4853 *p2 = '\0';
4854 p2++;
4855 if (!*p2)
4856 continue;
b7930180
CB
4857 no_newline = strcspn(p2, "\n");
4858 p2[no_newline] = '\0';
4859
b7b2fde4
CB
4860 if (lxc_safe_uint(p, &gid) < 0)
4861 WARN("Could not parse GID.");
4862 if (lxc_safe_uint(p2, &grange) < 0)
4863 WARN("Could not parse GID range.");
97e9cfa0
SH
4864 }
4865 fclose(f);
4866
f10fad2f 4867 free(line);
97e9cfa0
SH
4868
4869 if (!urange || !grange) {
4870 ERROR("You do not have subuids or subgids allocated");
4871 ERROR("Unprivileged containers require subuids and subgids");
4872 return;
4873 }
4874
4875 ERROR("You must either run as root, or define uid mappings");
4876 ERROR("To pass uid mappings to lxc-create, you could create");
4877 ERROR("~/.config/lxc/default.conf:");
4878 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4879 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
4880 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
4881
4882 free(gname);
4883 free(uname);
4884}
aaf26830 4885
a7307747
SH
4886static void free_cgroup_settings(struct lxc_list *result)
4887{
4888 struct lxc_list *iterator, *next;
4889
4890 lxc_list_for_each_safe(iterator, result, next) {
4891 lxc_list_del(iterator);
4892 free(iterator);
4893 }
4894 free(result);
4895}
4896
aaf26830
KT
4897/*
4898 * Return the list of cgroup_settings sorted according to the following rules
4899 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4900 */
4901struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
4902{
4903 struct lxc_list *result;
4904 struct lxc_list *memsw_limit = NULL;
4905 struct lxc_list *it = NULL;
4906 struct lxc_cgroup *cg = NULL;
4907 struct lxc_list *item = NULL;
4908
4909 result = malloc(sizeof(*result));
fac7c663
KT
4910 if (!result) {
4911 ERROR("failed to allocate memory to sort cgroup settings");
4912 return NULL;
4913 }
aaf26830
KT
4914 lxc_list_init(result);
4915
4916 /*Iterate over the cgroup settings and copy them to the output list*/
4917 lxc_list_for_each(it, cgroup_settings) {
4918 item = malloc(sizeof(*item));
fac7c663
KT
4919 if (!item) {
4920 ERROR("failed to allocate memory to sort cgroup settings");
a7307747 4921 free_cgroup_settings(result);
fac7c663
KT
4922 return NULL;
4923 }
aaf26830
KT
4924 item->elem = it->elem;
4925 cg = it->elem;
4926 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4927 /* Store the memsw_limit location */
4928 memsw_limit = item;
4929 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4d5b72a1 4930 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
aaf26830
KT
4931 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
4932 item->elem = memsw_limit->elem;
4933 memsw_limit->elem = it->elem;
4934 }
4935 lxc_list_add_tail(result, item);
4936 }
4937
4938 return result;
a7307747 4939}
78625a5e
CB
4940
4941int lxc_clear_simple_config_item(struct lxc_conf *c, const char *key)
4942{
4943 if (strcmp(key, "lxc.utsname") == 0) {
4944 free(c->utsname);
4945 c->utsname = NULL;
4946 } else if (strcmp(key, "lxc.arch") == 0) {
4947 c->personality = -1;
4948 } else if (strcmp(key, "lxc.haltsignal") == 0) {
4949 c->haltsignal = 0;
4950 } else if (strcmp(key, "lxc.rebootsignal") == 0) {
4951 c->rebootsignal = 0;
4952 } else if (strcmp(key, "lxc.stopsignal") == 0) {
4953 c->stopsignal = 0;
4954 } else if (strcmp(key, "lxc.init_cmd") == 0) {
4955 free(c->init_cmd);
4956 c->init_cmd = NULL;
4957 } else if (strcmp(key, "lxc.init_uid") == 0) {
4958 c->init_uid = 0;
4959 } else if (strcmp(key, "lxc.init_gid") == 0) {
4960 c->init_gid = 0;
4961 } else if (strcmp(key, "lxc.ephemeral") == 0) {
4962 c->ephemeral = 0;
4963 } else if (strcmp(key, "lxc.console.logfile") == 0) {
4964 free(c->console.log_path);
4965 c->console.log_path = NULL;
4966 } else if (strcmp(key, "lxc.console") == 0) {
4967 free(c->console.path);
4968 c->console.path = NULL;
4969 } else if (strcmp(key, "lxc.tty") == 0) {
4970 c->tty = 0;
4971 } else if (strcmp(key, "lxc.devttydir") == 0) {
4972 free(c->ttydir);
4973 c->ttydir = NULL;
4974 } else if (strcmp(key, "lxc.autodev") == 0) {
4975 c->autodev = 1;
4976 } else if (strcmp(key, "lxc.kmsg") == 0) {
4977 c->kmsg = 0;
4978 } else if (strcmp(key, "lxc.mount") == 0) {
4979 free(c->fstab);
4980 c->fstab = NULL;
4981 } else if (strcmp(key, "lxc.rootfs") == 0) {
4982 free(c->rootfs.path);
4983 c->rootfs.path = NULL;
4984 } else if (strcmp(key, "lxc.rootfs.mount") == 0) {
4985 free(c->rootfs.mount);
4986 c->rootfs.mount = NULL;
4987 } else if (strcmp(key, "lxc.rootfs.options") == 0) {
4988 free(c->rootfs.options);
4989 c->rootfs.options = NULL;
4990 } else if (strcmp(key, "lxc.rootfs.backend") == 0) {
4991 free(c->rootfs.bdev_type);
4992 c->rootfs.bdev_type = NULL;
4993 } else if (strcmp(key, "lxc.aa_profile") == 0) {
4994 free(c->lsm_aa_profile);
4995 c->lsm_aa_profile = NULL;
4996 } else if (strcmp(key, "lxc.aa_allow_incomplete") == 0) {
4997 c->lsm_aa_allow_incomplete = 0;
4998 } else if (strcmp(key, "lxc.se_context") == 0) {
4999 free(c->lsm_se_context);
5000 c->lsm_se_context = NULL;
5001 } else if (strcmp(key, "lxc.seccomp") == 0) {
5002 free(c->seccomp);
5003 c->seccomp = NULL;
5004 } else if (strcmp(key, "lxc.loglevel") == 0) {
5005 c->loglevel = LXC_LOG_PRIORITY_NOTSET;
5006 } else if (strcmp(key, "lxc.logfile") == 0) {
5007 free(c->logfile);
5008 c->logfile = NULL;
5009 } else if (strcmp(key, "lxc.monitor.unshare") == 0) {
5010 c->monitor_unshare = 0;
5011 } else if (strcmp(key, "lxc.pts") == 0) {
5012 c->pts = 0;
5013 } else {
5014 return -1;
5015 }
5016
5017 return 0;
5018}