]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
lxccontainer: cleanup + bugfixes
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
8f3e280e
CB
27#include <dirent.h>
28#include <errno.h>
29#include <fcntl.h>
30#include <grp.h>
31#include <inttypes.h>
32#include <libgen.h>
33#include <pwd.h>
34#include <stdarg.h>
0ad19a3f 35#include <stdio.h>
0ad19a3f 36#include <stdlib.h>
0ad19a3f 37#include <string.h>
8f3e280e 38#include <time.h>
0ad19a3f 39#include <unistd.h>
8f3e280e
CB
40#include <arpa/inet.h>
41#include <linux/loop.h>
8f3e280e
CB
42#include <net/if.h>
43#include <netinet/in.h>
44#include <sys/mman.h>
45#include <sys/mount.h>
46#include <sys/param.h>
47#include <sys/prctl.h>
48#include <sys/stat.h>
49#include <sys/socket.h>
ce831b3b 50#include <sys/sysmacros.h>
2d76d1d7 51#include <sys/syscall.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
1d52bdf7 55
af6824fc
ST
56/* makedev() */
57#ifdef MAJOR_IN_MKDEV
58# include <sys/mkdev.h>
59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
5ef5c9a3
CB
71#ifdef HAVE_LINUX_MEMFD_H
72#include <linux/memfd.h>
73#endif
74
e8bd4e43 75#include "af_unix.h"
8f3e280e
CB
76#include "bdev.h"
77#include "caps.h" /* for lxc_caps_last_cap() */
78#include "cgroup.h"
1b09f2c0 79#include "conf.h"
8f3e280e 80#include "error.h"
1b09f2c0 81#include "log.h"
d8e48992 82#include "lxcaufs.h"
025ed0f3 83#include "lxclock.h"
8f3e280e
CB
84#include "lxcoverlay.h"
85#include "lxcseccomp.h"
4355ab5f 86#include "namespace.h"
8f3e280e
CB
87#include "network.h"
88#include "parse.h"
89#include "utils.h"
fe4de9a6 90#include "lsm/lsm.h"
d0a36f2c 91
e37dda71 92#if HAVE_LIBCAP
495d2046
SG
93#include <sys/capability.h>
94#endif
95
6ff05e18
SG
96#if HAVE_SYS_PERSONALITY_H
97#include <sys/personality.h>
98#endif
99
edaf8b1b
SG
100#if IS_BIONIC
101#include <../include/lxcmntent.h>
a04f5407
CB
102#ifndef HAVE_PRLIMIT
103#include <../include/prlimit.h>
104#endif
edaf8b1b
SG
105#else
106#include <mntent.h>
107#endif
108
36eb9bde 109lxc_log_define(lxc_conf, lxc);
e5bda9ee 110
e37dda71 111#if HAVE_LIBCAP
b09094da
MN
112#ifndef CAP_SETFCAP
113#define CAP_SETFCAP 31
114#endif
115
116#ifndef CAP_MAC_OVERRIDE
117#define CAP_MAC_OVERRIDE 32
118#endif
119
120#ifndef CAP_MAC_ADMIN
121#define CAP_MAC_ADMIN 33
122#endif
495d2046 123#endif
b09094da
MN
124
125#ifndef PR_CAPBSET_DROP
126#define PR_CAPBSET_DROP 24
127#endif
128
9818cae4
SG
129#ifndef LO_FLAGS_AUTOCLEAR
130#define LO_FLAGS_AUTOCLEAR 4
131#endif
132
bc5b27d6
DK
133#ifndef CAP_SETUID
134#define CAP_SETUID 7
135#endif
136
137#ifndef CAP_SETGID
138#define CAP_SETGID 6
139#endif
140
0769b82a
CS
141/* needed for cgroup automount checks, regardless of whether we
142 * have included linux/capability.h or not */
143#ifndef CAP_SYS_ADMIN
144#define CAP_SYS_ADMIN 21
145#endif
146
2d76d1d7
SG
147/* Define pivot_root() if missing from the C library */
148#ifndef HAVE_PIVOT_ROOT
149static int pivot_root(const char * new_root, const char * put_old)
150{
151#ifdef __NR_pivot_root
8f3e280e 152 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 153#else
8f3e280e
CB
154 errno = ENOSYS;
155 return -1;
2d76d1d7
SG
156#endif
157}
158#else
159extern int pivot_root(const char * new_root, const char * put_old);
160#endif
161
162/* Define sethostname() if missing from the C library */
163#ifndef HAVE_SETHOSTNAME
164static int sethostname(const char * name, size_t len)
165{
166#ifdef __NR_sethostname
8f3e280e 167 return syscall(__NR_sethostname, name, len);
2d76d1d7 168#else
8f3e280e
CB
169 errno = ENOSYS;
170 return -1;
2d76d1d7
SG
171#endif
172}
173#endif
174
ecec0126
SG
175#ifndef MS_PRIVATE
176#define MS_PRIVATE (1<<18)
177#endif
178
8912711c
CB
179#ifndef MS_LAZYTIME
180#define MS_LAZYTIME (1<<25)
181#endif
182
5ef5c9a3
CB
183/* memfd_create() */
184#ifndef MFD_CLOEXEC
185#define MFD_CLOEXEC 0x0001U
186#endif
187
188#ifndef MFD_ALLOW_SEALING
189#define MFD_ALLOW_SEALING 0x0002U
190#endif
191
192#ifndef HAVE_MEMFD_CREATE
193static int memfd_create(const char *name, unsigned int flags) {
194 #ifndef __NR_memfd_create
195 #if defined __i386__
196 #define __NR_memfd_create 356
197 #elif defined __x86_64__
198 #define __NR_memfd_create 319
199 #elif defined __arm__
200 #define __NR_memfd_create 385
201 #elif defined __aarch64__
202 #define __NR_memfd_create 279
203 #elif defined __s390__
204 #define __NR_memfd_create 350
205 #elif defined __powerpc__
206 #define __NR_memfd_create 360
207 #elif defined __sparc__
208 #define __NR_memfd_create 348
209 #elif defined __blackfin__
210 #define __NR_memfd_create 390
211 #elif defined __ia64__
212 #define __NR_memfd_create 1340
213 #elif defined _MIPS_SIM
214 #if _MIPS_SIM == _MIPS_SIM_ABI32
215 #define __NR_memfd_create 4354
216 #endif
217 #if _MIPS_SIM == _MIPS_SIM_NABI32
218 #define __NR_memfd_create 6318
219 #endif
220 #if _MIPS_SIM == _MIPS_SIM_ABI64
221 #define __NR_memfd_create 5314
222 #endif
223 #endif
224 #endif
225 #ifdef __NR_memfd_create
226 return syscall(__NR_memfd_create, name, flags);
227 #else
228 errno = ENOSYS;
229 return -1;
230 #endif
231}
232#else
233extern int memfd_create(const char *name, unsigned int flags);
234#endif
235
72d0e1cb 236char *lxchook_names[NUM_LXC_HOOKS] = {
52492063 237 "pre-start", "pre-mount", "mount", "autodev", "start", "stop", "post-stop", "clone", "destroy" };
72d0e1cb 238
a589434e 239typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
0ad19a3f 240
998ac676
RT
241struct mount_opt {
242 char *name;
243 int clear;
244 int flag;
245};
246
81810dd1
DL
247struct caps_opt {
248 char *name;
249 int value;
250};
251
c6d09e15
WB
252struct limit_opt {
253 char *name;
254 int value;
255};
256
858377e4
SH
257/*
258 * The lxc_conf of the container currently being worked on in an
259 * API call
260 * This is used in the error calls
261 */
262#ifdef HAVE_TLS
263__thread struct lxc_conf *current_config;
264#else
265struct lxc_conf *current_config;
266#endif
267
0769b82a
CS
268/* Declare this here, since we don't want to reshuffle the whole file. */
269static int in_caplist(int cap, struct lxc_list *caps);
270
a589434e
JN
271static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
272static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
273static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
274static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
275static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
276static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
277
278static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
279 [LXC_NET_VETH] = instantiate_veth,
280 [LXC_NET_MACVLAN] = instantiate_macvlan,
281 [LXC_NET_VLAN] = instantiate_vlan,
282 [LXC_NET_PHYS] = instantiate_phys,
283 [LXC_NET_EMPTY] = instantiate_empty,
284 [LXC_NET_NONE] = instantiate_none,
0ad19a3f 285};
286
74a2b586
JK
287static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
288static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
289static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
290static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
291static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
26b797f3 292static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
74a2b586 293
a589434e 294static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
74a2b586
JK
295 [LXC_NET_VETH] = shutdown_veth,
296 [LXC_NET_MACVLAN] = shutdown_macvlan,
297 [LXC_NET_VLAN] = shutdown_vlan,
298 [LXC_NET_PHYS] = shutdown_phys,
299 [LXC_NET_EMPTY] = shutdown_empty,
26b797f3 300 [LXC_NET_NONE] = shutdown_none,
74a2b586
JK
301};
302
998ac676 303static struct mount_opt mount_opt[] = {
470b359b
CB
304 { "async", 1, MS_SYNCHRONOUS },
305 { "atime", 1, MS_NOATIME },
306 { "bind", 0, MS_BIND },
88d413d5 307 { "defaults", 0, 0 },
88d413d5 308 { "dev", 1, MS_NODEV },
470b359b 309 { "diratime", 1, MS_NODIRATIME },
88d413d5 310 { "dirsync", 0, MS_DIRSYNC },
470b359b 311 { "exec", 1, MS_NOEXEC },
8912711c 312 { "lazytime", 0, MS_LAZYTIME },
88d413d5 313 { "mand", 0, MS_MANDLOCK },
88d413d5 314 { "noatime", 0, MS_NOATIME },
470b359b 315 { "nodev", 0, MS_NODEV },
88d413d5 316 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
317 { "noexec", 0, MS_NOEXEC },
318 { "nomand", 1, MS_MANDLOCK },
319 { "norelatime", 1, MS_RELATIME },
320 { "nostrictatime", 1, MS_STRICTATIME },
321 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
322 { "rbind", 0, MS_BIND|MS_REC },
323 { "relatime", 0, MS_RELATIME },
470b359b
CB
324 { "remount", 0, MS_REMOUNT },
325 { "ro", 0, MS_RDONLY },
326 { "rw", 1, MS_RDONLY },
88d413d5 327 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
328 { "suid", 1, MS_NOSUID },
329 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 330 { NULL, 0, 0 },
998ac676
RT
331};
332
e37dda71 333#if HAVE_LIBCAP
81810dd1 334static struct caps_opt caps_opt[] = {
a6afdde9 335 { "chown", CAP_CHOWN },
1e11be34
DL
336 { "dac_override", CAP_DAC_OVERRIDE },
337 { "dac_read_search", CAP_DAC_READ_SEARCH },
338 { "fowner", CAP_FOWNER },
339 { "fsetid", CAP_FSETID },
81810dd1
DL
340 { "kill", CAP_KILL },
341 { "setgid", CAP_SETGID },
342 { "setuid", CAP_SETUID },
343 { "setpcap", CAP_SETPCAP },
344 { "linux_immutable", CAP_LINUX_IMMUTABLE },
345 { "net_bind_service", CAP_NET_BIND_SERVICE },
346 { "net_broadcast", CAP_NET_BROADCAST },
347 { "net_admin", CAP_NET_ADMIN },
348 { "net_raw", CAP_NET_RAW },
349 { "ipc_lock", CAP_IPC_LOCK },
350 { "ipc_owner", CAP_IPC_OWNER },
351 { "sys_module", CAP_SYS_MODULE },
352 { "sys_rawio", CAP_SYS_RAWIO },
353 { "sys_chroot", CAP_SYS_CHROOT },
354 { "sys_ptrace", CAP_SYS_PTRACE },
355 { "sys_pacct", CAP_SYS_PACCT },
356 { "sys_admin", CAP_SYS_ADMIN },
357 { "sys_boot", CAP_SYS_BOOT },
358 { "sys_nice", CAP_SYS_NICE },
359 { "sys_resource", CAP_SYS_RESOURCE },
360 { "sys_time", CAP_SYS_TIME },
361 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
362 { "mknod", CAP_MKNOD },
363 { "lease", CAP_LEASE },
57b837e2
CB
364#ifdef CAP_AUDIT_READ
365 { "audit_read", CAP_AUDIT_READ },
366#endif
9527e566 367#ifdef CAP_AUDIT_WRITE
81810dd1 368 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
369#endif
370#ifdef CAP_AUDIT_CONTROL
81810dd1 371 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 372#endif
81810dd1
DL
373 { "setfcap", CAP_SETFCAP },
374 { "mac_override", CAP_MAC_OVERRIDE },
375 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
376#ifdef CAP_SYSLOG
377 { "syslog", CAP_SYSLOG },
378#endif
379#ifdef CAP_WAKE_ALARM
380 { "wake_alarm", CAP_WAKE_ALARM },
381#endif
2b54359b
CB
382#ifdef CAP_BLOCK_SUSPEND
383 { "block_suspend", CAP_BLOCK_SUSPEND },
384#endif
81810dd1 385};
495d2046
SG
386#else
387static struct caps_opt caps_opt[] = {};
388#endif
81810dd1 389
c6d09e15
WB
390static struct limit_opt limit_opt[] = {
391#ifdef RLIMIT_AS
392 { "as", RLIMIT_AS },
393#endif
394#ifdef RLIMIT_CORE
395 { "core", RLIMIT_CORE },
396#endif
397#ifdef RLIMIT_CPU
398 { "cpu", RLIMIT_CPU },
399#endif
400#ifdef RLIMIT_DATA
401 { "data", RLIMIT_DATA },
402#endif
403#ifdef RLIMIT_FSIZE
404 { "fsize", RLIMIT_FSIZE },
405#endif
406#ifdef RLIMIT_LOCKS
407 { "locks", RLIMIT_LOCKS },
408#endif
409#ifdef RLIMIT_MEMLOCK
410 { "memlock", RLIMIT_MEMLOCK },
411#endif
412#ifdef RLIMIT_MSGQUEUE
413 { "msgqueue", RLIMIT_MSGQUEUE },
414#endif
415#ifdef RLIMIT_NICE
416 { "nice", RLIMIT_NICE },
417#endif
418#ifdef RLIMIT_NOFILE
419 { "nofile", RLIMIT_NOFILE },
420#endif
421#ifdef RLIMIT_NPROC
422 { "nproc", RLIMIT_NPROC },
423#endif
424#ifdef RLIMIT_RSS
425 { "rss", RLIMIT_RSS },
426#endif
427#ifdef RLIMIT_RTPRIO
428 { "rtprio", RLIMIT_RTPRIO },
429#endif
430#ifdef RLIMIT_RTTIME
431 { "rttime", RLIMIT_RTTIME },
432#endif
433#ifdef RLIMIT_SIGPENDING
434 { "sigpending", RLIMIT_SIGPENDING },
435#endif
436#ifdef RLIMIT_STACK
437 { "stack", RLIMIT_STACK },
438#endif
439};
440
91c3830e
SH
441static int run_buffer(char *buffer)
442{
ebec9176 443 struct lxc_popen_FILE *f;
91c3830e 444 char *output;
8e7da691 445 int ret;
91c3830e 446
ebec9176 447 f = lxc_popen(buffer);
91c3830e 448 if (!f) {
062b72c6 449 SYSERROR("Failed to popen() %s.", buffer);
91c3830e
SH
450 return -1;
451 }
452
453 output = malloc(LXC_LOG_BUFFER_SIZE);
454 if (!output) {
062b72c6 455 ERROR("Failed to allocate memory for %s.", buffer);
ebec9176 456 lxc_pclose(f);
91c3830e
SH
457 return -1;
458 }
459
062b72c6
CB
460 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
461 DEBUG("Script %s with output: %s.", buffer, output);
91c3830e
SH
462
463 free(output);
464
ebec9176 465 ret = lxc_pclose(f);
8e7da691 466 if (ret == -1) {
062b72c6 467 SYSERROR("Script exited with error.");
91c3830e 468 return -1;
8e7da691 469 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
062b72c6 470 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
8e7da691
DE
471 return -1;
472 } else if (WIFSIGNALED(ret)) {
062b72c6 473 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
8e7da691 474 return -1;
91c3830e
SH
475 }
476
477 return 0;
478}
479
148e91f5 480static int run_script_argv(const char *name, const char *section,
062b72c6
CB
481 const char *script, const char *hook,
482 const char *lxcpath, char **argsin)
148e91f5
SH
483{
484 int ret, i;
485 char *buffer;
486 size_t size = 0;
487
062b72c6 488 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
148e91f5
SH
489 script, name, section);
490
062b72c6 491 for (i = 0; argsin && argsin[i]; i++)
148e91f5
SH
492 size += strlen(argsin[i]) + 1;
493
494 size += strlen(hook) + 1;
495
496 size += strlen(script);
497 size += strlen(name);
498 size += strlen(section);
499 size += 3;
500
501 if (size > INT_MAX)
502 return -1;
503
504 buffer = alloca(size);
505 if (!buffer) {
062b72c6 506 ERROR("Failed to allocate memory.");
148e91f5
SH
507 return -1;
508 }
509
062b72c6
CB
510 ret =
511 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
512 if (ret < 0 || (size_t)ret >= size) {
513 ERROR("Script name too long.");
148e91f5
SH
514 return -1;
515 }
516
062b72c6
CB
517 for (i = 0; argsin && argsin[i]; i++) {
518 int len = size - ret;
148e91f5
SH
519 int rc;
520 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
521 if (rc < 0 || rc >= len) {
062b72c6 522 ERROR("Script args too long.");
148e91f5
SH
523 return -1;
524 }
525 ret += rc;
526 }
527
528 return run_buffer(buffer);
529}
530
062b72c6
CB
531static int run_script(const char *name, const char *section, const char *script,
532 ...)
e3b4c4c4 533{
abbfd20b 534 int ret;
91c3830e 535 char *buffer, *p;
abbfd20b
DL
536 size_t size = 0;
537 va_list ap;
751d9dcd 538
062b72c6 539 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
751d9dcd 540 script, name, section);
e3b4c4c4 541
abbfd20b
DL
542 va_start(ap, script);
543 while ((p = va_arg(ap, char *)))
95642a10 544 size += strlen(p) + 1;
abbfd20b
DL
545 va_end(ap);
546
547 size += strlen(script);
548 size += strlen(name);
549 size += strlen(section);
95642a10 550 size += 3;
abbfd20b 551
95642a10
MS
552 if (size > INT_MAX)
553 return -1;
554
555 buffer = alloca(size);
abbfd20b 556 if (!buffer) {
062b72c6 557 ERROR("Failed to allocate memory.");
751d9dcd
DL
558 return -1;
559 }
560
9ba8130c
SH
561 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
562 if (ret < 0 || ret >= size) {
062b72c6 563 ERROR("Script name too long.");
9ba8130c
SH
564 return -1;
565 }
751d9dcd 566
abbfd20b 567 va_start(ap, script);
9ba8130c 568 while ((p = va_arg(ap, char *))) {
062b72c6 569 int len = size - ret;
9ba8130c
SH
570 int rc;
571 rc = snprintf(buffer + ret, len, " %s", p);
572 if (rc < 0 || rc >= len) {
062b72c6 573 ERROR("Script args too long.");
9ba8130c
SH
574 return -1;
575 }
576 ret += rc;
577 }
abbfd20b 578 va_end(ap);
751d9dcd 579
91c3830e 580 return run_buffer(buffer);
e3b4c4c4
ST
581}
582
0c547523
SH
583/*
584 * pin_rootfs
b7ed4bf0
CS
585 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
586 * the duration of the container run, to prevent the container from marking
587 * the underlying fs readonly on shutdown. unlink the file immediately so
588 * no name pollution is happens
0c547523
SH
589 * return -1 on error.
590 * return -2 if nothing needed to be pinned.
591 * return an open fd (>=0) if we pinned it.
592 */
593int pin_rootfs(const char *rootfs)
594{
595 char absrootfs[MAXPATHLEN];
596 char absrootfspin[MAXPATHLEN];
597 struct stat s;
598 int ret, fd;
599
e99ee0de 600 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 601 return -2;
e99ee0de 602
00ec333b 603 if (!realpath(rootfs, absrootfs))
9be53773 604 return -2;
0c547523 605
00ec333b 606 if (access(absrootfs, F_OK))
0c547523 607 return -1;
0c547523 608
00ec333b 609 if (stat(absrootfs, &s))
0c547523 610 return -1;
0c547523 611
72f919c4 612 if (!S_ISDIR(s.st_mode))
0c547523
SH
613 return -2;
614
b7ed4bf0 615 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 616 if (ret >= MAXPATHLEN)
0c547523 617 return -1;
0c547523
SH
618
619 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
620 if (fd < 0)
621 return fd;
622 (void)unlink(absrootfspin);
0c547523
SH
623 return fd;
624}
625
e2a7e8dc
SH
626/*
627 * If we are asking to remount something, make sure that any
628 * NOEXEC etc are honored.
629 */
630static unsigned long add_required_remount_flags(const char *s, const char *d,
631 unsigned long flags)
632{
614305f3 633#ifdef HAVE_STATVFS
e2a7e8dc
SH
634 struct statvfs sb;
635 unsigned long required_flags = 0;
636
637 if (!(flags & MS_REMOUNT))
638 return flags;
639
640 if (!s)
641 s = d;
642
643 if (!s)
644 return flags;
645 if (statvfs(s, &sb) < 0)
646 return flags;
647
648 if (sb.f_flag & MS_NOSUID)
649 required_flags |= MS_NOSUID;
650 if (sb.f_flag & MS_NODEV)
651 required_flags |= MS_NODEV;
652 if (sb.f_flag & MS_RDONLY)
653 required_flags |= MS_RDONLY;
654 if (sb.f_flag & MS_NOEXEC)
655 required_flags |= MS_NOEXEC;
656
657 return flags | required_flags;
614305f3
SH
658#else
659 return flags;
660#endif
e2a7e8dc
SH
661}
662
4fb3cba5 663static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 664{
368bbc02 665 int r;
80e80c40 666 int i;
b06b8511
CS
667 static struct {
668 int match_mask;
669 int match_flag;
670 const char *source;
671 const char *destination;
672 const char *fstype;
673 unsigned long flags;
674 const char *options;
675 } default_mounts[] = {
676 /* Read-only bind-mounting... In older kernels, doing that required
677 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
678 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
679 * kernel 2.6.26 onwards. However, this apparently does not work on
680 * kernel 3.8. Unfortunately, on that very same kernel, doing the
681 * same trick as above doesn't seem to work either, there one needs
682 * to ALSO specify MS_BIND for the remount, otherwise the entire
683 * fs is remounted read-only or the mount fails because it's busy...
684 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
685 * 2.6.32...
368bbc02 686 */
f24a52d5 687 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a
SH
688 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
689 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
f24a52d5
SG
690 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
691 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
592fd47a 692 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
f24a52d5
SG
693 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
694 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
695 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
696 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
697 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
698 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
699 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
700 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
701 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
702 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
703 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
704 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 705 };
368bbc02 706
b06b8511
CS
707 for (i = 0; default_mounts[i].match_mask; i++) {
708 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
709 char *source = NULL;
710 char *destination = NULL;
711 int saved_errno;
e2a7e8dc 712 unsigned long mflags;
b06b8511
CS
713
714 if (default_mounts[i].source) {
715 /* will act like strdup if %r is not present */
8ede5f4c 716 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
b06b8511
CS
717 if (!source) {
718 SYSERROR("memory allocation error");
719 return -1;
720 }
721 }
cc4fd506
SH
722 if (!default_mounts[i].destination) {
723 ERROR("BUG: auto mounts destination %d was NULL", i);
b2f44b4d 724 free(source);
cc4fd506
SH
725 return -1;
726 }
727 /* will act like strdup if %r is not present */
728 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
729 if (!destination) {
730 saved_errno = errno;
731 SYSERROR("memory allocation error");
732 free(source);
733 errno = saved_errno;
734 return -1;
b06b8511 735 }
e2a7e8dc
SH
736 mflags = add_required_remount_flags(source, destination,
737 default_mounts[i].flags);
592fd47a 738 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
b06b8511 739 saved_errno = errno;
b88ff9a0
SG
740 if (r < 0 && errno == ENOENT) {
741 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
742 r = 0;
743 }
744 else if (r < 0)
e2a7e8dc 745 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
f24a52d5 746
b06b8511
CS
747 free(source);
748 free(destination);
749 if (r < 0) {
b06b8511
CS
750 errno = saved_errno;
751 return -1;
752 }
368bbc02 753 }
368bbc02
CS
754 }
755
b06b8511 756 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
757 int cg_flags;
758
759 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
760 /* If the type of cgroup mount was not specified, it depends on the
761 * container's capabilities as to what makes sense: if we have
762 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
763 * anyway, so we may as well default to read-write; then the admin
764 * will not be given a false sense of security. (And if they really
765 * want mixed r/o r/w, then they can explicitly specify :mixed.)
766 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
767 * :mixed, because then the container can't remount it read-write. */
768 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
769 int has_sys_admin = 0;
b0ee5983
CB
770
771 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 772 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 773 else
0769b82a 774 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
775
776 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 777 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 778 else
0769b82a 779 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a
CS
780 }
781
8ede5f4c 782 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
368bbc02 783 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 784 return -1;
368bbc02
CS
785 }
786 }
787
368bbc02 788 return 0;
368bbc02
CS
789}
790
4e5440c6 791static int setup_utsname(struct utsname *utsname)
0ad19a3f 792{
4e5440c6
DL
793 if (!utsname)
794 return 0;
0ad19a3f 795
4e5440c6
DL
796 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
797 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 798 return -1;
799 }
800
4e5440c6 801 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 802
0ad19a3f 803 return 0;
804}
805
69aa6655
DE
806struct dev_symlinks {
807 const char *oldpath;
808 const char *name;
809};
810
811static const struct dev_symlinks dev_symlinks[] = {
812 {"/proc/self/fd", "fd"},
813 {"/proc/self/fd/0", "stdin"},
814 {"/proc/self/fd/1", "stdout"},
815 {"/proc/self/fd/2", "stderr"},
816};
817
818static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
819{
820 char path[MAXPATHLEN];
821 int ret,i;
09227be2 822 struct stat s;
69aa6655
DE
823
824
825 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
826 const struct dev_symlinks *d = &dev_symlinks[i];
ec50007f 827 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
828 if (ret < 0 || ret >= MAXPATHLEN)
829 return -1;
09227be2
MW
830
831 /*
832 * Stat the path first. If we don't get an error
833 * accept it as is and don't try to create it
834 */
835 if (!stat(path, &s)) {
836 continue;
837 }
838
69aa6655 839 ret = symlink(d->oldpath, path);
09227be2 840
69aa6655 841 if (ret && errno != EEXIST) {
09227be2
MW
842 if ( errno == EROFS ) {
843 WARN("Warning: Read Only file system while creating %s", path);
844 } else {
845 SYSERROR("Error creating %s", path);
846 return -1;
847 }
69aa6655
DE
848 }
849 }
850 return 0;
851}
852
393903d1
SH
853/*
854 * Build a space-separate list of ptys to pass to systemd.
855 */
856static bool append_ptyname(char **pp, char *name)
b0a33c1e 857{
393903d1
SH
858 char *p;
859
860 if (!*pp) {
861 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
862 if (!*pp)
863 return false;
864 sprintf(*pp, "container_ttys=%s", name);
865 return true;
866 }
867 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
868 if (!p)
869 return false;
870 *pp = p;
871 strcat(p, " ");
872 strcat(p, name);
873 return true;
874}
875
9e1045e3 876static int lxc_setup_tty(struct lxc_conf *conf)
393903d1 877{
9e1045e3 878 int i, ret;
393903d1
SH
879 const struct lxc_tty_info *tty_info = &conf->tty_info;
880 char *ttydir = conf->ttydir;
7c6ef2a2 881 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 882
e8bd4e43 883 if (!conf->rootfs.path)
bc9bd0e3
DL
884 return 0;
885
b0a33c1e 886 for (i = 0; i < tty_info->nbtty; i++) {
b0a33c1e 887 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
888
e8bd4e43 889 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
9e1045e3 890 if (ret < 0 || (size_t)ret >= sizeof(path)) {
7c6ef2a2
SH
891 ERROR("pathname too long for ttys");
892 return -1;
893 }
9e1045e3 894
7c6ef2a2
SH
895 if (ttydir) {
896 /* create dev/lxc/tty%d" */
9e1045e3
CB
897 ret = snprintf(lxcpath, sizeof(lxcpath),
898 "/dev/%s/tty%d", ttydir, i + 1);
899 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
7c6ef2a2
SH
900 ERROR("pathname too long for ttys");
901 return -1;
902 }
9e1045e3 903
7c6ef2a2 904 ret = creat(lxcpath, 0660);
9e1045e3
CB
905 if (ret < 0 && errno != EEXIST) {
906 SYSERROR("failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
907 return -1;
908 }
4d44e274
SH
909 if (ret >= 0)
910 close(ret);
9e1045e3 911
7c6ef2a2 912 ret = unlink(path);
9e1045e3
CB
913 if (ret < 0 && errno != ENOENT) {
914 SYSERROR("failed to unlink \"%s\"", path);
7c6ef2a2
SH
915 return -1;
916 }
b0a33c1e 917
9e1045e3
CB
918 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
919 if (ret < 0) {
920 WARN("failed to bind mount \"%s\" onto \"%s\"",
7c6ef2a2
SH
921 pty_info->name, path);
922 continue;
923 }
9e1045e3
CB
924 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
925 path);
13954cce 926
9e1045e3
CB
927 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
928 ttydir, i + 1);
929 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
9ba8130c
SH
930 ERROR("tty pathname too long");
931 return -1;
932 }
9e1045e3 933
7c6ef2a2 934 ret = symlink(lxcpath, path);
9e1045e3
CB
935 if (ret < 0) {
936 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
937 path, lxcpath);
7c6ef2a2
SH
938 return -1;
939 }
940 } else {
9e1045e3
CB
941 /* If we populated /dev, then we need to create
942 * /dev/ttyN
943 */
944 ret = access(path, F_OK);
945 if (ret < 0) {
c6883f38 946 ret = creat(path, 0660);
9e1045e3
CB
947 if (ret < 0) {
948 SYSERROR("failed to create \"%s\"", path);
c6883f38 949 /* this isn't fatal, continue */
025ed0f3 950 } else {
c6883f38 951 close(ret);
025ed0f3 952 }
c6883f38 953 }
9e1045e3
CB
954
955 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
956 if (ret < 0) {
e8bd4e43 957 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
7c6ef2a2
SH
958 continue;
959 }
9e1045e3
CB
960
961 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
962 path);
393903d1 963 }
9e1045e3 964
e8bd4e43 965 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
393903d1
SH
966 ERROR("Error setting up container_ttys string");
967 return -1;
b0a33c1e 968 }
969 }
970
9e1045e3 971 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
b0a33c1e 972 return 0;
973}
974
59bb8698 975static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 976{
2d489f9e 977 int oldroot = -1, newroot = -1;
bf601689 978
2d489f9e
SH
979 oldroot = open("/", O_DIRECTORY | O_RDONLY);
980 if (oldroot < 0) {
981 SYSERROR("Error opening old-/ for fchdir");
9ba8130c
SH
982 return -1;
983 }
2d489f9e
SH
984 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
985 if (newroot < 0) {
986 SYSERROR("Error opening new-/ for fchdir");
987 goto fail;
c08556c6 988 }
bf601689 989
cc6f6dd7 990 /* change into new root fs */
2d489f9e 991 if (fchdir(newroot)) {
cc6f6dd7 992 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
2d489f9e 993 goto fail;
cc6f6dd7
DL
994 }
995
cc6f6dd7 996 /* pivot_root into our new root fs */
2d489f9e 997 if (pivot_root(".", ".")) {
cc6f6dd7 998 SYSERROR("pivot_root syscall failed");
2d489f9e 999 goto fail;
bf601689 1000 }
cc6f6dd7 1001
2d489f9e
SH
1002 /*
1003 * at this point the old-root is mounted on top of our new-root
1004 * To unmounted it we must not be chdir'd into it, so escape back
1005 * to old-root
1006 */
1007 if (fchdir(oldroot) < 0) {
1008 SYSERROR("Error entering oldroot");
1009 goto fail;
1010 }
7981ea46 1011 if (umount2(".", MNT_DETACH) < 0) {
2d489f9e
SH
1012 SYSERROR("Error detaching old root");
1013 goto fail;
cc6f6dd7
DL
1014 }
1015
2d489f9e
SH
1016 if (fchdir(newroot) < 0) {
1017 SYSERROR("Error re-entering newroot");
1018 goto fail;
1019 }
cc6f6dd7 1020
2d489f9e
SH
1021 close(oldroot);
1022 close(newroot);
bf601689 1023
2d489f9e 1024 DEBUG("pivot_root syscall to '%s' successful", rootfs);
bf601689 1025
bf601689 1026 return 0;
2d489f9e
SH
1027
1028fail:
1029 if (oldroot != -1)
1030 close(oldroot);
1031 if (newroot != -1)
1032 close(newroot);
1033 return -1;
bf601689
MH
1034}
1035
bc6928ff 1036/*
87da4ec3
SH
1037 * Just create a path for /dev under $lxcpath/$name and in rootfs
1038 * If we hit an error, log it but don't fail yet.
91c3830e 1039 */
14221cbb 1040static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, const char *lxcpath)
91c3830e
SH
1041{
1042 int ret;
87da4ec3
SH
1043 size_t clen;
1044 char *path;
91c3830e 1045
14221cbb 1046 INFO("Mounting container /dev");
bc6928ff 1047
14221cbb 1048 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1049 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1050 path = alloca(clen);
bc6928ff 1051
ec50007f 1052 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
87da4ec3 1053 if (ret < 0 || ret >= clen)
91c3830e 1054 return -1;
bc6928ff 1055
87da4ec3 1056 if (!dir_exists(path)) {
14221cbb 1057 WARN("No /dev in container.");
87da4ec3
SH
1058 WARN("Proceeding without autodev setup");
1059 return 0;
bc6928ff 1060 }
87da4ec3 1061
1ec0e8e3 1062 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
ec50007f 1063 rootfs->path ? rootfs->mount : NULL);
1ec0e8e3 1064 if (ret != 0) {
87da4ec3 1065 SYSERROR("Failed mounting tmpfs onto %s\n", path);
1ec0e8e3 1066 return -1;
91c3830e 1067 }
87da4ec3
SH
1068
1069 INFO("Mounted tmpfs onto %s", path);
1070
ec50007f 1071 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
87da4ec3 1072 if (ret < 0 || ret >= clen)
91c3830e 1073 return -1;
87da4ec3 1074
bc6928ff
MW
1075 /*
1076 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1077 * If not, then create it and exit if that fails...
1078 */
87da4ec3 1079 if (!dir_exists(path)) {
bc6928ff
MW
1080 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1081 if (ret) {
1082 SYSERROR("Failed to create /dev/pts in container");
1083 return -1;
1084 }
91c3830e
SH
1085 }
1086
14221cbb 1087 INFO("Mounted container /dev");
91c3830e
SH
1088 return 0;
1089}
1090
c6883f38 1091struct lxc_devs {
74a3920a 1092 const char *name;
c6883f38
SH
1093 mode_t mode;
1094 int maj;
1095 int min;
1096};
1097
74a3920a 1098static const struct lxc_devs lxc_devs[] = {
c6883f38
SH
1099 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1100 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1101 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1102 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1103 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1104 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
c6883f38
SH
1105};
1106
27245ff7 1107static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38
SH
1108{
1109 int ret;
c6883f38
SH
1110 char path[MAXPATHLEN];
1111 int i;
3a32201c 1112 mode_t cmask;
c6883f38 1113
ec50007f 1114 ret = snprintf(path, MAXPATHLEN, "%s/dev", rootfs->path ? rootfs->mount : "");
91c3830e
SH
1115 if (ret < 0 || ret >= MAXPATHLEN) {
1116 ERROR("Error calculating container /dev location");
c6883f38 1117 return -1;
f7bee6c6 1118 }
91c3830e 1119
0bbf8572
CB
1120 /* ignore, just don't try to fill in */
1121 if (!dir_exists(path))
9cb4d183
SH
1122 return 0;
1123
0bbf8572 1124 INFO("populating container /dev");
3a32201c 1125 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1126 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1127 const struct lxc_devs *d = &lxc_devs[i];
0728ebf4 1128
ec50007f 1129 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
c6883f38
SH
1130 if (ret < 0 || ret >= MAXPATHLEN)
1131 return -1;
0bbf8572 1132
c6883f38 1133 ret = mknod(path, d->mode, makedev(d->maj, d->min));
0bbf8572 1134 if (ret < 0) {
9cb4d183
SH
1135 char hostpath[MAXPATHLEN];
1136 FILE *pathfile;
1137
0bbf8572
CB
1138 if (errno == EEXIST) {
1139 DEBUG("\"%s\" device already existed", path);
1140 continue;
1141 }
1142
1143 /* Unprivileged containers cannot create devices, so
1144 * bind mount the device from the host.
1145 */
9cb4d183
SH
1146 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1147 if (ret < 0 || ret >= MAXPATHLEN)
1148 return -1;
1149 pathfile = fopen(path, "wb");
1150 if (!pathfile) {
1151 SYSERROR("Failed to create device mount target '%s'", path);
1152 return -1;
1153 }
1154 fclose(pathfile);
0bbf8572
CB
1155 if (safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL) != 0) {
1156 SYSERROR("Failed bind mounting device %s from host into container", d->name);
9cb4d183
SH
1157 return -1;
1158 }
0bbf8572
CB
1159 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath, path);
1160 } else {
1161 DEBUG("created device node \"%s\"", path);
c6883f38
SH
1162 }
1163 }
3a32201c 1164 umask(cmask);
c6883f38 1165
0bbf8572 1166 INFO("populated container /dev");
c6883f38
SH
1167 return 0;
1168}
1169
9aa76a17 1170static int lxc_setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1171{
9aa76a17 1172 int ret;
91c3e281
CB
1173 struct bdev *bdev;
1174 const struct lxc_rootfs *rootfs;
cc28d0b0 1175
91c3e281 1176 rootfs = &conf->rootfs;
a0f379bf 1177 if (!rootfs->path) {
91c3e281
CB
1178 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1179 SYSERROR("Failed to make / rslave.");
a0f379bf
DW
1180 return -1;
1181 }
c69bd12f 1182 return 0;
a0f379bf 1183 }
0ad19a3f 1184
12297168 1185 if (access(rootfs->mount, F_OK)) {
91c3e281 1186 SYSERROR("Failed to access to \"%s\". Check it is present.",
12297168 1187 rootfs->mount);
b1789442
DL
1188 return -1;
1189 }
1190
91c3e281 1191 bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
9aa76a17
CB
1192 if (!bdev) {
1193 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
91c3e281
CB
1194 rootfs->path, rootfs->mount,
1195 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1196 return -1;
9be53773 1197 }
9aa76a17
CB
1198
1199 ret = bdev->ops->mount(bdev);
1200 bdev_put(bdev);
1201 if (ret < 0) {
91c3e281
CB
1202 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1203 rootfs->path, rootfs->mount,
1204 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1205 return -1;
1206 }
0ad19a3f 1207
91c3e281
CB
1208 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1209 rootfs->path, rootfs->mount,
1210 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1211
ac778708
DL
1212 return 0;
1213}
1214
91e93c71
AV
1215int prepare_ramfs_root(char *root)
1216{
eab15c1e 1217 char buf[LXC_LINELEN], *p;
91e93c71
AV
1218 char nroot[PATH_MAX];
1219 FILE *f;
1220 int i;
1221 char *p2;
1222
1223 if (realpath(root, nroot) == NULL)
39c7b795 1224 return -errno;
91e93c71
AV
1225
1226 if (chdir("/") == -1)
39c7b795 1227 return -errno;
91e93c71
AV
1228
1229 /*
1230 * We could use here MS_MOVE, but in userns this mount is
1231 * locked and can't be moved.
1232 */
39c7b795 1233 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
91e93c71 1234 SYSERROR("Failed to move %s into /", root);
39c7b795 1235 return -errno;
91e93c71
AV
1236 }
1237
39c7b795 1238 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
91e93c71 1239 SYSERROR("Failed to make . rprivate");
39c7b795 1240 return -errno;
91e93c71
AV
1241 }
1242
1243 /*
1244 * The following code cleans up inhereted mounts which are not
1245 * required for CT.
1246 *
1247 * The mountinfo file shows not all mounts, if a few points have been
1248 * unmounted between read operations from the mountinfo. So we need to
1249 * read mountinfo a few times.
1250 *
1251 * This loop can be skipped if a container uses unserns, because all
1252 * inherited mounts are locked and we should live with all this trash.
1253 */
1254 while (1) {
1255 int progress = 0;
1256
1257 f = fopen("./proc/self/mountinfo", "r");
1258 if (!f) {
1259 SYSERROR("Unable to open /proc/self/mountinfo");
1260 return -1;
1261 }
eab15c1e 1262 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1263 for (p = buf, i=0; p && i < 4; i++)
1264 p = strchr(p+1, ' ');
1265 if (!p)
1266 continue;
1267 p2 = strchr(p+1, ' ');
1268 if (!p2)
1269 continue;
1270
1271 *p2 = '\0';
1272 *p = '.';
1273
1274 if (strcmp(p + 1, "/") == 0)
1275 continue;
1276 if (strcmp(p + 1, "/proc") == 0)
1277 continue;
1278
1279 if (umount2(p, MNT_DETACH) == 0)
1280 progress++;
1281 }
1282 fclose(f);
1283 if (!progress)
1284 break;
1285 }
1286
8bea9fae
PR
1287 /* This also can be skipped if a container uses unserns */
1288 umount2("./proc", MNT_DETACH);
91e93c71
AV
1289
1290 /* It is weird, but chdir("..") moves us in a new root */
1291 if (chdir("..") == -1) {
1292 SYSERROR("Unable to change working directory");
1293 return -1;
1294 }
1295
1296 if (chroot(".") == -1) {
1297 SYSERROR("Unable to chroot");
1298 return -1;
1299 }
1300
1301 return 0;
1302}
1303
74a3920a 1304static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1305{
39c7b795
CB
1306 if (!rootfs->path) {
1307 DEBUG("container does not have a rootfs, so not doing pivot root");
ac778708 1308 return 0;
39c7b795 1309 }
ac778708 1310
91e93c71 1311 if (detect_ramfs_rootfs()) {
39c7b795
CB
1312 DEBUG("detected that container is on ramfs");
1313 if (prepare_ramfs_root(rootfs->mount)) {
1314 ERROR("failed to prepare minimal ramfs root");
91e93c71 1315 return -1;
39c7b795
CB
1316 }
1317
1318 DEBUG("prepared ramfs root for container");
1319 return 0;
1320 }
1321
1322 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1323 ERROR("failed to pivot root");
25368b52 1324 return -1;
c69bd12f
DL
1325 }
1326
39c7b795 1327 DEBUG("finished pivot root");
25368b52 1328 return 0;
0ad19a3f 1329}
1330
70761e5e 1331static int lxc_setup_devpts(int num_pts)
3c26f34e 1332{
70761e5e 1333 int ret;
d5cb35d6 1334 const char *devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
77890c6d 1335
70761e5e
CB
1336 if (!num_pts) {
1337 DEBUG("no new devpts instance will be mounted since no pts "
1338 "devices are requested");
d852c78c 1339 return 0;
3c26f34e 1340 }
1341
d5cb35d6 1342 /* Unmount old devpts instance. */
70761e5e
CB
1343 ret = access("/dev/pts/ptmx", F_OK);
1344 if (!ret) {
70761e5e
CB
1345 ret = umount("/dev/pts");
1346 if (ret < 0) {
1347 SYSERROR("failed to unmount old devpts instance");
1348 return -1;
7e40254a 1349 }
70761e5e 1350 DEBUG("unmounted old /dev/pts instance");
7e40254a
JTLB
1351 }
1352
70761e5e
CB
1353 /* Create mountpoint for devpts instance. */
1354 ret = mkdir("/dev/pts", 0755);
1355 if (ret < 0 && errno != EEXIST) {
1356 SYSERROR("failed to create the \"/dev/pts\" directory");
3c26f34e 1357 return -1;
1358 }
1359
70761e5e
CB
1360 /* Mount new devpts instance. */
1361 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1362 if (ret < 0) {
1363 SYSERROR("failed to mount new devpts instance");
1364 return -1;
1365 }
f4f52cb5 1366 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1367
d5cb35d6 1368 /* Remove any pre-existing /dev/ptmx file. */
70761e5e 1369 ret = access("/dev/ptmx", F_OK);
d5cb35d6
CB
1370 if (!ret) {
1371 ret = remove("/dev/ptmx");
1372 if (ret < 0) {
1373 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1374 return -1;
70761e5e 1375 }
d5cb35d6 1376 DEBUG("removed existing \"/dev/ptmx\"");
3c26f34e 1377 }
1378
d5cb35d6
CB
1379 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1380 ret = open("/dev/ptmx", O_CREAT, 0666);
1381 if (ret < 0) {
1382 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1383 return -1;
1384 }
e87bd19c 1385 close(ret);
d5cb35d6 1386 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1387
d5cb35d6 1388 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1389 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6
CB
1390 if (!ret) {
1391 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1392 return 0;
1393 } else {
1394 /* Fallthrough and try to create a symlink. */
1395 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1396 }
1397
1398 /* Remove the dummy /dev/ptmx file we created above. */
1399 ret = remove("/dev/ptmx");
70761e5e 1400 if (ret < 0) {
d5cb35d6
CB
1401 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1402 return -1;
1403 }
1404
1405 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1406 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1407 if (ret < 0) {
1408 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
3c26f34e 1409 return -1;
1410 }
d5cb35d6 1411 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
cd54d859 1412
3c26f34e 1413 return 0;
1414}
1415
cccc74b5
DL
1416static int setup_personality(int persona)
1417{
6ff05e18 1418 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1419 if (persona == -1)
1420 return 0;
1421
1422 if (personality(persona) < 0) {
1423 SYSERROR("failed to set personality to '0x%x'", persona);
1424 return -1;
1425 }
1426
1427 INFO("set personality to '0x%x'", persona);
6ff05e18 1428 #endif
cccc74b5
DL
1429
1430 return 0;
1431}
1432
3d7d929a
CB
1433static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1434 const struct lxc_console *console)
6e590161 1435{
63376d7d 1436 char path[MAXPATHLEN];
0728ebf4 1437 int ret, fd;
52e35957 1438
8b1b1210
CB
1439 if (console->path && !strcmp(console->path, "none"))
1440 return 0;
1441
7c6ef2a2 1442 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
3d7d929a 1443 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1444 return -1;
52e35957 1445
8b1b1210
CB
1446 /* When we are asked to setup a console we remove any previous
1447 * /dev/console bind-mounts.
1448 */
a7ba3c7f
CB
1449 if (file_exists(path)) {
1450 ret = lxc_unstack_mountpoint(path, false);
1451 if (ret < 0) {
8b1b1210 1452 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1453 return -ret;
1454 } else {
1455 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1456 }
1457 ret = unlink(path);
1458 if (ret < 0) {
1459 SYSERROR("error unlinking %s", path);
8b1b1210
CB
1460 return -errno;
1461 }
8b1b1210
CB
1462 }
1463
1464 /* For unprivileged containers autodev or automounts will already have
1465 * taken care of creating /dev/console.
1466 */
0728ebf4
TA
1467 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1468 if (fd < 0) {
1469 if (errno != EEXIST) {
1470 SYSERROR("failed to create console");
3d7d929a 1471 return -errno;
0728ebf4
TA
1472 }
1473 } else {
1474 close(fd);
52e35957
DL
1475 }
1476
0728ebf4 1477 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
3d7d929a
CB
1478 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1479 return -errno;
63376d7d 1480 }
13954cce 1481
3d7d929a 1482 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
63376d7d 1483 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1484 return -1;
1485 }
1486
3d7d929a 1487 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1488 return 0;
1489}
1490
3d7d929a
CB
1491static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1492 const struct lxc_console *console,
1493 char *ttydir)
7c6ef2a2 1494{
7c6ef2a2 1495 int ret;
3d7d929a 1496 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
7c6ef2a2
SH
1497
1498 /* create rootfs/dev/<ttydir> directory */
3d7d929a
CB
1499 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1500 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1501 return -1;
3d7d929a 1502
7c6ef2a2
SH
1503 ret = mkdir(path, 0755);
1504 if (ret && errno != EEXIST) {
959aee9c 1505 SYSERROR("failed with errno %d to create %s", errno, path);
3d7d929a 1506 return -errno;
7c6ef2a2 1507 }
3d7d929a 1508 DEBUG("created directory for console and tty devices at \%s\"", path);
7c6ef2a2 1509
3d7d929a
CB
1510 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1511 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1512 return -1;
1513
7c6ef2a2 1514 ret = creat(lxcpath, 0660);
3d7d929a 1515 if (ret == -1 && errno != EEXIST) {
959aee9c 1516 SYSERROR("error %d creating %s", errno, lxcpath);
3d7d929a 1517 return -errno;
7c6ef2a2 1518 }
4d44e274
SH
1519 if (ret >= 0)
1520 close(ret);
7c6ef2a2 1521
2a12fefd
CB
1522 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1523 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 1524 return -1;
2a12fefd
CB
1525
1526 /* When we are asked to setup a console we remove any previous
1527 * /dev/console bind-mounts.
1528 */
1529 if (console->path && !strcmp(console->path, "none")) {
1530 struct stat st;
1531 ret = stat(path, &st);
1532 if (ret < 0) {
1533 if (errno == ENOENT)
1534 return 0;
1535 SYSERROR("failed stat() \"%s\"", path);
1536 return -errno;
1537 }
1538
1539 /* /dev/console must be character device with major number 5 and
1540 * minor number 1. If not, give benefit of the doubt and assume
1541 * the user has mounted something else right there on purpose.
1542 */
1543 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1544 return 0;
1545
1546 /* In case the user requested a bind-mount for /dev/console and
1547 * requests a ttydir we move the mount to the
a7ba3c7f
CB
1548 * /dev/<ttydir/console.
1549 * Note, we only move the uppermost mount and clear all other
1550 * mounts underneath for safety.
1551 * If it is a character device created via mknod() we simply
1552 * rename it.
2a12fefd
CB
1553 */
1554 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1555 if (ret < 0) {
1556 if (errno != EINVAL) {
1557 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1558 return -errno;
1559 }
1560 /* path was not a mountpoint */
1561 ret = rename(path, lxcpath);
1562 if (ret < 0) {
1563 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1564 return -errno;
1565 }
1566 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1567 } else {
1568 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1569 }
a7ba3c7f
CB
1570
1571 /* Clear all remaining bind-mounts. */
1572 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1573 if (ret < 0) {
a7ba3c7f
CB
1574 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1575 return -ret;
1576 } else {
1577 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1578 }
1579 } else {
1580 if (file_exists(path)) {
1581 ret = lxc_unstack_mountpoint(path, false);
1582 if (ret < 0) {
2a12fefd 1583 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1584 return -ret;
1585 } else {
1586 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
2a12fefd 1587 }
2a12fefd
CB
1588 }
1589
1590 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1591 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1592 return -1;
1593 }
1594 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1595 }
1596
2a12fefd 1597 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
9ba8130c 1598 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
3d7d929a 1599 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 1600 return -1;
3d7d929a 1601
2a12fefd
CB
1602 ret = unlink(path);
1603 if (ret && errno != ENOENT) {
1604 SYSERROR("error unlinking %s", path);
1605 return -errno;
1606 }
1607
7c6ef2a2 1608 ret = symlink(lxcpath, path);
3d7d929a
CB
1609 if (ret < 0) {
1610 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
7c6ef2a2
SH
1611 return -1;
1612 }
1613
3d7d929a 1614 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
6e590161 1615 return 0;
1616}
1617
3d7d929a
CB
1618static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1619 const struct lxc_console *console, char *ttydir)
7c6ef2a2 1620{
3d7d929a
CB
1621 /* We don't have a rootfs, /dev/console will be shared. */
1622 if (!rootfs->path) {
1623 DEBUG("/dev/console will be shared with the host");
7c6ef2a2 1624 return 0;
3d7d929a
CB
1625 }
1626
7c6ef2a2 1627 if (!ttydir)
3d7d929a 1628 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1629
3d7d929a 1630 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1631}
1632
1bd051a6
SH
1633static int setup_kmsg(const struct lxc_rootfs *rootfs,
1634 const struct lxc_console *console)
1635{
1636 char kpath[MAXPATHLEN];
1637 int ret;
1638
222fea5a
DE
1639 if (!rootfs->path)
1640 return 0;
1bd051a6
SH
1641 ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
1642 if (ret < 0 || ret >= sizeof(kpath))
1643 return -1;
1644
1645 ret = unlink(kpath);
1646 if (ret && errno != ENOENT) {
959aee9c 1647 SYSERROR("error unlinking %s", kpath);
1bd051a6
SH
1648 return -1;
1649 }
1650
1651 ret = symlink("console", kpath);
1652 if (ret) {
1653 SYSERROR("failed to create symlink for kmsg");
1654 return -1;
1655 }
1656
1657 return 0;
1658}
1659
998ac676
RT
1660static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1661{
1662 struct mount_opt *mo;
1663
1664 /* If opt is found in mount_opt, set or clear flags.
1665 * Otherwise append it to data. */
1666
1667 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1668 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1669 if (mo->clear)
1670 *flags &= ~mo->flag;
1671 else
1672 *flags |= mo->flag;
1673 return;
1674 }
1675 }
1676
1677 if (strlen(*data))
1678 strcat(*data, ",");
1679 strcat(*data, opt);
1680}
1681
a17b1e65 1682int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1683 char **mntdata)
1684{
1685 char *s, *data;
1686 char *p, *saveptr = NULL;
1687
911324ef 1688 *mntdata = NULL;
91656ce5 1689 *mntflags = 0L;
911324ef
DL
1690
1691 if (!mntopts)
998ac676
RT
1692 return 0;
1693
911324ef 1694 s = strdup(mntopts);
998ac676 1695 if (!s) {
36eb9bde 1696 SYSERROR("failed to allocate memory");
998ac676
RT
1697 return -1;
1698 }
1699
1700 data = malloc(strlen(s) + 1);
1701 if (!data) {
36eb9bde 1702 SYSERROR("failed to allocate memory");
998ac676
RT
1703 free(s);
1704 return -1;
1705 }
1706 *data = 0;
1707
1708 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1709 p = strtok_r(NULL, ",", &saveptr))
1710 parse_mntopt(p, mntflags, &data);
1711
1712 if (*data)
1713 *mntdata = data;
1714 else
1715 free(data);
1716 free(s);
1717
1718 return 0;
1719}
1720
6fd5e769
SH
1721static void null_endofword(char *word)
1722{
1723 while (*word && *word != ' ' && *word != '\t')
1724 word++;
1725 *word = '\0';
1726}
1727
1728/*
1729 * skip @nfields spaces in @src
1730 */
1731static char *get_field(char *src, int nfields)
1732{
1733 char *p = src;
1734 int i;
1735
1736 for (i = 0; i < nfields; i++) {
1737 while (*p && *p != ' ' && *p != '\t')
1738 p++;
1739 if (!*p)
1740 break;
1741 p++;
1742 }
1743 return p;
1744}
1745
911324ef
DL
1746static int mount_entry(const char *fsname, const char *target,
1747 const char *fstype, unsigned long mountflags,
ae7a770e 1748 const char *data, int optional, int dev, const char *rootfs)
911324ef 1749{
614305f3 1750#ifdef HAVE_STATVFS
2938f7c8 1751 struct statvfs sb;
614305f3 1752#endif
2938f7c8 1753
592fd47a 1754 if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) {
1fc64d22
SG
1755 if (optional) {
1756 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
1757 target, strerror(errno));
1758 return 0;
1759 }
1760 else {
1761 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1762 return -1;
1763 }
911324ef
DL
1764 }
1765
1766 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
2938f7c8
SH
1767 DEBUG("remounting %s on %s to respect bind or remount options",
1768 fsname ? fsname : "(none)", target ? target : "(none)");
7c5b6e7c
AS
1769 unsigned long rqd_flags = 0;
1770 if (mountflags & MS_RDONLY)
1771 rqd_flags |= MS_RDONLY;
614305f3 1772#ifdef HAVE_STATVFS
2938f7c8 1773 if (statvfs(fsname, &sb) == 0) {
7c5b6e7c 1774 unsigned long required_flags = rqd_flags;
2938f7c8
SH
1775 if (sb.f_flag & MS_NOSUID)
1776 required_flags |= MS_NOSUID;
ae7a770e 1777 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8
SH
1778 required_flags |= MS_NODEV;
1779 if (sb.f_flag & MS_RDONLY)
1780 required_flags |= MS_RDONLY;
1781 if (sb.f_flag & MS_NOEXEC)
1782 required_flags |= MS_NOEXEC;
1783 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname, sb.f_flag, required_flags);
1784 /*
1785 * If this was a bind mount request, and required_flags
1786 * does not have any flags which are not already in
1787 * mountflags, then skip the remount
1788 */
1789 if (!(mountflags & MS_REMOUNT)) {
7c5b6e7c 1790 if (!(required_flags & ~mountflags) && rqd_flags == 0) {
2938f7c8
SH
1791 DEBUG("mountflags already was %lu, skipping remount",
1792 mountflags);
1793 goto skipremount;
1794 }
1795 }
1796 mountflags |= required_flags;
6fd5e769 1797 }
614305f3 1798#endif
911324ef
DL
1799
1800 if (mount(fsname, target, fstype,
592fd47a 1801 mountflags | MS_REMOUNT, data) < 0) {
1fc64d22
SG
1802 if (optional) {
1803 INFO("failed to mount '%s' on '%s' (optional): %s",
1804 fsname, target, strerror(errno));
1805 return 0;
1806 }
1807 else {
1808 SYSERROR("failed to mount '%s' on '%s'",
1809 fsname, target);
1810 return -1;
1811 }
911324ef
DL
1812 }
1813 }
1814
614305f3 1815#ifdef HAVE_STATVFS
6fd5e769 1816skipremount:
614305f3 1817#endif
911324ef
DL
1818 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1819
1820 return 0;
1821}
1822
4e4ca161
SH
1823/*
1824 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1825 */
1826static void cull_mntent_opt(struct mntent *mntent)
1827{
1828 int i;
1829 char *p, *p2;
1830 char *list[] = {"create=dir",
1831 "create=file",
1832 "optional",
1833 NULL };
1834
1835 for (i=0; list[i]; i++) {
1836 if (!(p = strstr(mntent->mnt_opts, list[i])))
1837 continue;
1838 p2 = strchr(p, ',');
1839 if (!p2) {
1840 /* no more mntopts, so just chop it here */
1841 *p = '\0';
1842 continue;
1843 }
1844 memmove(p, p2+1, strlen(p2+1)+1);
1845 }
1846}
1847
4d5b72a1 1848static int mount_entry_create_dir_file(const struct mntent *mntent,
0a2dddd4
CB
1849 const char* path, const struct lxc_rootfs *rootfs,
1850 const char *lxc_name, const char *lxc_path)
0ad19a3f 1851{
4d5b72a1 1852 char *pathdirname = NULL;
608e3567 1853 int ret = 0;
34cfffb3 1854 FILE *pathfile = NULL;
911324ef 1855
6e46cc0d 1856 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
5c484f79 1857 if (ovl_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
6e46cc0d
CB
1858 return -1;
1859 } else if (strncmp(mntent->mnt_type, "aufs", 4) == 0) {
1d52bdf7 1860 if (aufs_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
6e46cc0d
CB
1861 return -1;
1862 }
1863
34cfffb3 1864 if (hasmntopt(mntent, "create=dir")) {
4d5b72a1
NC
1865 if (mkdir_p(path, 0755) < 0) {
1866 WARN("Failed to create mount target '%s'", path);
34cfffb3
SG
1867 ret = -1;
1868 }
1869 }
1870
4d5b72a1
NC
1871 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1872 pathdirname = strdup(path);
34cfffb3 1873 pathdirname = dirname(pathdirname);
119126b6
SG
1874 if (mkdir_p(pathdirname, 0755) < 0) {
1875 WARN("Failed to create target directory");
1876 }
4d5b72a1 1877 pathfile = fopen(path, "wb");
34cfffb3 1878 if (!pathfile) {
4d5b72a1 1879 WARN("Failed to create mount target '%s'", path);
34cfffb3 1880 ret = -1;
6e46cc0d 1881 } else {
34cfffb3 1882 fclose(pathfile);
6e46cc0d 1883 }
34cfffb3 1884 }
4d5b72a1
NC
1885 free(pathdirname);
1886 return ret;
1887}
1888
ec50007f
CB
1889/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1890 * without a rootfs. */
db4aba38 1891static inline int mount_entry_on_generic(struct mntent *mntent,
0a2dddd4
CB
1892 const char* path, const struct lxc_rootfs *rootfs,
1893 const char *lxc_name, const char *lxc_path)
4d5b72a1
NC
1894{
1895 unsigned long mntflags;
1896 char *mntdata;
1897 int ret;
1898 bool optional = hasmntopt(mntent, "optional") != NULL;
ae7a770e 1899 bool dev = hasmntopt(mntent, "dev") != NULL;
4d5b72a1 1900
ec50007f
CB
1901 char *rootfs_path = NULL;
1902 if (rootfs && rootfs->path)
1903 rootfs_path = rootfs->mount;
1904
0a2dddd4 1905 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name, lxc_path);
34cfffb3 1906
608e3567
SH
1907 if (ret < 0)
1908 return optional ? 0 : -1;
1909
4e4ca161
SH
1910 cull_mntent_opt(mntent);
1911
a17b1e65
SG
1912 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
1913 free(mntdata);
1914 return -1;
1915 }
1916
6e46cc0d 1917 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
ae7a770e 1918 mntdata, optional, dev, rootfs_path);
68c152ef 1919
911324ef 1920 free(mntdata);
911324ef
DL
1921 return ret;
1922}
1923
db4aba38
NC
1924static inline int mount_entry_on_systemfs(struct mntent *mntent)
1925{
1433c9f9
CB
1926 char path[MAXPATHLEN];
1927 int ret;
1928
1929 /* For containers created without a rootfs all mounts are treated as
1930 * absolute paths starting at / on the host. */
1931 if (mntent->mnt_dir[0] != '/')
1932 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1933 else
1934 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
1935
1936 if (ret < 0 || ret >= sizeof(path)) {
1937 ERROR("path name too long");
1938 return -1;
1939 }
1940
1941 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
1942}
1943
4e4ca161 1944static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 1945 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
1946 const char *lxc_name,
1947 const char *lxc_path)
911324ef 1948{
013bd428 1949 char *aux;
59760f5d 1950 char path[MAXPATHLEN];
80a881b2 1951 int r, ret = 0, offset;
67e571de 1952 const char *lxcpath;
0ad19a3f 1953
593e8478 1954 lxcpath = lxc_global_config_value("lxc.lxcpath");
2a59a681
SH
1955 if (!lxcpath) {
1956 ERROR("Out of memory");
1957 return -1;
1958 }
1959
80a881b2 1960 /* if rootfs->path is a blockdev path, allow container fstab to
2a59a681
SH
1961 * use $lxcpath/CN/rootfs as the target prefix */
1962 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
80a881b2
SH
1963 if (r < 0 || r >= MAXPATHLEN)
1964 goto skipvarlib;
1965
1966 aux = strstr(mntent->mnt_dir, path);
1967 if (aux) {
1968 offset = strlen(path);
1969 goto skipabs;
1970 }
1971
1972skipvarlib:
013bd428
DL
1973 aux = strstr(mntent->mnt_dir, rootfs->path);
1974 if (!aux) {
1975 WARN("ignoring mount point '%s'", mntent->mnt_dir);
db4aba38 1976 return ret;
013bd428 1977 }
80a881b2
SH
1978 offset = strlen(rootfs->path);
1979
1980skipabs:
013bd428 1981
9ba8130c 1982 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
80a881b2
SH
1983 aux + offset);
1984 if (r < 0 || r >= MAXPATHLEN) {
1985 WARN("pathnme too long for '%s'", mntent->mnt_dir);
a17b1e65
SG
1986 return -1;
1987 }
1988
0a2dddd4 1989 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 1990}
d330fe7b 1991
4e4ca161 1992static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
1993 const struct lxc_rootfs *rootfs,
1994 const char *lxc_name,
1995 const char *lxc_path)
911324ef
DL
1996{
1997 char path[MAXPATHLEN];
911324ef 1998 int ret;
d330fe7b 1999
34cfffb3 2000 /* relative to root mount point */
6e46cc0d 2001 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1433c9f9 2002 if (ret < 0 || ret >= sizeof(path)) {
9ba8130c
SH
2003 ERROR("path name too long");
2004 return -1;
2005 }
911324ef 2006
0a2dddd4 2007 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
2008}
2009
80a881b2 2010static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
0a2dddd4 2011 const char *lxc_name, const char *lxc_path)
911324ef 2012{
aaf901be
AM
2013 struct mntent mntent;
2014 char buf[4096];
911324ef 2015 int ret = -1;
e76b8764 2016
aaf901be 2017 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
e76b8764 2018
911324ef 2019 if (!rootfs->path) {
aaf901be 2020 if (mount_entry_on_systemfs(&mntent))
e76b8764 2021 goto out;
911324ef 2022 continue;
e76b8764
CDC
2023 }
2024
911324ef 2025 /* We have a separate root, mounts are relative to it */
aaf901be 2026 if (mntent.mnt_dir[0] != '/') {
0a2dddd4 2027 if (mount_entry_on_relative_rootfs(&mntent, rootfs, lxc_name, lxc_path))
911324ef
DL
2028 goto out;
2029 continue;
2030 }
cd54d859 2031
0a2dddd4 2032 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name, lxc_path))
911324ef 2033 goto out;
0ad19a3f 2034 }
cd54d859 2035
0ad19a3f 2036 ret = 0;
cd54d859
DL
2037
2038 INFO("mount points have been setup");
0ad19a3f 2039out:
e7938e9e
MN
2040 return ret;
2041}
2042
80a881b2 2043static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
0a2dddd4 2044 const char *lxc_name, const char *lxc_path)
e7938e9e
MN
2045{
2046 FILE *file;
2047 int ret;
2048
2049 if (!fstab)
2050 return 0;
2051
2052 file = setmntent(fstab, "r");
2053 if (!file) {
2054 SYSERROR("failed to use '%s'", fstab);
2055 return -1;
2056 }
2057
0a2dddd4 2058 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
e7938e9e 2059
0ad19a3f 2060 endmntent(file);
2061 return ret;
2062}
2063
5ef5c9a3 2064FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2065{
5ef5c9a3 2066 int ret;
e7938e9e 2067 char *mount_entry;
5ef5c9a3
CB
2068 struct lxc_list *iterator;
2069 FILE *file;
2070 int fd = -1;
2071
2072 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2073 if (fd < 0) {
2074 if (errno != ENOSYS)
2075 return NULL;
2076 file = tmpfile();
2077 } else {
2078 file = fdopen(fd, "r+");
2079 }
e7938e9e 2080
e7938e9e 2081 if (!file) {
fad6ef95 2082 int saved_errno = errno;
5ef5c9a3
CB
2083 if (fd != -1)
2084 close(fd);
fad6ef95 2085 ERROR("Could not create mount entry file: %s.", strerror(saved_errno));
9fc7f8c0 2086 return NULL;
e7938e9e
MN
2087 }
2088
2089 lxc_list_for_each(iterator, mount) {
2090 mount_entry = iterator->elem;
5ef5c9a3
CB
2091 ret = fprintf(file, "%s\n", mount_entry);
2092 if (ret < strlen(mount_entry))
2093 WARN("Could not write mount entry to anonymous mount file.");
2094 }
2095
2096 if (fseek(file, 0, SEEK_SET) < 0) {
2097 fclose(file);
2098 return NULL;
e7938e9e
MN
2099 }
2100
9fc7f8c0
TA
2101 return file;
2102}
2103
5ef5c9a3
CB
2104static int setup_mount_entries(const struct lxc_rootfs *rootfs,
2105 struct lxc_list *mount, const char *lxc_name,
2106 const char *lxc_path)
9fc7f8c0
TA
2107{
2108 FILE *file;
2109 int ret;
2110
5ef5c9a3 2111 file = make_anonymous_mount_file(mount);
9fc7f8c0
TA
2112 if (!file)
2113 return -1;
e7938e9e 2114
0a2dddd4 2115 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
e7938e9e
MN
2116
2117 fclose(file);
2118 return ret;
2119}
2120
bab88e68
CS
2121static int parse_cap(const char *cap)
2122{
2123 char *ptr = NULL;
84760c11 2124 size_t i;
2125 int capid = -1;
bab88e68 2126
7035407c
DE
2127 if (!strcmp(cap, "none"))
2128 return -2;
2129
bab88e68
CS
2130 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2131
2132 if (strcmp(cap, caps_opt[i].name))
2133 continue;
2134
2135 capid = caps_opt[i].value;
2136 break;
2137 }
2138
2139 if (capid < 0) {
2140 /* try to see if it's numeric, so the user may specify
2141 * capabilities that the running kernel knows about but
2142 * we don't */
2143 errno = 0;
2144 capid = strtol(cap, &ptr, 10);
2145 if (!ptr || *ptr != '\0' || errno != 0)
2146 /* not a valid number */
2147 capid = -1;
2148 else if (capid > lxc_caps_last_cap())
2149 /* we have a number but it's not a valid
2150 * capability */
2151 capid = -1;
2152 }
2153
2154 return capid;
2155}
2156
0769b82a
CS
2157int in_caplist(int cap, struct lxc_list *caps)
2158{
2159 struct lxc_list *iterator;
2160 int capid;
2161
2162 lxc_list_for_each(iterator, caps) {
2163 capid = parse_cap(iterator->elem);
2164 if (capid == cap)
2165 return 1;
2166 }
2167
2168 return 0;
2169}
2170
81810dd1
DL
2171static int setup_caps(struct lxc_list *caps)
2172{
2173 struct lxc_list *iterator;
2174 char *drop_entry;
bab88e68 2175 int capid;
81810dd1
DL
2176
2177 lxc_list_for_each(iterator, caps) {
2178
2179 drop_entry = iterator->elem;
2180
bab88e68 2181 capid = parse_cap(drop_entry);
d55bc1ad 2182
81810dd1 2183 if (capid < 0) {
1e11be34
DL
2184 ERROR("unknown capability %s", drop_entry);
2185 return -1;
81810dd1
DL
2186 }
2187
2188 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2189
2190 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2191 SYSERROR("failed to remove %s capability", drop_entry);
2192 return -1;
2193 }
81810dd1
DL
2194
2195 }
2196
1fb86a7c
SH
2197 DEBUG("capabilities have been setup");
2198
2199 return 0;
2200}
2201
2202static int dropcaps_except(struct lxc_list *caps)
2203{
2204 struct lxc_list *iterator;
2205 char *keep_entry;
1fb86a7c
SH
2206 int i, capid;
2207 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2208 INFO("found %d capabilities", numcaps);
1fb86a7c 2209
2caf9a97
SH
2210 if (numcaps <= 0 || numcaps > 200)
2211 return -1;
2212
1fb86a7c
SH
2213 // caplist[i] is 1 if we keep capability i
2214 int *caplist = alloca(numcaps * sizeof(int));
2215 memset(caplist, 0, numcaps * sizeof(int));
2216
2217 lxc_list_for_each(iterator, caps) {
2218
2219 keep_entry = iterator->elem;
2220
bab88e68 2221 capid = parse_cap(keep_entry);
1fb86a7c 2222
7035407c
DE
2223 if (capid == -2)
2224 continue;
2225
1fb86a7c
SH
2226 if (capid < 0) {
2227 ERROR("unknown capability %s", keep_entry);
2228 return -1;
2229 }
2230
8255688a 2231 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2232
2233 caplist[capid] = 1;
2234 }
2235 for (i=0; i<numcaps; i++) {
2236 if (caplist[i])
2237 continue;
2238 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2239 SYSERROR("failed to remove capability %d", i);
2240 return -1;
2241 }
1fb86a7c
SH
2242 }
2243
2244 DEBUG("capabilities have been setup");
81810dd1
DL
2245
2246 return 0;
2247}
2248
0ad19a3f 2249static int setup_hw_addr(char *hwaddr, const char *ifname)
2250{
2251 struct sockaddr sockaddr;
2252 struct ifreq ifr;
fad6ef95 2253 int ret, fd, saved_errno;
0ad19a3f 2254
3cfc0f3a
MN
2255 ret = lxc_convert_mac(hwaddr, &sockaddr);
2256 if (ret) {
2257 ERROR("mac address '%s' conversion failed : %s",
2258 hwaddr, strerror(-ret));
0ad19a3f 2259 return -1;
2260 }
2261
2262 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
5da6aa8c 2263 ifr.ifr_name[IFNAMSIZ-1] = '\0';
0ad19a3f 2264 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2265
2266 fd = socket(AF_INET, SOCK_DGRAM, 0);
2267 if (fd < 0) {
3ab87b66 2268 ERROR("socket failure : %s", strerror(errno));
0ad19a3f 2269 return -1;
2270 }
2271
2272 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
fad6ef95 2273 saved_errno = errno;
0ad19a3f 2274 close(fd);
2275 if (ret)
fad6ef95 2276 ERROR("ioctl failure : %s", strerror(saved_errno));
0ad19a3f 2277
5da6aa8c 2278 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
cd54d859 2279
0ad19a3f 2280 return ret;
2281}
2282
82d5ae15 2283static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2284{
82d5ae15
DL
2285 struct lxc_list *iterator;
2286 struct lxc_inetdev *inetdev;
3cfc0f3a 2287 int err;
0ad19a3f 2288
82d5ae15
DL
2289 lxc_list_for_each(iterator, ip) {
2290
2291 inetdev = iterator->elem;
2292
0093bb8c
DL
2293 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2294 &inetdev->bcast, inetdev->prefix);
3cfc0f3a
MN
2295 if (err) {
2296 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2297 ifindex, strerror(-err));
82d5ae15
DL
2298 return -1;
2299 }
2300 }
2301
2302 return 0;
0ad19a3f 2303}
2304
82d5ae15 2305static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2306{
82d5ae15 2307 struct lxc_list *iterator;
7fa9074f 2308 struct lxc_inet6dev *inet6dev;
3cfc0f3a 2309 int err;
0ad19a3f 2310
82d5ae15
DL
2311 lxc_list_for_each(iterator, ip) {
2312
2313 inet6dev = iterator->elem;
2314
b3df193c 2315 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
0093bb8c
DL
2316 &inet6dev->mcast, &inet6dev->acast,
2317 inet6dev->prefix);
3cfc0f3a
MN
2318 if (err) {
2319 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2320 ifindex, strerror(-err));
82d5ae15 2321 return -1;
3cfc0f3a 2322 }
82d5ae15
DL
2323 }
2324
2325 return 0;
0ad19a3f 2326}
2327
82d5ae15 2328static int setup_netdev(struct lxc_netdev *netdev)
0ad19a3f 2329{
0ad19a3f 2330 char ifname[IFNAMSIZ];
0ad19a3f 2331 char *current_ifname = ifname;
3cfc0f3a 2332 int err;
0ad19a3f 2333
82d5ae15
DL
2334 /* empty network namespace */
2335 if (!netdev->ifindex) {
b0efbac4 2336 if (netdev->flags & IFF_UP) {
d472214b 2337 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2338 if (err) {
2339 ERROR("failed to set the loopback up : %s",
2340 strerror(-err));
82d5ae15
DL
2341 return -1;
2342 }
82d5ae15 2343 }
40790553
SH
2344 if (netdev->type != LXC_NET_VETH)
2345 return 0;
2346 netdev->ifindex = if_nametoindex(netdev->name);
0ad19a3f 2347 }
13954cce 2348
b466dc33 2349 /* get the new ifindex in case of physical netdev */
40790553 2350 if (netdev->type == LXC_NET_PHYS) {
b466dc33
BP
2351 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2352 ERROR("failed to get ifindex for %s",
2353 netdev->link);
2354 return -1;
2355 }
40790553 2356 }
b466dc33 2357
82d5ae15
DL
2358 /* retrieve the name of the interface */
2359 if (!if_indextoname(netdev->ifindex, current_ifname)) {
36eb9bde 2360 ERROR("no interface corresponding to index '%d'",
82d5ae15 2361 netdev->ifindex);
0ad19a3f 2362 return -1;
2363 }
13954cce 2364
018ef520 2365 /* default: let the system to choose one interface name */
9d083402 2366 if (!netdev->name)
fb6d9b2f
DL
2367 netdev->name = netdev->type == LXC_NET_PHYS ?
2368 netdev->link : "eth%d";
018ef520 2369
82d5ae15 2370 /* rename the interface name */
40790553
SH
2371 if (strcmp(ifname, netdev->name) != 0) {
2372 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2373 if (err) {
2374 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2375 strerror(-err));
2376 return -1;
2377 }
018ef520
DL
2378 }
2379
2380 /* Re-read the name of the interface because its name has changed
2381 * and would be automatically allocated by the system
2382 */
82d5ae15 2383 if (!if_indextoname(netdev->ifindex, current_ifname)) {
018ef520 2384 ERROR("no interface corresponding to index '%d'",
82d5ae15 2385 netdev->ifindex);
018ef520 2386 return -1;
0ad19a3f 2387 }
2388
82d5ae15
DL
2389 /* set a mac address */
2390 if (netdev->hwaddr) {
2391 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
36eb9bde 2392 ERROR("failed to setup hw address for '%s'",
82d5ae15 2393 current_ifname);
0ad19a3f 2394 return -1;
2395 }
2396 }
2397
82d5ae15
DL
2398 /* setup ipv4 addresses on the interface */
2399 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
36eb9bde 2400 ERROR("failed to setup ip addresses for '%s'",
0ad19a3f 2401 ifname);
2402 return -1;
2403 }
2404
82d5ae15
DL
2405 /* setup ipv6 addresses on the interface */
2406 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
36eb9bde 2407 ERROR("failed to setup ipv6 addresses for '%s'",
0ad19a3f 2408 ifname);
2409 return -1;
2410 }
2411
82d5ae15 2412 /* set the network device up */
b0efbac4 2413 if (netdev->flags & IFF_UP) {
3cfc0f3a
MN
2414 int err;
2415
d472214b 2416 err = lxc_netdev_up(current_ifname);
3cfc0f3a
MN
2417 if (err) {
2418 ERROR("failed to set '%s' up : %s", current_ifname,
2419 strerror(-err));
0ad19a3f 2420 return -1;
2421 }
2422
2423 /* the network is up, make the loopback up too */
d472214b 2424 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2425 if (err) {
2426 ERROR("failed to set the loopback up : %s",
2427 strerror(-err));
0ad19a3f 2428 return -1;
2429 }
2430 }
2431
f8fee0e2
MK
2432 /* We can only set up the default routes after bringing
2433 * up the interface, sine bringing up the interface adds
2434 * the link-local routes and we can't add a default
2435 * route if the gateway is not reachable. */
2436
2437 /* setup ipv4 gateway on the interface */
2438 if (netdev->ipv4_gateway) {
2439 if (!(netdev->flags & IFF_UP)) {
2440 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2441 return -1;
2442 }
2443
2444 if (lxc_list_empty(&netdev->ipv4)) {
2445 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2446 return -1;
2447 }
2448
2449 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2450 if (err) {
fc739df5
SG
2451 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2452 if (err) {
2453 ERROR("failed to add ipv4 dest for '%s': %s",
2454 ifname, strerror(-err));
2455 }
2456
2457 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2458 if (err) {
2459 ERROR("failed to setup ipv4 gateway for '%s': %s",
2460 ifname, strerror(-err));
2461 if (netdev->ipv4_gateway_auto) {
2462 char buf[INET_ADDRSTRLEN];
2463 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2464 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2465 }
2466 return -1;
19a26f82 2467 }
f8fee0e2
MK
2468 }
2469 }
2470
2471 /* setup ipv6 gateway on the interface */
2472 if (netdev->ipv6_gateway) {
2473 if (!(netdev->flags & IFF_UP)) {
2474 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2475 return -1;
2476 }
2477
2478 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2479 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2480 return -1;
2481 }
2482
2483 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2484 if (err) {
fc739df5
SG
2485 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2486 if (err) {
2487 ERROR("failed to add ipv6 dest for '%s': %s",
f8fee0e2 2488 ifname, strerror(-err));
19a26f82 2489 }
fc739df5
SG
2490
2491 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2492 if (err) {
2493 ERROR("failed to setup ipv6 gateway for '%s': %s",
2494 ifname, strerror(-err));
2495 if (netdev->ipv6_gateway_auto) {
2496 char buf[INET6_ADDRSTRLEN];
2497 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2498 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2499 }
2500 return -1;
2501 }
f8fee0e2
MK
2502 }
2503 }
2504
cd54d859
DL
2505 DEBUG("'%s' has been setup", current_ifname);
2506
0ad19a3f 2507 return 0;
2508}
2509
5f4535a3 2510static int setup_network(struct lxc_list *network)
0ad19a3f 2511{
82d5ae15 2512 struct lxc_list *iterator;
82d5ae15 2513 struct lxc_netdev *netdev;
0ad19a3f 2514
5f4535a3 2515 lxc_list_for_each(iterator, network) {
cd54d859 2516
5f4535a3 2517 netdev = iterator->elem;
82d5ae15
DL
2518
2519 if (setup_netdev(netdev)) {
2520 ERROR("failed to setup netdev");
2521 return -1;
2522 }
2523 }
cd54d859 2524
5f4535a3
DL
2525 if (!lxc_list_empty(network))
2526 INFO("network has been setup");
cd54d859
DL
2527
2528 return 0;
0ad19a3f 2529}
2530
c6d09e15
WB
2531static int parse_resource(const char *res) {
2532 size_t i;
2533 int resid = -1;
2534
2535 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2536 if (strcmp(res, limit_opt[i].name) == 0)
2537 return limit_opt[i].value;
2538 }
2539
2540 /* try to see if it's numeric, so the user may specify
2541 * resources that the running kernel knows about but
2542 * we don't */
2543 if (lxc_safe_int(res, &resid) == 0)
2544 return resid;
2545 return -1;
2546}
2547
2548int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2549 struct lxc_list *it;
2550 struct lxc_limit *lim;
2551 int resid;
2552
2553 lxc_list_for_each(it, limits) {
2554 lim = it->elem;
2555
2556 resid = parse_resource(lim->resource);
2557 if (resid < 0) {
2558 ERROR("unknown resource %s", lim->resource);
2559 return -1;
2560 }
2561
2562 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2563 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2564 return -1;
2565 }
2566 }
2567 return 0;
2568}
2569
2af6bd1b 2570/* try to move physical nics to the init netns */
5610055a 2571void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2af6bd1b 2572{
64d2fcb5 2573 int i, oldfd;
4ec31c52 2574 char ifname[IFNAMSIZ];
2af6bd1b 2575
5610055a 2576 if (netnsfd < 0 || conf->num_savednics == 0)
2af6bd1b
SH
2577 return;
2578
64d2fcb5 2579 INFO("Running to reset %d nic names.", conf->num_savednics);
5610055a 2580
64d2fcb5
CB
2581 oldfd = lxc_preserve_ns(getpid(), "net");
2582 if (oldfd < 0) {
2583 SYSERROR("Failed to open monitor netns fd.");
2af6bd1b
SH
2584 return;
2585 }
64d2fcb5 2586
2af6bd1b
SH
2587 if (setns(netnsfd, 0) != 0) {
2588 SYSERROR("Failed to enter container netns to reset nics");
2589 close(oldfd);
2590 return;
2591 }
2592 for (i=0; i<conf->num_savednics; i++) {
2593 struct saved_nic *s = &conf->saved_nics[i];
f2e206ff 2594 /* retrieve the name of the interface */
2595 if (!if_indextoname(s->ifindex, ifname)) {
2596 WARN("no interface corresponding to index '%d'", s->ifindex);
2597 continue;
2598 }
5610055a 2599 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
f2e206ff 2600 WARN("Error moving nic name:%s back to host netns", ifname);
5610055a 2601 free(s->orig_name);
2af6bd1b 2602 }
5610055a
WB
2603 conf->num_savednics = 0;
2604
2af6bd1b
SH
2605 if (setns(oldfd, 0) != 0)
2606 SYSERROR("Failed to re-enter monitor's netns");
2607 close(oldfd);
2608}
2609
ae9242c8
SH
2610static char *default_rootfs_mount = LXCROOTFSMOUNT;
2611
7b379ab3 2612struct lxc_conf *lxc_conf_init(void)
089cd8b8 2613{
7b379ab3 2614 struct lxc_conf *new;
26ddeedd 2615 int i;
7b379ab3
MN
2616
2617 new = malloc(sizeof(*new));
2618 if (!new) {
2619 ERROR("lxc_conf_init : %m");
2620 return NULL;
2621 }
2622 memset(new, 0, sizeof(*new));
2623
b40a606e 2624 new->loglevel = LXC_LOG_PRIORITY_NOTSET;
cccc74b5 2625 new->personality = -1;
124fa0a8 2626 new->autodev = 1;
596a818d
DE
2627 new->console.log_path = NULL;
2628 new->console.log_fd = -1;
28a4b0e5 2629 new->console.path = NULL;
63376d7d 2630 new->console.peer = -1;
b5159817
DE
2631 new->console.peerpty.busy = -1;
2632 new->console.peerpty.master = -1;
2633 new->console.peerpty.slave = -1;
63376d7d
DL
2634 new->console.master = -1;
2635 new->console.slave = -1;
2636 new->console.name[0] = '\0';
d2e30e99 2637 new->maincmd_fd = -1;
76a26f55 2638 new->nbd_idx = -1;
54c30e29 2639 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048
SH
2640 if (!new->rootfs.mount) {
2641 ERROR("lxc_conf_init : %m");
2642 free(new);
2643 return NULL;
2644 }
d89de239 2645 new->kmsg = 0;
858377e4 2646 new->logfd = -1;
7b379ab3
MN
2647 lxc_list_init(&new->cgroup);
2648 lxc_list_init(&new->network);
2649 lxc_list_init(&new->mount_list);
81810dd1 2650 lxc_list_init(&new->caps);
1fb86a7c 2651 lxc_list_init(&new->keepcaps);
f6d3e3e4 2652 lxc_list_init(&new->id_map);
f979ac15 2653 lxc_list_init(&new->includes);
4184c3e1 2654 lxc_list_init(&new->aliens);
7c661726 2655 lxc_list_init(&new->environment);
c6d09e15 2656 lxc_list_init(&new->limits);
26ddeedd
SH
2657 for (i=0; i<NUM_LXC_HOOKS; i++)
2658 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2659 lxc_list_init(&new->groups);
fe4de9a6
DE
2660 new->lsm_aa_profile = NULL;
2661 new->lsm_se_context = NULL;
5112cd70 2662 new->tmp_umount_proc = 0;
7b379ab3 2663
9f30a190
MM
2664 for (i = 0; i < LXC_NS_MAX; i++)
2665 new->inherit_ns_fd[i] = -1;
2666
72bb04e4
PT
2667 /* if running in a new user namespace, init and COMMAND
2668 * default to running as UID/GID 0 when using lxc-execute */
2669 new->init_uid = 0;
2670 new->init_gid = 0;
2671
7b379ab3 2672 return new;
089cd8b8
DL
2673}
2674
a589434e 2675static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2676{
b0ee5983
CB
2677 char *veth1, *veth2;
2678 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
b7b2fde4
CB
2679 int bridge_index, err;
2680 unsigned int mtu = 0;
13954cce 2681
8bee8851 2682 if (netdev->priv.veth_attr.pair) {
e892973e 2683 veth1 = netdev->priv.veth_attr.pair;
8bee8851
WB
2684 if (handler->conf->reboot)
2685 lxc_netdev_delete_by_name(veth1);
2686 } else {
9ba8130c
SH
2687 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2688 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2689 ERROR("veth1 name too long");
2690 return -1;
2691 }
a0265685 2692 veth1 = lxc_mkifname(veth1buf);
ad40563e
ÇO
2693 if (!veth1) {
2694 ERROR("failed to allocate a temporary name");
2695 return -1;
2696 }
74a2b586
JK
2697 /* store away for deconf */
2698 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
8634bc19 2699 }
82d5ae15 2700
0e391e57 2701 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
a0265685 2702 veth2 = lxc_mkifname(veth2buf);
ad40563e 2703 if (!veth2) {
82d5ae15 2704 ERROR("failed to allocate a temporary name");
ad40563e 2705 goto out_delete;
0ad19a3f 2706 }
2707
3cfc0f3a
MN
2708 err = lxc_veth_create(veth1, veth2);
2709 if (err) {
b0ee5983
CB
2710 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2711 veth2, strerror(-err));
ad40563e 2712 goto out_delete;
0ad19a3f 2713 }
13954cce 2714
49684c0b
CS
2715 /* changing the high byte of the mac address to 0xfe, the bridge interface
2716 * will always keep the host's mac address and not take the mac address
2717 * of a container */
2718 err = setup_private_host_hw_addr(veth1);
2719 if (err) {
b0ee5983
CB
2720 ERROR("failed to change mac address of host interface \"%s\": %s",
2721 veth1, strerror(-err));
49684c0b
CS
2722 goto out_delete;
2723 }
2724
af651aa9
SN
2725 netdev->ifindex = if_nametoindex(veth2);
2726 if (!netdev->ifindex) {
b0ee5983 2727 ERROR("failed to retrieve the index for \"%s\"", veth2);
af651aa9
SN
2728 goto out_delete;
2729 }
2730
82d5ae15 2731 if (netdev->mtu) {
b7b2fde4 2732 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
b0ee5983 2733 WARN("failed to parse mtu from");
b7b2fde4 2734 else
b0ee5983 2735 INFO("retrieved mtu %d", mtu);
e54864d3 2736 } else if (netdev->link) {
e9280f65 2737 bridge_index = if_nametoindex(netdev->link);
729e8bf6
CB
2738 if (bridge_index) {
2739 mtu = netdev_get_mtu(bridge_index);
b0ee5983 2740 INFO("retrieved mtu %d from %s", mtu, netdev->link);
729e8bf6
CB
2741 } else {
2742 mtu = netdev_get_mtu(netdev->ifindex);
b0ee5983 2743 INFO("retrieved mtu %d from %s", mtu, veth2);
729e8bf6 2744 }
e54864d3
NC
2745 }
2746
2747 if (mtu) {
2748 err = lxc_netdev_set_mtu(veth1, mtu);
3cfc0f3a 2749 if (!err)
e54864d3 2750 err = lxc_netdev_set_mtu(veth2, mtu);
3cfc0f3a 2751 if (err) {
b0ee5983
CB
2752 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2753 "and \"%s\": %s",
e54864d3 2754 mtu, veth1, veth2, strerror(-err));
eb14c10a 2755 goto out_delete;
75d09f83
DL
2756 }
2757 }
2758
3cfc0f3a 2759 if (netdev->link) {
c43cbc04 2760 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
3cfc0f3a 2761 if (err) {
b0ee5983
CB
2762 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2763 veth1, netdev->link, strerror(-err));
3cfc0f3a
MN
2764 goto out_delete;
2765 }
b0ee5983 2766 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
eb14c10a
DL
2767 }
2768
d472214b 2769 err = lxc_netdev_up(veth1);
6e35af2e 2770 if (err) {
b0ee5983 2771 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
6e35af2e 2772 goto out_delete;
0ad19a3f 2773 }
2774
e3b4c4c4 2775 if (netdev->upscript) {
751d9dcd
DL
2776 err = run_script(handler->name, "net", netdev->upscript, "up",
2777 "veth", veth1, (char*) NULL);
2778 if (err)
e3b4c4c4 2779 goto out_delete;
e3b4c4c4
ST
2780 }
2781
b0ee5983
CB
2782 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2783 netdev->ifindex);
82d5ae15 2784
6ab9ab6d 2785 return 0;
eb14c10a
DL
2786
2787out_delete:
b316d209
CB
2788 if (netdev->ifindex != 0)
2789 lxc_netdev_delete_by_name(veth1);
f10fad2f 2790 if (!netdev->priv.veth_attr.pair)
ad40563e 2791 free(veth1);
f10fad2f 2792 free(veth2);
6ab9ab6d 2793 return -1;
13954cce 2794}
d957ae2d 2795
74a2b586
JK
2796static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2797{
2798 char *veth1;
2799 int err;
2800
2801 if (netdev->priv.veth_attr.pair)
2802 veth1 = netdev->priv.veth_attr.pair;
2803 else
2804 veth1 = netdev->priv.veth_attr.veth1;
2805
2806 if (netdev->downscript) {
2807 err = run_script(handler->name, "net", netdev->downscript,
2808 "down", "veth", veth1, (char*) NULL);
2809 if (err)
2810 return -1;
2811 }
2812 return 0;
2813}
2814
a589434e 2815static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2816{
0e391e57 2817 char peerbuf[IFNAMSIZ], *peer;
3cfc0f3a 2818 int err;
d957ae2d
MT
2819
2820 if (!netdev->link) {
2821 ERROR("no link specified for macvlan netdev");
2822 return -1;
2823 }
13954cce 2824
9ba8130c
SH
2825 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2826 if (err >= sizeof(peerbuf))
2827 return -1;
82d5ae15 2828
a0265685 2829 peer = lxc_mkifname(peerbuf);
ad40563e 2830 if (!peer) {
82d5ae15
DL
2831 ERROR("failed to make a temporary name");
2832 return -1;
0ad19a3f 2833 }
2834
3cfc0f3a
MN
2835 err = lxc_macvlan_create(netdev->link, peer,
2836 netdev->priv.macvlan_attr.mode);
2837 if (err) {
2838 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2839 peer, netdev->link, strerror(-err));
ad40563e 2840 goto out;
0ad19a3f 2841 }
2842
82d5ae15
DL
2843 netdev->ifindex = if_nametoindex(peer);
2844 if (!netdev->ifindex) {
36eb9bde 2845 ERROR("failed to retrieve the index for %s", peer);
ad40563e 2846 goto out;
22ebac19 2847 }
2848
e3b4c4c4 2849 if (netdev->upscript) {
751d9dcd
DL
2850 err = run_script(handler->name, "net", netdev->upscript, "up",
2851 "macvlan", netdev->link, (char*) NULL);
2852 if (err)
ad40563e 2853 goto out;
e3b4c4c4
ST
2854 }
2855
a589434e 2856 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
e892973e 2857 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
0ad19a3f 2858
d957ae2d 2859 return 0;
ad40563e
ÇO
2860out:
2861 lxc_netdev_delete_by_name(peer);
2862 free(peer);
2863 return -1;
0ad19a3f 2864}
2865
74a2b586
JK
2866static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2867{
2868 int err;
2869
2870 if (netdev->downscript) {
2871 err = run_script(handler->name, "net", netdev->downscript,
2872 "down", "macvlan", netdev->link,
2873 (char*) NULL);
2874 if (err)
2875 return -1;
2876 }
2877 return 0;
2878}
2879
a589434e
JN
2880/* XXX: merge with instantiate_macvlan */
2881static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
26c39028
JHS
2882{
2883 char peer[IFNAMSIZ];
3cfc0f3a 2884 int err;
82f58d03 2885 static uint16_t vlan_cntr = 0;
b7b2fde4 2886 unsigned int mtu = 0;
26c39028
JHS
2887
2888 if (!netdev->link) {
2889 ERROR("no link specified for vlan netdev");
2890 return -1;
2891 }
2892
82f58d03 2893 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
9ba8130c
SH
2894 if (err >= sizeof(peer)) {
2895 ERROR("peer name too long");
2896 return -1;
2897 }
26c39028 2898
3cfc0f3a
MN
2899 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2900 if (err) {
2901 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2902 peer, netdev->link, strerror(-err));
26c39028
JHS
2903 return -1;
2904 }
2905
2906 netdev->ifindex = if_nametoindex(peer);
2907 if (!netdev->ifindex) {
2908 ERROR("failed to retrieve the ifindex for %s", peer);
b84f58b9 2909 lxc_netdev_delete_by_name(peer);
26c39028
JHS
2910 return -1;
2911 }
2912
a589434e 2913 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
e892973e 2914 netdev->ifindex);
b4fb7de1 2915 if (netdev->mtu) {
b7b2fde4
CB
2916 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
2917 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
2918 netdev->ifindex, netdev->name);
2919 return -1;
2920 }
2921 err = lxc_netdev_set_mtu(peer, mtu);
b4fb7de1
VL
2922 if (err) {
2923 ERROR("failed to set mtu '%s' for %s : %s",
2924 netdev->mtu, peer, strerror(-err));
2925 lxc_netdev_delete_by_name(peer);
2926 return -1;
2927 }
2928 }
e892973e 2929
26c39028
JHS
2930 return 0;
2931}
2932
74a2b586
JK
2933static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2934{
2935 return 0;
2936}
2937
a589434e 2938static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2939{
6168e99f
DL
2940 if (!netdev->link) {
2941 ERROR("no link specified for the physical interface");
2942 return -1;
2943 }
2944
9d083402 2945 netdev->ifindex = if_nametoindex(netdev->link);
82d5ae15 2946 if (!netdev->ifindex) {
9d083402 2947 ERROR("failed to retrieve the index for %s", netdev->link);
0ad19a3f 2948 return -1;
2949 }
2950
e3b4c4c4
ST
2951 if (netdev->upscript) {
2952 int err;
751d9dcd
DL
2953 err = run_script(handler->name, "net", netdev->upscript,
2954 "up", "phys", netdev->link, (char*) NULL);
2955 if (err)
e3b4c4c4 2956 return -1;
e3b4c4c4
ST
2957 }
2958
82d5ae15 2959 return 0;
0ad19a3f 2960}
2961
74a2b586
JK
2962static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2963{
2964 int err;
2965
2966 if (netdev->downscript) {
2967 err = run_script(handler->name, "net", netdev->downscript,
2968 "down", "phys", netdev->link, (char*) NULL);
2969 if (err)
2970 return -1;
2971 }
2972 return 0;
2973}
2974
a589434e 2975static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
26b797f3
SH
2976{
2977 netdev->ifindex = 0;
2978 return 0;
2979}
2980
a589434e 2981static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2982{
82d5ae15 2983 netdev->ifindex = 0;
e3b4c4c4
ST
2984 if (netdev->upscript) {
2985 int err;
751d9dcd
DL
2986 err = run_script(handler->name, "net", netdev->upscript,
2987 "up", "empty", (char*) NULL);
2988 if (err)
e3b4c4c4 2989 return -1;
e3b4c4c4 2990 }
82d5ae15 2991 return 0;
0ad19a3f 2992}
2993
74a2b586
JK
2994static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
2995{
2996 int err;
2997
2998 if (netdev->downscript) {
2999 err = run_script(handler->name, "net", netdev->downscript,
3000 "down", "empty", (char*) NULL);
3001 if (err)
3002 return -1;
3003 }
3004 return 0;
3005}
3006
26b797f3
SH
3007static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3008{
3009 return 0;
3010}
3011
3012int lxc_requests_empty_network(struct lxc_handler *handler)
3013{
3014 struct lxc_list *network = &handler->conf->network;
3015 struct lxc_list *iterator;
3016 struct lxc_netdev *netdev;
3017 bool found_none = false, found_nic = false;
3018
3019 if (lxc_list_empty(network))
3020 return 0;
3021
3022 lxc_list_for_each(iterator, network) {
3023
3024 netdev = iterator->elem;
3025
3026 if (netdev->type == LXC_NET_NONE)
3027 found_none = true;
3028 else
3029 found_nic = true;
3030 }
3031 if (found_none && !found_nic)
3032 return 1;
3033 return 0;
3034}
3035
e3b4c4c4 3036int lxc_create_network(struct lxc_handler *handler)
0ad19a3f 3037{
e3b4c4c4 3038 struct lxc_list *network = &handler->conf->network;
82d5ae15 3039 struct lxc_list *iterator;
82d5ae15 3040 struct lxc_netdev *netdev;
cbef6c52
SH
3041 int am_root = (getuid() == 0);
3042
3043 if (!am_root)
3044 return 0;
0ad19a3f 3045
5f4535a3 3046 lxc_list_for_each(iterator, network) {
0ad19a3f 3047
5f4535a3 3048 netdev = iterator->elem;
13954cce 3049
24654103 3050 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
82d5ae15 3051 ERROR("invalid network configuration type '%d'",
5f4535a3 3052 netdev->type);
82d5ae15
DL
3053 return -1;
3054 }
0ad19a3f 3055
e3b4c4c4 3056 if (netdev_conf[netdev->type](handler, netdev)) {
82d5ae15
DL
3057 ERROR("failed to create netdev");
3058 return -1;
3059 }
e3b4c4c4 3060
0ad19a3f 3061 }
3062
3063 return 0;
3064}
3065
358daf49 3066bool lxc_delete_network(struct lxc_handler *handler)
7fef7a06 3067{
e97946ae 3068 int ret;
74a2b586 3069 struct lxc_list *network = &handler->conf->network;
7fef7a06
DL
3070 struct lxc_list *iterator;
3071 struct lxc_netdev *netdev;
358daf49 3072 bool deleted_all = true;
7fef7a06
DL
3073
3074 lxc_list_for_each(iterator, network) {
3075 netdev = iterator->elem;
d472214b 3076
74a2b586 3077 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
d8f8e352 3078 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
358daf49
CB
3079 WARN("Failed to rename interface with index %d "
3080 "to its initial name \"%s\".",
3081 netdev->ifindex, netdev->link);
d472214b 3082 continue;
d8f8e352 3083 }
d472214b 3084
74a2b586 3085 if (netdev_deconf[netdev->type](handler, netdev)) {
e97946ae 3086 WARN("Failed to destroy netdev");
74a2b586
JK
3087 }
3088
d8f8e352
DL
3089 /* Recent kernel remove the virtual interfaces when the network
3090 * namespace is destroyed but in case we did not moved the
3091 * interface to the network namespace, we have to destroy it
3092 */
e97946ae
CB
3093 if (netdev->ifindex != 0) {
3094 ret = lxc_netdev_delete_by_index(netdev->ifindex);
358daf49
CB
3095 if (-ret == ENODEV) {
3096 INFO("Interface \"%s\" with index %d already "
3097 "deleted or existing in different network "
3098 "namespace.",
3099 netdev->name ? netdev->name : "(null)",
3100 netdev->ifindex);
3101 } else if (ret < 0) {
3102 deleted_all = false;
3103 WARN("Failed to remove interface \"%s\" with "
3104 "index %d: %s.",
3105 netdev->name ? netdev->name : "(null)",
3106 netdev->ifindex, strerror(-ret));
3107 } else {
3108 INFO("Removed interface \"%s\" with index %d.",
3109 netdev->name ? netdev->name : "(null)",
3110 netdev->ifindex);
3111 }
e97946ae
CB
3112 }
3113
3114 /* Explicitly delete host veth device to prevent lingering
3115 * devices. We had issues in LXD around this.
3116 */
b316d209 3117 if (netdev->ifindex != 0 && netdev->type == LXC_NET_VETH && !am_unpriv()) {
358daf49
CB
3118 char *hostveth;
3119 if (netdev->priv.veth_attr.pair) {
e97946ae 3120 hostveth = netdev->priv.veth_attr.pair;
358daf49
CB
3121 ret = lxc_netdev_delete_by_name(hostveth);
3122 if (ret < 0) {
3123 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3124 } else {
3125 INFO("Removed interface \"%s\" from host.", hostveth);
358daf49
CB
3126 }
3127 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
e97946ae 3128 hostveth = netdev->priv.veth_attr.veth1;
e97946ae 3129 ret = lxc_netdev_delete_by_name(hostveth);
358daf49
CB
3130 if (ret < 0) {
3131 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3132 } else {
3133 INFO("Removed interface \"%s\" from host.", hostveth);
3134 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3135 }
e97946ae
CB
3136 }
3137 }
7fef7a06 3138 }
358daf49
CB
3139
3140 return deleted_all;
7fef7a06
DL
3141}
3142
45e854dc
SG
3143#define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3144
fe1f672f 3145/* lxc-user-nic returns "interface_name:interface_name\n" */
eab15c1e 3146#define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
c43cbc04
SH
3147static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3148 struct lxc_netdev *netdev, pid_t pid)
cbef6c52
SH
3149{
3150 pid_t child;
a7242d9a
ÇO
3151 int bytes, pipefd[2];
3152 char *token, *saveptr = NULL;
fe1f672f 3153 char buffer[MAX_BUFFER_SIZE];
091045f8 3154 char netdev_link[IFNAMSIZ + 1];
cbef6c52
SH
3155
3156 if (netdev->type != LXC_NET_VETH) {
3157 ERROR("nic type %d not support for unprivileged use",
091045f8 3158 netdev->type);
cbef6c52
SH
3159 return -1;
3160 }
3161
091045f8 3162 if (pipe(pipefd) < 0) {
a7242d9a
ÇO
3163 SYSERROR("pipe failed");
3164 return -1;
3165 }
3166
091045f8
CB
3167 child = fork();
3168 if (child < 0) {
cbef6c52 3169 SYSERROR("fork");
a7242d9a
ÇO
3170 close(pipefd[0]);
3171 close(pipefd[1]);
3172 return -1;
3173 }
3174
3175 if (child == 0) { // child
091045f8
CB
3176 /* Call lxc-user-nic pid type bridge. */
3177 int ret;
3178 char pidstr[LXC_NUMSTRLEN64];
3179
3180 close(pipefd[0]); /* Close the read-end of the pipe. */
3181
3182 /* Redirect stdout to write-end of the pipe. */
3183 ret = dup2(pipefd[1], STDOUT_FILENO);
3184 close(pipefd[1]); /* Close the write-end of the pipe. */
3185 if (ret < 0) {
3186 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3187 exit(EXIT_FAILURE);
3188 }
a7242d9a 3189
091045f8 3190 if (netdev->link)
cff7b5eb 3191 strncpy(netdev_link, netdev->link, IFNAMSIZ);
091045f8 3192 else
cff7b5eb 3193 strncpy(netdev_link, "none", IFNAMSIZ);
091045f8
CB
3194
3195 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3196 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3197 exit(EXIT_FAILURE);
3198 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3199
3200 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3201 lxcname, pidstr, netdev_link, netdev->name);
c43cbc04 3202 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
091045f8
CB
3203 pidstr, "veth", netdev_link, netdev->name, NULL);
3204
3205 SYSERROR("Failed to exec lxc-user-nic.");
3206 exit(EXIT_FAILURE);
a7242d9a
ÇO
3207 }
3208
3209 /* close the write-end of the pipe */
3210 close(pipefd[1]);
3211
fe1f672f 3212 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
091045f8
CB
3213 if (bytes < 0)
3214 SYSERROR("Failed to read from pipe file descriptor.");
a7242d9a
ÇO
3215 buffer[bytes - 1] = '\0';
3216
3217 if (wait_for_pid(child) != 0) {
3218 close(pipefd[0]);
cbef6c52
SH
3219 return -1;
3220 }
3221
a7242d9a
ÇO
3222 /* close the read-end of the pipe */
3223 close(pipefd[0]);
cbef6c52 3224
a7242d9a
ÇO
3225 /* fill netdev->name field */
3226 token = strtok_r(buffer, ":", &saveptr);
3227 if (!token)
3228 return -1;
091045f8
CB
3229
3230 netdev->name = malloc(IFNAMSIZ + 1);
658979c5 3231 if (!netdev->name) {
091045f8 3232 SYSERROR("Failed to allocate memory.");
658979c5
SH
3233 return -1;
3234 }
091045f8 3235 memset(netdev->name, 0, IFNAMSIZ + 1);
658979c5 3236 strncpy(netdev->name, token, IFNAMSIZ);
a7242d9a
ÇO
3237
3238 /* fill netdev->veth_attr.pair field */
3239 token = strtok_r(NULL, ":", &saveptr);
3240 if (!token)
3241 return -1;
091045f8 3242
a7242d9a 3243 netdev->priv.veth_attr.pair = strdup(token);
658979c5 3244 if (!netdev->priv.veth_attr.pair) {
091045f8 3245 ERROR("Failed to allocate memory.");
658979c5
SH
3246 return -1;
3247 }
45e854dc 3248
a7242d9a 3249 return 0;
cbef6c52
SH
3250}
3251
c43cbc04
SH
3252int lxc_assign_network(const char *lxcpath, char *lxcname,
3253 struct lxc_list *network, pid_t pid)
0ad19a3f 3254{
82d5ae15 3255 struct lxc_list *iterator;
82d5ae15 3256 struct lxc_netdev *netdev;
f2e206ff 3257 char ifname[IFNAMSIZ];
cbef6c52 3258 int am_root = (getuid() == 0);
3cfc0f3a 3259 int err;
0ad19a3f 3260
5f4535a3 3261 lxc_list_for_each(iterator, network) {
82d5ae15 3262
5f4535a3 3263 netdev = iterator->elem;
82d5ae15 3264
fbb16259 3265 if (netdev->type == LXC_NET_VETH && !am_root) {
72ccbbe1
SC
3266 if (netdev->mtu)
3267 INFO("mtu ignored due to insufficient privilege");
c43cbc04 3268 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
cbef6c52 3269 return -1;
658979c5
SH
3270 // lxc-user-nic has moved the nic to the new ns.
3271 // unpriv_assign_nic() fills in netdev->name.
3272 // netdev->ifindex will be filed in at setup_netdev.
cbef6c52
SH
3273 continue;
3274 }
236087a6 3275
fbb16259
SH
3276 /* empty network namespace, nothing to move */
3277 if (!netdev->ifindex)
3278 continue;
3279
f2e206ff 3280 /* retrieve the name of the interface */
3281 if (!if_indextoname(netdev->ifindex, ifname)) {
3282 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3283 return -1;
3284 }
3285
3286 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3cfc0f3a
MN
3287 if (err) {
3288 ERROR("failed to move '%s' to the container : %s",
3289 netdev->link, strerror(-err));
82d5ae15
DL
3290 return -1;
3291 }
3292
198cbbaa 3293 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
0ad19a3f 3294 }
3295
3296 return 0;
3297}
3298
251d0d2a
DE
3299static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3300 size_t buf_size)
f6d3e3e4 3301{
29053180
CB
3302 char path[MAXPATHLEN];
3303 int fd, ret;
f6d3e3e4 3304
29053180
CB
3305 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3306 idtype == ID_TYPE_UID ? 'u' : 'g');
3307 if (ret < 0 || ret >= MAXPATHLEN) {
3308 ERROR("failed to create path \"%s\"", path);
f6d3e3e4
SH
3309 return -E2BIG;
3310 }
29053180
CB
3311
3312 fd = open(path, O_WRONLY);
3313 if (fd < 0) {
3314 SYSERROR("failed to open \"%s\"", path);
3315 return -1;
f6d3e3e4 3316 }
29053180
CB
3317
3318 errno = 0;
3319 ret = lxc_write_nointr(fd, buf, buf_size);
3320 if (ret != buf_size) {
3321 SYSERROR("failed to write %cid mapping to \"%s\"",
3322 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3323 close(fd);
3324 return -1;
3325 }
3326 close(fd);
3327
3328 return 0;
f6d3e3e4
SH
3329}
3330
6e50e704
CB
3331/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
3332 *
3333 * @return 1 if functional binary was found
3334 * @return 0 if binary exists but is lacking privilege
3335 * @return -ENOENT if binary does not exist
3336 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
3337 *
3338 */
df6a2945
CB
3339static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3340{
3341 char *path;
3342 int ret;
3343 struct stat st;
3344 int fret = 0;
3345
6e50e704
CB
3346 if (cap != CAP_SETUID && cap != CAP_SETGID)
3347 return -EINVAL;
3348
df6a2945
CB
3349 path = on_path(binary, NULL);
3350 if (!path)
3351 return -ENOENT;
3352
3353 ret = stat(path, &st);
3354 if (ret < 0) {
3355 fret = -errno;
3356 goto cleanup;
3357 }
3358
3359 /* Check if the binary is setuid. */
3360 if (st.st_mode & S_ISUID) {
3361 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3362 fret = 1;
3363 goto cleanup;
3364 }
3365
69924fff 3366 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
3367 /* Check if it has the CAP_SETUID capability. */
3368 if ((cap & CAP_SETUID) &&
3369 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3370 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3371 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3372 "and CAP_PERMITTED sets.", path);
3373 fret = 1;
3374 goto cleanup;
3375 }
3376
3377 /* Check if it has the CAP_SETGID capability. */
3378 if ((cap & CAP_SETGID) &&
3379 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3380 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3381 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3382 "and CAP_PERMITTED sets.", path);
3383 fret = 1;
3384 goto cleanup;
3385 }
d6018f88 3386 #else
69924fff
CB
3387 /* If we cannot check for file capabilities we need to give the benefit
3388 * of the doubt. Otherwise we might fail even though all the necessary
3389 * file capabilities are set.
3390 */
d6018f88
CB
3391 DEBUG("Cannot check for file capabilites as full capability support is "
3392 "missing. Manual intervention needed.");
3393 fret = 1;
df6a2945
CB
3394 #endif
3395
3396cleanup:
3397 free(path);
3398 return fret;
3399}
3400
986ef930
CB
3401int lxc_map_ids_exec_wrapper(void *args)
3402{
3403 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3404 return -1;
3405}
3406
f6d3e3e4
SH
3407int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3408{
f6d3e3e4 3409 struct id_map *map;
4bc3b759 3410 struct lxc_list *iterator;
251d0d2a 3411 enum idtype type;
986ef930 3412 char u_or_g;
4bc3b759 3413 char *pos;
99d43365 3414 int fill, left;
986ef930
CB
3415 char cmd_output[MAXPATHLEN];
3416 /* strlen("new@idmap") = 9
3417 * +
3418 * strlen(" ") = 1
3419 * +
3420 * LXC_NUMSTRLEN64
3421 * +
3422 * strlen(" ") = 1
3423 *
3424 * We add some additional space to make sure that we really have
3425 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3426 */
3427 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3428 int ret = 0, uidmap = 0, gidmap = 0;
3429 bool use_shadow = false, had_entry = false;
df6a2945
CB
3430
3431 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3432 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
3433 * will protected it by preventing another user from being handed the
3434 * range by shadow.
3435 */
df6a2945 3436 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
3437 if (uidmap == -ENOENT)
3438 WARN("newuidmap binary is missing");
3439 else if (!uidmap)
3440 WARN("newuidmap is lacking necessary privileges");
3441
df6a2945 3442 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
3443 if (gidmap == -ENOENT)
3444 WARN("newgidmap binary is missing");
3445 else if (!gidmap)
3446 WARN("newgidmap is lacking necessary privileges");
3447
df6a2945
CB
3448 if (uidmap > 0 && gidmap > 0) {
3449 DEBUG("Functional newuidmap and newgidmap binary found.");
4bc3b759 3450 use_shadow = true;
df6a2945 3451 } else {
99d43365
CB
3452 /* In case unprivileged users run application containers via
3453 * execute() or a start*() there are valid cases where they may
3454 * only want to map their own {g,u}id. Let's not block them from
3455 * doing so by requiring geteuid() == 0.
3456 */
3457 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3458 "write directly with euid %d.", geteuid());
0e6e3a41 3459 }
251d0d2a 3460
986ef930
CB
3461 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3462 type++, u_or_g = 'g') {
3463 pos = mapbuf;
3464
0e6e3a41 3465 if (use_shadow)
986ef930 3466 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 3467
cf3ef16d 3468 lxc_list_for_each(iterator, idmap) {
4bc3b759
CB
3469 /* The kernel only takes <= 4k for writes to
3470 * /proc/<nr>/[ug]id_map
3471 */
251d0d2a 3472 map = iterator->elem;
cf3ef16d
SH
3473 if (map->idtype != type)
3474 continue;
3475
4bc3b759
CB
3476 had_entry = true;
3477
986ef930 3478 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 3479 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
3480 use_shadow ? " " : "", map->nsid,
3481 map->hostid, map->range,
0e6e3a41 3482 use_shadow ? "" : "\n");
cf3ef16d 3483 if (fill <= 0 || fill >= left)
4bc3b759
CB
3484 SYSERROR("Too many {g,u}id mappings defined.");
3485
cf3ef16d 3486 pos += fill;
251d0d2a 3487 }
cf3ef16d 3488 if (!had_entry)
4f7521b4 3489 continue;
cf3ef16d 3490
986ef930
CB
3491 /* Try to catch the ouput of new{g,u}idmap to make debugging
3492 * easier.
3493 */
3494 if (use_shadow) {
3495 ret = run_command(cmd_output, sizeof(cmd_output),
3496 lxc_map_ids_exec_wrapper,
3497 (void *)mapbuf);
3498 if (ret < 0) {
3499 ERROR("new%cidmap failed to write mapping: %s",
3500 u_or_g, cmd_output);
3501 return -1;
3502 }
d1838f34 3503 } else {
986ef930
CB
3504 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3505 if (ret < 0)
3506 return -1;
d1838f34 3507 }
986ef930
CB
3508
3509 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 3510 }
251d0d2a 3511
986ef930 3512 return 0;
f6d3e3e4
SH
3513}
3514
cf3ef16d 3515/*
7b50c609
TS
3516 * return the host uid/gid to which the container root is mapped in
3517 * *val.
0b3a6504 3518 * Return true if id was found, false otherwise.
cf3ef16d 3519 */
2a9a80cb 3520bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 3521 unsigned long *val)
cf3ef16d
SH
3522{
3523 struct lxc_list *it;
3524 struct id_map *map;
3525
3526 lxc_list_for_each(it, &conf->id_map) {
3527 map = it->elem;
7b50c609 3528 if (map->idtype != idtype)
cf3ef16d
SH
3529 continue;
3530 if (map->nsid != 0)
3531 continue;
2a9a80cb
SH
3532 *val = map->hostid;
3533 return true;
cf3ef16d 3534 }
2a9a80cb 3535 return false;
cf3ef16d
SH
3536}
3537
2133f58c 3538int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3539{
3540 struct lxc_list *it;
3541 struct id_map *map;
3542 lxc_list_for_each(it, &conf->id_map) {
3543 map = it->elem;
2133f58c 3544 if (map->idtype != idtype)
cf3ef16d
SH
3545 continue;
3546 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 3547 return (id - map->hostid) + map->nsid;
cf3ef16d 3548 }
57d116ab 3549 return -1;
cf3ef16d
SH
3550}
3551
339efad9 3552int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3553{
3554 struct lxc_list *it;
3555 struct id_map *map;
2133f58c 3556 unsigned int freeid = 0;
cf3ef16d
SH
3557again:
3558 lxc_list_for_each(it, &conf->id_map) {
3559 map = it->elem;
2133f58c 3560 if (map->idtype != idtype)
cf3ef16d
SH
3561 continue;
3562 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3563 freeid = map->nsid + map->range;
3564 goto again;
3565 }
3566 }
3567 return freeid;
3568}
3569
19a26f82
MK
3570int lxc_find_gateway_addresses(struct lxc_handler *handler)
3571{
3572 struct lxc_list *network = &handler->conf->network;
3573 struct lxc_list *iterator;
3574 struct lxc_netdev *netdev;
3575 int link_index;
3576
3577 lxc_list_for_each(iterator, network) {
3578 netdev = iterator->elem;
3579
3580 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3581 continue;
3582
3583 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3584 ERROR("gateway = auto only supported for "
3585 "veth and macvlan");
3586 return -1;
3587 }
3588
3589 if (!netdev->link) {
3590 ERROR("gateway = auto needs a link interface");
3591 return -1;
3592 }
3593
3594 link_index = if_nametoindex(netdev->link);
3595 if (!link_index)
3596 return -EINVAL;
3597
3598 if (netdev->ipv4_gateway_auto) {
3599 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3600 ERROR("failed to automatically find ipv4 gateway "
3601 "address from link interface '%s'", netdev->link);
3602 return -1;
3603 }
3604 }
3605
3606 if (netdev->ipv6_gateway_auto) {
3607 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3608 ERROR("failed to automatically find ipv6 gateway "
3609 "address from link interface '%s'", netdev->link);
3610 return -1;
3611 }
3612 }
3613 }
3614
3615 return 0;
3616}
3617
5e4a62bf 3618int lxc_create_tty(const char *name, struct lxc_conf *conf)
b0a33c1e 3619{
5e4a62bf 3620 struct lxc_tty_info *tty_info = &conf->tty_info;
025ed0f3 3621 int i, ret;
b0a33c1e 3622
5e4a62bf
DL
3623 /* no tty in the configuration */
3624 if (!conf->tty)
b0a33c1e 3625 return 0;
3626
9e1045e3 3627 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
b0a33c1e 3628 if (!tty_info->pty_info) {
9e1045e3
CB
3629 SYSERROR("failed to allocate struct *pty_info");
3630 return -ENOMEM;
b0a33c1e 3631 }
3632
985d15b1 3633 for (i = 0; i < conf->tty; i++) {
b0a33c1e 3634 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3635
025ed0f3
SH
3636 process_lock();
3637 ret = openpty(&pty_info->master, &pty_info->slave,
9e1045e3 3638 pty_info->name, NULL, NULL);
025ed0f3
SH
3639 process_unlock();
3640 if (ret) {
9e1045e3 3641 SYSERROR("failed to create pty device number %d", i);
985d15b1
MT
3642 tty_info->nbtty = i;
3643 lxc_delete_tty(tty_info);
9e1045e3 3644 return -ENOTTY;
b0a33c1e 3645 }
3646
9e1045e3 3647 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
5332bb84
DL
3648 pty_info->name, pty_info->master, pty_info->slave);
3649
3ec1648d 3650 /* Prevent leaking the file descriptors to the container */
9e1045e3
CB
3651 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3652 if (ret < 0)
3653 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3654 "pty device \"%s\": %s",
3655 pty_info->master, pty_info->name, strerror(errno));
3656
3657 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3658 if (ret < 0)
3659 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3660 "pty device \"%s\": %s",
3661 pty_info->slave, pty_info->name, strerror(errno));
b035ad62 3662
b0a33c1e 3663 pty_info->busy = 0;
3664 }
3665
985d15b1 3666 tty_info->nbtty = conf->tty;
1ac470c0 3667
9e1045e3 3668 INFO("finished allocating %d pts devices", conf->tty);
985d15b1 3669 return 0;
b0a33c1e 3670}
3671
3672void lxc_delete_tty(struct lxc_tty_info *tty_info)
3673{
3674 int i;
3675
3676 for (i = 0; i < tty_info->nbtty; i++) {
3677 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3678
3679 close(pty_info->master);
3680 close(pty_info->slave);
3681 }
3682
3683 free(tty_info->pty_info);
e00c0242 3684 tty_info->pty_info = NULL;
b0a33c1e 3685 tty_info->nbtty = 0;
3686}
3687
f4f52cb5
CB
3688
3689int chown_mapped_root_exec_wrapper(void *args)
3690{
3691 execvp("lxc-usernsexec", args);
3692 return -1;
3693}
3694
f6d3e3e4 3695/*
7b50c609
TS
3696 * chown_mapped_root: for an unprivileged user with uid/gid X to
3697 * chown a dir to subuid/subgid Y, he needs to run chown as root
3698 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3699 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3700 * root is privileged with respect to hostuid/hostgid X, allowing
3701 * him to do the chown.
f6d3e3e4 3702 */
c4d10a05 3703int chown_mapped_root(char *path, struct lxc_conf *conf)
f6d3e3e4 3704{
f4f52cb5 3705 uid_t rootuid, rootgid;
2a9a80cb 3706 unsigned long val;
a7ef8753 3707 char *chownpath = path;
f4f52cb5
CB
3708 int hostuid, hostgid, ret;
3709 struct stat sb;
3710 char map1[100], map2[100], map3[100], map4[100], map5[100];
3711 char ugid[100];
3712 char *args1[] = {"lxc-usernsexec",
3713 "-m", map1,
3714 "-m", map2,
3715 "-m", map3,
3716 "-m", map5,
3717 "--", "chown", ugid, path,
3718 NULL};
3719 char *args2[] = {"lxc-usernsexec",
3720 "-m", map1,
3721 "-m", map2,
3722 "-m", map3,
3723 "-m", map4,
3724 "-m", map5,
3725 "--", "chown", ugid, path,
3726 NULL};
3727 char cmd_output[MAXPATHLEN];
3728
3729 hostuid = geteuid();
3730 hostgid = getegid();
f6d3e3e4 3731
2a9a80cb 3732 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 3733 ERROR("No uid mapping for container root");
c4d10a05 3734 return -1;
f6d3e3e4 3735 }
f4f52cb5 3736 rootuid = (uid_t)val;
7b50c609 3737 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 3738 ERROR("No gid mapping for container root");
7b50c609
TS
3739 return -1;
3740 }
f4f52cb5 3741 rootgid = (gid_t)val;
2a9a80cb 3742
a7ef8753 3743 /*
f4f52cb5 3744 * In case of overlay, we want only the writeable layer to be chowned
a7ef8753 3745 */
1f92162d 3746 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
a7ef8753
SH
3747 chownpath = strchr(path, ':');
3748 if (!chownpath) {
3749 ERROR("Bad overlay path: %s", path);
3750 return -1;
3751 }
f4f52cb5 3752 chownpath = strchr(chownpath + 1, ':');
a7ef8753
SH
3753 if (!chownpath) {
3754 ERROR("Bad overlay path: %s", path);
3755 return -1;
3756 }
3757 chownpath++;
3758 }
3759 path = chownpath;
f4f52cb5 3760 if (hostuid == 0) {
7b50c609 3761 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3762 ERROR("Error chowning %s", path);
3763 return -1;
3764 }
3765 return 0;
3766 }
f3d7e4ca 3767
f4f52cb5 3768 if (rootuid == hostuid) {
f3d7e4ca
SH
3769 // nothing to do
3770 INFO("%s: container root is our uid; no need to chown" ,__func__);
3771 return 0;
3772 }
3773
f4f52cb5
CB
3774 // save the current gid of "path"
3775 if (stat(path, &sb) < 0) {
3776 ERROR("Error stat %s", path);
f6d3e3e4
SH
3777 return -1;
3778 }
7b50c609 3779
f4f52cb5
CB
3780 /*
3781 * A file has to be group-owned by a gid mapped into the
3782 * container, or the container won't be privileged over it.
3783 */
3784 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3785 if (sb.st_uid == hostuid &&
3786 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3787 chown(path, -1, hostgid) < 0) {
3788 ERROR("Failed chgrping %s", path);
3789 return -1;
3790 }
f6d3e3e4 3791
f4f52cb5
CB
3792 // "u:0:rootuid:1"
3793 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3794 if (ret < 0 || ret >= 100) {
3795 ERROR("Error uid printing map string");
3796 return -1;
3797 }
7b50c609 3798
f4f52cb5
CB
3799 // "u:hostuid:hostuid:1"
3800 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3801 if (ret < 0 || ret >= 100) {
3802 ERROR("Error uid printing map string");
3803 return -1;
3804 }
c4d10a05 3805
f4f52cb5
CB
3806 // "g:0:rootgid:1"
3807 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3808 if (ret < 0 || ret >= 100) {
3809 ERROR("Error gid printing map string");
3810 return -1;
3811 }
98e5ba51 3812
f4f52cb5
CB
3813 // "g:pathgid:rootgid+pathgid:1"
3814 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3815 rootgid + (gid_t)sb.st_gid);
3816 if (ret < 0 || ret >= 100) {
3817 ERROR("Error gid printing map string");
3818 return -1;
3819 }
c4d10a05 3820
f4f52cb5
CB
3821 // "g:hostgid:hostgid:1"
3822 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3823 if (ret < 0 || ret >= 100) {
3824 ERROR("Error gid printing map string");
3825 return -1;
3826 }
7b50c609 3827
f4f52cb5
CB
3828 // "0:pathgid" (chown)
3829 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3830 if (ret < 0 || ret >= 100) {
3831 ERROR("Error owner printing format string for chown");
3832 return -1;
3833 }
7b50c609 3834
f4f52cb5
CB
3835 if (hostgid == sb.st_gid)
3836 ret = run_command(cmd_output, sizeof(cmd_output),
3837 chown_mapped_root_exec_wrapper,
3838 (void *)args1);
3839 else
3840 ret = run_command(cmd_output, sizeof(cmd_output),
3841 chown_mapped_root_exec_wrapper,
3842 (void *)args2);
3843 if (ret < 0)
3844 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 3845
f4f52cb5 3846 return ret;
f6d3e3e4
SH
3847}
3848
54117de5 3849int lxc_ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 3850{
c4d10a05 3851 if (lxc_list_empty(&c->id_map))
f6d3e3e4 3852 return 0;
c4d10a05 3853
54117de5
CB
3854 if (!strcmp(c->console.name, ""))
3855 return 0;
3856
3857 if (chown_mapped_root(c->console.name, c) < 0) {
3858 ERROR("failed to chown console \"%s\"", c->console.name);
c4d10a05
SH
3859 return -1;
3860 }
3861
54117de5
CB
3862 TRACE("chowned console \"%s\"", c->console.name);
3863
f6d3e3e4
SH
3864 return 0;
3865}
3866
943144d9
CB
3867/* NOTE: Must not be called from inside the container namespace! */
3868int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3869{
3870 int mounted;
3871
943144d9 3872 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3873 if (mounted == -1) {
943144d9 3874 SYSERROR("failed to mount /proc in the container");
01958b1f 3875 /* continue only if there is no rootfs */
943144d9 3876 if (conf->rootfs.path)
01958b1f 3877 return -1;
5112cd70 3878 } else if (mounted == 1) {
943144d9 3879 conf->tmp_umount_proc = 1;
5112cd70 3880 }
943144d9 3881
5112cd70
SH
3882 return 0;
3883}
3884
3885void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3886{
3887 if (lxc_conf->tmp_umount_proc == 1) {
3888 umount("/proc");
3889 lxc_conf->tmp_umount_proc = 0;
3890 }
3891}
3892
6a0c909a 3893void remount_all_slave(void)
e995d7a2
SH
3894{
3895 /* walk /proc/mounts and change any shared entries to slave */
3896 FILE *f = fopen("/proc/self/mountinfo", "r");
3897 char *line = NULL;
3898 size_t len = 0;
3899
3900 if (!f) {
3901 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3902 ERROR("Continuing container startup...");
3903 return;
3904 }
3905
3906 while (getline(&line, &len, f) != -1) {
3907 char *target, *opts;
3908 target = get_field(line, 4);
3909 if (!target)
3910 continue;
3911 opts = get_field(target, 2);
3912 if (!opts)
3913 continue;
3914 null_endofword(opts);
3915 if (!strstr(opts, "shared"))
3916 continue;
3917 null_endofword(target);
3918 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3919 SYSERROR("Failed to make %s rslave", target);
3920 ERROR("Continuing...");
3921 }
3922 }
3923 fclose(f);
f10fad2f 3924 free(line);
e995d7a2
SH
3925}
3926
2322903b
SH
3927void lxc_execute_bind_init(struct lxc_conf *conf)
3928{
3929 int ret;
9d9c111c
SH
3930 char path[PATH_MAX], destpath[PATH_MAX], *p;
3931
3932 /* If init exists in the container, don't bind mount a static one */
3933 p = choose_init(conf->rootfs.mount);
3934 if (p) {
3935 free(p);
3936 return;
3937 }
2322903b
SH
3938
3939 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3940 if (ret < 0 || ret >= PATH_MAX) {
3941 WARN("Path name too long searching for lxc.init.static");
3942 return;
3943 }
3944
3945 if (!file_exists(path)) {
3946 INFO("%s does not exist on host", path);
3947 return;
3948 }
3949
3950 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3951 if (ret < 0 || ret >= PATH_MAX) {
3952 WARN("Path name too long for container's lxc.init.static");
3953 return;
3954 }
3955
3956 if (!file_exists(destpath)) {
3957 FILE * pathfile = fopen(destpath, "wb");
3958 if (!pathfile) {
3959 SYSERROR("Failed to create mount target '%s'", destpath);
3960 return;
3961 }
3962 fclose(pathfile);
3963 }
3964
592fd47a 3965 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
2322903b
SH
3966 if (ret < 0)
3967 SYSERROR("Failed to bind lxc.init.static into container");
3968 INFO("lxc.init.static bound into container at %s", path);
3969}
3970
35120d9c
SH
3971/*
3972 * This does the work of remounting / if it is shared, calling the
3973 * container pre-mount hooks, and mounting the rootfs.
3974 */
3975int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 3976{
35120d9c
SH
3977 if (conf->rootfs_setup) {
3978 /*
3979 * rootfs was set up in another namespace. bind-mount it
3980 * to give us a mount in our own ns so we can pivot_root to it
3981 */
3982 const char *path = conf->rootfs.mount;
3983 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
3984 ERROR("Failed to bind-mount container / onto itself");
145832ba 3985 return -1;
35120d9c 3986 }
145832ba 3987 return 0;
35120d9c 3988 }
d4ef7c50 3989
e995d7a2
SH
3990 remount_all_slave();
3991
35120d9c
SH
3992 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
3993 ERROR("failed to run pre-mount hooks for container '%s'.", name);
3994 return -1;
3995 }
3996
9aa76a17 3997 if (lxc_setup_rootfs(conf)) {
35120d9c
SH
3998 ERROR("failed to setup rootfs for '%s'", name);
3999 return -1;
4000 }
4001
4002 conf->rootfs_setup = true;
4003 return 0;
4004}
4005
1c1c7051
SH
4006static bool verify_start_hooks(struct lxc_conf *conf)
4007{
4008 struct lxc_list *it;
4009 char path[MAXPATHLEN];
4010 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4011 char *hookname = it->elem;
4012 struct stat st;
4013 int ret;
4014
4015 ret = snprintf(path, MAXPATHLEN, "%s%s",
7b6753e7 4016 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
1c1c7051
SH
4017 if (ret < 0 || ret >= MAXPATHLEN)
4018 return false;
4019 ret = stat(path, &st);
4020 if (ret) {
7b6753e7 4021 SYSERROR("Start hook %s not found in container",
1c1c7051
SH
4022 hookname);
4023 return false;
4024 }
6a0c909a 4025 return true;
1c1c7051
SH
4026 }
4027
4028 return true;
4029}
4030
ae467c54 4031static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
e8bd4e43 4032{
ae467c54
CB
4033 int i;
4034 int *ttyfds;
4035 struct lxc_pty_info *pty_info;
e8bd4e43
SH
4036 struct lxc_conf *conf = handler->conf;
4037 const struct lxc_tty_info *tty_info = &conf->tty_info;
e8bd4e43 4038 int sock = handler->ttysock[0];
ae467c54
CB
4039 int ret = -1;
4040 size_t num_ttyfds = (2 * conf->tty);
e8bd4e43 4041
ae467c54
CB
4042 ttyfds = malloc(num_ttyfds * sizeof(int));
4043 if (!ttyfds)
4044 return -1;
4045
4046 for (i = 0; i < num_ttyfds; i++) {
4047 pty_info = &tty_info->pty_info[i / 2];
4048 ttyfds[i++] = pty_info->slave;
4049 ttyfds[i] = pty_info->master;
4050 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
f07fa8df
CB
4051 "parent",
4052 pty_info->name, pty_info->master, pty_info->slave);
e8bd4e43
SH
4053 }
4054
ae467c54
CB
4055 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
4056 if (ret < 0)
4057 ERROR("failed to send %d ttys to parent: %s", conf->tty,
4058 strerror(errno));
4059 else
4060 TRACE("sent %d ttys to parent", conf->tty);
4061
e8bd4e43
SH
4062 close(handler->ttysock[0]);
4063 close(handler->ttysock[1]);
4064
ae467c54
CB
4065 for (i = 0; i < num_ttyfds; i++)
4066 close(ttyfds[i]);
e8bd4e43 4067
ae467c54
CB
4068 free(ttyfds);
4069
4070 return ret;
e8bd4e43
SH
4071}
4072
35120d9c
SH
4073int lxc_setup(struct lxc_handler *handler)
4074{
4075 const char *name = handler->name;
4076 struct lxc_conf *lxc_conf = handler->conf;
4077 const char *lxcpath = handler->lxcpath;
35120d9c
SH
4078
4079 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4080 ERROR("Error setting up rootfs mount after spawn");
4081 return -1;
4082 }
4083
6c544cb3
MM
4084 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4085 if (setup_utsname(lxc_conf->utsname)) {
4086 ERROR("failed to setup the utsname for '%s'", name);
4087 return -1;
4088 }
0ad19a3f 4089 }
4090
5f4535a3 4091 if (setup_network(&lxc_conf->network)) {
36eb9bde 4092 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 4093 return -1;
0ad19a3f 4094 }
4095
bc6928ff 4096 if (lxc_conf->autodev > 0) {
14221cbb 4097 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
91c3830e 4098 ERROR("failed to mount /dev in the container");
c6883f38
SH
4099 return -1;
4100 }
4101 }
4102
368bbc02
CS
4103 /* do automatic mounts (mainly /proc and /sys), but exclude
4104 * those that need to wait until other stuff has finished
4105 */
4fb3cba5 4106 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4107 ERROR("failed to setup the automatic mounts for '%s'", name);
4108 return -1;
4109 }
4110
0a2dddd4 4111 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
36eb9bde 4112 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 4113 return -1;
576f946d 4114 }
4115
0a2dddd4 4116 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
e7938e9e
MN
4117 ERROR("failed to setup the mount entries for '%s'", name);
4118 return -1;
4119 }
4120
7b6753e7 4121 /* Make sure any start hooks are in the container */
1c1c7051
SH
4122 if (!verify_start_hooks(lxc_conf))
4123 return -1;
4124
2322903b
SH
4125 if (lxc_conf->is_execute)
4126 lxc_execute_bind_init(lxc_conf);
4127
368bbc02
CS
4128 /* now mount only cgroup, if wanted;
4129 * before, /sys could not have been mounted
4130 * (is either mounted automatically or via fstab entries)
4131 */
4fb3cba5 4132 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4133 ERROR("failed to setup the automatic mounts for '%s'", name);
4134 return -1;
4135 }
4136
283678ed 4137 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
773fb9ca
SH
4138 ERROR("failed to run mount hooks for container '%s'.", name);
4139 return -1;
4140 }
4141
bc6928ff 4142 if (lxc_conf->autodev > 0) {
283678ed 4143 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
f7bee6c6
MW
4144 ERROR("failed to run autodev hooks for container '%s'.", name);
4145 return -1;
4146 }
27245ff7 4147 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
91c3830e
SH
4148 ERROR("failed to populate /dev in the container");
4149 return -1;
4150 }
4151 }
368bbc02 4152
3d7d929a 4153 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
36eb9bde 4154 ERROR("failed to setup the console for '%s'", name);
95b5ffaf 4155 return -1;
6e590161 4156 }
4157
7e0e1d94
AV
4158 if (lxc_conf->kmsg) {
4159 if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail
4160 ERROR("failed to setup kmsg for '%s'", name);
4161 }
1bd051a6 4162
69aa6655
DE
4163 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4164 ERROR("failed to setup /dev symlinks for '%s'", name);
4165 return -1;
4166 }
4167
5112cd70 4168 /* mount /proc if it's not already there */
943144d9 4169 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 4170 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 4171 return -1;
e075f5d9 4172 }
e075f5d9 4173
ac778708 4174 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 4175 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 4176 return -1;
ed502555 4177 }
4178
70761e5e 4179 if (lxc_setup_devpts(lxc_conf->pts)) {
36eb9bde 4180 ERROR("failed to setup the new pts instance");
95b5ffaf 4181 return -1;
3c26f34e 4182 }
4183
e8bd4e43
SH
4184 if (lxc_create_tty(name, lxc_conf)) {
4185 ERROR("failed to create the ttys");
4186 return -1;
4187 }
4188
ae467c54 4189 if (lxc_send_ttys_to_parent(handler) < 0) {
e8bd4e43
SH
4190 ERROR("failure sending console info to parent");
4191 return -1;
4192 }
4193
9e1045e3 4194 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
e8bd4e43
SH
4195 ERROR("failed to setup the ttys for '%s'", name);
4196 return -1;
4197 }
4198
4199 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4200 SYSERROR("failed to set environment variable for container ptys");
4201
4202
cccc74b5
DL
4203 if (setup_personality(lxc_conf->personality)) {
4204 ERROR("failed to setup personality");
4205 return -1;
4206 }
4207
97a8f74f
SG
4208 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4209 if (!lxc_list_empty(&lxc_conf->caps)) {
7389ca26 4210 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
f6d3e3e4
SH
4211 return -1;
4212 }
97a8f74f
SG
4213 if (dropcaps_except(&lxc_conf->keepcaps)) {
4214 ERROR("failed to keep requested caps");
4215 return -1;
4216 }
4217 } else if (setup_caps(&lxc_conf->caps)) {
4218 ERROR("failed to drop capabilities");
4219 return -1;
81810dd1
DL
4220 }
4221
cd54d859
DL
4222 NOTICE("'%s' is setup.", name);
4223
0ad19a3f 4224 return 0;
4225}
26ddeedd 4226
283678ed
SH
4227int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4228 const char *lxcpath, char *argv[])
26ddeedd
SH
4229{
4230 int which = -1;
4231 struct lxc_list *it;
4232
4233 if (strcmp(hook, "pre-start") == 0)
4234 which = LXCHOOK_PRESTART;
5ea6163a
SH
4235 else if (strcmp(hook, "pre-mount") == 0)
4236 which = LXCHOOK_PREMOUNT;
26ddeedd
SH
4237 else if (strcmp(hook, "mount") == 0)
4238 which = LXCHOOK_MOUNT;
f7bee6c6
MW
4239 else if (strcmp(hook, "autodev") == 0)
4240 which = LXCHOOK_AUTODEV;
26ddeedd
SH
4241 else if (strcmp(hook, "start") == 0)
4242 which = LXCHOOK_START;
52492063
WB
4243 else if (strcmp(hook, "stop") == 0)
4244 which = LXCHOOK_STOP;
26ddeedd
SH
4245 else if (strcmp(hook, "post-stop") == 0)
4246 which = LXCHOOK_POSTSTOP;
148e91f5
SH
4247 else if (strcmp(hook, "clone") == 0)
4248 which = LXCHOOK_CLONE;
37cf711b
SY
4249 else if (strcmp(hook, "destroy") == 0)
4250 which = LXCHOOK_DESTROY;
26ddeedd
SH
4251 else
4252 return -1;
4253 lxc_list_for_each(it, &conf->hooks[which]) {
4254 int ret;
4255 char *hookname = it->elem;
283678ed 4256 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
26ddeedd
SH
4257 if (ret)
4258 return ret;
4259 }
4260 return 0;
4261}
72d0e1cb 4262
427b3a21 4263static void lxc_remove_nic(struct lxc_list *it)
72d0e1cb
SG
4264{
4265 struct lxc_netdev *netdev = it->elem;
9ebb03ad 4266 struct lxc_list *it2,*next;
72d0e1cb
SG
4267
4268 lxc_list_del(it);
4269
f10fad2f
ME
4270 free(netdev->link);
4271 free(netdev->name);
4272 if (netdev->type == LXC_NET_VETH)
c9bb9a85 4273 free(netdev->priv.veth_attr.pair);
f10fad2f
ME
4274 free(netdev->upscript);
4275 free(netdev->hwaddr);
4276 free(netdev->mtu);
4277 free(netdev->ipv4_gateway);
4278 free(netdev->ipv6_gateway);
9ebb03ad 4279 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
72d0e1cb
SG
4280 lxc_list_del(it2);
4281 free(it2->elem);
4282 free(it2);
4283 }
9ebb03ad 4284 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
72d0e1cb
SG
4285 lxc_list_del(it2);
4286 free(it2->elem);
4287 free(it2);
4288 }
d95db067 4289 free(netdev);
72d0e1cb
SG
4290 free(it);
4291}
4292
4293/* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
12a50cc6 4294int lxc_clear_nic(struct lxc_conf *c, const char *key)
72d0e1cb
SG
4295{
4296 char *p1;
4297 int ret, idx, i;
4298 struct lxc_list *it;
4299 struct lxc_netdev *netdev;
4300
46cd2845 4301 p1 = strchr(key, '.');
72d0e1cb
SG
4302 if (!p1 || *(p1+1) == '\0')
4303 p1 = NULL;
4304
4305 ret = sscanf(key, "%d", &idx);
4306 if (ret != 1) return -1;
4307 if (idx < 0)
4308 return -1;
4309
4310 i = 0;
4311 lxc_list_for_each(it, &c->network) {
4312 if (i == idx)
4313 break;
4314 i++;
4315 }
4316 if (i < idx) // we don't have that many nics defined
4317 return -1;
4318
4319 if (!it || !it->elem)
4320 return -1;
4321
4322 netdev = it->elem;
4323
4324 if (!p1) {
4325 lxc_remove_nic(it);
52d21d40 4326 } else if (strcmp(p1, ".ipv4") == 0) {
9ebb03ad
DE
4327 struct lxc_list *it2,*next;
4328 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
72d0e1cb
SG
4329 lxc_list_del(it2);
4330 free(it2->elem);
4331 free(it2);
4332 }
52d21d40 4333 } else if (strcmp(p1, ".ipv6") == 0) {
9ebb03ad
DE
4334 struct lxc_list *it2,*next;
4335 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
72d0e1cb
SG
4336 lxc_list_del(it2);
4337 free(it2->elem);
4338 free(it2);
4339 }
72d0e1cb
SG
4340 }
4341 else return -1;
4342
4343 return 0;
4344}
4345
4346int lxc_clear_config_network(struct lxc_conf *c)
4347{
9ebb03ad
DE
4348 struct lxc_list *it,*next;
4349 lxc_list_for_each_safe(it, &c->network, next) {
72d0e1cb
SG
4350 lxc_remove_nic(it);
4351 }
4352 return 0;
4353}
4354
4355int lxc_clear_config_caps(struct lxc_conf *c)
4356{
9ebb03ad 4357 struct lxc_list *it,*next;
72d0e1cb 4358
9ebb03ad 4359 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
4360 lxc_list_del(it);
4361 free(it->elem);
4362 free(it);
4363 }
4364 return 0;
4365}
4366
74a3920a 4367static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
4368 struct lxc_list *it, *next;
4369
4355ab5f 4370 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
4371 lxc_list_del(it);
4372 free(it->elem);
4373 free(it);
4374 }
4375 return 0;
4376}
4377
4355ab5f
SH
4378int lxc_clear_idmaps(struct lxc_conf *c)
4379{
4380 return lxc_free_idmap(&c->id_map);
4381}
4382
1fb86a7c
SH
4383int lxc_clear_config_keepcaps(struct lxc_conf *c)
4384{
4385 struct lxc_list *it,*next;
4386
4387 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4388 lxc_list_del(it);
4389 free(it->elem);
4390 free(it);
4391 }
4392 return 0;
4393}
4394
12a50cc6 4395int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 4396{
9ebb03ad 4397 struct lxc_list *it,*next;
72d0e1cb 4398 bool all = false;
a6390f01 4399 const char *k = NULL;
72d0e1cb
SG
4400
4401 if (strcmp(key, "lxc.cgroup") == 0)
4402 all = true;
a6390f01
WB
4403 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4404 k = key + sizeof("lxc.cgroup.")-1;
4405 else
4406 return -1;
72d0e1cb 4407
9ebb03ad 4408 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
4409 struct lxc_cgroup *cg = it->elem;
4410 if (!all && strcmp(cg->subsystem, k) != 0)
4411 continue;
4412 lxc_list_del(it);
4413 free(cg->subsystem);
4414 free(cg->value);
4415 free(cg);
4416 free(it);
4417 }
4418 return 0;
4419}
4420
c6d09e15
WB
4421int lxc_clear_limits(struct lxc_conf *c, const char *key)
4422{
4423 struct lxc_list *it, *next;
4424 bool all = false;
4425 const char *k = NULL;
4426
4427 if (strcmp(key, "lxc.limit") == 0)
4428 all = true;
4429 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4430 k = key + sizeof("lxc.limit.")-1;
4431 else
4432 return -1;
4433
4434 lxc_list_for_each_safe(it, &c->limits, next) {
4435 struct lxc_limit *lim = it->elem;
4436 if (!all && strcmp(lim->resource, k) != 0)
4437 continue;
4438 lxc_list_del(it);
4439 free(lim->resource);
4440 free(lim);
4441 free(it);
4442 }
4443 return 0;
4444}
4445
ee1e7aa0
SG
4446int lxc_clear_groups(struct lxc_conf *c)
4447{
4448 struct lxc_list *it,*next;
4449
4450 lxc_list_for_each_safe(it, &c->groups, next) {
4451 lxc_list_del(it);
4452 free(it->elem);
4453 free(it);
4454 }
4455 return 0;
4456}
4457
ab799c0b
SG
4458int lxc_clear_environment(struct lxc_conf *c)
4459{
4460 struct lxc_list *it,*next;
4461
4462 lxc_list_for_each_safe(it, &c->environment, next) {
4463 lxc_list_del(it);
4464 free(it->elem);
4465 free(it);
4466 }
4467 return 0;
4468}
4469
4470
72d0e1cb
SG
4471int lxc_clear_mount_entries(struct lxc_conf *c)
4472{
9ebb03ad 4473 struct lxc_list *it,*next;
72d0e1cb 4474
9ebb03ad 4475 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
4476 lxc_list_del(it);
4477 free(it->elem);
4478 free(it);
4479 }
4480 return 0;
4481}
4482
b099e9e9
SH
4483int lxc_clear_automounts(struct lxc_conf *c)
4484{
4485 c->auto_mounts = 0;
4486 return 0;
4487}
4488
12a50cc6 4489int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 4490{
9ebb03ad 4491 struct lxc_list *it,*next;
17ed13a3 4492 bool all = false, done = false;
a6390f01 4493 const char *k = NULL;
72d0e1cb
SG
4494 int i;
4495
17ed13a3
SH
4496 if (strcmp(key, "lxc.hook") == 0)
4497 all = true;
a6390f01
WB
4498 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4499 k = key + sizeof("lxc.hook.")-1;
4500 else
4501 return -1;
17ed13a3 4502
72d0e1cb 4503 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 4504 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 4505 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
4506 lxc_list_del(it);
4507 free(it->elem);
4508 free(it);
4509 }
4510 done = true;
72d0e1cb
SG
4511 }
4512 }
17ed13a3
SH
4513
4514 if (!done) {
4515 ERROR("Invalid hook key: %s", key);
4516 return -1;
4517 }
72d0e1cb
SG
4518 return 0;
4519}
8eb5694b 4520
74a3920a 4521static void lxc_clear_saved_nics(struct lxc_conf *conf)
7b35f3d6
SH
4522{
4523 int i;
4524
0cf45501 4525 if (!conf->saved_nics)
7b35f3d6
SH
4526 return;
4527 for (i=0; i < conf->num_savednics; i++)
4528 free(conf->saved_nics[i].orig_name);
7b35f3d6
SH
4529 free(conf->saved_nics);
4530}
4531
4184c3e1
SH
4532static inline void lxc_clear_aliens(struct lxc_conf *conf)
4533{
4534 struct lxc_list *it,*next;
4535
4536 lxc_list_for_each_safe(it, &conf->aliens, next) {
4537 lxc_list_del(it);
4538 free(it->elem);
4539 free(it);
4540 }
4541}
4542
c7b15d1e 4543void lxc_clear_includes(struct lxc_conf *conf)
f979ac15
SH
4544{
4545 struct lxc_list *it,*next;
4546
4547 lxc_list_for_each_safe(it, &conf->includes, next) {
4548 lxc_list_del(it);
4549 free(it->elem);
4550 free(it);
4551 }
4552}
4553
8eb5694b
SH
4554void lxc_conf_free(struct lxc_conf *conf)
4555{
4556 if (!conf)
4557 return;
858377e4
SH
4558 if (current_config == conf)
4559 current_config = NULL;
f10fad2f
ME
4560 free(conf->console.log_path);
4561 free(conf->console.path);
4562 free(conf->rootfs.mount);
b3b8c97f 4563 free(conf->rootfs.bdev_type);
f10fad2f
ME
4564 free(conf->rootfs.options);
4565 free(conf->rootfs.path);
f10fad2f 4566 free(conf->logfile);
858377e4
SH
4567 if (conf->logfd != -1)
4568 close(conf->logfd);
f10fad2f
ME
4569 free(conf->utsname);
4570 free(conf->ttydir);
4571 free(conf->fstab);
4572 free(conf->rcfile);
4573 free(conf->init_cmd);
6b0d5538 4574 free(conf->unexpanded_config);
393903d1 4575 free(conf->pty_names);
76d0127f 4576 free(conf->syslog);
8eb5694b 4577 lxc_clear_config_network(conf);
f10fad2f
ME
4578 free(conf->lsm_aa_profile);
4579 free(conf->lsm_se_context);
769872f9 4580 lxc_seccomp_free(conf);
8eb5694b 4581 lxc_clear_config_caps(conf);
1fb86a7c 4582 lxc_clear_config_keepcaps(conf);
8eb5694b 4583 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 4584 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 4585 lxc_clear_mount_entries(conf);
7b35f3d6 4586 lxc_clear_saved_nics(conf);
27c27d73 4587 lxc_clear_idmaps(conf);
ee1e7aa0 4588 lxc_clear_groups(conf);
f979ac15 4589 lxc_clear_includes(conf);
761d81ca 4590 lxc_clear_aliens(conf);
ab799c0b 4591 lxc_clear_environment(conf);
c6d09e15 4592 lxc_clear_limits(conf, "lxc.limit");
8eb5694b
SH
4593 free(conf);
4594}
4355ab5f
SH
4595
4596struct userns_fn_data {
4597 int (*fn)(void *);
c9b7c33e 4598 const char *fn_name;
4355ab5f
SH
4599 void *arg;
4600 int p[2];
4601};
4602
4603static int run_userns_fn(void *data)
4604{
4605 struct userns_fn_data *d = data;
4606 char c;
4355ab5f 4607
f8aa4bf3 4608 /* Close write end of the pipe. */
4355ab5f 4609 close(d->p[1]);
f8aa4bf3
CB
4610
4611 /* Wait for parent to finish establishing a new mapping in the user
4612 * namespace we are executing in.
4613 */
4355ab5f
SH
4614 if (read(d->p[0], &c, 1) != 1)
4615 return -1;
f8aa4bf3
CB
4616
4617 /* Close read end of the pipe. */
4355ab5f 4618 close(d->p[0]);
f8aa4bf3 4619
c9b7c33e
CB
4620 if (d->fn_name)
4621 TRACE("calling function \"%s\"", d->fn_name);
f8aa4bf3 4622 /* Call function to run. */
4355ab5f
SH
4623 return d->fn(d->arg);
4624}
4625
339efad9 4626static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
f8aa4bf3
CB
4627 enum idtype idtype)
4628{
4629 struct lxc_list *it;
4630 struct id_map *map;
4631 struct id_map *retmap = NULL;
4632
4633 lxc_list_for_each(it, &conf->id_map) {
4634 map = it->elem;
4635 if (map->idtype != idtype)
4636 continue;
4637
4638 if (id >= map->hostid && id < map->hostid + map->range) {
4639 retmap = map;
4640 break;
4641 }
4642 }
4643
4644 if (!retmap)
4645 return NULL;
4646
4647 retmap = malloc(sizeof(*retmap));
4648 if (!retmap)
4649 return NULL;
4650
4651 memcpy(retmap, map, sizeof(*retmap));
4652 return retmap;
4653}
4654
4355ab5f 4655/*
f8aa4bf3
CB
4656 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4657 * existing one or establish a new one.
4355ab5f 4658 */
28a2d9e7 4659static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4355ab5f 4660{
28a2d9e7 4661 int hostid_mapped;
f8aa4bf3 4662 struct id_map *entry = NULL;
f8aa4bf3 4663
28a2d9e7
CB
4664 /* Reuse existing mapping. */
4665 entry = mapped_hostid_entry(conf, id, type);
4666 if (entry)
4667 return entry;
f8aa4bf3 4668
28a2d9e7
CB
4669 /* Find new mapping. */
4670 hostid_mapped = find_unmapped_nsid(conf, type);
4671 if (hostid_mapped < 0) {
4672 DEBUG("failed to find free mapping for id %d", id);
4673 return NULL;
f8aa4bf3 4674 }
f8aa4bf3 4675
28a2d9e7
CB
4676 entry = malloc(sizeof(*entry));
4677 if (!entry)
4678 return NULL;
4355ab5f 4679
28a2d9e7
CB
4680 entry->idtype = type;
4681 entry->nsid = hostid_mapped;
4682 entry->hostid = (unsigned long)id;
4683 entry->range = 1;
4355ab5f 4684
28a2d9e7 4685 return entry;
4355ab5f
SH
4686}
4687
f8aa4bf3
CB
4688/* Run a function in a new user namespace.
4689 * The caller's euid/egid will be mapped if it is not already.
4690 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4691 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4692 * This means we require only to establish a mapping from:
4693 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4694 * - the container root -> some sub{g,u}id
4695 * The former we add, if the user did not specifiy a mapping. The latter we
4696 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4697 * there to start the container in the first place.
4355ab5f 4698 */
c9b7c33e
CB
4699int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4700 const char *fn_name)
4355ab5f 4701{
f8aa4bf3
CB
4702 pid_t pid;
4703 uid_t euid, egid;
4355ab5f 4704 struct userns_fn_data d;
4355ab5f 4705 int p[2];
f8aa4bf3
CB
4706 struct lxc_list *it;
4707 struct id_map *map;
4708 char c = '1';
4709 int ret = -1;
4710 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
4711 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4712 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 4713
4355ab5f 4714 ret = pipe(p);
4355ab5f
SH
4715 if (ret < 0) {
4716 SYSERROR("opening pipe");
4717 return -1;
4718 }
4719 d.fn = fn;
c9b7c33e 4720 d.fn_name = fn_name;
4355ab5f
SH
4721 d.arg = data;
4722 d.p[0] = p[0];
4723 d.p[1] = p[1];
f8aa4bf3
CB
4724
4725 /* Clone child in new user namespace. */
4355ab5f 4726 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
f8aa4bf3
CB
4727 if (pid < 0) {
4728 ERROR("failed to clone child process in new user namespace");
4729 goto on_error;
4730 }
4731
4355ab5f 4732 close(p[0]);
4355ab5f
SH
4733 p[0] = -1;
4734
f8aa4bf3
CB
4735 /* Find container root. */
4736 lxc_list_for_each(it, &conf->id_map) {
4737 map = it->elem;
4738
4739 if (map->nsid != 0)
4740 continue;
4741
4742 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4743 container_root_uid = malloc(sizeof(*container_root_uid));
4744 if (!container_root_uid)
4745 goto on_error;
4746 container_root_uid->idtype = map->idtype;
4747 container_root_uid->hostid = map->hostid;
4748 container_root_uid->nsid = 0;
4749 container_root_uid->range = map->range;
4750 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4751 container_root_gid = malloc(sizeof(*container_root_gid));
4752 if (!container_root_gid)
4753 goto on_error;
4754 container_root_gid->idtype = map->idtype;
4755 container_root_gid->hostid = map->hostid;
4756 container_root_gid->nsid = 0;
4757 container_root_gid->range = map->range;
4758 }
4759
4760 /* Found container root. */
4761 if (container_root_uid && container_root_gid)
4762 break;
4763 }
4764
4765 /* This is actually checked earlier but it can't hurt. */
4766 if (!container_root_uid || !container_root_gid) {
4767 ERROR("no mapping for container root found");
4768 goto on_error;
4769 }
4770
1d90e064
CB
4771 host_uid_map = container_root_uid;
4772 host_gid_map = container_root_gid;
4773
f8aa4bf3
CB
4774 /* Check whether the {g,u}id of the user has a mapping. */
4775 euid = geteuid();
4776 egid = getegid();
1d90e064 4777 if (euid != container_root_uid->hostid)
28a2d9e7
CB
4778 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4779
1d90e064 4780 if (egid != container_root_gid->hostid)
28a2d9e7
CB
4781 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4782
4783 if (!host_uid_map) {
4784 DEBUG("failed to find mapping for uid %d", euid);
f8aa4bf3
CB
4785 goto on_error;
4786 }
4787
28a2d9e7
CB
4788 if (!host_gid_map) {
4789 DEBUG("failed to find mapping for gid %d", egid);
4790 goto on_error;
4791 }
4792
4793 /* Allocate new {g,u}id map list. */
4794 idmap = malloc(sizeof(*idmap));
4795 if (!idmap)
4796 goto on_error;
4797 lxc_list_init(idmap);
4798
f8aa4bf3
CB
4799 /* Add container root to the map. */
4800 tmplist = malloc(sizeof(*tmplist));
4801 if (!tmplist)
4802 goto on_error;
4803 lxc_list_add_elem(tmplist, container_root_uid);
4804 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4805
1d90e064 4806 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
4807 /* idmap will now keep track of that memory. */
4808 container_root_uid = NULL;
4809
4810 /* Add container root to the map. */
4811 tmplist = malloc(sizeof(*tmplist));
4812 if (!tmplist)
4813 goto on_error;
4814 lxc_list_add_elem(tmplist, host_uid_map);
4815 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4816 }
1d90e064
CB
4817 /* idmap will now keep track of that memory. */
4818 container_root_uid = NULL;
4819 /* idmap will now keep track of that memory. */
4820 host_uid_map = NULL;
f8aa4bf3
CB
4821
4822 tmplist = malloc(sizeof(*tmplist));
4823 if (!tmplist)
4824 goto on_error;
4825 lxc_list_add_elem(tmplist, container_root_gid);
4826 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4827
1d90e064 4828 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
4829 /* idmap will now keep track of that memory. */
4830 container_root_gid = NULL;
4831
4832 tmplist = malloc(sizeof(*tmplist));
4833 if (!tmplist)
4834 goto on_error;
4835 lxc_list_add_elem(tmplist, host_gid_map);
4836 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4837 }
1d90e064
CB
4838 /* idmap will now keep track of that memory. */
4839 container_root_gid = NULL;
4840 /* idmap will now keep track of that memory. */
4841 host_gid_map = NULL;
f8aa4bf3 4842
77803ee7
CB
4843 if (lxc_log_get_level() == LXC_LOG_PRIORITY_TRACE ||
4844 conf->loglevel == LXC_LOG_PRIORITY_TRACE) {
f8aa4bf3
CB
4845 lxc_list_for_each(it, idmap) {
4846 map = it->elem;
4847 TRACE("establishing %cid mapping for \"%d\" in new "
4848 "user namespace: nsuid %lu - hostid %lu - range "
4849 "%lu",
4850 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4851 map->nsid, map->hostid, map->range);
4852 }
4355ab5f
SH
4853 }
4854
f8aa4bf3 4855 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4856 ret = lxc_map_ids(idmap, pid);
f8aa4bf3
CB
4857 if (ret < 0) {
4858 ERROR("error setting up {g,u}id mappings for child process "
4859 "\"%d\"",
4860 pid);
4861 goto on_error;
4355ab5f
SH
4862 }
4863
f8aa4bf3 4864 /* Tell child to proceed. */
4355ab5f 4865 if (write(p[1], &c, 1) != 1) {
f8aa4bf3
CB
4866 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4867 goto on_error;
4355ab5f
SH
4868 }
4869
f8aa4bf3 4870 /* Wait for child to finish. */
3139aead
SG
4871 ret = wait_for_pid(pid);
4872
f8aa4bf3 4873on_error:
1d90e064
CB
4874 if (idmap)
4875 lxc_free_idmap(idmap);
4876 if (container_root_uid)
4877 free(container_root_uid);
4878 if (container_root_gid)
4879 free(container_root_gid);
4880 if (host_uid_map && (host_uid_map != container_root_uid))
4881 free(host_uid_map);
4882 if (host_gid_map && (host_gid_map != container_root_gid))
4883 free(host_gid_map);
3139aead 4884
4355ab5f
SH
4885 if (p[0] != -1)
4886 close(p[0]);
4887 close(p[1]);
f8aa4bf3
CB
4888
4889 return ret;
4355ab5f 4890}
97e9cfa0 4891
a96a8e8c 4892/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4893static char* getuname(void)
4894{
a96a8e8c 4895 struct passwd *result;
97e9cfa0 4896
a96a8e8c
SH
4897 result = getpwuid(geteuid());
4898 if (!result)
97e9cfa0
SH
4899 return NULL;
4900
a96a8e8c 4901 return strdup(result->pw_name);
97e9cfa0
SH
4902}
4903
a96a8e8c 4904/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4905static char *getgname(void)
4906{
a96a8e8c 4907 struct group *result;
97e9cfa0 4908
a96a8e8c
SH
4909 result = getgrgid(getegid());
4910 if (!result)
97e9cfa0
SH
4911 return NULL;
4912
a96a8e8c 4913 return strdup(result->gr_name);
97e9cfa0
SH
4914}
4915
a96a8e8c 4916/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4917void suggest_default_idmap(void)
4918{
4919 FILE *f;
4920 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4921 char *line = NULL;
4922 char *uname, *gname;
4923 size_t len = 0;
4924
4925 if (!(uname = getuname()))
4926 return;
4927
4928 if (!(gname = getgname())) {
4929 free(uname);
4930 return;
4931 }
4932
4933 f = fopen(subuidfile, "r");
4934 if (!f) {
4935 ERROR("Your system is not configured with subuids");
4936 free(gname);
4937 free(uname);
4938 return;
4939 }
4940 while (getline(&line, &len, f) != -1) {
b7930180 4941 size_t no_newline = 0;
97e9cfa0
SH
4942 char *p = strchr(line, ':'), *p2;
4943 if (*line == '#')
4944 continue;
4945 if (!p)
4946 continue;
4947 *p = '\0';
4948 p++;
4949 if (strcmp(line, uname))
4950 continue;
4951 p2 = strchr(p, ':');
4952 if (!p2)
4953 continue;
4954 *p2 = '\0';
4955 p2++;
4956 if (!*p2)
4957 continue;
b7930180
CB
4958 no_newline = strcspn(p2, "\n");
4959 p2[no_newline] = '\0';
4960
b7b2fde4
CB
4961 if (lxc_safe_uint(p, &uid) < 0)
4962 WARN("Could not parse UID.");
4963 if (lxc_safe_uint(p2, &urange) < 0)
4964 WARN("Could not parse UID range.");
97e9cfa0
SH
4965 }
4966 fclose(f);
4967
6be7389a 4968 f = fopen(subgidfile, "r");
97e9cfa0
SH
4969 if (!f) {
4970 ERROR("Your system is not configured with subgids");
4971 free(gname);
4972 free(uname);
4973 return;
4974 }
4975 while (getline(&line, &len, f) != -1) {
b7930180 4976 size_t no_newline = 0;
97e9cfa0
SH
4977 char *p = strchr(line, ':'), *p2;
4978 if (*line == '#')
4979 continue;
4980 if (!p)
4981 continue;
4982 *p = '\0';
4983 p++;
4984 if (strcmp(line, uname))
4985 continue;
4986 p2 = strchr(p, ':');
4987 if (!p2)
4988 continue;
4989 *p2 = '\0';
4990 p2++;
4991 if (!*p2)
4992 continue;
b7930180
CB
4993 no_newline = strcspn(p2, "\n");
4994 p2[no_newline] = '\0';
4995
b7b2fde4
CB
4996 if (lxc_safe_uint(p, &gid) < 0)
4997 WARN("Could not parse GID.");
4998 if (lxc_safe_uint(p2, &grange) < 0)
4999 WARN("Could not parse GID range.");
97e9cfa0
SH
5000 }
5001 fclose(f);
5002
f10fad2f 5003 free(line);
97e9cfa0
SH
5004
5005 if (!urange || !grange) {
5006 ERROR("You do not have subuids or subgids allocated");
5007 ERROR("Unprivileged containers require subuids and subgids");
5008 return;
5009 }
5010
5011 ERROR("You must either run as root, or define uid mappings");
5012 ERROR("To pass uid mappings to lxc-create, you could create");
5013 ERROR("~/.config/lxc/default.conf:");
5014 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
5015 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
5016 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
5017
5018 free(gname);
5019 free(uname);
5020}
aaf26830 5021
a7307747
SH
5022static void free_cgroup_settings(struct lxc_list *result)
5023{
5024 struct lxc_list *iterator, *next;
5025
5026 lxc_list_for_each_safe(iterator, result, next) {
5027 lxc_list_del(iterator);
5028 free(iterator);
5029 }
5030 free(result);
5031}
5032
aaf26830
KT
5033/*
5034 * Return the list of cgroup_settings sorted according to the following rules
5035 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
5036 */
5037struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
5038{
5039 struct lxc_list *result;
5040 struct lxc_list *memsw_limit = NULL;
5041 struct lxc_list *it = NULL;
5042 struct lxc_cgroup *cg = NULL;
5043 struct lxc_list *item = NULL;
5044
5045 result = malloc(sizeof(*result));
fac7c663
KT
5046 if (!result) {
5047 ERROR("failed to allocate memory to sort cgroup settings");
5048 return NULL;
5049 }
aaf26830
KT
5050 lxc_list_init(result);
5051
5052 /*Iterate over the cgroup settings and copy them to the output list*/
5053 lxc_list_for_each(it, cgroup_settings) {
5054 item = malloc(sizeof(*item));
fac7c663
KT
5055 if (!item) {
5056 ERROR("failed to allocate memory to sort cgroup settings");
a7307747 5057 free_cgroup_settings(result);
fac7c663
KT
5058 return NULL;
5059 }
aaf26830
KT
5060 item->elem = it->elem;
5061 cg = it->elem;
5062 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
5063 /* Store the memsw_limit location */
5064 memsw_limit = item;
5065 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4d5b72a1 5066 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
aaf26830
KT
5067 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5068 item->elem = memsw_limit->elem;
5069 memsw_limit->elem = it->elem;
5070 }
5071 lxc_list_add_tail(result, item);
5072 }
5073
5074 return result;
a7307747 5075}